Merge commit 'v2.6.35' into kbuild/kbuild

Conflicts: arch/powerpc/Makefile
author: Michal Marek <mmarek@suse.cz> 2010-08-04 13:59:13 +0200
committer: Michal Marek <mmarek@suse.cz> 2010-08-04 13:59:13 +0200
commit: 772320e84588dcbe1600ffb83e5f328f2209ac2a (patch)
tree: a7de21b79340aeaa17c58126f6b801b82c77b53a /fs
parent: 1ce53adf13a54375d2a5c7cdbe341b2558389615 (diff)
parent: 9fe6206f400646a2322096b56c59891d530e8d51 (diff)
961 files changed, 76294 insertions, 28304 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index e777961939f3..0dbe0d139ac2 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -22,6 +22,7 @@
 
 #include <linux/jiffies.h>
 #include <linux/file.h>
+#include <linux/slab.h>
 #include <linux/stat.h>
 #include <linux/sched.h>
 #include <linux/fs.h>
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 14d944204571..7317b39b2815 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -24,6 +24,7 @@
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/idr.h>
 #include <net/9p/9p.h>
@@ -110,7 +111,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
 {
 	int i, n, l, clone, any, access;
 	u32 uid;
-	struct p9_fid *fid;
+	struct p9_fid *fid, *old_fid = NULL;
 	struct dentry *d, *ds;
 	struct v9fs_session_info *v9ses;
 	char **wnames, *uname;
@@ -151,7 +152,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
 			if (access == V9FS_ACCESS_SINGLE)
 				return ERR_PTR(-EPERM);
 
-			if (v9fs_extended(v9ses))
+			if (v9fs_proto_dotu(v9ses))
 				uname = NULL;
 			else
 				uname = v9ses->uname;
@@ -183,10 +184,18 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
 		l = min(n - i, P9_MAXWELEM);
 		fid = p9_client_walk(fid, l, &wnames[i], clone);
 		if (IS_ERR(fid)) {
+			if (old_fid) {
+				/*
+				 * If we fail, clunk fid which are mapping
+				 * to path component and not the last component
+				 * of the path.
+				 */
+				p9_client_clunk(old_fid);
+			}
 			kfree(wnames);
 			return fid;
 		}
-
+		old_fid = fid;
 		i += l;
 		clone = 0;
 	}
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index cf62b05e296a..f8b86e92cd66 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -29,6 +29,7 @@
 #include <linux/sched.h>
 #include <linux/parser.h>
 #include <linux/idr.h>
+#include <linux/slab.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
 #include <net/9p/transport.h>
@@ -84,7 +85,7 @@ static const match_table_t tokens = {
 
 static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 {
-	char *options;
+	char *options, *tmp_options;
 	substring_t args[MAX_OPT_ARGS];
 	char *p;
 	int option = 0;
@@ -102,9 +103,12 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 	if (!opts)
 		return 0;
 
-	options = kstrdup(opts, GFP_KERNEL);
-	if (!options)
+	tmp_options = kstrdup(opts, GFP_KERNEL);
+	if (!tmp_options) {
+		ret = -ENOMEM;
 		goto fail_option_alloc;
+	}
+	options = tmp_options;
 
 	while ((p = strsep(&options, ",")) != NULL) {
 		int token;
@@ -159,8 +163,12 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 			break;
 		case Opt_cache:
 			s = match_strdup(&args[0]);
-			if (!s)
-				goto fail_option_alloc;
+			if (!s) {
+				ret = -ENOMEM;
+				P9_DPRINTK(P9_DEBUG_ERROR,
+				  "problem allocating copy of cache arg\n");
+				goto free_and_return;
+			}
 
 			if (strcmp(s, "loose") == 0)
 				v9ses->cache = CACHE_LOOSE;
@@ -173,8 +181,12 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 
 		case Opt_access:
 			s = match_strdup(&args[0]);
-			if (!s)
-				goto fail_option_alloc;
+			if (!s) {
+				ret = -ENOMEM;
+				P9_DPRINTK(P9_DEBUG_ERROR,
+				  "problem allocating copy of access arg\n");
+				goto free_and_return;
+			}
 
 			v9ses->flags &= ~V9FS_ACCESS_MASK;
 			if (strcmp(s, "user") == 0)
@@ -194,13 +206,11 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 			continue;
 		}
 	}
-	kfree(options);
-	return ret;
 
+free_and_return:
+	kfree(tmp_options);
 fail_option_alloc:
-	P9_DPRINTK(P9_DEBUG_ERROR,
-		   "failed to allocate copy of option argument\n");
-	return -ENOMEM;
+	return ret;
 }
 
 /**
@@ -228,11 +238,18 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 		return ERR_PTR(-ENOMEM);
 	}
 
+	rc = bdi_setup_and_register(&v9ses->bdi, "9p", BDI_CAP_MAP_COPY);
+	if (rc) {
+		__putname(v9ses->aname);
+		__putname(v9ses->uname);
+		return ERR_PTR(rc);
+	}
+
 	spin_lock(&v9fs_sessionlist_lock);
 	list_add(&v9ses->slist, &v9fs_sessionlist);
 	spin_unlock(&v9fs_sessionlist_lock);
 
-	v9ses->flags = V9FS_EXTENDED | V9FS_ACCESS_USER;
+	v9ses->flags = V9FS_ACCESS_USER;
 	strcpy(v9ses->uname, V9FS_DEFUSER);
 	strcpy(v9ses->aname, V9FS_DEFANAME);
 	v9ses->uid = ~0;
@@ -253,13 +270,15 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 		goto error;
 	}
 
-	if (!v9ses->clnt->dotu)
-		v9ses->flags &= ~V9FS_EXTENDED;
+	if (p9_is_proto_dotl(v9ses->clnt))
+		v9ses->flags |= V9FS_PROTO_2000L;
+	else if (p9_is_proto_dotu(v9ses->clnt))
+		v9ses->flags |= V9FS_PROTO_2000U;
 
 	v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
 
 	/* for legacy mode, fall back to V9FS_ACCESS_ANY */
-	if (!v9fs_extended(v9ses) &&
+	if (!v9fs_proto_dotu(v9ses) &&
 		((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) {
 
 		v9ses->flags &= ~V9FS_ACCESS_MASK;
@@ -289,6 +308,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 	return fid;
 
 error:
+	bdi_destroy(&v9ses->bdi);
 	return ERR_PTR(retval);
 }
 
@@ -314,6 +334,8 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
 	__putname(v9ses->uname);
 	__putname(v9ses->aname);
 
+	bdi_destroy(&v9ses->bdi);
+
 	spin_lock(&v9fs_sessionlist_lock);
 	list_del(&v9ses->slist);
 	spin_unlock(&v9fs_sessionlist_lock);
@@ -331,6 +353,19 @@ void v9fs_session_cancel(struct v9fs_session_info *v9ses) {
 	p9_client_disconnect(v9ses->clnt);
 }
 
+/**
+ * v9fs_session_begin_cancel - Begin terminate of a session
+ * @v9ses: session to terminate
+ *
+ * After this call we don't allow any request other than clunk.
+ */
+
+void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses)
+{
+	P9_DPRINTK(P9_DEBUG_ERROR, "begin cancel session %p\n", v9ses);
+	p9_client_begin_disconnect(v9ses->clnt);
+}
+
 extern int v9fs_error_init(void);
 
 static struct kobject *v9fs_kobj;
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 019f4ccb70c1..bec4d0bcb458 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -20,10 +20,12 @@
  *  Boston, MA  02111-1301  USA
  *
  */
+#include <linux/backing-dev.h>
 
 /**
  * enum p9_session_flags - option flags for each 9P session
- * @V9FS_EXTENDED: whether or not to use 9P2000.u extensions
+ * @V9FS_PROTO_2000U: whether or not to use 9P2000.u extensions
+ * @V9FS_PROTO_2000L: whether or not to use 9P2000.l extensions
  * @V9FS_ACCESS_SINGLE: only the mounting user can access the hierarchy
  * @V9FS_ACCESS_USER: a new attach will be issued for every user (default)
  * @V9FS_ACCESS_ANY: use a single attach for all users
@@ -32,11 +34,12 @@
  * Session flags reflect options selected by users at mount time
  */
 enum p9_session_flags {
-	V9FS_EXTENDED		= 0x01,
-	V9FS_ACCESS_SINGLE	= 0x02,
-	V9FS_ACCESS_USER	= 0x04,
-	V9FS_ACCESS_ANY		= 0x06,
-	V9FS_ACCESS_MASK	= 0x06,
+	V9FS_PROTO_2000U	= 0x01,
+	V9FS_PROTO_2000L	= 0x02,
+	V9FS_ACCESS_SINGLE	= 0x04,
+	V9FS_ACCESS_USER	= 0x08,
+	V9FS_ACCESS_ANY		= 0x0C,
+	V9FS_ACCESS_MASK	= 0x0C,
 };
 
 /* possible values of ->cache */
@@ -100,12 +103,14 @@ struct v9fs_session_info {
 	u32 uid;		/* if ACCESS_SINGLE, the uid that has access */
 	struct p9_client *clnt;	/* 9p client */
 	struct list_head slist; /* list of sessions registered with v9fs */
+	struct backing_dev_info bdi;
 };
 
 struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
 									char *);
 void v9fs_session_close(struct v9fs_session_info *v9ses);
 void v9fs_session_cancel(struct v9fs_session_info *v9ses);
+void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses);
 
 #define V9FS_MAGIC 0x01021997
 
@@ -121,7 +126,12 @@ static inline struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode)
 	return (inode->i_sb->s_fs_info);
 }
 
-static inline int v9fs_extended(struct v9fs_session_info *v9ses)
+static inline int v9fs_proto_dotu(struct v9fs_session_info *v9ses)
 {
-	return v9ses->flags & V9FS_EXTENDED;
+	return v9ses->flags & V9FS_PROTO_2000U;
+}
+
+static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses)
+{
+	return v9ses->flags & V9FS_PROTO_2000L;
 }
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 3a7560e35865..32ef4009d030 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -40,7 +40,9 @@
 extern struct file_system_type v9fs_fs_type;
 extern const struct address_space_operations v9fs_addr_operations;
 extern const struct file_operations v9fs_file_operations;
+extern const struct file_operations v9fs_file_operations_dotl;
 extern const struct file_operations v9fs_dir_operations;
+extern const struct file_operations v9fs_dir_operations_dotl;
 extern const struct dentry_operations v9fs_dentry_operations;
 extern const struct dentry_operations v9fs_cached_dentry_operations;
 
@@ -60,3 +62,4 @@ void v9fs_dentry_release(struct dentry *);
 int v9fs_uflags2omode(int uflags, int extended);
 
 ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64);
+void v9fs_blank_wstat(struct p9_wstat *wstat);
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index d74325295b1e..cbf4e50f3933 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -34,6 +34,7 @@
 #include <linux/namei.h>
 #include <linux/idr.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
 
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 15cce53bf61e..36d961f342af 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -32,6 +32,7 @@
 #include <linux/sched.h>
 #include <linux/inet.h>
 #include <linux/idr.h>
+#include <linux/slab.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
 
@@ -76,6 +77,15 @@ static inline int dt_type(struct p9_wstat *mistat)
 	return rettype;
 }
 
+static void p9stat_init(struct p9_wstat *stbuf)
+{
+	stbuf->name  = NULL;
+	stbuf->uid   = NULL;
+	stbuf->gid   = NULL;
+	stbuf->muid  = NULL;
+	stbuf->extension = NULL;
+}
+
 /**
  * v9fs_dir_readdir - read a directory
  * @filp: opened file structure
@@ -121,6 +131,8 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	rdir = (struct p9_rdir *) fid->rdir;
 
 	err = mutex_lock_interruptible(&rdir->mutex);
+	if (err)
+		return err;
 	while (err == 0) {
 		if (rdir->tail == rdir->head) {
 			err = v9fs_file_readn(filp, rdir->buf, NULL,
@@ -131,11 +143,11 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
 			rdir->head = 0;
 			rdir->tail = err;
 		}
-
 		while (rdir->head < rdir->tail) {
+			p9stat_init(&st);
 			err = p9stat_read(rdir->buf + rdir->head,
-						buflen - rdir->head, &st,
-						fid->clnt->dotu);
+						rdir->tail - rdir->head, &st,
+						fid->clnt->proto_version);
 			if (err) {
 				P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
 				err = -EIO;
@@ -191,3 +203,11 @@ const struct file_operations v9fs_dir_operations = {
 	.open = v9fs_file_open,
 	.release = v9fs_dir_release,
 };
+
+const struct file_operations v9fs_dir_operations_dotl = {
+	.read = generic_read_dir,
+	.llseek = generic_file_llseek,
+	.readdir = v9fs_dir_readdir,
+	.open = v9fs_file_open,
+	.release = v9fs_dir_release,
+};
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 3902bf43a088..2bedc6c94fc2 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -61,7 +61,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 
 	P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p \n", inode, file);
 	v9ses = v9fs_inode2v9ses(inode);
-	omode = v9fs_uflags2omode(file->f_flags, v9fs_extended(v9ses));
+	omode = v9fs_uflags2omode(file->f_flags, v9fs_proto_dotu(v9ses));
 	fid = file->private_data;
 	if (!fid) {
 		fid = v9fs_fid_clone(file->f_path.dentry);
@@ -77,7 +77,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 			i_size_write(inode, 0);
 			inode->i_blocks = 0;
 		}
-		if ((file->f_flags & O_APPEND) && (!v9fs_extended(v9ses)))
+		if ((file->f_flags & O_APPEND) && (!v9fs_proto_dotu(v9ses)))
 			generic_file_llseek(file, 0, SEEK_END);
 	}
 
@@ -114,7 +114,7 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
 	P9_DPRINTK(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl);
 
 	/* No mandatory locks */
-	if (__mandatory_lock(inode))
+	if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
 		return -ENOLCK;
 
 	if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
@@ -215,7 +215,7 @@ v9fs_file_write(struct file *filp, const char __user * data,
 	struct p9_fid *fid;
 	struct p9_client *clnt;
 	struct inode *inode = filp->f_path.dentry->d_inode;
-	int origin = *offset;
+	loff_t origin = *offset;
 	unsigned long pg_start, pg_end;
 
 	P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data,
@@ -257,6 +257,21 @@ v9fs_file_write(struct file *filp, const char __user * data,
 	return total;
 }
 
+static int v9fs_file_fsync(struct file *filp, int datasync)
+{
+	struct p9_fid *fid;
+	struct p9_wstat wstat;
+	int retval;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync);
+
+	fid = filp->private_data;
+	v9fs_blank_wstat(&wstat);
+
+	retval = p9_client_wstat(fid, &wstat);
+	return retval;
+}
+
 static const struct file_operations v9fs_cached_file_operations = {
 	.llseek = generic_file_llseek,
 	.read = do_sync_read,
@@ -266,6 +281,7 @@ static const struct file_operations v9fs_cached_file_operations = {
 	.release = v9fs_dir_release,
 	.lock = v9fs_file_lock,
 	.mmap = generic_file_readonly_mmap,
+	.fsync = v9fs_file_fsync,
 };
 
 const struct file_operations v9fs_file_operations = {
@@ -276,4 +292,16 @@ const struct file_operations v9fs_file_operations = {
 	.release = v9fs_dir_release,
 	.lock = v9fs_file_lock,
 	.mmap = generic_file_readonly_mmap,
+	.fsync = v9fs_file_fsync,
+};
+
+const struct file_operations v9fs_file_operations_dotl = {
+	.llseek = generic_file_llseek,
+	.read = v9fs_file_read,
+	.write = v9fs_file_write,
+	.open = v9fs_file_open,
+	.release = v9fs_dir_release,
+	.lock = v9fs_file_lock,
+	.mmap = generic_file_readonly_mmap,
+	.fsync = v9fs_file_fsync,
 };
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 9d03d1ebca6f..4331b3b5ee1c 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -34,6 +34,7 @@
 #include <linux/namei.h>
 #include <linux/idr.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
 
@@ -43,9 +44,12 @@
 #include "cache.h"
 
 static const struct inode_operations v9fs_dir_inode_operations;
-static const struct inode_operations v9fs_dir_inode_operations_ext;
+static const struct inode_operations v9fs_dir_inode_operations_dotu;
+static const struct inode_operations v9fs_dir_inode_operations_dotl;
 static const struct inode_operations v9fs_file_inode_operations;
+static const struct inode_operations v9fs_file_inode_operations_dotl;
 static const struct inode_operations v9fs_symlink_inode_operations;
+static const struct inode_operations v9fs_symlink_inode_operations_dotl;
 
 /**
  * unixmode2p9mode - convert unix mode bits to plan 9
@@ -60,7 +64,7 @@ static int unixmode2p9mode(struct v9fs_session_info *v9ses, int mode)
 	res = mode & 0777;
 	if (S_ISDIR(mode))
 		res |= P9_DMDIR;
-	if (v9fs_extended(v9ses)) {
+	if (v9fs_proto_dotu(v9ses)) {
 		if (S_ISLNK(mode))
 			res |= P9_DMSYMLINK;
 		if (v9ses->nodev == 0) {
@@ -102,21 +106,21 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode)
 
 	if ((mode & P9_DMDIR) == P9_DMDIR)
 		res |= S_IFDIR;
-	else if ((mode & P9_DMSYMLINK) && (v9fs_extended(v9ses)))
+	else if ((mode & P9_DMSYMLINK) && (v9fs_proto_dotu(v9ses)))
 		res |= S_IFLNK;
-	else if ((mode & P9_DMSOCKET) && (v9fs_extended(v9ses))
+	else if ((mode & P9_DMSOCKET) && (v9fs_proto_dotu(v9ses))
 		 && (v9ses->nodev == 0))
 		res |= S_IFSOCK;
-	else if ((mode & P9_DMNAMEDPIPE) && (v9fs_extended(v9ses))
+	else if ((mode & P9_DMNAMEDPIPE) && (v9fs_proto_dotu(v9ses))
 		 && (v9ses->nodev == 0))
 		res |= S_IFIFO;
-	else if ((mode & P9_DMDEVICE) && (v9fs_extended(v9ses))
+	else if ((mode & P9_DMDEVICE) && (v9fs_proto_dotu(v9ses))
 		 && (v9ses->nodev == 0))
 		res |= S_IFBLK;
 	else
 		res |= S_IFREG;
 
-	if (v9fs_extended(v9ses)) {
+	if (v9fs_proto_dotu(v9ses)) {
 		if ((mode & P9_DMSETUID) == P9_DMSETUID)
 			res |= S_ISUID;
 
@@ -176,7 +180,7 @@ int v9fs_uflags2omode(int uflags, int extended)
  *
  */
 
-static void
+void
 v9fs_blank_wstat(struct p9_wstat *wstat)
 {
 	wstat->type = ~0;
@@ -252,9 +256,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
 		return ERR_PTR(-ENOMEM);
 	}
 
-	inode->i_mode = mode;
-	inode->i_uid = current_fsuid();
-	inode->i_gid = current_fsgid();
+	inode_init_owner(inode, NULL, mode);
 	inode->i_blocks = 0;
 	inode->i_rdev = 0;
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -265,7 +267,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
 	case S_IFBLK:
 	case S_IFCHR:
 	case S_IFSOCK:
-		if (!v9fs_extended(v9ses)) {
+		if (!v9fs_proto_dotu(v9ses)) {
 			P9_DPRINTK(P9_DEBUG_ERROR,
 				   "special files without extended mode\n");
 			err = -EINVAL;
@@ -274,25 +276,44 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
 		init_special_inode(inode, inode->i_mode, inode->i_rdev);
 		break;
 	case S_IFREG:
-		inode->i_op = &v9fs_file_inode_operations;
-		inode->i_fop = &v9fs_file_operations;
+		if (v9fs_proto_dotl(v9ses)) {
+			inode->i_op = &v9fs_file_inode_operations_dotl;
+			inode->i_fop = &v9fs_file_operations_dotl;
+		} else {
+			inode->i_op = &v9fs_file_inode_operations;
+			inode->i_fop = &v9fs_file_operations;
+		}
+
 		break;
+
 	case S_IFLNK:
-		if (!v9fs_extended(v9ses)) {
-			P9_DPRINTK(P9_DEBUG_ERROR,
-				   "extended modes used w/o 9P2000.u\n");
+		if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) {
+			P9_DPRINTK(P9_DEBUG_ERROR, "extended modes used with "
+						"legacy protocol.\n");
 			err = -EINVAL;
 			goto error;
 		}
-		inode->i_op = &v9fs_symlink_inode_operations;
+
+		if (v9fs_proto_dotl(v9ses))
+			inode->i_op = &v9fs_symlink_inode_operations_dotl;
+		else
+			inode->i_op = &v9fs_symlink_inode_operations;
+
 		break;
 	case S_IFDIR:
 		inc_nlink(inode);
-		if (v9fs_extended(v9ses))
-			inode->i_op = &v9fs_dir_inode_operations_ext;
+		if (v9fs_proto_dotl(v9ses))
+			inode->i_op = &v9fs_dir_inode_operations_dotl;
+		else if (v9fs_proto_dotu(v9ses))
+			inode->i_op = &v9fs_dir_inode_operations_dotu;
 		else
 			inode->i_op = &v9fs_dir_inode_operations;
-		inode->i_fop = &v9fs_dir_operations;
+
+		if (v9fs_proto_dotl(v9ses))
+			inode->i_fop = &v9fs_dir_operations_dotl;
+		else
+			inode->i_fop = &v9fs_dir_operations;
+
 		break;
 	default:
 		P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n",
@@ -431,20 +452,22 @@ error:
 
 static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
 {
+	int retval;
 	struct inode *file_inode;
-	struct v9fs_session_info *v9ses;
 	struct p9_fid *v9fid;
 
 	P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %d\n", dir, file,
 		rmdir);
 
 	file_inode = file->d_inode;
-	v9ses = v9fs_inode2v9ses(file_inode);
 	v9fid = v9fs_fid_clone(file);
 	if (IS_ERR(v9fid))
 		return PTR_ERR(v9fid);
 
-	return p9_client_remove(v9fid);
+	retval = p9_client_remove(v9fid);
+	if (!retval)
+		drop_nlink(file_inode);
+	return retval;
 }
 
 static int
@@ -479,12 +502,11 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
 	ofid = NULL;
 	fid = NULL;
 	name = (char *) dentry->d_name.name;
-	dfid = v9fs_fid_clone(dentry->d_parent);
+	dfid = v9fs_fid_lookup(dentry->d_parent);
 	if (IS_ERR(dfid)) {
 		err = PTR_ERR(dfid);
-		P9_DPRINTK(P9_DEBUG_VFS, "fid clone failed %d\n", err);
-		dfid = NULL;
-		goto error;
+		P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+		return ERR_PTR(err);
 	}
 
 	/* clone a fid to use for creation */
@@ -492,8 +514,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
 	if (IS_ERR(ofid)) {
 		err = PTR_ERR(ofid);
 		P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
-		ofid = NULL;
-		goto error;
+		return ERR_PTR(err);
 	}
 
 	err = p9_client_fcreate(ofid, name, perm, mode, extension);
@@ -503,14 +524,13 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
 	}
 
 	/* now walk from the parent so we can get unopened fid */
-	fid = p9_client_walk(dfid, 1, &name, 0);
+	fid = p9_client_walk(dfid, 1, &name, 1);
 	if (IS_ERR(fid)) {
 		err = PTR_ERR(fid);
 		P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
 		fid = NULL;
 		goto error;
-	} else
-		dfid = NULL;
+	}
 
 	/* instantiate inode and assign the unopened fid to the dentry */
 	inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
@@ -533,9 +553,6 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
 	return ofid;
 
 error:
-	if (dfid)
-		p9_client_clunk(dfid);
-
 	if (ofid)
 		p9_client_clunk(ofid);
 
@@ -575,7 +592,8 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
 		flags = O_RDWR;
 
 	fid = v9fs_create(v9ses, dir, dentry, NULL, perm,
-				v9fs_uflags2omode(flags, v9fs_extended(v9ses)));
+				v9fs_uflags2omode(flags,
+						v9fs_proto_dotu(v9ses)));
 	if (IS_ERR(fid)) {
 		err = PTR_ERR(fid);
 		fid = NULL;
@@ -655,6 +673,9 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 	P9_DPRINTK(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n",
 		dir, dentry->d_name.name, dentry, nameidata);
 
+	if (dentry->d_name.len > NAME_MAX)
+		return ERR_PTR(-ENAMETOOLONG);
+
 	sb = dir->i_sb;
 	v9ses = v9fs_inode2v9ses(dir);
 	dfid = v9fs_fid_lookup(dentry->d_parent);
@@ -666,8 +687,8 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 	if (IS_ERR(fid)) {
 		result = PTR_ERR(fid);
 		if (result == -ENOENT) {
-			d_add(dentry, NULL);
-			return NULL;
+			inode = NULL;
+			goto inst_out;
 		}
 
 		return ERR_PTR(result);
@@ -684,7 +705,8 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 	if (result < 0)
 		goto error;
 
-	if ((fid->qid.version) && (v9ses->cache))
+inst_out:
+	if (v9ses->cache)
 		dentry->d_op = &v9fs_cached_dentry_operations;
 	else
 		dentry->d_op = &v9fs_dentry_operations;
@@ -763,6 +785,13 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		goto clunk_olddir;
 	}
 
+	if (v9fs_proto_dotl(v9ses)) {
+		retval = p9_client_rename(oldfid, newdirfid,
+					(char *) new_dentry->d_name.name);
+		if (retval != -ENOSYS)
+			goto clunk_newdir;
+	}
+
 	/* 9P can only handle file rename in the same directory */
 	if (memcmp(&olddirfid->qid, &newdirfid->qid, sizeof(newdirfid->qid))) {
 		P9_DPRINTK(P9_DEBUG_ERROR,
@@ -858,7 +887,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
 	if (iattr->ia_valid & ATTR_SIZE)
 		wstat.length = iattr->ia_size;
 
-	if (v9fs_extended(v9ses)) {
+	if (v9fs_proto_dotu(v9ses)) {
 		if (iattr->ia_valid & ATTR_UID)
 			wstat.n_uid = iattr->ia_uid;
 
@@ -886,6 +915,8 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
 	struct super_block *sb)
 {
 	char ext[32];
+	char tag_name[14];
+	unsigned int i_nlink;
 	struct v9fs_session_info *v9ses = sb->s_fs_info;
 
 	inode->i_nlink = 1;
@@ -897,11 +928,26 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
 	inode->i_uid = v9ses->dfltuid;
 	inode->i_gid = v9ses->dfltgid;
 
-	if (v9fs_extended(v9ses)) {
+	if (v9fs_proto_dotu(v9ses)) {
 		inode->i_uid = stat->n_uid;
 		inode->i_gid = stat->n_gid;
 	}
-
+	if ((S_ISREG(inode->i_mode)) || (S_ISDIR(inode->i_mode))) {
+		if (v9fs_proto_dotu(v9ses) && (stat->extension[0] != '\0')) {
+			/*
+			 * Hadlink support got added later to
+			 * to the .u extension. So there can be
+			 * server out there that doesn't support
+			 * this even with .u extension. So check
+			 * for non NULL stat->extension
+			 */
+			strncpy(ext, stat->extension, sizeof(ext));
+			/* HARDLINKCOUNT %u */
+			sscanf(ext, "%13s %u", tag_name, &i_nlink);
+			if (!strncmp(tag_name, "HARDLINKCOUNT", 13))
+				inode->i_nlink = i_nlink;
+		}
+	}
 	inode->i_mode = p9mode2unixmode(v9ses, stat->mode);
 	if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode))) {
 		char type = 0;
@@ -976,7 +1022,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
 	if (IS_ERR(fid))
 		return PTR_ERR(fid);
 
-	if (!v9fs_extended(v9ses))
+	if (!v9fs_proto_dotu(v9ses))
 		return -EBADF;
 
 	st = p9_client_stat(fid);
@@ -1066,7 +1112,7 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
 	struct p9_fid *fid;
 
 	v9ses = v9fs_inode2v9ses(dir);
-	if (!v9fs_extended(v9ses)) {
+	if (!v9fs_proto_dotu(v9ses)) {
 		P9_DPRINTK(P9_DEBUG_ERROR, "not extended\n");
 		return -EPERM;
 	}
@@ -1171,6 +1217,8 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
 		sprintf(name, "c %u %u", MAJOR(rdev), MINOR(rdev));
 	else if (S_ISFIFO(mode))
 		*name = 0;
+	else if (S_ISSOCK(mode))
+		*name = 0;
 	else {
 		__putname(name);
 		return -EINVAL;
@@ -1182,7 +1230,21 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
 	return retval;
 }
 
-static const struct inode_operations v9fs_dir_inode_operations_ext = {
+static const struct inode_operations v9fs_dir_inode_operations_dotu = {
+	.create = v9fs_vfs_create,
+	.lookup = v9fs_vfs_lookup,
+	.symlink = v9fs_vfs_symlink,
+	.link = v9fs_vfs_link,
+	.unlink = v9fs_vfs_unlink,
+	.mkdir = v9fs_vfs_mkdir,
+	.rmdir = v9fs_vfs_rmdir,
+	.mknod = v9fs_vfs_mknod,
+	.rename = v9fs_vfs_rename,
+	.getattr = v9fs_vfs_getattr,
+	.setattr = v9fs_vfs_setattr,
+};
+
+static const struct inode_operations v9fs_dir_inode_operations_dotl = {
 	.create = v9fs_vfs_create,
 	.lookup = v9fs_vfs_lookup,
 	.symlink = v9fs_vfs_symlink,
@@ -1213,6 +1275,11 @@ static const struct inode_operations v9fs_file_inode_operations = {
 	.setattr = v9fs_vfs_setattr,
 };
 
+static const struct inode_operations v9fs_file_inode_operations_dotl = {
+	.getattr = v9fs_vfs_getattr,
+	.setattr = v9fs_vfs_setattr,
+};
+
 static const struct inode_operations v9fs_symlink_inode_operations = {
 	.readlink = generic_readlink,
 	.follow_link = v9fs_vfs_follow_link,
@@ -1220,3 +1287,11 @@ static const struct inode_operations v9fs_symlink_inode_operations = {
 	.getattr = v9fs_vfs_getattr,
 	.setattr = v9fs_vfs_setattr,
 };
+
+static const struct inode_operations v9fs_symlink_inode_operations_dotl = {
+	.readlink = generic_readlink,
+	.follow_link = v9fs_vfs_follow_link,
+	.put_link = v9fs_vfs_put_link,
+	.getattr = v9fs_vfs_getattr,
+	.setattr = v9fs_vfs_setattr,
+};
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 69357c0d9899..be74d020436e 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -37,6 +37,8 @@
 #include <linux/mount.h>
 #include <linux/idr.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/statfs.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
 
@@ -44,7 +46,7 @@
 #include "v9fs_vfs.h"
 #include "fid.h"
 
-static const struct super_operations v9fs_super_ops;
+static const struct super_operations v9fs_super_ops, v9fs_super_ops_dotl;
 
 /**
  * v9fs_set_super - set the superblock
@@ -75,7 +77,11 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
 	sb->s_blocksize_bits = fls(v9ses->maxdata - 1);
 	sb->s_blocksize = 1 << sb->s_blocksize_bits;
 	sb->s_magic = V9FS_MAGIC;
-	sb->s_op = &v9fs_super_ops;
+	if (v9fs_proto_dotl(v9ses))
+		sb->s_op = &v9fs_super_ops_dotl;
+	else
+		sb->s_op = &v9fs_super_ops;
+	sb->s_bdi = &v9ses->bdi;
 
 	sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC |
 	    MS_NOATIME;
@@ -193,6 +199,7 @@ static void v9fs_kill_super(struct super_block *s)
 
 	kill_anon_super(s);
 
+	v9fs_session_cancel(v9ses);
 	v9fs_session_close(v9ses);
 	kfree(v9ses);
 	s->s_fs_info = NULL;
@@ -205,7 +212,43 @@ v9fs_umount_begin(struct super_block *sb)
 	struct v9fs_session_info *v9ses;
 
 	v9ses = sb->s_fs_info;
-	v9fs_session_cancel(v9ses);
+	v9fs_session_begin_cancel(v9ses);
+}
+
+static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct v9fs_session_info *v9ses;
+	struct p9_fid *fid;
+	struct p9_rstatfs rs;
+	int res;
+
+	fid = v9fs_fid_lookup(dentry);
+	if (IS_ERR(fid)) {
+		res = PTR_ERR(fid);
+		goto done;
+	}
+
+	v9ses = v9fs_inode2v9ses(dentry->d_inode);
+	if (v9fs_proto_dotl(v9ses)) {
+		res = p9_client_statfs(fid, &rs);
+		if (res == 0) {
+			buf->f_type = rs.type;
+			buf->f_bsize = rs.bsize;
+			buf->f_blocks = rs.blocks;
+			buf->f_bfree = rs.bfree;
+			buf->f_bavail = rs.bavail;
+			buf->f_files = rs.files;
+			buf->f_ffree = rs.ffree;
+			buf->f_fsid.val[0] = rs.fsid & 0xFFFFFFFFUL;
+			buf->f_fsid.val[1] = (rs.fsid >> 32) & 0xFFFFFFFFUL;
+			buf->f_namelen = rs.namelen;
+		}
+		if (res != -ENOSYS)
+			goto done;
+	}
+	res = simple_statfs(dentry, buf);
+done:
+	return res;
 }
 
 static const struct super_operations v9fs_super_ops = {
@@ -219,6 +262,17 @@ static const struct super_operations v9fs_super_ops = {
 	.umount_begin = v9fs_umount_begin,
 };
 
+static const struct super_operations v9fs_super_ops_dotl = {
+#ifdef CONFIG_9P_FSCACHE
+	.alloc_inode = v9fs_alloc_inode,
+	.destroy_inode = v9fs_destroy_inode,
+#endif
+	.statfs = v9fs_statfs,
+	.clear_inode = v9fs_clear_inode,
+	.show_options = generic_show_options,
+	.umount_begin = v9fs_umount_begin,
+};
+
 struct file_system_type v9fs_fs_type = {
 	.name = "9p",
 	.get_sb = v9fs_get_sb,
diff --git a/fs/Kconfig b/fs/Kconfig
index 64d44efad7a5..5f85b5947613 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -177,6 +177,7 @@ source "fs/efs/Kconfig"
 source "fs/jffs2/Kconfig"
 # UBIFS File system configuration
 source "fs/ubifs/Kconfig"
+source "fs/logfs/Kconfig"
 source "fs/cramfs/Kconfig"
 source "fs/squashfs/Kconfig"
 source "fs/freevxfs/Kconfig"
@@ -234,6 +235,7 @@ config NFS_COMMON
 
 source "net/sunrpc/Kconfig"
 source "fs/smbfs/Kconfig"
+source "fs/ceph/Kconfig"
 source "fs/cifs/Kconfig"
 source "fs/ncpfs/Kconfig"
 source "fs/coda/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index af6d04700d9c..e6ec1d309b1d 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y :=	open.o read_write.o file_table.o super.o \
 		attr.o bad_inode.o file.o filesystems.o namespace.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o \
 		pnode.o drop_caches.o splice.o sync.o utimes.o \
-		stack.o fs_struct.o
+		stack.o fs_struct.o statfs.o
 
 ifeq ($(CONFIG_BLOCK),y)
 obj-y +=	buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
@@ -99,6 +99,7 @@ obj-$(CONFIG_NTFS_FS)		+= ntfs/
 obj-$(CONFIG_UFS_FS)		+= ufs/
 obj-$(CONFIG_EFS_FS)		+= efs/
 obj-$(CONFIG_JFFS2_FS)		+= jffs2/
+obj-$(CONFIG_LOGFS)		+= logfs/
 obj-$(CONFIG_UBIFS_FS)		+= ubifs/
 obj-$(CONFIG_AFFS_FS)		+= affs/
 obj-$(CONFIG_ROMFS_FS)		+= romfs/
@@ -124,3 +125,4 @@ obj-$(CONFIG_OCFS2_FS)		+= ocfs2/
 obj-$(CONFIG_BTRFS_FS)		+= btrfs/
 obj-$(CONFIG_GFS2_FS)           += gfs2/
 obj-$(CONFIG_EXOFS_FS)          += exofs/
+obj-$(CONFIG_CEPH_FS)		+= ceph/
diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index 9cc18775b832..2ff622f6f547 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -121,7 +121,7 @@ struct adfs_discmap {
 
 /* Inode stuff */
 struct inode *adfs_iget(struct super_block *sb, struct object_info *obj);
-int adfs_write_inode(struct inode *inode,int unused);
+int adfs_write_inode(struct inode *inode, struct writeback_control *wbc);
 int adfs_notify_change(struct dentry *dentry, struct iattr *attr);
 
 /* map.c */
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 23aa52f548a0..f4287e4de744 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -197,7 +197,7 @@ const struct file_operations adfs_dir_operations = {
 	.read		= generic_read_dir,
 	.llseek		= generic_file_llseek,
 	.readdir	= adfs_readdir,
-	.fsync		= simple_fsync,
+	.fsync		= generic_file_fsync,
 };
 
 static int
diff --git a/fs/adfs/file.c b/fs/adfs/file.c
index 005ea34d1758..a36da5382b40 100644
--- a/fs/adfs/file.c
+++ b/fs/adfs/file.c
@@ -26,7 +26,7 @@ const struct file_operations adfs_file_operations = {
 	.read		= do_sync_read,
 	.aio_read	= generic_file_aio_read,
 	.mmap		= generic_file_mmap,
-	.fsync		= simple_fsync,
+	.fsync		= generic_file_fsync,
 	.write		= do_sync_write,
 	.aio_write	= generic_file_aio_write,
 	.splice_read	= generic_file_splice_read,
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 3f57ce4bee5d..6f850b06ab62 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -9,6 +9,7 @@
  */
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
+#include <linux/writeback.h>
 #include "adfs.h"
 
 /*
@@ -321,8 +322,9 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr)
 	if (error)
 		goto out;
 
+	/* XXX: this is missing some actual on-disk truncation.. */
 	if (ia_valid & ATTR_SIZE)
-		error = vmtruncate(inode, attr->ia_size);
+		error = simple_setsize(inode, attr->ia_size);
 
 	if (error)
 		goto out;
@@ -360,7 +362,7 @@ out:
  * The adfs-specific inode data has already been updated by
  * adfs_notify_change()
  */
-int adfs_write_inode(struct inode *inode, int wait)
+int adfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	struct super_block *sb = inode->i_sb;
 	struct object_info obj;
@@ -375,7 +377,7 @@ int adfs_write_inode(struct inode *inode, int wait)
 	obj.attr	= ADFS_I(inode)->attr;
 	obj.size	= inode->i_size;
 
-	ret = adfs_dir_update(sb, &obj, wait);
+	ret = adfs_dir_update(sb, &obj, wbc->sync_mode == WB_SYNC_ALL);
 	unlock_kernel();
 	return ret;
 }
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 6910a98bd73c..4a3af7075c1d 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -13,6 +13,7 @@
 #include <linux/parser.h>
 #include <linux/mount.h>
 #include <linux/seq_file.h>
+#include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/statfs.h>
 #include "adfs.h"
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 0e40caaba456..f05b6155ccc8 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -175,14 +175,15 @@ extern void			 affs_delete_inode(struct inode *inode);
 extern void			 affs_clear_inode(struct inode *inode);
 extern struct inode		*affs_iget(struct super_block *sb,
 					unsigned long ino);
-extern int			 affs_write_inode(struct inode *inode, int);
+extern int			 affs_write_inode(struct inode *inode,
+					struct writeback_control *wbc);
 extern int			 affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s32 type);
 
 /* file.c */
 
 void		affs_free_prealloc(struct inode *inode);
 extern void	affs_truncate(struct inode *);
-int		affs_file_fsync(struct file *, struct dentry *, int);
+int		affs_file_fsync(struct file *, int);
 
 /* dir.c */
 
diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c
index dc5ef14bdc1c..3e262711ae06 100644
--- a/fs/affs/bitmap.c
+++ b/fs/affs/bitmap.c
@@ -7,6 +7,7 @@
  *  block allocation, deallocation, calculation of free space.
  */
 
+#include <linux/slab.h>
 #include "affs.h"
 
 /* This is, of course, shamelessly stolen from fs/minix */
@@ -128,7 +129,7 @@ err_range:
 /*
  * Allocate a block in the given allocation zone.
  * Since we have to byte-swap the bitmap on little-endian
- * machines, this is rather expensive. Therefor we will
+ * machines, this is rather expensive. Therefore we will
  * preallocate up to 16 blocks from the same word, if
  * possible. We are not doing preallocations in the
  * header zone, though.
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 184e55c1c9ba..322710c3eedf 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -916,9 +916,9 @@ affs_truncate(struct inode *inode)
 	affs_free_prealloc(inode);
 }
 
-int affs_file_fsync(struct file *filp, struct dentry *dentry, int datasync)
+int affs_file_fsync(struct file *filp, int datasync)
 {
-	struct inode * inode = dentry->d_inode;
+	struct inode *inode = filp->f_mapping->host;
 	int ret, err;
 
 	ret = write_inode_now(inode, 0);
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 3c4ec7d864c4..f4b2a4ee4f91 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -10,6 +10,7 @@
  *  (C) 1991  Linus Torvalds - minix filesystem
  */
 #include <linux/sched.h>
+#include <linux/gfp.h>
 #include "affs.h"
 
 extern const struct inode_operations affs_symlink_inode_operations;
@@ -166,7 +167,7 @@ bad_inode:
 }
 
 int
-affs_write_inode(struct inode *inode, int unused)
+affs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	struct super_block	*sb = inode->i_sb;
 	struct buffer_head	*bh;
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index d70bbbac6b7b..914d1c0bc07a 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -224,7 +224,7 @@ affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 		affs_brelse(bh);
 		inode = affs_iget(sb, ino);
 		if (IS_ERR(inode))
-			return ERR_PTR(PTR_ERR(inode));
+			return ERR_CAST(inode);
 	}
 	dentry->d_op = AFFS_SB(sb)->s_flags & SF_INTL ? &affs_intl_dentry_operations : &affs_dentry_operations;
 	d_add(dentry, inode);
diff --git a/fs/affs/super.c b/fs/affs/super.c
index d41e9673cd97..16a3e4765f68 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -17,6 +17,7 @@
 #include <linux/magic.h>
 #include <linux/sched.h>
 #include <linux/smp_lock.h>
+#include <linux/slab.h>
 #include "affs.h"
 
 extern struct timezone sys_tz;
diff --git a/fs/afs/cache.c b/fs/afs/cache.c
index e2b1d3f16519..0fb315dd4d2a 100644
--- a/fs/afs/cache.c
+++ b/fs/afs/cache.c
@@ -9,7 +9,6 @@
  * 2 of the License, or (at your option) any later version.
  */
 
-#include <linux/slab.h>
 #include <linux/sched.h>
 #include "internal.h"
 
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index eb765489164f..a3bcec75c54a 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -11,6 +11,7 @@
 
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/ip.h>
 #include "internal.h"
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 88067f36e5e7..b42d5cc1d6d2 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -12,7 +12,6 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/ctype.h>
@@ -190,13 +189,9 @@ static struct page *afs_dir_get_page(struct inode *dir, unsigned long index,
 				     struct key *key)
 {
 	struct page *page;
-	struct file file = {
-		.private_data = key,
-	};
-
 	_enter("{%lu},%lu", dir->i_ino, index);
 
-	page = read_mapping_page(dir->i_mapping, index, &file);
+	page = read_cache_page(dir->i_mapping, index, afs_page_filler, key);
 	if (!IS_ERR(page)) {
 		kmap(page);
 		if (!PageChecked(page))
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 39b301662f22..14d89fa58fee 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -12,10 +12,10 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/writeback.h>
+#include <linux/gfp.h>
 #include "internal.h"
 
 static int afs_readpage(struct file *file, struct page *page);
@@ -121,34 +121,19 @@ static void afs_file_readpage_read_complete(struct page *page,
 #endif
 
 /*
- * AFS read page from file, directory or symlink
+ * read page from file, directory or symlink, given a key to use
  */
-static int afs_readpage(struct file *file, struct page *page)
+int afs_page_filler(void *data, struct page *page)
 {
-	struct afs_vnode *vnode;
-	struct inode *inode;
-	struct key *key;
+	struct inode *inode = page->mapping->host;
+	struct afs_vnode *vnode = AFS_FS_I(inode);
+	struct key *key = data;
 	size_t len;
 	off_t offset;
 	int ret;
 
-	inode = page->mapping->host;
-
-	if (file) {
-		key = file->private_data;
-		ASSERT(key != NULL);
-	} else {
-		key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell);
-		if (IS_ERR(key)) {
-			ret = PTR_ERR(key);
-			goto error_nokey;
-		}
-	}
-
 	_enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index);
 
-	vnode = AFS_FS_I(inode);
-
 	BUG_ON(!PageLocked(page));
 
 	ret = -ESTALE;
@@ -214,31 +199,56 @@ static int afs_readpage(struct file *file, struct page *page)
 		unlock_page(page);
 	}
 
-	if (!file)
-		key_put(key);
 	_leave(" = 0");
 	return 0;
 
 error:
 	SetPageError(page);
 	unlock_page(page);
-	if (!file)
-		key_put(key);
-error_nokey:
 	_leave(" = %d", ret);
 	return ret;
 }
 
 /*
+ * read page from file, directory or symlink, given a file to nominate the key
+ * to be used
+ */
+static int afs_readpage(struct file *file, struct page *page)
+{
+	struct key *key;
+	int ret;
+
+	if (file) {
+		key = file->private_data;
+		ASSERT(key != NULL);
+		ret = afs_page_filler(key, page);
+	} else {
+		struct inode *inode = page->mapping->host;
+		key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell);
+		if (IS_ERR(key)) {
+			ret = PTR_ERR(key);
+		} else {
+			ret = afs_page_filler(key, page);
+			key_put(key);
+		}
+	}
+	return ret;
+}
+
+/*
  * read a set of pages
  */
 static int afs_readpages(struct file *file, struct address_space *mapping,
 			 struct list_head *pages, unsigned nr_pages)
 {
+	struct key *key = file->private_data;
 	struct afs_vnode *vnode;
 	int ret = 0;
 
-	_enter(",{%lu},,%d", mapping->host->i_ino, nr_pages);
+	_enter("{%d},{%lu},,%d",
+	       key_serial(key), mapping->host->i_ino, nr_pages);
+
+	ASSERT(key != NULL);
 
 	vnode = AFS_FS_I(mapping->host);
 	if (vnode->flags & AFS_VNODE_DELETED) {
@@ -279,7 +289,7 @@ static int afs_readpages(struct file *file, struct address_space *mapping,
 	}
 
 	/* load the missing pages from the network */
-	ret = read_cache_pages(mapping, pages, (void *) afs_readpage, file);
+	ret = read_cache_pages(mapping, pages, afs_page_filler, key);
 
 	_leave(" = %d [netting]", ret);
 	return ret;
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 023b95b0d9d7..4bd0218473a9 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -10,6 +10,7 @@
  */
 
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/circ_buf.h>
 #include "internal.h"
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index c048f0658751..d00b312e3110 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -16,7 +16,6 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/sched.h>
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 6ece2a13bf71..5f679b77ce24 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -19,6 +19,7 @@
 #include <linux/workqueue.h>
 #include <linux/sched.h>
 #include <linux/fscache.h>
+#include <linux/backing-dev.h>
 
 #include "afs.h"
 #include "afs_vl.h"
@@ -313,6 +314,7 @@ struct afs_volume {
 	unsigned short		rjservers;	/* number of servers discarded due to -ENOMEDIUM */
 	struct afs_server	*servers[8];	/* servers on which volume resides (ordered) */
 	struct rw_semaphore	server_sem;	/* lock for accessing current server */
+	struct backing_dev_info	bdi;
 };
 
 /*
@@ -492,6 +494,7 @@ extern const struct file_operations afs_file_operations;
 
 extern int afs_open(struct inode *, struct file *);
 extern int afs_release(struct inode *, struct file *);
+extern int afs_page_filler(void *, struct page *);
 
 /*
  * flock.c
@@ -733,12 +736,11 @@ extern int afs_write_end(struct file *file, struct address_space *mapping,
 			struct page *page, void *fsdata);
 extern int afs_writepage(struct page *, struct writeback_control *);
 extern int afs_writepages(struct address_space *, struct writeback_control *);
-extern int afs_write_inode(struct inode *, int);
 extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *);
 extern ssize_t afs_file_write(struct kiocb *, const struct iovec *,
 			      unsigned long, loff_t);
 extern int afs_writeback_all(struct afs_vnode *);
-extern int afs_fsync(struct file *, struct dentry *, int);
+extern int afs_fsync(struct file *, int);
 
 
 /*****************************************************************************/
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 5ffb570cd3a8..a9e23039ea34 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -12,11 +12,11 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
+#include <linux/gfp.h>
 #include "internal.h"
 
 
@@ -49,9 +49,6 @@ static unsigned long afs_mntpt_expiry_timeout = 10 * 60;
  */
 int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
 {
-	struct file file = {
-		.private_data = key,
-	};
 	struct page *page;
 	size_t size;
 	char *buf;
@@ -61,7 +58,8 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
 	       vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
 
 	/* read the contents of the symlink into the pagecache */
-	page = read_mapping_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0, &file);
+	page = read_cache_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0,
+			       afs_page_filler, key);
 	if (IS_ERR(page)) {
 		ret = PTR_ERR(page);
 		goto out;
@@ -138,9 +136,9 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
 {
 	struct afs_super_info *super;
 	struct vfsmount *mnt;
-	struct page *page = NULL;
+	struct page *page;
 	size_t size;
-	char *buf, *devname = NULL, *options = NULL;
+	char *buf, *devname, *options;
 	int ret;
 
 	_enter("{%s}", mntpt->d_name.name);
@@ -150,22 +148,22 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
 	ret = -EINVAL;
 	size = mntpt->d_inode->i_size;
 	if (size > PAGE_SIZE - 1)
-		goto error;
+		goto error_no_devname;
 
 	ret = -ENOMEM;
 	devname = (char *) get_zeroed_page(GFP_KERNEL);
 	if (!devname)
-		goto error;
+		goto error_no_devname;
 
 	options = (char *) get_zeroed_page(GFP_KERNEL);
 	if (!options)
-		goto error;
+		goto error_no_options;
 
 	/* read the contents of the AFS special symlink */
 	page = read_mapping_page(mntpt->d_inode->i_mapping, 0, NULL);
 	if (IS_ERR(page)) {
 		ret = PTR_ERR(page);
-		goto error;
+		goto error_no_page;
 	}
 
 	ret = -EIO;
@@ -196,12 +194,12 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
 	return mnt;
 
 error:
-	if (page)
-		page_cache_release(page);
-	if (devname)
-		free_page((unsigned long) devname);
-	if (options)
-		free_page((unsigned long) options);
+	page_cache_release(page);
+error_no_page:
+	free_page((unsigned long) options);
+error_no_options:
+	free_page((unsigned long) devname);
+error_no_devname:
 	_leave(" = %d", ret);
 	return ERR_PTR(ret);
 }
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index bde3f19c0995..67cf810e0fd6 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -9,6 +9,7 @@
  * 2 of the License, or (at your option) any later version.
  */
 
+#include <linux/slab.h>
 #include <net/sock.h>
 #include <net/af_rxrpc.h>
 #include <rxrpc/packet.h>
diff --git a/fs/afs/security.c b/fs/afs/security.c
index 3ef504370034..bb4ed144d0e4 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -189,8 +189,9 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key, long acl_order)
 	if (!permits)
 		goto out_unlock;
 
-	memcpy(permits->permits, xpermits->permits,
-	       count * sizeof(struct afs_permit));
+	if (xpermits)
+		memcpy(permits->permits, xpermits->permits,
+			count * sizeof(struct afs_permit));
 
 	_debug("key %x access %x",
 	       key_serial(key), vnode->status.caller_access);
diff --git a/fs/afs/server.c b/fs/afs/server.c
index f49099516675..9fdc7fe3a7bc 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -91,9 +91,10 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell,
 
 		memcpy(&server->addr, addr, sizeof(struct in_addr));
 		server->addr.s_addr = addr->s_addr;
+		_leave(" = %p{%d}", server, atomic_read(&server->usage));
+	} else {
+		_leave(" = NULL [nomem]");
 	}
-
-	_leave(" = %p{%d}", server, atomic_read(&server->usage));
 	return server;
 }
 
diff --git a/fs/afs/super.c b/fs/afs/super.c
index e1ea1c240b6a..e932e5a3a0c1 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -48,7 +48,6 @@ struct file_system_type afs_fs_type = {
 static const struct super_operations afs_super_ops = {
 	.statfs		= afs_statfs,
 	.alloc_inode	= afs_alloc_inode,
-	.write_inode	= afs_write_inode,
 	.destroy_inode	= afs_destroy_inode,
 	.clear_inode	= afs_clear_inode,
 	.put_super	= afs_put_super,
@@ -312,6 +311,7 @@ static int afs_fill_super(struct super_block *sb, void *data)
 	sb->s_magic		= AFS_FS_MAGIC;
 	sb->s_op		= &afs_super_ops;
 	sb->s_fs_info		= as;
+	sb->s_bdi		= &as->volume->bdi;
 
 	/* allocate the root inode and dentry */
 	fid.vid		= as->volume->vid;
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index 36c1306e09e0..340afd0cd182 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -9,6 +9,7 @@
  * 2 of the License, or (at your option) any later version.
  */
 
+#include <linux/gfp.h>
 #include <linux/init.h>
 #include <linux/sched.h>
 #include "internal.h"
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 6e689208def2..9ac260d1361d 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -11,6 +11,7 @@
 
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/sched.h>
 #include "internal.h"
diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c
index 2f05c4fc2a70..25cf4c3f4ff7 100644
--- a/fs/afs/vnode.c
+++ b/fs/afs/vnode.c
@@ -12,7 +12,6 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include "internal.h"
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index a353e69e2391..401eeb21869f 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -106,6 +106,10 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
 	volume->cell		= params->cell;
 	volume->vid		= vlocation->vldb.vid[params->type];
 
+	ret = bdi_setup_and_register(&volume->bdi, "afs", BDI_CAP_MAP_COPY);
+	if (ret)
+		goto error_bdi;
+
 	init_rwsem(&volume->server_sem);
 
 	/* look up all the applicable server records */
@@ -151,6 +155,8 @@ error:
 	return ERR_PTR(ret);
 
 error_discard:
+	bdi_destroy(&volume->bdi);
+error_bdi:
 	up_write(&params->cell->vl_sem);
 
 	for (loop = volume->nservers - 1; loop >= 0; loop--)
@@ -200,6 +206,7 @@ void afs_put_volume(struct afs_volume *volume)
 	for (loop = volume->nservers - 1; loop >= 0; loop--)
 		afs_put_server(volume->servers[loop]);
 
+	bdi_destroy(&volume->bdi);
 	kfree(volume);
 
 	_leave(" [destroyed]");
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 5e15a21dbf9f..722743b152d8 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -585,27 +585,6 @@ int afs_writepages(struct address_space *mapping,
 }
 
 /*
- * write an inode back
- */
-int afs_write_inode(struct inode *inode, int sync)
-{
-	struct afs_vnode *vnode = AFS_FS_I(inode);
-	int ret;
-
-	_enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode);
-
-	ret = 0;
-	if (sync) {
-		ret = filemap_fdatawait(inode->i_mapping);
-		if (ret < 0)
-			__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
-	}
-
-	_leave(" = %d", ret);
-	return ret;
-}
-
-/*
  * completion of write to server
  */
 void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call)
@@ -701,7 +680,6 @@ int afs_writeback_all(struct afs_vnode *vnode)
 {
 	struct address_space *mapping = vnode->vfs_inode.i_mapping;
 	struct writeback_control wbc = {
-		.bdi		= mapping->backing_dev_info,
 		.sync_mode	= WB_SYNC_ALL,
 		.nr_to_write	= LONG_MAX,
 		.range_cyclic	= 1,
@@ -722,8 +700,9 @@ int afs_writeback_all(struct afs_vnode *vnode)
  * - the return status from this call provides a reliable indication of
  *   whether any write errors occurred for this process.
  */
-int afs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int afs_fsync(struct file *file, int datasync)
 {
+	struct dentry *dentry = file->f_path.dentry;
 	struct afs_writeback *wb, *xwb;
 	struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode);
 	int ret;
diff --git a/fs/aio.c b/fs/aio.c
index 1cf12b3dd83a..1ccf25cef1f0 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -36,6 +36,7 @@
 #include <linux/blkdev.h>
 #include <linux/mempool.h>
 #include <linux/hash.h>
+#include <linux/compat.h>
 
 #include <asm/kmap_types.h>
 #include <asm/uaccess.h>
@@ -526,7 +527,7 @@ static void aio_fput_routine(struct work_struct *data)
 
 		/* Complete the fput(s) */
 		if (req->ki_filp != NULL)
-			__fput(req->ki_filp);
+			fput(req->ki_filp);
 
 		/* Link the iocb into the context's free list */
 		spin_lock_irq(&ctx->ctx_lock);
@@ -559,11 +560,11 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
 
 	/*
 	 * Try to optimize the aio and eventfd file* puts, by avoiding to
-	 * schedule work in case it is not __fput() time. In normal cases,
+	 * schedule work in case it is not final fput() time. In normal cases,
 	 * we would not be holding the last reference to the file*, so
 	 * this function will be executed w/out any aio kthread wakeup.
 	 */
-	if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) {
+	if (unlikely(!fput_atomic(req->ki_filp))) {
 		get_ioctx(ctx);
 		spin_lock(&fput_lock);
 		list_add(&req->ki_list, &fput_head);
@@ -1384,13 +1385,22 @@ static ssize_t aio_fsync(struct kiocb *iocb)
 	return ret;
 }
 
-static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb)
+static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat)
 {
 	ssize_t ret;
 
-	ret = rw_copy_check_uvector(type, (struct iovec __user *)kiocb->ki_buf,
-				    kiocb->ki_nbytes, 1,
-				    &kiocb->ki_inline_vec, &kiocb->ki_iovec);
+#ifdef CONFIG_COMPAT
+	if (compat)
+		ret = compat_rw_copy_check_uvector(type,
+				(struct compat_iovec __user *)kiocb->ki_buf,
+				kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
+				&kiocb->ki_iovec);
+	else
+#endif
+		ret = rw_copy_check_uvector(type,
+				(struct iovec __user *)kiocb->ki_buf,
+				kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
+				&kiocb->ki_iovec);
 	if (ret < 0)
 		goto out;
 
@@ -1420,7 +1430,7 @@ static ssize_t aio_setup_single_vector(struct kiocb *kiocb)
  *	Performs the initial checks and aio retry method
  *	setup for the kiocb at the time of io submission.
  */
-static ssize_t aio_setup_iocb(struct kiocb *kiocb)
+static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat)
 {
 	struct file *file = kiocb->ki_filp;
 	ssize_t ret = 0;
@@ -1469,7 +1479,7 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb)
 		ret = security_file_permission(file, MAY_READ);
 		if (unlikely(ret))
 			break;
-		ret = aio_setup_vectored_rw(READ, kiocb);
+		ret = aio_setup_vectored_rw(READ, kiocb, compat);
 		if (ret)
 			break;
 		ret = -EINVAL;
@@ -1483,7 +1493,7 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb)
 		ret = security_file_permission(file, MAY_WRITE);
 		if (unlikely(ret))
 			break;
-		ret = aio_setup_vectored_rw(WRITE, kiocb);
+		ret = aio_setup_vectored_rw(WRITE, kiocb, compat);
 		if (ret)
 			break;
 		ret = -EINVAL;
@@ -1548,7 +1558,8 @@ static void aio_batch_free(struct hlist_head *batch_hash)
 }
 
 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
-			 struct iocb *iocb, struct hlist_head *batch_hash)
+			 struct iocb *iocb, struct hlist_head *batch_hash,
+			 bool compat)
 {
 	struct kiocb *req;
 	struct file *file;
@@ -1609,7 +1620,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 	req->ki_left = req->ki_nbytes = iocb->aio_nbytes;
 	req->ki_opcode = iocb->aio_lio_opcode;
 
-	ret = aio_setup_iocb(req);
+	ret = aio_setup_iocb(req, compat);
 
 	if (ret)
 		goto out_put_req;
@@ -1637,20 +1648,8 @@ out_put_req:
 	return ret;
 }
 
-/* sys_io_submit:
- *	Queue the nr iocbs pointed to by iocbpp for processing.  Returns
- *	the number of iocbs queued.  May return -EINVAL if the aio_context
- *	specified by ctx_id is invalid, if nr is < 0, if the iocb at
- *	*iocbpp[0] is not properly initialized, if the operation specified
- *	is invalid for the file descriptor in the iocb.  May fail with
- *	-EFAULT if any of the data structures point to invalid data.  May
- *	fail with -EBADF if the file descriptor specified in the first
- *	iocb is invalid.  May fail with -EAGAIN if insufficient resources
- *	are available to queue any iocbs.  Will return 0 if nr is 0.  Will
- *	fail with -ENOSYS if not implemented.
- */
-SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
-		struct iocb __user * __user *, iocbpp)
+long do_io_submit(aio_context_t ctx_id, long nr,
+		  struct iocb __user *__user *iocbpp, bool compat)
 {
 	struct kioctx *ctx;
 	long ret = 0;
@@ -1687,7 +1686,7 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
 			break;
 		}
 
-		ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash);
+		ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash, compat);
 		if (ret)
 			break;
 	}
@@ -1697,6 +1696,24 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
 	return i ? i : ret;
 }
 
+/* sys_io_submit:
+ *	Queue the nr iocbs pointed to by iocbpp for processing.  Returns
+ *	the number of iocbs queued.  May return -EINVAL if the aio_context
+ *	specified by ctx_id is invalid, if nr is < 0, if the iocb at
+ *	*iocbpp[0] is not properly initialized, if the operation specified
+ *	is invalid for the file descriptor in the iocb.  May fail with
+ *	-EFAULT if any of the data structures point to invalid data.  May
+ *	fail with -EBADF if the file descriptor specified in the first
+ *	iocb is invalid.  May fail with -EAGAIN if insufficient resources
+ *	are available to queue any iocbs.  Will return 0 if nr is 0.  Will
+ *	fail with -ENOSYS if not implemented.
+ */
+SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
+		struct iocb __user * __user *, iocbpp)
+{
+	return do_io_submit(ctx_id, nr, iocbpp, 0);
+}
+
 /* lookup_kiocb
  *	Finds a given iocb for cancellation.
  */
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 9f0bf13291e5..e4b75d6eda83 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -12,7 +12,6 @@
 #include <linux/file.h>
 #include <linux/poll.h>
 #include <linux/sched.h>
-#include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/fs.h>
 #include <linux/mount.h>
@@ -209,6 +208,7 @@ static struct inode *anon_inode_mkinode(void)
 	inode->i_mode = S_IRUSR | S_IWUSR;
 	inode->i_uid = current_fsuid();
 	inode->i_gid = current_fsgid();
+	inode->i_flags |= S_PRIVATE;
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	return inode;
 }
diff --git a/fs/attr.c b/fs/attr.c
index 96d394bdaddf..b4fa3b0aa596 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -12,7 +12,6 @@
 #include <linux/capability.h>
 #include <linux/fsnotify.h>
 #include <linux/fcntl.h>
-#include <linux/quotaops.h>
 #include <linux/security.h>
 
 /* Taken over from the old code... */
@@ -68,21 +67,21 @@ EXPORT_SYMBOL(inode_change_ok);
  * @offset:	the new size to assign to the inode
  * @Returns:	0 on success, -ve errno on failure
  *
+ * inode_newsize_ok must be called with i_mutex held.
+ *
  * inode_newsize_ok will check filesystem limits and ulimits to check that the
  * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ
  * when necessary. Caller must not proceed with inode size change if failure is
  * returned. @inode must be a file (not directory), with appropriate
  * permissions to allow truncate (inode_newsize_ok does NOT check these
  * conditions).
- *
- * inode_newsize_ok must be called with i_mutex held.
  */
 int inode_newsize_ok(const struct inode *inode, loff_t offset)
 {
 	if (inode->i_size < offset) {
 		unsigned long limit;
 
-		limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+		limit = rlimit(RLIMIT_FSIZE);
 		if (limit != RLIM_INFINITY && offset > limit)
 			goto out_sig;
 		if (offset > inode->i_sb->s_maxbytes)
@@ -105,17 +104,25 @@ out_big:
 }
 EXPORT_SYMBOL(inode_newsize_ok);
 
-int inode_setattr(struct inode * inode, struct iattr * attr)
+/**
+ * generic_setattr - copy simple metadata updates into the generic inode
+ * @inode:	the inode to be updated
+ * @attr:	the new attributes
+ *
+ * generic_setattr must be called with i_mutex held.
+ *
+ * generic_setattr updates the inode's metadata with that specified
+ * in attr. Noticably missing is inode size update, which is more complex
+ * as it requires pagecache updates. See simple_setsize.
+ *
+ * The inode is not marked as dirty after this operation. The rationale is
+ * that for "simple" filesystems, the struct inode is the inode storage.
+ * The caller is free to mark the inode dirty afterwards if needed.
+ */
+void generic_setattr(struct inode *inode, const struct iattr *attr)
 {
 	unsigned int ia_valid = attr->ia_valid;
 
-	if (ia_valid & ATTR_SIZE &&
-	    attr->ia_size != i_size_read(inode)) {
-		int error = vmtruncate(inode, attr->ia_size);
-		if (error)
-			return error;
-	}
-
 	if (ia_valid & ATTR_UID)
 		inode->i_uid = attr->ia_uid;
 	if (ia_valid & ATTR_GID)
@@ -136,6 +143,28 @@ int inode_setattr(struct inode * inode, struct iattr * attr)
 			mode &= ~S_ISGID;
 		inode->i_mode = mode;
 	}
+}
+EXPORT_SYMBOL(generic_setattr);
+
+/*
+ * note this function is deprecated, the new truncate sequence should be
+ * used instead -- see eg. simple_setsize, generic_setattr.
+ */
+int inode_setattr(struct inode *inode, const struct iattr *attr)
+{
+	unsigned int ia_valid = attr->ia_valid;
+
+	if (ia_valid & ATTR_SIZE &&
+	    attr->ia_size != i_size_read(inode)) {
+		int error;
+
+		error = vmtruncate(inode, attr->ia_size);
+		if (error)
+			return error;
+	}
+
+	generic_setattr(inode, attr);
+
 	mark_inode_dirty(inode);
 
 	return 0;
@@ -212,14 +241,8 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
 		error = inode->i_op->setattr(dentry, attr);
 	} else {
 		error = inode_change_ok(inode, attr);
-		if (!error) {
-			if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
-			    (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid))
-				error = vfs_dq_transfer(inode, attr) ?
-					-EDQUOT : 0;
-			if (!error)
-				error = inode_setattr(inode, attr);
-		}
+		if (!error)
+			error = inode_setattr(inode, attr);
 	}
 
 	if (ia_valid & ATTR_SIZE)
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 4a1401cea0a1..9a0520b50663 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -13,6 +13,7 @@
 #include <linux/capability.h>
 #include <linux/errno.h>
 #include <linux/stat.h>
+#include <linux/slab.h>
 #include <linux/param.h>
 #include <linux/time.h>
 #include <linux/smp_lock.h>
@@ -27,6 +28,7 @@ static int autofs_root_mkdir(struct inode *,struct dentry *,int);
 static int autofs_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long);
 
 const struct file_operations autofs_root_operations = {
+	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= autofs_root_readdir,
 	.ioctl		= autofs_root_ioctl,
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 0118d67221b2..3d283abf67d7 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -60,11 +60,6 @@ do {							\
 		current->pid, __func__, ##args);	\
 } while (0)
 
-struct rehash_entry {
-	struct task_struct *task;
-	struct list_head list;
-};
-
 /* Unified info structure.  This is pointed to by both the dentry and
    inode structures.  Each file in the filesystem has an instance of this
    structure.  It holds a reference to the dentry, so dentries are never
@@ -81,7 +76,6 @@ struct autofs_info {
 
 	struct list_head active;
 	int active_count;
-	struct list_head rehash_list;
 
 	struct list_head expiring;
 
@@ -104,7 +98,6 @@ struct autofs_info {
 #define AUTOFS_INF_EXPIRING	(1<<0) /* dentry is in the process of expiring */
 #define AUTOFS_INF_MOUNTPOINT	(1<<1) /* mountpoint status for direct expire */
 #define AUTOFS_INF_PENDING	(1<<2) /* dentry pending mount */
-#define AUTOFS_INF_REHASH	(1<<3) /* dentry in transit to ->lookup() */
 
 struct autofs_wait_queue {
 	wait_queue_head_t queue;
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 00bf8fcb245f..ba4a38b9c22f 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -22,6 +22,7 @@
 #include <linux/magic.h>
 #include <linux/dcache.h>
 #include <linux/uaccess.h>
+#include <linux/slab.h>
 
 #include "autofs_i.h"
 
@@ -94,7 +95,7 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param)
  */
 static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in)
 {
-	struct autofs_dev_ioctl tmp, *ads;
+	struct autofs_dev_ioctl tmp;
 
 	if (copy_from_user(&tmp, in, sizeof(tmp)))
 		return ERR_PTR(-EFAULT);
@@ -102,16 +103,7 @@ static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *i
 	if (tmp.size < sizeof(tmp))
 		return ERR_PTR(-EINVAL);
 
-	ads = kmalloc(tmp.size, GFP_KERNEL);
-	if (!ads)
-		return ERR_PTR(-ENOMEM);
-
-	if (copy_from_user(ads, in, tmp.size)) {
-		kfree(ads);
-		return ERR_PTR(-EFAULT);
-	}
-
-	return ads;
+	return memdup_user(in, tmp.size);
 }
 
 static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
@@ -544,10 +536,9 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
 			goto out;
 		devid = new_encode_dev(path.mnt->mnt_sb->s_dev);
 		err = 0;
-		if (path.dentry->d_inode &&
-		    path.mnt->mnt_root == path.dentry) {
+		if (path.mnt->mnt_root == path.dentry) {
 			err = 1;
-			magic = path.dentry->d_inode->i_sb->s_magic;
+			magic = path.mnt->mnt_sb->s_magic;
 		}
 	} else {
 		dev_t dev = sbi->sb->s_dev;
@@ -560,10 +551,8 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
 
 		err = have_submounts(path.dentry);
 
-		if (path.mnt->mnt_mountpoint != path.mnt->mnt_root) {
-			if (follow_down(&path))
-				magic = path.mnt->mnt_sb->s_magic;
-		}
+		if (follow_down(&path))
+			magic = path.mnt->mnt_sb->s_magic;
 	}
 
 	param->ismountpoint.out.devid = devid;
@@ -738,11 +727,14 @@ static const struct file_operations _dev_ioctl_fops = {
 };
 
 static struct miscdevice _autofs_dev_ioctl_misc = {
-	.minor 		= MISC_DYNAMIC_MINOR,
+	.minor		= AUTOFS_MINOR,
 	.name  		= AUTOFS_DEVICE_NAME,
 	.fops  		= &_dev_ioctl_fops
 };
 
+MODULE_ALIAS_MISCDEV(AUTOFS_MINOR);
+MODULE_ALIAS("devname:autofs");
+
 /* Register/deregister misc character device */
 int autofs_dev_ioctl_init(void)
 {
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 74bc9aa6df31..a796c9417fb1 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -279,7 +279,6 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
 			root->d_mounted--;
 		}
 		ino->flags |= AUTOFS_INF_EXPIRING;
-		autofs4_add_expiring(root);
 		init_completion(&ino->expire_complete);
 		spin_unlock(&sbi->fs_lock);
 		return root;
@@ -407,7 +406,6 @@ found:
 		expired, (int)expired->d_name.len, expired->d_name.name);
 	ino = autofs4_dentry_ino(expired);
 	ino->flags |= AUTOFS_INF_EXPIRING;
-	autofs4_add_expiring(expired);
 	init_completion(&ino->expire_complete);
 	spin_unlock(&sbi->fs_lock);
 	spin_lock(&dcache_lock);
@@ -435,7 +433,7 @@ int autofs4_expire_wait(struct dentry *dentry)
 
 		DPRINTK("expire done status=%d", status);
 
-		if (d_unhashed(dentry) && IS_DEADDIR(dentry->d_inode))
+		if (d_unhashed(dentry))
 			return -EAGAIN;
 
 		return status;
@@ -475,7 +473,6 @@ int autofs4_expire_run(struct super_block *sb,
 	spin_lock(&sbi->fs_lock);
 	ino = autofs4_dentry_ino(dentry);
 	ino->flags &= ~AUTOFS_INF_EXPIRING;
-	autofs4_del_expiring(dentry);
 	complete_all(&ino->expire_complete);
 	spin_unlock(&sbi->fs_lock);
 
@@ -506,7 +503,6 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
 			ino->flags &= ~AUTOFS_INF_MOUNTPOINT;
 		}
 		ino->flags &= ~AUTOFS_INF_EXPIRING;
-		autofs4_del_expiring(dentry);
 		complete_all(&ino->expire_complete);
 		spin_unlock(&sbi->fs_lock);
 		dput(dentry);
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index d0a3de247458..821b2b955dac 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -49,7 +49,6 @@ struct autofs_info *autofs4_init_ino(struct autofs_info *ino,
 		ino->dentry = NULL;
 		ino->size = 0;
 		INIT_LIST_HEAD(&ino->active);
-		INIT_LIST_HEAD(&ino->rehash_list);
 		ino->active_count = 0;
 		INIT_LIST_HEAD(&ino->expiring);
 		atomic_set(&ino->count, 0);
@@ -97,63 +96,6 @@ void autofs4_free_ino(struct autofs_info *ino)
 	kfree(ino);
 }
 
-/*
- * Deal with the infamous "Busy inodes after umount ..." message.
- *
- * Clean up the dentry tree. This happens with autofs if the user
- * space program goes away due to a SIGKILL, SIGSEGV etc.
- */
-static void autofs4_force_release(struct autofs_sb_info *sbi)
-{
-	struct dentry *this_parent = sbi->sb->s_root;
-	struct list_head *next;
-
-	if (!sbi->sb->s_root)
-		return;
-
-	spin_lock(&dcache_lock);
-repeat:
-	next = this_parent->d_subdirs.next;
-resume:
-	while (next != &this_parent->d_subdirs) {
-		struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child);
-
-		/* Negative dentry - don`t care */
-		if (!simple_positive(dentry)) {
-			next = next->next;
-			continue;
-		}
-
-		if (!list_empty(&dentry->d_subdirs)) {
-			this_parent = dentry;
-			goto repeat;
-		}
-
-		next = next->next;
-		spin_unlock(&dcache_lock);
-
-		DPRINTK("dentry %p %.*s",
-			dentry, (int)dentry->d_name.len, dentry->d_name.name);
-
-		dput(dentry);
-		spin_lock(&dcache_lock);
-	}
-
-	if (this_parent != sbi->sb->s_root) {
-		struct dentry *dentry = this_parent;
-
-		next = this_parent->d_u.d_child.next;
-		this_parent = this_parent->d_parent;
-		spin_unlock(&dcache_lock);
-		DPRINTK("parent dentry %p %.*s",
-			dentry, (int)dentry->d_name.len, dentry->d_name.name);
-		dput(dentry);
-		spin_lock(&dcache_lock);
-		goto resume;
-	}
-	spin_unlock(&dcache_lock);
-}
-
 void autofs4_kill_sb(struct super_block *sb)
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(sb);
@@ -170,15 +112,12 @@ void autofs4_kill_sb(struct super_block *sb)
 	/* Free wait queues, close pipe */
 	autofs4_catatonic_mode(sbi);
 
-	/* Clean up and release dangling references */
-	autofs4_force_release(sbi);
-
 	sb->s_fs_info = NULL;
 	kfree(sbi);
 
 out_kill_sb:
 	DPRINTK("shutting down");
-	kill_anon_super(sb);
+	kill_litter_super(sb);
 }
 
 static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt)
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 30cc9ddf4b70..db4117ed7803 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -15,15 +15,17 @@
 #include <linux/capability.h>
 #include <linux/errno.h>
 #include <linux/stat.h>
+#include <linux/slab.h>
 #include <linux/param.h>
 #include <linux/time.h>
+#include <linux/smp_lock.h>
 #include "autofs_i.h"
 
 static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
 static int autofs4_dir_unlink(struct inode *,struct dentry *);
 static int autofs4_dir_rmdir(struct inode *,struct dentry *);
 static int autofs4_dir_mkdir(struct inode *,struct dentry *,int);
-static int autofs4_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long);
+static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long);
 static int autofs4_dir_open(struct inode *inode, struct file *file);
 static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
 static void *autofs4_follow_link(struct dentry *, struct nameidata *);
@@ -37,7 +39,7 @@ const struct file_operations autofs4_root_operations = {
 	.read		= generic_read_dir,
 	.readdir	= dcache_readdir,
 	.llseek		= dcache_dir_lseek,
-	.ioctl		= autofs4_root_ioctl,
+	.unlocked_ioctl	= autofs4_root_ioctl,
 };
 
 const struct file_operations autofs4_dir_operations = {
@@ -104,99 +106,6 @@ static void autofs4_del_active(struct dentry *dentry)
 	return;
 }
 
-static void autofs4_add_rehash_entry(struct autofs_info *ino,
-				     struct rehash_entry *entry)
-{
-	entry->task = current;
-	INIT_LIST_HEAD(&entry->list);
-	list_add(&entry->list, &ino->rehash_list);
-	return;
-}
-
-static void autofs4_remove_rehash_entry(struct autofs_info *ino)
-{
-	struct list_head *head = &ino->rehash_list;
-	struct rehash_entry *entry;
-	list_for_each_entry(entry, head, list) {
-		if (entry->task == current) {
-			list_del(&entry->list);
-			kfree(entry);
-			break;
-		}
-	}
-	return;
-}
-
-static void autofs4_remove_rehash_entrys(struct autofs_info *ino)
-{
-	struct autofs_sb_info *sbi = ino->sbi;
-	struct rehash_entry *entry, *next;
-	struct list_head *head;
-
-	spin_lock(&sbi->fs_lock);
-	spin_lock(&sbi->lookup_lock);
-	if (!(ino->flags & AUTOFS_INF_REHASH)) {
-		spin_unlock(&sbi->lookup_lock);
-		spin_unlock(&sbi->fs_lock);
-		return;
-	}
-	ino->flags &= ~AUTOFS_INF_REHASH;
-	head = &ino->rehash_list;
-	list_for_each_entry_safe(entry, next, head, list) {
-		list_del(&entry->list);
-		kfree(entry);
-	}
-	spin_unlock(&sbi->lookup_lock);
-	spin_unlock(&sbi->fs_lock);
-	dput(ino->dentry);
-
-	return;
-}
-
-static void autofs4_revalidate_drop(struct dentry *dentry,
-				    struct rehash_entry *entry)
-{
-	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
-	struct autofs_info *ino = autofs4_dentry_ino(dentry);
-	/*
-	 * Add to the active list so we can pick this up in
-	 * ->lookup(). Also add an entry to a rehash list so
-	 * we know when there are no dentrys in flight so we
-	 * know when we can rehash the dentry.
-	 */
-	spin_lock(&sbi->lookup_lock);
-	if (list_empty(&ino->active))
-		list_add(&ino->active, &sbi->active_list);
-	autofs4_add_rehash_entry(ino, entry);
-	spin_unlock(&sbi->lookup_lock);
-	if (!(ino->flags & AUTOFS_INF_REHASH)) {
-		ino->flags |= AUTOFS_INF_REHASH;
-		dget(dentry);
-		spin_lock(&dentry->d_lock);
-		__d_drop(dentry);
-		spin_unlock(&dentry->d_lock);
-	}
-	return;
-}
-
-static void autofs4_revalidate_rehash(struct dentry *dentry)
-{
-	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
-	struct autofs_info *ino = autofs4_dentry_ino(dentry);
-	if (ino->flags & AUTOFS_INF_REHASH) {
-		spin_lock(&sbi->lookup_lock);
-		autofs4_remove_rehash_entry(ino);
-		if (list_empty(&ino->rehash_list)) {
-			spin_unlock(&sbi->lookup_lock);
-			ino->flags &= ~AUTOFS_INF_REHASH;
-			d_rehash(dentry);
-			dput(ino->dentry);
-		} else
-			spin_unlock(&sbi->lookup_lock);
-	}
-	return;
-}
-
 static unsigned int autofs4_need_mount(unsigned int flags)
 {
 	unsigned int res = 0;
@@ -236,7 +145,7 @@ out:
 	return dcache_dir_open(inode, file);
 }
 
-static int try_to_fill_dentry(struct dentry *dentry)
+static int try_to_fill_dentry(struct dentry *dentry, int flags)
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
 	struct autofs_info *ino = autofs4_dentry_ino(dentry);
@@ -249,17 +158,54 @@ static int try_to_fill_dentry(struct dentry *dentry)
 	 * Wait for a pending mount, triggering one if there
 	 * isn't one already
 	 */
-	DPRINTK("waiting for mount name=%.*s",
-		 dentry->d_name.len, dentry->d_name.name);
+	if (dentry->d_inode == NULL) {
+		DPRINTK("waiting for mount name=%.*s",
+			 dentry->d_name.len, dentry->d_name.name);
+
+		status = autofs4_wait(sbi, dentry, NFY_MOUNT);
 
-	status = autofs4_wait(sbi, dentry, NFY_MOUNT);
+		DPRINTK("mount done status=%d", status);
 
-	DPRINTK("mount done status=%d", status);
+		/* Turn this into a real negative dentry? */
+		if (status == -ENOENT) {
+			spin_lock(&sbi->fs_lock);
+			ino->flags &= ~AUTOFS_INF_PENDING;
+			spin_unlock(&sbi->fs_lock);
+			return status;
+		} else if (status) {
+			/* Return a negative dentry, but leave it "pending" */
+			return status;
+		}
+	/* Trigger mount for path component or follow link */
+	} else if (ino->flags & AUTOFS_INF_PENDING ||
+			autofs4_need_mount(flags)) {
+		DPRINTK("waiting for mount name=%.*s",
+			dentry->d_name.len, dentry->d_name.name);
 
-	/* Update expiry counter */
-	ino->last_used = jiffies;
+		spin_lock(&sbi->fs_lock);
+		ino->flags |= AUTOFS_INF_PENDING;
+		spin_unlock(&sbi->fs_lock);
+		status = autofs4_wait(sbi, dentry, NFY_MOUNT);
 
-	return status;
+		DPRINTK("mount done status=%d", status);
+
+		if (status) {
+			spin_lock(&sbi->fs_lock);
+			ino->flags &= ~AUTOFS_INF_PENDING;
+			spin_unlock(&sbi->fs_lock);
+			return status;
+		}
+	}
+
+	/* Initialize expiry counter after successful mount */
+	if (ino)
+		ino->last_used = jiffies;
+
+	spin_lock(&sbi->fs_lock);
+	ino->flags &= ~AUTOFS_INF_PENDING;
+	spin_unlock(&sbi->fs_lock);
+
+	return 0;
 }
 
 /* For autofs direct mounts the follow link triggers the mount */
@@ -313,16 +259,10 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 	 */
 	if (ino->flags & AUTOFS_INF_PENDING ||
 	    (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs))) {
-		ino->flags |= AUTOFS_INF_PENDING;
 		spin_unlock(&dcache_lock);
 		spin_unlock(&sbi->fs_lock);
 
-		status = try_to_fill_dentry(dentry);
-
-		spin_lock(&sbi->fs_lock);
-		ino->flags &= ~AUTOFS_INF_PENDING;
-		spin_unlock(&sbi->fs_lock);
-
+		status = try_to_fill_dentry(dentry, nd->flags);
 		if (status)
 			goto out_error;
 
@@ -361,47 +301,18 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *dir = dentry->d_parent->d_inode;
 	struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
-	struct autofs_info *ino = autofs4_dentry_ino(dentry);
-	struct rehash_entry *entry;
+	int oz_mode = autofs4_oz_mode(sbi);
 	int flags = nd ? nd->flags : 0;
-	unsigned int mutex_aquired;
-
-	DPRINTK("name = %.*s oz_mode = %d",
-		dentry->d_name.len, dentry->d_name.name, oz_mode);
+	int status = 1;
 
-	/* Daemon never causes a mount to trigger */
-	if (autofs4_oz_mode(sbi))
-		return 1;
-
-	entry = kmalloc(sizeof(struct rehash_entry), GFP_KERNEL);
-	if (!entry)
-		return -ENOMEM;
-
-	mutex_aquired = mutex_trylock(&dir->i_mutex);
-
-	spin_lock(&sbi->fs_lock);
-	spin_lock(&dcache_lock);
 	/* Pending dentry */
+	spin_lock(&sbi->fs_lock);
 	if (autofs4_ispending(dentry)) {
-		int status;
-
-		/*
-		 * We can only unhash and send this to ->lookup() if
-		 * the directory mutex is held over d_revalidate() and
-		 * ->lookup(). This prevents the VFS from incorrectly
-		 * seeing the dentry as non-existent.
-		 */
-		ino->flags |= AUTOFS_INF_PENDING;
-		if (!mutex_aquired) {
-			autofs4_revalidate_drop(dentry, entry);
-			spin_unlock(&dcache_lock);
-			spin_unlock(&sbi->fs_lock);
-			return 0;
-		}
-		spin_unlock(&dcache_lock);
+		/* The daemon never causes a mount to trigger */
 		spin_unlock(&sbi->fs_lock);
-		mutex_unlock(&dir->i_mutex);
-		kfree(entry);
+
+		if (oz_mode)
+			return 1;
 
 		/*
 		 * If the directory has gone away due to an expire
@@ -415,82 +326,45 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
 		 * A zero status is success otherwise we have a
 		 * negative error code.
 		 */
-		status = try_to_fill_dentry(dentry);
-
-		spin_lock(&sbi->fs_lock);
-		ino->flags &= ~AUTOFS_INF_PENDING;
-		spin_unlock(&sbi->fs_lock);
-
+		status = try_to_fill_dentry(dentry, flags);
 		if (status == 0)
 			return 1;
 
 		return status;
 	}
+	spin_unlock(&sbi->fs_lock);
+
+	/* Negative dentry.. invalidate if "old" */
+	if (dentry->d_inode == NULL)
+		return 0;
 
 	/* Check for a non-mountpoint directory with no contents */
+	spin_lock(&dcache_lock);
 	if (S_ISDIR(dentry->d_inode->i_mode) &&
 	    !d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
 		DPRINTK("dentry=%p %.*s, emptydir",
 			 dentry, dentry->d_name.len, dentry->d_name.name);
+		spin_unlock(&dcache_lock);
 
-		if (autofs4_need_mount(flags) || current->link_count) {
-			int status;
-
-			/*
-			 * We can only unhash and send this to ->lookup() if
-			 * the directory mutex is held over d_revalidate() and
-			 * ->lookup(). This prevents the VFS from incorrectly
-			 * seeing the dentry as non-existent.
-			 */
-			ino->flags |= AUTOFS_INF_PENDING;
-			if (!mutex_aquired) {
-				autofs4_revalidate_drop(dentry, entry);
-				spin_unlock(&dcache_lock);
-				spin_unlock(&sbi->fs_lock);
-				return 0;
-			}
-			spin_unlock(&dcache_lock);
-			spin_unlock(&sbi->fs_lock);
-			mutex_unlock(&dir->i_mutex);
-			kfree(entry);
-
-			/*
-			 * A zero status is success otherwise we have a
-			 * negative error code.
-			 */
-			status = try_to_fill_dentry(dentry);
-
-			spin_lock(&sbi->fs_lock);
-			ino->flags &= ~AUTOFS_INF_PENDING;
-			spin_unlock(&sbi->fs_lock);
+		/* The daemon never causes a mount to trigger */
+		if (oz_mode)
+			return 1;
 
-			if (status == 0)
-				return 1;
+		/*
+		 * A zero status is success otherwise we have a
+		 * negative error code.
+		 */
+		status = try_to_fill_dentry(dentry, flags);
+		if (status == 0)
+			return 1;
 
-			return status;
-		}
+		return status;
 	}
 	spin_unlock(&dcache_lock);
-	spin_unlock(&sbi->fs_lock);
-
-	if (mutex_aquired)
-		mutex_unlock(&dir->i_mutex);
-
-	kfree(entry);
 
 	return 1;
 }
 
-static void autofs4_free_rehash_entrys(struct autofs_info *inf)
-{
-	struct list_head *head = &inf->rehash_list;
-	struct rehash_entry *entry, *next;
-	list_for_each_entry_safe(entry, next, head, list) {
-		list_del(&entry->list);
-		kfree(entry);
-	}
-}
-
 void autofs4_dentry_release(struct dentry *de)
 {
 	struct autofs_info *inf;
@@ -509,8 +383,6 @@ void autofs4_dentry_release(struct dentry *de)
 				list_del(&inf->active);
 			if (!list_empty(&inf->expiring))
 				list_del(&inf->expiring);
-			if (!list_empty(&inf->rehash_list))
-				autofs4_free_rehash_entrys(inf);
 			spin_unlock(&sbi->lookup_lock);
 		}
 
@@ -543,7 +415,6 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
 	const unsigned char *str = name->name;
 	struct list_head *p, *head;
 
-restart:
 	spin_lock(&dcache_lock);
 	spin_lock(&sbi->lookup_lock);
 	head = &sbi->active_list;
@@ -561,19 +432,6 @@ restart:
 		if (atomic_read(&active->d_count) == 0)
 			goto next;
 
-		if (active->d_inode && IS_DEADDIR(active->d_inode)) {
-			if (!list_empty(&ino->rehash_list)) {
-				dget(active);
-				spin_unlock(&active->d_lock);
-				spin_unlock(&sbi->lookup_lock);
-				spin_unlock(&dcache_lock);
-				autofs4_remove_rehash_entrys(ino);
-				dput(active);
-				goto restart;
-			}
-			goto next;
-		}
-
 		qstr = &active->d_name;
 
 		if (active->d_name.hash != hash)
@@ -586,11 +444,13 @@ restart:
 		if (memcmp(qstr->name, str, len))
 			goto next;
 
-		dget(active);
-		spin_unlock(&active->d_lock);
-		spin_unlock(&sbi->lookup_lock);
-		spin_unlock(&dcache_lock);
-		return active;
+		if (d_unhashed(active)) {
+			dget(active);
+			spin_unlock(&active->d_lock);
+			spin_unlock(&sbi->lookup_lock);
+			spin_unlock(&dcache_lock);
+			return active;
+		}
 next:
 		spin_unlock(&active->d_lock);
 	}
@@ -639,11 +499,13 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
 		if (memcmp(qstr->name, str, len))
 			goto next;
 
-		dget(expiring);
-		spin_unlock(&expiring->d_lock);
-		spin_unlock(&sbi->lookup_lock);
-		spin_unlock(&dcache_lock);
-		return expiring;
+		if (d_unhashed(expiring)) {
+			dget(expiring);
+			spin_unlock(&expiring->d_lock);
+			spin_unlock(&sbi->lookup_lock);
+			spin_unlock(&dcache_lock);
+			return expiring;
+		}
 next:
 		spin_unlock(&expiring->d_lock);
 	}
@@ -653,48 +515,6 @@ next:
 	return NULL;
 }
 
-static struct autofs_info *init_new_dentry(struct autofs_sb_info *sbi,
-					   struct dentry *dentry, int oz_mode)
-{
-	struct autofs_info *ino;
-
-	/*
-	 * Mark the dentry incomplete but don't hash it. We do this
-	 * to serialize our inode creation operations (symlink and
-	 * mkdir) which prevents deadlock during the callback to
-	 * the daemon. Subsequent user space lookups for the same
-	 * dentry are placed on the wait queue while the daemon
-	 * itself is allowed passage unresticted so the create
-	 * operation itself can then hash the dentry. Finally,
-	 * we check for the hashed dentry and return the newly
-	 * hashed dentry.
-	 */
-	dentry->d_op = &autofs4_root_dentry_operations;
-
-	/*
-	 * And we need to ensure that the same dentry is used for
-	 * all following lookup calls until it is hashed so that
-	 * the dentry flags are persistent throughout the request.
-	 */
-	ino = autofs4_init_ino(NULL, sbi, 0555);
-	if (!ino)
-		return ERR_PTR(-ENOMEM);
-
-	dentry->d_fsdata = ino;
-	ino->dentry = dentry;
-
-	/*
-	 * Only set the mount pending flag for new dentrys not created
-	 * by the daemon.
-	 */
-	if (!oz_mode)
-		ino->flags |= AUTOFS_INF_PENDING;
-
-	d_instantiate(dentry, NULL);
-
-	return ino;
-}
-
 /* Lookups in the root directory */
 static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
@@ -702,7 +522,6 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 	struct autofs_info *ino;
 	struct dentry *expiring, *active;
 	int oz_mode;
-	int status = 0;
 
 	DPRINTK("name = %.*s",
 		dentry->d_name.len, dentry->d_name.name);
@@ -717,26 +536,44 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 	DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d",
 		 current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode);
 
-	spin_lock(&sbi->fs_lock);
 	active = autofs4_lookup_active(dentry);
 	if (active) {
 		dentry = active;
 		ino = autofs4_dentry_ino(dentry);
-		/* If this came from revalidate, rehash it */
-		autofs4_revalidate_rehash(dentry);
-		spin_unlock(&sbi->fs_lock);
 	} else {
-		spin_unlock(&sbi->fs_lock);
-		ino = init_new_dentry(sbi, dentry, oz_mode);
-		if (IS_ERR(ino))
-			return (struct dentry *) ino;
-	}
+		/*
+		 * Mark the dentry incomplete but don't hash it. We do this
+		 * to serialize our inode creation operations (symlink and
+		 * mkdir) which prevents deadlock during the callback to
+		 * the daemon. Subsequent user space lookups for the same
+		 * dentry are placed on the wait queue while the daemon
+		 * itself is allowed passage unresticted so the create
+		 * operation itself can then hash the dentry. Finally,
+		 * we check for the hashed dentry and return the newly
+		 * hashed dentry.
+		 */
+		dentry->d_op = &autofs4_root_dentry_operations;
 
-	autofs4_add_active(dentry);
+		/*
+		 * And we need to ensure that the same dentry is used for
+		 * all following lookup calls until it is hashed so that
+		 * the dentry flags are persistent throughout the request.
+		 */
+		ino = autofs4_init_ino(NULL, sbi, 0555);
+		if (!ino)
+			return ERR_PTR(-ENOMEM);
+
+		dentry->d_fsdata = ino;
+		ino->dentry = dentry;
+
+		autofs4_add_active(dentry);
+
+		d_instantiate(dentry, NULL);
+	}
 
 	if (!oz_mode) {
-		expiring = autofs4_lookup_expiring(dentry);
 		mutex_unlock(&dir->i_mutex);
+		expiring = autofs4_lookup_expiring(dentry);
 		if (expiring) {
 			/*
 			 * If we are racing with expire the request might not
@@ -744,22 +581,23 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 			 * so it must have been successful, so just wait for it.
 			 */
 			autofs4_expire_wait(expiring);
+			autofs4_del_expiring(expiring);
 			dput(expiring);
 		}
-		status = try_to_fill_dentry(dentry);
-		mutex_lock(&dir->i_mutex);
+
 		spin_lock(&sbi->fs_lock);
-		ino->flags &= ~AUTOFS_INF_PENDING;
+		ino->flags |= AUTOFS_INF_PENDING;
 		spin_unlock(&sbi->fs_lock);
+		if (dentry->d_op && dentry->d_op->d_revalidate)
+			(dentry->d_op->d_revalidate)(dentry, nd);
+		mutex_lock(&dir->i_mutex);
 	}
 
-	autofs4_del_active(dentry);
-
 	/*
-	 * If we had a mount fail, check if we had to handle
+	 * If we are still pending, check if we had to handle
 	 * a signal. If so we can force a restart..
 	 */
-	if (status) {
+	if (ino->flags & AUTOFS_INF_PENDING) {
 		/* See if we were interrupted */
 		if (signal_pending(current)) {
 			sigset_t *sigset = &current->pending.signal;
@@ -771,46 +609,43 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 			    return ERR_PTR(-ERESTARTNOINTR);
 			}
 		}
-	}
-
-	/*
-	 * User space can (and has done in the past) remove and re-create
-	 * this directory during the callback. This can leave us with an
-	 * unhashed dentry, but a successful mount!  So we need to
-	 * perform another cached lookup in case the dentry now exists.
-	 */
-	if (!oz_mode && !have_submounts(dentry)) {
-		struct dentry *new;
-		new = d_lookup(dentry->d_parent, &dentry->d_name);
-		if (new) {
-			if (active)
-				dput(active);
-			return new;
-		} else {
-			if (!status)
-				status = -ENOENT;
+		if (!oz_mode) {
+			spin_lock(&sbi->fs_lock);
+			ino->flags &= ~AUTOFS_INF_PENDING;
+			spin_unlock(&sbi->fs_lock);
 		}
 	}
 
 	/*
-	 * If we had a mount failure, return status to user space.
-	 * If the mount succeeded and we used a dentry from the active queue
-	 * return it.
+	 * If this dentry is unhashed, then we shouldn't honour this
+	 * lookup.  Returning ENOENT here doesn't do the right thing
+	 * for all system calls, but it should be OK for the operations
+	 * we permit from an autofs.
 	 */
-	if (status) {
-		dentry = ERR_PTR(status);
-		if (active)
-			dput(active);
-		return dentry;
-	} else {
+	if (!oz_mode && d_unhashed(dentry)) {
 		/*
-		 * Valid successful mount, return active dentry or NULL
-		 * for a new dentry.
+		 * A user space application can (and has done in the past)
+		 * remove and re-create this directory during the callback.
+		 * This can leave us with an unhashed dentry, but a
+		 * successful mount!  So we need to perform another
+		 * cached lookup in case the dentry now exists.
 		 */
+		struct dentry *parent = dentry->d_parent;
+		struct dentry *new = d_lookup(parent, &dentry->d_name);
+		if (new != NULL)
+			dentry = new;
+		else
+			dentry = ERR_PTR(-ENOENT);
+
 		if (active)
-			return active;
+			dput(active);
+
+		return dentry;
 	}
 
+	if (active)
+		return active;
+
 	return NULL;
 }
 
@@ -834,6 +669,8 @@ static int autofs4_dir_symlink(struct inode *dir,
 	if (!ino)
 		return -ENOMEM;
 
+	autofs4_del_active(dentry);
+
 	ino->size = strlen(symname);
 	cp = kmalloc(ino->size + 1, GFP_KERNEL);
 	if (!cp) {
@@ -910,6 +747,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
 	dir->i_mtime = CURRENT_TIME;
 
 	spin_lock(&dcache_lock);
+	autofs4_add_expiring(dentry);
 	spin_lock(&dentry->d_lock);
 	__d_drop(dentry);
 	spin_unlock(&dentry->d_lock);
@@ -935,6 +773,7 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
 		spin_unlock(&dcache_lock);
 		return -ENOTEMPTY;
 	}
+	autofs4_add_expiring(dentry);
 	spin_lock(&dentry->d_lock);
 	__d_drop(dentry);
 	spin_unlock(&dentry->d_lock);
@@ -972,6 +811,8 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	if (!ino)
 		return -ENOMEM;
 
+	autofs4_del_active(dentry);
+
 	inode = autofs4_get_inode(dir->i_sb, ino);
 	if (!inode) {
 		if (!dentry->d_fsdata)
@@ -1062,8 +903,8 @@ int is_autofs4_dentry(struct dentry *dentry)
  * ioctl()'s on the root directory is the chief method for the daemon to
  * generate kernel reactions
  */
-static int autofs4_root_ioctl(struct inode *inode, struct file *filp,
-			     unsigned int cmd, unsigned long arg)
+static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
+				       unsigned int cmd, unsigned long arg)
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb);
 	void __user *p = (void __user *)arg;
@@ -1107,3 +948,16 @@ static int autofs4_root_ioctl(struct inode *inode, struct file *filp,
 		return -ENOSYS;
 	}
 }
+
+static long autofs4_root_ioctl(struct file *filp,
+			       unsigned int cmd, unsigned long arg)
+{
+	long ret;
+	struct inode *inode = filp->f_dentry->d_inode;
+
+	lock_kernel();
+	ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
+	unlock_kernel();
+
+	return ret;
+}
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index a05287a23f62..52e59bf4aa5f 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -93,8 +93,7 @@ static int bad_file_release(struct inode *inode, struct file *filp)
 	return -EIO;
 }
 
-static int bad_file_fsync(struct file *file, struct dentry *dentry,
-			int datasync)
+static int bad_file_fsync(struct file *file, int datasync)
 {
 	return -EIO;
 }
diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c
index e3287d0d1a58..59096b5e0fc7 100644
--- a/fs/befs/datastream.c
+++ b/fs/befs/datastream.c
@@ -11,7 +11,6 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/buffer_head.h>
 #include <linux/string.h>
 
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 33baf27fac78..34ddda888e63 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -873,6 +873,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
 	brelse(bh);
 
       unacquire_priv_sbp:
+	kfree(befs_sb->mount_opts.iocharset);
 	kfree(sb->s_fs_info);
 
       unacquire_none:
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 1e41aadb1068..d967e052b779 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -78,7 +78,7 @@ static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir)
 const struct file_operations bfs_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= bfs_readdir,
-	.fsync		= simple_fsync,
+	.fsync		= generic_file_fsync,
 	.llseek		= generic_file_llseek,
 };
 
@@ -105,14 +105,12 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	}
 	set_bit(ino, info->si_imap);
 	info->si_freei--;
-	inode->i_uid = current_fsuid();
-	inode->i_gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current_fsgid();
+	inode_init_owner(inode, dir, mode);
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
 	inode->i_blocks = 0;
 	inode->i_op = &bfs_file_inops;
 	inode->i_fop = &bfs_file_operations;
 	inode->i_mapping->a_ops = &bfs_aops;
-	inode->i_mode = mode;
 	inode->i_ino = ino;
 	BFS_I(inode)->i_dsk_ino = ino;
 	BFS_I(inode)->i_sblock = 0;
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 8f3d9fd89604..f22a7d3dc362 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -15,6 +15,7 @@
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
+#include <linux/writeback.h>
 #include <asm/uaccess.h>
 #include "bfs.h"
 
@@ -98,7 +99,7 @@ error:
 	return ERR_PTR(-EIO);
 }
 
-static int bfs_write_inode(struct inode *inode, int wait)
+static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	struct bfs_sb_info *info = BFS_SB(inode->i_sb);
 	unsigned int ino = (u16)inode->i_ino;
@@ -147,7 +148,7 @@ static int bfs_write_inode(struct inode *inode, int wait)
 	di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1);
 
 	mark_buffer_dirty(bh);
-	if (wait) {
+	if (wbc->sync_mode == WB_SYNC_ALL) {
 		sync_dirty_buffer(bh);
 		if (buffer_req(bh) && !buffer_uptodate(bh))
 			err = -EIO;
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index fdd397099172..f96eff04e11a 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -20,10 +20,11 @@
 #include <linux/fcntl.h>
 #include <linux/ptrace.h>
 #include <linux/user.h>
-#include <linux/slab.h>
 #include <linux/binfmts.h>
 #include <linux/personality.h>
 #include <linux/init.h>
+#include <linux/coredump.h>
+#include <linux/slab.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -60,26 +61,6 @@ static int set_brk(unsigned long start, unsigned long end)
 }
 
 /*
- * These are the only things you should do on a core-file: use only these
- * macros to write out all the necessary info.
- */
-
-static int dump_write(struct file *file, const void *addr, int nr)
-{
-	return file->f_op->write(file, addr, nr, &file->f_pos) == nr;
-}
-
-#define DUMP_WRITE(addr, nr)	\
-	if (!dump_write(file, (void *)(addr), (nr))) \
-		goto end_coredump;
-
-#define DUMP_SEEK(offset) \
-if (file->f_op->llseek) { \
-	if (file->f_op->llseek(file,(offset),0) != (offset)) \
- 		goto end_coredump; \
-} else file->f_pos = (offset)
-
-/*
  * Routine writes a core dump image in the current directory.
  * Currently only a stub-function.
  *
@@ -94,14 +75,16 @@ static int aout_core_dump(struct coredump_params *cprm)
 	struct file *file = cprm->file;
 	mm_segment_t fs;
 	int has_dumped = 0;
-	unsigned long dump_start, dump_size;
+	void __user *dump_start;
+	int dump_size;
 	struct user dump;
 #ifdef __alpha__
-#       define START_DATA(u)	(u.start_data)
+#       define START_DATA(u)	((void __user *)u.start_data)
 #else
-#	define START_DATA(u)	((u.u_tsize << PAGE_SHIFT) + u.start_code)
+#	define START_DATA(u)	((void __user *)((u.u_tsize << PAGE_SHIFT) + \
+				 u.start_code))
 #endif
-#       define START_STACK(u)   (u.start_stack)
+#       define START_STACK(u)   ((void __user *)u.start_stack)
 
 	fs = get_fs();
 	set_fs(KERNEL_DS);
@@ -123,33 +106,38 @@ static int aout_core_dump(struct coredump_params *cprm)
 
 /* make sure we actually have a data and stack area to dump */
 	set_fs(USER_DS);
-	if (!access_ok(VERIFY_READ, (void __user *)START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
+	if (!access_ok(VERIFY_READ, START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
 		dump.u_dsize = 0;
-	if (!access_ok(VERIFY_READ, (void __user *)START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
+	if (!access_ok(VERIFY_READ, START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
 		dump.u_ssize = 0;
 
 	set_fs(KERNEL_DS);
 /* struct user */
-	DUMP_WRITE(&dump,sizeof(dump));
+	if (!dump_write(file, &dump, sizeof(dump)))
+		goto end_coredump;
 /* Now dump all of the user data.  Include malloced stuff as well */
-	DUMP_SEEK(PAGE_SIZE);
+	if (!dump_seek(cprm->file, PAGE_SIZE - sizeof(dump)))
+		goto end_coredump;
 /* now we start writing out the user space info */
 	set_fs(USER_DS);
 /* Dump the data area */
 	if (dump.u_dsize != 0) {
 		dump_start = START_DATA(dump);
 		dump_size = dump.u_dsize << PAGE_SHIFT;
-		DUMP_WRITE(dump_start,dump_size);
+		if (!dump_write(file, dump_start, dump_size))
+			goto end_coredump;
 	}
 /* Now prepare to dump the stack area */
 	if (dump.u_ssize != 0) {
 		dump_start = START_STACK(dump);
 		dump_size = dump.u_ssize << PAGE_SHIFT;
-		DUMP_WRITE(dump_start,dump_size);
+		if (!dump_write(file, dump_start, dump_size))
+			goto end_coredump;
 	}
 /* Finally dump the task struct.  Not be used by gdb, but could be useful */
 	set_fs(KERNEL_DS);
-	DUMP_WRITE(current,sizeof(*current));
+	if (!dump_write(file, current, sizeof(*current)))
+		goto end_coredump;
 end_coredump:
 	set_fs(fs);
 	return has_dumped;
@@ -247,7 +235,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	 * size limits imposed on them by creating programs with large
 	 * arrays in the data or bss.
 	 */
-	rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
+	rlim = rlimit(RLIMIT_DATA);
 	if (rlim >= RLIM_INFINITY)
 		rlim = ~0;
 	if (ex.a_data + ex.a_bss > rlim)
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index fd5b2ea5d299..535e763ab1a6 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -31,6 +31,7 @@
 #include <linux/random.h>
 #include <linux/elf.h>
 #include <linux/utsname.h>
+#include <linux/coredump.h>
 #include <asm/uaccess.h>
 #include <asm/param.h>
 #include <asm/page.h>
@@ -1085,36 +1086,6 @@ out:
  * Modelled on fs/exec.c:aout_core_dump()
  * Jeremy Fitzhardinge <jeremy@sw.oz.au>
  */
-/*
- * These are the only things you should do on a core-file: use only these
- * functions to write out all the necessary info.
- */
-static int dump_write(struct file *file, const void *addr, int nr)
-{
-	return file->f_op->write(file, addr, nr, &file->f_pos) == nr;
-}
-
-static int dump_seek(struct file *file, loff_t off)
-{
-	if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
-		if (file->f_op->llseek(file, off, SEEK_CUR) < 0)
-			return 0;
-	} else {
-		char *buf = (char *)get_zeroed_page(GFP_KERNEL);
-		if (!buf)
-			return 0;
-		while (off > 0) {
-			unsigned long n = off;
-			if (n > PAGE_SIZE)
-				n = PAGE_SIZE;
-			if (!dump_write(file, buf, n))
-				return 0;
-			off -= n;
-		}
-		free_page((unsigned long)buf);
-	}
-	return 1;
-}
 
 /*
  * Decide what to dump of a segment, part, all or none.
@@ -1249,11 +1220,6 @@ static int writenote(struct memelfnote *men, struct file *file,
 }
 #undef DUMP_WRITE
 
-#define DUMP_WRITE(addr, nr)				\
-	if ((size += (nr)) > cprm->limit ||		\
-	    !dump_write(cprm->file, (addr), (nr)))	\
-		goto end_coredump;
-
 static void fill_elf_header(struct elfhdr *elf, int segs,
 			    u16 machine, u32 flags, u8 osabi)
 {
@@ -1872,6 +1838,34 @@ static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma,
 	return gate_vma;
 }
 
+static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum,
+			     elf_addr_t e_shoff, int segs)
+{
+	elf->e_shoff = e_shoff;
+	elf->e_shentsize = sizeof(*shdr4extnum);
+	elf->e_shnum = 1;
+	elf->e_shstrndx = SHN_UNDEF;
+
+	memset(shdr4extnum, 0, sizeof(*shdr4extnum));
+
+	shdr4extnum->sh_type = SHT_NULL;
+	shdr4extnum->sh_size = elf->e_shnum;
+	shdr4extnum->sh_link = elf->e_shstrndx;
+	shdr4extnum->sh_info = segs;
+}
+
+static size_t elf_core_vma_data_size(struct vm_area_struct *gate_vma,
+				     unsigned long mm_flags)
+{
+	struct vm_area_struct *vma;
+	size_t size = 0;
+
+	for (vma = first_vma(current, gate_vma); vma != NULL;
+	     vma = next_vma(vma, gate_vma))
+		size += vma_dump_size(vma, mm_flags);
+	return size;
+}
+
 /*
  * Actual dumper
  *
@@ -1888,8 +1882,11 @@ static int elf_core_dump(struct coredump_params *cprm)
 	struct vm_area_struct *vma, *gate_vma;
 	struct elfhdr *elf = NULL;
 	loff_t offset = 0, dataoff, foffset;
-	unsigned long mm_flags;
 	struct elf_note_info info;
+	struct elf_phdr *phdr4note = NULL;
+	struct elf_shdr *shdr4extnum = NULL;
+	Elf_Half e_phnum;
+	elf_addr_t e_shoff;
 
 	/*
 	 * We no longer stop all VM operations.
@@ -1912,20 +1909,25 @@ static int elf_core_dump(struct coredump_params *cprm)
 	 * Please check DEFAULT_MAX_MAP_COUNT definition when you modify here.
 	 */
 	segs = current->mm->map_count;
-#ifdef ELF_CORE_EXTRA_PHDRS
-	segs += ELF_CORE_EXTRA_PHDRS;
-#endif
+	segs += elf_core_extra_phdrs();
 
 	gate_vma = get_gate_vma(current);
 	if (gate_vma != NULL)
 		segs++;
 
+	/* for notes section */
+	segs++;
+
+	/* If segs > PN_XNUM(0xffff), then e_phnum overflows. To avoid
+	 * this, kernel supports extended numbering. Have a look at
+	 * include/linux/elf.h for further information. */
+	e_phnum = segs > PN_XNUM ? PN_XNUM : segs;
+
 	/*
 	 * Collect all the non-memory information about the process for the
 	 * notes.  This also sets up the file header.
 	 */
-	if (!fill_note_info(elf, segs + 1, /* including notes section */
-			    &info, cprm->signr, cprm->regs))
+	if (!fill_note_info(elf, e_phnum, &info, cprm->signr, cprm->regs))
 		goto cleanup;
 
 	has_dumped = 1;
@@ -1934,31 +1936,47 @@ static int elf_core_dump(struct coredump_params *cprm)
 	fs = get_fs();
 	set_fs(KERNEL_DS);
 
-	DUMP_WRITE(elf, sizeof(*elf));
 	offset += sizeof(*elf);				/* Elf header */
-	offset += (segs + 1) * sizeof(struct elf_phdr); /* Program headers */
+	offset += segs * sizeof(struct elf_phdr);	/* Program headers */
 	foffset = offset;
 
 	/* Write notes phdr entry */
 	{
-		struct elf_phdr phdr;
 		size_t sz = get_note_info_size(&info);
 
 		sz += elf_coredump_extra_notes_size();
 
-		fill_elf_note_phdr(&phdr, sz, offset);
+		phdr4note = kmalloc(sizeof(*phdr4note), GFP_KERNEL);
+		if (!phdr4note)
+			goto end_coredump;
+
+		fill_elf_note_phdr(phdr4note, sz, offset);
 		offset += sz;
-		DUMP_WRITE(&phdr, sizeof(phdr));
 	}
 
 	dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
 
-	/*
-	 * We must use the same mm->flags while dumping core to avoid
-	 * inconsistency between the program headers and bodies, otherwise an
-	 * unusable core file can be generated.
-	 */
-	mm_flags = current->mm->flags;
+	offset += elf_core_vma_data_size(gate_vma, cprm->mm_flags);
+	offset += elf_core_extra_data_size();
+	e_shoff = offset;
+
+	if (e_phnum == PN_XNUM) {
+		shdr4extnum = kmalloc(sizeof(*shdr4extnum), GFP_KERNEL);
+		if (!shdr4extnum)
+			goto end_coredump;
+		fill_extnum_info(elf, shdr4extnum, e_shoff, segs);
+	}
+
+	offset = dataoff;
+
+	size += sizeof(*elf);
+	if (size > cprm->limit || !dump_write(cprm->file, elf, sizeof(*elf)))
+		goto end_coredump;
+
+	size += sizeof(*phdr4note);
+	if (size > cprm->limit
+	    || !dump_write(cprm->file, phdr4note, sizeof(*phdr4note)))
+		goto end_coredump;
 
 	/* Write program headers for segments dump */
 	for (vma = first_vma(current, gate_vma); vma != NULL;
@@ -1969,7 +1987,7 @@ static int elf_core_dump(struct coredump_params *cprm)
 		phdr.p_offset = offset;
 		phdr.p_vaddr = vma->vm_start;
 		phdr.p_paddr = 0;
-		phdr.p_filesz = vma_dump_size(vma, mm_flags);
+		phdr.p_filesz = vma_dump_size(vma, cprm->mm_flags);
 		phdr.p_memsz = vma->vm_end - vma->vm_start;
 		offset += phdr.p_filesz;
 		phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0;
@@ -1979,12 +1997,14 @@ static int elf_core_dump(struct coredump_params *cprm)
 			phdr.p_flags |= PF_X;
 		phdr.p_align = ELF_EXEC_PAGESIZE;
 
-		DUMP_WRITE(&phdr, sizeof(phdr));
+		size += sizeof(phdr);
+		if (size > cprm->limit
+		    || !dump_write(cprm->file, &phdr, sizeof(phdr)))
+			goto end_coredump;
 	}
 
-#ifdef ELF_CORE_WRITE_EXTRA_PHDRS
-	ELF_CORE_WRITE_EXTRA_PHDRS;
-#endif
+	if (!elf_core_write_extra_phdrs(cprm->file, offset, &size, cprm->limit))
+		goto end_coredump;
 
  	/* write out the notes section */
 	if (!write_note_info(&info, cprm->file, &foffset))
@@ -2002,7 +2022,7 @@ static int elf_core_dump(struct coredump_params *cprm)
 		unsigned long addr;
 		unsigned long end;
 
-		end = vma->vm_start + vma_dump_size(vma, mm_flags);
+		end = vma->vm_start + vma_dump_size(vma, cprm->mm_flags);
 
 		for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) {
 			struct page *page;
@@ -2023,15 +2043,24 @@ static int elf_core_dump(struct coredump_params *cprm)
 		}
 	}
 
-#ifdef ELF_CORE_WRITE_EXTRA_DATA
-	ELF_CORE_WRITE_EXTRA_DATA;
-#endif
+	if (!elf_core_write_extra_data(cprm->file, &size, cprm->limit))
+		goto end_coredump;
+
+	if (e_phnum == PN_XNUM) {
+		size += sizeof(*shdr4extnum);
+		if (size > cprm->limit
+		    || !dump_write(cprm->file, shdr4extnum,
+				   sizeof(*shdr4extnum)))
+			goto end_coredump;
+	}
 
 end_coredump:
 	set_fs(fs);
 
 cleanup:
 	free_note_info(&info);
+	kfree(shdr4extnum);
+	kfree(phdr4note);
 	kfree(elf);
 out:
 	return has_dumped;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 18d77297ccc8..63039ed9576f 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -34,6 +34,7 @@
 #include <linux/elf.h>
 #include <linux/elf-fdpic.h>
 #include <linux/elfcore.h>
+#include <linux/coredump.h>
 
 #include <asm/uaccess.h>
 #include <asm/param.h>
@@ -989,10 +990,9 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(
 
 		/* clear any space allocated but not loaded */
 		if (phdr->p_filesz < phdr->p_memsz) {
-			ret = clear_user((void *) (seg->addr + phdr->p_filesz),
-					 phdr->p_memsz - phdr->p_filesz);
-			if (ret)
-				return ret;
+			if (clear_user((void *) (seg->addr + phdr->p_filesz),
+				       phdr->p_memsz - phdr->p_filesz))
+				return -EFAULT;
 		}
 
 		if (mm) {
@@ -1004,15 +1004,8 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(
 				}
 			} else if (!mm->start_data) {
 				mm->start_data = seg->addr;
-#ifndef CONFIG_MMU
 				mm->end_data = seg->addr + phdr->p_memsz;
-#endif
 			}
-
-#ifdef CONFIG_MMU
-			if (seg->addr + phdr->p_memsz > mm->end_data)
-				mm->end_data = seg->addr + phdr->p_memsz;
-#endif
 		}
 
 		seg++;
@@ -1033,7 +1026,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
 	struct elf32_fdpic_loadseg *seg;
 	struct elf32_phdr *phdr;
 	unsigned long load_addr, delta_vaddr;
-	int loop, dvset, ret;
+	int loop, dvset;
 
 	load_addr = params->load_addr;
 	delta_vaddr = 0;
@@ -1133,9 +1126,8 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
 		 * PT_LOAD */
 		if (prot & PROT_WRITE && disp > 0) {
 			kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr, disp);
-			ret = clear_user((void __user *) maddr, disp);
-			if (ret)
-				return ret;
+			if (clear_user((void __user *) maddr, disp))
+				return -EFAULT;
 			maddr += disp;
 		}
 
@@ -1170,19 +1162,17 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
 		if (prot & PROT_WRITE && excess1 > 0) {
 			kdebug("clear[%d] ad=%lx sz=%lx",
 			       loop, maddr + phdr->p_filesz, excess1);
-			ret = clear_user((void __user *) maddr + phdr->p_filesz,
-					 excess1);
-			if (ret)
-				return ret;
+			if (clear_user((void __user *) maddr + phdr->p_filesz,
+				       excess1))
+				return -EFAULT;
 		}
 
 #else
 		if (excess > 0) {
 			kdebug("clear[%d] ad=%lx sz=%lx",
 			       loop, maddr + phdr->p_filesz, excess);
-			ret = clear_user((void *) maddr + phdr->p_filesz, excess);
-			if (ret)
-				return ret;
+			if (clear_user((void *) maddr + phdr->p_filesz, excess))
+				return -EFAULT;
 		}
 #endif
 
@@ -1216,26 +1206,6 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
 #ifdef CONFIG_ELF_CORE
 
 /*
- * These are the only things you should do on a core-file: use only these
- * functions to write out all the necessary info.
- */
-static int dump_write(struct file *file, const void *addr, int nr)
-{
-	return file->f_op->write(file, addr, nr, &file->f_pos) == nr;
-}
-
-static int dump_seek(struct file *file, loff_t off)
-{
-	if (file->f_op->llseek) {
-		if (file->f_op->llseek(file, off, SEEK_SET) != off)
-			return 0;
-	} else {
-		file->f_pos = off;
-	}
-	return 1;
-}
-
-/*
  * Decide whether a segment is worth dumping; default is yes to be
  * sure (missing info is worse than too much; etc).
  * Personally I'd include everything, and use the coredump limit...
@@ -1313,35 +1283,35 @@ static int notesize(struct memelfnote *en)
 
 /* #define DEBUG */
 
-#define DUMP_WRITE(addr, nr)	\
-	do { if (!dump_write(file, (addr), (nr))) return 0; } while(0)
-#define DUMP_SEEK(off)	\
-	do { if (!dump_seek(file, (off))) return 0; } while(0)
+#define DUMP_WRITE(addr, nr, foffset)	\
+	do { if (!dump_write(file, (addr), (nr))) return 0; *foffset += (nr); } while(0)
 
-static int writenote(struct memelfnote *men, struct file *file)
+static int alignfile(struct file *file, loff_t *foffset)
 {
-	struct elf_note en;
+	static const char buf[4] = { 0, };
+	DUMP_WRITE(buf, roundup(*foffset, 4) - *foffset, foffset);
+	return 1;
+}
 
+static int writenote(struct memelfnote *men, struct file *file,
+			loff_t *foffset)
+{
+	struct elf_note en;
 	en.n_namesz = strlen(men->name) + 1;
 	en.n_descsz = men->datasz;
 	en.n_type = men->type;
 
-	DUMP_WRITE(&en, sizeof(en));
-	DUMP_WRITE(men->name, en.n_namesz);
-	/* XXX - cast from long long to long to avoid need for libgcc.a */
-	DUMP_SEEK(roundup((unsigned long)file->f_pos, 4));	/* XXX */
-	DUMP_WRITE(men->data, men->datasz);
-	DUMP_SEEK(roundup((unsigned long)file->f_pos, 4));	/* XXX */
+	DUMP_WRITE(&en, sizeof(en), foffset);
+	DUMP_WRITE(men->name, en.n_namesz, foffset);
+	if (!alignfile(file, foffset))
+		return 0;
+	DUMP_WRITE(men->data, men->datasz, foffset);
+	if (!alignfile(file, foffset))
+		return 0;
 
 	return 1;
 }
 #undef DUMP_WRITE
-#undef DUMP_SEEK
-
-#define DUMP_WRITE(addr, nr)				\
-	if ((size += (nr)) > cprm->limit ||		\
-	    !dump_write(cprm->file, (addr), (nr)))	\
-		goto end_coredump;
 
 static inline void fill_elf_fdpic_header(struct elfhdr *elf, int segs)
 {
@@ -1393,7 +1363,7 @@ static inline void fill_note(struct memelfnote *note, const char *name, int type
 
 /*
  * fill up all the fields in prstatus from the given task struct, except
- * registers which need to be filled up seperately.
+ * registers which need to be filled up separately.
  */
 static void fill_prstatus(struct elf_prstatus *prstatus,
 			  struct task_struct *p, long signr)
@@ -1524,6 +1494,22 @@ static int elf_dump_thread_status(long signr, struct elf_thread_status *t)
 	return sz;
 }
 
+static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum,
+			     elf_addr_t e_shoff, int segs)
+{
+	elf->e_shoff = e_shoff;
+	elf->e_shentsize = sizeof(*shdr4extnum);
+	elf->e_shnum = 1;
+	elf->e_shstrndx = SHN_UNDEF;
+
+	memset(shdr4extnum, 0, sizeof(*shdr4extnum));
+
+	shdr4extnum->sh_type = SHT_NULL;
+	shdr4extnum->sh_size = elf->e_shnum;
+	shdr4extnum->sh_link = elf->e_shstrndx;
+	shdr4extnum->sh_info = segs;
+}
+
 /*
  * dump the segments for an MMU process
  */
@@ -1552,7 +1538,7 @@ static int elf_fdpic_dump_segments(struct file *file, size_t *size,
 					err = -EIO;
 				kunmap(page);
 				page_cache_release(page);
-			} else if (!dump_seek(file, file->f_pos + PAGE_SIZE))
+			} else if (!dump_seek(file, PAGE_SIZE))
 				err = -EFBIG;
 			if (err)
 				goto out;
@@ -1588,6 +1574,17 @@ static int elf_fdpic_dump_segments(struct file *file, size_t *size,
 }
 #endif
 
+static size_t elf_core_vma_data_size(unsigned long mm_flags)
+{
+	struct vm_area_struct *vma;
+	size_t size = 0;
+
+	for (vma = current->mm->mmap; vma; vma = vma->vm_next)
+		if (maydump(vma, mm_flags))
+			size += vma->vm_end - vma->vm_start;
+	return size;
+}
+
 /*
  * Actual dumper
  *
@@ -1605,7 +1602,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
 	int i;
 	struct vm_area_struct *vma;
 	struct elfhdr *elf = NULL;
-	loff_t offset = 0, dataoff;
+	loff_t offset = 0, dataoff, foffset;
 	int numnote;
 	struct memelfnote *notes = NULL;
 	struct elf_prstatus *prstatus = NULL;	/* NT_PRSTATUS */
@@ -1618,7 +1615,10 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
 #endif
 	int thread_status_size = 0;
 	elf_addr_t *auxv;
-	unsigned long mm_flags;
+	struct elf_phdr *phdr4note = NULL;
+	struct elf_shdr *shdr4extnum = NULL;
+	Elf_Half e_phnum;
+	elf_addr_t e_shoff;
 
 	/*
 	 * We no longer stop all VM operations.
@@ -1683,12 +1683,18 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
 	elf_core_copy_regs(&prstatus->pr_reg, cprm->regs);
 
 	segs = current->mm->map_count;
-#ifdef ELF_CORE_EXTRA_PHDRS
-	segs += ELF_CORE_EXTRA_PHDRS;
-#endif
+	segs += elf_core_extra_phdrs();
+
+	/* for notes section */
+	segs++;
+
+	/* If segs > PN_XNUM(0xffff), then e_phnum overflows. To avoid
+	 * this, kernel supports extended numbering. Have a look at
+	 * include/linux/elf.h for further information. */
+	e_phnum = segs > PN_XNUM ? PN_XNUM : segs;
 
 	/* Set up header */
-	fill_elf_fdpic_header(elf, segs + 1);	/* including notes section */
+	fill_elf_fdpic_header(elf, e_phnum);
 
 	has_dumped = 1;
 	current->flags |= PF_DUMPCORE;
@@ -1727,13 +1733,12 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
 	fs = get_fs();
 	set_fs(KERNEL_DS);
 
-	DUMP_WRITE(elf, sizeof(*elf));
 	offset += sizeof(*elf);				/* Elf header */
-	offset += (segs+1) * sizeof(struct elf_phdr);	/* Program headers */
+	offset += segs * sizeof(struct elf_phdr);	/* Program headers */
+	foffset = offset;
 
 	/* Write notes phdr entry */
 	{
-		struct elf_phdr phdr;
 		int sz = 0;
 
 		for (i = 0; i < numnote; i++)
@@ -1741,20 +1746,38 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
 
 		sz += thread_status_size;
 
-		fill_elf_note_phdr(&phdr, sz, offset);
+		phdr4note = kmalloc(sizeof(*phdr4note), GFP_KERNEL);
+		if (!phdr4note)
+			goto end_coredump;
+
+		fill_elf_note_phdr(phdr4note, sz, offset);
 		offset += sz;
-		DUMP_WRITE(&phdr, sizeof(phdr));
 	}
 
 	/* Page-align dumped data */
 	dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
 
-	/*
-	 * We must use the same mm->flags while dumping core to avoid
-	 * inconsistency between the program headers and bodies, otherwise an
-	 * unusable core file can be generated.
-	 */
-	mm_flags = current->mm->flags;
+	offset += elf_core_vma_data_size(cprm->mm_flags);
+	offset += elf_core_extra_data_size();
+	e_shoff = offset;
+
+	if (e_phnum == PN_XNUM) {
+		shdr4extnum = kmalloc(sizeof(*shdr4extnum), GFP_KERNEL);
+		if (!shdr4extnum)
+			goto end_coredump;
+		fill_extnum_info(elf, shdr4extnum, e_shoff, segs);
+	}
+
+	offset = dataoff;
+
+	size += sizeof(*elf);
+	if (size > cprm->limit || !dump_write(cprm->file, elf, sizeof(*elf)))
+		goto end_coredump;
+
+	size += sizeof(*phdr4note);
+	if (size > cprm->limit
+	    || !dump_write(cprm->file, phdr4note, sizeof(*phdr4note)))
+		goto end_coredump;
 
 	/* write program headers for segments dump */
 	for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
@@ -1767,7 +1790,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
 		phdr.p_offset = offset;
 		phdr.p_vaddr = vma->vm_start;
 		phdr.p_paddr = 0;
-		phdr.p_filesz = maydump(vma, mm_flags) ? sz : 0;
+		phdr.p_filesz = maydump(vma, cprm->mm_flags) ? sz : 0;
 		phdr.p_memsz = sz;
 		offset += phdr.p_filesz;
 		phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0;
@@ -1777,16 +1800,18 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
 			phdr.p_flags |= PF_X;
 		phdr.p_align = ELF_EXEC_PAGESIZE;
 
-		DUMP_WRITE(&phdr, sizeof(phdr));
+		size += sizeof(phdr);
+		if (size > cprm->limit
+		    || !dump_write(cprm->file, &phdr, sizeof(phdr)))
+			goto end_coredump;
 	}
 
-#ifdef ELF_CORE_WRITE_EXTRA_PHDRS
-	ELF_CORE_WRITE_EXTRA_PHDRS;
-#endif
+	if (!elf_core_write_extra_phdrs(cprm->file, offset, &size, cprm->limit))
+		goto end_coredump;
 
  	/* write out the notes section */
 	for (i = 0; i < numnote; i++)
-		if (!writenote(notes + i, cprm->file))
+		if (!writenote(notes + i, cprm->file, &foffset))
 			goto end_coredump;
 
 	/* write out the thread status notes section */
@@ -1795,20 +1820,27 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
 				list_entry(t, struct elf_thread_status, list);
 
 		for (i = 0; i < tmp->num_notes; i++)
-			if (!writenote(&tmp->notes[i], cprm->file))
+			if (!writenote(&tmp->notes[i], cprm->file, &foffset))
 				goto end_coredump;
 	}
 
-	if (!dump_seek(cprm->file, dataoff))
+	if (!dump_seek(cprm->file, dataoff - foffset))
 		goto end_coredump;
 
 	if (elf_fdpic_dump_segments(cprm->file, &size, &cprm->limit,
-				    mm_flags) < 0)
+				    cprm->mm_flags) < 0)
 		goto end_coredump;
 
-#ifdef ELF_CORE_WRITE_EXTRA_DATA
-	ELF_CORE_WRITE_EXTRA_DATA;
-#endif
+	if (!elf_core_write_extra_data(cprm->file, &size, cprm->limit))
+		goto end_coredump;
+
+	if (e_phnum == PN_XNUM) {
+		size += sizeof(*shdr4extnum);
+		if (size > cprm->limit
+		    || !dump_write(cprm->file, shdr4extnum,
+				   sizeof(*shdr4extnum)))
+			goto end_coredump;
+	}
 
 	if (cprm->file->f_pos != offset) {
 		/* Sanity check */
@@ -1826,7 +1858,7 @@ cleanup:
 		list_del(tmp);
 		kfree(list_entry(tmp, struct elf_thread_status, list));
 	}
-
+	kfree(phdr4note);
 	kfree(elf);
 	kfree(prstatus);
 	kfree(psinfo);
diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c
index 32fb00b52cd0..b8e8b0acf9bd 100644
--- a/fs/binfmt_em86.c
+++ b/fs/binfmt_em86.c
@@ -11,7 +11,6 @@
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/stat.h>
-#include <linux/slab.h>
 #include <linux/binfmts.h>
 #include <linux/elf.h>
 #include <linux/init.h>
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 42c6b4a54445..811384bec8de 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -56,16 +56,19 @@
 #endif
 
 /*
- * User data (stack, data section and bss) needs to be aligned
- * for the same reasons as SLAB memory is, and to the same amount.
- * Avoid duplicating architecture specific code by using the same
- * macro as with SLAB allocation:
+ * User data (data section and bss) needs to be aligned.
+ * We pick 0x20 here because it is the max value elf2flt has always
+ * used in producing FLAT files, and because it seems to be large
+ * enough to make all the gcc alignment related tests happy.
  */
-#ifdef ARCH_SLAB_MINALIGN
-#define FLAT_DATA_ALIGN	(ARCH_SLAB_MINALIGN)
-#else
-#define FLAT_DATA_ALIGN	(sizeof(void *))
-#endif
+#define FLAT_DATA_ALIGN	(0x20)
+
+/*
+ * User data (stack) also needs to be aligned.
+ * Here we can be a bit looser than the data sections since this
+ * needs to only meet arch ABI requirements.
+ */
+#define FLAT_STACK_ALIGN	max_t(unsigned long, sizeof(void *), ARCH_SLAB_MINALIGN)
 
 #define RELOC_FAILED 0xff00ff01		/* Relocation incorrect somewhere */
 #define UNLOADED_LIB 0x7ff000ff		/* Placeholder for unused library */
@@ -129,7 +132,7 @@ static unsigned long create_flat_tables(
 
 	sp = (unsigned long *)p;
 	sp -= (envc + argc + 2) + 1 + (flat_argvp_envp_on_stack() ? 2 : 0);
-	sp = (unsigned long *) ((unsigned long)sp & -FLAT_DATA_ALIGN);
+	sp = (unsigned long *) ((unsigned long)sp & -FLAT_STACK_ALIGN);
 	argv = sp + 1 + (flat_argvp_envp_on_stack() ? 2 : 0);
 	envp = argv + (argc + 1);
 
@@ -355,7 +358,7 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp)
 
 	if (!flat_reloc_valid(r, start_brk - start_data + text_len)) {
 		printk("BINFMT_FLAT: reloc outside program 0x%x (0 - 0x%x/0x%x)",
-		       (int) r,(int)(start_brk-start_code),(int)text_len);
+		       (int) r,(int)(start_brk-start_data+text_len),(int)text_len);
 		goto failed;
 	}
 
@@ -501,7 +504,7 @@ static int load_flat_file(struct linux_binprm * bprm,
 	 * size limits imposed on them by creating programs with large
 	 * arrays in the data or bss.
 	 */
-	rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
+	rlim = rlimit(RLIMIT_DATA);
 	if (rlim >= RLIM_INFINITY)
 		rlim = ~0;
 	if (data_len + bss_len > rlim) {
@@ -589,7 +592,7 @@ static int load_flat_file(struct linux_binprm * bprm,
 		if (IS_ERR_VALUE(result)) {
 			printk("Unable to read data+bss, errno %d\n", (int)-result);
 			do_munmap(current->mm, textpos, text_len);
-			do_munmap(current->mm, realdatastart, data_len + extra);
+			do_munmap(current->mm, realdatastart, len);
 			ret = result;
 			goto err;
 		}
@@ -876,7 +879,7 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	stack_len = TOP_OF_ARGS - bprm->p;             /* the strings */
 	stack_len += (bprm->argc + 1) * sizeof(char *); /* the argv array */
 	stack_len += (bprm->envc + 1) * sizeof(char *); /* the envp array */
-	stack_len += FLAT_DATA_ALIGN - 1;  /* reserve for upcoming alignment */
+	stack_len += FLAT_STACK_ALIGN - 1;  /* reserve for upcoming alignment */
 	
 	res = load_flat_file(bprm, &libinfo, 0, &stack_len);
 	if (IS_ERR_VALUE(res))
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index 08343505e184..aca9d55afb22 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -8,7 +8,6 @@
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/stat.h>
-#include <linux/slab.h>
 #include <linux/binfmts.h>
 #include <linux/init.h>
 #include <linux/file.h>
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 49a34e7f7306..612a5c38d3c1 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -24,6 +24,7 @@
 #include <linux/mempool.h>
 #include <linux/bio.h>
 #include <linux/workqueue.h>
+#include <linux/slab.h>
 
 struct integrity_slab {
 	struct kmem_cache *slab;
@@ -61,7 +62,7 @@ static inline unsigned int vecs_to_idx(unsigned int nr)
 
 static inline int use_bip_pool(unsigned int idx)
 {
-	if (idx == BIOVEC_NR_POOLS)
+	if (idx == BIOVEC_MAX_IDX)
 		return 1;
 
 	return 0;
@@ -95,6 +96,7 @@ struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
 
 	/* Use mempool if lower order alloc failed or max vecs were requested */
 	if (bip == NULL) {
+		idx = BIOVEC_MAX_IDX;  /* so we free the payload properly later */
 		bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
 
 		if (unlikely(bip == NULL)) {
diff --git a/fs/bio.c b/fs/bio.c
index 12429c9553eb..e7bf6ca64dcf 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -264,13 +264,12 @@ EXPORT_SYMBOL(bio_init);
  * bio_alloc_bioset - allocate a bio for I/O
  * @gfp_mask:   the GFP_ mask given to the slab allocator
  * @nr_iovecs:	number of iovecs to pre-allocate
- * @bs:		the bio_set to allocate from. If %NULL, just use kmalloc
+ * @bs:		the bio_set to allocate from.
  *
  * Description:
- *   bio_alloc_bioset will first try its own mempool to satisfy the allocation.
+ *   bio_alloc_bioset will try its own mempool to satisfy the allocation.
  *   If %__GFP_WAIT is set then we will block on the internal pool waiting
- *   for a &struct bio to become free. If a %NULL @bs is passed in, we will
- *   fall back to just using @kmalloc to allocate the required memory.
+ *   for a &struct bio to become free.
  *
  *   Note that the caller must set ->bi_destructor on successful return
  *   of a bio, to do the appropriate freeing of the bio once the reference
@@ -507,10 +506,8 @@ int bio_get_nr_vecs(struct block_device *bdev)
 	int nr_pages;
 
 	nr_pages = ((queue_max_sectors(q) << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	if (nr_pages > queue_max_phys_segments(q))
-		nr_pages = queue_max_phys_segments(q);
-	if (nr_pages > queue_max_hw_segments(q))
-		nr_pages = queue_max_hw_segments(q);
+	if (nr_pages > queue_max_segments(q))
+		nr_pages = queue_max_segments(q);
 
 	return nr_pages;
 }
@@ -542,17 +539,22 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 
 		if (page == prev->bv_page &&
 		    offset == prev->bv_offset + prev->bv_len) {
+			unsigned int prev_bv_len = prev->bv_len;
 			prev->bv_len += len;
 
 			if (q->merge_bvec_fn) {
 				struct bvec_merge_data bvm = {
+					/* prev_bvec is already charged in
+					   bi_size, discharge it in order to
+					   simulate merging updated prev_bvec
+					   as new bvec. */
 					.bi_bdev = bio->bi_bdev,
 					.bi_sector = bio->bi_sector,
-					.bi_size = bio->bi_size,
+					.bi_size = bio->bi_size - prev_bv_len,
 					.bi_rw = bio->bi_rw,
 				};
 
-				if (q->merge_bvec_fn(q, &bvm, prev) < len) {
+				if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) {
 					prev->bv_len -= len;
 					return 0;
 				}
@@ -570,8 +572,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 	 * make this too complex.
 	 */
 
-	while (bio->bi_phys_segments >= queue_max_phys_segments(q)
-	       || bio->bi_phys_segments >= queue_max_hw_segments(q)) {
+	while (bio->bi_phys_segments >= queue_max_segments(q)) {
 
 		if (retried_segments)
 			return 0;
@@ -606,7 +607,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 		 * merge_bvec_fn() returns number of bytes it can accept
 		 * at this offset
 		 */
-		if (q->merge_bvec_fn(q, &bvm, bvec) < len) {
+		if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) {
 			bvec->bv_page = NULL;
 			bvec->bv_len = 0;
 			bvec->bv_offset = 0;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 73d6a735b8f3..99d6af811747 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -172,8 +172,9 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 
-	return blockdev_direct_IO_no_locking(rw, iocb, inode, I_BDEV(inode),
-				iov, offset, nr_segs, blkdev_get_blocks, NULL);
+	return blockdev_direct_IO_no_locking_newtrunc(rw, iocb, inode,
+				I_BDEV(inode), iov, offset, nr_segs,
+				blkdev_get_blocks, NULL);
 }
 
 int __sync_blockdev(struct block_device *bdev, int wait)
@@ -245,36 +246,14 @@ struct super_block *freeze_bdev(struct block_device *bdev)
 	sb = get_active_super(bdev);
 	if (!sb)
 		goto out;
-	if (sb->s_flags & MS_RDONLY) {
-		deactivate_locked_super(sb);
+	error = freeze_super(sb);
+	if (error) {
+		deactivate_super(sb);
+		bdev->bd_fsfreeze_count--;
 		mutex_unlock(&bdev->bd_fsfreeze_mutex);
-		return sb;
-	}
-
-	sb->s_frozen = SB_FREEZE_WRITE;
-	smp_wmb();
-
-	sync_filesystem(sb);
-
-	sb->s_frozen = SB_FREEZE_TRANS;
-	smp_wmb();
-
-	sync_blockdev(sb->s_bdev);
-
-	if (sb->s_op->freeze_fs) {
-		error = sb->s_op->freeze_fs(sb);
-		if (error) {
-			printk(KERN_ERR
-				"VFS:Filesystem freeze failed\n");
-			sb->s_frozen = SB_UNFROZEN;
-			deactivate_locked_super(sb);
-			bdev->bd_fsfreeze_count--;
-			mutex_unlock(&bdev->bd_fsfreeze_mutex);
-			return ERR_PTR(error);
-		}
+		return ERR_PTR(error);
 	}
-	up_write(&sb->s_umount);
-
+	deactivate_super(sb);
  out:
 	sync_blockdev(bdev);
 	mutex_unlock(&bdev->bd_fsfreeze_mutex);
@@ -295,40 +274,22 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb)
 
 	mutex_lock(&bdev->bd_fsfreeze_mutex);
 	if (!bdev->bd_fsfreeze_count)
-		goto out_unlock;
+		goto out;
 
 	error = 0;
 	if (--bdev->bd_fsfreeze_count > 0)
-		goto out_unlock;
+		goto out;
 
 	if (!sb)
-		goto out_unlock;
-
-	BUG_ON(sb->s_bdev != bdev);
-	down_write(&sb->s_umount);
-	if (sb->s_flags & MS_RDONLY)
-		goto out_deactivate;
-
-	if (sb->s_op->unfreeze_fs) {
-		error = sb->s_op->unfreeze_fs(sb);
-		if (error) {
-			printk(KERN_ERR
-				"VFS:Filesystem thaw failed\n");
-			sb->s_frozen = SB_FREEZE_TRANS;
-			bdev->bd_fsfreeze_count++;
-			mutex_unlock(&bdev->bd_fsfreeze_mutex);
-			return error;
-		}
-	}
-
-	sb->s_frozen = SB_UNFROZEN;
-	smp_wmb();
-	wake_up(&sb->s_wait_unfrozen);
+		goto out;
 
-out_deactivate:
-	if (sb)
-		deactivate_locked_super(sb);
-out_unlock:
+	error = thaw_super(sb);
+	if (error) {
+		bdev->bd_fsfreeze_count++;
+		mutex_unlock(&bdev->bd_fsfreeze_mutex);
+		return error;
+	}
+out:
 	mutex_unlock(&bdev->bd_fsfreeze_mutex);
 	return 0;
 }
@@ -349,8 +310,8 @@ static int blkdev_write_begin(struct file *file, struct address_space *mapping,
 			struct page **pagep, void **fsdata)
 {
 	*pagep = NULL;
-	return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
-				blkdev_get_block);
+	return block_write_begin_newtrunc(file, mapping, pos, len, flags,
+				pagep, fsdata, blkdev_get_block);
 }
 
 static int blkdev_write_end(struct file *file, struct address_space *mapping,
@@ -398,25 +359,28 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
 	return retval;
 }
 	
-/*
- *	Filp is never NULL; the only case when ->fsync() is called with
- *	NULL first argument is nfsd_sync_dir() and that's not a directory.
- */
- 
-static int block_fsync(struct file *filp, struct dentry *dentry, int datasync)
+int blkdev_fsync(struct file *filp, int datasync)
 {
-	struct block_device *bdev = I_BDEV(filp->f_mapping->host);
+	struct inode *bd_inode = filp->f_mapping->host;
+	struct block_device *bdev = I_BDEV(bd_inode);
 	int error;
 
-	error = sync_blockdev(bdev);
-	if (error)
-		return error;
-	
-	error = blkdev_issue_flush(bdev, NULL);
+	/*
+	 * There is no need to serialise calls to blkdev_issue_flush with
+	 * i_mutex and doing so causes performance issues with concurrent
+	 * O_SYNC writers to a block device.
+	 */
+	mutex_unlock(&bd_inode->i_mutex);
+
+	error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL, BLKDEV_IFL_WAIT);
 	if (error == -EOPNOTSUPP)
 		error = 0;
+
+	mutex_lock(&bd_inode->i_mutex);
+
 	return error;
 }
+EXPORT_SYMBOL(blkdev_fsync);
 
 /*
  * pseudo-fs
@@ -659,41 +623,233 @@ void bd_forget(struct inode *inode)
 		iput(bdev->bd_inode);
 }
 
-int bd_claim(struct block_device *bdev, void *holder)
+/**
+ * bd_may_claim - test whether a block device can be claimed
+ * @bdev: block device of interest
+ * @whole: whole block device containing @bdev, may equal @bdev
+ * @holder: holder trying to claim @bdev
+ *
+ * Test whther @bdev can be claimed by @holder.
+ *
+ * CONTEXT:
+ * spin_lock(&bdev_lock).
+ *
+ * RETURNS:
+ * %true if @bdev can be claimed, %false otherwise.
+ */
+static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
+			 void *holder)
 {
-	int res;
-	spin_lock(&bdev_lock);
-
-	/* first decide result */
 	if (bdev->bd_holder == holder)
-		res = 0;	 /* already a holder */
+		return true;	 /* already a holder */
 	else if (bdev->bd_holder != NULL)
-		res = -EBUSY; 	 /* held by someone else */
+		return false; 	 /* held by someone else */
 	else if (bdev->bd_contains == bdev)
-		res = 0;  	 /* is a whole device which isn't held */
+		return true;  	 /* is a whole device which isn't held */
 
-	else if (bdev->bd_contains->bd_holder == bd_claim)
-		res = 0; 	 /* is a partition of a device that is being partitioned */
-	else if (bdev->bd_contains->bd_holder != NULL)
-		res = -EBUSY;	 /* is a partition of a held device */
+	else if (whole->bd_holder == bd_claim)
+		return true; 	 /* is a partition of a device that is being partitioned */
+	else if (whole->bd_holder != NULL)
+		return false;	 /* is a partition of a held device */
 	else
-		res = 0;	 /* is a partition of an un-held device */
+		return true;	 /* is a partition of an un-held device */
+}
 
-	/* now impose change */
-	if (res==0) {
-		/* note that for a whole device bd_holders
-		 * will be incremented twice, and bd_holder will
-		 * be set to bd_claim before being set to holder
-		 */
-		bdev->bd_contains->bd_holders ++;
-		bdev->bd_contains->bd_holder = bd_claim;
-		bdev->bd_holders++;
-		bdev->bd_holder = holder;
+/**
+ * bd_prepare_to_claim - prepare to claim a block device
+ * @bdev: block device of interest
+ * @whole: the whole device containing @bdev, may equal @bdev
+ * @holder: holder trying to claim @bdev
+ *
+ * Prepare to claim @bdev.  This function fails if @bdev is already
+ * claimed by another holder and waits if another claiming is in
+ * progress.  This function doesn't actually claim.  On successful
+ * return, the caller has ownership of bd_claiming and bd_holder[s].
+ *
+ * CONTEXT:
+ * spin_lock(&bdev_lock).  Might release bdev_lock, sleep and regrab
+ * it multiple times.
+ *
+ * RETURNS:
+ * 0 if @bdev can be claimed, -EBUSY otherwise.
+ */
+static int bd_prepare_to_claim(struct block_device *bdev,
+			       struct block_device *whole, void *holder)
+{
+retry:
+	/* if someone else claimed, fail */
+	if (!bd_may_claim(bdev, whole, holder))
+		return -EBUSY;
+
+	/* if someone else is claiming, wait for it to finish */
+	if (whole->bd_claiming && whole->bd_claiming != holder) {
+		wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
+		DEFINE_WAIT(wait);
+
+		prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
+		spin_unlock(&bdev_lock);
+		schedule();
+		finish_wait(wq, &wait);
+		spin_lock(&bdev_lock);
+		goto retry;
+	}
+
+	/* yay, all mine */
+	return 0;
+}
+
+/**
+ * bd_start_claiming - start claiming a block device
+ * @bdev: block device of interest
+ * @holder: holder trying to claim @bdev
+ *
+ * @bdev is about to be opened exclusively.  Check @bdev can be opened
+ * exclusively and mark that an exclusive open is in progress.  Each
+ * successful call to this function must be matched with a call to
+ * either bd_finish_claiming() or bd_abort_claiming() (which do not
+ * fail).
+ *
+ * This function is used to gain exclusive access to the block device
+ * without actually causing other exclusive open attempts to fail. It
+ * should be used when the open sequence itself requires exclusive
+ * access but may subsequently fail.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * Pointer to the block device containing @bdev on success, ERR_PTR()
+ * value on failure.
+ */
+static struct block_device *bd_start_claiming(struct block_device *bdev,
+					      void *holder)
+{
+	struct gendisk *disk;
+	struct block_device *whole;
+	int partno, err;
+
+	might_sleep();
+
+	/*
+	 * @bdev might not have been initialized properly yet, look up
+	 * and grab the outer block device the hard way.
+	 */
+	disk = get_gendisk(bdev->bd_dev, &partno);
+	if (!disk)
+		return ERR_PTR(-ENXIO);
+
+	whole = bdget_disk(disk, 0);
+	module_put(disk->fops->owner);
+	put_disk(disk);
+	if (!whole)
+		return ERR_PTR(-ENOMEM);
+
+	/* prepare to claim, if successful, mark claiming in progress */
+	spin_lock(&bdev_lock);
+
+	err = bd_prepare_to_claim(bdev, whole, holder);
+	if (err == 0) {
+		whole->bd_claiming = holder;
+		spin_unlock(&bdev_lock);
+		return whole;
+	} else {
+		spin_unlock(&bdev_lock);
+		bdput(whole);
+		return ERR_PTR(err);
 	}
+}
+
+/* releases bdev_lock */
+static void __bd_abort_claiming(struct block_device *whole, void *holder)
+{
+	BUG_ON(whole->bd_claiming != holder);
+	whole->bd_claiming = NULL;
+	wake_up_bit(&whole->bd_claiming, 0);
+
 	spin_unlock(&bdev_lock);
-	return res;
+	bdput(whole);
+}
+
+/**
+ * bd_abort_claiming - abort claiming a block device
+ * @whole: whole block device returned by bd_start_claiming()
+ * @holder: holder trying to claim @bdev
+ *
+ * Abort a claiming block started by bd_start_claiming().  Note that
+ * @whole is not the block device to be claimed but the whole device
+ * returned by bd_start_claiming().
+ *
+ * CONTEXT:
+ * Grabs and releases bdev_lock.
+ */
+static void bd_abort_claiming(struct block_device *whole, void *holder)
+{
+	spin_lock(&bdev_lock);
+	__bd_abort_claiming(whole, holder);		/* releases bdev_lock */
+}
+
+/* increment holders when we have a legitimate claim. requires bdev_lock */
+static void __bd_claim(struct block_device *bdev, struct block_device *whole,
+					void *holder)
+{
+	/* note that for a whole device bd_holders
+	 * will be incremented twice, and bd_holder will
+	 * be set to bd_claim before being set to holder
+	 */
+	whole->bd_holders++;
+	whole->bd_holder = bd_claim;
+	bdev->bd_holders++;
+	bdev->bd_holder = holder;
 }
 
+/**
+ * bd_finish_claiming - finish claiming a block device
+ * @bdev: block device of interest (passed to bd_start_claiming())
+ * @whole: whole block device returned by bd_start_claiming()
+ * @holder: holder trying to claim @bdev
+ *
+ * Finish a claiming block started by bd_start_claiming().
+ *
+ * CONTEXT:
+ * Grabs and releases bdev_lock.
+ */
+static void bd_finish_claiming(struct block_device *bdev,
+				struct block_device *whole, void *holder)
+{
+	spin_lock(&bdev_lock);
+	BUG_ON(!bd_may_claim(bdev, whole, holder));
+	__bd_claim(bdev, whole, holder);
+	__bd_abort_claiming(whole, holder); /* not actually an abort */
+}
+
+/**
+ * bd_claim - claim a block device
+ * @bdev: block device to claim
+ * @holder: holder trying to claim @bdev
+ *
+ * Try to claim @bdev which must have been opened successfully.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * 0 if successful, -EBUSY if @bdev is already claimed.
+ */
+int bd_claim(struct block_device *bdev, void *holder)
+{
+	struct block_device *whole = bdev->bd_contains;
+	int res;
+
+	might_sleep();
+
+	spin_lock(&bdev_lock);
+	res = bd_prepare_to_claim(bdev, whole, holder);
+	if (res == 0)
+		__bd_claim(bdev, whole, holder);
+	spin_unlock(&bdev_lock);
+
+	return res;
+}
 EXPORT_SYMBOL(bd_claim);
 
 void bd_release(struct block_device *bdev)
@@ -1307,6 +1463,7 @@ EXPORT_SYMBOL(blkdev_get);
 
 static int blkdev_open(struct inode * inode, struct file * filp)
 {
+	struct block_device *whole = NULL;
 	struct block_device *bdev;
 	int res;
 
@@ -1329,22 +1486,25 @@ static int blkdev_open(struct inode * inode, struct file * filp)
 	if (bdev == NULL)
 		return -ENOMEM;
 
+	if (filp->f_mode & FMODE_EXCL) {
+		whole = bd_start_claiming(bdev, filp);
+		if (IS_ERR(whole)) {
+			bdput(bdev);
+			return PTR_ERR(whole);
+		}
+	}
+
 	filp->f_mapping = bdev->bd_inode->i_mapping;
 
 	res = blkdev_get(bdev, filp->f_mode);
-	if (res)
-		return res;
 
-	if (filp->f_mode & FMODE_EXCL) {
-		res = bd_claim(bdev, filp);
-		if (res)
-			goto out_blkdev_put;
+	if (whole) {
+		if (res == 0)
+			bd_finish_claiming(bdev, whole, filp);
+		else
+			bd_abort_claiming(whole, filp);
 	}
 
-	return 0;
-
- out_blkdev_put:
-	blkdev_put(bdev, filp->f_mode);
 	return res;
 }
 
@@ -1480,7 +1640,7 @@ const struct file_operations def_blk_fops = {
   	.aio_read	= generic_file_aio_read,
 	.aio_write	= blkdev_aio_write,
 	.mmap		= generic_file_mmap,
-	.fsync		= block_fsync,
+	.fsync		= blkdev_fsync,
 	.unlocked_ioctl	= block_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= compat_blkdev_ioctl,
@@ -1555,27 +1715,34 @@ EXPORT_SYMBOL(lookup_bdev);
  */
 struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder)
 {
-	struct block_device *bdev;
-	int error = 0;
+	struct block_device *bdev, *whole;
+	int error;
 
 	bdev = lookup_bdev(path);
 	if (IS_ERR(bdev))
 		return bdev;
 
+	whole = bd_start_claiming(bdev, holder);
+	if (IS_ERR(whole)) {
+		bdput(bdev);
+		return whole;
+	}
+
 	error = blkdev_get(bdev, mode);
 	if (error)
-		return ERR_PTR(error);
+		goto out_abort_claiming;
+
 	error = -EACCES;
 	if ((mode & FMODE_WRITE) && bdev_read_only(bdev))
-		goto blkdev_put;
-	error = bd_claim(bdev, holder);
-	if (error)
-		goto blkdev_put;
+		goto out_blkdev_put;
 
+	bd_finish_claiming(bdev, whole, holder);
 	return bdev;
-	
-blkdev_put:
+
+out_blkdev_put:
 	blkdev_put(bdev, mode);
+out_abort_claiming:
+	bd_abort_claiming(whole, holder);
 	return ERR_PTR(error);
 }
 
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 6df6d6ed74fd..2222d161c7b6 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -22,6 +22,7 @@
 #include <linux/posix_acl_xattr.h>
 #include <linux/posix_acl.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 
 #include "ctree.h"
 #include "btrfs_inode.h"
@@ -59,6 +60,8 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
 		size = __btrfs_getxattr(inode, name, value, size);
 		if (size > 0) {
 			acl = posix_acl_from_xattr(value, size);
+			if (IS_ERR(acl))
+				return acl;
 			set_cached_acl(inode, type, acl);
 		}
 		kfree(value);
@@ -159,6 +162,12 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
 	int ret;
 	struct posix_acl *acl = NULL;
 
+	if (!is_owner_or_cap(dentry->d_inode))
+		return -EPERM;
+
+	if (!IS_POSIXACL(dentry->d_inode))
+		return -EOPNOTSUPP;
+
 	if (value) {
 		acl = posix_acl_from_xattr(value, size);
 		if (acl == NULL) {
@@ -281,14 +290,14 @@ int btrfs_acl_chmod(struct inode *inode)
 	return ret;
 }
 
-struct xattr_handler btrfs_xattr_acl_default_handler = {
+const struct xattr_handler btrfs_xattr_acl_default_handler = {
 	.prefix = POSIX_ACL_XATTR_DEFAULT,
 	.flags	= ACL_TYPE_DEFAULT,
 	.get	= btrfs_xattr_acl_get,
 	.set	= btrfs_xattr_acl_set,
 };
 
-struct xattr_handler btrfs_xattr_acl_access_handler = {
+const struct xattr_handler btrfs_xattr_acl_access_handler = {
 	.prefix = POSIX_ACL_XATTR_ACCESS,
 	.flags	= ACL_TYPE_ACCESS,
 	.get	= btrfs_xattr_acl_get,
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index c0861e781cdb..7ec14097fef1 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -17,6 +17,7 @@
  */
 
 #include <linux/kthread.h>
+#include <linux/slab.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/freezer.h>
@@ -376,6 +377,7 @@ again:
 				if (!list_empty(&worker->pending) ||
 				    !list_empty(&worker->prio_pending)) {
 					spin_unlock_irq(&worker->lock);
+					set_current_state(TASK_RUNNING);
 					goto again;
 				}
 
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 3f1f50d9d916..6ad63f17eca0 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -137,8 +137,8 @@ struct btrfs_inode {
 	 * of extent items we've reserved metadata for.
 	 */
 	spinlock_t accounting_lock;
+	atomic_t outstanding_extents;
 	int reserved_extents;
-	int outstanding_extents;
 
 	/*
 	 * ordered_data_close is set by truncate when a file that used
@@ -151,8 +151,14 @@ struct btrfs_inode {
 	 * of these.
 	 */
 	unsigned ordered_data_close:1;
+	unsigned orphan_meta_reserved:1;
 	unsigned dummy_inode:1;
 
+	/*
+	 * always compress this one file
+	 */
+	unsigned force_compress:1;
+
 	struct inode vfs_inode;
 };
 
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index a11a32058b50..396039b3a8a2 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -31,7 +31,7 @@
 #include <linux/swap.h>
 #include <linux/writeback.h>
 #include <linux/bit_spinlock.h>
-#include <linux/pagevec.h>
+#include <linux/slab.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -445,7 +445,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 	unsigned long nr_pages = 0;
 	struct extent_map *em;
 	struct address_space *mapping = inode->i_mapping;
-	struct pagevec pvec;
 	struct extent_map_tree *em_tree;
 	struct extent_io_tree *tree;
 	u64 end;
@@ -461,7 +460,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 
 	end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
 
-	pagevec_init(&pvec, 0);
 	while (last_offset < compressed_end) {
 		page_index = last_offset >> PAGE_CACHE_SHIFT;
 
@@ -478,26 +476,17 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 			goto next;
 		}
 
-		page = alloc_page(mapping_gfp_mask(mapping) | GFP_NOFS);
+		page = __page_cache_alloc(mapping_gfp_mask(mapping) &
+								~__GFP_FS);
 		if (!page)
 			break;
 
-		page->index = page_index;
-		/*
-		 * what we want to do here is call add_to_page_cache_lru,
-		 * but that isn't exported, so we reproduce it here
-		 */
-		if (add_to_page_cache(page, mapping,
-				      page->index, GFP_NOFS)) {
+		if (add_to_page_cache_lru(page, mapping, page_index,
+								GFP_NOFS)) {
 			page_cache_release(page);
 			goto next;
 		}
 
-		/* open coding of lru_cache_add, also not exported */
-		page_cache_get(page);
-		if (!pagevec_add(&pvec, page))
-			__pagevec_lru_add_file(&pvec);
-
 		end = last_offset + PAGE_CACHE_SIZE - 1;
 		/*
 		 * at this point, we have a locked page in the page cache
@@ -551,8 +540,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 next:
 		last_offset += PAGE_CACHE_SIZE;
 	}
-	if (pagevec_count(&pvec))
-		__pagevec_lru_add_file(&pvec);
 	return 0;
 }
 
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c4bc570a396e..c3df14ce2cc2 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -17,6 +17,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -279,7 +280,8 @@ int btrfs_block_can_be_shared(struct btrfs_root *root,
 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 				       struct btrfs_root *root,
 				       struct extent_buffer *buf,
-				       struct extent_buffer *cow)
+				       struct extent_buffer *cow,
+				       int *last_ref)
 {
 	u64 refs;
 	u64 owner;
@@ -365,6 +367,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 			BUG_ON(ret);
 		}
 		clean_tree_block(trans, root, buf);
+		*last_ref = 1;
 	}
 	return 0;
 }
@@ -391,6 +394,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	struct btrfs_disk_key disk_key;
 	struct extent_buffer *cow;
 	int level;
+	int last_ref = 0;
 	int unlock_orig = 0;
 	u64 parent_start;
 
@@ -441,7 +445,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 			    (unsigned long)btrfs_header_fsid(cow),
 			    BTRFS_FSID_SIZE);
 
-	update_ref_for_cow(trans, root, buf, cow);
+	update_ref_for_cow(trans, root, buf, cow, &last_ref);
+
+	if (root->ref_cows)
+		btrfs_reloc_cow_block(trans, root, buf, cow);
 
 	if (buf == root->node) {
 		WARN_ON(parent && parent != buf);
@@ -456,8 +463,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 		extent_buffer_get(cow);
 		spin_unlock(&root->node_lock);
 
-		btrfs_free_tree_block(trans, root, buf->start, buf->len,
-				parent_start, root->root_key.objectid, level);
+		btrfs_free_tree_block(trans, root, buf, parent_start,
+				      last_ref);
 		free_extent_buffer(buf);
 		add_root_to_dirty_list(root);
 	} else {
@@ -472,8 +479,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 		btrfs_set_node_ptr_generation(parent, parent_slot,
 					      trans->transid);
 		btrfs_mark_buffer_dirty(parent);
-		btrfs_free_tree_block(trans, root, buf->start, buf->len,
-				parent_start, root->root_key.objectid, level);
+		btrfs_free_tree_block(trans, root, buf, parent_start,
+				      last_ref);
 	}
 	if (unlock_orig)
 		btrfs_tree_unlock(buf);
@@ -948,6 +955,22 @@ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
 	return bin_search(eb, key, level, slot);
 }
 
+static void root_add_used(struct btrfs_root *root, u32 size)
+{
+	spin_lock(&root->accounting_lock);
+	btrfs_set_root_used(&root->root_item,
+			    btrfs_root_used(&root->root_item) + size);
+	spin_unlock(&root->accounting_lock);
+}
+
+static void root_sub_used(struct btrfs_root *root, u32 size)
+{
+	spin_lock(&root->accounting_lock);
+	btrfs_set_root_used(&root->root_item,
+			    btrfs_root_used(&root->root_item) - size);
+	spin_unlock(&root->accounting_lock);
+}
+
 /* given a node and slot number, this reads the blocks it points to.  The
  * extent buffer is returned with a reference taken (but unlocked).
  * NULL is returned on error.
@@ -1018,7 +1041,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		btrfs_tree_lock(child);
 		btrfs_set_lock_blocking(child);
 		ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
-		BUG_ON(ret);
+		if (ret) {
+			btrfs_tree_unlock(child);
+			free_extent_buffer(child);
+			goto enospc;
+		}
 
 		spin_lock(&root->node_lock);
 		root->node = child;
@@ -1033,11 +1060,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		btrfs_tree_unlock(mid);
 		/* once for the path */
 		free_extent_buffer(mid);
-		ret = btrfs_free_tree_block(trans, root, mid->start, mid->len,
-					    0, root->root_key.objectid, level);
+
+		root_sub_used(root, mid->len);
+		btrfs_free_tree_block(trans, root, mid, 0, 1);
 		/* once for the root ptr */
 		free_extent_buffer(mid);
-		return ret;
+		return 0;
 	}
 	if (btrfs_header_nritems(mid) >
 	    BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
@@ -1087,23 +1115,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		if (wret < 0 && wret != -ENOSPC)
 			ret = wret;
 		if (btrfs_header_nritems(right) == 0) {
-			u64 bytenr = right->start;
-			u32 blocksize = right->len;
-
 			clean_tree_block(trans, root, right);
 			btrfs_tree_unlock(right);
-			free_extent_buffer(right);
-			right = NULL;
 			wret = del_ptr(trans, root, path, level + 1, pslot +
 				       1);
 			if (wret)
 				ret = wret;
-			wret = btrfs_free_tree_block(trans, root,
-						     bytenr, blocksize, 0,
-						     root->root_key.objectid,
-						     level);
-			if (wret)
-				ret = wret;
+			root_sub_used(root, right->len);
+			btrfs_free_tree_block(trans, root, right, 0, 1);
+			free_extent_buffer(right);
+			right = NULL;
 		} else {
 			struct btrfs_disk_key right_key;
 			btrfs_node_key(right, &right_key, 0);
@@ -1135,21 +1156,15 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		BUG_ON(wret == 1);
 	}
 	if (btrfs_header_nritems(mid) == 0) {
-		/* we've managed to empty the middle node, drop it */
-		u64 bytenr = mid->start;
-		u32 blocksize = mid->len;
-
 		clean_tree_block(trans, root, mid);
 		btrfs_tree_unlock(mid);
-		free_extent_buffer(mid);
-		mid = NULL;
 		wret = del_ptr(trans, root, path, level + 1, pslot);
 		if (wret)
 			ret = wret;
-		wret = btrfs_free_tree_block(trans, root, bytenr, blocksize,
-					 0, root->root_key.objectid, level);
-		if (wret)
-			ret = wret;
+		root_sub_used(root, mid->len);
+		btrfs_free_tree_block(trans, root, mid, 0, 1);
+		free_extent_buffer(mid);
+		mid = NULL;
 	} else {
 		/* update the parent key to reflect our changes */
 		struct btrfs_disk_key mid_key;
@@ -1589,7 +1604,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
 	btrfs_release_path(NULL, p);
 
 	ret = -EAGAIN;
-	tmp = read_tree_block(root, blocknr, blocksize, gen);
+	tmp = read_tree_block(root, blocknr, blocksize, 0);
 	if (tmp) {
 		/*
 		 * If the read above didn't mark this buffer up to date,
@@ -1739,7 +1754,6 @@ again:
 					      p->nodes[level + 1],
 					      p->slots[level + 1], &b);
 			if (err) {
-				free_extent_buffer(b);
 				ret = err;
 				goto done;
 			}
@@ -2075,6 +2089,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 	if (IS_ERR(c))
 		return PTR_ERR(c);
 
+	root_add_used(root, root->nodesize);
+
 	memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
 	btrfs_set_header_nritems(c, 1);
 	btrfs_set_header_level(c, level);
@@ -2133,6 +2149,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
 	int nritems;
 
 	BUG_ON(!path->nodes[level]);
+	btrfs_assert_tree_locked(path->nodes[level]);
 	lower = path->nodes[level];
 	nritems = btrfs_header_nritems(lower);
 	BUG_ON(slot > nritems);
@@ -2201,6 +2218,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 	if (IS_ERR(split))
 		return PTR_ERR(split);
 
+	root_add_used(root, root->nodesize);
+
 	memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
 	btrfs_set_header_level(split, btrfs_header_level(c));
 	btrfs_set_header_bytenr(split, split->start);
@@ -2285,12 +2304,17 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root,
 	return ret;
 }
 
+/*
+ * min slot controls the lowest index we're willing to push to the
+ * right.  We'll push up to and including min_slot, but no lower
+ */
 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
 				      struct btrfs_root *root,
 				      struct btrfs_path *path,
 				      int data_size, int empty,
 				      struct extent_buffer *right,
-				      int free_space, u32 left_nritems)
+				      int free_space, u32 left_nritems,
+				      u32 min_slot)
 {
 	struct extent_buffer *left = path->nodes[0];
 	struct extent_buffer *upper = path->nodes[1];
@@ -2308,7 +2332,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
 	if (empty)
 		nr = 0;
 	else
-		nr = 1;
+		nr = max_t(u32, 1, min_slot);
 
 	if (path->slots[0] >= left_nritems)
 		push_space += data_size;
@@ -2414,6 +2438,9 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
 
 	if (left_nritems)
 		btrfs_mark_buffer_dirty(left);
+	else
+		clean_tree_block(trans, root, left);
+
 	btrfs_mark_buffer_dirty(right);
 
 	btrfs_item_key(right, &disk_key, 0);
@@ -2447,10 +2474,14 @@ out_unlock:
  *
  * returns 1 if the push failed because the other node didn't have enough
  * room, 0 if everything worked out and < 0 if there were major errors.
+ *
+ * this will push starting from min_slot to the end of the leaf.  It won't
+ * push any slot lower than min_slot
  */
 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
-			   *root, struct btrfs_path *path, int data_size,
-			   int empty)
+			   *root, struct btrfs_path *path,
+			   int min_data_size, int data_size,
+			   int empty, u32 min_slot)
 {
 	struct extent_buffer *left = path->nodes[0];
 	struct extent_buffer *right;
@@ -2492,8 +2523,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (left_nritems == 0)
 		goto out_unlock;
 
-	return __push_leaf_right(trans, root, path, data_size, empty,
-				right, free_space, left_nritems);
+	return __push_leaf_right(trans, root, path, min_data_size, empty,
+				right, free_space, left_nritems, min_slot);
 out_unlock:
 	btrfs_tree_unlock(right);
 	free_extent_buffer(right);
@@ -2503,12 +2534,17 @@ out_unlock:
 /*
  * push some data in the path leaf to the left, trying to free up at
  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
+ *
+ * max_slot can put a limit on how far into the leaf we'll push items.  The
+ * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
+ * items
  */
 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root,
 				     struct btrfs_path *path, int data_size,
 				     int empty, struct extent_buffer *left,
-				     int free_space, int right_nritems)
+				     int free_space, u32 right_nritems,
+				     u32 max_slot)
 {
 	struct btrfs_disk_key disk_key;
 	struct extent_buffer *right = path->nodes[0];
@@ -2527,9 +2563,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
 	slot = path->slots[1];
 
 	if (empty)
-		nr = right_nritems;
+		nr = min(right_nritems, max_slot);
 	else
-		nr = right_nritems - 1;
+		nr = min(right_nritems - 1, max_slot);
 
 	for (i = 0; i < nr; i++) {
 		item = btrfs_item_nr(right, i);
@@ -2659,6 +2695,8 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(left);
 	if (right_nritems)
 		btrfs_mark_buffer_dirty(right);
+	else
+		clean_tree_block(trans, root, right);
 
 	btrfs_item_key(right, &disk_key, 0);
 	wret = fixup_low_keys(trans, root, path, &disk_key, 1);
@@ -2668,8 +2706,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
 	/* then fixup the leaf pointer in the path */
 	if (path->slots[0] < push_items) {
 		path->slots[0] += old_left_nritems;
-		if (btrfs_header_nritems(path->nodes[0]) == 0)
-			clean_tree_block(trans, root, path->nodes[0]);
 		btrfs_tree_unlock(path->nodes[0]);
 		free_extent_buffer(path->nodes[0]);
 		path->nodes[0] = left;
@@ -2690,10 +2726,14 @@ out:
 /*
  * push some data in the path leaf to the left, trying to free up at
  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
+ *
+ * max_slot can put a limit on how far into the leaf we'll push items.  The
+ * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
+ * items
  */
 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
-			  *root, struct btrfs_path *path, int data_size,
-			  int empty)
+			  *root, struct btrfs_path *path, int min_data_size,
+			  int data_size, int empty, u32 max_slot)
 {
 	struct extent_buffer *right = path->nodes[0];
 	struct extent_buffer *left;
@@ -2739,8 +2779,9 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 		goto out;
 	}
 
-	return __push_leaf_left(trans, root, path, data_size,
-			       empty, left, free_space, right_nritems);
+	return __push_leaf_left(trans, root, path, min_data_size,
+			       empty, left, free_space, right_nritems,
+			       max_slot);
 out:
 	btrfs_tree_unlock(left);
 	free_extent_buffer(left);
@@ -2833,6 +2874,64 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
 }
 
 /*
+ * double splits happen when we need to insert a big item in the middle
+ * of a leaf.  A double split can leave us with 3 mostly empty leaves:
+ * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
+ *          A                 B                 C
+ *
+ * We avoid this by trying to push the items on either side of our target
+ * into the adjacent leaves.  If all goes well we can avoid the double split
+ * completely.
+ */
+static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path,
+					  int data_size)
+{
+	int ret;
+	int progress = 0;
+	int slot;
+	u32 nritems;
+
+	slot = path->slots[0];
+
+	/*
+	 * try to push all the items after our slot into the
+	 * right leaf
+	 */
+	ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot);
+	if (ret < 0)
+		return ret;
+
+	if (ret == 0)
+		progress++;
+
+	nritems = btrfs_header_nritems(path->nodes[0]);
+	/*
+	 * our goal is to get our slot at the start or end of a leaf.  If
+	 * we've done so we're done
+	 */
+	if (path->slots[0] == 0 || path->slots[0] == nritems)
+		return 0;
+
+	if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
+		return 0;
+
+	/* try to push all the items before our slot into the next leaf */
+	slot = path->slots[0];
+	ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot);
+	if (ret < 0)
+		return ret;
+
+	if (ret == 0)
+		progress++;
+
+	if (progress)
+		return 0;
+	return 1;
+}
+
+/*
  * split the path's leaf in two, making sure there is at least data_size
  * available for the resulting leaf level of the path.
  *
@@ -2854,6 +2953,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 	int wret;
 	int split;
 	int num_doubles = 0;
+	int tried_avoid_double = 0;
 
 	l = path->nodes[0];
 	slot = path->slots[0];
@@ -2862,12 +2962,14 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 		return -EOVERFLOW;
 
 	/* first try to make some room by pushing left and right */
-	if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
-		wret = push_leaf_right(trans, root, path, data_size, 0);
+	if (data_size) {
+		wret = push_leaf_right(trans, root, path, data_size,
+				       data_size, 0, 0);
 		if (wret < 0)
 			return wret;
 		if (wret) {
-			wret = push_leaf_left(trans, root, path, data_size, 0);
+			wret = push_leaf_left(trans, root, path, data_size,
+					      data_size, 0, (u32)-1);
 			if (wret < 0)
 				return wret;
 		}
@@ -2901,6 +3003,8 @@ again:
 				if (mid != nritems &&
 				    leaf_space_used(l, mid, nritems - mid) +
 				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+					if (data_size && !tried_avoid_double)
+						goto push_for_double;
 					split = 2;
 				}
 			}
@@ -2917,6 +3021,8 @@ again:
 				if (mid != nritems &&
 				    leaf_space_used(l, mid, nritems - mid) +
 				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+					if (data_size && !tried_avoid_double)
+						goto push_for_double;
 					split = 2 ;
 				}
 			}
@@ -2931,10 +3037,10 @@ again:
 	right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
 					root->root_key.objectid,
 					&disk_key, 0, l->start, 0);
-	if (IS_ERR(right)) {
-		BUG_ON(1);
+	if (IS_ERR(right))
 		return PTR_ERR(right);
-	}
+
+	root_add_used(root, root->leafsize);
 
 	memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
 	btrfs_set_header_bytenr(right, right->start);
@@ -2997,6 +3103,13 @@ again:
 	}
 
 	return ret;
+
+push_for_double:
+	push_for_double_split(trans, root, path, data_size);
+	tried_avoid_double = 1;
+	if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
+		return 0;
+	goto again;
 }
 
 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
@@ -3040,6 +3153,10 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
 	if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
 		goto err;
 
+	/* the leaf has  changed, it now has room.  return now */
+	if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
+		goto err;
+
 	if (key.type == BTRFS_EXTENT_DATA_KEY) {
 		fi = btrfs_item_ptr(leaf, path->slots[0],
 				    struct btrfs_file_extent_item);
@@ -3049,7 +3166,8 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
 
 	btrfs_set_path_blocking(path);
 	ret = split_leaf(trans, root, &key, path, ins_len, 1);
-	BUG_ON(ret);
+	if (ret)
+		goto err;
 
 	path->keep_locks = 0;
 	btrfs_unlock_up_safe(path, 1);
@@ -3791,9 +3909,10 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
 	 */
 	btrfs_unlock_up_safe(path, 0);
 
-	ret = btrfs_free_tree_block(trans, root, leaf->start, leaf->len,
-				    0, root->root_key.objectid, 0);
-	return ret;
+	root_sub_used(root, leaf->len);
+
+	btrfs_free_tree_block(trans, root, leaf, 0, 1);
+	return 0;
 }
 /*
  * delete the item at the leaf level in path.  If that empties
@@ -3860,6 +3979,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		if (leaf == root->node) {
 			btrfs_set_header_level(leaf, 0);
 		} else {
+			btrfs_set_path_blocking(path);
+			clean_tree_block(trans, root, leaf);
 			ret = btrfs_del_leaf(trans, root, path, leaf);
 			BUG_ON(ret);
 		}
@@ -3885,13 +4006,15 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			extent_buffer_get(leaf);
 
 			btrfs_set_path_blocking(path);
-			wret = push_leaf_left(trans, root, path, 1, 1);
+			wret = push_leaf_left(trans, root, path, 1, 1,
+					      1, (u32)-1);
 			if (wret < 0 && wret != -ENOSPC)
 				ret = wret;
 
 			if (path->nodes[0] == leaf &&
 			    btrfs_header_nritems(leaf)) {
-				wret = push_leaf_right(trans, root, path, 1, 1);
+				wret = push_leaf_right(trans, root, path, 1,
+						       1, 1, 0);
 				if (wret < 0 && wret != -ENOSPC)
 					ret = wret;
 			}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2aa8ec6a0981..29c20092847e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -26,6 +26,7 @@
 #include <linux/completion.h>
 #include <linux/backing-dev.h>
 #include <linux/wait.h>
+#include <linux/slab.h>
 #include <asm/kmap_types.h>
 #include "extent_io.h"
 #include "extent_map.h"
@@ -33,6 +34,7 @@
 
 struct btrfs_trans_handle;
 struct btrfs_transaction;
+struct btrfs_pending_snapshot;
 extern struct kmem_cache *btrfs_trans_handle_cachep;
 extern struct kmem_cache *btrfs_transaction_cachep;
 extern struct kmem_cache *btrfs_bit_radix_cachep;
@@ -373,11 +375,13 @@ struct btrfs_super_block {
  * ones specified below then we will fail to mount
  */
 #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF	(1ULL << 0)
+#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL	(2ULL << 0)
 
 #define BTRFS_FEATURE_COMPAT_SUPP		0ULL
 #define BTRFS_FEATURE_COMPAT_RO_SUPP		0ULL
 #define BTRFS_FEATURE_INCOMPAT_SUPP		\
-	BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF
+	(BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF |	\
+	 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)
 
 /*
  * A leaf is full of items. offset and size tell us where to find
@@ -660,6 +664,7 @@ struct btrfs_csum_item {
 #define BTRFS_BLOCK_GROUP_RAID1    (1 << 4)
 #define BTRFS_BLOCK_GROUP_DUP	   (1 << 5)
 #define BTRFS_BLOCK_GROUP_RAID10   (1 << 6)
+#define BTRFS_NR_RAID_TYPES	   5
 
 struct btrfs_block_group_item {
 	__le64 used;
@@ -671,42 +676,46 @@ struct btrfs_space_info {
 	u64 flags;
 
 	u64 total_bytes;	/* total bytes in the space */
-	u64 bytes_used;		/* total bytes used on disk */
+	u64 bytes_used;		/* total bytes used,
+				   this does't take mirrors into account */
 	u64 bytes_pinned;	/* total bytes pinned, will be freed when the
 				   transaction finishes */
 	u64 bytes_reserved;	/* total bytes the allocator has reserved for
 				   current allocations */
 	u64 bytes_readonly;	/* total bytes that are read only */
-	u64 bytes_super;	/* total bytes reserved for the super blocks */
-	u64 bytes_root;		/* the number of bytes needed to commit a
-				   transaction */
+
 	u64 bytes_may_use;	/* number of bytes that may be used for
 				   delalloc/allocations */
-	u64 bytes_delalloc;	/* number of bytes currently reserved for
-				   delayed allocation */
+	u64 disk_used;		/* total bytes used on disk */
 
 	int full;		/* indicates that we cannot allocate any more
 				   chunks for this space */
 	int force_alloc;	/* set if we need to force a chunk alloc for
 				   this space */
-	int force_delalloc;	/* make people start doing filemap_flush until
-				   we're under a threshold */
 
 	struct list_head list;
 
-	/* for controlling how we free up space for allocations */
-	wait_queue_head_t allocate_wait;
-	wait_queue_head_t flush_wait;
-	int allocating_chunk;
-	int flushing;
-
 	/* for block groups in our same type */
-	struct list_head block_groups;
+	struct list_head block_groups[BTRFS_NR_RAID_TYPES];
 	spinlock_t lock;
 	struct rw_semaphore groups_sem;
 	atomic_t caching_threads;
 };
 
+struct btrfs_block_rsv {
+	u64 size;
+	u64 reserved;
+	u64 freed[2];
+	struct btrfs_space_info *space_info;
+	struct list_head list;
+	spinlock_t lock;
+	atomic_t usage;
+	unsigned int priority:8;
+	unsigned int durable:1;
+	unsigned int refill_used:1;
+	unsigned int full:1;
+};
+
 /*
  * free clusters are used to claim free space in relatively large chunks,
  * allowing us to do less seeky writes.  They are used for all metadata
@@ -757,6 +766,7 @@ struct btrfs_block_group_cache {
 	spinlock_t lock;
 	u64 pinned;
 	u64 reserved;
+	u64 reserved_pinned;
 	u64 bytes_super;
 	u64 flags;
 	u64 sectorsize;
@@ -822,6 +832,22 @@ struct btrfs_fs_info {
 	/* logical->physical extent mapping */
 	struct btrfs_mapping_tree mapping_tree;
 
+	/* block reservation for extent, checksum and root tree */
+	struct btrfs_block_rsv global_block_rsv;
+	/* block reservation for delay allocation */
+	struct btrfs_block_rsv delalloc_block_rsv;
+	/* block reservation for metadata operations */
+	struct btrfs_block_rsv trans_block_rsv;
+	/* block reservation for chunk tree */
+	struct btrfs_block_rsv chunk_block_rsv;
+
+	struct btrfs_block_rsv empty_block_rsv;
+
+	/* list of block reservations that cross multiple transactions */
+	struct list_head durable_block_rsv_list;
+
+	struct mutex durable_block_rsv_mutex;
+
 	u64 generation;
 	u64 last_trans_committed;
 
@@ -832,7 +858,6 @@ struct btrfs_fs_info {
 	u64 last_trans_log_full_commit;
 	u64 open_ioctl_trans;
 	unsigned long mount_opt;
-	u64 max_extent;
 	u64 max_inline;
 	u64 alloc_start;
 	struct btrfs_transaction *running_transaction;
@@ -925,7 +950,6 @@ struct btrfs_fs_info {
 	struct btrfs_workers endio_meta_write_workers;
 	struct btrfs_workers endio_write_workers;
 	struct btrfs_workers submit_workers;
-	struct btrfs_workers enospc_workers;
 	/*
 	 * fixup workers take dirty pages that didn't properly go through
 	 * the cow mechanism and make them safe to write.  It happens
@@ -941,6 +965,7 @@ struct btrfs_fs_info {
 	int do_barriers;
 	int closing;
 	int log_root_recovering;
+	int enospc_unlink;
 
 	u64 total_pinned;
 
@@ -1010,6 +1035,9 @@ struct btrfs_root {
 	struct completion kobj_unregister;
 	struct mutex objectid_mutex;
 
+	spinlock_t accounting_lock;
+	struct btrfs_block_rsv *block_rsv;
+
 	struct mutex log_mutex;
 	wait_queue_head_t log_writer_wait;
 	wait_queue_head_t log_commit_wait[2];
@@ -1041,7 +1069,6 @@ struct btrfs_root {
 	int ref_cows;
 	int track_dirty;
 	int in_radix;
-	int clean_orphans;
 
 	u64 defrag_trans_start;
 	struct btrfs_key defrag_progress;
@@ -1055,8 +1082,11 @@ struct btrfs_root {
 
 	struct list_head root_list;
 
-	spinlock_t list_lock;
+	spinlock_t orphan_lock;
 	struct list_head orphan_list;
+	struct btrfs_block_rsv *orphan_block_rsv;
+	int orphan_item_inserted;
+	int orphan_cleanup_state;
 
 	spinlock_t inode_lock;
 	/* red-black tree that keeps track of in-memory inodes */
@@ -1182,7 +1212,6 @@ struct btrfs_root {
 #define BTRFS_INODE_NOATIME		(1 << 9)
 #define BTRFS_INODE_DIRSYNC		(1 << 10)
 
-
 /* some macros to generate set/get funcs for the struct fields.  This
  * assumes there is a lefoo_to_cpu for every type, so lets make a simple
  * one for u8:
@@ -1842,7 +1871,7 @@ BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block,
 BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block,
 			 compat_flags, 64);
 BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block,
-			 compat_flags, 64);
+			 compat_ro_flags, 64);
 BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
 			 incompat_flags, 64);
 BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
@@ -1964,6 +1993,9 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root, unsigned long count);
 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root, u64 bytenr,
+			     u64 num_bytes, u64 *refs, u64 *flags);
 int btrfs_pin_extent(struct btrfs_root *root,
 		     u64 bytenr, u64 num, int reserved);
 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
@@ -1983,10 +2015,10 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 					u64 parent, u64 root_objectid,
 					struct btrfs_disk_key *key, int level,
 					u64 hint, u64 empty_size);
-int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root,
-			  u64 bytenr, u32 blocksize,
-			  u64 parent, u64 root_objectid, int level);
+void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   struct extent_buffer *buf,
+			   u64 parent, int last_ref);
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
 					    struct btrfs_root *root,
 					    u64 bytenr, u32 blocksize,
@@ -2040,27 +2072,49 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 			   u64 size);
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, u64 group_start);
-int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
-				struct btrfs_block_group_cache *group);
-
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
 void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
 void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
-
-int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items);
-int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items);
-int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
-					  struct inode *inode, int num_items);
-int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
-					struct inode *inode, int num_items);
-int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
-				u64 bytes);
-void btrfs_free_reserved_data_space(struct btrfs_root *root,
-				    struct inode *inode, u64 bytes);
-void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
-				 u64 bytes);
-void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
-			      u64 bytes);
+int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
+void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
+int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				int num_items, int *retries);
+void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root);
+int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
+				  struct inode *inode);
+void btrfs_orphan_release_metadata(struct inode *inode);
+int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
+				struct btrfs_pending_snapshot *pending);
+int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
+void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
+void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
+void btrfs_free_block_rsv(struct btrfs_root *root,
+			  struct btrfs_block_rsv *rsv);
+void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
+				 struct btrfs_block_rsv *rsv);
+int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root,
+			struct btrfs_block_rsv *block_rsv,
+			u64 num_bytes, int *retries);
+int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root,
+			  struct btrfs_block_rsv *block_rsv,
+			  u64 min_reserved, int min_factor);
+int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
+			    struct btrfs_block_rsv *dst_rsv,
+			    u64 num_bytes);
+void btrfs_block_rsv_release(struct btrfs_root *root,
+			     struct btrfs_block_rsv *block_rsv,
+			     u64 num_bytes);
+int btrfs_set_block_group_ro(struct btrfs_root *root,
+			     struct btrfs_block_group_cache *cache);
+int btrfs_set_block_group_rw(struct btrfs_root *root,
+			     struct btrfs_block_group_cache *cache);
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
 		     int level, int *slot);
@@ -2151,7 +2205,8 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
-int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref);
+int btrfs_drop_snapshot(struct btrfs_root *root,
+			struct btrfs_block_rsv *block_rsv, int update_ref);
 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
 			struct extent_buffer *node,
@@ -2244,6 +2299,12 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   const char *name, int name_len,
 			   u64 inode_objectid, u64 ref_objectid, u64 *index);
+struct btrfs_inode_ref *
+btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root,
+			struct btrfs_path *path,
+			const char *name, int name_len,
+			u64 inode_objectid, u64 ref_objectid, int mod);
 int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct btrfs_path *path, u64 objectid);
@@ -2256,6 +2317,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
 		    struct btrfs_root *root, u64 bytenr, u64 len);
 int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 			  struct bio *bio, u32 *dst);
+int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
+			      struct bio *bio, u64 logical_offset, u32 *dst);
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     u64 objectid, u64 pos,
@@ -2310,7 +2373,9 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 			       u32 min_type);
 
 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
-int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);
+int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput);
+int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
+			      struct extent_state **cached_state);
 int btrfs_writepages(struct address_space *mapping,
 		     struct writeback_control *wbc);
 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
@@ -2326,7 +2391,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_delete_inode(struct inode *inode);
 void btrfs_put_inode(struct inode *inode);
-int btrfs_write_inode(struct inode *inode, int wait);
+int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
 void btrfs_dirty_inode(struct inode *inode);
 struct inode *btrfs_alloc_inode(struct super_block *sb);
 void btrfs_destroy_inode(struct inode *inode);
@@ -2335,7 +2400,7 @@ int btrfs_init_cachep(void);
 void btrfs_destroy_cachep(void);
 long btrfs_ioctl_trans_end(struct file *file);
 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
-			 struct btrfs_root *root);
+			 struct btrfs_root *root, int *was_new);
 int btrfs_commit_write(struct file *file, struct page *page,
 		       unsigned from, unsigned to);
 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
@@ -2347,10 +2412,20 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
 int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
 void btrfs_orphan_cleanup(struct btrfs_root *root);
+void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
+				struct btrfs_pending_snapshot *pending,
+				u64 *bytes_to_reserve);
+void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
+				struct btrfs_pending_snapshot *pending);
+void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root);
 int btrfs_cont_expand(struct inode *inode, loff_t size);
 int btrfs_invalidate_inodes(struct btrfs_root *root);
 void btrfs_add_delayed_iput(struct inode *inode);
 void btrfs_run_delayed_iputs(struct btrfs_root *root);
+int btrfs_prealloc_file_range(struct inode *inode, int mode,
+			      u64 start, u64 num_bytes, u64 min_size,
+			      loff_t actual_len, u64 *alloc_hint);
 extern const struct dentry_operations btrfs_dentry_operations;
 
 /* ioctl.c */
@@ -2359,7 +2434,7 @@ void btrfs_update_iflags(struct inode *inode);
 void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
 
 /* file.c */
-int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
+int btrfs_sync_file(struct file *file, int datasync);
 int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 			    int skip_pinned);
 int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
@@ -2386,7 +2461,6 @@ void btrfs_sysfs_del_super(struct btrfs_fs_info *root);
 ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
 
 /* super.c */
-u64 btrfs_parse_size(char *str);
 int btrfs_parse_options(struct btrfs_root *root, char *options);
 int btrfs_sync_fs(struct super_block *sb, int wait);
 
@@ -2408,4 +2482,12 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root);
 int btrfs_recover_relocation(struct btrfs_root *root);
 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
+void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root, struct extent_buffer *buf,
+			   struct extent_buffer *cow);
+void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
+			      struct btrfs_pending_snapshot *pending,
+			      u64 *bytes_to_reserve);
+void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
+			      struct btrfs_pending_snapshot *pending);
 #endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 84e6781413b1..e807b143b857 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -17,6 +17,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/sort.h>
 #include "ctree.h"
 #include "delayed-ref.h"
@@ -318,107 +319,6 @@ out:
 }
 
 /*
- * helper function to lookup reference count and flags of extent.
- *
- * the head node for delayed ref is used to store the sum of all the
- * reference count modifications queued up in the rbtree. the head
- * node may also store the extent flags to set. This way you can check
- * to see what the reference count and extent flags would be if all of
- * the delayed refs are not processed.
- */
-int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
-			     struct btrfs_root *root, u64 bytenr,
-			     u64 num_bytes, u64 *refs, u64 *flags)
-{
-	struct btrfs_delayed_ref_node *ref;
-	struct btrfs_delayed_ref_head *head;
-	struct btrfs_delayed_ref_root *delayed_refs;
-	struct btrfs_path *path;
-	struct btrfs_extent_item *ei;
-	struct extent_buffer *leaf;
-	struct btrfs_key key;
-	u32 item_size;
-	u64 num_refs;
-	u64 extent_flags;
-	int ret;
-
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
-	key.objectid = bytenr;
-	key.type = BTRFS_EXTENT_ITEM_KEY;
-	key.offset = num_bytes;
-	delayed_refs = &trans->transaction->delayed_refs;
-again:
-	ret = btrfs_search_slot(trans, root->fs_info->extent_root,
-				&key, path, 0, 0);
-	if (ret < 0)
-		goto out;
-
-	if (ret == 0) {
-		leaf = path->nodes[0];
-		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
-		if (item_size >= sizeof(*ei)) {
-			ei = btrfs_item_ptr(leaf, path->slots[0],
-					    struct btrfs_extent_item);
-			num_refs = btrfs_extent_refs(leaf, ei);
-			extent_flags = btrfs_extent_flags(leaf, ei);
-		} else {
-#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
-			struct btrfs_extent_item_v0 *ei0;
-			BUG_ON(item_size != sizeof(*ei0));
-			ei0 = btrfs_item_ptr(leaf, path->slots[0],
-					     struct btrfs_extent_item_v0);
-			num_refs = btrfs_extent_refs_v0(leaf, ei0);
-			/* FIXME: this isn't correct for data */
-			extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
-#else
-			BUG();
-#endif
-		}
-		BUG_ON(num_refs == 0);
-	} else {
-		num_refs = 0;
-		extent_flags = 0;
-		ret = 0;
-	}
-
-	spin_lock(&delayed_refs->lock);
-	ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
-	if (ref) {
-		head = btrfs_delayed_node_to_head(ref);
-		if (!mutex_trylock(&head->mutex)) {
-			atomic_inc(&ref->refs);
-			spin_unlock(&delayed_refs->lock);
-
-			btrfs_release_path(root->fs_info->extent_root, path);
-
-			mutex_lock(&head->mutex);
-			mutex_unlock(&head->mutex);
-			btrfs_put_delayed_ref(ref);
-			goto again;
-		}
-		if (head->extent_op && head->extent_op->update_flags)
-			extent_flags |= head->extent_op->flags_to_set;
-		else
-			BUG_ON(num_refs == 0);
-
-		num_refs += ref->ref_mod;
-		mutex_unlock(&head->mutex);
-	}
-	WARN_ON(num_refs == 0);
-	if (refs)
-		*refs = num_refs;
-	if (flags)
-		*flags = extent_flags;
-out:
-	spin_unlock(&delayed_refs->lock);
-	btrfs_free_path(path);
-	return ret;
-}
-
-/*
  * helper function to update an extent delayed ref in the
  * rbtree.  existing and update must both have the same
  * bytenr and parent
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index f6fc67ddad36..50e3cf92fbda 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -167,9 +167,6 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
 struct btrfs_delayed_ref_head *
 btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
 int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
-int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
-			     struct btrfs_root *root, u64 bytenr,
-			     u64 num_bytes, u64 *refs, u64 *flags);
 int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
 			  u64 bytenr, u64 num_bytes, u64 orig_parent,
 			  u64 parent, u64 orig_ref_root, u64 ref_root,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 87b25543d7d1..34f7c375567e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -27,6 +27,7 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/crc32c.h>
+#include <linux/slab.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -43,8 +44,6 @@ static struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
 static void free_fs_root(struct btrfs_root *root);
 
-static atomic_t btrfs_bdi_num = ATOMIC_INIT(0);
-
 /*
  * end_io_wq structs are used to do processing in task context when an IO is
  * complete.  This is used during reads to verify checksums, and it is used
@@ -75,6 +74,11 @@ struct async_submit_bio {
 	int rw;
 	int mirror_num;
 	unsigned long bio_flags;
+	/*
+	 * bio_offset is optional, can be used if the pages in the bio
+	 * can't tell us where in the file the bio should go
+	 */
+	u64 bio_offset;
 	struct btrfs_work work;
 };
 
@@ -263,13 +267,15 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 static int verify_parent_transid(struct extent_io_tree *io_tree,
 				 struct extent_buffer *eb, u64 parent_transid)
 {
+	struct extent_state *cached_state = NULL;
 	int ret;
 
 	if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
 		return 0;
 
-	lock_extent(io_tree, eb->start, eb->start + eb->len - 1, GFP_NOFS);
-	if (extent_buffer_uptodate(io_tree, eb) &&
+	lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
+			 0, &cached_state, GFP_NOFS);
+	if (extent_buffer_uptodate(io_tree, eb, cached_state) &&
 	    btrfs_header_generation(eb) == parent_transid) {
 		ret = 0;
 		goto out;
@@ -282,10 +288,10 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
 		       (unsigned long long)btrfs_header_generation(eb));
 	}
 	ret = 1;
-	clear_extent_buffer_uptodate(io_tree, eb);
+	clear_extent_buffer_uptodate(io_tree, eb, &cached_state);
 out:
-	unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
-		      GFP_NOFS);
+	unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
+			     &cached_state, GFP_NOFS);
 	return ret;
 }
 
@@ -533,7 +539,8 @@ static void run_one_async_start(struct btrfs_work *work)
 	async = container_of(work, struct  async_submit_bio, work);
 	fs_info = BTRFS_I(async->inode)->root->fs_info;
 	async->submit_bio_start(async->inode, async->rw, async->bio,
-			       async->mirror_num, async->bio_flags);
+			       async->mirror_num, async->bio_flags,
+			       async->bio_offset);
 }
 
 static void run_one_async_done(struct btrfs_work *work)
@@ -555,7 +562,8 @@ static void run_one_async_done(struct btrfs_work *work)
 		wake_up(&fs_info->async_submit_wait);
 
 	async->submit_bio_done(async->inode, async->rw, async->bio,
-			       async->mirror_num, async->bio_flags);
+			       async->mirror_num, async->bio_flags,
+			       async->bio_offset);
 }
 
 static void run_one_async_free(struct btrfs_work *work)
@@ -569,6 +577,7 @@ static void run_one_async_free(struct btrfs_work *work)
 int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 			int rw, struct bio *bio, int mirror_num,
 			unsigned long bio_flags,
+			u64 bio_offset,
 			extent_submit_bio_hook_t *submit_bio_start,
 			extent_submit_bio_hook_t *submit_bio_done)
 {
@@ -591,6 +600,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 
 	async->work.flags = 0;
 	async->bio_flags = bio_flags;
+	async->bio_offset = bio_offset;
 
 	atomic_inc(&fs_info->nr_async_submits);
 
@@ -626,7 +636,8 @@ static int btree_csum_one_bio(struct bio *bio)
 
 static int __btree_submit_bio_start(struct inode *inode, int rw,
 				    struct bio *bio, int mirror_num,
-				    unsigned long bio_flags)
+				    unsigned long bio_flags,
+				    u64 bio_offset)
 {
 	/*
 	 * when we're called for a write, we're already in the async
@@ -637,7 +648,8 @@ static int __btree_submit_bio_start(struct inode *inode, int rw,
 }
 
 static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
-				 int mirror_num, unsigned long bio_flags)
+				 int mirror_num, unsigned long bio_flags,
+				 u64 bio_offset)
 {
 	/*
 	 * when we're called for a write, we're already in the async
@@ -647,7 +659,8 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
 }
 
 static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
-				 int mirror_num, unsigned long bio_flags)
+				 int mirror_num, unsigned long bio_flags,
+				 u64 bio_offset)
 {
 	int ret;
 
@@ -670,6 +683,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 	 */
 	return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
 				   inode, rw, bio, mirror_num, 0,
+				   bio_offset,
 				   __btree_submit_bio_start,
 				   __btree_submit_bio_done);
 }
@@ -893,7 +907,8 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	root->ref_cows = 0;
 	root->track_dirty = 0;
 	root->in_radix = 0;
-	root->clean_orphans = 0;
+	root->orphan_item_inserted = 0;
+	root->orphan_cleanup_state = 0;
 
 	root->fs_info = fs_info;
 	root->objectid = objectid;
@@ -901,14 +916,17 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	root->highest_objectid = 0;
 	root->name = NULL;
 	root->in_sysfs = 0;
-	root->inode_tree.rb_node = NULL;
+	root->inode_tree = RB_ROOT;
+	root->block_rsv = NULL;
+	root->orphan_block_rsv = NULL;
 
 	INIT_LIST_HEAD(&root->dirty_list);
 	INIT_LIST_HEAD(&root->orphan_list);
 	INIT_LIST_HEAD(&root->root_list);
 	spin_lock_init(&root->node_lock);
-	spin_lock_init(&root->list_lock);
+	spin_lock_init(&root->orphan_lock);
 	spin_lock_init(&root->inode_lock);
+	spin_lock_init(&root->accounting_lock);
 	mutex_init(&root->objectid_mutex);
 	mutex_init(&root->log_mutex);
 	init_waitqueue_head(&root->log_writer_wait);
@@ -967,42 +985,6 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
 	return 0;
 }
 
-int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
-			     struct btrfs_fs_info *fs_info)
-{
-	struct extent_buffer *eb;
-	struct btrfs_root *log_root_tree = fs_info->log_root_tree;
-	u64 start = 0;
-	u64 end = 0;
-	int ret;
-
-	if (!log_root_tree)
-		return 0;
-
-	while (1) {
-		ret = find_first_extent_bit(&log_root_tree->dirty_log_pages,
-				0, &start, &end, EXTENT_DIRTY | EXTENT_NEW);
-		if (ret)
-			break;
-
-		clear_extent_bits(&log_root_tree->dirty_log_pages, start, end,
-				  EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
-	}
-	eb = fs_info->log_root_tree->node;
-
-	WARN_ON(btrfs_header_level(eb) != 0);
-	WARN_ON(btrfs_header_nritems(eb) != 0);
-
-	ret = btrfs_free_reserved_extent(fs_info->tree_root,
-				eb->start, eb->len);
-	BUG_ON(ret);
-
-	free_extent_buffer(eb);
-	kfree(fs_info->log_root_tree);
-	fs_info->log_root_tree = NULL;
-	return 0;
-}
-
 static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
 					 struct btrfs_fs_info *fs_info)
 {
@@ -1190,19 +1172,23 @@ again:
 	if (root)
 		return root;
 
-	ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
-	if (ret == 0)
-		ret = -ENOENT;
-	if (ret < 0)
-		return ERR_PTR(ret);
-
 	root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
 	if (IS_ERR(root))
 		return root;
 
-	WARN_ON(btrfs_root_refs(&root->root_item) == 0);
 	set_anon_super(&root->anon_super, NULL);
 
+	if (btrfs_root_refs(&root->root_item) == 0) {
+		ret = -ENOENT;
+		goto fail;
+	}
+
+	ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
+	if (ret < 0)
+		goto fail;
+	if (ret == 0)
+		root->orphan_item_inserted = 1;
+
 	ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
 	if (ret)
 		goto fail;
@@ -1211,10 +1197,9 @@ again:
 	ret = radix_tree_insert(&fs_info->fs_roots_radix,
 				(unsigned long)root->root_key.objectid,
 				root);
-	if (ret == 0) {
+	if (ret == 0)
 		root->in_radix = 1;
-		root->clean_orphans = 1;
-	}
+
 	spin_unlock(&fs_info->fs_roots_radix_lock);
 	radix_tree_preload_end();
 	if (ret) {
@@ -1372,19 +1357,11 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
 {
 	int err;
 
-	bdi->name = "btrfs";
 	bdi->capabilities = BDI_CAP_MAP_COPY;
-	err = bdi_init(bdi);
+	err = bdi_setup_and_register(bdi, "btrfs", BDI_CAP_MAP_COPY);
 	if (err)
 		return err;
 
-	err = bdi_register(bdi, NULL, "btrfs-%d",
-				atomic_inc_return(&btrfs_bdi_num));
-	if (err) {
-		bdi_destroy(bdi);
-		return err;
-	}
-
 	bdi->ra_pages	= default_backing_dev_info.ra_pages;
 	bdi->unplug_io_fn	= btrfs_unplug_io_fn;
 	bdi->unplug_io_data	= info;
@@ -1468,10 +1445,6 @@ static int cleaner_kthread(void *arg)
 	struct btrfs_root *root = arg;
 
 	do {
-		smp_mb();
-		if (root->fs_info->closing)
-			break;
-
 		vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
 
 		if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
@@ -1484,11 +1457,9 @@ static int cleaner_kthread(void *arg)
 		if (freezing(current)) {
 			refrigerator();
 		} else {
-			smp_mb();
-			if (root->fs_info->closing)
-				break;
 			set_current_state(TASK_INTERRUPTIBLE);
-			schedule();
+			if (!kthread_should_stop())
+				schedule();
 			__set_current_state(TASK_RUNNING);
 		}
 	} while (!kthread_should_stop());
@@ -1500,36 +1471,40 @@ static int transaction_kthread(void *arg)
 	struct btrfs_root *root = arg;
 	struct btrfs_trans_handle *trans;
 	struct btrfs_transaction *cur;
+	u64 transid;
 	unsigned long now;
 	unsigned long delay;
 	int ret;
 
 	do {
-		smp_mb();
-		if (root->fs_info->closing)
-			break;
-
 		delay = HZ * 30;
 		vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
 		mutex_lock(&root->fs_info->transaction_kthread_mutex);
 
-		mutex_lock(&root->fs_info->trans_mutex);
+		spin_lock(&root->fs_info->new_trans_lock);
 		cur = root->fs_info->running_transaction;
 		if (!cur) {
-			mutex_unlock(&root->fs_info->trans_mutex);
+			spin_unlock(&root->fs_info->new_trans_lock);
 			goto sleep;
 		}
 
 		now = get_seconds();
-		if (now < cur->start_time || now - cur->start_time < 30) {
-			mutex_unlock(&root->fs_info->trans_mutex);
+		if (!cur->blocked &&
+		    (now < cur->start_time || now - cur->start_time < 30)) {
+			spin_unlock(&root->fs_info->new_trans_lock);
 			delay = HZ * 5;
 			goto sleep;
 		}
-		mutex_unlock(&root->fs_info->trans_mutex);
-		trans = btrfs_start_transaction(root, 1);
-		ret = btrfs_commit_transaction(trans, root);
+		transid = cur->transid;
+		spin_unlock(&root->fs_info->new_trans_lock);
 
+		trans = btrfs_join_transaction(root, 1);
+		if (transid == trans->transid) {
+			ret = btrfs_commit_transaction(trans, root);
+			BUG_ON(ret);
+		} else {
+			btrfs_end_transaction(trans, root);
+		}
 sleep:
 		wake_up_process(root->fs_info->cleaner_kthread);
 		mutex_unlock(&root->fs_info->transaction_kthread_mutex);
@@ -1537,10 +1512,10 @@ sleep:
 		if (freezing(current)) {
 			refrigerator();
 		} else {
-			if (root->fs_info->closing)
-				break;
 			set_current_state(TASK_INTERRUPTIBLE);
-			schedule_timeout(delay);
+			if (!kthread_should_stop() &&
+			    !btrfs_transaction_blocked(root->fs_info))
+				schedule_timeout(delay);
 			__set_current_state(TASK_RUNNING);
 		}
 	} while (!kthread_should_stop());
@@ -1627,12 +1602,18 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
 	INIT_LIST_HEAD(&fs_info->space_info);
 	btrfs_mapping_init(&fs_info->mapping_tree);
+	btrfs_init_block_rsv(&fs_info->global_block_rsv);
+	btrfs_init_block_rsv(&fs_info->delalloc_block_rsv);
+	btrfs_init_block_rsv(&fs_info->trans_block_rsv);
+	btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
+	btrfs_init_block_rsv(&fs_info->empty_block_rsv);
+	INIT_LIST_HEAD(&fs_info->durable_block_rsv_list);
+	mutex_init(&fs_info->durable_block_rsv_mutex);
 	atomic_set(&fs_info->nr_async_submits, 0);
 	atomic_set(&fs_info->async_delalloc_pages, 0);
 	atomic_set(&fs_info->async_submit_draining, 0);
 	atomic_set(&fs_info->nr_async_bios, 0);
 	fs_info->sb = sb;
-	fs_info->max_extent = (u64)-1;
 	fs_info->max_inline = 8192 * 1024;
 	fs_info->metadata_ratio = 0;
 
@@ -1673,7 +1654,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	insert_inode_hash(fs_info->btree_inode);
 
 	spin_lock_init(&fs_info->block_group_cache_lock);
-	fs_info->block_group_cache_tree.rb_node = NULL;
+	fs_info->block_group_cache_tree = RB_ROOT;
 
 	extent_io_tree_init(&fs_info->freed_extents[0],
 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
@@ -1767,9 +1748,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 			   min_t(u64, fs_devices->num_devices,
 			   fs_info->thread_pool_size),
 			   &fs_info->generic_worker);
-	btrfs_init_workers(&fs_info->enospc_workers, "enospc",
-			   fs_info->thread_pool_size,
-			   &fs_info->generic_worker);
 
 	/* a higher idle thresh on the submit workers makes it much more
 	 * likely that bios will be send down in a sane order to the
@@ -1817,7 +1795,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	btrfs_start_workers(&fs_info->endio_meta_workers, 1);
 	btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
 	btrfs_start_workers(&fs_info->endio_write_workers, 1);
-	btrfs_start_workers(&fs_info->enospc_workers, 1);
 
 	fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
 	fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1920,17 +1897,22 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	csum_root->track_dirty = 1;
 
-	btrfs_read_block_groups(extent_root);
-
 	fs_info->generation = generation;
 	fs_info->last_trans_committed = generation;
 	fs_info->data_alloc_profile = (u64)-1;
 	fs_info->metadata_alloc_profile = (u64)-1;
 	fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
+
+	ret = btrfs_read_block_groups(extent_root);
+	if (ret) {
+		printk(KERN_ERR "Failed to read block groups: %d\n", ret);
+		goto fail_block_groups;
+	}
+
 	fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
 					       "btrfs-cleaner");
 	if (IS_ERR(fs_info->cleaner_kthread))
-		goto fail_csum_root;
+		goto fail_block_groups;
 
 	fs_info->transaction_kthread = kthread_run(transaction_kthread,
 						   tree_root,
@@ -1959,8 +1941,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 		     btrfs_level_size(tree_root,
 				      btrfs_super_log_root_level(disk_super));
 
-		log_tree_root = kzalloc(sizeof(struct btrfs_root),
-						      GFP_NOFS);
+		log_tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+		if (!log_tree_root) {
+			err = -ENOMEM;
+			goto fail_trans_kthread;
+		}
 
 		__setup_root(nodesize, leafsize, sectorsize, stripesize,
 			     log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
@@ -1981,8 +1966,16 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	BUG_ON(ret);
 
 	if (!(sb->s_flags & MS_RDONLY)) {
-		ret = btrfs_recover_relocation(tree_root);
+		ret = btrfs_cleanup_fs_roots(fs_info);
 		BUG_ON(ret);
+
+		ret = btrfs_recover_relocation(tree_root);
+		if (ret < 0) {
+			printk(KERN_WARNING
+			       "btrfs: failed to recover relocation\n");
+			err = -EINVAL;
+			goto fail_trans_kthread;
+		}
 	}
 
 	location.objectid = BTRFS_FS_TREE_OBJECTID;
@@ -1992,6 +1985,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
 	if (!fs_info->fs_root)
 		goto fail_trans_kthread;
+	if (IS_ERR(fs_info->fs_root)) {
+		err = PTR_ERR(fs_info->fs_root);
+		goto fail_trans_kthread;
+	}
 
 	if (!(sb->s_flags & MS_RDONLY)) {
 		down_read(&fs_info->cleanup_work_sem);
@@ -2013,7 +2010,8 @@ fail_cleaner:
 	filemap_write_and_wait(fs_info->btree_inode->i_mapping);
 	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
 
-fail_csum_root:
+fail_block_groups:
+	btrfs_free_block_groups(fs_info);
 	free_extent_buffer(csum_root->node);
 	free_extent_buffer(csum_root->commit_root);
 fail_dev_root:
@@ -2038,7 +2036,6 @@ fail_sb_buffer:
 	btrfs_stop_workers(&fs_info->endio_meta_write_workers);
 	btrfs_stop_workers(&fs_info->endio_write_workers);
 	btrfs_stop_workers(&fs_info->submit_workers);
-	btrfs_stop_workers(&fs_info->enospc_workers);
 fail_iput:
 	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
 	iput(fs_info->btree_inode);
@@ -2403,11 +2400,11 @@ int btrfs_commit_super(struct btrfs_root *root)
 	down_write(&root->fs_info->cleanup_work_sem);
 	up_write(&root->fs_info->cleanup_work_sem);
 
-	trans = btrfs_start_transaction(root, 1);
+	trans = btrfs_join_transaction(root, 1);
 	ret = btrfs_commit_transaction(trans, root);
 	BUG_ON(ret);
 	/* run commit again to drop the original snapshot */
-	trans = btrfs_start_transaction(root, 1);
+	trans = btrfs_join_transaction(root, 1);
 	btrfs_commit_transaction(trans, root);
 	ret = btrfs_write_and_wait_transaction(NULL, root);
 	BUG_ON(ret);
@@ -2424,15 +2421,15 @@ int close_ctree(struct btrfs_root *root)
 	fs_info->closing = 1;
 	smp_mb();
 
-	kthread_stop(root->fs_info->transaction_kthread);
-	kthread_stop(root->fs_info->cleaner_kthread);
-
 	if (!(fs_info->sb->s_flags & MS_RDONLY)) {
 		ret =  btrfs_commit_super(root);
 		if (ret)
 			printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
 	}
 
+	kthread_stop(root->fs_info->transaction_kthread);
+	kthread_stop(root->fs_info->cleaner_kthread);
+
 	fs_info->closing = 2;
 	smp_mb();
 
@@ -2471,7 +2468,6 @@ int close_ctree(struct btrfs_root *root)
 	btrfs_stop_workers(&fs_info->endio_meta_write_workers);
 	btrfs_stop_workers(&fs_info->endio_write_workers);
 	btrfs_stop_workers(&fs_info->submit_workers);
-	btrfs_stop_workers(&fs_info->enospc_workers);
 
 	btrfs_close_devices(fs_info->fs_devices);
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -2492,7 +2488,8 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
 	int ret;
 	struct inode *btree_inode = buf->first_page->mapping->host;
 
-	ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf);
+	ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf,
+				     NULL);
 	if (!ret)
 		return ret;
 
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c958ecbc1916..88e825a0bf21 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -87,7 +87,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 			int metadata);
 int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 			int rw, struct bio *bio, int mirror_num,
-			unsigned long bio_flags,
+			unsigned long bio_flags, u64 bio_offset,
 			extent_submit_bio_hook_t *submit_bio_start,
 			extent_submit_bio_hook_t *submit_bio_done);
 
@@ -95,8 +95,6 @@ int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
 unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
 int btrfs_write_tree_block(struct extent_buffer *buf);
 int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
-int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
-			     struct btrfs_fs_info *fs_info);
 int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
 			     struct btrfs_fs_info *fs_info);
 int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index ba5c3fd5ab8c..951ef09b82f4 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -95,7 +95,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
 	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
 	key.offset = 0;
 
-	inode = btrfs_iget(sb, &key, root);
+	inode = btrfs_iget(sb, &key, root, NULL);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		goto fail;
@@ -223,7 +223,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
 
 	key.type = BTRFS_INODE_ITEM_KEY;
 	key.offset = 0;
-	dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root));
+	dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
 	if (!IS_ERR(dentry))
 		dentry->d_op = &btrfs_dentry_operations;
 	return dentry;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 432a2da4641e..32d094002a57 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -22,6 +22,7 @@
 #include <linux/sort.h>
 #include <linux/rcupdate.h>
 #include <linux/kthread.h>
+#include <linux/slab.h>
 #include "compat.h"
 #include "hash.h"
 #include "ctree.h"
@@ -34,10 +35,9 @@
 
 static int update_block_group(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
-			      u64 bytenr, u64 num_bytes, int alloc,
-			      int mark_free);
-static int update_reserved_extents(struct btrfs_block_group_cache *cache,
-				   u64 num_bytes, int reserve);
+			      u64 bytenr, u64 num_bytes, int alloc);
+static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
+				 u64 num_bytes, int reserve, int sinfo);
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
 				u64 bytenr, u64 num_bytes, u64 parent,
@@ -60,12 +60,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *extent_root, u64 alloc_bytes,
 			  u64 flags, int force);
-static int pin_down_bytes(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root,
-			  struct btrfs_path *path,
-			  u64 bytenr, u64 num_bytes,
-			  int is_data, int reserved,
-			  struct extent_buffer **must_clean);
 static int find_next_key(struct btrfs_path *path, int level,
 			 struct btrfs_key *key);
 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
@@ -90,8 +84,12 @@ void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
 
 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
 {
-	if (atomic_dec_and_test(&cache->count))
+	if (atomic_dec_and_test(&cache->count)) {
+		WARN_ON(cache->pinned > 0);
+		WARN_ON(cache->reserved > 0);
+		WARN_ON(cache->reserved_pinned > 0);
 		kfree(cache);
+	}
 }
 
 /*
@@ -318,7 +316,7 @@ static int caching_kthread(void *data)
 
 	exclude_super_stripes(extent_root, block_group);
 	spin_lock(&block_group->space_info->lock);
-	block_group->space_info->bytes_super += block_group->bytes_super;
+	block_group->space_info->bytes_readonly += block_group->bytes_super;
 	spin_unlock(&block_group->space_info->lock);
 
 	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
@@ -506,6 +504,9 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
 	struct list_head *head = &info->space_info;
 	struct btrfs_space_info *found;
 
+	flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
+		 BTRFS_BLOCK_GROUP_METADATA;
+
 	rcu_read_lock();
 	list_for_each_entry_rcu(found, head, list) {
 		if (found->flags == flags) {
@@ -609,6 +610,113 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
 }
 
 /*
+ * helper function to lookup reference count and flags of extent.
+ *
+ * the head node for delayed ref is used to store the sum of all the
+ * reference count modifications queued up in the rbtree. the head
+ * node may also store the extent flags to set. This way you can check
+ * to see what the reference count and extent flags would be if all of
+ * the delayed refs are not processed.
+ */
+int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root, u64 bytenr,
+			     u64 num_bytes, u64 *refs, u64 *flags)
+{
+	struct btrfs_delayed_ref_head *head;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	struct btrfs_path *path;
+	struct btrfs_extent_item *ei;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	u32 item_size;
+	u64 num_refs;
+	u64 extent_flags;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = bytenr;
+	key.type = BTRFS_EXTENT_ITEM_KEY;
+	key.offset = num_bytes;
+	if (!trans) {
+		path->skip_locking = 1;
+		path->search_commit_root = 1;
+	}
+again:
+	ret = btrfs_search_slot(trans, root->fs_info->extent_root,
+				&key, path, 0, 0);
+	if (ret < 0)
+		goto out_free;
+
+	if (ret == 0) {
+		leaf = path->nodes[0];
+		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+		if (item_size >= sizeof(*ei)) {
+			ei = btrfs_item_ptr(leaf, path->slots[0],
+					    struct btrfs_extent_item);
+			num_refs = btrfs_extent_refs(leaf, ei);
+			extent_flags = btrfs_extent_flags(leaf, ei);
+		} else {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+			struct btrfs_extent_item_v0 *ei0;
+			BUG_ON(item_size != sizeof(*ei0));
+			ei0 = btrfs_item_ptr(leaf, path->slots[0],
+					     struct btrfs_extent_item_v0);
+			num_refs = btrfs_extent_refs_v0(leaf, ei0);
+			/* FIXME: this isn't correct for data */
+			extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+#else
+			BUG();
+#endif
+		}
+		BUG_ON(num_refs == 0);
+	} else {
+		num_refs = 0;
+		extent_flags = 0;
+		ret = 0;
+	}
+
+	if (!trans)
+		goto out;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+	spin_lock(&delayed_refs->lock);
+	head = btrfs_find_delayed_ref_head(trans, bytenr);
+	if (head) {
+		if (!mutex_trylock(&head->mutex)) {
+			atomic_inc(&head->node.refs);
+			spin_unlock(&delayed_refs->lock);
+
+			btrfs_release_path(root->fs_info->extent_root, path);
+
+			mutex_lock(&head->mutex);
+			mutex_unlock(&head->mutex);
+			btrfs_put_delayed_ref(&head->node);
+			goto again;
+		}
+		if (head->extent_op && head->extent_op->update_flags)
+			extent_flags |= head->extent_op->flags_to_set;
+		else
+			BUG_ON(num_refs == 0);
+
+		num_refs += head->node.ref_mod;
+		mutex_unlock(&head->mutex);
+	}
+	spin_unlock(&delayed_refs->lock);
+out:
+	WARN_ON(num_refs == 0);
+	if (refs)
+		*refs = num_refs;
+	if (flags)
+		*flags = extent_flags;
+out_free:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
  * Back reference rules.  Back refs have three main goals:
  *
  * 1) differentiate between all holders of references to an extent so that
@@ -1588,7 +1696,7 @@ static void btrfs_issue_discard(struct block_device *bdev,
 				u64 start, u64 len)
 {
 	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
-			     DISCARD_FL_BARRIER);
+			BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
 }
 
 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
@@ -1870,7 +1978,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-
 /* helper function to actually process a single delayed ref entry */
 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
@@ -1890,32 +1997,14 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
 		BUG_ON(extent_op);
 		head = btrfs_delayed_node_to_head(node);
 		if (insert_reserved) {
-			int mark_free = 0;
-			struct extent_buffer *must_clean = NULL;
-
-			ret = pin_down_bytes(trans, root, NULL,
-					     node->bytenr, node->num_bytes,
-					     head->is_data, 1, &must_clean);
-			if (ret > 0)
-				mark_free = 1;
-
-			if (must_clean) {
-				clean_tree_block(NULL, root, must_clean);
-				btrfs_tree_unlock(must_clean);
-				free_extent_buffer(must_clean);
-			}
+			btrfs_pin_extent(root, node->bytenr,
+					 node->num_bytes, 1);
 			if (head->is_data) {
 				ret = btrfs_del_csums(trans, root,
 						      node->bytenr,
 						      node->num_bytes);
 				BUG_ON(ret);
 			}
-			if (mark_free) {
-				ret = btrfs_free_reserved_extent(root,
-							node->bytenr,
-							node->num_bytes);
-				BUG_ON(ret);
-			}
 		}
 		mutex_unlock(&head->mutex);
 		return 0;
@@ -2346,6 +2435,8 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
 		ret = 0;
 out:
 	btrfs_free_path(path);
+	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+		WARN_ON(ret > 0);
 	return ret;
 }
 
@@ -2659,12 +2750,21 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 			     struct btrfs_space_info **space_info)
 {
 	struct btrfs_space_info *found;
+	int i;
+	int factor;
+
+	if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
+		     BTRFS_BLOCK_GROUP_RAID10))
+		factor = 2;
+	else
+		factor = 1;
 
 	found = __find_space_info(info, flags);
 	if (found) {
 		spin_lock(&found->lock);
 		found->total_bytes += total_bytes;
 		found->bytes_used += bytes_used;
+		found->disk_used += bytes_used * factor;
 		found->full = 0;
 		spin_unlock(&found->lock);
 		*space_info = found;
@@ -2674,16 +2774,20 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 	if (!found)
 		return -ENOMEM;
 
-	INIT_LIST_HEAD(&found->block_groups);
+	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+		INIT_LIST_HEAD(&found->block_groups[i]);
 	init_rwsem(&found->groups_sem);
 	spin_lock_init(&found->lock);
-	found->flags = flags;
+	found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
+				BTRFS_BLOCK_GROUP_SYSTEM |
+				BTRFS_BLOCK_GROUP_METADATA);
 	found->total_bytes = total_bytes;
 	found->bytes_used = bytes_used;
+	found->disk_used = bytes_used * factor;
 	found->bytes_pinned = 0;
 	found->bytes_reserved = 0;
 	found->bytes_readonly = 0;
-	found->bytes_delalloc = 0;
+	found->bytes_may_use = 0;
 	found->full = 0;
 	found->force_alloc = 0;
 	*space_info = found;
@@ -2708,19 +2812,6 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 	}
 }
 
-static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
-{
-	spin_lock(&cache->space_info->lock);
-	spin_lock(&cache->lock);
-	if (!cache->ro) {
-		cache->space_info->bytes_readonly += cache->key.offset -
-					btrfs_block_group_used(&cache->item);
-		cache->ro = 1;
-	}
-	spin_unlock(&cache->lock);
-	spin_unlock(&cache->space_info->lock);
-}
-
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 {
 	u64 num_devices = root->fs_info->fs_devices->rw_devices;
@@ -2749,492 +2840,49 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 	return flags;
 }
 
-static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data)
+static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
 {
-	struct btrfs_fs_info *info = root->fs_info;
-	u64 alloc_profile;
-
-	if (data) {
-		alloc_profile = info->avail_data_alloc_bits &
-			info->data_alloc_profile;
-		data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
-	} else if (root == root->fs_info->chunk_root) {
-		alloc_profile = info->avail_system_alloc_bits &
-			info->system_alloc_profile;
-		data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
-	} else {
-		alloc_profile = info->avail_metadata_alloc_bits &
-			info->metadata_alloc_profile;
-		data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
-	}
-
-	return btrfs_reduce_alloc_profile(root, data);
+	if (flags & BTRFS_BLOCK_GROUP_DATA)
+		flags |= root->fs_info->avail_data_alloc_bits &
+			 root->fs_info->data_alloc_profile;
+	else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+		flags |= root->fs_info->avail_system_alloc_bits &
+			 root->fs_info->system_alloc_profile;
+	else if (flags & BTRFS_BLOCK_GROUP_METADATA)
+		flags |= root->fs_info->avail_metadata_alloc_bits &
+			 root->fs_info->metadata_alloc_profile;
+	return btrfs_reduce_alloc_profile(root, flags);
 }
 
-void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
+static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
 {
-	u64 alloc_target;
-
-	alloc_target = btrfs_get_alloc_profile(root, 1);
-	BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
-						       alloc_target);
-}
-
-static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
-{
-	u64 num_bytes;
-	int level;
-
-	level = BTRFS_MAX_LEVEL - 2;
-	/*
-	 * NOTE: these calculations are absolutely the worst possible case.
-	 * This assumes that _every_ item we insert will require a new leaf, and
-	 * that the tree has grown to its maximum level size.
-	 */
-
-	/*
-	 * for every item we insert we could insert both an extent item and a
-	 * extent ref item.  Then for ever item we insert, we will need to cow
-	 * both the original leaf, plus the leaf to the left and right of it.
-	 *
-	 * Unless we are talking about the extent root, then we just want the
-	 * number of items * 2, since we just need the extent item plus its ref.
-	 */
-	if (root == root->fs_info->extent_root)
-		num_bytes = num_items * 2;
-	else
-		num_bytes = (num_items + (2 * num_items)) * 3;
-
-	/*
-	 * num_bytes is total number of leaves we could need times the leaf
-	 * size, and then for every leaf we could end up cow'ing 2 nodes per
-	 * level, down to the leaf level.
-	 */
-	num_bytes = (num_bytes * root->leafsize) +
-		(num_bytes * (level * 2)) * root->nodesize;
-
-	return num_bytes;
-}
-
-/*
- * Unreserve metadata space for delalloc.  If we have less reserved credits than
- * we have extents, this function does nothing.
- */
-int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
-					  struct inode *inode, int num_items)
-{
-	struct btrfs_fs_info *info = root->fs_info;
-	struct btrfs_space_info *meta_sinfo;
-	u64 num_bytes;
-	u64 alloc_target;
-	bool bug = false;
-
-	/* get the space info for where the metadata will live */
-	alloc_target = btrfs_get_alloc_profile(root, 0);
-	meta_sinfo = __find_space_info(info, alloc_target);
-
-	num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
-					   num_items);
-
-	spin_lock(&meta_sinfo->lock);
-	spin_lock(&BTRFS_I(inode)->accounting_lock);
-	if (BTRFS_I(inode)->reserved_extents <=
-	    BTRFS_I(inode)->outstanding_extents) {
-		spin_unlock(&BTRFS_I(inode)->accounting_lock);
-		spin_unlock(&meta_sinfo->lock);
-		return 0;
-	}
-	spin_unlock(&BTRFS_I(inode)->accounting_lock);
-
-	BTRFS_I(inode)->reserved_extents--;
-	BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
-
-	if (meta_sinfo->bytes_delalloc < num_bytes) {
-		bug = true;
-		meta_sinfo->bytes_delalloc = 0;
-	} else {
-		meta_sinfo->bytes_delalloc -= num_bytes;
-	}
-	spin_unlock(&meta_sinfo->lock);
-
-	BUG_ON(bug);
-
-	return 0;
-}
-
-static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
-{
-	u64 thresh;
-
-	thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
-		meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
-		meta_sinfo->bytes_super + meta_sinfo->bytes_root +
-		meta_sinfo->bytes_may_use;
+	u64 flags;
 
-	thresh = meta_sinfo->total_bytes - thresh;
-	thresh *= 80;
-	do_div(thresh, 100);
-	if (thresh <= meta_sinfo->bytes_delalloc)
-		meta_sinfo->force_delalloc = 1;
+	if (data)
+		flags = BTRFS_BLOCK_GROUP_DATA;
+	else if (root == root->fs_info->chunk_root)
+		flags = BTRFS_BLOCK_GROUP_SYSTEM;
 	else
-		meta_sinfo->force_delalloc = 0;
-}
-
-struct async_flush {
-	struct btrfs_root *root;
-	struct btrfs_space_info *info;
-	struct btrfs_work work;
-};
-
-static noinline void flush_delalloc_async(struct btrfs_work *work)
-{
-	struct async_flush *async;
-	struct btrfs_root *root;
-	struct btrfs_space_info *info;
-
-	async = container_of(work, struct async_flush, work);
-	root = async->root;
-	info = async->info;
-
-	btrfs_start_delalloc_inodes(root, 0);
-	wake_up(&info->flush_wait);
-	btrfs_wait_ordered_extents(root, 0, 0);
-
-	spin_lock(&info->lock);
-	info->flushing = 0;
-	spin_unlock(&info->lock);
-	wake_up(&info->flush_wait);
-
-	kfree(async);
-}
-
-static void wait_on_flush(struct btrfs_space_info *info)
-{
-	DEFINE_WAIT(wait);
-	u64 used;
-
-	while (1) {
-		prepare_to_wait(&info->flush_wait, &wait,
-				TASK_UNINTERRUPTIBLE);
-		spin_lock(&info->lock);
-		if (!info->flushing) {
-			spin_unlock(&info->lock);
-			break;
-		}
-
-		used = info->bytes_used + info->bytes_reserved +
-			info->bytes_pinned + info->bytes_readonly +
-			info->bytes_super + info->bytes_root +
-			info->bytes_may_use + info->bytes_delalloc;
-		if (used < info->total_bytes) {
-			spin_unlock(&info->lock);
-			break;
-		}
-		spin_unlock(&info->lock);
-		schedule();
-	}
-	finish_wait(&info->flush_wait, &wait);
-}
-
-static void flush_delalloc(struct btrfs_root *root,
-				 struct btrfs_space_info *info)
-{
-	struct async_flush *async;
-	bool wait = false;
-
-	spin_lock(&info->lock);
-
-	if (!info->flushing) {
-		info->flushing = 1;
-		init_waitqueue_head(&info->flush_wait);
-	} else {
-		wait = true;
-	}
-
-	spin_unlock(&info->lock);
-
-	if (wait) {
-		wait_on_flush(info);
-		return;
-	}
-
-	async = kzalloc(sizeof(*async), GFP_NOFS);
-	if (!async)
-		goto flush;
-
-	async->root = root;
-	async->info = info;
-	async->work.func = flush_delalloc_async;
-
-	btrfs_queue_worker(&root->fs_info->enospc_workers,
-			   &async->work);
-	wait_on_flush(info);
-	return;
-
-flush:
-	btrfs_start_delalloc_inodes(root, 0);
-	btrfs_wait_ordered_extents(root, 0, 0);
-
-	spin_lock(&info->lock);
-	info->flushing = 0;
-	spin_unlock(&info->lock);
-	wake_up(&info->flush_wait);
-}
-
-static int maybe_allocate_chunk(struct btrfs_root *root,
-				 struct btrfs_space_info *info)
-{
-	struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
-	struct btrfs_trans_handle *trans;
-	bool wait = false;
-	int ret = 0;
-	u64 min_metadata;
-	u64 free_space;
-
-	free_space = btrfs_super_total_bytes(disk_super);
-	/*
-	 * we allow the metadata to grow to a max of either 10gb or 5% of the
-	 * space in the volume.
-	 */
-	min_metadata = min((u64)10 * 1024 * 1024 * 1024,
-			     div64_u64(free_space * 5, 100));
-	if (info->total_bytes >= min_metadata) {
-		spin_unlock(&info->lock);
-		return 0;
-	}
-
-	if (info->full) {
-		spin_unlock(&info->lock);
-		return 0;
-	}
-
-	if (!info->allocating_chunk) {
-		info->force_alloc = 1;
-		info->allocating_chunk = 1;
-		init_waitqueue_head(&info->allocate_wait);
-	} else {
-		wait = true;
-	}
-
-	spin_unlock(&info->lock);
-
-	if (wait) {
-		wait_event(info->allocate_wait,
-			   !info->allocating_chunk);
-		return 1;
-	}
-
-	trans = btrfs_start_transaction(root, 1);
-	if (!trans) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-			     4096 + 2 * 1024 * 1024,
-			     info->flags, 0);
-	btrfs_end_transaction(trans, root);
-	if (ret)
-		goto out;
-out:
-	spin_lock(&info->lock);
-	info->allocating_chunk = 0;
-	spin_unlock(&info->lock);
-	wake_up(&info->allocate_wait);
-
-	if (ret)
-		return 0;
-	return 1;
-}
-
-/*
- * Reserve metadata space for delalloc.
- */
-int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
-					struct inode *inode, int num_items)
-{
-	struct btrfs_fs_info *info = root->fs_info;
-	struct btrfs_space_info *meta_sinfo;
-	u64 num_bytes;
-	u64 used;
-	u64 alloc_target;
-	int flushed = 0;
-	int force_delalloc;
-
-	/* get the space info for where the metadata will live */
-	alloc_target = btrfs_get_alloc_profile(root, 0);
-	meta_sinfo = __find_space_info(info, alloc_target);
-
-	num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
-					   num_items);
-again:
-	spin_lock(&meta_sinfo->lock);
-
-	force_delalloc = meta_sinfo->force_delalloc;
-
-	if (unlikely(!meta_sinfo->bytes_root))
-		meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
-
-	if (!flushed)
-		meta_sinfo->bytes_delalloc += num_bytes;
-
-	used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
-		meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
-		meta_sinfo->bytes_super + meta_sinfo->bytes_root +
-		meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
-
-	if (used > meta_sinfo->total_bytes) {
-		flushed++;
-
-		if (flushed == 1) {
-			if (maybe_allocate_chunk(root, meta_sinfo))
-				goto again;
-			flushed++;
-		} else {
-			spin_unlock(&meta_sinfo->lock);
-		}
-
-		if (flushed == 2) {
-			filemap_flush(inode->i_mapping);
-			goto again;
-		} else if (flushed == 3) {
-			flush_delalloc(root, meta_sinfo);
-			goto again;
-		}
-		spin_lock(&meta_sinfo->lock);
-		meta_sinfo->bytes_delalloc -= num_bytes;
-		spin_unlock(&meta_sinfo->lock);
-		printk(KERN_ERR "enospc, has %d, reserved %d\n",
-		       BTRFS_I(inode)->outstanding_extents,
-		       BTRFS_I(inode)->reserved_extents);
-		dump_space_info(meta_sinfo, 0, 0);
-		return -ENOSPC;
-	}
-
-	BTRFS_I(inode)->reserved_extents++;
-	check_force_delalloc(meta_sinfo);
-	spin_unlock(&meta_sinfo->lock);
-
-	if (!flushed && force_delalloc)
-		filemap_flush(inode->i_mapping);
+		flags = BTRFS_BLOCK_GROUP_METADATA;
 
-	return 0;
+	return get_alloc_profile(root, flags);
 }
 
-/*
- * unreserve num_items number of items worth of metadata space.  This needs to
- * be paired with btrfs_reserve_metadata_space.
- *
- * NOTE: if you have the option, run this _AFTER_ you do a
- * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
- * oprations which will result in more used metadata, so we want to make sure we
- * can do that without issue.
- */
-int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
-{
-	struct btrfs_fs_info *info = root->fs_info;
-	struct btrfs_space_info *meta_sinfo;
-	u64 num_bytes;
-	u64 alloc_target;
-	bool bug = false;
-
-	/* get the space info for where the metadata will live */
-	alloc_target = btrfs_get_alloc_profile(root, 0);
-	meta_sinfo = __find_space_info(info, alloc_target);
-
-	num_bytes = calculate_bytes_needed(root, num_items);
-
-	spin_lock(&meta_sinfo->lock);
-	if (meta_sinfo->bytes_may_use < num_bytes) {
-		bug = true;
-		meta_sinfo->bytes_may_use = 0;
-	} else {
-		meta_sinfo->bytes_may_use -= num_bytes;
-	}
-	spin_unlock(&meta_sinfo->lock);
-
-	BUG_ON(bug);
-
-	return 0;
-}
-
-/*
- * Reserve some metadata space for use.  We'll calculate the worste case number
- * of bytes that would be needed to modify num_items number of items.  If we
- * have space, fantastic, if not, you get -ENOSPC.  Please call
- * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
- * items you reserved, since whatever metadata you needed should have already
- * been allocated.
- *
- * This will commit the transaction to make more space if we don't have enough
- * metadata space.  THe only time we don't do this is if we're reserving space
- * inside of a transaction, then we will just return -ENOSPC and it is the
- * callers responsibility to handle it properly.
- */
-int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
+void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
 {
-	struct btrfs_fs_info *info = root->fs_info;
-	struct btrfs_space_info *meta_sinfo;
-	u64 num_bytes;
-	u64 used;
-	u64 alloc_target;
-	int retries = 0;
-
-	/* get the space info for where the metadata will live */
-	alloc_target = btrfs_get_alloc_profile(root, 0);
-	meta_sinfo = __find_space_info(info, alloc_target);
-
-	num_bytes = calculate_bytes_needed(root, num_items);
-again:
-	spin_lock(&meta_sinfo->lock);
-
-	if (unlikely(!meta_sinfo->bytes_root))
-		meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
-
-	if (!retries)
-		meta_sinfo->bytes_may_use += num_bytes;
-
-	used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
-		meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
-		meta_sinfo->bytes_super + meta_sinfo->bytes_root +
-		meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
-
-	if (used > meta_sinfo->total_bytes) {
-		retries++;
-		if (retries == 1) {
-			if (maybe_allocate_chunk(root, meta_sinfo))
-				goto again;
-			retries++;
-		} else {
-			spin_unlock(&meta_sinfo->lock);
-		}
-
-		if (retries == 2) {
-			flush_delalloc(root, meta_sinfo);
-			goto again;
-		}
-		spin_lock(&meta_sinfo->lock);
-		meta_sinfo->bytes_may_use -= num_bytes;
-		spin_unlock(&meta_sinfo->lock);
-
-		dump_space_info(meta_sinfo, 0, 0);
-		return -ENOSPC;
-	}
-
-	check_force_delalloc(meta_sinfo);
-	spin_unlock(&meta_sinfo->lock);
-
-	return 0;
+	BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
+						       BTRFS_BLOCK_GROUP_DATA);
 }
 
 /*
  * This will check the space that the inode allocates from to make sure we have
  * enough space for bytes.
  */
-int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
-				u64 bytes)
+int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
 {
 	struct btrfs_space_info *data_sinfo;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 used;
 	int ret = 0, committed = 0;
 
 	/* make sure bytes are sectorsize aligned */
@@ -3247,10 +2895,11 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
 again:
 	/* make sure we have enough space to handle the data first */
 	spin_lock(&data_sinfo->lock);
-	if (data_sinfo->total_bytes - data_sinfo->bytes_used -
-	    data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
-	    data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
-	    data_sinfo->bytes_may_use - data_sinfo->bytes_super < bytes) {
+	used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
+		data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
+		data_sinfo->bytes_may_use;
+
+	if (used + bytes > data_sinfo->total_bytes) {
 		struct btrfs_trans_handle *trans;
 
 		/*
@@ -3264,15 +2913,15 @@ again:
 			spin_unlock(&data_sinfo->lock);
 alloc:
 			alloc_target = btrfs_get_alloc_profile(root, 1);
-			trans = btrfs_start_transaction(root, 1);
-			if (!trans)
-				return -ENOMEM;
+			trans = btrfs_join_transaction(root, 1);
+			if (IS_ERR(trans))
+				return PTR_ERR(trans);
 
 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
 					     bytes + 2 * 1024 * 1024,
 					     alloc_target, 0);
 			btrfs_end_transaction(trans, root);
-			if (ret)
+			if (ret < 0)
 				return ret;
 
 			if (!data_sinfo) {
@@ -3287,25 +2936,26 @@ alloc:
 		if (!committed && !root->fs_info->open_ioctl_trans) {
 			committed = 1;
 			trans = btrfs_join_transaction(root, 1);
-			if (!trans)
-				return -ENOMEM;
+			if (IS_ERR(trans))
+				return PTR_ERR(trans);
 			ret = btrfs_commit_transaction(trans, root);
 			if (ret)
 				return ret;
 			goto again;
 		}
 
-		printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
-		       ", %llu bytes_used, %llu bytes_reserved, "
-		       "%llu bytes_pinned, %llu bytes_readonly, %llu may use "
-		       "%llu total\n", (unsigned long long)bytes,
-		       (unsigned long long)data_sinfo->bytes_delalloc,
+#if 0 /* I hope we never need this code again, just in case */
+		printk(KERN_ERR "no space left, need %llu, %llu bytes_used, "
+		       "%llu bytes_reserved, " "%llu bytes_pinned, "
+		       "%llu bytes_readonly, %llu may use %llu total\n",
+		       (unsigned long long)bytes,
 		       (unsigned long long)data_sinfo->bytes_used,
 		       (unsigned long long)data_sinfo->bytes_reserved,
 		       (unsigned long long)data_sinfo->bytes_pinned,
 		       (unsigned long long)data_sinfo->bytes_readonly,
 		       (unsigned long long)data_sinfo->bytes_may_use,
 		       (unsigned long long)data_sinfo->total_bytes);
+#endif
 		return -ENOSPC;
 	}
 	data_sinfo->bytes_may_use += bytes;
@@ -3316,12 +2966,13 @@ alloc:
 }
 
 /*
- * if there was an error for whatever reason after calling
- * btrfs_check_data_free_space, call this so we can cleanup the counters.
+ * called when we are clearing an delalloc extent from the
+ * inode's io_tree or there was an error for whatever reason
+ * after calling btrfs_check_data_free_space
  */
-void btrfs_free_reserved_data_space(struct btrfs_root *root,
-				    struct inode *inode, u64 bytes)
+void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
 {
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_space_info *data_sinfo;
 
 	/* make sure bytes are sectorsize aligned */
@@ -3334,48 +2985,6 @@ void btrfs_free_reserved_data_space(struct btrfs_root *root,
 	spin_unlock(&data_sinfo->lock);
 }
 
-/* called when we are adding a delalloc extent to the inode's io_tree */
-void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
-				  u64 bytes)
-{
-	struct btrfs_space_info *data_sinfo;
-
-	/* get the space info for where this inode will be storing its data */
-	data_sinfo = BTRFS_I(inode)->space_info;
-
-	/* make sure we have enough space to handle the data first */
-	spin_lock(&data_sinfo->lock);
-	data_sinfo->bytes_delalloc += bytes;
-
-	/*
-	 * we are adding a delalloc extent without calling
-	 * btrfs_check_data_free_space first.  This happens on a weird
-	 * writepage condition, but shouldn't hurt our accounting
-	 */
-	if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
-		data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
-		BTRFS_I(inode)->reserved_bytes = 0;
-	} else {
-		data_sinfo->bytes_may_use -= bytes;
-		BTRFS_I(inode)->reserved_bytes -= bytes;
-	}
-
-	spin_unlock(&data_sinfo->lock);
-}
-
-/* called when we are clearing an delalloc extent from the inode's io_tree */
-void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
-			      u64 bytes)
-{
-	struct btrfs_space_info *info;
-
-	info = BTRFS_I(inode)->space_info;
-
-	spin_lock(&info->lock);
-	info->bytes_delalloc -= bytes;
-	spin_unlock(&info->lock);
-}
-
 static void force_metadata_allocation(struct btrfs_fs_info *info)
 {
 	struct list_head *head = &info->space_info;
@@ -3389,13 +2998,28 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
 	rcu_read_unlock();
 }
 
+static int should_alloc_chunk(struct btrfs_space_info *sinfo,
+			      u64 alloc_bytes)
+{
+	u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
+
+	if (sinfo->bytes_used + sinfo->bytes_reserved +
+	    alloc_bytes + 256 * 1024 * 1024 < num_bytes)
+		return 0;
+
+	if (sinfo->bytes_used + sinfo->bytes_reserved +
+	    alloc_bytes < div_factor(num_bytes, 8))
+		return 0;
+
+	return 1;
+}
+
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *extent_root, u64 alloc_bytes,
 			  u64 flags, int force)
 {
 	struct btrfs_space_info *space_info;
 	struct btrfs_fs_info *fs_info = extent_root->fs_info;
-	u64 thresh;
 	int ret = 0;
 
 	mutex_lock(&fs_info->chunk_mutex);
@@ -3418,11 +3042,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 
-	thresh = space_info->total_bytes - space_info->bytes_readonly;
-	thresh = div_factor(thresh, 8);
-	if (!force &&
-	   (space_info->bytes_used + space_info->bytes_pinned +
-	    space_info->bytes_reserved + alloc_bytes) < thresh) {
+	if (!force && !should_alloc_chunk(space_info, alloc_bytes)) {
 		spin_unlock(&space_info->lock);
 		goto out;
 	}
@@ -3444,6 +3064,8 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 	spin_lock(&space_info->lock);
 	if (ret)
 		space_info->full = 1;
+	else
+		ret = 1;
 	space_info->force_alloc = 0;
 	spin_unlock(&space_info->lock);
 out:
@@ -3451,13 +3073,713 @@ out:
 	return ret;
 }
 
+static int maybe_allocate_chunk(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				struct btrfs_space_info *sinfo, u64 num_bytes)
+{
+	int ret;
+	int end_trans = 0;
+
+	if (sinfo->full)
+		return 0;
+
+	spin_lock(&sinfo->lock);
+	ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024);
+	spin_unlock(&sinfo->lock);
+	if (!ret)
+		return 0;
+
+	if (!trans) {
+		trans = btrfs_join_transaction(root, 1);
+		BUG_ON(IS_ERR(trans));
+		end_trans = 1;
+	}
+
+	ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+			     num_bytes + 2 * 1024 * 1024,
+			     get_alloc_profile(root, sinfo->flags), 0);
+
+	if (end_trans)
+		btrfs_end_transaction(trans, root);
+
+	return ret == 1 ? 1 : 0;
+}
+
+/*
+ * shrink metadata reservation for delalloc
+ */
+static int shrink_delalloc(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root, u64 to_reclaim)
+{
+	struct btrfs_block_rsv *block_rsv;
+	u64 reserved;
+	u64 max_reclaim;
+	u64 reclaimed = 0;
+	int pause = 1;
+	int ret;
+
+	block_rsv = &root->fs_info->delalloc_block_rsv;
+	spin_lock(&block_rsv->lock);
+	reserved = block_rsv->reserved;
+	spin_unlock(&block_rsv->lock);
+
+	if (reserved == 0)
+		return 0;
+
+	max_reclaim = min(reserved, to_reclaim);
+
+	while (1) {
+		ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0);
+		if (!ret) {
+			__set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(pause);
+			pause <<= 1;
+			if (pause > HZ / 10)
+				pause = HZ / 10;
+		} else {
+			pause = 1;
+		}
+
+		spin_lock(&block_rsv->lock);
+		if (reserved > block_rsv->reserved)
+			reclaimed = reserved - block_rsv->reserved;
+		reserved = block_rsv->reserved;
+		spin_unlock(&block_rsv->lock);
+
+		if (reserved == 0 || reclaimed >= max_reclaim)
+			break;
+
+		if (trans && trans->transaction->blocked)
+			return -EAGAIN;
+	}
+	return reclaimed >= to_reclaim;
+}
+
+static int should_retry_reserve(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				struct btrfs_block_rsv *block_rsv,
+				u64 num_bytes, int *retries)
+{
+	struct btrfs_space_info *space_info = block_rsv->space_info;
+	int ret;
+
+	if ((*retries) > 2)
+		return -ENOSPC;
+
+	ret = maybe_allocate_chunk(trans, root, space_info, num_bytes);
+	if (ret)
+		return 1;
+
+	if (trans && trans->transaction->in_commit)
+		return -ENOSPC;
+
+	ret = shrink_delalloc(trans, root, num_bytes);
+	if (ret)
+		return ret;
+
+	spin_lock(&space_info->lock);
+	if (space_info->bytes_pinned < num_bytes)
+		ret = 1;
+	spin_unlock(&space_info->lock);
+	if (ret)
+		return -ENOSPC;
+
+	(*retries)++;
+
+	if (trans)
+		return -EAGAIN;
+
+	trans = btrfs_join_transaction(root, 1);
+	BUG_ON(IS_ERR(trans));
+	ret = btrfs_commit_transaction(trans, root);
+	BUG_ON(ret);
+
+	return 1;
+}
+
+static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv,
+				  u64 num_bytes)
+{
+	struct btrfs_space_info *space_info = block_rsv->space_info;
+	u64 unused;
+	int ret = -ENOSPC;
+
+	spin_lock(&space_info->lock);
+	unused = space_info->bytes_used + space_info->bytes_reserved +
+		 space_info->bytes_pinned + space_info->bytes_readonly;
+
+	if (unused < space_info->total_bytes)
+		unused = space_info->total_bytes - unused;
+	else
+		unused = 0;
+
+	if (unused >= num_bytes) {
+		if (block_rsv->priority >= 10) {
+			space_info->bytes_reserved += num_bytes;
+			ret = 0;
+		} else {
+			if ((unused + block_rsv->reserved) *
+			    block_rsv->priority >=
+			    (num_bytes + block_rsv->reserved) * 10) {
+				space_info->bytes_reserved += num_bytes;
+				ret = 0;
+			}
+		}
+	}
+	spin_unlock(&space_info->lock);
+
+	return ret;
+}
+
+static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
+					     struct btrfs_root *root)
+{
+	struct btrfs_block_rsv *block_rsv;
+	if (root->ref_cows)
+		block_rsv = trans->block_rsv;
+	else
+		block_rsv = root->block_rsv;
+
+	if (!block_rsv)
+		block_rsv = &root->fs_info->empty_block_rsv;
+
+	return block_rsv;
+}
+
+static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
+			       u64 num_bytes)
+{
+	int ret = -ENOSPC;
+	spin_lock(&block_rsv->lock);
+	if (block_rsv->reserved >= num_bytes) {
+		block_rsv->reserved -= num_bytes;
+		if (block_rsv->reserved < block_rsv->size)
+			block_rsv->full = 0;
+		ret = 0;
+	}
+	spin_unlock(&block_rsv->lock);
+	return ret;
+}
+
+static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
+				u64 num_bytes, int update_size)
+{
+	spin_lock(&block_rsv->lock);
+	block_rsv->reserved += num_bytes;
+	if (update_size)
+		block_rsv->size += num_bytes;
+	else if (block_rsv->reserved >= block_rsv->size)
+		block_rsv->full = 1;
+	spin_unlock(&block_rsv->lock);
+}
+
+void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
+			     struct btrfs_block_rsv *dest, u64 num_bytes)
+{
+	struct btrfs_space_info *space_info = block_rsv->space_info;
+
+	spin_lock(&block_rsv->lock);
+	if (num_bytes == (u64)-1)
+		num_bytes = block_rsv->size;
+	block_rsv->size -= num_bytes;
+	if (block_rsv->reserved >= block_rsv->size) {
+		num_bytes = block_rsv->reserved - block_rsv->size;
+		block_rsv->reserved = block_rsv->size;
+		block_rsv->full = 1;
+	} else {
+		num_bytes = 0;
+	}
+	spin_unlock(&block_rsv->lock);
+
+	if (num_bytes > 0) {
+		if (dest) {
+			block_rsv_add_bytes(dest, num_bytes, 0);
+		} else {
+			spin_lock(&space_info->lock);
+			space_info->bytes_reserved -= num_bytes;
+			spin_unlock(&space_info->lock);
+		}
+	}
+}
+
+static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
+				   struct btrfs_block_rsv *dst, u64 num_bytes)
+{
+	int ret;
+
+	ret = block_rsv_use_bytes(src, num_bytes);
+	if (ret)
+		return ret;
+
+	block_rsv_add_bytes(dst, num_bytes, 1);
+	return 0;
+}
+
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
+{
+	memset(rsv, 0, sizeof(*rsv));
+	spin_lock_init(&rsv->lock);
+	atomic_set(&rsv->usage, 1);
+	rsv->priority = 6;
+	INIT_LIST_HEAD(&rsv->list);
+}
+
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
+{
+	struct btrfs_block_rsv *block_rsv;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	u64 alloc_target;
+
+	block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
+	if (!block_rsv)
+		return NULL;
+
+	btrfs_init_block_rsv(block_rsv);
+
+	alloc_target = btrfs_get_alloc_profile(root, 0);
+	block_rsv->space_info = __find_space_info(fs_info,
+						  BTRFS_BLOCK_GROUP_METADATA);
+
+	return block_rsv;
+}
+
+void btrfs_free_block_rsv(struct btrfs_root *root,
+			  struct btrfs_block_rsv *rsv)
+{
+	if (rsv && atomic_dec_and_test(&rsv->usage)) {
+		btrfs_block_rsv_release(root, rsv, (u64)-1);
+		if (!rsv->durable)
+			kfree(rsv);
+	}
+}
+
+/*
+ * make the block_rsv struct be able to capture freed space.
+ * the captured space will re-add to the the block_rsv struct
+ * after transaction commit
+ */
+void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
+				 struct btrfs_block_rsv *block_rsv)
+{
+	block_rsv->durable = 1;
+	mutex_lock(&fs_info->durable_block_rsv_mutex);
+	list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list);
+	mutex_unlock(&fs_info->durable_block_rsv_mutex);
+}
+
+int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root,
+			struct btrfs_block_rsv *block_rsv,
+			u64 num_bytes, int *retries)
+{
+	int ret;
+
+	if (num_bytes == 0)
+		return 0;
+again:
+	ret = reserve_metadata_bytes(block_rsv, num_bytes);
+	if (!ret) {
+		block_rsv_add_bytes(block_rsv, num_bytes, 1);
+		return 0;
+	}
+
+	ret = should_retry_reserve(trans, root, block_rsv, num_bytes, retries);
+	if (ret > 0)
+		goto again;
+
+	return ret;
+}
+
+int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root,
+			  struct btrfs_block_rsv *block_rsv,
+			  u64 min_reserved, int min_factor)
+{
+	u64 num_bytes = 0;
+	int commit_trans = 0;
+	int ret = -ENOSPC;
+
+	if (!block_rsv)
+		return 0;
+
+	spin_lock(&block_rsv->lock);
+	if (min_factor > 0)
+		num_bytes = div_factor(block_rsv->size, min_factor);
+	if (min_reserved > num_bytes)
+		num_bytes = min_reserved;
+
+	if (block_rsv->reserved >= num_bytes) {
+		ret = 0;
+	} else {
+		num_bytes -= block_rsv->reserved;
+		if (block_rsv->durable &&
+		    block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
+			commit_trans = 1;
+	}
+	spin_unlock(&block_rsv->lock);
+	if (!ret)
+		return 0;
+
+	if (block_rsv->refill_used) {
+		ret = reserve_metadata_bytes(block_rsv, num_bytes);
+		if (!ret) {
+			block_rsv_add_bytes(block_rsv, num_bytes, 0);
+			return 0;
+		}
+	}
+
+	if (commit_trans) {
+		if (trans)
+			return -EAGAIN;
+
+		trans = btrfs_join_transaction(root, 1);
+		BUG_ON(IS_ERR(trans));
+		ret = btrfs_commit_transaction(trans, root);
+		return 0;
+	}
+
+	WARN_ON(1);
+	printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
+		block_rsv->size, block_rsv->reserved,
+		block_rsv->freed[0], block_rsv->freed[1]);
+
+	return -ENOSPC;
+}
+
+int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
+			    struct btrfs_block_rsv *dst_rsv,
+			    u64 num_bytes)
+{
+	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+}
+
+void btrfs_block_rsv_release(struct btrfs_root *root,
+			     struct btrfs_block_rsv *block_rsv,
+			     u64 num_bytes)
+{
+	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
+	if (global_rsv->full || global_rsv == block_rsv ||
+	    block_rsv->space_info != global_rsv->space_info)
+		global_rsv = NULL;
+	block_rsv_release_bytes(block_rsv, global_rsv, num_bytes);
+}
+
+/*
+ * helper to calculate size of global block reservation.
+ * the desired value is sum of space used by extent tree,
+ * checksum tree and root tree
+ */
+static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_space_info *sinfo;
+	u64 num_bytes;
+	u64 meta_used;
+	u64 data_used;
+	int csum_size = btrfs_super_csum_size(&fs_info->super_copy);
+#if 0
+	/*
+	 * per tree used space accounting can be inaccuracy, so we
+	 * can't rely on it.
+	 */
+	spin_lock(&fs_info->extent_root->accounting_lock);
+	num_bytes = btrfs_root_used(&fs_info->extent_root->root_item);
+	spin_unlock(&fs_info->extent_root->accounting_lock);
+
+	spin_lock(&fs_info->csum_root->accounting_lock);
+	num_bytes += btrfs_root_used(&fs_info->csum_root->root_item);
+	spin_unlock(&fs_info->csum_root->accounting_lock);
+
+	spin_lock(&fs_info->tree_root->accounting_lock);
+	num_bytes += btrfs_root_used(&fs_info->tree_root->root_item);
+	spin_unlock(&fs_info->tree_root->accounting_lock);
+#endif
+	sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
+	spin_lock(&sinfo->lock);
+	data_used = sinfo->bytes_used;
+	spin_unlock(&sinfo->lock);
+
+	sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+	spin_lock(&sinfo->lock);
+	meta_used = sinfo->bytes_used;
+	spin_unlock(&sinfo->lock);
+
+	num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
+		    csum_size * 2;
+	num_bytes += div64_u64(data_used + meta_used, 50);
+
+	if (num_bytes * 3 > meta_used)
+		num_bytes = div64_u64(meta_used, 3);
+
+	return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
+}
+
+static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
+	struct btrfs_space_info *sinfo = block_rsv->space_info;
+	u64 num_bytes;
+
+	num_bytes = calc_global_metadata_size(fs_info);
+
+	spin_lock(&block_rsv->lock);
+	spin_lock(&sinfo->lock);
+
+	block_rsv->size = num_bytes;
+
+	num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
+		    sinfo->bytes_reserved + sinfo->bytes_readonly;
+
+	if (sinfo->total_bytes > num_bytes) {
+		num_bytes = sinfo->total_bytes - num_bytes;
+		block_rsv->reserved += num_bytes;
+		sinfo->bytes_reserved += num_bytes;
+	}
+
+	if (block_rsv->reserved >= block_rsv->size) {
+		num_bytes = block_rsv->reserved - block_rsv->size;
+		sinfo->bytes_reserved -= num_bytes;
+		block_rsv->reserved = block_rsv->size;
+		block_rsv->full = 1;
+	}
+#if 0
+	printk(KERN_INFO"global block rsv size %llu reserved %llu\n",
+		block_rsv->size, block_rsv->reserved);
+#endif
+	spin_unlock(&sinfo->lock);
+	spin_unlock(&block_rsv->lock);
+}
+
+static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_space_info *space_info;
+
+	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
+	fs_info->chunk_block_rsv.space_info = space_info;
+	fs_info->chunk_block_rsv.priority = 10;
+
+	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+	fs_info->global_block_rsv.space_info = space_info;
+	fs_info->global_block_rsv.priority = 10;
+	fs_info->global_block_rsv.refill_used = 1;
+	fs_info->delalloc_block_rsv.space_info = space_info;
+	fs_info->trans_block_rsv.space_info = space_info;
+	fs_info->empty_block_rsv.space_info = space_info;
+	fs_info->empty_block_rsv.priority = 10;
+
+	fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
+	fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
+	fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
+	fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
+	fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
+
+	btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
+
+	btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
+
+	update_global_block_rsv(fs_info);
+}
+
+static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+	block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1);
+	WARN_ON(fs_info->delalloc_block_rsv.size > 0);
+	WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
+	WARN_ON(fs_info->trans_block_rsv.size > 0);
+	WARN_ON(fs_info->trans_block_rsv.reserved > 0);
+	WARN_ON(fs_info->chunk_block_rsv.size > 0);
+	WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
+}
+
+static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items)
+{
+	return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
+		3 * num_items;
+}
+
+int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 int num_items, int *retries)
+{
+	u64 num_bytes;
+	int ret;
+
+	if (num_items == 0 || root->fs_info->chunk_root == root)
+		return 0;
+
+	num_bytes = calc_trans_metadata_size(root, num_items);
+	ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv,
+				  num_bytes, retries);
+	if (!ret) {
+		trans->bytes_reserved += num_bytes;
+		trans->block_rsv = &root->fs_info->trans_block_rsv;
+	}
+	return ret;
+}
+
+void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root)
+{
+	if (!trans->bytes_reserved)
+		return;
+
+	BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv);
+	btrfs_block_rsv_release(root, trans->block_rsv,
+				trans->bytes_reserved);
+	trans->bytes_reserved = 0;
+}
+
+int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
+				  struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
+	struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
+
+	/*
+	 * one for deleting orphan item, one for updating inode and
+	 * two for calling btrfs_truncate_inode_items.
+	 *
+	 * btrfs_truncate_inode_items is a delete operation, it frees
+	 * more space than it uses in most cases. So two units of
+	 * metadata space should be enough for calling it many times.
+	 * If all of the metadata space is used, we can commit
+	 * transaction and use space it freed.
+	 */
+	u64 num_bytes = calc_trans_metadata_size(root, 4);
+	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+}
+
+void btrfs_orphan_release_metadata(struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 num_bytes = calc_trans_metadata_size(root, 4);
+	btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
+}
+
+int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
+				struct btrfs_pending_snapshot *pending)
+{
+	struct btrfs_root *root = pending->root;
+	struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
+	struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
+	/*
+	 * two for root back/forward refs, two for directory entries
+	 * and one for root of the snapshot.
+	 */
+	u64 num_bytes = calc_trans_metadata_size(root, 5);
+	dst_rsv->space_info = src_rsv->space_info;
+	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+}
+
+static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes)
+{
+	return num_bytes >>= 3;
+}
+
+int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
+	u64 to_reserve;
+	int nr_extents;
+	int retries = 0;
+	int ret;
+
+	if (btrfs_transaction_in_commit(root->fs_info))
+		schedule_timeout(1);
+
+	num_bytes = ALIGN(num_bytes, root->sectorsize);
+again:
+	spin_lock(&BTRFS_I(inode)->accounting_lock);
+	nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
+	if (nr_extents > BTRFS_I(inode)->reserved_extents) {
+		nr_extents -= BTRFS_I(inode)->reserved_extents;
+		to_reserve = calc_trans_metadata_size(root, nr_extents);
+	} else {
+		nr_extents = 0;
+		to_reserve = 0;
+	}
+
+	to_reserve += calc_csum_metadata_size(inode, num_bytes);
+	ret = reserve_metadata_bytes(block_rsv, to_reserve);
+	if (ret) {
+		spin_unlock(&BTRFS_I(inode)->accounting_lock);
+		ret = should_retry_reserve(NULL, root, block_rsv, to_reserve,
+					   &retries);
+		if (ret > 0)
+			goto again;
+		return ret;
+	}
+
+	BTRFS_I(inode)->reserved_extents += nr_extents;
+	atomic_inc(&BTRFS_I(inode)->outstanding_extents);
+	spin_unlock(&BTRFS_I(inode)->accounting_lock);
+
+	block_rsv_add_bytes(block_rsv, to_reserve, 1);
+
+	if (block_rsv->size > 512 * 1024 * 1024)
+		shrink_delalloc(NULL, root, to_reserve);
+
+	return 0;
+}
+
+void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 to_free;
+	int nr_extents;
+
+	num_bytes = ALIGN(num_bytes, root->sectorsize);
+	atomic_dec(&BTRFS_I(inode)->outstanding_extents);
+
+	spin_lock(&BTRFS_I(inode)->accounting_lock);
+	nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
+	if (nr_extents < BTRFS_I(inode)->reserved_extents) {
+		nr_extents = BTRFS_I(inode)->reserved_extents - nr_extents;
+		BTRFS_I(inode)->reserved_extents -= nr_extents;
+	} else {
+		nr_extents = 0;
+	}
+	spin_unlock(&BTRFS_I(inode)->accounting_lock);
+
+	to_free = calc_csum_metadata_size(inode, num_bytes);
+	if (nr_extents > 0)
+		to_free += calc_trans_metadata_size(root, nr_extents);
+
+	btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
+				to_free);
+}
+
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
+{
+	int ret;
+
+	ret = btrfs_check_data_free_space(inode, num_bytes);
+	if (ret)
+		return ret;
+
+	ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
+	if (ret) {
+		btrfs_free_reserved_data_space(inode, num_bytes);
+		return ret;
+	}
+
+	return 0;
+}
+
+void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
+{
+	btrfs_delalloc_release_metadata(inode, num_bytes);
+	btrfs_free_reserved_data_space(inode, num_bytes);
+}
+
 static int update_block_group(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
-			      u64 bytenr, u64 num_bytes, int alloc,
-			      int mark_free)
+			      u64 bytenr, u64 num_bytes, int alloc)
 {
 	struct btrfs_block_group_cache *cache;
 	struct btrfs_fs_info *info = root->fs_info;
+	int factor;
 	u64 total = num_bytes;
 	u64 old_val;
 	u64 byte_in_group;
@@ -3476,6 +3798,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 		cache = btrfs_lookup_block_group(info, bytenr);
 		if (!cache)
 			return -1;
+		if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
+				    BTRFS_BLOCK_GROUP_RAID1 |
+				    BTRFS_BLOCK_GROUP_RAID10))
+			factor = 2;
+		else
+			factor = 1;
 		byte_in_group = bytenr - cache->key.objectid;
 		WARN_ON(byte_in_group > cache->key.offset);
 
@@ -3488,31 +3816,24 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 			old_val += num_bytes;
 			btrfs_set_block_group_used(&cache->item, old_val);
 			cache->reserved -= num_bytes;
-			cache->space_info->bytes_used += num_bytes;
 			cache->space_info->bytes_reserved -= num_bytes;
-			if (cache->ro)
-				cache->space_info->bytes_readonly -= num_bytes;
+			cache->space_info->bytes_used += num_bytes;
+			cache->space_info->disk_used += num_bytes * factor;
 			spin_unlock(&cache->lock);
 			spin_unlock(&cache->space_info->lock);
 		} else {
 			old_val -= num_bytes;
-			cache->space_info->bytes_used -= num_bytes;
-			if (cache->ro)
-				cache->space_info->bytes_readonly += num_bytes;
 			btrfs_set_block_group_used(&cache->item, old_val);
+			cache->pinned += num_bytes;
+			cache->space_info->bytes_pinned += num_bytes;
+			cache->space_info->bytes_used -= num_bytes;
+			cache->space_info->disk_used -= num_bytes * factor;
 			spin_unlock(&cache->lock);
 			spin_unlock(&cache->space_info->lock);
-			if (mark_free) {
-				int ret;
-
-				ret = btrfs_discard_extent(root, bytenr,
-							   num_bytes);
-				WARN_ON(ret);
 
-				ret = btrfs_add_free_space(cache, bytenr,
-							   num_bytes);
-				WARN_ON(ret);
-			}
+			set_extent_dirty(info->pinned_extents,
+					 bytenr, bytenr + num_bytes - 1,
+					 GFP_NOFS | __GFP_NOFAIL);
 		}
 		btrfs_put_block_group(cache);
 		total -= num_bytes;
@@ -3536,18 +3857,10 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
 	return bytenr;
 }
 
-/*
- * this function must be called within transaction
- */
-int btrfs_pin_extent(struct btrfs_root *root,
-		     u64 bytenr, u64 num_bytes, int reserved)
+static int pin_down_extent(struct btrfs_root *root,
+			   struct btrfs_block_group_cache *cache,
+			   u64 bytenr, u64 num_bytes, int reserved)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
-	struct btrfs_block_group_cache *cache;
-
-	cache = btrfs_lookup_block_group(fs_info, bytenr);
-	BUG_ON(!cache);
-
 	spin_lock(&cache->space_info->lock);
 	spin_lock(&cache->lock);
 	cache->pinned += num_bytes;
@@ -3559,28 +3872,68 @@ int btrfs_pin_extent(struct btrfs_root *root,
 	spin_unlock(&cache->lock);
 	spin_unlock(&cache->space_info->lock);
 
-	btrfs_put_block_group(cache);
+	set_extent_dirty(root->fs_info->pinned_extents, bytenr,
+			 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
+	return 0;
+}
+
+/*
+ * this function must be called within transaction
+ */
+int btrfs_pin_extent(struct btrfs_root *root,
+		     u64 bytenr, u64 num_bytes, int reserved)
+{
+	struct btrfs_block_group_cache *cache;
+
+	cache = btrfs_lookup_block_group(root->fs_info, bytenr);
+	BUG_ON(!cache);
 
-	set_extent_dirty(fs_info->pinned_extents,
-			 bytenr, bytenr + num_bytes - 1, GFP_NOFS);
+	pin_down_extent(root, cache, bytenr, num_bytes, reserved);
+
+	btrfs_put_block_group(cache);
 	return 0;
 }
 
-static int update_reserved_extents(struct btrfs_block_group_cache *cache,
-				   u64 num_bytes, int reserve)
+/*
+ * update size of reserved extents. this function may return -EAGAIN
+ * if 'reserve' is true or 'sinfo' is false.
+ */
+static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
+				 u64 num_bytes, int reserve, int sinfo)
 {
-	spin_lock(&cache->space_info->lock);
-	spin_lock(&cache->lock);
-	if (reserve) {
-		cache->reserved += num_bytes;
-		cache->space_info->bytes_reserved += num_bytes;
+	int ret = 0;
+	if (sinfo) {
+		struct btrfs_space_info *space_info = cache->space_info;
+		spin_lock(&space_info->lock);
+		spin_lock(&cache->lock);
+		if (reserve) {
+			if (cache->ro) {
+				ret = -EAGAIN;
+			} else {
+				cache->reserved += num_bytes;
+				space_info->bytes_reserved += num_bytes;
+			}
+		} else {
+			if (cache->ro)
+				space_info->bytes_readonly += num_bytes;
+			cache->reserved -= num_bytes;
+			space_info->bytes_reserved -= num_bytes;
+		}
+		spin_unlock(&cache->lock);
+		spin_unlock(&space_info->lock);
 	} else {
-		cache->reserved -= num_bytes;
-		cache->space_info->bytes_reserved -= num_bytes;
+		spin_lock(&cache->lock);
+		if (cache->ro) {
+			ret = -EAGAIN;
+		} else {
+			if (reserve)
+				cache->reserved += num_bytes;
+			else
+				cache->reserved -= num_bytes;
+		}
+		spin_unlock(&cache->lock);
 	}
-	spin_unlock(&cache->lock);
-	spin_unlock(&cache->space_info->lock);
-	return 0;
+	return ret;
 }
 
 int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
@@ -3611,6 +3964,8 @@ int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
 		fs_info->pinned_extents = &fs_info->freed_extents[0];
 
 	up_write(&fs_info->extent_commit_sem);
+
+	update_global_block_rsv(fs_info);
 	return 0;
 }
 
@@ -3637,14 +3992,21 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
 			btrfs_add_free_space(cache, start, len);
 		}
 
+		start += len;
+
 		spin_lock(&cache->space_info->lock);
 		spin_lock(&cache->lock);
 		cache->pinned -= len;
 		cache->space_info->bytes_pinned -= len;
+		if (cache->ro) {
+			cache->space_info->bytes_readonly += len;
+		} else if (cache->reserved_pinned > 0) {
+			len = min(len, cache->reserved_pinned);
+			cache->reserved_pinned -= len;
+			cache->space_info->bytes_reserved += len;
+		}
 		spin_unlock(&cache->lock);
 		spin_unlock(&cache->space_info->lock);
-
-		start += len;
 	}
 
 	if (cache)
@@ -3657,8 +4019,11 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct extent_io_tree *unpin;
+	struct btrfs_block_rsv *block_rsv;
+	struct btrfs_block_rsv *next_rsv;
 	u64 start;
 	u64 end;
+	int idx;
 	int ret;
 
 	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
@@ -3679,59 +4044,30 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 		cond_resched();
 	}
 
-	return ret;
-}
-
-static int pin_down_bytes(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root,
-			  struct btrfs_path *path,
-			  u64 bytenr, u64 num_bytes,
-			  int is_data, int reserved,
-			  struct extent_buffer **must_clean)
-{
-	int err = 0;
-	struct extent_buffer *buf;
-
-	if (is_data)
-		goto pinit;
-
-	/*
-	 * discard is sloooow, and so triggering discards on
-	 * individual btree blocks isn't a good plan.  Just
-	 * pin everything in discard mode.
-	 */
-	if (btrfs_test_opt(root, DISCARD))
-		goto pinit;
+	mutex_lock(&fs_info->durable_block_rsv_mutex);
+	list_for_each_entry_safe(block_rsv, next_rsv,
+				 &fs_info->durable_block_rsv_list, list) {
 
-	buf = btrfs_find_tree_block(root, bytenr, num_bytes);
-	if (!buf)
-		goto pinit;
+		idx = trans->transid & 0x1;
+		if (block_rsv->freed[idx] > 0) {
+			block_rsv_add_bytes(block_rsv,
+					    block_rsv->freed[idx], 0);
+			block_rsv->freed[idx] = 0;
+		}
+		if (atomic_read(&block_rsv->usage) == 0) {
+			btrfs_block_rsv_release(root, block_rsv, (u64)-1);
 
-	/* we can reuse a block if it hasn't been written
-	 * and it is from this transaction.  We can't
-	 * reuse anything from the tree log root because
-	 * it has tiny sub-transactions.
-	 */
-	if (btrfs_buffer_uptodate(buf, 0) &&
-	    btrfs_try_tree_lock(buf)) {
-		u64 header_owner = btrfs_header_owner(buf);
-		u64 header_transid = btrfs_header_generation(buf);
-		if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
-		    header_transid == trans->transid &&
-		    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
-			*must_clean = buf;
-			return 1;
+			if (block_rsv->freed[0] == 0 &&
+			    block_rsv->freed[1] == 0) {
+				list_del_init(&block_rsv->list);
+				kfree(block_rsv);
+			}
+		} else {
+			btrfs_block_rsv_release(root, block_rsv, 0);
 		}
-		btrfs_tree_unlock(buf);
 	}
-	free_extent_buffer(buf);
-pinit:
-	if (path)
-		btrfs_set_path_blocking(path);
-	/* unlocks the pinned mutex */
-	btrfs_pin_extent(root, bytenr, num_bytes, reserved);
+	mutex_unlock(&fs_info->durable_block_rsv_mutex);
 
-	BUG_ON(err < 0);
 	return 0;
 }
 
@@ -3892,9 +4228,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 			BUG_ON(ret);
 		}
 	} else {
-		int mark_free = 0;
-		struct extent_buffer *must_clean = NULL;
-
 		if (found_extent) {
 			BUG_ON(is_data && refs_to_drop !=
 			       extent_data_ref_count(root, path, iref));
@@ -3907,31 +4240,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 			}
 		}
 
-		ret = pin_down_bytes(trans, root, path, bytenr,
-				     num_bytes, is_data, 0, &must_clean);
-		if (ret > 0)
-			mark_free = 1;
-		BUG_ON(ret < 0);
-		/*
-		 * it is going to be very rare for someone to be waiting
-		 * on the block we're freeing.  del_items might need to
-		 * schedule, so rather than get fancy, just force it
-		 * to blocking here
-		 */
-		if (must_clean)
-			btrfs_set_lock_blocking(must_clean);
-
 		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
 				      num_to_del);
 		BUG_ON(ret);
 		btrfs_release_path(extent_root, path);
 
-		if (must_clean) {
-			clean_tree_block(NULL, root, must_clean);
-			btrfs_tree_unlock(must_clean);
-			free_extent_buffer(must_clean);
-		}
-
 		if (is_data) {
 			ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
 			BUG_ON(ret);
@@ -3941,8 +4254,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 			     (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
 		}
 
-		ret = update_block_group(trans, root, bytenr, num_bytes, 0,
-					 mark_free);
+		ret = update_block_group(trans, root, bytenr, num_bytes, 0);
 		BUG_ON(ret);
 	}
 	btrfs_free_path(path);
@@ -3950,7 +4262,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 }
 
 /*
- * when we free an extent, it is possible (and likely) that we free the last
+ * when we free an block, it is possible (and likely) that we free the last
  * delayed ref for that extent as well.  This searches the delayed ref tree for
  * a given extent, and if there are no other delayed refs to be processed, it
  * removes it from the tree.
@@ -3962,7 +4274,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 	struct btrfs_delayed_ref_root *delayed_refs;
 	struct btrfs_delayed_ref_node *ref;
 	struct rb_node *node;
-	int ret;
+	int ret = 0;
 
 	delayed_refs = &trans->transaction->delayed_refs;
 	spin_lock(&delayed_refs->lock);
@@ -4014,17 +4326,100 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 	list_del_init(&head->cluster);
 	spin_unlock(&delayed_refs->lock);
 
-	ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
-				  &head->node, head->extent_op,
-				  head->must_insert_reserved);
-	BUG_ON(ret);
+	BUG_ON(head->extent_op);
+	if (head->must_insert_reserved)
+		ret = 1;
+
+	mutex_unlock(&head->mutex);
 	btrfs_put_delayed_ref(&head->node);
-	return 0;
+	return ret;
 out:
 	spin_unlock(&delayed_refs->lock);
 	return 0;
 }
 
+void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   struct extent_buffer *buf,
+			   u64 parent, int last_ref)
+{
+	struct btrfs_block_rsv *block_rsv;
+	struct btrfs_block_group_cache *cache = NULL;
+	int ret;
+
+	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
+		ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
+						parent, root->root_key.objectid,
+						btrfs_header_level(buf),
+						BTRFS_DROP_DELAYED_REF, NULL);
+		BUG_ON(ret);
+	}
+
+	if (!last_ref)
+		return;
+
+	block_rsv = get_block_rsv(trans, root);
+	cache = btrfs_lookup_block_group(root->fs_info, buf->start);
+	if (block_rsv->space_info != cache->space_info)
+		goto out;
+
+	if (btrfs_header_generation(buf) == trans->transid) {
+		if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
+			ret = check_ref_cleanup(trans, root, buf->start);
+			if (!ret)
+				goto pin;
+		}
+
+		if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
+			pin_down_extent(root, cache, buf->start, buf->len, 1);
+			goto pin;
+		}
+
+		WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
+
+		btrfs_add_free_space(cache, buf->start, buf->len);
+		ret = update_reserved_bytes(cache, buf->len, 0, 0);
+		if (ret == -EAGAIN) {
+			/* block group became read-only */
+			update_reserved_bytes(cache, buf->len, 0, 1);
+			goto out;
+		}
+
+		ret = 1;
+		spin_lock(&block_rsv->lock);
+		if (block_rsv->reserved < block_rsv->size) {
+			block_rsv->reserved += buf->len;
+			ret = 0;
+		}
+		spin_unlock(&block_rsv->lock);
+
+		if (ret) {
+			spin_lock(&cache->space_info->lock);
+			cache->space_info->bytes_reserved -= buf->len;
+			spin_unlock(&cache->space_info->lock);
+		}
+		goto out;
+	}
+pin:
+	if (block_rsv->durable && !cache->ro) {
+		ret = 0;
+		spin_lock(&cache->lock);
+		if (!cache->ro) {
+			cache->reserved_pinned += buf->len;
+			ret = 1;
+		}
+		spin_unlock(&cache->lock);
+
+		if (ret) {
+			spin_lock(&block_rsv->lock);
+			block_rsv->freed[trans->transid & 0x1] += buf->len;
+			spin_unlock(&block_rsv->lock);
+		}
+	}
+out:
+	btrfs_put_block_group(cache);
+}
+
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root,
 		      u64 bytenr, u64 num_bytes, u64 parent,
@@ -4046,8 +4441,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
 					parent, root_objectid, (int)owner,
 					BTRFS_DROP_DELAYED_REF, NULL);
 		BUG_ON(ret);
-		ret = check_ref_cleanup(trans, root, bytenr);
-		BUG_ON(ret);
 	} else {
 		ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
 					parent, root_objectid, owner,
@@ -4057,21 +4450,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root,
-			  u64 bytenr, u32 blocksize,
-			  u64 parent, u64 root_objectid, int level)
-{
-	u64 used;
-	spin_lock(&root->node_lock);
-	used = btrfs_root_used(&root->root_item) - blocksize;
-	btrfs_set_root_used(&root->root_item, used);
-	spin_unlock(&root->node_lock);
-
-	return btrfs_free_extent(trans, root, bytenr, blocksize,
-				 parent, root_objectid, level, 0);
-}
-
 static u64 stripe_align(struct btrfs_root *root, u64 val)
 {
 	u64 mask = ((u64)root->stripesize - 1);
@@ -4124,6 +4502,22 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
 	return 0;
 }
 
+static int get_block_group_index(struct btrfs_block_group_cache *cache)
+{
+	int index;
+	if (cache->flags & BTRFS_BLOCK_GROUP_RAID10)
+		index = 0;
+	else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1)
+		index = 1;
+	else if (cache->flags & BTRFS_BLOCK_GROUP_DUP)
+		index = 2;
+	else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0)
+		index = 3;
+	else
+		index = 4;
+	return index;
+}
+
 enum btrfs_loop_type {
 	LOOP_FIND_IDEAL = 0,
 	LOOP_CACHING_NOWAIT = 1,
@@ -4145,7 +4539,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 				     u64 num_bytes, u64 empty_size,
 				     u64 search_start, u64 search_end,
 				     u64 hint_byte, struct btrfs_key *ins,
-				     u64 exclude_start, u64 exclude_nr,
 				     int data)
 {
 	int ret = 0;
@@ -4158,6 +4551,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_space_info *space_info;
 	int last_ptr_loop = 0;
 	int loop = 0;
+	int index = 0;
 	bool found_uncached_bg = false;
 	bool failed_cluster_refill = false;
 	bool failed_alloc = false;
@@ -4170,6 +4564,10 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 	ins->offset = 0;
 
 	space_info = __find_space_info(root->fs_info, data);
+	if (!space_info) {
+		printk(KERN_ERR "No space info for %d\n", data);
+		return -ENOSPC;
+	}
 
 	if (orig_root->ref_cows || empty_size)
 		allowed_chunk_alloc = 1;
@@ -4223,6 +4621,7 @@ ideal_cache:
 				btrfs_put_block_group(block_group);
 				up_read(&space_info->groups_sem);
 			} else {
+				index = get_block_group_index(block_group);
 				goto have_block_group;
 			}
 		} else if (block_group) {
@@ -4231,7 +4630,8 @@ ideal_cache:
 	}
 search:
 	down_read(&space_info->groups_sem);
-	list_for_each_entry(block_group, &space_info->block_groups, list) {
+	list_for_each_entry(block_group, &space_info->block_groups[index],
+			    list) {
 		u64 offset;
 		int cached;
 
@@ -4422,23 +4822,22 @@ checks:
 			goto loop;
 		}
 
-		if (exclude_nr > 0 &&
-		    (search_start + num_bytes > exclude_start &&
-		     search_start < exclude_start + exclude_nr)) {
-			search_start = exclude_start + exclude_nr;
+		ins->objectid = search_start;
+		ins->offset = num_bytes;
+
+		if (offset < search_start)
+			btrfs_add_free_space(block_group, offset,
+					     search_start - offset);
+		BUG_ON(offset > search_start);
 
+		ret = update_reserved_bytes(block_group, num_bytes, 1,
+					    (data & BTRFS_BLOCK_GROUP_DATA));
+		if (ret == -EAGAIN) {
 			btrfs_add_free_space(block_group, offset, num_bytes);
-			/*
-			 * if search_start is still in this block group
-			 * then we just re-search this block group
-			 */
-			if (search_start >= block_group->key.objectid &&
-			    search_start < (block_group->key.objectid +
-					    block_group->key.offset))
-				goto have_block_group;
 			goto loop;
 		}
 
+		/* we are all good, lets return */
 		ins->objectid = search_start;
 		ins->offset = num_bytes;
 
@@ -4446,18 +4845,18 @@ checks:
 			btrfs_add_free_space(block_group, offset,
 					     search_start - offset);
 		BUG_ON(offset > search_start);
-
-		update_reserved_extents(block_group, num_bytes, 1);
-
-		/* we are all good, lets return */
 		break;
 loop:
 		failed_cluster_refill = false;
 		failed_alloc = false;
+		BUG_ON(index != get_block_group_index(block_group));
 		btrfs_put_block_group(block_group);
 	}
 	up_read(&space_info->groups_sem);
 
+	if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
+		goto search;
+
 	/* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
 	 *			for them to make caching progress.  Also
 	 *			determine the best possible bg to cache
@@ -4471,6 +4870,7 @@ loop:
 	if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
 	    (found_uncached_bg || empty_size || empty_cluster ||
 	     allowed_chunk_alloc)) {
+		index = 0;
 		if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
 			found_uncached_bg = false;
 			loop++;
@@ -4553,31 +4953,30 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
 			    int dump_block_groups)
 {
 	struct btrfs_block_group_cache *cache;
+	int index = 0;
 
 	spin_lock(&info->lock);
 	printk(KERN_INFO "space_info has %llu free, is %sfull\n",
 	       (unsigned long long)(info->total_bytes - info->bytes_used -
 				    info->bytes_pinned - info->bytes_reserved -
-				    info->bytes_super),
+				    info->bytes_readonly),
 	       (info->full) ? "" : "not ");
-	printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
-	       " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu"
-	       "\n",
+	printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
+	       "reserved=%llu, may_use=%llu, readonly=%llu\n",
 	       (unsigned long long)info->total_bytes,
+	       (unsigned long long)info->bytes_used,
 	       (unsigned long long)info->bytes_pinned,
-	       (unsigned long long)info->bytes_delalloc,
+	       (unsigned long long)info->bytes_reserved,
 	       (unsigned long long)info->bytes_may_use,
-	       (unsigned long long)info->bytes_used,
-	       (unsigned long long)info->bytes_root,
-	       (unsigned long long)info->bytes_super,
-	       (unsigned long long)info->bytes_reserved);
+	       (unsigned long long)info->bytes_readonly);
 	spin_unlock(&info->lock);
 
 	if (!dump_block_groups)
 		return;
 
 	down_read(&info->groups_sem);
-	list_for_each_entry(cache, &info->block_groups, list) {
+again:
+	list_for_each_entry(cache, &info->block_groups[index], list) {
 		spin_lock(&cache->lock);
 		printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
 		       "%llu pinned %llu reserved\n",
@@ -4589,6 +4988,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
 		btrfs_dump_free_space(cache, bytes);
 		spin_unlock(&cache->lock);
 	}
+	if (++index < BTRFS_NR_RAID_TYPES)
+		goto again;
 	up_read(&info->groups_sem);
 }
 
@@ -4614,9 +5015,8 @@ again:
 
 	WARN_ON(num_bytes < root->sectorsize);
 	ret = find_free_extent(trans, root, num_bytes, empty_size,
-			       search_start, search_end, hint_byte, ins,
-			       trans->alloc_exclude_start,
-			       trans->alloc_exclude_nr, data);
+			       search_start, search_end, hint_byte,
+			       ins, data);
 
 	if (ret == -ENOSPC && num_bytes > min_alloc_size) {
 		num_bytes = num_bytes >> 1;
@@ -4654,7 +5054,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
 	ret = btrfs_discard_extent(root, start, len);
 
 	btrfs_add_free_space(cache, start, len);
-	update_reserved_extents(cache, len, 0);
+	update_reserved_bytes(cache, len, 0, 1);
 	btrfs_put_block_group(cache);
 
 	return ret;
@@ -4717,8 +5117,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 	btrfs_free_path(path);
 
-	ret = update_block_group(trans, root, ins->objectid, ins->offset,
-				 1, 0);
+	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
 	if (ret) {
 		printk(KERN_ERR "btrfs update block group failed for %llu "
 		       "%llu\n", (unsigned long long)ins->objectid,
@@ -4778,8 +5177,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(leaf);
 	btrfs_free_path(path);
 
-	ret = update_block_group(trans, root, ins->objectid, ins->offset,
-				 1, 0);
+	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
 	if (ret) {
 		printk(KERN_ERR "btrfs update block group failed for %llu "
 		       "%llu\n", (unsigned long long)ins->objectid,
@@ -4855,73 +5253,14 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
 		put_caching_control(caching_ctl);
 	}
 
-	update_reserved_extents(block_group, ins->offset, 1);
+	ret = update_reserved_bytes(block_group, ins->offset, 1, 1);
+	BUG_ON(ret);
 	btrfs_put_block_group(block_group);
 	ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
 					 0, owner, offset, ins, 1);
 	return ret;
 }
 
-/*
- * finds a free extent and does all the dirty work required for allocation
- * returns the key for the extent through ins, and a tree buffer for
- * the first block of the extent through buf.
- *
- * returns 0 if everything worked, non-zero otherwise.
- */
-static int alloc_tree_block(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root,
-			    u64 num_bytes, u64 parent, u64 root_objectid,
-			    struct btrfs_disk_key *key, int level,
-			    u64 empty_size, u64 hint_byte, u64 search_end,
-			    struct btrfs_key *ins)
-{
-	int ret;
-	u64 flags = 0;
-
-	ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
-				   empty_size, hint_byte, search_end,
-				   ins, 0);
-	if (ret)
-		return ret;
-
-	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
-		if (parent == 0)
-			parent = ins->objectid;
-		flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
-	} else
-		BUG_ON(parent > 0);
-
-	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
-		struct btrfs_delayed_extent_op *extent_op;
-		extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
-		BUG_ON(!extent_op);
-		if (key)
-			memcpy(&extent_op->key, key, sizeof(extent_op->key));
-		else
-			memset(&extent_op->key, 0, sizeof(extent_op->key));
-		extent_op->flags_to_set = flags;
-		extent_op->update_key = 1;
-		extent_op->update_flags = 1;
-		extent_op->is_data = 0;
-
-		ret = btrfs_add_delayed_tree_ref(trans, ins->objectid,
-					ins->offset, parent, root_objectid,
-					level, BTRFS_ADD_DELAYED_EXTENT,
-					extent_op);
-		BUG_ON(ret);
-	}
-
-	if (root_objectid == root->root_key.objectid) {
-		u64 used;
-		spin_lock(&root->node_lock);
-		used = btrfs_root_used(&root->root_item) + num_bytes;
-		btrfs_set_root_used(&root->root_item, used);
-		spin_unlock(&root->node_lock);
-	}
-	return ret;
-}
-
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
 					    struct btrfs_root *root,
 					    u64 bytenr, u32 blocksize,
@@ -4960,8 +5299,45 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
 	return buf;
 }
 
+static struct btrfs_block_rsv *
+use_block_rsv(struct btrfs_trans_handle *trans,
+	      struct btrfs_root *root, u32 blocksize)
+{
+	struct btrfs_block_rsv *block_rsv;
+	int ret;
+
+	block_rsv = get_block_rsv(trans, root);
+
+	if (block_rsv->size == 0) {
+		ret = reserve_metadata_bytes(block_rsv, blocksize);
+		if (ret)
+			return ERR_PTR(ret);
+		return block_rsv;
+	}
+
+	ret = block_rsv_use_bytes(block_rsv, blocksize);
+	if (!ret)
+		return block_rsv;
+
+	WARN_ON(1);
+	printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
+		block_rsv->size, block_rsv->reserved,
+		block_rsv->freed[0], block_rsv->freed[1]);
+
+	return ERR_PTR(-ENOSPC);
+}
+
+static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize)
+{
+	block_rsv_add_bytes(block_rsv, blocksize, 0);
+	block_rsv_release_bytes(block_rsv, NULL, 0);
+}
+
 /*
- * helper function to allocate a block for a given tree
+ * finds a free extent and does all the dirty work required for allocation
+ * returns the key for the extent through ins, and a tree buffer for
+ * the first block of the extent through buf.
+ *
  * returns the tree buffer or NULL.
  */
 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
@@ -4971,18 +5347,53 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 					u64 hint, u64 empty_size)
 {
 	struct btrfs_key ins;
-	int ret;
+	struct btrfs_block_rsv *block_rsv;
 	struct extent_buffer *buf;
+	u64 flags = 0;
+	int ret;
 
-	ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid,
-			       key, level, empty_size, hint, (u64)-1, &ins);
+
+	block_rsv = use_block_rsv(trans, root, blocksize);
+	if (IS_ERR(block_rsv))
+		return ERR_CAST(block_rsv);
+
+	ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
+				   empty_size, hint, (u64)-1, &ins, 0);
 	if (ret) {
-		BUG_ON(ret > 0);
+		unuse_block_rsv(block_rsv, blocksize);
 		return ERR_PTR(ret);
 	}
 
 	buf = btrfs_init_new_buffer(trans, root, ins.objectid,
 				    blocksize, level);
+	BUG_ON(IS_ERR(buf));
+
+	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
+		if (parent == 0)
+			parent = ins.objectid;
+		flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
+	} else
+		BUG_ON(parent > 0);
+
+	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
+		struct btrfs_delayed_extent_op *extent_op;
+		extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+		BUG_ON(!extent_op);
+		if (key)
+			memcpy(&extent_op->key, key, sizeof(extent_op->key));
+		else
+			memset(&extent_op->key, 0, sizeof(extent_op->key));
+		extent_op->flags_to_set = flags;
+		extent_op->update_key = 1;
+		extent_op->update_flags = 1;
+		extent_op->is_data = 0;
+
+		ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
+					ins.offset, parent, root_objectid,
+					level, BTRFS_ADD_DELAYED_EXTENT,
+					extent_op);
+		BUG_ON(ret);
+	}
 	return buf;
 }
 
@@ -5205,6 +5616,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
 	next = btrfs_find_tree_block(root, bytenr, blocksize);
 	if (!next) {
 		next = btrfs_find_create_tree_block(root, bytenr, blocksize);
+		if (!next)
+			return -ENOMEM;
 		reada = 1;
 	}
 	btrfs_tree_lock(next);
@@ -5305,7 +5718,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
 				 struct btrfs_path *path,
 				 struct walk_control *wc)
 {
-	int ret = 0;
+	int ret;
 	int level = wc->level;
 	struct extent_buffer *eb = path->nodes[level];
 	u64 parent = 0;
@@ -5383,13 +5796,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
 			       btrfs_header_owner(path->nodes[level + 1]));
 	}
 
-	ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent,
-				root->root_key.objectid, level, 0);
-	BUG_ON(ret);
+	btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
 out:
 	wc->refs[level] = 0;
 	wc->flags[level] = 0;
-	return ret;
+	return 0;
 }
 
 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
@@ -5402,10 +5813,6 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 	int ret;
 
 	while (level >= 0) {
-		if (path->slots[level] >=
-		    btrfs_header_nritems(path->nodes[level]))
-			break;
-
 		ret = walk_down_proc(trans, root, path, wc, lookup_info);
 		if (ret > 0)
 			break;
@@ -5413,11 +5820,16 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 		if (level == 0)
 			break;
 
+		if (path->slots[level] >=
+		    btrfs_header_nritems(path->nodes[level]))
+			break;
+
 		ret = do_walk_down(trans, root, path, wc, &lookup_info);
 		if (ret > 0) {
 			path->slots[level]++;
 			continue;
-		}
+		} else if (ret < 0)
+			return ret;
 		level = wc->level;
 	}
 	return 0;
@@ -5466,7 +5878,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
  * also make sure backrefs for the shared block and all lower level
  * blocks are properly updated.
  */
-int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
+int btrfs_drop_snapshot(struct btrfs_root *root,
+			struct btrfs_block_rsv *block_rsv, int update_ref)
 {
 	struct btrfs_path *path;
 	struct btrfs_trans_handle *trans;
@@ -5484,7 +5897,9 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
 	wc = kzalloc(sizeof(*wc), GFP_NOFS);
 	BUG_ON(!wc);
 
-	trans = btrfs_start_transaction(tree_root, 1);
+	trans = btrfs_start_transaction(tree_root, 0);
+	if (block_rsv)
+		trans->block_rsv = block_rsv;
 
 	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
 		level = btrfs_header_level(root->node);
@@ -5572,22 +5987,16 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
 		}
 
 		BUG_ON(wc->level == 0);
-		if (trans->transaction->in_commit ||
-		    trans->transaction->delayed_refs.flushing) {
+		if (btrfs_should_end_transaction(trans, tree_root)) {
 			ret = btrfs_update_root(trans, tree_root,
 						&root->root_key,
 						root_item);
 			BUG_ON(ret);
 
-			btrfs_end_transaction(trans, tree_root);
-			trans = btrfs_start_transaction(tree_root, 1);
-		} else {
-			unsigned long update;
-			update = trans->delayed_ref_updates;
-			trans->delayed_ref_updates = 0;
-			if (update)
-				btrfs_run_delayed_refs(trans, tree_root,
-						       update);
+			btrfs_end_transaction_throttle(trans, tree_root);
+			trans = btrfs_start_transaction(tree_root, 0);
+			if (block_rsv)
+				trans->block_rsv = block_rsv;
 		}
 	}
 	btrfs_release_path(root, path);
@@ -5615,7 +6024,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
 		kfree(root);
 	}
 out:
-	btrfs_end_transaction(trans, tree_root);
+	btrfs_end_transaction_throttle(trans, tree_root);
 	kfree(wc);
 	btrfs_free_path(path);
 	return err;
@@ -6561,6 +6970,7 @@ static noinline int invalidate_extent_cache(struct btrfs_root *root,
 	struct btrfs_key key;
 	struct inode *inode = NULL;
 	struct btrfs_file_extent_item *fi;
+	struct extent_state *cached_state = NULL;
 	u64 num_bytes;
 	u64 skip_objectid = 0;
 	u32 nritems;
@@ -6589,12 +6999,14 @@ static noinline int invalidate_extent_cache(struct btrfs_root *root,
 		}
 		num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
 
-		lock_extent(&BTRFS_I(inode)->io_tree, key.offset,
-			    key.offset + num_bytes - 1, GFP_NOFS);
+		lock_extent_bits(&BTRFS_I(inode)->io_tree, key.offset,
+				 key.offset + num_bytes - 1, 0, &cached_state,
+				 GFP_NOFS);
 		btrfs_drop_extent_cache(inode, key.offset,
 					key.offset + num_bytes - 1, 1);
-		unlock_extent(&BTRFS_I(inode)->io_tree, key.offset,
-			      key.offset + num_bytes - 1, GFP_NOFS);
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree, key.offset,
+				     key.offset + num_bytes - 1, &cached_state,
+				     GFP_NOFS);
 		cond_resched();
 	}
 	iput(inode);
@@ -7208,48 +7620,80 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
 	return flags;
 }
 
-static int __alloc_chunk_for_shrink(struct btrfs_root *root,
-		     struct btrfs_block_group_cache *shrink_block_group,
-		     int force)
+static int set_block_group_ro(struct btrfs_block_group_cache *cache)
 {
-	struct btrfs_trans_handle *trans;
-	u64 new_alloc_flags;
-	u64 calc;
+	struct btrfs_space_info *sinfo = cache->space_info;
+	u64 num_bytes;
+	int ret = -ENOSPC;
 
-	spin_lock(&shrink_block_group->lock);
-	if (btrfs_block_group_used(&shrink_block_group->item) +
-	    shrink_block_group->reserved > 0) {
-		spin_unlock(&shrink_block_group->lock);
+	if (cache->ro)
+		return 0;
 
-		trans = btrfs_start_transaction(root, 1);
-		spin_lock(&shrink_block_group->lock);
+	spin_lock(&sinfo->lock);
+	spin_lock(&cache->lock);
+	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
+		    cache->bytes_super - btrfs_block_group_used(&cache->item);
+
+	if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
+	    sinfo->bytes_may_use + sinfo->bytes_readonly +
+	    cache->reserved_pinned + num_bytes < sinfo->total_bytes) {
+		sinfo->bytes_readonly += num_bytes;
+		sinfo->bytes_reserved += cache->reserved_pinned;
+		cache->reserved_pinned = 0;
+		cache->ro = 1;
+		ret = 0;
+	}
+	spin_unlock(&cache->lock);
+	spin_unlock(&sinfo->lock);
+	return ret;
+}
 
-		new_alloc_flags = update_block_group_flags(root,
-						   shrink_block_group->flags);
-		if (new_alloc_flags != shrink_block_group->flags) {
-			calc =
-			     btrfs_block_group_used(&shrink_block_group->item);
-		} else {
-			calc = shrink_block_group->key.offset;
-		}
-		spin_unlock(&shrink_block_group->lock);
+int btrfs_set_block_group_ro(struct btrfs_root *root,
+			     struct btrfs_block_group_cache *cache)
 
-		do_chunk_alloc(trans, root->fs_info->extent_root,
-			       calc + 2 * 1024 * 1024, new_alloc_flags, force);
+{
+	struct btrfs_trans_handle *trans;
+	u64 alloc_flags;
+	int ret;
 
-		btrfs_end_transaction(trans, root);
-	} else
-		spin_unlock(&shrink_block_group->lock);
-	return 0;
-}
+	BUG_ON(cache->ro);
 
+	trans = btrfs_join_transaction(root, 1);
+	BUG_ON(IS_ERR(trans));
 
-int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
-					 struct btrfs_block_group_cache *group)
+	alloc_flags = update_block_group_flags(root, cache->flags);
+	if (alloc_flags != cache->flags)
+		do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
 
+	ret = set_block_group_ro(cache);
+	if (!ret)
+		goto out;
+	alloc_flags = get_alloc_profile(root, cache->space_info->flags);
+	ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
+	if (ret < 0)
+		goto out;
+	ret = set_block_group_ro(cache);
+out:
+	btrfs_end_transaction(trans, root);
+	return ret;
+}
+
+int btrfs_set_block_group_rw(struct btrfs_root *root,
+			      struct btrfs_block_group_cache *cache)
 {
-	__alloc_chunk_for_shrink(root, group, 1);
-	set_block_group_readonly(group);
+	struct btrfs_space_info *sinfo = cache->space_info;
+	u64 num_bytes;
+
+	BUG_ON(!cache->ro);
+
+	spin_lock(&sinfo->lock);
+	spin_lock(&cache->lock);
+	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
+		    cache->bytes_super - btrfs_block_group_used(&cache->item);
+	sinfo->bytes_readonly -= num_bytes;
+	cache->ro = 0;
+	spin_unlock(&cache->lock);
+	spin_unlock(&sinfo->lock);
 	return 0;
 }
 
@@ -7366,7 +7810,6 @@ static int find_first_block_group(struct btrfs_root *root,
 		}
 		path->slots[0]++;
 	}
-	ret = -ENOENT;
 out:
 	return ret;
 }
@@ -7417,17 +7860,33 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 	 */
 	synchronize_rcu();
 
+	release_global_block_rsv(info);
+
 	while(!list_empty(&info->space_info)) {
 		space_info = list_entry(info->space_info.next,
 					struct btrfs_space_info,
 					list);
-
+		if (space_info->bytes_pinned > 0 ||
+		    space_info->bytes_reserved > 0) {
+			WARN_ON(1);
+			dump_space_info(space_info, 0, 0);
+		}
 		list_del(&space_info->list);
 		kfree(space_info);
 	}
 	return 0;
 }
 
+static void __link_block_group(struct btrfs_space_info *space_info,
+			       struct btrfs_block_group_cache *cache)
+{
+	int index = get_block_group_index(cache);
+
+	down_write(&space_info->groups_sem);
+	list_add_tail(&cache->list, &space_info->block_groups[index]);
+	up_write(&space_info->groups_sem);
+}
+
 int btrfs_read_block_groups(struct btrfs_root *root)
 {
 	struct btrfs_path *path;
@@ -7449,10 +7908,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 
 	while (1) {
 		ret = find_first_block_group(root, path, &key);
-		if (ret > 0) {
-			ret = 0;
-			goto error;
-		}
+		if (ret > 0)
+			break;
 		if (ret != 0)
 			goto error;
 
@@ -7461,7 +7918,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		cache = kzalloc(sizeof(*cache), GFP_NOFS);
 		if (!cache) {
 			ret = -ENOMEM;
-			break;
+			goto error;
 		}
 
 		atomic_set(&cache->count, 1);
@@ -7518,20 +7975,36 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		BUG_ON(ret);
 		cache->space_info = space_info;
 		spin_lock(&cache->space_info->lock);
-		cache->space_info->bytes_super += cache->bytes_super;
+		cache->space_info->bytes_readonly += cache->bytes_super;
 		spin_unlock(&cache->space_info->lock);
 
-		down_write(&space_info->groups_sem);
-		list_add_tail(&cache->list, &space_info->block_groups);
-		up_write(&space_info->groups_sem);
+		__link_block_group(space_info, cache);
 
 		ret = btrfs_add_block_group_cache(root->fs_info, cache);
 		BUG_ON(ret);
 
 		set_avail_alloc_bits(root->fs_info, cache->flags);
 		if (btrfs_chunk_readonly(root, cache->key.objectid))
-			set_block_group_readonly(cache);
+			set_block_group_ro(cache);
 	}
+
+	list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
+		if (!(get_alloc_profile(root, space_info->flags) &
+		      (BTRFS_BLOCK_GROUP_RAID10 |
+		       BTRFS_BLOCK_GROUP_RAID1 |
+		       BTRFS_BLOCK_GROUP_DUP)))
+			continue;
+		/*
+		 * avoid allocating from un-mirrored block group if there are
+		 * mirrored block groups.
+		 */
+		list_for_each_entry(cache, &space_info->block_groups[3], list)
+			set_block_group_ro(cache);
+		list_for_each_entry(cache, &space_info->block_groups[4], list)
+			set_block_group_ro(cache);
+	}
+
+	init_global_block_rsv(info);
 	ret = 0;
 error:
 	btrfs_free_path(path);
@@ -7592,12 +8065,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	BUG_ON(ret);
 
 	spin_lock(&cache->space_info->lock);
-	cache->space_info->bytes_super += cache->bytes_super;
+	cache->space_info->bytes_readonly += cache->bytes_super;
 	spin_unlock(&cache->space_info->lock);
 
-	down_write(&cache->space_info->groups_sem);
-	list_add_tail(&cache->list, &cache->space_info->block_groups);
-	up_write(&cache->space_info->groups_sem);
+	__link_block_group(cache->space_info, cache);
 
 	ret = btrfs_add_block_group_cache(root->fs_info, cache);
 	BUG_ON(ret);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 96577e8bf9fd..d74e6af9b53a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2,7 +2,6 @@
 #include <linux/slab.h>
 #include <linux/bio.h>
 #include <linux/mm.h>
-#include <linux/gfp.h>
 #include <linux/pagemap.h>
 #include <linux/page-flags.h>
 #include <linux/module.h>
@@ -104,8 +103,8 @@ void extent_io_exit(void)
 void extent_io_tree_init(struct extent_io_tree *tree,
 			  struct address_space *mapping, gfp_t mask)
 {
-	tree->state.rb_node = NULL;
-	tree->buffer.rb_node = NULL;
+	tree->state = RB_ROOT;
+	tree->buffer = RB_ROOT;
 	tree->ops = NULL;
 	tree->dirty_bytes = 0;
 	spin_lock_init(&tree->lock);
@@ -136,7 +135,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
 	return state;
 }
 
-static void free_extent_state(struct extent_state *state)
+void free_extent_state(struct extent_state *state)
 {
 	if (!state)
 		return;
@@ -336,21 +335,18 @@ static int merge_state(struct extent_io_tree *tree,
 }
 
 static int set_state_cb(struct extent_io_tree *tree,
-			 struct extent_state *state,
-			 unsigned long bits)
+			 struct extent_state *state, int *bits)
 {
 	if (tree->ops && tree->ops->set_bit_hook) {
 		return tree->ops->set_bit_hook(tree->mapping->host,
-					       state->start, state->end,
-					       state->state, bits);
+					       state, bits);
 	}
 
 	return 0;
 }
 
 static void clear_state_cb(struct extent_io_tree *tree,
-			   struct extent_state *state,
-			   unsigned long bits)
+			   struct extent_state *state, int *bits)
 {
 	if (tree->ops && tree->ops->clear_bit_hook)
 		tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
@@ -368,9 +364,10 @@ static void clear_state_cb(struct extent_io_tree *tree,
  */
 static int insert_state(struct extent_io_tree *tree,
 			struct extent_state *state, u64 start, u64 end,
-			int bits)
+			int *bits)
 {
 	struct rb_node *node;
+	int bits_to_set = *bits & ~EXTENT_CTLBITS;
 	int ret;
 
 	if (end < start) {
@@ -385,9 +382,9 @@ static int insert_state(struct extent_io_tree *tree,
 	if (ret)
 		return ret;
 
-	if (bits & EXTENT_DIRTY)
+	if (bits_to_set & EXTENT_DIRTY)
 		tree->dirty_bytes += end - start + 1;
-	state->state |= bits;
+	state->state |= bits_to_set;
 	node = tree_insert(&tree->state, end, &state->rb_node);
 	if (node) {
 		struct extent_state *found;
@@ -457,13 +454,13 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
  * struct is freed and removed from the tree
  */
 static int clear_state_bit(struct extent_io_tree *tree,
-			    struct extent_state *state, int bits, int wake,
-			    int delete)
+			    struct extent_state *state,
+			    int *bits, int wake)
 {
-	int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING;
+	int bits_to_clear = *bits & ~EXTENT_CTLBITS;
 	int ret = state->state & bits_to_clear;
 
-	if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
+	if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
 		u64 range = state->end - state->start + 1;
 		WARN_ON(range > tree->dirty_bytes);
 		tree->dirty_bytes -= range;
@@ -472,9 +469,8 @@ static int clear_state_bit(struct extent_io_tree *tree,
 	state->state &= ~bits_to_clear;
 	if (wake)
 		wake_up(&state->wq);
-	if (delete || state->state == 0) {
+	if (state->state == 0) {
 		if (state->tree) {
-			clear_state_cb(tree, state, state->state);
 			rb_erase(&state->rb_node, &tree->state);
 			state->tree = NULL;
 			free_extent_state(state);
@@ -513,7 +509,14 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 	u64 last_end;
 	int err;
 	int set = 0;
+	int clear = 0;
+
+	if (delete)
+		bits |= ~EXTENT_CTLBITS;
+	bits |= EXTENT_FIRST_DELALLOC;
 
+	if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
+		clear = 1;
 again:
 	if (!prealloc && (mask & __GFP_WAIT)) {
 		prealloc = alloc_extent_state(mask);
@@ -524,14 +527,20 @@ again:
 	spin_lock(&tree->lock);
 	if (cached_state) {
 		cached = *cached_state;
-		*cached_state = NULL;
-		cached_state = NULL;
+
+		if (clear) {
+			*cached_state = NULL;
+			cached_state = NULL;
+		}
+
 		if (cached && cached->tree && cached->start == start) {
-			atomic_dec(&cached->refs);
+			if (clear)
+				atomic_dec(&cached->refs);
 			state = cached;
 			goto hit_next;
 		}
-		free_extent_state(cached);
+		if (clear)
+			free_extent_state(cached);
 	}
 	/*
 	 * this search will find the extents that end after
@@ -572,8 +581,7 @@ hit_next:
 		if (err)
 			goto out;
 		if (state->end <= end) {
-			set |= clear_state_bit(tree, state, bits, wake,
-					       delete);
+			set |= clear_state_bit(tree, state, &bits, wake);
 			if (last_end == (u64)-1)
 				goto out;
 			start = last_end + 1;
@@ -594,7 +602,7 @@ hit_next:
 		if (wake)
 			wake_up(&state->wq);
 
-		set |= clear_state_bit(tree, prealloc, bits, wake, delete);
+		set |= clear_state_bit(tree, prealloc, &bits, wake);
 
 		prealloc = NULL;
 		goto out;
@@ -605,7 +613,7 @@ hit_next:
 	else
 		next_node = NULL;
 
-	set |= clear_state_bit(tree, state, bits, wake, delete);
+	set |= clear_state_bit(tree, state, &bits, wake);
 	if (last_end == (u64)-1)
 		goto out;
 	start = last_end + 1;
@@ -698,19 +706,19 @@ out:
 
 static int set_state_bits(struct extent_io_tree *tree,
 			   struct extent_state *state,
-			   int bits)
+			   int *bits)
 {
 	int ret;
+	int bits_to_set = *bits & ~EXTENT_CTLBITS;
 
 	ret = set_state_cb(tree, state, bits);
 	if (ret)
 		return ret;
-
-	if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
+	if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
 		u64 range = state->end - state->start + 1;
 		tree->dirty_bytes += range;
 	}
-	state->state |= bits;
+	state->state |= bits_to_set;
 
 	return 0;
 }
@@ -737,10 +745,9 @@ static void cache_state(struct extent_state *state,
  * [start, end] is inclusive This takes the tree lock.
  */
 
-static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-			  int bits, int exclusive_bits, u64 *failed_start,
-			  struct extent_state **cached_state,
-			  gfp_t mask)
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		   int bits, int exclusive_bits, u64 *failed_start,
+		   struct extent_state **cached_state, gfp_t mask)
 {
 	struct extent_state *state;
 	struct extent_state *prealloc = NULL;
@@ -749,6 +756,7 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 	u64 last_start;
 	u64 last_end;
 
+	bits |= EXTENT_FIRST_DELALLOC;
 again:
 	if (!prealloc && (mask & __GFP_WAIT)) {
 		prealloc = alloc_extent_state(mask);
@@ -770,7 +778,7 @@ again:
 	 */
 	node = tree_search(tree, start);
 	if (!node) {
-		err = insert_state(tree, prealloc, start, end, bits);
+		err = insert_state(tree, prealloc, start, end, &bits);
 		prealloc = NULL;
 		BUG_ON(err == -EEXIST);
 		goto out;
@@ -794,7 +802,7 @@ hit_next:
 			goto out;
 		}
 
-		err = set_state_bits(tree, state, bits);
+		err = set_state_bits(tree, state, &bits);
 		if (err)
 			goto out;
 
@@ -844,7 +852,7 @@ hit_next:
 		if (err)
 			goto out;
 		if (state->end <= end) {
-			err = set_state_bits(tree, state, bits);
+			err = set_state_bits(tree, state, &bits);
 			if (err)
 				goto out;
 			cache_state(state, cached_state);
@@ -869,7 +877,7 @@ hit_next:
 		else
 			this_end = last_start - 1;
 		err = insert_state(tree, prealloc, start, this_end,
-				   bits);
+				   &bits);
 		BUG_ON(err == -EEXIST);
 		if (err) {
 			prealloc = NULL;
@@ -895,7 +903,7 @@ hit_next:
 		err = split_state(tree, state, prealloc, end + 1);
 		BUG_ON(err == -EEXIST);
 
-		err = set_state_bits(tree, prealloc, bits);
+		err = set_state_bits(tree, prealloc, &bits);
 		if (err) {
 			prealloc = NULL;
 			goto out;
@@ -946,11 +954,11 @@ int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 }
 
 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
-		     gfp_t mask)
+			struct extent_state **cached_state, gfp_t mask)
 {
 	return set_extent_bit(tree, start, end,
 			      EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
-			      0, NULL, NULL, mask);
+			      0, NULL, cached_state, mask);
 }
 
 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
@@ -958,8 +966,7 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 {
 	return clear_extent_bit(tree, start, end,
 				EXTENT_DIRTY | EXTENT_DELALLOC |
-				EXTENT_DO_ACCOUNTING, 0, 0,
-				NULL, mask);
+				EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
 }
 
 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
@@ -984,10 +991,11 @@ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
 }
 
 static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
-				 u64 end, gfp_t mask)
+				 u64 end, struct extent_state **cached_state,
+				 gfp_t mask)
 {
 	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
-				NULL, mask);
+				cached_state, mask);
 }
 
 int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
@@ -1171,7 +1179,8 @@ out:
  * 1 is returned if we find something, 0 if nothing was in the tree
  */
 static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
-					u64 *start, u64 *end, u64 max_bytes)
+					u64 *start, u64 *end, u64 max_bytes,
+					struct extent_state **cached_state)
 {
 	struct rb_node *node;
 	struct extent_state *state;
@@ -1203,8 +1212,11 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
 				*end = state->end;
 			goto out;
 		}
-		if (!found)
+		if (!found) {
 			*start = state->start;
+			*cached_state = state;
+			atomic_inc(&state->refs);
+		}
 		found++;
 		*end = state->end;
 		cur_start = state->end + 1;
@@ -1336,10 +1348,11 @@ again:
 	delalloc_start = *start;
 	delalloc_end = 0;
 	found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
-				    max_bytes);
+				    max_bytes, &cached_state);
 	if (!found || delalloc_end <= *start) {
 		*start = delalloc_start;
 		*end = delalloc_end;
+		free_extent_state(cached_state);
 		return found;
 	}
 
@@ -1421,9 +1434,6 @@ int extent_clear_unlock_delalloc(struct inode *inode,
 	if (op & EXTENT_CLEAR_DELALLOC)
 		clear_bits |= EXTENT_DELALLOC;
 
-	if (op & EXTENT_CLEAR_ACCOUNTING)
-		clear_bits |= EXTENT_DO_ACCOUNTING;
-
 	clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
 	if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
 		    EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
@@ -1722,7 +1732,7 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
 		}
 
 		if (!uptodate) {
-			clear_extent_uptodate(tree, start, end, GFP_NOFS);
+			clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS);
 			ClearPageUptodate(page);
 			SetPageError(page);
 		}
@@ -1750,7 +1760,8 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
 static void end_bio_extent_readpage(struct bio *bio, int err)
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct bio_vec *bvec = bio->bi_io_vec;
 	struct extent_io_tree *tree;
 	u64 start;
 	u64 end;
@@ -1773,7 +1784,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 		else
 			whole_page = 0;
 
-		if (--bvec >= bio->bi_io_vec)
+		if (++bvec <= bvec_end)
 			prefetchw(&bvec->bv_page->flags);
 
 		if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
@@ -1818,7 +1829,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 			}
 			check_page_locked(tree, page);
 		}
-	} while (bvec >= bio->bi_io_vec);
+	} while (bvec <= bvec_end);
 
 	bio_put(bio);
 }
@@ -1901,7 +1912,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
 
 	if (tree->ops && tree->ops->submit_bio_hook)
 		tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
-					   mirror_num, bio_flags);
+					   mirror_num, bio_flags, start);
 	else
 		submit_bio(rw, bio);
 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
@@ -2005,6 +2016,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 	sector_t sector;
 	struct extent_map *em;
 	struct block_device *bdev;
+	struct btrfs_ordered_extent *ordered;
 	int ret;
 	int nr = 0;
 	size_t page_offset = 0;
@@ -2016,7 +2028,15 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 	set_page_extent_mapped(page);
 
 	end = page_end;
-	lock_extent(tree, start, end, GFP_NOFS);
+	while (1) {
+		lock_extent(tree, start, end, GFP_NOFS);
+		ordered = btrfs_lookup_ordered_extent(inode, start);
+		if (!ordered)
+			break;
+		unlock_extent(tree, start, end, GFP_NOFS);
+		btrfs_start_ordered_extent(inode, ordered, 1);
+		btrfs_put_ordered_extent(ordered);
+	}
 
 	if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
 		char *userpage;
@@ -2574,7 +2594,6 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
 	};
 	struct writeback_control wbc_writepages = {
-		.bdi		= wbc->bdi,
 		.sync_mode	= wbc->sync_mode,
 		.older_than_this = NULL,
 		.nr_to_write	= 64,
@@ -2608,7 +2627,6 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
 		.sync_io = mode == WB_SYNC_ALL,
 	};
 	struct writeback_control wbc_writepages = {
-		.bdi		= inode->i_mapping->backing_dev_info,
 		.sync_mode	= mode,
 		.older_than_this = NULL,
 		.nr_to_write	= nr_pages * 2,
@@ -2663,33 +2681,20 @@ int extent_readpages(struct extent_io_tree *tree,
 {
 	struct bio *bio = NULL;
 	unsigned page_idx;
-	struct pagevec pvec;
 	unsigned long bio_flags = 0;
 
-	pagevec_init(&pvec, 0);
 	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
 		struct page *page = list_entry(pages->prev, struct page, lru);
 
 		prefetchw(&page->flags);
 		list_del(&page->lru);
-		/*
-		 * what we want to do here is call add_to_page_cache_lru,
-		 * but that isn't exported, so we reproduce it here
-		 */
-		if (!add_to_page_cache(page, mapping,
+		if (!add_to_page_cache_lru(page, mapping,
 					page->index, GFP_KERNEL)) {
-
-			/* open coding of lru_cache_add, also not exported */
-			page_cache_get(page);
-			if (!pagevec_add(&pvec, page))
-				__pagevec_lru_add_file(&pvec);
 			__extent_read_full_page(tree, page, get_extent,
 						&bio, 0, &bio_flags);
 		}
 		page_cache_release(page);
 	}
-	if (pagevec_count(&pvec))
-		__pagevec_lru_add_file(&pvec);
 	BUG_ON(!list_empty(pages));
 	if (bio)
 		submit_one_bio(READ, bio, 0, bio_flags);
@@ -2704,6 +2709,7 @@ int extent_readpages(struct extent_io_tree *tree,
 int extent_invalidatepage(struct extent_io_tree *tree,
 			  struct page *page, unsigned long offset)
 {
+	struct extent_state *cached_state = NULL;
 	u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
 	u64 end = start + PAGE_CACHE_SIZE - 1;
 	size_t blocksize = page->mapping->host->i_sb->s_blocksize;
@@ -2712,12 +2718,12 @@ int extent_invalidatepage(struct extent_io_tree *tree,
 	if (start > end)
 		return 0;
 
-	lock_extent(tree, start, end, GFP_NOFS);
+	lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS);
 	wait_on_page_writeback(page);
 	clear_extent_bit(tree, start, end,
 			 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
 			 EXTENT_DO_ACCOUNTING,
-			 1, 1, NULL, GFP_NOFS);
+			 1, 1, &cached_state, GFP_NOFS);
 	return 0;
 }
 
@@ -2920,16 +2926,17 @@ sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
 		get_extent_t *get_extent)
 {
 	struct inode *inode = mapping->host;
+	struct extent_state *cached_state = NULL;
 	u64 start = iblock << inode->i_blkbits;
 	sector_t sector = 0;
 	size_t blksize = (1 << inode->i_blkbits);
 	struct extent_map *em;
 
-	lock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
-		    GFP_NOFS);
+	lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
+			 0, &cached_state, GFP_NOFS);
 	em = get_extent(inode, NULL, 0, start, blksize, 0);
-	unlock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
-		      GFP_NOFS);
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, start,
+			     start + blksize - 1, &cached_state, GFP_NOFS);
 	if (!em || IS_ERR(em))
 		return 0;
 
@@ -2951,6 +2958,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	u32 flags = 0;
 	u64 disko = 0;
 	struct extent_map *em = NULL;
+	struct extent_state *cached_state = NULL;
 	int end = 0;
 	u64 em_start = 0, em_len = 0;
 	unsigned long emflags;
@@ -2959,8 +2967,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	if (len == 0)
 		return -EINVAL;
 
-	lock_extent(&BTRFS_I(inode)->io_tree, start, start + len,
-		GFP_NOFS);
+	lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
+			 &cached_state, GFP_NOFS);
 	em = get_extent(inode, NULL, 0, off, max - off, 0);
 	if (!em)
 		goto out;
@@ -3023,8 +3031,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 out_free:
 	free_extent_map(em);
 out:
-	unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len,
-			GFP_NOFS);
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len,
+			     &cached_state, GFP_NOFS);
 	return ret;
 }
 
@@ -3165,10 +3173,9 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 		spin_unlock(&tree->buffer_lock);
 		goto free_eb;
 	}
-	spin_unlock(&tree->buffer_lock);
-
 	/* add one reference for the tree */
 	atomic_inc(&eb->refs);
+	spin_unlock(&tree->buffer_lock);
 	return eb;
 
 free_eb:
@@ -3265,7 +3272,8 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
 }
 
 int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
-				struct extent_buffer *eb)
+				struct extent_buffer *eb,
+				struct extent_state **cached_state)
 {
 	unsigned long i;
 	struct page *page;
@@ -3275,7 +3283,7 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
 	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
 
 	clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
-			      GFP_NOFS);
+			      cached_state, GFP_NOFS);
 	for (i = 0; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
 		if (page)
@@ -3335,7 +3343,8 @@ int extent_range_uptodate(struct extent_io_tree *tree,
 }
 
 int extent_buffer_uptodate(struct extent_io_tree *tree,
-			   struct extent_buffer *eb)
+			   struct extent_buffer *eb,
+			   struct extent_state *cached_state)
 {
 	int ret = 0;
 	unsigned long num_pages;
@@ -3347,7 +3356,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
 		return 1;
 
 	ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
-			   EXTENT_UPTODATE, 1, NULL);
+			   EXTENT_UPTODATE, 1, cached_state);
 	if (ret)
 		return ret;
 
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 36de250a7b2b..5691c7b590da 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -16,7 +16,9 @@
 #define EXTENT_BOUNDARY (1 << 9)
 #define EXTENT_NODATASUM (1 << 10)
 #define EXTENT_DO_ACCOUNTING (1 << 11)
+#define EXTENT_FIRST_DELALLOC (1 << 12)
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
+#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
 
 /* flags for bio submission */
 #define EXTENT_BIO_COMPRESSED 1
@@ -47,7 +49,7 @@ struct extent_state;
 
 typedef	int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
 				       struct bio *bio, int mirror_num,
-				       unsigned long bio_flags);
+				       unsigned long bio_flags, u64 bio_offset);
 struct extent_io_ops {
 	int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
 			     u64 start, u64 end, int *page_started,
@@ -69,10 +71,10 @@ struct extent_io_ops {
 				    struct extent_state *state);
 	int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
 				      struct extent_state *state, int uptodate);
-	int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
-			    unsigned long old, unsigned long bits);
+	int (*set_bit_hook)(struct inode *inode, struct extent_state *state,
+			    int *bits);
 	int (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
-			      unsigned long bits);
+			      int *bits);
 	int (*merge_extent_hook)(struct inode *inode,
 				 struct extent_state *new,
 				 struct extent_state *other);
@@ -163,6 +165,8 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 		     int bits, struct extent_state **cached, gfp_t mask);
 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
+int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
+			 struct extent_state **cached, gfp_t mask);
 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
 		    gfp_t mask);
 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
@@ -174,6 +178,7 @@ u64 count_range_bits(struct extent_io_tree *tree,
 		     u64 *start, u64 search_end,
 		     u64 max_bytes, unsigned long bits);
 
+void free_extent_state(struct extent_state *state);
 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
 		   int bits, int filled, struct extent_state *cached_state);
 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
@@ -183,6 +188,9 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 		     gfp_t mask);
 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 		    int bits, gfp_t mask);
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		   int bits, int exclusive_bits, u64 *failed_start,
+		   struct extent_state **cached_state, gfp_t mask);
 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
 			gfp_t mask);
 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
@@ -196,7 +204,7 @@ int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
 int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start,
 				  u64 end, gfp_t mask);
 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
-		     gfp_t mask);
+			struct extent_state **cached_state, gfp_t mask);
 int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
 		     gfp_t mask);
 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
@@ -281,9 +289,11 @@ int test_extent_buffer_dirty(struct extent_io_tree *tree,
 int set_extent_buffer_uptodate(struct extent_io_tree *tree,
 			       struct extent_buffer *eb);
 int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
-				struct extent_buffer *eb);
+				struct extent_buffer *eb,
+				struct extent_state **cached_state);
 int extent_buffer_uptodate(struct extent_io_tree *tree,
-			   struct extent_buffer *eb);
+			   struct extent_buffer *eb,
+			   struct extent_state *cached_state);
 int map_extent_buffer(struct extent_buffer *eb, unsigned long offset,
 		      unsigned long min_len, char **token, char **map,
 		      unsigned long *map_start,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 428fcac45f90..454ca52d6451 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1,5 +1,4 @@
 #include <linux/err.h>
-#include <linux/gfp.h>
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/spinlock.h>
@@ -35,7 +34,7 @@ void extent_map_exit(void)
  */
 void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
 {
-	tree->map.rb_node = NULL;
+	tree->map = RB_ROOT;
 	rwlock_init(&tree->lock);
 }
 
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 9b99886562d0..a562a250ae77 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -17,6 +17,7 @@
  */
 
 #include <linux/bio.h>
+#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
 #include "ctree.h"
@@ -148,13 +149,14 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 }
 
 
-int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
-			  struct bio *bio, u32 *dst)
+static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
+				   struct inode *inode, struct bio *bio,
+				   u64 logical_offset, u32 *dst, int dio)
 {
 	u32 sum;
 	struct bio_vec *bvec = bio->bi_io_vec;
 	int bio_index = 0;
-	u64 offset;
+	u64 offset = 0;
 	u64 item_start_offset = 0;
 	u64 item_last_offset = 0;
 	u64 disk_bytenr;
@@ -173,8 +175,11 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 	WARN_ON(bio->bi_vcnt <= 0);
 
 	disk_bytenr = (u64)bio->bi_sector << 9;
+	if (dio)
+		offset = logical_offset;
 	while (bio_index < bio->bi_vcnt) {
-		offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+		if (!dio)
+			offset = page_offset(bvec->bv_page) + bvec->bv_offset;
 		ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
 		if (ret == 0)
 			goto found;
@@ -237,6 +242,7 @@ found:
 		else
 			set_state_private(io_tree, offset, sum);
 		disk_bytenr += bvec->bv_len;
+		offset += bvec->bv_len;
 		bio_index++;
 		bvec++;
 	}
@@ -244,6 +250,18 @@ found:
 	return 0;
 }
 
+int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
+			  struct bio *bio, u32 *dst)
+{
+	return __btrfs_lookup_bio_sums(root, inode, bio, 0, dst, 0);
+}
+
+int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
+			      struct bio *bio, u64 offset, u32 *dst)
+{
+	return __btrfs_lookup_bio_sums(root, inode, bio, offset, dst, 1);
+}
+
 int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
 			     struct list_head *list)
 {
@@ -656,6 +674,9 @@ again:
 		goto found;
 	}
 	ret = PTR_ERR(item);
+	if (ret != -EFBIG && ret != -ENOENT)
+		goto fail_unlock;
+
 	if (ret == -EFBIG) {
 		u32 item_size;
 		/* we found one, but it isn't big enough yet */
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index c02033596f02..e354c33df082 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -28,6 +28,7 @@
 #include <linux/writeback.h>
 #include <linux/statfs.h>
 #include <linux/compat.h>
+#include <linux/slab.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -45,32 +46,42 @@
 static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
 					 int write_bytes,
 					 struct page **prepared_pages,
-					 const char __user *buf)
+					 struct iov_iter *i)
 {
-	long page_fault = 0;
-	int i;
+	size_t copied;
+	int pg = 0;
 	int offset = pos & (PAGE_CACHE_SIZE - 1);
 
-	for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) {
+	while (write_bytes > 0) {
 		size_t count = min_t(size_t,
 				     PAGE_CACHE_SIZE - offset, write_bytes);
-		struct page *page = prepared_pages[i];
-		fault_in_pages_readable(buf, count);
+		struct page *page = prepared_pages[pg];
+again:
+		if (unlikely(iov_iter_fault_in_readable(i, count)))
+			return -EFAULT;
 
 		/* Copy data from userspace to the current page */
-		kmap(page);
-		page_fault = __copy_from_user(page_address(page) + offset,
-					      buf, count);
+		copied = iov_iter_copy_from_user(page, i, offset, count);
+
 		/* Flush processor's dcache for this page */
 		flush_dcache_page(page);
-		kunmap(page);
-		buf += count;
-		write_bytes -= count;
+		iov_iter_advance(i, copied);
+		write_bytes -= copied;
 
-		if (page_fault)
-			break;
+		if (unlikely(copied == 0)) {
+			count = min_t(size_t, PAGE_CACHE_SIZE - offset,
+				      iov_iter_single_seg_count(i));
+			goto again;
+		}
+
+		if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
+			offset += copied;
+		} else {
+			pg++;
+			offset = 0;
+		}
 	}
-	return page_fault ? -EFAULT : 0;
+	return 0;
 }
 
 /*
@@ -123,9 +134,9 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		    root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
 
 	end_of_last_block = start_pos + num_bytes - 1;
-	err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
-	if (err)
-		return err;
+	err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
+					NULL);
+	BUG_ON(err);
 
 	for (i = 0; i < num_pages; i++) {
 		struct page *p = pages[i];
@@ -140,7 +151,7 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		 * at this time.
 		 */
 	}
-	return err;
+	return 0;
 }
 
 /*
@@ -720,13 +731,15 @@ again:
 					inode->i_ino, orig_offset);
 		BUG_ON(ret);
 	}
-	fi = btrfs_item_ptr(leaf, path->slots[0],
-			   struct btrfs_file_extent_item);
 	if (del_nr == 0) {
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+			   struct btrfs_file_extent_item);
 		btrfs_set_file_extent_type(leaf, fi,
 					   BTRFS_FILE_EXTENT_REG);
 		btrfs_mark_buffer_dirty(leaf);
 	} else {
+		fi = btrfs_item_ptr(leaf, del_slot - 1,
+			   struct btrfs_file_extent_item);
 		btrfs_set_file_extent_type(leaf, fi,
 					   BTRFS_FILE_EXTENT_REG);
 		btrfs_set_file_extent_num_bytes(leaf, fi,
@@ -751,6 +764,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
 			 loff_t pos, unsigned long first_index,
 			 unsigned long last_index, size_t write_bytes)
 {
+	struct extent_state *cached_state = NULL;
 	int i;
 	unsigned long index = pos >> PAGE_CACHE_SHIFT;
 	struct inode *inode = fdentry(file)->d_inode;
@@ -779,16 +793,18 @@ again:
 	}
 	if (start_pos < inode->i_size) {
 		struct btrfs_ordered_extent *ordered;
-		lock_extent(&BTRFS_I(inode)->io_tree,
-			    start_pos, last_pos - 1, GFP_NOFS);
+		lock_extent_bits(&BTRFS_I(inode)->io_tree,
+				 start_pos, last_pos - 1, 0, &cached_state,
+				 GFP_NOFS);
 		ordered = btrfs_lookup_first_ordered_extent(inode,
 							    last_pos - 1);
 		if (ordered &&
 		    ordered->file_offset + ordered->len > start_pos &&
 		    ordered->file_offset < last_pos) {
 			btrfs_put_ordered_extent(ordered);
-			unlock_extent(&BTRFS_I(inode)->io_tree,
-				      start_pos, last_pos - 1, GFP_NOFS);
+			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+					     start_pos, last_pos - 1,
+					     &cached_state, GFP_NOFS);
 			for (i = 0; i < num_pages; i++) {
 				unlock_page(pages[i]);
 				page_cache_release(pages[i]);
@@ -800,12 +816,13 @@ again:
 		if (ordered)
 			btrfs_put_ordered_extent(ordered);
 
-		clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
+		clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
 				  last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
-				  EXTENT_DO_ACCOUNTING,
+				  EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
 				  GFP_NOFS);
-		unlock_extent(&BTRFS_I(inode)->io_tree,
-			      start_pos, last_pos - 1, GFP_NOFS);
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+				     start_pos, last_pos - 1, &cached_state,
+				     GFP_NOFS);
 	}
 	for (i = 0; i < num_pages; i++) {
 		clear_page_dirty_for_io(pages[i]);
@@ -815,45 +832,46 @@ again:
 	return 0;
 }
 
-static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
-				size_t count, loff_t *ppos)
+static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
+				    const struct iovec *iov,
+				    unsigned long nr_segs, loff_t pos)
 {
-	loff_t pos;
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = fdentry(file)->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct page *pinned[2];
+	struct page **pages = NULL;
+	struct iov_iter i;
+	loff_t *ppos = &iocb->ki_pos;
 	loff_t start_pos;
 	ssize_t num_written = 0;
 	ssize_t err = 0;
+	size_t count;
+	size_t ocount;
 	int ret = 0;
-	struct inode *inode = fdentry(file)->d_inode;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct page **pages = NULL;
 	int nrptrs;
-	struct page *pinned[2];
 	unsigned long first_index;
 	unsigned long last_index;
 	int will_write;
+	int buffered = 0;
 
 	will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
 		      (file->f_flags & O_DIRECT));
 
-	nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
-		     PAGE_CACHE_SIZE / (sizeof(struct page *)));
 	pinned[0] = NULL;
 	pinned[1] = NULL;
 
-	pos = *ppos;
 	start_pos = pos;
 
 	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
 
-	/* do the reserve before the mutex lock in case we have to do some
-	 * flushing.  We wouldn't deadlock, but this is more polite.
-	 */
-	err = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
-	if (err)
-		goto out_nolock;
-
 	mutex_lock(&inode->i_mutex);
 
+	err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
+	if (err)
+		goto out;
+	count = ocount;
+
 	current->backing_dev_info = inode->i_mapping->backing_dev_info;
 	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
 	if (err)
@@ -867,15 +885,53 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		goto out;
 
 	file_update_time(file);
+	BTRFS_I(inode)->sequence++;
 
+	if (unlikely(file->f_flags & O_DIRECT)) {
+		num_written = generic_file_direct_write(iocb, iov, &nr_segs,
+							pos, ppos, count,
+							ocount);
+		/*
+		 * the generic O_DIRECT will update in-memory i_size after the
+		 * DIOs are done.  But our endio handlers that update the on
+		 * disk i_size never update past the in memory i_size.  So we
+		 * need one more update here to catch any additions to the
+		 * file
+		 */
+		if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
+			btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
+			mark_inode_dirty(inode);
+		}
+
+		if (num_written < 0) {
+			ret = num_written;
+			num_written = 0;
+			goto out;
+		} else if (num_written == count) {
+			/* pick up pos changes done by the generic code */
+			pos = *ppos;
+			goto out;
+		}
+		/*
+		 * We are going to do buffered for the rest of the range, so we
+		 * need to make sure to invalidate the buffered pages when we're
+		 * done.
+		 */
+		buffered = 1;
+		pos += num_written;
+	}
+
+	iov_iter_init(&i, iov, nr_segs, count, num_written);
+	nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) /
+		     PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
+		     (sizeof(struct page *)));
 	pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
 
 	/* generic_write_checks can change our pos */
 	start_pos = pos;
 
-	BTRFS_I(inode)->sequence++;
 	first_index = pos >> PAGE_CACHE_SHIFT;
-	last_index = (pos + count) >> PAGE_CACHE_SHIFT;
+	last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
 
 	/*
 	 * there are lots of better ways to do this, but this code
@@ -892,7 +948,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 			unlock_page(pinned[0]);
 		}
 	}
-	if ((pos + count) & (PAGE_CACHE_SIZE - 1)) {
+	if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) {
 		pinned[1] = grab_cache_page(inode->i_mapping, last_index);
 		if (!PageUptodate(pinned[1])) {
 			ret = btrfs_readpage(NULL, pinned[1]);
@@ -903,10 +959,10 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		}
 	}
 
-	while (count > 0) {
+	while (iov_iter_count(&i) > 0) {
 		size_t offset = pos & (PAGE_CACHE_SIZE - 1);
-		size_t write_bytes = min(count, nrptrs *
-					(size_t)PAGE_CACHE_SIZE -
+		size_t write_bytes = min(iov_iter_count(&i),
+					 nrptrs * (size_t)PAGE_CACHE_SIZE -
 					 offset);
 		size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
 					PAGE_CACHE_SHIFT;
@@ -914,7 +970,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		WARN_ON(num_pages > nrptrs);
 		memset(pages, 0, sizeof(struct page *) * nrptrs);
 
-		ret = btrfs_check_data_free_space(root, inode, write_bytes);
+		ret = btrfs_delalloc_reserve_space(inode, write_bytes);
 		if (ret)
 			goto out;
 
@@ -922,26 +978,20 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 				    pos, first_index, last_index,
 				    write_bytes);
 		if (ret) {
-			btrfs_free_reserved_data_space(root, inode,
-						       write_bytes);
+			btrfs_delalloc_release_space(inode, write_bytes);
 			goto out;
 		}
 
 		ret = btrfs_copy_from_user(pos, num_pages,
-					   write_bytes, pages, buf);
-		if (ret) {
-			btrfs_free_reserved_data_space(root, inode,
-						       write_bytes);
-			btrfs_drop_pages(pages, num_pages);
-			goto out;
+					   write_bytes, pages, &i);
+		if (ret == 0) {
+			dirty_and_release_pages(NULL, root, file, pages,
+						num_pages, pos, write_bytes);
 		}
 
-		ret = dirty_and_release_pages(NULL, root, file, pages,
-					      num_pages, pos, write_bytes);
 		btrfs_drop_pages(pages, num_pages);
 		if (ret) {
-			btrfs_free_reserved_data_space(root, inode,
-						       write_bytes);
+			btrfs_delalloc_release_space(inode, write_bytes);
 			goto out;
 		}
 
@@ -957,8 +1007,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 			btrfs_throttle(root);
 		}
 
-		buf += write_bytes;
-		count -= write_bytes;
 		pos += write_bytes;
 		num_written += write_bytes;
 
@@ -968,9 +1016,7 @@ out:
 	mutex_unlock(&inode->i_mutex);
 	if (ret)
 		err = ret;
-	btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
 
-out_nolock:
 	kfree(pages);
 	if (pinned[0])
 		page_cache_release(pinned[0]);
@@ -1000,7 +1046,7 @@ out_nolock:
 			num_written = err;
 
 		if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
-			trans = btrfs_start_transaction(root, 1);
+			trans = btrfs_start_transaction(root, 0);
 			ret = btrfs_log_dentry_safe(trans, root,
 						    file->f_dentry);
 			if (ret == 0) {
@@ -1015,7 +1061,7 @@ out_nolock:
 				btrfs_end_transaction(trans, root);
 			}
 		}
-		if (file->f_flags & O_DIRECT) {
+		if (file->f_flags & O_DIRECT && buffered) {
 			invalidate_mapping_pages(inode->i_mapping,
 			      start_pos >> PAGE_CACHE_SHIFT,
 			     (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
@@ -1055,8 +1101,9 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
  * important optimization for directories because holding the mutex prevents
  * new operations on the dir while we write to disk.
  */
-int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
+int btrfs_sync_file(struct file *file, int datasync)
 {
+	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret = 0;
@@ -1093,12 +1140,12 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 	/*
 	 * ok we haven't committed the transaction yet, lets do a commit
 	 */
-	if (file && file->private_data)
+	if (file->private_data)
 		btrfs_ioctl_trans_end(file);
 
-	trans = btrfs_start_transaction(root, 1);
-	if (!trans) {
-		ret = -ENOMEM;
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
 		goto out;
 	}
 
@@ -1133,7 +1180,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 	}
 	mutex_lock(&dentry->d_inode->i_mutex);
 out:
-	return ret > 0 ? EIO : ret;
+	return ret > 0 ? -EIO : ret;
 }
 
 static const struct vm_operations_struct btrfs_file_vm_ops = {
@@ -1143,17 +1190,25 @@ static const struct vm_operations_struct btrfs_file_vm_ops = {
 
 static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
 {
-	vma->vm_ops = &btrfs_file_vm_ops;
+	struct address_space *mapping = filp->f_mapping;
+
+	if (!mapping->a_ops->readpage)
+		return -ENOEXEC;
+
 	file_accessed(filp);
+	vma->vm_ops = &btrfs_file_vm_ops;
+	vma->vm_flags |= VM_CAN_NONLINEAR;
+
 	return 0;
 }
 
 const struct file_operations btrfs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
+	.write		= do_sync_write,
 	.aio_read       = generic_file_aio_read,
 	.splice_read	= generic_file_splice_read,
-	.write		= btrfs_file_write,
+	.aio_write	= btrfs_file_aio_write,
 	.mmap		= btrfs_file_mmap,
 	.open		= generic_file_open,
 	.release	= btrfs_release_file,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index cb2849f03251..f488fac04d99 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -18,6 +18,7 @@
 
 #include <linux/pagemap.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/math64.h>
 #include "ctree.h"
 #include "free-space-cache.h"
@@ -870,7 +871,7 @@ __btrfs_return_cluster_to_free_space(
 		tree_insert_offset(&block_group->free_space_offset,
 				   entry->offset, &entry->offset_index, 0);
 	}
-	cluster->root.rb_node = NULL;
+	cluster->root = RB_ROOT;
 
 out:
 	spin_unlock(&cluster->lock);
@@ -1355,7 +1356,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
 {
 	spin_lock_init(&cluster->lock);
 	spin_lock_init(&cluster->refill_lock);
-	cluster->root.rb_node = NULL;
+	cluster->root = RB_ROOT;
 	cluster->max_size = 0;
 	cluster->points_to_bitmap = false;
 	INIT_LIST_HEAD(&cluster->block_group_list);
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 72ce3c173d6a..64f1150bb48d 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -49,6 +49,33 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name,
 	return 0;
 }
 
+struct btrfs_inode_ref *
+btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root,
+			struct btrfs_path *path,
+			const char *name, int name_len,
+			u64 inode_objectid, u64 ref_objectid, int mod)
+{
+	struct btrfs_key key;
+	struct btrfs_inode_ref *ref;
+	int ins_len = mod < 0 ? -1 : 0;
+	int cow = mod != 0;
+	int ret;
+
+	key.objectid = inode_objectid;
+	key.type = BTRFS_INODE_REF_KEY;
+	key.offset = ref_objectid;
+
+	ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+	if (ret < 0)
+		return ERR_PTR(ret);
+	if (ret > 0)
+		return NULL;
+	if (!find_name_in_backref(path, name, name_len, &ref))
+		return NULL;
+	return ref;
+}
+
 int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   const char *name, int name_len,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8cd109972fa6..1bff92ad4744 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -36,6 +36,7 @@
 #include <linux/xattr.h>
 #include <linux/posix_acl.h>
 #include <linux/falloc.h>
+#include <linux/slab.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -251,6 +252,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
 				   inline_len, compressed_size,
 				   compressed_pages);
 	BUG_ON(ret);
+	btrfs_delalloc_release_metadata(inode, end + 1 - start);
 	btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
 	return 0;
 }
@@ -379,7 +381,8 @@ again:
 	 * change at any time if we discover bad compression ratios.
 	 */
 	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
-	    btrfs_test_opt(root, COMPRESS)) {
+	    (btrfs_test_opt(root, COMPRESS) ||
+	     (BTRFS_I(inode)->force_compress))) {
 		WARN_ON(pages);
 		pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
 
@@ -412,6 +415,7 @@ again:
 		trans = btrfs_join_transaction(root, 1);
 		BUG_ON(!trans);
 		btrfs_set_trans_block_group(trans, inode);
+		trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 
 		/* lets try to make an inline extent */
 		if (ret || total_in < (actual_end - start)) {
@@ -437,7 +441,6 @@ again:
 			     start, end, NULL,
 			     EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
 			     EXTENT_CLEAR_DELALLOC |
-			     EXTENT_CLEAR_ACCOUNTING |
 			     EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
 
 			btrfs_end_transaction(trans, root);
@@ -483,8 +486,10 @@ again:
 		nr_pages_ret = 0;
 
 		/* flag the file so we don't compress in the future */
-		if (!btrfs_test_opt(root, FORCE_COMPRESS))
+		if (!btrfs_test_opt(root, FORCE_COMPRESS) &&
+		    !(BTRFS_I(inode)->force_compress)) {
 			BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
+		}
 	}
 	if (will_compress) {
 		*num_added += 1;
@@ -570,8 +575,8 @@ retry:
 			unsigned long nr_written = 0;
 
 			lock_extent(io_tree, async_extent->start,
-				    async_extent->start +
-				    async_extent->ram_size - 1, GFP_NOFS);
+					 async_extent->start +
+					 async_extent->ram_size - 1, GFP_NOFS);
 
 			/* allocate blocks */
 			ret = cow_file_range(inode, async_cow->locked_page,
@@ -693,6 +698,38 @@ retry:
 	return 0;
 }
 
+static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
+				      u64 num_bytes)
+{
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_map *em;
+	u64 alloc_hint = 0;
+
+	read_lock(&em_tree->lock);
+	em = search_extent_mapping(em_tree, start, num_bytes);
+	if (em) {
+		/*
+		 * if block start isn't an actual block number then find the
+		 * first block in this inode and use that as a hint.  If that
+		 * block is also bogus then just don't worry about it.
+		 */
+		if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+			free_extent_map(em);
+			em = search_extent_mapping(em_tree, 0, 0);
+			if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
+				alloc_hint = em->block_start;
+			if (em)
+				free_extent_map(em);
+		} else {
+			alloc_hint = em->block_start;
+			free_extent_map(em);
+		}
+	}
+	read_unlock(&em_tree->lock);
+
+	return alloc_hint;
+}
+
 /*
  * when extent_io.c finds a delayed allocation range in the file,
  * the call backs end up in this code.  The basic idea is to
@@ -730,6 +767,7 @@ static noinline int cow_file_range(struct inode *inode,
 	trans = btrfs_join_transaction(root, 1);
 	BUG_ON(!trans);
 	btrfs_set_trans_block_group(trans, inode);
+	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 
 	actual_end = min_t(u64, isize, end + 1);
 
@@ -749,7 +787,6 @@ static noinline int cow_file_range(struct inode *inode,
 				     EXTENT_CLEAR_UNLOCK_PAGE |
 				     EXTENT_CLEAR_UNLOCK |
 				     EXTENT_CLEAR_DELALLOC |
-				     EXTENT_CLEAR_ACCOUNTING |
 				     EXTENT_CLEAR_DIRTY |
 				     EXTENT_SET_WRITEBACK |
 				     EXTENT_END_WRITEBACK);
@@ -765,35 +802,13 @@ static noinline int cow_file_range(struct inode *inode,
 	BUG_ON(disk_num_bytes >
 	       btrfs_super_total_bytes(&root->fs_info->super_copy));
 
-
-	read_lock(&BTRFS_I(inode)->extent_tree.lock);
-	em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
-				   start, num_bytes);
-	if (em) {
-		/*
-		 * if block start isn't an actual block number then find the
-		 * first block in this inode and use that as a hint.  If that
-		 * block is also bogus then just don't worry about it.
-		 */
-		if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
-			free_extent_map(em);
-			em = search_extent_mapping(em_tree, 0, 0);
-			if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
-				alloc_hint = em->block_start;
-			if (em)
-				free_extent_map(em);
-		} else {
-			alloc_hint = em->block_start;
-			free_extent_map(em);
-		}
-	}
-	read_unlock(&BTRFS_I(inode)->extent_tree.lock);
+	alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
 	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
 
 	while (disk_num_bytes > 0) {
 		unsigned long op;
 
-		cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
+		cur_alloc_size = disk_num_bytes;
 		ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
 					   root->sectorsize, 0, alloc_hint,
 					   (u64)-1, &ins, 1);
@@ -1170,6 +1185,13 @@ out_check:
 					       num_bytes, num_bytes, type);
 		BUG_ON(ret);
 
+		if (root->root_key.objectid ==
+		    BTRFS_DATA_RELOC_TREE_OBJECTID) {
+			ret = btrfs_reloc_clone_csums(inode, cur_offset,
+						      num_bytes);
+			BUG_ON(ret);
+		}
+
 		extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
 				cur_offset, cur_offset + num_bytes - 1,
 				locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
@@ -1211,7 +1233,8 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
 	else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)
 		ret = run_delalloc_nocow(inode, locked_page, start, end,
 					 page_started, 0, nr_written);
-	else if (!btrfs_test_opt(root, COMPRESS))
+	else if (!btrfs_test_opt(root, COMPRESS) &&
+		 !(BTRFS_I(inode)->force_compress))
 		ret = cow_file_range(inode, locked_page, start, end,
 				      page_started, nr_written, 1);
 	else
@@ -1221,36 +1244,13 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
 }
 
 static int btrfs_split_extent_hook(struct inode *inode,
-				    struct extent_state *orig, u64 split)
+				   struct extent_state *orig, u64 split)
 {
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	u64 size;
-
+	/* not delalloc, ignore it */
 	if (!(orig->state & EXTENT_DELALLOC))
 		return 0;
 
-	size = orig->end - orig->start + 1;
-	if (size > root->fs_info->max_extent) {
-		u64 num_extents;
-		u64 new_size;
-
-		new_size = orig->end - split + 1;
-		num_extents = div64_u64(size + root->fs_info->max_extent - 1,
-					root->fs_info->max_extent);
-
-		/*
-		 * if we break a large extent up then leave oustanding_extents
-		 * be, since we've already accounted for the large extent.
-		 */
-		if (div64_u64(new_size + root->fs_info->max_extent - 1,
-			      root->fs_info->max_extent) < num_extents)
-			return 0;
-	}
-
-	spin_lock(&BTRFS_I(inode)->accounting_lock);
-	BTRFS_I(inode)->outstanding_extents++;
-	spin_unlock(&BTRFS_I(inode)->accounting_lock);
-
+	atomic_inc(&BTRFS_I(inode)->outstanding_extents);
 	return 0;
 }
 
@@ -1264,42 +1264,11 @@ static int btrfs_merge_extent_hook(struct inode *inode,
 				   struct extent_state *new,
 				   struct extent_state *other)
 {
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	u64 new_size, old_size;
-	u64 num_extents;
-
 	/* not delalloc, ignore it */
 	if (!(other->state & EXTENT_DELALLOC))
 		return 0;
 
-	old_size = other->end - other->start + 1;
-	if (new->start < other->start)
-		new_size = other->end - new->start + 1;
-	else
-		new_size = new->end - other->start + 1;
-
-	/* we're not bigger than the max, unreserve the space and go */
-	if (new_size <= root->fs_info->max_extent) {
-		spin_lock(&BTRFS_I(inode)->accounting_lock);
-		BTRFS_I(inode)->outstanding_extents--;
-		spin_unlock(&BTRFS_I(inode)->accounting_lock);
-		return 0;
-	}
-
-	/*
-	 * If we grew by another max_extent, just return, we want to keep that
-	 * reserved amount.
-	 */
-	num_extents = div64_u64(old_size + root->fs_info->max_extent - 1,
-				root->fs_info->max_extent);
-	if (div64_u64(new_size + root->fs_info->max_extent - 1,
-		      root->fs_info->max_extent) > num_extents)
-		return 0;
-
-	spin_lock(&BTRFS_I(inode)->accounting_lock);
-	BTRFS_I(inode)->outstanding_extents--;
-	spin_unlock(&BTRFS_I(inode)->accounting_lock);
-
+	atomic_dec(&BTRFS_I(inode)->outstanding_extents);
 	return 0;
 }
 
@@ -1308,8 +1277,8 @@ static int btrfs_merge_extent_hook(struct inode *inode,
  * bytes in this file, and to maintain the list of inodes that
  * have pending delalloc work to be done.
  */
-static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
-		       unsigned long old, unsigned long bits)
+static int btrfs_set_bit_hook(struct inode *inode,
+			      struct extent_state *state, int *bits)
 {
 
 	/*
@@ -1317,16 +1286,18 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
 	 * but in this case, we are only testeing for the DELALLOC
 	 * bit, which is only set or cleared with irqs on
 	 */
-	if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+	if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
 		struct btrfs_root *root = BTRFS_I(inode)->root;
+		u64 len = state->end + 1 - state->start;
+
+		if (*bits & EXTENT_FIRST_DELALLOC)
+			*bits &= ~EXTENT_FIRST_DELALLOC;
+		else
+			atomic_inc(&BTRFS_I(inode)->outstanding_extents);
 
-		spin_lock(&BTRFS_I(inode)->accounting_lock);
-		BTRFS_I(inode)->outstanding_extents++;
-		spin_unlock(&BTRFS_I(inode)->accounting_lock);
-		btrfs_delalloc_reserve_space(root, inode, end - start + 1);
 		spin_lock(&root->fs_info->delalloc_lock);
-		BTRFS_I(inode)->delalloc_bytes += end - start + 1;
-		root->fs_info->delalloc_bytes += end - start + 1;
+		BTRFS_I(inode)->delalloc_bytes += len;
+		root->fs_info->delalloc_bytes += len;
 		if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
 			list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
 				      &root->fs_info->delalloc_inodes);
@@ -1340,44 +1311,32 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
  * extent_io.c clear_bit_hook, see set_bit_hook for why
  */
 static int btrfs_clear_bit_hook(struct inode *inode,
-				struct extent_state *state, unsigned long bits)
+				struct extent_state *state, int *bits)
 {
 	/*
 	 * set_bit and clear bit hooks normally require _irqsave/restore
 	 * but in this case, we are only testeing for the DELALLOC
 	 * bit, which is only set or cleared with irqs on
 	 */
-	if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+	if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
 		struct btrfs_root *root = BTRFS_I(inode)->root;
+		u64 len = state->end + 1 - state->start;
 
-		if (bits & EXTENT_DO_ACCOUNTING) {
-			spin_lock(&BTRFS_I(inode)->accounting_lock);
-			BTRFS_I(inode)->outstanding_extents--;
-			spin_unlock(&BTRFS_I(inode)->accounting_lock);
-			btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
-		}
+		if (*bits & EXTENT_FIRST_DELALLOC)
+			*bits &= ~EXTENT_FIRST_DELALLOC;
+		else if (!(*bits & EXTENT_DO_ACCOUNTING))
+			atomic_dec(&BTRFS_I(inode)->outstanding_extents);
+
+		if (*bits & EXTENT_DO_ACCOUNTING)
+			btrfs_delalloc_release_metadata(inode, len);
+
+		if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID)
+			btrfs_free_reserved_data_space(inode, len);
 
 		spin_lock(&root->fs_info->delalloc_lock);
-		if (state->end - state->start + 1 >
-		    root->fs_info->delalloc_bytes) {
-			printk(KERN_INFO "btrfs warning: delalloc account "
-			       "%llu %llu\n",
-			       (unsigned long long)
-			       state->end - state->start + 1,
-			       (unsigned long long)
-			       root->fs_info->delalloc_bytes);
-			btrfs_delalloc_free_space(root, inode, (u64)-1);
-			root->fs_info->delalloc_bytes = 0;
-			BTRFS_I(inode)->delalloc_bytes = 0;
-		} else {
-			btrfs_delalloc_free_space(root, inode,
-						  state->end -
-						  state->start + 1);
-			root->fs_info->delalloc_bytes -= state->end -
-				state->start + 1;
-			BTRFS_I(inode)->delalloc_bytes -= state->end -
-				state->start + 1;
-		}
+		root->fs_info->delalloc_bytes -= len;
+		BTRFS_I(inode)->delalloc_bytes -= len;
+
 		if (BTRFS_I(inode)->delalloc_bytes == 0 &&
 		    !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
 			list_del_init(&BTRFS_I(inode)->delalloc_inodes);
@@ -1426,7 +1385,8 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
  */
 static int __btrfs_submit_bio_start(struct inode *inode, int rw,
 				    struct bio *bio, int mirror_num,
-				    unsigned long bio_flags)
+				    unsigned long bio_flags,
+				    u64 bio_offset)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret = 0;
@@ -1445,7 +1405,8 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw,
  * are inserted into the btree
  */
 static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
-			  int mirror_num, unsigned long bio_flags)
+			  int mirror_num, unsigned long bio_flags,
+			  u64 bio_offset)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	return btrfs_map_bio(root, rw, bio, mirror_num, 1);
@@ -1456,7 +1417,8 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
  * on write, or reading the csums from the tree before a read
  */
 static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
-			  int mirror_num, unsigned long bio_flags)
+			  int mirror_num, unsigned long bio_flags,
+			  u64 bio_offset)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret = 0;
@@ -1481,7 +1443,8 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 		/* we're doing a write, do the async checksumming */
 		return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
 				   inode, rw, bio, mirror_num,
-				   bio_flags, __btrfs_submit_bio_start,
+				   bio_flags, bio_offset,
+				   __btrfs_submit_bio_start,
 				   __btrfs_submit_bio_done);
 	}
 
@@ -1508,12 +1471,13 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end)
+int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
+			      struct extent_state **cached_state)
 {
 	if ((end & (PAGE_CACHE_SIZE - 1)) == 0)
 		WARN_ON(1);
 	return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
-				   GFP_NOFS);
+				   cached_state, GFP_NOFS);
 }
 
 /* see btrfs_writepage_start_hook for details on why this is required */
@@ -1526,6 +1490,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
 {
 	struct btrfs_writepage_fixup *fixup;
 	struct btrfs_ordered_extent *ordered;
+	struct extent_state *cached_state = NULL;
 	struct page *page;
 	struct inode *inode;
 	u64 page_start;
@@ -1544,7 +1509,8 @@ again:
 	page_start = page_offset(page);
 	page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
 
-	lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
+	lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0,
+			 &cached_state, GFP_NOFS);
 
 	/* already ordered? We're done */
 	if (PagePrivate2(page))
@@ -1552,17 +1518,19 @@ again:
 
 	ordered = btrfs_lookup_ordered_extent(inode, page_start);
 	if (ordered) {
-		unlock_extent(&BTRFS_I(inode)->io_tree, page_start,
-			      page_end, GFP_NOFS);
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
+				     page_end, &cached_state, GFP_NOFS);
 		unlock_page(page);
 		btrfs_start_ordered_extent(inode, ordered, 1);
 		goto again;
 	}
 
-	btrfs_set_extent_delalloc(inode, page_start, page_end);
+	BUG();
+	btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
 	ClearPageChecked(page);
 out:
-	unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
+			     &cached_state, GFP_NOFS);
 out_page:
 	unlock_page(page);
 	page_cache_release(page);
@@ -1681,24 +1649,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
  * before we start the transaction.  It limits the amount of btree
  * reads required while inside the transaction.
  */
-static noinline void reada_csum(struct btrfs_root *root,
-				struct btrfs_path *path,
-				struct btrfs_ordered_extent *ordered_extent)
-{
-	struct btrfs_ordered_sum *sum;
-	u64 bytenr;
-
-	sum = list_entry(ordered_extent->list.next, struct btrfs_ordered_sum,
-			 list);
-	bytenr = sum->sums[0].bytenr;
-
-	/*
-	 * we don't care about the results, the point of this search is
-	 * just to get the btree leaves into ram
-	 */
-	btrfs_lookup_csum(NULL, root->fs_info->csum_root, path, bytenr, 0);
-}
-
 /* as ordered data IO finishes, this gets called so we can finish
  * an ordered extent if the range of bytes in the file it covers are
  * fully written.
@@ -1706,60 +1656,39 @@ static noinline void reada_csum(struct btrfs_root *root,
 static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_trans_handle *trans;
+	struct btrfs_trans_handle *trans = NULL;
 	struct btrfs_ordered_extent *ordered_extent = NULL;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-	struct btrfs_path *path;
+	struct extent_state *cached_state = NULL;
 	int compressed = 0;
 	int ret;
 
-	ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1);
+	ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
+					     end - start + 1);
 	if (!ret)
 		return 0;
-
-	/*
-	 * before we join the transaction, try to do some of our IO.
-	 * This will limit the amount of IO that we have to do with
-	 * the transaction running.  We're unlikely to need to do any
-	 * IO if the file extents are new, the disk_i_size checks
-	 * covers the most common case.
-	 */
-	if (start < BTRFS_I(inode)->disk_i_size) {
-		path = btrfs_alloc_path();
-		if (path) {
-			ret = btrfs_lookup_file_extent(NULL, root, path,
-						       inode->i_ino,
-						       start, 0);
-			ordered_extent = btrfs_lookup_ordered_extent(inode,
-								     start);
-			if (!list_empty(&ordered_extent->list)) {
-				btrfs_release_path(root, path);
-				reada_csum(root, path, ordered_extent);
-			}
-			btrfs_free_path(path);
-		}
-	}
-
-	if (!ordered_extent)
-		ordered_extent = btrfs_lookup_ordered_extent(inode, start);
 	BUG_ON(!ordered_extent);
+
 	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
 		BUG_ON(!list_empty(&ordered_extent->list));
 		ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
 		if (!ret) {
 			trans = btrfs_join_transaction(root, 1);
+			btrfs_set_trans_block_group(trans, inode);
+			trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 			ret = btrfs_update_inode(trans, root, inode);
 			BUG_ON(ret);
-			btrfs_end_transaction(trans, root);
 		}
 		goto out;
 	}
 
-	lock_extent(io_tree, ordered_extent->file_offset,
-		    ordered_extent->file_offset + ordered_extent->len - 1,
-		    GFP_NOFS);
+	lock_extent_bits(io_tree, ordered_extent->file_offset,
+			 ordered_extent->file_offset + ordered_extent->len - 1,
+			 0, &cached_state, GFP_NOFS);
 
 	trans = btrfs_join_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, inode);
+	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 
 	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
 		compressed = 1;
@@ -1784,18 +1713,20 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 				   ordered_extent->len);
 		BUG_ON(ret);
 	}
-	unlock_extent(io_tree, ordered_extent->file_offset,
-		    ordered_extent->file_offset + ordered_extent->len - 1,
-		    GFP_NOFS);
+	unlock_extent_cached(io_tree, ordered_extent->file_offset,
+			     ordered_extent->file_offset +
+			     ordered_extent->len - 1, &cached_state, GFP_NOFS);
+
 	add_pending_csums(trans, inode, ordered_extent->file_offset,
 			  &ordered_extent->list);
 
-	/* this also removes the ordered extent from the tree */
 	btrfs_ordered_update_i_size(inode, 0, ordered_extent);
 	ret = btrfs_update_inode(trans, root, inode);
 	BUG_ON(ret);
-	btrfs_end_transaction(trans, root);
 out:
+	btrfs_delalloc_release_metadata(inode, ordered_extent->len);
+	if (trans)
+		btrfs_end_transaction(trans, root);
 	/* once for us */
 	btrfs_put_ordered_extent(ordered_extent);
 	/* once for the tree */
@@ -1917,7 +1848,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
 
 	BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
 						      failrec->last_mirror,
-						      failrec->bio_flags);
+						      failrec->bio_flags, 0);
 	return 0;
 }
 
@@ -2072,32 +2003,196 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
 }
 
 /*
+ * calculate extra metadata reservation when snapshotting a subvolume
+ * contains orphan files.
+ */
+void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
+				struct btrfs_pending_snapshot *pending,
+				u64 *bytes_to_reserve)
+{
+	struct btrfs_root *root;
+	struct btrfs_block_rsv *block_rsv;
+	u64 num_bytes;
+	int index;
+
+	root = pending->root;
+	if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
+		return;
+
+	block_rsv = root->orphan_block_rsv;
+
+	/* orphan block reservation for the snapshot */
+	num_bytes = block_rsv->size;
+
+	/*
+	 * after the snapshot is created, COWing tree blocks may use more
+	 * space than it frees. So we should make sure there is enough
+	 * reserved space.
+	 */
+	index = trans->transid & 0x1;
+	if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
+		num_bytes += block_rsv->size -
+			     (block_rsv->reserved + block_rsv->freed[index]);
+	}
+
+	*bytes_to_reserve += num_bytes;
+}
+
+void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
+				struct btrfs_pending_snapshot *pending)
+{
+	struct btrfs_root *root = pending->root;
+	struct btrfs_root *snap = pending->snap;
+	struct btrfs_block_rsv *block_rsv;
+	u64 num_bytes;
+	int index;
+	int ret;
+
+	if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
+		return;
+
+	/* refill source subvolume's orphan block reservation */
+	block_rsv = root->orphan_block_rsv;
+	index = trans->transid & 0x1;
+	if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
+		num_bytes = block_rsv->size -
+			    (block_rsv->reserved + block_rsv->freed[index]);
+		ret = btrfs_block_rsv_migrate(&pending->block_rsv,
+					      root->orphan_block_rsv,
+					      num_bytes);
+		BUG_ON(ret);
+	}
+
+	/* setup orphan block reservation for the snapshot */
+	block_rsv = btrfs_alloc_block_rsv(snap);
+	BUG_ON(!block_rsv);
+
+	btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
+	snap->orphan_block_rsv = block_rsv;
+
+	num_bytes = root->orphan_block_rsv->size;
+	ret = btrfs_block_rsv_migrate(&pending->block_rsv,
+				      block_rsv, num_bytes);
+	BUG_ON(ret);
+
+#if 0
+	/* insert orphan item for the snapshot */
+	WARN_ON(!root->orphan_item_inserted);
+	ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
+				       snap->root_key.objectid);
+	BUG_ON(ret);
+	snap->orphan_item_inserted = 1;
+#endif
+}
+
+enum btrfs_orphan_cleanup_state {
+	ORPHAN_CLEANUP_STARTED	= 1,
+	ORPHAN_CLEANUP_DONE	= 2,
+};
+
+/*
+ * This is called in transaction commmit time. If there are no orphan
+ * files in the subvolume, it removes orphan item and frees block_rsv
+ * structure.
+ */
+void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root)
+{
+	int ret;
+
+	if (!list_empty(&root->orphan_list) ||
+	    root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
+		return;
+
+	if (root->orphan_item_inserted &&
+	    btrfs_root_refs(&root->root_item) > 0) {
+		ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
+					    root->root_key.objectid);
+		BUG_ON(ret);
+		root->orphan_item_inserted = 0;
+	}
+
+	if (root->orphan_block_rsv) {
+		WARN_ON(root->orphan_block_rsv->size > 0);
+		btrfs_free_block_rsv(root, root->orphan_block_rsv);
+		root->orphan_block_rsv = NULL;
+	}
+}
+
+/*
  * This creates an orphan entry for the given inode in case something goes
  * wrong in the middle of an unlink/truncate.
+ *
+ * NOTE: caller of this function should reserve 5 units of metadata for
+ *	 this function.
  */
 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	int ret = 0;
+	struct btrfs_block_rsv *block_rsv = NULL;
+	int reserve = 0;
+	int insert = 0;
+	int ret;
 
-	spin_lock(&root->list_lock);
+	if (!root->orphan_block_rsv) {
+		block_rsv = btrfs_alloc_block_rsv(root);
+		BUG_ON(!block_rsv);
+	}
 
-	/* already on the orphan list, we're good */
-	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
-		spin_unlock(&root->list_lock);
-		return 0;
+	spin_lock(&root->orphan_lock);
+	if (!root->orphan_block_rsv) {
+		root->orphan_block_rsv = block_rsv;
+	} else if (block_rsv) {
+		btrfs_free_block_rsv(root, block_rsv);
+		block_rsv = NULL;
 	}
 
-	list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+	if (list_empty(&BTRFS_I(inode)->i_orphan)) {
+		list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+#if 0
+		/*
+		 * For proper ENOSPC handling, we should do orphan
+		 * cleanup when mounting. But this introduces backward
+		 * compatibility issue.
+		 */
+		if (!xchg(&root->orphan_item_inserted, 1))
+			insert = 2;
+		else
+			insert = 1;
+#endif
+		insert = 1;
+	} else {
+		WARN_ON(!BTRFS_I(inode)->orphan_meta_reserved);
+	}
 
-	spin_unlock(&root->list_lock);
+	if (!BTRFS_I(inode)->orphan_meta_reserved) {
+		BTRFS_I(inode)->orphan_meta_reserved = 1;
+		reserve = 1;
+	}
+	spin_unlock(&root->orphan_lock);
 
-	/*
-	 * insert an orphan item to track this unlinked/truncated file
-	 */
-	ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
+	if (block_rsv)
+		btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
 
-	return ret;
+	/* grab metadata reservation from transaction handle */
+	if (reserve) {
+		ret = btrfs_orphan_reserve_metadata(trans, inode);
+		BUG_ON(ret);
+	}
+
+	/* insert an orphan item to track this unlinked/truncated file */
+	if (insert >= 1) {
+		ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
+		BUG_ON(ret);
+	}
+
+	/* insert an orphan item to track subvolume contains orphan files */
+	if (insert >= 2) {
+		ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
+					       root->root_key.objectid);
+		BUG_ON(ret);
+	}
+	return 0;
 }
 
 /*
@@ -2107,26 +2202,31 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
 int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int delete_item = 0;
+	int release_rsv = 0;
 	int ret = 0;
 
-	spin_lock(&root->list_lock);
-
-	if (list_empty(&BTRFS_I(inode)->i_orphan)) {
-		spin_unlock(&root->list_lock);
-		return 0;
+	spin_lock(&root->orphan_lock);
+	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
+		list_del_init(&BTRFS_I(inode)->i_orphan);
+		delete_item = 1;
 	}
 
-	list_del_init(&BTRFS_I(inode)->i_orphan);
-	if (!trans) {
-		spin_unlock(&root->list_lock);
-		return 0;
+	if (BTRFS_I(inode)->orphan_meta_reserved) {
+		BTRFS_I(inode)->orphan_meta_reserved = 0;
+		release_rsv = 1;
 	}
+	spin_unlock(&root->orphan_lock);
 
-	spin_unlock(&root->list_lock);
+	if (trans && delete_item) {
+		ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
+		BUG_ON(ret);
+	}
 
-	ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
+	if (release_rsv)
+		btrfs_orphan_release_metadata(inode);
 
-	return ret;
+	return 0;
 }
 
 /*
@@ -2143,7 +2243,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
 	struct inode *inode;
 	int ret = 0, nr_unlink = 0, nr_truncate = 0;
 
-	if (!xchg(&root->clean_orphans, 0))
+	if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
 		return;
 
 	path = btrfs_alloc_path();
@@ -2195,17 +2295,16 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
 		found_key.objectid = found_key.offset;
 		found_key.type = BTRFS_INODE_ITEM_KEY;
 		found_key.offset = 0;
-		inode = btrfs_iget(root->fs_info->sb, &found_key, root);
-		if (IS_ERR(inode))
-			break;
+		inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
+		BUG_ON(IS_ERR(inode));
 
 		/*
 		 * add this inode to the orphan list so btrfs_orphan_del does
 		 * the proper thing when we hit it
 		 */
-		spin_lock(&root->list_lock);
+		spin_lock(&root->orphan_lock);
 		list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
-		spin_unlock(&root->list_lock);
+		spin_unlock(&root->orphan_lock);
 
 		/*
 		 * if this is a bad inode, means we actually succeeded in
@@ -2214,7 +2313,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
 		 * do a destroy_inode
 		 */
 		if (is_bad_inode(inode)) {
-			trans = btrfs_start_transaction(root, 1);
+			trans = btrfs_start_transaction(root, 0);
 			btrfs_orphan_del(trans, inode);
 			btrfs_end_transaction(trans, root);
 			iput(inode);
@@ -2232,13 +2331,23 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
 		/* this will do delete_inode and everything for us */
 		iput(inode);
 	}
+	btrfs_free_path(path);
+
+	root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
+
+	if (root->orphan_block_rsv)
+		btrfs_block_rsv_release(root, root->orphan_block_rsv,
+					(u64)-1);
+
+	if (root->orphan_block_rsv || root->orphan_item_inserted) {
+		trans = btrfs_join_transaction(root, 1);
+		btrfs_end_transaction(trans, root);
+	}
 
 	if (nr_unlink)
 		printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
 	if (nr_truncate)
 		printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
-
-	btrfs_free_path(path);
 }
 
 /*
@@ -2557,29 +2666,201 @@ out:
 	return ret;
 }
 
-static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
+/* helper to check if there is any shared block in the path */
+static int check_path_shared(struct btrfs_root *root,
+			     struct btrfs_path *path)
+{
+	struct extent_buffer *eb;
+	int level;
+	int ret;
+	u64 refs = 1;
+
+	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+		if (!path->nodes[level])
+			break;
+		eb = path->nodes[level];
+		if (!btrfs_block_can_be_shared(root, eb))
+			continue;
+		ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len,
+					       &refs, NULL);
+		if (refs > 1)
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * helper to start transaction for unlink and rmdir.
+ *
+ * unlink and rmdir are special in btrfs, they do not always free space.
+ * so in enospc case, we should make sure they will free space before
+ * allowing them to use the global metadata reservation.
+ */
+static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
+						       struct dentry *dentry)
 {
-	struct btrfs_root *root;
 	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct btrfs_path *path;
+	struct btrfs_inode_ref *ref;
+	struct btrfs_dir_item *di;
 	struct inode *inode = dentry->d_inode;
+	u64 index;
+	int check_link = 1;
+	int err = -ENOSPC;
 	int ret;
-	unsigned long nr = 0;
 
-	root = BTRFS_I(dir)->root;
+	trans = btrfs_start_transaction(root, 10);
+	if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
+		return trans;
 
-	/*
-	 * 5 items for unlink inode
-	 * 1 for orphan
-	 */
-	ret = btrfs_reserve_metadata_space(root, 6);
-	if (ret)
-		return ret;
+	if (inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
+		return ERR_PTR(-ENOSPC);
+
+	/* check if there is someone else holds reference */
+	if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1)
+		return ERR_PTR(-ENOSPC);
 
-	trans = btrfs_start_transaction(root, 1);
+	if (atomic_read(&inode->i_count) > 2)
+		return ERR_PTR(-ENOSPC);
+
+	if (xchg(&root->fs_info->enospc_unlink, 1))
+		return ERR_PTR(-ENOSPC);
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		root->fs_info->enospc_unlink = 0;
+		return ERR_PTR(-ENOMEM);
+	}
+
+	trans = btrfs_start_transaction(root, 0);
 	if (IS_ERR(trans)) {
-		btrfs_unreserve_metadata_space(root, 6);
-		return PTR_ERR(trans);
+		btrfs_free_path(path);
+		root->fs_info->enospc_unlink = 0;
+		return trans;
+	}
+
+	path->skip_locking = 1;
+	path->search_commit_root = 1;
+
+	ret = btrfs_lookup_inode(trans, root, path,
+				&BTRFS_I(dir)->location, 0);
+	if (ret < 0) {
+		err = ret;
+		goto out;
+	}
+	if (ret == 0) {
+		if (check_path_shared(root, path))
+			goto out;
+	} else {
+		check_link = 0;
+	}
+	btrfs_release_path(root, path);
+
+	ret = btrfs_lookup_inode(trans, root, path,
+				&BTRFS_I(inode)->location, 0);
+	if (ret < 0) {
+		err = ret;
+		goto out;
+	}
+	if (ret == 0) {
+		if (check_path_shared(root, path))
+			goto out;
+	} else {
+		check_link = 0;
+	}
+	btrfs_release_path(root, path);
+
+	if (ret == 0 && S_ISREG(inode->i_mode)) {
+		ret = btrfs_lookup_file_extent(trans, root, path,
+					       inode->i_ino, (u64)-1, 0);
+		if (ret < 0) {
+			err = ret;
+			goto out;
+		}
+		BUG_ON(ret == 0);
+		if (check_path_shared(root, path))
+			goto out;
+		btrfs_release_path(root, path);
+	}
+
+	if (!check_link) {
+		err = 0;
+		goto out;
+	}
+
+	di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+				dentry->d_name.name, dentry->d_name.len, 0);
+	if (IS_ERR(di)) {
+		err = PTR_ERR(di);
+		goto out;
+	}
+	if (di) {
+		if (check_path_shared(root, path))
+			goto out;
+	} else {
+		err = 0;
+		goto out;
+	}
+	btrfs_release_path(root, path);
+
+	ref = btrfs_lookup_inode_ref(trans, root, path,
+				dentry->d_name.name, dentry->d_name.len,
+				inode->i_ino, dir->i_ino, 0);
+	if (IS_ERR(ref)) {
+		err = PTR_ERR(ref);
+		goto out;
+	}
+	BUG_ON(!ref);
+	if (check_path_shared(root, path))
+		goto out;
+	index = btrfs_inode_ref_index(path->nodes[0], ref);
+	btrfs_release_path(root, path);
+
+	di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, index,
+				dentry->d_name.name, dentry->d_name.len, 0);
+	if (IS_ERR(di)) {
+		err = PTR_ERR(di);
+		goto out;
+	}
+	BUG_ON(ret == -ENOENT);
+	if (check_path_shared(root, path))
+		goto out;
+
+	err = 0;
+out:
+	btrfs_free_path(path);
+	if (err) {
+		btrfs_end_transaction(trans, root);
+		root->fs_info->enospc_unlink = 0;
+		return ERR_PTR(err);
+	}
+
+	trans->block_rsv = &root->fs_info->global_block_rsv;
+	return trans;
+}
+
+static void __unlink_end_trans(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root)
+{
+	if (trans->block_rsv == &root->fs_info->global_block_rsv) {
+		BUG_ON(!root->fs_info->enospc_unlink);
+		root->fs_info->enospc_unlink = 0;
 	}
+	btrfs_end_transaction_throttle(trans, root);
+}
+
+static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct btrfs_trans_handle *trans;
+	struct inode *inode = dentry->d_inode;
+	int ret;
+	unsigned long nr = 0;
+
+	trans = __unlink_start_trans(dir, dentry);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
 
 	btrfs_set_trans_block_group(trans, dir);
 
@@ -2587,14 +2868,15 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 
 	ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
 				 dentry->d_name.name, dentry->d_name.len);
+	BUG_ON(ret);
 
-	if (inode->i_nlink == 0)
+	if (inode->i_nlink == 0) {
 		ret = btrfs_orphan_add(trans, inode);
+		BUG_ON(ret);
+	}
 
 	nr = trans->blocks_used;
-
-	btrfs_end_transaction_throttle(trans, root);
-	btrfs_unreserve_metadata_space(root, 6);
+	__unlink_end_trans(trans, root);
 	btrfs_btree_balance_dirty(root, nr);
 	return ret;
 }
@@ -2666,7 +2948,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
 	int err = 0;
-	int ret;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct btrfs_trans_handle *trans;
 	unsigned long nr = 0;
@@ -2675,15 +2956,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	    inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
 		return -ENOTEMPTY;
 
-	ret = btrfs_reserve_metadata_space(root, 5);
-	if (ret)
-		return ret;
-
-	trans = btrfs_start_transaction(root, 1);
-	if (IS_ERR(trans)) {
-		btrfs_unreserve_metadata_space(root, 5);
+	trans = __unlink_start_trans(dir, dentry);
+	if (IS_ERR(trans))
 		return PTR_ERR(trans);
-	}
 
 	btrfs_set_trans_block_group(trans, dir);
 
@@ -2706,12 +2981,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 		btrfs_i_size_write(inode, 0);
 out:
 	nr = trans->blocks_used;
-	ret = btrfs_end_transaction_throttle(trans, root);
-	btrfs_unreserve_metadata_space(root, 5);
+	__unlink_end_trans(trans, root);
 	btrfs_btree_balance_dirty(root, nr);
 
-	if (ret && !err)
-		err = ret;
 	return err;
 }
 
@@ -3108,6 +3380,7 @@ out:
 	if (pending_del_nr) {
 		ret = btrfs_del_items(trans, root, path, pending_del_slot,
 				      pending_del_nr);
+		BUG_ON(ret);
 	}
 	btrfs_free_path(path);
 	return err;
@@ -3123,6 +3396,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct btrfs_ordered_extent *ordered;
+	struct extent_state *cached_state = NULL;
 	char *kaddr;
 	u32 blocksize = root->sectorsize;
 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
@@ -3134,11 +3408,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 
 	if ((offset & (blocksize - 1)) == 0)
 		goto out;
-	ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
-	if (ret)
-		goto out;
-
-	ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
+	ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
 	if (ret)
 		goto out;
 
@@ -3146,8 +3416,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 again:
 	page = grab_cache_page(mapping, index);
 	if (!page) {
-		btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
-		btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+		btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
 		goto out;
 	}
 
@@ -3169,12 +3438,14 @@ again:
 	}
 	wait_on_page_writeback(page);
 
-	lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+	lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state,
+			 GFP_NOFS);
 	set_page_extent_mapped(page);
 
 	ordered = btrfs_lookup_ordered_extent(inode, page_start);
 	if (ordered) {
-		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+		unlock_extent_cached(io_tree, page_start, page_end,
+				     &cached_state, GFP_NOFS);
 		unlock_page(page);
 		page_cache_release(page);
 		btrfs_start_ordered_extent(inode, ordered, 1);
@@ -3182,13 +3453,15 @@ again:
 		goto again;
 	}
 
-	clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
+	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
 			  EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
-			  GFP_NOFS);
+			  0, 0, &cached_state, GFP_NOFS);
 
-	ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
+	ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
+					&cached_state);
 	if (ret) {
-		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+		unlock_extent_cached(io_tree, page_start, page_end,
+				     &cached_state, GFP_NOFS);
 		goto out_unlock;
 	}
 
@@ -3201,12 +3474,12 @@ again:
 	}
 	ClearPageChecked(page);
 	set_page_dirty(page);
-	unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+	unlock_extent_cached(io_tree, page_start, page_end, &cached_state,
+			     GFP_NOFS);
 
 out_unlock:
 	if (ret)
-		btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
-	btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+		btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
 	unlock_page(page);
 	page_cache_release(page);
 out:
@@ -3218,7 +3491,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-	struct extent_map *em;
+	struct extent_map *em = NULL;
+	struct extent_state *cached_state = NULL;
 	u64 mask = root->sectorsize - 1;
 	u64 hole_start = (inode->i_size + mask) & ~mask;
 	u64 block_end = (size + mask) & ~mask;
@@ -3234,11 +3508,13 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
 		struct btrfs_ordered_extent *ordered;
 		btrfs_wait_ordered_range(inode, hole_start,
 					 block_end - hole_start);
-		lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
+		lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
+				 &cached_state, GFP_NOFS);
 		ordered = btrfs_lookup_ordered_extent(inode, hole_start);
 		if (!ordered)
 			break;
-		unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
+		unlock_extent_cached(io_tree, hole_start, block_end - 1,
+				     &cached_state, GFP_NOFS);
 		btrfs_put_ordered_extent(ordered);
 	}
 
@@ -3253,11 +3529,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
 			u64 hint_byte = 0;
 			hole_size = last_byte - cur_offset;
 
-			err = btrfs_reserve_metadata_space(root, 2);
-			if (err)
+			trans = btrfs_start_transaction(root, 2);
+			if (IS_ERR(trans)) {
+				err = PTR_ERR(trans);
 				break;
-
-			trans = btrfs_start_transaction(root, 1);
+			}
 			btrfs_set_trans_block_group(trans, inode);
 
 			err = btrfs_drop_extents(trans, inode, cur_offset,
@@ -3275,15 +3551,17 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
 					last_byte - 1, 0);
 
 			btrfs_end_transaction(trans, root);
-			btrfs_unreserve_metadata_space(root, 2);
 		}
 		free_extent_map(em);
+		em = NULL;
 		cur_offset = last_byte;
 		if (cur_offset >= block_end)
 			break;
 	}
 
-	unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
+	free_extent_map(em);
+	unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
+			     GFP_NOFS);
 	return err;
 }
 
@@ -3308,11 +3586,10 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
 		}
 	}
 
-	ret = btrfs_reserve_metadata_space(root, 1);
-	if (ret)
-		return ret;
+	trans = btrfs_start_transaction(root, 5);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
 
-	trans = btrfs_start_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, inode);
 
 	ret = btrfs_orphan_add(trans, inode);
@@ -3320,7 +3597,6 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
 
 	nr = trans->blocks_used;
 	btrfs_end_transaction(trans, root);
-	btrfs_unreserve_metadata_space(root, 1);
 	btrfs_btree_balance_dirty(root, nr);
 
 	if (attr->ia_size > inode->i_size) {
@@ -3333,8 +3609,11 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
 		i_size_write(inode, attr->ia_size);
 		btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
 
-		trans = btrfs_start_transaction(root, 1);
+		trans = btrfs_start_transaction(root, 0);
+		BUG_ON(IS_ERR(trans));
 		btrfs_set_trans_block_group(trans, inode);
+		trans->block_rsv = root->orphan_block_rsv;
+		BUG_ON(!trans->block_rsv);
 
 		ret = btrfs_update_inode(trans, root, inode);
 		BUG_ON(ret);
@@ -3414,10 +3693,21 @@ void btrfs_delete_inode(struct inode *inode)
 	btrfs_i_size_write(inode, 0);
 
 	while (1) {
-		trans = btrfs_start_transaction(root, 1);
+		trans = btrfs_start_transaction(root, 0);
+		BUG_ON(IS_ERR(trans));
 		btrfs_set_trans_block_group(trans, inode);
-		ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
+		trans->block_rsv = root->orphan_block_rsv;
+
+		ret = btrfs_block_rsv_check(trans, root,
+					    root->orphan_block_rsv, 0, 5);
+		if (ret) {
+			BUG_ON(ret != -EAGAIN);
+			ret = btrfs_commit_transaction(trans, root);
+			BUG_ON(ret);
+			continue;
+		}
 
+		ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
 		if (ret != -EAGAIN)
 			break;
 
@@ -3425,6 +3715,7 @@ void btrfs_delete_inode(struct inode *inode)
 		btrfs_end_transaction(trans, root);
 		trans = NULL;
 		btrfs_btree_balance_dirty(root, nr);
+
 	}
 
 	if (ret == 0) {
@@ -3665,39 +3956,10 @@ again:
 	return 0;
 }
 
-static noinline void init_btrfs_i(struct inode *inode)
-{
-	struct btrfs_inode *bi = BTRFS_I(inode);
-
-	bi->generation = 0;
-	bi->sequence = 0;
-	bi->last_trans = 0;
-	bi->last_sub_trans = 0;
-	bi->logged_trans = 0;
-	bi->delalloc_bytes = 0;
-	bi->reserved_bytes = 0;
-	bi->disk_i_size = 0;
-	bi->flags = 0;
-	bi->index_cnt = (u64)-1;
-	bi->last_unlink_trans = 0;
-	bi->ordered_data_close = 0;
-	extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
-	extent_io_tree_init(&BTRFS_I(inode)->io_tree,
-			     inode->i_mapping, GFP_NOFS);
-	extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
-			     inode->i_mapping, GFP_NOFS);
-	INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
-	INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
-	RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
-	btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
-	mutex_init(&BTRFS_I(inode)->log_mutex);
-}
-
 static int btrfs_init_locked_inode(struct inode *inode, void *p)
 {
 	struct btrfs_iget_args *args = p;
 	inode->i_ino = args->ino;
-	init_btrfs_i(inode);
 	BTRFS_I(inode)->root = args->root;
 	btrfs_set_inode_space_info(args->root, inode);
 	return 0;
@@ -3729,7 +3991,7 @@ static struct inode *btrfs_iget_locked(struct super_block *s,
  * Returns in *is_new if the inode was read from disk
  */
 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
-			 struct btrfs_root *root)
+			 struct btrfs_root *root, int *new)
 {
 	struct inode *inode;
 
@@ -3744,6 +4006,8 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
 
 		inode_tree_add(inode);
 		unlock_new_inode(inode);
+		if (new)
+			*new = 1;
 	}
 
 	return inode;
@@ -3758,8 +4022,6 @@ static struct inode *new_simple_dir(struct super_block *s,
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 
-	init_btrfs_i(inode);
-
 	BTRFS_I(inode)->root = root;
 	memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
 	BTRFS_I(inode)->dummy_inode = 1;
@@ -3796,7 +4058,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 		return NULL;
 
 	if (location.type == BTRFS_INODE_ITEM_KEY) {
-		inode = btrfs_iget(dir->i_sb, &location, root);
+		inode = btrfs_iget(dir->i_sb, &location, root, NULL);
 		return inode;
 	}
 
@@ -3811,7 +4073,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 		else
 			inode = new_simple_dir(dir->i_sb, &location, sub_root);
 	} else {
-		inode = btrfs_iget(dir->i_sb, &location, sub_root);
+		inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
 	}
 	srcu_read_unlock(&root->fs_info->subvol_srcu, index);
 
@@ -4010,16 +4272,16 @@ err:
 	return ret;
 }
 
-int btrfs_write_inode(struct inode *inode, int wait)
+int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
 	int ret = 0;
 
-	if (root->fs_info->btree_inode == inode)
+	if (BTRFS_I(inode)->dummy_inode)
 		return 0;
 
-	if (wait) {
+	if (wbc->sync_mode == WB_SYNC_ALL) {
 		trans = btrfs_join_transaction(root, 1);
 		btrfs_set_trans_block_group(trans, inode);
 		ret = btrfs_commit_transaction(trans, root);
@@ -4037,10 +4299,38 @@ void btrfs_dirty_inode(struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
+	int ret;
+
+	if (BTRFS_I(inode)->dummy_inode)
+		return;
 
 	trans = btrfs_join_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, inode);
-	btrfs_update_inode(trans, root, inode);
+
+	ret = btrfs_update_inode(trans, root, inode);
+	if (ret && ret == -ENOSPC) {
+		/* whoops, lets try again with the full transaction */
+		btrfs_end_transaction(trans, root);
+		trans = btrfs_start_transaction(root, 1);
+		if (IS_ERR(trans)) {
+			if (printk_ratelimit()) {
+				printk(KERN_ERR "btrfs: fail to "
+				       "dirty  inode %lu error %ld\n",
+				       inode->i_ino, PTR_ERR(trans));
+			}
+			return;
+		}
+		btrfs_set_trans_block_group(trans, inode);
+
+		ret = btrfs_update_inode(trans, root, inode);
+		if (ret) {
+			if (printk_ratelimit()) {
+				printk(KERN_ERR "btrfs: fail to "
+				       "dirty  inode %lu error %d\n",
+				       inode->i_ino, ret);
+			}
+		}
+	}
 	btrfs_end_transaction(trans, root);
 }
 
@@ -4158,7 +4448,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	 * btrfs_get_inode_index_count has an explanation for the magic
 	 * number
 	 */
-	init_btrfs_i(inode);
 	BTRFS_I(inode)->index_cnt = 2;
 	BTRFS_I(inode)->root = root;
 	BTRFS_I(inode)->generation = trans->transid;
@@ -4187,16 +4476,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	if (ret != 0)
 		goto fail;
 
-	inode->i_uid = current_fsuid();
-
-	if (dir && (dir->i_mode & S_ISGID)) {
-		inode->i_gid = dir->i_gid;
-		if (S_ISDIR(mode))
-			mode |= S_ISGID;
-	} else
-		inode->i_gid = current_fsgid();
-
-	inode->i_mode = mode;
+	inode_init_owner(inode, dir, mode);
 	inode->i_ino = objectid;
 	inode_set_bytes(inode, 0);
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
@@ -4322,26 +4602,21 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 	if (!new_valid_dev(rdev))
 		return -EINVAL;
 
+	err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+	if (err)
+		return err;
+
 	/*
 	 * 2 for inode item and ref
 	 * 2 for dir items
 	 * 1 for xattr if selinux is on
 	 */
-	err = btrfs_reserve_metadata_space(root, 5);
-	if (err)
-		return err;
+	trans = btrfs_start_transaction(root, 5);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
 
-	trans = btrfs_start_transaction(root, 1);
-	if (!trans)
-		goto fail;
 	btrfs_set_trans_block_group(trans, dir);
 
-	err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
-	if (err) {
-		err = -ENOSPC;
-		goto out_unlock;
-	}
-
 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
 				dentry->d_name.len,
 				dentry->d_parent->d_inode->i_ino, objectid,
@@ -4370,13 +4645,11 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 out_unlock:
 	nr = trans->blocks_used;
 	btrfs_end_transaction_throttle(trans, root);
-fail:
-	btrfs_unreserve_metadata_space(root, 5);
+	btrfs_btree_balance_dirty(root, nr);
 	if (drop_inode) {
 		inode_dec_link_count(inode);
 		iput(inode);
 	}
-	btrfs_btree_balance_dirty(root, nr);
 	return err;
 }
 
@@ -4386,32 +4659,26 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct inode *inode = NULL;
-	int err;
 	int drop_inode = 0;
+	int err;
 	unsigned long nr = 0;
 	u64 objectid;
 	u64 index = 0;
 
+	err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+	if (err)
+		return err;
 	/*
 	 * 2 for inode item and ref
 	 * 2 for dir items
 	 * 1 for xattr if selinux is on
 	 */
-	err = btrfs_reserve_metadata_space(root, 5);
-	if (err)
-		return err;
+	trans = btrfs_start_transaction(root, 5);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
 
-	trans = btrfs_start_transaction(root, 1);
-	if (!trans)
-		goto fail;
 	btrfs_set_trans_block_group(trans, dir);
 
-	err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
-	if (err) {
-		err = -ENOSPC;
-		goto out_unlock;
-	}
-
 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
 				dentry->d_name.len,
 				dentry->d_parent->d_inode->i_ino,
@@ -4443,8 +4710,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 out_unlock:
 	nr = trans->blocks_used;
 	btrfs_end_transaction_throttle(trans, root);
-fail:
-	btrfs_unreserve_metadata_space(root, 5);
 	if (drop_inode) {
 		inode_dec_link_count(inode);
 		iput(inode);
@@ -4471,21 +4736,21 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	if (root->objectid != BTRFS_I(inode)->root->objectid)
 		return -EPERM;
 
-	/*
-	 * 1 item for inode ref
-	 * 2 items for dir items
-	 */
-	err = btrfs_reserve_metadata_space(root, 3);
-	if (err)
-		return err;
-
 	btrfs_inc_nlink(inode);
 
 	err = btrfs_set_inode_index(dir, &index);
 	if (err)
 		goto fail;
 
-	trans = btrfs_start_transaction(root, 1);
+	/*
+	 * 1 item for inode ref
+	 * 2 items for dir items
+	 */
+	trans = btrfs_start_transaction(root, 3);
+	if (IS_ERR(trans)) {
+		err = PTR_ERR(trans);
+		goto fail;
+	}
 
 	btrfs_set_trans_block_group(trans, dir);
 	atomic_inc(&inode->i_count);
@@ -4504,7 +4769,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	nr = trans->blocks_used;
 	btrfs_end_transaction_throttle(trans, root);
 fail:
-	btrfs_unreserve_metadata_space(root, 3);
 	if (drop_inode) {
 		inode_dec_link_count(inode);
 		iput(inode);
@@ -4524,28 +4788,20 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	u64 index = 0;
 	unsigned long nr = 1;
 
+	err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+	if (err)
+		return err;
+
 	/*
 	 * 2 items for inode and ref
 	 * 2 items for dir items
 	 * 1 for xattr if selinux is on
 	 */
-	err = btrfs_reserve_metadata_space(root, 5);
-	if (err)
-		return err;
-
-	trans = btrfs_start_transaction(root, 1);
-	if (!trans) {
-		err = -ENOMEM;
-		goto out_unlock;
-	}
+	trans = btrfs_start_transaction(root, 5);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
 	btrfs_set_trans_block_group(trans, dir);
 
-	err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
-	if (err) {
-		err = -ENOSPC;
-		goto out_unlock;
-	}
-
 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
 				dentry->d_name.len,
 				dentry->d_parent->d_inode->i_ino, objectid,
@@ -4585,9 +4841,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 out_fail:
 	nr = trans->blocks_used;
 	btrfs_end_transaction_throttle(trans, root);
-
-out_unlock:
-	btrfs_unreserve_metadata_space(root, 5);
 	if (drop_on_err)
 		iput(inode);
 	btrfs_btree_balance_dirty(root, nr);
@@ -4845,6 +5098,7 @@ again:
 			}
 			flush_dcache_page(page);
 		} else if (create && PageUptodate(page)) {
+			WARN_ON(1);
 			if (!trans) {
 				kunmap(page);
 				free_extent_map(em);
@@ -4941,11 +5195,651 @@ out:
 	return em;
 }
 
+static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
+						  u64 start, u64 len)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	struct extent_map *em;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct btrfs_key ins;
+	u64 alloc_hint;
+	int ret;
+
+	btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
+
+	trans = btrfs_join_transaction(root, 0);
+	if (!trans)
+		return ERR_PTR(-ENOMEM);
+
+	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+
+	alloc_hint = get_extent_allocation_hint(inode, start, len);
+	ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0,
+				   alloc_hint, (u64)-1, &ins, 1);
+	if (ret) {
+		em = ERR_PTR(ret);
+		goto out;
+	}
+
+	em = alloc_extent_map(GFP_NOFS);
+	if (!em) {
+		em = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+
+	em->start = start;
+	em->orig_start = em->start;
+	em->len = ins.offset;
+
+	em->block_start = ins.objectid;
+	em->block_len = ins.offset;
+	em->bdev = root->fs_info->fs_devices->latest_bdev;
+	set_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+	while (1) {
+		write_lock(&em_tree->lock);
+		ret = add_extent_mapping(em_tree, em);
+		write_unlock(&em_tree->lock);
+		if (ret != -EEXIST)
+			break;
+		btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
+	}
+
+	ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
+					   ins.offset, ins.offset, 0);
+	if (ret) {
+		btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+		em = ERR_PTR(ret);
+	}
+out:
+	btrfs_end_transaction(trans, root);
+	return em;
+}
+
+/*
+ * returns 1 when the nocow is safe, < 1 on error, 0 if the
+ * block must be cow'd
+ */
+static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
+				      struct inode *inode, u64 offset, u64 len)
+{
+	struct btrfs_path *path;
+	int ret;
+	struct extent_buffer *leaf;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_key key;
+	u64 disk_bytenr;
+	u64 backref_offset;
+	u64 extent_end;
+	u64 num_bytes;
+	int slot;
+	int found_type;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+				       offset, 0);
+	if (ret < 0)
+		goto out;
+
+	slot = path->slots[0];
+	if (ret == 1) {
+		if (slot == 0) {
+			/* can't find the item, must cow */
+			ret = 0;
+			goto out;
+		}
+		slot--;
+	}
+	ret = 0;
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &key, slot);
+	if (key.objectid != inode->i_ino ||
+	    key.type != BTRFS_EXTENT_DATA_KEY) {
+		/* not our file or wrong item type, must cow */
+		goto out;
+	}
+
+	if (key.offset > offset) {
+		/* Wrong offset, must cow */
+		goto out;
+	}
+
+	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+	found_type = btrfs_file_extent_type(leaf, fi);
+	if (found_type != BTRFS_FILE_EXTENT_REG &&
+	    found_type != BTRFS_FILE_EXTENT_PREALLOC) {
+		/* not a regular extent, must cow */
+		goto out;
+	}
+	disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+	backref_offset = btrfs_file_extent_offset(leaf, fi);
+
+	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+	if (extent_end < offset + len) {
+		/* extent doesn't include our full range, must cow */
+		goto out;
+	}
+
+	if (btrfs_extent_readonly(root, disk_bytenr))
+		goto out;
+
+	/*
+	 * look for other files referencing this extent, if we
+	 * find any we must cow
+	 */
+	if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
+				  key.offset - backref_offset, disk_bytenr))
+		goto out;
+
+	/*
+	 * adjust disk_bytenr and num_bytes to cover just the bytes
+	 * in this extent we are about to write.  If there
+	 * are any csums in that range we have to cow in order
+	 * to keep the csums correct
+	 */
+	disk_bytenr += backref_offset;
+	disk_bytenr += offset - key.offset;
+	num_bytes = min(offset + len, extent_end) - offset;
+	if (csum_exist_in_range(root, disk_bytenr, num_bytes))
+				goto out;
+	/*
+	 * all of the above have passed, it is safe to overwrite this extent
+	 * without cow
+	 */
+	ret = 1;
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
+				   struct buffer_head *bh_result, int create)
+{
+	struct extent_map *em;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 start = iblock << inode->i_blkbits;
+	u64 len = bh_result->b_size;
+	struct btrfs_trans_handle *trans;
+
+	em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
+	if (IS_ERR(em))
+		return PTR_ERR(em);
+
+	/*
+	 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
+	 * io.  INLINE is special, and we could probably kludge it in here, but
+	 * it's still buffered so for safety lets just fall back to the generic
+	 * buffered path.
+	 *
+	 * For COMPRESSED we _have_ to read the entire extent in so we can
+	 * decompress it, so there will be buffering required no matter what we
+	 * do, so go ahead and fallback to buffered.
+	 *
+	 * We return -ENOTBLK because thats what makes DIO go ahead and go back
+	 * to buffered IO.  Don't blame me, this is the price we pay for using
+	 * the generic code.
+	 */
+	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
+	    em->block_start == EXTENT_MAP_INLINE) {
+		free_extent_map(em);
+		return -ENOTBLK;
+	}
+
+	/* Just a good old fashioned hole, return */
+	if (!create && (em->block_start == EXTENT_MAP_HOLE ||
+			test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+		free_extent_map(em);
+		/* DIO will do one hole at a time, so just unlock a sector */
+		unlock_extent(&BTRFS_I(inode)->io_tree, start,
+			      start + root->sectorsize - 1, GFP_NOFS);
+		return 0;
+	}
+
+	/*
+	 * We don't allocate a new extent in the following cases
+	 *
+	 * 1) The inode is marked as NODATACOW.  In this case we'll just use the
+	 * existing extent.
+	 * 2) The extent is marked as PREALLOC.  We're good to go here and can
+	 * just use the extent.
+	 *
+	 */
+	if (!create) {
+		len = em->len - (start - em->start);
+		goto map;
+	}
+
+	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+	    ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+	     em->block_start != EXTENT_MAP_HOLE)) {
+		int type;
+		int ret;
+		u64 block_start;
+
+		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+			type = BTRFS_ORDERED_PREALLOC;
+		else
+			type = BTRFS_ORDERED_NOCOW;
+		len = min(len, em->len - (start - em->start));
+		block_start = em->block_start + (start - em->start);
+
+		/*
+		 * we're not going to log anything, but we do need
+		 * to make sure the current transaction stays open
+		 * while we look for nocow cross refs
+		 */
+		trans = btrfs_join_transaction(root, 0);
+		if (!trans)
+			goto must_cow;
+
+		if (can_nocow_odirect(trans, inode, start, len) == 1) {
+			ret = btrfs_add_ordered_extent_dio(inode, start,
+					   block_start, len, len, type);
+			btrfs_end_transaction(trans, root);
+			if (ret) {
+				free_extent_map(em);
+				return ret;
+			}
+			goto unlock;
+		}
+		btrfs_end_transaction(trans, root);
+	}
+must_cow:
+	/*
+	 * this will cow the extent, reset the len in case we changed
+	 * it above
+	 */
+	len = bh_result->b_size;
+	free_extent_map(em);
+	em = btrfs_new_extent_direct(inode, start, len);
+	if (IS_ERR(em))
+		return PTR_ERR(em);
+	len = min(len, em->len - (start - em->start));
+unlock:
+	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1,
+			  EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1,
+			  0, NULL, GFP_NOFS);
+map:
+	bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
+		inode->i_blkbits;
+	bh_result->b_size = len;
+	bh_result->b_bdev = em->bdev;
+	set_buffer_mapped(bh_result);
+	if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+		set_buffer_new(bh_result);
+
+	free_extent_map(em);
+
+	return 0;
+}
+
+struct btrfs_dio_private {
+	struct inode *inode;
+	u64 logical_offset;
+	u64 disk_bytenr;
+	u64 bytes;
+	u32 *csums;
+	void *private;
+};
+
+static void btrfs_endio_direct_read(struct bio *bio, int err)
+{
+	struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct bio_vec *bvec = bio->bi_io_vec;
+	struct btrfs_dio_private *dip = bio->bi_private;
+	struct inode *inode = dip->inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 start;
+	u32 *private = dip->csums;
+
+	start = dip->logical_offset;
+	do {
+		if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+			struct page *page = bvec->bv_page;
+			char *kaddr;
+			u32 csum = ~(u32)0;
+			unsigned long flags;
+
+			local_irq_save(flags);
+			kaddr = kmap_atomic(page, KM_IRQ0);
+			csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
+					       csum, bvec->bv_len);
+			btrfs_csum_final(csum, (char *)&csum);
+			kunmap_atomic(kaddr, KM_IRQ0);
+			local_irq_restore(flags);
+
+			flush_dcache_page(bvec->bv_page);
+			if (csum != *private) {
+				printk(KERN_ERR "btrfs csum failed ino %lu off"
+				      " %llu csum %u private %u\n",
+				      inode->i_ino, (unsigned long long)start,
+				      csum, *private);
+				err = -EIO;
+			}
+		}
+
+		start += bvec->bv_len;
+		private++;
+		bvec++;
+	} while (bvec <= bvec_end);
+
+	unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
+		      dip->logical_offset + dip->bytes - 1, GFP_NOFS);
+	bio->bi_private = dip->private;
+
+	kfree(dip->csums);
+	kfree(dip);
+	dio_end_io(bio, err);
+}
+
+static void btrfs_endio_direct_write(struct bio *bio, int err)
+{
+	struct btrfs_dio_private *dip = bio->bi_private;
+	struct inode *inode = dip->inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_ordered_extent *ordered = NULL;
+	struct extent_state *cached_state = NULL;
+	int ret;
+
+	if (err)
+		goto out_done;
+
+	ret = btrfs_dec_test_ordered_pending(inode, &ordered,
+					     dip->logical_offset, dip->bytes);
+	if (!ret)
+		goto out_done;
+
+	BUG_ON(!ordered);
+
+	trans = btrfs_join_transaction(root, 1);
+	if (!trans) {
+		err = -ENOMEM;
+		goto out;
+	}
+	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+
+	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
+		ret = btrfs_ordered_update_i_size(inode, 0, ordered);
+		if (!ret)
+			ret = btrfs_update_inode(trans, root, inode);
+		err = ret;
+		goto out;
+	}
+
+	lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset,
+			 ordered->file_offset + ordered->len - 1, 0,
+			 &cached_state, GFP_NOFS);
+
+	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
+		ret = btrfs_mark_extent_written(trans, inode,
+						ordered->file_offset,
+						ordered->file_offset +
+						ordered->len);
+		if (ret) {
+			err = ret;
+			goto out_unlock;
+		}
+	} else {
+		ret = insert_reserved_file_extent(trans, inode,
+						  ordered->file_offset,
+						  ordered->start,
+						  ordered->disk_len,
+						  ordered->len,
+						  ordered->len,
+						  0, 0, 0,
+						  BTRFS_FILE_EXTENT_REG);
+		unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
+				   ordered->file_offset, ordered->len);
+		if (ret) {
+			err = ret;
+			WARN_ON(1);
+			goto out_unlock;
+		}
+	}
+
+	add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
+	btrfs_ordered_update_i_size(inode, 0, ordered);
+	btrfs_update_inode(trans, root, inode);
+out_unlock:
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
+			     ordered->file_offset + ordered->len - 1,
+			     &cached_state, GFP_NOFS);
+out:
+	btrfs_delalloc_release_metadata(inode, ordered->len);
+	btrfs_end_transaction(trans, root);
+	btrfs_put_ordered_extent(ordered);
+	btrfs_put_ordered_extent(ordered);
+out_done:
+	bio->bi_private = dip->private;
+
+	kfree(dip->csums);
+	kfree(dip);
+	dio_end_io(bio, err);
+}
+
+static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
+				    struct bio *bio, int mirror_num,
+				    unsigned long bio_flags, u64 offset)
+{
+	int ret;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
+	BUG_ON(ret);
+	return 0;
+}
+
+static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
+				loff_t file_offset)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_dio_private *dip;
+	struct bio_vec *bvec = bio->bi_io_vec;
+	u64 start;
+	int skip_sum;
+	int write = rw & (1 << BIO_RW);
+	int ret = 0;
+
+	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+
+	dip = kmalloc(sizeof(*dip), GFP_NOFS);
+	if (!dip) {
+		ret = -ENOMEM;
+		goto free_ordered;
+	}
+	dip->csums = NULL;
+
+	if (!skip_sum) {
+		dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
+		if (!dip->csums) {
+			ret = -ENOMEM;
+			goto free_ordered;
+		}
+	}
+
+	dip->private = bio->bi_private;
+	dip->inode = inode;
+	dip->logical_offset = file_offset;
+
+	start = dip->logical_offset;
+	dip->bytes = 0;
+	do {
+		dip->bytes += bvec->bv_len;
+		bvec++;
+	} while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1));
+
+	dip->disk_bytenr = (u64)bio->bi_sector << 9;
+	bio->bi_private = dip;
+
+	if (write)
+		bio->bi_end_io = btrfs_endio_direct_write;
+	else
+		bio->bi_end_io = btrfs_endio_direct_read;
+
+	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+	if (ret)
+		goto out_err;
+
+	if (write && !skip_sum) {
+		ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+				   inode, rw, bio, 0, 0,
+				   dip->logical_offset,
+				   __btrfs_submit_bio_start_direct_io,
+				   __btrfs_submit_bio_done);
+		if (ret)
+			goto out_err;
+		return;
+	} else if (!skip_sum)
+		btrfs_lookup_bio_sums_dio(root, inode, bio,
+					  dip->logical_offset, dip->csums);
+
+	ret = btrfs_map_bio(root, rw, bio, 0, 1);
+	if (ret)
+		goto out_err;
+	return;
+out_err:
+	kfree(dip->csums);
+	kfree(dip);
+free_ordered:
+	/*
+	 * If this is a write, we need to clean up the reserved space and kill
+	 * the ordered extent.
+	 */
+	if (write) {
+		struct btrfs_ordered_extent *ordered;
+		ordered = btrfs_lookup_ordered_extent(inode,
+						      dip->logical_offset);
+		if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
+		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
+			btrfs_free_reserved_extent(root, ordered->start,
+						   ordered->disk_len);
+		btrfs_put_ordered_extent(ordered);
+		btrfs_put_ordered_extent(ordered);
+	}
+	bio_endio(bio, ret);
+}
+
+static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
+			const struct iovec *iov, loff_t offset,
+			unsigned long nr_segs)
+{
+	int seg;
+	size_t size;
+	unsigned long addr;
+	unsigned blocksize_mask = root->sectorsize - 1;
+	ssize_t retval = -EINVAL;
+	loff_t end = offset;
+
+	if (offset & blocksize_mask)
+		goto out;
+
+	/* Check the memory alignment.  Blocks cannot straddle pages */
+	for (seg = 0; seg < nr_segs; seg++) {
+		addr = (unsigned long)iov[seg].iov_base;
+		size = iov[seg].iov_len;
+		end += size;
+		if ((addr & blocksize_mask) || (size & blocksize_mask)) 
+			goto out;
+	}
+	retval = 0;
+out:
+	return retval;
+}
 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 			const struct iovec *iov, loff_t offset,
 			unsigned long nr_segs)
 {
-	return -EINVAL;
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	struct btrfs_ordered_extent *ordered;
+	struct extent_state *cached_state = NULL;
+	u64 lockstart, lockend;
+	ssize_t ret;
+	int writing = rw & WRITE;
+	int write_bits = 0;
+	size_t count = iov_length(iov, nr_segs);
+
+	if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
+			    offset, nr_segs)) {
+		return 0;
+	}
+
+	lockstart = offset;
+	lockend = offset + count - 1;
+
+	if (writing) {
+		ret = btrfs_delalloc_reserve_space(inode, count);
+		if (ret)
+			goto out;
+	}
+
+	while (1) {
+		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+				 0, &cached_state, GFP_NOFS);
+		/*
+		 * We're concerned with the entire range that we're going to be
+		 * doing DIO to, so we need to make sure theres no ordered
+		 * extents in this range.
+		 */
+		ordered = btrfs_lookup_ordered_range(inode, lockstart,
+						     lockend - lockstart + 1);
+		if (!ordered)
+			break;
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+				     &cached_state, GFP_NOFS);
+		btrfs_start_ordered_extent(inode, ordered, 1);
+		btrfs_put_ordered_extent(ordered);
+		cond_resched();
+	}
+
+	/*
+	 * we don't use btrfs_set_extent_delalloc because we don't want
+	 * the dirty or uptodate bits
+	 */
+	if (writing) {
+		write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING;
+		ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+				     EXTENT_DELALLOC, 0, NULL, &cached_state,
+				     GFP_NOFS);
+		if (ret) {
+			clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+					 lockend, EXTENT_LOCKED | write_bits,
+					 1, 0, &cached_state, GFP_NOFS);
+			goto out;
+		}
+	}
+
+	free_extent_state(cached_state);
+	cached_state = NULL;
+
+	ret = __blockdev_direct_IO(rw, iocb, inode,
+		   BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
+		   iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
+		   btrfs_submit_direct, 0);
+
+	if (ret < 0 && ret != -EIOCBQUEUED) {
+		clear_extent_bit(&BTRFS_I(inode)->io_tree, offset,
+			      offset + iov_length(iov, nr_segs) - 1,
+			      EXTENT_LOCKED | write_bits, 1, 0,
+			      &cached_state, GFP_NOFS);
+	} else if (ret >= 0 && ret < iov_length(iov, nr_segs)) {
+		/*
+		 * We're falling back to buffered, unlock the section we didn't
+		 * do IO on.
+		 */
+		clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret,
+			      offset + iov_length(iov, nr_segs) - 1,
+			      EXTENT_LOCKED | write_bits, 1, 0,
+			      &cached_state, GFP_NOFS);
+	}
+out:
+	free_extent_state(cached_state);
+	return ret;
 }
 
 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -5021,6 +5915,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 {
 	struct extent_io_tree *tree;
 	struct btrfs_ordered_extent *ordered;
+	struct extent_state *cached_state = NULL;
 	u64 page_start = page_offset(page);
 	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
 
@@ -5039,7 +5934,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 		btrfs_releasepage(page, GFP_NOFS);
 		return;
 	}
-	lock_extent(tree, page_start, page_end, GFP_NOFS);
+	lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
+			 GFP_NOFS);
 	ordered = btrfs_lookup_ordered_extent(page->mapping->host,
 					   page_offset(page));
 	if (ordered) {
@@ -5050,7 +5946,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 		clear_extent_bit(tree, page_start, page_end,
 				 EXTENT_DIRTY | EXTENT_DELALLOC |
 				 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0,
-				 NULL, GFP_NOFS);
+				 &cached_state, GFP_NOFS);
 		/*
 		 * whoever cleared the private bit is responsible
 		 * for the finish_ordered_io
@@ -5060,11 +5956,13 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 						page_start, page_end);
 		}
 		btrfs_put_ordered_extent(ordered);
-		lock_extent(tree, page_start, page_end, GFP_NOFS);
+		cached_state = NULL;
+		lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
+				 GFP_NOFS);
 	}
 	clear_extent_bit(tree, page_start, page_end,
 		 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
-		 EXTENT_DO_ACCOUNTING, 1, 1, NULL, GFP_NOFS);
+		 EXTENT_DO_ACCOUNTING, 1, 1, &cached_state, GFP_NOFS);
 	__btrfs_releasepage(page, GFP_NOFS);
 
 	ClearPageChecked(page);
@@ -5097,6 +5995,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct btrfs_ordered_extent *ordered;
+	struct extent_state *cached_state = NULL;
 	char *kaddr;
 	unsigned long zero_start;
 	loff_t size;
@@ -5104,7 +6003,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	u64 page_start;
 	u64 page_end;
 
-	ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
+	ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
 	if (ret) {
 		if (ret == -ENOMEM)
 			ret = VM_FAULT_OOM;
@@ -5113,13 +6012,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 		goto out;
 	}
 
-	ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
-	if (ret) {
-		btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
-		ret = VM_FAULT_SIGBUS;
-		goto out;
-	}
-
 	ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
 again:
 	lock_page(page);
@@ -5129,13 +6021,13 @@ again:
 
 	if ((page->mapping != inode->i_mapping) ||
 	    (page_start >= size)) {
-		btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
 		/* page got truncated out from underneath us */
 		goto out_unlock;
 	}
 	wait_on_page_writeback(page);
 
-	lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+	lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state,
+			 GFP_NOFS);
 	set_page_extent_mapped(page);
 
 	/*
@@ -5144,7 +6036,8 @@ again:
 	 */
 	ordered = btrfs_lookup_ordered_extent(inode, page_start);
 	if (ordered) {
-		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+		unlock_extent_cached(io_tree, page_start, page_end,
+				     &cached_state, GFP_NOFS);
 		unlock_page(page);
 		btrfs_start_ordered_extent(inode, ordered, 1);
 		btrfs_put_ordered_extent(ordered);
@@ -5158,15 +6051,16 @@ again:
 	 * is probably a better way to do this, but for now keep consistent with
 	 * prepare_pages in the normal write path.
 	 */
-	clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
+	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
 			  EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
-			  GFP_NOFS);
+			  0, 0, &cached_state, GFP_NOFS);
 
-	ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
+	ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
+					&cached_state);
 	if (ret) {
-		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+		unlock_extent_cached(io_tree, page_start, page_end,
+				     &cached_state, GFP_NOFS);
 		ret = VM_FAULT_SIGBUS;
-		btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
 		goto out_unlock;
 	}
 	ret = 0;
@@ -5190,13 +6084,13 @@ again:
 	BTRFS_I(inode)->last_trans = root->fs_info->generation;
 	BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
 
-	unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+	unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
 
 out_unlock:
-	btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
 	if (!ret)
 		return VM_FAULT_LOCKED;
 	unlock_page(page);
+	btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
 out:
 	return ret;
 }
@@ -5221,8 +6115,10 @@ static void btrfs_truncate(struct inode *inode)
 	btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
 	btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
 
-	trans = btrfs_start_transaction(root, 1);
+	trans = btrfs_start_transaction(root, 0);
+	BUG_ON(IS_ERR(trans));
 	btrfs_set_trans_block_group(trans, inode);
+	trans->block_rsv = root->orphan_block_rsv;
 
 	/*
 	 * setattr is responsible for setting the ordered_data_close flag,
@@ -5245,6 +6141,23 @@ static void btrfs_truncate(struct inode *inode)
 		btrfs_add_ordered_operation(trans, root, inode);
 
 	while (1) {
+		if (!trans) {
+			trans = btrfs_start_transaction(root, 0);
+			BUG_ON(IS_ERR(trans));
+			btrfs_set_trans_block_group(trans, inode);
+			trans->block_rsv = root->orphan_block_rsv;
+		}
+
+		ret = btrfs_block_rsv_check(trans, root,
+					    root->orphan_block_rsv, 0, 5);
+		if (ret) {
+			BUG_ON(ret != -EAGAIN);
+			ret = btrfs_commit_transaction(trans, root);
+			BUG_ON(ret);
+			trans = NULL;
+			continue;
+		}
+
 		ret = btrfs_truncate_inode_items(trans, root, inode,
 						 inode->i_size,
 						 BTRFS_EXTENT_DATA_KEY);
@@ -5256,10 +6169,8 @@ static void btrfs_truncate(struct inode *inode)
 
 		nr = trans->blocks_used;
 		btrfs_end_transaction(trans, root);
+		trans = NULL;
 		btrfs_btree_balance_dirty(root, nr);
-
-		trans = btrfs_start_transaction(root, 1);
-		btrfs_set_trans_block_group(trans, inode);
 	}
 
 	if (ret == 0 && inode->i_nlink > 0) {
@@ -5320,21 +6231,47 @@ unsigned long btrfs_force_ra(struct address_space *mapping,
 struct inode *btrfs_alloc_inode(struct super_block *sb)
 {
 	struct btrfs_inode *ei;
+	struct inode *inode;
 
 	ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
 	if (!ei)
 		return NULL;
+
+	ei->root = NULL;
+	ei->space_info = NULL;
+	ei->generation = 0;
+	ei->sequence = 0;
 	ei->last_trans = 0;
 	ei->last_sub_trans = 0;
 	ei->logged_trans = 0;
-	ei->outstanding_extents = 0;
-	ei->reserved_extents = 0;
-	ei->root = NULL;
+	ei->delalloc_bytes = 0;
+	ei->reserved_bytes = 0;
+	ei->disk_i_size = 0;
+	ei->flags = 0;
+	ei->index_cnt = (u64)-1;
+	ei->last_unlink_trans = 0;
+
 	spin_lock_init(&ei->accounting_lock);
+	atomic_set(&ei->outstanding_extents, 0);
+	ei->reserved_extents = 0;
+
+	ei->ordered_data_close = 0;
+	ei->orphan_meta_reserved = 0;
+	ei->dummy_inode = 0;
+	ei->force_compress = 0;
+
+	inode = &ei->vfs_inode;
+	extent_map_tree_init(&ei->extent_tree, GFP_NOFS);
+	extent_io_tree_init(&ei->io_tree, &inode->i_data, GFP_NOFS);
+	extent_io_tree_init(&ei->io_failure_tree, &inode->i_data, GFP_NOFS);
+	mutex_init(&ei->log_mutex);
 	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
 	INIT_LIST_HEAD(&ei->i_orphan);
+	INIT_LIST_HEAD(&ei->delalloc_inodes);
 	INIT_LIST_HEAD(&ei->ordered_operations);
-	return &ei->vfs_inode;
+	RB_CLEAR_NODE(&ei->rb_node);
+
+	return inode;
 }
 
 void btrfs_destroy_inode(struct inode *inode)
@@ -5344,6 +6281,8 @@ void btrfs_destroy_inode(struct inode *inode)
 
 	WARN_ON(!list_empty(&inode->i_dentry));
 	WARN_ON(inode->i_data.nrpages);
+	WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents));
+	WARN_ON(BTRFS_I(inode)->reserved_extents);
 
 	/*
 	 * This can happen where we create an inode, but somebody else also
@@ -5364,13 +6303,13 @@ void btrfs_destroy_inode(struct inode *inode)
 		spin_unlock(&root->fs_info->ordered_extent_lock);
 	}
 
-	spin_lock(&root->list_lock);
+	spin_lock(&root->orphan_lock);
 	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
 		printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n",
 		       inode->i_ino);
 		list_del_init(&BTRFS_I(inode)->i_orphan);
 	}
-	spin_unlock(&root->list_lock);
+	spin_unlock(&root->orphan_lock);
 
 	while (1) {
 		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -5395,7 +6334,6 @@ free:
 void btrfs_drop_inode(struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-
 	if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0)
 		generic_delete_inode(inode);
 	else
@@ -5492,19 +6430,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (S_ISDIR(old_inode->i_mode) && new_inode &&
 	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
 		return -ENOTEMPTY;
-
-	/*
-	 * We want to reserve the absolute worst case amount of items.  So if
-	 * both inodes are subvols and we need to unlink them then that would
-	 * require 4 item modifications, but if they are both normal inodes it
-	 * would require 5 item modifications, so we'll assume their normal
-	 * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
-	 * should cover the worst case number of items we'll modify.
-	 */
-	ret = btrfs_reserve_metadata_space(root, 11);
-	if (ret)
-		return ret;
-
 	/*
 	 * we're using rename to replace one file with another.
 	 * and the replacement file is large.  Start IO on it now so
@@ -5517,8 +6442,18 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	/* close the racy window with snapshot create/destroy ioctl */
 	if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
 		down_read(&root->fs_info->subvol_sem);
+	/*
+	 * We want to reserve the absolute worst case amount of items.  So if
+	 * both inodes are subvols and we need to unlink them then that would
+	 * require 4 item modifications, but if they are both normal inodes it
+	 * would require 5 item modifications, so we'll assume their normal
+	 * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
+	 * should cover the worst case number of items we'll modify.
+	 */
+	trans = btrfs_start_transaction(root, 20);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
 
-	trans = btrfs_start_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, new_dir);
 
 	if (dest != root)
@@ -5617,7 +6552,6 @@ out_fail:
 	if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
 		up_read(&root->fs_info->subvol_sem);
 
-	btrfs_unreserve_metadata_space(root, 11);
 	return ret;
 }
 
@@ -5669,6 +6603,38 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 	return 0;
 }
 
+int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput)
+{
+	struct btrfs_inode *binode;
+	struct inode *inode = NULL;
+
+	spin_lock(&root->fs_info->delalloc_lock);
+	while (!list_empty(&root->fs_info->delalloc_inodes)) {
+		binode = list_entry(root->fs_info->delalloc_inodes.next,
+				    struct btrfs_inode, delalloc_inodes);
+		inode = igrab(&binode->vfs_inode);
+		if (inode) {
+			list_move_tail(&binode->delalloc_inodes,
+				       &root->fs_info->delalloc_inodes);
+			break;
+		}
+
+		list_del_init(&binode->delalloc_inodes);
+		cond_resched_lock(&root->fs_info->delalloc_lock);
+	}
+	spin_unlock(&root->fs_info->delalloc_lock);
+
+	if (inode) {
+		write_inode_now(inode, 0);
+		if (delay_iput)
+			btrfs_add_delayed_iput(inode);
+		else
+			iput(inode);
+		return 1;
+	}
+	return 0;
+}
+
 static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 			 const char *symname)
 {
@@ -5692,26 +6658,20 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
 		return -ENAMETOOLONG;
 
+	err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+	if (err)
+		return err;
 	/*
 	 * 2 items for inode item and ref
 	 * 2 items for dir items
 	 * 1 item for xattr if selinux is on
 	 */
-	err = btrfs_reserve_metadata_space(root, 5);
-	if (err)
-		return err;
+	trans = btrfs_start_transaction(root, 5);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
 
-	trans = btrfs_start_transaction(root, 1);
-	if (!trans)
-		goto out_fail;
 	btrfs_set_trans_block_group(trans, dir);
 
-	err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
-	if (err) {
-		err = -ENOSPC;
-		goto out_unlock;
-	}
-
 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
 				dentry->d_name.len,
 				dentry->d_parent->d_inode->i_ino, objectid,
@@ -5783,8 +6743,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 out_unlock:
 	nr = trans->blocks_used;
 	btrfs_end_transaction_throttle(trans, root);
-out_fail:
-	btrfs_unreserve_metadata_space(root, 5);
 	if (drop_inode) {
 		inode_dec_link_count(inode);
 		iput(inode);
@@ -5793,36 +6751,28 @@ out_fail:
 	return err;
 }
 
-static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
-			u64 alloc_hint, int mode, loff_t actual_len)
+int btrfs_prealloc_file_range(struct inode *inode, int mode,
+			      u64 start, u64 num_bytes, u64 min_size,
+			      loff_t actual_len, u64 *alloc_hint)
 {
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_key ins;
-	u64 alloc_size;
 	u64 cur_offset = start;
-	u64 num_bytes = end - start;
 	int ret = 0;
-	u64 i_size;
 
 	while (num_bytes > 0) {
-		alloc_size = min(num_bytes, root->fs_info->max_extent);
-
-		trans = btrfs_start_transaction(root, 1);
-
-		ret = btrfs_reserve_extent(trans, root, alloc_size,
-					   root->sectorsize, 0, alloc_hint,
-					   (u64)-1, &ins, 1);
-		if (ret) {
-			WARN_ON(1);
-			goto stop_trans;
+		trans = btrfs_start_transaction(root, 3);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			break;
 		}
 
-		ret = btrfs_reserve_metadata_space(root, 3);
+		ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
+					   0, *alloc_hint, (u64)-1, &ins, 1);
 		if (ret) {
-			btrfs_free_reserved_extent(root, ins.objectid,
-						   ins.offset);
-			goto stop_trans;
+			btrfs_end_transaction(trans, root);
+			break;
 		}
 
 		ret = insert_reserved_file_extent(trans, inode,
@@ -5836,37 +6786,33 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
 
 		num_bytes -= ins.offset;
 		cur_offset += ins.offset;
-		alloc_hint = ins.objectid + ins.offset;
+		*alloc_hint = ins.objectid + ins.offset;
 
 		inode->i_ctime = CURRENT_TIME;
 		BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
 		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
-		    cur_offset > inode->i_size) {
+		    (actual_len > inode->i_size) &&
+		    (cur_offset > inode->i_size)) {
 			if (cur_offset > actual_len)
-				i_size  = actual_len;
+				i_size_write(inode, actual_len);
 			else
-				i_size = cur_offset;
-			i_size_write(inode, i_size);
-			btrfs_ordered_update_i_size(inode, i_size, NULL);
+				i_size_write(inode, cur_offset);
+			i_size_write(inode, cur_offset);
+			btrfs_ordered_update_i_size(inode, cur_offset, NULL);
 		}
 
 		ret = btrfs_update_inode(trans, root, inode);
 		BUG_ON(ret);
 
 		btrfs_end_transaction(trans, root);
-		btrfs_unreserve_metadata_space(root, 3);
 	}
 	return ret;
-
-stop_trans:
-	btrfs_end_transaction(trans, root);
-	return ret;
-
 }
 
 static long btrfs_fallocate(struct inode *inode, int mode,
 			    loff_t offset, loff_t len)
 {
+	struct extent_state *cached_state = NULL;
 	u64 cur_offset;
 	u64 last_byte;
 	u64 alloc_start;
@@ -5893,8 +6839,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
 			goto out;
 	}
 
-	ret = btrfs_check_data_free_space(BTRFS_I(inode)->root, inode,
-					  alloc_end - alloc_start);
+	ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
 	if (ret)
 		goto out;
 
@@ -5905,16 +6850,17 @@ static long btrfs_fallocate(struct inode *inode, int mode,
 		/* the extent lock is ordered inside the running
 		 * transaction
 		 */
-		lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
-			    GFP_NOFS);
+		lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
+				 locked_end, 0, &cached_state, GFP_NOFS);
 		ordered = btrfs_lookup_first_ordered_extent(inode,
 							    alloc_end - 1);
 		if (ordered &&
 		    ordered->file_offset + ordered->len > alloc_start &&
 		    ordered->file_offset < alloc_end) {
 			btrfs_put_ordered_extent(ordered);
-			unlock_extent(&BTRFS_I(inode)->io_tree,
-				      alloc_start, locked_end, GFP_NOFS);
+			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+					     alloc_start, locked_end,
+					     &cached_state, GFP_NOFS);
 			/*
 			 * we can't wait on the range with the transaction
 			 * running or with the extent lock held
@@ -5938,16 +6884,16 @@ static long btrfs_fallocate(struct inode *inode, int mode,
 		if (em->block_start == EXTENT_MAP_HOLE ||
 		    (cur_offset >= inode->i_size &&
 		     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
-			ret = prealloc_file_range(inode,
-						  cur_offset, last_byte,
-						alloc_hint, mode, offset+len);
+			ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
+							last_byte - cur_offset,
+							1 << inode->i_blkbits,
+							offset + len,
+							&alloc_hint);
 			if (ret < 0) {
 				free_extent_map(em);
 				break;
 			}
 		}
-		if (em->block_start <= EXTENT_MAP_LAST_BYTE)
-			alloc_hint = em->block_start;
 		free_extent_map(em);
 
 		cur_offset = last_byte;
@@ -5956,11 +6902,10 @@ static long btrfs_fallocate(struct inode *inode, int mode,
 			break;
 		}
 	}
-	unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
-		      GFP_NOFS);
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+			     &cached_state, GFP_NOFS);
 
-	btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode,
-				       alloc_end - alloc_start);
+	btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
 out:
 	mutex_unlock(&inode->i_mutex);
 	return ret;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 645a17927a8f..9254b3d58dbe 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -39,6 +39,7 @@
 #include <linux/security.h>
 #include <linux/xattr.h>
 #include <linux/vmalloc.h>
+#include <linux/slab.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -238,23 +239,19 @@ static noinline int create_subvol(struct btrfs_root *root,
 	u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
 	u64 index = 0;
 
+	ret = btrfs_find_free_objectid(NULL, root->fs_info->tree_root,
+				       0, &objectid);
+	if (ret)
+		return ret;
 	/*
 	 * 1 - inode item
 	 * 2 - refs
 	 * 1 - root item
 	 * 2 - dir items
 	 */
-	ret = btrfs_reserve_metadata_space(root, 6);
-	if (ret)
-		return ret;
-
-	trans = btrfs_start_transaction(root, 1);
-	BUG_ON(!trans);
-
-	ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
-				       0, &objectid);
-	if (ret)
-		goto fail;
+	trans = btrfs_start_transaction(root, 6);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
 
 	leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
 				      0, objectid, NULL, 0, 0, 0);
@@ -344,13 +341,10 @@ fail:
 	err = btrfs_commit_transaction(trans, root);
 	if (err && !ret)
 		ret = err;
-
-	btrfs_unreserve_metadata_space(root, 6);
 	return ret;
 }
 
-static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
-			   char *name, int namelen)
+static int create_snapshot(struct btrfs_root *root, struct dentry *dentry)
 {
 	struct inode *inode;
 	struct btrfs_pending_snapshot *pending_snapshot;
@@ -360,40 +354,33 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
 	if (!root->ref_cows)
 		return -EINVAL;
 
-	/*
-	 * 1 - inode item
-	 * 2 - refs
-	 * 1 - root item
-	 * 2 - dir items
-	 */
-	ret = btrfs_reserve_metadata_space(root, 6);
-	if (ret)
-		goto fail;
-
 	pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
-	if (!pending_snapshot) {
-		ret = -ENOMEM;
-		btrfs_unreserve_metadata_space(root, 6);
-		goto fail;
-	}
-	pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
-	if (!pending_snapshot->name) {
-		ret = -ENOMEM;
-		kfree(pending_snapshot);
-		btrfs_unreserve_metadata_space(root, 6);
-		goto fail;
-	}
-	memcpy(pending_snapshot->name, name, namelen);
-	pending_snapshot->name[namelen] = '\0';
+	if (!pending_snapshot)
+		return -ENOMEM;
+
+	btrfs_init_block_rsv(&pending_snapshot->block_rsv);
 	pending_snapshot->dentry = dentry;
-	trans = btrfs_start_transaction(root, 1);
-	BUG_ON(!trans);
 	pending_snapshot->root = root;
+
+	trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto fail;
+	}
+
+	ret = btrfs_snap_reserve_metadata(trans, pending_snapshot);
+	BUG_ON(ret);
+
 	list_add(&pending_snapshot->list,
 		 &trans->transaction->pending_snapshots);
-	ret = btrfs_commit_transaction(trans, root);
+	ret = btrfs_commit_transaction(trans, root->fs_info->extent_root);
 	BUG_ON(ret);
-	btrfs_unreserve_metadata_space(root, 6);
+
+	ret = pending_snapshot->error;
+	if (ret)
+		goto fail;
+
+	btrfs_orphan_cleanup(pending_snapshot->snap);
 
 	inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
 	if (IS_ERR(inode)) {
@@ -404,6 +391,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
 	d_instantiate(dentry, inode);
 	ret = 0;
 fail:
+	kfree(pending_snapshot);
 	return ret;
 }
 
@@ -455,8 +443,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
 		goto out_up_read;
 
 	if (snap_src) {
-		error = create_snapshot(snap_src, dentry,
-					name, namelen);
+		error = create_snapshot(snap_src, dentry);
 	} else {
 		error = create_subvol(BTRFS_I(dir)->root, dentry,
 				      name, namelen);
@@ -474,7 +461,79 @@ out_unlock:
 	return error;
 }
 
-static int btrfs_defrag_file(struct file *file)
+static int should_defrag_range(struct inode *inode, u64 start, u64 len,
+			       int thresh, u64 *last_len, u64 *skip,
+			       u64 *defrag_end)
+{
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct extent_map *em = NULL;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	int ret = 1;
+
+
+	if (thresh == 0)
+		thresh = 256 * 1024;
+
+	/*
+	 * make sure that once we start defragging and extent, we keep on
+	 * defragging it
+	 */
+	if (start < *defrag_end)
+		return 1;
+
+	*skip = 0;
+
+	/*
+	 * hopefully we have this extent in the tree already, try without
+	 * the full extent lock
+	 */
+	read_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, start, len);
+	read_unlock(&em_tree->lock);
+
+	if (!em) {
+		/* get the big lock and read metadata off disk */
+		lock_extent(io_tree, start, start + len - 1, GFP_NOFS);
+		em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
+		unlock_extent(io_tree, start, start + len - 1, GFP_NOFS);
+
+		if (IS_ERR(em))
+			return 0;
+	}
+
+	/* this will cover holes, and inline extents */
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE)
+		ret = 0;
+
+	/*
+	 * we hit a real extent, if it is big don't bother defragging it again
+	 */
+	if ((*last_len == 0 || *last_len >= thresh) && em->len >= thresh)
+		ret = 0;
+
+	/*
+	 * last_len ends up being a counter of how many bytes we've defragged.
+	 * every time we choose not to defrag an extent, we reset *last_len
+	 * so that the next tiny extent will force a defrag.
+	 *
+	 * The end result of this is that tiny extents before a single big
+	 * extent will force at least part of that big extent to be defragged.
+	 */
+	if (ret) {
+		*last_len += len;
+		*defrag_end = extent_map_end(em);
+	} else {
+		*last_len = 0;
+		*skip = extent_map_end(em);
+		*defrag_end = 0;
+	}
+
+	free_extent_map(em);
+	return ret;
+}
+
+static int btrfs_defrag_file(struct file *file,
+			     struct btrfs_ioctl_defrag_range_args *range)
 {
 	struct inode *inode = fdentry(file)->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -486,37 +545,88 @@ static int btrfs_defrag_file(struct file *file)
 	unsigned long total_read = 0;
 	u64 page_start;
 	u64 page_end;
+	u64 last_len = 0;
+	u64 skip = 0;
+	u64 defrag_end = 0;
 	unsigned long i;
 	int ret;
 
-	ret = btrfs_check_data_free_space(root, inode, inode->i_size);
-	if (ret)
-		return -ENOSPC;
+	if (inode->i_size == 0)
+		return 0;
+
+	if (range->start + range->len > range->start) {
+		last_index = min_t(u64, inode->i_size - 1,
+			 range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
+	} else {
+		last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
+	}
+
+	i = range->start >> PAGE_CACHE_SHIFT;
+	while (i <= last_index) {
+		if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
+					PAGE_CACHE_SIZE,
+					range->extent_thresh,
+					&last_len, &skip,
+					&defrag_end)) {
+			unsigned long next;
+			/*
+			 * the should_defrag function tells us how much to skip
+			 * bump our counter by the suggested amount
+			 */
+			next = (skip + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+			i = max(i + 1, next);
+			continue;
+		}
 
-	mutex_lock(&inode->i_mutex);
-	last_index = inode->i_size >> PAGE_CACHE_SHIFT;
-	for (i = 0; i <= last_index; i++) {
 		if (total_read % ra_pages == 0) {
 			btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i,
 				       min(last_index, i + ra_pages - 1));
 		}
 		total_read++;
+		mutex_lock(&inode->i_mutex);
+		if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
+			BTRFS_I(inode)->force_compress = 1;
+
+		ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+		if (ret)
+			goto err_unlock;
 again:
+		if (inode->i_size == 0 ||
+		    i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) {
+			ret = 0;
+			goto err_reservations;
+		}
+
 		page = grab_cache_page(inode->i_mapping, i);
-		if (!page)
-			goto out_unlock;
+		if (!page) {
+			ret = -ENOMEM;
+			goto err_reservations;
+		}
+
 		if (!PageUptodate(page)) {
 			btrfs_readpage(NULL, page);
 			lock_page(page);
 			if (!PageUptodate(page)) {
 				unlock_page(page);
 				page_cache_release(page);
-				goto out_unlock;
+				ret = -EIO;
+				goto err_reservations;
 			}
 		}
 
+		if (page->mapping != inode->i_mapping) {
+			unlock_page(page);
+			page_cache_release(page);
+			goto again;
+		}
+
 		wait_on_page_writeback(page);
 
+		if (PageDirty(page)) {
+			btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+			goto loop_unlock;
+		}
+
 		page_start = (u64)page->index << PAGE_CACHE_SHIFT;
 		page_end = page_start + PAGE_CACHE_SIZE - 1;
 		lock_extent(io_tree, page_start, page_end, GFP_NOFS);
@@ -537,18 +647,53 @@ again:
 		 * page if it is dirtied again later
 		 */
 		clear_page_dirty_for_io(page);
+		clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start,
+				  page_end, EXTENT_DIRTY | EXTENT_DELALLOC |
+				  EXTENT_DO_ACCOUNTING, GFP_NOFS);
 
-		btrfs_set_extent_delalloc(inode, page_start, page_end);
+		btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
+		ClearPageChecked(page);
 		set_page_dirty(page);
 		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+
+loop_unlock:
 		unlock_page(page);
 		page_cache_release(page);
+		mutex_unlock(&inode->i_mutex);
+
 		balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
+		i++;
+	}
+
+	if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO))
+		filemap_flush(inode->i_mapping);
+
+	if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
+		/* the filemap_flush will queue IO into the worker threads, but
+		 * we have to make sure the IO is actually started and that
+		 * ordered extents get created before we return
+		 */
+		atomic_inc(&root->fs_info->async_submit_draining);
+		while (atomic_read(&root->fs_info->nr_async_submits) ||
+		      atomic_read(&root->fs_info->async_delalloc_pages)) {
+			wait_event(root->fs_info->async_submit_wait,
+			   (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
+			    atomic_read(&root->fs_info->async_delalloc_pages) == 0));
+		}
+		atomic_dec(&root->fs_info->async_submit_draining);
+
+		mutex_lock(&inode->i_mutex);
+		BTRFS_I(inode)->force_compress = 0;
+		mutex_unlock(&inode->i_mutex);
 	}
 
-out_unlock:
-	mutex_unlock(&inode->i_mutex);
 	return 0;
+
+err_reservations:
+	btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+err_unlock:
+	mutex_unlock(&inode->i_mutex);
+	return ret;
 }
 
 static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
@@ -608,7 +753,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 			mod = 1;
 			sizestr++;
 		}
-		new_size = btrfs_parse_size(sizestr);
+		new_size = memparse(sizestr, NULL);
 		if (new_size == 0) {
 			ret = -EINVAL;
 			goto out_unlock;
@@ -643,7 +788,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 		device->name, (unsigned long long)new_size);
 
 	if (new_size > old_size) {
-		trans = btrfs_start_transaction(root, 1);
+		trans = btrfs_start_transaction(root, 0);
 		ret = btrfs_grow_device(trans, device, new_size);
 		btrfs_commit_transaction(trans, root);
 	} else {
@@ -743,6 +888,330 @@ out:
 	return ret;
 }
 
+static noinline int key_in_sk(struct btrfs_key *key,
+			      struct btrfs_ioctl_search_key *sk)
+{
+	struct btrfs_key test;
+	int ret;
+
+	test.objectid = sk->min_objectid;
+	test.type = sk->min_type;
+	test.offset = sk->min_offset;
+
+	ret = btrfs_comp_cpu_keys(key, &test);
+	if (ret < 0)
+		return 0;
+
+	test.objectid = sk->max_objectid;
+	test.type = sk->max_type;
+	test.offset = sk->max_offset;
+
+	ret = btrfs_comp_cpu_keys(key, &test);
+	if (ret > 0)
+		return 0;
+	return 1;
+}
+
+static noinline int copy_to_sk(struct btrfs_root *root,
+			       struct btrfs_path *path,
+			       struct btrfs_key *key,
+			       struct btrfs_ioctl_search_key *sk,
+			       char *buf,
+			       unsigned long *sk_offset,
+			       int *num_found)
+{
+	u64 found_transid;
+	struct extent_buffer *leaf;
+	struct btrfs_ioctl_search_header sh;
+	unsigned long item_off;
+	unsigned long item_len;
+	int nritems;
+	int i;
+	int slot;
+	int found = 0;
+	int ret = 0;
+
+	leaf = path->nodes[0];
+	slot = path->slots[0];
+	nritems = btrfs_header_nritems(leaf);
+
+	if (btrfs_header_generation(leaf) > sk->max_transid) {
+		i = nritems;
+		goto advance_key;
+	}
+	found_transid = btrfs_header_generation(leaf);
+
+	for (i = slot; i < nritems; i++) {
+		item_off = btrfs_item_ptr_offset(leaf, i);
+		item_len = btrfs_item_size_nr(leaf, i);
+
+		if (item_len > BTRFS_SEARCH_ARGS_BUFSIZE)
+			item_len = 0;
+
+		if (sizeof(sh) + item_len + *sk_offset >
+		    BTRFS_SEARCH_ARGS_BUFSIZE) {
+			ret = 1;
+			goto overflow;
+		}
+
+		btrfs_item_key_to_cpu(leaf, key, i);
+		if (!key_in_sk(key, sk))
+			continue;
+
+		sh.objectid = key->objectid;
+		sh.offset = key->offset;
+		sh.type = key->type;
+		sh.len = item_len;
+		sh.transid = found_transid;
+
+		/* copy search result header */
+		memcpy(buf + *sk_offset, &sh, sizeof(sh));
+		*sk_offset += sizeof(sh);
+
+		if (item_len) {
+			char *p = buf + *sk_offset;
+			/* copy the item */
+			read_extent_buffer(leaf, p,
+					   item_off, item_len);
+			*sk_offset += item_len;
+		}
+		found++;
+
+		if (*num_found >= sk->nr_items)
+			break;
+	}
+advance_key:
+	ret = 0;
+	if (key->offset < (u64)-1 && key->offset < sk->max_offset)
+		key->offset++;
+	else if (key->type < (u8)-1 && key->type < sk->max_type) {
+		key->offset = 0;
+		key->type++;
+	} else if (key->objectid < (u64)-1 && key->objectid < sk->max_objectid) {
+		key->offset = 0;
+		key->type = 0;
+		key->objectid++;
+	} else
+		ret = 1;
+overflow:
+	*num_found += found;
+	return ret;
+}
+
+static noinline int search_ioctl(struct inode *inode,
+				 struct btrfs_ioctl_search_args *args)
+{
+	struct btrfs_root *root;
+	struct btrfs_key key;
+	struct btrfs_key max_key;
+	struct btrfs_path *path;
+	struct btrfs_ioctl_search_key *sk = &args->key;
+	struct btrfs_fs_info *info = BTRFS_I(inode)->root->fs_info;
+	int ret;
+	int num_found = 0;
+	unsigned long sk_offset = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	if (sk->tree_id == 0) {
+		/* search the root of the inode that was passed */
+		root = BTRFS_I(inode)->root;
+	} else {
+		key.objectid = sk->tree_id;
+		key.type = BTRFS_ROOT_ITEM_KEY;
+		key.offset = (u64)-1;
+		root = btrfs_read_fs_root_no_name(info, &key);
+		if (IS_ERR(root)) {
+			printk(KERN_ERR "could not find root %llu\n",
+			       sk->tree_id);
+			btrfs_free_path(path);
+			return -ENOENT;
+		}
+	}
+
+	key.objectid = sk->min_objectid;
+	key.type = sk->min_type;
+	key.offset = sk->min_offset;
+
+	max_key.objectid = sk->max_objectid;
+	max_key.type = sk->max_type;
+	max_key.offset = sk->max_offset;
+
+	path->keep_locks = 1;
+
+	while(1) {
+		ret = btrfs_search_forward(root, &key, &max_key, path, 0,
+					   sk->min_transid);
+		if (ret != 0) {
+			if (ret > 0)
+				ret = 0;
+			goto err;
+		}
+		ret = copy_to_sk(root, path, &key, sk, args->buf,
+				 &sk_offset, &num_found);
+		btrfs_release_path(root, path);
+		if (ret || num_found >= sk->nr_items)
+			break;
+
+	}
+	ret = 0;
+err:
+	sk->nr_items = num_found;
+	btrfs_free_path(path);
+	return ret;
+}
+
+static noinline int btrfs_ioctl_tree_search(struct file *file,
+					   void __user *argp)
+{
+	 struct btrfs_ioctl_search_args *args;
+	 struct inode *inode;
+	 int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	args = kmalloc(sizeof(*args), GFP_KERNEL);
+	if (!args)
+		return -ENOMEM;
+
+	if (copy_from_user(args, argp, sizeof(*args))) {
+		kfree(args);
+		return -EFAULT;
+	}
+	inode = fdentry(file)->d_inode;
+	ret = search_ioctl(inode, args);
+	if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
+		ret = -EFAULT;
+	kfree(args);
+	return ret;
+}
+
+/*
+ * Search INODE_REFs to identify path name of 'dirid' directory
+ * in a 'tree_id' tree. and sets path name to 'name'.
+ */
+static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
+				u64 tree_id, u64 dirid, char *name)
+{
+	struct btrfs_root *root;
+	struct btrfs_key key;
+	char *ptr;
+	int ret = -1;
+	int slot;
+	int len;
+	int total_len = 0;
+	struct btrfs_inode_ref *iref;
+	struct extent_buffer *l;
+	struct btrfs_path *path;
+
+	if (dirid == BTRFS_FIRST_FREE_OBJECTID) {
+		name[0]='\0';
+		return 0;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX];
+
+	key.objectid = tree_id;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = (u64)-1;
+	root = btrfs_read_fs_root_no_name(info, &key);
+	if (IS_ERR(root)) {
+		printk(KERN_ERR "could not find root %llu\n", tree_id);
+		ret = -ENOENT;
+		goto out;
+	}
+
+	key.objectid = dirid;
+	key.type = BTRFS_INODE_REF_KEY;
+	key.offset = (u64)-1;
+
+	while(1) {
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0)
+			goto out;
+
+		l = path->nodes[0];
+		slot = path->slots[0];
+		if (ret > 0 && slot > 0)
+			slot--;
+		btrfs_item_key_to_cpu(l, &key, slot);
+
+		if (ret > 0 && (key.objectid != dirid ||
+				key.type != BTRFS_INODE_REF_KEY)) {
+			ret = -ENOENT;
+			goto out;
+		}
+
+		iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref);
+		len = btrfs_inode_ref_name_len(l, iref);
+		ptr -= len + 1;
+		total_len += len + 1;
+		if (ptr < name)
+			goto out;
+
+		*(ptr + len) = '/';
+		read_extent_buffer(l, ptr,(unsigned long)(iref + 1), len);
+
+		if (key.offset == BTRFS_FIRST_FREE_OBJECTID)
+			break;
+
+		btrfs_release_path(root, path);
+		key.objectid = key.offset;
+		key.offset = (u64)-1;
+		dirid = key.objectid;
+
+	}
+	if (ptr < name)
+		goto out;
+	memcpy(name, ptr, total_len);
+	name[total_len]='\0';
+	ret = 0;
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static noinline int btrfs_ioctl_ino_lookup(struct file *file,
+					   void __user *argp)
+{
+	 struct btrfs_ioctl_ino_lookup_args *args;
+	 struct inode *inode;
+	 int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	args = kmalloc(sizeof(*args), GFP_KERNEL);
+	if (!args)
+		return -ENOMEM;
+
+	if (copy_from_user(args, argp, sizeof(*args))) {
+		kfree(args);
+		return -EFAULT;
+	}
+	inode = fdentry(file)->d_inode;
+
+	if (args->treeid == 0)
+		args->treeid = BTRFS_I(inode)->root->root_key.objectid;
+
+	ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info,
+					args->treeid, args->objectid,
+					args->name);
+
+	if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
+		ret = -EFAULT;
+
+	kfree(args);
+	return ret;
+}
+
 static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 					     void __user *arg)
 {
@@ -808,7 +1277,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 	if (err)
 		goto out_up_write;
 
-	trans = btrfs_start_transaction(root, 1);
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		err = PTR_ERR(trans);
+		goto out_up_write;
+	}
+	trans->block_rsv = &root->fs_info->global_block_rsv;
+
 	ret = btrfs_unlink_subvol(trans, root, dir,
 				dest->root_key.objectid,
 				dentry->d_name.name,
@@ -822,10 +1297,12 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 	dest->root_item.drop_level = 0;
 	btrfs_set_root_refs(&dest->root_item, 0);
 
-	ret = btrfs_insert_orphan_item(trans,
-				root->fs_info->tree_root,
-				dest->root_key.objectid);
-	BUG_ON(ret);
+	if (!xchg(&dest->orphan_item_inserted, 1)) {
+		ret = btrfs_insert_orphan_item(trans,
+					root->fs_info->tree_root,
+					dest->root_key.objectid);
+		BUG_ON(ret);
+	}
 
 	ret = btrfs_commit_transaction(trans, root);
 	BUG_ON(ret);
@@ -849,10 +1326,11 @@ out:
 	return err;
 }
 
-static int btrfs_ioctl_defrag(struct file *file)
+static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
 {
 	struct inode *inode = fdentry(file)->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_ioctl_defrag_range_args *range;
 	int ret;
 
 	ret = mnt_want_write(file->f_path.mnt);
@@ -865,16 +1343,44 @@ static int btrfs_ioctl_defrag(struct file *file)
 			ret = -EPERM;
 			goto out;
 		}
-		btrfs_defrag_root(root, 0);
-		btrfs_defrag_root(root->fs_info->extent_root, 0);
+		ret = btrfs_defrag_root(root, 0);
+		if (ret)
+			goto out;
+		ret = btrfs_defrag_root(root->fs_info->extent_root, 0);
 		break;
 	case S_IFREG:
 		if (!(file->f_mode & FMODE_WRITE)) {
 			ret = -EINVAL;
 			goto out;
 		}
-		btrfs_defrag_file(file);
+
+		range = kzalloc(sizeof(*range), GFP_KERNEL);
+		if (!range) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		if (argp) {
+			if (copy_from_user(range, argp,
+					   sizeof(*range))) {
+				ret = -EFAULT;
+				kfree(range);
+				goto out;
+			}
+			/* compression requires us to start the IO */
+			if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
+				range->flags |= BTRFS_DEFRAG_RANGE_START_IO;
+				range->extent_thresh = (u32)-1;
+			}
+		} else {
+			/* the rest are all set to zero by kzalloc */
+			range->len = (u64)-1;
+		}
+		ret = btrfs_defrag_file(file, range);
+		kfree(range);
 		break;
+	default:
+		ret = -EINVAL;
 	}
 out:
 	mnt_drop_write(file->f_path.mnt);
@@ -952,7 +1458,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 	 */
 
 	/* the destination must be opened for writing */
-	if (!(file->f_mode & FMODE_WRITE))
+	if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
 		return -EINVAL;
 
 	ret = mnt_want_write(file->f_path.mnt);
@@ -964,12 +1470,17 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 		ret = -EBADF;
 		goto out_drop_write;
 	}
+
 	src = src_file->f_dentry->d_inode;
 
 	ret = -EINVAL;
 	if (src == inode)
 		goto out_fput;
 
+	/* the src must be open for reading */
+	if (!(src_file->f_mode & FMODE_READ))
+		goto out_fput;
+
 	ret = -EISDIR;
 	if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
 		goto out_fput;
@@ -1000,7 +1511,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 
 	/* determine range to clone */
 	ret = -EINVAL;
-	if (off >= src->i_size || off + len > src->i_size)
+	if (off + len > src->i_size || off + len < off)
 		goto out_unlock;
 	if (len == 0)
 		olen = len = src->i_size - off;
@@ -1028,12 +1539,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 		btrfs_wait_ordered_range(src, off, off+len);
 	}
 
-	trans = btrfs_start_transaction(root, 1);
-	BUG_ON(!trans);
-
-	/* punch hole in destination first */
-	btrfs_drop_extents(trans, inode, off, off + len, &hint_byte, 1);
-
 	/* clone data */
 	key.objectid = src->i_ino;
 	key.type = BTRFS_EXTENT_DATA_KEY;
@@ -1044,7 +1549,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 		 * note the key will change type as we walk through the
 		 * tree.
 		 */
-		ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 		if (ret < 0)
 			goto out;
 
@@ -1073,6 +1578,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 			u64 disko = 0, diskl = 0;
 			u64 datao = 0, datal = 0;
 			u8 comp;
+			u64 endoff;
 
 			size = btrfs_item_size_nr(leaf, slot);
 			read_extent_buffer(leaf, buf,
@@ -1107,12 +1613,31 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 			new_key.objectid = inode->i_ino;
 			new_key.offset = key.offset + destoff - off;
 
+			trans = btrfs_start_transaction(root, 1);
+			if (IS_ERR(trans)) {
+				ret = PTR_ERR(trans);
+				goto out;
+			}
+
 			if (type == BTRFS_FILE_EXTENT_REG ||
 			    type == BTRFS_FILE_EXTENT_PREALLOC) {
+				if (off > key.offset) {
+					datao += off - key.offset;
+					datal -= off - key.offset;
+				}
+
+				if (key.offset + datal > off + len)
+					datal = off + len - key.offset;
+
+				ret = btrfs_drop_extents(trans, inode,
+							 new_key.offset,
+							 new_key.offset + datal,
+							 &hint_byte, 1);
+				BUG_ON(ret);
+
 				ret = btrfs_insert_empty_item(trans, root, path,
 							      &new_key, size);
-				if (ret)
-					goto out;
+				BUG_ON(ret);
 
 				leaf = path->nodes[0];
 				slot = path->slots[0];
@@ -1123,14 +1648,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 				extent = btrfs_item_ptr(leaf, slot,
 						struct btrfs_file_extent_item);
 
-				if (off > key.offset) {
-					datao += off - key.offset;
-					datal -= off - key.offset;
-				}
-
-				if (key.offset + datal > off + len)
-					datal = off + len - key.offset;
-
 				/* disko == 0 means it's a hole */
 				if (!disko)
 					datao = 0;
@@ -1161,14 +1678,21 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 
 				if (comp && (skip || trim)) {
 					ret = -EINVAL;
+					btrfs_end_transaction(trans, root);
 					goto out;
 				}
 				size -= skip + trim;
 				datal -= skip + trim;
+
+				ret = btrfs_drop_extents(trans, inode,
+							 new_key.offset,
+							 new_key.offset + datal,
+							 &hint_byte, 1);
+				BUG_ON(ret);
+
 				ret = btrfs_insert_empty_item(trans, root, path,
 							      &new_key, size);
-				if (ret)
-					goto out;
+				BUG_ON(ret);
 
 				if (skip) {
 					u32 start =
@@ -1186,8 +1710,26 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 			}
 
 			btrfs_mark_buffer_dirty(leaf);
-		}
+			btrfs_release_path(root, path);
 
+			inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+			/*
+			 * we round up to the block size at eof when
+			 * determining which extents to clone above,
+			 * but shouldn't round up the file size
+			 */
+			endoff = new_key.offset + datal;
+			if (endoff > off+olen)
+				endoff = off+olen;
+			if (endoff > inode->i_size)
+				btrfs_i_size_write(inode, endoff);
+
+			BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
+			ret = btrfs_update_inode(trans, root, inode);
+			BUG_ON(ret);
+			btrfs_end_transaction(trans, root);
+		}
 next:
 		btrfs_release_path(root, path);
 		key.offset++;
@@ -1195,17 +1737,7 @@ next:
 	ret = 0;
 out:
 	btrfs_release_path(root, path);
-	if (ret == 0) {
-		inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-		if (destoff + olen > inode->i_size)
-			btrfs_i_size_write(inode, destoff + olen);
-		BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
-		ret = btrfs_update_inode(trans, root, inode);
-	}
-	btrfs_end_transaction(trans, root);
 	unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
-	if (ret)
-		vmtruncate(inode, 0);
 out_unlock:
 	mutex_unlock(&src->i_mutex);
 	mutex_unlock(&inode->i_mutex);
@@ -1274,6 +1806,157 @@ out:
 	return ret;
 }
 
+static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
+{
+	struct inode *inode = fdentry(file)->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_root *new_root;
+	struct btrfs_dir_item *di;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_path *path;
+	struct btrfs_key location;
+	struct btrfs_disk_key disk_key;
+	struct btrfs_super_block *disk_super;
+	u64 features;
+	u64 objectid = 0;
+	u64 dir_id;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (copy_from_user(&objectid, argp, sizeof(objectid)))
+		return -EFAULT;
+
+	if (!objectid)
+		objectid = root->root_key.objectid;
+
+	location.objectid = objectid;
+	location.type = BTRFS_ROOT_ITEM_KEY;
+	location.offset = (u64)-1;
+
+	new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
+	if (IS_ERR(new_root))
+		return PTR_ERR(new_root);
+
+	if (btrfs_root_refs(&new_root->root_item) == 0)
+		return -ENOENT;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	path->leave_spinning = 1;
+
+	trans = btrfs_start_transaction(root, 1);
+	if (!trans) {
+		btrfs_free_path(path);
+		return -ENOMEM;
+	}
+
+	dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
+	di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
+				   dir_id, "default", 7, 1);
+	if (IS_ERR_OR_NULL(di)) {
+		btrfs_free_path(path);
+		btrfs_end_transaction(trans, root);
+		printk(KERN_ERR "Umm, you don't have the default dir item, "
+		       "this isn't going to work\n");
+		return -ENOENT;
+	}
+
+	btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
+	btrfs_set_dir_item_key(path->nodes[0], di, &disk_key);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_free_path(path);
+
+	disk_super = &root->fs_info->super_copy;
+	features = btrfs_super_incompat_flags(disk_super);
+	if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) {
+		features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL;
+		btrfs_set_super_incompat_flags(disk_super, features);
+	}
+	btrfs_end_transaction(trans, root);
+
+	return 0;
+}
+
+long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
+{
+	struct btrfs_ioctl_space_args space_args;
+	struct btrfs_ioctl_space_info space;
+	struct btrfs_ioctl_space_info *dest;
+	struct btrfs_ioctl_space_info *dest_orig;
+	struct btrfs_ioctl_space_info *user_dest;
+	struct btrfs_space_info *info;
+	int alloc_size;
+	int ret = 0;
+	int slot_count = 0;
+
+	if (copy_from_user(&space_args,
+			   (struct btrfs_ioctl_space_args __user *)arg,
+			   sizeof(space_args)))
+		return -EFAULT;
+
+	/* first we count slots */
+	rcu_read_lock();
+	list_for_each_entry_rcu(info, &root->fs_info->space_info, list)
+		slot_count++;
+	rcu_read_unlock();
+
+	/* space_slots == 0 means they are asking for a count */
+	if (space_args.space_slots == 0) {
+		space_args.total_spaces = slot_count;
+		goto out;
+	}
+	alloc_size = sizeof(*dest) * slot_count;
+	/* we generally have at most 6 or so space infos, one for each raid
+	 * level.  So, a whole page should be more than enough for everyone
+	 */
+	if (alloc_size > PAGE_CACHE_SIZE)
+		return -ENOMEM;
+
+	space_args.total_spaces = 0;
+	dest = kmalloc(alloc_size, GFP_NOFS);
+	if (!dest)
+		return -ENOMEM;
+	dest_orig = dest;
+
+	/* now we have a buffer to copy into */
+	rcu_read_lock();
+	list_for_each_entry_rcu(info, &root->fs_info->space_info, list) {
+		/* make sure we don't copy more than we allocated
+		 * in our buffer
+		 */
+		if (slot_count == 0)
+			break;
+		slot_count--;
+
+		/* make sure userland has enough room in their buffer */
+		if (space_args.total_spaces >= space_args.space_slots)
+			break;
+
+		space.flags = info->flags;
+		space.total_bytes = info->total_bytes;
+		space.used_bytes = info->bytes_used;
+		memcpy(dest, &space, sizeof(space));
+		dest++;
+		space_args.total_spaces++;
+	}
+	rcu_read_unlock();
+
+	user_dest = (struct btrfs_ioctl_space_info *)
+		(arg + sizeof(struct btrfs_ioctl_space_args));
+
+	if (copy_to_user(user_dest, dest_orig, alloc_size))
+		ret = -EFAULT;
+
+	kfree(dest_orig);
+out:
+	if (ret == 0 && copy_to_user(arg, &space_args, sizeof(space_args)))
+		ret = -EFAULT;
+
+	return ret;
+}
+
 /*
  * there are many ways the trans_start and trans_end ioctls can lead
  * to deadlocks.  They should only be used by applications that
@@ -1320,8 +2003,12 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_snap_create(file, argp, 1);
 	case BTRFS_IOC_SNAP_DESTROY:
 		return btrfs_ioctl_snap_destroy(file, argp);
+	case BTRFS_IOC_DEFAULT_SUBVOL:
+		return btrfs_ioctl_default_subvol(file, argp);
 	case BTRFS_IOC_DEFRAG:
-		return btrfs_ioctl_defrag(file);
+		return btrfs_ioctl_defrag(file, NULL);
+	case BTRFS_IOC_DEFRAG_RANGE:
+		return btrfs_ioctl_defrag(file, argp);
 	case BTRFS_IOC_RESIZE:
 		return btrfs_ioctl_resize(root, argp);
 	case BTRFS_IOC_ADD_DEV:
@@ -1338,6 +2025,12 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_trans_start(file);
 	case BTRFS_IOC_TRANS_END:
 		return btrfs_ioctl_trans_end(file);
+	case BTRFS_IOC_TREE_SEARCH:
+		return btrfs_ioctl_tree_search(file, argp);
+	case BTRFS_IOC_INO_LOOKUP:
+		return btrfs_ioctl_ino_lookup(file, argp);
+	case BTRFS_IOC_SPACE_INFO:
+		return btrfs_ioctl_space_info(root, argp);
 	case BTRFS_IOC_SYNC:
 		btrfs_sync_fs(file->f_dentry->d_sb, 1);
 		return 0;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index bc49914475eb..424694aa517f 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -30,12 +30,114 @@ struct btrfs_ioctl_vol_args {
 	char name[BTRFS_PATH_NAME_MAX + 1];
 };
 
+#define BTRFS_INO_LOOKUP_PATH_MAX 4080
+struct btrfs_ioctl_ino_lookup_args {
+	__u64 treeid;
+	__u64 objectid;
+	char name[BTRFS_INO_LOOKUP_PATH_MAX];
+};
+
+struct btrfs_ioctl_search_key {
+	/* which root are we searching.  0 is the tree of tree roots */
+	__u64 tree_id;
+
+	/* keys returned will be >= min and <= max */
+	__u64 min_objectid;
+	__u64 max_objectid;
+
+	/* keys returned will be >= min and <= max */
+	__u64 min_offset;
+	__u64 max_offset;
+
+	/* max and min transids to search for */
+	__u64 min_transid;
+	__u64 max_transid;
+
+	/* keys returned will be >= min and <= max */
+	__u32 min_type;
+	__u32 max_type;
+
+	/*
+	 * how many items did userland ask for, and how many are we
+	 * returning
+	 */
+	__u32 nr_items;
+
+	/* align to 64 bits */
+	__u32 unused;
+
+	/* some extra for later */
+	__u64 unused1;
+	__u64 unused2;
+	__u64 unused3;
+	__u64 unused4;
+};
+
+struct btrfs_ioctl_search_header {
+	__u64 transid;
+	__u64 objectid;
+	__u64 offset;
+	__u32 type;
+	__u32 len;
+};
+
+#define BTRFS_SEARCH_ARGS_BUFSIZE (4096 - sizeof(struct btrfs_ioctl_search_key))
+/*
+ * the buf is an array of search headers where
+ * each header is followed by the actual item
+ * the type field is expanded to 32 bits for alignment
+ */
+struct btrfs_ioctl_search_args {
+	struct btrfs_ioctl_search_key key;
+	char buf[BTRFS_SEARCH_ARGS_BUFSIZE];
+};
+
 struct btrfs_ioctl_clone_range_args {
   __s64 src_fd;
   __u64 src_offset, src_length;
   __u64 dest_offset;
 };
 
+/* flags for the defrag range ioctl */
+#define BTRFS_DEFRAG_RANGE_COMPRESS 1
+#define BTRFS_DEFRAG_RANGE_START_IO 2
+
+struct btrfs_ioctl_defrag_range_args {
+	/* start of the defrag operation */
+	__u64 start;
+
+	/* number of bytes to defrag, use (u64)-1 to say all */
+	__u64 len;
+
+	/*
+	 * flags for the operation, which can include turning
+	 * on compression for this one defrag
+	 */
+	__u64 flags;
+
+	/*
+	 * any extent bigger than this will be considered
+	 * already defragged.  Use 0 to take the kernel default
+	 * Use 1 to say every single extent must be rewritten
+	 */
+	__u32 extent_thresh;
+
+	/* spare for later */
+	__u32 unused[5];
+};
+
+struct btrfs_ioctl_space_info {
+	__u64 flags;
+	__u64 total_bytes;
+	__u64 used_bytes;
+};
+
+struct btrfs_ioctl_space_args {
+	__u64 space_slots;
+	__u64 total_spaces;
+	struct btrfs_ioctl_space_info spaces[0];
+};
+
 #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
 				   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -67,4 +169,13 @@ struct btrfs_ioctl_clone_range_args {
 				   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
 				struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_DEFRAG_RANGE _IOW(BTRFS_IOCTL_MAGIC, 16, \
+				struct btrfs_ioctl_defrag_range_args)
+#define BTRFS_IOC_TREE_SEARCH _IOWR(BTRFS_IOCTL_MAGIC, 17, \
+				   struct btrfs_ioctl_search_args)
+#define BTRFS_IOC_INO_LOOKUP _IOWR(BTRFS_IOCTL_MAGIC, 18, \
+				   struct btrfs_ioctl_ino_lookup_args)
+#define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64)
+#define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \
+				    struct btrfs_ioctl_space_args)
 #endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 1c36e5cd8f55..6151f2ea38bb 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -16,7 +16,6 @@
  * Boston, MA 021110-1307, USA.
  */
 #include <linux/sched.h>
-#include <linux/gfp.h>
 #include <linux/pagemap.h>
 #include <linux/spinlock.h>
 #include <linux/page-flags.h>
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 5c2a9e78a949..e56c72bc5add 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -16,7 +16,6 @@
  * Boston, MA 021110-1307, USA.
  */
 
-#include <linux/gfp.h>
 #include <linux/slab.h>
 #include <linux/blkdev.h>
 #include <linux/writeback.h>
@@ -125,6 +124,15 @@ static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
 	return 1;
 }
 
+static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
+			  u64 len)
+{
+	if (file_offset + len <= entry->file_offset ||
+	    entry->file_offset + entry->len <= file_offset)
+		return 0;
+	return 1;
+}
+
 /*
  * look find the first ordered struct that has this offset, otherwise
  * the first one less than this offset
@@ -162,8 +170,9 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
  * The tree is given a single reference on the ordered extent that was
  * inserted.
  */
-int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
-			     u64 start, u64 len, u64 disk_len, int type)
+static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+				      u64 start, u64 len, u64 disk_len,
+				      int type, int dio)
 {
 	struct btrfs_ordered_inode_tree *tree;
 	struct rb_node *node;
@@ -174,7 +183,6 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 	if (!entry)
 		return -ENOMEM;
 
-	mutex_lock(&tree->mutex);
 	entry->file_offset = file_offset;
 	entry->start = start;
 	entry->len = len;
@@ -184,26 +192,44 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 	if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
 		set_bit(type, &entry->flags);
 
+	if (dio)
+		set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
+
 	/* one ref for the tree */
 	atomic_set(&entry->refs, 1);
 	init_waitqueue_head(&entry->wait);
 	INIT_LIST_HEAD(&entry->list);
 	INIT_LIST_HEAD(&entry->root_extent_list);
 
+	spin_lock(&tree->lock);
 	node = tree_insert(&tree->tree, file_offset,
 			   &entry->rb_node);
 	BUG_ON(node);
+	spin_unlock(&tree->lock);
 
 	spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
 	list_add_tail(&entry->root_extent_list,
 		      &BTRFS_I(inode)->root->fs_info->ordered_extents);
 	spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
 
-	mutex_unlock(&tree->mutex);
 	BUG_ON(node);
 	return 0;
 }
 
+int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+			     u64 start, u64 len, u64 disk_len, int type)
+{
+	return __btrfs_add_ordered_extent(inode, file_offset, start, len,
+					  disk_len, type, 0);
+}
+
+int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
+				 u64 start, u64 len, u64 disk_len, int type)
+{
+	return __btrfs_add_ordered_extent(inode, file_offset, start, len,
+					  disk_len, type, 1);
+}
+
 /*
  * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
  * when an ordered extent is finished.  If the list covers more than one
@@ -216,9 +242,9 @@ int btrfs_add_ordered_sum(struct inode *inode,
 	struct btrfs_ordered_inode_tree *tree;
 
 	tree = &BTRFS_I(inode)->ordered_tree;
-	mutex_lock(&tree->mutex);
+	spin_lock(&tree->lock);
 	list_add_tail(&sum->list, &entry->list);
-	mutex_unlock(&tree->mutex);
+	spin_unlock(&tree->lock);
 	return 0;
 }
 
@@ -232,15 +258,16 @@ int btrfs_add_ordered_sum(struct inode *inode,
  * to make sure this function only returns 1 once for a given ordered extent.
  */
 int btrfs_dec_test_ordered_pending(struct inode *inode,
+				   struct btrfs_ordered_extent **cached,
 				   u64 file_offset, u64 io_size)
 {
 	struct btrfs_ordered_inode_tree *tree;
 	struct rb_node *node;
-	struct btrfs_ordered_extent *entry;
+	struct btrfs_ordered_extent *entry = NULL;
 	int ret;
 
 	tree = &BTRFS_I(inode)->ordered_tree;
-	mutex_lock(&tree->mutex);
+	spin_lock(&tree->lock);
 	node = tree_search(tree, file_offset);
 	if (!node) {
 		ret = 1;
@@ -264,7 +291,11 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
 	else
 		ret = 1;
 out:
-	mutex_unlock(&tree->mutex);
+	if (!ret && cached && entry) {
+		*cached = entry;
+		atomic_inc(&entry->refs);
+	}
+	spin_unlock(&tree->lock);
 	return ret == 0;
 }
 
@@ -291,13 +322,14 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
 
 /*
  * remove an ordered extent from the tree.  No references are dropped
- * and you must wake_up entry->wait.  You must hold the tree mutex
+ * and you must wake_up entry->wait.  You must hold the tree lock
  * while you call this function.
  */
 static int __btrfs_remove_ordered_extent(struct inode *inode,
 				struct btrfs_ordered_extent *entry)
 {
 	struct btrfs_ordered_inode_tree *tree;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct rb_node *node;
 
 	tree = &BTRFS_I(inode)->ordered_tree;
@@ -306,13 +338,7 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
 	tree->last = NULL;
 	set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
 
-	spin_lock(&BTRFS_I(inode)->accounting_lock);
-	BTRFS_I(inode)->outstanding_extents--;
-	spin_unlock(&BTRFS_I(inode)->accounting_lock);
-	btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root,
-					      inode, 1);
-
-	spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+	spin_lock(&root->fs_info->ordered_extent_lock);
 	list_del_init(&entry->root_extent_list);
 
 	/*
@@ -324,7 +350,7 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
 	    !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
 		list_del_init(&BTRFS_I(inode)->ordered_operations);
 	}
-	spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+	spin_unlock(&root->fs_info->ordered_extent_lock);
 
 	return 0;
 }
@@ -340,9 +366,9 @@ int btrfs_remove_ordered_extent(struct inode *inode,
 	int ret;
 
 	tree = &BTRFS_I(inode)->ordered_tree;
-	mutex_lock(&tree->mutex);
+	spin_lock(&tree->lock);
 	ret = __btrfs_remove_ordered_extent(inode, entry);
-	mutex_unlock(&tree->mutex);
+	spin_unlock(&tree->lock);
 	wake_up(&entry->wait);
 
 	return ret;
@@ -485,7 +511,8 @@ void btrfs_start_ordered_extent(struct inode *inode,
 	 * start IO on any dirty ones so the wait doesn't stall waiting
 	 * for pdflush to find them
 	 */
-	filemap_fdatawrite_range(inode->i_mapping, start, end);
+	if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
+		filemap_fdatawrite_range(inode->i_mapping, start, end);
 	if (wait) {
 		wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
 						 &entry->flags));
@@ -567,7 +594,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
 	struct btrfs_ordered_extent *entry = NULL;
 
 	tree = &BTRFS_I(inode)->ordered_tree;
-	mutex_lock(&tree->mutex);
+	spin_lock(&tree->lock);
 	node = tree_search(tree, file_offset);
 	if (!node)
 		goto out;
@@ -578,7 +605,48 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
 	if (entry)
 		atomic_inc(&entry->refs);
 out:
-	mutex_unlock(&tree->mutex);
+	spin_unlock(&tree->lock);
+	return entry;
+}
+
+/* Since the DIO code tries to lock a wide area we need to look for any ordered
+ * extents that exist in the range, rather than just the start of the range.
+ */
+struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
+							u64 file_offset,
+							u64 len)
+{
+	struct btrfs_ordered_inode_tree *tree;
+	struct rb_node *node;
+	struct btrfs_ordered_extent *entry = NULL;
+
+	tree = &BTRFS_I(inode)->ordered_tree;
+	spin_lock(&tree->lock);
+	node = tree_search(tree, file_offset);
+	if (!node) {
+		node = tree_search(tree, file_offset + len);
+		if (!node)
+			goto out;
+	}
+
+	while (1) {
+		entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+		if (range_overlaps(entry, file_offset, len))
+			break;
+
+		if (entry->file_offset >= file_offset + len) {
+			entry = NULL;
+			break;
+		}
+		entry = NULL;
+		node = rb_next(node);
+		if (!node)
+			break;
+	}
+out:
+	if (entry)
+		atomic_inc(&entry->refs);
+	spin_unlock(&tree->lock);
 	return entry;
 }
 
@@ -594,7 +662,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
 	struct btrfs_ordered_extent *entry = NULL;
 
 	tree = &BTRFS_I(inode)->ordered_tree;
-	mutex_lock(&tree->mutex);
+	spin_lock(&tree->lock);
 	node = tree_search(tree, file_offset);
 	if (!node)
 		goto out;
@@ -602,7 +670,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
 	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
 	atomic_inc(&entry->refs);
 out:
-	mutex_unlock(&tree->mutex);
+	spin_unlock(&tree->lock);
 	return entry;
 }
 
@@ -629,7 +697,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
 	else
 		offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize);
 
-	mutex_lock(&tree->mutex);
+	spin_lock(&tree->lock);
 	disk_i_size = BTRFS_I(inode)->disk_i_size;
 
 	/* truncate file */
@@ -735,7 +803,7 @@ out:
 	 */
 	if (ordered)
 		__btrfs_remove_ordered_extent(inode, ordered);
-	mutex_unlock(&tree->mutex);
+	spin_unlock(&tree->lock);
 	if (ordered)
 		wake_up(&ordered->wait);
 	return ret;
@@ -762,7 +830,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
 	if (!ordered)
 		return 1;
 
-	mutex_lock(&tree->mutex);
+	spin_lock(&tree->lock);
 	list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
 		if (disk_bytenr >= ordered_sum->bytenr) {
 			num_sectors = ordered_sum->len / sectorsize;
@@ -777,7 +845,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
 		}
 	}
 out:
-	mutex_unlock(&tree->mutex);
+	spin_unlock(&tree->lock);
 	btrfs_put_ordered_extent(ordered);
 	return ret;
 }
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 1fe1282ef47c..8ac365492a3f 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -21,7 +21,7 @@
 
 /* one of these per inode */
 struct btrfs_ordered_inode_tree {
-	struct mutex mutex;
+	spinlock_t lock;
 	struct rb_root tree;
 	struct rb_node *last;
 };
@@ -72,6 +72,8 @@ struct btrfs_ordered_sum {
 
 #define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
 
+#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */
+
 struct btrfs_ordered_extent {
 	/* logical offset in the file */
 	u64 file_offset;
@@ -128,8 +130,8 @@ static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
 static inline void
 btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
 {
-	mutex_init(&t->mutex);
-	t->tree.rb_node = NULL;
+	spin_lock_init(&t->lock);
+	t->tree = RB_ROOT;
 	t->last = NULL;
 }
 
@@ -137,9 +139,12 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
 int btrfs_remove_ordered_extent(struct inode *inode,
 				struct btrfs_ordered_extent *entry);
 int btrfs_dec_test_ordered_pending(struct inode *inode,
-				       u64 file_offset, u64 io_size);
+				   struct btrfs_ordered_extent **cached,
+				   u64 file_offset, u64 io_size);
 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
-			     u64 start, u64 len, u64 disk_len, int tyep);
+			     u64 start, u64 len, u64 disk_len, int type);
+int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
+				 u64 start, u64 len, u64 disk_len, int type);
 int btrfs_add_ordered_sum(struct inode *inode,
 			  struct btrfs_ordered_extent *entry,
 			  struct btrfs_ordered_sum *sum);
@@ -150,6 +155,9 @@ void btrfs_start_ordered_extent(struct inode *inode,
 int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
 struct btrfs_ordered_extent *
 btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
+struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
+							u64 file_offset,
+							u64 len);
 int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
 				struct btrfs_ordered_extent *ordered);
 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
index d0cc62bccb94..a97314cf6bd6 100644
--- a/fs/btrfs/ref-cache.c
+++ b/fs/btrfs/ref-cache.c
@@ -17,6 +17,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/sort.h>
 #include "ctree.h"
 #include "ref-cache.h"
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
index bc283ad2db73..e2a55cb2072b 100644
--- a/fs/btrfs/ref-cache.h
+++ b/fs/btrfs/ref-cache.h
@@ -52,7 +52,7 @@ static inline size_t btrfs_leaf_ref_size(int nr_extents)
 
 static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree)
 {
-	tree->root.rb_node = NULL;
+	tree->root = RB_ROOT;
 	INIT_LIST_HEAD(&tree->list);
 	spin_lock_init(&tree->lock);
 }
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index ed3e4a2ec2c8..b37d723b9d4a 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -21,6 +21,7 @@
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
 #include <linux/rbtree.h>
+#include <linux/slab.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -43,8 +44,12 @@ struct tree_entry {
 struct backref_node {
 	struct rb_node rb_node;
 	u64 bytenr;
-	/* objectid tree block owner */
+
+	u64 new_bytenr;
+	/* objectid of tree block owner, can be not uptodate */
 	u64 owner;
+	/* link to pending, changed or detached list */
+	struct list_head list;
 	/* list of upper level blocks reference this block */
 	struct list_head upper;
 	/* list of child blocks in the cache */
@@ -55,9 +60,9 @@ struct backref_node {
 	struct extent_buffer *eb;
 	/* level of tree block */
 	unsigned int level:8;
-	/* 1 if the block is root of old snapshot */
-	unsigned int old_root:1;
-	/* 1 if no child blocks in the cache */
+	/* is the block in non-reference counted tree */
+	unsigned int cowonly:1;
+	/* 1 if no child node in the cache */
 	unsigned int lowest:1;
 	/* is the extent buffer locked */
 	unsigned int locked:1;
@@ -65,6 +70,16 @@ struct backref_node {
 	unsigned int processed:1;
 	/* have backrefs of this block been checked */
 	unsigned int checked:1;
+	/*
+	 * 1 if corresponding block has been cowed but some upper
+	 * level block pointers may not point to the new location
+	 */
+	unsigned int pending:1;
+	/*
+	 * 1 if the backref node isn't connected to any other
+	 * backref node.
+	 */
+	unsigned int detached:1;
 };
 
 /*
@@ -73,7 +88,6 @@ struct backref_node {
 struct backref_edge {
 	struct list_head list[2];
 	struct backref_node *node[2];
-	u64 blockptr;
 };
 
 #define LOWER	0
@@ -82,9 +96,25 @@ struct backref_edge {
 struct backref_cache {
 	/* red black tree of all backref nodes in the cache */
 	struct rb_root rb_root;
-	/* list of backref nodes with no child block in the cache */
+	/* for passing backref nodes to btrfs_reloc_cow_block */
+	struct backref_node *path[BTRFS_MAX_LEVEL];
+	/*
+	 * list of blocks that have been cowed but some block
+	 * pointers in upper level blocks may not reflect the
+	 * new location
+	 */
 	struct list_head pending[BTRFS_MAX_LEVEL];
-	spinlock_t lock;
+	/* list of backref nodes with no child node */
+	struct list_head leaves;
+	/* list of blocks that have been cowed in current transaction */
+	struct list_head changed;
+	/* list of detached backref node. */
+	struct list_head detached;
+
+	u64 last_trans;
+
+	int nr_nodes;
+	int nr_edges;
 };
 
 /*
@@ -112,15 +142,6 @@ struct tree_block {
 	unsigned int key_ready:1;
 };
 
-/* inode vector */
-#define INODEVEC_SIZE 16
-
-struct inodevec {
-	struct list_head list;
-	struct inode *inode[INODEVEC_SIZE];
-	int nr;
-};
-
 #define MAX_EXTENTS 128
 
 struct file_extent_cluster {
@@ -137,58 +158,130 @@ struct reloc_control {
 	struct btrfs_root *extent_root;
 	/* inode for moving data */
 	struct inode *data_inode;
-	struct btrfs_workers workers;
+
+	struct btrfs_block_rsv *block_rsv;
+
+	struct backref_cache backref_cache;
+
+	struct file_extent_cluster cluster;
 	/* tree blocks have been processed */
 	struct extent_io_tree processed_blocks;
 	/* map start of tree root to corresponding reloc tree */
 	struct mapping_tree reloc_root_tree;
 	/* list of reloc trees */
 	struct list_head reloc_roots;
+	/* size of metadata reservation for merging reloc trees */
+	u64 merging_rsv_size;
+	/* size of relocated tree nodes */
+	u64 nodes_relocated;
+
 	u64 search_start;
 	u64 extents_found;
-	u64 extents_skipped;
-	int stage;
-	int create_reloc_root;
+
+	int block_rsv_retries;
+
+	unsigned int stage:8;
+	unsigned int create_reloc_tree:1;
+	unsigned int merge_reloc_tree:1;
 	unsigned int found_file_extent:1;
-	unsigned int found_old_snapshot:1;
+	unsigned int commit_transaction:1;
 };
 
 /* stages of data relocation */
 #define MOVE_DATA_EXTENTS	0
 #define UPDATE_DATA_PTRS	1
 
-/*
- * merge reloc tree to corresponding fs tree in worker threads
- */
-struct async_merge {
-	struct btrfs_work work;
-	struct reloc_control *rc;
-	struct btrfs_root *root;
-	struct completion *done;
-	atomic_t *num_pending;
-};
+static void remove_backref_node(struct backref_cache *cache,
+				struct backref_node *node);
+static void __mark_block_processed(struct reloc_control *rc,
+				   struct backref_node *node);
 
 static void mapping_tree_init(struct mapping_tree *tree)
 {
-	tree->rb_root.rb_node = NULL;
+	tree->rb_root = RB_ROOT;
 	spin_lock_init(&tree->lock);
 }
 
 static void backref_cache_init(struct backref_cache *cache)
 {
 	int i;
-	cache->rb_root.rb_node = NULL;
+	cache->rb_root = RB_ROOT;
 	for (i = 0; i < BTRFS_MAX_LEVEL; i++)
 		INIT_LIST_HEAD(&cache->pending[i]);
-	spin_lock_init(&cache->lock);
+	INIT_LIST_HEAD(&cache->changed);
+	INIT_LIST_HEAD(&cache->detached);
+	INIT_LIST_HEAD(&cache->leaves);
+}
+
+static void backref_cache_cleanup(struct backref_cache *cache)
+{
+	struct backref_node *node;
+	int i;
+
+	while (!list_empty(&cache->detached)) {
+		node = list_entry(cache->detached.next,
+				  struct backref_node, list);
+		remove_backref_node(cache, node);
+	}
+
+	while (!list_empty(&cache->leaves)) {
+		node = list_entry(cache->leaves.next,
+				  struct backref_node, lower);
+		remove_backref_node(cache, node);
+	}
+
+	cache->last_trans = 0;
+
+	for (i = 0; i < BTRFS_MAX_LEVEL; i++)
+		BUG_ON(!list_empty(&cache->pending[i]));
+	BUG_ON(!list_empty(&cache->changed));
+	BUG_ON(!list_empty(&cache->detached));
+	BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
+	BUG_ON(cache->nr_nodes);
+	BUG_ON(cache->nr_edges);
+}
+
+static struct backref_node *alloc_backref_node(struct backref_cache *cache)
+{
+	struct backref_node *node;
+
+	node = kzalloc(sizeof(*node), GFP_NOFS);
+	if (node) {
+		INIT_LIST_HEAD(&node->list);
+		INIT_LIST_HEAD(&node->upper);
+		INIT_LIST_HEAD(&node->lower);
+		RB_CLEAR_NODE(&node->rb_node);
+		cache->nr_nodes++;
+	}
+	return node;
+}
+
+static void free_backref_node(struct backref_cache *cache,
+			      struct backref_node *node)
+{
+	if (node) {
+		cache->nr_nodes--;
+		kfree(node);
+	}
+}
+
+static struct backref_edge *alloc_backref_edge(struct backref_cache *cache)
+{
+	struct backref_edge *edge;
+
+	edge = kzalloc(sizeof(*edge), GFP_NOFS);
+	if (edge)
+		cache->nr_edges++;
+	return edge;
 }
 
-static void backref_node_init(struct backref_node *node)
+static void free_backref_edge(struct backref_cache *cache,
+			      struct backref_edge *edge)
 {
-	memset(node, 0, sizeof(*node));
-	INIT_LIST_HEAD(&node->upper);
-	INIT_LIST_HEAD(&node->lower);
-	RB_CLEAR_NODE(&node->rb_node);
+	if (edge) {
+		cache->nr_edges--;
+		kfree(edge);
+	}
 }
 
 static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
@@ -249,6 +342,7 @@ static struct backref_node *walk_up_backref(struct backref_node *node,
 		edges[idx++] = edge;
 		node = edge->node[UPPER];
 	}
+	BUG_ON(node->detached);
 	*index = idx;
 	return node;
 }
@@ -280,13 +374,18 @@ static struct backref_node *walk_down_backref(struct backref_edge *edges[],
 	return NULL;
 }
 
+static void unlock_node_buffer(struct backref_node *node)
+{
+	if (node->locked) {
+		btrfs_tree_unlock(node->eb);
+		node->locked = 0;
+	}
+}
+
 static void drop_node_buffer(struct backref_node *node)
 {
 	if (node->eb) {
-		if (node->locked) {
-			btrfs_tree_unlock(node->eb);
-			node->locked = 0;
-		}
+		unlock_node_buffer(node);
 		free_extent_buffer(node->eb);
 		node->eb = NULL;
 	}
@@ -295,14 +394,14 @@ static void drop_node_buffer(struct backref_node *node)
 static void drop_backref_node(struct backref_cache *tree,
 			      struct backref_node *node)
 {
-	BUG_ON(!node->lowest);
 	BUG_ON(!list_empty(&node->upper));
 
 	drop_node_buffer(node);
+	list_del(&node->list);
 	list_del(&node->lower);
-
-	rb_erase(&node->rb_node, &tree->rb_root);
-	kfree(node);
+	if (!RB_EMPTY_NODE(&node->rb_node))
+		rb_erase(&node->rb_node, &tree->rb_root);
+	free_backref_node(tree, node);
 }
 
 /*
@@ -317,27 +416,121 @@ static void remove_backref_node(struct backref_cache *cache,
 	if (!node)
 		return;
 
-	BUG_ON(!node->lowest);
+	BUG_ON(!node->lowest && !node->detached);
 	while (!list_empty(&node->upper)) {
 		edge = list_entry(node->upper.next, struct backref_edge,
 				  list[LOWER]);
 		upper = edge->node[UPPER];
 		list_del(&edge->list[LOWER]);
 		list_del(&edge->list[UPPER]);
-		kfree(edge);
+		free_backref_edge(cache, edge);
+
+		if (RB_EMPTY_NODE(&upper->rb_node)) {
+			BUG_ON(!list_empty(&node->upper));
+			drop_backref_node(cache, node);
+			node = upper;
+			node->lowest = 1;
+			continue;
+		}
 		/*
-		 * add the node to pending list if no other
+		 * add the node to leaf node list if no other
 		 * child block cached.
 		 */
 		if (list_empty(&upper->lower)) {
-			list_add_tail(&upper->lower,
-				      &cache->pending[upper->level]);
+			list_add_tail(&upper->lower, &cache->leaves);
 			upper->lowest = 1;
 		}
 	}
+
 	drop_backref_node(cache, node);
 }
 
+static void update_backref_node(struct backref_cache *cache,
+				struct backref_node *node, u64 bytenr)
+{
+	struct rb_node *rb_node;
+	rb_erase(&node->rb_node, &cache->rb_root);
+	node->bytenr = bytenr;
+	rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
+	BUG_ON(rb_node);
+}
+
+/*
+ * update backref cache after a transaction commit
+ */
+static int update_backref_cache(struct btrfs_trans_handle *trans,
+				struct backref_cache *cache)
+{
+	struct backref_node *node;
+	int level = 0;
+
+	if (cache->last_trans == 0) {
+		cache->last_trans = trans->transid;
+		return 0;
+	}
+
+	if (cache->last_trans == trans->transid)
+		return 0;
+
+	/*
+	 * detached nodes are used to avoid unnecessary backref
+	 * lookup. transaction commit changes the extent tree.
+	 * so the detached nodes are no longer useful.
+	 */
+	while (!list_empty(&cache->detached)) {
+		node = list_entry(cache->detached.next,
+				  struct backref_node, list);
+		remove_backref_node(cache, node);
+	}
+
+	while (!list_empty(&cache->changed)) {
+		node = list_entry(cache->changed.next,
+				  struct backref_node, list);
+		list_del_init(&node->list);
+		BUG_ON(node->pending);
+		update_backref_node(cache, node, node->new_bytenr);
+	}
+
+	/*
+	 * some nodes can be left in the pending list if there were
+	 * errors during processing the pending nodes.
+	 */
+	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+		list_for_each_entry(node, &cache->pending[level], list) {
+			BUG_ON(!node->pending);
+			if (node->bytenr == node->new_bytenr)
+				continue;
+			update_backref_node(cache, node, node->new_bytenr);
+		}
+	}
+
+	cache->last_trans = 0;
+	return 1;
+}
+
+static int should_ignore_root(struct btrfs_root *root)
+{
+	struct btrfs_root *reloc_root;
+
+	if (!root->ref_cows)
+		return 0;
+
+	reloc_root = root->reloc_root;
+	if (!reloc_root)
+		return 0;
+
+	if (btrfs_root_last_snapshot(&reloc_root->root_item) ==
+	    root->fs_info->running_transaction->transid - 1)
+		return 0;
+	/*
+	 * if there is reloc tree and it was created in previous
+	 * transaction backref lookup can find the reloc tree,
+	 * so backref node for the fs tree root is useless for
+	 * relocation.
+	 */
+	return 1;
+}
+
 /*
  * find reloc tree by address of tree root
  */
@@ -452,11 +645,12 @@ int find_inline_backref(struct extent_buffer *leaf, int slot,
  * for all upper level blocks that directly/indirectly reference the
  * block are also cached.
  */
-static struct backref_node *build_backref_tree(struct reloc_control *rc,
-					       struct backref_cache *cache,
-					       struct btrfs_key *node_key,
-					       int level, u64 bytenr)
+static noinline_for_stack
+struct backref_node *build_backref_tree(struct reloc_control *rc,
+					struct btrfs_key *node_key,
+					int level, u64 bytenr)
 {
+	struct backref_cache *cache = &rc->backref_cache;
 	struct btrfs_path *path1;
 	struct btrfs_path *path2;
 	struct extent_buffer *eb;
@@ -472,6 +666,8 @@ static struct backref_node *build_backref_tree(struct reloc_control *rc,
 	unsigned long end;
 	unsigned long ptr;
 	LIST_HEAD(list);
+	LIST_HEAD(useless);
+	int cowonly;
 	int ret;
 	int err = 0;
 
@@ -482,15 +678,13 @@ static struct backref_node *build_backref_tree(struct reloc_control *rc,
 		goto out;
 	}
 
-	node = kmalloc(sizeof(*node), GFP_NOFS);
+	node = alloc_backref_node(cache);
 	if (!node) {
 		err = -ENOMEM;
 		goto out;
 	}
 
-	backref_node_init(node);
 	node->bytenr = bytenr;
-	node->owner = 0;
 	node->level = level;
 	node->lowest = 1;
 	cur = node;
@@ -586,17 +780,21 @@ again:
 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 		if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
 		    key.type == BTRFS_EXTENT_REF_V0_KEY) {
-			if (key.objectid == key.offset &&
-			    key.type == BTRFS_EXTENT_REF_V0_KEY) {
+			if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
 				struct btrfs_extent_ref_v0 *ref0;
 				ref0 = btrfs_item_ptr(eb, path1->slots[0],
 						struct btrfs_extent_ref_v0);
-				root = find_tree_root(rc, eb, ref0);
-				if (root)
-					cur->root = root;
-				else
-					cur->old_root = 1;
-				break;
+				if (key.objectid == key.offset) {
+					root = find_tree_root(rc, eb, ref0);
+					if (root && !should_ignore_root(root))
+						cur->root = root;
+					else
+						list_add(&cur->list, &useless);
+					break;
+				}
+				if (is_cowonly_root(btrfs_ref_root_v0(eb,
+								      ref0)))
+					cur->cowonly = 1;
 			}
 #else
 		BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
@@ -613,22 +811,20 @@ again:
 				break;
 			}
 
-			edge = kzalloc(sizeof(*edge), GFP_NOFS);
+			edge = alloc_backref_edge(cache);
 			if (!edge) {
 				err = -ENOMEM;
 				goto out;
 			}
 			rb_node = tree_search(&cache->rb_root, key.offset);
 			if (!rb_node) {
-				upper = kmalloc(sizeof(*upper), GFP_NOFS);
+				upper = alloc_backref_node(cache);
 				if (!upper) {
-					kfree(edge);
+					free_backref_edge(cache, edge);
 					err = -ENOMEM;
 					goto out;
 				}
-				backref_node_init(upper);
 				upper->bytenr = key.offset;
-				upper->owner = 0;
 				upper->level = cur->level + 1;
 				/*
 				 *  backrefs for the upper level block isn't
@@ -638,11 +834,12 @@ again:
 			} else {
 				upper = rb_entry(rb_node, struct backref_node,
 						 rb_node);
+				BUG_ON(!upper->checked);
 				INIT_LIST_HEAD(&edge->list[UPPER]);
 			}
-			list_add(&edge->list[LOWER], &cur->upper);
-			edge->node[UPPER] = upper;
+			list_add_tail(&edge->list[LOWER], &cur->upper);
 			edge->node[LOWER] = cur;
+			edge->node[UPPER] = upper;
 
 			goto next;
 		} else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
@@ -656,11 +853,17 @@ again:
 			goto out;
 		}
 
+		if (!root->ref_cows)
+			cur->cowonly = 1;
+
 		if (btrfs_root_level(&root->root_item) == cur->level) {
 			/* tree root */
 			BUG_ON(btrfs_root_bytenr(&root->root_item) !=
 			       cur->bytenr);
-			cur->root = root;
+			if (should_ignore_root(root))
+				list_add(&cur->list, &useless);
+			else
+				cur->root = root;
 			break;
 		}
 
@@ -691,11 +894,14 @@ again:
 			if (!path2->nodes[level]) {
 				BUG_ON(btrfs_root_bytenr(&root->root_item) !=
 				       lower->bytenr);
-				lower->root = root;
+				if (should_ignore_root(root))
+					list_add(&lower->list, &useless);
+				else
+					lower->root = root;
 				break;
 			}
 
-			edge = kzalloc(sizeof(*edge), GFP_NOFS);
+			edge = alloc_backref_edge(cache);
 			if (!edge) {
 				err = -ENOMEM;
 				goto out;
@@ -704,16 +910,17 @@ again:
 			eb = path2->nodes[level];
 			rb_node = tree_search(&cache->rb_root, eb->start);
 			if (!rb_node) {
-				upper = kmalloc(sizeof(*upper), GFP_NOFS);
+				upper = alloc_backref_node(cache);
 				if (!upper) {
-					kfree(edge);
+					free_backref_edge(cache, edge);
 					err = -ENOMEM;
 					goto out;
 				}
-				backref_node_init(upper);
 				upper->bytenr = eb->start;
 				upper->owner = btrfs_header_owner(eb);
 				upper->level = lower->level + 1;
+				if (!root->ref_cows)
+					upper->cowonly = 1;
 
 				/*
 				 * if we know the block isn't shared
@@ -743,10 +950,12 @@ again:
 						 rb_node);
 				BUG_ON(!upper->checked);
 				INIT_LIST_HEAD(&edge->list[UPPER]);
+				if (!upper->owner)
+					upper->owner = btrfs_header_owner(eb);
 			}
 			list_add_tail(&edge->list[LOWER], &lower->upper);
-			edge->node[UPPER] = upper;
 			edge->node[LOWER] = lower;
+			edge->node[UPPER] = upper;
 
 			if (rb_node)
 				break;
@@ -784,8 +993,13 @@ next:
 	 * into the cache.
 	 */
 	BUG_ON(!node->checked);
-	rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
-	BUG_ON(rb_node);
+	cowonly = node->cowonly;
+	if (!cowonly) {
+		rb_node = tree_insert(&cache->rb_root, node->bytenr,
+				      &node->rb_node);
+		BUG_ON(rb_node);
+		list_add_tail(&node->lower, &cache->leaves);
+	}
 
 	list_for_each_entry(edge, &node->upper, list[LOWER])
 		list_add_tail(&edge->list[UPPER], &list);
@@ -794,6 +1008,14 @@ next:
 		edge = list_entry(list.next, struct backref_edge, list[UPPER]);
 		list_del_init(&edge->list[UPPER]);
 		upper = edge->node[UPPER];
+		if (upper->detached) {
+			list_del(&edge->list[LOWER]);
+			lower = edge->node[LOWER];
+			free_backref_edge(cache, edge);
+			if (list_empty(&lower->upper))
+				list_add(&lower->list, &useless);
+			continue;
+		}
 
 		if (!RB_EMPTY_NODE(&upper->rb_node)) {
 			if (upper->lowest) {
@@ -806,25 +1028,69 @@ next:
 		}
 
 		BUG_ON(!upper->checked);
-		rb_node = tree_insert(&cache->rb_root, upper->bytenr,
-				      &upper->rb_node);
-		BUG_ON(rb_node);
+		BUG_ON(cowonly != upper->cowonly);
+		if (!cowonly) {
+			rb_node = tree_insert(&cache->rb_root, upper->bytenr,
+					      &upper->rb_node);
+			BUG_ON(rb_node);
+		}
 
 		list_add_tail(&edge->list[UPPER], &upper->lower);
 
 		list_for_each_entry(edge, &upper->upper, list[LOWER])
 			list_add_tail(&edge->list[UPPER], &list);
 	}
+	/*
+	 * process useless backref nodes. backref nodes for tree leaves
+	 * are deleted from the cache. backref nodes for upper level
+	 * tree blocks are left in the cache to avoid unnecessary backref
+	 * lookup.
+	 */
+	while (!list_empty(&useless)) {
+		upper = list_entry(useless.next, struct backref_node, list);
+		list_del_init(&upper->list);
+		BUG_ON(!list_empty(&upper->upper));
+		if (upper == node)
+			node = NULL;
+		if (upper->lowest) {
+			list_del_init(&upper->lower);
+			upper->lowest = 0;
+		}
+		while (!list_empty(&upper->lower)) {
+			edge = list_entry(upper->lower.next,
+					  struct backref_edge, list[UPPER]);
+			list_del(&edge->list[UPPER]);
+			list_del(&edge->list[LOWER]);
+			lower = edge->node[LOWER];
+			free_backref_edge(cache, edge);
+
+			if (list_empty(&lower->upper))
+				list_add(&lower->list, &useless);
+		}
+		__mark_block_processed(rc, upper);
+		if (upper->level > 0) {
+			list_add(&upper->list, &cache->detached);
+			upper->detached = 1;
+		} else {
+			rb_erase(&upper->rb_node, &cache->rb_root);
+			free_backref_node(cache, upper);
+		}
+	}
 out:
 	btrfs_free_path(path1);
 	btrfs_free_path(path2);
 	if (err) {
-		INIT_LIST_HEAD(&list);
+		while (!list_empty(&useless)) {
+			lower = list_entry(useless.next,
+					   struct backref_node, upper);
+			list_del_init(&lower->upper);
+		}
 		upper = node;
+		INIT_LIST_HEAD(&list);
 		while (upper) {
 			if (RB_EMPTY_NODE(&upper->rb_node)) {
 				list_splice_tail(&upper->upper, &list);
-				kfree(upper);
+				free_backref_node(cache, upper);
 			}
 
 			if (list_empty(&list))
@@ -832,15 +1098,104 @@ out:
 
 			edge = list_entry(list.next, struct backref_edge,
 					  list[LOWER]);
+			list_del(&edge->list[LOWER]);
 			upper = edge->node[UPPER];
-			kfree(edge);
+			free_backref_edge(cache, edge);
 		}
 		return ERR_PTR(err);
 	}
+	BUG_ON(node && node->detached);
 	return node;
 }
 
 /*
+ * helper to add backref node for the newly created snapshot.
+ * the backref node is created by cloning backref node that
+ * corresponds to root of source tree
+ */
+static int clone_backref_node(struct btrfs_trans_handle *trans,
+			      struct reloc_control *rc,
+			      struct btrfs_root *src,
+			      struct btrfs_root *dest)
+{
+	struct btrfs_root *reloc_root = src->reloc_root;
+	struct backref_cache *cache = &rc->backref_cache;
+	struct backref_node *node = NULL;
+	struct backref_node *new_node;
+	struct backref_edge *edge;
+	struct backref_edge *new_edge;
+	struct rb_node *rb_node;
+
+	if (cache->last_trans > 0)
+		update_backref_cache(trans, cache);
+
+	rb_node = tree_search(&cache->rb_root, src->commit_root->start);
+	if (rb_node) {
+		node = rb_entry(rb_node, struct backref_node, rb_node);
+		if (node->detached)
+			node = NULL;
+		else
+			BUG_ON(node->new_bytenr != reloc_root->node->start);
+	}
+
+	if (!node) {
+		rb_node = tree_search(&cache->rb_root,
+				      reloc_root->commit_root->start);
+		if (rb_node) {
+			node = rb_entry(rb_node, struct backref_node,
+					rb_node);
+			BUG_ON(node->detached);
+		}
+	}
+
+	if (!node)
+		return 0;
+
+	new_node = alloc_backref_node(cache);
+	if (!new_node)
+		return -ENOMEM;
+
+	new_node->bytenr = dest->node->start;
+	new_node->level = node->level;
+	new_node->lowest = node->lowest;
+	new_node->root = dest;
+
+	if (!node->lowest) {
+		list_for_each_entry(edge, &node->lower, list[UPPER]) {
+			new_edge = alloc_backref_edge(cache);
+			if (!new_edge)
+				goto fail;
+
+			new_edge->node[UPPER] = new_node;
+			new_edge->node[LOWER] = edge->node[LOWER];
+			list_add_tail(&new_edge->list[UPPER],
+				      &new_node->lower);
+		}
+	}
+
+	rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
+			      &new_node->rb_node);
+	BUG_ON(rb_node);
+
+	if (!new_node->lowest) {
+		list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
+			list_add_tail(&new_edge->list[LOWER],
+				      &new_edge->node[LOWER]->upper);
+		}
+	}
+	return 0;
+fail:
+	while (!list_empty(&new_node->lower)) {
+		new_edge = list_entry(new_node->lower.next,
+				      struct backref_edge, list[UPPER]);
+		list_del(&new_edge->list[UPPER]);
+		free_backref_edge(cache, new_edge);
+	}
+	free_backref_node(cache, new_node);
+	return -ENOMEM;
+}
+
+/*
  * helper to add 'address of tree root -> reloc tree' mapping
  */
 static int __add_reloc_root(struct btrfs_root *root)
@@ -900,12 +1255,8 @@ static int __update_reloc_root(struct btrfs_root *root, int del)
 	return 0;
 }
 
-/*
- * create reloc tree for a given fs tree. reloc tree is just a
- * snapshot of the fs tree with special root objectid.
- */
-int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root)
+static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root, u64 objectid)
 {
 	struct btrfs_root *reloc_root;
 	struct extent_buffer *eb;
@@ -913,36 +1264,45 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
 	struct btrfs_key root_key;
 	int ret;
 
-	if (root->reloc_root) {
-		reloc_root = root->reloc_root;
-		reloc_root->last_trans = trans->transid;
-		return 0;
-	}
-
-	if (!root->fs_info->reloc_ctl ||
-	    !root->fs_info->reloc_ctl->create_reloc_root ||
-	    root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
-		return 0;
-
 	root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
 	BUG_ON(!root_item);
 
 	root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
 	root_key.type = BTRFS_ROOT_ITEM_KEY;
-	root_key.offset = root->root_key.objectid;
+	root_key.offset = objectid;
 
-	ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
-			      BTRFS_TREE_RELOC_OBJECTID);
-	BUG_ON(ret);
+	if (root->root_key.objectid == objectid) {
+		/* called by btrfs_init_reloc_root */
+		ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
+				      BTRFS_TREE_RELOC_OBJECTID);
+		BUG_ON(ret);
+
+		btrfs_set_root_last_snapshot(&root->root_item,
+					     trans->transid - 1);
+	} else {
+		/*
+		 * called by btrfs_reloc_post_snapshot_hook.
+		 * the source tree is a reloc tree, all tree blocks
+		 * modified after it was created have RELOC flag
+		 * set in their headers. so it's OK to not update
+		 * the 'last_snapshot'.
+		 */
+		ret = btrfs_copy_root(trans, root, root->node, &eb,
+				      BTRFS_TREE_RELOC_OBJECTID);
+		BUG_ON(ret);
+	}
 
-	btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1);
 	memcpy(root_item, &root->root_item, sizeof(*root_item));
-	btrfs_set_root_refs(root_item, 1);
 	btrfs_set_root_bytenr(root_item, eb->start);
 	btrfs_set_root_level(root_item, btrfs_header_level(eb));
 	btrfs_set_root_generation(root_item, trans->transid);
-	memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key));
-	root_item->drop_level = 0;
+
+	if (root->root_key.objectid == objectid) {
+		btrfs_set_root_refs(root_item, 0);
+		memset(&root_item->drop_progress, 0,
+		       sizeof(struct btrfs_disk_key));
+		root_item->drop_level = 0;
+	}
 
 	btrfs_tree_unlock(eb);
 	free_extent_buffer(eb);
@@ -956,6 +1316,37 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
 						 &root_key);
 	BUG_ON(IS_ERR(reloc_root));
 	reloc_root->last_trans = trans->transid;
+	return reloc_root;
+}
+
+/*
+ * create reloc tree for a given fs tree. reloc tree is just a
+ * snapshot of the fs tree with special root objectid.
+ */
+int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root)
+{
+	struct btrfs_root *reloc_root;
+	struct reloc_control *rc = root->fs_info->reloc_ctl;
+	int clear_rsv = 0;
+
+	if (root->reloc_root) {
+		reloc_root = root->reloc_root;
+		reloc_root->last_trans = trans->transid;
+		return 0;
+	}
+
+	if (!rc || !rc->create_reloc_tree ||
+	    root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+		return 0;
+
+	if (!trans->block_rsv) {
+		trans->block_rsv = rc->block_rsv;
+		clear_rsv = 1;
+	}
+	reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
+	if (clear_rsv)
+		trans->block_rsv = NULL;
 
 	__add_reloc_root(reloc_root);
 	root->reloc_root = reloc_root;
@@ -979,7 +1370,8 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
 	reloc_root = root->reloc_root;
 	root_item = &reloc_root->root_item;
 
-	if (btrfs_root_refs(root_item) == 0) {
+	if (root->fs_info->reloc_ctl->merge_reloc_tree &&
+	    btrfs_root_refs(root_item) == 0) {
 		root->reloc_root = NULL;
 		del = 1;
 	}
@@ -1101,8 +1493,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
 		goto out;
 	}
 
-	if (new_bytenr)
-		*new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+	*new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
 	ret = 0;
 out:
 	btrfs_free_path(path);
@@ -1113,19 +1504,18 @@ out:
  * update file extent items in the tree leaf to point to
  * the new locations.
  */
-static int replace_file_extents(struct btrfs_trans_handle *trans,
-				struct reloc_control *rc,
-				struct btrfs_root *root,
-				struct extent_buffer *leaf,
-				struct list_head *inode_list)
+static noinline_for_stack
+int replace_file_extents(struct btrfs_trans_handle *trans,
+			 struct reloc_control *rc,
+			 struct btrfs_root *root,
+			 struct extent_buffer *leaf)
 {
 	struct btrfs_key key;
 	struct btrfs_file_extent_item *fi;
 	struct inode *inode = NULL;
-	struct inodevec *ivec = NULL;
 	u64 parent;
 	u64 bytenr;
-	u64 new_bytenr;
+	u64 new_bytenr = 0;
 	u64 num_bytes;
 	u64 end;
 	u32 nritems;
@@ -1165,21 +1555,12 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
 		 * to complete and drop the extent cache
 		 */
 		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
-			if (!ivec || ivec->nr == INODEVEC_SIZE) {
-				ivec = kmalloc(sizeof(*ivec), GFP_NOFS);
-				BUG_ON(!ivec);
-				ivec->nr = 0;
-				list_add_tail(&ivec->list, inode_list);
-			}
 			if (first) {
 				inode = find_next_inode(root, key.objectid);
-				if (inode)
-					ivec->inode[ivec->nr++] = inode;
 				first = 0;
 			} else if (inode && inode->i_ino < key.objectid) {
+				btrfs_add_delayed_iput(inode);
 				inode = find_next_inode(root, key.objectid);
-				if (inode)
-					ivec->inode[ivec->nr++] = inode;
 			}
 			if (inode && inode->i_ino == key.objectid) {
 				end = key.offset +
@@ -1203,8 +1584,10 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
 
 		ret = get_new_location(rc->data_inode, &new_bytenr,
 				       bytenr, num_bytes);
-		if (ret > 0)
+		if (ret > 0) {
+			WARN_ON(1);
 			continue;
+		}
 		BUG_ON(ret < 0);
 
 		btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
@@ -1224,6 +1607,8 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
 	}
 	if (dirty)
 		btrfs_mark_buffer_dirty(leaf);
+	if (inode)
+		btrfs_add_delayed_iput(inode);
 	return 0;
 }
 
@@ -1247,11 +1632,11 @@ int memcmp_node_keys(struct extent_buffer *eb, int slot,
  * if no block got replaced, 0 is returned. if there are other
  * errors, a negative error number is returned.
  */
-static int replace_path(struct btrfs_trans_handle *trans,
-			struct btrfs_root *dest, struct btrfs_root *src,
-			struct btrfs_path *path, struct btrfs_key *next_key,
-			struct extent_buffer **leaf,
-			int lowest_level, int max_level)
+static noinline_for_stack
+int replace_path(struct btrfs_trans_handle *trans,
+		 struct btrfs_root *dest, struct btrfs_root *src,
+		 struct btrfs_path *path, struct btrfs_key *next_key,
+		 int lowest_level, int max_level)
 {
 	struct extent_buffer *eb;
 	struct extent_buffer *parent;
@@ -1262,16 +1647,16 @@ static int replace_path(struct btrfs_trans_handle *trans,
 	u64 new_ptr_gen;
 	u64 last_snapshot;
 	u32 blocksize;
+	int cow = 0;
 	int level;
 	int ret;
 	int slot;
 
 	BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
 	BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
-	BUG_ON(lowest_level > 1 && leaf);
 
 	last_snapshot = btrfs_root_last_snapshot(&src->root_item);
-
+again:
 	slot = path->slots[lowest_level];
 	btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
 
@@ -1285,8 +1670,10 @@ static int replace_path(struct btrfs_trans_handle *trans,
 		return 0;
 	}
 
-	ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
-	BUG_ON(ret);
+	if (cow) {
+		ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
+		BUG_ON(ret);
+	}
 	btrfs_set_lock_blocking(eb);
 
 	if (next_key) {
@@ -1330,7 +1717,7 @@ static int replace_path(struct btrfs_trans_handle *trans,
 
 		if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
 		    memcmp_node_keys(parent, slot, path, level)) {
-			if (level <= lowest_level && !leaf) {
+			if (level <= lowest_level) {
 				ret = 0;
 				break;
 			}
@@ -1338,16 +1725,12 @@ static int replace_path(struct btrfs_trans_handle *trans,
 			eb = read_tree_block(dest, old_bytenr, blocksize,
 					     old_ptr_gen);
 			btrfs_tree_lock(eb);
-			ret = btrfs_cow_block(trans, dest, eb, parent,
-					      slot, &eb);
-			BUG_ON(ret);
-			btrfs_set_lock_blocking(eb);
-
-			if (level <= lowest_level) {
-				*leaf = eb;
-				ret = 0;
-				break;
+			if (cow) {
+				ret = btrfs_cow_block(trans, dest, eb, parent,
+						      slot, &eb);
+				BUG_ON(ret);
 			}
+			btrfs_set_lock_blocking(eb);
 
 			btrfs_tree_unlock(parent);
 			free_extent_buffer(parent);
@@ -1356,6 +1739,13 @@ static int replace_path(struct btrfs_trans_handle *trans,
 			continue;
 		}
 
+		if (!cow) {
+			btrfs_tree_unlock(parent);
+			free_extent_buffer(parent);
+			cow = 1;
+			goto again;
+		}
+
 		btrfs_node_key_to_cpu(path->nodes[level], &key,
 				      path->slots[level]);
 		btrfs_release_path(src, path);
@@ -1561,20 +1951,6 @@ static int invalidate_extent_cache(struct btrfs_root *root,
 	return 0;
 }
 
-static void put_inodes(struct list_head *list)
-{
-	struct inodevec *ivec;
-	while (!list_empty(list)) {
-		ivec = list_entry(list->next, struct inodevec, list);
-		list_del(&ivec->list);
-		while (ivec->nr > 0) {
-			ivec->nr--;
-			iput(ivec->inode[ivec->nr]);
-		}
-		kfree(ivec);
-	}
-}
-
 static int find_next_key(struct btrfs_path *path, int level,
 			 struct btrfs_key *key)
 
@@ -1607,13 +1983,14 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 	struct btrfs_root *reloc_root;
 	struct btrfs_root_item *root_item;
 	struct btrfs_path *path;
-	struct extent_buffer *leaf = NULL;
+	struct extent_buffer *leaf;
 	unsigned long nr;
 	int level;
 	int max_level;
 	int replaced = 0;
 	int ret;
 	int err = 0;
+	u32 min_reserved;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -1647,34 +2024,23 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 		btrfs_unlock_up_safe(path, 0);
 	}
 
-	if (level == 0 && rc->stage == UPDATE_DATA_PTRS) {
-		trans = btrfs_start_transaction(root, 1);
+	min_reserved = root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
+	memset(&next_key, 0, sizeof(next_key));
 
-		leaf = path->nodes[0];
-		btrfs_item_key_to_cpu(leaf, &key, 0);
-		btrfs_release_path(reloc_root, path);
+	while (1) {
+		trans = btrfs_start_transaction(root, 0);
+		trans->block_rsv = rc->block_rsv;
 
-		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
-		if (ret < 0) {
-			err = ret;
-			goto out;
+		ret = btrfs_block_rsv_check(trans, root, rc->block_rsv,
+					    min_reserved, 0);
+		if (ret) {
+			BUG_ON(ret != -EAGAIN);
+			ret = btrfs_commit_transaction(trans, root);
+			BUG_ON(ret);
+			continue;
 		}
 
-		leaf = path->nodes[0];
-		btrfs_unlock_up_safe(path, 1);
-		ret = replace_file_extents(trans, rc, root, leaf,
-					   &inode_list);
-		if (ret < 0)
-			err = ret;
-		goto out;
-	}
-
-	memset(&next_key, 0, sizeof(next_key));
-
-	while (1) {
-		leaf = NULL;
 		replaced = 0;
-		trans = btrfs_start_transaction(root, 1);
 		max_level = level;
 
 		ret = walk_down_reloc_tree(reloc_root, path, &level);
@@ -1688,14 +2054,9 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 		if (!find_next_key(path, level, &key) &&
 		    btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
 			ret = 0;
-		} else if (level == 1 && rc->stage == UPDATE_DATA_PTRS) {
-			ret = replace_path(trans, root, reloc_root,
-					   path, &next_key, &leaf,
-					   level, max_level);
 		} else {
-			ret = replace_path(trans, root, reloc_root,
-					   path, &next_key, NULL,
-					   level, max_level);
+			ret = replace_path(trans, root, reloc_root, path,
+					   &next_key, level, max_level);
 		}
 		if (ret < 0) {
 			err = ret;
@@ -1707,16 +2068,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 			btrfs_node_key_to_cpu(path->nodes[level], &key,
 					      path->slots[level]);
 			replaced = 1;
-		} else if (leaf) {
-			/*
-			 * no block got replaced, try replacing file extents
-			 */
-			btrfs_item_key_to_cpu(leaf, &key, 0);
-			ret = replace_file_extents(trans, rc, root, leaf,
-						   &inode_list);
-			btrfs_tree_unlock(leaf);
-			free_extent_buffer(leaf);
-			BUG_ON(ret < 0);
 		}
 
 		ret = walk_up_reloc_tree(reloc_root, path, &level);
@@ -1733,15 +2084,10 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 		root_item->drop_level = level;
 
 		nr = trans->blocks_used;
-		btrfs_end_transaction(trans, root);
+		btrfs_end_transaction_throttle(trans, root);
 
 		btrfs_btree_balance_dirty(root, nr);
 
-		/*
-		 * put inodes outside transaction, otherwise we may deadlock.
-		 */
-		put_inodes(&inode_list);
-
 		if (replaced && rc->stage == UPDATE_DATA_PTRS)
 			invalidate_extent_cache(root, &key, &next_key);
 	}
@@ -1764,87 +2110,125 @@ out:
 		       sizeof(root_item->drop_progress));
 		root_item->drop_level = 0;
 		btrfs_set_root_refs(root_item, 0);
+		btrfs_update_reloc_root(trans, root);
 	}
 
 	nr = trans->blocks_used;
-	btrfs_end_transaction(trans, root);
+	btrfs_end_transaction_throttle(trans, root);
 
 	btrfs_btree_balance_dirty(root, nr);
 
-	put_inodes(&inode_list);
-
 	if (replaced && rc->stage == UPDATE_DATA_PTRS)
 		invalidate_extent_cache(root, &key, &next_key);
 
 	return err;
 }
 
-/*
- * callback for the work threads.
- * this function merges reloc tree with corresponding fs tree,
- * and then drops the reloc tree.
- */
-static void merge_func(struct btrfs_work *work)
+static noinline_for_stack
+int prepare_to_merge(struct reloc_control *rc, int err)
 {
-	struct btrfs_trans_handle *trans;
-	struct btrfs_root *root;
+	struct btrfs_root *root = rc->extent_root;
 	struct btrfs_root *reloc_root;
-	struct async_merge *async;
+	struct btrfs_trans_handle *trans;
+	LIST_HEAD(reloc_roots);
+	u64 num_bytes = 0;
+	int ret;
+	int retries = 0;
+
+	mutex_lock(&root->fs_info->trans_mutex);
+	rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
+	rc->merging_rsv_size += rc->nodes_relocated * 2;
+	mutex_unlock(&root->fs_info->trans_mutex);
+again:
+	if (!err) {
+		num_bytes = rc->merging_rsv_size;
+		ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv,
+					  num_bytes, &retries);
+		if (ret)
+			err = ret;
+	}
+
+	trans = btrfs_join_transaction(rc->extent_root, 1);
+
+	if (!err) {
+		if (num_bytes != rc->merging_rsv_size) {
+			btrfs_end_transaction(trans, rc->extent_root);
+			btrfs_block_rsv_release(rc->extent_root,
+						rc->block_rsv, num_bytes);
+			retries = 0;
+			goto again;
+		}
+	}
 
-	async = container_of(work, struct async_merge, work);
-	reloc_root = async->root;
+	rc->merge_reloc_tree = 1;
+
+	while (!list_empty(&rc->reloc_roots)) {
+		reloc_root = list_entry(rc->reloc_roots.next,
+					struct btrfs_root, root_list);
+		list_del_init(&reloc_root->root_list);
 
-	if (btrfs_root_refs(&reloc_root->root_item) > 0) {
 		root = read_fs_root(reloc_root->fs_info,
 				    reloc_root->root_key.offset);
 		BUG_ON(IS_ERR(root));
 		BUG_ON(root->reloc_root != reloc_root);
 
-		merge_reloc_root(async->rc, root);
-
-		trans = btrfs_start_transaction(root, 1);
+		/*
+		 * set reference count to 1, so btrfs_recover_relocation
+		 * knows it should resumes merging
+		 */
+		if (!err)
+			btrfs_set_root_refs(&reloc_root->root_item, 1);
 		btrfs_update_reloc_root(trans, root);
-		btrfs_end_transaction(trans, root);
-	}
 
-	btrfs_drop_snapshot(reloc_root, 0);
+		list_add(&reloc_root->root_list, &reloc_roots);
+	}
 
-	if (atomic_dec_and_test(async->num_pending))
-		complete(async->done);
+	list_splice(&reloc_roots, &rc->reloc_roots);
 
-	kfree(async);
+	if (!err)
+		btrfs_commit_transaction(trans, rc->extent_root);
+	else
+		btrfs_end_transaction(trans, rc->extent_root);
+	return err;
 }
 
-static int merge_reloc_roots(struct reloc_control *rc)
+static noinline_for_stack
+int merge_reloc_roots(struct reloc_control *rc)
 {
-	struct async_merge *async;
 	struct btrfs_root *root;
-	struct completion done;
-	atomic_t num_pending;
+	struct btrfs_root *reloc_root;
+	LIST_HEAD(reloc_roots);
+	int found = 0;
+	int ret;
+again:
+	root = rc->extent_root;
+	mutex_lock(&root->fs_info->trans_mutex);
+	list_splice_init(&rc->reloc_roots, &reloc_roots);
+	mutex_unlock(&root->fs_info->trans_mutex);
 
-	init_completion(&done);
-	atomic_set(&num_pending, 1);
+	while (!list_empty(&reloc_roots)) {
+		found = 1;
+		reloc_root = list_entry(reloc_roots.next,
+					struct btrfs_root, root_list);
 
-	while (!list_empty(&rc->reloc_roots)) {
-		root = list_entry(rc->reloc_roots.next,
-				  struct btrfs_root, root_list);
-		list_del_init(&root->root_list);
+		if (btrfs_root_refs(&reloc_root->root_item) > 0) {
+			root = read_fs_root(reloc_root->fs_info,
+					    reloc_root->root_key.offset);
+			BUG_ON(IS_ERR(root));
+			BUG_ON(root->reloc_root != reloc_root);
 
-		async = kmalloc(sizeof(*async), GFP_NOFS);
-		BUG_ON(!async);
-		async->work.func = merge_func;
-		async->work.flags = 0;
-		async->rc = rc;
-		async->root = root;
-		async->done = &done;
-		async->num_pending = &num_pending;
-		atomic_inc(&num_pending);
-		btrfs_queue_worker(&rc->workers, &async->work);
+			ret = merge_reloc_root(rc, root);
+			BUG_ON(ret);
+		} else {
+			list_del_init(&reloc_root->root_list);
+		}
+		btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0);
 	}
 
-	if (!atomic_dec_and_test(&num_pending))
-		wait_for_completion(&done);
-
+	if (found) {
+		found = 0;
+		goto again;
+	}
 	BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
 	return 0;
 }
@@ -1875,119 +2259,169 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
 	return btrfs_record_root_in_trans(trans, root);
 }
 
-/*
- * select one tree from trees that references the block.
- * for blocks in refernce counted trees, we preper reloc tree.
- * if no reloc tree found and reloc_only is true, NULL is returned.
- */
-static struct btrfs_root *__select_one_root(struct btrfs_trans_handle *trans,
-					    struct backref_node *node,
-					    struct backref_edge *edges[],
-					    int *nr, int reloc_only)
+static noinline_for_stack
+struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
+				     struct reloc_control *rc,
+				     struct backref_node *node,
+				     struct backref_edge *edges[], int *nr)
 {
 	struct backref_node *next;
 	struct btrfs_root *root;
-	int index;
-	int loop = 0;
-again:
-	index = 0;
+	int index = 0;
+
 	next = node;
 	while (1) {
 		cond_resched();
 		next = walk_up_backref(next, edges, &index);
 		root = next->root;
-		if (!root) {
-			BUG_ON(!node->old_root);
-			goto skip;
-		}
-
-		/* no other choice for non-refernce counted tree */
-		if (!root->ref_cows) {
-			BUG_ON(reloc_only);
-			break;
-		}
+		BUG_ON(!root);
+		BUG_ON(!root->ref_cows);
 
 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
 			record_reloc_root_in_trans(trans, root);
 			break;
 		}
 
-		if (loop) {
-			btrfs_record_root_in_trans(trans, root);
+		btrfs_record_root_in_trans(trans, root);
+		root = root->reloc_root;
+
+		if (next->new_bytenr != root->node->start) {
+			BUG_ON(next->new_bytenr);
+			BUG_ON(!list_empty(&next->list));
+			next->new_bytenr = root->node->start;
+			next->root = root;
+			list_add_tail(&next->list,
+				      &rc->backref_cache.changed);
+			__mark_block_processed(rc, next);
 			break;
 		}
 
-		if (reloc_only || next != node) {
-			if (!root->reloc_root)
-				btrfs_record_root_in_trans(trans, root);
-			root = root->reloc_root;
-			/*
-			 * if the reloc tree was created in current
-			 * transation, there is no node in backref tree
-			 * corresponds to the root of the reloc tree.
-			 */
-			if (btrfs_root_last_snapshot(&root->root_item) ==
-			    trans->transid - 1)
-				break;
-		}
-skip:
+		WARN_ON(1);
 		root = NULL;
 		next = walk_down_backref(edges, &index);
 		if (!next || next->level <= node->level)
 			break;
 	}
+	if (!root)
+		return NULL;
 
-	if (!root && !loop && !reloc_only) {
-		loop = 1;
-		goto again;
+	*nr = index;
+	next = node;
+	/* setup backref node path for btrfs_reloc_cow_block */
+	while (1) {
+		rc->backref_cache.path[next->level] = next;
+		if (--index < 0)
+			break;
+		next = edges[index]->node[UPPER];
 	}
-
-	if (root)
-		*nr = index;
-	else
-		*nr = 0;
-
 	return root;
 }
 
+/*
+ * select a tree root for relocation. return NULL if the block
+ * is reference counted. we should use do_relocation() in this
+ * case. return a tree root pointer if the block isn't reference
+ * counted. return -ENOENT if the block is root of reloc tree.
+ */
 static noinline_for_stack
 struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
 				   struct backref_node *node)
 {
+	struct backref_node *next;
+	struct btrfs_root *root;
+	struct btrfs_root *fs_root = NULL;
 	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
-	int nr;
-	return __select_one_root(trans, node, edges, &nr, 0);
+	int index = 0;
+
+	next = node;
+	while (1) {
+		cond_resched();
+		next = walk_up_backref(next, edges, &index);
+		root = next->root;
+		BUG_ON(!root);
+
+		/* no other choice for non-refernce counted tree */
+		if (!root->ref_cows)
+			return root;
+
+		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
+			fs_root = root;
+
+		if (next != node)
+			return NULL;
+
+		next = walk_down_backref(edges, &index);
+		if (!next || next->level <= node->level)
+			break;
+	}
+
+	if (!fs_root)
+		return ERR_PTR(-ENOENT);
+	return fs_root;
 }
 
 static noinline_for_stack
-struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
-				     struct backref_node *node,
-				     struct backref_edge *edges[], int *nr)
+u64 calcu_metadata_size(struct reloc_control *rc,
+			struct backref_node *node, int reserve)
 {
-	return __select_one_root(trans, node, edges, nr, 1);
+	struct backref_node *next = node;
+	struct backref_edge *edge;
+	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+	u64 num_bytes = 0;
+	int index = 0;
+
+	BUG_ON(reserve && node->processed);
+
+	while (next) {
+		cond_resched();
+		while (1) {
+			if (next->processed && (reserve || next != node))
+				break;
+
+			num_bytes += btrfs_level_size(rc->extent_root,
+						      next->level);
+
+			if (list_empty(&next->upper))
+				break;
+
+			edge = list_entry(next->upper.next,
+					  struct backref_edge, list[LOWER]);
+			edges[index++] = edge;
+			next = edge->node[UPPER];
+		}
+		next = walk_down_backref(edges, &index);
+	}
+	return num_bytes;
 }
 
-static void grab_path_buffers(struct btrfs_path *path,
-			      struct backref_node *node,
-			      struct backref_edge *edges[], int nr)
+static int reserve_metadata_space(struct btrfs_trans_handle *trans,
+				  struct reloc_control *rc,
+				  struct backref_node *node)
 {
-	int i = 0;
-	while (1) {
-		drop_node_buffer(node);
-		node->eb = path->nodes[node->level];
-		BUG_ON(!node->eb);
-		if (path->locks[node->level])
-			node->locked = 1;
-		path->nodes[node->level] = NULL;
-		path->locks[node->level] = 0;
-
-		if (i >= nr)
-			break;
+	struct btrfs_root *root = rc->extent_root;
+	u64 num_bytes;
+	int ret;
+
+	num_bytes = calcu_metadata_size(rc, node, 1) * 2;
 
-		edges[i]->blockptr = node->eb->start;
-		node = edges[i]->node[UPPER];
-		i++;
+	trans->block_rsv = rc->block_rsv;
+	ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes,
+				  &rc->block_rsv_retries);
+	if (ret) {
+		if (ret == -EAGAIN)
+			rc->commit_transaction = 1;
+		return ret;
 	}
+
+	rc->block_rsv_retries = 0;
+	return 0;
+}
+
+static void release_metadata_space(struct reloc_control *rc,
+				   struct backref_node *node)
+{
+	u64 num_bytes = calcu_metadata_size(rc, node, 0) * 2;
+	btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, num_bytes);
 }
 
 /*
@@ -1998,6 +2432,7 @@ static void grab_path_buffers(struct btrfs_path *path,
  * in that case this function just updates pointers.
  */
 static int do_relocation(struct btrfs_trans_handle *trans,
+			 struct reloc_control *rc,
 			 struct backref_node *node,
 			 struct btrfs_key *key,
 			 struct btrfs_path *path, int lowest)
@@ -2018,18 +2453,25 @@ static int do_relocation(struct btrfs_trans_handle *trans,
 	BUG_ON(lowest && node->eb);
 
 	path->lowest_level = node->level + 1;
+	rc->backref_cache.path[node->level] = node;
 	list_for_each_entry(edge, &node->upper, list[LOWER]) {
 		cond_resched();
-		if (node->eb && node->eb->start == edge->blockptr)
-			continue;
 
 		upper = edge->node[UPPER];
-		root = select_reloc_root(trans, upper, edges, &nr);
-		if (!root)
-			continue;
-
-		if (upper->eb && !upper->locked)
+		root = select_reloc_root(trans, rc, upper, edges, &nr);
+		BUG_ON(!root);
+
+		if (upper->eb && !upper->locked) {
+			if (!lowest) {
+				ret = btrfs_bin_search(upper->eb, key,
+						       upper->level, &slot);
+				BUG_ON(ret);
+				bytenr = btrfs_node_blockptr(upper->eb, slot);
+				if (node->eb->start == bytenr)
+					goto next;
+			}
 			drop_node_buffer(upper);
+		}
 
 		if (!upper->eb) {
 			ret = btrfs_search_slot(trans, root, key, path, 0, 1);
@@ -2039,11 +2481,17 @@ static int do_relocation(struct btrfs_trans_handle *trans,
 			}
 			BUG_ON(ret > 0);
 
-			slot = path->slots[upper->level];
+			if (!upper->eb) {
+				upper->eb = path->nodes[upper->level];
+				path->nodes[upper->level] = NULL;
+			} else {
+				BUG_ON(upper->eb != path->nodes[upper->level]);
+			}
 
-			btrfs_unlock_up_safe(path, upper->level + 1);
-			grab_path_buffers(path, upper, edges, nr);
+			upper->locked = 1;
+			path->locks[upper->level] = 0;
 
+			slot = path->slots[upper->level];
 			btrfs_release_path(NULL, path);
 		} else {
 			ret = btrfs_bin_search(upper->eb, key, upper->level,
@@ -2052,14 +2500,11 @@ static int do_relocation(struct btrfs_trans_handle *trans,
 		}
 
 		bytenr = btrfs_node_blockptr(upper->eb, slot);
-		if (!lowest) {
-			if (node->eb->start == bytenr) {
-				btrfs_tree_unlock(upper->eb);
-				upper->locked = 0;
-				continue;
-			}
+		if (lowest) {
+			BUG_ON(bytenr != node->bytenr);
 		} else {
-			BUG_ON(node->bytenr != bytenr);
+			if (node->eb->start == bytenr)
+				goto next;
 		}
 
 		blocksize = btrfs_level_size(root, node->level);
@@ -2071,13 +2516,13 @@ static int do_relocation(struct btrfs_trans_handle *trans,
 		if (!node->eb) {
 			ret = btrfs_cow_block(trans, root, eb, upper->eb,
 					      slot, &eb);
+			btrfs_tree_unlock(eb);
+			free_extent_buffer(eb);
 			if (ret < 0) {
 				err = ret;
-				break;
+				goto next;
 			}
-			btrfs_set_lock_blocking(eb);
-			node->eb = eb;
-			node->locked = 1;
+			BUG_ON(node->eb != eb);
 		} else {
 			btrfs_set_node_blockptr(upper->eb, slot,
 						node->eb->start);
@@ -2095,67 +2540,80 @@ static int do_relocation(struct btrfs_trans_handle *trans,
 			ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
 			BUG_ON(ret);
 		}
-		if (!lowest) {
-			btrfs_tree_unlock(upper->eb);
-			upper->locked = 0;
-		}
+next:
+		if (!upper->pending)
+			drop_node_buffer(upper);
+		else
+			unlock_node_buffer(upper);
+		if (err)
+			break;
 	}
+
+	if (!err && node->pending) {
+		drop_node_buffer(node);
+		list_move_tail(&node->list, &rc->backref_cache.changed);
+		node->pending = 0;
+	}
+
 	path->lowest_level = 0;
+	BUG_ON(err == -ENOSPC);
 	return err;
 }
 
 static int link_to_upper(struct btrfs_trans_handle *trans,
+			 struct reloc_control *rc,
 			 struct backref_node *node,
 			 struct btrfs_path *path)
 {
 	struct btrfs_key key;
-	if (!node->eb || list_empty(&node->upper))
-		return 0;
 
 	btrfs_node_key_to_cpu(node->eb, &key, 0);
-	return do_relocation(trans, node, &key, path, 0);
+	return do_relocation(trans, rc, node, &key, path, 0);
 }
 
 static int finish_pending_nodes(struct btrfs_trans_handle *trans,
-				struct backref_cache *cache,
-				struct btrfs_path *path)
+				struct reloc_control *rc,
+				struct btrfs_path *path, int err)
 {
+	LIST_HEAD(list);
+	struct backref_cache *cache = &rc->backref_cache;
 	struct backref_node *node;
 	int level;
 	int ret;
-	int err = 0;
 
 	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
 		while (!list_empty(&cache->pending[level])) {
 			node = list_entry(cache->pending[level].next,
-					  struct backref_node, lower);
-			BUG_ON(node->level != level);
+					  struct backref_node, list);
+			list_move_tail(&node->list, &list);
+			BUG_ON(!node->pending);
 
-			ret = link_to_upper(trans, node, path);
-			if (ret < 0)
-				err = ret;
-			/*
-			 * this remove the node from the pending list and
-			 * may add some other nodes to the level + 1
-			 * pending list
-			 */
-			remove_backref_node(cache, node);
+			if (!err) {
+				ret = link_to_upper(trans, rc, node, path);
+				if (ret < 0)
+					err = ret;
+			}
 		}
+		list_splice_init(&list, &cache->pending[level]);
 	}
-	BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
 	return err;
 }
 
 static void mark_block_processed(struct reloc_control *rc,
-				 struct backref_node *node)
+				 u64 bytenr, u32 blocksize)
+{
+	set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1,
+			EXTENT_DIRTY, GFP_NOFS);
+}
+
+static void __mark_block_processed(struct reloc_control *rc,
+				   struct backref_node *node)
 {
 	u32 blocksize;
 	if (node->level == 0 ||
 	    in_block_group(node->bytenr, rc->block_group)) {
 		blocksize = btrfs_level_size(rc->extent_root, node->level);
-		set_extent_bits(&rc->processed_blocks, node->bytenr,
-				node->bytenr + blocksize - 1, EXTENT_DIRTY,
-				GFP_NOFS);
+		mark_block_processed(rc, node->bytenr, blocksize);
 	}
 	node->processed = 1;
 }
@@ -2178,7 +2636,7 @@ static void update_processed_blocks(struct reloc_control *rc,
 			if (next->processed)
 				break;
 
-			mark_block_processed(rc, next);
+			__mark_block_processed(rc, next);
 
 			if (list_empty(&next->upper))
 				break;
@@ -2201,138 +2659,6 @@ static int tree_block_processed(u64 bytenr, u32 blocksize,
 	return 0;
 }
 
-/*
- * check if there are any file extent pointers in the leaf point to
- * data require processing
- */
-static int check_file_extents(struct reloc_control *rc,
-			      u64 bytenr, u32 blocksize, u64 ptr_gen)
-{
-	struct btrfs_key found_key;
-	struct btrfs_file_extent_item *fi;
-	struct extent_buffer *leaf;
-	u32 nritems;
-	int i;
-	int ret = 0;
-
-	leaf = read_tree_block(rc->extent_root, bytenr, blocksize, ptr_gen);
-
-	nritems = btrfs_header_nritems(leaf);
-	for (i = 0; i < nritems; i++) {
-		cond_resched();
-		btrfs_item_key_to_cpu(leaf, &found_key, i);
-		if (found_key.type != BTRFS_EXTENT_DATA_KEY)
-			continue;
-		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
-		if (btrfs_file_extent_type(leaf, fi) ==
-		    BTRFS_FILE_EXTENT_INLINE)
-			continue;
-		bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
-		if (bytenr == 0)
-			continue;
-		if (in_block_group(bytenr, rc->block_group)) {
-			ret = 1;
-			break;
-		}
-	}
-	free_extent_buffer(leaf);
-	return ret;
-}
-
-/*
- * scan child blocks of a given block to find blocks require processing
- */
-static int add_child_blocks(struct btrfs_trans_handle *trans,
-			    struct reloc_control *rc,
-			    struct backref_node *node,
-			    struct rb_root *blocks)
-{
-	struct tree_block *block;
-	struct rb_node *rb_node;
-	u64 bytenr;
-	u64 ptr_gen;
-	u32 blocksize;
-	u32 nritems;
-	int i;
-	int err = 0;
-
-	nritems = btrfs_header_nritems(node->eb);
-	blocksize = btrfs_level_size(rc->extent_root, node->level - 1);
-	for (i = 0; i < nritems; i++) {
-		cond_resched();
-		bytenr = btrfs_node_blockptr(node->eb, i);
-		ptr_gen = btrfs_node_ptr_generation(node->eb, i);
-		if (ptr_gen == trans->transid)
-			continue;
-		if (!in_block_group(bytenr, rc->block_group) &&
-		    (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
-			continue;
-		if (tree_block_processed(bytenr, blocksize, rc))
-			continue;
-
-		readahead_tree_block(rc->extent_root,
-				     bytenr, blocksize, ptr_gen);
-	}
-
-	for (i = 0; i < nritems; i++) {
-		cond_resched();
-		bytenr = btrfs_node_blockptr(node->eb, i);
-		ptr_gen = btrfs_node_ptr_generation(node->eb, i);
-		if (ptr_gen == trans->transid)
-			continue;
-		if (!in_block_group(bytenr, rc->block_group) &&
-		    (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
-			continue;
-		if (tree_block_processed(bytenr, blocksize, rc))
-			continue;
-		if (!in_block_group(bytenr, rc->block_group) &&
-		    !check_file_extents(rc, bytenr, blocksize, ptr_gen))
-			continue;
-
-		block = kmalloc(sizeof(*block), GFP_NOFS);
-		if (!block) {
-			err = -ENOMEM;
-			break;
-		}
-		block->bytenr = bytenr;
-		btrfs_node_key_to_cpu(node->eb, &block->key, i);
-		block->level = node->level - 1;
-		block->key_ready = 1;
-		rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
-		BUG_ON(rb_node);
-	}
-	if (err)
-		free_block_list(blocks);
-	return err;
-}
-
-/*
- * find adjacent blocks require processing
- */
-static noinline_for_stack
-int add_adjacent_blocks(struct btrfs_trans_handle *trans,
-			struct reloc_control *rc,
-			struct backref_cache *cache,
-			struct rb_root *blocks, int level,
-			struct backref_node **upper)
-{
-	struct backref_node *node;
-	int ret = 0;
-
-	WARN_ON(!list_empty(&cache->pending[level]));
-
-	if (list_empty(&cache->pending[level + 1]))
-		return 1;
-
-	node = list_entry(cache->pending[level + 1].next,
-			  struct backref_node, lower);
-	if (node->eb)
-		ret = add_child_blocks(trans, rc, node, blocks);
-
-	*upper = node;
-	return ret;
-}
-
 static int get_tree_block_key(struct reloc_control *rc,
 			      struct tree_block *block)
 {
@@ -2370,40 +2696,53 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
 				struct btrfs_path *path)
 {
 	struct btrfs_root *root;
-	int ret;
+	int release = 0;
+	int ret = 0;
 
+	if (!node)
+		return 0;
+
+	BUG_ON(node->processed);
 	root = select_one_root(trans, node);
-	if (unlikely(!root)) {
-		rc->found_old_snapshot = 1;
+	if (root == ERR_PTR(-ENOENT)) {
 		update_processed_blocks(rc, node);
-		return 0;
+		goto out;
 	}
 
-	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
-		ret = do_relocation(trans, node, key, path, 1);
-		if (ret < 0)
-			goto out;
-		if (node->level == 0 && rc->stage == UPDATE_DATA_PTRS) {
-			ret = replace_file_extents(trans, rc, root,
-						   node->eb, NULL);
-			if (ret < 0)
-				goto out;
-		}
-		drop_node_buffer(node);
-	} else if (!root->ref_cows) {
-		path->lowest_level = node->level;
-		ret = btrfs_search_slot(trans, root, key, path, 0, 1);
-		btrfs_release_path(root, path);
-		if (ret < 0)
+	if (!root || root->ref_cows) {
+		ret = reserve_metadata_space(trans, rc, node);
+		if (ret)
 			goto out;
-	} else if (root != node->root) {
-		WARN_ON(node->level > 0 || rc->stage != UPDATE_DATA_PTRS);
+		release = 1;
 	}
 
-	update_processed_blocks(rc, node);
-	ret = 0;
+	if (root) {
+		if (root->ref_cows) {
+			BUG_ON(node->new_bytenr);
+			BUG_ON(!list_empty(&node->list));
+			btrfs_record_root_in_trans(trans, root);
+			root = root->reloc_root;
+			node->new_bytenr = root->node->start;
+			node->root = root;
+			list_add_tail(&node->list, &rc->backref_cache.changed);
+		} else {
+			path->lowest_level = node->level;
+			ret = btrfs_search_slot(trans, root, key, path, 0, 1);
+			btrfs_release_path(root, path);
+			if (ret > 0)
+				ret = 0;
+		}
+		if (!ret)
+			update_processed_blocks(rc, node);
+	} else {
+		ret = do_relocation(trans, rc, node, key, path, 1);
+	}
 out:
-	drop_node_buffer(node);
+	if (ret || node->level == 0 || node->cowonly) {
+		if (release)
+			release_metadata_space(rc, node);
+		remove_backref_node(&rc->backref_cache, node);
+	}
 	return ret;
 }
 
@@ -2414,12 +2753,10 @@ static noinline_for_stack
 int relocate_tree_blocks(struct btrfs_trans_handle *trans,
 			 struct reloc_control *rc, struct rb_root *blocks)
 {
-	struct backref_cache *cache;
 	struct backref_node *node;
 	struct btrfs_path *path;
 	struct tree_block *block;
 	struct rb_node *rb_node;
-	int level = -1;
 	int ret;
 	int err = 0;
 
@@ -2427,21 +2764,9 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
-	cache = kmalloc(sizeof(*cache), GFP_NOFS);
-	if (!cache) {
-		btrfs_free_path(path);
-		return -ENOMEM;
-	}
-
-	backref_cache_init(cache);
-
 	rb_node = rb_first(blocks);
 	while (rb_node) {
 		block = rb_entry(rb_node, struct tree_block, rb_node);
-		if (level == -1)
-			level = block->level;
-		else
-			BUG_ON(level != block->level);
 		if (!block->key_ready)
 			reada_tree_block(rc, block);
 		rb_node = rb_next(rb_node);
@@ -2459,7 +2784,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
 	while (rb_node) {
 		block = rb_entry(rb_node, struct tree_block, rb_node);
 
-		node = build_backref_tree(rc, cache, &block->key,
+		node = build_backref_tree(rc, &block->key,
 					  block->level, block->bytenr);
 		if (IS_ERR(node)) {
 			err = PTR_ERR(node);
@@ -2469,79 +2794,62 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
 		ret = relocate_tree_block(trans, rc, node, &block->key,
 					  path);
 		if (ret < 0) {
-			err = ret;
+			if (ret != -EAGAIN || rb_node == rb_first(blocks))
+				err = ret;
 			goto out;
 		}
-		remove_backref_node(cache, node);
 		rb_node = rb_next(rb_node);
 	}
-
-	if (level > 0)
-		goto out;
-
+out:
 	free_block_list(blocks);
+	err = finish_pending_nodes(trans, rc, path, err);
 
-	/*
-	 * now backrefs of some upper level tree blocks have been cached,
-	 * try relocating blocks referenced by these upper level blocks.
-	 */
-	while (1) {
-		struct backref_node *upper = NULL;
-		if (trans->transaction->in_commit ||
-		    trans->transaction->delayed_refs.flushing)
-			break;
+	btrfs_free_path(path);
+	return err;
+}
 
-		ret = add_adjacent_blocks(trans, rc, cache, blocks, level,
-					  &upper);
-		if (ret < 0)
-			err = ret;
-		if (ret != 0)
-			break;
+static noinline_for_stack
+int prealloc_file_extent_cluster(struct inode *inode,
+				 struct file_extent_cluster *cluster)
+{
+	u64 alloc_hint = 0;
+	u64 start;
+	u64 end;
+	u64 offset = BTRFS_I(inode)->index_cnt;
+	u64 num_bytes;
+	int nr = 0;
+	int ret = 0;
 
-		rb_node = rb_first(blocks);
-		while (rb_node) {
-			block = rb_entry(rb_node, struct tree_block, rb_node);
-			if (trans->transaction->in_commit ||
-			    trans->transaction->delayed_refs.flushing)
-				goto out;
-			BUG_ON(!block->key_ready);
-			node = build_backref_tree(rc, cache, &block->key,
-						  level, block->bytenr);
-			if (IS_ERR(node)) {
-				err = PTR_ERR(node);
-				goto out;
-			}
+	BUG_ON(cluster->start != cluster->boundary[0]);
+	mutex_lock(&inode->i_mutex);
 
-			ret = relocate_tree_block(trans, rc, node,
-						  &block->key, path);
-			if (ret < 0) {
-				err = ret;
-				goto out;
-			}
-			remove_backref_node(cache, node);
-			rb_node = rb_next(rb_node);
-		}
-		free_block_list(blocks);
+	ret = btrfs_check_data_free_space(inode, cluster->end +
+					  1 - cluster->start);
+	if (ret)
+		goto out;
 
-		if (upper) {
-			ret = link_to_upper(trans, upper, path);
-			if (ret < 0) {
-				err = ret;
-				break;
-			}
-			remove_backref_node(cache, upper);
-		}
+	while (nr < cluster->nr) {
+		start = cluster->boundary[nr] - offset;
+		if (nr + 1 < cluster->nr)
+			end = cluster->boundary[nr + 1] - 1 - offset;
+		else
+			end = cluster->end - offset;
+
+		lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+		num_bytes = end + 1 - start;
+		ret = btrfs_prealloc_file_range(inode, 0, start,
+						num_bytes, num_bytes,
+						end + 1, &alloc_hint);
+		unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+		if (ret)
+			break;
+		nr++;
 	}
+	btrfs_free_reserved_data_space(inode, cluster->end +
+				       1 - cluster->start);
 out:
-	free_block_list(blocks);
-
-	ret = finish_pending_nodes(trans, cache, path);
-	if (ret < 0)
-		err = ret;
-
-	kfree(cache);
-	btrfs_free_path(path);
-	return err;
+	mutex_unlock(&inode->i_mutex);
+	return ret;
 }
 
 static noinline_for_stack
@@ -2587,7 +2895,6 @@ static int relocate_file_extent_cluster(struct inode *inode,
 	u64 offset = BTRFS_I(inode)->index_cnt;
 	unsigned long index;
 	unsigned long last_index;
-	unsigned int dirty_page = 0;
 	struct page *page;
 	struct file_ra_state *ra;
 	int nr = 0;
@@ -2600,21 +2907,24 @@ static int relocate_file_extent_cluster(struct inode *inode,
 	if (!ra)
 		return -ENOMEM;
 
-	index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
-	last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
+	ret = prealloc_file_extent_cluster(inode, cluster);
+	if (ret)
+		goto out;
 
-	mutex_lock(&inode->i_mutex);
+	file_ra_state_init(ra, inode->i_mapping);
 
-	i_size_write(inode, cluster->end + 1 - offset);
 	ret = setup_extent_mapping(inode, cluster->start - offset,
 				   cluster->end - offset, cluster->start);
 	if (ret)
-		goto out_unlock;
-
-	file_ra_state_init(ra, inode->i_mapping);
+		goto out;
 
-	WARN_ON(cluster->start != cluster->boundary[0]);
+	index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
+	last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
 	while (index <= last_index) {
+		ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
+		if (ret)
+			goto out;
+
 		page = find_lock_page(inode->i_mapping, index);
 		if (!page) {
 			page_cache_sync_readahead(inode->i_mapping,
@@ -2622,8 +2932,10 @@ static int relocate_file_extent_cluster(struct inode *inode,
 						  last_index + 1 - index);
 			page = grab_cache_page(inode->i_mapping, index);
 			if (!page) {
+				btrfs_delalloc_release_metadata(inode,
+							PAGE_CACHE_SIZE);
 				ret = -ENOMEM;
-				goto out_unlock;
+				goto out;
 			}
 		}
 
@@ -2639,8 +2951,10 @@ static int relocate_file_extent_cluster(struct inode *inode,
 			if (!PageUptodate(page)) {
 				unlock_page(page);
 				page_cache_release(page);
+				btrfs_delalloc_release_metadata(inode,
+							PAGE_CACHE_SIZE);
 				ret = -EIO;
-				goto out_unlock;
+				goto out;
 			}
 		}
 
@@ -2659,10 +2973,9 @@ static int relocate_file_extent_cluster(struct inode *inode,
 					EXTENT_BOUNDARY, GFP_NOFS);
 			nr++;
 		}
-		btrfs_set_extent_delalloc(inode, page_start, page_end);
 
+		btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
 		set_page_dirty(page);
-		dirty_page++;
 
 		unlock_extent(&BTRFS_I(inode)->io_tree,
 			      page_start, page_end, GFP_NOFS);
@@ -2670,20 +2983,11 @@ static int relocate_file_extent_cluster(struct inode *inode,
 		page_cache_release(page);
 
 		index++;
-		if (nr < cluster->nr &&
-		    page_end + 1 + offset == cluster->boundary[nr]) {
-			balance_dirty_pages_ratelimited_nr(inode->i_mapping,
-							   dirty_page);
-			dirty_page = 0;
-		}
-	}
-	if (dirty_page) {
-		balance_dirty_pages_ratelimited_nr(inode->i_mapping,
-						   dirty_page);
+		balance_dirty_pages_ratelimited(inode->i_mapping);
+		btrfs_throttle(BTRFS_I(inode)->root);
 	}
 	WARN_ON(nr != cluster->nr);
-out_unlock:
-	mutex_unlock(&inode->i_mutex);
+out:
 	kfree(ra);
 	return ret;
 }
@@ -2869,9 +3173,6 @@ out:
 static int block_use_full_backref(struct reloc_control *rc,
 				  struct extent_buffer *eb)
 {
-	struct btrfs_path *path;
-	struct btrfs_extent_item *ei;
-	struct btrfs_key key;
 	u64 flags;
 	int ret;
 
@@ -2879,28 +3180,14 @@ static int block_use_full_backref(struct reloc_control *rc,
 	    btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
 		return 1;
 
-	path = btrfs_alloc_path();
-	BUG_ON(!path);
-
-	key.objectid = eb->start;
-	key.type = BTRFS_EXTENT_ITEM_KEY;
-	key.offset = eb->len;
-
-	path->search_commit_root = 1;
-	path->skip_locking = 1;
-	ret = btrfs_search_slot(NULL, rc->extent_root,
-				&key, path, 0, 0);
+	ret = btrfs_lookup_extent_info(NULL, rc->extent_root,
+				       eb->start, eb->len, NULL, &flags);
 	BUG_ON(ret);
 
-	ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
-			    struct btrfs_extent_item);
-	flags = btrfs_extent_flags(path->nodes[0], ei);
-	BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
 	if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
 		ret = 1;
 	else
 		ret = 0;
-	btrfs_free_path(path);
 	return ret;
 }
 
@@ -3073,22 +3360,10 @@ int add_data_references(struct reloc_control *rc,
 	struct btrfs_extent_inline_ref *iref;
 	unsigned long ptr;
 	unsigned long end;
-	u32 blocksize;
+	u32 blocksize = btrfs_level_size(rc->extent_root, 0);
 	int ret;
 	int err = 0;
 
-	ret = get_new_location(rc->data_inode, NULL, extent_key->objectid,
-			       extent_key->offset);
-	BUG_ON(ret < 0);
-	if (ret > 0) {
-		/* the relocated data is fragmented */
-		rc->extents_skipped++;
-		btrfs_release_path(rc->extent_root, path);
-		return 0;
-	}
-
-	blocksize = btrfs_level_size(rc->extent_root, 0);
-
 	eb = path->nodes[0];
 	ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
 	end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
@@ -3169,7 +3444,8 @@ int add_data_references(struct reloc_control *rc,
  */
 static noinline_for_stack
 int find_next_extent(struct btrfs_trans_handle *trans,
-		     struct reloc_control *rc, struct btrfs_path *path)
+		     struct reloc_control *rc, struct btrfs_path *path,
+		     struct btrfs_key *extent_key)
 {
 	struct btrfs_key key;
 	struct extent_buffer *leaf;
@@ -3224,6 +3500,7 @@ next:
 			rc->search_start = end + 1;
 		} else {
 			rc->search_start = key.objectid + key.offset;
+			memcpy(extent_key, &key, sizeof(key));
 			return 0;
 		}
 	}
@@ -3261,12 +3538,49 @@ static int check_extent_flags(u64 flags)
 	return 0;
 }
 
+static noinline_for_stack
+int prepare_to_relocate(struct reloc_control *rc)
+{
+	struct btrfs_trans_handle *trans;
+	int ret;
+
+	rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root);
+	if (!rc->block_rsv)
+		return -ENOMEM;
+
+	/*
+	 * reserve some space for creating reloc trees.
+	 * btrfs_init_reloc_root will use them when there
+	 * is no reservation in transaction handle.
+	 */
+	ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv,
+				  rc->extent_root->nodesize * 256,
+				  &rc->block_rsv_retries);
+	if (ret)
+		return ret;
+
+	rc->block_rsv->refill_used = 1;
+	btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv);
+
+	memset(&rc->cluster, 0, sizeof(rc->cluster));
+	rc->search_start = rc->block_group->key.objectid;
+	rc->extents_found = 0;
+	rc->nodes_relocated = 0;
+	rc->merging_rsv_size = 0;
+	rc->block_rsv_retries = 0;
+
+	rc->create_reloc_tree = 1;
+	set_reloc_control(rc);
+
+	trans = btrfs_join_transaction(rc->extent_root, 1);
+	btrfs_commit_transaction(trans, rc->extent_root);
+	return 0;
+}
 
 static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 {
 	struct rb_root blocks = RB_ROOT;
 	struct btrfs_key key;
-	struct file_extent_cluster *cluster;
 	struct btrfs_trans_handle *trans = NULL;
 	struct btrfs_path *path;
 	struct btrfs_extent_item *ei;
@@ -3276,33 +3590,25 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 	int ret;
 	int err = 0;
 
-	cluster = kzalloc(sizeof(*cluster), GFP_NOFS);
-	if (!cluster)
-		return -ENOMEM;
-
 	path = btrfs_alloc_path();
-	if (!path) {
-		kfree(cluster);
+	if (!path)
 		return -ENOMEM;
-	}
-
-	rc->extents_found = 0;
-	rc->extents_skipped = 0;
-
-	rc->search_start = rc->block_group->key.objectid;
-	clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
-			  GFP_NOFS);
-
-	rc->create_reloc_root = 1;
-	set_reloc_control(rc);
 
-	trans = btrfs_start_transaction(rc->extent_root, 1);
-	btrfs_commit_transaction(trans, rc->extent_root);
+	ret = prepare_to_relocate(rc);
+	if (ret) {
+		err = ret;
+		goto out_free;
+	}
 
 	while (1) {
-		trans = btrfs_start_transaction(rc->extent_root, 1);
+		trans = btrfs_start_transaction(rc->extent_root, 0);
 
-		ret = find_next_extent(trans, rc, path);
+		if (update_backref_cache(trans, &rc->backref_cache)) {
+			btrfs_end_transaction(trans, rc->extent_root);
+			continue;
+		}
+
+		ret = find_next_extent(trans, rc, path, &key);
 		if (ret < 0)
 			err = ret;
 		if (ret != 0)
@@ -3312,9 +3618,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 
 		ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
 				    struct btrfs_extent_item);
-		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
-		item_size = btrfs_item_size_nr(path->nodes[0],
-					       path->slots[0]);
+		item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
 		if (item_size >= sizeof(*ei)) {
 			flags = btrfs_extent_flags(path->nodes[0], ei);
 			ret = check_extent_flags(flags);
@@ -3355,73 +3659,100 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 		if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 			ret = add_tree_block(rc, &key, path, &blocks);
 		} else if (rc->stage == UPDATE_DATA_PTRS &&
-			 (flags & BTRFS_EXTENT_FLAG_DATA)) {
+			   (flags & BTRFS_EXTENT_FLAG_DATA)) {
 			ret = add_data_references(rc, &key, path, &blocks);
 		} else {
 			btrfs_release_path(rc->extent_root, path);
 			ret = 0;
 		}
 		if (ret < 0) {
-			err = 0;
+			err = ret;
 			break;
 		}
 
 		if (!RB_EMPTY_ROOT(&blocks)) {
 			ret = relocate_tree_blocks(trans, rc, &blocks);
 			if (ret < 0) {
+				if (ret != -EAGAIN) {
+					err = ret;
+					break;
+				}
+				rc->extents_found--;
+				rc->search_start = key.objectid;
+			}
+		}
+
+		ret = btrfs_block_rsv_check(trans, rc->extent_root,
+					    rc->block_rsv, 0, 5);
+		if (ret < 0) {
+			if (ret != -EAGAIN) {
 				err = ret;
+				WARN_ON(1);
 				break;
 			}
+			rc->commit_transaction = 1;
 		}
 
-		nr = trans->blocks_used;
-		btrfs_end_transaction(trans, rc->extent_root);
+		if (rc->commit_transaction) {
+			rc->commit_transaction = 0;
+			ret = btrfs_commit_transaction(trans, rc->extent_root);
+			BUG_ON(ret);
+		} else {
+			nr = trans->blocks_used;
+			btrfs_end_transaction_throttle(trans, rc->extent_root);
+			btrfs_btree_balance_dirty(rc->extent_root, nr);
+		}
 		trans = NULL;
-		btrfs_btree_balance_dirty(rc->extent_root, nr);
 
 		if (rc->stage == MOVE_DATA_EXTENTS &&
 		    (flags & BTRFS_EXTENT_FLAG_DATA)) {
 			rc->found_file_extent = 1;
 			ret = relocate_data_extent(rc->data_inode,
-						   &key, cluster);
+						   &key, &rc->cluster);
 			if (ret < 0) {
 				err = ret;
 				break;
 			}
 		}
 	}
-	btrfs_free_path(path);
+
+	btrfs_release_path(rc->extent_root, path);
+	clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
+			  GFP_NOFS);
 
 	if (trans) {
 		nr = trans->blocks_used;
-		btrfs_end_transaction(trans, rc->extent_root);
+		btrfs_end_transaction_throttle(trans, rc->extent_root);
 		btrfs_btree_balance_dirty(rc->extent_root, nr);
 	}
 
 	if (!err) {
-		ret = relocate_file_extent_cluster(rc->data_inode, cluster);
+		ret = relocate_file_extent_cluster(rc->data_inode,
+						   &rc->cluster);
 		if (ret < 0)
 			err = ret;
 	}
 
-	kfree(cluster);
+	rc->create_reloc_tree = 0;
+	set_reloc_control(rc);
 
-	rc->create_reloc_root = 0;
-	smp_mb();
+	backref_cache_cleanup(&rc->backref_cache);
+	btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
 
-	if (rc->extents_found > 0) {
-		trans = btrfs_start_transaction(rc->extent_root, 1);
-		btrfs_commit_transaction(trans, rc->extent_root);
-	}
+	err = prepare_to_merge(rc, err);
 
 	merge_reloc_roots(rc);
 
+	rc->merge_reloc_tree = 0;
 	unset_reloc_control(rc);
+	btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
 
 	/* get rid of pinned extents */
-	trans = btrfs_start_transaction(rc->extent_root, 1);
+	trans = btrfs_join_transaction(rc->extent_root, 1);
 	btrfs_commit_transaction(trans, rc->extent_root);
-
+out_free:
+	btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
+	btrfs_free_path(path);
 	return err;
 }
 
@@ -3447,7 +3778,8 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
 	btrfs_set_inode_generation(leaf, item, 1);
 	btrfs_set_inode_size(leaf, item, 0);
 	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
-	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
+	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
+					  BTRFS_INODE_PREALLOC);
 	btrfs_mark_buffer_dirty(leaf);
 	btrfs_release_path(root, path);
 out:
@@ -3459,8 +3791,9 @@ out:
  * helper to create inode for data relocation.
  * the inode is in data relocation tree and its link count is 0
  */
-static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
-					struct btrfs_block_group_cache *group)
+static noinline_for_stack
+struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
+				 struct btrfs_block_group_cache *group)
 {
 	struct inode *inode = NULL;
 	struct btrfs_trans_handle *trans;
@@ -3474,8 +3807,9 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
 	if (IS_ERR(root))
 		return ERR_CAST(root);
 
-	trans = btrfs_start_transaction(root, 1);
-	BUG_ON(!trans);
+	trans = btrfs_start_transaction(root, 6);
+	if (IS_ERR(trans))
+		return ERR_CAST(trans);
 
 	err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
 	if (err)
@@ -3487,7 +3821,7 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
 	key.objectid = objectid;
 	key.type = BTRFS_INODE_ITEM_KEY;
 	key.offset = 0;
-	inode = btrfs_iget(root->fs_info->sb, &key, root);
+	inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
 	BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
 	BTRFS_I(inode)->index_cnt = group->key.objectid;
 
@@ -3495,7 +3829,6 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
 out:
 	nr = trans->blocks_used;
 	btrfs_end_transaction(trans, root);
-
 	btrfs_btree_balance_dirty(root, nr);
 	if (err) {
 		if (inode)
@@ -3505,6 +3838,21 @@ out:
 	return inode;
 }
 
+static struct reloc_control *alloc_reloc_control(void)
+{
+	struct reloc_control *rc;
+
+	rc = kzalloc(sizeof(*rc), GFP_NOFS);
+	if (!rc)
+		return NULL;
+
+	INIT_LIST_HEAD(&rc->reloc_roots);
+	backref_cache_init(&rc->backref_cache);
+	mapping_tree_init(&rc->reloc_root_tree);
+	extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
+	return rc;
+}
+
 /*
  * function to relocate all extents in a block group.
  */
@@ -3513,24 +3861,26 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 	struct btrfs_fs_info *fs_info = extent_root->fs_info;
 	struct reloc_control *rc;
 	int ret;
+	int rw = 0;
 	int err = 0;
 
-	rc = kzalloc(sizeof(*rc), GFP_NOFS);
+	rc = alloc_reloc_control();
 	if (!rc)
 		return -ENOMEM;
 
-	mapping_tree_init(&rc->reloc_root_tree);
-	extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
-	INIT_LIST_HEAD(&rc->reloc_roots);
+	rc->extent_root = extent_root;
 
 	rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
 	BUG_ON(!rc->block_group);
 
-	btrfs_init_workers(&rc->workers, "relocate",
-			   fs_info->thread_pool_size, NULL);
-
-	rc->extent_root = extent_root;
-	btrfs_prepare_block_group_relocation(extent_root, rc->block_group);
+	if (!rc->block_group->ro) {
+		ret = btrfs_set_block_group_ro(extent_root, rc->block_group);
+		if (ret) {
+			err = ret;
+			goto out;
+		}
+		rw = 1;
+	}
 
 	rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
 	if (IS_ERR(rc->data_inode)) {
@@ -3547,9 +3897,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 	btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0);
 
 	while (1) {
-		rc->extents_found = 0;
-		rc->extents_skipped = 0;
-
 		mutex_lock(&fs_info->cleaner_mutex);
 
 		btrfs_clean_old_snapshots(fs_info->tree_root);
@@ -3558,7 +3905,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 		mutex_unlock(&fs_info->cleaner_mutex);
 		if (ret < 0) {
 			err = ret;
-			break;
+			goto out;
 		}
 
 		if (rc->extents_found == 0)
@@ -3572,18 +3919,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 			invalidate_mapping_pages(rc->data_inode->i_mapping,
 						 0, -1);
 			rc->stage = UPDATE_DATA_PTRS;
-		} else if (rc->stage == UPDATE_DATA_PTRS &&
-			   rc->extents_skipped >= rc->extents_found) {
-			iput(rc->data_inode);
-			rc->data_inode = create_reloc_inode(fs_info,
-							    rc->block_group);
-			if (IS_ERR(rc->data_inode)) {
-				err = PTR_ERR(rc->data_inode);
-				rc->data_inode = NULL;
-				break;
-			}
-			rc->stage = MOVE_DATA_EXTENTS;
-			rc->found_file_extent = 0;
 		}
 	}
 
@@ -3596,8 +3931,9 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 	WARN_ON(rc->block_group->reserved > 0);
 	WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
 out:
+	if (err && rw)
+		btrfs_set_block_group_rw(extent_root, rc->block_group);
 	iput(rc->data_inode);
-	btrfs_stop_workers(&rc->workers);
 	btrfs_put_block_group(rc->block_group);
 	kfree(rc);
 	return err;
@@ -3608,7 +3944,7 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
 	struct btrfs_trans_handle *trans;
 	int ret;
 
-	trans = btrfs_start_transaction(root->fs_info->tree_root, 1);
+	trans = btrfs_start_transaction(root->fs_info->tree_root, 0);
 
 	memset(&root->root_item.drop_progress, 0,
 		sizeof(root->root_item.drop_progress));
@@ -3701,20 +4037,20 @@ int btrfs_recover_relocation(struct btrfs_root *root)
 	if (list_empty(&reloc_roots))
 		goto out;
 
-	rc = kzalloc(sizeof(*rc), GFP_NOFS);
+	rc = alloc_reloc_control();
 	if (!rc) {
 		err = -ENOMEM;
 		goto out;
 	}
 
-	mapping_tree_init(&rc->reloc_root_tree);
-	INIT_LIST_HEAD(&rc->reloc_roots);
-	btrfs_init_workers(&rc->workers, "relocate",
-			   root->fs_info->thread_pool_size, NULL);
 	rc->extent_root = root->fs_info->extent_root;
 
 	set_reloc_control(rc);
 
+	trans = btrfs_join_transaction(rc->extent_root, 1);
+
+	rc->merge_reloc_tree = 1;
+
 	while (!list_empty(&reloc_roots)) {
 		reloc_root = list_entry(reloc_roots.next,
 					struct btrfs_root, root_list);
@@ -3734,20 +4070,16 @@ int btrfs_recover_relocation(struct btrfs_root *root)
 		fs_root->reloc_root = reloc_root;
 	}
 
-	trans = btrfs_start_transaction(rc->extent_root, 1);
 	btrfs_commit_transaction(trans, rc->extent_root);
 
 	merge_reloc_roots(rc);
 
 	unset_reloc_control(rc);
 
-	trans = btrfs_start_transaction(rc->extent_root, 1);
+	trans = btrfs_join_transaction(rc->extent_root, 1);
 	btrfs_commit_transaction(trans, rc->extent_root);
 out:
-	if (rc) {
-		btrfs_stop_workers(&rc->workers);
-		kfree(rc);
-	}
+	kfree(rc);
 	while (!list_empty(&reloc_roots)) {
 		reloc_root = list_entry(reloc_roots.next,
 					struct btrfs_root, root_list);
@@ -3764,7 +4096,8 @@ out:
 				       BTRFS_DATA_RELOC_TREE_OBJECTID);
 		if (IS_ERR(fs_root))
 			err = PTR_ERR(fs_root);
-		btrfs_orphan_cleanup(fs_root);
+		else
+			btrfs_orphan_cleanup(fs_root);
 	}
 	return err;
 }
@@ -3812,3 +4145,130 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
 	btrfs_put_ordered_extent(ordered);
 	return 0;
 }
+
+void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root, struct extent_buffer *buf,
+			   struct extent_buffer *cow)
+{
+	struct reloc_control *rc;
+	struct backref_node *node;
+	int first_cow = 0;
+	int level;
+	int ret;
+
+	rc = root->fs_info->reloc_ctl;
+	if (!rc)
+		return;
+
+	BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
+	       root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
+
+	level = btrfs_header_level(buf);
+	if (btrfs_header_generation(buf) <=
+	    btrfs_root_last_snapshot(&root->root_item))
+		first_cow = 1;
+
+	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
+	    rc->create_reloc_tree) {
+		WARN_ON(!first_cow && level == 0);
+
+		node = rc->backref_cache.path[level];
+		BUG_ON(node->bytenr != buf->start &&
+		       node->new_bytenr != buf->start);
+
+		drop_node_buffer(node);
+		extent_buffer_get(cow);
+		node->eb = cow;
+		node->new_bytenr = cow->start;
+
+		if (!node->pending) {
+			list_move_tail(&node->list,
+				       &rc->backref_cache.pending[level]);
+			node->pending = 1;
+		}
+
+		if (first_cow)
+			__mark_block_processed(rc, node);
+
+		if (first_cow && level > 0)
+			rc->nodes_relocated += buf->len;
+	}
+
+	if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) {
+		ret = replace_file_extents(trans, rc, root, cow);
+		BUG_ON(ret);
+	}
+}
+
+/*
+ * called before creating snapshot. it calculates metadata reservation
+ * requried for relocating tree blocks in the snapshot
+ */
+void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
+			      struct btrfs_pending_snapshot *pending,
+			      u64 *bytes_to_reserve)
+{
+	struct btrfs_root *root;
+	struct reloc_control *rc;
+
+	root = pending->root;
+	if (!root->reloc_root)
+		return;
+
+	rc = root->fs_info->reloc_ctl;
+	if (!rc->merge_reloc_tree)
+		return;
+
+	root = root->reloc_root;
+	BUG_ON(btrfs_root_refs(&root->root_item) == 0);
+	/*
+	 * relocation is in the stage of merging trees. the space
+	 * used by merging a reloc tree is twice the size of
+	 * relocated tree nodes in the worst case. half for cowing
+	 * the reloc tree, half for cowing the fs tree. the space
+	 * used by cowing the reloc tree will be freed after the
+	 * tree is dropped. if we create snapshot, cowing the fs
+	 * tree may use more space than it frees. so we need
+	 * reserve extra space.
+	 */
+	*bytes_to_reserve += rc->nodes_relocated;
+}
+
+/*
+ * called after snapshot is created. migrate block reservation
+ * and create reloc root for the newly created snapshot
+ */
+void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
+			       struct btrfs_pending_snapshot *pending)
+{
+	struct btrfs_root *root = pending->root;
+	struct btrfs_root *reloc_root;
+	struct btrfs_root *new_root;
+	struct reloc_control *rc;
+	int ret;
+
+	if (!root->reloc_root)
+		return;
+
+	rc = root->fs_info->reloc_ctl;
+	rc->merging_rsv_size += rc->nodes_relocated;
+
+	if (rc->merge_reloc_tree) {
+		ret = btrfs_block_rsv_migrate(&pending->block_rsv,
+					      rc->block_rsv,
+					      rc->nodes_relocated);
+		BUG_ON(ret);
+	}
+
+	new_root = pending->snap;
+	reloc_root = create_reloc_root(trans, root->reloc_root,
+				       new_root->root_key.objectid);
+
+	__add_reloc_root(reloc_root);
+	new_root->reloc_root = reloc_root;
+
+	if (rc->create_reloc_tree) {
+		ret = clone_backref_node(trans, rc, root, reloc_root);
+		BUG_ON(ret);
+	}
+}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 67fa2d29d663..2d958be761c8 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -259,6 +259,8 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
 	struct extent_buffer *leaf;
 	struct btrfs_path *path;
 	struct btrfs_key key;
+	struct btrfs_key root_key;
+	struct btrfs_root *root;
 	int err = 0;
 	int ret;
 
@@ -270,6 +272,9 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
 	key.type = BTRFS_ORPHAN_ITEM_KEY;
 	key.offset = 0;
 
+	root_key.type = BTRFS_ROOT_ITEM_KEY;
+	root_key.offset = (u64)-1;
+
 	while (1) {
 		ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
 		if (ret < 0) {
@@ -294,13 +299,25 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
 		    key.type != BTRFS_ORPHAN_ITEM_KEY)
 			break;
 
-		ret = btrfs_find_dead_roots(tree_root, key.offset);
-		if (ret) {
+		root_key.objectid = key.offset;
+		key.offset++;
+
+		root = btrfs_read_fs_root_no_name(tree_root->fs_info,
+						  &root_key);
+		if (!IS_ERR(root))
+			continue;
+
+		ret = PTR_ERR(root);
+		if (ret != -ENOENT) {
 			err = ret;
 			break;
 		}
 
-		key.offset++;
+		ret = btrfs_find_dead_roots(tree_root, root_key.objectid);
+		if (ret) {
+			err = ret;
+			break;
+		}
 	}
 
 	btrfs_free_path(path);
@@ -313,7 +330,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 {
 	struct btrfs_path *path;
 	int ret;
-	u32 refs;
 	struct btrfs_root_item *ri;
 	struct extent_buffer *leaf;
 
@@ -327,8 +343,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	leaf = path->nodes[0];
 	ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
 
-	refs = btrfs_disk_root_refs(leaf, ri);
-	BUG_ON(refs != 0);
 	ret = btrfs_del_item(trans, root, path);
 out:
 	btrfs_free_path(path);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8a1ea6e64575..f2393b390318 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -38,6 +38,7 @@
 #include <linux/namei.h>
 #include <linux/miscdevice.h>
 #include <linux/magic.h>
+#include <linux/slab.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -63,22 +64,21 @@ static void btrfs_put_super(struct super_block *sb)
 }
 
 enum {
-	Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
-	Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
-	Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl,
-	Opt_compress, Opt_compress_force, Opt_notreelog, Opt_ratio,
-	Opt_flushoncommit,
+	Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum,
+	Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
+	Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
+	Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
 	Opt_discard, Opt_err,
 };
 
 static match_table_t tokens = {
 	{Opt_degraded, "degraded"},
 	{Opt_subvol, "subvol=%s"},
+	{Opt_subvolid, "subvolid=%d"},
 	{Opt_device, "device=%s"},
 	{Opt_nodatasum, "nodatasum"},
 	{Opt_nodatacow, "nodatacow"},
 	{Opt_nobarrier, "nobarrier"},
-	{Opt_max_extent, "max_extent=%s"},
 	{Opt_max_inline, "max_inline=%s"},
 	{Opt_alloc_start, "alloc_start=%s"},
 	{Opt_thread_pool, "thread_pool=%d"},
@@ -95,31 +95,6 @@ static match_table_t tokens = {
 	{Opt_err, NULL},
 };
 
-u64 btrfs_parse_size(char *str)
-{
-	u64 res;
-	int mult = 1;
-	char *end;
-	char last;
-
-	res = simple_strtoul(str, &end, 10);
-
-	last = end[0];
-	if (isalpha(last)) {
-		last = tolower(last);
-		switch (last) {
-		case 'g':
-			mult *= 1024;
-		case 'm':
-			mult *= 1024;
-		case 'k':
-			mult *= 1024;
-		}
-		res = res * mult;
-	}
-	return res;
-}
-
 /*
  * Regular mount options parser.  Everything that is needed only when
  * reading in a new superblock is parsed here.
@@ -128,7 +103,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 {
 	struct btrfs_fs_info *info = root->fs_info;
 	substring_t args[MAX_OPT_ARGS];
-	char *p, *num;
+	char *p, *num, *orig;
 	int intarg;
 	int ret = 0;
 
@@ -143,6 +118,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 	if (!options)
 		return -ENOMEM;
 
+	orig = options;
 
 	while ((p = strsep(&options, ",")) != NULL) {
 		int token;
@@ -156,6 +132,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			btrfs_set_opt(info->mount_opt, DEGRADED);
 			break;
 		case Opt_subvol:
+		case Opt_subvolid:
 		case Opt_device:
 			/*
 			 * These are parsed by btrfs_parse_early_options
@@ -210,22 +187,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 				       info->thread_pool_size);
 			}
 			break;
-		case Opt_max_extent:
-			num = match_strdup(&args[0]);
-			if (num) {
-				info->max_extent = btrfs_parse_size(num);
-				kfree(num);
-
-				info->max_extent = max_t(u64,
-					info->max_extent, root->sectorsize);
-				printk(KERN_INFO "btrfs: max_extent at %llu\n",
-				       (unsigned long long)info->max_extent);
-			}
-			break;
 		case Opt_max_inline:
 			num = match_strdup(&args[0]);
 			if (num) {
-				info->max_inline = btrfs_parse_size(num);
+				info->max_inline = memparse(num, NULL);
 				kfree(num);
 
 				if (info->max_inline) {
@@ -240,7 +205,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 		case Opt_alloc_start:
 			num = match_strdup(&args[0]);
 			if (num) {
-				info->alloc_start = btrfs_parse_size(num);
+				info->alloc_start = memparse(num, NULL);
 				kfree(num);
 				printk(KERN_INFO
 					"btrfs: allocations start at %llu\n",
@@ -280,7 +245,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 		}
 	}
 out:
-	kfree(options);
+	kfree(orig);
 	return ret;
 }
 
@@ -291,12 +256,13 @@ out:
  * only when we need to allocate a new super block.
  */
 static int btrfs_parse_early_options(const char *options, fmode_t flags,
-		void *holder, char **subvol_name,
+		void *holder, char **subvol_name, u64 *subvol_objectid,
 		struct btrfs_fs_devices **fs_devices)
 {
 	substring_t args[MAX_OPT_ARGS];
 	char *opts, *p;
 	int error = 0;
+	int intarg;
 
 	if (!options)
 		goto out;
@@ -319,6 +285,18 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
 		case Opt_subvol:
 			*subvol_name = match_strdup(&args[0]);
 			break;
+		case Opt_subvolid:
+			intarg = 0;
+			error = match_int(&args[0], &intarg);
+			if (!error) {
+				/* we want the original fs_tree */
+				if (!intarg)
+					*subvol_objectid =
+						BTRFS_FS_TREE_OBJECTID;
+				else
+					*subvol_objectid = intarg;
+			}
+			break;
 		case Opt_device:
 			error = btrfs_scan_one_device(match_strdup(&args[0]),
 					flags, holder, fs_devices);
@@ -346,6 +324,112 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
 	return error;
 }
 
+static struct dentry *get_default_root(struct super_block *sb,
+				       u64 subvol_objectid)
+{
+	struct btrfs_root *root = sb->s_fs_info;
+	struct btrfs_root *new_root;
+	struct btrfs_dir_item *di;
+	struct btrfs_path *path;
+	struct btrfs_key location;
+	struct inode *inode;
+	struct dentry *dentry;
+	u64 dir_id;
+	int new = 0;
+
+	/*
+	 * We have a specific subvol we want to mount, just setup location and
+	 * go look up the root.
+	 */
+	if (subvol_objectid) {
+		location.objectid = subvol_objectid;
+		location.type = BTRFS_ROOT_ITEM_KEY;
+		location.offset = (u64)-1;
+		goto find_root;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return ERR_PTR(-ENOMEM);
+	path->leave_spinning = 1;
+
+	/*
+	 * Find the "default" dir item which points to the root item that we
+	 * will mount by default if we haven't been given a specific subvolume
+	 * to mount.
+	 */
+	dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
+	di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
+	if (IS_ERR(di))
+		return ERR_CAST(di);
+	if (!di) {
+		/*
+		 * Ok the default dir item isn't there.  This is weird since
+		 * it's always been there, but don't freak out, just try and
+		 * mount to root most subvolume.
+		 */
+		btrfs_free_path(path);
+		dir_id = BTRFS_FIRST_FREE_OBJECTID;
+		new_root = root->fs_info->fs_root;
+		goto setup_root;
+	}
+
+	btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
+	btrfs_free_path(path);
+
+find_root:
+	new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
+	if (IS_ERR(new_root))
+		return ERR_PTR(PTR_ERR(new_root));
+
+	if (btrfs_root_refs(&new_root->root_item) == 0)
+		return ERR_PTR(-ENOENT);
+
+	dir_id = btrfs_root_dirid(&new_root->root_item);
+setup_root:
+	location.objectid = dir_id;
+	location.type = BTRFS_INODE_ITEM_KEY;
+	location.offset = 0;
+
+	inode = btrfs_iget(sb, &location, new_root, &new);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+
+	/*
+	 * If we're just mounting the root most subvol put the inode and return
+	 * a reference to the dentry.  We will have already gotten a reference
+	 * to the inode in btrfs_fill_super so we're good to go.
+	 */
+	if (!new && sb->s_root->d_inode == inode) {
+		iput(inode);
+		return dget(sb->s_root);
+	}
+
+	if (new) {
+		const struct qstr name = { .name = "/", .len = 1 };
+
+		/*
+		 * New inode, we need to make the dentry a sibling of s_root so
+		 * everything gets cleaned up properly on unmount.
+		 */
+		dentry = d_alloc(sb->s_root, &name);
+		if (!dentry) {
+			iput(inode);
+			return ERR_PTR(-ENOMEM);
+		}
+		d_splice_alias(inode, dentry);
+	} else {
+		/*
+		 * We found the inode in cache, just find a dentry for it and
+		 * put the reference to the inode we just got.
+		 */
+		dentry = d_find_alias(inode);
+		iput(inode);
+	}
+
+	return dentry;
+}
+
 static int btrfs_fill_super(struct super_block *sb,
 			    struct btrfs_fs_devices *fs_devices,
 			    void *data, int silent)
@@ -379,7 +463,7 @@ static int btrfs_fill_super(struct super_block *sb,
 	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
 	key.type = BTRFS_INODE_ITEM_KEY;
 	key.offset = 0;
-	inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root);
+	inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root, NULL);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		goto fail_close;
@@ -391,12 +475,6 @@ static int btrfs_fill_super(struct super_block *sb,
 		err = -ENOMEM;
 		goto fail_close;
 	}
-#if 0
-	/* this does the super kobj at the same time */
-	err = btrfs_sysfs_add_super(tree_root->fs_info);
-	if (err)
-		goto fail_close;
-#endif
 
 	sb->s_root = root_dentry;
 
@@ -422,7 +500,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 	btrfs_start_delalloc_inodes(root, 0);
 	btrfs_wait_ordered_extents(root, 0, 0);
 
-	trans = btrfs_start_transaction(root, 1);
+	trans = btrfs_start_transaction(root, 0);
 	ret = btrfs_commit_transaction(trans, root);
 	return ret;
 }
@@ -440,9 +518,6 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_puts(seq, ",nodatacow");
 	if (btrfs_test_opt(root, NOBARRIER))
 		seq_puts(seq, ",nobarrier");
-	if (info->max_extent != (u64)-1)
-		seq_printf(seq, ",max_extent=%llu",
-			   (unsigned long long)info->max_extent);
 	if (info->max_inline != 8192 * 1024)
 		seq_printf(seq, ",max_inline=%llu",
 			   (unsigned long long)info->max_inline);
@@ -488,19 +563,22 @@ static int btrfs_test_super(struct super_block *s, void *data)
 static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 		const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	char *subvol_name = NULL;
 	struct block_device *bdev = NULL;
 	struct super_block *s;
 	struct dentry *root;
 	struct btrfs_fs_devices *fs_devices = NULL;
 	fmode_t mode = FMODE_READ;
+	char *subvol_name = NULL;
+	u64 subvol_objectid = 0;
 	int error = 0;
+	int found = 0;
 
 	if (!(flags & MS_RDONLY))
 		mode |= FMODE_WRITE;
 
 	error = btrfs_parse_early_options(data, mode, fs_type,
-					  &subvol_name, &fs_devices);
+					  &subvol_name, &subvol_objectid,
+					  &fs_devices);
 	if (error)
 		return error;
 
@@ -529,6 +607,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 			goto error_close_devices;
 		}
 
+		found = 1;
 		btrfs_close_devices(fs_devices);
 	} else {
 		char b[BDEVNAME_SIZE];
@@ -546,25 +625,35 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 		s->s_flags |= MS_ACTIVE;
 	}
 
-	if (!strcmp(subvol_name, "."))
-		root = dget(s->s_root);
-	else {
-		mutex_lock(&s->s_root->d_inode->i_mutex);
-		root = lookup_one_len(subvol_name, s->s_root,
+	root = get_default_root(s, subvol_objectid);
+	if (IS_ERR(root)) {
+		error = PTR_ERR(root);
+		deactivate_locked_super(s);
+		goto error;
+	}
+	/* if they gave us a subvolume name bind mount into that */
+	if (strcmp(subvol_name, ".")) {
+		struct dentry *new_root;
+		mutex_lock(&root->d_inode->i_mutex);
+		new_root = lookup_one_len(subvol_name, root,
 				      strlen(subvol_name));
-		mutex_unlock(&s->s_root->d_inode->i_mutex);
+		mutex_unlock(&root->d_inode->i_mutex);
 
-		if (IS_ERR(root)) {
+		if (IS_ERR(new_root)) {
 			deactivate_locked_super(s);
-			error = PTR_ERR(root);
-			goto error_free_subvol_name;
+			error = PTR_ERR(new_root);
+			dput(root);
+			goto error_close_devices;
 		}
-		if (!root->d_inode) {
+		if (!new_root->d_inode) {
 			dput(root);
+			dput(new_root);
 			deactivate_locked_super(s);
 			error = -ENXIO;
-			goto error_free_subvol_name;
+			goto error_close_devices;
 		}
+		dput(root);
+		root = new_root;
 	}
 
 	mnt->mnt_sb = s;
@@ -579,6 +668,7 @@ error_close_devices:
 	btrfs_close_devices(fs_devices);
 error_free_subvol_name:
 	kfree(subvol_name);
+error:
 	return error;
 }
 
@@ -606,11 +696,11 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 		if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
 			return -EINVAL;
 
-		/* recover relocation */
-		ret = btrfs_recover_relocation(root);
+		ret = btrfs_cleanup_fs_roots(root->fs_info);
 		WARN_ON(ret);
 
-		ret = btrfs_cleanup_fs_roots(root->fs_info);
+		/* recover relocation */
+		ret = btrfs_recover_relocation(root);
 		WARN_ON(ret);
 
 		sb->s_flags &= ~MS_RDONLY;
@@ -623,13 +713,20 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct btrfs_root *root = btrfs_sb(dentry->d_sb);
 	struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+	struct list_head *head = &root->fs_info->space_info;
+	struct btrfs_space_info *found;
+	u64 total_used = 0;
 	int bits = dentry->d_sb->s_blocksize_bits;
 	__be32 *fsid = (__be32 *)root->fs_info->fsid;
 
+	rcu_read_lock();
+	list_for_each_entry_rcu(found, head, list)
+		total_used += found->disk_used;
+	rcu_read_unlock();
+
 	buf->f_namelen = BTRFS_NAME_LEN;
 	buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
-	buf->f_bfree = buf->f_blocks -
-		(btrfs_super_bytes_used(disk_super) >> bits);
+	buf->f_bfree = buf->f_blocks - (total_used >> bits);
 	buf->f_bavail = buf->f_bfree;
 	buf->f_bsize = dentry->d_sb->s_blocksize;
 	buf->f_type = BTRFS_SUPER_MAGIC;
@@ -721,11 +818,14 @@ static const struct file_operations btrfs_ctl_fops = {
 };
 
 static struct miscdevice btrfs_misc = {
-	.minor		= MISC_DYNAMIC_MINOR,
+	.minor		= BTRFS_MINOR,
 	.name		= "btrfs-control",
 	.fops		= &btrfs_ctl_fops
 };
 
+MODULE_ALIAS_MISCDEV(BTRFS_MINOR);
+MODULE_ALIAS("devname:btrfs-control");
+
 static int btrfs_interface_init(void)
 {
 	return misc_register(&btrfs_misc);
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index a240b6fa81df..4ce16ef702a3 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -164,12 +164,12 @@ static void btrfs_root_release(struct kobject *kobj)
 	complete(&root->kobj_unregister);
 }
 
-static struct sysfs_ops btrfs_super_attr_ops = {
+static const struct sysfs_ops btrfs_super_attr_ops = {
 	.show	= btrfs_super_attr_show,
 	.store	= btrfs_super_attr_store,
 };
 
-static struct sysfs_ops btrfs_root_attr_ops = {
+static const struct sysfs_ops btrfs_root_attr_ops = {
 	.show	= btrfs_root_attr_show,
 	.store	= btrfs_root_attr_store,
 };
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index b2acc79f1b34..66e4c66cc63b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -17,6 +17,7 @@
  */
 
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/writeback.h>
 #include <linux/pagemap.h>
@@ -69,7 +70,7 @@ static noinline int join_transaction(struct btrfs_root *root)
 		cur_trans->commit_done = 0;
 		cur_trans->start_time = get_seconds();
 
-		cur_trans->delayed_refs.root.rb_node = NULL;
+		cur_trans->delayed_refs.root = RB_ROOT;
 		cur_trans->delayed_refs.num_entries = 0;
 		cur_trans->delayed_refs.num_heads_ready = 0;
 		cur_trans->delayed_refs.num_heads = 0;
@@ -147,18 +148,13 @@ static void wait_current_trans(struct btrfs_root *root)
 		while (1) {
 			prepare_to_wait(&root->fs_info->transaction_wait, &wait,
 					TASK_UNINTERRUPTIBLE);
-			if (cur_trans->blocked) {
-				mutex_unlock(&root->fs_info->trans_mutex);
-				schedule();
-				mutex_lock(&root->fs_info->trans_mutex);
-				finish_wait(&root->fs_info->transaction_wait,
-					    &wait);
-			} else {
-				finish_wait(&root->fs_info->transaction_wait,
-					    &wait);
+			if (!cur_trans->blocked)
 				break;
-			}
+			mutex_unlock(&root->fs_info->trans_mutex);
+			schedule();
+			mutex_lock(&root->fs_info->trans_mutex);
 		}
+		finish_wait(&root->fs_info->transaction_wait, &wait);
 		put_transaction(cur_trans);
 	}
 }
@@ -169,54 +165,89 @@ enum btrfs_trans_type {
 	TRANS_USERSPACE,
 };
 
+static int may_wait_transaction(struct btrfs_root *root, int type)
+{
+	if (!root->fs_info->log_root_recovering &&
+	    ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
+	     type == TRANS_USERSPACE))
+		return 1;
+	return 0;
+}
+
 static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
-					     int num_blocks, int type)
+						    u64 num_items, int type)
 {
-	struct btrfs_trans_handle *h =
-		kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
+	struct btrfs_trans_handle *h;
+	struct btrfs_transaction *cur_trans;
+	int retries = 0;
 	int ret;
+again:
+	h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
+	if (!h)
+		return ERR_PTR(-ENOMEM);
 
 	mutex_lock(&root->fs_info->trans_mutex);
-	if (!root->fs_info->log_root_recovering &&
-	    ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
-	     type == TRANS_USERSPACE))
+	if (may_wait_transaction(root, type))
 		wait_current_trans(root);
+
 	ret = join_transaction(root);
 	BUG_ON(ret);
 
-	h->transid = root->fs_info->running_transaction->transid;
-	h->transaction = root->fs_info->running_transaction;
-	h->blocks_reserved = num_blocks;
+	cur_trans = root->fs_info->running_transaction;
+	cur_trans->use_count++;
+	mutex_unlock(&root->fs_info->trans_mutex);
+
+	h->transid = cur_trans->transid;
+	h->transaction = cur_trans;
 	h->blocks_used = 0;
 	h->block_group = 0;
-	h->alloc_exclude_nr = 0;
-	h->alloc_exclude_start = 0;
+	h->bytes_reserved = 0;
 	h->delayed_ref_updates = 0;
+	h->block_rsv = NULL;
 
-	if (!current->journal_info && type != TRANS_USERSPACE)
-		current->journal_info = h;
+	smp_mb();
+	if (cur_trans->blocked && may_wait_transaction(root, type)) {
+		btrfs_commit_transaction(h, root);
+		goto again;
+	}
+
+	if (num_items > 0) {
+		ret = btrfs_trans_reserve_metadata(h, root, num_items,
+						   &retries);
+		if (ret == -EAGAIN) {
+			btrfs_commit_transaction(h, root);
+			goto again;
+		}
+		if (ret < 0) {
+			btrfs_end_transaction(h, root);
+			return ERR_PTR(ret);
+		}
+	}
 
-	root->fs_info->running_transaction->use_count++;
+	mutex_lock(&root->fs_info->trans_mutex);
 	record_root_in_trans(h, root);
 	mutex_unlock(&root->fs_info->trans_mutex);
+
+	if (!current->journal_info && type != TRANS_USERSPACE)
+		current->journal_info = h;
 	return h;
 }
 
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
-						   int num_blocks)
+						   int num_items)
 {
-	return start_transaction(root, num_blocks, TRANS_START);
+	return start_transaction(root, num_items, TRANS_START);
 }
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
 						   int num_blocks)
 {
-	return start_transaction(root, num_blocks, TRANS_JOIN);
+	return start_transaction(root, 0, TRANS_JOIN);
 }
 
 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
 							 int num_blocks)
 {
-	return start_transaction(r, num_blocks, TRANS_USERSPACE);
+	return start_transaction(r, 0, TRANS_USERSPACE);
 }
 
 /* wait for a transaction commit to be fully complete */
@@ -290,10 +321,36 @@ void btrfs_throttle(struct btrfs_root *root)
 	mutex_unlock(&root->fs_info->trans_mutex);
 }
 
+static int should_end_transaction(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root)
+{
+	int ret;
+	ret = btrfs_block_rsv_check(trans, root,
+				    &root->fs_info->global_block_rsv, 0, 5);
+	return ret ? 1 : 0;
+}
+
+int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root)
+{
+	struct btrfs_transaction *cur_trans = trans->transaction;
+	int updates;
+
+	if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
+		return 1;
+
+	updates = trans->delayed_ref_updates;
+	trans->delayed_ref_updates = 0;
+	if (updates)
+		btrfs_run_delayed_refs(trans, root, updates);
+
+	return should_end_transaction(trans, root);
+}
+
 static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root, int throttle)
 {
-	struct btrfs_transaction *cur_trans;
+	struct btrfs_transaction *cur_trans = trans->transaction;
 	struct btrfs_fs_info *info = root->fs_info;
 	int count = 0;
 
@@ -317,9 +374,21 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 		count++;
 	}
 
+	btrfs_trans_release_metadata(trans, root);
+
+	if (!root->fs_info->open_ioctl_trans &&
+	    should_end_transaction(trans, root))
+		trans->transaction->blocked = 1;
+
+	if (cur_trans->blocked && !cur_trans->in_commit) {
+		if (throttle)
+			return btrfs_commit_transaction(trans, root);
+		else
+			wake_up_process(info->transaction_kthread);
+	}
+
 	mutex_lock(&info->trans_mutex);
-	cur_trans = info->running_transaction;
-	WARN_ON(cur_trans != trans->transaction);
+	WARN_ON(cur_trans != info->running_transaction);
 	WARN_ON(cur_trans->num_writers < 1);
 	cur_trans->num_writers--;
 
@@ -607,6 +676,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
 
 			btrfs_free_log(trans, root);
 			btrfs_update_reloc_root(trans, root);
+			btrfs_orphan_commit_root(trans, root);
 
 			if (root->commit_root != root->node) {
 				switch_commit_root(root);
@@ -631,30 +701,30 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
 int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
 {
 	struct btrfs_fs_info *info = root->fs_info;
-	int ret;
 	struct btrfs_trans_handle *trans;
+	int ret;
 	unsigned long nr;
 
-	smp_mb();
-	if (root->defrag_running)
+	if (xchg(&root->defrag_running, 1))
 		return 0;
-	trans = btrfs_start_transaction(root, 1);
+
 	while (1) {
-		root->defrag_running = 1;
+		trans = btrfs_start_transaction(root, 0);
+		if (IS_ERR(trans))
+			return PTR_ERR(trans);
+
 		ret = btrfs_defrag_leaves(trans, root, cacheonly);
+
 		nr = trans->blocks_used;
 		btrfs_end_transaction(trans, root);
 		btrfs_btree_balance_dirty(info->tree_root, nr);
 		cond_resched();
 
-		trans = btrfs_start_transaction(root, 1);
 		if (root->fs_info->closing || ret != -EAGAIN)
 			break;
 	}
 	root->defrag_running = 0;
-	smp_mb();
-	btrfs_end_transaction(trans, root);
-	return 0;
+	return ret;
 }
 
 #if 0
@@ -760,29 +830,72 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	struct btrfs_root_item *new_root_item;
 	struct btrfs_root *tree_root = fs_info->tree_root;
 	struct btrfs_root *root = pending->root;
+	struct btrfs_root *parent_root;
+	struct inode *parent_inode;
+	struct dentry *dentry;
 	struct extent_buffer *tmp;
 	struct extent_buffer *old;
 	int ret;
+	int retries = 0;
+	u64 to_reserve = 0;
+	u64 index = 0;
 	u64 objectid;
 
 	new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
 	if (!new_root_item) {
-		ret = -ENOMEM;
+		pending->error = -ENOMEM;
 		goto fail;
 	}
+
 	ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
-	if (ret)
+	if (ret) {
+		pending->error = ret;
 		goto fail;
+	}
+
+	btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
+	btrfs_orphan_pre_snapshot(trans, pending, &to_reserve);
+
+	if (to_reserve > 0) {
+		ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv,
+					  to_reserve, &retries);
+		if (ret) {
+			pending->error = ret;
+			goto fail;
+		}
+	}
+
+	key.objectid = objectid;
+	key.offset = (u64)-1;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+
+	trans->block_rsv = &pending->block_rsv;
+
+	dentry = pending->dentry;
+	parent_inode = dentry->d_parent->d_inode;
+	parent_root = BTRFS_I(parent_inode)->root;
+	record_root_in_trans(trans, parent_root);
+
+	/*
+	 * insert the directory item
+	 */
+	ret = btrfs_set_inode_index(parent_inode, &index);
+	BUG_ON(ret);
+	ret = btrfs_insert_dir_item(trans, parent_root,
+				dentry->d_name.name, dentry->d_name.len,
+				parent_inode->i_ino, &key,
+				BTRFS_FT_DIR, index);
+	BUG_ON(ret);
+
+	btrfs_i_size_write(parent_inode, parent_inode->i_size +
+					 dentry->d_name.len * 2);
+	ret = btrfs_update_inode(trans, parent_root, parent_inode);
+	BUG_ON(ret);
 
 	record_root_in_trans(trans, root);
 	btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
 	memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
 
-	key.objectid = objectid;
-	/* record when the snapshot was created in key.offset */
-	key.offset = trans->transid;
-	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
-
 	old = btrfs_lock_root_node(root);
 	btrfs_cow_block(trans, root, old, NULL, 0, &old);
 	btrfs_set_lock_blocking(old);
@@ -792,62 +905,32 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	free_extent_buffer(old);
 
 	btrfs_set_root_node(new_root_item, tmp);
-	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
-				new_root_item);
+	/* record when the snapshot was created in key.offset */
+	key.offset = trans->transid;
+	ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
 	btrfs_tree_unlock(tmp);
 	free_extent_buffer(tmp);
-	if (ret)
-		goto fail;
-
-	key.offset = (u64)-1;
-	memcpy(&pending->root_key, &key, sizeof(key));
-fail:
-	kfree(new_root_item);
-	return ret;
-}
-
-static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
-				   struct btrfs_pending_snapshot *pending)
-{
-	int ret;
-	int namelen;
-	u64 index = 0;
-	struct btrfs_trans_handle *trans;
-	struct inode *parent_inode;
-	struct btrfs_root *parent_root;
-
-	parent_inode = pending->dentry->d_parent->d_inode;
-	parent_root = BTRFS_I(parent_inode)->root;
-	trans = btrfs_join_transaction(parent_root, 1);
+	BUG_ON(ret);
 
 	/*
-	 * insert the directory item
+	 * insert root back/forward references
 	 */
-	namelen = strlen(pending->name);
-	ret = btrfs_set_inode_index(parent_inode, &index);
-	ret = btrfs_insert_dir_item(trans, parent_root,
-			    pending->name, namelen,
-			    parent_inode->i_ino,
-			    &pending->root_key, BTRFS_FT_DIR, index);
-
-	if (ret)
-		goto fail;
-
-	btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2);
-	ret = btrfs_update_inode(trans, parent_root, parent_inode);
-	BUG_ON(ret);
-
-	ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
-				 pending->root_key.objectid,
+	ret = btrfs_add_root_ref(trans, tree_root, objectid,
 				 parent_root->root_key.objectid,
-				 parent_inode->i_ino, index, pending->name,
-				 namelen);
-
+				 parent_inode->i_ino, index,
+				 dentry->d_name.name, dentry->d_name.len);
 	BUG_ON(ret);
 
+	key.offset = (u64)-1;
+	pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
+	BUG_ON(IS_ERR(pending->snap));
+
+	btrfs_reloc_post_snapshot(trans, pending);
+	btrfs_orphan_post_snapshot(trans, pending);
 fail:
-	btrfs_end_transaction(trans, fs_info->fs_root);
-	return ret;
+	kfree(new_root_item);
+	btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
+	return 0;
 }
 
 /*
@@ -867,25 +950,6 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans,
-					     struct btrfs_fs_info *fs_info)
-{
-	struct btrfs_pending_snapshot *pending;
-	struct list_head *head = &trans->transaction->pending_snapshots;
-	int ret;
-
-	while (!list_empty(head)) {
-		pending = list_entry(head->next,
-				     struct btrfs_pending_snapshot, list);
-		ret = finish_pending_snapshot(fs_info, pending);
-		BUG_ON(ret);
-		list_del(&pending->list);
-		kfree(pending->name);
-		kfree(pending);
-	}
-	return 0;
-}
-
 static void update_super_roots(struct btrfs_root *root)
 {
 	struct btrfs_root_item *root_item;
@@ -914,6 +978,16 @@ int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
 	return ret;
 }
 
+int btrfs_transaction_blocked(struct btrfs_fs_info *info)
+{
+	int ret = 0;
+	spin_lock(&info->new_trans_lock);
+	if (info->running_transaction)
+		ret = info->running_transaction->blocked;
+	spin_unlock(&info->new_trans_lock);
+	return ret;
+}
+
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root)
 {
@@ -935,6 +1009,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	ret = btrfs_run_delayed_refs(trans, root, 0);
 	BUG_ON(ret);
 
+	btrfs_trans_release_metadata(trans, root);
+
 	cur_trans = trans->transaction;
 	/*
 	 * set the flushing flag so procs in this transaction have to
@@ -987,9 +1063,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			snap_pending = 1;
 
 		WARN_ON(cur_trans != trans->transaction);
-		prepare_to_wait(&cur_trans->writer_wait, &wait,
-				TASK_UNINTERRUPTIBLE);
-
 		if (cur_trans->num_writers > 1)
 			timeout = MAX_SCHEDULE_TIMEOUT;
 		else if (should_grow)
@@ -997,13 +1070,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 		mutex_unlock(&root->fs_info->trans_mutex);
 
-		if (flush_on_commit) {
+		if (flush_on_commit || snap_pending) {
 			btrfs_start_delalloc_inodes(root, 1);
 			ret = btrfs_wait_ordered_extents(root, 0, 1);
 			BUG_ON(ret);
-		} else if (snap_pending) {
-			ret = btrfs_wait_ordered_extents(root, 0, 1);
-			BUG_ON(ret);
 		}
 
 		/*
@@ -1015,6 +1085,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		 */
 		btrfs_run_ordered_operations(root, 1);
 
+		prepare_to_wait(&cur_trans->writer_wait, &wait,
+				TASK_UNINTERRUPTIBLE);
+
 		smp_mb();
 		if (cur_trans->num_writers > 1 || should_grow)
 			schedule_timeout(timeout);
@@ -1100,9 +1173,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	btrfs_finish_extent_commit(trans, root);
 
-	/* do the directory inserts of any pending snapshot creations */
-	finish_pending_snapshots(trans, root->fs_info);
-
 	mutex_lock(&root->fs_info->trans_mutex);
 
 	cur_trans->commit_done = 1;
@@ -1145,9 +1215,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
 
 		if (btrfs_header_backref_rev(root->node) <
 		    BTRFS_MIXED_BACKREF_REV)
-			btrfs_drop_snapshot(root, 0);
+			btrfs_drop_snapshot(root, NULL, 0);
 		else
-			btrfs_drop_snapshot(root, 1);
+			btrfs_drop_snapshot(root, NULL, 1);
 	}
 	return 0;
 }
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 93c7ccb33118..e104986d0bfd 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -45,20 +45,23 @@ struct btrfs_transaction {
 
 struct btrfs_trans_handle {
 	u64 transid;
+	u64 block_group;
+	u64 bytes_reserved;
 	unsigned long blocks_reserved;
 	unsigned long blocks_used;
-	struct btrfs_transaction *transaction;
-	u64 block_group;
-	u64 alloc_exclude_start;
-	u64 alloc_exclude_nr;
 	unsigned long delayed_ref_updates;
+	struct btrfs_transaction *transaction;
+	struct btrfs_block_rsv *block_rsv;
 };
 
 struct btrfs_pending_snapshot {
 	struct dentry *dentry;
 	struct btrfs_root *root;
-	char *name;
-	struct btrfs_key root_key;
+	struct btrfs_root *snap;
+	/* block reservation for the operation */
+	struct btrfs_block_rsv block_rsv;
+	/* extra metadata reseration for relocation */
+	int error;
 	struct list_head list;
 };
 
@@ -85,11 +88,11 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
-						   int num_blocks);
+						   int num_items);
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
-						   int num_blocks);
+						  int num_blocks);
 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
-						   int num_blocks);
+							 int num_blocks);
 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root);
 int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
@@ -103,6 +106,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root);
 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root);
+int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root);
 void btrfs_throttle(struct btrfs_root *root);
 int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root);
@@ -112,5 +117,6 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
 				struct extent_io_tree *dirty_pages, int mark);
 int btrfs_wait_marked_extents(struct btrfs_root *root,
 				struct extent_io_tree *dirty_pages, int mark);
+int btrfs_transaction_blocked(struct btrfs_fs_info *info);
 int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
 #endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index b10eacdb1620..f7ac8e013ed7 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -117,13 +117,14 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 				 path->nodes[1], 0,
 				 cache_only, &last_ret,
 				 &root->defrag_progress);
-	WARN_ON(ret && ret != -EAGAIN);
+	if (ret) {
+		WARN_ON(ret == -EAGAIN);
+		goto out;
+	}
 	if (next_key_ret == 0) {
 		memcpy(&root->defrag_progress, &key, sizeof(key));
 		ret = -EAGAIN;
 	}
-
-	btrfs_release_path(root, path);
 out:
 	if (path)
 		btrfs_free_path(path);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 4a9434b622ec..fb102a9aee9c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -17,6 +17,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include "ctree.h"
 #include "transaction.h"
 #include "disk-io.h"
@@ -134,6 +135,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root)
 {
 	int ret;
+	int err = 0;
 
 	mutex_lock(&root->log_mutex);
 	if (root->log_root) {
@@ -154,17 +156,19 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
 	mutex_lock(&root->fs_info->tree_log_mutex);
 	if (!root->fs_info->log_root_tree) {
 		ret = btrfs_init_log_root_tree(trans, root->fs_info);
-		BUG_ON(ret);
+		if (ret)
+			err = ret;
 	}
-	if (!root->log_root) {
+	if (err == 0 && !root->log_root) {
 		ret = btrfs_add_log_tree(trans, root);
-		BUG_ON(ret);
+		if (ret)
+			err = ret;
 	}
 	mutex_unlock(&root->fs_info->tree_log_mutex);
 	root->log_batch++;
 	atomic_inc(&root->log_writers);
 	mutex_unlock(&root->log_mutex);
-	return 0;
+	return err;
 }
 
 /*
@@ -375,7 +379,7 @@ insert:
 			BUG_ON(ret);
 		}
 	} else if (ret) {
-		BUG();
+		return ret;
 	}
 	dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
 					path->slots[0]);
@@ -445,7 +449,7 @@ static noinline struct inode *read_one_inode(struct btrfs_root *root,
 	key.objectid = objectid;
 	key.type = BTRFS_INODE_ITEM_KEY;
 	key.offset = 0;
-	inode = btrfs_iget(root->fs_info->sb, &key, root);
+	inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
 	if (IS_ERR(inode)) {
 		inode = NULL;
 	} else if (is_bad_inode(inode)) {
@@ -1698,9 +1702,9 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 
 		next = btrfs_find_create_tree_block(root, bytenr, blocksize);
 
-		wc->process_func(root, next, wc, ptr_gen);
-
 		if (*level == 1) {
+			wc->process_func(root, next, wc, ptr_gen);
+
 			path->slots[*level]++;
 			if (wc->free) {
 				btrfs_read_buffer(next, ptr_gen);
@@ -1733,35 +1737,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 	WARN_ON(*level < 0);
 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
 
-	if (path->nodes[*level] == root->node)
-		parent = path->nodes[*level];
-	else
-		parent = path->nodes[*level + 1];
-
-	bytenr = path->nodes[*level]->start;
-
-	blocksize = btrfs_level_size(root, *level);
-	root_owner = btrfs_header_owner(parent);
-	root_gen = btrfs_header_generation(parent);
-
-	wc->process_func(root, path->nodes[*level], wc,
-			 btrfs_header_generation(path->nodes[*level]));
-
-	if (wc->free) {
-		next = path->nodes[*level];
-		btrfs_tree_lock(next);
-		clean_tree_block(trans, root, next);
-		btrfs_set_lock_blocking(next);
-		btrfs_wait_tree_block_writeback(next);
-		btrfs_tree_unlock(next);
-
-		WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
-		ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
-		BUG_ON(ret);
-	}
-	free_extent_buffer(path->nodes[*level]);
-	path->nodes[*level] = NULL;
-	*level += 1;
+	path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
 
 	cond_resched();
 	return 0;
@@ -1780,7 +1756,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
 
 	for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
 		slot = path->slots[i];
-		if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
+		if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
 			struct extent_buffer *node;
 			node = path->nodes[i];
 			path->slots[i]++;
@@ -2046,7 +2022,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	mutex_unlock(&log_root_tree->log_mutex);
 
 	ret = update_log_root(trans, log);
-	BUG_ON(ret);
 
 	mutex_lock(&log_root_tree->log_mutex);
 	if (atomic_dec_and_test(&log_root_tree->log_writers)) {
@@ -2055,6 +2030,15 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 			wake_up(&log_root_tree->log_writer_wait);
 	}
 
+	if (ret) {
+		BUG_ON(ret != -ENOSPC);
+		root->fs_info->last_trans_log_full_commit = trans->transid;
+		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+		mutex_unlock(&log_root_tree->log_mutex);
+		ret = -EAGAIN;
+		goto out;
+	}
+
 	index2 = log_root_tree->log_transid % 2;
 	if (atomic_read(&log_root_tree->log_commit[index2])) {
 		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
@@ -2128,15 +2112,10 @@ out:
 	return 0;
 }
 
-/*
- * free all the extents used by the tree log.  This should be called
- * at commit time of the full transaction
- */
-int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
+static void free_log_tree(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *log)
 {
 	int ret;
-	struct btrfs_root *log;
-	struct key;
 	u64 start;
 	u64 end;
 	struct walk_control wc = {
@@ -2144,10 +2123,6 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
 		.process_func = process_one_buffer
 	};
 
-	if (!root->log_root || root->fs_info->log_root_recovering)
-		return 0;
-
-	log = root->log_root;
 	ret = walk_log_tree(trans, log, &wc);
 	BUG_ON(ret);
 
@@ -2161,14 +2136,30 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
 				  EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
 	}
 
-	if (log->log_transid > 0) {
-		ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
-				     &log->root_key);
-		BUG_ON(ret);
-	}
-	root->log_root = NULL;
 	free_extent_buffer(log->node);
 	kfree(log);
+}
+
+/*
+ * free all the extents used by the tree log.  This should be called
+ * at commit time of the full transaction
+ */
+int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
+{
+	if (root->log_root) {
+		free_log_tree(trans, root->log_root);
+		root->log_root = NULL;
+	}
+	return 0;
+}
+
+int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
+			     struct btrfs_fs_info *fs_info)
+{
+	if (fs_info->log_root_tree) {
+		free_log_tree(trans, fs_info->log_root_tree);
+		fs_info->log_root_tree = NULL;
+	}
 	return 0;
 }
 
@@ -2202,6 +2193,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
 	struct btrfs_dir_item *di;
 	struct btrfs_path *path;
 	int ret;
+	int err = 0;
 	int bytes_del = 0;
 
 	if (BTRFS_I(dir)->logged_trans < trans->transid)
@@ -2217,7 +2209,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
 	path = btrfs_alloc_path();
 	di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
 				   name, name_len, -1);
-	if (di && !IS_ERR(di)) {
+	if (IS_ERR(di)) {
+		err = PTR_ERR(di);
+		goto fail;
+	}
+	if (di) {
 		ret = btrfs_delete_one_dir_name(trans, log, path, di);
 		bytes_del += name_len;
 		BUG_ON(ret);
@@ -2225,7 +2221,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
 	btrfs_release_path(log, path);
 	di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
 					 index, name, name_len, -1);
-	if (di && !IS_ERR(di)) {
+	if (IS_ERR(di)) {
+		err = PTR_ERR(di);
+		goto fail;
+	}
+	if (di) {
 		ret = btrfs_delete_one_dir_name(trans, log, path, di);
 		bytes_del += name_len;
 		BUG_ON(ret);
@@ -2243,6 +2243,10 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
 		btrfs_release_path(log, path);
 
 		ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
+		if (ret < 0) {
+			err = ret;
+			goto fail;
+		}
 		if (ret == 0) {
 			struct btrfs_inode_item *item;
 			u64 i_size;
@@ -2260,9 +2264,13 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
 			ret = 0;
 		btrfs_release_path(log, path);
 	}
-
+fail:
 	btrfs_free_path(path);
 	mutex_unlock(&BTRFS_I(dir)->log_mutex);
+	if (ret == -ENOSPC) {
+		root->fs_info->last_trans_log_full_commit = trans->transid;
+		ret = 0;
+	}
 	btrfs_end_log_trans(root);
 
 	return 0;
@@ -2290,6 +2298,10 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
 	ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
 				  dirid, &index);
 	mutex_unlock(&BTRFS_I(inode)->log_mutex);
+	if (ret == -ENOSPC) {
+		root->fs_info->last_trans_log_full_commit = trans->transid;
+		ret = 0;
+	}
 	btrfs_end_log_trans(root);
 
 	return ret;
@@ -2317,7 +2329,8 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
 	else
 		key.type = BTRFS_DIR_LOG_INDEX_KEY;
 	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
-	BUG_ON(ret);
+	if (ret)
+		return ret;
 
 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
 			      struct btrfs_dir_log_item);
@@ -2342,6 +2355,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 	struct btrfs_key max_key;
 	struct btrfs_root *log = root->log_root;
 	struct extent_buffer *src;
+	int err = 0;
 	int ret;
 	int i;
 	int nritems;
@@ -2404,6 +2418,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 			ret = overwrite_item(trans, log, dst_path,
 					     path->nodes[0], path->slots[0],
 					     &tmp);
+			if (ret) {
+				err = ret;
+				goto done;
+			}
 		}
 	}
 	btrfs_release_path(root, path);
@@ -2431,7 +2449,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 				goto done;
 			ret = overwrite_item(trans, log, dst_path, src, i,
 					     &min_key);
-			BUG_ON(ret);
+			if (ret) {
+				err = ret;
+				goto done;
+			}
 		}
 		path->slots[0] = nritems;
 
@@ -2453,22 +2474,30 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 			ret = overwrite_item(trans, log, dst_path,
 					     path->nodes[0], path->slots[0],
 					     &tmp);
-
-			BUG_ON(ret);
-			last_offset = tmp.offset;
+			if (ret)
+				err = ret;
+			else
+				last_offset = tmp.offset;
 			goto done;
 		}
 	}
 done:
-	*last_offset_ret = last_offset;
 	btrfs_release_path(root, path);
 	btrfs_release_path(log, dst_path);
 
-	/* insert the log range keys to indicate where the log is valid */
-	ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino,
-				 first_offset, last_offset);
-	BUG_ON(ret);
-	return 0;
+	if (err == 0) {
+		*last_offset_ret = last_offset;
+		/*
+		 * insert the log range keys to indicate where the log
+		 * is valid
+		 */
+		ret = insert_dir_log_key(trans, log, path, key_type,
+					 inode->i_ino, first_offset,
+					 last_offset);
+		if (ret)
+			err = ret;
+	}
+	return err;
 }
 
 /*
@@ -2500,7 +2529,8 @@ again:
 		ret = log_dir_items(trans, root, inode, path,
 				    dst_path, key_type, min_key,
 				    &max_key);
-		BUG_ON(ret);
+		if (ret)
+			return ret;
 		if (max_key == (u64)-1)
 			break;
 		min_key = max_key + 1;
@@ -2534,8 +2564,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
 
 	while (1) {
 		ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
-
-		if (ret != 1)
+		BUG_ON(ret == 0);
+		if (ret < 0)
 			break;
 
 		if (path->slots[0] == 0)
@@ -2553,7 +2583,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
 		btrfs_release_path(log, path);
 	}
 	btrfs_release_path(log, path);
-	return 0;
+	return ret;
 }
 
 static noinline int copy_items(struct btrfs_trans_handle *trans,
@@ -2586,7 +2616,10 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 	}
 	ret = btrfs_insert_empty_items(trans, log, dst_path,
 				       ins_keys, ins_sizes, nr);
-	BUG_ON(ret);
+	if (ret) {
+		kfree(ins_data);
+		return ret;
+	}
 
 	for (i = 0; i < nr; i++, dst_path->slots[0]++) {
 		dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
@@ -2659,16 +2692,17 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 	 * we have to do this after the loop above to avoid changing the
 	 * log tree while trying to change the log tree.
 	 */
+	ret = 0;
 	while (!list_empty(&ordered_sums)) {
 		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
 						   struct btrfs_ordered_sum,
 						   list);
-		ret = btrfs_csum_file_blocks(trans, log, sums);
-		BUG_ON(ret);
+		if (!ret)
+			ret = btrfs_csum_file_blocks(trans, log, sums);
 		list_del(&sums->list);
 		kfree(sums);
 	}
-	return 0;
+	return ret;
 }
 
 /* log a single inode in the tree log.
@@ -2696,6 +2730,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 	struct btrfs_root *log = root->log_root;
 	struct extent_buffer *src = NULL;
 	u32 size;
+	int err = 0;
 	int ret;
 	int nritems;
 	int ins_start_slot = 0;
@@ -2738,7 +2773,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 	} else {
 		ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
 	}
-	BUG_ON(ret);
+	if (ret) {
+		err = ret;
+		goto out_unlock;
+	}
 	path->keep_locks = 1;
 
 	while (1) {
@@ -2767,7 +2805,10 @@ again:
 
 		ret = copy_items(trans, log, dst_path, src, ins_start_slot,
 				 ins_nr, inode_only);
-		BUG_ON(ret);
+		if (ret) {
+			err = ret;
+			goto out_unlock;
+		}
 		ins_nr = 1;
 		ins_start_slot = path->slots[0];
 next_slot:
@@ -2783,7 +2824,10 @@ next_slot:
 			ret = copy_items(trans, log, dst_path, src,
 					 ins_start_slot,
 					 ins_nr, inode_only);
-			BUG_ON(ret);
+			if (ret) {
+				err = ret;
+				goto out_unlock;
+			}
 			ins_nr = 0;
 		}
 		btrfs_release_path(root, path);
@@ -2801,7 +2845,10 @@ next_slot:
 		ret = copy_items(trans, log, dst_path, src,
 				 ins_start_slot,
 				 ins_nr, inode_only);
-		BUG_ON(ret);
+		if (ret) {
+			err = ret;
+			goto out_unlock;
+		}
 		ins_nr = 0;
 	}
 	WARN_ON(ins_nr);
@@ -2809,14 +2856,18 @@ next_slot:
 		btrfs_release_path(root, path);
 		btrfs_release_path(log, dst_path);
 		ret = log_directory_changes(trans, root, inode, path, dst_path);
-		BUG_ON(ret);
+		if (ret) {
+			err = ret;
+			goto out_unlock;
+		}
 	}
 	BTRFS_I(inode)->logged_trans = trans->transid;
+out_unlock:
 	mutex_unlock(&BTRFS_I(inode)->log_mutex);
 
 	btrfs_free_path(path);
 	btrfs_free_path(dst_path);
-	return 0;
+	return err;
 }
 
 /*
@@ -2941,10 +2992,13 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 		goto end_no_trans;
 	}
 
-	start_log_trans(trans, root);
+	ret = start_log_trans(trans, root);
+	if (ret)
+		goto end_trans;
 
 	ret = btrfs_log_inode(trans, root, inode, inode_only);
-	BUG_ON(ret);
+	if (ret)
+		goto end_trans;
 
 	/*
 	 * for regular files, if its inode is already on disk, we don't
@@ -2954,8 +3008,10 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 	 */
 	if (S_ISREG(inode->i_mode) &&
 	    BTRFS_I(inode)->generation <= last_committed &&
-	    BTRFS_I(inode)->last_unlink_trans <= last_committed)
-			goto no_parent;
+	    BTRFS_I(inode)->last_unlink_trans <= last_committed) {
+		ret = 0;
+		goto end_trans;
+	}
 
 	inode_only = LOG_INODE_EXISTS;
 	while (1) {
@@ -2969,15 +3025,21 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 		if (BTRFS_I(inode)->generation >
 		    root->fs_info->last_trans_committed) {
 			ret = btrfs_log_inode(trans, root, inode, inode_only);
-			BUG_ON(ret);
+			if (ret)
+				goto end_trans;
 		}
 		if (IS_ROOT(parent))
 			break;
 
 		parent = parent->d_parent;
 	}
-no_parent:
 	ret = 0;
+end_trans:
+	if (ret < 0) {
+		BUG_ON(ret != -ENOSPC);
+		root->fs_info->last_trans_log_full_commit = trans->transid;
+		ret = 1;
+	}
 	btrfs_end_log_trans(root);
 end_no_trans:
 	return ret;
@@ -3019,7 +3081,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 
-	trans = btrfs_start_transaction(fs_info->tree_root, 1);
+	trans = btrfs_start_transaction(fs_info->tree_root, 0);
 
 	wc.trans = trans;
 	wc.pin = 1;
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 0776eacb5083..3dfae84c8cc8 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -25,6 +25,8 @@
 int btrfs_sync_log(struct btrfs_trans_handle *trans,
 		   struct btrfs_root *root);
 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
+int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
+			     struct btrfs_fs_info *fs_info);
 int btrfs_recover_log_trees(struct btrfs_root *tree_root);
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root, struct dentry *dentry);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 41ecbb2347f2..d6e3af8be95b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -17,6 +17,7 @@
  */
 #include <linux/sched.h>
 #include <linux/bio.h>
+#include <linux/slab.h>
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 #include <linux/random.h>
@@ -256,13 +257,13 @@ loop_lock:
 			wake_up(&fs_info->async_submit_wait);
 
 		BUG_ON(atomic_read(&cur->bi_cnt) == 0);
-		submit_bio(cur->bi_rw, cur);
-		num_run++;
-		batch_run++;
 
 		if (bio_rw_flagged(cur, BIO_RW_SYNCIO))
 			num_sync_run++;
 
+		submit_bio(cur->bi_rw, cur);
+		num_run++;
+		batch_run++;
 		if (need_resched()) {
 			if (num_sync_run) {
 				blk_run_backing_dev(bdi, NULL);
@@ -325,16 +326,6 @@ loop_lock:
 		num_sync_run = 0;
 		blk_run_backing_dev(bdi, NULL);
 	}
-
-	cond_resched();
-	if (again)
-		goto loop;
-
-	spin_lock(&device->io_lock);
-	if (device->pending_bios.head || device->pending_sync_bios.head)
-		goto loop_lock;
-	spin_unlock(&device->io_lock);
-
 	/*
 	 * IO has already been through a long path to get here.  Checksumming,
 	 * async helper threads, perhaps compression.  We've done a pretty
@@ -346,6 +337,16 @@ loop_lock:
 	 * cared about found its way down here.
 	 */
 	blk_run_backing_dev(bdi, NULL);
+
+	cond_resched();
+	if (again)
+		goto loop;
+
+	spin_lock(&device->io_lock);
+	if (device->pending_bios.head || device->pending_sync_bios.head)
+		goto loop_lock;
+	spin_unlock(&device->io_lock);
+
 done:
 	return 0;
 }
@@ -365,6 +366,7 @@ static noinline int device_list_add(const char *path,
 	struct btrfs_device *device;
 	struct btrfs_fs_devices *fs_devices;
 	u64 found_transid = btrfs_super_generation(disk_super);
+	char *name;
 
 	fs_devices = find_fsid(disk_super->fsid);
 	if (!fs_devices) {
@@ -411,6 +413,12 @@ static noinline int device_list_add(const char *path,
 
 		device->fs_devices = fs_devices;
 		fs_devices->num_devices++;
+	} else if (strcmp(device->name, path)) {
+		name = kstrdup(path, GFP_NOFS);
+		if (!name)
+			return -ENOMEM;
+		kfree(device->name);
+		device->name = name;
 	}
 
 	if (found_transid > fs_devices->latest_trans) {
@@ -592,7 +600,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 			goto error_close;
 
 		disk_super = (struct btrfs_super_block *)bh->b_data;
-		devid = le64_to_cpu(disk_super->dev_item.devid);
+		devid = btrfs_stack_device_id(&disk_super->dev_item);
 		if (devid != device->devid)
 			goto error_brelse;
 
@@ -694,7 +702,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 		goto error_close;
 	}
 	disk_super = (struct btrfs_super_block *)bh->b_data;
-	devid = le64_to_cpu(disk_super->dev_item.devid);
+	devid = btrfs_stack_device_id(&disk_super->dev_item);
 	transid = btrfs_super_generation(disk_super);
 	if (disk_super->label[0])
 		printk(KERN_INFO "device label %s ", disk_super->label);
@@ -1089,7 +1097,7 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
 	if (!path)
 		return -ENOMEM;
 
-	trans = btrfs_start_transaction(root, 1);
+	trans = btrfs_start_transaction(root, 0);
 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
 	key.type = BTRFS_DEV_ITEM_KEY;
 	key.offset = device->devid;
@@ -1187,7 +1195,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 			goto error_close;
 		}
 		disk_super = (struct btrfs_super_block *)bh->b_data;
-		devid = le64_to_cpu(disk_super->dev_item.devid);
+		devid = btrfs_stack_device_id(&disk_super->dev_item);
 		dev_uuid = disk_super->dev_item.uuid;
 		device = btrfs_find_device(root, devid, dev_uuid,
 					   disk_super->fsid);
@@ -1478,7 +1486,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 		goto error;
 	}
 
-	trans = btrfs_start_transaction(root, 1);
+	trans = btrfs_start_transaction(root, 0);
 	lock_chunks(root);
 
 	device->barriers = 1;
@@ -1743,9 +1751,10 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
 
 	/* step one, relocate all the extents inside this chunk */
 	ret = btrfs_relocate_block_group(extent_root, chunk_offset);
-	BUG_ON(ret);
+	if (ret)
+		return ret;
 
-	trans = btrfs_start_transaction(root, 1);
+	trans = btrfs_start_transaction(root, 0);
 	BUG_ON(!trans);
 
 	lock_chunks(root);
@@ -1917,7 +1926,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
 			break;
 		BUG_ON(ret);
 
-		trans = btrfs_start_transaction(dev_root, 1);
+		trans = btrfs_start_transaction(dev_root, 0);
 		BUG_ON(!trans);
 
 		ret = btrfs_grow_device(trans, device, old_size);
@@ -2086,11 +2095,7 @@ again:
 	}
 
 	/* Shrinking succeeded, else we would be at "done". */
-	trans = btrfs_start_transaction(root, 1);
-	if (!trans) {
-		ret = -ENOMEM;
-		goto done;
-	}
+	trans = btrfs_start_transaction(root, 0);
 	lock_chunks(root);
 
 	device->disk_total_bytes = new_size;
@@ -2191,9 +2196,9 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		min_stripes = 2;
 	}
 	if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
-		num_stripes = min_t(u64, 2, fs_devices->rw_devices);
-		if (num_stripes < 2)
+		if (fs_devices->rw_devices < 2)
 			return -ENOSPC;
+		num_stripes = 2;
 		min_stripes = 2;
 	}
 	if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
@@ -2237,8 +2242,16 @@ again:
 		do_div(calc_size, stripe_len);
 		calc_size *= stripe_len;
 	}
+
 	/* we don't want tiny stripes */
-	calc_size = max_t(u64, min_stripe_size, calc_size);
+	if (!looped)
+		calc_size = max_t(u64, min_stripe_size, calc_size);
+
+	/*
+	 * we're about to do_div by the stripe_len so lets make sure
+	 * we end up with something bigger than a stripe
+	 */
+	calc_size = max_t(u64, calc_size, stripe_len * 4);
 
 	do_div(calc_size, stripe_len);
 	calc_size *= stripe_len;
@@ -3382,6 +3395,8 @@ int btrfs_read_chunk_tree(struct btrfs_root *root)
 	key.type = 0;
 again:
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto error;
 	while (1) {
 		leaf = path->nodes[0];
 		slot = path->slots[0];
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 193b58f7d3f3..88ecbb215878 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -154,15 +154,10 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
 	if (trans)
 		return do_setxattr(trans, inode, name, value, size, flags);
 
-	ret = btrfs_reserve_metadata_space(root, 2);
-	if (ret)
-		return ret;
+	trans = btrfs_start_transaction(root, 2);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
 
-	trans = btrfs_start_transaction(root, 1);
-	if (!trans) {
-		ret = -ENOMEM;
-		goto out;
-	}
 	btrfs_set_trans_block_group(trans, inode);
 
 	ret = do_setxattr(trans, inode, name, value, size, flags);
@@ -174,7 +169,6 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
 	BUG_ON(ret);
 out:
 	btrfs_end_transaction_throttle(trans, root);
-	btrfs_unreserve_metadata_space(root, 2);
 	return ret;
 }
 
@@ -282,7 +276,7 @@ err:
  * List of handlers for synthetic system.* attributes.  All real ondisk
  * attributes are handled directly.
  */
-struct xattr_handler *btrfs_xattr_handlers[] = {
+const struct xattr_handler *btrfs_xattr_handlers[] = {
 #ifdef CONFIG_BTRFS_FS_POSIX_ACL
 	&btrfs_xattr_acl_access_handler,
 	&btrfs_xattr_acl_default_handler,
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 721efa0346e0..7a43fd640bbb 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -21,9 +21,9 @@
 
 #include <linux/xattr.h>
 
-extern struct xattr_handler btrfs_xattr_acl_access_handler;
-extern struct xattr_handler btrfs_xattr_acl_default_handler;
-extern struct xattr_handler *btrfs_xattr_handlers[];
+extern const struct xattr_handler btrfs_xattr_acl_access_handler;
+extern const struct xattr_handler btrfs_xattr_acl_default_handler;
+extern const struct xattr_handler *btrfs_xattr_handlers[];
 
 extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
 		void *buffer, size_t size);
diff --git a/fs/buffer.c b/fs/buffer.c
index 6fa530256bfd..d54812b198e9 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -275,6 +275,7 @@ void invalidate_bdev(struct block_device *bdev)
 		return;
 
 	invalidate_bh_lrus();
+	lru_add_drain_all();	/* make sure all lru add caches are flushed */
 	invalidate_mapping_pages(mapping, 0, -1);
 }
 EXPORT_SYMBOL(invalidate_bdev);
@@ -560,26 +561,17 @@ repeat:
 	return err;
 }
 
-static void do_thaw_all(struct work_struct *work)
+static void do_thaw_one(struct super_block *sb, void *unused)
 {
-	struct super_block *sb;
 	char b[BDEVNAME_SIZE];
+	while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
+		printk(KERN_WARNING "Emergency Thaw on %s\n",
+		       bdevname(sb->s_bdev, b));
+}
 
-	spin_lock(&sb_lock);
-restart:
-	list_for_each_entry(sb, &super_blocks, s_list) {
-		sb->s_count++;
-		spin_unlock(&sb_lock);
-		down_read(&sb->s_umount);
-		while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
-			printk(KERN_WARNING "Emergency Thaw on %s\n",
-			       bdevname(sb->s_bdev, b));
-		up_read(&sb->s_umount);
-		spin_lock(&sb_lock);
-		if (__put_super_and_need_restart(sb))
-			goto restart;
-	}
-	spin_unlock(&sb_lock);
+static void do_thaw_all(struct work_struct *work)
+{
+	iterate_supers(do_thaw_one, NULL);
 	kfree(work);
 	printk(KERN_WARNING "Emergency Thaw complete\n");
 }
@@ -1957,14 +1949,11 @@ static int __block_commit_write(struct inode *inode, struct page *page,
 }
 
 /*
- * block_write_begin takes care of the basic task of block allocation and
- * bringing partial write blocks uptodate first.
- *
- * If *pagep is not NULL, then block_write_begin uses the locked page
- * at *pagep rather than allocating its own. In this case, the page will
- * not be unlocked or deallocated on failure.
+ * Filesystems implementing the new truncate sequence should use the
+ * _newtrunc postfix variant which won't incorrectly call vmtruncate.
+ * The filesystem needs to handle block truncation upon failure.
  */
-int block_write_begin(struct file *file, struct address_space *mapping,
+int block_write_begin_newtrunc(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata,
 			get_block_t *get_block)
@@ -2000,20 +1989,50 @@ int block_write_begin(struct file *file, struct address_space *mapping,
 			unlock_page(page);
 			page_cache_release(page);
 			*pagep = NULL;
-
-			/*
-			 * prepare_write() may have instantiated a few blocks
-			 * outside i_size.  Trim these off again. Don't need
-			 * i_size_read because we hold i_mutex.
-			 */
-			if (pos + len > inode->i_size)
-				vmtruncate(inode, inode->i_size);
 		}
 	}
 
 out:
 	return status;
 }
+EXPORT_SYMBOL(block_write_begin_newtrunc);
+
+/*
+ * block_write_begin takes care of the basic task of block allocation and
+ * bringing partial write blocks uptodate first.
+ *
+ * If *pagep is not NULL, then block_write_begin uses the locked page
+ * at *pagep rather than allocating its own. In this case, the page will
+ * not be unlocked or deallocated on failure.
+ */
+int block_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata,
+			get_block_t *get_block)
+{
+	int ret;
+
+	ret = block_write_begin_newtrunc(file, mapping, pos, len, flags,
+					pagep, fsdata, get_block);
+
+	/*
+	 * prepare_write() may have instantiated a few blocks
+	 * outside i_size.  Trim these off again. Don't need
+	 * i_size_read because we hold i_mutex.
+	 *
+	 * Filesystems which pass down their own page also cannot
+	 * call into vmtruncate here because it would lead to lock
+	 * inversion problems (*pagep is locked). This is a further
+	 * example of where the old truncate sequence is inadequate.
+	 */
+	if (unlikely(ret) && *pagep == NULL) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
+
+	return ret;
+}
 EXPORT_SYMBOL(block_write_begin);
 
 int block_write_end(struct file *file, struct address_space *mapping,
@@ -2332,7 +2351,7 @@ out:
  * For moronic filesystems that do not allow holes in file.
  * We may have to extend the file.
  */
-int cont_write_begin(struct file *file, struct address_space *mapping,
+int cont_write_begin_newtrunc(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata,
 			get_block_t *get_block, loff_t *bytes)
@@ -2353,11 +2372,30 @@ int cont_write_begin(struct file *file, struct address_space *mapping,
 	}
 
 	*pagep = NULL;
-	err = block_write_begin(file, mapping, pos, len,
+	err = block_write_begin_newtrunc(file, mapping, pos, len,
 				flags, pagep, fsdata, get_block);
 out:
 	return err;
 }
+EXPORT_SYMBOL(cont_write_begin_newtrunc);
+
+int cont_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata,
+			get_block_t *get_block, loff_t *bytes)
+{
+	int ret;
+
+	ret = cont_write_begin_newtrunc(file, mapping, pos, len, flags,
+					pagep, fsdata, get_block, bytes);
+	if (unlikely(ret)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
+
+	return ret;
+}
 EXPORT_SYMBOL(cont_write_begin);
 
 int block_prepare_write(struct page *page, unsigned from, unsigned to,
@@ -2389,7 +2427,7 @@ EXPORT_SYMBOL(block_commit_write);
  *
  * We are not allowed to take the i_mutex here so we have to play games to
  * protect against truncate races as the page could now be beyond EOF.  Because
- * vmtruncate() writes the inode size before removing pages, once we have the
+ * truncate writes the inode size before removing pages, once we have the
  * page lock we can determine safely if the page is beyond EOF. If it is not
  * beyond EOF, then the page is guaranteed safe against truncation until we
  * unlock the page.
@@ -2472,10 +2510,11 @@ static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
 }
 
 /*
- * On entry, the page is fully not uptodate.
- * On exit the page is fully uptodate in the areas outside (from,to)
+ * Filesystems implementing the new truncate sequence should use the
+ * _newtrunc postfix variant which won't incorrectly call vmtruncate.
+ * The filesystem needs to handle block truncation upon failure.
  */
-int nobh_write_begin(struct file *file, struct address_space *mapping,
+int nobh_write_begin_newtrunc(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata,
 			get_block_t *get_block)
@@ -2508,8 +2547,8 @@ int nobh_write_begin(struct file *file, struct address_space *mapping,
 		unlock_page(page);
 		page_cache_release(page);
 		*pagep = NULL;
-		return block_write_begin(file, mapping, pos, len, flags, pagep,
-					fsdata, get_block);
+		return block_write_begin_newtrunc(file, mapping, pos, len,
+					flags, pagep, fsdata, get_block);
 	}
 
 	if (PageMappedToDisk(page))
@@ -2613,8 +2652,34 @@ out_release:
 	page_cache_release(page);
 	*pagep = NULL;
 
-	if (pos + len > inode->i_size)
-		vmtruncate(inode, inode->i_size);
+	return ret;
+}
+EXPORT_SYMBOL(nobh_write_begin_newtrunc);
+
+/*
+ * On entry, the page is fully not uptodate.
+ * On exit the page is fully uptodate in the areas outside (from,to)
+ */
+int nobh_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata,
+			get_block_t *get_block)
+{
+	int ret;
+
+	ret = nobh_write_begin_newtrunc(file, mapping, pos, len, flags,
+					pagep, fsdata, get_block);
+
+	/*
+	 * prepare_write() may have instantiated a few blocks
+	 * outside i_size.  Trim these off again. Don't need
+	 * i_size_read because we hold i_mutex.
+	 */
+	if (unlikely(ret)) {
+		loff_t isize = mapping->host->i_size;
+		if (pos + len > isize)
+			vmtruncate(mapping->host, isize);
+	}
 
 	return ret;
 }
@@ -2893,7 +2958,7 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block,
 
 	/*
 	 * The page straddles i_size.  It must be zeroed out on each and every
-	 * writepage invokation because it may be mmapped.  "A file is mapped
+	 * writepage invocation because it may be mmapped.  "A file is mapped
 	 * in multiples of the page size.  For a file that is not a multiple of
 	 * the  page size, the remaining memory is zeroed when mapped, and
 	 * writes to that region are not written out to the file."
@@ -3265,7 +3330,7 @@ static void recalc_bh_state(void)
 	
 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
 {
-	struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
+	struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
 	if (ret) {
 		INIT_LIST_HEAD(&ret->b_assoc_buffers);
 		get_cpu_var(bh_accounting).nr++;
@@ -3352,15 +3417,6 @@ int bh_submit_read(struct buffer_head *bh)
 }
 EXPORT_SYMBOL(bh_submit_read);
 
-static void
-init_buffer_head(void *data)
-{
-	struct buffer_head *bh = data;
-
-	memset(bh, 0, sizeof(*bh));
-	INIT_LIST_HEAD(&bh->b_assoc_buffers);
-}
-
 void __init buffer_init(void)
 {
 	int nrpages;
@@ -3369,7 +3425,7 @@ void __init buffer_init(void)
 			sizeof(struct buffer_head), 0,
 				(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
 				SLAB_MEM_SPREAD),
-				init_buffer_head);
+				NULL);
 
 	/*
 	 * Limit the bh occupancy to 10% of ZONE_NORMAL
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 27089311fbea..37fe101a4e0d 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -9,6 +9,7 @@
  * 2 of the Licence, or (at your option) any later version.
  */
 
+#include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/buffer_head.h>
 #include "internal.h"
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index f7c255f9c624..a8cd821226da 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -34,6 +34,7 @@ struct cachefiles_object {
 	loff_t				i_size;		/* object size */
 	unsigned long			flags;
 #define CACHEFILES_OBJECT_ACTIVE	0		/* T if marked active */
+#define CACHEFILES_OBJECT_BURIED	1		/* T if preemptively buried */
 	atomic_t			usage;		/* object usage count */
 	uint8_t				type;		/* object type */
 	uint8_t				new;		/* T if object new */
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 14ac4806e291..f4a7840bf42c 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -19,6 +19,7 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/security.h>
+#include <linux/slab.h>
 #include "internal.h"
 
 #define CACHEFILES_KEYBUF_SIZE 512
@@ -92,6 +93,59 @@ static noinline void cachefiles_printk_object(struct cachefiles_object *object,
 }
 
 /*
+ * mark the owner of a dentry, if there is one, to indicate that that dentry
+ * has been preemptively deleted
+ * - the caller must hold the i_mutex on the dentry's parent as required to
+ *   call vfs_unlink(), vfs_rmdir() or vfs_rename()
+ */
+static void cachefiles_mark_object_buried(struct cachefiles_cache *cache,
+					  struct dentry *dentry)
+{
+	struct cachefiles_object *object;
+	struct rb_node *p;
+
+	_enter(",'%*.*s'",
+	       dentry->d_name.len, dentry->d_name.len, dentry->d_name.name);
+
+	write_lock(&cache->active_lock);
+
+	p = cache->active_nodes.rb_node;
+	while (p) {
+		object = rb_entry(p, struct cachefiles_object, active_node);
+		if (object->dentry > dentry)
+			p = p->rb_left;
+		else if (object->dentry < dentry)
+			p = p->rb_right;
+		else
+			goto found_dentry;
+	}
+
+	write_unlock(&cache->active_lock);
+	_leave(" [no owner]");
+	return;
+
+	/* found the dentry for  */
+found_dentry:
+	kdebug("preemptive burial: OBJ%x [%s] %p",
+	       object->fscache.debug_id,
+	       fscache_object_states[object->fscache.state],
+	       dentry);
+
+	if (object->fscache.state < FSCACHE_OBJECT_DYING) {
+		printk(KERN_ERR "\n");
+		printk(KERN_ERR "CacheFiles: Error:"
+		       " Can't preemptively bury live object\n");
+		cachefiles_printk_object(object, NULL);
+	} else if (test_and_set_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) {
+		printk(KERN_ERR "CacheFiles: Error:"
+		       " Object already preemptively buried\n");
+	}
+
+	write_unlock(&cache->active_lock);
+	_leave(" [owner marked]");
+}
+
+/*
  * record the fact that an object is now active
  */
 static int cachefiles_mark_object_active(struct cachefiles_cache *cache,
@@ -218,7 +272,8 @@ requeue:
  */
 static int cachefiles_bury_object(struct cachefiles_cache *cache,
 				  struct dentry *dir,
-				  struct dentry *rep)
+				  struct dentry *rep,
+				  bool preemptive)
 {
 	struct dentry *grave, *trap;
 	char nbuffer[8 + 8 + 1];
@@ -228,11 +283,16 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
 	       dir->d_name.len, dir->d_name.len, dir->d_name.name,
 	       rep->d_name.len, rep->d_name.len, rep->d_name.name);
 
+	_debug("remove %p from %p", rep, dir);
+
 	/* non-directories can just be unlinked */
 	if (!S_ISDIR(rep->d_inode->i_mode)) {
 		_debug("unlink stale object");
 		ret = vfs_unlink(dir->d_inode, rep);
 
+		if (preemptive)
+			cachefiles_mark_object_buried(cache, rep);
+
 		mutex_unlock(&dir->d_inode->i_mutex);
 
 		if (ret == -EIO)
@@ -324,6 +384,9 @@ try_again:
 	if (ret != 0 && ret != -ENOMEM)
 		cachefiles_io_error(cache, "Rename failed with error %d", ret);
 
+	if (preemptive)
+		cachefiles_mark_object_buried(cache, rep);
+
 	unlock_rename(cache->graveyard, dir);
 	dput(grave);
 	_leave(" = 0");
@@ -339,7 +402,7 @@ int cachefiles_delete_object(struct cachefiles_cache *cache,
 	struct dentry *dir;
 	int ret;
 
-	_enter(",{%p}", object->dentry);
+	_enter(",OBJ%x{%p}", object->fscache.debug_id, object->dentry);
 
 	ASSERT(object->dentry);
 	ASSERT(object->dentry->d_inode);
@@ -348,7 +411,27 @@ int cachefiles_delete_object(struct cachefiles_cache *cache,
 	dir = dget_parent(object->dentry);
 
 	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
-	ret = cachefiles_bury_object(cache, dir, object->dentry);
+
+	if (test_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) {
+		/* object allocation for the same key preemptively deleted this
+		 * object's file so that it could create its own file */
+		_debug("object preemptively buried");
+		mutex_unlock(&dir->d_inode->i_mutex);
+		ret = 0;
+	} else {
+		/* we need to check that our parent is _still_ our parent - it
+		 * may have been renamed */
+		if (dir == object->dentry->d_parent) {
+			ret = cachefiles_bury_object(cache, dir,
+						     object->dentry, false);
+		} else {
+			/* it got moved, presumably by cachefilesd culling it,
+			 * so it's no longer in the key path and we can ignore
+			 * it */
+			mutex_unlock(&dir->d_inode->i_mutex);
+			ret = 0;
+		}
+	}
 
 	dput(dir);
 	_leave(" = %d", ret);
@@ -370,7 +453,9 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent,
 	const char *name;
 	int ret, nlen;
 
-	_enter("{%p},,%s,", parent->dentry, key);
+	_enter("OBJ%x{%p},OBJ%x,%s,",
+	       parent->fscache.debug_id, parent->dentry,
+	       object->fscache.debug_id, key);
 
 	cache = container_of(parent->fscache.cache,
 			     struct cachefiles_cache, cache);
@@ -498,7 +583,7 @@ lookup_again:
 			 * mutex) */
 			object->dentry = NULL;
 
-			ret = cachefiles_bury_object(cache, dir, next);
+			ret = cachefiles_bury_object(cache, dir, next, true);
 			dput(next);
 			next = NULL;
 
@@ -817,7 +902,7 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
 	/*  actually remove the victim (drops the dir mutex) */
 	_debug("bury");
 
-	ret = cachefiles_bury_object(cache, dir, victim);
+	ret = cachefiles_bury_object(cache, dir, victim, false);
 	if (ret < 0)
 		goto error;
 
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 1d8332563863..0f0d41fbb03f 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -10,6 +10,7 @@
  */
 
 #include <linux/mount.h>
+#include <linux/slab.h>
 #include <linux/file.h>
 #include "internal.h"
 
diff --git a/fs/cachefiles/security.c b/fs/cachefiles/security.c
index b5808cdb2232..039b5011d83b 100644
--- a/fs/cachefiles/security.c
+++ b/fs/cachefiles/security.c
@@ -77,6 +77,8 @@ static int cachefiles_check_cache_dir(struct cachefiles_cache *cache,
 /*
  * check the security details of the on-disk cache
  * - must be called with security override in force
+ * - must return with a security override in force - even in the case of an
+ *   error
  */
 int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
 					struct dentry *root,
@@ -99,6 +101,8 @@ int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
 	 * which create files */
 	ret = set_create_files_as(new, root->d_inode);
 	if (ret < 0) {
+		abort_creds(new);
+		cachefiles_begin_secure(cache, _saved_cred);
 		_leave(" = %d [cfa]", ret);
 		return ret;
 	}
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index f3e7a0bf068b..e18b183b47e1 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -16,6 +16,7 @@
 #include <linux/fsnotify.h>
 #include <linux/quotaops.h>
 #include <linux/xattr.h>
+#include <linux/slab.h>
 #include "internal.h"
 
 static const char cachefiles_xattr_cache[] =
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
new file mode 100644
index 000000000000..bc87b9c1d27e
--- /dev/null
+++ b/fs/ceph/Kconfig
@@ -0,0 +1,27 @@
+config CEPH_FS
+        tristate "Ceph distributed file system (EXPERIMENTAL)"
+	depends on INET && EXPERIMENTAL
+	select LIBCRC32C
+	select CRYPTO_AES
+	help
+	  Choose Y or M here to include support for mounting the
+	  experimental Ceph distributed file system.  Ceph is an extremely
+	  scalable file system designed to provide high performance,
+	  reliable access to petabytes of storage.
+
+	  More information at http://ceph.newdream.net/.
+
+	  If unsure, say N.
+
+config CEPH_FS_PRETTYDEBUG
+	bool "Include file:line in ceph debug output"
+	depends on CEPH_FS
+	default n
+	help
+	  If you say Y here, debug output will include a filename and
+	  line to aid debugging.  This icnreases kernel size and slows
+	  execution slightly when debug call sites are enabled (e.g.,
+	  via CONFIG_DYNAMIC_DEBUG).
+
+	  If unsure, say N.
+
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
new file mode 100644
index 000000000000..6a660e610be8
--- /dev/null
+++ b/fs/ceph/Makefile
@@ -0,0 +1,39 @@
+#
+# Makefile for CEPH filesystem.
+#
+
+ifneq ($(KERNELRELEASE),)
+
+obj-$(CONFIG_CEPH_FS) += ceph.o
+
+ceph-objs := super.o inode.o dir.o file.o addr.o ioctl.o \
+	export.o caps.o snap.o xattr.o \
+	messenger.o msgpool.o buffer.o pagelist.o \
+	mds_client.o mdsmap.o \
+	mon_client.o \
+	osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
+	debugfs.o \
+	auth.o auth_none.o \
+	crypto.o armor.o \
+	auth_x.o \
+	ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o
+
+else
+#Otherwise we were called directly from the command
+# line; invoke the kernel build system.
+
+KERNELDIR ?= /lib/modules/$(shell uname -r)/build
+PWD := $(shell pwd)
+
+default: all
+
+all:
+	$(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules
+
+modules_install:
+	$(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules_install
+
+clean:
+	$(MAKE) -C $(KERNELDIR) M=$(PWD) clean
+
+endif
diff --git a/fs/ceph/README b/fs/ceph/README
new file mode 100644
index 000000000000..18352fab37c0
--- /dev/null
+++ b/fs/ceph/README
@@ -0,0 +1,20 @@
+#
+# The following files are shared by (and manually synchronized
+# between) the Ceph userland and kernel client.
+#
+# userland                  kernel
+src/include/ceph_fs.h	    fs/ceph/ceph_fs.h
+src/include/ceph_fs.cc	    fs/ceph/ceph_fs.c
+src/include/msgr.h	    fs/ceph/msgr.h
+src/include/rados.h	    fs/ceph/rados.h
+src/include/ceph_strings.cc fs/ceph/ceph_strings.c
+src/include/ceph_frag.h	    fs/ceph/ceph_frag.h
+src/include/ceph_frag.cc    fs/ceph/ceph_frag.c
+src/include/ceph_hash.h	    fs/ceph/ceph_hash.h
+src/include/ceph_hash.cc    fs/ceph/ceph_hash.c
+src/crush/crush.c	    fs/ceph/crush/crush.c
+src/crush/crush.h	    fs/ceph/crush/crush.h
+src/crush/mapper.c	    fs/ceph/crush/mapper.c
+src/crush/mapper.h	    fs/ceph/crush/mapper.h
+src/crush/hash.h	    fs/ceph/crush/hash.h
+src/crush/hash.c	    fs/ceph/crush/hash.c
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
new file mode 100644
index 000000000000..d9c60b84949a
--- /dev/null
+++ b/fs/ceph/addr.c
@@ -0,0 +1,1182 @@
+#include "ceph_debug.h"
+
+#include <linux/backing-dev.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>	/* generic_writepages */
+#include <linux/slab.h>
+#include <linux/pagevec.h>
+#include <linux/task_io_accounting_ops.h>
+
+#include "super.h"
+#include "osd_client.h"
+
+/*
+ * Ceph address space ops.
+ *
+ * There are a few funny things going on here.
+ *
+ * The page->private field is used to reference a struct
+ * ceph_snap_context for _every_ dirty page.  This indicates which
+ * snapshot the page was logically dirtied in, and thus which snap
+ * context needs to be associated with the osd write during writeback.
+ *
+ * Similarly, struct ceph_inode_info maintains a set of counters to
+ * count dirty pages on the inode.  In the absense of snapshots,
+ * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count.
+ *
+ * When a snapshot is taken (that is, when the client receives
+ * notification that a snapshot was taken), each inode with caps and
+ * with dirty pages (dirty pages implies there is a cap) gets a new
+ * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending
+ * order, new snaps go to the tail).  The i_wrbuffer_ref_head count is
+ * moved to capsnap->dirty. (Unless a sync write is currently in
+ * progress.  In that case, the capsnap is said to be "pending", new
+ * writes cannot start, and the capsnap isn't "finalized" until the
+ * write completes (or fails) and a final size/mtime for the inode for
+ * that snap can be settled upon.)  i_wrbuffer_ref_head is reset to 0.
+ *
+ * On writeback, we must submit writes to the osd IN SNAP ORDER.  So,
+ * we look for the first capsnap in i_cap_snaps and write out pages in
+ * that snap context _only_.  Then we move on to the next capsnap,
+ * eventually reaching the "live" or "head" context (i.e., pages that
+ * are not yet snapped) and are writing the most recently dirtied
+ * pages.
+ *
+ * Invalidate and so forth must take care to ensure the dirty page
+ * accounting is preserved.
+ */
+
+#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10))
+#define CONGESTION_OFF_THRESH(congestion_kb)				\
+	(CONGESTION_ON_THRESH(congestion_kb) -				\
+	 (CONGESTION_ON_THRESH(congestion_kb) >> 2))
+
+
+
+/*
+ * Dirty a page.  Optimistically adjust accounting, on the assumption
+ * that we won't race with invalidate.  If we do, readjust.
+ */
+static int ceph_set_page_dirty(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *inode;
+	struct ceph_inode_info *ci;
+	int undo = 0;
+	struct ceph_snap_context *snapc;
+
+	if (unlikely(!mapping))
+		return !TestSetPageDirty(page);
+
+	if (TestSetPageDirty(page)) {
+		dout("%p set_page_dirty %p idx %lu -- already dirty\n",
+		     mapping->host, page, page->index);
+		return 0;
+	}
+
+	inode = mapping->host;
+	ci = ceph_inode(inode);
+
+	/*
+	 * Note that we're grabbing a snapc ref here without holding
+	 * any locks!
+	 */
+	snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
+
+	/* dirty the head */
+	spin_lock(&inode->i_lock);
+	if (ci->i_wrbuffer_ref_head == 0)
+		ci->i_head_snapc = ceph_get_snap_context(snapc);
+	++ci->i_wrbuffer_ref_head;
+	if (ci->i_wrbuffer_ref == 0)
+		igrab(inode);
+	++ci->i_wrbuffer_ref;
+	dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d "
+	     "snapc %p seq %lld (%d snaps)\n",
+	     mapping->host, page, page->index,
+	     ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
+	     ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
+	     snapc, snapc->seq, snapc->num_snaps);
+	spin_unlock(&inode->i_lock);
+
+	/* now adjust page */
+	spin_lock_irq(&mapping->tree_lock);
+	if (page->mapping) {	/* Race with truncate? */
+		WARN_ON_ONCE(!PageUptodate(page));
+
+		if (mapping_cap_account_dirty(mapping)) {
+			__inc_zone_page_state(page, NR_FILE_DIRTY);
+			__inc_bdi_stat(mapping->backing_dev_info,
+					BDI_RECLAIMABLE);
+			task_io_account_write(PAGE_CACHE_SIZE);
+		}
+		radix_tree_tag_set(&mapping->page_tree,
+				page_index(page), PAGECACHE_TAG_DIRTY);
+
+		/*
+		 * Reference snap context in page->private.  Also set
+		 * PagePrivate so that we get invalidatepage callback.
+		 */
+		page->private = (unsigned long)snapc;
+		SetPagePrivate(page);
+	} else {
+		dout("ANON set_page_dirty %p (raced truncate?)\n", page);
+		undo = 1;
+	}
+
+	spin_unlock_irq(&mapping->tree_lock);
+
+	if (undo)
+		/* whoops, we failed to dirty the page */
+		ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
+
+	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+
+	BUG_ON(!PageDirty(page));
+	return 1;
+}
+
+/*
+ * If we are truncating the full page (i.e. offset == 0), adjust the
+ * dirty page counters appropriately.  Only called if there is private
+ * data on the page.
+ */
+static void ceph_invalidatepage(struct page *page, unsigned long offset)
+{
+	struct inode *inode;
+	struct ceph_inode_info *ci;
+	struct ceph_snap_context *snapc = (void *)page->private;
+
+	BUG_ON(!PageLocked(page));
+	BUG_ON(!page->private);
+	BUG_ON(!PagePrivate(page));
+	BUG_ON(!page->mapping);
+
+	inode = page->mapping->host;
+
+	/*
+	 * We can get non-dirty pages here due to races between
+	 * set_page_dirty and truncate_complete_page; just spit out a
+	 * warning, in case we end up with accounting problems later.
+	 */
+	if (!PageDirty(page))
+		pr_err("%p invalidatepage %p page not dirty\n", inode, page);
+
+	if (offset == 0)
+		ClearPageChecked(page);
+
+	ci = ceph_inode(inode);
+	if (offset == 0) {
+		dout("%p invalidatepage %p idx %lu full dirty page %lu\n",
+		     inode, page, page->index, offset);
+		ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
+		ceph_put_snap_context(snapc);
+		page->private = 0;
+		ClearPagePrivate(page);
+	} else {
+		dout("%p invalidatepage %p idx %lu partial dirty page\n",
+		     inode, page, page->index);
+	}
+}
+
+/* just a sanity check */
+static int ceph_releasepage(struct page *page, gfp_t g)
+{
+	struct inode *inode = page->mapping ? page->mapping->host : NULL;
+	dout("%p releasepage %p idx %lu\n", inode, page, page->index);
+	WARN_ON(PageDirty(page));
+	WARN_ON(page->private);
+	WARN_ON(PagePrivate(page));
+	return 0;
+}
+
+/*
+ * read a single page, without unlocking it.
+ */
+static int readpage_nounlock(struct file *filp, struct page *page)
+{
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
+	int err = 0;
+	u64 len = PAGE_CACHE_SIZE;
+
+	dout("readpage inode %p file %p page %p index %lu\n",
+	     inode, filp, page, page->index);
+	err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
+				  page->index << PAGE_CACHE_SHIFT, &len,
+				  ci->i_truncate_seq, ci->i_truncate_size,
+				  &page, 1);
+	if (err == -ENOENT)
+		err = 0;
+	if (err < 0) {
+		SetPageError(page);
+		goto out;
+	} else if (err < PAGE_CACHE_SIZE) {
+		/* zero fill remainder of page */
+		zero_user_segment(page, err, PAGE_CACHE_SIZE);
+	}
+	SetPageUptodate(page);
+
+out:
+	return err < 0 ? err : 0;
+}
+
+static int ceph_readpage(struct file *filp, struct page *page)
+{
+	int r = readpage_nounlock(filp, page);
+	unlock_page(page);
+	return r;
+}
+
+/*
+ * Build a vector of contiguous pages from the provided page list.
+ */
+static struct page **page_vector_from_list(struct list_head *page_list,
+					   unsigned *nr_pages)
+{
+	struct page **pages;
+	struct page *page;
+	int next_index, contig_pages = 0;
+
+	/* build page vector */
+	pages = kmalloc(sizeof(*pages) * *nr_pages, GFP_NOFS);
+	if (!pages)
+		return ERR_PTR(-ENOMEM);
+
+	BUG_ON(list_empty(page_list));
+	next_index = list_entry(page_list->prev, struct page, lru)->index;
+	list_for_each_entry_reverse(page, page_list, lru) {
+		if (page->index == next_index) {
+			dout("readpages page %d %p\n", contig_pages, page);
+			pages[contig_pages] = page;
+			contig_pages++;
+			next_index++;
+		} else {
+			break;
+		}
+	}
+	*nr_pages = contig_pages;
+	return pages;
+}
+
+/*
+ * Read multiple pages.  Leave pages we don't read + unlock in page_list;
+ * the caller (VM) cleans them up.
+ */
+static int ceph_readpages(struct file *file, struct address_space *mapping,
+			  struct list_head *page_list, unsigned nr_pages)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
+	int rc = 0;
+	struct page **pages;
+	loff_t offset;
+	u64 len;
+
+	dout("readpages %p file %p nr_pages %d\n",
+	     inode, file, nr_pages);
+
+	pages = page_vector_from_list(page_list, &nr_pages);
+	if (IS_ERR(pages))
+		return PTR_ERR(pages);
+
+	/* guess read extent */
+	offset = pages[0]->index << PAGE_CACHE_SHIFT;
+	len = nr_pages << PAGE_CACHE_SHIFT;
+	rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
+				 offset, &len,
+				 ci->i_truncate_seq, ci->i_truncate_size,
+				 pages, nr_pages);
+	if (rc == -ENOENT)
+		rc = 0;
+	if (rc < 0)
+		goto out;
+
+	for (; !list_empty(page_list) && len > 0;
+	     rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) {
+		struct page *page =
+			list_entry(page_list->prev, struct page, lru);
+
+		list_del(&page->lru);
+
+		if (rc < (int)PAGE_CACHE_SIZE) {
+			/* zero (remainder of) page */
+			int s = rc < 0 ? 0 : rc;
+			zero_user_segment(page, s, PAGE_CACHE_SIZE);
+		}
+
+		if (add_to_page_cache_lru(page, mapping, page->index, GFP_NOFS)) {
+			page_cache_release(page);
+			dout("readpages %p add_to_page_cache failed %p\n",
+			     inode, page);
+			continue;
+		}
+		dout("readpages %p adding %p idx %lu\n", inode, page,
+		     page->index);
+		flush_dcache_page(page);
+		SetPageUptodate(page);
+		unlock_page(page);
+		page_cache_release(page);
+	}
+	rc = 0;
+
+out:
+	kfree(pages);
+	return rc;
+}
+
+/*
+ * Get ref for the oldest snapc for an inode with dirty data... that is, the
+ * only snap context we are allowed to write back.
+ */
+static struct ceph_snap_context *get_oldest_context(struct inode *inode,
+						    u64 *snap_size)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_snap_context *snapc = NULL;
+	struct ceph_cap_snap *capsnap = NULL;
+
+	spin_lock(&inode->i_lock);
+	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+		dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
+		     capsnap->context, capsnap->dirty_pages);
+		if (capsnap->dirty_pages) {
+			snapc = ceph_get_snap_context(capsnap->context);
+			if (snap_size)
+				*snap_size = capsnap->size;
+			break;
+		}
+	}
+	if (!snapc && ci->i_head_snapc) {
+		snapc = ceph_get_snap_context(ci->i_head_snapc);
+		dout(" head snapc %p has %d dirty pages\n",
+		     snapc, ci->i_wrbuffer_ref_head);
+	}
+	spin_unlock(&inode->i_lock);
+	return snapc;
+}
+
+/*
+ * Write a single page, but leave the page locked.
+ *
+ * If we get a write error, set the page error bit, but still adjust the
+ * dirty page accounting (i.e., page is no longer dirty).
+ */
+static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
+{
+	struct inode *inode;
+	struct ceph_inode_info *ci;
+	struct ceph_client *client;
+	struct ceph_osd_client *osdc;
+	loff_t page_off = page->index << PAGE_CACHE_SHIFT;
+	int len = PAGE_CACHE_SIZE;
+	loff_t i_size;
+	int err = 0;
+	struct ceph_snap_context *snapc, *oldest;
+	u64 snap_size = 0;
+	long writeback_stat;
+
+	dout("writepage %p idx %lu\n", page, page->index);
+
+	if (!page->mapping || !page->mapping->host) {
+		dout("writepage %p - no mapping\n", page);
+		return -EFAULT;
+	}
+	inode = page->mapping->host;
+	ci = ceph_inode(inode);
+	client = ceph_inode_to_client(inode);
+	osdc = &client->osdc;
+
+	/* verify this is a writeable snap context */
+	snapc = (void *)page->private;
+	if (snapc == NULL) {
+		dout("writepage %p page %p not dirty?\n", inode, page);
+		goto out;
+	}
+	oldest = get_oldest_context(inode, &snap_size);
+	if (snapc->seq > oldest->seq) {
+		dout("writepage %p page %p snapc %p not writeable - noop\n",
+		     inode, page, (void *)page->private);
+		/* we should only noop if called by kswapd */
+		WARN_ON((current->flags & PF_MEMALLOC) == 0);
+		ceph_put_snap_context(oldest);
+		goto out;
+	}
+	ceph_put_snap_context(oldest);
+
+	/* is this a partial page at end of file? */
+	if (snap_size)
+		i_size = snap_size;
+	else
+		i_size = i_size_read(inode);
+	if (i_size < page_off + len)
+		len = i_size - page_off;
+
+	dout("writepage %p page %p index %lu on %llu~%u\n",
+	     inode, page, page->index, page_off, len);
+
+	writeback_stat = atomic_long_inc_return(&client->writeback_count);
+	if (writeback_stat >
+	    CONGESTION_ON_THRESH(client->mount_args->congestion_kb))
+		set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
+
+	set_page_writeback(page);
+	err = ceph_osdc_writepages(osdc, ceph_vino(inode),
+				   &ci->i_layout, snapc,
+				   page_off, len,
+				   ci->i_truncate_seq, ci->i_truncate_size,
+				   &inode->i_mtime,
+				   &page, 1, 0, 0, true);
+	if (err < 0) {
+		dout("writepage setting page/mapping error %d %p\n", err, page);
+		SetPageError(page);
+		mapping_set_error(&inode->i_data, err);
+		if (wbc)
+			wbc->pages_skipped++;
+	} else {
+		dout("writepage cleaned page %p\n", page);
+		err = 0;  /* vfs expects us to return 0 */
+	}
+	page->private = 0;
+	ClearPagePrivate(page);
+	end_page_writeback(page);
+	ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
+	ceph_put_snap_context(snapc);  /* page's reference */
+out:
+	return err;
+}
+
+static int ceph_writepage(struct page *page, struct writeback_control *wbc)
+{
+	int err;
+	struct inode *inode = page->mapping->host;
+	BUG_ON(!inode);
+	igrab(inode);
+	err = writepage_nounlock(page, wbc);
+	unlock_page(page);
+	iput(inode);
+	return err;
+}
+
+
+/*
+ * lame release_pages helper.  release_pages() isn't exported to
+ * modules.
+ */
+static void ceph_release_pages(struct page **pages, int num)
+{
+	struct pagevec pvec;
+	int i;
+
+	pagevec_init(&pvec, 0);
+	for (i = 0; i < num; i++) {
+		if (pagevec_add(&pvec, pages[i]) == 0)
+			pagevec_release(&pvec);
+	}
+	pagevec_release(&pvec);
+}
+
+
+/*
+ * async writeback completion handler.
+ *
+ * If we get an error, set the mapping error bit, but not the individual
+ * page error bits.
+ */
+static void writepages_finish(struct ceph_osd_request *req,
+			      struct ceph_msg *msg)
+{
+	struct inode *inode = req->r_inode;
+	struct ceph_osd_reply_head *replyhead;
+	struct ceph_osd_op *op;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	unsigned wrote;
+	struct page *page;
+	int i;
+	struct ceph_snap_context *snapc = req->r_snapc;
+	struct address_space *mapping = inode->i_mapping;
+	__s32 rc = -EIO;
+	u64 bytes = 0;
+	struct ceph_client *client = ceph_inode_to_client(inode);
+	long writeback_stat;
+	unsigned issued = ceph_caps_issued(ci);
+
+	/* parse reply */
+	replyhead = msg->front.iov_base;
+	WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
+	op = (void *)(replyhead + 1);
+	rc = le32_to_cpu(replyhead->result);
+	bytes = le64_to_cpu(op->extent.length);
+
+	if (rc >= 0) {
+		/*
+		 * Assume we wrote the pages we originally sent.  The
+		 * osd might reply with fewer pages if our writeback
+		 * raced with a truncation and was adjusted at the osd,
+		 * so don't believe the reply.
+		 */
+		wrote = req->r_num_pages;
+	} else {
+		wrote = 0;
+		mapping_set_error(mapping, rc);
+	}
+	dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n",
+	     inode, rc, bytes, wrote);
+
+	/* clean all pages */
+	for (i = 0; i < req->r_num_pages; i++) {
+		page = req->r_pages[i];
+		BUG_ON(!page);
+		WARN_ON(!PageUptodate(page));
+
+		writeback_stat =
+			atomic_long_dec_return(&client->writeback_count);
+		if (writeback_stat <
+		    CONGESTION_OFF_THRESH(client->mount_args->congestion_kb))
+			clear_bdi_congested(&client->backing_dev_info,
+					    BLK_RW_ASYNC);
+
+		ceph_put_snap_context((void *)page->private);
+		page->private = 0;
+		ClearPagePrivate(page);
+		dout("unlocking %d %p\n", i, page);
+		end_page_writeback(page);
+
+		/*
+		 * We lost the cache cap, need to truncate the page before
+		 * it is unlocked, otherwise we'd truncate it later in the
+		 * page truncation thread, possibly losing some data that
+		 * raced its way in
+		 */
+		if ((issued & CEPH_CAP_FILE_CACHE) == 0)
+			generic_error_remove_page(inode->i_mapping, page);
+
+		unlock_page(page);
+	}
+	dout("%p wrote+cleaned %d pages\n", inode, wrote);
+	ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc);
+
+	ceph_release_pages(req->r_pages, req->r_num_pages);
+	if (req->r_pages_from_pool)
+		mempool_free(req->r_pages,
+			     ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
+	else
+		kfree(req->r_pages);
+	ceph_osdc_put_request(req);
+}
+
+/*
+ * allocate a page vec, either directly, or if necessary, via a the
+ * mempool.  we avoid the mempool if we can because req->r_num_pages
+ * may be less than the maximum write size.
+ */
+static void alloc_page_vec(struct ceph_client *client,
+			   struct ceph_osd_request *req)
+{
+	req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
+			       GFP_NOFS);
+	if (!req->r_pages) {
+		req->r_pages = mempool_alloc(client->wb_pagevec_pool, GFP_NOFS);
+		req->r_pages_from_pool = 1;
+		WARN_ON(!req->r_pages);
+	}
+}
+
+/*
+ * initiate async writeback
+ */
+static int ceph_writepages_start(struct address_space *mapping,
+				 struct writeback_control *wbc)
+{
+	struct inode *inode = mapping->host;
+	struct backing_dev_info *bdi = mapping->backing_dev_info;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_client *client;
+	pgoff_t index, start, end;
+	int range_whole = 0;
+	int should_loop = 1;
+	pgoff_t max_pages = 0, max_pages_ever = 0;
+	struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc;
+	struct pagevec pvec;
+	int done = 0;
+	int rc = 0;
+	unsigned wsize = 1 << inode->i_blkbits;
+	struct ceph_osd_request *req = NULL;
+	int do_sync;
+	u64 snap_size = 0;
+
+	/*
+	 * Include a 'sync' in the OSD request if this is a data
+	 * integrity write (e.g., O_SYNC write or fsync()), or if our
+	 * cap is being revoked.
+	 */
+	do_sync = wbc->sync_mode == WB_SYNC_ALL;
+	if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
+		do_sync = 1;
+	dout("writepages_start %p dosync=%d (mode=%s)\n",
+	     inode, do_sync,
+	     wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
+	     (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
+
+	client = ceph_inode_to_client(inode);
+	if (client->mount_state == CEPH_MOUNT_SHUTDOWN) {
+		pr_warning("writepage_start %p on forced umount\n", inode);
+		return -EIO; /* we're in a forced umount, don't write! */
+	}
+	if (client->mount_args->wsize && client->mount_args->wsize < wsize)
+		wsize = client->mount_args->wsize;
+	if (wsize < PAGE_CACHE_SIZE)
+		wsize = PAGE_CACHE_SIZE;
+	max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
+
+	pagevec_init(&pvec, 0);
+
+	/* ?? */
+	if (wbc->nonblocking && bdi_write_congested(bdi)) {
+		dout(" writepages congested\n");
+		wbc->encountered_congestion = 1;
+		goto out_final;
+	}
+
+	/* where to start/end? */
+	if (wbc->range_cyclic) {
+		start = mapping->writeback_index; /* Start from prev offset */
+		end = -1;
+		dout(" cyclic, start at %lu\n", start);
+	} else {
+		start = wbc->range_start >> PAGE_CACHE_SHIFT;
+		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+			range_whole = 1;
+		should_loop = 0;
+		dout(" not cyclic, %lu to %lu\n", start, end);
+	}
+	index = start;
+
+retry:
+	/* find oldest snap context with dirty data */
+	ceph_put_snap_context(snapc);
+	snapc = get_oldest_context(inode, &snap_size);
+	if (!snapc) {
+		/* hmm, why does writepages get called when there
+		   is no dirty data? */
+		dout(" no snap context with dirty data?\n");
+		goto out;
+	}
+	dout(" oldest snapc is %p seq %lld (%d snaps)\n",
+	     snapc, snapc->seq, snapc->num_snaps);
+	if (last_snapc && snapc != last_snapc) {
+		/* if we switched to a newer snapc, restart our scan at the
+		 * start of the original file range. */
+		dout("  snapc differs from last pass, restarting at %lu\n",
+		     index);
+		index = start;
+	}
+	last_snapc = snapc;
+
+	while (!done && index <= end) {
+		unsigned i;
+		int first;
+		pgoff_t next;
+		int pvec_pages, locked_pages;
+		struct page *page;
+		int want;
+		u64 offset, len;
+		struct ceph_osd_request_head *reqhead;
+		struct ceph_osd_op *op;
+		long writeback_stat;
+
+		next = 0;
+		locked_pages = 0;
+		max_pages = max_pages_ever;
+
+get_more_pages:
+		first = -1;
+		want = min(end - index,
+			   min((pgoff_t)PAGEVEC_SIZE,
+			       max_pages - (pgoff_t)locked_pages) - 1)
+			+ 1;
+		pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+						PAGECACHE_TAG_DIRTY,
+						want);
+		dout("pagevec_lookup_tag got %d\n", pvec_pages);
+		if (!pvec_pages && !locked_pages)
+			break;
+		for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) {
+			page = pvec.pages[i];
+			dout("? %p idx %lu\n", page, page->index);
+			if (locked_pages == 0)
+				lock_page(page);  /* first page */
+			else if (!trylock_page(page))
+				break;
+
+			/* only dirty pages, or our accounting breaks */
+			if (unlikely(!PageDirty(page)) ||
+			    unlikely(page->mapping != mapping)) {
+				dout("!dirty or !mapping %p\n", page);
+				unlock_page(page);
+				break;
+			}
+			if (!wbc->range_cyclic && page->index > end) {
+				dout("end of range %p\n", page);
+				done = 1;
+				unlock_page(page);
+				break;
+			}
+			if (next && (page->index != next)) {
+				dout("not consecutive %p\n", page);
+				unlock_page(page);
+				break;
+			}
+			if (wbc->sync_mode != WB_SYNC_NONE) {
+				dout("waiting on writeback %p\n", page);
+				wait_on_page_writeback(page);
+			}
+			if ((snap_size && page_offset(page) > snap_size) ||
+			    (!snap_size &&
+			     page_offset(page) > i_size_read(inode))) {
+				dout("%p page eof %llu\n", page, snap_size ?
+				     snap_size : i_size_read(inode));
+				done = 1;
+				unlock_page(page);
+				break;
+			}
+			if (PageWriteback(page)) {
+				dout("%p under writeback\n", page);
+				unlock_page(page);
+				break;
+			}
+
+			/* only if matching snap context */
+			pgsnapc = (void *)page->private;
+			if (pgsnapc->seq > snapc->seq) {
+				dout("page snapc %p %lld > oldest %p %lld\n",
+				     pgsnapc, pgsnapc->seq, snapc, snapc->seq);
+				unlock_page(page);
+				if (!locked_pages)
+					continue; /* keep looking for snap */
+				break;
+			}
+
+			if (!clear_page_dirty_for_io(page)) {
+				dout("%p !clear_page_dirty_for_io\n", page);
+				unlock_page(page);
+				break;
+			}
+
+			/* ok */
+			if (locked_pages == 0) {
+				/* prepare async write request */
+				offset = page->index << PAGE_CACHE_SHIFT;
+				len = wsize;
+				req = ceph_osdc_new_request(&client->osdc,
+					    &ci->i_layout,
+					    ceph_vino(inode),
+					    offset, &len,
+					    CEPH_OSD_OP_WRITE,
+					    CEPH_OSD_FLAG_WRITE |
+						    CEPH_OSD_FLAG_ONDISK,
+					    snapc, do_sync,
+					    ci->i_truncate_seq,
+					    ci->i_truncate_size,
+					    &inode->i_mtime, true, 1);
+				max_pages = req->r_num_pages;
+
+				alloc_page_vec(client, req);
+				req->r_callback = writepages_finish;
+				req->r_inode = inode;
+			}
+
+			/* note position of first page in pvec */
+			if (first < 0)
+				first = i;
+			dout("%p will write page %p idx %lu\n",
+			     inode, page, page->index);
+
+			writeback_stat = atomic_long_inc_return(&client->writeback_count);
+			if (writeback_stat > CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) {
+				set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
+			}
+
+			set_page_writeback(page);
+			req->r_pages[locked_pages] = page;
+			locked_pages++;
+			next = page->index + 1;
+		}
+
+		/* did we get anything? */
+		if (!locked_pages)
+			goto release_pvec_pages;
+		if (i) {
+			int j;
+			BUG_ON(!locked_pages || first < 0);
+
+			if (pvec_pages && i == pvec_pages &&
+			    locked_pages < max_pages) {
+				dout("reached end pvec, trying for more\n");
+				pagevec_reinit(&pvec);
+				goto get_more_pages;
+			}
+
+			/* shift unused pages over in the pvec...  we
+			 * will need to release them below. */
+			for (j = i; j < pvec_pages; j++) {
+				dout(" pvec leftover page %p\n",
+				     pvec.pages[j]);
+				pvec.pages[j-i+first] = pvec.pages[j];
+			}
+			pvec.nr -= i-first;
+		}
+
+		/* submit the write */
+		offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT;
+		len = min((snap_size ? snap_size : i_size_read(inode)) - offset,
+			  (u64)locked_pages << PAGE_CACHE_SHIFT);
+		dout("writepages got %d pages at %llu~%llu\n",
+		     locked_pages, offset, len);
+
+		/* revise final length, page count */
+		req->r_num_pages = locked_pages;
+		reqhead = req->r_request->front.iov_base;
+		op = (void *)(reqhead + 1);
+		op->extent.length = cpu_to_le64(len);
+		op->payload_len = cpu_to_le32(len);
+		req->r_request->hdr.data_len = cpu_to_le32(len);
+
+		ceph_osdc_start_request(&client->osdc, req, true);
+		req = NULL;
+
+		/* continue? */
+		index = next;
+		wbc->nr_to_write -= locked_pages;
+		if (wbc->nr_to_write <= 0)
+			done = 1;
+
+release_pvec_pages:
+		dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr,
+		     pvec.nr ? pvec.pages[0] : NULL);
+		pagevec_release(&pvec);
+
+		if (locked_pages && !done)
+			goto retry;
+	}
+
+	if (should_loop && !done) {
+		/* more to do; loop back to beginning of file */
+		dout("writepages looping back to beginning of file\n");
+		should_loop = 0;
+		index = 0;
+		goto retry;
+	}
+
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+		mapping->writeback_index = index;
+
+out:
+	if (req)
+		ceph_osdc_put_request(req);
+	if (rc > 0)
+		rc = 0;  /* vfs expects us to return 0 */
+	ceph_put_snap_context(snapc);
+	dout("writepages done, rc = %d\n", rc);
+out_final:
+	return rc;
+}
+
+
+
+/*
+ * See if a given @snapc is either writeable, or already written.
+ */
+static int context_is_writeable_or_written(struct inode *inode,
+					   struct ceph_snap_context *snapc)
+{
+	struct ceph_snap_context *oldest = get_oldest_context(inode, NULL);
+	int ret = !oldest || snapc->seq <= oldest->seq;
+
+	ceph_put_snap_context(oldest);
+	return ret;
+}
+
+/*
+ * We are only allowed to write into/dirty the page if the page is
+ * clean, or already dirty within the same snap context.
+ *
+ * called with page locked.
+ * return success with page locked,
+ * or any failure (incl -EAGAIN) with page unlocked.
+ */
+static int ceph_update_writeable_page(struct file *file,
+			    loff_t pos, unsigned len,
+			    struct page *page)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+	loff_t page_off = pos & PAGE_CACHE_MASK;
+	int pos_in_page = pos & ~PAGE_CACHE_MASK;
+	int end_in_page = pos_in_page + len;
+	loff_t i_size;
+	int r;
+	struct ceph_snap_context *snapc, *oldest;
+
+retry_locked:
+	/* writepages currently holds page lock, but if we change that later, */
+	wait_on_page_writeback(page);
+
+	/* check snap context */
+	BUG_ON(!ci->i_snap_realm);
+	down_read(&mdsc->snap_rwsem);
+	BUG_ON(!ci->i_snap_realm->cached_context);
+	snapc = (void *)page->private;
+	if (snapc && snapc != ci->i_head_snapc) {
+		/*
+		 * this page is already dirty in another (older) snap
+		 * context!  is it writeable now?
+		 */
+		oldest = get_oldest_context(inode, NULL);
+		up_read(&mdsc->snap_rwsem);
+
+		if (snapc->seq > oldest->seq) {
+			ceph_put_snap_context(oldest);
+			dout(" page %p snapc %p not current or oldest\n",
+			     page, snapc);
+			/*
+			 * queue for writeback, and wait for snapc to
+			 * be writeable or written
+			 */
+			snapc = ceph_get_snap_context(snapc);
+			unlock_page(page);
+			ceph_queue_writeback(inode);
+			r = wait_event_interruptible(ci->i_cap_wq,
+			       context_is_writeable_or_written(inode, snapc));
+			ceph_put_snap_context(snapc);
+			if (r == -ERESTARTSYS)
+				return r;
+			return -EAGAIN;
+		}
+		ceph_put_snap_context(oldest);
+
+		/* yay, writeable, do it now (without dropping page lock) */
+		dout(" page %p snapc %p not current, but oldest\n",
+		     page, snapc);
+		if (!clear_page_dirty_for_io(page))
+			goto retry_locked;
+		r = writepage_nounlock(page, NULL);
+		if (r < 0)
+			goto fail_nosnap;
+		goto retry_locked;
+	}
+
+	if (PageUptodate(page)) {
+		dout(" page %p already uptodate\n", page);
+		return 0;
+	}
+
+	/* full page? */
+	if (pos_in_page == 0 && len == PAGE_CACHE_SIZE)
+		return 0;
+
+	/* past end of file? */
+	i_size = inode->i_size;   /* caller holds i_mutex */
+
+	if (i_size + len > inode->i_sb->s_maxbytes) {
+		/* file is too big */
+		r = -EINVAL;
+		goto fail;
+	}
+
+	if (page_off >= i_size ||
+	    (pos_in_page == 0 && (pos+len) >= i_size &&
+	     end_in_page - pos_in_page != PAGE_CACHE_SIZE)) {
+		dout(" zeroing %p 0 - %d and %d - %d\n",
+		     page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE);
+		zero_user_segments(page,
+				   0, pos_in_page,
+				   end_in_page, PAGE_CACHE_SIZE);
+		return 0;
+	}
+
+	/* we need to read it. */
+	up_read(&mdsc->snap_rwsem);
+	r = readpage_nounlock(file, page);
+	if (r < 0)
+		goto fail_nosnap;
+	goto retry_locked;
+
+fail:
+	up_read(&mdsc->snap_rwsem);
+fail_nosnap:
+	unlock_page(page);
+	return r;
+}
+
+/*
+ * We are only allowed to write into/dirty the page if the page is
+ * clean, or already dirty within the same snap context.
+ */
+static int ceph_write_begin(struct file *file, struct address_space *mapping,
+			    loff_t pos, unsigned len, unsigned flags,
+			    struct page **pagep, void **fsdata)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct page *page;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	int r;
+
+	do {
+		/* get a page */
+		page = grab_cache_page_write_begin(mapping, index, 0);
+		if (!page)
+			return -ENOMEM;
+		*pagep = page;
+
+		dout("write_begin file %p inode %p page %p %d~%d\n", file,
+	     	inode, page, (int)pos, (int)len);
+
+		r = ceph_update_writeable_page(file, pos, len, page);
+	} while (r == -EAGAIN);
+
+	return r;
+}
+
+/*
+ * we don't do anything in here that simple_write_end doesn't do
+ * except adjust dirty page accounting and drop read lock on
+ * mdsc->snap_rwsem.
+ */
+static int ceph_write_end(struct file *file, struct address_space *mapping,
+			  loff_t pos, unsigned len, unsigned copied,
+			  struct page *page, void *fsdata)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ceph_client *client = ceph_inode_to_client(inode);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+	int check_cap = 0;
+
+	dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
+	     inode, page, (int)pos, (int)copied, (int)len);
+
+	/* zero the stale part of the page if we did a short copy */
+	if (copied < len)
+		zero_user_segment(page, from+copied, len);
+
+	/* did file size increase? */
+	/* (no need for i_size_read(); we caller holds i_mutex */
+	if (pos+copied > inode->i_size)
+		check_cap = ceph_inode_set_size(inode, pos+copied);
+
+	if (!PageUptodate(page))
+		SetPageUptodate(page);
+
+	set_page_dirty(page);
+
+	unlock_page(page);
+	up_read(&mdsc->snap_rwsem);
+	page_cache_release(page);
+
+	if (check_cap)
+		ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
+
+	return copied;
+}
+
+/*
+ * we set .direct_IO to indicate direct io is supported, but since we
+ * intercept O_DIRECT reads and writes early, this function should
+ * never get called.
+ */
+static ssize_t ceph_direct_io(int rw, struct kiocb *iocb,
+			      const struct iovec *iov,
+			      loff_t pos, unsigned long nr_segs)
+{
+	WARN_ON(1);
+	return -EINVAL;
+}
+
+const struct address_space_operations ceph_aops = {
+	.readpage = ceph_readpage,
+	.readpages = ceph_readpages,
+	.writepage = ceph_writepage,
+	.writepages = ceph_writepages_start,
+	.write_begin = ceph_write_begin,
+	.write_end = ceph_write_end,
+	.set_page_dirty = ceph_set_page_dirty,
+	.invalidatepage = ceph_invalidatepage,
+	.releasepage = ceph_releasepage,
+	.direct_IO = ceph_direct_io,
+};
+
+
+/*
+ * vm ops
+ */
+
+/*
+ * Reuse write_begin here for simplicity.
+ */
+static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct inode *inode = vma->vm_file->f_dentry->d_inode;
+	struct page *page = vmf->page;
+	struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+	loff_t off = page->index << PAGE_CACHE_SHIFT;
+	loff_t size, len;
+	int ret;
+
+	size = i_size_read(inode);
+	if (off + PAGE_CACHE_SIZE <= size)
+		len = PAGE_CACHE_SIZE;
+	else
+		len = size & ~PAGE_CACHE_MASK;
+
+	dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode,
+	     off, len, page, page->index);
+
+	lock_page(page);
+
+	ret = VM_FAULT_NOPAGE;
+	if ((off > size) ||
+	    (page->mapping != inode->i_mapping))
+		goto out;
+
+	ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
+	if (ret == 0) {
+		/* success.  we'll keep the page locked. */
+		set_page_dirty(page);
+		up_read(&mdsc->snap_rwsem);
+		ret = VM_FAULT_LOCKED;
+	} else {
+		if (ret == -ENOMEM)
+			ret = VM_FAULT_OOM;
+		else
+			ret = VM_FAULT_SIGBUS;
+	}
+out:
+	dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret);
+	if (ret != VM_FAULT_LOCKED)
+		unlock_page(page);
+	return ret;
+}
+
+static struct vm_operations_struct ceph_vmops = {
+	.fault		= filemap_fault,
+	.page_mkwrite	= ceph_page_mkwrite,
+};
+
+int ceph_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct address_space *mapping = file->f_mapping;
+
+	if (!mapping->a_ops->readpage)
+		return -ENOEXEC;
+	file_accessed(file);
+	vma->vm_ops = &ceph_vmops;
+	vma->vm_flags |= VM_CAN_NONLINEAR;
+	return 0;
+}
diff --git a/fs/ceph/armor.c b/fs/ceph/armor.c
new file mode 100644
index 000000000000..67b2c030924b
--- /dev/null
+++ b/fs/ceph/armor.c
@@ -0,0 +1,99 @@
+
+#include <linux/errno.h>
+
+/*
+ * base64 encode/decode.
+ */
+
+const char *pem_key = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+static int encode_bits(int c)
+{
+	return pem_key[c];
+}
+
+static int decode_bits(char c)
+{
+	if (c >= 'A' && c <= 'Z')
+		return c - 'A';
+	if (c >= 'a' && c <= 'z')
+		return c - 'a' + 26;
+	if (c >= '0' && c <= '9')
+		return c - '0' + 52;
+	if (c == '+')
+		return 62;
+	if (c == '/')
+		return 63;
+	if (c == '=')
+		return 0; /* just non-negative, please */
+	return -EINVAL;
+}
+
+int ceph_armor(char *dst, const char *src, const char *end)
+{
+	int olen = 0;
+	int line = 0;
+
+	while (src < end) {
+		unsigned char a, b, c;
+
+		a = *src++;
+		*dst++ = encode_bits(a >> 2);
+		if (src < end) {
+			b = *src++;
+			*dst++ = encode_bits(((a & 3) << 4) | (b >> 4));
+			if (src < end) {
+				c = *src++;
+				*dst++ = encode_bits(((b & 15) << 2) |
+						     (c >> 6));
+				*dst++ = encode_bits(c & 63);
+			} else {
+				*dst++ = encode_bits((b & 15) << 2);
+				*dst++ = '=';
+			}
+		} else {
+			*dst++ = encode_bits(((a & 3) << 4));
+			*dst++ = '=';
+			*dst++ = '=';
+		}
+		olen += 4;
+		line += 4;
+		if (line == 64) {
+			line = 0;
+			*(dst++) = '\n';
+			olen++;
+		}
+	}
+	return olen;
+}
+
+int ceph_unarmor(char *dst, const char *src, const char *end)
+{
+	int olen = 0;
+
+	while (src < end) {
+		int a, b, c, d;
+
+		if (src < end && src[0] == '\n')
+			src++;
+		if (src + 4 > end)
+			return -EINVAL;
+		a = decode_bits(src[0]);
+		b = decode_bits(src[1]);
+		c = decode_bits(src[2]);
+		d = decode_bits(src[3]);
+		if (a < 0 || b < 0 || c < 0 || d < 0)
+			return -EINVAL;
+
+		*dst++ = (a << 2) | (b >> 4);
+		if (src[2] == '=')
+			return olen + 1;
+		*dst++ = ((b & 15) << 4) | (c >> 2);
+		if (src[3] == '=')
+			return olen + 2;
+		*dst++ = ((c & 3) << 6) | d;
+		olen += 3;
+		src += 4;
+	}
+	return olen;
+}
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
new file mode 100644
index 000000000000..89490beaf537
--- /dev/null
+++ b/fs/ceph/auth.c
@@ -0,0 +1,259 @@
+#include "ceph_debug.h"
+
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+
+#include "types.h"
+#include "auth_none.h"
+#include "auth_x.h"
+#include "decode.h"
+#include "super.h"
+
+#include "messenger.h"
+
+/*
+ * get protocol handler
+ */
+static u32 supported_protocols[] = {
+	CEPH_AUTH_NONE,
+	CEPH_AUTH_CEPHX
+};
+
+int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
+{
+	switch (protocol) {
+	case CEPH_AUTH_NONE:
+		return ceph_auth_none_init(ac);
+	case CEPH_AUTH_CEPHX:
+		return ceph_x_init(ac);
+	default:
+		return -ENOENT;
+	}
+}
+
+/*
+ * setup, teardown.
+ */
+struct ceph_auth_client *ceph_auth_init(const char *name, const char *secret)
+{
+	struct ceph_auth_client *ac;
+	int ret;
+
+	dout("auth_init name '%s' secret '%s'\n", name, secret);
+
+	ret = -ENOMEM;
+	ac = kzalloc(sizeof(*ac), GFP_NOFS);
+	if (!ac)
+		goto out;
+
+	ac->negotiating = true;
+	if (name)
+		ac->name = name;
+	else
+		ac->name = CEPH_AUTH_NAME_DEFAULT;
+	dout("auth_init name %s secret %s\n", ac->name, secret);
+	ac->secret = secret;
+	return ac;
+
+out:
+	return ERR_PTR(ret);
+}
+
+void ceph_auth_destroy(struct ceph_auth_client *ac)
+{
+	dout("auth_destroy %p\n", ac);
+	if (ac->ops)
+		ac->ops->destroy(ac);
+	kfree(ac);
+}
+
+/*
+ * Reset occurs when reconnecting to the monitor.
+ */
+void ceph_auth_reset(struct ceph_auth_client *ac)
+{
+	dout("auth_reset %p\n", ac);
+	if (ac->ops && !ac->negotiating)
+		ac->ops->reset(ac);
+	ac->negotiating = true;
+}
+
+int ceph_entity_name_encode(const char *name, void **p, void *end)
+{
+	int len = strlen(name);
+
+	if (*p + 2*sizeof(u32) + len > end)
+		return -ERANGE;
+	ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT);
+	ceph_encode_32(p, len);
+	ceph_encode_copy(p, name, len);
+	return 0;
+}
+
+/*
+ * Initiate protocol negotiation with monitor.  Include entity name
+ * and list supported protocols.
+ */
+int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
+{
+	struct ceph_mon_request_header *monhdr = buf;
+	void *p = monhdr + 1, *end = buf + len, *lenp;
+	int i, num;
+	int ret;
+
+	dout("auth_build_hello\n");
+	monhdr->have_version = 0;
+	monhdr->session_mon = cpu_to_le16(-1);
+	monhdr->session_mon_tid = 0;
+
+	ceph_encode_32(&p, 0);  /* no protocol, yet */
+
+	lenp = p;
+	p += sizeof(u32);
+
+	ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
+	ceph_encode_8(&p, 1);
+	num = ARRAY_SIZE(supported_protocols);
+	ceph_encode_32(&p, num);
+	ceph_decode_need(&p, end, num * sizeof(u32), bad);
+	for (i = 0; i < num; i++)
+		ceph_encode_32(&p, supported_protocols[i]);
+
+	ret = ceph_entity_name_encode(ac->name, &p, end);
+	if (ret < 0)
+		return ret;
+	ceph_decode_need(&p, end, sizeof(u64), bad);
+	ceph_encode_64(&p, ac->global_id);
+
+	ceph_encode_32(&lenp, p - lenp - sizeof(u32));
+	return p - buf;
+
+bad:
+	return -ERANGE;
+}
+
+int ceph_build_auth_request(struct ceph_auth_client *ac,
+			   void *msg_buf, size_t msg_len)
+{
+	struct ceph_mon_request_header *monhdr = msg_buf;
+	void *p = monhdr + 1;
+	void *end = msg_buf + msg_len;
+	int ret;
+
+	monhdr->have_version = 0;
+	monhdr->session_mon = cpu_to_le16(-1);
+	monhdr->session_mon_tid = 0;
+
+	ceph_encode_32(&p, ac->protocol);
+
+	ret = ac->ops->build_request(ac, p + sizeof(u32), end);
+	if (ret < 0) {
+		pr_err("error %d building auth method %s request\n", ret,
+		       ac->ops->name);
+		return ret;
+	}
+	dout(" built request %d bytes\n", ret);
+	ceph_encode_32(&p, ret);
+	return p + ret - msg_buf;
+}
+
+/*
+ * Handle auth message from monitor.
+ */
+int ceph_handle_auth_reply(struct ceph_auth_client *ac,
+			   void *buf, size_t len,
+			   void *reply_buf, size_t reply_len)
+{
+	void *p = buf;
+	void *end = buf + len;
+	int protocol;
+	s32 result;
+	u64 global_id;
+	void *payload, *payload_end;
+	int payload_len;
+	char *result_msg;
+	int result_msg_len;
+	int ret = -EINVAL;
+
+	dout("handle_auth_reply %p %p\n", p, end);
+	ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
+	protocol = ceph_decode_32(&p);
+	result = ceph_decode_32(&p);
+	global_id = ceph_decode_64(&p);
+	payload_len = ceph_decode_32(&p);
+	payload = p;
+	p += payload_len;
+	ceph_decode_need(&p, end, sizeof(u32), bad);
+	result_msg_len = ceph_decode_32(&p);
+	result_msg = p;
+	p += result_msg_len;
+	if (p != end)
+		goto bad;
+
+	dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len,
+	     result_msg, global_id, payload_len);
+
+	payload_end = payload + payload_len;
+
+	if (global_id && ac->global_id != global_id) {
+		dout(" set global_id %lld -> %lld\n", ac->global_id, global_id);
+		ac->global_id = global_id;
+	}
+
+	if (ac->negotiating) {
+		/* server does not support our protocols? */
+		if (!protocol && result < 0) {
+			ret = result;
+			goto out;
+		}
+		/* set up (new) protocol handler? */
+		if (ac->protocol && ac->protocol != protocol) {
+			ac->ops->destroy(ac);
+			ac->protocol = 0;
+			ac->ops = NULL;
+		}
+		if (ac->protocol != protocol) {
+			ret = ceph_auth_init_protocol(ac, protocol);
+			if (ret) {
+				pr_err("error %d on auth protocol %d init\n",
+				       ret, protocol);
+				goto out;
+			}
+		}
+
+		ac->negotiating = false;
+	}
+
+	ret = ac->ops->handle_reply(ac, result, payload, payload_end);
+	if (ret == -EAGAIN) {
+		return ceph_build_auth_request(ac, reply_buf, reply_len);
+	} else if (ret) {
+		pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
+		return ret;
+	}
+	return 0;
+
+bad:
+	pr_err("failed to decode auth msg\n");
+out:
+	return ret;
+}
+
+int ceph_build_auth(struct ceph_auth_client *ac,
+		    void *msg_buf, size_t msg_len)
+{
+	if (!ac->protocol)
+		return ceph_auth_build_hello(ac, msg_buf, msg_len);
+	BUG_ON(!ac->ops);
+	if (ac->ops->should_authenticate(ac))
+		return ceph_build_auth_request(ac, msg_buf, msg_len);
+	return 0;
+}
+
+int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
+{
+	if (!ac->ops)
+		return 0;
+	return ac->ops->is_authenticated(ac);
+}
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h
new file mode 100644
index 000000000000..d38a2fb4a137
--- /dev/null
+++ b/fs/ceph/auth.h
@@ -0,0 +1,92 @@
+#ifndef _FS_CEPH_AUTH_H
+#define _FS_CEPH_AUTH_H
+
+#include "types.h"
+#include "buffer.h"
+
+/*
+ * Abstract interface for communicating with the authenticate module.
+ * There is some handshake that takes place between us and the monitor
+ * to acquire the necessary keys.  These are used to generate an
+ * 'authorizer' that we use when connecting to a service (mds, osd).
+ */
+
+struct ceph_auth_client;
+struct ceph_authorizer;
+
+struct ceph_auth_client_ops {
+	const char *name;
+
+	/*
+	 * true if we are authenticated and can connect to
+	 * services.
+	 */
+	int (*is_authenticated)(struct ceph_auth_client *ac);
+
+	/*
+	 * true if we should (re)authenticate, e.g., when our tickets
+	 * are getting old and crusty.
+	 */
+	int (*should_authenticate)(struct ceph_auth_client *ac);
+
+	/*
+	 * build requests and process replies during monitor
+	 * handshake.  if handle_reply returns -EAGAIN, we build
+	 * another request.
+	 */
+	int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end);
+	int (*handle_reply)(struct ceph_auth_client *ac, int result,
+			    void *buf, void *end);
+
+	/*
+	 * Create authorizer for connecting to a service, and verify
+	 * the response to authenticate the service.
+	 */
+	int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type,
+				 struct ceph_authorizer **a,
+				 void **buf, size_t *len,
+				 void **reply_buf, size_t *reply_len);
+	int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
+				       struct ceph_authorizer *a, size_t len);
+	void (*destroy_authorizer)(struct ceph_auth_client *ac,
+				   struct ceph_authorizer *a);
+	void (*invalidate_authorizer)(struct ceph_auth_client *ac,
+				      int peer_type);
+
+	/* reset when we (re)connect to a monitor */
+	void (*reset)(struct ceph_auth_client *ac);
+
+	void (*destroy)(struct ceph_auth_client *ac);
+};
+
+struct ceph_auth_client {
+	u32 protocol;           /* CEPH_AUTH_* */
+	void *private;          /* for use by protocol implementation */
+	const struct ceph_auth_client_ops *ops;  /* null iff protocol==0 */
+
+	bool negotiating;       /* true if negotiating protocol */
+	const char *name;       /* entity name */
+	u64 global_id;          /* our unique id in system */
+	const char *secret;     /* our secret key */
+	unsigned want_keys;     /* which services we want */
+};
+
+extern struct ceph_auth_client *ceph_auth_init(const char *name,
+					       const char *secret);
+extern void ceph_auth_destroy(struct ceph_auth_client *ac);
+
+extern void ceph_auth_reset(struct ceph_auth_client *ac);
+
+extern int ceph_auth_build_hello(struct ceph_auth_client *ac,
+				 void *buf, size_t len);
+extern int ceph_handle_auth_reply(struct ceph_auth_client *ac,
+				  void *buf, size_t len,
+				  void *reply_buf, size_t reply_len);
+extern int ceph_entity_name_encode(const char *name, void **p, void *end);
+
+extern int ceph_build_auth(struct ceph_auth_client *ac,
+		    void *msg_buf, size_t msg_len);
+
+extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
+
+#endif
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
new file mode 100644
index 000000000000..ad1dc21286c7
--- /dev/null
+++ b/fs/ceph/auth_none.c
@@ -0,0 +1,131 @@
+
+#include "ceph_debug.h"
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+
+#include "auth_none.h"
+#include "auth.h"
+#include "decode.h"
+
+static void reset(struct ceph_auth_client *ac)
+{
+	struct ceph_auth_none_info *xi = ac->private;
+
+	xi->starting = true;
+	xi->built_authorizer = false;
+}
+
+static void destroy(struct ceph_auth_client *ac)
+{
+	kfree(ac->private);
+	ac->private = NULL;
+}
+
+static int is_authenticated(struct ceph_auth_client *ac)
+{
+	struct ceph_auth_none_info *xi = ac->private;
+
+	return !xi->starting;
+}
+
+static int should_authenticate(struct ceph_auth_client *ac)
+{
+	struct ceph_auth_none_info *xi = ac->private;
+
+	return xi->starting;
+}
+
+/*
+ * the generic auth code decode the global_id, and we carry no actual
+ * authenticate state, so nothing happens here.
+ */
+static int handle_reply(struct ceph_auth_client *ac, int result,
+			void *buf, void *end)
+{
+	struct ceph_auth_none_info *xi = ac->private;
+
+	xi->starting = false;
+	return result;
+}
+
+/*
+ * build an 'authorizer' with our entity_name and global_id.  we can
+ * reuse a single static copy since it is identical for all services
+ * we connect to.
+ */
+static int ceph_auth_none_create_authorizer(
+	struct ceph_auth_client *ac, int peer_type,
+	struct ceph_authorizer **a,
+	void **buf, size_t *len,
+	void **reply_buf, size_t *reply_len)
+{
+	struct ceph_auth_none_info *ai = ac->private;
+	struct ceph_none_authorizer *au = &ai->au;
+	void *p, *end;
+	int ret;
+
+	if (!ai->built_authorizer) {
+		p = au->buf;
+		end = p + sizeof(au->buf);
+		ceph_encode_8(&p, 1);
+		ret = ceph_entity_name_encode(ac->name, &p, end - 8);
+		if (ret < 0)
+			goto bad;
+		ceph_decode_need(&p, end, sizeof(u64), bad2);
+		ceph_encode_64(&p, ac->global_id);
+		au->buf_len = p - (void *)au->buf;
+		ai->built_authorizer = true;
+		dout("built authorizer len %d\n", au->buf_len);
+	}
+
+	*a = (struct ceph_authorizer *)au;
+	*buf = au->buf;
+	*len = au->buf_len;
+	*reply_buf = au->reply_buf;
+	*reply_len = sizeof(au->reply_buf);
+	return 0;
+
+bad2:
+	ret = -ERANGE;
+bad:
+	return ret;
+}
+
+static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
+				      struct ceph_authorizer *a)
+{
+	/* nothing to do */
+}
+
+static const struct ceph_auth_client_ops ceph_auth_none_ops = {
+	.name = "none",
+	.reset = reset,
+	.destroy = destroy,
+	.is_authenticated = is_authenticated,
+	.should_authenticate = should_authenticate,
+	.handle_reply = handle_reply,
+	.create_authorizer = ceph_auth_none_create_authorizer,
+	.destroy_authorizer = ceph_auth_none_destroy_authorizer,
+};
+
+int ceph_auth_none_init(struct ceph_auth_client *ac)
+{
+	struct ceph_auth_none_info *xi;
+
+	dout("ceph_auth_none_init %p\n", ac);
+	xi = kzalloc(sizeof(*xi), GFP_NOFS);
+	if (!xi)
+		return -ENOMEM;
+
+	xi->starting = true;
+	xi->built_authorizer = false;
+
+	ac->protocol = CEPH_AUTH_NONE;
+	ac->private = xi;
+	ac->ops = &ceph_auth_none_ops;
+	return 0;
+}
+
diff --git a/fs/ceph/auth_none.h b/fs/ceph/auth_none.h
new file mode 100644
index 000000000000..8164df1a08be
--- /dev/null
+++ b/fs/ceph/auth_none.h
@@ -0,0 +1,30 @@
+#ifndef _FS_CEPH_AUTH_NONE_H
+#define _FS_CEPH_AUTH_NONE_H
+
+#include <linux/slab.h>
+
+#include "auth.h"
+
+/*
+ * null security mode.
+ *
+ * we use a single static authorizer that simply encodes our entity name
+ * and global id.
+ */
+
+struct ceph_none_authorizer {
+	char buf[128];
+	int buf_len;
+	char reply_buf[0];
+};
+
+struct ceph_auth_none_info {
+	bool starting;
+	bool built_authorizer;
+	struct ceph_none_authorizer au;   /* we only need one; it's static */
+};
+
+extern int ceph_auth_none_init(struct ceph_auth_client *ac);
+
+#endif
+
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
new file mode 100644
index 000000000000..6d44053ecff1
--- /dev/null
+++ b/fs/ceph/auth_x.c
@@ -0,0 +1,684 @@
+
+#include "ceph_debug.h"
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+
+#include "auth_x.h"
+#include "auth_x_protocol.h"
+#include "crypto.h"
+#include "auth.h"
+#include "decode.h"
+
+#define TEMP_TICKET_BUF_LEN	256
+
+static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
+
+static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
+{
+	struct ceph_x_info *xi = ac->private;
+	int need;
+
+	ceph_x_validate_tickets(ac, &need);
+	dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
+	     ac->want_keys, need, xi->have_keys);
+	return (ac->want_keys & xi->have_keys) == ac->want_keys;
+}
+
+static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
+{
+	struct ceph_x_info *xi = ac->private;
+	int need;
+
+	ceph_x_validate_tickets(ac, &need);
+	dout("ceph_x_should_authenticate want=%d need=%d have=%d\n",
+	     ac->want_keys, need, xi->have_keys);
+	return need != 0;
+}
+
+static int ceph_x_encrypt_buflen(int ilen)
+{
+	return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
+		sizeof(u32);
+}
+
+static int ceph_x_encrypt(struct ceph_crypto_key *secret,
+			  void *ibuf, int ilen, void *obuf, size_t olen)
+{
+	struct ceph_x_encrypt_header head = {
+		.struct_v = 1,
+		.magic = cpu_to_le64(CEPHX_ENC_MAGIC)
+	};
+	size_t len = olen - sizeof(u32);
+	int ret;
+
+	ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len,
+			    &head, sizeof(head), ibuf, ilen);
+	if (ret)
+		return ret;
+	ceph_encode_32(&obuf, len);
+	return len + sizeof(u32);
+}
+
+static int ceph_x_decrypt(struct ceph_crypto_key *secret,
+			  void **p, void *end, void *obuf, size_t olen)
+{
+	struct ceph_x_encrypt_header head;
+	size_t head_len = sizeof(head);
+	int len, ret;
+
+	len = ceph_decode_32(p);
+	if (*p + len > end)
+		return -EINVAL;
+
+	dout("ceph_x_decrypt len %d\n", len);
+	ret = ceph_decrypt2(secret, &head, &head_len, obuf, &olen,
+			    *p, len);
+	if (ret)
+		return ret;
+	if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC)
+		return -EPERM;
+	*p += len;
+	return olen;
+}
+
+/*
+ * get existing (or insert new) ticket handler
+ */
+struct ceph_x_ticket_handler *get_ticket_handler(struct ceph_auth_client *ac,
+						 int service)
+{
+	struct ceph_x_ticket_handler *th;
+	struct ceph_x_info *xi = ac->private;
+	struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node;
+
+	while (*p) {
+		parent = *p;
+		th = rb_entry(parent, struct ceph_x_ticket_handler, node);
+		if (service < th->service)
+			p = &(*p)->rb_left;
+		else if (service > th->service)
+			p = &(*p)->rb_right;
+		else
+			return th;
+	}
+
+	/* add it */
+	th = kzalloc(sizeof(*th), GFP_NOFS);
+	if (!th)
+		return ERR_PTR(-ENOMEM);
+	th->service = service;
+	rb_link_node(&th->node, parent, p);
+	rb_insert_color(&th->node, &xi->ticket_handlers);
+	return th;
+}
+
+static void remove_ticket_handler(struct ceph_auth_client *ac,
+				  struct ceph_x_ticket_handler *th)
+{
+	struct ceph_x_info *xi = ac->private;
+
+	dout("remove_ticket_handler %p %d\n", th, th->service);
+	rb_erase(&th->node, &xi->ticket_handlers);
+	ceph_crypto_key_destroy(&th->session_key);
+	if (th->ticket_blob)
+		ceph_buffer_put(th->ticket_blob);
+	kfree(th);
+}
+
+static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
+				    struct ceph_crypto_key *secret,
+				    void *buf, void *end)
+{
+	struct ceph_x_info *xi = ac->private;
+	int num;
+	void *p = buf;
+	int ret;
+	char *dbuf;
+	char *ticket_buf;
+	u8 reply_struct_v;
+
+	dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
+	if (!dbuf)
+		return -ENOMEM;
+
+	ret = -ENOMEM;
+	ticket_buf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
+	if (!ticket_buf)
+		goto out_dbuf;
+
+	ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
+	reply_struct_v = ceph_decode_8(&p);
+	if (reply_struct_v != 1)
+		goto bad;
+	num = ceph_decode_32(&p);
+	dout("%d tickets\n", num);
+	while (num--) {
+		int type;
+		u8 tkt_struct_v, blob_struct_v;
+		struct ceph_x_ticket_handler *th;
+		void *dp, *dend;
+		int dlen;
+		char is_enc;
+		struct timespec validity;
+		struct ceph_crypto_key old_key;
+		void *tp, *tpend;
+		struct ceph_timespec new_validity;
+		struct ceph_crypto_key new_session_key;
+		struct ceph_buffer *new_ticket_blob;
+		unsigned long new_expires, new_renew_after;
+		u64 new_secret_id;
+
+		ceph_decode_need(&p, end, sizeof(u32) + 1, bad);
+
+		type = ceph_decode_32(&p);
+		dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
+
+		tkt_struct_v = ceph_decode_8(&p);
+		if (tkt_struct_v != 1)
+			goto bad;
+
+		th = get_ticket_handler(ac, type);
+		if (IS_ERR(th)) {
+			ret = PTR_ERR(th);
+			goto out;
+		}
+
+		/* blob for me */
+		dlen = ceph_x_decrypt(secret, &p, end, dbuf,
+				      TEMP_TICKET_BUF_LEN);
+		if (dlen <= 0) {
+			ret = dlen;
+			goto out;
+		}
+		dout(" decrypted %d bytes\n", dlen);
+		dend = dbuf + dlen;
+		dp = dbuf;
+
+		tkt_struct_v = ceph_decode_8(&dp);
+		if (tkt_struct_v != 1)
+			goto bad;
+
+		memcpy(&old_key, &th->session_key, sizeof(old_key));
+		ret = ceph_crypto_key_decode(&new_session_key, &dp, dend);
+		if (ret)
+			goto out;
+
+		ceph_decode_copy(&dp, &new_validity, sizeof(new_validity));
+		ceph_decode_timespec(&validity, &new_validity);
+		new_expires = get_seconds() + validity.tv_sec;
+		new_renew_after = new_expires - (validity.tv_sec / 4);
+		dout(" expires=%lu renew_after=%lu\n", new_expires,
+		     new_renew_after);
+
+		/* ticket blob for service */
+		ceph_decode_8_safe(&p, end, is_enc, bad);
+		tp = ticket_buf;
+		if (is_enc) {
+			/* encrypted */
+			dout(" encrypted ticket\n");
+			dlen = ceph_x_decrypt(&old_key, &p, end, ticket_buf,
+					      TEMP_TICKET_BUF_LEN);
+			if (dlen < 0) {
+				ret = dlen;
+				goto out;
+			}
+			dlen = ceph_decode_32(&tp);
+		} else {
+			/* unencrypted */
+			ceph_decode_32_safe(&p, end, dlen, bad);
+			ceph_decode_need(&p, end, dlen, bad);
+			ceph_decode_copy(&p, ticket_buf, dlen);
+		}
+		tpend = tp + dlen;
+		dout(" ticket blob is %d bytes\n", dlen);
+		ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
+		blob_struct_v = ceph_decode_8(&tp);
+		new_secret_id = ceph_decode_64(&tp);
+		ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
+		if (ret)
+			goto out;
+
+		/* all is well, update our ticket */
+		ceph_crypto_key_destroy(&th->session_key);
+		if (th->ticket_blob)
+			ceph_buffer_put(th->ticket_blob);
+		th->session_key = new_session_key;
+		th->ticket_blob = new_ticket_blob;
+		th->validity = new_validity;
+		th->secret_id = new_secret_id;
+		th->expires = new_expires;
+		th->renew_after = new_renew_after;
+		dout(" got ticket service %d (%s) secret_id %lld len %d\n",
+		     type, ceph_entity_type_name(type), th->secret_id,
+		     (int)th->ticket_blob->vec.iov_len);
+		xi->have_keys |= th->service;
+	}
+
+	ret = 0;
+out:
+	kfree(ticket_buf);
+out_dbuf:
+	kfree(dbuf);
+	return ret;
+
+bad:
+	ret = -EINVAL;
+	goto out;
+}
+
+static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
+				   struct ceph_x_ticket_handler *th,
+				   struct ceph_x_authorizer *au)
+{
+	int maxlen;
+	struct ceph_x_authorize_a *msg_a;
+	struct ceph_x_authorize_b msg_b;
+	void *p, *end;
+	int ret;
+	int ticket_blob_len =
+		(th->ticket_blob ? th->ticket_blob->vec.iov_len : 0);
+
+	dout("build_authorizer for %s %p\n",
+	     ceph_entity_type_name(th->service), au);
+
+	maxlen = sizeof(*msg_a) + sizeof(msg_b) +
+		ceph_x_encrypt_buflen(ticket_blob_len);
+	dout("  need len %d\n", maxlen);
+	if (au->buf && au->buf->alloc_len < maxlen) {
+		ceph_buffer_put(au->buf);
+		au->buf = NULL;
+	}
+	if (!au->buf) {
+		au->buf = ceph_buffer_new(maxlen, GFP_NOFS);
+		if (!au->buf)
+			return -ENOMEM;
+	}
+	au->service = th->service;
+
+	msg_a = au->buf->vec.iov_base;
+	msg_a->struct_v = 1;
+	msg_a->global_id = cpu_to_le64(ac->global_id);
+	msg_a->service_id = cpu_to_le32(th->service);
+	msg_a->ticket_blob.struct_v = 1;
+	msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id);
+	msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len);
+	if (ticket_blob_len) {
+		memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base,
+		       th->ticket_blob->vec.iov_len);
+	}
+	dout(" th %p secret_id %lld %lld\n", th, th->secret_id,
+	     le64_to_cpu(msg_a->ticket_blob.secret_id));
+
+	p = msg_a + 1;
+	p += ticket_blob_len;
+	end = au->buf->vec.iov_base + au->buf->vec.iov_len;
+
+	get_random_bytes(&au->nonce, sizeof(au->nonce));
+	msg_b.struct_v = 1;
+	msg_b.nonce = cpu_to_le64(au->nonce);
+	ret = ceph_x_encrypt(&th->session_key, &msg_b, sizeof(msg_b),
+			     p, end - p);
+	if (ret < 0)
+		goto out_buf;
+	p += ret;
+	au->buf->vec.iov_len = p - au->buf->vec.iov_base;
+	dout(" built authorizer nonce %llx len %d\n", au->nonce,
+	     (int)au->buf->vec.iov_len);
+	BUG_ON(au->buf->vec.iov_len > maxlen);
+	return 0;
+
+out_buf:
+	ceph_buffer_put(au->buf);
+	au->buf = NULL;
+	return ret;
+}
+
+static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th,
+				void **p, void *end)
+{
+	ceph_decode_need(p, end, 1 + sizeof(u64), bad);
+	ceph_encode_8(p, 1);
+	ceph_encode_64(p, th->secret_id);
+	if (th->ticket_blob) {
+		const char *buf = th->ticket_blob->vec.iov_base;
+		u32 len = th->ticket_blob->vec.iov_len;
+
+		ceph_encode_32_safe(p, end, len, bad);
+		ceph_encode_copy_safe(p, end, buf, len, bad);
+	} else {
+		ceph_encode_32_safe(p, end, 0, bad);
+	}
+
+	return 0;
+bad:
+	return -ERANGE;
+}
+
+static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
+{
+	int want = ac->want_keys;
+	struct ceph_x_info *xi = ac->private;
+	int service;
+
+	*pneed = ac->want_keys & ~(xi->have_keys);
+
+	for (service = 1; service <= want; service <<= 1) {
+		struct ceph_x_ticket_handler *th;
+
+		if (!(ac->want_keys & service))
+			continue;
+
+		if (*pneed & service)
+			continue;
+
+		th = get_ticket_handler(ac, service);
+
+		if (!th) {
+			*pneed |= service;
+			continue;
+		}
+
+		if (get_seconds() >= th->renew_after)
+			*pneed |= service;
+		if (get_seconds() >= th->expires)
+			xi->have_keys &= ~service;
+	}
+}
+
+
+static int ceph_x_build_request(struct ceph_auth_client *ac,
+				void *buf, void *end)
+{
+	struct ceph_x_info *xi = ac->private;
+	int need;
+	struct ceph_x_request_header *head = buf;
+	int ret;
+	struct ceph_x_ticket_handler *th =
+		get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
+
+	ceph_x_validate_tickets(ac, &need);
+
+	dout("build_request want %x have %x need %x\n",
+	     ac->want_keys, xi->have_keys, need);
+
+	if (need & CEPH_ENTITY_TYPE_AUTH) {
+		struct ceph_x_authenticate *auth = (void *)(head + 1);
+		void *p = auth + 1;
+		struct ceph_x_challenge_blob tmp;
+		char tmp_enc[40];
+		u64 *u;
+
+		if (p > end)
+			return -ERANGE;
+
+		dout(" get_auth_session_key\n");
+		head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY);
+
+		/* encrypt and hash */
+		get_random_bytes(&auth->client_challenge, sizeof(u64));
+		tmp.client_challenge = auth->client_challenge;
+		tmp.server_challenge = cpu_to_le64(xi->server_challenge);
+		ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp),
+				     tmp_enc, sizeof(tmp_enc));
+		if (ret < 0)
+			return ret;
+
+		auth->struct_v = 1;
+		auth->key = 0;
+		for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
+			auth->key ^= *u;
+		dout(" server_challenge %llx client_challenge %llx key %llx\n",
+		     xi->server_challenge, le64_to_cpu(auth->client_challenge),
+		     le64_to_cpu(auth->key));
+
+		/* now encode the old ticket if exists */
+		ret = ceph_x_encode_ticket(th, &p, end);
+		if (ret < 0)
+			return ret;
+
+		return p - buf;
+	}
+
+	if (need) {
+		void *p = head + 1;
+		struct ceph_x_service_ticket_request *req;
+
+		if (p > end)
+			return -ERANGE;
+		head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
+
+		BUG_ON(!th);
+		ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
+		if (ret)
+			return ret;
+		ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
+				 xi->auth_authorizer.buf->vec.iov_len);
+
+		req = p;
+		req->keys = cpu_to_le32(need);
+		p += sizeof(*req);
+		return p - buf;
+	}
+
+	return 0;
+}
+
+static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
+			       void *buf, void *end)
+{
+	struct ceph_x_info *xi = ac->private;
+	struct ceph_x_reply_header *head = buf;
+	struct ceph_x_ticket_handler *th;
+	int len = end - buf;
+	int op;
+	int ret;
+
+	if (result)
+		return result;  /* XXX hmm? */
+
+	if (xi->starting) {
+		/* it's a hello */
+		struct ceph_x_server_challenge *sc = buf;
+
+		if (len != sizeof(*sc))
+			return -EINVAL;
+		xi->server_challenge = le64_to_cpu(sc->server_challenge);
+		dout("handle_reply got server challenge %llx\n",
+		     xi->server_challenge);
+		xi->starting = false;
+		xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH;
+		return -EAGAIN;
+	}
+
+	op = le16_to_cpu(head->op);
+	result = le32_to_cpu(head->result);
+	dout("handle_reply op %d result %d\n", op, result);
+	switch (op) {
+	case CEPHX_GET_AUTH_SESSION_KEY:
+		/* verify auth key */
+		ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
+					       buf + sizeof(*head), end);
+		break;
+
+	case CEPHX_GET_PRINCIPAL_SESSION_KEY:
+		th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
+		BUG_ON(!th);
+		ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
+					       buf + sizeof(*head), end);
+		break;
+
+	default:
+		return -EINVAL;
+	}
+	if (ret)
+		return ret;
+	if (ac->want_keys == xi->have_keys)
+		return 0;
+	return -EAGAIN;
+}
+
+static int ceph_x_create_authorizer(
+	struct ceph_auth_client *ac, int peer_type,
+	struct ceph_authorizer **a,
+	void **buf, size_t *len,
+	void **reply_buf, size_t *reply_len)
+{
+	struct ceph_x_authorizer *au;
+	struct ceph_x_ticket_handler *th;
+	int ret;
+
+	th = get_ticket_handler(ac, peer_type);
+	if (IS_ERR(th))
+		return PTR_ERR(th);
+
+	au = kzalloc(sizeof(*au), GFP_NOFS);
+	if (!au)
+		return -ENOMEM;
+
+	ret = ceph_x_build_authorizer(ac, th, au);
+	if (ret) {
+		kfree(au);
+		return ret;
+	}
+
+	*a = (struct ceph_authorizer *)au;
+	*buf = au->buf->vec.iov_base;
+	*len = au->buf->vec.iov_len;
+	*reply_buf = au->reply_buf;
+	*reply_len = sizeof(au->reply_buf);
+	return 0;
+}
+
+static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
+					  struct ceph_authorizer *a, size_t len)
+{
+	struct ceph_x_authorizer *au = (void *)a;
+	struct ceph_x_ticket_handler *th;
+	int ret = 0;
+	struct ceph_x_authorize_reply reply;
+	void *p = au->reply_buf;
+	void *end = p + sizeof(au->reply_buf);
+
+	th = get_ticket_handler(ac, au->service);
+	if (!th)
+		return -EIO;  /* hrm! */
+	ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
+	if (ret < 0)
+		return ret;
+	if (ret != sizeof(reply))
+		return -EPERM;
+
+	if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one))
+		ret = -EPERM;
+	else
+		ret = 0;
+	dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
+	     au->nonce, le64_to_cpu(reply.nonce_plus_one), ret);
+	return ret;
+}
+
+static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac,
+				      struct ceph_authorizer *a)
+{
+	struct ceph_x_authorizer *au = (void *)a;
+
+	ceph_buffer_put(au->buf);
+	kfree(au);
+}
+
+
+static void ceph_x_reset(struct ceph_auth_client *ac)
+{
+	struct ceph_x_info *xi = ac->private;
+
+	dout("reset\n");
+	xi->starting = true;
+	xi->server_challenge = 0;
+}
+
+static void ceph_x_destroy(struct ceph_auth_client *ac)
+{
+	struct ceph_x_info *xi = ac->private;
+	struct rb_node *p;
+
+	dout("ceph_x_destroy %p\n", ac);
+	ceph_crypto_key_destroy(&xi->secret);
+
+	while ((p = rb_first(&xi->ticket_handlers)) != NULL) {
+		struct ceph_x_ticket_handler *th =
+			rb_entry(p, struct ceph_x_ticket_handler, node);
+		remove_ticket_handler(ac, th);
+	}
+
+	if (xi->auth_authorizer.buf)
+		ceph_buffer_put(xi->auth_authorizer.buf);
+
+	kfree(ac->private);
+	ac->private = NULL;
+}
+
+static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
+				   int peer_type)
+{
+	struct ceph_x_ticket_handler *th;
+
+	th = get_ticket_handler(ac, peer_type);
+	if (th && !IS_ERR(th))
+		remove_ticket_handler(ac, th);
+}
+
+
+static const struct ceph_auth_client_ops ceph_x_ops = {
+	.name = "x",
+	.is_authenticated = ceph_x_is_authenticated,
+	.should_authenticate = ceph_x_should_authenticate,
+	.build_request = ceph_x_build_request,
+	.handle_reply = ceph_x_handle_reply,
+	.create_authorizer = ceph_x_create_authorizer,
+	.verify_authorizer_reply = ceph_x_verify_authorizer_reply,
+	.destroy_authorizer = ceph_x_destroy_authorizer,
+	.invalidate_authorizer = ceph_x_invalidate_authorizer,
+	.reset =  ceph_x_reset,
+	.destroy = ceph_x_destroy,
+};
+
+
+int ceph_x_init(struct ceph_auth_client *ac)
+{
+	struct ceph_x_info *xi;
+	int ret;
+
+	dout("ceph_x_init %p\n", ac);
+	ret = -ENOMEM;
+	xi = kzalloc(sizeof(*xi), GFP_NOFS);
+	if (!xi)
+		goto out;
+
+	ret = -EINVAL;
+	if (!ac->secret) {
+		pr_err("no secret set (for auth_x protocol)\n");
+		goto out_nomem;
+	}
+
+	ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret);
+	if (ret)
+		goto out_nomem;
+
+	xi->starting = true;
+	xi->ticket_handlers = RB_ROOT;
+
+	ac->protocol = CEPH_AUTH_CEPHX;
+	ac->private = xi;
+	ac->ops = &ceph_x_ops;
+	return 0;
+
+out_nomem:
+	kfree(xi);
+out:
+	return ret;
+}
+
+
diff --git a/fs/ceph/auth_x.h b/fs/ceph/auth_x.h
new file mode 100644
index 000000000000..ff6f8180e681
--- /dev/null
+++ b/fs/ceph/auth_x.h
@@ -0,0 +1,49 @@
+#ifndef _FS_CEPH_AUTH_X_H
+#define _FS_CEPH_AUTH_X_H
+
+#include <linux/rbtree.h>
+
+#include "crypto.h"
+#include "auth.h"
+#include "auth_x_protocol.h"
+
+/*
+ * Handle ticket for a single service.
+ */
+struct ceph_x_ticket_handler {
+	struct rb_node node;
+	unsigned service;
+
+	struct ceph_crypto_key session_key;
+	struct ceph_timespec validity;
+
+	u64 secret_id;
+	struct ceph_buffer *ticket_blob;
+
+	unsigned long renew_after, expires;
+};
+
+
+struct ceph_x_authorizer {
+	struct ceph_buffer *buf;
+	unsigned service;
+	u64 nonce;
+	char reply_buf[128];  /* big enough for encrypted blob */
+};
+
+struct ceph_x_info {
+	struct ceph_crypto_key secret;
+
+	bool starting;
+	u64 server_challenge;
+
+	unsigned have_keys;
+	struct rb_root ticket_handlers;
+
+	struct ceph_x_authorizer auth_authorizer;
+};
+
+extern int ceph_x_init(struct ceph_auth_client *ac);
+
+#endif
+
diff --git a/fs/ceph/auth_x_protocol.h b/fs/ceph/auth_x_protocol.h
new file mode 100644
index 000000000000..671d30576c4f
--- /dev/null
+++ b/fs/ceph/auth_x_protocol.h
@@ -0,0 +1,90 @@
+#ifndef __FS_CEPH_AUTH_X_PROTOCOL
+#define __FS_CEPH_AUTH_X_PROTOCOL
+
+#define CEPHX_GET_AUTH_SESSION_KEY      0x0100
+#define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200
+#define CEPHX_GET_ROTATING_KEY          0x0400
+
+/* common bits */
+struct ceph_x_ticket_blob {
+	__u8 struct_v;
+	__le64 secret_id;
+	__le32 blob_len;
+	char blob[];
+} __attribute__ ((packed));
+
+
+/* common request/reply headers */
+struct ceph_x_request_header {
+	__le16 op;
+} __attribute__ ((packed));
+
+struct ceph_x_reply_header {
+	__le16 op;
+	__le32 result;
+} __attribute__ ((packed));
+
+
+/* authenticate handshake */
+
+/* initial hello (no reply header) */
+struct ceph_x_server_challenge {
+	__u8 struct_v;
+	__le64 server_challenge;
+} __attribute__ ((packed));
+
+struct ceph_x_authenticate {
+	__u8 struct_v;
+	__le64 client_challenge;
+	__le64 key;
+	/* ticket blob */
+} __attribute__ ((packed));
+
+struct ceph_x_service_ticket_request {
+	__u8 struct_v;
+	__le32 keys;
+} __attribute__ ((packed));
+
+struct ceph_x_challenge_blob {
+	__le64 server_challenge;
+	__le64 client_challenge;
+} __attribute__ ((packed));
+
+
+
+/* authorize handshake */
+
+/*
+ * The authorizer consists of two pieces:
+ *  a - service id, ticket blob
+ *  b - encrypted with session key
+ */
+struct ceph_x_authorize_a {
+	__u8 struct_v;
+	__le64 global_id;
+	__le32 service_id;
+	struct ceph_x_ticket_blob ticket_blob;
+} __attribute__ ((packed));
+
+struct ceph_x_authorize_b {
+	__u8 struct_v;
+	__le64 nonce;
+} __attribute__ ((packed));
+
+struct ceph_x_authorize_reply {
+	__u8 struct_v;
+	__le64 nonce_plus_one;
+} __attribute__ ((packed));
+
+
+/*
+ * encyption bundle
+ */
+#define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull
+
+struct ceph_x_encrypt_header {
+	__u8 struct_v;
+	__le64 magic;
+} __attribute__ ((packed));
+
+#endif
diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c
new file mode 100644
index 000000000000..c67535d70aa6
--- /dev/null
+++ b/fs/ceph/buffer.c
@@ -0,0 +1,81 @@
+
+#include "ceph_debug.h"
+
+#include <linux/slab.h>
+
+#include "buffer.h"
+#include "decode.h"
+
+struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
+{
+	struct ceph_buffer *b;
+
+	b = kmalloc(sizeof(*b), gfp);
+	if (!b)
+		return NULL;
+
+	b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
+	if (b->vec.iov_base) {
+		b->is_vmalloc = false;
+	} else {
+		b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
+		if (!b->vec.iov_base) {
+			kfree(b);
+			return NULL;
+		}
+		b->is_vmalloc = true;
+	}
+
+	kref_init(&b->kref);
+	b->alloc_len = len;
+	b->vec.iov_len = len;
+	dout("buffer_new %p\n", b);
+	return b;
+}
+
+void ceph_buffer_release(struct kref *kref)
+{
+	struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);
+
+	dout("buffer_release %p\n", b);
+	if (b->vec.iov_base) {
+		if (b->is_vmalloc)
+			vfree(b->vec.iov_base);
+		else
+			kfree(b->vec.iov_base);
+	}
+	kfree(b);
+}
+
+int ceph_buffer_alloc(struct ceph_buffer *b, int len, gfp_t gfp)
+{
+	b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
+	if (b->vec.iov_base) {
+		b->is_vmalloc = false;
+	} else {
+		b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
+		b->is_vmalloc = true;
+	}
+	if (!b->vec.iov_base)
+		return -ENOMEM;
+	b->alloc_len = len;
+	b->vec.iov_len = len;
+	return 0;
+}
+
+int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
+{
+	size_t len;
+
+	ceph_decode_need(p, end, sizeof(u32), bad);
+	len = ceph_decode_32(p);
+	dout("decode_buffer len %d\n", (int)len);
+	ceph_decode_need(p, end, len, bad);
+	*b = ceph_buffer_new(len, GFP_NOFS);
+	if (!*b)
+		return -ENOMEM;
+	ceph_decode_copy(p, (*b)->vec.iov_base, len);
+	return 0;
+bad:
+	return -EINVAL;
+}
diff --git a/fs/ceph/buffer.h b/fs/ceph/buffer.h
new file mode 100644
index 000000000000..58d19014068f
--- /dev/null
+++ b/fs/ceph/buffer.h
@@ -0,0 +1,39 @@
+#ifndef __FS_CEPH_BUFFER_H
+#define __FS_CEPH_BUFFER_H
+
+#include <linux/kref.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/types.h>
+#include <linux/uio.h>
+
+/*
+ * a simple reference counted buffer.
+ *
+ * use kmalloc for small sizes (<= one page), vmalloc for larger
+ * sizes.
+ */
+struct ceph_buffer {
+	struct kref kref;
+	struct kvec vec;
+	size_t alloc_len;
+	bool is_vmalloc;
+};
+
+extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp);
+extern void ceph_buffer_release(struct kref *kref);
+
+static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b)
+{
+	kref_get(&b->kref);
+	return b;
+}
+
+static inline void ceph_buffer_put(struct ceph_buffer *b)
+{
+	kref_put(&b->kref, ceph_buffer_release);
+}
+
+extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end);
+
+#endif
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
new file mode 100644
index 000000000000..b81be9a56487
--- /dev/null
+++ b/fs/ceph/caps.c
@@ -0,0 +1,2991 @@
+#include "ceph_debug.h"
+
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/wait.h>
+#include <linux/writeback.h>
+
+#include "super.h"
+#include "decode.h"
+#include "messenger.h"
+
+/*
+ * Capability management
+ *
+ * The Ceph metadata servers control client access to inode metadata
+ * and file data by issuing capabilities, granting clients permission
+ * to read and/or write both inode field and file data to OSDs
+ * (storage nodes).  Each capability consists of a set of bits
+ * indicating which operations are allowed.
+ *
+ * If the client holds a *_SHARED cap, the client has a coherent value
+ * that can be safely read from the cached inode.
+ *
+ * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the
+ * client is allowed to change inode attributes (e.g., file size,
+ * mtime), note its dirty state in the ceph_cap, and asynchronously
+ * flush that metadata change to the MDS.
+ *
+ * In the event of a conflicting operation (perhaps by another
+ * client), the MDS will revoke the conflicting client capabilities.
+ *
+ * In order for a client to cache an inode, it must hold a capability
+ * with at least one MDS server.  When inodes are released, release
+ * notifications are batched and periodically sent en masse to the MDS
+ * cluster to release server state.
+ */
+
+
+/*
+ * Generate readable cap strings for debugging output.
+ */
+#define MAX_CAP_STR 20
+static char cap_str[MAX_CAP_STR][40];
+static DEFINE_SPINLOCK(cap_str_lock);
+static int last_cap_str;
+
+static char *gcap_string(char *s, int c)
+{
+	if (c & CEPH_CAP_GSHARED)
+		*s++ = 's';
+	if (c & CEPH_CAP_GEXCL)
+		*s++ = 'x';
+	if (c & CEPH_CAP_GCACHE)
+		*s++ = 'c';
+	if (c & CEPH_CAP_GRD)
+		*s++ = 'r';
+	if (c & CEPH_CAP_GWR)
+		*s++ = 'w';
+	if (c & CEPH_CAP_GBUFFER)
+		*s++ = 'b';
+	if (c & CEPH_CAP_GLAZYIO)
+		*s++ = 'l';
+	return s;
+}
+
+const char *ceph_cap_string(int caps)
+{
+	int i;
+	char *s;
+	int c;
+
+	spin_lock(&cap_str_lock);
+	i = last_cap_str++;
+	if (last_cap_str == MAX_CAP_STR)
+		last_cap_str = 0;
+	spin_unlock(&cap_str_lock);
+
+	s = cap_str[i];
+
+	if (caps & CEPH_CAP_PIN)
+		*s++ = 'p';
+
+	c = (caps >> CEPH_CAP_SAUTH) & 3;
+	if (c) {
+		*s++ = 'A';
+		s = gcap_string(s, c);
+	}
+
+	c = (caps >> CEPH_CAP_SLINK) & 3;
+	if (c) {
+		*s++ = 'L';
+		s = gcap_string(s, c);
+	}
+
+	c = (caps >> CEPH_CAP_SXATTR) & 3;
+	if (c) {
+		*s++ = 'X';
+		s = gcap_string(s, c);
+	}
+
+	c = caps >> CEPH_CAP_SFILE;
+	if (c) {
+		*s++ = 'F';
+		s = gcap_string(s, c);
+	}
+
+	if (s == cap_str[i])
+		*s++ = '-';
+	*s = 0;
+	return cap_str[i];
+}
+
+/*
+ * Cap reservations
+ *
+ * Maintain a global pool of preallocated struct ceph_caps, referenced
+ * by struct ceph_caps_reservations.  This ensures that we preallocate
+ * memory needed to successfully process an MDS response.  (If an MDS
+ * sends us cap information and we fail to process it, we will have
+ * problems due to the client and MDS being out of sync.)
+ *
+ * Reservations are 'owned' by a ceph_cap_reservation context.
+ */
+static spinlock_t caps_list_lock;
+static struct list_head caps_list;  /* unused (reserved or unreserved) */
+static int caps_total_count;        /* total caps allocated */
+static int caps_use_count;          /* in use */
+static int caps_reserve_count;      /* unused, reserved */
+static int caps_avail_count;        /* unused, unreserved */
+static int caps_min_count;          /* keep at least this many (unreserved) */
+
+void __init ceph_caps_init(void)
+{
+	INIT_LIST_HEAD(&caps_list);
+	spin_lock_init(&caps_list_lock);
+}
+
+void ceph_caps_finalize(void)
+{
+	struct ceph_cap *cap;
+
+	spin_lock(&caps_list_lock);
+	while (!list_empty(&caps_list)) {
+		cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
+		list_del(&cap->caps_item);
+		kmem_cache_free(ceph_cap_cachep, cap);
+	}
+	caps_total_count = 0;
+	caps_avail_count = 0;
+	caps_use_count = 0;
+	caps_reserve_count = 0;
+	caps_min_count = 0;
+	spin_unlock(&caps_list_lock);
+}
+
+void ceph_adjust_min_caps(int delta)
+{
+	spin_lock(&caps_list_lock);
+	caps_min_count += delta;
+	BUG_ON(caps_min_count < 0);
+	spin_unlock(&caps_list_lock);
+}
+
+int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need)
+{
+	int i;
+	struct ceph_cap *cap;
+	int have;
+	int alloc = 0;
+	LIST_HEAD(newcaps);
+	int ret = 0;
+
+	dout("reserve caps ctx=%p need=%d\n", ctx, need);
+
+	/* first reserve any caps that are already allocated */
+	spin_lock(&caps_list_lock);
+	if (caps_avail_count >= need)
+		have = need;
+	else
+		have = caps_avail_count;
+	caps_avail_count -= have;
+	caps_reserve_count += have;
+	BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+	       caps_avail_count);
+	spin_unlock(&caps_list_lock);
+
+	for (i = have; i < need; i++) {
+		cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
+		if (!cap) {
+			ret = -ENOMEM;
+			goto out_alloc_count;
+		}
+		list_add(&cap->caps_item, &newcaps);
+		alloc++;
+	}
+	BUG_ON(have + alloc != need);
+
+	spin_lock(&caps_list_lock);
+	caps_total_count += alloc;
+	caps_reserve_count += alloc;
+	list_splice(&newcaps, &caps_list);
+
+	BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+	       caps_avail_count);
+	spin_unlock(&caps_list_lock);
+
+	ctx->count = need;
+	dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
+	     ctx, caps_total_count, caps_use_count, caps_reserve_count,
+	     caps_avail_count);
+	return 0;
+
+out_alloc_count:
+	/* we didn't manage to reserve as much as we needed */
+	pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
+		   ctx, need, have);
+	return ret;
+}
+
+int ceph_unreserve_caps(struct ceph_cap_reservation *ctx)
+{
+	dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
+	if (ctx->count) {
+		spin_lock(&caps_list_lock);
+		BUG_ON(caps_reserve_count < ctx->count);
+		caps_reserve_count -= ctx->count;
+		caps_avail_count += ctx->count;
+		ctx->count = 0;
+		dout("unreserve caps %d = %d used + %d resv + %d avail\n",
+		     caps_total_count, caps_use_count, caps_reserve_count,
+		     caps_avail_count);
+		BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+		       caps_avail_count);
+		spin_unlock(&caps_list_lock);
+	}
+	return 0;
+}
+
+static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx)
+{
+	struct ceph_cap *cap = NULL;
+
+	/* temporary, until we do something about cap import/export */
+	if (!ctx) {
+		cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
+		if (cap) {
+			caps_use_count++;
+			caps_total_count++;
+		}
+		return cap;
+	}
+
+	spin_lock(&caps_list_lock);
+	dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
+	     ctx, ctx->count, caps_total_count, caps_use_count,
+	     caps_reserve_count, caps_avail_count);
+	BUG_ON(!ctx->count);
+	BUG_ON(ctx->count > caps_reserve_count);
+	BUG_ON(list_empty(&caps_list));
+
+	ctx->count--;
+	caps_reserve_count--;
+	caps_use_count++;
+
+	cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
+	list_del(&cap->caps_item);
+
+	BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+	       caps_avail_count);
+	spin_unlock(&caps_list_lock);
+	return cap;
+}
+
+void ceph_put_cap(struct ceph_cap *cap)
+{
+	spin_lock(&caps_list_lock);
+	dout("put_cap %p %d = %d used + %d resv + %d avail\n",
+	     cap, caps_total_count, caps_use_count,
+	     caps_reserve_count, caps_avail_count);
+	caps_use_count--;
+	/*
+	 * Keep some preallocated caps around (ceph_min_count), to
+	 * avoid lots of free/alloc churn.
+	 */
+	if (caps_avail_count >= caps_reserve_count + caps_min_count) {
+		caps_total_count--;
+		kmem_cache_free(ceph_cap_cachep, cap);
+	} else {
+		caps_avail_count++;
+		list_add(&cap->caps_item, &caps_list);
+	}
+
+	BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+	       caps_avail_count);
+	spin_unlock(&caps_list_lock);
+}
+
+void ceph_reservation_status(struct ceph_client *client,
+			     int *total, int *avail, int *used, int *reserved,
+			     int *min)
+{
+	if (total)
+		*total = caps_total_count;
+	if (avail)
+		*avail = caps_avail_count;
+	if (used)
+		*used = caps_use_count;
+	if (reserved)
+		*reserved = caps_reserve_count;
+	if (min)
+		*min = caps_min_count;
+}
+
+/*
+ * Find ceph_cap for given mds, if any.
+ *
+ * Called with i_lock held.
+ */
+static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
+{
+	struct ceph_cap *cap;
+	struct rb_node *n = ci->i_caps.rb_node;
+
+	while (n) {
+		cap = rb_entry(n, struct ceph_cap, ci_node);
+		if (mds < cap->mds)
+			n = n->rb_left;
+		else if (mds > cap->mds)
+			n = n->rb_right;
+		else
+			return cap;
+	}
+	return NULL;
+}
+
+/*
+ * Return id of any MDS with a cap, preferably FILE_WR|WRBUFFER|EXCL, else
+ * -1.
+ */
+static int __ceph_get_cap_mds(struct ceph_inode_info *ci, u32 *mseq)
+{
+	struct ceph_cap *cap;
+	int mds = -1;
+	struct rb_node *p;
+
+	/* prefer mds with WR|WRBUFFER|EXCL caps */
+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+		cap = rb_entry(p, struct ceph_cap, ci_node);
+		mds = cap->mds;
+		if (mseq)
+			*mseq = cap->mseq;
+		if (cap->issued & (CEPH_CAP_FILE_WR |
+				   CEPH_CAP_FILE_BUFFER |
+				   CEPH_CAP_FILE_EXCL))
+			break;
+	}
+	return mds;
+}
+
+int ceph_get_cap_mds(struct inode *inode)
+{
+	int mds;
+	spin_lock(&inode->i_lock);
+	mds = __ceph_get_cap_mds(ceph_inode(inode), NULL);
+	spin_unlock(&inode->i_lock);
+	return mds;
+}
+
+/*
+ * Called under i_lock.
+ */
+static void __insert_cap_node(struct ceph_inode_info *ci,
+			      struct ceph_cap *new)
+{
+	struct rb_node **p = &ci->i_caps.rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_cap *cap = NULL;
+
+	while (*p) {
+		parent = *p;
+		cap = rb_entry(parent, struct ceph_cap, ci_node);
+		if (new->mds < cap->mds)
+			p = &(*p)->rb_left;
+		else if (new->mds > cap->mds)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&new->ci_node, parent, p);
+	rb_insert_color(&new->ci_node, &ci->i_caps);
+}
+
+/*
+ * (re)set cap hold timeouts, which control the delayed release
+ * of unused caps back to the MDS.  Should be called on cap use.
+ */
+static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
+			       struct ceph_inode_info *ci)
+{
+	struct ceph_mount_args *ma = mdsc->client->mount_args;
+
+	ci->i_hold_caps_min = round_jiffies(jiffies +
+					    ma->caps_wanted_delay_min * HZ);
+	ci->i_hold_caps_max = round_jiffies(jiffies +
+					    ma->caps_wanted_delay_max * HZ);
+	dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
+	     ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
+}
+
+/*
+ * (Re)queue cap at the end of the delayed cap release list.
+ *
+ * If I_FLUSH is set, leave the inode at the front of the list.
+ *
+ * Caller holds i_lock
+ *    -> we take mdsc->cap_delay_lock
+ */
+static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
+				struct ceph_inode_info *ci)
+{
+	__cap_set_timeouts(mdsc, ci);
+	dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
+	     ci->i_ceph_flags, ci->i_hold_caps_max);
+	if (!mdsc->stopping) {
+		spin_lock(&mdsc->cap_delay_lock);
+		if (!list_empty(&ci->i_cap_delay_list)) {
+			if (ci->i_ceph_flags & CEPH_I_FLUSH)
+				goto no_change;
+			list_del_init(&ci->i_cap_delay_list);
+		}
+		list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
+no_change:
+		spin_unlock(&mdsc->cap_delay_lock);
+	}
+}
+
+/*
+ * Queue an inode for immediate writeback.  Mark inode with I_FLUSH,
+ * indicating we should send a cap message to flush dirty metadata
+ * asap, and move to the front of the delayed cap list.
+ */
+static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
+				      struct ceph_inode_info *ci)
+{
+	dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
+	spin_lock(&mdsc->cap_delay_lock);
+	ci->i_ceph_flags |= CEPH_I_FLUSH;
+	if (!list_empty(&ci->i_cap_delay_list))
+		list_del_init(&ci->i_cap_delay_list);
+	list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
+	spin_unlock(&mdsc->cap_delay_lock);
+}
+
+/*
+ * Cancel delayed work on cap.
+ *
+ * Caller must hold i_lock.
+ */
+static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
+			       struct ceph_inode_info *ci)
+{
+	dout("__cap_delay_cancel %p\n", &ci->vfs_inode);
+	if (list_empty(&ci->i_cap_delay_list))
+		return;
+	spin_lock(&mdsc->cap_delay_lock);
+	list_del_init(&ci->i_cap_delay_list);
+	spin_unlock(&mdsc->cap_delay_lock);
+}
+
+/*
+ * Common issue checks for add_cap, handle_cap_grant.
+ */
+static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
+			      unsigned issued)
+{
+	unsigned had = __ceph_caps_issued(ci, NULL);
+
+	/*
+	 * Each time we receive FILE_CACHE anew, we increment
+	 * i_rdcache_gen.
+	 */
+	if ((issued & CEPH_CAP_FILE_CACHE) &&
+	    (had & CEPH_CAP_FILE_CACHE) == 0)
+		ci->i_rdcache_gen++;
+
+	/*
+	 * if we are newly issued FILE_SHARED, clear I_COMPLETE; we
+	 * don't know what happened to this directory while we didn't
+	 * have the cap.
+	 */
+	if ((issued & CEPH_CAP_FILE_SHARED) &&
+	    (had & CEPH_CAP_FILE_SHARED) == 0) {
+		ci->i_shared_gen++;
+		if (S_ISDIR(ci->vfs_inode.i_mode)) {
+			dout(" marking %p NOT complete\n", &ci->vfs_inode);
+			ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
+		}
+	}
+}
+
+/*
+ * Add a capability under the given MDS session.
+ *
+ * Caller should hold session snap_rwsem (read) and s_mutex.
+ *
+ * @fmode is the open file mode, if we are opening a file, otherwise
+ * it is < 0.  (This is so we can atomically add the cap and add an
+ * open file reference to it.)
+ */
+int ceph_add_cap(struct inode *inode,
+		 struct ceph_mds_session *session, u64 cap_id,
+		 int fmode, unsigned issued, unsigned wanted,
+		 unsigned seq, unsigned mseq, u64 realmino, int flags,
+		 struct ceph_cap_reservation *caps_reservation)
+{
+	struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_cap *new_cap = NULL;
+	struct ceph_cap *cap;
+	int mds = session->s_mds;
+	int actual_wanted;
+
+	dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
+	     session->s_mds, cap_id, ceph_cap_string(issued), seq);
+
+	/*
+	 * If we are opening the file, include file mode wanted bits
+	 * in wanted.
+	 */
+	if (fmode >= 0)
+		wanted |= ceph_caps_for_mode(fmode);
+
+retry:
+	spin_lock(&inode->i_lock);
+	cap = __get_cap_for_mds(ci, mds);
+	if (!cap) {
+		if (new_cap) {
+			cap = new_cap;
+			new_cap = NULL;
+		} else {
+			spin_unlock(&inode->i_lock);
+			new_cap = get_cap(caps_reservation);
+			if (new_cap == NULL)
+				return -ENOMEM;
+			goto retry;
+		}
+
+		cap->issued = 0;
+		cap->implemented = 0;
+		cap->mds = mds;
+		cap->mds_wanted = 0;
+
+		cap->ci = ci;
+		__insert_cap_node(ci, cap);
+
+		/* clear out old exporting info?  (i.e. on cap import) */
+		if (ci->i_cap_exporting_mds == mds) {
+			ci->i_cap_exporting_issued = 0;
+			ci->i_cap_exporting_mseq = 0;
+			ci->i_cap_exporting_mds = -1;
+		}
+
+		/* add to session cap list */
+		cap->session = session;
+		spin_lock(&session->s_cap_lock);
+		list_add_tail(&cap->session_caps, &session->s_caps);
+		session->s_nr_caps++;
+		spin_unlock(&session->s_cap_lock);
+	}
+
+	if (!ci->i_snap_realm) {
+		/*
+		 * add this inode to the appropriate snap realm
+		 */
+		struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
+							       realmino);
+		if (realm) {
+			ceph_get_snap_realm(mdsc, realm);
+			spin_lock(&realm->inodes_with_caps_lock);
+			ci->i_snap_realm = realm;
+			list_add(&ci->i_snap_realm_item,
+				 &realm->inodes_with_caps);
+			spin_unlock(&realm->inodes_with_caps_lock);
+		} else {
+			pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
+			       realmino);
+		}
+	}
+
+	__check_cap_issue(ci, cap, issued);
+
+	/*
+	 * If we are issued caps we don't want, or the mds' wanted
+	 * value appears to be off, queue a check so we'll release
+	 * later and/or update the mds wanted value.
+	 */
+	actual_wanted = __ceph_caps_wanted(ci);
+	if ((wanted & ~actual_wanted) ||
+	    (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
+		dout(" issued %s, mds wanted %s, actual %s, queueing\n",
+		     ceph_cap_string(issued), ceph_cap_string(wanted),
+		     ceph_cap_string(actual_wanted));
+		__cap_delay_requeue(mdsc, ci);
+	}
+
+	if (flags & CEPH_CAP_FLAG_AUTH)
+		ci->i_auth_cap = cap;
+	else if (ci->i_auth_cap == cap)
+		ci->i_auth_cap = NULL;
+
+	dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
+	     inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
+	     ceph_cap_string(issued|cap->issued), seq, mds);
+	cap->cap_id = cap_id;
+	cap->issued = issued;
+	cap->implemented |= issued;
+	cap->mds_wanted |= wanted;
+	cap->seq = seq;
+	cap->issue_seq = seq;
+	cap->mseq = mseq;
+	cap->cap_gen = session->s_cap_gen;
+
+	if (fmode >= 0)
+		__ceph_get_fmode(ci, fmode);
+	spin_unlock(&inode->i_lock);
+	wake_up_all(&ci->i_cap_wq);
+	return 0;
+}
+
+/*
+ * Return true if cap has not timed out and belongs to the current
+ * generation of the MDS session (i.e. has not gone 'stale' due to
+ * us losing touch with the mds).
+ */
+static int __cap_is_valid(struct ceph_cap *cap)
+{
+	unsigned long ttl;
+	u32 gen;
+
+	spin_lock(&cap->session->s_cap_lock);
+	gen = cap->session->s_cap_gen;
+	ttl = cap->session->s_cap_ttl;
+	spin_unlock(&cap->session->s_cap_lock);
+
+	if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
+		dout("__cap_is_valid %p cap %p issued %s "
+		     "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
+		     cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
+		return 0;
+	}
+
+	return 1;
+}
+
+/*
+ * Return set of valid cap bits issued to us.  Note that caps time
+ * out, and may be invalidated in bulk if the client session times out
+ * and session->s_cap_gen is bumped.
+ */
+int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
+{
+	int have = ci->i_snap_caps | ci->i_cap_exporting_issued;
+	struct ceph_cap *cap;
+	struct rb_node *p;
+
+	if (implemented)
+		*implemented = 0;
+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+		cap = rb_entry(p, struct ceph_cap, ci_node);
+		if (!__cap_is_valid(cap))
+			continue;
+		dout("__ceph_caps_issued %p cap %p issued %s\n",
+		     &ci->vfs_inode, cap, ceph_cap_string(cap->issued));
+		have |= cap->issued;
+		if (implemented)
+			*implemented |= cap->implemented;
+	}
+	return have;
+}
+
+/*
+ * Get cap bits issued by caps other than @ocap
+ */
+int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap)
+{
+	int have = ci->i_snap_caps;
+	struct ceph_cap *cap;
+	struct rb_node *p;
+
+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+		cap = rb_entry(p, struct ceph_cap, ci_node);
+		if (cap == ocap)
+			continue;
+		if (!__cap_is_valid(cap))
+			continue;
+		have |= cap->issued;
+	}
+	return have;
+}
+
+/*
+ * Move a cap to the end of the LRU (oldest caps at list head, newest
+ * at list tail).
+ */
+static void __touch_cap(struct ceph_cap *cap)
+{
+	struct ceph_mds_session *s = cap->session;
+
+	spin_lock(&s->s_cap_lock);
+	if (s->s_cap_iterator == NULL) {
+		dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
+		     s->s_mds);
+		list_move_tail(&cap->session_caps, &s->s_caps);
+	} else {
+		dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",
+		     &cap->ci->vfs_inode, cap, s->s_mds);
+	}
+	spin_unlock(&s->s_cap_lock);
+}
+
+/*
+ * Check if we hold the given mask.  If so, move the cap(s) to the
+ * front of their respective LRUs.  (This is the preferred way for
+ * callers to check for caps they want.)
+ */
+int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
+{
+	struct ceph_cap *cap;
+	struct rb_node *p;
+	int have = ci->i_snap_caps;
+
+	if ((have & mask) == mask) {
+		dout("__ceph_caps_issued_mask %p snap issued %s"
+		     " (mask %s)\n", &ci->vfs_inode,
+		     ceph_cap_string(have),
+		     ceph_cap_string(mask));
+		return 1;
+	}
+
+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+		cap = rb_entry(p, struct ceph_cap, ci_node);
+		if (!__cap_is_valid(cap))
+			continue;
+		if ((cap->issued & mask) == mask) {
+			dout("__ceph_caps_issued_mask %p cap %p issued %s"
+			     " (mask %s)\n", &ci->vfs_inode, cap,
+			     ceph_cap_string(cap->issued),
+			     ceph_cap_string(mask));
+			if (touch)
+				__touch_cap(cap);
+			return 1;
+		}
+
+		/* does a combination of caps satisfy mask? */
+		have |= cap->issued;
+		if ((have & mask) == mask) {
+			dout("__ceph_caps_issued_mask %p combo issued %s"
+			     " (mask %s)\n", &ci->vfs_inode,
+			     ceph_cap_string(cap->issued),
+			     ceph_cap_string(mask));
+			if (touch) {
+				struct rb_node *q;
+
+				/* touch this + preceeding caps */
+				__touch_cap(cap);
+				for (q = rb_first(&ci->i_caps); q != p;
+				     q = rb_next(q)) {
+					cap = rb_entry(q, struct ceph_cap,
+						       ci_node);
+					if (!__cap_is_valid(cap))
+						continue;
+					__touch_cap(cap);
+				}
+			}
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Return true if mask caps are currently being revoked by an MDS.
+ */
+int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
+{
+	struct inode *inode = &ci->vfs_inode;
+	struct ceph_cap *cap;
+	struct rb_node *p;
+	int ret = 0;
+
+	spin_lock(&inode->i_lock);
+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+		cap = rb_entry(p, struct ceph_cap, ci_node);
+		if (__cap_is_valid(cap) &&
+		    (cap->implemented & ~cap->issued & mask)) {
+			ret = 1;
+			break;
+		}
+	}
+	spin_unlock(&inode->i_lock);
+	dout("ceph_caps_revoking %p %s = %d\n", inode,
+	     ceph_cap_string(mask), ret);
+	return ret;
+}
+
+int __ceph_caps_used(struct ceph_inode_info *ci)
+{
+	int used = 0;
+	if (ci->i_pin_ref)
+		used |= CEPH_CAP_PIN;
+	if (ci->i_rd_ref)
+		used |= CEPH_CAP_FILE_RD;
+	if (ci->i_rdcache_ref || ci->i_rdcache_gen)
+		used |= CEPH_CAP_FILE_CACHE;
+	if (ci->i_wr_ref)
+		used |= CEPH_CAP_FILE_WR;
+	if (ci->i_wrbuffer_ref)
+		used |= CEPH_CAP_FILE_BUFFER;
+	return used;
+}
+
+/*
+ * wanted, by virtue of open file modes
+ */
+int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
+{
+	int want = 0;
+	int mode;
+	for (mode = 0; mode < 4; mode++)
+		if (ci->i_nr_by_mode[mode])
+			want |= ceph_caps_for_mode(mode);
+	return want;
+}
+
+/*
+ * Return caps we have registered with the MDS(s) as 'wanted'.
+ */
+int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
+{
+	struct ceph_cap *cap;
+	struct rb_node *p;
+	int mds_wanted = 0;
+
+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+		cap = rb_entry(p, struct ceph_cap, ci_node);
+		if (!__cap_is_valid(cap))
+			continue;
+		mds_wanted |= cap->mds_wanted;
+	}
+	return mds_wanted;
+}
+
+/*
+ * called under i_lock
+ */
+static int __ceph_is_any_caps(struct ceph_inode_info *ci)
+{
+	return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0;
+}
+
+/*
+ * Remove a cap.  Take steps to deal with a racing iterate_session_caps.
+ *
+ * caller should hold i_lock.
+ * caller will not hold session s_mutex if called from destroy_inode.
+ */
+void __ceph_remove_cap(struct ceph_cap *cap)
+{
+	struct ceph_mds_session *session = cap->session;
+	struct ceph_inode_info *ci = cap->ci;
+	struct ceph_mds_client *mdsc =
+		&ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
+	int removed = 0;
+
+	dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
+
+	/* remove from session list */
+	spin_lock(&session->s_cap_lock);
+	if (session->s_cap_iterator == cap) {
+		/* not yet, we are iterating over this very cap */
+		dout("__ceph_remove_cap  delaying %p removal from session %p\n",
+		     cap, cap->session);
+	} else {
+		list_del_init(&cap->session_caps);
+		session->s_nr_caps--;
+		cap->session = NULL;
+		removed = 1;
+	}
+	/* protect backpointer with s_cap_lock: see iterate_session_caps */
+	cap->ci = NULL;
+	spin_unlock(&session->s_cap_lock);
+
+	/* remove from inode list */
+	rb_erase(&cap->ci_node, &ci->i_caps);
+	if (ci->i_auth_cap == cap)
+		ci->i_auth_cap = NULL;
+
+	if (removed)
+		ceph_put_cap(cap);
+
+	if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
+		struct ceph_snap_realm *realm = ci->i_snap_realm;
+		spin_lock(&realm->inodes_with_caps_lock);
+		list_del_init(&ci->i_snap_realm_item);
+		ci->i_snap_realm_counter++;
+		ci->i_snap_realm = NULL;
+		spin_unlock(&realm->inodes_with_caps_lock);
+		ceph_put_snap_realm(mdsc, realm);
+	}
+	if (!__ceph_is_any_real_caps(ci))
+		__cap_delay_cancel(mdsc, ci);
+}
+
+/*
+ * Build and send a cap message to the given MDS.
+ *
+ * Caller should be holding s_mutex.
+ */
+static int send_cap_msg(struct ceph_mds_session *session,
+			u64 ino, u64 cid, int op,
+			int caps, int wanted, int dirty,
+			u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq,
+			u64 size, u64 max_size,
+			struct timespec *mtime, struct timespec *atime,
+			u64 time_warp_seq,
+			uid_t uid, gid_t gid, mode_t mode,
+			u64 xattr_version,
+			struct ceph_buffer *xattrs_buf,
+			u64 follows)
+{
+	struct ceph_mds_caps *fc;
+	struct ceph_msg *msg;
+
+	dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
+	     " seq %u/%u mseq %u follows %lld size %llu/%llu"
+	     " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op),
+	     cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted),
+	     ceph_cap_string(dirty),
+	     seq, issue_seq, mseq, follows, size, max_size,
+	     xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
+
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS);
+	if (!msg)
+		return -ENOMEM;
+
+	msg->hdr.tid = cpu_to_le64(flush_tid);
+
+	fc = msg->front.iov_base;
+	memset(fc, 0, sizeof(*fc));
+
+	fc->cap_id = cpu_to_le64(cid);
+	fc->op = cpu_to_le32(op);
+	fc->seq = cpu_to_le32(seq);
+	fc->issue_seq = cpu_to_le32(issue_seq);
+	fc->migrate_seq = cpu_to_le32(mseq);
+	fc->caps = cpu_to_le32(caps);
+	fc->wanted = cpu_to_le32(wanted);
+	fc->dirty = cpu_to_le32(dirty);
+	fc->ino = cpu_to_le64(ino);
+	fc->snap_follows = cpu_to_le64(follows);
+
+	fc->size = cpu_to_le64(size);
+	fc->max_size = cpu_to_le64(max_size);
+	if (mtime)
+		ceph_encode_timespec(&fc->mtime, mtime);
+	if (atime)
+		ceph_encode_timespec(&fc->atime, atime);
+	fc->time_warp_seq = cpu_to_le32(time_warp_seq);
+
+	fc->uid = cpu_to_le32(uid);
+	fc->gid = cpu_to_le32(gid);
+	fc->mode = cpu_to_le32(mode);
+
+	fc->xattr_version = cpu_to_le64(xattr_version);
+	if (xattrs_buf) {
+		msg->middle = ceph_buffer_get(xattrs_buf);
+		fc->xattr_len = cpu_to_le32(xattrs_buf->vec.iov_len);
+		msg->hdr.middle_len = cpu_to_le32(xattrs_buf->vec.iov_len);
+	}
+
+	ceph_con_send(&session->s_con, msg);
+	return 0;
+}
+
+static void __queue_cap_release(struct ceph_mds_session *session,
+				u64 ino, u64 cap_id, u32 migrate_seq,
+				u32 issue_seq)
+{
+	struct ceph_msg *msg;
+	struct ceph_mds_cap_release *head;
+	struct ceph_mds_cap_item *item;
+
+	spin_lock(&session->s_cap_lock);
+	BUG_ON(!session->s_num_cap_releases);
+	msg = list_first_entry(&session->s_cap_releases,
+			       struct ceph_msg, list_head);
+
+	dout(" adding %llx release to mds%d msg %p (%d left)\n",
+	     ino, session->s_mds, msg, session->s_num_cap_releases);
+
+	BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
+	head = msg->front.iov_base;
+	head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
+	item = msg->front.iov_base + msg->front.iov_len;
+	item->ino = cpu_to_le64(ino);
+	item->cap_id = cpu_to_le64(cap_id);
+	item->migrate_seq = cpu_to_le32(migrate_seq);
+	item->seq = cpu_to_le32(issue_seq);
+
+	session->s_num_cap_releases--;
+
+	msg->front.iov_len += sizeof(*item);
+	if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
+		dout(" release msg %p full\n", msg);
+		list_move_tail(&msg->list_head, &session->s_cap_releases_done);
+	} else {
+		dout(" release msg %p at %d/%d (%d)\n", msg,
+		     (int)le32_to_cpu(head->num),
+		     (int)CEPH_CAPS_PER_RELEASE,
+		     (int)msg->front.iov_len);
+	}
+	spin_unlock(&session->s_cap_lock);
+}
+
+/*
+ * Queue cap releases when an inode is dropped from our cache.  Since
+ * inode is about to be destroyed, there is no need for i_lock.
+ */
+void ceph_queue_caps_release(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct rb_node *p;
+
+	p = rb_first(&ci->i_caps);
+	while (p) {
+		struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
+		struct ceph_mds_session *session = cap->session;
+
+		__queue_cap_release(session, ceph_ino(inode), cap->cap_id,
+				    cap->mseq, cap->issue_seq);
+		p = rb_next(p);
+		__ceph_remove_cap(cap);
+	}
+}
+
+/*
+ * Send a cap msg on the given inode.  Update our caps state, then
+ * drop i_lock and send the message.
+ *
+ * Make note of max_size reported/requested from mds, revoked caps
+ * that have now been implemented.
+ *
+ * Make half-hearted attempt ot to invalidate page cache if we are
+ * dropping RDCACHE.  Note that this will leave behind locked pages
+ * that we'll then need to deal with elsewhere.
+ *
+ * Return non-zero if delayed release, or we experienced an error
+ * such that the caller should requeue + retry later.
+ *
+ * called with i_lock, then drops it.
+ * caller should hold snap_rwsem (read), s_mutex.
+ */
+static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
+		      int op, int used, int want, int retain, int flushing,
+		      unsigned *pflush_tid)
+	__releases(cap->ci->vfs_inode->i_lock)
+{
+	struct ceph_inode_info *ci = cap->ci;
+	struct inode *inode = &ci->vfs_inode;
+	u64 cap_id = cap->cap_id;
+	int held, revoking, dropping, keep;
+	u64 seq, issue_seq, mseq, time_warp_seq, follows;
+	u64 size, max_size;
+	struct timespec mtime, atime;
+	int wake = 0;
+	mode_t mode;
+	uid_t uid;
+	gid_t gid;
+	struct ceph_mds_session *session;
+	u64 xattr_version = 0;
+	int delayed = 0;
+	u64 flush_tid = 0;
+	int i;
+	int ret;
+
+	held = cap->issued | cap->implemented;
+	revoking = cap->implemented & ~cap->issued;
+	retain &= ~revoking;
+	dropping = cap->issued & ~retain;
+
+	dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",
+	     inode, cap, cap->session,
+	     ceph_cap_string(held), ceph_cap_string(held & retain),
+	     ceph_cap_string(revoking));
+	BUG_ON((retain & CEPH_CAP_PIN) == 0);
+
+	session = cap->session;
+
+	/* don't release wanted unless we've waited a bit. */
+	if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
+	    time_before(jiffies, ci->i_hold_caps_min)) {
+		dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
+		     ceph_cap_string(cap->issued),
+		     ceph_cap_string(cap->issued & retain),
+		     ceph_cap_string(cap->mds_wanted),
+		     ceph_cap_string(want));
+		want |= cap->mds_wanted;
+		retain |= cap->issued;
+		delayed = 1;
+	}
+	ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
+
+	cap->issued &= retain;  /* drop bits we don't want */
+	if (cap->implemented & ~cap->issued) {
+		/*
+		 * Wake up any waiters on wanted -> needed transition.
+		 * This is due to the weird transition from buffered
+		 * to sync IO... we need to flush dirty pages _before_
+		 * allowing sync writes to avoid reordering.
+		 */
+		wake = 1;
+	}
+	cap->implemented &= cap->issued | used;
+	cap->mds_wanted = want;
+
+	if (flushing) {
+		/*
+		 * assign a tid for flush operations so we can avoid
+		 * flush1 -> dirty1 -> flush2 -> flushack1 -> mark
+		 * clean type races.  track latest tid for every bit
+		 * so we can handle flush AxFw, flush Fw, and have the
+		 * first ack clean Ax.
+		 */
+		flush_tid = ++ci->i_cap_flush_last_tid;
+		if (pflush_tid)
+			*pflush_tid = flush_tid;
+		dout(" cap_flush_tid %d\n", (int)flush_tid);
+		for (i = 0; i < CEPH_CAP_BITS; i++)
+			if (flushing & (1 << i))
+				ci->i_cap_flush_tid[i] = flush_tid;
+	}
+
+	keep = cap->implemented;
+	seq = cap->seq;
+	issue_seq = cap->issue_seq;
+	mseq = cap->mseq;
+	size = inode->i_size;
+	ci->i_reported_size = size;
+	max_size = ci->i_wanted_max_size;
+	ci->i_requested_max_size = max_size;
+	mtime = inode->i_mtime;
+	atime = inode->i_atime;
+	time_warp_seq = ci->i_time_warp_seq;
+	follows = ci->i_snap_realm->cached_context->seq;
+	uid = inode->i_uid;
+	gid = inode->i_gid;
+	mode = inode->i_mode;
+
+	if (dropping & CEPH_CAP_XATTR_EXCL) {
+		__ceph_build_xattrs_blob(ci);
+		xattr_version = ci->i_xattrs.version + 1;
+	}
+
+	spin_unlock(&inode->i_lock);
+
+	ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
+		op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
+		size, max_size, &mtime, &atime, time_warp_seq,
+		uid, gid, mode,
+		xattr_version,
+		(flushing & CEPH_CAP_XATTR_EXCL) ? ci->i_xattrs.blob : NULL,
+		follows);
+	if (ret < 0) {
+		dout("error sending cap msg, must requeue %p\n", inode);
+		delayed = 1;
+	}
+
+	if (wake)
+		wake_up_all(&ci->i_cap_wq);
+
+	return delayed;
+}
+
+/*
+ * When a snapshot is taken, clients accumulate dirty metadata on
+ * inodes with capabilities in ceph_cap_snaps to describe the file
+ * state at the time the snapshot was taken.  This must be flushed
+ * asynchronously back to the MDS once sync writes complete and dirty
+ * data is written out.
+ *
+ * Called under i_lock.  Takes s_mutex as needed.
+ */
+void __ceph_flush_snaps(struct ceph_inode_info *ci,
+			struct ceph_mds_session **psession)
+{
+	struct inode *inode = &ci->vfs_inode;
+	int mds;
+	struct ceph_cap_snap *capsnap;
+	u32 mseq;
+	struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+	struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
+						    session->s_mutex */
+	u64 next_follows = 0;  /* keep track of how far we've gotten through the
+			     i_cap_snaps list, and skip these entries next time
+			     around to avoid an infinite loop */
+
+	if (psession)
+		session = *psession;
+
+	dout("__flush_snaps %p\n", inode);
+retry:
+	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+		/* avoid an infiniute loop after retry */
+		if (capsnap->follows < next_follows)
+			continue;
+		/*
+		 * we need to wait for sync writes to complete and for dirty
+		 * pages to be written out.
+		 */
+		if (capsnap->dirty_pages || capsnap->writing)
+			continue;
+
+		/*
+		 * if cap writeback already occurred, we should have dropped
+		 * the capsnap in ceph_put_wrbuffer_cap_refs.
+		 */
+		BUG_ON(capsnap->dirty == 0);
+
+		/* pick mds, take s_mutex */
+		mds = __ceph_get_cap_mds(ci, &mseq);
+		if (session && session->s_mds != mds) {
+			dout("oops, wrong session %p mutex\n", session);
+			mutex_unlock(&session->s_mutex);
+			ceph_put_mds_session(session);
+			session = NULL;
+		}
+		if (!session) {
+			spin_unlock(&inode->i_lock);
+			mutex_lock(&mdsc->mutex);
+			session = __ceph_lookup_mds_session(mdsc, mds);
+			mutex_unlock(&mdsc->mutex);
+			if (session) {
+				dout("inverting session/ino locks on %p\n",
+				     session);
+				mutex_lock(&session->s_mutex);
+			}
+			/*
+			 * if session == NULL, we raced against a cap
+			 * deletion.  retry, and we'll get a better
+			 * @mds value next time.
+			 */
+			spin_lock(&inode->i_lock);
+			goto retry;
+		}
+
+		capsnap->flush_tid = ++ci->i_cap_flush_last_tid;
+		atomic_inc(&capsnap->nref);
+		if (!list_empty(&capsnap->flushing_item))
+			list_del_init(&capsnap->flushing_item);
+		list_add_tail(&capsnap->flushing_item,
+			      &session->s_cap_snaps_flushing);
+		spin_unlock(&inode->i_lock);
+
+		dout("flush_snaps %p cap_snap %p follows %lld size %llu\n",
+		     inode, capsnap, next_follows, capsnap->size);
+		send_cap_msg(session, ceph_vino(inode).ino, 0,
+			     CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
+			     capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
+			     capsnap->size, 0,
+			     &capsnap->mtime, &capsnap->atime,
+			     capsnap->time_warp_seq,
+			     capsnap->uid, capsnap->gid, capsnap->mode,
+			     0, NULL,
+			     capsnap->follows);
+
+		next_follows = capsnap->follows + 1;
+		ceph_put_cap_snap(capsnap);
+
+		spin_lock(&inode->i_lock);
+		goto retry;
+	}
+
+	/* we flushed them all; remove this inode from the queue */
+	spin_lock(&mdsc->snap_flush_lock);
+	list_del_init(&ci->i_snap_flush_item);
+	spin_unlock(&mdsc->snap_flush_lock);
+
+	if (psession)
+		*psession = session;
+	else if (session) {
+		mutex_unlock(&session->s_mutex);
+		ceph_put_mds_session(session);
+	}
+}
+
+static void ceph_flush_snaps(struct ceph_inode_info *ci)
+{
+	struct inode *inode = &ci->vfs_inode;
+
+	spin_lock(&inode->i_lock);
+	__ceph_flush_snaps(ci, NULL);
+	spin_unlock(&inode->i_lock);
+}
+
+/*
+ * Mark caps dirty.  If inode is newly dirty, add to the global dirty
+ * list.
+ */
+void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
+{
+	struct ceph_mds_client *mdsc =
+		&ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
+	struct inode *inode = &ci->vfs_inode;
+	int was = ci->i_dirty_caps;
+	int dirty = 0;
+
+	dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
+	     ceph_cap_string(mask), ceph_cap_string(was),
+	     ceph_cap_string(was | mask));
+	ci->i_dirty_caps |= mask;
+	if (was == 0) {
+		dout(" inode %p now dirty\n", &ci->vfs_inode);
+		BUG_ON(!list_empty(&ci->i_dirty_item));
+		spin_lock(&mdsc->cap_dirty_lock);
+		list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
+		spin_unlock(&mdsc->cap_dirty_lock);
+		if (ci->i_flushing_caps == 0) {
+			igrab(inode);
+			dirty |= I_DIRTY_SYNC;
+		}
+	}
+	BUG_ON(list_empty(&ci->i_dirty_item));
+	if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
+	    (mask & CEPH_CAP_FILE_BUFFER))
+		dirty |= I_DIRTY_DATASYNC;
+	if (dirty)
+		__mark_inode_dirty(inode, dirty);
+	__cap_delay_requeue(mdsc, ci);
+}
+
+/*
+ * Add dirty inode to the flushing list.  Assigned a seq number so we
+ * can wait for caps to flush without starving.
+ *
+ * Called under i_lock.
+ */
+static int __mark_caps_flushing(struct inode *inode,
+				 struct ceph_mds_session *session)
+{
+	struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int flushing;
+
+	BUG_ON(ci->i_dirty_caps == 0);
+	BUG_ON(list_empty(&ci->i_dirty_item));
+
+	flushing = ci->i_dirty_caps;
+	dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
+	     ceph_cap_string(flushing),
+	     ceph_cap_string(ci->i_flushing_caps),
+	     ceph_cap_string(ci->i_flushing_caps | flushing));
+	ci->i_flushing_caps |= flushing;
+	ci->i_dirty_caps = 0;
+	dout(" inode %p now !dirty\n", inode);
+
+	spin_lock(&mdsc->cap_dirty_lock);
+	list_del_init(&ci->i_dirty_item);
+
+	ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
+	if (list_empty(&ci->i_flushing_item)) {
+		list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
+		mdsc->num_cap_flushing++;
+		dout(" inode %p now flushing seq %lld\n", inode,
+		     ci->i_cap_flush_seq);
+	} else {
+		list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
+		dout(" inode %p now flushing (more) seq %lld\n", inode,
+		     ci->i_cap_flush_seq);
+	}
+	spin_unlock(&mdsc->cap_dirty_lock);
+
+	return flushing;
+}
+
+/*
+ * try to invalidate mapping pages without blocking.
+ */
+static int mapping_is_empty(struct address_space *mapping)
+{
+	struct page *page = find_get_page(mapping, 0);
+
+	if (!page)
+		return 1;
+
+	put_page(page);
+	return 0;
+}
+
+static int try_nonblocking_invalidate(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	u32 invalidating_gen = ci->i_rdcache_gen;
+
+	spin_unlock(&inode->i_lock);
+	invalidate_mapping_pages(&inode->i_data, 0, -1);
+	spin_lock(&inode->i_lock);
+
+	if (mapping_is_empty(&inode->i_data) &&
+	    invalidating_gen == ci->i_rdcache_gen) {
+		/* success. */
+		dout("try_nonblocking_invalidate %p success\n", inode);
+		ci->i_rdcache_gen = 0;
+		ci->i_rdcache_revoking = 0;
+		return 0;
+	}
+	dout("try_nonblocking_invalidate %p failed\n", inode);
+	return -1;
+}
+
+/*
+ * Swiss army knife function to examine currently used and wanted
+ * versus held caps.  Release, flush, ack revoked caps to mds as
+ * appropriate.
+ *
+ *  CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
+ *    cap release further.
+ *  CHECK_CAPS_AUTHONLY - we should only check the auth cap
+ *  CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
+ *    further delay.
+ */
+void ceph_check_caps(struct ceph_inode_info *ci, int flags,
+		     struct ceph_mds_session *session)
+	__releases(session->s_mutex)
+{
+	struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct inode *inode = &ci->vfs_inode;
+	struct ceph_cap *cap;
+	int file_wanted, used;
+	int took_snap_rwsem = 0;             /* true if mdsc->snap_rwsem held */
+	int issued, implemented, want, retain, revoking, flushing = 0;
+	int mds = -1;   /* keep track of how far we've gone through i_caps list
+			   to avoid an infinite loop on retry */
+	struct rb_node *p;
+	int tried_invalidate = 0;
+	int delayed = 0, sent = 0, force_requeue = 0, num;
+	int queue_invalidate = 0;
+	int is_delayed = flags & CHECK_CAPS_NODELAY;
+
+	/* if we are unmounting, flush any unused caps immediately. */
+	if (mdsc->stopping)
+		is_delayed = 1;
+
+	spin_lock(&inode->i_lock);
+
+	if (ci->i_ceph_flags & CEPH_I_FLUSH)
+		flags |= CHECK_CAPS_FLUSH;
+
+	/* flush snaps first time around only */
+	if (!list_empty(&ci->i_cap_snaps))
+		__ceph_flush_snaps(ci, &session);
+	goto retry_locked;
+retry:
+	spin_lock(&inode->i_lock);
+retry_locked:
+	file_wanted = __ceph_caps_file_wanted(ci);
+	used = __ceph_caps_used(ci);
+	want = file_wanted | used;
+	issued = __ceph_caps_issued(ci, &implemented);
+	revoking = implemented & ~issued;
+
+	retain = want | CEPH_CAP_PIN;
+	if (!mdsc->stopping && inode->i_nlink > 0) {
+		if (want) {
+			retain |= CEPH_CAP_ANY;       /* be greedy */
+		} else {
+			retain |= CEPH_CAP_ANY_SHARED;
+			/*
+			 * keep RD only if we didn't have the file open RW,
+			 * because then the mds would revoke it anyway to
+			 * journal max_size=0.
+			 */
+			if (ci->i_max_size == 0)
+				retain |= CEPH_CAP_ANY_RD;
+		}
+	}
+
+	dout("check_caps %p file_want %s used %s dirty %s flushing %s"
+	     " issued %s revoking %s retain %s %s%s%s\n", inode,
+	     ceph_cap_string(file_wanted),
+	     ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
+	     ceph_cap_string(ci->i_flushing_caps),
+	     ceph_cap_string(issued), ceph_cap_string(revoking),
+	     ceph_cap_string(retain),
+	     (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
+	     (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
+	     (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
+
+	/*
+	 * If we no longer need to hold onto old our caps, and we may
+	 * have cached pages, but don't want them, then try to invalidate.
+	 * If we fail, it's because pages are locked.... try again later.
+	 */
+	if ((!is_delayed || mdsc->stopping) &&
+	    ci->i_wrbuffer_ref == 0 &&               /* no dirty pages... */
+	    ci->i_rdcache_gen &&                     /* may have cached pages */
+	    (file_wanted == 0 ||                     /* no open files */
+	     (revoking & CEPH_CAP_FILE_CACHE)) &&     /*  or revoking cache */
+	    !tried_invalidate) {
+		dout("check_caps trying to invalidate on %p\n", inode);
+		if (try_nonblocking_invalidate(inode) < 0) {
+			if (revoking & CEPH_CAP_FILE_CACHE) {
+				dout("check_caps queuing invalidate\n");
+				queue_invalidate = 1;
+				ci->i_rdcache_revoking = ci->i_rdcache_gen;
+			} else {
+				dout("check_caps failed to invalidate pages\n");
+				/* we failed to invalidate pages.  check these
+				   caps again later. */
+				force_requeue = 1;
+				__cap_set_timeouts(mdsc, ci);
+			}
+		}
+		tried_invalidate = 1;
+		goto retry_locked;
+	}
+
+	num = 0;
+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+		cap = rb_entry(p, struct ceph_cap, ci_node);
+		num++;
+
+		/* avoid looping forever */
+		if (mds >= cap->mds ||
+		    ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))
+			continue;
+
+		/* NOTE: no side-effects allowed, until we take s_mutex */
+
+		revoking = cap->implemented & ~cap->issued;
+		if (revoking)
+			dout(" mds%d revoking %s\n", cap->mds,
+			     ceph_cap_string(revoking));
+
+		if (cap == ci->i_auth_cap &&
+		    (cap->issued & CEPH_CAP_FILE_WR)) {
+			/* request larger max_size from MDS? */
+			if (ci->i_wanted_max_size > ci->i_max_size &&
+			    ci->i_wanted_max_size > ci->i_requested_max_size) {
+				dout("requesting new max_size\n");
+				goto ack;
+			}
+
+			/* approaching file_max? */
+			if ((inode->i_size << 1) >= ci->i_max_size &&
+			    (ci->i_reported_size << 1) < ci->i_max_size) {
+				dout("i_size approaching max_size\n");
+				goto ack;
+			}
+		}
+		/* flush anything dirty? */
+		if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) &&
+		    ci->i_dirty_caps) {
+			dout("flushing dirty caps\n");
+			goto ack;
+		}
+
+		/* completed revocation? going down and there are no caps? */
+		if (revoking && (revoking & used) == 0) {
+			dout("completed revocation of %s\n",
+			     ceph_cap_string(cap->implemented & ~cap->issued));
+			goto ack;
+		}
+
+		/* want more caps from mds? */
+		if (want & ~(cap->mds_wanted | cap->issued))
+			goto ack;
+
+		/* things we might delay */
+		if ((cap->issued & ~retain) == 0 &&
+		    cap->mds_wanted == want)
+			continue;     /* nope, all good */
+
+		if (is_delayed)
+			goto ack;
+
+		/* delay? */
+		if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
+		    time_before(jiffies, ci->i_hold_caps_max)) {
+			dout(" delaying issued %s -> %s, wanted %s -> %s\n",
+			     ceph_cap_string(cap->issued),
+			     ceph_cap_string(cap->issued & retain),
+			     ceph_cap_string(cap->mds_wanted),
+			     ceph_cap_string(want));
+			delayed++;
+			continue;
+		}
+
+ack:
+		if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
+			dout(" skipping %p I_NOFLUSH set\n", inode);
+			continue;
+		}
+
+		if (session && session != cap->session) {
+			dout("oops, wrong session %p mutex\n", session);
+			mutex_unlock(&session->s_mutex);
+			session = NULL;
+		}
+		if (!session) {
+			session = cap->session;
+			if (mutex_trylock(&session->s_mutex) == 0) {
+				dout("inverting session/ino locks on %p\n",
+				     session);
+				spin_unlock(&inode->i_lock);
+				if (took_snap_rwsem) {
+					up_read(&mdsc->snap_rwsem);
+					took_snap_rwsem = 0;
+				}
+				mutex_lock(&session->s_mutex);
+				goto retry;
+			}
+		}
+		/* take snap_rwsem after session mutex */
+		if (!took_snap_rwsem) {
+			if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
+				dout("inverting snap/in locks on %p\n",
+				     inode);
+				spin_unlock(&inode->i_lock);
+				down_read(&mdsc->snap_rwsem);
+				took_snap_rwsem = 1;
+				goto retry;
+			}
+			took_snap_rwsem = 1;
+		}
+
+		if (cap == ci->i_auth_cap && ci->i_dirty_caps)
+			flushing = __mark_caps_flushing(inode, session);
+
+		mds = cap->mds;  /* remember mds, so we don't repeat */
+		sent++;
+
+		/* __send_cap drops i_lock */
+		delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,
+				      retain, flushing, NULL);
+		goto retry; /* retake i_lock and restart our cap scan. */
+	}
+
+	/*
+	 * Reschedule delayed caps release if we delayed anything,
+	 * otherwise cancel.
+	 */
+	if (delayed && is_delayed)
+		force_requeue = 1;   /* __send_cap delayed release; requeue */
+	if (!delayed && !is_delayed)
+		__cap_delay_cancel(mdsc, ci);
+	else if (!is_delayed || force_requeue)
+		__cap_delay_requeue(mdsc, ci);
+
+	spin_unlock(&inode->i_lock);
+
+	if (queue_invalidate)
+		ceph_queue_invalidate(inode);
+
+	if (session)
+		mutex_unlock(&session->s_mutex);
+	if (took_snap_rwsem)
+		up_read(&mdsc->snap_rwsem);
+}
+
+/*
+ * Try to flush dirty caps back to the auth mds.
+ */
+static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
+			  unsigned *flush_tid)
+{
+	struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int unlock_session = session ? 0 : 1;
+	int flushing = 0;
+
+retry:
+	spin_lock(&inode->i_lock);
+	if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
+		dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
+		goto out;
+	}
+	if (ci->i_dirty_caps && ci->i_auth_cap) {
+		struct ceph_cap *cap = ci->i_auth_cap;
+		int used = __ceph_caps_used(ci);
+		int want = __ceph_caps_wanted(ci);
+		int delayed;
+
+		if (!session) {
+			spin_unlock(&inode->i_lock);
+			session = cap->session;
+			mutex_lock(&session->s_mutex);
+			goto retry;
+		}
+		BUG_ON(session != cap->session);
+		if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
+			goto out;
+
+		flushing = __mark_caps_flushing(inode, session);
+
+		/* __send_cap drops i_lock */
+		delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
+				     cap->issued | cap->implemented, flushing,
+				     flush_tid);
+		if (!delayed)
+			goto out_unlocked;
+
+		spin_lock(&inode->i_lock);
+		__cap_delay_requeue(mdsc, ci);
+	}
+out:
+	spin_unlock(&inode->i_lock);
+out_unlocked:
+	if (session && unlock_session)
+		mutex_unlock(&session->s_mutex);
+	return flushing;
+}
+
+/*
+ * Return true if we've flushed caps through the given flush_tid.
+ */
+static int caps_are_flushed(struct inode *inode, unsigned tid)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int i, ret = 1;
+
+	spin_lock(&inode->i_lock);
+	for (i = 0; i < CEPH_CAP_BITS; i++)
+		if ((ci->i_flushing_caps & (1 << i)) &&
+		    ci->i_cap_flush_tid[i] <= tid) {
+			/* still flushing this bit */
+			ret = 0;
+			break;
+		}
+	spin_unlock(&inode->i_lock);
+	return ret;
+}
+
+/*
+ * Wait on any unsafe replies for the given inode.  First wait on the
+ * newest request, and make that the upper bound.  Then, if there are
+ * more requests, keep waiting on the oldest as long as it is still older
+ * than the original request.
+ */
+static void sync_write_wait(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct list_head *head = &ci->i_unsafe_writes;
+	struct ceph_osd_request *req;
+	u64 last_tid;
+
+	spin_lock(&ci->i_unsafe_lock);
+	if (list_empty(head))
+		goto out;
+
+	/* set upper bound as _last_ entry in chain */
+	req = list_entry(head->prev, struct ceph_osd_request,
+			 r_unsafe_item);
+	last_tid = req->r_tid;
+
+	do {
+		ceph_osdc_get_request(req);
+		spin_unlock(&ci->i_unsafe_lock);
+		dout("sync_write_wait on tid %llu (until %llu)\n",
+		     req->r_tid, last_tid);
+		wait_for_completion(&req->r_safe_completion);
+		spin_lock(&ci->i_unsafe_lock);
+		ceph_osdc_put_request(req);
+
+		/*
+		 * from here on look at first entry in chain, since we
+		 * only want to wait for anything older than last_tid
+		 */
+		if (list_empty(head))
+			break;
+		req = list_entry(head->next, struct ceph_osd_request,
+				 r_unsafe_item);
+	} while (req->r_tid < last_tid);
+out:
+	spin_unlock(&ci->i_unsafe_lock);
+}
+
+int ceph_fsync(struct file *file, int datasync)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	unsigned flush_tid;
+	int ret;
+	int dirty;
+
+	dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
+	sync_write_wait(inode);
+
+	ret = filemap_write_and_wait(inode->i_mapping);
+	if (ret < 0)
+		return ret;
+
+	dirty = try_flush_caps(inode, NULL, &flush_tid);
+	dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
+
+	/*
+	 * only wait on non-file metadata writeback (the mds
+	 * can recover size and mtime, so we don't need to
+	 * wait for that)
+	 */
+	if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
+		dout("fsync waiting for flush_tid %u\n", flush_tid);
+		ret = wait_event_interruptible(ci->i_cap_wq,
+				       caps_are_flushed(inode, flush_tid));
+	}
+
+	dout("fsync %p%s done\n", inode, datasync ? " datasync" : "");
+	return ret;
+}
+
+/*
+ * Flush any dirty caps back to the mds.  If we aren't asked to wait,
+ * queue inode for flush but don't do so immediately, because we can
+ * get by with fewer MDS messages if we wait for data writeback to
+ * complete first.
+ */
+int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	unsigned flush_tid;
+	int err = 0;
+	int dirty;
+	int wait = wbc->sync_mode == WB_SYNC_ALL;
+
+	dout("write_inode %p wait=%d\n", inode, wait);
+	if (wait) {
+		dirty = try_flush_caps(inode, NULL, &flush_tid);
+		if (dirty)
+			err = wait_event_interruptible(ci->i_cap_wq,
+				       caps_are_flushed(inode, flush_tid));
+	} else {
+		struct ceph_mds_client *mdsc =
+			&ceph_sb_to_client(inode->i_sb)->mdsc;
+
+		spin_lock(&inode->i_lock);
+		if (__ceph_caps_dirty(ci))
+			__cap_delay_requeue_front(mdsc, ci);
+		spin_unlock(&inode->i_lock);
+	}
+	return err;
+}
+
+/*
+ * After a recovering MDS goes active, we need to resend any caps
+ * we were flushing.
+ *
+ * Caller holds session->s_mutex.
+ */
+static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
+				   struct ceph_mds_session *session)
+{
+	struct ceph_cap_snap *capsnap;
+
+	dout("kick_flushing_capsnaps mds%d\n", session->s_mds);
+	list_for_each_entry(capsnap, &session->s_cap_snaps_flushing,
+			    flushing_item) {
+		struct ceph_inode_info *ci = capsnap->ci;
+		struct inode *inode = &ci->vfs_inode;
+		struct ceph_cap *cap;
+
+		spin_lock(&inode->i_lock);
+		cap = ci->i_auth_cap;
+		if (cap && cap->session == session) {
+			dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
+			     cap, capsnap);
+			__ceph_flush_snaps(ci, &session);
+		} else {
+			pr_err("%p auth cap %p not mds%d ???\n", inode,
+			       cap, session->s_mds);
+		}
+		spin_unlock(&inode->i_lock);
+	}
+}
+
+void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
+			     struct ceph_mds_session *session)
+{
+	struct ceph_inode_info *ci;
+
+	kick_flushing_capsnaps(mdsc, session);
+
+	dout("kick_flushing_caps mds%d\n", session->s_mds);
+	list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
+		struct inode *inode = &ci->vfs_inode;
+		struct ceph_cap *cap;
+		int delayed = 0;
+
+		spin_lock(&inode->i_lock);
+		cap = ci->i_auth_cap;
+		if (cap && cap->session == session) {
+			dout("kick_flushing_caps %p cap %p %s\n", inode,
+			     cap, ceph_cap_string(ci->i_flushing_caps));
+			delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
+					     __ceph_caps_used(ci),
+					     __ceph_caps_wanted(ci),
+					     cap->issued | cap->implemented,
+					     ci->i_flushing_caps, NULL);
+			if (delayed) {
+				spin_lock(&inode->i_lock);
+				__cap_delay_requeue(mdsc, ci);
+				spin_unlock(&inode->i_lock);
+			}
+		} else {
+			pr_err("%p auth cap %p not mds%d ???\n", inode,
+			       cap, session->s_mds);
+			spin_unlock(&inode->i_lock);
+		}
+	}
+}
+
+
+/*
+ * Take references to capabilities we hold, so that we don't release
+ * them to the MDS prematurely.
+ *
+ * Protected by i_lock.
+ */
+static void __take_cap_refs(struct ceph_inode_info *ci, int got)
+{
+	if (got & CEPH_CAP_PIN)
+		ci->i_pin_ref++;
+	if (got & CEPH_CAP_FILE_RD)
+		ci->i_rd_ref++;
+	if (got & CEPH_CAP_FILE_CACHE)
+		ci->i_rdcache_ref++;
+	if (got & CEPH_CAP_FILE_WR)
+		ci->i_wr_ref++;
+	if (got & CEPH_CAP_FILE_BUFFER) {
+		if (ci->i_wrbuffer_ref == 0)
+			igrab(&ci->vfs_inode);
+		ci->i_wrbuffer_ref++;
+		dout("__take_cap_refs %p wrbuffer %d -> %d (?)\n",
+		     &ci->vfs_inode, ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref);
+	}
+}
+
+/*
+ * Try to grab cap references.  Specify those refs we @want, and the
+ * minimal set we @need.  Also include the larger offset we are writing
+ * to (when applicable), and check against max_size here as well.
+ * Note that caller is responsible for ensuring max_size increases are
+ * requested from the MDS.
+ */
+static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
+			    int *got, loff_t endoff, int *check_max, int *err)
+{
+	struct inode *inode = &ci->vfs_inode;
+	int ret = 0;
+	int have, implemented;
+	int file_wanted;
+
+	dout("get_cap_refs %p need %s want %s\n", inode,
+	     ceph_cap_string(need), ceph_cap_string(want));
+	spin_lock(&inode->i_lock);
+
+	/* make sure file is actually open */
+	file_wanted = __ceph_caps_file_wanted(ci);
+	if ((file_wanted & need) == 0) {
+		dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
+		     ceph_cap_string(need), ceph_cap_string(file_wanted));
+		*err = -EBADF;
+		ret = 1;
+		goto out;
+	}
+
+	if (need & CEPH_CAP_FILE_WR) {
+		if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
+			dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
+			     inode, endoff, ci->i_max_size);
+			if (endoff > ci->i_wanted_max_size) {
+				*check_max = 1;
+				ret = 1;
+			}
+			goto out;
+		}
+		/*
+		 * If a sync write is in progress, we must wait, so that we
+		 * can get a final snapshot value for size+mtime.
+		 */
+		if (__ceph_have_pending_cap_snap(ci)) {
+			dout("get_cap_refs %p cap_snap_pending\n", inode);
+			goto out;
+		}
+	}
+	have = __ceph_caps_issued(ci, &implemented);
+
+	/*
+	 * disallow writes while a truncate is pending
+	 */
+	if (ci->i_truncate_pending)
+		have &= ~CEPH_CAP_FILE_WR;
+
+	if ((have & need) == need) {
+		/*
+		 * Look at (implemented & ~have & not) so that we keep waiting
+		 * on transition from wanted -> needed caps.  This is needed
+		 * for WRBUFFER|WR -> WR to avoid a new WR sync write from
+		 * going before a prior buffered writeback happens.
+		 */
+		int not = want & ~(have & need);
+		int revoking = implemented & ~have;
+		dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
+		     inode, ceph_cap_string(have), ceph_cap_string(not),
+		     ceph_cap_string(revoking));
+		if ((revoking & not) == 0) {
+			*got = need | (have & want);
+			__take_cap_refs(ci, *got);
+			ret = 1;
+		}
+	} else {
+		dout("get_cap_refs %p have %s needed %s\n", inode,
+		     ceph_cap_string(have), ceph_cap_string(need));
+	}
+out:
+	spin_unlock(&inode->i_lock);
+	dout("get_cap_refs %p ret %d got %s\n", inode,
+	     ret, ceph_cap_string(*got));
+	return ret;
+}
+
+/*
+ * Check the offset we are writing up to against our current
+ * max_size.  If necessary, tell the MDS we want to write to
+ * a larger offset.
+ */
+static void check_max_size(struct inode *inode, loff_t endoff)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int check = 0;
+
+	/* do we need to explicitly request a larger max_size? */
+	spin_lock(&inode->i_lock);
+	if ((endoff >= ci->i_max_size ||
+	     endoff > (inode->i_size << 1)) &&
+	    endoff > ci->i_wanted_max_size) {
+		dout("write %p at large endoff %llu, req max_size\n",
+		     inode, endoff);
+		ci->i_wanted_max_size = endoff;
+		check = 1;
+	}
+	spin_unlock(&inode->i_lock);
+	if (check)
+		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+}
+
+/*
+ * Wait for caps, and take cap references.  If we can't get a WR cap
+ * due to a small max_size, make sure we check_max_size (and possibly
+ * ask the mds) so we don't get hung up indefinitely.
+ */
+int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, int *got,
+		  loff_t endoff)
+{
+	int check_max, ret, err;
+
+retry:
+	if (endoff > 0)
+		check_max_size(&ci->vfs_inode, endoff);
+	check_max = 0;
+	err = 0;
+	ret = wait_event_interruptible(ci->i_cap_wq,
+				       try_get_cap_refs(ci, need, want,
+							got, endoff,
+							&check_max, &err));
+	if (err)
+		ret = err;
+	if (check_max)
+		goto retry;
+	return ret;
+}
+
+/*
+ * Take cap refs.  Caller must already know we hold at least one ref
+ * on the caps in question or we don't know this is safe.
+ */
+void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
+{
+	spin_lock(&ci->vfs_inode.i_lock);
+	__take_cap_refs(ci, caps);
+	spin_unlock(&ci->vfs_inode.i_lock);
+}
+
+/*
+ * Release cap refs.
+ *
+ * If we released the last ref on any given cap, call ceph_check_caps
+ * to release (or schedule a release).
+ *
+ * If we are releasing a WR cap (from a sync write), finalize any affected
+ * cap_snap, and wake up any waiters.
+ */
+void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
+{
+	struct inode *inode = &ci->vfs_inode;
+	int last = 0, put = 0, flushsnaps = 0, wake = 0;
+	struct ceph_cap_snap *capsnap;
+
+	spin_lock(&inode->i_lock);
+	if (had & CEPH_CAP_PIN)
+		--ci->i_pin_ref;
+	if (had & CEPH_CAP_FILE_RD)
+		if (--ci->i_rd_ref == 0)
+			last++;
+	if (had & CEPH_CAP_FILE_CACHE)
+		if (--ci->i_rdcache_ref == 0)
+			last++;
+	if (had & CEPH_CAP_FILE_BUFFER) {
+		if (--ci->i_wrbuffer_ref == 0) {
+			last++;
+			put++;
+		}
+		dout("put_cap_refs %p wrbuffer %d -> %d (?)\n",
+		     inode, ci->i_wrbuffer_ref+1, ci->i_wrbuffer_ref);
+	}
+	if (had & CEPH_CAP_FILE_WR)
+		if (--ci->i_wr_ref == 0) {
+			last++;
+			if (!list_empty(&ci->i_cap_snaps)) {
+				capsnap = list_first_entry(&ci->i_cap_snaps,
+						     struct ceph_cap_snap,
+						     ci_item);
+				if (capsnap->writing) {
+					capsnap->writing = 0;
+					flushsnaps =
+						__ceph_finish_cap_snap(ci,
+								       capsnap);
+					wake = 1;
+				}
+			}
+		}
+	spin_unlock(&inode->i_lock);
+
+	dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
+	     last ? " last" : "", put ? " put" : "");
+
+	if (last && !flushsnaps)
+		ceph_check_caps(ci, 0, NULL);
+	else if (flushsnaps)
+		ceph_flush_snaps(ci);
+	if (wake)
+		wake_up_all(&ci->i_cap_wq);
+	if (put)
+		iput(inode);
+}
+
+/*
+ * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
+ * context.  Adjust per-snap dirty page accounting as appropriate.
+ * Once all dirty data for a cap_snap is flushed, flush snapped file
+ * metadata back to the MDS.  If we dropped the last ref, call
+ * ceph_check_caps.
+ */
+void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
+				struct ceph_snap_context *snapc)
+{
+	struct inode *inode = &ci->vfs_inode;
+	int last = 0;
+	int complete_capsnap = 0;
+	int drop_capsnap = 0;
+	int found = 0;
+	struct ceph_cap_snap *capsnap = NULL;
+
+	spin_lock(&inode->i_lock);
+	ci->i_wrbuffer_ref -= nr;
+	last = !ci->i_wrbuffer_ref;
+
+	if (ci->i_head_snapc == snapc) {
+		ci->i_wrbuffer_ref_head -= nr;
+		if (!ci->i_wrbuffer_ref_head) {
+			ceph_put_snap_context(ci->i_head_snapc);
+			ci->i_head_snapc = NULL;
+		}
+		dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n",
+		     inode,
+		     ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr,
+		     ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
+		     last ? " LAST" : "");
+	} else {
+		list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+			if (capsnap->context == snapc) {
+				found = 1;
+				break;
+			}
+		}
+		BUG_ON(!found);
+		capsnap->dirty_pages -= nr;
+		if (capsnap->dirty_pages == 0) {
+			complete_capsnap = 1;
+			if (capsnap->dirty == 0)
+				/* cap writeback completed before we created
+				 * the cap_snap; no FLUSHSNAP is needed */
+				drop_capsnap = 1;
+		}
+		dout("put_wrbuffer_cap_refs on %p cap_snap %p "
+		     " snap %lld %d/%d -> %d/%d %s%s%s\n",
+		     inode, capsnap, capsnap->context->seq,
+		     ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
+		     ci->i_wrbuffer_ref, capsnap->dirty_pages,
+		     last ? " (wrbuffer last)" : "",
+		     complete_capsnap ? " (complete capsnap)" : "",
+		     drop_capsnap ? " (drop capsnap)" : "");
+		if (drop_capsnap) {
+			ceph_put_snap_context(capsnap->context);
+			list_del(&capsnap->ci_item);
+			list_del(&capsnap->flushing_item);
+			ceph_put_cap_snap(capsnap);
+		}
+	}
+
+	spin_unlock(&inode->i_lock);
+
+	if (last) {
+		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+		iput(inode);
+	} else if (complete_capsnap) {
+		ceph_flush_snaps(ci);
+		wake_up_all(&ci->i_cap_wq);
+	}
+	if (drop_capsnap)
+		iput(inode);
+}
+
+/*
+ * Handle a cap GRANT message from the MDS.  (Note that a GRANT may
+ * actually be a revocation if it specifies a smaller cap set.)
+ *
+ * caller holds s_mutex and i_lock, we drop both.
+ *
+ * return value:
+ *  0 - ok
+ *  1 - check_caps on auth cap only (writeback)
+ *  2 - check_caps (ack revoke)
+ */
+static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
+			     struct ceph_mds_session *session,
+			     struct ceph_cap *cap,
+			     struct ceph_buffer *xattr_buf)
+	__releases(inode->i_lock)
+	__releases(session->s_mutex)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int mds = session->s_mds;
+	int seq = le32_to_cpu(grant->seq);
+	int newcaps = le32_to_cpu(grant->caps);
+	int issued, implemented, used, wanted, dirty;
+	u64 size = le64_to_cpu(grant->size);
+	u64 max_size = le64_to_cpu(grant->max_size);
+	struct timespec mtime, atime, ctime;
+	int check_caps = 0;
+	int wake = 0;
+	int writeback = 0;
+	int revoked_rdcache = 0;
+	int queue_invalidate = 0;
+
+	dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
+	     inode, cap, mds, seq, ceph_cap_string(newcaps));
+	dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
+		inode->i_size);
+
+	/*
+	 * If CACHE is being revoked, and we have no dirty buffers,
+	 * try to invalidate (once).  (If there are dirty buffers, we
+	 * will invalidate _after_ writeback.)
+	 */
+	if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
+	    !ci->i_wrbuffer_ref) {
+		if (try_nonblocking_invalidate(inode) == 0) {
+			revoked_rdcache = 1;
+		} else {
+			/* there were locked pages.. invalidate later
+			   in a separate thread. */
+			if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
+				queue_invalidate = 1;
+				ci->i_rdcache_revoking = ci->i_rdcache_gen;
+			}
+		}
+	}
+
+	/* side effects now are allowed */
+
+	issued = __ceph_caps_issued(ci, &implemented);
+	issued |= implemented | __ceph_caps_dirty(ci);
+
+	cap->cap_gen = session->s_cap_gen;
+
+	__check_cap_issue(ci, cap, newcaps);
+
+	if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
+		inode->i_mode = le32_to_cpu(grant->mode);
+		inode->i_uid = le32_to_cpu(grant->uid);
+		inode->i_gid = le32_to_cpu(grant->gid);
+		dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
+		     inode->i_uid, inode->i_gid);
+	}
+
+	if ((issued & CEPH_CAP_LINK_EXCL) == 0)
+		inode->i_nlink = le32_to_cpu(grant->nlink);
+
+	if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {
+		int len = le32_to_cpu(grant->xattr_len);
+		u64 version = le64_to_cpu(grant->xattr_version);
+
+		if (version > ci->i_xattrs.version) {
+			dout(" got new xattrs v%llu on %p len %d\n",
+			     version, inode, len);
+			if (ci->i_xattrs.blob)
+				ceph_buffer_put(ci->i_xattrs.blob);
+			ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
+			ci->i_xattrs.version = version;
+		}
+	}
+
+	/* size/ctime/mtime/atime? */
+	ceph_fill_file_size(inode, issued,
+			    le32_to_cpu(grant->truncate_seq),
+			    le64_to_cpu(grant->truncate_size), size);
+	ceph_decode_timespec(&mtime, &grant->mtime);
+	ceph_decode_timespec(&atime, &grant->atime);
+	ceph_decode_timespec(&ctime, &grant->ctime);
+	ceph_fill_file_time(inode, issued,
+			    le32_to_cpu(grant->time_warp_seq), &ctime, &mtime,
+			    &atime);
+
+	/* max size increase? */
+	if (max_size != ci->i_max_size) {
+		dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
+		ci->i_max_size = max_size;
+		if (max_size >= ci->i_wanted_max_size) {
+			ci->i_wanted_max_size = 0;  /* reset */
+			ci->i_requested_max_size = 0;
+		}
+		wake = 1;
+	}
+
+	/* check cap bits */
+	wanted = __ceph_caps_wanted(ci);
+	used = __ceph_caps_used(ci);
+	dirty = __ceph_caps_dirty(ci);
+	dout(" my wanted = %s, used = %s, dirty %s\n",
+	     ceph_cap_string(wanted),
+	     ceph_cap_string(used),
+	     ceph_cap_string(dirty));
+	if (wanted != le32_to_cpu(grant->wanted)) {
+		dout("mds wanted %s -> %s\n",
+		     ceph_cap_string(le32_to_cpu(grant->wanted)),
+		     ceph_cap_string(wanted));
+		grant->wanted = cpu_to_le32(wanted);
+	}
+
+	cap->seq = seq;
+
+	/* file layout may have changed */
+	ci->i_layout = grant->layout;
+
+	/* revocation, grant, or no-op? */
+	if (cap->issued & ~newcaps) {
+		dout("revocation: %s -> %s\n", ceph_cap_string(cap->issued),
+		     ceph_cap_string(newcaps));
+		if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER)
+			writeback = 1; /* will delay ack */
+		else if (dirty & ~newcaps)
+			check_caps = 1;  /* initiate writeback in check_caps */
+		else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 ||
+			   revoked_rdcache)
+			check_caps = 2;     /* send revoke ack in check_caps */
+		cap->issued = newcaps;
+		cap->implemented |= newcaps;
+	} else if (cap->issued == newcaps) {
+		dout("caps unchanged: %s -> %s\n",
+		     ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
+	} else {
+		dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
+		     ceph_cap_string(newcaps));
+		cap->issued = newcaps;
+		cap->implemented |= newcaps; /* add bits only, to
+					      * avoid stepping on a
+					      * pending revocation */
+		wake = 1;
+	}
+	BUG_ON(cap->issued & ~cap->implemented);
+
+	spin_unlock(&inode->i_lock);
+	if (writeback)
+		/*
+		 * queue inode for writeback: we can't actually call
+		 * filemap_write_and_wait, etc. from message handler
+		 * context.
+		 */
+		ceph_queue_writeback(inode);
+	if (queue_invalidate)
+		ceph_queue_invalidate(inode);
+	if (wake)
+		wake_up_all(&ci->i_cap_wq);
+
+	if (check_caps == 1)
+		ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
+				session);
+	else if (check_caps == 2)
+		ceph_check_caps(ci, CHECK_CAPS_NODELAY, session);
+	else
+		mutex_unlock(&session->s_mutex);
+}
+
+/*
+ * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the
+ * MDS has been safely committed.
+ */
+static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
+				 struct ceph_mds_caps *m,
+				 struct ceph_mds_session *session,
+				 struct ceph_cap *cap)
+	__releases(inode->i_lock)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+	unsigned seq = le32_to_cpu(m->seq);
+	int dirty = le32_to_cpu(m->dirty);
+	int cleaned = 0;
+	int drop = 0;
+	int i;
+
+	for (i = 0; i < CEPH_CAP_BITS; i++)
+		if ((dirty & (1 << i)) &&
+		    flush_tid == ci->i_cap_flush_tid[i])
+			cleaned |= 1 << i;
+
+	dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
+	     " flushing %s -> %s\n",
+	     inode, session->s_mds, seq, ceph_cap_string(dirty),
+	     ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
+	     ceph_cap_string(ci->i_flushing_caps & ~cleaned));
+
+	if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned))
+		goto out;
+
+	ci->i_flushing_caps &= ~cleaned;
+
+	spin_lock(&mdsc->cap_dirty_lock);
+	if (ci->i_flushing_caps == 0) {
+		list_del_init(&ci->i_flushing_item);
+		if (!list_empty(&session->s_cap_flushing))
+			dout(" mds%d still flushing cap on %p\n",
+			     session->s_mds,
+			     &list_entry(session->s_cap_flushing.next,
+					 struct ceph_inode_info,
+					 i_flushing_item)->vfs_inode);
+		mdsc->num_cap_flushing--;
+		wake_up_all(&mdsc->cap_flushing_wq);
+		dout(" inode %p now !flushing\n", inode);
+
+		if (ci->i_dirty_caps == 0) {
+			dout(" inode %p now clean\n", inode);
+			BUG_ON(!list_empty(&ci->i_dirty_item));
+			drop = 1;
+		} else {
+			BUG_ON(list_empty(&ci->i_dirty_item));
+		}
+	}
+	spin_unlock(&mdsc->cap_dirty_lock);
+	wake_up_all(&ci->i_cap_wq);
+
+out:
+	spin_unlock(&inode->i_lock);
+	if (drop)
+		iput(inode);
+}
+
+/*
+ * Handle FLUSHSNAP_ACK.  MDS has flushed snap data to disk and we can
+ * throw away our cap_snap.
+ *
+ * Caller hold s_mutex.
+ */
+static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
+				     struct ceph_mds_caps *m,
+				     struct ceph_mds_session *session)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	u64 follows = le64_to_cpu(m->snap_follows);
+	struct ceph_cap_snap *capsnap;
+	int drop = 0;
+
+	dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
+	     inode, ci, session->s_mds, follows);
+
+	spin_lock(&inode->i_lock);
+	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+		if (capsnap->follows == follows) {
+			if (capsnap->flush_tid != flush_tid) {
+				dout(" cap_snap %p follows %lld tid %lld !="
+				     " %lld\n", capsnap, follows,
+				     flush_tid, capsnap->flush_tid);
+				break;
+			}
+			WARN_ON(capsnap->dirty_pages || capsnap->writing);
+			dout(" removing %p cap_snap %p follows %lld\n",
+			     inode, capsnap, follows);
+			ceph_put_snap_context(capsnap->context);
+			list_del(&capsnap->ci_item);
+			list_del(&capsnap->flushing_item);
+			ceph_put_cap_snap(capsnap);
+			drop = 1;
+			break;
+		} else {
+			dout(" skipping cap_snap %p follows %lld\n",
+			     capsnap, capsnap->follows);
+		}
+	}
+	spin_unlock(&inode->i_lock);
+	if (drop)
+		iput(inode);
+}
+
+/*
+ * Handle TRUNC from MDS, indicating file truncation.
+ *
+ * caller hold s_mutex.
+ */
+static void handle_cap_trunc(struct inode *inode,
+			     struct ceph_mds_caps *trunc,
+			     struct ceph_mds_session *session)
+	__releases(inode->i_lock)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int mds = session->s_mds;
+	int seq = le32_to_cpu(trunc->seq);
+	u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
+	u64 truncate_size = le64_to_cpu(trunc->truncate_size);
+	u64 size = le64_to_cpu(trunc->size);
+	int implemented = 0;
+	int dirty = __ceph_caps_dirty(ci);
+	int issued = __ceph_caps_issued(ceph_inode(inode), &implemented);
+	int queue_trunc = 0;
+
+	issued |= implemented | dirty;
+
+	dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n",
+	     inode, mds, seq, truncate_size, truncate_seq);
+	queue_trunc = ceph_fill_file_size(inode, issued,
+					  truncate_seq, truncate_size, size);
+	spin_unlock(&inode->i_lock);
+
+	if (queue_trunc)
+		ceph_queue_vmtruncate(inode);
+}
+
+/*
+ * Handle EXPORT from MDS.  Cap is being migrated _from_ this mds to a
+ * different one.  If we are the most recent migration we've seen (as
+ * indicated by mseq), make note of the migrating cap bits for the
+ * duration (until we see the corresponding IMPORT).
+ *
+ * caller holds s_mutex
+ */
+static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
+			      struct ceph_mds_session *session)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int mds = session->s_mds;
+	unsigned mseq = le32_to_cpu(ex->migrate_seq);
+	struct ceph_cap *cap = NULL, *t;
+	struct rb_node *p;
+	int remember = 1;
+
+	dout("handle_cap_export inode %p ci %p mds%d mseq %d\n",
+	     inode, ci, mds, mseq);
+
+	spin_lock(&inode->i_lock);
+
+	/* make sure we haven't seen a higher mseq */
+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+		t = rb_entry(p, struct ceph_cap, ci_node);
+		if (ceph_seq_cmp(t->mseq, mseq) > 0) {
+			dout(" higher mseq on cap from mds%d\n",
+			     t->session->s_mds);
+			remember = 0;
+		}
+		if (t->session->s_mds == mds)
+			cap = t;
+	}
+
+	if (cap) {
+		if (remember) {
+			/* make note */
+			ci->i_cap_exporting_mds = mds;
+			ci->i_cap_exporting_mseq = mseq;
+			ci->i_cap_exporting_issued = cap->issued;
+		}
+		__ceph_remove_cap(cap);
+	}
+	/* else, we already released it */
+
+	spin_unlock(&inode->i_lock);
+}
+
+/*
+ * Handle cap IMPORT.  If there are temp bits from an older EXPORT,
+ * clean them up.
+ *
+ * caller holds s_mutex.
+ */
+static void handle_cap_import(struct ceph_mds_client *mdsc,
+			      struct inode *inode, struct ceph_mds_caps *im,
+			      struct ceph_mds_session *session,
+			      void *snaptrace, int snaptrace_len)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int mds = session->s_mds;
+	unsigned issued = le32_to_cpu(im->caps);
+	unsigned wanted = le32_to_cpu(im->wanted);
+	unsigned seq = le32_to_cpu(im->seq);
+	unsigned mseq = le32_to_cpu(im->migrate_seq);
+	u64 realmino = le64_to_cpu(im->realm);
+	u64 cap_id = le64_to_cpu(im->cap_id);
+
+	if (ci->i_cap_exporting_mds >= 0 &&
+	    ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) {
+		dout("handle_cap_import inode %p ci %p mds%d mseq %d"
+		     " - cleared exporting from mds%d\n",
+		     inode, ci, mds, mseq,
+		     ci->i_cap_exporting_mds);
+		ci->i_cap_exporting_issued = 0;
+		ci->i_cap_exporting_mseq = 0;
+		ci->i_cap_exporting_mds = -1;
+	} else {
+		dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",
+		     inode, ci, mds, mseq);
+	}
+
+	down_write(&mdsc->snap_rwsem);
+	ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len,
+			       false);
+	downgrade_write(&mdsc->snap_rwsem);
+	ceph_add_cap(inode, session, cap_id, -1,
+		     issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
+		     NULL /* no caps context */);
+	try_flush_caps(inode, session, NULL);
+	up_read(&mdsc->snap_rwsem);
+}
+
+/*
+ * Handle a caps message from the MDS.
+ *
+ * Identify the appropriate session, inode, and call the right handler
+ * based on the cap op.
+ */
+void ceph_handle_caps(struct ceph_mds_session *session,
+		      struct ceph_msg *msg)
+{
+	struct ceph_mds_client *mdsc = session->s_mdsc;
+	struct super_block *sb = mdsc->client->sb;
+	struct inode *inode;
+	struct ceph_cap *cap;
+	struct ceph_mds_caps *h;
+	int mds = session->s_mds;
+	int op;
+	u32 seq, mseq;
+	struct ceph_vino vino;
+	u64 cap_id;
+	u64 size, max_size;
+	u64 tid;
+	void *snaptrace;
+
+	dout("handle_caps from mds%d\n", mds);
+
+	/* decode */
+	tid = le64_to_cpu(msg->hdr.tid);
+	if (msg->front.iov_len < sizeof(*h))
+		goto bad;
+	h = msg->front.iov_base;
+	snaptrace = h + 1;
+	op = le32_to_cpu(h->op);
+	vino.ino = le64_to_cpu(h->ino);
+	vino.snap = CEPH_NOSNAP;
+	cap_id = le64_to_cpu(h->cap_id);
+	seq = le32_to_cpu(h->seq);
+	mseq = le32_to_cpu(h->migrate_seq);
+	size = le64_to_cpu(h->size);
+	max_size = le64_to_cpu(h->max_size);
+
+	mutex_lock(&session->s_mutex);
+	session->s_seq++;
+	dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
+	     (unsigned)seq);
+
+	/* lookup ino */
+	inode = ceph_find_inode(sb, vino);
+	dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
+	     vino.snap, inode);
+	if (!inode) {
+		dout(" i don't have ino %llx\n", vino.ino);
+
+		if (op == CEPH_CAP_OP_IMPORT)
+			__queue_cap_release(session, vino.ino, cap_id,
+					    mseq, seq);
+
+		/*
+		 * send any full release message to try to move things
+		 * along for the mds (who clearly thinks we still have this
+		 * cap).
+		 */
+		ceph_add_cap_releases(mdsc, session, -1);
+		ceph_send_cap_releases(mdsc, session);
+		goto done;
+	}
+
+	/* these will work even if we don't have a cap yet */
+	switch (op) {
+	case CEPH_CAP_OP_FLUSHSNAP_ACK:
+		handle_cap_flushsnap_ack(inode, tid, h, session);
+		goto done;
+
+	case CEPH_CAP_OP_EXPORT:
+		handle_cap_export(inode, h, session);
+		goto done;
+
+	case CEPH_CAP_OP_IMPORT:
+		handle_cap_import(mdsc, inode, h, session,
+				  snaptrace, le32_to_cpu(h->snap_trace_len));
+		ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY,
+				session);
+		goto done_unlocked;
+	}
+
+	/* the rest require a cap */
+	spin_lock(&inode->i_lock);
+	cap = __get_cap_for_mds(ceph_inode(inode), mds);
+	if (!cap) {
+		dout(" no cap on %p ino %llx.%llx from mds%d\n",
+		     inode, ceph_ino(inode), ceph_snap(inode), mds);
+		spin_unlock(&inode->i_lock);
+		goto done;
+	}
+
+	/* note that each of these drops i_lock for us */
+	switch (op) {
+	case CEPH_CAP_OP_REVOKE:
+	case CEPH_CAP_OP_GRANT:
+		handle_cap_grant(inode, h, session, cap, msg->middle);
+		goto done_unlocked;
+
+	case CEPH_CAP_OP_FLUSH_ACK:
+		handle_cap_flush_ack(inode, tid, h, session, cap);
+		break;
+
+	case CEPH_CAP_OP_TRUNC:
+		handle_cap_trunc(inode, h, session);
+		break;
+
+	default:
+		spin_unlock(&inode->i_lock);
+		pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
+		       ceph_cap_op_name(op));
+	}
+
+done:
+	mutex_unlock(&session->s_mutex);
+done_unlocked:
+	if (inode)
+		iput(inode);
+	return;
+
+bad:
+	pr_err("ceph_handle_caps: corrupt message\n");
+	ceph_msg_dump(msg);
+	return;
+}
+
+/*
+ * Delayed work handler to process end of delayed cap release LRU list.
+ */
+void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
+{
+	struct ceph_inode_info *ci;
+	int flags = CHECK_CAPS_NODELAY;
+
+	dout("check_delayed_caps\n");
+	while (1) {
+		spin_lock(&mdsc->cap_delay_lock);
+		if (list_empty(&mdsc->cap_delay_list))
+			break;
+		ci = list_first_entry(&mdsc->cap_delay_list,
+				      struct ceph_inode_info,
+				      i_cap_delay_list);
+		if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
+		    time_before(jiffies, ci->i_hold_caps_max))
+			break;
+		list_del_init(&ci->i_cap_delay_list);
+		spin_unlock(&mdsc->cap_delay_lock);
+		dout("check_delayed_caps on %p\n", &ci->vfs_inode);
+		ceph_check_caps(ci, flags, NULL);
+	}
+	spin_unlock(&mdsc->cap_delay_lock);
+}
+
+/*
+ * Flush all dirty caps to the mds
+ */
+void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
+{
+	struct ceph_inode_info *ci, *nci = NULL;
+	struct inode *inode, *ninode = NULL;
+	struct list_head *p, *n;
+
+	dout("flush_dirty_caps\n");
+	spin_lock(&mdsc->cap_dirty_lock);
+	list_for_each_safe(p, n, &mdsc->cap_dirty) {
+		if (nci) {
+			ci = nci;
+			inode = ninode;
+			ci->i_ceph_flags &= ~CEPH_I_NOFLUSH;
+			dout("flush_dirty_caps inode %p (was next inode)\n",
+			     inode);
+		} else {
+			ci = list_entry(p, struct ceph_inode_info,
+					i_dirty_item);
+			inode = igrab(&ci->vfs_inode);
+			BUG_ON(!inode);
+			dout("flush_dirty_caps inode %p\n", inode);
+		}
+		if (n != &mdsc->cap_dirty) {
+			nci = list_entry(n, struct ceph_inode_info,
+					 i_dirty_item);
+			ninode = igrab(&nci->vfs_inode);
+			BUG_ON(!ninode);
+			nci->i_ceph_flags |= CEPH_I_NOFLUSH;
+			dout("flush_dirty_caps next inode %p, noflush\n",
+			     ninode);
+		} else {
+			nci = NULL;
+			ninode = NULL;
+		}
+		spin_unlock(&mdsc->cap_dirty_lock);
+		if (inode) {
+			ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH,
+					NULL);
+			iput(inode);
+		}
+		spin_lock(&mdsc->cap_dirty_lock);
+	}
+	spin_unlock(&mdsc->cap_dirty_lock);
+}
+
+/*
+ * Drop open file reference.  If we were the last open file,
+ * we may need to release capabilities to the MDS (or schedule
+ * their delayed release).
+ */
+void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
+{
+	struct inode *inode = &ci->vfs_inode;
+	int last = 0;
+
+	spin_lock(&inode->i_lock);
+	dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode,
+	     ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1);
+	BUG_ON(ci->i_nr_by_mode[fmode] == 0);
+	if (--ci->i_nr_by_mode[fmode] == 0)
+		last++;
+	spin_unlock(&inode->i_lock);
+
+	if (last && ci->i_vino.snap == CEPH_NOSNAP)
+		ceph_check_caps(ci, 0, NULL);
+}
+
+/*
+ * Helpers for embedding cap and dentry lease releases into mds
+ * requests.
+ *
+ * @force is used by dentry_release (below) to force inclusion of a
+ * record for the directory inode, even when there aren't any caps to
+ * drop.
+ */
+int ceph_encode_inode_release(void **p, struct inode *inode,
+			      int mds, int drop, int unless, int force)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_cap *cap;
+	struct ceph_mds_request_release *rel = *p;
+	int used, dirty;
+	int ret = 0;
+
+	spin_lock(&inode->i_lock);
+	used = __ceph_caps_used(ci);
+	dirty = __ceph_caps_dirty(ci);
+
+	dout("encode_inode_release %p mds%d used|dirty %s drop %s unless %s\n",
+	     inode, mds, ceph_cap_string(used|dirty), ceph_cap_string(drop),
+	     ceph_cap_string(unless));
+
+	/* only drop unused, clean caps */
+	drop &= ~(used | dirty);
+
+	cap = __get_cap_for_mds(ci, mds);
+	if (cap && __cap_is_valid(cap)) {
+		if (force ||
+		    ((cap->issued & drop) &&
+		     (cap->issued & unless) == 0)) {
+			if ((cap->issued & drop) &&
+			    (cap->issued & unless) == 0) {
+				dout("encode_inode_release %p cap %p %s -> "
+				     "%s\n", inode, cap,
+				     ceph_cap_string(cap->issued),
+				     ceph_cap_string(cap->issued & ~drop));
+				cap->issued &= ~drop;
+				cap->implemented &= ~drop;
+				if (ci->i_ceph_flags & CEPH_I_NODELAY) {
+					int wanted = __ceph_caps_wanted(ci);
+					dout("  wanted %s -> %s (act %s)\n",
+					     ceph_cap_string(cap->mds_wanted),
+					     ceph_cap_string(cap->mds_wanted &
+							     ~wanted),
+					     ceph_cap_string(wanted));
+					cap->mds_wanted &= wanted;
+				}
+			} else {
+				dout("encode_inode_release %p cap %p %s"
+				     " (force)\n", inode, cap,
+				     ceph_cap_string(cap->issued));
+			}
+
+			rel->ino = cpu_to_le64(ceph_ino(inode));
+			rel->cap_id = cpu_to_le64(cap->cap_id);
+			rel->seq = cpu_to_le32(cap->seq);
+			rel->issue_seq = cpu_to_le32(cap->issue_seq),
+			rel->mseq = cpu_to_le32(cap->mseq);
+			rel->caps = cpu_to_le32(cap->issued);
+			rel->wanted = cpu_to_le32(cap->mds_wanted);
+			rel->dname_len = 0;
+			rel->dname_seq = 0;
+			*p += sizeof(*rel);
+			ret = 1;
+		} else {
+			dout("encode_inode_release %p cap %p %s\n",
+			     inode, cap, ceph_cap_string(cap->issued));
+		}
+	}
+	spin_unlock(&inode->i_lock);
+	return ret;
+}
+
+int ceph_encode_dentry_release(void **p, struct dentry *dentry,
+			       int mds, int drop, int unless)
+{
+	struct inode *dir = dentry->d_parent->d_inode;
+	struct ceph_mds_request_release *rel = *p;
+	struct ceph_dentry_info *di = ceph_dentry(dentry);
+	int force = 0;
+	int ret;
+
+	/*
+	 * force an record for the directory caps if we have a dentry lease.
+	 * this is racy (can't take i_lock and d_lock together), but it
+	 * doesn't have to be perfect; the mds will revoke anything we don't
+	 * release.
+	 */
+	spin_lock(&dentry->d_lock);
+	if (di->lease_session && di->lease_session->s_mds == mds)
+		force = 1;
+	spin_unlock(&dentry->d_lock);
+
+	ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
+
+	spin_lock(&dentry->d_lock);
+	if (ret && di->lease_session && di->lease_session->s_mds == mds) {
+		dout("encode_dentry_release %p mds%d seq %d\n",
+		     dentry, mds, (int)di->lease_seq);
+		rel->dname_len = cpu_to_le32(dentry->d_name.len);
+		memcpy(*p, dentry->d_name.name, dentry->d_name.len);
+		*p += dentry->d_name.len;
+		rel->dname_seq = cpu_to_le32(di->lease_seq);
+		__ceph_mdsc_drop_dentry_lease(dentry);
+	}
+	spin_unlock(&dentry->d_lock);
+	return ret;
+}
diff --git a/fs/ceph/ceph_debug.h b/fs/ceph/ceph_debug.h
new file mode 100644
index 000000000000..1818c2305610
--- /dev/null
+++ b/fs/ceph/ceph_debug.h
@@ -0,0 +1,37 @@
+#ifndef _FS_CEPH_DEBUG_H
+#define _FS_CEPH_DEBUG_H
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#ifdef CONFIG_CEPH_FS_PRETTYDEBUG
+
+/*
+ * wrap pr_debug to include a filename:lineno prefix on each line.
+ * this incurs some overhead (kernel size and execution time) due to
+ * the extra function call at each call site.
+ */
+
+# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
+extern const char *ceph_file_part(const char *s, int len);
+#  define dout(fmt, ...)						\
+	pr_debug(" %12.12s:%-4d : " fmt,				\
+		 ceph_file_part(__FILE__, sizeof(__FILE__)),		\
+		 __LINE__, ##__VA_ARGS__)
+# else
+/* faux printk call just to see any compiler warnings. */
+#  define dout(fmt, ...)	do {				\
+		if (0)						\
+			printk(KERN_DEBUG fmt, ##__VA_ARGS__);	\
+	} while (0)
+# endif
+
+#else
+
+/*
+ * or, just wrap pr_debug
+ */
+# define dout(fmt, ...)	pr_debug(" " fmt, ##__VA_ARGS__)
+
+#endif
+
+#endif
diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c
new file mode 100644
index 000000000000..ab6cf35c4091
--- /dev/null
+++ b/fs/ceph/ceph_frag.c
@@ -0,0 +1,21 @@
+/*
+ * Ceph 'frag' type
+ */
+#include "types.h"
+
+int ceph_frag_compare(__u32 a, __u32 b)
+{
+	unsigned va = ceph_frag_value(a);
+	unsigned vb = ceph_frag_value(b);
+	if (va < vb)
+		return -1;
+	if (va > vb)
+		return 1;
+	va = ceph_frag_bits(a);
+	vb = ceph_frag_bits(b);
+	if (va < vb)
+		return -1;
+	if (va > vb)
+		return 1;
+	return 0;
+}
diff --git a/fs/ceph/ceph_frag.h b/fs/ceph/ceph_frag.h
new file mode 100644
index 000000000000..793f50cb7c22
--- /dev/null
+++ b/fs/ceph/ceph_frag.h
@@ -0,0 +1,109 @@
+#ifndef _FS_CEPH_FRAG_H
+#define _FS_CEPH_FRAG_H
+
+/*
+ * "Frags" are a way to describe a subset of a 32-bit number space,
+ * using a mask and a value to match against that mask.  Any given frag
+ * (subset of the number space) can be partitioned into 2^n sub-frags.
+ *
+ * Frags are encoded into a 32-bit word:
+ *   8 upper bits = "bits"
+ *  24 lower bits = "value"
+ * (We could go to 5+27 bits, but who cares.)
+ *
+ * We use the _most_ significant bits of the 24 bit value.  This makes
+ * values logically sort.
+ *
+ * Unfortunately, because the "bits" field is still in the high bits, we
+ * can't sort encoded frags numerically.  However, it does allow you
+ * to feed encoded frags as values into frag_contains_value.
+ */
+static inline __u32 ceph_frag_make(__u32 b, __u32 v)
+{
+	return (b << 24) |
+		(v & (0xffffffu << (24-b)) & 0xffffffu);
+}
+static inline __u32 ceph_frag_bits(__u32 f)
+{
+	return f >> 24;
+}
+static inline __u32 ceph_frag_value(__u32 f)
+{
+	return f & 0xffffffu;
+}
+static inline __u32 ceph_frag_mask(__u32 f)
+{
+	return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu;
+}
+static inline __u32 ceph_frag_mask_shift(__u32 f)
+{
+	return 24 - ceph_frag_bits(f);
+}
+
+static inline int ceph_frag_contains_value(__u32 f, __u32 v)
+{
+	return (v & ceph_frag_mask(f)) == ceph_frag_value(f);
+}
+static inline int ceph_frag_contains_frag(__u32 f, __u32 sub)
+{
+	/* is sub as specific as us, and contained by us? */
+	return ceph_frag_bits(sub) >= ceph_frag_bits(f) &&
+	       (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f);
+}
+
+static inline __u32 ceph_frag_parent(__u32 f)
+{
+	return ceph_frag_make(ceph_frag_bits(f) - 1,
+			 ceph_frag_value(f) & (ceph_frag_mask(f) << 1));
+}
+static inline int ceph_frag_is_left_child(__u32 f)
+{
+	return ceph_frag_bits(f) > 0 &&
+		(ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0;
+}
+static inline int ceph_frag_is_right_child(__u32 f)
+{
+	return ceph_frag_bits(f) > 0 &&
+		(ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1;
+}
+static inline __u32 ceph_frag_sibling(__u32 f)
+{
+	return ceph_frag_make(ceph_frag_bits(f),
+		      ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f)));
+}
+static inline __u32 ceph_frag_left_child(__u32 f)
+{
+	return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f));
+}
+static inline __u32 ceph_frag_right_child(__u32 f)
+{
+	return ceph_frag_make(ceph_frag_bits(f)+1,
+	      ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f))));
+}
+static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
+{
+	int newbits = ceph_frag_bits(f) + by;
+	return ceph_frag_make(newbits,
+			 ceph_frag_value(f) | (i << (24 - newbits)));
+}
+static inline int ceph_frag_is_leftmost(__u32 f)
+{
+	return ceph_frag_value(f) == 0;
+}
+static inline int ceph_frag_is_rightmost(__u32 f)
+{
+	return ceph_frag_value(f) == ceph_frag_mask(f);
+}
+static inline __u32 ceph_frag_next(__u32 f)
+{
+	return ceph_frag_make(ceph_frag_bits(f),
+			 ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f)));
+}
+
+/*
+ * comparator to sort frags logically, as when traversing the
+ * number space in ascending order...
+ */
+int ceph_frag_compare(__u32 a, __u32 b);
+
+#endif
diff --git a/fs/ceph/ceph_fs.c b/fs/ceph/ceph_fs.c
new file mode 100644
index 000000000000..79d76bc4303f
--- /dev/null
+++ b/fs/ceph/ceph_fs.c
@@ -0,0 +1,74 @@
+/*
+ * Some non-inline ceph helpers
+ */
+#include "types.h"
+
+/*
+ * return true if @layout appears to be valid
+ */
+int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
+{
+	__u32 su = le32_to_cpu(layout->fl_stripe_unit);
+	__u32 sc = le32_to_cpu(layout->fl_stripe_count);
+	__u32 os = le32_to_cpu(layout->fl_object_size);
+
+	/* stripe unit, object size must be non-zero, 64k increment */
+	if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
+		return 0;
+	if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1)))
+		return 0;
+	/* object size must be a multiple of stripe unit */
+	if (os < su || os % su)
+		return 0;
+	/* stripe count must be non-zero */
+	if (!sc)
+		return 0;
+	return 1;
+}
+
+
+int ceph_flags_to_mode(int flags)
+{
+#ifdef O_DIRECTORY  /* fixme */
+	if ((flags & O_DIRECTORY) == O_DIRECTORY)
+		return CEPH_FILE_MODE_PIN;
+#endif
+#ifdef O_LAZY
+	if (flags & O_LAZY)
+		return CEPH_FILE_MODE_LAZY;
+#endif
+	if ((flags & O_APPEND) == O_APPEND)
+		flags |= O_WRONLY;
+
+	flags &= O_ACCMODE;
+	if ((flags & O_RDWR) == O_RDWR)
+		return CEPH_FILE_MODE_RDWR;
+	if ((flags & O_WRONLY) == O_WRONLY)
+		return CEPH_FILE_MODE_WR;
+	return CEPH_FILE_MODE_RD;
+}
+
+int ceph_caps_for_mode(int mode)
+{
+	switch (mode) {
+	case CEPH_FILE_MODE_PIN:
+		return CEPH_CAP_PIN;
+	case CEPH_FILE_MODE_RD:
+		return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
+			CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
+	case CEPH_FILE_MODE_RDWR:
+		return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
+			CEPH_CAP_FILE_EXCL |
+			CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE |
+			CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
+			CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
+			CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
+	case CEPH_FILE_MODE_WR:
+		return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
+			CEPH_CAP_FILE_EXCL |
+			CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
+			CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
+			CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
+	}
+	return 0;
+}
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
new file mode 100644
index 000000000000..2fa992eaf7da
--- /dev/null
+++ b/fs/ceph/ceph_fs.h
@@ -0,0 +1,705 @@
+/*
+ * ceph_fs.h - Ceph constants and data types to share between kernel and
+ * user space.
+ *
+ * Most types in this file are defined as little-endian, and are
+ * primarily intended to describe data structures that pass over the
+ * wire or that are stored on disk.
+ *
+ * LGPL2
+ */
+
+#ifndef _FS_CEPH_CEPH_FS_H
+#define _FS_CEPH_CEPH_FS_H
+
+#include "msgr.h"
+#include "rados.h"
+
+/*
+ * Ceph release version
+ */
+#define CEPH_VERSION_MAJOR 0
+#define CEPH_VERSION_MINOR 20
+#define CEPH_VERSION_PATCH 0
+
+#define _CEPH_STRINGIFY(x) #x
+#define CEPH_STRINGIFY(x) _CEPH_STRINGIFY(x)
+#define CEPH_MAKE_VERSION(x, y, z) CEPH_STRINGIFY(x) "." CEPH_STRINGIFY(y) \
+	"." CEPH_STRINGIFY(z)
+#define CEPH_VERSION CEPH_MAKE_VERSION(CEPH_VERSION_MAJOR, \
+				       CEPH_VERSION_MINOR, CEPH_VERSION_PATCH)
+
+/*
+ * subprotocol versions.  when specific messages types or high-level
+ * protocols change, bump the affected components.  we keep rev
+ * internal cluster protocols separately from the public,
+ * client-facing protocol.
+ */
+#define CEPH_OSD_PROTOCOL     8 /* cluster internal */
+#define CEPH_MDS_PROTOCOL    12 /* cluster internal */
+#define CEPH_MON_PROTOCOL     5 /* cluster internal */
+#define CEPH_OSDC_PROTOCOL   24 /* server/client */
+#define CEPH_MDSC_PROTOCOL   32 /* server/client */
+#define CEPH_MONC_PROTOCOL   15 /* server/client */
+
+
+#define CEPH_INO_ROOT  1
+#define CEPH_INO_CEPH  2        /* hidden .ceph dir */
+
+/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
+#define CEPH_MAX_MON   31
+
+
+/*
+ * feature bits
+ */
+#define CEPH_FEATURE_UID        1
+#define CEPH_FEATURE_NOSRCADDR  2
+#define CEPH_FEATURE_FLOCK      4
+
+#define CEPH_FEATURE_SUPPORTED_MON  CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR
+#define CEPH_FEATURE_REQUIRED_MON   CEPH_FEATURE_UID
+#define CEPH_FEATURE_SUPPORTED_MDS  CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR|CEPH_FEATURE_FLOCK
+#define CEPH_FEATURE_REQUIRED_MDS   CEPH_FEATURE_UID
+#define CEPH_FEATURE_SUPPORTED_OSD  CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR
+#define CEPH_FEATURE_REQUIRED_OSD   CEPH_FEATURE_UID
+#define CEPH_FEATURE_SUPPORTED_CLIENT CEPH_FEATURE_NOSRCADDR
+#define CEPH_FEATURE_REQUIRED_CLIENT CEPH_FEATURE_NOSRCADDR
+
+
+/*
+ * ceph_file_layout - describe data layout for a file/inode
+ */
+struct ceph_file_layout {
+	/* file -> object mapping */
+	__le32 fl_stripe_unit;     /* stripe unit, in bytes.  must be multiple
+				      of page size. */
+	__le32 fl_stripe_count;    /* over this many objects */
+	__le32 fl_object_size;     /* until objects are this big, then move to
+				      new objects */
+	__le32 fl_cas_hash;        /* 0 = none; 1 = sha256 */
+
+	/* pg -> disk layout */
+	__le32 fl_object_stripe_unit;  /* for per-object parity, if any */
+
+	/* object -> pg layout */
+	__le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */
+	__le32 fl_pg_pool;      /* namespace, crush ruleset, rep level */
+} __attribute__ ((packed));
+
+#define CEPH_MIN_STRIPE_UNIT 65536
+
+int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
+
+
+/* crypto algorithms */
+#define CEPH_CRYPTO_NONE 0x0
+#define CEPH_CRYPTO_AES  0x1
+
+/* security/authentication protocols */
+#define CEPH_AUTH_UNKNOWN	0x0
+#define CEPH_AUTH_NONE	 	0x1
+#define CEPH_AUTH_CEPHX	 	0x2
+
+#define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
+
+
+/*********************************************
+ * message layer
+ */
+
+/*
+ * message types
+ */
+
+/* misc */
+#define CEPH_MSG_SHUTDOWN               1
+#define CEPH_MSG_PING                   2
+
+/* client <-> monitor */
+#define CEPH_MSG_MON_MAP                4
+#define CEPH_MSG_MON_GET_MAP            5
+#define CEPH_MSG_STATFS                 13
+#define CEPH_MSG_STATFS_REPLY           14
+#define CEPH_MSG_MON_SUBSCRIBE          15
+#define CEPH_MSG_MON_SUBSCRIBE_ACK      16
+#define CEPH_MSG_AUTH			17
+#define CEPH_MSG_AUTH_REPLY		18
+
+/* client <-> mds */
+#define CEPH_MSG_MDS_MAP                21
+
+#define CEPH_MSG_CLIENT_SESSION         22
+#define CEPH_MSG_CLIENT_RECONNECT       23
+
+#define CEPH_MSG_CLIENT_REQUEST         24
+#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
+#define CEPH_MSG_CLIENT_REPLY           26
+#define CEPH_MSG_CLIENT_CAPS            0x310
+#define CEPH_MSG_CLIENT_LEASE           0x311
+#define CEPH_MSG_CLIENT_SNAP            0x312
+#define CEPH_MSG_CLIENT_CAPRELEASE      0x313
+
+/* pool ops */
+#define CEPH_MSG_POOLOP_REPLY           48
+#define CEPH_MSG_POOLOP                 49
+
+
+/* osd */
+#define CEPH_MSG_OSD_MAP          41
+#define CEPH_MSG_OSD_OP           42
+#define CEPH_MSG_OSD_OPREPLY      43
+
+/* pool operations */
+enum {
+  POOL_OP_CREATE			= 0x01,
+  POOL_OP_DELETE			= 0x02,
+  POOL_OP_AUID_CHANGE			= 0x03,
+  POOL_OP_CREATE_SNAP			= 0x11,
+  POOL_OP_DELETE_SNAP			= 0x12,
+  POOL_OP_CREATE_UNMANAGED_SNAP		= 0x21,
+  POOL_OP_DELETE_UNMANAGED_SNAP		= 0x22,
+};
+
+struct ceph_mon_request_header {
+	__le64 have_version;
+	__le16 session_mon;
+	__le64 session_mon_tid;
+} __attribute__ ((packed));
+
+struct ceph_mon_statfs {
+	struct ceph_mon_request_header monhdr;
+	struct ceph_fsid fsid;
+} __attribute__ ((packed));
+
+struct ceph_statfs {
+	__le64 kb, kb_used, kb_avail;
+	__le64 num_objects;
+} __attribute__ ((packed));
+
+struct ceph_mon_statfs_reply {
+	struct ceph_fsid fsid;
+	__le64 version;
+	struct ceph_statfs st;
+} __attribute__ ((packed));
+
+const char *ceph_pool_op_name(int op);
+
+struct ceph_mon_poolop {
+	struct ceph_mon_request_header monhdr;
+	struct ceph_fsid fsid;
+	__le32 pool;
+	__le32 op;
+	__le64 auid;
+	__le64 snapid;
+	__le32 name_len;
+} __attribute__ ((packed));
+
+struct ceph_mon_poolop_reply {
+	struct ceph_mon_request_header monhdr;
+	struct ceph_fsid fsid;
+	__le32 reply_code;
+	__le32 epoch;
+	char has_data;
+	char data[0];
+} __attribute__ ((packed));
+
+struct ceph_mon_unmanaged_snap {
+	__le64 snapid;
+} __attribute__ ((packed));
+
+struct ceph_osd_getmap {
+	struct ceph_mon_request_header monhdr;
+	struct ceph_fsid fsid;
+	__le32 start;
+} __attribute__ ((packed));
+
+struct ceph_mds_getmap {
+	struct ceph_mon_request_header monhdr;
+	struct ceph_fsid fsid;
+} __attribute__ ((packed));
+
+struct ceph_client_mount {
+	struct ceph_mon_request_header monhdr;
+} __attribute__ ((packed));
+
+struct ceph_mon_subscribe_item {
+	__le64 have_version;	__le64 have;
+	__u8 onetime;
+} __attribute__ ((packed));
+
+struct ceph_mon_subscribe_ack {
+	__le32 duration;         /* seconds */
+	struct ceph_fsid fsid;
+} __attribute__ ((packed));
+
+/*
+ * mds states
+ *   > 0 -> in
+ *  <= 0 -> out
+ */
+#define CEPH_MDS_STATE_DNE          0  /* down, does not exist. */
+#define CEPH_MDS_STATE_STOPPED     -1  /* down, once existed, but no subtrees.
+					  empty log. */
+#define CEPH_MDS_STATE_BOOT        -4  /* up, boot announcement. */
+#define CEPH_MDS_STATE_STANDBY     -5  /* up, idle.  waiting for assignment. */
+#define CEPH_MDS_STATE_CREATING    -6  /* up, creating MDS instance. */
+#define CEPH_MDS_STATE_STARTING    -7  /* up, starting previously stopped mds */
+#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
+
+#define CEPH_MDS_STATE_REPLAY       8  /* up, replaying journal. */
+#define CEPH_MDS_STATE_RESOLVE      9  /* up, disambiguating distributed
+					  operations (import, rename, etc.) */
+#define CEPH_MDS_STATE_RECONNECT    10 /* up, reconnect to clients */
+#define CEPH_MDS_STATE_REJOIN       11 /* up, rejoining distributed cache */
+#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */
+#define CEPH_MDS_STATE_ACTIVE       13 /* up, active */
+#define CEPH_MDS_STATE_STOPPING     14 /* up, but exporting metadata */
+
+extern const char *ceph_mds_state_name(int s);
+
+
+/*
+ * metadata lock types.
+ *  - these are bitmasks.. we can compose them
+ *  - they also define the lock ordering by the MDS
+ *  - a few of these are internal to the mds
+ */
+#define CEPH_LOCK_DVERSION    1
+#define CEPH_LOCK_DN          2
+#define CEPH_LOCK_ISNAP       16
+#define CEPH_LOCK_IVERSION    32    /* mds internal */
+#define CEPH_LOCK_IFILE       64
+#define CEPH_LOCK_IAUTH       128
+#define CEPH_LOCK_ILINK       256
+#define CEPH_LOCK_IDFT        512   /* dir frag tree */
+#define CEPH_LOCK_INEST       1024  /* mds internal */
+#define CEPH_LOCK_IXATTR      2048
+#define CEPH_LOCK_INO         8192  /* immutable inode bits; not a lock */
+
+/* client_session ops */
+enum {
+	CEPH_SESSION_REQUEST_OPEN,
+	CEPH_SESSION_OPEN,
+	CEPH_SESSION_REQUEST_CLOSE,
+	CEPH_SESSION_CLOSE,
+	CEPH_SESSION_REQUEST_RENEWCAPS,
+	CEPH_SESSION_RENEWCAPS,
+	CEPH_SESSION_STALE,
+	CEPH_SESSION_RECALL_STATE,
+};
+
+extern const char *ceph_session_op_name(int op);
+
+struct ceph_mds_session_head {
+	__le32 op;
+	__le64 seq;
+	struct ceph_timespec stamp;
+	__le32 max_caps, max_leases;
+} __attribute__ ((packed));
+
+/* client_request */
+/*
+ * metadata ops.
+ *  & 0x001000 -> write op
+ *  & 0x010000 -> follow symlink (e.g. stat(), not lstat()).
+ &  & 0x100000 -> use weird ino/path trace
+ */
+#define CEPH_MDS_OP_WRITE        0x001000
+enum {
+	CEPH_MDS_OP_LOOKUP     = 0x00100,
+	CEPH_MDS_OP_GETATTR    = 0x00101,
+	CEPH_MDS_OP_LOOKUPHASH = 0x00102,
+	CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
+
+	CEPH_MDS_OP_SETXATTR   = 0x01105,
+	CEPH_MDS_OP_RMXATTR    = 0x01106,
+	CEPH_MDS_OP_SETLAYOUT  = 0x01107,
+	CEPH_MDS_OP_SETATTR    = 0x01108,
+
+	CEPH_MDS_OP_MKNOD      = 0x01201,
+	CEPH_MDS_OP_LINK       = 0x01202,
+	CEPH_MDS_OP_UNLINK     = 0x01203,
+	CEPH_MDS_OP_RENAME     = 0x01204,
+	CEPH_MDS_OP_MKDIR      = 0x01220,
+	CEPH_MDS_OP_RMDIR      = 0x01221,
+	CEPH_MDS_OP_SYMLINK    = 0x01222,
+
+	CEPH_MDS_OP_CREATE     = 0x01301,
+	CEPH_MDS_OP_OPEN       = 0x00302,
+	CEPH_MDS_OP_READDIR    = 0x00305,
+
+	CEPH_MDS_OP_LOOKUPSNAP = 0x00400,
+	CEPH_MDS_OP_MKSNAP     = 0x01400,
+	CEPH_MDS_OP_RMSNAP     = 0x01401,
+	CEPH_MDS_OP_LSSNAP     = 0x00402,
+};
+
+extern const char *ceph_mds_op_name(int op);
+
+
+#define CEPH_SETATTR_MODE   1
+#define CEPH_SETATTR_UID    2
+#define CEPH_SETATTR_GID    4
+#define CEPH_SETATTR_MTIME  8
+#define CEPH_SETATTR_ATIME 16
+#define CEPH_SETATTR_SIZE  32
+#define CEPH_SETATTR_CTIME 64
+
+union ceph_mds_request_args {
+	struct {
+		__le32 mask;                 /* CEPH_CAP_* */
+	} __attribute__ ((packed)) getattr;
+	struct {
+		__le32 mode;
+		__le32 uid;
+		__le32 gid;
+		struct ceph_timespec mtime;
+		struct ceph_timespec atime;
+		__le64 size, old_size;       /* old_size needed by truncate */
+		__le32 mask;                 /* CEPH_SETATTR_* */
+	} __attribute__ ((packed)) setattr;
+	struct {
+		__le32 frag;                 /* which dir fragment */
+		__le32 max_entries;          /* how many dentries to grab */
+		__le32 max_bytes;
+	} __attribute__ ((packed)) readdir;
+	struct {
+		__le32 mode;
+		__le32 rdev;
+	} __attribute__ ((packed)) mknod;
+	struct {
+		__le32 mode;
+	} __attribute__ ((packed)) mkdir;
+	struct {
+		__le32 flags;
+		__le32 mode;
+		__le32 stripe_unit;          /* layout for newly created file */
+		__le32 stripe_count;         /* ... */
+		__le32 object_size;
+		__le32 file_replication;
+		__le32 preferred;
+	} __attribute__ ((packed)) open;
+	struct {
+		__le32 flags;
+	} __attribute__ ((packed)) setxattr;
+	struct {
+		struct ceph_file_layout layout;
+	} __attribute__ ((packed)) setlayout;
+} __attribute__ ((packed));
+
+#define CEPH_MDS_FLAG_REPLAY        1  /* this is a replayed op */
+#define CEPH_MDS_FLAG_WANT_DENTRY   2  /* want dentry in reply */
+
+struct ceph_mds_request_head {
+	__le64 oldest_client_tid;
+	__le32 mdsmap_epoch;           /* on client */
+	__le32 flags;                  /* CEPH_MDS_FLAG_* */
+	__u8 num_retry, num_fwd;       /* count retry, fwd attempts */
+	__le16 num_releases;           /* # include cap/lease release records */
+	__le32 op;                     /* mds op code */
+	__le32 caller_uid, caller_gid;
+	__le64 ino;                    /* use this ino for openc, mkdir, mknod,
+					  etc. (if replaying) */
+	union ceph_mds_request_args args;
+} __attribute__ ((packed));
+
+/* cap/lease release record */
+struct ceph_mds_request_release {
+	__le64 ino, cap_id;            /* ino and unique cap id */
+	__le32 caps, wanted;           /* new issued, wanted */
+	__le32 seq, issue_seq, mseq;
+	__le32 dname_seq;              /* if releasing a dentry lease, a */
+	__le32 dname_len;              /* string follows. */
+} __attribute__ ((packed));
+
+/* client reply */
+struct ceph_mds_reply_head {
+	__le32 op;
+	__le32 result;
+	__le32 mdsmap_epoch;
+	__u8 safe;                     /* true if committed to disk */
+	__u8 is_dentry, is_target;     /* true if dentry, target inode records
+					  are included with reply */
+} __attribute__ ((packed));
+
+/* one for each node split */
+struct ceph_frag_tree_split {
+	__le32 frag;                   /* this frag splits... */
+	__le32 by;                     /* ...by this many bits */
+} __attribute__ ((packed));
+
+struct ceph_frag_tree_head {
+	__le32 nsplits;                /* num ceph_frag_tree_split records */
+	struct ceph_frag_tree_split splits[];
+} __attribute__ ((packed));
+
+/* capability issue, for bundling with mds reply */
+struct ceph_mds_reply_cap {
+	__le32 caps, wanted;           /* caps issued, wanted */
+	__le64 cap_id;
+	__le32 seq, mseq;
+	__le64 realm;                  /* snap realm */
+	__u8 flags;                    /* CEPH_CAP_FLAG_* */
+} __attribute__ ((packed));
+
+#define CEPH_CAP_FLAG_AUTH  1          /* cap is issued by auth mds */
+
+/* inode record, for bundling with mds reply */
+struct ceph_mds_reply_inode {
+	__le64 ino;
+	__le64 snapid;
+	__le32 rdev;
+	__le64 version;                /* inode version */
+	__le64 xattr_version;          /* version for xattr blob */
+	struct ceph_mds_reply_cap cap; /* caps issued for this inode */
+	struct ceph_file_layout layout;
+	struct ceph_timespec ctime, mtime, atime;
+	__le32 time_warp_seq;
+	__le64 size, max_size, truncate_size;
+	__le32 truncate_seq;
+	__le32 mode, uid, gid;
+	__le32 nlink;
+	__le64 files, subdirs, rbytes, rfiles, rsubdirs;  /* dir stats */
+	struct ceph_timespec rctime;
+	struct ceph_frag_tree_head fragtree;  /* (must be at end of struct) */
+} __attribute__ ((packed));
+/* followed by frag array, then symlink string, then xattr blob */
+
+/* reply_lease follows dname, and reply_inode */
+struct ceph_mds_reply_lease {
+	__le16 mask;            /* lease type(s) */
+	__le32 duration_ms;     /* lease duration */
+	__le32 seq;
+} __attribute__ ((packed));
+
+struct ceph_mds_reply_dirfrag {
+	__le32 frag;            /* fragment */
+	__le32 auth;            /* auth mds, if this is a delegation point */
+	__le32 ndist;           /* number of mds' this is replicated on */
+	__le32 dist[];
+} __attribute__ ((packed));
+
+/* file access modes */
+#define CEPH_FILE_MODE_PIN        0
+#define CEPH_FILE_MODE_RD         1
+#define CEPH_FILE_MODE_WR         2
+#define CEPH_FILE_MODE_RDWR       3  /* RD | WR */
+#define CEPH_FILE_MODE_LAZY       4  /* lazy io */
+#define CEPH_FILE_MODE_NUM        8  /* bc these are bit fields.. mostly */
+
+int ceph_flags_to_mode(int flags);
+
+
+/* capability bits */
+#define CEPH_CAP_PIN         1  /* no specific capabilities beyond the pin */
+
+/* generic cap bits */
+#define CEPH_CAP_GSHARED     1  /* client can reads */
+#define CEPH_CAP_GEXCL       2  /* client can read and update */
+#define CEPH_CAP_GCACHE      4  /* (file) client can cache reads */
+#define CEPH_CAP_GRD         8  /* (file) client can read */
+#define CEPH_CAP_GWR        16  /* (file) client can write */
+#define CEPH_CAP_GBUFFER    32  /* (file) client can buffer writes */
+#define CEPH_CAP_GWREXTEND  64  /* (file) client can extend EOF */
+#define CEPH_CAP_GLAZYIO   128  /* (file) client can perform lazy io */
+
+/* per-lock shift */
+#define CEPH_CAP_SAUTH      2
+#define CEPH_CAP_SLINK      4
+#define CEPH_CAP_SXATTR     6
+#define CEPH_CAP_SFILE      8   /* goes at the end (uses >2 cap bits) */
+
+#define CEPH_CAP_BITS       16
+
+/* composed values */
+#define CEPH_CAP_AUTH_SHARED  (CEPH_CAP_GSHARED  << CEPH_CAP_SAUTH)
+#define CEPH_CAP_AUTH_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SAUTH)
+#define CEPH_CAP_LINK_SHARED  (CEPH_CAP_GSHARED  << CEPH_CAP_SLINK)
+#define CEPH_CAP_LINK_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SLINK)
+#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED  << CEPH_CAP_SXATTR)
+#define CEPH_CAP_XATTR_EXCL    (CEPH_CAP_GEXCL     << CEPH_CAP_SXATTR)
+#define CEPH_CAP_FILE(x)    (x << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_SHARED   (CEPH_CAP_GSHARED   << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_CACHE    (CEPH_CAP_GCACHE    << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_RD       (CEPH_CAP_GRD       << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_WR       (CEPH_CAP_GWR       << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_BUFFER   (CEPH_CAP_GBUFFER   << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_LAZYIO   (CEPH_CAP_GLAZYIO   << CEPH_CAP_SFILE)
+
+/* cap masks (for getattr) */
+#define CEPH_STAT_CAP_INODE    CEPH_CAP_PIN
+#define CEPH_STAT_CAP_TYPE     CEPH_CAP_PIN  /* mode >> 12 */
+#define CEPH_STAT_CAP_SYMLINK  CEPH_CAP_PIN
+#define CEPH_STAT_CAP_UID      CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_GID      CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_MODE     CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_NLINK    CEPH_CAP_LINK_SHARED
+#define CEPH_STAT_CAP_LAYOUT   CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_MTIME    CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_SIZE     CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_ATIME    CEPH_CAP_FILE_SHARED  /* fixme */
+#define CEPH_STAT_CAP_XATTR    CEPH_CAP_XATTR_SHARED
+#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN |			\
+				 CEPH_CAP_AUTH_SHARED |	\
+				 CEPH_CAP_LINK_SHARED |	\
+				 CEPH_CAP_FILE_SHARED |	\
+				 CEPH_CAP_XATTR_SHARED)
+
+#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED |			\
+			      CEPH_CAP_LINK_SHARED |			\
+			      CEPH_CAP_XATTR_SHARED |			\
+			      CEPH_CAP_FILE_SHARED)
+#define CEPH_CAP_ANY_RD   (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD |	\
+			   CEPH_CAP_FILE_CACHE)
+
+#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL |		\
+			   CEPH_CAP_LINK_EXCL |		\
+			   CEPH_CAP_XATTR_EXCL |	\
+			   CEPH_CAP_FILE_EXCL)
+#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |	\
+			      CEPH_CAP_FILE_EXCL)
+#define CEPH_CAP_ANY_WR   (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
+#define CEPH_CAP_ANY      (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
+			   CEPH_CAP_ANY_FILE_WR | CEPH_CAP_PIN)
+
+#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
+			CEPH_LOCK_IXATTR)
+
+int ceph_caps_for_mode(int mode);
+
+enum {
+	CEPH_CAP_OP_GRANT,         /* mds->client grant */
+	CEPH_CAP_OP_REVOKE,        /* mds->client revoke */
+	CEPH_CAP_OP_TRUNC,         /* mds->client trunc notify */
+	CEPH_CAP_OP_EXPORT,        /* mds has exported the cap */
+	CEPH_CAP_OP_IMPORT,        /* mds has imported the cap */
+	CEPH_CAP_OP_UPDATE,        /* client->mds update */
+	CEPH_CAP_OP_DROP,          /* client->mds drop cap bits */
+	CEPH_CAP_OP_FLUSH,         /* client->mds cap writeback */
+	CEPH_CAP_OP_FLUSH_ACK,     /* mds->client flushed */
+	CEPH_CAP_OP_FLUSHSNAP,     /* client->mds flush snapped metadata */
+	CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */
+	CEPH_CAP_OP_RELEASE,       /* client->mds release (clean) cap */
+	CEPH_CAP_OP_RENEW,         /* client->mds renewal request */
+};
+
+extern const char *ceph_cap_op_name(int op);
+
+/*
+ * caps message, used for capability callbacks, acks, requests, etc.
+ */
+struct ceph_mds_caps {
+	__le32 op;                  /* CEPH_CAP_OP_* */
+	__le64 ino, realm;
+	__le64 cap_id;
+	__le32 seq, issue_seq;
+	__le32 caps, wanted, dirty; /* latest issued/wanted/dirty */
+	__le32 migrate_seq;
+	__le64 snap_follows;
+	__le32 snap_trace_len;
+
+	/* authlock */
+	__le32 uid, gid, mode;
+
+	/* linklock */
+	__le32 nlink;
+
+	/* xattrlock */
+	__le32 xattr_len;
+	__le64 xattr_version;
+
+	/* filelock */
+	__le64 size, max_size, truncate_size;
+	__le32 truncate_seq;
+	struct ceph_timespec mtime, atime, ctime;
+	struct ceph_file_layout layout;
+	__le32 time_warp_seq;
+} __attribute__ ((packed));
+
+/* cap release msg head */
+struct ceph_mds_cap_release {
+	__le32 num;                /* number of cap_items that follow */
+} __attribute__ ((packed));
+
+struct ceph_mds_cap_item {
+	__le64 ino;
+	__le64 cap_id;
+	__le32 migrate_seq, seq;
+} __attribute__ ((packed));
+
+#define CEPH_MDS_LEASE_REVOKE           1  /*    mds  -> client */
+#define CEPH_MDS_LEASE_RELEASE          2  /* client  -> mds    */
+#define CEPH_MDS_LEASE_RENEW            3  /* client <-> mds    */
+#define CEPH_MDS_LEASE_REVOKE_ACK       4  /* client  -> mds    */
+
+extern const char *ceph_lease_op_name(int o);
+
+/* lease msg header */
+struct ceph_mds_lease {
+	__u8 action;            /* CEPH_MDS_LEASE_* */
+	__le16 mask;            /* which lease */
+	__le64 ino;
+	__le64 first, last;     /* snap range */
+	__le32 seq;
+	__le32 duration_ms;     /* duration of renewal */
+} __attribute__ ((packed));
+/* followed by a __le32+string for dname */
+
+/* client reconnect */
+struct ceph_mds_cap_reconnect {
+	__le64 cap_id;
+	__le32 wanted;
+	__le32 issued;
+	__le64 size;
+	struct ceph_timespec mtime, atime;
+	__le64 snaprealm;
+	__le64 pathbase;        /* base ino for our path to this ino */
+} __attribute__ ((packed));
+/* followed by encoded string */
+
+struct ceph_mds_snaprealm_reconnect {
+	__le64 ino;     /* snap realm base */
+	__le64 seq;     /* snap seq for this snap realm */
+	__le64 parent;  /* parent realm */
+} __attribute__ ((packed));
+
+/*
+ * snaps
+ */
+enum {
+	CEPH_SNAP_OP_UPDATE,  /* CREATE or DESTROY */
+	CEPH_SNAP_OP_CREATE,
+	CEPH_SNAP_OP_DESTROY,
+	CEPH_SNAP_OP_SPLIT,
+};
+
+extern const char *ceph_snap_op_name(int o);
+
+/* snap msg header */
+struct ceph_mds_snap_head {
+	__le32 op;                /* CEPH_SNAP_OP_* */
+	__le64 split;             /* ino to split off, if any */
+	__le32 num_split_inos;    /* # inos belonging to new child realm */
+	__le32 num_split_realms;  /* # child realms udner new child realm */
+	__le32 trace_len;         /* size of snap trace blob */
+} __attribute__ ((packed));
+/* followed by split ino list, then split realms, then the trace blob */
+
+/*
+ * encode info about a snaprealm, as viewed by a client
+ */
+struct ceph_mds_snap_realm {
+	__le64 ino;           /* ino */
+	__le64 created;       /* snap: when created */
+	__le64 parent;        /* ino: parent realm */
+	__le64 parent_since;  /* snap: same parent since */
+	__le64 seq;           /* snap: version */
+	__le32 num_snaps;
+	__le32 num_prior_parent_snaps;
+} __attribute__ ((packed));
+/* followed by my snap list, then prior parent snap list */
+
+#endif
diff --git a/fs/ceph/ceph_hash.c b/fs/ceph/ceph_hash.c
new file mode 100644
index 000000000000..bd570015d147
--- /dev/null
+++ b/fs/ceph/ceph_hash.c
@@ -0,0 +1,118 @@
+
+#include "types.h"
+
+/*
+ * Robert Jenkin's hash function.
+ * http://burtleburtle.net/bob/hash/evahash.html
+ * This is in the public domain.
+ */
+#define mix(a, b, c)						\
+	do {							\
+		a = a - b;  a = a - c;  a = a ^ (c >> 13);	\
+		b = b - c;  b = b - a;  b = b ^ (a << 8);	\
+		c = c - a;  c = c - b;  c = c ^ (b >> 13);	\
+		a = a - b;  a = a - c;  a = a ^ (c >> 12);	\
+		b = b - c;  b = b - a;  b = b ^ (a << 16);	\
+		c = c - a;  c = c - b;  c = c ^ (b >> 5);	\
+		a = a - b;  a = a - c;  a = a ^ (c >> 3);	\
+		b = b - c;  b = b - a;  b = b ^ (a << 10);	\
+		c = c - a;  c = c - b;  c = c ^ (b >> 15);	\
+	} while (0)
+
+unsigned ceph_str_hash_rjenkins(const char *str, unsigned length)
+{
+	const unsigned char *k = (const unsigned char *)str;
+	__u32 a, b, c;  /* the internal state */
+	__u32 len;      /* how many key bytes still need mixing */
+
+	/* Set up the internal state */
+	len = length;
+	a = 0x9e3779b9;      /* the golden ratio; an arbitrary value */
+	b = a;
+	c = 0;               /* variable initialization of internal state */
+
+	/* handle most of the key */
+	while (len >= 12) {
+		a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) +
+			 ((__u32)k[3] << 24));
+		b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) +
+			 ((__u32)k[7] << 24));
+		c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) +
+			 ((__u32)k[11] << 24));
+		mix(a, b, c);
+		k = k + 12;
+		len = len - 12;
+	}
+
+	/* handle the last 11 bytes */
+	c = c + length;
+	switch (len) {            /* all the case statements fall through */
+	case 11:
+		c = c + ((__u32)k[10] << 24);
+	case 10:
+		c = c + ((__u32)k[9] << 16);
+	case 9:
+		c = c + ((__u32)k[8] << 8);
+		/* the first byte of c is reserved for the length */
+	case 8:
+		b = b + ((__u32)k[7] << 24);
+	case 7:
+		b = b + ((__u32)k[6] << 16);
+	case 6:
+		b = b + ((__u32)k[5] << 8);
+	case 5:
+		b = b + k[4];
+	case 4:
+		a = a + ((__u32)k[3] << 24);
+	case 3:
+		a = a + ((__u32)k[2] << 16);
+	case 2:
+		a = a + ((__u32)k[1] << 8);
+	case 1:
+		a = a + k[0];
+		/* case 0: nothing left to add */
+	}
+	mix(a, b, c);
+
+	return c;
+}
+
+/*
+ * linux dcache hash
+ */
+unsigned ceph_str_hash_linux(const char *str, unsigned length)
+{
+	unsigned long hash = 0;
+	unsigned char c;
+
+	while (length--) {
+		c = *str++;
+		hash = (hash + (c << 4) + (c >> 4)) * 11;
+	}
+	return hash;
+}
+
+
+unsigned ceph_str_hash(int type, const char *s, unsigned len)
+{
+	switch (type) {
+	case CEPH_STR_HASH_LINUX:
+		return ceph_str_hash_linux(s, len);
+	case CEPH_STR_HASH_RJENKINS:
+		return ceph_str_hash_rjenkins(s, len);
+	default:
+		return -1;
+	}
+}
+
+const char *ceph_str_hash_name(int type)
+{
+	switch (type) {
+	case CEPH_STR_HASH_LINUX:
+		return "linux";
+	case CEPH_STR_HASH_RJENKINS:
+		return "rjenkins";
+	default:
+		return "unknown";
+	}
+}
diff --git a/fs/ceph/ceph_hash.h b/fs/ceph/ceph_hash.h
new file mode 100644
index 000000000000..5ac470c433c9
--- /dev/null
+++ b/fs/ceph/ceph_hash.h
@@ -0,0 +1,13 @@
+#ifndef _FS_CEPH_HASH_H
+#define _FS_CEPH_HASH_H
+
+#define CEPH_STR_HASH_LINUX      0x1  /* linux dcache hash */
+#define CEPH_STR_HASH_RJENKINS   0x2  /* robert jenkins' */
+
+extern unsigned ceph_str_hash_linux(const char *s, unsigned len);
+extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len);
+
+extern unsigned ceph_str_hash(int type, const char *s, unsigned len);
+extern const char *ceph_str_hash_name(int type);
+
+#endif
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c
new file mode 100644
index 000000000000..7503aee828ce
--- /dev/null
+++ b/fs/ceph/ceph_strings.c
@@ -0,0 +1,190 @@
+/*
+ * Ceph string constants
+ */
+#include "types.h"
+
+const char *ceph_entity_type_name(int type)
+{
+	switch (type) {
+	case CEPH_ENTITY_TYPE_MDS: return "mds";
+	case CEPH_ENTITY_TYPE_OSD: return "osd";
+	case CEPH_ENTITY_TYPE_MON: return "mon";
+	case CEPH_ENTITY_TYPE_CLIENT: return "client";
+	case CEPH_ENTITY_TYPE_AUTH: return "auth";
+	default: return "unknown";
+	}
+}
+
+const char *ceph_osd_op_name(int op)
+{
+	switch (op) {
+	case CEPH_OSD_OP_READ: return "read";
+	case CEPH_OSD_OP_STAT: return "stat";
+
+	case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
+
+	case CEPH_OSD_OP_WRITE: return "write";
+	case CEPH_OSD_OP_DELETE: return "delete";
+	case CEPH_OSD_OP_TRUNCATE: return "truncate";
+	case CEPH_OSD_OP_ZERO: return "zero";
+	case CEPH_OSD_OP_WRITEFULL: return "writefull";
+
+	case CEPH_OSD_OP_APPEND: return "append";
+	case CEPH_OSD_OP_STARTSYNC: return "startsync";
+	case CEPH_OSD_OP_SETTRUNC: return "settrunc";
+	case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
+
+	case CEPH_OSD_OP_TMAPUP: return "tmapup";
+	case CEPH_OSD_OP_TMAPGET: return "tmapget";
+	case CEPH_OSD_OP_TMAPPUT: return "tmapput";
+
+	case CEPH_OSD_OP_GETXATTR: return "getxattr";
+	case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
+	case CEPH_OSD_OP_SETXATTR: return "setxattr";
+	case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
+	case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
+	case CEPH_OSD_OP_RMXATTR: return "rmxattr";
+	case CEPH_OSD_OP_CMPXATTR: return "cmpxattr";
+
+	case CEPH_OSD_OP_PULL: return "pull";
+	case CEPH_OSD_OP_PUSH: return "push";
+	case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
+	case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
+	case CEPH_OSD_OP_SCRUB: return "scrub";
+
+	case CEPH_OSD_OP_WRLOCK: return "wrlock";
+	case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
+	case CEPH_OSD_OP_RDLOCK: return "rdlock";
+	case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
+	case CEPH_OSD_OP_UPLOCK: return "uplock";
+	case CEPH_OSD_OP_DNLOCK: return "dnlock";
+
+	case CEPH_OSD_OP_CALL: return "call";
+
+	case CEPH_OSD_OP_PGLS: return "pgls";
+	}
+	return "???";
+}
+
+const char *ceph_mds_state_name(int s)
+{
+	switch (s) {
+		/* down and out */
+	case CEPH_MDS_STATE_DNE:        return "down:dne";
+	case CEPH_MDS_STATE_STOPPED:    return "down:stopped";
+		/* up and out */
+	case CEPH_MDS_STATE_BOOT:       return "up:boot";
+	case CEPH_MDS_STATE_STANDBY:    return "up:standby";
+	case CEPH_MDS_STATE_STANDBY_REPLAY:    return "up:standby-replay";
+	case CEPH_MDS_STATE_CREATING:   return "up:creating";
+	case CEPH_MDS_STATE_STARTING:   return "up:starting";
+		/* up and in */
+	case CEPH_MDS_STATE_REPLAY:     return "up:replay";
+	case CEPH_MDS_STATE_RESOLVE:    return "up:resolve";
+	case CEPH_MDS_STATE_RECONNECT:  return "up:reconnect";
+	case CEPH_MDS_STATE_REJOIN:     return "up:rejoin";
+	case CEPH_MDS_STATE_CLIENTREPLAY: return "up:clientreplay";
+	case CEPH_MDS_STATE_ACTIVE:     return "up:active";
+	case CEPH_MDS_STATE_STOPPING:   return "up:stopping";
+	}
+	return "???";
+}
+
+const char *ceph_session_op_name(int op)
+{
+	switch (op) {
+	case CEPH_SESSION_REQUEST_OPEN: return "request_open";
+	case CEPH_SESSION_OPEN: return "open";
+	case CEPH_SESSION_REQUEST_CLOSE: return "request_close";
+	case CEPH_SESSION_CLOSE: return "close";
+	case CEPH_SESSION_REQUEST_RENEWCAPS: return "request_renewcaps";
+	case CEPH_SESSION_RENEWCAPS: return "renewcaps";
+	case CEPH_SESSION_STALE: return "stale";
+	case CEPH_SESSION_RECALL_STATE: return "recall_state";
+	}
+	return "???";
+}
+
+const char *ceph_mds_op_name(int op)
+{
+	switch (op) {
+	case CEPH_MDS_OP_LOOKUP:  return "lookup";
+	case CEPH_MDS_OP_LOOKUPHASH:  return "lookuphash";
+	case CEPH_MDS_OP_LOOKUPPARENT:  return "lookupparent";
+	case CEPH_MDS_OP_GETATTR:  return "getattr";
+	case CEPH_MDS_OP_SETXATTR: return "setxattr";
+	case CEPH_MDS_OP_SETATTR: return "setattr";
+	case CEPH_MDS_OP_RMXATTR: return "rmxattr";
+	case CEPH_MDS_OP_READDIR: return "readdir";
+	case CEPH_MDS_OP_MKNOD: return "mknod";
+	case CEPH_MDS_OP_LINK: return "link";
+	case CEPH_MDS_OP_UNLINK: return "unlink";
+	case CEPH_MDS_OP_RENAME: return "rename";
+	case CEPH_MDS_OP_MKDIR: return "mkdir";
+	case CEPH_MDS_OP_RMDIR: return "rmdir";
+	case CEPH_MDS_OP_SYMLINK: return "symlink";
+	case CEPH_MDS_OP_CREATE: return "create";
+	case CEPH_MDS_OP_OPEN: return "open";
+	case CEPH_MDS_OP_LOOKUPSNAP: return "lookupsnap";
+	case CEPH_MDS_OP_LSSNAP: return "lssnap";
+	case CEPH_MDS_OP_MKSNAP: return "mksnap";
+	case CEPH_MDS_OP_RMSNAP: return "rmsnap";
+	}
+	return "???";
+}
+
+const char *ceph_cap_op_name(int op)
+{
+	switch (op) {
+	case CEPH_CAP_OP_GRANT: return "grant";
+	case CEPH_CAP_OP_REVOKE: return "revoke";
+	case CEPH_CAP_OP_TRUNC: return "trunc";
+	case CEPH_CAP_OP_EXPORT: return "export";
+	case CEPH_CAP_OP_IMPORT: return "import";
+	case CEPH_CAP_OP_UPDATE: return "update";
+	case CEPH_CAP_OP_DROP: return "drop";
+	case CEPH_CAP_OP_FLUSH: return "flush";
+	case CEPH_CAP_OP_FLUSH_ACK: return "flush_ack";
+	case CEPH_CAP_OP_FLUSHSNAP: return "flushsnap";
+	case CEPH_CAP_OP_FLUSHSNAP_ACK: return "flushsnap_ack";
+	case CEPH_CAP_OP_RELEASE: return "release";
+	case CEPH_CAP_OP_RENEW: return "renew";
+	}
+	return "???";
+}
+
+const char *ceph_lease_op_name(int o)
+{
+	switch (o) {
+	case CEPH_MDS_LEASE_REVOKE: return "revoke";
+	case CEPH_MDS_LEASE_RELEASE: return "release";
+	case CEPH_MDS_LEASE_RENEW: return "renew";
+	case CEPH_MDS_LEASE_REVOKE_ACK: return "revoke_ack";
+	}
+	return "???";
+}
+
+const char *ceph_snap_op_name(int o)
+{
+	switch (o) {
+	case CEPH_SNAP_OP_UPDATE: return "update";
+	case CEPH_SNAP_OP_CREATE: return "create";
+	case CEPH_SNAP_OP_DESTROY: return "destroy";
+	case CEPH_SNAP_OP_SPLIT: return "split";
+	}
+	return "???";
+}
+
+const char *ceph_pool_op_name(int op)
+{
+	switch (op) {
+	case POOL_OP_CREATE: return "create";
+	case POOL_OP_DELETE: return "delete";
+	case POOL_OP_AUID_CHANGE: return "auid change";
+	case POOL_OP_CREATE_SNAP: return "create snap";
+	case POOL_OP_DELETE_SNAP: return "delete snap";
+	case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
+	case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
+	}
+	return "???";
+}
diff --git a/fs/ceph/crush/crush.c b/fs/ceph/crush/crush.c
new file mode 100644
index 000000000000..fabd302e5779
--- /dev/null
+++ b/fs/ceph/crush/crush.c
@@ -0,0 +1,151 @@
+
+#ifdef __KERNEL__
+# include <linux/slab.h>
+#else
+# include <stdlib.h>
+# include <assert.h>
+# define kfree(x) do { if (x) free(x); } while (0)
+# define BUG_ON(x) assert(!(x))
+#endif
+
+#include "crush.h"
+
+const char *crush_bucket_alg_name(int alg)
+{
+	switch (alg) {
+	case CRUSH_BUCKET_UNIFORM: return "uniform";
+	case CRUSH_BUCKET_LIST: return "list";
+	case CRUSH_BUCKET_TREE: return "tree";
+	case CRUSH_BUCKET_STRAW: return "straw";
+	default: return "unknown";
+	}
+}
+
+/**
+ * crush_get_bucket_item_weight - Get weight of an item in given bucket
+ * @b: bucket pointer
+ * @p: item index in bucket
+ */
+int crush_get_bucket_item_weight(struct crush_bucket *b, int p)
+{
+	if (p >= b->size)
+		return 0;
+
+	switch (b->alg) {
+	case CRUSH_BUCKET_UNIFORM:
+		return ((struct crush_bucket_uniform *)b)->item_weight;
+	case CRUSH_BUCKET_LIST:
+		return ((struct crush_bucket_list *)b)->item_weights[p];
+	case CRUSH_BUCKET_TREE:
+		if (p & 1)
+			return ((struct crush_bucket_tree *)b)->node_weights[p];
+		return 0;
+	case CRUSH_BUCKET_STRAW:
+		return ((struct crush_bucket_straw *)b)->item_weights[p];
+	}
+	return 0;
+}
+
+/**
+ * crush_calc_parents - Calculate parent vectors for the given crush map.
+ * @map: crush_map pointer
+ */
+void crush_calc_parents(struct crush_map *map)
+{
+	int i, b, c;
+
+	for (b = 0; b < map->max_buckets; b++) {
+		if (map->buckets[b] == NULL)
+			continue;
+		for (i = 0; i < map->buckets[b]->size; i++) {
+			c = map->buckets[b]->items[i];
+			BUG_ON(c >= map->max_devices ||
+			       c < -map->max_buckets);
+			if (c >= 0)
+				map->device_parents[c] = map->buckets[b]->id;
+			else
+				map->bucket_parents[-1-c] = map->buckets[b]->id;
+		}
+	}
+}
+
+void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
+{
+	kfree(b->h.perm);
+	kfree(b->h.items);
+	kfree(b);
+}
+
+void crush_destroy_bucket_list(struct crush_bucket_list *b)
+{
+	kfree(b->item_weights);
+	kfree(b->sum_weights);
+	kfree(b->h.perm);
+	kfree(b->h.items);
+	kfree(b);
+}
+
+void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
+{
+	kfree(b->node_weights);
+	kfree(b);
+}
+
+void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
+{
+	kfree(b->straws);
+	kfree(b->item_weights);
+	kfree(b->h.perm);
+	kfree(b->h.items);
+	kfree(b);
+}
+
+void crush_destroy_bucket(struct crush_bucket *b)
+{
+	switch (b->alg) {
+	case CRUSH_BUCKET_UNIFORM:
+		crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);
+		break;
+	case CRUSH_BUCKET_LIST:
+		crush_destroy_bucket_list((struct crush_bucket_list *)b);
+		break;
+	case CRUSH_BUCKET_TREE:
+		crush_destroy_bucket_tree((struct crush_bucket_tree *)b);
+		break;
+	case CRUSH_BUCKET_STRAW:
+		crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
+		break;
+	}
+}
+
+/**
+ * crush_destroy - Destroy a crush_map
+ * @map: crush_map pointer
+ */
+void crush_destroy(struct crush_map *map)
+{
+	int b;
+
+	/* buckets */
+	if (map->buckets) {
+		for (b = 0; b < map->max_buckets; b++) {
+			if (map->buckets[b] == NULL)
+				continue;
+			crush_destroy_bucket(map->buckets[b]);
+		}
+		kfree(map->buckets);
+	}
+
+	/* rules */
+	if (map->rules) {
+		for (b = 0; b < map->max_rules; b++)
+			kfree(map->rules[b]);
+		kfree(map->rules);
+	}
+
+	kfree(map->bucket_parents);
+	kfree(map->device_parents);
+	kfree(map);
+}
+
+
diff --git a/fs/ceph/crush/crush.h b/fs/ceph/crush/crush.h
new file mode 100644
index 000000000000..dcd7e7523700
--- /dev/null
+++ b/fs/ceph/crush/crush.h
@@ -0,0 +1,180 @@
+#ifndef _CRUSH_CRUSH_H
+#define _CRUSH_CRUSH_H
+
+#include <linux/types.h>
+
+/*
+ * CRUSH is a pseudo-random data distribution algorithm that
+ * efficiently distributes input values (typically, data objects)
+ * across a heterogeneous, structured storage cluster.
+ *
+ * The algorithm was originally described in detail in this paper
+ * (although the algorithm has evolved somewhat since then):
+ *
+ *     http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
+ *
+ * LGPL2
+ */
+
+
+#define CRUSH_MAGIC 0x00010000ul   /* for detecting algorithm revisions */
+
+
+#define CRUSH_MAX_DEPTH 10  /* max crush hierarchy depth */
+#define CRUSH_MAX_SET   10  /* max size of a mapping result */
+
+
+/*
+ * CRUSH uses user-defined "rules" to describe how inputs should be
+ * mapped to devices.  A rule consists of sequence of steps to perform
+ * to generate the set of output devices.
+ */
+struct crush_rule_step {
+	__u32 op;
+	__s32 arg1;
+	__s32 arg2;
+};
+
+/* step op codes */
+enum {
+	CRUSH_RULE_NOOP = 0,
+	CRUSH_RULE_TAKE = 1,          /* arg1 = value to start with */
+	CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
+				      /* arg2 = type */
+	CRUSH_RULE_CHOOSE_INDEP = 3,  /* same */
+	CRUSH_RULE_EMIT = 4,          /* no args */
+	CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6,
+	CRUSH_RULE_CHOOSE_LEAF_INDEP = 7,
+};
+
+/*
+ * for specifying choose num (arg1) relative to the max parameter
+ * passed to do_rule
+ */
+#define CRUSH_CHOOSE_N            0
+#define CRUSH_CHOOSE_N_MINUS(x)   (-(x))
+
+/*
+ * The rule mask is used to describe what the rule is intended for.
+ * Given a ruleset and size of output set, we search through the
+ * rule list for a matching rule_mask.
+ */
+struct crush_rule_mask {
+	__u8 ruleset;
+	__u8 type;
+	__u8 min_size;
+	__u8 max_size;
+};
+
+struct crush_rule {
+	__u32 len;
+	struct crush_rule_mask mask;
+	struct crush_rule_step steps[0];
+};
+
+#define crush_rule_size(len) (sizeof(struct crush_rule) + \
+			      (len)*sizeof(struct crush_rule_step))
+
+
+
+/*
+ * A bucket is a named container of other items (either devices or
+ * other buckets).  Items within a bucket are chosen using one of a
+ * few different algorithms.  The table summarizes how the speed of
+ * each option measures up against mapping stability when items are
+ * added or removed.
+ *
+ *  Bucket Alg     Speed       Additions    Removals
+ *  ------------------------------------------------
+ *  uniform         O(1)       poor         poor
+ *  list            O(n)       optimal      poor
+ *  tree            O(log n)   good         good
+ *  straw           O(n)       optimal      optimal
+ */
+enum {
+	CRUSH_BUCKET_UNIFORM = 1,
+	CRUSH_BUCKET_LIST = 2,
+	CRUSH_BUCKET_TREE = 3,
+	CRUSH_BUCKET_STRAW = 4
+};
+extern const char *crush_bucket_alg_name(int alg);
+
+struct crush_bucket {
+	__s32 id;        /* this'll be negative */
+	__u16 type;      /* non-zero; type=0 is reserved for devices */
+	__u8 alg;        /* one of CRUSH_BUCKET_* */
+	__u8 hash;       /* which hash function to use, CRUSH_HASH_* */
+	__u32 weight;    /* 16-bit fixed point */
+	__u32 size;      /* num items */
+	__s32 *items;
+
+	/*
+	 * cached random permutation: used for uniform bucket and for
+	 * the linear search fallback for the other bucket types.
+	 */
+	__u32 perm_x;  /* @x for which *perm is defined */
+	__u32 perm_n;  /* num elements of *perm that are permuted/defined */
+	__u32 *perm;
+};
+
+struct crush_bucket_uniform {
+	struct crush_bucket h;
+	__u32 item_weight;  /* 16-bit fixed point; all items equally weighted */
+};
+
+struct crush_bucket_list {
+	struct crush_bucket h;
+	__u32 *item_weights;  /* 16-bit fixed point */
+	__u32 *sum_weights;   /* 16-bit fixed point.  element i is sum
+				 of weights 0..i, inclusive */
+};
+
+struct crush_bucket_tree {
+	struct crush_bucket h;  /* note: h.size is _tree_ size, not number of
+				   actual items */
+	__u8 num_nodes;
+	__u32 *node_weights;
+};
+
+struct crush_bucket_straw {
+	struct crush_bucket h;
+	__u32 *item_weights;   /* 16-bit fixed point */
+	__u32 *straws;         /* 16-bit fixed point */
+};
+
+
+
+/*
+ * CRUSH map includes all buckets, rules, etc.
+ */
+struct crush_map {
+	struct crush_bucket **buckets;
+	struct crush_rule **rules;
+
+	/*
+	 * Parent pointers to identify the parent bucket a device or
+	 * bucket in the hierarchy.  If an item appears more than
+	 * once, this is the _last_ time it appeared (where buckets
+	 * are processed in bucket id order, from -1 on down to
+	 * -max_buckets.
+	 */
+	__u32 *bucket_parents;
+	__u32 *device_parents;
+
+	__s32 max_buckets;
+	__u32 max_rules;
+	__s32 max_devices;
+};
+
+
+/* crush.c */
+extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos);
+extern void crush_calc_parents(struct crush_map *map);
+extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
+extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
+extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
+extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
+extern void crush_destroy_bucket(struct crush_bucket *b);
+extern void crush_destroy(struct crush_map *map);
+
+#endif
diff --git a/fs/ceph/crush/hash.c b/fs/ceph/crush/hash.c
new file mode 100644
index 000000000000..5873aed694bf
--- /dev/null
+++ b/fs/ceph/crush/hash.c
@@ -0,0 +1,149 @@
+
+#include <linux/types.h>
+#include "hash.h"
+
+/*
+ * Robert Jenkins' function for mixing 32-bit values
+ * http://burtleburtle.net/bob/hash/evahash.html
+ * a, b = random bits, c = input and output
+ */
+#define crush_hashmix(a, b, c) do {			\
+		a = a-b;  a = a-c;  a = a^(c>>13);	\
+		b = b-c;  b = b-a;  b = b^(a<<8);	\
+		c = c-a;  c = c-b;  c = c^(b>>13);	\
+		a = a-b;  a = a-c;  a = a^(c>>12);	\
+		b = b-c;  b = b-a;  b = b^(a<<16);	\
+		c = c-a;  c = c-b;  c = c^(b>>5);	\
+		a = a-b;  a = a-c;  a = a^(c>>3);	\
+		b = b-c;  b = b-a;  b = b^(a<<10);	\
+		c = c-a;  c = c-b;  c = c^(b>>15);	\
+	} while (0)
+
+#define crush_hash_seed 1315423911
+
+static __u32 crush_hash32_rjenkins1(__u32 a)
+{
+	__u32 hash = crush_hash_seed ^ a;
+	__u32 b = a;
+	__u32 x = 231232;
+	__u32 y = 1232;
+	crush_hashmix(b, x, hash);
+	crush_hashmix(y, a, hash);
+	return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b)
+{
+	__u32 hash = crush_hash_seed ^ a ^ b;
+	__u32 x = 231232;
+	__u32 y = 1232;
+	crush_hashmix(a, b, hash);
+	crush_hashmix(x, a, hash);
+	crush_hashmix(b, y, hash);
+	return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c)
+{
+	__u32 hash = crush_hash_seed ^ a ^ b ^ c;
+	__u32 x = 231232;
+	__u32 y = 1232;
+	crush_hashmix(a, b, hash);
+	crush_hashmix(c, x, hash);
+	crush_hashmix(y, a, hash);
+	crush_hashmix(b, x, hash);
+	crush_hashmix(y, c, hash);
+	return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d)
+{
+	__u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
+	__u32 x = 231232;
+	__u32 y = 1232;
+	crush_hashmix(a, b, hash);
+	crush_hashmix(c, d, hash);
+	crush_hashmix(a, x, hash);
+	crush_hashmix(y, b, hash);
+	crush_hashmix(c, x, hash);
+	crush_hashmix(y, d, hash);
+	return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d,
+				      __u32 e)
+{
+	__u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
+	__u32 x = 231232;
+	__u32 y = 1232;
+	crush_hashmix(a, b, hash);
+	crush_hashmix(c, d, hash);
+	crush_hashmix(e, x, hash);
+	crush_hashmix(y, a, hash);
+	crush_hashmix(b, x, hash);
+	crush_hashmix(y, c, hash);
+	crush_hashmix(d, x, hash);
+	crush_hashmix(y, e, hash);
+	return hash;
+}
+
+
+__u32 crush_hash32(int type, __u32 a)
+{
+	switch (type) {
+	case CRUSH_HASH_RJENKINS1:
+		return crush_hash32_rjenkins1(a);
+	default:
+		return 0;
+	}
+}
+
+__u32 crush_hash32_2(int type, __u32 a, __u32 b)
+{
+	switch (type) {
+	case CRUSH_HASH_RJENKINS1:
+		return crush_hash32_rjenkins1_2(a, b);
+	default:
+		return 0;
+	}
+}
+
+__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c)
+{
+	switch (type) {
+	case CRUSH_HASH_RJENKINS1:
+		return crush_hash32_rjenkins1_3(a, b, c);
+	default:
+		return 0;
+	}
+}
+
+__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d)
+{
+	switch (type) {
+	case CRUSH_HASH_RJENKINS1:
+		return crush_hash32_rjenkins1_4(a, b, c, d);
+	default:
+		return 0;
+	}
+}
+
+__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
+{
+	switch (type) {
+	case CRUSH_HASH_RJENKINS1:
+		return crush_hash32_rjenkins1_5(a, b, c, d, e);
+	default:
+		return 0;
+	}
+}
+
+const char *crush_hash_name(int type)
+{
+	switch (type) {
+	case CRUSH_HASH_RJENKINS1:
+		return "rjenkins1";
+	default:
+		return "unknown";
+	}
+}
diff --git a/fs/ceph/crush/hash.h b/fs/ceph/crush/hash.h
new file mode 100644
index 000000000000..ff48e110e4bb
--- /dev/null
+++ b/fs/ceph/crush/hash.h
@@ -0,0 +1,17 @@
+#ifndef _CRUSH_HASH_H
+#define _CRUSH_HASH_H
+
+#define CRUSH_HASH_RJENKINS1   0
+
+#define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1
+
+extern const char *crush_hash_name(int type);
+
+extern __u32 crush_hash32(int type, __u32 a);
+extern __u32 crush_hash32_2(int type, __u32 a, __u32 b);
+extern __u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c);
+extern __u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d);
+extern __u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d,
+			    __u32 e);
+
+#endif
diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c
new file mode 100644
index 000000000000..a4eec133258e
--- /dev/null
+++ b/fs/ceph/crush/mapper.c
@@ -0,0 +1,609 @@
+
+#ifdef __KERNEL__
+# include <linux/string.h>
+# include <linux/slab.h>
+# include <linux/bug.h>
+# include <linux/kernel.h>
+# ifndef dprintk
+#  define dprintk(args...)
+# endif
+#else
+# include <string.h>
+# include <stdio.h>
+# include <stdlib.h>
+# include <assert.h>
+# define BUG_ON(x) assert(!(x))
+# define dprintk(args...) /* printf(args) */
+# define kmalloc(x, f) malloc(x)
+# define kfree(x) free(x)
+#endif
+
+#include "crush.h"
+#include "hash.h"
+
+/*
+ * Implement the core CRUSH mapping algorithm.
+ */
+
+/**
+ * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
+ * @map: the crush_map
+ * @ruleset: the storage ruleset id (user defined)
+ * @type: storage ruleset type (user defined)
+ * @size: output set size
+ */
+int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
+{
+	int i;
+
+	for (i = 0; i < map->max_rules; i++) {
+		if (map->rules[i] &&
+		    map->rules[i]->mask.ruleset == ruleset &&
+		    map->rules[i]->mask.type == type &&
+		    map->rules[i]->mask.min_size <= size &&
+		    map->rules[i]->mask.max_size >= size)
+			return i;
+	}
+	return -1;
+}
+
+
+/*
+ * bucket choose methods
+ *
+ * For each bucket algorithm, we have a "choose" method that, given a
+ * crush input @x and replica position (usually, position in output set) @r,
+ * will produce an item in the bucket.
+ */
+
+/*
+ * Choose based on a random permutation of the bucket.
+ *
+ * We used to use some prime number arithmetic to do this, but it
+ * wasn't very random, and had some other bad behaviors.  Instead, we
+ * calculate an actual random permutation of the bucket members.
+ * Since this is expensive, we optimize for the r=0 case, which
+ * captures the vast majority of calls.
+ */
+static int bucket_perm_choose(struct crush_bucket *bucket,
+			      int x, int r)
+{
+	unsigned pr = r % bucket->size;
+	unsigned i, s;
+
+	/* start a new permutation if @x has changed */
+	if (bucket->perm_x != x || bucket->perm_n == 0) {
+		dprintk("bucket %d new x=%d\n", bucket->id, x);
+		bucket->perm_x = x;
+
+		/* optimize common r=0 case */
+		if (pr == 0) {
+			s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
+				bucket->size;
+			bucket->perm[0] = s;
+			bucket->perm_n = 0xffff;   /* magic value, see below */
+			goto out;
+		}
+
+		for (i = 0; i < bucket->size; i++)
+			bucket->perm[i] = i;
+		bucket->perm_n = 0;
+	} else if (bucket->perm_n == 0xffff) {
+		/* clean up after the r=0 case above */
+		for (i = 1; i < bucket->size; i++)
+			bucket->perm[i] = i;
+		bucket->perm[bucket->perm[0]] = 0;
+		bucket->perm_n = 1;
+	}
+
+	/* calculate permutation up to pr */
+	for (i = 0; i < bucket->perm_n; i++)
+		dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
+	while (bucket->perm_n <= pr) {
+		unsigned p = bucket->perm_n;
+		/* no point in swapping the final entry */
+		if (p < bucket->size - 1) {
+			i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
+				(bucket->size - p);
+			if (i) {
+				unsigned t = bucket->perm[p + i];
+				bucket->perm[p + i] = bucket->perm[p];
+				bucket->perm[p] = t;
+			}
+			dprintk(" perm_choose swap %d with %d\n", p, p+i);
+		}
+		bucket->perm_n++;
+	}
+	for (i = 0; i < bucket->size; i++)
+		dprintk(" perm_choose  %d: %d\n", i, bucket->perm[i]);
+
+	s = bucket->perm[pr];
+out:
+	dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
+		bucket->size, x, r, pr, s);
+	return bucket->items[s];
+}
+
+/* uniform */
+static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
+				 int x, int r)
+{
+	return bucket_perm_choose(&bucket->h, x, r);
+}
+
+/* list */
+static int bucket_list_choose(struct crush_bucket_list *bucket,
+			      int x, int r)
+{
+	int i;
+
+	for (i = bucket->h.size-1; i >= 0; i--) {
+		__u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
+					 r, bucket->h.id);
+		w &= 0xffff;
+		dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
+			"sw %x rand %llx",
+			i, x, r, bucket->h.items[i], bucket->item_weights[i],
+			bucket->sum_weights[i], w);
+		w *= bucket->sum_weights[i];
+		w = w >> 16;
+		/*dprintk(" scaled %llx\n", w);*/
+		if (w < bucket->item_weights[i])
+			return bucket->h.items[i];
+	}
+
+	BUG_ON(1);
+	return 0;
+}
+
+
+/* (binary) tree */
+static int height(int n)
+{
+	int h = 0;
+	while ((n & 1) == 0) {
+		h++;
+		n = n >> 1;
+	}
+	return h;
+}
+
+static int left(int x)
+{
+	int h = height(x);
+	return x - (1 << (h-1));
+}
+
+static int right(int x)
+{
+	int h = height(x);
+	return x + (1 << (h-1));
+}
+
+static int terminal(int x)
+{
+	return x & 1;
+}
+
+static int bucket_tree_choose(struct crush_bucket_tree *bucket,
+			      int x, int r)
+{
+	int n, l;
+	__u32 w;
+	__u64 t;
+
+	/* start at root */
+	n = bucket->num_nodes >> 1;
+
+	while (!terminal(n)) {
+		/* pick point in [0, w) */
+		w = bucket->node_weights[n];
+		t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
+					  bucket->h.id) * (__u64)w;
+		t = t >> 32;
+
+		/* descend to the left or right? */
+		l = left(n);
+		if (t < bucket->node_weights[l])
+			n = l;
+		else
+			n = right(n);
+	}
+
+	return bucket->h.items[n >> 1];
+}
+
+
+/* straw */
+
+static int bucket_straw_choose(struct crush_bucket_straw *bucket,
+			       int x, int r)
+{
+	int i;
+	int high = 0;
+	__u64 high_draw = 0;
+	__u64 draw;
+
+	for (i = 0; i < bucket->h.size; i++) {
+		draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
+		draw &= 0xffff;
+		draw *= bucket->straws[i];
+		if (i == 0 || draw > high_draw) {
+			high = i;
+			high_draw = draw;
+		}
+	}
+	return bucket->h.items[high];
+}
+
+static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
+{
+	dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
+	switch (in->alg) {
+	case CRUSH_BUCKET_UNIFORM:
+		return bucket_uniform_choose((struct crush_bucket_uniform *)in,
+					  x, r);
+	case CRUSH_BUCKET_LIST:
+		return bucket_list_choose((struct crush_bucket_list *)in,
+					  x, r);
+	case CRUSH_BUCKET_TREE:
+		return bucket_tree_choose((struct crush_bucket_tree *)in,
+					  x, r);
+	case CRUSH_BUCKET_STRAW:
+		return bucket_straw_choose((struct crush_bucket_straw *)in,
+					   x, r);
+	default:
+		BUG_ON(1);
+		return in->items[0];
+	}
+}
+
+/*
+ * true if device is marked "out" (failed, fully offloaded)
+ * of the cluster
+ */
+static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
+{
+	if (weight[item] >= 0x10000)
+		return 0;
+	if (weight[item] == 0)
+		return 1;
+	if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
+	    < weight[item])
+		return 0;
+	return 1;
+}
+
+/**
+ * crush_choose - choose numrep distinct items of given type
+ * @map: the crush_map
+ * @bucket: the bucket we are choose an item from
+ * @x: crush input value
+ * @numrep: the number of items to choose
+ * @type: the type of item to choose
+ * @out: pointer to output vector
+ * @outpos: our position in that vector
+ * @firstn: true if choosing "first n" items, false if choosing "indep"
+ * @recurse_to_leaf: true if we want one device under each item of given type
+ * @out2: second output vector for leaf items (if @recurse_to_leaf)
+ */
+static int crush_choose(struct crush_map *map,
+			struct crush_bucket *bucket,
+			__u32 *weight,
+			int x, int numrep, int type,
+			int *out, int outpos,
+			int firstn, int recurse_to_leaf,
+			int *out2)
+{
+	int rep;
+	int ftotal, flocal;
+	int retry_descent, retry_bucket, skip_rep;
+	struct crush_bucket *in = bucket;
+	int r;
+	int i;
+	int item = 0;
+	int itemtype;
+	int collide, reject;
+	const int orig_tries = 5; /* attempts before we fall back to search */
+
+	dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
+		bucket->id, x, outpos, numrep);
+
+	for (rep = outpos; rep < numrep; rep++) {
+		/* keep trying until we get a non-out, non-colliding item */
+		ftotal = 0;
+		skip_rep = 0;
+		do {
+			retry_descent = 0;
+			in = bucket;               /* initial bucket */
+
+			/* choose through intervening buckets */
+			flocal = 0;
+			do {
+				collide = 0;
+				retry_bucket = 0;
+				r = rep;
+				if (in->alg == CRUSH_BUCKET_UNIFORM) {
+					/* be careful */
+					if (firstn || numrep >= in->size)
+						/* r' = r + f_total */
+						r += ftotal;
+					else if (in->size % numrep == 0)
+						/* r'=r+(n+1)*f_local */
+						r += (numrep+1) *
+							(flocal+ftotal);
+					else
+						/* r' = r + n*f_local */
+						r += numrep * (flocal+ftotal);
+				} else {
+					if (firstn)
+						/* r' = r + f_total */
+						r += ftotal;
+					else
+						/* r' = r + n*f_local */
+						r += numrep * (flocal+ftotal);
+				}
+
+				/* bucket choose */
+				if (in->size == 0) {
+					reject = 1;
+					goto reject;
+				}
+				if (flocal >= (in->size>>1) &&
+				    flocal > orig_tries)
+					item = bucket_perm_choose(in, x, r);
+				else
+					item = crush_bucket_choose(in, x, r);
+				BUG_ON(item >= map->max_devices);
+
+				/* desired type? */
+				if (item < 0)
+					itemtype = map->buckets[-1-item]->type;
+				else
+					itemtype = 0;
+				dprintk("  item %d type %d\n", item, itemtype);
+
+				/* keep going? */
+				if (itemtype != type) {
+					BUG_ON(item >= 0 ||
+					       (-1-item) >= map->max_buckets);
+					in = map->buckets[-1-item];
+					retry_bucket = 1;
+					continue;
+				}
+
+				/* collision? */
+				for (i = 0; i < outpos; i++) {
+					if (out[i] == item) {
+						collide = 1;
+						break;
+					}
+				}
+
+				reject = 0;
+				if (recurse_to_leaf) {
+					if (item < 0) {
+						if (crush_choose(map,
+							 map->buckets[-1-item],
+							 weight,
+							 x, outpos+1, 0,
+							 out2, outpos,
+							 firstn, 0,
+							 NULL) <= outpos)
+							/* didn't get leaf */
+							reject = 1;
+					} else {
+						/* we already have a leaf! */
+						out2[outpos] = item;
+					}
+				}
+
+				if (!reject) {
+					/* out? */
+					if (itemtype == 0)
+						reject = is_out(map, weight,
+								item, x);
+					else
+						reject = 0;
+				}
+
+reject:
+				if (reject || collide) {
+					ftotal++;
+					flocal++;
+
+					if (collide && flocal < 3)
+						/* retry locally a few times */
+						retry_bucket = 1;
+					else if (flocal < in->size + orig_tries)
+						/* exhaustive bucket search */
+						retry_bucket = 1;
+					else if (ftotal < 20)
+						/* then retry descent */
+						retry_descent = 1;
+					else
+						/* else give up */
+						skip_rep = 1;
+					dprintk("  reject %d  collide %d  "
+						"ftotal %d  flocal %d\n",
+						reject, collide, ftotal,
+						flocal);
+				}
+			} while (retry_bucket);
+		} while (retry_descent);
+
+		if (skip_rep) {
+			dprintk("skip rep\n");
+			continue;
+		}
+
+		dprintk("CHOOSE got %d\n", item);
+		out[outpos] = item;
+		outpos++;
+	}
+
+	dprintk("CHOOSE returns %d\n", outpos);
+	return outpos;
+}
+
+
+/**
+ * crush_do_rule - calculate a mapping with the given input and rule
+ * @map: the crush_map
+ * @ruleno: the rule id
+ * @x: hash input
+ * @result: pointer to result vector
+ * @result_max: maximum result size
+ * @force: force initial replica choice; -1 for none
+ */
+int crush_do_rule(struct crush_map *map,
+		  int ruleno, int x, int *result, int result_max,
+		  int force, __u32 *weight)
+{
+	int result_len;
+	int force_context[CRUSH_MAX_DEPTH];
+	int force_pos = -1;
+	int a[CRUSH_MAX_SET];
+	int b[CRUSH_MAX_SET];
+	int c[CRUSH_MAX_SET];
+	int recurse_to_leaf;
+	int *w;
+	int wsize = 0;
+	int *o;
+	int osize;
+	int *tmp;
+	struct crush_rule *rule;
+	int step;
+	int i, j;
+	int numrep;
+	int firstn;
+	int rc = -1;
+
+	BUG_ON(ruleno >= map->max_rules);
+
+	rule = map->rules[ruleno];
+	result_len = 0;
+	w = a;
+	o = b;
+
+	/*
+	 * determine hierarchical context of force, if any.  note
+	 * that this may or may not correspond to the specific types
+	 * referenced by the crush rule.
+	 */
+	if (force >= 0) {
+		if (force >= map->max_devices ||
+		    map->device_parents[force] == 0) {
+			/*dprintk("CRUSH: forcefed device dne\n");*/
+			rc = -1;  /* force fed device dne */
+			goto out;
+		}
+		if (!is_out(map, weight, force, x)) {
+			while (1) {
+				force_context[++force_pos] = force;
+				if (force >= 0)
+					force = map->device_parents[force];
+				else
+					force = map->bucket_parents[-1-force];
+				if (force == 0)
+					break;
+			}
+		}
+	}
+
+	for (step = 0; step < rule->len; step++) {
+		firstn = 0;
+		switch (rule->steps[step].op) {
+		case CRUSH_RULE_TAKE:
+			w[0] = rule->steps[step].arg1;
+			if (force_pos >= 0) {
+				BUG_ON(force_context[force_pos] != w[0]);
+				force_pos--;
+			}
+			wsize = 1;
+			break;
+
+		case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
+		case CRUSH_RULE_CHOOSE_FIRSTN:
+			firstn = 1;
+		case CRUSH_RULE_CHOOSE_LEAF_INDEP:
+		case CRUSH_RULE_CHOOSE_INDEP:
+			BUG_ON(wsize == 0);
+
+			recurse_to_leaf =
+				rule->steps[step].op ==
+				 CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
+				rule->steps[step].op ==
+				CRUSH_RULE_CHOOSE_LEAF_INDEP;
+
+			/* reset output */
+			osize = 0;
+
+			for (i = 0; i < wsize; i++) {
+				/*
+				 * see CRUSH_N, CRUSH_N_MINUS macros.
+				 * basically, numrep <= 0 means relative to
+				 * the provided result_max
+				 */
+				numrep = rule->steps[step].arg1;
+				if (numrep <= 0) {
+					numrep += result_max;
+					if (numrep <= 0)
+						continue;
+				}
+				j = 0;
+				if (osize == 0 && force_pos >= 0) {
+					/* skip any intermediate types */
+					while (force_pos &&
+					       force_context[force_pos] < 0 &&
+					       rule->steps[step].arg2 !=
+					       map->buckets[-1 -
+					       force_context[force_pos]]->type)
+						force_pos--;
+					o[osize] = force_context[force_pos];
+					if (recurse_to_leaf)
+						c[osize] = force_context[0];
+					j++;
+					force_pos--;
+				}
+				osize += crush_choose(map,
+						      map->buckets[-1-w[i]],
+						      weight,
+						      x, numrep,
+						      rule->steps[step].arg2,
+						      o+osize, j,
+						      firstn,
+						      recurse_to_leaf, c+osize);
+			}
+
+			if (recurse_to_leaf)
+				/* copy final _leaf_ values to output set */
+				memcpy(o, c, osize*sizeof(*o));
+
+			/* swap t and w arrays */
+			tmp = o;
+			o = w;
+			w = tmp;
+			wsize = osize;
+			break;
+
+
+		case CRUSH_RULE_EMIT:
+			for (i = 0; i < wsize && result_len < result_max; i++) {
+				result[result_len] = w[i];
+				result_len++;
+			}
+			wsize = 0;
+			break;
+
+		default:
+			BUG_ON(1);
+		}
+	}
+	rc = result_len;
+
+out:
+	return rc;
+}
+
+
diff --git a/fs/ceph/crush/mapper.h b/fs/ceph/crush/mapper.h
new file mode 100644
index 000000000000..98e90046fd9f
--- /dev/null
+++ b/fs/ceph/crush/mapper.h
@@ -0,0 +1,20 @@
+#ifndef _CRUSH_MAPPER_H
+#define _CRUSH_MAPPER_H
+
+/*
+ * CRUSH functions for find rules and then mapping an input to an
+ * output set.
+ *
+ * LGPL2
+ */
+
+#include "crush.h"
+
+extern int crush_find_rule(struct crush_map *map, int pool, int type, int size);
+extern int crush_do_rule(struct crush_map *map,
+			 int ruleno,
+			 int x, int *result, int result_max,
+			 int forcefeed,    /* -1 for none */
+			 __u32 *weights);
+
+#endif
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
new file mode 100644
index 000000000000..f704b3b62424
--- /dev/null
+++ b/fs/ceph/crypto.c
@@ -0,0 +1,409 @@
+
+#include "ceph_debug.h"
+
+#include <linux/err.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+#include <crypto/hash.h>
+
+#include "crypto.h"
+#include "decode.h"
+
+int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
+{
+	if (*p + sizeof(u16) + sizeof(key->created) +
+	    sizeof(u16) + key->len > end)
+		return -ERANGE;
+	ceph_encode_16(p, key->type);
+	ceph_encode_copy(p, &key->created, sizeof(key->created));
+	ceph_encode_16(p, key->len);
+	ceph_encode_copy(p, key->key, key->len);
+	return 0;
+}
+
+int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
+{
+	ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad);
+	key->type = ceph_decode_16(p);
+	ceph_decode_copy(p, &key->created, sizeof(key->created));
+	key->len = ceph_decode_16(p);
+	ceph_decode_need(p, end, key->len, bad);
+	key->key = kmalloc(key->len, GFP_NOFS);
+	if (!key->key)
+		return -ENOMEM;
+	ceph_decode_copy(p, key->key, key->len);
+	return 0;
+
+bad:
+	dout("failed to decode crypto key\n");
+	return -EINVAL;
+}
+
+int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
+{
+	int inlen = strlen(inkey);
+	int blen = inlen * 3 / 4;
+	void *buf, *p;
+	int ret;
+
+	dout("crypto_key_unarmor %s\n", inkey);
+	buf = kmalloc(blen, GFP_NOFS);
+	if (!buf)
+		return -ENOMEM;
+	blen = ceph_unarmor(buf, inkey, inkey+inlen);
+	if (blen < 0) {
+		kfree(buf);
+		return blen;
+	}
+
+	p = buf;
+	ret = ceph_crypto_key_decode(key, &p, p + blen);
+	kfree(buf);
+	if (ret)
+		return ret;
+	dout("crypto_key_unarmor key %p type %d len %d\n", key,
+	     key->type, key->len);
+	return 0;
+}
+
+
+
+#define AES_KEY_SIZE 16
+
+static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
+{
+	return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
+}
+
+const u8 *aes_iv = "cephsageyudagreg";
+
+int ceph_aes_encrypt(const void *key, int key_len, void *dst, size_t *dst_len,
+		     const void *src, size_t src_len)
+{
+	struct scatterlist sg_in[2], sg_out[1];
+	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+	struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
+	int ret;
+	void *iv;
+	int ivsize;
+	size_t zero_padding = (0x10 - (src_len & 0x0f));
+	char pad[16];
+
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	memset(pad, zero_padding, zero_padding);
+
+	*dst_len = src_len + zero_padding;
+
+	crypto_blkcipher_setkey((void *)tfm, key, key_len);
+	sg_init_table(sg_in, 2);
+	sg_set_buf(&sg_in[0], src, src_len);
+	sg_set_buf(&sg_in[1], pad, zero_padding);
+	sg_init_table(sg_out, 1);
+	sg_set_buf(sg_out, dst, *dst_len);
+	iv = crypto_blkcipher_crt(tfm)->iv;
+	ivsize = crypto_blkcipher_ivsize(tfm);
+
+	memcpy(iv, aes_iv, ivsize);
+	/*
+	print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
+		       key, key_len, 1);
+	print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1,
+			src, src_len, 1);
+	print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
+			pad, zero_padding, 1);
+	*/
+	ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
+				     src_len + zero_padding);
+	crypto_free_blkcipher(tfm);
+	if (ret < 0)
+		pr_err("ceph_aes_crypt failed %d\n", ret);
+	/*
+	print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
+		       dst, *dst_len, 1);
+	*/
+	return 0;
+}
+
+int ceph_aes_encrypt2(const void *key, int key_len, void *dst, size_t *dst_len,
+		      const void *src1, size_t src1_len,
+		      const void *src2, size_t src2_len)
+{
+	struct scatterlist sg_in[3], sg_out[1];
+	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+	struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
+	int ret;
+	void *iv;
+	int ivsize;
+	size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f));
+	char pad[16];
+
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	memset(pad, zero_padding, zero_padding);
+
+	*dst_len = src1_len + src2_len + zero_padding;
+
+	crypto_blkcipher_setkey((void *)tfm, key, key_len);
+	sg_init_table(sg_in, 3);
+	sg_set_buf(&sg_in[0], src1, src1_len);
+	sg_set_buf(&sg_in[1], src2, src2_len);
+	sg_set_buf(&sg_in[2], pad, zero_padding);
+	sg_init_table(sg_out, 1);
+	sg_set_buf(sg_out, dst, *dst_len);
+	iv = crypto_blkcipher_crt(tfm)->iv;
+	ivsize = crypto_blkcipher_ivsize(tfm);
+
+	memcpy(iv, aes_iv, ivsize);
+	/*
+	print_hex_dump(KERN_ERR, "enc  key: ", DUMP_PREFIX_NONE, 16, 1,
+		       key, key_len, 1);
+	print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1,
+			src1, src1_len, 1);
+	print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1,
+			src2, src2_len, 1);
+	print_hex_dump(KERN_ERR, "enc  pad: ", DUMP_PREFIX_NONE, 16, 1,
+			pad, zero_padding, 1);
+	*/
+	ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
+				     src1_len + src2_len + zero_padding);
+	crypto_free_blkcipher(tfm);
+	if (ret < 0)
+		pr_err("ceph_aes_crypt2 failed %d\n", ret);
+	/*
+	print_hex_dump(KERN_ERR, "enc  out: ", DUMP_PREFIX_NONE, 16, 1,
+		       dst, *dst_len, 1);
+	*/
+	return 0;
+}
+
+int ceph_aes_decrypt(const void *key, int key_len, void *dst, size_t *dst_len,
+		     const void *src, size_t src_len)
+{
+	struct scatterlist sg_in[1], sg_out[2];
+	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+	struct blkcipher_desc desc = { .tfm = tfm };
+	char pad[16];
+	void *iv;
+	int ivsize;
+	int ret;
+	int last_byte;
+
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	crypto_blkcipher_setkey((void *)tfm, key, key_len);
+	sg_init_table(sg_in, 1);
+	sg_init_table(sg_out, 2);
+	sg_set_buf(sg_in, src, src_len);
+	sg_set_buf(&sg_out[0], dst, *dst_len);
+	sg_set_buf(&sg_out[1], pad, sizeof(pad));
+
+	iv = crypto_blkcipher_crt(tfm)->iv;
+	ivsize = crypto_blkcipher_ivsize(tfm);
+
+	memcpy(iv, aes_iv, ivsize);
+
+	/*
+	print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
+		       key, key_len, 1);
+	print_hex_dump(KERN_ERR, "dec  in: ", DUMP_PREFIX_NONE, 16, 1,
+		       src, src_len, 1);
+	*/
+
+	ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
+	crypto_free_blkcipher(tfm);
+	if (ret < 0) {
+		pr_err("ceph_aes_decrypt failed %d\n", ret);
+		return ret;
+	}
+
+	if (src_len <= *dst_len)
+		last_byte = ((char *)dst)[src_len - 1];
+	else
+		last_byte = pad[src_len - *dst_len - 1];
+	if (last_byte <= 16 && src_len >= last_byte) {
+		*dst_len = src_len - last_byte;
+	} else {
+		pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
+		       last_byte, (int)src_len);
+		return -EPERM;  /* bad padding */
+	}
+	/*
+	print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1,
+		       dst, *dst_len, 1);
+	*/
+	return 0;
+}
+
+int ceph_aes_decrypt2(const void *key, int key_len,
+		      void *dst1, size_t *dst1_len,
+		      void *dst2, size_t *dst2_len,
+		      const void *src, size_t src_len)
+{
+	struct scatterlist sg_in[1], sg_out[3];
+	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+	struct blkcipher_desc desc = { .tfm = tfm };
+	char pad[16];
+	void *iv;
+	int ivsize;
+	int ret;
+	int last_byte;
+
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	sg_init_table(sg_in, 1);
+	sg_set_buf(sg_in, src, src_len);
+	sg_init_table(sg_out, 3);
+	sg_set_buf(&sg_out[0], dst1, *dst1_len);
+	sg_set_buf(&sg_out[1], dst2, *dst2_len);
+	sg_set_buf(&sg_out[2], pad, sizeof(pad));
+
+	crypto_blkcipher_setkey((void *)tfm, key, key_len);
+	iv = crypto_blkcipher_crt(tfm)->iv;
+	ivsize = crypto_blkcipher_ivsize(tfm);
+
+	memcpy(iv, aes_iv, ivsize);
+
+	/*
+	print_hex_dump(KERN_ERR, "dec  key: ", DUMP_PREFIX_NONE, 16, 1,
+		       key, key_len, 1);
+	print_hex_dump(KERN_ERR, "dec   in: ", DUMP_PREFIX_NONE, 16, 1,
+		       src, src_len, 1);
+	*/
+
+	ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
+	crypto_free_blkcipher(tfm);
+	if (ret < 0) {
+		pr_err("ceph_aes_decrypt failed %d\n", ret);
+		return ret;
+	}
+
+	if (src_len <= *dst1_len)
+		last_byte = ((char *)dst1)[src_len - 1];
+	else if (src_len <= *dst1_len + *dst2_len)
+		last_byte = ((char *)dst2)[src_len - *dst1_len - 1];
+	else
+		last_byte = pad[src_len - *dst1_len - *dst2_len - 1];
+	if (last_byte <= 16 && src_len >= last_byte) {
+		src_len -= last_byte;
+	} else {
+		pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
+		       last_byte, (int)src_len);
+		return -EPERM;  /* bad padding */
+	}
+
+	if (src_len < *dst1_len) {
+		*dst1_len = src_len;
+		*dst2_len = 0;
+	} else {
+		*dst2_len = src_len - *dst1_len;
+	}
+	/*
+	print_hex_dump(KERN_ERR, "dec  out1: ", DUMP_PREFIX_NONE, 16, 1,
+		       dst1, *dst1_len, 1);
+	print_hex_dump(KERN_ERR, "dec  out2: ", DUMP_PREFIX_NONE, 16, 1,
+		       dst2, *dst2_len, 1);
+	*/
+
+	return 0;
+}
+
+
+int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
+		 const void *src, size_t src_len)
+{
+	switch (secret->type) {
+	case CEPH_CRYPTO_NONE:
+		if (*dst_len < src_len)
+			return -ERANGE;
+		memcpy(dst, src, src_len);
+		*dst_len = src_len;
+		return 0;
+
+	case CEPH_CRYPTO_AES:
+		return ceph_aes_decrypt(secret->key, secret->len, dst,
+					dst_len, src, src_len);
+
+	default:
+		return -EINVAL;
+	}
+}
+
+int ceph_decrypt2(struct ceph_crypto_key *secret,
+			void *dst1, size_t *dst1_len,
+			void *dst2, size_t *dst2_len,
+			const void *src, size_t src_len)
+{
+	size_t t;
+
+	switch (secret->type) {
+	case CEPH_CRYPTO_NONE:
+		if (*dst1_len + *dst2_len < src_len)
+			return -ERANGE;
+		t = min(*dst1_len, src_len);
+		memcpy(dst1, src, t);
+		*dst1_len = t;
+		src += t;
+		src_len -= t;
+		if (src_len) {
+			t = min(*dst2_len, src_len);
+			memcpy(dst2, src, t);
+			*dst2_len = t;
+		}
+		return 0;
+
+	case CEPH_CRYPTO_AES:
+		return ceph_aes_decrypt2(secret->key, secret->len,
+					 dst1, dst1_len, dst2, dst2_len,
+					 src, src_len);
+
+	default:
+		return -EINVAL;
+	}
+}
+
+int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
+		 const void *src, size_t src_len)
+{
+	switch (secret->type) {
+	case CEPH_CRYPTO_NONE:
+		if (*dst_len < src_len)
+			return -ERANGE;
+		memcpy(dst, src, src_len);
+		*dst_len = src_len;
+		return 0;
+
+	case CEPH_CRYPTO_AES:
+		return ceph_aes_encrypt(secret->key, secret->len, dst,
+					dst_len, src, src_len);
+
+	default:
+		return -EINVAL;
+	}
+}
+
+int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
+		  const void *src1, size_t src1_len,
+		  const void *src2, size_t src2_len)
+{
+	switch (secret->type) {
+	case CEPH_CRYPTO_NONE:
+		if (*dst_len < src1_len + src2_len)
+			return -ERANGE;
+		memcpy(dst, src1, src1_len);
+		memcpy(dst + src1_len, src2, src2_len);
+		*dst_len = src1_len + src2_len;
+		return 0;
+
+	case CEPH_CRYPTO_AES:
+		return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len,
+					 src1, src1_len, src2, src2_len);
+
+	default:
+		return -EINVAL;
+	}
+}
diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h
new file mode 100644
index 000000000000..40b502e6bd89
--- /dev/null
+++ b/fs/ceph/crypto.h
@@ -0,0 +1,48 @@
+#ifndef _FS_CEPH_CRYPTO_H
+#define _FS_CEPH_CRYPTO_H
+
+#include "types.h"
+#include "buffer.h"
+
+/*
+ * cryptographic secret
+ */
+struct ceph_crypto_key {
+	int type;
+	struct ceph_timespec created;
+	int len;
+	void *key;
+};
+
+static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
+{
+	kfree(key->key);
+}
+
+extern int ceph_crypto_key_encode(struct ceph_crypto_key *key,
+				  void **p, void *end);
+extern int ceph_crypto_key_decode(struct ceph_crypto_key *key,
+				  void **p, void *end);
+extern int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in);
+
+/* crypto.c */
+extern int ceph_decrypt(struct ceph_crypto_key *secret,
+			void *dst, size_t *dst_len,
+			const void *src, size_t src_len);
+extern int ceph_encrypt(struct ceph_crypto_key *secret,
+			void *dst, size_t *dst_len,
+			const void *src, size_t src_len);
+extern int ceph_decrypt2(struct ceph_crypto_key *secret,
+			void *dst1, size_t *dst1_len,
+			void *dst2, size_t *dst2_len,
+			const void *src, size_t src_len);
+extern int ceph_encrypt2(struct ceph_crypto_key *secret,
+			 void *dst, size_t *dst_len,
+			 const void *src1, size_t src1_len,
+			 const void *src2, size_t src2_len);
+
+/* armor.c */
+extern int ceph_armor(char *dst, const void *src, const void *end);
+extern int ceph_unarmor(void *dst, const char *src, const char *end);
+
+#endif
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
new file mode 100644
index 000000000000..f2f5332ddbba
--- /dev/null
+++ b/fs/ceph/debugfs.c
@@ -0,0 +1,489 @@
+#include "ceph_debug.h"
+
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include "super.h"
+#include "mds_client.h"
+#include "mon_client.h"
+#include "auth.h"
+
+#ifdef CONFIG_DEBUG_FS
+
+/*
+ * Implement /sys/kernel/debug/ceph fun
+ *
+ * /sys/kernel/debug/ceph/client*  - an instance of the ceph client
+ *      .../osdmap      - current osdmap
+ *      .../mdsmap      - current mdsmap
+ *      .../monmap      - current monmap
+ *      .../osdc        - active osd requests
+ *      .../mdsc        - active mds requests
+ *      .../monc        - mon client state
+ *      .../dentry_lru  - dump contents of dentry lru
+ *      .../caps        - expose cap (reservation) stats
+ *      .../bdi         - symlink to ../../bdi/something
+ */
+
+static struct dentry *ceph_debugfs_dir;
+
+static int monmap_show(struct seq_file *s, void *p)
+{
+	int i;
+	struct ceph_client *client = s->private;
+
+	if (client->monc.monmap == NULL)
+		return 0;
+
+	seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
+	for (i = 0; i < client->monc.monmap->num_mon; i++) {
+		struct ceph_entity_inst *inst =
+			&client->monc.monmap->mon_inst[i];
+
+		seq_printf(s, "\t%s%lld\t%s\n",
+			   ENTITY_NAME(inst->name),
+			   pr_addr(&inst->addr.in_addr));
+	}
+	return 0;
+}
+
+static int mdsmap_show(struct seq_file *s, void *p)
+{
+	int i;
+	struct ceph_client *client = s->private;
+
+	if (client->mdsc.mdsmap == NULL)
+		return 0;
+	seq_printf(s, "epoch %d\n", client->mdsc.mdsmap->m_epoch);
+	seq_printf(s, "root %d\n", client->mdsc.mdsmap->m_root);
+	seq_printf(s, "session_timeout %d\n",
+		       client->mdsc.mdsmap->m_session_timeout);
+	seq_printf(s, "session_autoclose %d\n",
+		       client->mdsc.mdsmap->m_session_autoclose);
+	for (i = 0; i < client->mdsc.mdsmap->m_max_mds; i++) {
+		struct ceph_entity_addr *addr =
+			&client->mdsc.mdsmap->m_info[i].addr;
+		int state = client->mdsc.mdsmap->m_info[i].state;
+
+		seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, pr_addr(&addr->in_addr),
+			       ceph_mds_state_name(state));
+	}
+	return 0;
+}
+
+static int osdmap_show(struct seq_file *s, void *p)
+{
+	int i;
+	struct ceph_client *client = s->private;
+	struct rb_node *n;
+
+	if (client->osdc.osdmap == NULL)
+		return 0;
+	seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
+	seq_printf(s, "flags%s%s\n",
+		   (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
+		   " NEARFULL" : "",
+		   (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
+		   " FULL" : "");
+	for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
+		struct ceph_pg_pool_info *pool =
+			rb_entry(n, struct ceph_pg_pool_info, node);
+		seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
+			   pool->id, pool->v.pg_num, pool->pg_num_mask,
+			   pool->v.lpg_num, pool->lpg_num_mask);
+	}
+	for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
+		struct ceph_entity_addr *addr =
+			&client->osdc.osdmap->osd_addr[i];
+		int state = client->osdc.osdmap->osd_state[i];
+		char sb[64];
+
+		seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
+			   i, pr_addr(&addr->in_addr),
+			   ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
+			   ceph_osdmap_state_str(sb, sizeof(sb), state));
+	}
+	return 0;
+}
+
+static int monc_show(struct seq_file *s, void *p)
+{
+	struct ceph_client *client = s->private;
+	struct ceph_mon_generic_request *req;
+	struct ceph_mon_client *monc = &client->monc;
+	struct rb_node *rp;
+
+	mutex_lock(&monc->mutex);
+
+	if (monc->have_mdsmap)
+		seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
+	if (monc->have_osdmap)
+		seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
+	if (monc->want_next_osdmap)
+		seq_printf(s, "want next osdmap\n");
+
+	for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
+		__u16 op;
+		req = rb_entry(rp, struct ceph_mon_generic_request, node);
+		op = le16_to_cpu(req->request->hdr.type);
+		if (op == CEPH_MSG_STATFS)
+			seq_printf(s, "%lld statfs\n", req->tid);
+		else
+			seq_printf(s, "%lld unknown\n", req->tid);
+	}
+
+	mutex_unlock(&monc->mutex);
+	return 0;
+}
+
+static int mdsc_show(struct seq_file *s, void *p)
+{
+	struct ceph_client *client = s->private;
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_mds_request *req;
+	struct rb_node *rp;
+	int pathlen;
+	u64 pathbase;
+	char *path;
+
+	mutex_lock(&mdsc->mutex);
+	for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) {
+		req = rb_entry(rp, struct ceph_mds_request, r_node);
+
+		if (req->r_request)
+			seq_printf(s, "%lld\tmds%d\t", req->r_tid, req->r_mds);
+		else
+			seq_printf(s, "%lld\t(no request)\t", req->r_tid);
+
+		seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
+
+		if (req->r_got_unsafe)
+			seq_printf(s, "\t(unsafe)");
+		else
+			seq_printf(s, "\t");
+
+		if (req->r_inode) {
+			seq_printf(s, " #%llx", ceph_ino(req->r_inode));
+		} else if (req->r_dentry) {
+			path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
+						    &pathbase, 0);
+			spin_lock(&req->r_dentry->d_lock);
+			seq_printf(s, " #%llx/%.*s (%s)",
+				   ceph_ino(req->r_dentry->d_parent->d_inode),
+				   req->r_dentry->d_name.len,
+				   req->r_dentry->d_name.name,
+				   path ? path : "");
+			spin_unlock(&req->r_dentry->d_lock);
+			kfree(path);
+		} else if (req->r_path1) {
+			seq_printf(s, " #%llx/%s", req->r_ino1.ino,
+				   req->r_path1);
+		}
+
+		if (req->r_old_dentry) {
+			path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen,
+						    &pathbase, 0);
+			spin_lock(&req->r_old_dentry->d_lock);
+			seq_printf(s, " #%llx/%.*s (%s)",
+			   ceph_ino(req->r_old_dentry->d_parent->d_inode),
+				   req->r_old_dentry->d_name.len,
+				   req->r_old_dentry->d_name.name,
+				   path ? path : "");
+			spin_unlock(&req->r_old_dentry->d_lock);
+			kfree(path);
+		} else if (req->r_path2) {
+			if (req->r_ino2.ino)
+				seq_printf(s, " #%llx/%s", req->r_ino2.ino,
+					   req->r_path2);
+			else
+				seq_printf(s, " %s", req->r_path2);
+		}
+
+		seq_printf(s, "\n");
+	}
+	mutex_unlock(&mdsc->mutex);
+
+	return 0;
+}
+
+static int osdc_show(struct seq_file *s, void *pp)
+{
+	struct ceph_client *client = s->private;
+	struct ceph_osd_client *osdc = &client->osdc;
+	struct rb_node *p;
+
+	mutex_lock(&osdc->request_mutex);
+	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
+		struct ceph_osd_request *req;
+		struct ceph_osd_request_head *head;
+		struct ceph_osd_op *op;
+		int num_ops;
+		int opcode, olen;
+		int i;
+
+		req = rb_entry(p, struct ceph_osd_request, r_node);
+
+		seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
+			   req->r_osd ? req->r_osd->o_osd : -1,
+			   le32_to_cpu(req->r_pgid.pool),
+			   le16_to_cpu(req->r_pgid.ps));
+
+		head = req->r_request->front.iov_base;
+		op = (void *)(head + 1);
+
+		num_ops = le16_to_cpu(head->num_ops);
+		olen = le32_to_cpu(head->object_len);
+		seq_printf(s, "%.*s", olen,
+			   (const char *)(head->ops + num_ops));
+
+		if (req->r_reassert_version.epoch)
+			seq_printf(s, "\t%u'%llu",
+			   (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
+			   le64_to_cpu(req->r_reassert_version.version));
+		else
+			seq_printf(s, "\t");
+
+		for (i = 0; i < num_ops; i++) {
+			opcode = le16_to_cpu(op->op);
+			seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
+			op++;
+		}
+
+		seq_printf(s, "\n");
+	}
+	mutex_unlock(&osdc->request_mutex);
+	return 0;
+}
+
+static int caps_show(struct seq_file *s, void *p)
+{
+	struct ceph_client *client = s->private;
+	int total, avail, used, reserved, min;
+
+	ceph_reservation_status(client, &total, &avail, &used, &reserved, &min);
+	seq_printf(s, "total\t\t%d\n"
+		   "avail\t\t%d\n"
+		   "used\t\t%d\n"
+		   "reserved\t%d\n"
+		   "min\t%d\n",
+		   total, avail, used, reserved, min);
+	return 0;
+}
+
+static int dentry_lru_show(struct seq_file *s, void *ptr)
+{
+	struct ceph_client *client = s->private;
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_dentry_info *di;
+
+	spin_lock(&mdsc->dentry_lru_lock);
+	list_for_each_entry(di, &mdsc->dentry_lru, lru) {
+		struct dentry *dentry = di->dentry;
+		seq_printf(s, "%p %p\t%.*s\n",
+			   di, dentry, dentry->d_name.len, dentry->d_name.name);
+	}
+	spin_unlock(&mdsc->dentry_lru_lock);
+
+	return 0;
+}
+
+#define DEFINE_SHOW_FUNC(name) 						\
+static int name##_open(struct inode *inode, struct file *file)		\
+{									\
+	struct seq_file *sf;						\
+	int ret;							\
+									\
+	ret = single_open(file, name, NULL);				\
+	sf = file->private_data;					\
+	sf->private = inode->i_private;					\
+	return ret;							\
+}									\
+									\
+static const struct file_operations name##_fops = {			\
+	.open		= name##_open,					\
+	.read		= seq_read,					\
+	.llseek		= seq_lseek,					\
+	.release	= single_release,				\
+};
+
+DEFINE_SHOW_FUNC(monmap_show)
+DEFINE_SHOW_FUNC(mdsmap_show)
+DEFINE_SHOW_FUNC(osdmap_show)
+DEFINE_SHOW_FUNC(monc_show)
+DEFINE_SHOW_FUNC(mdsc_show)
+DEFINE_SHOW_FUNC(osdc_show)
+DEFINE_SHOW_FUNC(dentry_lru_show)
+DEFINE_SHOW_FUNC(caps_show)
+
+static int congestion_kb_set(void *data, u64 val)
+{
+	struct ceph_client *client = (struct ceph_client *)data;
+
+	if (client)
+		client->mount_args->congestion_kb = (int)val;
+
+	return 0;
+}
+
+static int congestion_kb_get(void *data, u64 *val)
+{
+	struct ceph_client *client = (struct ceph_client *)data;
+
+	if (client)
+		*val = (u64)client->mount_args->congestion_kb;
+
+	return 0;
+}
+
+
+DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
+			congestion_kb_set, "%llu\n");
+
+int __init ceph_debugfs_init(void)
+{
+	ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
+	if (!ceph_debugfs_dir)
+		return -ENOMEM;
+	return 0;
+}
+
+void ceph_debugfs_cleanup(void)
+{
+	debugfs_remove(ceph_debugfs_dir);
+}
+
+int ceph_debugfs_client_init(struct ceph_client *client)
+{
+	int ret = 0;
+	char name[80];
+
+	snprintf(name, sizeof(name), FSID_FORMAT ".client%lld",
+		 PR_FSID(&client->fsid), client->monc.auth->global_id);
+
+	client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
+	if (!client->debugfs_dir)
+		goto out;
+
+	client->monc.debugfs_file = debugfs_create_file("monc",
+						      0600,
+						      client->debugfs_dir,
+						      client,
+						      &monc_show_fops);
+	if (!client->monc.debugfs_file)
+		goto out;
+
+	client->mdsc.debugfs_file = debugfs_create_file("mdsc",
+						      0600,
+						      client->debugfs_dir,
+						      client,
+						      &mdsc_show_fops);
+	if (!client->mdsc.debugfs_file)
+		goto out;
+
+	client->osdc.debugfs_file = debugfs_create_file("osdc",
+						      0600,
+						      client->debugfs_dir,
+						      client,
+						      &osdc_show_fops);
+	if (!client->osdc.debugfs_file)
+		goto out;
+
+	client->debugfs_monmap = debugfs_create_file("monmap",
+					0600,
+					client->debugfs_dir,
+					client,
+					&monmap_show_fops);
+	if (!client->debugfs_monmap)
+		goto out;
+
+	client->debugfs_mdsmap = debugfs_create_file("mdsmap",
+					0600,
+					client->debugfs_dir,
+					client,
+					&mdsmap_show_fops);
+	if (!client->debugfs_mdsmap)
+		goto out;
+
+	client->debugfs_osdmap = debugfs_create_file("osdmap",
+					0600,
+					client->debugfs_dir,
+					client,
+					&osdmap_show_fops);
+	if (!client->debugfs_osdmap)
+		goto out;
+
+	client->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
+					0600,
+					client->debugfs_dir,
+					client,
+					&dentry_lru_show_fops);
+	if (!client->debugfs_dentry_lru)
+		goto out;
+
+	client->debugfs_caps = debugfs_create_file("caps",
+						   0400,
+						   client->debugfs_dir,
+						   client,
+						   &caps_show_fops);
+	if (!client->debugfs_caps)
+		goto out;
+
+	client->debugfs_congestion_kb = debugfs_create_file("writeback_congestion_kb",
+						   0600,
+						   client->debugfs_dir,
+						   client,
+						   &congestion_kb_fops);
+	if (!client->debugfs_congestion_kb)
+		goto out;
+
+	sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev));
+	client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir,
+						     name);
+
+	return 0;
+
+out:
+	ceph_debugfs_client_cleanup(client);
+	return ret;
+}
+
+void ceph_debugfs_client_cleanup(struct ceph_client *client)
+{
+	debugfs_remove(client->debugfs_bdi);
+	debugfs_remove(client->debugfs_caps);
+	debugfs_remove(client->debugfs_dentry_lru);
+	debugfs_remove(client->debugfs_osdmap);
+	debugfs_remove(client->debugfs_mdsmap);
+	debugfs_remove(client->debugfs_monmap);
+	debugfs_remove(client->osdc.debugfs_file);
+	debugfs_remove(client->mdsc.debugfs_file);
+	debugfs_remove(client->monc.debugfs_file);
+	debugfs_remove(client->debugfs_congestion_kb);
+	debugfs_remove(client->debugfs_dir);
+}
+
+#else  // CONFIG_DEBUG_FS
+
+int __init ceph_debugfs_init(void)
+{
+	return 0;
+}
+
+void ceph_debugfs_cleanup(void)
+{
+}
+
+int ceph_debugfs_client_init(struct ceph_client *client)
+{
+	return 0;
+}
+
+void ceph_debugfs_client_cleanup(struct ceph_client *client)
+{
+}
+
+#endif  // CONFIG_DEBUG_FS
diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h
new file mode 100644
index 000000000000..65b3e022eaf5
--- /dev/null
+++ b/fs/ceph/decode.h
@@ -0,0 +1,194 @@
+#ifndef __CEPH_DECODE_H
+#define __CEPH_DECODE_H
+
+#include <asm/unaligned.h>
+#include <linux/time.h>
+
+#include "types.h"
+
+/*
+ * in all cases,
+ *   void **p     pointer to position pointer
+ *   void *end    pointer to end of buffer (last byte + 1)
+ */
+
+static inline u64 ceph_decode_64(void **p)
+{
+	u64 v = get_unaligned_le64(*p);
+	*p += sizeof(u64);
+	return v;
+}
+static inline u32 ceph_decode_32(void **p)
+{
+	u32 v = get_unaligned_le32(*p);
+	*p += sizeof(u32);
+	return v;
+}
+static inline u16 ceph_decode_16(void **p)
+{
+	u16 v = get_unaligned_le16(*p);
+	*p += sizeof(u16);
+	return v;
+}
+static inline u8 ceph_decode_8(void **p)
+{
+	u8 v = *(u8 *)*p;
+	(*p)++;
+	return v;
+}
+static inline void ceph_decode_copy(void **p, void *pv, size_t n)
+{
+	memcpy(pv, *p, n);
+	*p += n;
+}
+
+/*
+ * bounds check input.
+ */
+#define ceph_decode_need(p, end, n, bad)		\
+	do {						\
+		if (unlikely(*(p) + (n) > (end))) 	\
+			goto bad;			\
+	} while (0)
+
+#define ceph_decode_64_safe(p, end, v, bad)			\
+	do {							\
+		ceph_decode_need(p, end, sizeof(u64), bad);	\
+		v = ceph_decode_64(p);				\
+	} while (0)
+#define ceph_decode_32_safe(p, end, v, bad)			\
+	do {							\
+		ceph_decode_need(p, end, sizeof(u32), bad);	\
+		v = ceph_decode_32(p);				\
+	} while (0)
+#define ceph_decode_16_safe(p, end, v, bad)			\
+	do {							\
+		ceph_decode_need(p, end, sizeof(u16), bad);	\
+		v = ceph_decode_16(p);				\
+	} while (0)
+#define ceph_decode_8_safe(p, end, v, bad)			\
+	do {							\
+		ceph_decode_need(p, end, sizeof(u8), bad);	\
+		v = ceph_decode_8(p);				\
+	} while (0)
+
+#define ceph_decode_copy_safe(p, end, pv, n, bad)		\
+	do {							\
+		ceph_decode_need(p, end, n, bad);		\
+		ceph_decode_copy(p, pv, n);			\
+	} while (0)
+
+/*
+ * struct ceph_timespec <-> struct timespec
+ */
+static inline void ceph_decode_timespec(struct timespec *ts,
+					const struct ceph_timespec *tv)
+{
+	ts->tv_sec = le32_to_cpu(tv->tv_sec);
+	ts->tv_nsec = le32_to_cpu(tv->tv_nsec);
+}
+static inline void ceph_encode_timespec(struct ceph_timespec *tv,
+					const struct timespec *ts)
+{
+	tv->tv_sec = cpu_to_le32(ts->tv_sec);
+	tv->tv_nsec = cpu_to_le32(ts->tv_nsec);
+}
+
+/*
+ * sockaddr_storage <-> ceph_sockaddr
+ */
+static inline void ceph_encode_addr(struct ceph_entity_addr *a)
+{
+	a->in_addr.ss_family = htons(a->in_addr.ss_family);
+}
+static inline void ceph_decode_addr(struct ceph_entity_addr *a)
+{
+	a->in_addr.ss_family = ntohs(a->in_addr.ss_family);
+	WARN_ON(a->in_addr.ss_family == 512);
+}
+
+/*
+ * encoders
+ */
+static inline void ceph_encode_64(void **p, u64 v)
+{
+	put_unaligned_le64(v, (__le64 *)*p);
+	*p += sizeof(u64);
+}
+static inline void ceph_encode_32(void **p, u32 v)
+{
+	put_unaligned_le32(v, (__le32 *)*p);
+	*p += sizeof(u32);
+}
+static inline void ceph_encode_16(void **p, u16 v)
+{
+	put_unaligned_le16(v, (__le16 *)*p);
+	*p += sizeof(u16);
+}
+static inline void ceph_encode_8(void **p, u8 v)
+{
+	*(u8 *)*p = v;
+	(*p)++;
+}
+static inline void ceph_encode_copy(void **p, const void *s, int len)
+{
+	memcpy(*p, s, len);
+	*p += len;
+}
+
+/*
+ * filepath, string encoders
+ */
+static inline void ceph_encode_filepath(void **p, void *end,
+					u64 ino, const char *path)
+{
+	u32 len = path ? strlen(path) : 0;
+	BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end);
+	ceph_encode_8(p, 1);
+	ceph_encode_64(p, ino);
+	ceph_encode_32(p, len);
+	if (len)
+		memcpy(*p, path, len);
+	*p += len;
+}
+
+static inline void ceph_encode_string(void **p, void *end,
+				      const char *s, u32 len)
+{
+	BUG_ON(*p + sizeof(len) + len > end);
+	ceph_encode_32(p, len);
+	if (len)
+		memcpy(*p, s, len);
+	*p += len;
+}
+
+#define ceph_encode_need(p, end, n, bad)		\
+	do {						\
+		if (unlikely(*(p) + (n) > (end))) 	\
+			goto bad;			\
+	} while (0)
+
+#define ceph_encode_64_safe(p, end, v, bad)			\
+	do {							\
+		ceph_encode_need(p, end, sizeof(u64), bad);	\
+		ceph_encode_64(p, v);				\
+	} while (0)
+#define ceph_encode_32_safe(p, end, v, bad)			\
+	do {							\
+		ceph_encode_need(p, end, sizeof(u32), bad);	\
+		ceph_encode_32(p, v);			\
+	} while (0)
+#define ceph_encode_16_safe(p, end, v, bad)			\
+	do {							\
+		ceph_encode_need(p, end, sizeof(u16), bad);	\
+		ceph_encode_16(p, v);			\
+	} while (0)
+
+#define ceph_encode_copy_safe(p, end, pv, n, bad)		\
+	do {							\
+		ceph_encode_need(p, end, n, bad);		\
+		ceph_encode_copy(p, pv, n);			\
+	} while (0)
+
+
+#endif
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
new file mode 100644
index 000000000000..f94ed3c7f6a5
--- /dev/null
+++ b/fs/ceph/dir.c
@@ -0,0 +1,1254 @@
+#include "ceph_debug.h"
+
+#include <linux/spinlock.h>
+#include <linux/fs_struct.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+
+#include "super.h"
+
+/*
+ * Directory operations: readdir, lookup, create, link, unlink,
+ * rename, etc.
+ */
+
+/*
+ * Ceph MDS operations are specified in terms of a base ino and
+ * relative path.  Thus, the client can specify an operation on a
+ * specific inode (e.g., a getattr due to fstat(2)), or as a path
+ * relative to, say, the root directory.
+ *
+ * Normally, we limit ourselves to strict inode ops (no path component)
+ * or dentry operations (a single path component relative to an ino).  The
+ * exception to this is open_root_dentry(), which will open the mount
+ * point by name.
+ */
+
+const struct inode_operations ceph_dir_iops;
+const struct file_operations ceph_dir_fops;
+struct dentry_operations ceph_dentry_ops;
+
+/*
+ * Initialize ceph dentry state.
+ */
+int ceph_init_dentry(struct dentry *dentry)
+{
+	struct ceph_dentry_info *di;
+
+	if (dentry->d_fsdata)
+		return 0;
+
+	if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
+		dentry->d_op = &ceph_dentry_ops;
+	else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
+		dentry->d_op = &ceph_snapdir_dentry_ops;
+	else
+		dentry->d_op = &ceph_snap_dentry_ops;
+
+	di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS);
+	if (!di)
+		return -ENOMEM;          /* oh well */
+
+	spin_lock(&dentry->d_lock);
+	if (dentry->d_fsdata) {
+		/* lost a race */
+		kmem_cache_free(ceph_dentry_cachep, di);
+		goto out_unlock;
+	}
+	di->dentry = dentry;
+	di->lease_session = NULL;
+	dentry->d_fsdata = di;
+	dentry->d_time = jiffies;
+	ceph_dentry_lru_add(dentry);
+out_unlock:
+	spin_unlock(&dentry->d_lock);
+	return 0;
+}
+
+
+
+/*
+ * for readdir, we encode the directory frag and offset within that
+ * frag into f_pos.
+ */
+static unsigned fpos_frag(loff_t p)
+{
+	return p >> 32;
+}
+static unsigned fpos_off(loff_t p)
+{
+	return p & 0xffffffff;
+}
+
+/*
+ * When possible, we try to satisfy a readdir by peeking at the
+ * dcache.  We make this work by carefully ordering dentries on
+ * d_u.d_child when we initially get results back from the MDS, and
+ * falling back to a "normal" sync readdir if any dentries in the dir
+ * are dropped.
+ *
+ * I_COMPLETE tells indicates we have all dentries in the dir.  It is
+ * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
+ * the MDS if/when the directory is modified).
+ */
+static int __dcache_readdir(struct file *filp,
+			    void *dirent, filldir_t filldir)
+{
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct ceph_file_info *fi = filp->private_data;
+	struct dentry *parent = filp->f_dentry;
+	struct inode *dir = parent->d_inode;
+	struct list_head *p;
+	struct dentry *dentry, *last;
+	struct ceph_dentry_info *di;
+	int err = 0;
+
+	/* claim ref on last dentry we returned */
+	last = fi->dentry;
+	fi->dentry = NULL;
+
+	dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,
+	     last);
+
+	spin_lock(&dcache_lock);
+
+	/* start at beginning? */
+	if (filp->f_pos == 2 || (last &&
+				 filp->f_pos < ceph_dentry(last)->offset)) {
+		if (list_empty(&parent->d_subdirs))
+			goto out_unlock;
+		p = parent->d_subdirs.prev;
+		dout(" initial p %p/%p\n", p->prev, p->next);
+	} else {
+		p = last->d_u.d_child.prev;
+	}
+
+more:
+	dentry = list_entry(p, struct dentry, d_u.d_child);
+	di = ceph_dentry(dentry);
+	while (1) {
+		dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next,
+		     d_unhashed(dentry) ? "!hashed" : "hashed",
+		     parent->d_subdirs.prev, parent->d_subdirs.next);
+		if (p == &parent->d_subdirs) {
+			fi->at_end = 1;
+			goto out_unlock;
+		}
+		if (!d_unhashed(dentry) && dentry->d_inode &&
+		    ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
+		    ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
+		    filp->f_pos <= di->offset)
+			break;
+		dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,
+		     dentry->d_name.len, dentry->d_name.name, di->offset,
+		     filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",
+		     !dentry->d_inode ? " null" : "");
+		p = p->prev;
+		dentry = list_entry(p, struct dentry, d_u.d_child);
+		di = ceph_dentry(dentry);
+	}
+
+	atomic_inc(&dentry->d_count);
+	spin_unlock(&dcache_lock);
+	spin_unlock(&inode->i_lock);
+
+	dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
+	     dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
+	filp->f_pos = di->offset;
+	err = filldir(dirent, dentry->d_name.name,
+		      dentry->d_name.len, di->offset,
+		      dentry->d_inode->i_ino,
+		      dentry->d_inode->i_mode >> 12);
+
+	if (last) {
+		if (err < 0) {
+			/* remember our position */
+			fi->dentry = last;
+			fi->next_offset = di->offset;
+		} else {
+			dput(last);
+		}
+		last = NULL;
+	}
+
+	spin_lock(&inode->i_lock);
+	spin_lock(&dcache_lock);
+
+	last = dentry;
+
+	if (err < 0)
+		goto out_unlock;
+
+	p = p->prev;
+	filp->f_pos++;
+
+	/* make sure a dentry wasn't dropped while we didn't have dcache_lock */
+	if ((ceph_inode(dir)->i_ceph_flags & CEPH_I_COMPLETE))
+		goto more;
+	dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
+	err = -EAGAIN;
+
+out_unlock:
+	spin_unlock(&dcache_lock);
+
+	if (last) {
+		spin_unlock(&inode->i_lock);
+		dput(last);
+		spin_lock(&inode->i_lock);
+	}
+
+	return err;
+}
+
+/*
+ * make note of the last dentry we read, so we can
+ * continue at the same lexicographical point,
+ * regardless of what dir changes take place on the
+ * server.
+ */
+static int note_last_dentry(struct ceph_file_info *fi, const char *name,
+			    int len)
+{
+	kfree(fi->last_name);
+	fi->last_name = kmalloc(len+1, GFP_NOFS);
+	if (!fi->last_name)
+		return -ENOMEM;
+	memcpy(fi->last_name, name, len);
+	fi->last_name[len] = 0;
+	dout("note_last_dentry '%s'\n", fi->last_name);
+	return 0;
+}
+
+static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct ceph_file_info *fi = filp->private_data;
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_client *client = ceph_inode_to_client(inode);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	unsigned frag = fpos_frag(filp->f_pos);
+	int off = fpos_off(filp->f_pos);
+	int err;
+	u32 ftype;
+	struct ceph_mds_reply_info_parsed *rinfo;
+	const int max_entries = client->mount_args->max_readdir;
+	const int max_bytes = client->mount_args->max_readdir_bytes;
+
+	dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
+	if (fi->at_end)
+		return 0;
+
+	/* always start with . and .. */
+	if (filp->f_pos == 0) {
+		/* note dir version at start of readdir so we can tell
+		 * if any dentries get dropped */
+		fi->dir_release_count = ci->i_release_count;
+
+		dout("readdir off 0 -> '.'\n");
+		if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0),
+			    inode->i_ino, inode->i_mode >> 12) < 0)
+			return 0;
+		filp->f_pos = 1;
+		off = 1;
+	}
+	if (filp->f_pos == 1) {
+		dout("readdir off 1 -> '..'\n");
+		if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1),
+			    filp->f_dentry->d_parent->d_inode->i_ino,
+			    inode->i_mode >> 12) < 0)
+			return 0;
+		filp->f_pos = 2;
+		off = 2;
+	}
+
+	/* can we use the dcache? */
+	spin_lock(&inode->i_lock);
+	if ((filp->f_pos == 2 || fi->dentry) &&
+	    !ceph_test_opt(client, NOASYNCREADDIR) &&
+	    ceph_snap(inode) != CEPH_SNAPDIR &&
+	    (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
+	    __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
+		err = __dcache_readdir(filp, dirent, filldir);
+		if (err != -EAGAIN) {
+			spin_unlock(&inode->i_lock);
+			return err;
+		}
+	}
+	spin_unlock(&inode->i_lock);
+	if (fi->dentry) {
+		err = note_last_dentry(fi, fi->dentry->d_name.name,
+				       fi->dentry->d_name.len);
+		if (err)
+			return err;
+		dput(fi->dentry);
+		fi->dentry = NULL;
+	}
+
+	/* proceed with a normal readdir */
+
+more:
+	/* do we have the correct frag content buffered? */
+	if (fi->frag != frag || fi->last_readdir == NULL) {
+		struct ceph_mds_request *req;
+		int op = ceph_snap(inode) == CEPH_SNAPDIR ?
+			CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
+
+		/* discard old result, if any */
+		if (fi->last_readdir) {
+			ceph_mdsc_put_request(fi->last_readdir);
+			fi->last_readdir = NULL;
+		}
+
+		/* requery frag tree, as the frag topology may have changed */
+		frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL);
+
+		dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
+		     ceph_vinop(inode), frag, fi->last_name);
+		req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
+		if (IS_ERR(req))
+			return PTR_ERR(req);
+		req->r_inode = igrab(inode);
+		req->r_dentry = dget(filp->f_dentry);
+		/* hints to request -> mds selection code */
+		req->r_direct_mode = USE_AUTH_MDS;
+		req->r_direct_hash = ceph_frag_value(frag);
+		req->r_direct_is_hash = true;
+		req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
+		req->r_readdir_offset = fi->next_offset;
+		req->r_args.readdir.frag = cpu_to_le32(frag);
+		req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
+		req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes);
+		req->r_num_caps = max_entries + 1;
+		err = ceph_mdsc_do_request(mdsc, NULL, req);
+		if (err < 0) {
+			ceph_mdsc_put_request(req);
+			return err;
+		}
+		dout("readdir got and parsed readdir result=%d"
+		     " on frag %x, end=%d, complete=%d\n", err, frag,
+		     (int)req->r_reply_info.dir_end,
+		     (int)req->r_reply_info.dir_complete);
+
+		if (!req->r_did_prepopulate) {
+			dout("readdir !did_prepopulate");
+			fi->dir_release_count--;    /* preclude I_COMPLETE */
+		}
+
+		/* note next offset and last dentry name */
+		fi->offset = fi->next_offset;
+		fi->last_readdir = req;
+
+		if (req->r_reply_info.dir_end) {
+			kfree(fi->last_name);
+			fi->last_name = NULL;
+			fi->next_offset = 2;
+		} else {
+			rinfo = &req->r_reply_info;
+			err = note_last_dentry(fi,
+				       rinfo->dir_dname[rinfo->dir_nr-1],
+				       rinfo->dir_dname_len[rinfo->dir_nr-1]);
+			if (err)
+				return err;
+			fi->next_offset += rinfo->dir_nr;
+		}
+	}
+
+	rinfo = &fi->last_readdir->r_reply_info;
+	dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
+	     rinfo->dir_nr, off, fi->offset);
+	while (off - fi->offset >= 0 && off - fi->offset < rinfo->dir_nr) {
+		u64 pos = ceph_make_fpos(frag, off);
+		struct ceph_mds_reply_inode *in =
+			rinfo->dir_in[off - fi->offset].in;
+		dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
+		     off, off - fi->offset, rinfo->dir_nr, pos,
+		     rinfo->dir_dname_len[off - fi->offset],
+		     rinfo->dir_dname[off - fi->offset], in);
+		BUG_ON(!in);
+		ftype = le32_to_cpu(in->mode) >> 12;
+		if (filldir(dirent,
+			    rinfo->dir_dname[off - fi->offset],
+			    rinfo->dir_dname_len[off - fi->offset],
+			    pos,
+			    le64_to_cpu(in->ino),
+			    ftype) < 0) {
+			dout("filldir stopping us...\n");
+			return 0;
+		}
+		off++;
+		filp->f_pos = pos + 1;
+	}
+
+	if (fi->last_name) {
+		ceph_mdsc_put_request(fi->last_readdir);
+		fi->last_readdir = NULL;
+		goto more;
+	}
+
+	/* more frags? */
+	if (!ceph_frag_is_rightmost(frag)) {
+		frag = ceph_frag_next(frag);
+		off = 0;
+		filp->f_pos = ceph_make_fpos(frag, off);
+		dout("readdir next frag is %x\n", frag);
+		goto more;
+	}
+	fi->at_end = 1;
+
+	/*
+	 * if dir_release_count still matches the dir, no dentries
+	 * were released during the whole readdir, and we should have
+	 * the complete dir contents in our cache.
+	 */
+	spin_lock(&inode->i_lock);
+	if (ci->i_release_count == fi->dir_release_count) {
+		dout(" marking %p complete\n", inode);
+		ci->i_ceph_flags |= CEPH_I_COMPLETE;
+		ci->i_max_offset = filp->f_pos;
+	}
+	spin_unlock(&inode->i_lock);
+
+	dout("readdir %p filp %p done.\n", inode, filp);
+	return 0;
+}
+
+static void reset_readdir(struct ceph_file_info *fi)
+{
+	if (fi->last_readdir) {
+		ceph_mdsc_put_request(fi->last_readdir);
+		fi->last_readdir = NULL;
+	}
+	kfree(fi->last_name);
+	fi->next_offset = 2;  /* compensate for . and .. */
+	if (fi->dentry) {
+		dput(fi->dentry);
+		fi->dentry = NULL;
+	}
+	fi->at_end = 0;
+}
+
+static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
+{
+	struct ceph_file_info *fi = file->private_data;
+	struct inode *inode = file->f_mapping->host;
+	loff_t old_offset = offset;
+	loff_t retval;
+
+	mutex_lock(&inode->i_mutex);
+	switch (origin) {
+	case SEEK_END:
+		offset += inode->i_size + 2;   /* FIXME */
+		break;
+	case SEEK_CUR:
+		offset += file->f_pos;
+	}
+	retval = -EINVAL;
+	if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
+		if (offset != file->f_pos) {
+			file->f_pos = offset;
+			file->f_version = 0;
+			fi->at_end = 0;
+		}
+		retval = offset;
+
+		/*
+		 * discard buffered readdir content on seekdir(0), or
+		 * seek to new frag, or seek prior to current chunk.
+		 */
+		if (offset == 0 ||
+		    fpos_frag(offset) != fpos_frag(old_offset) ||
+		    fpos_off(offset) < fi->offset) {
+			dout("dir_llseek dropping %p content\n", file);
+			reset_readdir(fi);
+		}
+
+		/* bump dir_release_count if we did a forward seek */
+		if (offset > old_offset)
+			fi->dir_release_count--;
+	}
+	mutex_unlock(&inode->i_mutex);
+	return retval;
+}
+
+/*
+ * Process result of a lookup/open request.
+ *
+ * Mainly, make sure we return the final req->r_dentry (if it already
+ * existed) in place of the original VFS-provided dentry when they
+ * differ.
+ *
+ * Gracefully handle the case where the MDS replies with -ENOENT and
+ * no trace (which it may do, at its discretion, e.g., if it doesn't
+ * care to issue a lease on the negative dentry).
+ */
+struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
+				  struct dentry *dentry, int err)
+{
+	struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
+	struct inode *parent = dentry->d_parent->d_inode;
+
+	/* .snap dir? */
+	if (err == -ENOENT &&
+	    ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */
+	    strcmp(dentry->d_name.name,
+		   client->mount_args->snapdir_name) == 0) {
+		struct inode *inode = ceph_get_snapdir(parent);
+		dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
+		     dentry, dentry->d_name.len, dentry->d_name.name, inode);
+		BUG_ON(!d_unhashed(dentry));
+		d_add(dentry, inode);
+		err = 0;
+	}
+
+	if (err == -ENOENT) {
+		/* no trace? */
+		err = 0;
+		if (!req->r_reply_info.head->is_dentry) {
+			dout("ENOENT and no trace, dentry %p inode %p\n",
+			     dentry, dentry->d_inode);
+			if (dentry->d_inode) {
+				d_drop(dentry);
+				err = -ENOENT;
+			} else {
+				d_add(dentry, NULL);
+			}
+		}
+	}
+	if (err)
+		dentry = ERR_PTR(err);
+	else if (dentry != req->r_dentry)
+		dentry = dget(req->r_dentry);   /* we got spliced */
+	else
+		dentry = NULL;
+	return dentry;
+}
+
+static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
+{
+	return ceph_ino(inode) == CEPH_INO_ROOT &&
+		strncmp(dentry->d_name.name, ".ceph", 5) == 0;
+}
+
+/*
+ * Look up a single dir entry.  If there is a lookup intent, inform
+ * the MDS so that it gets our 'caps wanted' value in a single op.
+ */
+static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
+				  struct nameidata *nd)
+{
+	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_mds_request *req;
+	int op;
+	int err;
+
+	dout("lookup %p dentry %p '%.*s'\n",
+	     dir, dentry, dentry->d_name.len, dentry->d_name.name);
+
+	if (dentry->d_name.len > NAME_MAX)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	err = ceph_init_dentry(dentry);
+	if (err < 0)
+		return ERR_PTR(err);
+
+	/* open (but not create!) intent? */
+	if (nd &&
+	    (nd->flags & LOOKUP_OPEN) &&
+	    (nd->flags & LOOKUP_CONTINUE) == 0 && /* only open last component */
+	    !(nd->intent.open.flags & O_CREAT)) {
+		int mode = nd->intent.open.create_mode & ~current->fs->umask;
+		return ceph_lookup_open(dir, dentry, nd, mode, 1);
+	}
+
+	/* can we conclude ENOENT locally? */
+	if (dentry->d_inode == NULL) {
+		struct ceph_inode_info *ci = ceph_inode(dir);
+		struct ceph_dentry_info *di = ceph_dentry(dentry);
+
+		spin_lock(&dir->i_lock);
+		dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
+		if (strncmp(dentry->d_name.name,
+			    client->mount_args->snapdir_name,
+			    dentry->d_name.len) &&
+		    !is_root_ceph_dentry(dir, dentry) &&
+		    (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
+		    (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
+			spin_unlock(&dir->i_lock);
+			dout(" dir %p complete, -ENOENT\n", dir);
+			d_add(dentry, NULL);
+			di->lease_shared_gen = ci->i_shared_gen;
+			return NULL;
+		}
+		spin_unlock(&dir->i_lock);
+	}
+
+	op = ceph_snap(dir) == CEPH_SNAPDIR ?
+		CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
+	req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
+	if (IS_ERR(req))
+		return ERR_CAST(req);
+	req->r_dentry = dget(dentry);
+	req->r_num_caps = 2;
+	/* we only need inode linkage */
+	req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
+	req->r_locked_dir = dir;
+	err = ceph_mdsc_do_request(mdsc, NULL, req);
+	dentry = ceph_finish_lookup(req, dentry, err);
+	ceph_mdsc_put_request(req);  /* will dput(dentry) */
+	dout("lookup result=%p\n", dentry);
+	return dentry;
+}
+
+/*
+ * If we do a create but get no trace back from the MDS, follow up with
+ * a lookup (the VFS expects us to link up the provided dentry).
+ */
+int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
+{
+	struct dentry *result = ceph_lookup(dir, dentry, NULL);
+
+	if (result && !IS_ERR(result)) {
+		/*
+		 * We created the item, then did a lookup, and found
+		 * it was already linked to another inode we already
+		 * had in our cache (and thus got spliced).  Link our
+		 * dentry to that inode, but don't hash it, just in
+		 * case the VFS wants to dereference it.
+		 */
+		BUG_ON(!result->d_inode);
+		d_instantiate(dentry, result->d_inode);
+		return 0;
+	}
+	return PTR_ERR(result);
+}
+
+static int ceph_mknod(struct inode *dir, struct dentry *dentry,
+		      int mode, dev_t rdev)
+{
+	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_mds_request *req;
+	int err;
+
+	if (ceph_snap(dir) != CEPH_NOSNAP)
+		return -EROFS;
+
+	dout("mknod in dir %p dentry %p mode 0%o rdev %d\n",
+	     dir, dentry, mode, rdev);
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS);
+	if (IS_ERR(req)) {
+		d_drop(dentry);
+		return PTR_ERR(req);
+	}
+	req->r_dentry = dget(dentry);
+	req->r_num_caps = 2;
+	req->r_locked_dir = dir;
+	req->r_args.mknod.mode = cpu_to_le32(mode);
+	req->r_args.mknod.rdev = cpu_to_le32(rdev);
+	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+	err = ceph_mdsc_do_request(mdsc, dir, req);
+	if (!err && !req->r_reply_info.head->is_dentry)
+		err = ceph_handle_notrace_create(dir, dentry);
+	ceph_mdsc_put_request(req);
+	if (err)
+		d_drop(dentry);
+	return err;
+}
+
+static int ceph_create(struct inode *dir, struct dentry *dentry, int mode,
+		       struct nameidata *nd)
+{
+	dout("create in dir %p dentry %p name '%.*s'\n",
+	     dir, dentry, dentry->d_name.len, dentry->d_name.name);
+
+	if (ceph_snap(dir) != CEPH_NOSNAP)
+		return -EROFS;
+
+	if (nd) {
+		BUG_ON((nd->flags & LOOKUP_OPEN) == 0);
+		dentry = ceph_lookup_open(dir, dentry, nd, mode, 0);
+		/* hrm, what should i do here if we get aliased? */
+		if (IS_ERR(dentry))
+			return PTR_ERR(dentry);
+		return 0;
+	}
+
+	/* fall back to mknod */
+	return ceph_mknod(dir, dentry, (mode & ~S_IFMT) | S_IFREG, 0);
+}
+
+static int ceph_symlink(struct inode *dir, struct dentry *dentry,
+			    const char *dest)
+{
+	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_mds_request *req;
+	int err;
+
+	if (ceph_snap(dir) != CEPH_NOSNAP)
+		return -EROFS;
+
+	dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
+	if (IS_ERR(req)) {
+		d_drop(dentry);
+		return PTR_ERR(req);
+	}
+	req->r_dentry = dget(dentry);
+	req->r_num_caps = 2;
+	req->r_path2 = kstrdup(dest, GFP_NOFS);
+	req->r_locked_dir = dir;
+	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+	err = ceph_mdsc_do_request(mdsc, dir, req);
+	if (!err && !req->r_reply_info.head->is_dentry)
+		err = ceph_handle_notrace_create(dir, dentry);
+	ceph_mdsc_put_request(req);
+	if (err)
+		d_drop(dentry);
+	return err;
+}
+
+static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_mds_request *req;
+	int err = -EROFS;
+	int op;
+
+	if (ceph_snap(dir) == CEPH_SNAPDIR) {
+		/* mkdir .snap/foo is a MKSNAP */
+		op = CEPH_MDS_OP_MKSNAP;
+		dout("mksnap dir %p snap '%.*s' dn %p\n", dir,
+		     dentry->d_name.len, dentry->d_name.name, dentry);
+	} else if (ceph_snap(dir) == CEPH_NOSNAP) {
+		dout("mkdir dir %p dn %p mode 0%o\n", dir, dentry, mode);
+		op = CEPH_MDS_OP_MKDIR;
+	} else {
+		goto out;
+	}
+	req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto out;
+	}
+
+	req->r_dentry = dget(dentry);
+	req->r_num_caps = 2;
+	req->r_locked_dir = dir;
+	req->r_args.mkdir.mode = cpu_to_le32(mode);
+	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+	err = ceph_mdsc_do_request(mdsc, dir, req);
+	if (!err && !req->r_reply_info.head->is_dentry)
+		err = ceph_handle_notrace_create(dir, dentry);
+	ceph_mdsc_put_request(req);
+out:
+	if (err < 0)
+		d_drop(dentry);
+	return err;
+}
+
+static int ceph_link(struct dentry *old_dentry, struct inode *dir,
+		     struct dentry *dentry)
+{
+	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_mds_request *req;
+	int err;
+
+	if (ceph_snap(dir) != CEPH_NOSNAP)
+		return -EROFS;
+
+	dout("link in dir %p old_dentry %p dentry %p\n", dir,
+	     old_dentry, dentry);
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS);
+	if (IS_ERR(req)) {
+		d_drop(dentry);
+		return PTR_ERR(req);
+	}
+	req->r_dentry = dget(dentry);
+	req->r_num_caps = 2;
+	req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */
+	req->r_locked_dir = dir;
+	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+	err = ceph_mdsc_do_request(mdsc, dir, req);
+	if (err)
+		d_drop(dentry);
+	else if (!req->r_reply_info.head->is_dentry)
+		d_instantiate(dentry, igrab(old_dentry->d_inode));
+	ceph_mdsc_put_request(req);
+	return err;
+}
+
+/*
+ * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps.  If it
+ * looks like the link count will hit 0, drop any other caps (other
+ * than PIN) we don't specifically want (due to the file still being
+ * open).
+ */
+static int drop_caps_for_unlink(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
+
+	spin_lock(&inode->i_lock);
+	if (inode->i_nlink == 1) {
+		drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
+		ci->i_ceph_flags |= CEPH_I_NODELAY;
+	}
+	spin_unlock(&inode->i_lock);
+	return drop;
+}
+
+/*
+ * rmdir and unlink are differ only by the metadata op code
+ */
+static int ceph_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct inode *inode = dentry->d_inode;
+	struct ceph_mds_request *req;
+	int err = -EROFS;
+	int op;
+
+	if (ceph_snap(dir) == CEPH_SNAPDIR) {
+		/* rmdir .snap/foo is RMSNAP */
+		dout("rmsnap dir %p '%.*s' dn %p\n", dir, dentry->d_name.len,
+		     dentry->d_name.name, dentry);
+		op = CEPH_MDS_OP_RMSNAP;
+	} else if (ceph_snap(dir) == CEPH_NOSNAP) {
+		dout("unlink/rmdir dir %p dn %p inode %p\n",
+		     dir, dentry, inode);
+		op = ((dentry->d_inode->i_mode & S_IFMT) == S_IFDIR) ?
+			CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
+	} else
+		goto out;
+	req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto out;
+	}
+	req->r_dentry = dget(dentry);
+	req->r_num_caps = 2;
+	req->r_locked_dir = dir;
+	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+	req->r_inode_drop = drop_caps_for_unlink(inode);
+	err = ceph_mdsc_do_request(mdsc, dir, req);
+	if (!err && !req->r_reply_info.head->is_dentry)
+		d_delete(dentry);
+	ceph_mdsc_put_request(req);
+out:
+	return err;
+}
+
+static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
+		       struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct ceph_client *client = ceph_sb_to_client(old_dir->i_sb);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_mds_request *req;
+	int err;
+
+	if (ceph_snap(old_dir) != ceph_snap(new_dir))
+		return -EXDEV;
+	if (ceph_snap(old_dir) != CEPH_NOSNAP ||
+	    ceph_snap(new_dir) != CEPH_NOSNAP)
+		return -EROFS;
+	dout("rename dir %p dentry %p to dir %p dentry %p\n",
+	     old_dir, old_dentry, new_dir, new_dentry);
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+	req->r_dentry = dget(new_dentry);
+	req->r_num_caps = 2;
+	req->r_old_dentry = dget(old_dentry);
+	req->r_locked_dir = new_dir;
+	req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
+	req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
+	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+	/* release LINK_RDCACHE on source inode (mds will lock it) */
+	req->r_old_inode_drop = CEPH_CAP_LINK_SHARED;
+	if (new_dentry->d_inode)
+		req->r_inode_drop = drop_caps_for_unlink(new_dentry->d_inode);
+	err = ceph_mdsc_do_request(mdsc, old_dir, req);
+	if (!err && !req->r_reply_info.head->is_dentry) {
+		/*
+		 * Normally d_move() is done by fill_trace (called by
+		 * do_request, above).  If there is no trace, we need
+		 * to do it here.
+		 */
+
+		/* d_move screws up d_subdirs order */
+		ceph_i_clear(new_dir, CEPH_I_COMPLETE);
+
+		d_move(old_dentry, new_dentry);
+
+		/* ensure target dentry is invalidated, despite
+		   rehashing bug in vfs_rename_dir */
+		ceph_invalidate_dentry_lease(new_dentry);
+	}
+	ceph_mdsc_put_request(req);
+	return err;
+}
+
+/*
+ * Ensure a dentry lease will no longer revalidate.
+ */
+void ceph_invalidate_dentry_lease(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	dentry->d_time = jiffies;
+	ceph_dentry(dentry)->lease_shared_gen = 0;
+	spin_unlock(&dentry->d_lock);
+}
+
+/*
+ * Check if dentry lease is valid.  If not, delete the lease.  Try to
+ * renew if the least is more than half up.
+ */
+static int dentry_lease_is_valid(struct dentry *dentry)
+{
+	struct ceph_dentry_info *di;
+	struct ceph_mds_session *s;
+	int valid = 0;
+	u32 gen;
+	unsigned long ttl;
+	struct ceph_mds_session *session = NULL;
+	struct inode *dir = NULL;
+	u32 seq = 0;
+
+	spin_lock(&dentry->d_lock);
+	di = ceph_dentry(dentry);
+	if (di && di->lease_session) {
+		s = di->lease_session;
+		spin_lock(&s->s_cap_lock);
+		gen = s->s_cap_gen;
+		ttl = s->s_cap_ttl;
+		spin_unlock(&s->s_cap_lock);
+
+		if (di->lease_gen == gen &&
+		    time_before(jiffies, dentry->d_time) &&
+		    time_before(jiffies, ttl)) {
+			valid = 1;
+			if (di->lease_renew_after &&
+			    time_after(jiffies, di->lease_renew_after)) {
+				/* we should renew */
+				dir = dentry->d_parent->d_inode;
+				session = ceph_get_mds_session(s);
+				seq = di->lease_seq;
+				di->lease_renew_after = 0;
+				di->lease_renew_from = jiffies;
+			}
+		}
+	}
+	spin_unlock(&dentry->d_lock);
+
+	if (session) {
+		ceph_mdsc_lease_send_msg(session, dir, dentry,
+					 CEPH_MDS_LEASE_RENEW, seq);
+		ceph_put_mds_session(session);
+	}
+	dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
+	return valid;
+}
+
+/*
+ * Check if directory-wide content lease/cap is valid.
+ */
+static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
+{
+	struct ceph_inode_info *ci = ceph_inode(dir);
+	struct ceph_dentry_info *di = ceph_dentry(dentry);
+	int valid = 0;
+
+	spin_lock(&dir->i_lock);
+	if (ci->i_shared_gen == di->lease_shared_gen)
+		valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
+	spin_unlock(&dir->i_lock);
+	dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n",
+	     dir, (unsigned)ci->i_shared_gen, dentry,
+	     (unsigned)di->lease_shared_gen, valid);
+	return valid;
+}
+
+/*
+ * Check if cached dentry can be trusted.
+ */
+static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	struct inode *dir = dentry->d_parent->d_inode;
+
+	dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry,
+	     dentry->d_name.len, dentry->d_name.name, dentry->d_inode,
+	     ceph_dentry(dentry)->offset);
+
+	/* always trust cached snapped dentries, snapdir dentry */
+	if (ceph_snap(dir) != CEPH_NOSNAP) {
+		dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry,
+		     dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
+		goto out_touch;
+	}
+	if (dentry->d_inode && ceph_snap(dentry->d_inode) == CEPH_SNAPDIR)
+		goto out_touch;
+
+	if (dentry_lease_is_valid(dentry) ||
+	    dir_lease_is_valid(dir, dentry))
+		goto out_touch;
+
+	dout("d_revalidate %p invalid\n", dentry);
+	d_drop(dentry);
+	return 0;
+out_touch:
+	ceph_dentry_lru_touch(dentry);
+	return 1;
+}
+
+/*
+ * When a dentry is released, clear the dir I_COMPLETE if it was part
+ * of the current dir gen or if this is in the snapshot namespace.
+ */
+static void ceph_dentry_release(struct dentry *dentry)
+{
+	struct ceph_dentry_info *di = ceph_dentry(dentry);
+	struct inode *parent_inode = dentry->d_parent->d_inode;
+	u64 snapid = ceph_snap(parent_inode);
+
+	dout("dentry_release %p parent %p\n", dentry, parent_inode);
+
+	if (parent_inode && snapid != CEPH_SNAPDIR) {
+		struct ceph_inode_info *ci = ceph_inode(parent_inode);
+
+		spin_lock(&parent_inode->i_lock);
+		if (ci->i_shared_gen == di->lease_shared_gen ||
+		    snapid <= CEPH_MAXSNAP) {
+			dout(" clearing %p complete (d_release)\n",
+			     parent_inode);
+			ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
+			ci->i_release_count++;
+		}
+		spin_unlock(&parent_inode->i_lock);
+	}
+	if (di) {
+		ceph_dentry_lru_del(dentry);
+		if (di->lease_session)
+			ceph_put_mds_session(di->lease_session);
+		kmem_cache_free(ceph_dentry_cachep, di);
+		dentry->d_fsdata = NULL;
+	}
+}
+
+static int ceph_snapdir_d_revalidate(struct dentry *dentry,
+					  struct nameidata *nd)
+{
+	/*
+	 * Eventually, we'll want to revalidate snapped metadata
+	 * too... probably...
+	 */
+	return 1;
+}
+
+
+
+/*
+ * read() on a dir.  This weird interface hack only works if mounted
+ * with '-o dirstat'.
+ */
+static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
+			     loff_t *ppos)
+{
+	struct ceph_file_info *cf = file->private_data;
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int left;
+
+	if (!ceph_test_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
+		return -EISDIR;
+
+	if (!cf->dir_info) {
+		cf->dir_info = kmalloc(1024, GFP_NOFS);
+		if (!cf->dir_info)
+			return -ENOMEM;
+		cf->dir_info_len =
+			sprintf(cf->dir_info,
+				"entries:   %20lld\n"
+				" files:    %20lld\n"
+				" subdirs:  %20lld\n"
+				"rentries:  %20lld\n"
+				" rfiles:   %20lld\n"
+				" rsubdirs: %20lld\n"
+				"rbytes:    %20lld\n"
+				"rctime:    %10ld.%09ld\n",
+				ci->i_files + ci->i_subdirs,
+				ci->i_files,
+				ci->i_subdirs,
+				ci->i_rfiles + ci->i_rsubdirs,
+				ci->i_rfiles,
+				ci->i_rsubdirs,
+				ci->i_rbytes,
+				(long)ci->i_rctime.tv_sec,
+				(long)ci->i_rctime.tv_nsec);
+	}
+
+	if (*ppos >= cf->dir_info_len)
+		return 0;
+	size = min_t(unsigned, size, cf->dir_info_len-*ppos);
+	left = copy_to_user(buf, cf->dir_info + *ppos, size);
+	if (left == size)
+		return -EFAULT;
+	*ppos += (size - left);
+	return size - left;
+}
+
+/*
+ * an fsync() on a dir will wait for any uncommitted directory
+ * operations to commit.
+ */
+static int ceph_dir_fsync(struct file *file, int datasync)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct list_head *head = &ci->i_unsafe_dirops;
+	struct ceph_mds_request *req;
+	u64 last_tid;
+	int ret = 0;
+
+	dout("dir_fsync %p\n", inode);
+	spin_lock(&ci->i_unsafe_lock);
+	if (list_empty(head))
+		goto out;
+
+	req = list_entry(head->prev,
+			 struct ceph_mds_request, r_unsafe_dir_item);
+	last_tid = req->r_tid;
+
+	do {
+		ceph_mdsc_get_request(req);
+		spin_unlock(&ci->i_unsafe_lock);
+		dout("dir_fsync %p wait on tid %llu (until %llu)\n",
+		     inode, req->r_tid, last_tid);
+		if (req->r_timeout) {
+			ret = wait_for_completion_timeout(
+				&req->r_safe_completion, req->r_timeout);
+			if (ret > 0)
+				ret = 0;
+			else if (ret == 0)
+				ret = -EIO;  /* timed out */
+		} else {
+			wait_for_completion(&req->r_safe_completion);
+		}
+		spin_lock(&ci->i_unsafe_lock);
+		ceph_mdsc_put_request(req);
+
+		if (ret || list_empty(head))
+			break;
+		req = list_entry(head->next,
+				 struct ceph_mds_request, r_unsafe_dir_item);
+	} while (req->r_tid < last_tid);
+out:
+	spin_unlock(&ci->i_unsafe_lock);
+	return ret;
+}
+
+/*
+ * We maintain a private dentry LRU.
+ *
+ * FIXME: this needs to be changed to a per-mds lru to be useful.
+ */
+void ceph_dentry_lru_add(struct dentry *dn)
+{
+	struct ceph_dentry_info *di = ceph_dentry(dn);
+	struct ceph_mds_client *mdsc;
+
+	dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
+	     dn->d_name.len, dn->d_name.name);
+	if (di) {
+		mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
+		spin_lock(&mdsc->dentry_lru_lock);
+		list_add_tail(&di->lru, &mdsc->dentry_lru);
+		mdsc->num_dentry++;
+		spin_unlock(&mdsc->dentry_lru_lock);
+	}
+}
+
+void ceph_dentry_lru_touch(struct dentry *dn)
+{
+	struct ceph_dentry_info *di = ceph_dentry(dn);
+	struct ceph_mds_client *mdsc;
+
+	dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
+	     dn->d_name.len, dn->d_name.name, di->offset);
+	if (di) {
+		mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
+		spin_lock(&mdsc->dentry_lru_lock);
+		list_move_tail(&di->lru, &mdsc->dentry_lru);
+		spin_unlock(&mdsc->dentry_lru_lock);
+	}
+}
+
+void ceph_dentry_lru_del(struct dentry *dn)
+{
+	struct ceph_dentry_info *di = ceph_dentry(dn);
+	struct ceph_mds_client *mdsc;
+
+	dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
+	     dn->d_name.len, dn->d_name.name);
+	if (di) {
+		mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
+		spin_lock(&mdsc->dentry_lru_lock);
+		list_del_init(&di->lru);
+		mdsc->num_dentry--;
+		spin_unlock(&mdsc->dentry_lru_lock);
+	}
+}
+
+const struct file_operations ceph_dir_fops = {
+	.read = ceph_read_dir,
+	.readdir = ceph_readdir,
+	.llseek = ceph_dir_llseek,
+	.open = ceph_open,
+	.release = ceph_release,
+	.unlocked_ioctl = ceph_ioctl,
+	.fsync = ceph_dir_fsync,
+};
+
+const struct inode_operations ceph_dir_iops = {
+	.lookup = ceph_lookup,
+	.permission = ceph_permission,
+	.getattr = ceph_getattr,
+	.setattr = ceph_setattr,
+	.setxattr = ceph_setxattr,
+	.getxattr = ceph_getxattr,
+	.listxattr = ceph_listxattr,
+	.removexattr = ceph_removexattr,
+	.mknod = ceph_mknod,
+	.symlink = ceph_symlink,
+	.mkdir = ceph_mkdir,
+	.link = ceph_link,
+	.unlink = ceph_unlink,
+	.rmdir = ceph_unlink,
+	.rename = ceph_rename,
+	.create = ceph_create,
+};
+
+struct dentry_operations ceph_dentry_ops = {
+	.d_revalidate = ceph_d_revalidate,
+	.d_release = ceph_dentry_release,
+};
+
+struct dentry_operations ceph_snapdir_dentry_ops = {
+	.d_revalidate = ceph_snapdir_d_revalidate,
+	.d_release = ceph_dentry_release,
+};
+
+struct dentry_operations ceph_snap_dentry_ops = {
+	.d_release = ceph_dentry_release,
+};
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
new file mode 100644
index 000000000000..4480cb1c63e7
--- /dev/null
+++ b/fs/ceph/export.c
@@ -0,0 +1,224 @@
+#include "ceph_debug.h"
+
+#include <linux/exportfs.h>
+#include <linux/slab.h>
+#include <asm/unaligned.h>
+
+#include "super.h"
+
+/*
+ * NFS export support
+ *
+ * NFS re-export of a ceph mount is, at present, only semireliable.
+ * The basic issue is that the Ceph architectures doesn't lend itself
+ * well to generating filehandles that will remain valid forever.
+ *
+ * So, we do our best.  If you're lucky, your inode will be in the
+ * client's cache.  If it's not, and you have a connectable fh, then
+ * the MDS server may be able to find it for you.  Otherwise, you get
+ * ESTALE.
+ *
+ * There are ways to this more reliable, but in the non-connectable fh
+ * case, we won't every work perfectly, and in the connectable case,
+ * some changes are needed on the MDS side to work better.
+ */
+
+/*
+ * Basic fh
+ */
+struct ceph_nfs_fh {
+	u64 ino;
+} __attribute__ ((packed));
+
+/*
+ * Larger 'connectable' fh that includes parent ino and name hash.
+ * Use this whenever possible, as it works more reliably.
+ */
+struct ceph_nfs_confh {
+	u64 ino, parent_ino;
+	u32 parent_name_hash;
+} __attribute__ ((packed));
+
+static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
+			  int connectable)
+{
+	struct ceph_nfs_fh *fh = (void *)rawfh;
+	struct ceph_nfs_confh *cfh = (void *)rawfh;
+	struct dentry *parent = dentry->d_parent;
+	struct inode *inode = dentry->d_inode;
+	int type;
+
+	/* don't re-export snaps */
+	if (ceph_snap(inode) != CEPH_NOSNAP)
+		return -EINVAL;
+
+	if (*max_len >= sizeof(*cfh)) {
+		dout("encode_fh %p connectable\n", dentry);
+		cfh->ino = ceph_ino(dentry->d_inode);
+		cfh->parent_ino = ceph_ino(parent->d_inode);
+		cfh->parent_name_hash = parent->d_name.hash;
+		*max_len = sizeof(*cfh);
+		type = 2;
+	} else if (*max_len > sizeof(*fh)) {
+		if (connectable)
+			return -ENOSPC;
+		dout("encode_fh %p\n", dentry);
+		fh->ino = ceph_ino(dentry->d_inode);
+		*max_len = sizeof(*fh);
+		type = 1;
+	} else {
+		return -ENOSPC;
+	}
+	return type;
+}
+
+/*
+ * convert regular fh to dentry
+ *
+ * FIXME: we should try harder by querying the mds for the ino.
+ */
+static struct dentry *__fh_to_dentry(struct super_block *sb,
+				     struct ceph_nfs_fh *fh)
+{
+	struct inode *inode;
+	struct dentry *dentry;
+	struct ceph_vino vino;
+	int err;
+
+	dout("__fh_to_dentry %llx\n", fh->ino);
+	vino.ino = fh->ino;
+	vino.snap = CEPH_NOSNAP;
+	inode = ceph_find_inode(sb, vino);
+	if (!inode)
+		return ERR_PTR(-ESTALE);
+
+	dentry = d_obtain_alias(inode);
+	if (IS_ERR(dentry)) {
+		pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
+		       fh->ino, inode);
+		iput(inode);
+		return dentry;
+	}
+	err = ceph_init_dentry(dentry);
+
+	if (err < 0) {
+		iput(inode);
+		return ERR_PTR(err);
+	}
+	dout("__fh_to_dentry %llx %p dentry %p\n", fh->ino, inode, dentry);
+	return dentry;
+}
+
+/*
+ * convert connectable fh to dentry
+ */
+static struct dentry *__cfh_to_dentry(struct super_block *sb,
+				      struct ceph_nfs_confh *cfh)
+{
+	struct ceph_mds_client *mdsc = &ceph_sb_to_client(sb)->mdsc;
+	struct inode *inode;
+	struct dentry *dentry;
+	struct ceph_vino vino;
+	int err;
+
+	dout("__cfh_to_dentry %llx (%llx/%x)\n",
+	     cfh->ino, cfh->parent_ino, cfh->parent_name_hash);
+
+	vino.ino = cfh->ino;
+	vino.snap = CEPH_NOSNAP;
+	inode = ceph_find_inode(sb, vino);
+	if (!inode) {
+		struct ceph_mds_request *req;
+
+		req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH,
+					       USE_ANY_MDS);
+		if (IS_ERR(req))
+			return ERR_CAST(req);
+
+		req->r_ino1 = vino;
+		req->r_ino2.ino = cfh->parent_ino;
+		req->r_ino2.snap = CEPH_NOSNAP;
+		req->r_path2 = kmalloc(16, GFP_NOFS);
+		snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash);
+		req->r_num_caps = 1;
+		err = ceph_mdsc_do_request(mdsc, NULL, req);
+		ceph_mdsc_put_request(req);
+		inode = ceph_find_inode(sb, vino);
+		if (!inode)
+			return ERR_PTR(err ? err : -ESTALE);
+	}
+
+	dentry = d_obtain_alias(inode);
+	if (IS_ERR(dentry)) {
+		pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
+		       cfh->ino, inode);
+		iput(inode);
+		return dentry;
+	}
+	err = ceph_init_dentry(dentry);
+	if (err < 0) {
+		iput(inode);
+		return ERR_PTR(err);
+	}
+	dout("__cfh_to_dentry %llx %p dentry %p\n", cfh->ino, inode, dentry);
+	return dentry;
+}
+
+static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid,
+					int fh_len, int fh_type)
+{
+	if (fh_type == 1)
+		return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw);
+	else
+		return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw);
+}
+
+/*
+ * get parent, if possible.
+ *
+ * FIXME: we could do better by querying the mds to discover the
+ * parent.
+ */
+static struct dentry *ceph_fh_to_parent(struct super_block *sb,
+					 struct fid *fid,
+					int fh_len, int fh_type)
+{
+	struct ceph_nfs_confh *cfh = (void *)fid->raw;
+	struct ceph_vino vino;
+	struct inode *inode;
+	struct dentry *dentry;
+	int err;
+
+	if (fh_type == 1)
+		return ERR_PTR(-ESTALE);
+
+	pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino,
+		 cfh->parent_name_hash);
+
+	vino.ino = cfh->ino;
+	vino.snap = CEPH_NOSNAP;
+	inode = ceph_find_inode(sb, vino);
+	if (!inode)
+		return ERR_PTR(-ESTALE);
+
+	dentry = d_obtain_alias(inode);
+	if (IS_ERR(dentry)) {
+		pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n",
+		       cfh->ino, inode);
+		iput(inode);
+		return dentry;
+	}
+	err = ceph_init_dentry(dentry);
+	if (err < 0) {
+		iput(inode);
+		return ERR_PTR(err);
+	}
+	dout("fh_to_parent %llx %p dentry %p\n", cfh->ino, inode, dentry);
+	return dentry;
+}
+
+const struct export_operations ceph_export_ops = {
+	.encode_fh = ceph_encode_fh,
+	.fh_to_dentry = ceph_fh_to_dentry,
+	.fh_to_parent = ceph_fh_to_parent,
+};
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
new file mode 100644
index 000000000000..7c08698fad3e
--- /dev/null
+++ b/fs/ceph/file.c
@@ -0,0 +1,938 @@
+#include "ceph_debug.h"
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/writeback.h>
+
+#include "super.h"
+#include "mds_client.h"
+
+/*
+ * Ceph file operations
+ *
+ * Implement basic open/close functionality, and implement
+ * read/write.
+ *
+ * We implement three modes of file I/O:
+ *  - buffered uses the generic_file_aio_{read,write} helpers
+ *
+ *  - synchronous is used when there is multi-client read/write
+ *    sharing, avoids the page cache, and synchronously waits for an
+ *    ack from the OSD.
+ *
+ *  - direct io takes the variant of the sync path that references
+ *    user pages directly.
+ *
+ * fsync() flushes and waits on dirty pages, but just queues metadata
+ * for writeback: since the MDS can recover size and mtime there is no
+ * need to wait for MDS acknowledgement.
+ */
+
+
+/*
+ * Prepare an open request.  Preallocate ceph_cap to avoid an
+ * inopportune ENOMEM later.
+ */
+static struct ceph_mds_request *
+prepare_open_request(struct super_block *sb, int flags, int create_mode)
+{
+	struct ceph_client *client = ceph_sb_to_client(sb);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_mds_request *req;
+	int want_auth = USE_ANY_MDS;
+	int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
+
+	if (flags & (O_WRONLY|O_RDWR|O_CREAT|O_TRUNC))
+		want_auth = USE_AUTH_MDS;
+
+	req = ceph_mdsc_create_request(mdsc, op, want_auth);
+	if (IS_ERR(req))
+		goto out;
+	req->r_fmode = ceph_flags_to_mode(flags);
+	req->r_args.open.flags = cpu_to_le32(flags);
+	req->r_args.open.mode = cpu_to_le32(create_mode);
+	req->r_args.open.preferred = cpu_to_le32(-1);
+out:
+	return req;
+}
+
+/*
+ * initialize private struct file data.
+ * if we fail, clean up by dropping fmode reference on the ceph_inode
+ */
+static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
+{
+	struct ceph_file_info *cf;
+	int ret = 0;
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFREG:
+	case S_IFDIR:
+		dout("init_file %p %p 0%o (regular)\n", inode, file,
+		     inode->i_mode);
+		cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO);
+		if (cf == NULL) {
+			ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
+			return -ENOMEM;
+		}
+		cf->fmode = fmode;
+		cf->next_offset = 2;
+		file->private_data = cf;
+		BUG_ON(inode->i_fop->release != ceph_release);
+		break;
+
+	case S_IFLNK:
+		dout("init_file %p %p 0%o (symlink)\n", inode, file,
+		     inode->i_mode);
+		ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
+		break;
+
+	default:
+		dout("init_file %p %p 0%o (special)\n", inode, file,
+		     inode->i_mode);
+		/*
+		 * we need to drop the open ref now, since we don't
+		 * have .release set to ceph_release.
+		 */
+		ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
+		BUG_ON(inode->i_fop->release == ceph_release);
+
+		/* call the proper open fop */
+		ret = inode->i_fop->open(inode, file);
+	}
+	return ret;
+}
+
+/*
+ * If the filp already has private_data, that means the file was
+ * already opened by intent during lookup, and we do nothing.
+ *
+ * If we already have the requisite capabilities, we can satisfy
+ * the open request locally (no need to request new caps from the
+ * MDS).  We do, however, need to inform the MDS (asynchronously)
+ * if our wanted caps set expands.
+ */
+int ceph_open(struct inode *inode, struct file *file)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_mds_request *req;
+	struct ceph_file_info *cf = file->private_data;
+	struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
+	int err;
+	int flags, fmode, wanted;
+
+	if (cf) {
+		dout("open file %p is already opened\n", file);
+		return 0;
+	}
+
+	/* filter out O_CREAT|O_EXCL; vfs did that already.  yuck. */
+	flags = file->f_flags & ~(O_CREAT|O_EXCL);
+	if (S_ISDIR(inode->i_mode))
+		flags = O_DIRECTORY;  /* mds likes to know */
+
+	dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode,
+	     ceph_vinop(inode), file, flags, file->f_flags);
+	fmode = ceph_flags_to_mode(flags);
+	wanted = ceph_caps_for_mode(fmode);
+
+	/* snapped files are read-only */
+	if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE))
+		return -EROFS;
+
+	/* trivially open snapdir */
+	if (ceph_snap(inode) == CEPH_SNAPDIR) {
+		spin_lock(&inode->i_lock);
+		__ceph_get_fmode(ci, fmode);
+		spin_unlock(&inode->i_lock);
+		return ceph_init_file(inode, file, fmode);
+	}
+
+	/*
+	 * No need to block if we have any caps.  Update wanted set
+	 * asynchronously.
+	 */
+	spin_lock(&inode->i_lock);
+	if (__ceph_is_any_real_caps(ci)) {
+		int mds_wanted = __ceph_caps_mds_wanted(ci);
+		int issued = __ceph_caps_issued(ci, NULL);
+
+		dout("open %p fmode %d want %s issued %s using existing\n",
+		     inode, fmode, ceph_cap_string(wanted),
+		     ceph_cap_string(issued));
+		__ceph_get_fmode(ci, fmode);
+		spin_unlock(&inode->i_lock);
+
+		/* adjust wanted? */
+		if ((issued & wanted) != wanted &&
+		    (mds_wanted & wanted) != wanted &&
+		    ceph_snap(inode) != CEPH_SNAPDIR)
+			ceph_check_caps(ci, 0, NULL);
+
+		return ceph_init_file(inode, file, fmode);
+	} else if (ceph_snap(inode) != CEPH_NOSNAP &&
+		   (ci->i_snap_caps & wanted) == wanted) {
+		__ceph_get_fmode(ci, fmode);
+		spin_unlock(&inode->i_lock);
+		return ceph_init_file(inode, file, fmode);
+	}
+	spin_unlock(&inode->i_lock);
+
+	dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
+	req = prepare_open_request(inode->i_sb, flags, 0);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto out;
+	}
+	req->r_inode = igrab(inode);
+	req->r_num_caps = 1;
+	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+	if (!err)
+		err = ceph_init_file(inode, file, req->r_fmode);
+	ceph_mdsc_put_request(req);
+	dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode));
+out:
+	return err;
+}
+
+
+/*
+ * Do a lookup + open with a single request.
+ *
+ * If this succeeds, but some subsequent check in the vfs
+ * may_open() fails, the struct *file gets cleaned up (i.e.
+ * ceph_release gets called).  So fear not!
+ */
+/*
+ * flags
+ *  path_lookup_open   -> LOOKUP_OPEN
+ *  path_lookup_create -> LOOKUP_OPEN|LOOKUP_CREATE
+ */
+struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
+				struct nameidata *nd, int mode,
+				int locked_dir)
+{
+	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct file *file = nd->intent.open.file;
+	struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry);
+	struct ceph_mds_request *req;
+	int err;
+	int flags = nd->intent.open.flags - 1;  /* silly vfs! */
+
+	dout("ceph_lookup_open dentry %p '%.*s' flags %d mode 0%o\n",
+	     dentry, dentry->d_name.len, dentry->d_name.name, flags, mode);
+
+	/* do the open */
+	req = prepare_open_request(dir->i_sb, flags, mode);
+	if (IS_ERR(req))
+		return ERR_CAST(req);
+	req->r_dentry = dget(dentry);
+	req->r_num_caps = 2;
+	if (flags & O_CREAT) {
+		req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+		req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+	}
+	req->r_locked_dir = dir;           /* caller holds dir->i_mutex */
+	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+	dentry = ceph_finish_lookup(req, dentry, err);
+	if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
+		err = ceph_handle_notrace_create(dir, dentry);
+	if (!err)
+		err = ceph_init_file(req->r_dentry->d_inode, file,
+				     req->r_fmode);
+	ceph_mdsc_put_request(req);
+	dout("ceph_lookup_open result=%p\n", dentry);
+	return dentry;
+}
+
+int ceph_release(struct inode *inode, struct file *file)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_file_info *cf = file->private_data;
+
+	dout("release inode %p file %p\n", inode, file);
+	ceph_put_fmode(ci, cf->fmode);
+	if (cf->last_readdir)
+		ceph_mdsc_put_request(cf->last_readdir);
+	kfree(cf->last_name);
+	kfree(cf->dir_info);
+	dput(cf->dentry);
+	kmem_cache_free(ceph_file_cachep, cf);
+
+	/* wake up anyone waiting for caps on this inode */
+	wake_up_all(&ci->i_cap_wq);
+	return 0;
+}
+
+/*
+ * build a vector of user pages
+ */
+static struct page **get_direct_page_vector(const char __user *data,
+					    int num_pages,
+					    loff_t off, size_t len)
+{
+	struct page **pages;
+	int rc;
+
+	pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
+	if (!pages)
+		return ERR_PTR(-ENOMEM);
+
+	down_read(&current->mm->mmap_sem);
+	rc = get_user_pages(current, current->mm, (unsigned long)data,
+			    num_pages, 0, 0, pages, NULL);
+	up_read(&current->mm->mmap_sem);
+	if (rc < 0)
+		goto fail;
+	return pages;
+
+fail:
+	kfree(pages);
+	return ERR_PTR(rc);
+}
+
+static void put_page_vector(struct page **pages, int num_pages)
+{
+	int i;
+
+	for (i = 0; i < num_pages; i++)
+		put_page(pages[i]);
+	kfree(pages);
+}
+
+void ceph_release_page_vector(struct page **pages, int num_pages)
+{
+	int i;
+
+	for (i = 0; i < num_pages; i++)
+		__free_pages(pages[i], 0);
+	kfree(pages);
+}
+
+/*
+ * allocate a vector new pages
+ */
+struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
+{
+	struct page **pages;
+	int i;
+
+	pages = kmalloc(sizeof(*pages) * num_pages, flags);
+	if (!pages)
+		return ERR_PTR(-ENOMEM);
+	for (i = 0; i < num_pages; i++) {
+		pages[i] = __page_cache_alloc(flags);
+		if (pages[i] == NULL) {
+			ceph_release_page_vector(pages, i);
+			return ERR_PTR(-ENOMEM);
+		}
+	}
+	return pages;
+}
+
+/*
+ * copy user data into a page vector
+ */
+static int copy_user_to_page_vector(struct page **pages,
+				    const char __user *data,
+				    loff_t off, size_t len)
+{
+	int i = 0;
+	int po = off & ~PAGE_CACHE_MASK;
+	int left = len;
+	int l, bad;
+
+	while (left > 0) {
+		l = min_t(int, PAGE_CACHE_SIZE-po, left);
+		bad = copy_from_user(page_address(pages[i]) + po, data, l);
+		if (bad == l)
+			return -EFAULT;
+		data += l - bad;
+		left -= l - bad;
+		po += l - bad;
+		if (po == PAGE_CACHE_SIZE) {
+			po = 0;
+			i++;
+		}
+	}
+	return len;
+}
+
+/*
+ * copy user data from a page vector into a user pointer
+ */
+static int copy_page_vector_to_user(struct page **pages, char __user *data,
+				    loff_t off, size_t len)
+{
+	int i = 0;
+	int po = off & ~PAGE_CACHE_MASK;
+	int left = len;
+	int l, bad;
+
+	while (left > 0) {
+		l = min_t(int, left, PAGE_CACHE_SIZE-po);
+		bad = copy_to_user(data, page_address(pages[i]) + po, l);
+		if (bad == l)
+			return -EFAULT;
+		data += l - bad;
+		left -= l - bad;
+		if (po) {
+			po += l - bad;
+			if (po == PAGE_CACHE_SIZE)
+				po = 0;
+		}
+		i++;
+	}
+	return len;
+}
+
+/*
+ * Zero an extent within a page vector.  Offset is relative to the
+ * start of the first page.
+ */
+static void zero_page_vector_range(int off, int len, struct page **pages)
+{
+	int i = off >> PAGE_CACHE_SHIFT;
+
+	off &= ~PAGE_CACHE_MASK;
+
+	dout("zero_page_vector_page %u~%u\n", off, len);
+
+	/* leading partial page? */
+	if (off) {
+		int end = min((int)PAGE_CACHE_SIZE, off + len);
+		dout("zeroing %d %p head from %d\n", i, pages[i],
+		     (int)off);
+		zero_user_segment(pages[i], off, end);
+		len -= (end - off);
+		i++;
+	}
+	while (len >= PAGE_CACHE_SIZE) {
+		dout("zeroing %d %p len=%d\n", i, pages[i], len);
+		zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
+		len -= PAGE_CACHE_SIZE;
+		i++;
+	}
+	/* trailing partial page? */
+	if (len) {
+		dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
+		zero_user_segment(pages[i], 0, len);
+	}
+}
+
+
+/*
+ * Read a range of bytes striped over one or more objects.  Iterate over
+ * objects we stripe over.  (That's not atomic, but good enough for now.)
+ *
+ * If we get a short result from the OSD, check against i_size; we need to
+ * only return a short read to the caller if we hit EOF.
+ */
+static int striped_read(struct inode *inode,
+			u64 off, u64 len,
+			struct page **pages, int num_pages,
+			int *checkeof)
+{
+	struct ceph_client *client = ceph_inode_to_client(inode);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	u64 pos, this_len;
+	int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */
+	int left, pages_left;
+	int read;
+	struct page **page_pos;
+	int ret;
+	bool hit_stripe, was_short;
+
+	/*
+	 * we may need to do multiple reads.  not atomic, unfortunately.
+	 */
+	pos = off;
+	left = len;
+	page_pos = pages;
+	pages_left = num_pages;
+	read = 0;
+
+more:
+	this_len = left;
+	ret = ceph_osdc_readpages(&client->osdc, ceph_vino(inode),
+				  &ci->i_layout, pos, &this_len,
+				  ci->i_truncate_seq,
+				  ci->i_truncate_size,
+				  page_pos, pages_left);
+	hit_stripe = this_len < left;
+	was_short = ret >= 0 && ret < this_len;
+	if (ret == -ENOENT)
+		ret = 0;
+	dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read,
+	     ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
+
+	if (ret > 0) {
+		int didpages =
+			((pos & ~PAGE_CACHE_MASK) + ret) >> PAGE_CACHE_SHIFT;
+
+		if (read < pos - off) {
+			dout(" zero gap %llu to %llu\n", off + read, pos);
+			zero_page_vector_range(page_off + read,
+					       pos - off - read, pages);
+		}
+		pos += ret;
+		read = pos - off;
+		left -= ret;
+		page_pos += didpages;
+		pages_left -= didpages;
+
+		/* hit stripe? */
+		if (left && hit_stripe)
+			goto more;
+	}
+
+	if (was_short) {
+		/* was original extent fully inside i_size? */
+		if (pos + left <= inode->i_size) {
+			dout("zero tail\n");
+			zero_page_vector_range(page_off + read, len - read,
+					       pages);
+			read = len;
+			goto out;
+		}
+
+		/* check i_size */
+		*checkeof = 1;
+	}
+
+out:
+	if (ret >= 0)
+		ret = read;
+	dout("striped_read returns %d\n", ret);
+	return ret;
+}
+
+/*
+ * Completely synchronous read and write methods.  Direct from __user
+ * buffer to osd, or directly to user pages (if O_DIRECT).
+ *
+ * If the read spans object boundary, just do multiple reads.
+ */
+static ssize_t ceph_sync_read(struct file *file, char __user *data,
+			      unsigned len, loff_t *poff, int *checkeof)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct page **pages;
+	u64 off = *poff;
+	int num_pages = calc_pages_for(off, len);
+	int ret;
+
+	dout("sync_read on file %p %llu~%u %s\n", file, off, len,
+	     (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
+
+	if (file->f_flags & O_DIRECT) {
+		pages = get_direct_page_vector(data, num_pages, off, len);
+
+		/*
+		 * flush any page cache pages in this range.  this
+		 * will make concurrent normal and O_DIRECT io slow,
+		 * but it will at least behave sensibly when they are
+		 * in sequence.
+		 */
+	} else {
+		pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
+	}
+	if (IS_ERR(pages))
+		return PTR_ERR(pages);
+
+	ret = filemap_write_and_wait(inode->i_mapping);
+	if (ret < 0)
+		goto done;
+
+	ret = striped_read(inode, off, len, pages, num_pages, checkeof);
+
+	if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
+		ret = copy_page_vector_to_user(pages, data, off, ret);
+	if (ret >= 0)
+		*poff = off + ret;
+
+done:
+	if (file->f_flags & O_DIRECT)
+		put_page_vector(pages, num_pages);
+	else
+		ceph_release_page_vector(pages, num_pages);
+	dout("sync_read result %d\n", ret);
+	return ret;
+}
+
+/*
+ * Write commit callback, called if we requested both an ACK and
+ * ONDISK commit reply from the OSD.
+ */
+static void sync_write_commit(struct ceph_osd_request *req,
+			      struct ceph_msg *msg)
+{
+	struct ceph_inode_info *ci = ceph_inode(req->r_inode);
+
+	dout("sync_write_commit %p tid %llu\n", req, req->r_tid);
+	spin_lock(&ci->i_unsafe_lock);
+	list_del_init(&req->r_unsafe_item);
+	spin_unlock(&ci->i_unsafe_lock);
+	ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
+}
+
+/*
+ * Synchronous write, straight from __user pointer or user pages (if
+ * O_DIRECT).
+ *
+ * If write spans object boundary, just do multiple writes.  (For a
+ * correct atomic write, we should e.g. take write locks on all
+ * objects, rollback on failure, etc.)
+ */
+static ssize_t ceph_sync_write(struct file *file, const char __user *data,
+			       size_t left, loff_t *offset)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_client *client = ceph_inode_to_client(inode);
+	struct ceph_osd_request *req;
+	struct page **pages;
+	int num_pages;
+	long long unsigned pos;
+	u64 len;
+	int written = 0;
+	int flags;
+	int do_sync = 0;
+	int check_caps = 0;
+	int ret;
+	struct timespec mtime = CURRENT_TIME;
+
+	if (ceph_snap(file->f_dentry->d_inode) != CEPH_NOSNAP)
+		return -EROFS;
+
+	dout("sync_write on file %p %lld~%u %s\n", file, *offset,
+	     (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
+
+	if (file->f_flags & O_APPEND)
+		pos = i_size_read(inode);
+	else
+		pos = *offset;
+
+	ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);
+	if (ret < 0)
+		return ret;
+
+	ret = invalidate_inode_pages2_range(inode->i_mapping,
+					    pos >> PAGE_CACHE_SHIFT,
+					    (pos + left) >> PAGE_CACHE_SHIFT);
+	if (ret < 0)
+		dout("invalidate_inode_pages2_range returned %d\n", ret);
+
+	flags = CEPH_OSD_FLAG_ORDERSNAP |
+		CEPH_OSD_FLAG_ONDISK |
+		CEPH_OSD_FLAG_WRITE;
+	if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0)
+		flags |= CEPH_OSD_FLAG_ACK;
+	else
+		do_sync = 1;
+
+	/*
+	 * we may need to do multiple writes here if we span an object
+	 * boundary.  this isn't atomic, unfortunately.  :(
+	 */
+more:
+	len = left;
+	req = ceph_osdc_new_request(&client->osdc, &ci->i_layout,
+				    ceph_vino(inode), pos, &len,
+				    CEPH_OSD_OP_WRITE, flags,
+				    ci->i_snap_realm->cached_context,
+				    do_sync,
+				    ci->i_truncate_seq, ci->i_truncate_size,
+				    &mtime, false, 2);
+	if (!req)
+		return -ENOMEM;
+
+	num_pages = calc_pages_for(pos, len);
+
+	if (file->f_flags & O_DIRECT) {
+		pages = get_direct_page_vector(data, num_pages, pos, len);
+		if (IS_ERR(pages)) {
+			ret = PTR_ERR(pages);
+			goto out;
+		}
+
+		/*
+		 * throw out any page cache pages in this range. this
+		 * may block.
+		 */
+		truncate_inode_pages_range(inode->i_mapping, pos, 
+					   (pos+len) | (PAGE_CACHE_SIZE-1));
+	} else {
+		pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
+		if (IS_ERR(pages)) {
+			ret = PTR_ERR(pages);
+			goto out;
+		}
+		ret = copy_user_to_page_vector(pages, data, pos, len);
+		if (ret < 0) {
+			ceph_release_page_vector(pages, num_pages);
+			goto out;
+		}
+
+		if ((file->f_flags & O_SYNC) == 0) {
+			/* get a second commit callback */
+			req->r_safe_callback = sync_write_commit;
+			req->r_own_pages = 1;
+		}
+	}
+	req->r_pages = pages;
+	req->r_num_pages = num_pages;
+	req->r_inode = inode;
+
+	ret = ceph_osdc_start_request(&client->osdc, req, false);
+	if (!ret) {
+		if (req->r_safe_callback) {
+			/*
+			 * Add to inode unsafe list only after we
+			 * start_request so that a tid has been assigned.
+			 */
+			spin_lock(&ci->i_unsafe_lock);
+			list_add(&ci->i_unsafe_writes, &req->r_unsafe_item);
+			spin_unlock(&ci->i_unsafe_lock);
+			ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
+		}
+		ret = ceph_osdc_wait_request(&client->osdc, req);
+	}
+
+	if (file->f_flags & O_DIRECT)
+		put_page_vector(pages, num_pages);
+	else if (file->f_flags & O_SYNC)
+		ceph_release_page_vector(pages, num_pages);
+
+out:
+	ceph_osdc_put_request(req);
+	if (ret == 0) {
+		pos += len;
+		written += len;
+		left -= len;
+		if (left)
+			goto more;
+
+		ret = written;
+		*offset = pos;
+		if (pos > i_size_read(inode))
+			check_caps = ceph_inode_set_size(inode, pos);
+		if (check_caps)
+			ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY,
+					NULL);
+	}
+	return ret;
+}
+
+/*
+ * Wrap generic_file_aio_read with checks for cap bits on the inode.
+ * Atomically grab references, so that those bits are not released
+ * back to the MDS mid-read.
+ *
+ * Hmm, the sync read case isn't actually async... should it be?
+ */
+static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
+			     unsigned long nr_segs, loff_t pos)
+{
+	struct file *filp = iocb->ki_filp;
+	loff_t *ppos = &iocb->ki_pos;
+	size_t len = iov->iov_len;
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	void *base = iov->iov_base;
+	ssize_t ret;
+	int got = 0;
+	int checkeof = 0, read = 0;
+
+	dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
+	     inode, ceph_vinop(inode), pos, (unsigned)len, inode);
+again:
+	__ceph_do_pending_vmtruncate(inode);
+	ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE,
+			    &got, -1);
+	if (ret < 0)
+		goto out;
+	dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
+	     inode, ceph_vinop(inode), pos, (unsigned)len,
+	     ceph_cap_string(got));
+
+	if ((got & CEPH_CAP_FILE_CACHE) == 0 ||
+	    (iocb->ki_filp->f_flags & O_DIRECT) ||
+	    (inode->i_sb->s_flags & MS_SYNCHRONOUS))
+		/* hmm, this isn't really async... */
+		ret = ceph_sync_read(filp, base, len, ppos, &checkeof);
+	else
+		ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
+
+out:
+	dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
+	     inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
+	ceph_put_cap_refs(ci, got);
+
+	if (checkeof && ret >= 0) {
+		int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
+
+		/* hit EOF or hole? */
+		if (statret == 0 && *ppos < inode->i_size) {
+			dout("aio_read sync_read hit hole, reading more\n");
+			read += ret;
+			base += ret;
+			len -= ret;
+			checkeof = 0;
+			goto again;
+		}
+	}
+	if (ret >= 0)
+		ret += read;
+
+	return ret;
+}
+
+/*
+ * Take cap references to avoid releasing caps to MDS mid-write.
+ *
+ * If we are synchronous, and write with an old snap context, the OSD
+ * may return EOLDSNAPC.  In that case, retry the write.. _after_
+ * dropping our cap refs and allowing the pending snap to logically
+ * complete _before_ this write occurs.
+ *
+ * If we are near ENOSPC, write synchronously.
+ */
+static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
+		       unsigned long nr_segs, loff_t pos)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
+	loff_t endoff = pos + iov->iov_len;
+	int got = 0;
+	int ret, err;
+
+	if (ceph_snap(inode) != CEPH_NOSNAP)
+		return -EROFS;
+
+retry_snap:
+	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL))
+		return -ENOSPC;
+	__ceph_do_pending_vmtruncate(inode);
+	dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
+	     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
+	     inode->i_size);
+	ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER,
+			    &got, endoff);
+	if (ret < 0)
+		goto out;
+
+	dout("aio_write %p %llx.%llx %llu~%u  got cap refs on %s\n",
+	     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
+	     ceph_cap_string(got));
+
+	if ((got & CEPH_CAP_FILE_BUFFER) == 0 ||
+	    (iocb->ki_filp->f_flags & O_DIRECT) ||
+	    (inode->i_sb->s_flags & MS_SYNCHRONOUS)) {
+		ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,
+			&iocb->ki_pos);
+	} else {
+		ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
+
+		if ((ret >= 0 || ret == -EIOCBQUEUED) &&
+		    ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
+		     || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
+			err = vfs_fsync_range(file, pos, pos + ret - 1, 1);
+			if (err < 0)
+				ret = err;
+		}
+	}
+	if (ret >= 0) {
+		spin_lock(&inode->i_lock);
+		__ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+		spin_unlock(&inode->i_lock);
+	}
+
+out:
+	dout("aio_write %p %llx.%llx %llu~%u  dropping cap refs on %s\n",
+	     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
+	     ceph_cap_string(got));
+	ceph_put_cap_refs(ci, got);
+
+	if (ret == -EOLDSNAPC) {
+		dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n",
+		     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len);
+		goto retry_snap;
+	}
+
+	return ret;
+}
+
+/*
+ * llseek.  be sure to verify file size on SEEK_END.
+ */
+static loff_t ceph_llseek(struct file *file, loff_t offset, int origin)
+{
+	struct inode *inode = file->f_mapping->host;
+	int ret;
+
+	mutex_lock(&inode->i_mutex);
+	__ceph_do_pending_vmtruncate(inode);
+	switch (origin) {
+	case SEEK_END:
+		ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
+		if (ret < 0) {
+			offset = ret;
+			goto out;
+		}
+		offset += inode->i_size;
+		break;
+	case SEEK_CUR:
+		/*
+		 * Here we special-case the lseek(fd, 0, SEEK_CUR)
+		 * position-querying operation.  Avoid rewriting the "same"
+		 * f_pos value back to the file because a concurrent read(),
+		 * write() or lseek() might have altered it
+		 */
+		if (offset == 0) {
+			offset = file->f_pos;
+			goto out;
+		}
+		offset += file->f_pos;
+		break;
+	}
+
+	if (offset < 0 || offset > inode->i_sb->s_maxbytes) {
+		offset = -EINVAL;
+		goto out;
+	}
+
+	/* Special lock needed here? */
+	if (offset != file->f_pos) {
+		file->f_pos = offset;
+		file->f_version = 0;
+	}
+
+out:
+	mutex_unlock(&inode->i_mutex);
+	return offset;
+}
+
+const struct file_operations ceph_file_fops = {
+	.open = ceph_open,
+	.release = ceph_release,
+	.llseek = ceph_llseek,
+	.read = do_sync_read,
+	.write = do_sync_write,
+	.aio_read = ceph_aio_read,
+	.aio_write = ceph_aio_write,
+	.mmap = ceph_mmap,
+	.fsync = ceph_fsync,
+	.splice_read = generic_file_splice_read,
+	.splice_write = generic_file_splice_write,
+	.unlocked_ioctl = ceph_ioctl,
+	.compat_ioctl	= ceph_ioctl,
+};
+
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
new file mode 100644
index 000000000000..389f9dbd9949
--- /dev/null
+++ b/fs/ceph/inode.c
@@ -0,0 +1,1794 @@
+#include "ceph_debug.h"
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/smp_lock.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/namei.h>
+#include <linux/writeback.h>
+#include <linux/vmalloc.h>
+#include <linux/pagevec.h>
+
+#include "super.h"
+#include "decode.h"
+
+/*
+ * Ceph inode operations
+ *
+ * Implement basic inode helpers (get, alloc) and inode ops (getattr,
+ * setattr, etc.), xattr helpers, and helpers for assimilating
+ * metadata returned by the MDS into our cache.
+ *
+ * Also define helpers for doing asynchronous writeback, invalidation,
+ * and truncation for the benefit of those who can't afford to block
+ * (typically because they are in the message handler path).
+ */
+
+static const struct inode_operations ceph_symlink_iops;
+
+static void ceph_invalidate_work(struct work_struct *work);
+static void ceph_writeback_work(struct work_struct *work);
+static void ceph_vmtruncate_work(struct work_struct *work);
+
+/*
+ * find or create an inode, given the ceph ino number
+ */
+struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
+{
+	struct inode *inode;
+	ino_t t = ceph_vino_to_ino(vino);
+
+	inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
+	if (inode == NULL)
+		return ERR_PTR(-ENOMEM);
+	if (inode->i_state & I_NEW) {
+		dout("get_inode created new inode %p %llx.%llx ino %llx\n",
+		     inode, ceph_vinop(inode), (u64)inode->i_ino);
+		unlock_new_inode(inode);
+	}
+
+	dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
+	     vino.snap, inode);
+	return inode;
+}
+
+/*
+ * get/constuct snapdir inode for a given directory
+ */
+struct inode *ceph_get_snapdir(struct inode *parent)
+{
+	struct ceph_vino vino = {
+		.ino = ceph_ino(parent),
+		.snap = CEPH_SNAPDIR,
+	};
+	struct inode *inode = ceph_get_inode(parent->i_sb, vino);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	BUG_ON(!S_ISDIR(parent->i_mode));
+	if (IS_ERR(inode))
+		return inode;
+	inode->i_mode = parent->i_mode;
+	inode->i_uid = parent->i_uid;
+	inode->i_gid = parent->i_gid;
+	inode->i_op = &ceph_dir_iops;
+	inode->i_fop = &ceph_dir_fops;
+	ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
+	ci->i_rbytes = 0;
+	return inode;
+}
+
+const struct inode_operations ceph_file_iops = {
+	.permission = ceph_permission,
+	.setattr = ceph_setattr,
+	.getattr = ceph_getattr,
+	.setxattr = ceph_setxattr,
+	.getxattr = ceph_getxattr,
+	.listxattr = ceph_listxattr,
+	.removexattr = ceph_removexattr,
+};
+
+
+/*
+ * We use a 'frag tree' to keep track of the MDS's directory fragments
+ * for a given inode (usually there is just a single fragment).  We
+ * need to know when a child frag is delegated to a new MDS, or when
+ * it is flagged as replicated, so we can direct our requests
+ * accordingly.
+ */
+
+/*
+ * find/create a frag in the tree
+ */
+static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
+						    u32 f)
+{
+	struct rb_node **p;
+	struct rb_node *parent = NULL;
+	struct ceph_inode_frag *frag;
+	int c;
+
+	p = &ci->i_fragtree.rb_node;
+	while (*p) {
+		parent = *p;
+		frag = rb_entry(parent, struct ceph_inode_frag, node);
+		c = ceph_frag_compare(f, frag->frag);
+		if (c < 0)
+			p = &(*p)->rb_left;
+		else if (c > 0)
+			p = &(*p)->rb_right;
+		else
+			return frag;
+	}
+
+	frag = kmalloc(sizeof(*frag), GFP_NOFS);
+	if (!frag) {
+		pr_err("__get_or_create_frag ENOMEM on %p %llx.%llx "
+		       "frag %x\n", &ci->vfs_inode,
+		       ceph_vinop(&ci->vfs_inode), f);
+		return ERR_PTR(-ENOMEM);
+	}
+	frag->frag = f;
+	frag->split_by = 0;
+	frag->mds = -1;
+	frag->ndist = 0;
+
+	rb_link_node(&frag->node, parent, p);
+	rb_insert_color(&frag->node, &ci->i_fragtree);
+
+	dout("get_or_create_frag added %llx.%llx frag %x\n",
+	     ceph_vinop(&ci->vfs_inode), f);
+	return frag;
+}
+
+/*
+ * find a specific frag @f
+ */
+struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f)
+{
+	struct rb_node *n = ci->i_fragtree.rb_node;
+
+	while (n) {
+		struct ceph_inode_frag *frag =
+			rb_entry(n, struct ceph_inode_frag, node);
+		int c = ceph_frag_compare(f, frag->frag);
+		if (c < 0)
+			n = n->rb_left;
+		else if (c > 0)
+			n = n->rb_right;
+		else
+			return frag;
+	}
+	return NULL;
+}
+
+/*
+ * Choose frag containing the given value @v.  If @pfrag is
+ * specified, copy the frag delegation info to the caller if
+ * it is present.
+ */
+u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
+		     struct ceph_inode_frag *pfrag,
+		     int *found)
+{
+	u32 t = ceph_frag_make(0, 0);
+	struct ceph_inode_frag *frag;
+	unsigned nway, i;
+	u32 n;
+
+	if (found)
+		*found = 0;
+
+	mutex_lock(&ci->i_fragtree_mutex);
+	while (1) {
+		WARN_ON(!ceph_frag_contains_value(t, v));
+		frag = __ceph_find_frag(ci, t);
+		if (!frag)
+			break; /* t is a leaf */
+		if (frag->split_by == 0) {
+			if (pfrag)
+				memcpy(pfrag, frag, sizeof(*pfrag));
+			if (found)
+				*found = 1;
+			break;
+		}
+
+		/* choose child */
+		nway = 1 << frag->split_by;
+		dout("choose_frag(%x) %x splits by %d (%d ways)\n", v, t,
+		     frag->split_by, nway);
+		for (i = 0; i < nway; i++) {
+			n = ceph_frag_make_child(t, frag->split_by, i);
+			if (ceph_frag_contains_value(n, v)) {
+				t = n;
+				break;
+			}
+		}
+		BUG_ON(i == nway);
+	}
+	dout("choose_frag(%x) = %x\n", v, t);
+
+	mutex_unlock(&ci->i_fragtree_mutex);
+	return t;
+}
+
+/*
+ * Process dirfrag (delegation) info from the mds.  Include leaf
+ * fragment in tree ONLY if ndist > 0.  Otherwise, only
+ * branches/splits are included in i_fragtree)
+ */
+static int ceph_fill_dirfrag(struct inode *inode,
+			     struct ceph_mds_reply_dirfrag *dirinfo)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_inode_frag *frag;
+	u32 id = le32_to_cpu(dirinfo->frag);
+	int mds = le32_to_cpu(dirinfo->auth);
+	int ndist = le32_to_cpu(dirinfo->ndist);
+	int i;
+	int err = 0;
+
+	mutex_lock(&ci->i_fragtree_mutex);
+	if (ndist == 0) {
+		/* no delegation info needed. */
+		frag = __ceph_find_frag(ci, id);
+		if (!frag)
+			goto out;
+		if (frag->split_by == 0) {
+			/* tree leaf, remove */
+			dout("fill_dirfrag removed %llx.%llx frag %x"
+			     " (no ref)\n", ceph_vinop(inode), id);
+			rb_erase(&frag->node, &ci->i_fragtree);
+			kfree(frag);
+		} else {
+			/* tree branch, keep and clear */
+			dout("fill_dirfrag cleared %llx.%llx frag %x"
+			     " referral\n", ceph_vinop(inode), id);
+			frag->mds = -1;
+			frag->ndist = 0;
+		}
+		goto out;
+	}
+
+
+	/* find/add this frag to store mds delegation info */
+	frag = __get_or_create_frag(ci, id);
+	if (IS_ERR(frag)) {
+		/* this is not the end of the world; we can continue
+		   with bad/inaccurate delegation info */
+		pr_err("fill_dirfrag ENOMEM on mds ref %llx.%llx fg %x\n",
+		       ceph_vinop(inode), le32_to_cpu(dirinfo->frag));
+		err = -ENOMEM;
+		goto out;
+	}
+
+	frag->mds = mds;
+	frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP);
+	for (i = 0; i < frag->ndist; i++)
+		frag->dist[i] = le32_to_cpu(dirinfo->dist[i]);
+	dout("fill_dirfrag %llx.%llx frag %x ndist=%d\n",
+	     ceph_vinop(inode), frag->frag, frag->ndist);
+
+out:
+	mutex_unlock(&ci->i_fragtree_mutex);
+	return err;
+}
+
+
+/*
+ * initialize a newly allocated inode.
+ */
+struct inode *ceph_alloc_inode(struct super_block *sb)
+{
+	struct ceph_inode_info *ci;
+	int i;
+
+	ci = kmem_cache_alloc(ceph_inode_cachep, GFP_NOFS);
+	if (!ci)
+		return NULL;
+
+	dout("alloc_inode %p\n", &ci->vfs_inode);
+
+	ci->i_version = 0;
+	ci->i_time_warp_seq = 0;
+	ci->i_ceph_flags = 0;
+	ci->i_release_count = 0;
+	ci->i_symlink = NULL;
+
+	ci->i_fragtree = RB_ROOT;
+	mutex_init(&ci->i_fragtree_mutex);
+
+	ci->i_xattrs.blob = NULL;
+	ci->i_xattrs.prealloc_blob = NULL;
+	ci->i_xattrs.dirty = false;
+	ci->i_xattrs.index = RB_ROOT;
+	ci->i_xattrs.count = 0;
+	ci->i_xattrs.names_size = 0;
+	ci->i_xattrs.vals_size = 0;
+	ci->i_xattrs.version = 0;
+	ci->i_xattrs.index_version = 0;
+
+	ci->i_caps = RB_ROOT;
+	ci->i_auth_cap = NULL;
+	ci->i_dirty_caps = 0;
+	ci->i_flushing_caps = 0;
+	INIT_LIST_HEAD(&ci->i_dirty_item);
+	INIT_LIST_HEAD(&ci->i_flushing_item);
+	ci->i_cap_flush_seq = 0;
+	ci->i_cap_flush_last_tid = 0;
+	memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid));
+	init_waitqueue_head(&ci->i_cap_wq);
+	ci->i_hold_caps_min = 0;
+	ci->i_hold_caps_max = 0;
+	INIT_LIST_HEAD(&ci->i_cap_delay_list);
+	ci->i_cap_exporting_mds = 0;
+	ci->i_cap_exporting_mseq = 0;
+	ci->i_cap_exporting_issued = 0;
+	INIT_LIST_HEAD(&ci->i_cap_snaps);
+	ci->i_head_snapc = NULL;
+	ci->i_snap_caps = 0;
+
+	for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
+		ci->i_nr_by_mode[i] = 0;
+
+	ci->i_truncate_seq = 0;
+	ci->i_truncate_size = 0;
+	ci->i_truncate_pending = 0;
+
+	ci->i_max_size = 0;
+	ci->i_reported_size = 0;
+	ci->i_wanted_max_size = 0;
+	ci->i_requested_max_size = 0;
+
+	ci->i_pin_ref = 0;
+	ci->i_rd_ref = 0;
+	ci->i_rdcache_ref = 0;
+	ci->i_wr_ref = 0;
+	ci->i_wrbuffer_ref = 0;
+	ci->i_wrbuffer_ref_head = 0;
+	ci->i_shared_gen = 0;
+	ci->i_rdcache_gen = 0;
+	ci->i_rdcache_revoking = 0;
+
+	INIT_LIST_HEAD(&ci->i_unsafe_writes);
+	INIT_LIST_HEAD(&ci->i_unsafe_dirops);
+	spin_lock_init(&ci->i_unsafe_lock);
+
+	ci->i_snap_realm = NULL;
+	INIT_LIST_HEAD(&ci->i_snap_realm_item);
+	INIT_LIST_HEAD(&ci->i_snap_flush_item);
+
+	INIT_WORK(&ci->i_wb_work, ceph_writeback_work);
+	INIT_WORK(&ci->i_pg_inv_work, ceph_invalidate_work);
+
+	INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work);
+
+	return &ci->vfs_inode;
+}
+
+void ceph_destroy_inode(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_inode_frag *frag;
+	struct rb_node *n;
+
+	dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
+
+	ceph_queue_caps_release(inode);
+
+	/*
+	 * we may still have a snap_realm reference if there are stray
+	 * caps in i_cap_exporting_issued or i_snap_caps.
+	 */
+	if (ci->i_snap_realm) {
+		struct ceph_mds_client *mdsc =
+			&ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
+		struct ceph_snap_realm *realm = ci->i_snap_realm;
+
+		dout(" dropping residual ref to snap realm %p\n", realm);
+		spin_lock(&realm->inodes_with_caps_lock);
+		list_del_init(&ci->i_snap_realm_item);
+		spin_unlock(&realm->inodes_with_caps_lock);
+		ceph_put_snap_realm(mdsc, realm);
+	}
+
+	kfree(ci->i_symlink);
+	while ((n = rb_first(&ci->i_fragtree)) != NULL) {
+		frag = rb_entry(n, struct ceph_inode_frag, node);
+		rb_erase(n, &ci->i_fragtree);
+		kfree(frag);
+	}
+
+	__ceph_destroy_xattrs(ci);
+	if (ci->i_xattrs.blob)
+		ceph_buffer_put(ci->i_xattrs.blob);
+	if (ci->i_xattrs.prealloc_blob)
+		ceph_buffer_put(ci->i_xattrs.prealloc_blob);
+
+	kmem_cache_free(ceph_inode_cachep, ci);
+}
+
+
+/*
+ * Helpers to fill in size, ctime, mtime, and atime.  We have to be
+ * careful because either the client or MDS may have more up to date
+ * info, depending on which capabilities are held, and whether
+ * time_warp_seq or truncate_seq have increased.  (Ordinarily, mtime
+ * and size are monotonically increasing, except when utimes() or
+ * truncate() increments the corresponding _seq values.)
+ */
+int ceph_fill_file_size(struct inode *inode, int issued,
+			u32 truncate_seq, u64 truncate_size, u64 size)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int queue_trunc = 0;
+
+	if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
+	    (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) {
+		dout("size %lld -> %llu\n", inode->i_size, size);
+		inode->i_size = size;
+		inode->i_blocks = (size + (1<<9) - 1) >> 9;
+		ci->i_reported_size = size;
+		if (truncate_seq != ci->i_truncate_seq) {
+			dout("truncate_seq %u -> %u\n",
+			     ci->i_truncate_seq, truncate_seq);
+			ci->i_truncate_seq = truncate_seq;
+			/*
+			 * If we hold relevant caps, or in the case where we're
+			 * not the only client referencing this file and we
+			 * don't hold those caps, then we need to check whether
+			 * the file is either opened or mmaped
+			 */
+			if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD|
+				      CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER|
+				      CEPH_CAP_FILE_EXCL)) ||
+			    mapping_mapped(inode->i_mapping) ||
+			    __ceph_caps_file_wanted(ci)) {
+				ci->i_truncate_pending++;
+				queue_trunc = 1;
+			}
+		}
+	}
+	if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0 &&
+	    ci->i_truncate_size != truncate_size) {
+		dout("truncate_size %lld -> %llu\n", ci->i_truncate_size,
+		     truncate_size);
+		ci->i_truncate_size = truncate_size;
+	}
+	return queue_trunc;
+}
+
+void ceph_fill_file_time(struct inode *inode, int issued,
+			 u64 time_warp_seq, struct timespec *ctime,
+			 struct timespec *mtime, struct timespec *atime)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int warn = 0;
+
+	if (issued & (CEPH_CAP_FILE_EXCL|
+		      CEPH_CAP_FILE_WR|
+		      CEPH_CAP_FILE_BUFFER)) {
+		if (timespec_compare(ctime, &inode->i_ctime) > 0) {
+			dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n",
+			     inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
+			     ctime->tv_sec, ctime->tv_nsec);
+			inode->i_ctime = *ctime;
+		}
+		if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
+			/* the MDS did a utimes() */
+			dout("mtime %ld.%09ld -> %ld.%09ld "
+			     "tw %d -> %d\n",
+			     inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
+			     mtime->tv_sec, mtime->tv_nsec,
+			     ci->i_time_warp_seq, (int)time_warp_seq);
+
+			inode->i_mtime = *mtime;
+			inode->i_atime = *atime;
+			ci->i_time_warp_seq = time_warp_seq;
+		} else if (time_warp_seq == ci->i_time_warp_seq) {
+			/* nobody did utimes(); take the max */
+			if (timespec_compare(mtime, &inode->i_mtime) > 0) {
+				dout("mtime %ld.%09ld -> %ld.%09ld inc\n",
+				     inode->i_mtime.tv_sec,
+				     inode->i_mtime.tv_nsec,
+				     mtime->tv_sec, mtime->tv_nsec);
+				inode->i_mtime = *mtime;
+			}
+			if (timespec_compare(atime, &inode->i_atime) > 0) {
+				dout("atime %ld.%09ld -> %ld.%09ld inc\n",
+				     inode->i_atime.tv_sec,
+				     inode->i_atime.tv_nsec,
+				     atime->tv_sec, atime->tv_nsec);
+				inode->i_atime = *atime;
+			}
+		} else if (issued & CEPH_CAP_FILE_EXCL) {
+			/* we did a utimes(); ignore mds values */
+		} else {
+			warn = 1;
+		}
+	} else {
+		/* we have no write caps; whatever the MDS says is true */
+		if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
+			inode->i_ctime = *ctime;
+			inode->i_mtime = *mtime;
+			inode->i_atime = *atime;
+			ci->i_time_warp_seq = time_warp_seq;
+		} else {
+			warn = 1;
+		}
+	}
+	if (warn) /* time_warp_seq shouldn't go backwards */
+		dout("%p mds time_warp_seq %llu < %u\n",
+		     inode, time_warp_seq, ci->i_time_warp_seq);
+}
+
+/*
+ * Populate an inode based on info from mds.  May be called on new or
+ * existing inodes.
+ */
+static int fill_inode(struct inode *inode,
+		      struct ceph_mds_reply_info_in *iinfo,
+		      struct ceph_mds_reply_dirfrag *dirinfo,
+		      struct ceph_mds_session *session,
+		      unsigned long ttl_from, int cap_fmode,
+		      struct ceph_cap_reservation *caps_reservation)
+{
+	struct ceph_mds_reply_inode *info = iinfo->in;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int i;
+	int issued, implemented;
+	struct timespec mtime, atime, ctime;
+	u32 nsplits;
+	struct ceph_buffer *xattr_blob = NULL;
+	int err = 0;
+	int queue_trunc = 0;
+
+	dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",
+	     inode, ceph_vinop(inode), le64_to_cpu(info->version),
+	     ci->i_version);
+
+	/*
+	 * prealloc xattr data, if it looks like we'll need it.  only
+	 * if len > 4 (meaning there are actually xattrs; the first 4
+	 * bytes are the xattr count).
+	 */
+	if (iinfo->xattr_len > 4) {
+		xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS);
+		if (!xattr_blob)
+			pr_err("fill_inode ENOMEM xattr blob %d bytes\n",
+			       iinfo->xattr_len);
+	}
+
+	spin_lock(&inode->i_lock);
+
+	/*
+	 * provided version will be odd if inode value is projected,
+	 * even if stable.  skip the update if we have a newer info
+	 * (e.g., due to inode info racing form multiple MDSs), or if
+	 * we are getting projected (unstable) inode info.
+	 */
+	if (le64_to_cpu(info->version) > 0 &&
+	    (ci->i_version & ~1) > le64_to_cpu(info->version))
+		goto no_change;
+
+	issued = __ceph_caps_issued(ci, &implemented);
+	issued |= implemented | __ceph_caps_dirty(ci);
+
+	/* update inode */
+	ci->i_version = le64_to_cpu(info->version);
+	inode->i_version++;
+	inode->i_rdev = le32_to_cpu(info->rdev);
+
+	if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
+		inode->i_mode = le32_to_cpu(info->mode);
+		inode->i_uid = le32_to_cpu(info->uid);
+		inode->i_gid = le32_to_cpu(info->gid);
+		dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
+		     inode->i_uid, inode->i_gid);
+	}
+
+	if ((issued & CEPH_CAP_LINK_EXCL) == 0)
+		inode->i_nlink = le32_to_cpu(info->nlink);
+
+	/* be careful with mtime, atime, size */
+	ceph_decode_timespec(&atime, &info->atime);
+	ceph_decode_timespec(&mtime, &info->mtime);
+	ceph_decode_timespec(&ctime, &info->ctime);
+	queue_trunc = ceph_fill_file_size(inode, issued,
+					  le32_to_cpu(info->truncate_seq),
+					  le64_to_cpu(info->truncate_size),
+					  le64_to_cpu(info->size));
+	ceph_fill_file_time(inode, issued,
+			    le32_to_cpu(info->time_warp_seq),
+			    &ctime, &mtime, &atime);
+
+	ci->i_max_size = le64_to_cpu(info->max_size);
+	ci->i_layout = info->layout;
+	inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
+
+	/* xattrs */
+	/* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */
+	if ((issued & CEPH_CAP_XATTR_EXCL) == 0 &&
+	    le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) {
+		if (ci->i_xattrs.blob)
+			ceph_buffer_put(ci->i_xattrs.blob);
+		ci->i_xattrs.blob = xattr_blob;
+		if (xattr_blob)
+			memcpy(ci->i_xattrs.blob->vec.iov_base,
+			       iinfo->xattr_data, iinfo->xattr_len);
+		ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
+		xattr_blob = NULL;
+	}
+
+	inode->i_mapping->a_ops = &ceph_aops;
+	inode->i_mapping->backing_dev_info =
+		&ceph_sb_to_client(inode->i_sb)->backing_dev_info;
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFIFO:
+	case S_IFBLK:
+	case S_IFCHR:
+	case S_IFSOCK:
+		init_special_inode(inode, inode->i_mode, inode->i_rdev);
+		inode->i_op = &ceph_file_iops;
+		break;
+	case S_IFREG:
+		inode->i_op = &ceph_file_iops;
+		inode->i_fop = &ceph_file_fops;
+		break;
+	case S_IFLNK:
+		inode->i_op = &ceph_symlink_iops;
+		if (!ci->i_symlink) {
+			int symlen = iinfo->symlink_len;
+			char *sym;
+
+			BUG_ON(symlen != inode->i_size);
+			spin_unlock(&inode->i_lock);
+
+			err = -ENOMEM;
+			sym = kmalloc(symlen+1, GFP_NOFS);
+			if (!sym)
+				goto out;
+			memcpy(sym, iinfo->symlink, symlen);
+			sym[symlen] = 0;
+
+			spin_lock(&inode->i_lock);
+			if (!ci->i_symlink)
+				ci->i_symlink = sym;
+			else
+				kfree(sym); /* lost a race */
+		}
+		break;
+	case S_IFDIR:
+		inode->i_op = &ceph_dir_iops;
+		inode->i_fop = &ceph_dir_fops;
+
+		ci->i_files = le64_to_cpu(info->files);
+		ci->i_subdirs = le64_to_cpu(info->subdirs);
+		ci->i_rbytes = le64_to_cpu(info->rbytes);
+		ci->i_rfiles = le64_to_cpu(info->rfiles);
+		ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
+		ceph_decode_timespec(&ci->i_rctime, &info->rctime);
+
+		/* set dir completion flag? */
+		if (ci->i_files == 0 && ci->i_subdirs == 0 &&
+		    ceph_snap(inode) == CEPH_NOSNAP &&
+		    (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
+		    (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
+			dout(" marking %p complete (empty)\n", inode);
+			ci->i_ceph_flags |= CEPH_I_COMPLETE;
+			ci->i_max_offset = 2;
+		}
+
+		/* it may be better to set st_size in getattr instead? */
+		if (ceph_test_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
+			inode->i_size = ci->i_rbytes;
+		break;
+	default:
+		pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
+		       ceph_vinop(inode), inode->i_mode);
+	}
+
+no_change:
+	spin_unlock(&inode->i_lock);
+
+	/* queue truncate if we saw i_size decrease */
+	if (queue_trunc)
+		ceph_queue_vmtruncate(inode);
+
+	/* populate frag tree */
+	/* FIXME: move me up, if/when version reflects fragtree changes */
+	nsplits = le32_to_cpu(info->fragtree.nsplits);
+	mutex_lock(&ci->i_fragtree_mutex);
+	for (i = 0; i < nsplits; i++) {
+		u32 id = le32_to_cpu(info->fragtree.splits[i].frag);
+		struct ceph_inode_frag *frag = __get_or_create_frag(ci, id);
+
+		if (IS_ERR(frag))
+			continue;
+		frag->split_by = le32_to_cpu(info->fragtree.splits[i].by);
+		dout(" frag %x split by %d\n", frag->frag, frag->split_by);
+	}
+	mutex_unlock(&ci->i_fragtree_mutex);
+
+	/* were we issued a capability? */
+	if (info->cap.caps) {
+		if (ceph_snap(inode) == CEPH_NOSNAP) {
+			ceph_add_cap(inode, session,
+				     le64_to_cpu(info->cap.cap_id),
+				     cap_fmode,
+				     le32_to_cpu(info->cap.caps),
+				     le32_to_cpu(info->cap.wanted),
+				     le32_to_cpu(info->cap.seq),
+				     le32_to_cpu(info->cap.mseq),
+				     le64_to_cpu(info->cap.realm),
+				     info->cap.flags,
+				     caps_reservation);
+		} else {
+			spin_lock(&inode->i_lock);
+			dout(" %p got snap_caps %s\n", inode,
+			     ceph_cap_string(le32_to_cpu(info->cap.caps)));
+			ci->i_snap_caps |= le32_to_cpu(info->cap.caps);
+			if (cap_fmode >= 0)
+				__ceph_get_fmode(ci, cap_fmode);
+			spin_unlock(&inode->i_lock);
+		}
+	} else if (cap_fmode >= 0) {
+		pr_warning("mds issued no caps on %llx.%llx\n",
+			   ceph_vinop(inode));
+		__ceph_get_fmode(ci, cap_fmode);
+	}
+
+	/* update delegation info? */
+	if (dirinfo)
+		ceph_fill_dirfrag(inode, dirinfo);
+
+	err = 0;
+
+out:
+	if (xattr_blob)
+		ceph_buffer_put(xattr_blob);
+	return err;
+}
+
+/*
+ * caller should hold session s_mutex.
+ */
+static void update_dentry_lease(struct dentry *dentry,
+				struct ceph_mds_reply_lease *lease,
+				struct ceph_mds_session *session,
+				unsigned long from_time)
+{
+	struct ceph_dentry_info *di = ceph_dentry(dentry);
+	long unsigned duration = le32_to_cpu(lease->duration_ms);
+	long unsigned ttl = from_time + (duration * HZ) / 1000;
+	long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
+	struct inode *dir;
+
+	/* only track leases on regular dentries */
+	if (dentry->d_op != &ceph_dentry_ops)
+		return;
+
+	spin_lock(&dentry->d_lock);
+	dout("update_dentry_lease %p mask %d duration %lu ms ttl %lu\n",
+	     dentry, le16_to_cpu(lease->mask), duration, ttl);
+
+	/* make lease_rdcache_gen match directory */
+	dir = dentry->d_parent->d_inode;
+	di->lease_shared_gen = ceph_inode(dir)->i_shared_gen;
+
+	if (lease->mask == 0)
+		goto out_unlock;
+
+	if (di->lease_gen == session->s_cap_gen &&
+	    time_before(ttl, dentry->d_time))
+		goto out_unlock;  /* we already have a newer lease. */
+
+	if (di->lease_session && di->lease_session != session)
+		goto out_unlock;
+
+	ceph_dentry_lru_touch(dentry);
+
+	if (!di->lease_session)
+		di->lease_session = ceph_get_mds_session(session);
+	di->lease_gen = session->s_cap_gen;
+	di->lease_seq = le32_to_cpu(lease->seq);
+	di->lease_renew_after = half_ttl;
+	di->lease_renew_from = 0;
+	dentry->d_time = ttl;
+out_unlock:
+	spin_unlock(&dentry->d_lock);
+	return;
+}
+
+/*
+ * Set dentry's directory position based on the current dir's max, and
+ * order it in d_subdirs, so that dcache_readdir behaves.
+ */
+static void ceph_set_dentry_offset(struct dentry *dn)
+{
+	struct dentry *dir = dn->d_parent;
+	struct inode *inode = dn->d_parent->d_inode;
+	struct ceph_dentry_info *di;
+
+	BUG_ON(!inode);
+
+	di = ceph_dentry(dn);
+
+	spin_lock(&inode->i_lock);
+	if ((ceph_inode(inode)->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
+		spin_unlock(&inode->i_lock);
+		return;
+	}
+	di->offset = ceph_inode(inode)->i_max_offset++;
+	spin_unlock(&inode->i_lock);
+
+	spin_lock(&dcache_lock);
+	spin_lock(&dn->d_lock);
+	list_move(&dn->d_u.d_child, &dir->d_subdirs);
+	dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
+	     dn->d_u.d_child.prev, dn->d_u.d_child.next);
+	spin_unlock(&dn->d_lock);
+	spin_unlock(&dcache_lock);
+}
+
+/*
+ * splice a dentry to an inode.
+ * caller must hold directory i_mutex for this to be safe.
+ *
+ * we will only rehash the resulting dentry if @prehash is
+ * true; @prehash will be set to false (for the benefit of
+ * the caller) if we fail.
+ */
+static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
+				    bool *prehash)
+{
+	struct dentry *realdn;
+
+	BUG_ON(dn->d_inode);
+
+	/* dn must be unhashed */
+	if (!d_unhashed(dn))
+		d_drop(dn);
+	realdn = d_materialise_unique(dn, in);
+	if (IS_ERR(realdn)) {
+		pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n",
+		       PTR_ERR(realdn), dn, in, ceph_vinop(in));
+		if (prehash)
+			*prehash = false; /* don't rehash on error */
+		dn = realdn; /* note realdn contains the error */
+		goto out;
+	} else if (realdn) {
+		dout("dn %p (%d) spliced with %p (%d) "
+		     "inode %p ino %llx.%llx\n",
+		     dn, atomic_read(&dn->d_count),
+		     realdn, atomic_read(&realdn->d_count),
+		     realdn->d_inode, ceph_vinop(realdn->d_inode));
+		dput(dn);
+		dn = realdn;
+	} else {
+		BUG_ON(!ceph_dentry(dn));
+		dout("dn %p attached to %p ino %llx.%llx\n",
+		     dn, dn->d_inode, ceph_vinop(dn->d_inode));
+	}
+	if ((!prehash || *prehash) && d_unhashed(dn))
+		d_rehash(dn);
+	ceph_set_dentry_offset(dn);
+out:
+	return dn;
+}
+
+/*
+ * Incorporate results into the local cache.  This is either just
+ * one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
+ * after a lookup).
+ *
+ * A reply may contain
+ *         a directory inode along with a dentry.
+ *  and/or a target inode
+ *
+ * Called with snap_rwsem (read).
+ */
+int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
+		    struct ceph_mds_session *session)
+{
+	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+	struct inode *in = NULL;
+	struct ceph_mds_reply_inode *ininfo;
+	struct ceph_vino vino;
+	struct ceph_client *client = ceph_sb_to_client(sb);
+	int i = 0;
+	int err = 0;
+
+	dout("fill_trace %p is_dentry %d is_target %d\n", req,
+	     rinfo->head->is_dentry, rinfo->head->is_target);
+
+#if 0
+	/*
+	 * Debugging hook:
+	 *
+	 * If we resend completed ops to a recovering mds, we get no
+	 * trace.  Since that is very rare, pretend this is the case
+	 * to ensure the 'no trace' handlers in the callers behave.
+	 *
+	 * Fill in inodes unconditionally to avoid breaking cap
+	 * invariants.
+	 */
+	if (rinfo->head->op & CEPH_MDS_OP_WRITE) {
+		pr_info("fill_trace faking empty trace on %lld %s\n",
+			req->r_tid, ceph_mds_op_name(rinfo->head->op));
+		if (rinfo->head->is_dentry) {
+			rinfo->head->is_dentry = 0;
+			err = fill_inode(req->r_locked_dir,
+					 &rinfo->diri, rinfo->dirfrag,
+					 session, req->r_request_started, -1);
+		}
+		if (rinfo->head->is_target) {
+			rinfo->head->is_target = 0;
+			ininfo = rinfo->targeti.in;
+			vino.ino = le64_to_cpu(ininfo->ino);
+			vino.snap = le64_to_cpu(ininfo->snapid);
+			in = ceph_get_inode(sb, vino);
+			err = fill_inode(in, &rinfo->targeti, NULL,
+					 session, req->r_request_started,
+					 req->r_fmode);
+			iput(in);
+		}
+	}
+#endif
+
+	if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
+		dout("fill_trace reply is empty!\n");
+		if (rinfo->head->result == 0 && req->r_locked_dir)
+			ceph_invalidate_dir_request(req);
+		return 0;
+	}
+
+	if (rinfo->head->is_dentry) {
+		struct inode *dir = req->r_locked_dir;
+
+		err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag,
+				 session, req->r_request_started, -1,
+				 &req->r_caps_reservation);
+		if (err < 0)
+			return err;
+	}
+
+	/*
+	 * ignore null lease/binding on snapdir ENOENT, or else we
+	 * will have trouble splicing in the virtual snapdir later
+	 */
+	if (rinfo->head->is_dentry && !req->r_aborted &&
+	    (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
+					       client->mount_args->snapdir_name,
+					       req->r_dentry->d_name.len))) {
+		/*
+		 * lookup link rename   : null -> possibly existing inode
+		 * mknod symlink mkdir  : null -> new inode
+		 * unlink               : linked -> null
+		 */
+		struct inode *dir = req->r_locked_dir;
+		struct dentry *dn = req->r_dentry;
+		bool have_dir_cap, have_lease;
+
+		BUG_ON(!dn);
+		BUG_ON(!dir);
+		BUG_ON(dn->d_parent->d_inode != dir);
+		BUG_ON(ceph_ino(dir) !=
+		       le64_to_cpu(rinfo->diri.in->ino));
+		BUG_ON(ceph_snap(dir) !=
+		       le64_to_cpu(rinfo->diri.in->snapid));
+
+		/* do we have a lease on the whole dir? */
+		have_dir_cap =
+			(le32_to_cpu(rinfo->diri.in->cap.caps) &
+			 CEPH_CAP_FILE_SHARED);
+
+		/* do we have a dn lease? */
+		have_lease = have_dir_cap ||
+			(le16_to_cpu(rinfo->dlease->mask) &
+			 CEPH_LOCK_DN);
+
+		if (!have_lease)
+			dout("fill_trace  no dentry lease or dir cap\n");
+
+		/* rename? */
+		if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) {
+			dout(" src %p '%.*s' dst %p '%.*s'\n",
+			     req->r_old_dentry,
+			     req->r_old_dentry->d_name.len,
+			     req->r_old_dentry->d_name.name,
+			     dn, dn->d_name.len, dn->d_name.name);
+			dout("fill_trace doing d_move %p -> %p\n",
+			     req->r_old_dentry, dn);
+
+			/* d_move screws up d_subdirs order */
+			ceph_i_clear(dir, CEPH_I_COMPLETE);
+
+			d_move(req->r_old_dentry, dn);
+			dout(" src %p '%.*s' dst %p '%.*s'\n",
+			     req->r_old_dentry,
+			     req->r_old_dentry->d_name.len,
+			     req->r_old_dentry->d_name.name,
+			     dn, dn->d_name.len, dn->d_name.name);
+
+			/* ensure target dentry is invalidated, despite
+			   rehashing bug in vfs_rename_dir */
+			ceph_invalidate_dentry_lease(dn);
+
+			/* take overwritten dentry's readdir offset */
+			dout("dn %p gets %p offset %lld (old offset %lld)\n",
+			     req->r_old_dentry, dn, ceph_dentry(dn)->offset,
+			     ceph_dentry(req->r_old_dentry)->offset);
+			ceph_dentry(req->r_old_dentry)->offset =
+				ceph_dentry(dn)->offset;
+
+			dn = req->r_old_dentry;  /* use old_dentry */
+			in = dn->d_inode;
+		}
+
+		/* null dentry? */
+		if (!rinfo->head->is_target) {
+			dout("fill_trace null dentry\n");
+			if (dn->d_inode) {
+				dout("d_delete %p\n", dn);
+				d_delete(dn);
+			} else {
+				dout("d_instantiate %p NULL\n", dn);
+				d_instantiate(dn, NULL);
+				if (have_lease && d_unhashed(dn))
+					d_rehash(dn);
+				update_dentry_lease(dn, rinfo->dlease,
+						    session,
+						    req->r_request_started);
+			}
+			goto done;
+		}
+
+		/* attach proper inode */
+		ininfo = rinfo->targeti.in;
+		vino.ino = le64_to_cpu(ininfo->ino);
+		vino.snap = le64_to_cpu(ininfo->snapid);
+		if (!dn->d_inode) {
+			in = ceph_get_inode(sb, vino);
+			if (IS_ERR(in)) {
+				pr_err("fill_trace bad get_inode "
+				       "%llx.%llx\n", vino.ino, vino.snap);
+				err = PTR_ERR(in);
+				d_delete(dn);
+				goto done;
+			}
+			dn = splice_dentry(dn, in, &have_lease);
+			if (IS_ERR(dn)) {
+				err = PTR_ERR(dn);
+				goto done;
+			}
+			req->r_dentry = dn;  /* may have spliced */
+			igrab(in);
+		} else if (ceph_ino(in) == vino.ino &&
+			   ceph_snap(in) == vino.snap) {
+			igrab(in);
+		} else {
+			dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
+			     dn, in, ceph_ino(in), ceph_snap(in),
+			     vino.ino, vino.snap);
+			have_lease = false;
+			in = NULL;
+		}
+
+		if (have_lease)
+			update_dentry_lease(dn, rinfo->dlease, session,
+					    req->r_request_started);
+		dout(" final dn %p\n", dn);
+		i++;
+	} else if (req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
+		   req->r_op == CEPH_MDS_OP_MKSNAP) {
+		struct dentry *dn = req->r_dentry;
+
+		/* fill out a snapdir LOOKUPSNAP dentry */
+		BUG_ON(!dn);
+		BUG_ON(!req->r_locked_dir);
+		BUG_ON(ceph_snap(req->r_locked_dir) != CEPH_SNAPDIR);
+		ininfo = rinfo->targeti.in;
+		vino.ino = le64_to_cpu(ininfo->ino);
+		vino.snap = le64_to_cpu(ininfo->snapid);
+		in = ceph_get_inode(sb, vino);
+		if (IS_ERR(in)) {
+			pr_err("fill_inode get_inode badness %llx.%llx\n",
+			       vino.ino, vino.snap);
+			err = PTR_ERR(in);
+			d_delete(dn);
+			goto done;
+		}
+		dout(" linking snapped dir %p to dn %p\n", in, dn);
+		dn = splice_dentry(dn, in, NULL);
+		if (IS_ERR(dn)) {
+			err = PTR_ERR(dn);
+			goto done;
+		}
+		req->r_dentry = dn;  /* may have spliced */
+		igrab(in);
+		rinfo->head->is_dentry = 1;  /* fool notrace handlers */
+	}
+
+	if (rinfo->head->is_target) {
+		vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+		vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+
+		if (in == NULL || ceph_ino(in) != vino.ino ||
+		    ceph_snap(in) != vino.snap) {
+			in = ceph_get_inode(sb, vino);
+			if (IS_ERR(in)) {
+				err = PTR_ERR(in);
+				goto done;
+			}
+		}
+		req->r_target_inode = in;
+
+		err = fill_inode(in,
+				 &rinfo->targeti, NULL,
+				 session, req->r_request_started,
+				 (le32_to_cpu(rinfo->head->result) == 0) ?
+				 req->r_fmode : -1,
+				 &req->r_caps_reservation);
+		if (err < 0) {
+			pr_err("fill_inode badness %p %llx.%llx\n",
+			       in, ceph_vinop(in));
+			goto done;
+		}
+	}
+
+done:
+	dout("fill_trace done err=%d\n", err);
+	return err;
+}
+
+/*
+ * Prepopulate our cache with readdir results, leases, etc.
+ */
+int ceph_readdir_prepopulate(struct ceph_mds_request *req,
+			     struct ceph_mds_session *session)
+{
+	struct dentry *parent = req->r_dentry;
+	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+	struct qstr dname;
+	struct dentry *dn;
+	struct inode *in;
+	int err = 0, i;
+	struct inode *snapdir = NULL;
+	struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
+	u64 frag = le32_to_cpu(rhead->args.readdir.frag);
+	struct ceph_dentry_info *di;
+
+	if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
+		snapdir = ceph_get_snapdir(parent->d_inode);
+		parent = d_find_alias(snapdir);
+		dout("readdir_prepopulate %d items under SNAPDIR dn %p\n",
+		     rinfo->dir_nr, parent);
+	} else {
+		dout("readdir_prepopulate %d items under dn %p\n",
+		     rinfo->dir_nr, parent);
+		if (rinfo->dir_dir)
+			ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir);
+	}
+
+	for (i = 0; i < rinfo->dir_nr; i++) {
+		struct ceph_vino vino;
+
+		dname.name = rinfo->dir_dname[i];
+		dname.len = rinfo->dir_dname_len[i];
+		dname.hash = full_name_hash(dname.name, dname.len);
+
+		vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
+		vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
+
+retry_lookup:
+		dn = d_lookup(parent, &dname);
+		dout("d_lookup on parent=%p name=%.*s got %p\n",
+		     parent, dname.len, dname.name, dn);
+
+		if (!dn) {
+			dn = d_alloc(parent, &dname);
+			dout("d_alloc %p '%.*s' = %p\n", parent,
+			     dname.len, dname.name, dn);
+			if (dn == NULL) {
+				dout("d_alloc badness\n");
+				err = -ENOMEM;
+				goto out;
+			}
+			err = ceph_init_dentry(dn);
+			if (err < 0) {
+				dput(dn);
+				goto out;
+			}
+		} else if (dn->d_inode &&
+			   (ceph_ino(dn->d_inode) != vino.ino ||
+			    ceph_snap(dn->d_inode) != vino.snap)) {
+			dout(" dn %p points to wrong inode %p\n",
+			     dn, dn->d_inode);
+			d_delete(dn);
+			dput(dn);
+			goto retry_lookup;
+		} else {
+			/* reorder parent's d_subdirs */
+			spin_lock(&dcache_lock);
+			spin_lock(&dn->d_lock);
+			list_move(&dn->d_u.d_child, &parent->d_subdirs);
+			spin_unlock(&dn->d_lock);
+			spin_unlock(&dcache_lock);
+		}
+
+		di = dn->d_fsdata;
+		di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
+
+		/* inode */
+		if (dn->d_inode) {
+			in = dn->d_inode;
+		} else {
+			in = ceph_get_inode(parent->d_sb, vino);
+			if (in == NULL) {
+				dout("new_inode badness\n");
+				d_delete(dn);
+				dput(dn);
+				err = -ENOMEM;
+				goto out;
+			}
+			dn = splice_dentry(dn, in, NULL);
+			if (IS_ERR(dn))
+				dn = NULL;
+		}
+
+		if (fill_inode(in, &rinfo->dir_in[i], NULL, session,
+			       req->r_request_started, -1,
+			       &req->r_caps_reservation) < 0) {
+			pr_err("fill_inode badness on %p\n", in);
+			goto next_item;
+		}
+		if (dn)
+			update_dentry_lease(dn, rinfo->dir_dlease[i],
+					    req->r_session,
+					    req->r_request_started);
+next_item:
+		if (dn)
+			dput(dn);
+	}
+	req->r_did_prepopulate = true;
+
+out:
+	if (snapdir) {
+		iput(snapdir);
+		dput(parent);
+	}
+	dout("readdir_prepopulate done\n");
+	return err;
+}
+
+int ceph_inode_set_size(struct inode *inode, loff_t size)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int ret = 0;
+
+	spin_lock(&inode->i_lock);
+	dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
+	inode->i_size = size;
+	inode->i_blocks = (size + (1 << 9) - 1) >> 9;
+
+	/* tell the MDS if we are approaching max_size */
+	if ((size << 1) >= ci->i_max_size &&
+	    (ci->i_reported_size << 1) < ci->i_max_size)
+		ret = 1;
+
+	spin_unlock(&inode->i_lock);
+	return ret;
+}
+
+/*
+ * Write back inode data in a worker thread.  (This can't be done
+ * in the message handler context.)
+ */
+void ceph_queue_writeback(struct inode *inode)
+{
+	if (queue_work(ceph_inode_to_client(inode)->wb_wq,
+		       &ceph_inode(inode)->i_wb_work)) {
+		dout("ceph_queue_writeback %p\n", inode);
+		igrab(inode);
+	} else {
+		dout("ceph_queue_writeback %p failed\n", inode);
+	}
+}
+
+static void ceph_writeback_work(struct work_struct *work)
+{
+	struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
+						  i_wb_work);
+	struct inode *inode = &ci->vfs_inode;
+
+	dout("writeback %p\n", inode);
+	filemap_fdatawrite(&inode->i_data);
+	iput(inode);
+}
+
+/*
+ * queue an async invalidation
+ */
+void ceph_queue_invalidate(struct inode *inode)
+{
+	if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq,
+		       &ceph_inode(inode)->i_pg_inv_work)) {
+		dout("ceph_queue_invalidate %p\n", inode);
+		igrab(inode);
+	} else {
+		dout("ceph_queue_invalidate %p failed\n", inode);
+	}
+}
+
+/*
+ * invalidate any pages that are not dirty or under writeback.  this
+ * includes pages that are clean and mapped.
+ */
+static void ceph_invalidate_nondirty_pages(struct address_space *mapping)
+{
+	struct pagevec pvec;
+	pgoff_t next = 0;
+	int i;
+
+	pagevec_init(&pvec, 0);
+	while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+			pgoff_t index;
+			int skip_page =
+				(PageDirty(page) || PageWriteback(page));
+
+			if (!skip_page)
+				skip_page = !trylock_page(page);
+
+			/*
+			 * We really shouldn't be looking at the ->index of an
+			 * unlocked page.  But we're not allowed to lock these
+			 * pages.  So we rely upon nobody altering the ->index
+			 * of this (pinned-by-us) page.
+			 */
+			index = page->index;
+			if (index > next)
+				next = index;
+			next++;
+
+			if (skip_page)
+				continue;
+
+			generic_error_remove_page(mapping, page);
+			unlock_page(page);
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+}
+
+/*
+ * Invalidate inode pages in a worker thread.  (This can't be done
+ * in the message handler context.)
+ */
+static void ceph_invalidate_work(struct work_struct *work)
+{
+	struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
+						  i_pg_inv_work);
+	struct inode *inode = &ci->vfs_inode;
+	u32 orig_gen;
+	int check = 0;
+
+	spin_lock(&inode->i_lock);
+	dout("invalidate_pages %p gen %d revoking %d\n", inode,
+	     ci->i_rdcache_gen, ci->i_rdcache_revoking);
+	if (ci->i_rdcache_gen == 0 ||
+	    ci->i_rdcache_revoking != ci->i_rdcache_gen) {
+		BUG_ON(ci->i_rdcache_revoking > ci->i_rdcache_gen);
+		/* nevermind! */
+		ci->i_rdcache_revoking = 0;
+		spin_unlock(&inode->i_lock);
+		goto out;
+	}
+	orig_gen = ci->i_rdcache_gen;
+	spin_unlock(&inode->i_lock);
+
+	ceph_invalidate_nondirty_pages(inode->i_mapping);
+
+	spin_lock(&inode->i_lock);
+	if (orig_gen == ci->i_rdcache_gen) {
+		dout("invalidate_pages %p gen %d successful\n", inode,
+		     ci->i_rdcache_gen);
+		ci->i_rdcache_gen = 0;
+		ci->i_rdcache_revoking = 0;
+		check = 1;
+	} else {
+		dout("invalidate_pages %p gen %d raced, gen now %d\n",
+		     inode, orig_gen, ci->i_rdcache_gen);
+	}
+	spin_unlock(&inode->i_lock);
+
+	if (check)
+		ceph_check_caps(ci, 0, NULL);
+out:
+	iput(inode);
+}
+
+
+/*
+ * called by trunc_wq; take i_mutex ourselves
+ *
+ * We also truncate in a separate thread as well.
+ */
+static void ceph_vmtruncate_work(struct work_struct *work)
+{
+	struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
+						  i_vmtruncate_work);
+	struct inode *inode = &ci->vfs_inode;
+
+	dout("vmtruncate_work %p\n", inode);
+	mutex_lock(&inode->i_mutex);
+	__ceph_do_pending_vmtruncate(inode);
+	mutex_unlock(&inode->i_mutex);
+	iput(inode);
+}
+
+/*
+ * Queue an async vmtruncate.  If we fail to queue work, we will handle
+ * the truncation the next time we call __ceph_do_pending_vmtruncate.
+ */
+void ceph_queue_vmtruncate(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq,
+		       &ci->i_vmtruncate_work)) {
+		dout("ceph_queue_vmtruncate %p\n", inode);
+		igrab(inode);
+	} else {
+		dout("ceph_queue_vmtruncate %p failed, pending=%d\n",
+		     inode, ci->i_truncate_pending);
+	}
+}
+
+/*
+ * called with i_mutex held.
+ *
+ * Make sure any pending truncation is applied before doing anything
+ * that may depend on it.
+ */
+void __ceph_do_pending_vmtruncate(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	u64 to;
+	int wrbuffer_refs, wake = 0;
+
+retry:
+	spin_lock(&inode->i_lock);
+	if (ci->i_truncate_pending == 0) {
+		dout("__do_pending_vmtruncate %p none pending\n", inode);
+		spin_unlock(&inode->i_lock);
+		return;
+	}
+
+	/*
+	 * make sure any dirty snapped pages are flushed before we
+	 * possibly truncate them.. so write AND block!
+	 */
+	if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
+		dout("__do_pending_vmtruncate %p flushing snaps first\n",
+		     inode);
+		spin_unlock(&inode->i_lock);
+		filemap_write_and_wait_range(&inode->i_data, 0,
+					     inode->i_sb->s_maxbytes);
+		goto retry;
+	}
+
+	to = ci->i_truncate_size;
+	wrbuffer_refs = ci->i_wrbuffer_ref;
+	dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode,
+	     ci->i_truncate_pending, to);
+	spin_unlock(&inode->i_lock);
+
+	truncate_inode_pages(inode->i_mapping, to);
+
+	spin_lock(&inode->i_lock);
+	ci->i_truncate_pending--;
+	if (ci->i_truncate_pending == 0)
+		wake = 1;
+	spin_unlock(&inode->i_lock);
+
+	if (wrbuffer_refs == 0)
+		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+	if (wake)
+		wake_up_all(&ci->i_cap_wq);
+}
+
+
+/*
+ * symlinks
+ */
+static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct ceph_inode_info *ci = ceph_inode(dentry->d_inode);
+	nd_set_link(nd, ci->i_symlink);
+	return NULL;
+}
+
+static const struct inode_operations ceph_symlink_iops = {
+	.readlink = generic_readlink,
+	.follow_link = ceph_sym_follow_link,
+};
+
+/*
+ * setattr
+ */
+int ceph_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct inode *parent_inode = dentry->d_parent->d_inode;
+	const unsigned int ia_valid = attr->ia_valid;
+	struct ceph_mds_request *req;
+	struct ceph_mds_client *mdsc = &ceph_sb_to_client(dentry->d_sb)->mdsc;
+	int issued;
+	int release = 0, dirtied = 0;
+	int mask = 0;
+	int err = 0;
+
+	if (ceph_snap(inode) != CEPH_NOSNAP)
+		return -EROFS;
+
+	__ceph_do_pending_vmtruncate(inode);
+
+	err = inode_change_ok(inode, attr);
+	if (err != 0)
+		return err;
+
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR,
+				       USE_AUTH_MDS);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	spin_lock(&inode->i_lock);
+	issued = __ceph_caps_issued(ci, NULL);
+	dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
+
+	if (ia_valid & ATTR_UID) {
+		dout("setattr %p uid %d -> %d\n", inode,
+		     inode->i_uid, attr->ia_uid);
+		if (issued & CEPH_CAP_AUTH_EXCL) {
+			inode->i_uid = attr->ia_uid;
+			dirtied |= CEPH_CAP_AUTH_EXCL;
+		} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
+			   attr->ia_uid != inode->i_uid) {
+			req->r_args.setattr.uid = cpu_to_le32(attr->ia_uid);
+			mask |= CEPH_SETATTR_UID;
+			release |= CEPH_CAP_AUTH_SHARED;
+		}
+	}
+	if (ia_valid & ATTR_GID) {
+		dout("setattr %p gid %d -> %d\n", inode,
+		     inode->i_gid, attr->ia_gid);
+		if (issued & CEPH_CAP_AUTH_EXCL) {
+			inode->i_gid = attr->ia_gid;
+			dirtied |= CEPH_CAP_AUTH_EXCL;
+		} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
+			   attr->ia_gid != inode->i_gid) {
+			req->r_args.setattr.gid = cpu_to_le32(attr->ia_gid);
+			mask |= CEPH_SETATTR_GID;
+			release |= CEPH_CAP_AUTH_SHARED;
+		}
+	}
+	if (ia_valid & ATTR_MODE) {
+		dout("setattr %p mode 0%o -> 0%o\n", inode, inode->i_mode,
+		     attr->ia_mode);
+		if (issued & CEPH_CAP_AUTH_EXCL) {
+			inode->i_mode = attr->ia_mode;
+			dirtied |= CEPH_CAP_AUTH_EXCL;
+		} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
+			   attr->ia_mode != inode->i_mode) {
+			req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode);
+			mask |= CEPH_SETATTR_MODE;
+			release |= CEPH_CAP_AUTH_SHARED;
+		}
+	}
+
+	if (ia_valid & ATTR_ATIME) {
+		dout("setattr %p atime %ld.%ld -> %ld.%ld\n", inode,
+		     inode->i_atime.tv_sec, inode->i_atime.tv_nsec,
+		     attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec);
+		if (issued & CEPH_CAP_FILE_EXCL) {
+			ci->i_time_warp_seq++;
+			inode->i_atime = attr->ia_atime;
+			dirtied |= CEPH_CAP_FILE_EXCL;
+		} else if ((issued & CEPH_CAP_FILE_WR) &&
+			   timespec_compare(&inode->i_atime,
+					    &attr->ia_atime) < 0) {
+			inode->i_atime = attr->ia_atime;
+			dirtied |= CEPH_CAP_FILE_WR;
+		} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
+			   !timespec_equal(&inode->i_atime, &attr->ia_atime)) {
+			ceph_encode_timespec(&req->r_args.setattr.atime,
+					     &attr->ia_atime);
+			mask |= CEPH_SETATTR_ATIME;
+			release |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
+				CEPH_CAP_FILE_WR;
+		}
+	}
+	if (ia_valid & ATTR_MTIME) {
+		dout("setattr %p mtime %ld.%ld -> %ld.%ld\n", inode,
+		     inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
+		     attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec);
+		if (issued & CEPH_CAP_FILE_EXCL) {
+			ci->i_time_warp_seq++;
+			inode->i_mtime = attr->ia_mtime;
+			dirtied |= CEPH_CAP_FILE_EXCL;
+		} else if ((issued & CEPH_CAP_FILE_WR) &&
+			   timespec_compare(&inode->i_mtime,
+					    &attr->ia_mtime) < 0) {
+			inode->i_mtime = attr->ia_mtime;
+			dirtied |= CEPH_CAP_FILE_WR;
+		} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
+			   !timespec_equal(&inode->i_mtime, &attr->ia_mtime)) {
+			ceph_encode_timespec(&req->r_args.setattr.mtime,
+					     &attr->ia_mtime);
+			mask |= CEPH_SETATTR_MTIME;
+			release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
+				CEPH_CAP_FILE_WR;
+		}
+	}
+	if (ia_valid & ATTR_SIZE) {
+		dout("setattr %p size %lld -> %lld\n", inode,
+		     inode->i_size, attr->ia_size);
+		if (attr->ia_size > inode->i_sb->s_maxbytes) {
+			err = -EINVAL;
+			goto out;
+		}
+		if ((issued & CEPH_CAP_FILE_EXCL) &&
+		    attr->ia_size > inode->i_size) {
+			inode->i_size = attr->ia_size;
+			inode->i_blocks =
+				(attr->ia_size + (1 << 9) - 1) >> 9;
+			inode->i_ctime = attr->ia_ctime;
+			ci->i_reported_size = attr->ia_size;
+			dirtied |= CEPH_CAP_FILE_EXCL;
+		} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
+			   attr->ia_size != inode->i_size) {
+			req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
+			req->r_args.setattr.old_size =
+				cpu_to_le64(inode->i_size);
+			mask |= CEPH_SETATTR_SIZE;
+			release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
+				CEPH_CAP_FILE_WR;
+		}
+	}
+
+	/* these do nothing */
+	if (ia_valid & ATTR_CTIME) {
+		bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME|
+					 ATTR_MODE|ATTR_UID|ATTR_GID)) == 0;
+		dout("setattr %p ctime %ld.%ld -> %ld.%ld (%s)\n", inode,
+		     inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
+		     attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
+		     only ? "ctime only" : "ignored");
+		inode->i_ctime = attr->ia_ctime;
+		if (only) {
+			/*
+			 * if kernel wants to dirty ctime but nothing else,
+			 * we need to choose a cap to dirty under, or do
+			 * a almost-no-op setattr
+			 */
+			if (issued & CEPH_CAP_AUTH_EXCL)
+				dirtied |= CEPH_CAP_AUTH_EXCL;
+			else if (issued & CEPH_CAP_FILE_EXCL)
+				dirtied |= CEPH_CAP_FILE_EXCL;
+			else if (issued & CEPH_CAP_XATTR_EXCL)
+				dirtied |= CEPH_CAP_XATTR_EXCL;
+			else
+				mask |= CEPH_SETATTR_CTIME;
+		}
+	}
+	if (ia_valid & ATTR_FILE)
+		dout("setattr %p ATTR_FILE ... hrm!\n", inode);
+
+	if (dirtied) {
+		__ceph_mark_dirty_caps(ci, dirtied);
+		inode->i_ctime = CURRENT_TIME;
+	}
+
+	release &= issued;
+	spin_unlock(&inode->i_lock);
+
+	if (mask) {
+		req->r_inode = igrab(inode);
+		req->r_inode_drop = release;
+		req->r_args.setattr.mask = cpu_to_le32(mask);
+		req->r_num_caps = 1;
+		err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+	}
+	dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
+	     ceph_cap_string(dirtied), mask);
+
+	ceph_mdsc_put_request(req);
+	__ceph_do_pending_vmtruncate(inode);
+	return err;
+out:
+	spin_unlock(&inode->i_lock);
+	ceph_mdsc_put_request(req);
+	return err;
+}
+
+/*
+ * Verify that we have a lease on the given mask.  If not,
+ * do a getattr against an mds.
+ */
+int ceph_do_getattr(struct inode *inode, int mask)
+{
+	struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_mds_request *req;
+	int err;
+
+	if (ceph_snap(inode) == CEPH_SNAPDIR) {
+		dout("do_getattr inode %p SNAPDIR\n", inode);
+		return 0;
+	}
+
+	dout("do_getattr inode %p mask %s\n", inode, ceph_cap_string(mask));
+	if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
+		return 0;
+
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+	req->r_inode = igrab(inode);
+	req->r_num_caps = 1;
+	req->r_args.getattr.mask = cpu_to_le32(mask);
+	err = ceph_mdsc_do_request(mdsc, NULL, req);
+	ceph_mdsc_put_request(req);
+	dout("do_getattr result=%d\n", err);
+	return err;
+}
+
+
+/*
+ * Check inode permissions.  We verify we have a valid value for
+ * the AUTH cap, then call the generic handler.
+ */
+int ceph_permission(struct inode *inode, int mask)
+{
+	int err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
+
+	if (!err)
+		err = generic_permission(inode, mask, NULL);
+	return err;
+}
+
+/*
+ * Get all attributes.  Hopefully somedata we'll have a statlite()
+ * and can limit the fields we require to be accurate.
+ */
+int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		 struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int err;
+
+	err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL);
+	if (!err) {
+		generic_fillattr(inode, stat);
+		stat->ino = inode->i_ino;
+		if (ceph_snap(inode) != CEPH_NOSNAP)
+			stat->dev = ceph_snap(inode);
+		else
+			stat->dev = 0;
+		if (S_ISDIR(inode->i_mode)) {
+			stat->size = ci->i_rbytes;
+			stat->blocks = 0;
+			stat->blksize = 65536;
+		}
+	}
+	return err;
+}
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
new file mode 100644
index 000000000000..d085f07756b4
--- /dev/null
+++ b/fs/ceph/ioctl.c
@@ -0,0 +1,160 @@
+#include <linux/in.h>
+
+#include "ioctl.h"
+#include "super.h"
+#include "ceph_debug.h"
+
+
+/*
+ * ioctls
+ */
+
+/*
+ * get and set the file layout
+ */
+static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
+{
+	struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode);
+	struct ceph_ioctl_layout l;
+	int err;
+
+	err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT);
+	if (!err) {
+		l.stripe_unit = ceph_file_layout_su(ci->i_layout);
+		l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
+		l.object_size = ceph_file_layout_object_size(ci->i_layout);
+		l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool);
+		l.preferred_osd =
+			(s32)le32_to_cpu(ci->i_layout.fl_pg_preferred);
+		if (copy_to_user(arg, &l, sizeof(l)))
+			return -EFAULT;
+	}
+
+	return err;
+}
+
+static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
+	struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_mds_request *req;
+	struct ceph_ioctl_layout l;
+	int err, i;
+
+	/* copy and validate */
+	if (copy_from_user(&l, arg, sizeof(l)))
+		return -EFAULT;
+
+	if ((l.object_size & ~PAGE_MASK) ||
+	    (l.stripe_unit & ~PAGE_MASK) ||
+	    !l.stripe_unit ||
+	    (l.object_size &&
+	     (unsigned)l.object_size % (unsigned)l.stripe_unit))
+		return -EINVAL;
+
+	/* make sure it's a valid data pool */
+	if (l.data_pool > 0) {
+		mutex_lock(&mdsc->mutex);
+		err = -EINVAL;
+		for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
+			if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
+				err = 0;
+				break;
+			}
+		mutex_unlock(&mdsc->mutex);
+		if (err)
+			return err;
+	}
+
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETLAYOUT,
+				       USE_AUTH_MDS);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+	req->r_inode = igrab(inode);
+	req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL;
+
+	req->r_args.setlayout.layout.fl_stripe_unit =
+		cpu_to_le32(l.stripe_unit);
+	req->r_args.setlayout.layout.fl_stripe_count =
+		cpu_to_le32(l.stripe_count);
+	req->r_args.setlayout.layout.fl_object_size =
+		cpu_to_le32(l.object_size);
+	req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool);
+	req->r_args.setlayout.layout.fl_pg_preferred =
+		cpu_to_le32(l.preferred_osd);
+
+	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+	ceph_mdsc_put_request(req);
+	return err;
+}
+
+/*
+ * Return object name, size/offset information, and location (OSD
+ * number, network address) for a given file offset.
+ */
+static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
+{
+	struct ceph_ioctl_dataloc dl;
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
+	u64 len = 1, olen;
+	u64 tmp;
+	struct ceph_object_layout ol;
+	struct ceph_pg pgid;
+
+	/* copy and validate */
+	if (copy_from_user(&dl, arg, sizeof(dl)))
+		return -EFAULT;
+
+	down_read(&osdc->map_sem);
+	ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len,
+				      &dl.object_no, &dl.object_offset, &olen);
+	dl.file_offset -= dl.object_offset;
+	dl.object_size = ceph_file_layout_object_size(ci->i_layout);
+	dl.block_size = ceph_file_layout_su(ci->i_layout);
+
+	/* block_offset = object_offset % block_size */
+	tmp = dl.object_offset;
+	dl.block_offset = do_div(tmp, dl.block_size);
+
+	snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
+		 ceph_ino(inode), dl.object_no);
+	ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout,
+				osdc->osdmap);
+
+	pgid = ol.ol_pgid;
+	dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
+	if (dl.osd >= 0) {
+		struct ceph_entity_addr *a =
+			ceph_osd_addr(osdc->osdmap, dl.osd);
+		if (a)
+			memcpy(&dl.osd_addr, &a->in_addr, sizeof(dl.osd_addr));
+	} else {
+		memset(&dl.osd_addr, 0, sizeof(dl.osd_addr));
+	}
+	up_read(&osdc->map_sem);
+
+	/* send result back to user */
+	if (copy_to_user(arg, &dl, sizeof(dl)))
+		return -EFAULT;
+
+	return 0;
+}
+
+long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg);
+	switch (cmd) {
+	case CEPH_IOC_GET_LAYOUT:
+		return ceph_ioctl_get_layout(file, (void __user *)arg);
+
+	case CEPH_IOC_SET_LAYOUT:
+		return ceph_ioctl_set_layout(file, (void __user *)arg);
+
+	case CEPH_IOC_GET_DATALOC:
+		return ceph_ioctl_get_dataloc(file, (void __user *)arg);
+	}
+	return -ENOTTY;
+}
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
new file mode 100644
index 000000000000..25e4f1a9d059
--- /dev/null
+++ b/fs/ceph/ioctl.h
@@ -0,0 +1,40 @@
+#ifndef FS_CEPH_IOCTL_H
+#define FS_CEPH_IOCTL_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+#define CEPH_IOCTL_MAGIC 0x97
+
+/* just use u64 to align sanely on all archs */
+struct ceph_ioctl_layout {
+	__u64 stripe_unit, stripe_count, object_size;
+	__u64 data_pool;
+	__s64 preferred_osd;
+};
+
+#define CEPH_IOC_GET_LAYOUT _IOR(CEPH_IOCTL_MAGIC, 1,		\
+				   struct ceph_ioctl_layout)
+#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2,		\
+				   struct ceph_ioctl_layout)
+
+/*
+ * Extract identity, address of the OSD and object storing a given
+ * file offset.
+ */
+struct ceph_ioctl_dataloc {
+	__u64 file_offset;           /* in+out: file offset */
+	__u64 object_offset;         /* out: offset in object */
+	__u64 object_no;             /* out: object # */
+	__u64 object_size;           /* out: object size */
+	char object_name[64];        /* out: object name */
+	__u64 block_offset;          /* out: offset in block */
+	__u64 block_size;            /* out: block length */
+	__s64 osd;                   /* out: osd # */
+	struct sockaddr_storage osd_addr; /* out: osd address */
+};
+
+#define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3,	\
+				   struct ceph_ioctl_dataloc)
+
+#endif
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
new file mode 100644
index 000000000000..dd440bd438a9
--- /dev/null
+++ b/fs/ceph/mds_client.c
@@ -0,0 +1,3176 @@
+#include "ceph_debug.h"
+
+#include <linux/wait.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+
+#include "mds_client.h"
+#include "mon_client.h"
+#include "super.h"
+#include "messenger.h"
+#include "decode.h"
+#include "auth.h"
+#include "pagelist.h"
+
+/*
+ * A cluster of MDS (metadata server) daemons is responsible for
+ * managing the file system namespace (the directory hierarchy and
+ * inodes) and for coordinating shared access to storage.  Metadata is
+ * partitioning hierarchically across a number of servers, and that
+ * partition varies over time as the cluster adjusts the distribution
+ * in order to balance load.
+ *
+ * The MDS client is primarily responsible to managing synchronous
+ * metadata requests for operations like open, unlink, and so forth.
+ * If there is a MDS failure, we find out about it when we (possibly
+ * request and) receive a new MDS map, and can resubmit affected
+ * requests.
+ *
+ * For the most part, though, we take advantage of a lossless
+ * communications channel to the MDS, and do not need to worry about
+ * timing out or resubmitting requests.
+ *
+ * We maintain a stateful "session" with each MDS we interact with.
+ * Within each session, we sent periodic heartbeat messages to ensure
+ * any capabilities or leases we have been issues remain valid.  If
+ * the session times out and goes stale, our leases and capabilities
+ * are no longer valid.
+ */
+
+static void __wake_requests(struct ceph_mds_client *mdsc,
+			    struct list_head *head);
+
+static const struct ceph_connection_operations mds_con_ops;
+
+
+/*
+ * mds reply parsing
+ */
+
+/*
+ * parse individual inode info
+ */
+static int parse_reply_info_in(void **p, void *end,
+			       struct ceph_mds_reply_info_in *info)
+{
+	int err = -EIO;
+
+	info->in = *p;
+	*p += sizeof(struct ceph_mds_reply_inode) +
+		sizeof(*info->in->fragtree.splits) *
+		le32_to_cpu(info->in->fragtree.nsplits);
+
+	ceph_decode_32_safe(p, end, info->symlink_len, bad);
+	ceph_decode_need(p, end, info->symlink_len, bad);
+	info->symlink = *p;
+	*p += info->symlink_len;
+
+	ceph_decode_32_safe(p, end, info->xattr_len, bad);
+	ceph_decode_need(p, end, info->xattr_len, bad);
+	info->xattr_data = *p;
+	*p += info->xattr_len;
+	return 0;
+bad:
+	return err;
+}
+
+/*
+ * parse a normal reply, which may contain a (dir+)dentry and/or a
+ * target inode.
+ */
+static int parse_reply_info_trace(void **p, void *end,
+				  struct ceph_mds_reply_info_parsed *info)
+{
+	int err;
+
+	if (info->head->is_dentry) {
+		err = parse_reply_info_in(p, end, &info->diri);
+		if (err < 0)
+			goto out_bad;
+
+		if (unlikely(*p + sizeof(*info->dirfrag) > end))
+			goto bad;
+		info->dirfrag = *p;
+		*p += sizeof(*info->dirfrag) +
+			sizeof(u32)*le32_to_cpu(info->dirfrag->ndist);
+		if (unlikely(*p > end))
+			goto bad;
+
+		ceph_decode_32_safe(p, end, info->dname_len, bad);
+		ceph_decode_need(p, end, info->dname_len, bad);
+		info->dname = *p;
+		*p += info->dname_len;
+		info->dlease = *p;
+		*p += sizeof(*info->dlease);
+	}
+
+	if (info->head->is_target) {
+		err = parse_reply_info_in(p, end, &info->targeti);
+		if (err < 0)
+			goto out_bad;
+	}
+
+	if (unlikely(*p != end))
+		goto bad;
+	return 0;
+
+bad:
+	err = -EIO;
+out_bad:
+	pr_err("problem parsing mds trace %d\n", err);
+	return err;
+}
+
+/*
+ * parse readdir results
+ */
+static int parse_reply_info_dir(void **p, void *end,
+				struct ceph_mds_reply_info_parsed *info)
+{
+	u32 num, i = 0;
+	int err;
+
+	info->dir_dir = *p;
+	if (*p + sizeof(*info->dir_dir) > end)
+		goto bad;
+	*p += sizeof(*info->dir_dir) +
+		sizeof(u32)*le32_to_cpu(info->dir_dir->ndist);
+	if (*p > end)
+		goto bad;
+
+	ceph_decode_need(p, end, sizeof(num) + 2, bad);
+	num = ceph_decode_32(p);
+	info->dir_end = ceph_decode_8(p);
+	info->dir_complete = ceph_decode_8(p);
+	if (num == 0)
+		goto done;
+
+	/* alloc large array */
+	info->dir_nr = num;
+	info->dir_in = kcalloc(num, sizeof(*info->dir_in) +
+			       sizeof(*info->dir_dname) +
+			       sizeof(*info->dir_dname_len) +
+			       sizeof(*info->dir_dlease),
+			       GFP_NOFS);
+	if (info->dir_in == NULL) {
+		err = -ENOMEM;
+		goto out_bad;
+	}
+	info->dir_dname = (void *)(info->dir_in + num);
+	info->dir_dname_len = (void *)(info->dir_dname + num);
+	info->dir_dlease = (void *)(info->dir_dname_len + num);
+
+	while (num) {
+		/* dentry */
+		ceph_decode_need(p, end, sizeof(u32)*2, bad);
+		info->dir_dname_len[i] = ceph_decode_32(p);
+		ceph_decode_need(p, end, info->dir_dname_len[i], bad);
+		info->dir_dname[i] = *p;
+		*p += info->dir_dname_len[i];
+		dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i],
+		     info->dir_dname[i]);
+		info->dir_dlease[i] = *p;
+		*p += sizeof(struct ceph_mds_reply_lease);
+
+		/* inode */
+		err = parse_reply_info_in(p, end, &info->dir_in[i]);
+		if (err < 0)
+			goto out_bad;
+		i++;
+		num--;
+	}
+
+done:
+	if (*p != end)
+		goto bad;
+	return 0;
+
+bad:
+	err = -EIO;
+out_bad:
+	pr_err("problem parsing dir contents %d\n", err);
+	return err;
+}
+
+/*
+ * parse entire mds reply
+ */
+static int parse_reply_info(struct ceph_msg *msg,
+			    struct ceph_mds_reply_info_parsed *info)
+{
+	void *p, *end;
+	u32 len;
+	int err;
+
+	info->head = msg->front.iov_base;
+	p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head);
+	end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head);
+
+	/* trace */
+	ceph_decode_32_safe(&p, end, len, bad);
+	if (len > 0) {
+		err = parse_reply_info_trace(&p, p+len, info);
+		if (err < 0)
+			goto out_bad;
+	}
+
+	/* dir content */
+	ceph_decode_32_safe(&p, end, len, bad);
+	if (len > 0) {
+		err = parse_reply_info_dir(&p, p+len, info);
+		if (err < 0)
+			goto out_bad;
+	}
+
+	/* snap blob */
+	ceph_decode_32_safe(&p, end, len, bad);
+	info->snapblob_len = len;
+	info->snapblob = p;
+	p += len;
+
+	if (p != end)
+		goto bad;
+	return 0;
+
+bad:
+	err = -EIO;
+out_bad:
+	pr_err("mds parse_reply err %d\n", err);
+	return err;
+}
+
+static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
+{
+	kfree(info->dir_in);
+}
+
+
+/*
+ * sessions
+ */
+static const char *session_state_name(int s)
+{
+	switch (s) {
+	case CEPH_MDS_SESSION_NEW: return "new";
+	case CEPH_MDS_SESSION_OPENING: return "opening";
+	case CEPH_MDS_SESSION_OPEN: return "open";
+	case CEPH_MDS_SESSION_HUNG: return "hung";
+	case CEPH_MDS_SESSION_CLOSING: return "closing";
+	case CEPH_MDS_SESSION_RESTARTING: return "restarting";
+	case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting";
+	default: return "???";
+	}
+}
+
+static struct ceph_mds_session *get_session(struct ceph_mds_session *s)
+{
+	if (atomic_inc_not_zero(&s->s_ref)) {
+		dout("mdsc get_session %p %d -> %d\n", s,
+		     atomic_read(&s->s_ref)-1, atomic_read(&s->s_ref));
+		return s;
+	} else {
+		dout("mdsc get_session %p 0 -- FAIL", s);
+		return NULL;
+	}
+}
+
+void ceph_put_mds_session(struct ceph_mds_session *s)
+{
+	dout("mdsc put_session %p %d -> %d\n", s,
+	     atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
+	if (atomic_dec_and_test(&s->s_ref)) {
+		if (s->s_authorizer)
+			s->s_mdsc->client->monc.auth->ops->destroy_authorizer(
+				s->s_mdsc->client->monc.auth, s->s_authorizer);
+		kfree(s);
+	}
+}
+
+/*
+ * called under mdsc->mutex
+ */
+struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
+						   int mds)
+{
+	struct ceph_mds_session *session;
+
+	if (mds >= mdsc->max_sessions || mdsc->sessions[mds] == NULL)
+		return NULL;
+	session = mdsc->sessions[mds];
+	dout("lookup_mds_session %p %d\n", session,
+	     atomic_read(&session->s_ref));
+	get_session(session);
+	return session;
+}
+
+static bool __have_session(struct ceph_mds_client *mdsc, int mds)
+{
+	if (mds >= mdsc->max_sessions)
+		return false;
+	return mdsc->sessions[mds];
+}
+
+static int __verify_registered_session(struct ceph_mds_client *mdsc,
+				       struct ceph_mds_session *s)
+{
+	if (s->s_mds >= mdsc->max_sessions ||
+	    mdsc->sessions[s->s_mds] != s)
+		return -ENOENT;
+	return 0;
+}
+
+/*
+ * create+register a new session for given mds.
+ * called under mdsc->mutex.
+ */
+static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
+						 int mds)
+{
+	struct ceph_mds_session *s;
+
+	s = kzalloc(sizeof(*s), GFP_NOFS);
+	if (!s)
+		return ERR_PTR(-ENOMEM);
+	s->s_mdsc = mdsc;
+	s->s_mds = mds;
+	s->s_state = CEPH_MDS_SESSION_NEW;
+	s->s_ttl = 0;
+	s->s_seq = 0;
+	mutex_init(&s->s_mutex);
+
+	ceph_con_init(mdsc->client->msgr, &s->s_con);
+	s->s_con.private = s;
+	s->s_con.ops = &mds_con_ops;
+	s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
+	s->s_con.peer_name.num = cpu_to_le64(mds);
+
+	spin_lock_init(&s->s_cap_lock);
+	s->s_cap_gen = 0;
+	s->s_cap_ttl = 0;
+	s->s_renew_requested = 0;
+	s->s_renew_seq = 0;
+	INIT_LIST_HEAD(&s->s_caps);
+	s->s_nr_caps = 0;
+	s->s_trim_caps = 0;
+	atomic_set(&s->s_ref, 1);
+	INIT_LIST_HEAD(&s->s_waiting);
+	INIT_LIST_HEAD(&s->s_unsafe);
+	s->s_num_cap_releases = 0;
+	s->s_cap_iterator = NULL;
+	INIT_LIST_HEAD(&s->s_cap_releases);
+	INIT_LIST_HEAD(&s->s_cap_releases_done);
+	INIT_LIST_HEAD(&s->s_cap_flushing);
+	INIT_LIST_HEAD(&s->s_cap_snaps_flushing);
+
+	dout("register_session mds%d\n", mds);
+	if (mds >= mdsc->max_sessions) {
+		int newmax = 1 << get_count_order(mds+1);
+		struct ceph_mds_session **sa;
+
+		dout("register_session realloc to %d\n", newmax);
+		sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
+		if (sa == NULL)
+			goto fail_realloc;
+		if (mdsc->sessions) {
+			memcpy(sa, mdsc->sessions,
+			       mdsc->max_sessions * sizeof(void *));
+			kfree(mdsc->sessions);
+		}
+		mdsc->sessions = sa;
+		mdsc->max_sessions = newmax;
+	}
+	mdsc->sessions[mds] = s;
+	atomic_inc(&s->s_ref);  /* one ref to sessions[], one to caller */
+
+	ceph_con_open(&s->s_con, ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
+
+	return s;
+
+fail_realloc:
+	kfree(s);
+	return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * called under mdsc->mutex
+ */
+static void __unregister_session(struct ceph_mds_client *mdsc,
+			       struct ceph_mds_session *s)
+{
+	dout("__unregister_session mds%d %p\n", s->s_mds, s);
+	BUG_ON(mdsc->sessions[s->s_mds] != s);
+	mdsc->sessions[s->s_mds] = NULL;
+	ceph_con_close(&s->s_con);
+	ceph_put_mds_session(s);
+}
+
+/*
+ * drop session refs in request.
+ *
+ * should be last request ref, or hold mdsc->mutex
+ */
+static void put_request_session(struct ceph_mds_request *req)
+{
+	if (req->r_session) {
+		ceph_put_mds_session(req->r_session);
+		req->r_session = NULL;
+	}
+}
+
+void ceph_mdsc_release_request(struct kref *kref)
+{
+	struct ceph_mds_request *req = container_of(kref,
+						    struct ceph_mds_request,
+						    r_kref);
+	if (req->r_request)
+		ceph_msg_put(req->r_request);
+	if (req->r_reply) {
+		ceph_msg_put(req->r_reply);
+		destroy_reply_info(&req->r_reply_info);
+	}
+	if (req->r_inode) {
+		ceph_put_cap_refs(ceph_inode(req->r_inode),
+				  CEPH_CAP_PIN);
+		iput(req->r_inode);
+	}
+	if (req->r_locked_dir)
+		ceph_put_cap_refs(ceph_inode(req->r_locked_dir),
+				  CEPH_CAP_PIN);
+	if (req->r_target_inode)
+		iput(req->r_target_inode);
+	if (req->r_dentry)
+		dput(req->r_dentry);
+	if (req->r_old_dentry) {
+		ceph_put_cap_refs(
+			ceph_inode(req->r_old_dentry->d_parent->d_inode),
+			CEPH_CAP_PIN);
+		dput(req->r_old_dentry);
+	}
+	kfree(req->r_path1);
+	kfree(req->r_path2);
+	put_request_session(req);
+	ceph_unreserve_caps(&req->r_caps_reservation);
+	kfree(req);
+}
+
+/*
+ * lookup session, bump ref if found.
+ *
+ * called under mdsc->mutex.
+ */
+static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc,
+					     u64 tid)
+{
+	struct ceph_mds_request *req;
+	struct rb_node *n = mdsc->request_tree.rb_node;
+
+	while (n) {
+		req = rb_entry(n, struct ceph_mds_request, r_node);
+		if (tid < req->r_tid)
+			n = n->rb_left;
+		else if (tid > req->r_tid)
+			n = n->rb_right;
+		else {
+			ceph_mdsc_get_request(req);
+			return req;
+		}
+	}
+	return NULL;
+}
+
+static void __insert_request(struct ceph_mds_client *mdsc,
+			     struct ceph_mds_request *new)
+{
+	struct rb_node **p = &mdsc->request_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_mds_request *req = NULL;
+
+	while (*p) {
+		parent = *p;
+		req = rb_entry(parent, struct ceph_mds_request, r_node);
+		if (new->r_tid < req->r_tid)
+			p = &(*p)->rb_left;
+		else if (new->r_tid > req->r_tid)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&new->r_node, parent, p);
+	rb_insert_color(&new->r_node, &mdsc->request_tree);
+}
+
+/*
+ * Register an in-flight request, and assign a tid.  Link to directory
+ * are modifying (if any).
+ *
+ * Called under mdsc->mutex.
+ */
+static void __register_request(struct ceph_mds_client *mdsc,
+			       struct ceph_mds_request *req,
+			       struct inode *dir)
+{
+	req->r_tid = ++mdsc->last_tid;
+	if (req->r_num_caps)
+		ceph_reserve_caps(&req->r_caps_reservation, req->r_num_caps);
+	dout("__register_request %p tid %lld\n", req, req->r_tid);
+	ceph_mdsc_get_request(req);
+	__insert_request(mdsc, req);
+
+	if (dir) {
+		struct ceph_inode_info *ci = ceph_inode(dir);
+
+		spin_lock(&ci->i_unsafe_lock);
+		req->r_unsafe_dir = dir;
+		list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
+		spin_unlock(&ci->i_unsafe_lock);
+	}
+}
+
+static void __unregister_request(struct ceph_mds_client *mdsc,
+				 struct ceph_mds_request *req)
+{
+	dout("__unregister_request %p tid %lld\n", req, req->r_tid);
+	rb_erase(&req->r_node, &mdsc->request_tree);
+	RB_CLEAR_NODE(&req->r_node);
+
+	if (req->r_unsafe_dir) {
+		struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
+
+		spin_lock(&ci->i_unsafe_lock);
+		list_del_init(&req->r_unsafe_dir_item);
+		spin_unlock(&ci->i_unsafe_lock);
+	}
+
+	ceph_mdsc_put_request(req);
+}
+
+/*
+ * Choose mds to send request to next.  If there is a hint set in the
+ * request (e.g., due to a prior forward hint from the mds), use that.
+ * Otherwise, consult frag tree and/or caps to identify the
+ * appropriate mds.  If all else fails, choose randomly.
+ *
+ * Called under mdsc->mutex.
+ */
+static int __choose_mds(struct ceph_mds_client *mdsc,
+			struct ceph_mds_request *req)
+{
+	struct inode *inode;
+	struct ceph_inode_info *ci;
+	struct ceph_cap *cap;
+	int mode = req->r_direct_mode;
+	int mds = -1;
+	u32 hash = req->r_direct_hash;
+	bool is_hash = req->r_direct_is_hash;
+
+	/*
+	 * is there a specific mds we should try?  ignore hint if we have
+	 * no session and the mds is not up (active or recovering).
+	 */
+	if (req->r_resend_mds >= 0 &&
+	    (__have_session(mdsc, req->r_resend_mds) ||
+	     ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) {
+		dout("choose_mds using resend_mds mds%d\n",
+		     req->r_resend_mds);
+		return req->r_resend_mds;
+	}
+
+	if (mode == USE_RANDOM_MDS)
+		goto random;
+
+	inode = NULL;
+	if (req->r_inode) {
+		inode = req->r_inode;
+	} else if (req->r_dentry) {
+		if (req->r_dentry->d_inode) {
+			inode = req->r_dentry->d_inode;
+		} else {
+			inode = req->r_dentry->d_parent->d_inode;
+			hash = req->r_dentry->d_name.hash;
+			is_hash = true;
+		}
+	}
+	dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
+	     (int)hash, mode);
+	if (!inode)
+		goto random;
+	ci = ceph_inode(inode);
+
+	if (is_hash && S_ISDIR(inode->i_mode)) {
+		struct ceph_inode_frag frag;
+		int found;
+
+		ceph_choose_frag(ci, hash, &frag, &found);
+		if (found) {
+			if (mode == USE_ANY_MDS && frag.ndist > 0) {
+				u8 r;
+
+				/* choose a random replica */
+				get_random_bytes(&r, 1);
+				r %= frag.ndist;
+				mds = frag.dist[r];
+				dout("choose_mds %p %llx.%llx "
+				     "frag %u mds%d (%d/%d)\n",
+				     inode, ceph_vinop(inode),
+				     frag.frag, frag.mds,
+				     (int)r, frag.ndist);
+				return mds;
+			}
+
+			/* since this file/dir wasn't known to be
+			 * replicated, then we want to look for the
+			 * authoritative mds. */
+			mode = USE_AUTH_MDS;
+			if (frag.mds >= 0) {
+				/* choose auth mds */
+				mds = frag.mds;
+				dout("choose_mds %p %llx.%llx "
+				     "frag %u mds%d (auth)\n",
+				     inode, ceph_vinop(inode), frag.frag, mds);
+				return mds;
+			}
+		}
+	}
+
+	spin_lock(&inode->i_lock);
+	cap = NULL;
+	if (mode == USE_AUTH_MDS)
+		cap = ci->i_auth_cap;
+	if (!cap && !RB_EMPTY_ROOT(&ci->i_caps))
+		cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
+	if (!cap) {
+		spin_unlock(&inode->i_lock);
+		goto random;
+	}
+	mds = cap->session->s_mds;
+	dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n",
+	     inode, ceph_vinop(inode), mds,
+	     cap == ci->i_auth_cap ? "auth " : "", cap);
+	spin_unlock(&inode->i_lock);
+	return mds;
+
+random:
+	mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
+	dout("choose_mds chose random mds%d\n", mds);
+	return mds;
+}
+
+
+/*
+ * session messages
+ */
+static struct ceph_msg *create_session_msg(u32 op, u64 seq)
+{
+	struct ceph_msg *msg;
+	struct ceph_mds_session_head *h;
+
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS);
+	if (!msg) {
+		pr_err("create_session_msg ENOMEM creating msg\n");
+		return NULL;
+	}
+	h = msg->front.iov_base;
+	h->op = cpu_to_le32(op);
+	h->seq = cpu_to_le64(seq);
+	return msg;
+}
+
+/*
+ * send session open request.
+ *
+ * called under mdsc->mutex
+ */
+static int __open_session(struct ceph_mds_client *mdsc,
+			  struct ceph_mds_session *session)
+{
+	struct ceph_msg *msg;
+	int mstate;
+	int mds = session->s_mds;
+
+	/* wait for mds to go active? */
+	mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
+	dout("open_session to mds%d (%s)\n", mds,
+	     ceph_mds_state_name(mstate));
+	session->s_state = CEPH_MDS_SESSION_OPENING;
+	session->s_renew_requested = jiffies;
+
+	/* send connect message */
+	msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq);
+	if (!msg)
+		return -ENOMEM;
+	ceph_con_send(&session->s_con, msg);
+	return 0;
+}
+
+/*
+ * session caps
+ */
+
+/*
+ * Free preallocated cap messages assigned to this session
+ */
+static void cleanup_cap_releases(struct ceph_mds_session *session)
+{
+	struct ceph_msg *msg;
+
+	spin_lock(&session->s_cap_lock);
+	while (!list_empty(&session->s_cap_releases)) {
+		msg = list_first_entry(&session->s_cap_releases,
+				       struct ceph_msg, list_head);
+		list_del_init(&msg->list_head);
+		ceph_msg_put(msg);
+	}
+	while (!list_empty(&session->s_cap_releases_done)) {
+		msg = list_first_entry(&session->s_cap_releases_done,
+				       struct ceph_msg, list_head);
+		list_del_init(&msg->list_head);
+		ceph_msg_put(msg);
+	}
+	spin_unlock(&session->s_cap_lock);
+}
+
+/*
+ * Helper to safely iterate over all caps associated with a session, with
+ * special care taken to handle a racing __ceph_remove_cap().
+ *
+ * Caller must hold session s_mutex.
+ */
+static int iterate_session_caps(struct ceph_mds_session *session,
+				 int (*cb)(struct inode *, struct ceph_cap *,
+					    void *), void *arg)
+{
+	struct list_head *p;
+	struct ceph_cap *cap;
+	struct inode *inode, *last_inode = NULL;
+	struct ceph_cap *old_cap = NULL;
+	int ret;
+
+	dout("iterate_session_caps %p mds%d\n", session, session->s_mds);
+	spin_lock(&session->s_cap_lock);
+	p = session->s_caps.next;
+	while (p != &session->s_caps) {
+		cap = list_entry(p, struct ceph_cap, session_caps);
+		inode = igrab(&cap->ci->vfs_inode);
+		if (!inode) {
+			p = p->next;
+			continue;
+		}
+		session->s_cap_iterator = cap;
+		spin_unlock(&session->s_cap_lock);
+
+		if (last_inode) {
+			iput(last_inode);
+			last_inode = NULL;
+		}
+		if (old_cap) {
+			ceph_put_cap(old_cap);
+			old_cap = NULL;
+		}
+
+		ret = cb(inode, cap, arg);
+		last_inode = inode;
+
+		spin_lock(&session->s_cap_lock);
+		p = p->next;
+		if (cap->ci == NULL) {
+			dout("iterate_session_caps  finishing cap %p removal\n",
+			     cap);
+			BUG_ON(cap->session != session);
+			list_del_init(&cap->session_caps);
+			session->s_nr_caps--;
+			cap->session = NULL;
+			old_cap = cap;  /* put_cap it w/o locks held */
+		}
+		if (ret < 0)
+			goto out;
+	}
+	ret = 0;
+out:
+	session->s_cap_iterator = NULL;
+	spin_unlock(&session->s_cap_lock);
+
+	if (last_inode)
+		iput(last_inode);
+	if (old_cap)
+		ceph_put_cap(old_cap);
+
+	return ret;
+}
+
+static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
+				  void *arg)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int drop = 0;
+
+	dout("removing cap %p, ci is %p, inode is %p\n",
+	     cap, ci, &ci->vfs_inode);
+	spin_lock(&inode->i_lock);
+	__ceph_remove_cap(cap);
+	if (!__ceph_is_any_real_caps(ci)) {
+		struct ceph_mds_client *mdsc =
+			&ceph_sb_to_client(inode->i_sb)->mdsc;
+
+		spin_lock(&mdsc->cap_dirty_lock);
+		if (!list_empty(&ci->i_dirty_item)) {
+			pr_info(" dropping dirty %s state for %p %lld\n",
+				ceph_cap_string(ci->i_dirty_caps),
+				inode, ceph_ino(inode));
+			ci->i_dirty_caps = 0;
+			list_del_init(&ci->i_dirty_item);
+			drop = 1;
+		}
+		if (!list_empty(&ci->i_flushing_item)) {
+			pr_info(" dropping dirty+flushing %s state for %p %lld\n",
+				ceph_cap_string(ci->i_flushing_caps),
+				inode, ceph_ino(inode));
+			ci->i_flushing_caps = 0;
+			list_del_init(&ci->i_flushing_item);
+			mdsc->num_cap_flushing--;
+			drop = 1;
+		}
+		if (drop && ci->i_wrbuffer_ref) {
+			pr_info(" dropping dirty data for %p %lld\n",
+				inode, ceph_ino(inode));
+			ci->i_wrbuffer_ref = 0;
+			ci->i_wrbuffer_ref_head = 0;
+			drop++;
+		}
+		spin_unlock(&mdsc->cap_dirty_lock);
+	}
+	spin_unlock(&inode->i_lock);
+	while (drop--)
+		iput(inode);
+	return 0;
+}
+
+/*
+ * caller must hold session s_mutex
+ */
+static void remove_session_caps(struct ceph_mds_session *session)
+{
+	dout("remove_session_caps on %p\n", session);
+	iterate_session_caps(session, remove_session_caps_cb, NULL);
+	BUG_ON(session->s_nr_caps > 0);
+	BUG_ON(!list_empty(&session->s_cap_flushing));
+	cleanup_cap_releases(session);
+}
+
+/*
+ * wake up any threads waiting on this session's caps.  if the cap is
+ * old (didn't get renewed on the client reconnect), remove it now.
+ *
+ * caller must hold s_mutex.
+ */
+static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
+			      void *arg)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	wake_up_all(&ci->i_cap_wq);
+	if (arg) {
+		spin_lock(&inode->i_lock);
+		ci->i_wanted_max_size = 0;
+		ci->i_requested_max_size = 0;
+		spin_unlock(&inode->i_lock);
+	}
+	return 0;
+}
+
+static void wake_up_session_caps(struct ceph_mds_session *session,
+				 int reconnect)
+{
+	dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
+	iterate_session_caps(session, wake_up_session_cb,
+			     (void *)(unsigned long)reconnect);
+}
+
+/*
+ * Send periodic message to MDS renewing all currently held caps.  The
+ * ack will reset the expiration for all caps from this session.
+ *
+ * caller holds s_mutex
+ */
+static int send_renew_caps(struct ceph_mds_client *mdsc,
+			   struct ceph_mds_session *session)
+{
+	struct ceph_msg *msg;
+	int state;
+
+	if (time_after_eq(jiffies, session->s_cap_ttl) &&
+	    time_after_eq(session->s_cap_ttl, session->s_renew_requested))
+		pr_info("mds%d caps stale\n", session->s_mds);
+	session->s_renew_requested = jiffies;
+
+	/* do not try to renew caps until a recovering mds has reconnected
+	 * with its clients. */
+	state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds);
+	if (state < CEPH_MDS_STATE_RECONNECT) {
+		dout("send_renew_caps ignoring mds%d (%s)\n",
+		     session->s_mds, ceph_mds_state_name(state));
+		return 0;
+	}
+
+	dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
+		ceph_mds_state_name(state));
+	msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
+				 ++session->s_renew_seq);
+	if (!msg)
+		return -ENOMEM;
+	ceph_con_send(&session->s_con, msg);
+	return 0;
+}
+
+/*
+ * Note new cap ttl, and any transition from stale -> not stale (fresh?).
+ *
+ * Called under session->s_mutex
+ */
+static void renewed_caps(struct ceph_mds_client *mdsc,
+			 struct ceph_mds_session *session, int is_renew)
+{
+	int was_stale;
+	int wake = 0;
+
+	spin_lock(&session->s_cap_lock);
+	was_stale = is_renew && (session->s_cap_ttl == 0 ||
+				 time_after_eq(jiffies, session->s_cap_ttl));
+
+	session->s_cap_ttl = session->s_renew_requested +
+		mdsc->mdsmap->m_session_timeout*HZ;
+
+	if (was_stale) {
+		if (time_before(jiffies, session->s_cap_ttl)) {
+			pr_info("mds%d caps renewed\n", session->s_mds);
+			wake = 1;
+		} else {
+			pr_info("mds%d caps still stale\n", session->s_mds);
+		}
+	}
+	dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n",
+	     session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh",
+	     time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh");
+	spin_unlock(&session->s_cap_lock);
+
+	if (wake)
+		wake_up_session_caps(session, 0);
+}
+
+/*
+ * send a session close request
+ */
+static int request_close_session(struct ceph_mds_client *mdsc,
+				 struct ceph_mds_session *session)
+{
+	struct ceph_msg *msg;
+
+	dout("request_close_session mds%d state %s seq %lld\n",
+	     session->s_mds, session_state_name(session->s_state),
+	     session->s_seq);
+	msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
+	if (!msg)
+		return -ENOMEM;
+	ceph_con_send(&session->s_con, msg);
+	return 0;
+}
+
+/*
+ * Called with s_mutex held.
+ */
+static int __close_session(struct ceph_mds_client *mdsc,
+			 struct ceph_mds_session *session)
+{
+	if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
+		return 0;
+	session->s_state = CEPH_MDS_SESSION_CLOSING;
+	return request_close_session(mdsc, session);
+}
+
+/*
+ * Trim old(er) caps.
+ *
+ * Because we can't cache an inode without one or more caps, we do
+ * this indirectly: if a cap is unused, we prune its aliases, at which
+ * point the inode will hopefully get dropped to.
+ *
+ * Yes, this is a bit sloppy.  Our only real goal here is to respond to
+ * memory pressure from the MDS, though, so it needn't be perfect.
+ */
+static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
+{
+	struct ceph_mds_session *session = arg;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int used, oissued, mine;
+
+	if (session->s_trim_caps <= 0)
+		return -1;
+
+	spin_lock(&inode->i_lock);
+	mine = cap->issued | cap->implemented;
+	used = __ceph_caps_used(ci);
+	oissued = __ceph_caps_issued_other(ci, cap);
+
+	dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n",
+	     inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
+	     ceph_cap_string(used));
+	if (ci->i_dirty_caps)
+		goto out;   /* dirty caps */
+	if ((used & ~oissued) & mine)
+		goto out;   /* we need these caps */
+
+	session->s_trim_caps--;
+	if (oissued) {
+		/* we aren't the only cap.. just remove us */
+		__ceph_remove_cap(cap);
+	} else {
+		/* try to drop referring dentries */
+		spin_unlock(&inode->i_lock);
+		d_prune_aliases(inode);
+		dout("trim_caps_cb %p cap %p  pruned, count now %d\n",
+		     inode, cap, atomic_read(&inode->i_count));
+		return 0;
+	}
+
+out:
+	spin_unlock(&inode->i_lock);
+	return 0;
+}
+
+/*
+ * Trim session cap count down to some max number.
+ */
+static int trim_caps(struct ceph_mds_client *mdsc,
+		     struct ceph_mds_session *session,
+		     int max_caps)
+{
+	int trim_caps = session->s_nr_caps - max_caps;
+
+	dout("trim_caps mds%d start: %d / %d, trim %d\n",
+	     session->s_mds, session->s_nr_caps, max_caps, trim_caps);
+	if (trim_caps > 0) {
+		session->s_trim_caps = trim_caps;
+		iterate_session_caps(session, trim_caps_cb, session);
+		dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
+		     session->s_mds, session->s_nr_caps, max_caps,
+			trim_caps - session->s_trim_caps);
+		session->s_trim_caps = 0;
+	}
+	return 0;
+}
+
+/*
+ * Allocate cap_release messages.  If there is a partially full message
+ * in the queue, try to allocate enough to cover it's remainder, so that
+ * we can send it immediately.
+ *
+ * Called under s_mutex.
+ */
+int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
+			  struct ceph_mds_session *session,
+			  int extra)
+{
+	struct ceph_msg *msg;
+	struct ceph_mds_cap_release *head;
+	int err = -ENOMEM;
+
+	if (extra < 0)
+		extra = mdsc->client->mount_args->cap_release_safety;
+
+	spin_lock(&session->s_cap_lock);
+
+	if (!list_empty(&session->s_cap_releases)) {
+		msg = list_first_entry(&session->s_cap_releases,
+				       struct ceph_msg,
+				 list_head);
+		head = msg->front.iov_base;
+		extra += CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
+	}
+
+	while (session->s_num_cap_releases < session->s_nr_caps + extra) {
+		spin_unlock(&session->s_cap_lock);
+		msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
+				   GFP_NOFS);
+		if (!msg)
+			goto out_unlocked;
+		dout("add_cap_releases %p msg %p now %d\n", session, msg,
+		     (int)msg->front.iov_len);
+		head = msg->front.iov_base;
+		head->num = cpu_to_le32(0);
+		msg->front.iov_len = sizeof(*head);
+		spin_lock(&session->s_cap_lock);
+		list_add(&msg->list_head, &session->s_cap_releases);
+		session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE;
+	}
+
+	if (!list_empty(&session->s_cap_releases)) {
+		msg = list_first_entry(&session->s_cap_releases,
+				       struct ceph_msg,
+				       list_head);
+		head = msg->front.iov_base;
+		if (head->num) {
+			dout(" queueing non-full %p (%d)\n", msg,
+			     le32_to_cpu(head->num));
+			list_move_tail(&msg->list_head,
+				      &session->s_cap_releases_done);
+			session->s_num_cap_releases -=
+				CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
+		}
+	}
+	err = 0;
+	spin_unlock(&session->s_cap_lock);
+out_unlocked:
+	return err;
+}
+
+/*
+ * flush all dirty inode data to disk.
+ *
+ * returns true if we've flushed through want_flush_seq
+ */
+static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
+{
+	int mds, ret = 1;
+
+	dout("check_cap_flush want %lld\n", want_flush_seq);
+	mutex_lock(&mdsc->mutex);
+	for (mds = 0; ret && mds < mdsc->max_sessions; mds++) {
+		struct ceph_mds_session *session = mdsc->sessions[mds];
+
+		if (!session)
+			continue;
+		get_session(session);
+		mutex_unlock(&mdsc->mutex);
+
+		mutex_lock(&session->s_mutex);
+		if (!list_empty(&session->s_cap_flushing)) {
+			struct ceph_inode_info *ci =
+				list_entry(session->s_cap_flushing.next,
+					   struct ceph_inode_info,
+					   i_flushing_item);
+			struct inode *inode = &ci->vfs_inode;
+
+			spin_lock(&inode->i_lock);
+			if (ci->i_cap_flush_seq <= want_flush_seq) {
+				dout("check_cap_flush still flushing %p "
+				     "seq %lld <= %lld to mds%d\n", inode,
+				     ci->i_cap_flush_seq, want_flush_seq,
+				     session->s_mds);
+				ret = 0;
+			}
+			spin_unlock(&inode->i_lock);
+		}
+		mutex_unlock(&session->s_mutex);
+		ceph_put_mds_session(session);
+
+		if (!ret)
+			return ret;
+		mutex_lock(&mdsc->mutex);
+	}
+
+	mutex_unlock(&mdsc->mutex);
+	dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq);
+	return ret;
+}
+
+/*
+ * called under s_mutex
+ */
+void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
+			    struct ceph_mds_session *session)
+{
+	struct ceph_msg *msg;
+
+	dout("send_cap_releases mds%d\n", session->s_mds);
+	spin_lock(&session->s_cap_lock);
+	while (!list_empty(&session->s_cap_releases_done)) {
+		msg = list_first_entry(&session->s_cap_releases_done,
+				 struct ceph_msg, list_head);
+		list_del_init(&msg->list_head);
+		spin_unlock(&session->s_cap_lock);
+		msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+		dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
+		ceph_con_send(&session->s_con, msg);
+		spin_lock(&session->s_cap_lock);
+	}
+	spin_unlock(&session->s_cap_lock);
+}
+
+static void discard_cap_releases(struct ceph_mds_client *mdsc,
+				 struct ceph_mds_session *session)
+{
+	struct ceph_msg *msg;
+	struct ceph_mds_cap_release *head;
+	unsigned num;
+
+	dout("discard_cap_releases mds%d\n", session->s_mds);
+	spin_lock(&session->s_cap_lock);
+
+	/* zero out the in-progress message */
+	msg = list_first_entry(&session->s_cap_releases,
+			       struct ceph_msg, list_head);
+	head = msg->front.iov_base;
+	num = le32_to_cpu(head->num);
+	dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num);
+	head->num = cpu_to_le32(0);
+	session->s_num_cap_releases += num;
+
+	/* requeue completed messages */
+	while (!list_empty(&session->s_cap_releases_done)) {
+		msg = list_first_entry(&session->s_cap_releases_done,
+				 struct ceph_msg, list_head);
+		list_del_init(&msg->list_head);
+
+		head = msg->front.iov_base;
+		num = le32_to_cpu(head->num);
+		dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg,
+		     num);
+		session->s_num_cap_releases += num;
+		head->num = cpu_to_le32(0);
+		msg->front.iov_len = sizeof(*head);
+		list_add(&msg->list_head, &session->s_cap_releases);
+	}
+
+	spin_unlock(&session->s_cap_lock);
+}
+
+/*
+ * requests
+ */
+
+/*
+ * Create an mds request.
+ */
+struct ceph_mds_request *
+ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
+{
+	struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
+
+	if (!req)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&req->r_fill_mutex);
+	req->r_started = jiffies;
+	req->r_resend_mds = -1;
+	INIT_LIST_HEAD(&req->r_unsafe_dir_item);
+	req->r_fmode = -1;
+	kref_init(&req->r_kref);
+	INIT_LIST_HEAD(&req->r_wait);
+	init_completion(&req->r_completion);
+	init_completion(&req->r_safe_completion);
+	INIT_LIST_HEAD(&req->r_unsafe_item);
+
+	req->r_op = op;
+	req->r_direct_mode = mode;
+	return req;
+}
+
+/*
+ * return oldest (lowest) request, tid in request tree, 0 if none.
+ *
+ * called under mdsc->mutex.
+ */
+static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc)
+{
+	if (RB_EMPTY_ROOT(&mdsc->request_tree))
+		return NULL;
+	return rb_entry(rb_first(&mdsc->request_tree),
+			struct ceph_mds_request, r_node);
+}
+
+static u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
+{
+	struct ceph_mds_request *req = __get_oldest_req(mdsc);
+
+	if (req)
+		return req->r_tid;
+	return 0;
+}
+
+/*
+ * Build a dentry's path.  Allocate on heap; caller must kfree.  Based
+ * on build_path_from_dentry in fs/cifs/dir.c.
+ *
+ * If @stop_on_nosnap, generate path relative to the first non-snapped
+ * inode.
+ *
+ * Encode hidden .snap dirs as a double /, i.e.
+ *   foo/.snap/bar -> foo//bar
+ */
+char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
+			   int stop_on_nosnap)
+{
+	struct dentry *temp;
+	char *path;
+	int len, pos;
+
+	if (dentry == NULL)
+		return ERR_PTR(-EINVAL);
+
+retry:
+	len = 0;
+	for (temp = dentry; !IS_ROOT(temp);) {
+		struct inode *inode = temp->d_inode;
+		if (inode && ceph_snap(inode) == CEPH_SNAPDIR)
+			len++;  /* slash only */
+		else if (stop_on_nosnap && inode &&
+			 ceph_snap(inode) == CEPH_NOSNAP)
+			break;
+		else
+			len += 1 + temp->d_name.len;
+		temp = temp->d_parent;
+		if (temp == NULL) {
+			pr_err("build_path corrupt dentry %p\n", dentry);
+			return ERR_PTR(-EINVAL);
+		}
+	}
+	if (len)
+		len--;  /* no leading '/' */
+
+	path = kmalloc(len+1, GFP_NOFS);
+	if (path == NULL)
+		return ERR_PTR(-ENOMEM);
+	pos = len;
+	path[pos] = 0;	/* trailing null */
+	for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) {
+		struct inode *inode = temp->d_inode;
+
+		if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
+			dout("build_path path+%d: %p SNAPDIR\n",
+			     pos, temp);
+		} else if (stop_on_nosnap && inode &&
+			   ceph_snap(inode) == CEPH_NOSNAP) {
+			break;
+		} else {
+			pos -= temp->d_name.len;
+			if (pos < 0)
+				break;
+			strncpy(path + pos, temp->d_name.name,
+				temp->d_name.len);
+		}
+		if (pos)
+			path[--pos] = '/';
+		temp = temp->d_parent;
+		if (temp == NULL) {
+			pr_err("build_path corrupt dentry\n");
+			kfree(path);
+			return ERR_PTR(-EINVAL);
+		}
+	}
+	if (pos != 0) {
+		pr_err("build_path did not end path lookup where "
+		       "expected, namelen is %d, pos is %d\n", len, pos);
+		/* presumably this is only possible if racing with a
+		   rename of one of the parent directories (we can not
+		   lock the dentries above us to prevent this, but
+		   retrying should be harmless) */
+		kfree(path);
+		goto retry;
+	}
+
+	*base = ceph_ino(temp->d_inode);
+	*plen = len;
+	dout("build_path on %p %d built %llx '%.*s'\n",
+	     dentry, atomic_read(&dentry->d_count), *base, len, path);
+	return path;
+}
+
+static int build_dentry_path(struct dentry *dentry,
+			     const char **ppath, int *ppathlen, u64 *pino,
+			     int *pfreepath)
+{
+	char *path;
+
+	if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) {
+		*pino = ceph_ino(dentry->d_parent->d_inode);
+		*ppath = dentry->d_name.name;
+		*ppathlen = dentry->d_name.len;
+		return 0;
+	}
+	path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
+	if (IS_ERR(path))
+		return PTR_ERR(path);
+	*ppath = path;
+	*pfreepath = 1;
+	return 0;
+}
+
+static int build_inode_path(struct inode *inode,
+			    const char **ppath, int *ppathlen, u64 *pino,
+			    int *pfreepath)
+{
+	struct dentry *dentry;
+	char *path;
+
+	if (ceph_snap(inode) == CEPH_NOSNAP) {
+		*pino = ceph_ino(inode);
+		*ppathlen = 0;
+		return 0;
+	}
+	dentry = d_find_alias(inode);
+	path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
+	dput(dentry);
+	if (IS_ERR(path))
+		return PTR_ERR(path);
+	*ppath = path;
+	*pfreepath = 1;
+	return 0;
+}
+
+/*
+ * request arguments may be specified via an inode *, a dentry *, or
+ * an explicit ino+path.
+ */
+static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
+				  const char *rpath, u64 rino,
+				  const char **ppath, int *pathlen,
+				  u64 *ino, int *freepath)
+{
+	int r = 0;
+
+	if (rinode) {
+		r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
+		dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
+		     ceph_snap(rinode));
+	} else if (rdentry) {
+		r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath);
+		dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
+		     *ppath);
+	} else if (rpath) {
+		*ino = rino;
+		*ppath = rpath;
+		*pathlen = strlen(rpath);
+		dout(" path %.*s\n", *pathlen, rpath);
+	}
+
+	return r;
+}
+
+/*
+ * called under mdsc->mutex
+ */
+static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
+					       struct ceph_mds_request *req,
+					       int mds)
+{
+	struct ceph_msg *msg;
+	struct ceph_mds_request_head *head;
+	const char *path1 = NULL;
+	const char *path2 = NULL;
+	u64 ino1 = 0, ino2 = 0;
+	int pathlen1 = 0, pathlen2 = 0;
+	int freepath1 = 0, freepath2 = 0;
+	int len;
+	u16 releases;
+	void *p, *end;
+	int ret;
+
+	ret = set_request_path_attr(req->r_inode, req->r_dentry,
+			      req->r_path1, req->r_ino1.ino,
+			      &path1, &pathlen1, &ino1, &freepath1);
+	if (ret < 0) {
+		msg = ERR_PTR(ret);
+		goto out;
+	}
+
+	ret = set_request_path_attr(NULL, req->r_old_dentry,
+			      req->r_path2, req->r_ino2.ino,
+			      &path2, &pathlen2, &ino2, &freepath2);
+	if (ret < 0) {
+		msg = ERR_PTR(ret);
+		goto out_free1;
+	}
+
+	len = sizeof(*head) +
+		pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64));
+
+	/* calculate (max) length for cap releases */
+	len += sizeof(struct ceph_mds_request_release) *
+		(!!req->r_inode_drop + !!req->r_dentry_drop +
+		 !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
+	if (req->r_dentry_drop)
+		len += req->r_dentry->d_name.len;
+	if (req->r_old_dentry_drop)
+		len += req->r_old_dentry->d_name.len;
+
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS);
+	if (!msg) {
+		msg = ERR_PTR(-ENOMEM);
+		goto out_free2;
+	}
+
+	msg->hdr.tid = cpu_to_le64(req->r_tid);
+
+	head = msg->front.iov_base;
+	p = msg->front.iov_base + sizeof(*head);
+	end = msg->front.iov_base + msg->front.iov_len;
+
+	head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
+	head->op = cpu_to_le32(req->r_op);
+	head->caller_uid = cpu_to_le32(current_fsuid());
+	head->caller_gid = cpu_to_le32(current_fsgid());
+	head->args = req->r_args;
+
+	ceph_encode_filepath(&p, end, ino1, path1);
+	ceph_encode_filepath(&p, end, ino2, path2);
+
+	/* make note of release offset, in case we need to replay */
+	req->r_request_release_offset = p - msg->front.iov_base;
+
+	/* cap releases */
+	releases = 0;
+	if (req->r_inode_drop)
+		releases += ceph_encode_inode_release(&p,
+		      req->r_inode ? req->r_inode : req->r_dentry->d_inode,
+		      mds, req->r_inode_drop, req->r_inode_unless, 0);
+	if (req->r_dentry_drop)
+		releases += ceph_encode_dentry_release(&p, req->r_dentry,
+		       mds, req->r_dentry_drop, req->r_dentry_unless);
+	if (req->r_old_dentry_drop)
+		releases += ceph_encode_dentry_release(&p, req->r_old_dentry,
+		       mds, req->r_old_dentry_drop, req->r_old_dentry_unless);
+	if (req->r_old_inode_drop)
+		releases += ceph_encode_inode_release(&p,
+		      req->r_old_dentry->d_inode,
+		      mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
+	head->num_releases = cpu_to_le16(releases);
+
+	BUG_ON(p > end);
+	msg->front.iov_len = p - msg->front.iov_base;
+	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+
+	msg->pages = req->r_pages;
+	msg->nr_pages = req->r_num_pages;
+	msg->hdr.data_len = cpu_to_le32(req->r_data_len);
+	msg->hdr.data_off = cpu_to_le16(0);
+
+out_free2:
+	if (freepath2)
+		kfree((char *)path2);
+out_free1:
+	if (freepath1)
+		kfree((char *)path1);
+out:
+	return msg;
+}
+
+/*
+ * called under mdsc->mutex if error, under no mutex if
+ * success.
+ */
+static void complete_request(struct ceph_mds_client *mdsc,
+			     struct ceph_mds_request *req)
+{
+	if (req->r_callback)
+		req->r_callback(mdsc, req);
+	else
+		complete_all(&req->r_completion);
+}
+
+/*
+ * called under mdsc->mutex
+ */
+static int __prepare_send_request(struct ceph_mds_client *mdsc,
+				  struct ceph_mds_request *req,
+				  int mds)
+{
+	struct ceph_mds_request_head *rhead;
+	struct ceph_msg *msg;
+	int flags = 0;
+
+	req->r_mds = mds;
+	req->r_attempts++;
+	dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
+	     req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
+
+	if (req->r_got_unsafe) {
+		/*
+		 * Replay.  Do not regenerate message (and rebuild
+		 * paths, etc.); just use the original message.
+		 * Rebuilding paths will break for renames because
+		 * d_move mangles the src name.
+		 */
+		msg = req->r_request;
+		rhead = msg->front.iov_base;
+
+		flags = le32_to_cpu(rhead->flags);
+		flags |= CEPH_MDS_FLAG_REPLAY;
+		rhead->flags = cpu_to_le32(flags);
+
+		if (req->r_target_inode)
+			rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
+
+		rhead->num_retry = req->r_attempts - 1;
+
+		/* remove cap/dentry releases from message */
+		rhead->num_releases = 0;
+		msg->hdr.front_len = cpu_to_le32(req->r_request_release_offset);
+		msg->front.iov_len = req->r_request_release_offset;
+		return 0;
+	}
+
+	if (req->r_request) {
+		ceph_msg_put(req->r_request);
+		req->r_request = NULL;
+	}
+	msg = create_request_message(mdsc, req, mds);
+	if (IS_ERR(msg)) {
+		req->r_err = PTR_ERR(msg);
+		complete_request(mdsc, req);
+		return PTR_ERR(msg);
+	}
+	req->r_request = msg;
+
+	rhead = msg->front.iov_base;
+	rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
+	if (req->r_got_unsafe)
+		flags |= CEPH_MDS_FLAG_REPLAY;
+	if (req->r_locked_dir)
+		flags |= CEPH_MDS_FLAG_WANT_DENTRY;
+	rhead->flags = cpu_to_le32(flags);
+	rhead->num_fwd = req->r_num_fwd;
+	rhead->num_retry = req->r_attempts - 1;
+	rhead->ino = 0;
+
+	dout(" r_locked_dir = %p\n", req->r_locked_dir);
+	return 0;
+}
+
+/*
+ * send request, or put it on the appropriate wait list.
+ */
+static int __do_request(struct ceph_mds_client *mdsc,
+			struct ceph_mds_request *req)
+{
+	struct ceph_mds_session *session = NULL;
+	int mds = -1;
+	int err = -EAGAIN;
+
+	if (req->r_err || req->r_got_result)
+		goto out;
+
+	if (req->r_timeout &&
+	    time_after_eq(jiffies, req->r_started + req->r_timeout)) {
+		dout("do_request timed out\n");
+		err = -EIO;
+		goto finish;
+	}
+
+	mds = __choose_mds(mdsc, req);
+	if (mds < 0 ||
+	    ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
+		dout("do_request no mds or not active, waiting for map\n");
+		list_add(&req->r_wait, &mdsc->waiting_for_map);
+		goto out;
+	}
+
+	/* get, open session */
+	session = __ceph_lookup_mds_session(mdsc, mds);
+	if (!session) {
+		session = register_session(mdsc, mds);
+		if (IS_ERR(session)) {
+			err = PTR_ERR(session);
+			goto finish;
+		}
+	}
+	dout("do_request mds%d session %p state %s\n", mds, session,
+	     session_state_name(session->s_state));
+	if (session->s_state != CEPH_MDS_SESSION_OPEN &&
+	    session->s_state != CEPH_MDS_SESSION_HUNG) {
+		if (session->s_state == CEPH_MDS_SESSION_NEW ||
+		    session->s_state == CEPH_MDS_SESSION_CLOSING)
+			__open_session(mdsc, session);
+		list_add(&req->r_wait, &session->s_waiting);
+		goto out_session;
+	}
+
+	/* send request */
+	req->r_session = get_session(session);
+	req->r_resend_mds = -1;   /* forget any previous mds hint */
+
+	if (req->r_request_started == 0)   /* note request start time */
+		req->r_request_started = jiffies;
+
+	err = __prepare_send_request(mdsc, req, mds);
+	if (!err) {
+		ceph_msg_get(req->r_request);
+		ceph_con_send(&session->s_con, req->r_request);
+	}
+
+out_session:
+	ceph_put_mds_session(session);
+out:
+	return err;
+
+finish:
+	req->r_err = err;
+	complete_request(mdsc, req);
+	goto out;
+}
+
+/*
+ * called under mdsc->mutex
+ */
+static void __wake_requests(struct ceph_mds_client *mdsc,
+			    struct list_head *head)
+{
+	struct ceph_mds_request *req, *nreq;
+
+	list_for_each_entry_safe(req, nreq, head, r_wait) {
+		list_del_init(&req->r_wait);
+		__do_request(mdsc, req);
+	}
+}
+
+/*
+ * Wake up threads with requests pending for @mds, so that they can
+ * resubmit their requests to a possibly different mds.
+ */
+static void kick_requests(struct ceph_mds_client *mdsc, int mds)
+{
+	struct ceph_mds_request *req;
+	struct rb_node *p;
+
+	dout("kick_requests mds%d\n", mds);
+	for (p = rb_first(&mdsc->request_tree); p; p = rb_next(p)) {
+		req = rb_entry(p, struct ceph_mds_request, r_node);
+		if (req->r_got_unsafe)
+			continue;
+		if (req->r_session &&
+		    req->r_session->s_mds == mds) {
+			dout(" kicking tid %llu\n", req->r_tid);
+			put_request_session(req);
+			__do_request(mdsc, req);
+		}
+	}
+}
+
+void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
+			      struct ceph_mds_request *req)
+{
+	dout("submit_request on %p\n", req);
+	mutex_lock(&mdsc->mutex);
+	__register_request(mdsc, req, NULL);
+	__do_request(mdsc, req);
+	mutex_unlock(&mdsc->mutex);
+}
+
+/*
+ * Synchrously perform an mds request.  Take care of all of the
+ * session setup, forwarding, retry details.
+ */
+int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
+			 struct inode *dir,
+			 struct ceph_mds_request *req)
+{
+	int err;
+
+	dout("do_request on %p\n", req);
+
+	/* take CAP_PIN refs for r_inode, r_locked_dir, r_old_dentry */
+	if (req->r_inode)
+		ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
+	if (req->r_locked_dir)
+		ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
+	if (req->r_old_dentry)
+		ceph_get_cap_refs(
+			ceph_inode(req->r_old_dentry->d_parent->d_inode),
+			CEPH_CAP_PIN);
+
+	/* issue */
+	mutex_lock(&mdsc->mutex);
+	__register_request(mdsc, req, dir);
+	__do_request(mdsc, req);
+
+	if (req->r_err) {
+		err = req->r_err;
+		__unregister_request(mdsc, req);
+		dout("do_request early error %d\n", err);
+		goto out;
+	}
+
+	/* wait */
+	mutex_unlock(&mdsc->mutex);
+	dout("do_request waiting\n");
+	if (req->r_timeout) {
+		err = (long)wait_for_completion_killable_timeout(
+			&req->r_completion, req->r_timeout);
+		if (err == 0)
+			err = -EIO;
+	} else {
+		err = wait_for_completion_killable(&req->r_completion);
+	}
+	dout("do_request waited, got %d\n", err);
+	mutex_lock(&mdsc->mutex);
+
+	/* only abort if we didn't race with a real reply */
+	if (req->r_got_result) {
+		err = le32_to_cpu(req->r_reply_info.head->result);
+	} else if (err < 0) {
+		dout("aborted request %lld with %d\n", req->r_tid, err);
+
+		/*
+		 * ensure we aren't running concurrently with
+		 * ceph_fill_trace or ceph_readdir_prepopulate, which
+		 * rely on locks (dir mutex) held by our caller.
+		 */
+		mutex_lock(&req->r_fill_mutex);
+		req->r_err = err;
+		req->r_aborted = true;
+		mutex_unlock(&req->r_fill_mutex);
+
+		if (req->r_locked_dir &&
+		    (req->r_op & CEPH_MDS_OP_WRITE))
+			ceph_invalidate_dir_request(req);
+	} else {
+		err = req->r_err;
+	}
+
+out:
+	mutex_unlock(&mdsc->mutex);
+	dout("do_request %p done, result %d\n", req, err);
+	return err;
+}
+
+/*
+ * Invalidate dir I_COMPLETE, dentry lease state on an aborted MDS
+ * namespace request.
+ */
+void ceph_invalidate_dir_request(struct ceph_mds_request *req)
+{
+	struct inode *inode = req->r_locked_dir;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	dout("invalidate_dir_request %p (I_COMPLETE, lease(s))\n", inode);
+	spin_lock(&inode->i_lock);
+	ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
+	ci->i_release_count++;
+	spin_unlock(&inode->i_lock);
+
+	if (req->r_dentry)
+		ceph_invalidate_dentry_lease(req->r_dentry);
+	if (req->r_old_dentry)
+		ceph_invalidate_dentry_lease(req->r_old_dentry);
+}
+
+/*
+ * Handle mds reply.
+ *
+ * We take the session mutex and parse and process the reply immediately.
+ * This preserves the logical ordering of replies, capabilities, etc., sent
+ * by the MDS as they are applied to our local cache.
+ */
+static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
+{
+	struct ceph_mds_client *mdsc = session->s_mdsc;
+	struct ceph_mds_request *req;
+	struct ceph_mds_reply_head *head = msg->front.iov_base;
+	struct ceph_mds_reply_info_parsed *rinfo;  /* parsed reply info */
+	u64 tid;
+	int err, result;
+	int mds = session->s_mds;
+
+	if (msg->front.iov_len < sizeof(*head)) {
+		pr_err("mdsc_handle_reply got corrupt (short) reply\n");
+		ceph_msg_dump(msg);
+		return;
+	}
+
+	/* get request, session */
+	tid = le64_to_cpu(msg->hdr.tid);
+	mutex_lock(&mdsc->mutex);
+	req = __lookup_request(mdsc, tid);
+	if (!req) {
+		dout("handle_reply on unknown tid %llu\n", tid);
+		mutex_unlock(&mdsc->mutex);
+		return;
+	}
+	dout("handle_reply %p\n", req);
+
+	/* correct session? */
+	if (req->r_session != session) {
+		pr_err("mdsc_handle_reply got %llu on session mds%d"
+		       " not mds%d\n", tid, session->s_mds,
+		       req->r_session ? req->r_session->s_mds : -1);
+		mutex_unlock(&mdsc->mutex);
+		goto out;
+	}
+
+	/* dup? */
+	if ((req->r_got_unsafe && !head->safe) ||
+	    (req->r_got_safe && head->safe)) {
+		pr_warning("got a dup %s reply on %llu from mds%d\n",
+			   head->safe ? "safe" : "unsafe", tid, mds);
+		mutex_unlock(&mdsc->mutex);
+		goto out;
+	}
+	if (req->r_got_safe && !head->safe) {
+		pr_warning("got unsafe after safe on %llu from mds%d\n",
+			   tid, mds);
+		mutex_unlock(&mdsc->mutex);
+		goto out;
+	}
+
+	result = le32_to_cpu(head->result);
+
+	/*
+	 * Tolerate 2 consecutive ESTALEs from the same mds.
+	 * FIXME: we should be looking at the cap migrate_seq.
+	 */
+	if (result == -ESTALE) {
+		req->r_direct_mode = USE_AUTH_MDS;
+		req->r_num_stale++;
+		if (req->r_num_stale <= 2) {
+			__do_request(mdsc, req);
+			mutex_unlock(&mdsc->mutex);
+			goto out;
+		}
+	} else {
+		req->r_num_stale = 0;
+	}
+
+	if (head->safe) {
+		req->r_got_safe = true;
+		__unregister_request(mdsc, req);
+		complete_all(&req->r_safe_completion);
+
+		if (req->r_got_unsafe) {
+			/*
+			 * We already handled the unsafe response, now do the
+			 * cleanup.  No need to examine the response; the MDS
+			 * doesn't include any result info in the safe
+			 * response.  And even if it did, there is nothing
+			 * useful we could do with a revised return value.
+			 */
+			dout("got safe reply %llu, mds%d\n", tid, mds);
+			list_del_init(&req->r_unsafe_item);
+
+			/* last unsafe request during umount? */
+			if (mdsc->stopping && !__get_oldest_req(mdsc))
+				complete_all(&mdsc->safe_umount_waiters);
+			mutex_unlock(&mdsc->mutex);
+			goto out;
+		}
+	} else {
+		req->r_got_unsafe = true;
+		list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
+	}
+
+	dout("handle_reply tid %lld result %d\n", tid, result);
+	rinfo = &req->r_reply_info;
+	err = parse_reply_info(msg, rinfo);
+	mutex_unlock(&mdsc->mutex);
+
+	mutex_lock(&session->s_mutex);
+	if (err < 0) {
+		pr_err("mdsc_handle_reply got corrupt reply mds%d\n", mds);
+		ceph_msg_dump(msg);
+		goto out_err;
+	}
+
+	/* snap trace */
+	if (rinfo->snapblob_len) {
+		down_write(&mdsc->snap_rwsem);
+		ceph_update_snap_trace(mdsc, rinfo->snapblob,
+			       rinfo->snapblob + rinfo->snapblob_len,
+			       le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP);
+		downgrade_write(&mdsc->snap_rwsem);
+	} else {
+		down_read(&mdsc->snap_rwsem);
+	}
+
+	/* insert trace into our cache */
+	mutex_lock(&req->r_fill_mutex);
+	err = ceph_fill_trace(mdsc->client->sb, req, req->r_session);
+	if (err == 0) {
+		if (result == 0 && rinfo->dir_nr)
+			ceph_readdir_prepopulate(req, req->r_session);
+		ceph_unreserve_caps(&req->r_caps_reservation);
+	}
+	mutex_unlock(&req->r_fill_mutex);
+
+	up_read(&mdsc->snap_rwsem);
+out_err:
+	mutex_lock(&mdsc->mutex);
+	if (!req->r_aborted) {
+		if (err) {
+			req->r_err = err;
+		} else {
+			req->r_reply = msg;
+			ceph_msg_get(msg);
+			req->r_got_result = true;
+		}
+	} else {
+		dout("reply arrived after request %lld was aborted\n", tid);
+	}
+	mutex_unlock(&mdsc->mutex);
+
+	ceph_add_cap_releases(mdsc, req->r_session, -1);
+	mutex_unlock(&session->s_mutex);
+
+	/* kick calling process */
+	complete_request(mdsc, req);
+out:
+	ceph_mdsc_put_request(req);
+	return;
+}
+
+
+
+/*
+ * handle mds notification that our request has been forwarded.
+ */
+static void handle_forward(struct ceph_mds_client *mdsc,
+			   struct ceph_mds_session *session,
+			   struct ceph_msg *msg)
+{
+	struct ceph_mds_request *req;
+	u64 tid = le64_to_cpu(msg->hdr.tid);
+	u32 next_mds;
+	u32 fwd_seq;
+	int err = -EINVAL;
+	void *p = msg->front.iov_base;
+	void *end = p + msg->front.iov_len;
+
+	ceph_decode_need(&p, end, 2*sizeof(u32), bad);
+	next_mds = ceph_decode_32(&p);
+	fwd_seq = ceph_decode_32(&p);
+
+	mutex_lock(&mdsc->mutex);
+	req = __lookup_request(mdsc, tid);
+	if (!req) {
+		dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
+		goto out;  /* dup reply? */
+	}
+
+	if (req->r_aborted) {
+		dout("forward tid %llu aborted, unregistering\n", tid);
+		__unregister_request(mdsc, req);
+	} else if (fwd_seq <= req->r_num_fwd) {
+		dout("forward tid %llu to mds%d - old seq %d <= %d\n",
+		     tid, next_mds, req->r_num_fwd, fwd_seq);
+	} else {
+		/* resend. forward race not possible; mds would drop */
+		dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
+		BUG_ON(req->r_err);
+		BUG_ON(req->r_got_result);
+		req->r_num_fwd = fwd_seq;
+		req->r_resend_mds = next_mds;
+		put_request_session(req);
+		__do_request(mdsc, req);
+	}
+	ceph_mdsc_put_request(req);
+out:
+	mutex_unlock(&mdsc->mutex);
+	return;
+
+bad:
+	pr_err("mdsc_handle_forward decode error err=%d\n", err);
+}
+
+/*
+ * handle a mds session control message
+ */
+static void handle_session(struct ceph_mds_session *session,
+			   struct ceph_msg *msg)
+{
+	struct ceph_mds_client *mdsc = session->s_mdsc;
+	u32 op;
+	u64 seq;
+	int mds = session->s_mds;
+	struct ceph_mds_session_head *h = msg->front.iov_base;
+	int wake = 0;
+
+	/* decode */
+	if (msg->front.iov_len != sizeof(*h))
+		goto bad;
+	op = le32_to_cpu(h->op);
+	seq = le64_to_cpu(h->seq);
+
+	mutex_lock(&mdsc->mutex);
+	if (op == CEPH_SESSION_CLOSE)
+		__unregister_session(mdsc, session);
+	/* FIXME: this ttl calculation is generous */
+	session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose;
+	mutex_unlock(&mdsc->mutex);
+
+	mutex_lock(&session->s_mutex);
+
+	dout("handle_session mds%d %s %p state %s seq %llu\n",
+	     mds, ceph_session_op_name(op), session,
+	     session_state_name(session->s_state), seq);
+
+	if (session->s_state == CEPH_MDS_SESSION_HUNG) {
+		session->s_state = CEPH_MDS_SESSION_OPEN;
+		pr_info("mds%d came back\n", session->s_mds);
+	}
+
+	switch (op) {
+	case CEPH_SESSION_OPEN:
+		if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
+			pr_info("mds%d reconnect success\n", session->s_mds);
+		session->s_state = CEPH_MDS_SESSION_OPEN;
+		renewed_caps(mdsc, session, 0);
+		wake = 1;
+		if (mdsc->stopping)
+			__close_session(mdsc, session);
+		break;
+
+	case CEPH_SESSION_RENEWCAPS:
+		if (session->s_renew_seq == seq)
+			renewed_caps(mdsc, session, 1);
+		break;
+
+	case CEPH_SESSION_CLOSE:
+		if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
+			pr_info("mds%d reconnect denied\n", session->s_mds);
+		remove_session_caps(session);
+		wake = 1; /* for good measure */
+		complete_all(&mdsc->session_close_waiters);
+		kick_requests(mdsc, mds);
+		break;
+
+	case CEPH_SESSION_STALE:
+		pr_info("mds%d caps went stale, renewing\n",
+			session->s_mds);
+		spin_lock(&session->s_cap_lock);
+		session->s_cap_gen++;
+		session->s_cap_ttl = 0;
+		spin_unlock(&session->s_cap_lock);
+		send_renew_caps(mdsc, session);
+		break;
+
+	case CEPH_SESSION_RECALL_STATE:
+		trim_caps(mdsc, session, le32_to_cpu(h->max_caps));
+		break;
+
+	default:
+		pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
+		WARN_ON(1);
+	}
+
+	mutex_unlock(&session->s_mutex);
+	if (wake) {
+		mutex_lock(&mdsc->mutex);
+		__wake_requests(mdsc, &session->s_waiting);
+		mutex_unlock(&mdsc->mutex);
+	}
+	return;
+
+bad:
+	pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds,
+	       (int)msg->front.iov_len);
+	ceph_msg_dump(msg);
+	return;
+}
+
+
+/*
+ * called under session->mutex.
+ */
+static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
+				   struct ceph_mds_session *session)
+{
+	struct ceph_mds_request *req, *nreq;
+	int err;
+
+	dout("replay_unsafe_requests mds%d\n", session->s_mds);
+
+	mutex_lock(&mdsc->mutex);
+	list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) {
+		err = __prepare_send_request(mdsc, req, session->s_mds);
+		if (!err) {
+			ceph_msg_get(req->r_request);
+			ceph_con_send(&session->s_con, req->r_request);
+		}
+	}
+	mutex_unlock(&mdsc->mutex);
+}
+
+/*
+ * Encode information about a cap for a reconnect with the MDS.
+ */
+static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
+			  void *arg)
+{
+	struct ceph_mds_cap_reconnect rec;
+	struct ceph_inode_info *ci;
+	struct ceph_pagelist *pagelist = arg;
+	char *path;
+	int pathlen, err;
+	u64 pathbase;
+	struct dentry *dentry;
+
+	ci = cap->ci;
+
+	dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
+	     inode, ceph_vinop(inode), cap, cap->cap_id,
+	     ceph_cap_string(cap->issued));
+	err = ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
+	if (err)
+		return err;
+
+	dentry = d_find_alias(inode);
+	if (dentry) {
+		path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0);
+		if (IS_ERR(path)) {
+			err = PTR_ERR(path);
+			BUG_ON(err);
+		}
+	} else {
+		path = NULL;
+		pathlen = 0;
+	}
+	err = ceph_pagelist_encode_string(pagelist, path, pathlen);
+	if (err)
+		goto out;
+
+	spin_lock(&inode->i_lock);
+	cap->seq = 0;        /* reset cap seq */
+	cap->issue_seq = 0;  /* and issue_seq */
+	rec.cap_id = cpu_to_le64(cap->cap_id);
+	rec.pathbase = cpu_to_le64(pathbase);
+	rec.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
+	rec.issued = cpu_to_le32(cap->issued);
+	rec.size = cpu_to_le64(inode->i_size);
+	ceph_encode_timespec(&rec.mtime, &inode->i_mtime);
+	ceph_encode_timespec(&rec.atime, &inode->i_atime);
+	rec.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
+	spin_unlock(&inode->i_lock);
+
+	err = ceph_pagelist_append(pagelist, &rec, sizeof(rec));
+
+out:
+	kfree(path);
+	dput(dentry);
+	return err;
+}
+
+
+/*
+ * If an MDS fails and recovers, clients need to reconnect in order to
+ * reestablish shared state.  This includes all caps issued through
+ * this session _and_ the snap_realm hierarchy.  Because it's not
+ * clear which snap realms the mds cares about, we send everything we
+ * know about.. that ensures we'll then get any new info the
+ * recovering MDS might have.
+ *
+ * This is a relatively heavyweight operation, but it's rare.
+ *
+ * called with mdsc->mutex held.
+ */
+static void send_mds_reconnect(struct ceph_mds_client *mdsc,
+			       struct ceph_mds_session *session)
+{
+	struct ceph_msg *reply;
+	struct rb_node *p;
+	int mds = session->s_mds;
+	int err = -ENOMEM;
+	struct ceph_pagelist *pagelist;
+
+	pr_info("mds%d reconnect start\n", mds);
+
+	pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
+	if (!pagelist)
+		goto fail_nopagelist;
+	ceph_pagelist_init(pagelist);
+
+	reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS);
+	if (!reply)
+		goto fail_nomsg;
+
+	mutex_lock(&session->s_mutex);
+	session->s_state = CEPH_MDS_SESSION_RECONNECTING;
+	session->s_seq = 0;
+
+	ceph_con_open(&session->s_con,
+		      ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
+
+	/* replay unsafe requests */
+	replay_unsafe_requests(mdsc, session);
+
+	down_read(&mdsc->snap_rwsem);
+
+	dout("session %p state %s\n", session,
+	     session_state_name(session->s_state));
+
+	/* drop old cap expires; we're about to reestablish that state */
+	discard_cap_releases(mdsc, session);
+
+	/* traverse this session's caps */
+	err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
+	if (err)
+		goto fail;
+	err = iterate_session_caps(session, encode_caps_cb, pagelist);
+	if (err < 0)
+		goto fail;
+
+	/*
+	 * snaprealms.  we provide mds with the ino, seq (version), and
+	 * parent for all of our realms.  If the mds has any newer info,
+	 * it will tell us.
+	 */
+	for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {
+		struct ceph_snap_realm *realm =
+			rb_entry(p, struct ceph_snap_realm, node);
+		struct ceph_mds_snaprealm_reconnect sr_rec;
+
+		dout(" adding snap realm %llx seq %lld parent %llx\n",
+		     realm->ino, realm->seq, realm->parent_ino);
+		sr_rec.ino = cpu_to_le64(realm->ino);
+		sr_rec.seq = cpu_to_le64(realm->seq);
+		sr_rec.parent = cpu_to_le64(realm->parent_ino);
+		err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
+		if (err)
+			goto fail;
+	}
+
+	reply->pagelist = pagelist;
+	reply->hdr.data_len = cpu_to_le32(pagelist->length);
+	reply->nr_pages = calc_pages_for(0, pagelist->length);
+	ceph_con_send(&session->s_con, reply);
+
+	mutex_unlock(&session->s_mutex);
+
+	mutex_lock(&mdsc->mutex);
+	__wake_requests(mdsc, &session->s_waiting);
+	mutex_unlock(&mdsc->mutex);
+
+	up_read(&mdsc->snap_rwsem);
+	return;
+
+fail:
+	ceph_msg_put(reply);
+	up_read(&mdsc->snap_rwsem);
+	mutex_unlock(&session->s_mutex);
+fail_nomsg:
+	ceph_pagelist_release(pagelist);
+	kfree(pagelist);
+fail_nopagelist:
+	pr_err("error %d preparing reconnect for mds%d\n", err, mds);
+	return;
+}
+
+
+/*
+ * compare old and new mdsmaps, kicking requests
+ * and closing out old connections as necessary
+ *
+ * called under mdsc->mutex.
+ */
+static void check_new_map(struct ceph_mds_client *mdsc,
+			  struct ceph_mdsmap *newmap,
+			  struct ceph_mdsmap *oldmap)
+{
+	int i;
+	int oldstate, newstate;
+	struct ceph_mds_session *s;
+
+	dout("check_new_map new %u old %u\n",
+	     newmap->m_epoch, oldmap->m_epoch);
+
+	for (i = 0; i < oldmap->m_max_mds && i < mdsc->max_sessions; i++) {
+		if (mdsc->sessions[i] == NULL)
+			continue;
+		s = mdsc->sessions[i];
+		oldstate = ceph_mdsmap_get_state(oldmap, i);
+		newstate = ceph_mdsmap_get_state(newmap, i);
+
+		dout("check_new_map mds%d state %s -> %s (session %s)\n",
+		     i, ceph_mds_state_name(oldstate),
+		     ceph_mds_state_name(newstate),
+		     session_state_name(s->s_state));
+
+		if (memcmp(ceph_mdsmap_get_addr(oldmap, i),
+			   ceph_mdsmap_get_addr(newmap, i),
+			   sizeof(struct ceph_entity_addr))) {
+			if (s->s_state == CEPH_MDS_SESSION_OPENING) {
+				/* the session never opened, just close it
+				 * out now */
+				__wake_requests(mdsc, &s->s_waiting);
+				__unregister_session(mdsc, s);
+			} else {
+				/* just close it */
+				mutex_unlock(&mdsc->mutex);
+				mutex_lock(&s->s_mutex);
+				mutex_lock(&mdsc->mutex);
+				ceph_con_close(&s->s_con);
+				mutex_unlock(&s->s_mutex);
+				s->s_state = CEPH_MDS_SESSION_RESTARTING;
+			}
+
+			/* kick any requests waiting on the recovering mds */
+			kick_requests(mdsc, i);
+		} else if (oldstate == newstate) {
+			continue;  /* nothing new with this mds */
+		}
+
+		/*
+		 * send reconnect?
+		 */
+		if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
+		    newstate >= CEPH_MDS_STATE_RECONNECT) {
+			mutex_unlock(&mdsc->mutex);
+			send_mds_reconnect(mdsc, s);
+			mutex_lock(&mdsc->mutex);
+		}
+
+		/*
+		 * kick request on any mds that has gone active.
+		 */
+		if (oldstate < CEPH_MDS_STATE_ACTIVE &&
+		    newstate >= CEPH_MDS_STATE_ACTIVE) {
+			if (oldstate != CEPH_MDS_STATE_CREATING &&
+			    oldstate != CEPH_MDS_STATE_STARTING)
+				pr_info("mds%d recovery completed\n", s->s_mds);
+			kick_requests(mdsc, i);
+			ceph_kick_flushing_caps(mdsc, s);
+			wake_up_session_caps(s, 1);
+		}
+	}
+}
+
+
+
+/*
+ * leases
+ */
+
+/*
+ * caller must hold session s_mutex, dentry->d_lock
+ */
+void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry)
+{
+	struct ceph_dentry_info *di = ceph_dentry(dentry);
+
+	ceph_put_mds_session(di->lease_session);
+	di->lease_session = NULL;
+}
+
+static void handle_lease(struct ceph_mds_client *mdsc,
+			 struct ceph_mds_session *session,
+			 struct ceph_msg *msg)
+{
+	struct super_block *sb = mdsc->client->sb;
+	struct inode *inode;
+	struct ceph_inode_info *ci;
+	struct dentry *parent, *dentry;
+	struct ceph_dentry_info *di;
+	int mds = session->s_mds;
+	struct ceph_mds_lease *h = msg->front.iov_base;
+	u32 seq;
+	struct ceph_vino vino;
+	int mask;
+	struct qstr dname;
+	int release = 0;
+
+	dout("handle_lease from mds%d\n", mds);
+
+	/* decode */
+	if (msg->front.iov_len < sizeof(*h) + sizeof(u32))
+		goto bad;
+	vino.ino = le64_to_cpu(h->ino);
+	vino.snap = CEPH_NOSNAP;
+	mask = le16_to_cpu(h->mask);
+	seq = le32_to_cpu(h->seq);
+	dname.name = (void *)h + sizeof(*h) + sizeof(u32);
+	dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32);
+	if (dname.len != get_unaligned_le32(h+1))
+		goto bad;
+
+	mutex_lock(&session->s_mutex);
+	session->s_seq++;
+
+	/* lookup inode */
+	inode = ceph_find_inode(sb, vino);
+	dout("handle_lease %s, mask %d, ino %llx %p %.*s\n",
+	     ceph_lease_op_name(h->action), mask, vino.ino, inode,
+	     dname.len, dname.name);
+	if (inode == NULL) {
+		dout("handle_lease no inode %llx\n", vino.ino);
+		goto release;
+	}
+	ci = ceph_inode(inode);
+
+	/* dentry */
+	parent = d_find_alias(inode);
+	if (!parent) {
+		dout("no parent dentry on inode %p\n", inode);
+		WARN_ON(1);
+		goto release;  /* hrm... */
+	}
+	dname.hash = full_name_hash(dname.name, dname.len);
+	dentry = d_lookup(parent, &dname);
+	dput(parent);
+	if (!dentry)
+		goto release;
+
+	spin_lock(&dentry->d_lock);
+	di = ceph_dentry(dentry);
+	switch (h->action) {
+	case CEPH_MDS_LEASE_REVOKE:
+		if (di && di->lease_session == session) {
+			if (ceph_seq_cmp(di->lease_seq, seq) > 0)
+				h->seq = cpu_to_le32(di->lease_seq);
+			__ceph_mdsc_drop_dentry_lease(dentry);
+		}
+		release = 1;
+		break;
+
+	case CEPH_MDS_LEASE_RENEW:
+		if (di && di->lease_session == session &&
+		    di->lease_gen == session->s_cap_gen &&
+		    di->lease_renew_from &&
+		    di->lease_renew_after == 0) {
+			unsigned long duration =
+				le32_to_cpu(h->duration_ms) * HZ / 1000;
+
+			di->lease_seq = seq;
+			dentry->d_time = di->lease_renew_from + duration;
+			di->lease_renew_after = di->lease_renew_from +
+				(duration >> 1);
+			di->lease_renew_from = 0;
+		}
+		break;
+	}
+	spin_unlock(&dentry->d_lock);
+	dput(dentry);
+
+	if (!release)
+		goto out;
+
+release:
+	/* let's just reuse the same message */
+	h->action = CEPH_MDS_LEASE_REVOKE_ACK;
+	ceph_msg_get(msg);
+	ceph_con_send(&session->s_con, msg);
+
+out:
+	iput(inode);
+	mutex_unlock(&session->s_mutex);
+	return;
+
+bad:
+	pr_err("corrupt lease message\n");
+	ceph_msg_dump(msg);
+}
+
+void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
+			      struct inode *inode,
+			      struct dentry *dentry, char action,
+			      u32 seq)
+{
+	struct ceph_msg *msg;
+	struct ceph_mds_lease *lease;
+	int len = sizeof(*lease) + sizeof(u32);
+	int dnamelen = 0;
+
+	dout("lease_send_msg inode %p dentry %p %s to mds%d\n",
+	     inode, dentry, ceph_lease_op_name(action), session->s_mds);
+	dnamelen = dentry->d_name.len;
+	len += dnamelen;
+
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS);
+	if (!msg)
+		return;
+	lease = msg->front.iov_base;
+	lease->action = action;
+	lease->mask = cpu_to_le16(1);
+	lease->ino = cpu_to_le64(ceph_vino(inode).ino);
+	lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
+	lease->seq = cpu_to_le32(seq);
+	put_unaligned_le32(dnamelen, lease + 1);
+	memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dnamelen);
+
+	/*
+	 * if this is a preemptive lease RELEASE, no need to
+	 * flush request stream, since the actual request will
+	 * soon follow.
+	 */
+	msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE);
+
+	ceph_con_send(&session->s_con, msg);
+}
+
+/*
+ * Preemptively release a lease we expect to invalidate anyway.
+ * Pass @inode always, @dentry is optional.
+ */
+void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
+			     struct dentry *dentry, int mask)
+{
+	struct ceph_dentry_info *di;
+	struct ceph_mds_session *session;
+	u32 seq;
+
+	BUG_ON(inode == NULL);
+	BUG_ON(dentry == NULL);
+	BUG_ON(mask == 0);
+
+	/* is dentry lease valid? */
+	spin_lock(&dentry->d_lock);
+	di = ceph_dentry(dentry);
+	if (!di || !di->lease_session ||
+	    di->lease_session->s_mds < 0 ||
+	    di->lease_gen != di->lease_session->s_cap_gen ||
+	    !time_before(jiffies, dentry->d_time)) {
+		dout("lease_release inode %p dentry %p -- "
+		     "no lease on %d\n",
+		     inode, dentry, mask);
+		spin_unlock(&dentry->d_lock);
+		return;
+	}
+
+	/* we do have a lease on this dentry; note mds and seq */
+	session = ceph_get_mds_session(di->lease_session);
+	seq = di->lease_seq;
+	__ceph_mdsc_drop_dentry_lease(dentry);
+	spin_unlock(&dentry->d_lock);
+
+	dout("lease_release inode %p dentry %p mask %d to mds%d\n",
+	     inode, dentry, mask, session->s_mds);
+	ceph_mdsc_lease_send_msg(session, inode, dentry,
+				 CEPH_MDS_LEASE_RELEASE, seq);
+	ceph_put_mds_session(session);
+}
+
+/*
+ * drop all leases (and dentry refs) in preparation for umount
+ */
+static void drop_leases(struct ceph_mds_client *mdsc)
+{
+	int i;
+
+	dout("drop_leases\n");
+	mutex_lock(&mdsc->mutex);
+	for (i = 0; i < mdsc->max_sessions; i++) {
+		struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
+		if (!s)
+			continue;
+		mutex_unlock(&mdsc->mutex);
+		mutex_lock(&s->s_mutex);
+		mutex_unlock(&s->s_mutex);
+		ceph_put_mds_session(s);
+		mutex_lock(&mdsc->mutex);
+	}
+	mutex_unlock(&mdsc->mutex);
+}
+
+
+
+/*
+ * delayed work -- periodically trim expired leases, renew caps with mds
+ */
+static void schedule_delayed(struct ceph_mds_client *mdsc)
+{
+	int delay = 5;
+	unsigned hz = round_jiffies_relative(HZ * delay);
+	schedule_delayed_work(&mdsc->delayed_work, hz);
+}
+
+static void delayed_work(struct work_struct *work)
+{
+	int i;
+	struct ceph_mds_client *mdsc =
+		container_of(work, struct ceph_mds_client, delayed_work.work);
+	int renew_interval;
+	int renew_caps;
+
+	dout("mdsc delayed_work\n");
+	ceph_check_delayed_caps(mdsc);
+
+	mutex_lock(&mdsc->mutex);
+	renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
+	renew_caps = time_after_eq(jiffies, HZ*renew_interval +
+				   mdsc->last_renew_caps);
+	if (renew_caps)
+		mdsc->last_renew_caps = jiffies;
+
+	for (i = 0; i < mdsc->max_sessions; i++) {
+		struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
+		if (s == NULL)
+			continue;
+		if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
+			dout("resending session close request for mds%d\n",
+			     s->s_mds);
+			request_close_session(mdsc, s);
+			ceph_put_mds_session(s);
+			continue;
+		}
+		if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
+			if (s->s_state == CEPH_MDS_SESSION_OPEN) {
+				s->s_state = CEPH_MDS_SESSION_HUNG;
+				pr_info("mds%d hung\n", s->s_mds);
+			}
+		}
+		if (s->s_state < CEPH_MDS_SESSION_OPEN) {
+			/* this mds is failed or recovering, just wait */
+			ceph_put_mds_session(s);
+			continue;
+		}
+		mutex_unlock(&mdsc->mutex);
+
+		mutex_lock(&s->s_mutex);
+		if (renew_caps)
+			send_renew_caps(mdsc, s);
+		else
+			ceph_con_keepalive(&s->s_con);
+		ceph_add_cap_releases(mdsc, s, -1);
+		if (s->s_state == CEPH_MDS_SESSION_OPEN ||
+		    s->s_state == CEPH_MDS_SESSION_HUNG)
+			ceph_send_cap_releases(mdsc, s);
+		mutex_unlock(&s->s_mutex);
+		ceph_put_mds_session(s);
+
+		mutex_lock(&mdsc->mutex);
+	}
+	mutex_unlock(&mdsc->mutex);
+
+	schedule_delayed(mdsc);
+}
+
+
+int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
+{
+	mdsc->client = client;
+	mutex_init(&mdsc->mutex);
+	mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
+	if (mdsc->mdsmap == NULL)
+		return -ENOMEM;
+
+	init_completion(&mdsc->safe_umount_waiters);
+	init_completion(&mdsc->session_close_waiters);
+	INIT_LIST_HEAD(&mdsc->waiting_for_map);
+	mdsc->sessions = NULL;
+	mdsc->max_sessions = 0;
+	mdsc->stopping = 0;
+	init_rwsem(&mdsc->snap_rwsem);
+	mdsc->snap_realms = RB_ROOT;
+	INIT_LIST_HEAD(&mdsc->snap_empty);
+	spin_lock_init(&mdsc->snap_empty_lock);
+	mdsc->last_tid = 0;
+	mdsc->request_tree = RB_ROOT;
+	INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
+	mdsc->last_renew_caps = jiffies;
+	INIT_LIST_HEAD(&mdsc->cap_delay_list);
+	spin_lock_init(&mdsc->cap_delay_lock);
+	INIT_LIST_HEAD(&mdsc->snap_flush_list);
+	spin_lock_init(&mdsc->snap_flush_lock);
+	mdsc->cap_flush_seq = 0;
+	INIT_LIST_HEAD(&mdsc->cap_dirty);
+	mdsc->num_cap_flushing = 0;
+	spin_lock_init(&mdsc->cap_dirty_lock);
+	init_waitqueue_head(&mdsc->cap_flushing_wq);
+	spin_lock_init(&mdsc->dentry_lru_lock);
+	INIT_LIST_HEAD(&mdsc->dentry_lru);
+
+	return 0;
+}
+
+/*
+ * Wait for safe replies on open mds requests.  If we time out, drop
+ * all requests from the tree to avoid dangling dentry refs.
+ */
+static void wait_requests(struct ceph_mds_client *mdsc)
+{
+	struct ceph_mds_request *req;
+	struct ceph_client *client = mdsc->client;
+
+	mutex_lock(&mdsc->mutex);
+	if (__get_oldest_req(mdsc)) {
+		mutex_unlock(&mdsc->mutex);
+
+		dout("wait_requests waiting for requests\n");
+		wait_for_completion_timeout(&mdsc->safe_umount_waiters,
+				    client->mount_args->mount_timeout * HZ);
+
+		/* tear down remaining requests */
+		mutex_lock(&mdsc->mutex);
+		while ((req = __get_oldest_req(mdsc))) {
+			dout("wait_requests timed out on tid %llu\n",
+			     req->r_tid);
+			__unregister_request(mdsc, req);
+		}
+	}
+	mutex_unlock(&mdsc->mutex);
+	dout("wait_requests done\n");
+}
+
+/*
+ * called before mount is ro, and before dentries are torn down.
+ * (hmm, does this still race with new lookups?)
+ */
+void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
+{
+	dout("pre_umount\n");
+	mdsc->stopping = 1;
+
+	drop_leases(mdsc);
+	ceph_flush_dirty_caps(mdsc);
+	wait_requests(mdsc);
+
+	/*
+	 * wait for reply handlers to drop their request refs and
+	 * their inode/dcache refs
+	 */
+	ceph_msgr_flush();
+}
+
+/*
+ * wait for all write mds requests to flush.
+ */
+static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
+{
+	struct ceph_mds_request *req = NULL, *nextreq;
+	struct rb_node *n;
+
+	mutex_lock(&mdsc->mutex);
+	dout("wait_unsafe_requests want %lld\n", want_tid);
+restart:
+	req = __get_oldest_req(mdsc);
+	while (req && req->r_tid <= want_tid) {
+		/* find next request */
+		n = rb_next(&req->r_node);
+		if (n)
+			nextreq = rb_entry(n, struct ceph_mds_request, r_node);
+		else
+			nextreq = NULL;
+		if ((req->r_op & CEPH_MDS_OP_WRITE)) {
+			/* write op */
+			ceph_mdsc_get_request(req);
+			if (nextreq)
+				ceph_mdsc_get_request(nextreq);
+			mutex_unlock(&mdsc->mutex);
+			dout("wait_unsafe_requests  wait on %llu (want %llu)\n",
+			     req->r_tid, want_tid);
+			wait_for_completion(&req->r_safe_completion);
+			mutex_lock(&mdsc->mutex);
+			ceph_mdsc_put_request(req);
+			if (!nextreq)
+				break;  /* next dne before, so we're done! */
+			if (RB_EMPTY_NODE(&nextreq->r_node)) {
+				/* next request was removed from tree */
+				ceph_mdsc_put_request(nextreq);
+				goto restart;
+			}
+			ceph_mdsc_put_request(nextreq);  /* won't go away */
+		}
+		req = nextreq;
+	}
+	mutex_unlock(&mdsc->mutex);
+	dout("wait_unsafe_requests done\n");
+}
+
+void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
+{
+	u64 want_tid, want_flush;
+
+	if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN)
+		return;
+
+	dout("sync\n");
+	mutex_lock(&mdsc->mutex);
+	want_tid = mdsc->last_tid;
+	want_flush = mdsc->cap_flush_seq;
+	mutex_unlock(&mdsc->mutex);
+	dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
+
+	ceph_flush_dirty_caps(mdsc);
+
+	wait_unsafe_requests(mdsc, want_tid);
+	wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush));
+}
+
+
+/*
+ * called after sb is ro.
+ */
+void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
+{
+	struct ceph_mds_session *session;
+	int i;
+	int n;
+	struct ceph_client *client = mdsc->client;
+	unsigned long started, timeout = client->mount_args->mount_timeout * HZ;
+
+	dout("close_sessions\n");
+
+	mutex_lock(&mdsc->mutex);
+
+	/* close sessions */
+	started = jiffies;
+	while (time_before(jiffies, started + timeout)) {
+		dout("closing sessions\n");
+		n = 0;
+		for (i = 0; i < mdsc->max_sessions; i++) {
+			session = __ceph_lookup_mds_session(mdsc, i);
+			if (!session)
+				continue;
+			mutex_unlock(&mdsc->mutex);
+			mutex_lock(&session->s_mutex);
+			__close_session(mdsc, session);
+			mutex_unlock(&session->s_mutex);
+			ceph_put_mds_session(session);
+			mutex_lock(&mdsc->mutex);
+			n++;
+		}
+		if (n == 0)
+			break;
+
+		if (client->mount_state == CEPH_MOUNT_SHUTDOWN)
+			break;
+
+		dout("waiting for sessions to close\n");
+		mutex_unlock(&mdsc->mutex);
+		wait_for_completion_timeout(&mdsc->session_close_waiters,
+					    timeout);
+		mutex_lock(&mdsc->mutex);
+	}
+
+	/* tear down remaining sessions */
+	for (i = 0; i < mdsc->max_sessions; i++) {
+		if (mdsc->sessions[i]) {
+			session = get_session(mdsc->sessions[i]);
+			__unregister_session(mdsc, session);
+			mutex_unlock(&mdsc->mutex);
+			mutex_lock(&session->s_mutex);
+			remove_session_caps(session);
+			mutex_unlock(&session->s_mutex);
+			ceph_put_mds_session(session);
+			mutex_lock(&mdsc->mutex);
+		}
+	}
+
+	WARN_ON(!list_empty(&mdsc->cap_delay_list));
+
+	mutex_unlock(&mdsc->mutex);
+
+	ceph_cleanup_empty_realms(mdsc);
+
+	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
+
+	dout("stopped\n");
+}
+
+void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
+{
+	dout("stop\n");
+	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
+	if (mdsc->mdsmap)
+		ceph_mdsmap_destroy(mdsc->mdsmap);
+	kfree(mdsc->sessions);
+}
+
+
+/*
+ * handle mds map update.
+ */
+void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
+{
+	u32 epoch;
+	u32 maplen;
+	void *p = msg->front.iov_base;
+	void *end = p + msg->front.iov_len;
+	struct ceph_mdsmap *newmap, *oldmap;
+	struct ceph_fsid fsid;
+	int err = -EINVAL;
+
+	ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
+	ceph_decode_copy(&p, &fsid, sizeof(fsid));
+	if (ceph_check_fsid(mdsc->client, &fsid) < 0)
+		return;
+	epoch = ceph_decode_32(&p);
+	maplen = ceph_decode_32(&p);
+	dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
+
+	/* do we need it? */
+	ceph_monc_got_mdsmap(&mdsc->client->monc, epoch);
+	mutex_lock(&mdsc->mutex);
+	if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
+		dout("handle_map epoch %u <= our %u\n",
+		     epoch, mdsc->mdsmap->m_epoch);
+		mutex_unlock(&mdsc->mutex);
+		return;
+	}
+
+	newmap = ceph_mdsmap_decode(&p, end);
+	if (IS_ERR(newmap)) {
+		err = PTR_ERR(newmap);
+		goto bad_unlock;
+	}
+
+	/* swap into place */
+	if (mdsc->mdsmap) {
+		oldmap = mdsc->mdsmap;
+		mdsc->mdsmap = newmap;
+		check_new_map(mdsc, newmap, oldmap);
+		ceph_mdsmap_destroy(oldmap);
+	} else {
+		mdsc->mdsmap = newmap;  /* first mds map */
+	}
+	mdsc->client->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
+
+	__wake_requests(mdsc, &mdsc->waiting_for_map);
+
+	mutex_unlock(&mdsc->mutex);
+	schedule_delayed(mdsc);
+	return;
+
+bad_unlock:
+	mutex_unlock(&mdsc->mutex);
+bad:
+	pr_err("error decoding mdsmap %d\n", err);
+	return;
+}
+
+static struct ceph_connection *con_get(struct ceph_connection *con)
+{
+	struct ceph_mds_session *s = con->private;
+
+	if (get_session(s)) {
+		dout("mdsc con_get %p ok (%d)\n", s, atomic_read(&s->s_ref));
+		return con;
+	}
+	dout("mdsc con_get %p FAIL\n", s);
+	return NULL;
+}
+
+static void con_put(struct ceph_connection *con)
+{
+	struct ceph_mds_session *s = con->private;
+
+	ceph_put_mds_session(s);
+	dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref));
+}
+
+/*
+ * if the client is unresponsive for long enough, the mds will kill
+ * the session entirely.
+ */
+static void peer_reset(struct ceph_connection *con)
+{
+	struct ceph_mds_session *s = con->private;
+	struct ceph_mds_client *mdsc = s->s_mdsc;
+
+	pr_warning("mds%d closed our session\n", s->s_mds);
+	send_mds_reconnect(mdsc, s);
+}
+
+static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+{
+	struct ceph_mds_session *s = con->private;
+	struct ceph_mds_client *mdsc = s->s_mdsc;
+	int type = le16_to_cpu(msg->hdr.type);
+
+	mutex_lock(&mdsc->mutex);
+	if (__verify_registered_session(mdsc, s) < 0) {
+		mutex_unlock(&mdsc->mutex);
+		goto out;
+	}
+	mutex_unlock(&mdsc->mutex);
+
+	switch (type) {
+	case CEPH_MSG_MDS_MAP:
+		ceph_mdsc_handle_map(mdsc, msg);
+		break;
+	case CEPH_MSG_CLIENT_SESSION:
+		handle_session(s, msg);
+		break;
+	case CEPH_MSG_CLIENT_REPLY:
+		handle_reply(s, msg);
+		break;
+	case CEPH_MSG_CLIENT_REQUEST_FORWARD:
+		handle_forward(mdsc, s, msg);
+		break;
+	case CEPH_MSG_CLIENT_CAPS:
+		ceph_handle_caps(s, msg);
+		break;
+	case CEPH_MSG_CLIENT_SNAP:
+		ceph_handle_snap(mdsc, s, msg);
+		break;
+	case CEPH_MSG_CLIENT_LEASE:
+		handle_lease(mdsc, s, msg);
+		break;
+
+	default:
+		pr_err("received unknown message type %d %s\n", type,
+		       ceph_msg_type_name(type));
+	}
+out:
+	ceph_msg_put(msg);
+}
+
+/*
+ * authentication
+ */
+static int get_authorizer(struct ceph_connection *con,
+			  void **buf, int *len, int *proto,
+			  void **reply_buf, int *reply_len, int force_new)
+{
+	struct ceph_mds_session *s = con->private;
+	struct ceph_mds_client *mdsc = s->s_mdsc;
+	struct ceph_auth_client *ac = mdsc->client->monc.auth;
+	int ret = 0;
+
+	if (force_new && s->s_authorizer) {
+		ac->ops->destroy_authorizer(ac, s->s_authorizer);
+		s->s_authorizer = NULL;
+	}
+	if (s->s_authorizer == NULL) {
+		if (ac->ops->create_authorizer) {
+			ret = ac->ops->create_authorizer(
+				ac, CEPH_ENTITY_TYPE_MDS,
+				&s->s_authorizer,
+				&s->s_authorizer_buf,
+				&s->s_authorizer_buf_len,
+				&s->s_authorizer_reply_buf,
+				&s->s_authorizer_reply_buf_len);
+			if (ret)
+				return ret;
+		}
+	}
+
+	*proto = ac->protocol;
+	*buf = s->s_authorizer_buf;
+	*len = s->s_authorizer_buf_len;
+	*reply_buf = s->s_authorizer_reply_buf;
+	*reply_len = s->s_authorizer_reply_buf_len;
+	return 0;
+}
+
+
+static int verify_authorizer_reply(struct ceph_connection *con, int len)
+{
+	struct ceph_mds_session *s = con->private;
+	struct ceph_mds_client *mdsc = s->s_mdsc;
+	struct ceph_auth_client *ac = mdsc->client->monc.auth;
+
+	return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len);
+}
+
+static int invalidate_authorizer(struct ceph_connection *con)
+{
+	struct ceph_mds_session *s = con->private;
+	struct ceph_mds_client *mdsc = s->s_mdsc;
+	struct ceph_auth_client *ac = mdsc->client->monc.auth;
+
+	if (ac->ops->invalidate_authorizer)
+		ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
+
+	return ceph_monc_validate_auth(&mdsc->client->monc);
+}
+
+static const struct ceph_connection_operations mds_con_ops = {
+	.get = con_get,
+	.put = con_put,
+	.dispatch = dispatch,
+	.get_authorizer = get_authorizer,
+	.verify_authorizer_reply = verify_authorizer_reply,
+	.invalidate_authorizer = invalidate_authorizer,
+	.peer_reset = peer_reset,
+};
+
+
+
+
+/* eof */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
new file mode 100644
index 000000000000..952410c60d09
--- /dev/null
+++ b/fs/ceph/mds_client.h
@@ -0,0 +1,346 @@
+#ifndef _FS_CEPH_MDS_CLIENT_H
+#define _FS_CEPH_MDS_CLIENT_H
+
+#include <linux/completion.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+
+#include "types.h"
+#include "messenger.h"
+#include "mdsmap.h"
+
+/*
+ * Some lock dependencies:
+ *
+ * session->s_mutex
+ *         mdsc->mutex
+ *
+ *         mdsc->snap_rwsem
+ *
+ *         inode->i_lock
+ *                 mdsc->snap_flush_lock
+ *                 mdsc->cap_delay_lock
+ *
+ */
+
+struct ceph_client;
+struct ceph_cap;
+
+/*
+ * parsed info about a single inode.  pointers are into the encoded
+ * on-wire structures within the mds reply message payload.
+ */
+struct ceph_mds_reply_info_in {
+	struct ceph_mds_reply_inode *in;
+	u32 symlink_len;
+	char *symlink;
+	u32 xattr_len;
+	char *xattr_data;
+};
+
+/*
+ * parsed info about an mds reply, including information about the
+ * target inode and/or its parent directory and dentry, and directory
+ * contents (for readdir results).
+ */
+struct ceph_mds_reply_info_parsed {
+	struct ceph_mds_reply_head    *head;
+
+	struct ceph_mds_reply_info_in diri, targeti;
+	struct ceph_mds_reply_dirfrag *dirfrag;
+	char                          *dname;
+	u32                           dname_len;
+	struct ceph_mds_reply_lease   *dlease;
+
+	struct ceph_mds_reply_dirfrag *dir_dir;
+	int                           dir_nr;
+	char                          **dir_dname;
+	u32                           *dir_dname_len;
+	struct ceph_mds_reply_lease   **dir_dlease;
+	struct ceph_mds_reply_info_in *dir_in;
+	u8                            dir_complete, dir_end;
+
+	/* encoded blob describing snapshot contexts for certain
+	   operations (e.g., open) */
+	void *snapblob;
+	int snapblob_len;
+};
+
+
+/*
+ * cap releases are batched and sent to the MDS en masse.
+ */
+#define CEPH_CAPS_PER_RELEASE ((PAGE_CACHE_SIZE -			\
+				sizeof(struct ceph_mds_cap_release)) /	\
+			       sizeof(struct ceph_mds_cap_item))
+
+
+/*
+ * state associated with each MDS<->client session
+ */
+enum {
+	CEPH_MDS_SESSION_NEW = 1,
+	CEPH_MDS_SESSION_OPENING = 2,
+	CEPH_MDS_SESSION_OPEN = 3,
+	CEPH_MDS_SESSION_HUNG = 4,
+	CEPH_MDS_SESSION_CLOSING = 5,
+	CEPH_MDS_SESSION_RESTARTING = 6,
+	CEPH_MDS_SESSION_RECONNECTING = 7,
+};
+
+struct ceph_mds_session {
+	struct ceph_mds_client *s_mdsc;
+	int               s_mds;
+	int               s_state;
+	unsigned long     s_ttl;      /* time until mds kills us */
+	u64               s_seq;      /* incoming msg seq # */
+	struct mutex      s_mutex;    /* serialize session messages */
+
+	struct ceph_connection s_con;
+
+	struct ceph_authorizer *s_authorizer;
+	void             *s_authorizer_buf, *s_authorizer_reply_buf;
+	size_t            s_authorizer_buf_len, s_authorizer_reply_buf_len;
+
+	/* protected by s_cap_lock */
+	spinlock_t        s_cap_lock;
+	u32               s_cap_gen;  /* inc each time we get mds stale msg */
+	unsigned long     s_cap_ttl;  /* when session caps expire */
+	struct list_head  s_caps;     /* all caps issued by this session */
+	int               s_nr_caps, s_trim_caps;
+	int               s_num_cap_releases;
+	struct list_head  s_cap_releases; /* waiting cap_release messages */
+	struct list_head  s_cap_releases_done; /* ready to send */
+	struct ceph_cap  *s_cap_iterator;
+
+	/* protected by mutex */
+	struct list_head  s_cap_flushing;     /* inodes w/ flushing caps */
+	struct list_head  s_cap_snaps_flushing;
+	unsigned long     s_renew_requested; /* last time we sent a renew req */
+	u64               s_renew_seq;
+
+	atomic_t          s_ref;
+	struct list_head  s_waiting;  /* waiting requests */
+	struct list_head  s_unsafe;   /* unsafe requests */
+};
+
+/*
+ * modes of choosing which MDS to send a request to
+ */
+enum {
+	USE_ANY_MDS,
+	USE_RANDOM_MDS,
+	USE_AUTH_MDS,   /* prefer authoritative mds for this metadata item */
+};
+
+struct ceph_mds_request;
+struct ceph_mds_client;
+
+/*
+ * request completion callback
+ */
+typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
+					     struct ceph_mds_request *req);
+
+/*
+ * an in-flight mds request
+ */
+struct ceph_mds_request {
+	u64 r_tid;                   /* transaction id */
+	struct rb_node r_node;
+
+	int r_op;                    /* mds op code */
+	int r_mds;
+
+	/* operation on what? */
+	struct inode *r_inode;              /* arg1 */
+	struct dentry *r_dentry;            /* arg1 */
+	struct dentry *r_old_dentry;        /* arg2: rename from or link from */
+	char *r_path1, *r_path2;
+	struct ceph_vino r_ino1, r_ino2;
+
+	struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
+	struct inode *r_target_inode;       /* resulting inode */
+
+	struct mutex r_fill_mutex;
+
+	union ceph_mds_request_args r_args;
+	int r_fmode;        /* file mode, if expecting cap */
+
+	/* for choosing which mds to send this request to */
+	int r_direct_mode;
+	u32 r_direct_hash;      /* choose dir frag based on this dentry hash */
+	bool r_direct_is_hash;  /* true if r_direct_hash is valid */
+
+	/* data payload is used for xattr ops */
+	struct page **r_pages;
+	int r_num_pages;
+	int r_data_len;
+
+	/* what caps shall we drop? */
+	int r_inode_drop, r_inode_unless;
+	int r_dentry_drop, r_dentry_unless;
+	int r_old_dentry_drop, r_old_dentry_unless;
+	struct inode *r_old_inode;
+	int r_old_inode_drop, r_old_inode_unless;
+
+	struct ceph_msg  *r_request;  /* original request */
+	int r_request_release_offset;
+	struct ceph_msg  *r_reply;
+	struct ceph_mds_reply_info_parsed r_reply_info;
+	int r_err;
+	bool r_aborted;
+
+	unsigned long r_timeout;  /* optional.  jiffies */
+	unsigned long r_started;  /* start time to measure timeout against */
+	unsigned long r_request_started; /* start time for mds request only,
+					    used to measure lease durations */
+
+	/* link unsafe requests to parent directory, for fsync */
+	struct inode	*r_unsafe_dir;
+	struct list_head r_unsafe_dir_item;
+
+	struct ceph_mds_session *r_session;
+
+	int               r_attempts;   /* resend attempts */
+	int               r_num_fwd;    /* number of forward attempts */
+	int               r_num_stale;
+	int               r_resend_mds; /* mds to resend to next, if any*/
+
+	struct kref       r_kref;
+	struct list_head  r_wait;
+	struct completion r_completion;
+	struct completion r_safe_completion;
+	ceph_mds_request_callback_t r_callback;
+	struct list_head  r_unsafe_item;  /* per-session unsafe list item */
+	bool		  r_got_unsafe, r_got_safe, r_got_result;
+
+	bool              r_did_prepopulate;
+	u32               r_readdir_offset;
+
+	struct ceph_cap_reservation r_caps_reservation;
+	int r_num_caps;
+};
+
+/*
+ * mds client state
+ */
+struct ceph_mds_client {
+	struct ceph_client      *client;
+	struct mutex            mutex;         /* all nested structures */
+
+	struct ceph_mdsmap      *mdsmap;
+	struct completion       safe_umount_waiters, session_close_waiters;
+	struct list_head        waiting_for_map;
+
+	struct ceph_mds_session **sessions;    /* NULL for mds if no session */
+	int                     max_sessions;  /* len of s_mds_sessions */
+	int                     stopping;      /* true if shutting down */
+
+	/*
+	 * snap_rwsem will cover cap linkage into snaprealms, and
+	 * realm snap contexts.  (later, we can do per-realm snap
+	 * contexts locks..)  the empty list contains realms with no
+	 * references (implying they contain no inodes with caps) that
+	 * should be destroyed.
+	 */
+	struct rw_semaphore     snap_rwsem;
+	struct rb_root          snap_realms;
+	struct list_head        snap_empty;
+	spinlock_t              snap_empty_lock;  /* protect snap_empty */
+
+	u64                    last_tid;      /* most recent mds request */
+	struct rb_root         request_tree;  /* pending mds requests */
+	struct delayed_work    delayed_work;  /* delayed work */
+	unsigned long    last_renew_caps;  /* last time we renewed our caps */
+	struct list_head cap_delay_list;   /* caps with delayed release */
+	spinlock_t       cap_delay_lock;   /* protects cap_delay_list */
+	struct list_head snap_flush_list;  /* cap_snaps ready to flush */
+	spinlock_t       snap_flush_lock;
+
+	u64               cap_flush_seq;
+	struct list_head  cap_dirty;        /* inodes with dirty caps */
+	int               num_cap_flushing; /* # caps we are flushing */
+	spinlock_t        cap_dirty_lock;   /* protects above items */
+	wait_queue_head_t cap_flushing_wq;
+
+#ifdef CONFIG_DEBUG_FS
+	struct dentry 	  *debugfs_file;
+#endif
+
+	spinlock_t	  dentry_lru_lock;
+	struct list_head  dentry_lru;
+	int		  num_dentry;
+};
+
+extern const char *ceph_mds_op_name(int op);
+
+extern struct ceph_mds_session *
+__ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
+
+static inline struct ceph_mds_session *
+ceph_get_mds_session(struct ceph_mds_session *s)
+{
+	atomic_inc(&s->s_ref);
+	return s;
+}
+
+extern void ceph_put_mds_session(struct ceph_mds_session *s);
+
+extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
+			     struct ceph_msg *msg, int mds);
+
+extern int ceph_mdsc_init(struct ceph_mds_client *mdsc,
+			   struct ceph_client *client);
+extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
+extern void ceph_mdsc_stop(struct ceph_mds_client *mdsc);
+
+extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
+
+extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
+				    struct inode *inode,
+				    struct dentry *dn, int mask);
+
+extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
+
+extern struct ceph_mds_request *
+ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
+extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
+				     struct ceph_mds_request *req);
+extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
+				struct inode *dir,
+				struct ceph_mds_request *req);
+static inline void ceph_mdsc_get_request(struct ceph_mds_request *req)
+{
+	kref_get(&req->r_kref);
+}
+extern void ceph_mdsc_release_request(struct kref *kref);
+static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
+{
+	kref_put(&req->r_kref, ceph_mdsc_release_request);
+}
+
+extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
+				 struct ceph_mds_session *session,
+				 int extra);
+extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
+				   struct ceph_mds_session *session);
+
+extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
+
+extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
+				  int stop_on_nosnap);
+
+extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
+extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
+				     struct inode *inode,
+				     struct dentry *dentry, char action,
+				     u32 seq);
+
+extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
+				 struct ceph_msg *msg);
+
+#endif
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
new file mode 100644
index 000000000000..c4c498e6dfef
--- /dev/null
+++ b/fs/ceph/mdsmap.c
@@ -0,0 +1,174 @@
+#include "ceph_debug.h"
+
+#include <linux/bug.h>
+#include <linux/err.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+#include "mdsmap.h"
+#include "messenger.h"
+#include "decode.h"
+
+#include "super.h"
+
+
+/*
+ * choose a random mds that is "up" (i.e. has a state > 0), or -1.
+ */
+int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
+{
+	int n = 0;
+	int i;
+	char r;
+
+	/* count */
+	for (i = 0; i < m->m_max_mds; i++)
+		if (m->m_info[i].state > 0)
+			n++;
+	if (n == 0)
+		return -1;
+
+	/* pick */
+	get_random_bytes(&r, 1);
+	n = r % n;
+	i = 0;
+	for (i = 0; n > 0; i++, n--)
+		while (m->m_info[i].state <= 0)
+			i++;
+
+	return i;
+}
+
+/*
+ * Decode an MDS map
+ *
+ * Ignore any fields we don't care about (there are quite a few of
+ * them).
+ */
+struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
+{
+	struct ceph_mdsmap *m;
+	const void *start = *p;
+	int i, j, n;
+	int err = -EINVAL;
+	u16 version;
+
+	m = kzalloc(sizeof(*m), GFP_NOFS);
+	if (m == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	ceph_decode_16_safe(p, end, version, bad);
+
+	ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
+	m->m_epoch = ceph_decode_32(p);
+	m->m_client_epoch = ceph_decode_32(p);
+	m->m_last_failure = ceph_decode_32(p);
+	m->m_root = ceph_decode_32(p);
+	m->m_session_timeout = ceph_decode_32(p);
+	m->m_session_autoclose = ceph_decode_32(p);
+	m->m_max_file_size = ceph_decode_64(p);
+	m->m_max_mds = ceph_decode_32(p);
+
+	m->m_info = kcalloc(m->m_max_mds, sizeof(*m->m_info), GFP_NOFS);
+	if (m->m_info == NULL)
+		goto badmem;
+
+	/* pick out active nodes from mds_info (state > 0) */
+	n = ceph_decode_32(p);
+	for (i = 0; i < n; i++) {
+		u64 global_id;
+		u32 namelen;
+		s32 mds, inc, state;
+		u64 state_seq;
+		u8 infoversion;
+		struct ceph_entity_addr addr;
+		u32 num_export_targets;
+		void *pexport_targets = NULL;
+
+		ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
+		global_id = ceph_decode_64(p);
+		infoversion = ceph_decode_8(p);
+		*p += sizeof(u64);
+		namelen = ceph_decode_32(p);  /* skip mds name */
+		*p += namelen;
+
+		ceph_decode_need(p, end,
+				 4*sizeof(u32) + sizeof(u64) +
+				 sizeof(addr) + sizeof(struct ceph_timespec),
+				 bad);
+		mds = ceph_decode_32(p);
+		inc = ceph_decode_32(p);
+		state = ceph_decode_32(p);
+		state_seq = ceph_decode_64(p);
+		ceph_decode_copy(p, &addr, sizeof(addr));
+		ceph_decode_addr(&addr);
+		*p += sizeof(struct ceph_timespec);
+		*p += sizeof(u32);
+		ceph_decode_32_safe(p, end, namelen, bad);
+		*p += namelen;
+		if (infoversion >= 2) {
+			ceph_decode_32_safe(p, end, num_export_targets, bad);
+			pexport_targets = *p;
+			*p += num_export_targets * sizeof(u32);
+		} else {
+			num_export_targets = 0;
+		}
+
+		dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
+		     i+1, n, global_id, mds, inc, pr_addr(&addr.in_addr),
+		     ceph_mds_state_name(state));
+		if (mds >= 0 && mds < m->m_max_mds && state > 0) {
+			m->m_info[mds].global_id = global_id;
+			m->m_info[mds].state = state;
+			m->m_info[mds].addr = addr;
+			m->m_info[mds].num_export_targets = num_export_targets;
+			if (num_export_targets) {
+				m->m_info[mds].export_targets =
+					kcalloc(num_export_targets, sizeof(u32),
+						GFP_NOFS);
+				for (j = 0; j < num_export_targets; j++)
+					m->m_info[mds].export_targets[j] =
+					       ceph_decode_32(&pexport_targets);
+			} else {
+				m->m_info[mds].export_targets = NULL;
+			}
+		}
+	}
+
+	/* pg_pools */
+	ceph_decode_32_safe(p, end, n, bad);
+	m->m_num_data_pg_pools = n;
+	m->m_data_pg_pools = kcalloc(n, sizeof(u32), GFP_NOFS);
+	if (!m->m_data_pg_pools)
+		goto badmem;
+	ceph_decode_need(p, end, sizeof(u32)*(n+1), bad);
+	for (i = 0; i < n; i++)
+		m->m_data_pg_pools[i] = ceph_decode_32(p);
+	m->m_cas_pg_pool = ceph_decode_32(p);
+
+	/* ok, we don't care about the rest. */
+	dout("mdsmap_decode success epoch %u\n", m->m_epoch);
+	return m;
+
+badmem:
+	err = -ENOMEM;
+bad:
+	pr_err("corrupt mdsmap\n");
+	print_hex_dump(KERN_DEBUG, "mdsmap: ",
+		       DUMP_PREFIX_OFFSET, 16, 1,
+		       start, end - start, true);
+	ceph_mdsmap_destroy(m);
+	return ERR_PTR(-EINVAL);
+}
+
+void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
+{
+	int i;
+
+	for (i = 0; i < m->m_max_mds; i++)
+		kfree(m->m_info[i].export_targets);
+	kfree(m->m_info);
+	kfree(m->m_data_pg_pools);
+	kfree(m);
+}
diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h
new file mode 100644
index 000000000000..eacc131aa5cb
--- /dev/null
+++ b/fs/ceph/mdsmap.h
@@ -0,0 +1,54 @@
+#ifndef _FS_CEPH_MDSMAP_H
+#define _FS_CEPH_MDSMAP_H
+
+#include "types.h"
+
+/*
+ * mds map - describe servers in the mds cluster.
+ *
+ * we limit fields to those the client actually xcares about
+ */
+struct ceph_mds_info {
+	u64 global_id;
+	struct ceph_entity_addr addr;
+	s32 state;
+	int num_export_targets;
+	u32 *export_targets;
+};
+
+struct ceph_mdsmap {
+	u32 m_epoch, m_client_epoch, m_last_failure;
+	u32 m_root;
+	u32 m_session_timeout;          /* seconds */
+	u32 m_session_autoclose;        /* seconds */
+	u64 m_max_file_size;
+	u32 m_max_mds;                  /* size of m_addr, m_state arrays */
+	struct ceph_mds_info *m_info;
+
+	/* which object pools file data can be stored in */
+	int m_num_data_pg_pools;
+	u32 *m_data_pg_pools;
+	u32 m_cas_pg_pool;
+};
+
+static inline struct ceph_entity_addr *
+ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
+{
+	if (w >= m->m_max_mds)
+		return NULL;
+	return &m->m_info[w].addr;
+}
+
+static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
+{
+	BUG_ON(w < 0);
+	if (w >= m->m_max_mds)
+		return CEPH_MDS_STATE_DNE;
+	return m->m_info[w].state;
+}
+
+extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
+extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
+extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
+
+#endif
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
new file mode 100644
index 000000000000..15167b2daa55
--- /dev/null
+++ b/fs/ceph/messenger.c
@@ -0,0 +1,2276 @@
+#include "ceph_debug.h"
+
+#include <linux/crc32c.h>
+#include <linux/ctype.h>
+#include <linux/highmem.h>
+#include <linux/inet.h>
+#include <linux/kthread.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <linux/socket.h>
+#include <linux/string.h>
+#include <net/tcp.h>
+
+#include "super.h"
+#include "messenger.h"
+#include "decode.h"
+#include "pagelist.h"
+
+/*
+ * Ceph uses the messenger to exchange ceph_msg messages with other
+ * hosts in the system.  The messenger provides ordered and reliable
+ * delivery.  We tolerate TCP disconnects by reconnecting (with
+ * exponential backoff) in the case of a fault (disconnection, bad
+ * crc, protocol error).  Acks allow sent messages to be discarded by
+ * the sender.
+ */
+
+/* static tag bytes (protocol control messages) */
+static char tag_msg = CEPH_MSGR_TAG_MSG;
+static char tag_ack = CEPH_MSGR_TAG_ACK;
+static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
+
+#ifdef CONFIG_LOCKDEP
+static struct lock_class_key socket_class;
+#endif
+
+
+static void queue_con(struct ceph_connection *con);
+static void con_work(struct work_struct *);
+static void ceph_fault(struct ceph_connection *con);
+
+/*
+ * nicely render a sockaddr as a string.
+ */
+#define MAX_ADDR_STR 20
+#define MAX_ADDR_STR_LEN 60
+static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN];
+static DEFINE_SPINLOCK(addr_str_lock);
+static int last_addr_str;
+
+const char *pr_addr(const struct sockaddr_storage *ss)
+{
+	int i;
+	char *s;
+	struct sockaddr_in *in4 = (void *)ss;
+	struct sockaddr_in6 *in6 = (void *)ss;
+
+	spin_lock(&addr_str_lock);
+	i = last_addr_str++;
+	if (last_addr_str == MAX_ADDR_STR)
+		last_addr_str = 0;
+	spin_unlock(&addr_str_lock);
+	s = addr_str[i];
+
+	switch (ss->ss_family) {
+	case AF_INET:
+		snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr,
+			 (unsigned int)ntohs(in4->sin_port));
+		break;
+
+	case AF_INET6:
+		snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr,
+			 (unsigned int)ntohs(in6->sin6_port));
+		break;
+
+	default:
+		sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family);
+	}
+
+	return s;
+}
+
+static void encode_my_addr(struct ceph_messenger *msgr)
+{
+	memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
+	ceph_encode_addr(&msgr->my_enc_addr);
+}
+
+/*
+ * work queue for all reading and writing to/from the socket.
+ */
+struct workqueue_struct *ceph_msgr_wq;
+
+int __init ceph_msgr_init(void)
+{
+	ceph_msgr_wq = create_workqueue("ceph-msgr");
+	if (IS_ERR(ceph_msgr_wq)) {
+		int ret = PTR_ERR(ceph_msgr_wq);
+		pr_err("msgr_init failed to create workqueue: %d\n", ret);
+		ceph_msgr_wq = NULL;
+		return ret;
+	}
+	return 0;
+}
+
+void ceph_msgr_exit(void)
+{
+	destroy_workqueue(ceph_msgr_wq);
+}
+
+void ceph_msgr_flush()
+{
+	flush_workqueue(ceph_msgr_wq);
+}
+
+
+/*
+ * socket callback functions
+ */
+
+/* data available on socket, or listen socket received a connect */
+static void ceph_data_ready(struct sock *sk, int count_unused)
+{
+	struct ceph_connection *con =
+		(struct ceph_connection *)sk->sk_user_data;
+	if (sk->sk_state != TCP_CLOSE_WAIT) {
+		dout("ceph_data_ready on %p state = %lu, queueing work\n",
+		     con, con->state);
+		queue_con(con);
+	}
+}
+
+/* socket has buffer space for writing */
+static void ceph_write_space(struct sock *sk)
+{
+	struct ceph_connection *con =
+		(struct ceph_connection *)sk->sk_user_data;
+
+	/* only queue to workqueue if there is data we want to write. */
+	if (test_bit(WRITE_PENDING, &con->state)) {
+		dout("ceph_write_space %p queueing write work\n", con);
+		queue_con(con);
+	} else {
+		dout("ceph_write_space %p nothing to write\n", con);
+	}
+
+	/* since we have our own write_space, clear the SOCK_NOSPACE flag */
+	clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+}
+
+/* socket's state has changed */
+static void ceph_state_change(struct sock *sk)
+{
+	struct ceph_connection *con =
+		(struct ceph_connection *)sk->sk_user_data;
+
+	dout("ceph_state_change %p state = %lu sk_state = %u\n",
+	     con, con->state, sk->sk_state);
+
+	if (test_bit(CLOSED, &con->state))
+		return;
+
+	switch (sk->sk_state) {
+	case TCP_CLOSE:
+		dout("ceph_state_change TCP_CLOSE\n");
+	case TCP_CLOSE_WAIT:
+		dout("ceph_state_change TCP_CLOSE_WAIT\n");
+		if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) {
+			if (test_bit(CONNECTING, &con->state))
+				con->error_msg = "connection failed";
+			else
+				con->error_msg = "socket closed";
+			queue_con(con);
+		}
+		break;
+	case TCP_ESTABLISHED:
+		dout("ceph_state_change TCP_ESTABLISHED\n");
+		queue_con(con);
+		break;
+	}
+}
+
+/*
+ * set up socket callbacks
+ */
+static void set_sock_callbacks(struct socket *sock,
+			       struct ceph_connection *con)
+{
+	struct sock *sk = sock->sk;
+	sk->sk_user_data = (void *)con;
+	sk->sk_data_ready = ceph_data_ready;
+	sk->sk_write_space = ceph_write_space;
+	sk->sk_state_change = ceph_state_change;
+}
+
+
+/*
+ * socket helpers
+ */
+
+/*
+ * initiate connection to a remote socket.
+ */
+static struct socket *ceph_tcp_connect(struct ceph_connection *con)
+{
+	struct sockaddr_storage *paddr = &con->peer_addr.in_addr;
+	struct socket *sock;
+	int ret;
+
+	BUG_ON(con->sock);
+	ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM,
+			       IPPROTO_TCP, &sock);
+	if (ret)
+		return ERR_PTR(ret);
+	con->sock = sock;
+	sock->sk->sk_allocation = GFP_NOFS;
+
+#ifdef CONFIG_LOCKDEP
+	lockdep_set_class(&sock->sk->sk_lock, &socket_class);
+#endif
+
+	set_sock_callbacks(sock, con);
+
+	dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
+
+	ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr),
+				 O_NONBLOCK);
+	if (ret == -EINPROGRESS) {
+		dout("connect %s EINPROGRESS sk_state = %u\n",
+		     pr_addr(&con->peer_addr.in_addr),
+		     sock->sk->sk_state);
+		ret = 0;
+	}
+	if (ret < 0) {
+		pr_err("connect %s error %d\n",
+		       pr_addr(&con->peer_addr.in_addr), ret);
+		sock_release(sock);
+		con->sock = NULL;
+		con->error_msg = "connect error";
+	}
+
+	if (ret < 0)
+		return ERR_PTR(ret);
+	return sock;
+}
+
+static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
+{
+	struct kvec iov = {buf, len};
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+
+	return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
+}
+
+/*
+ * write something.  @more is true if caller will be sending more data
+ * shortly.
+ */
+static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
+		     size_t kvlen, size_t len, int more)
+{
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+
+	if (more)
+		msg.msg_flags |= MSG_MORE;
+	else
+		msg.msg_flags |= MSG_EOR;  /* superfluous, but what the hell */
+
+	return kernel_sendmsg(sock, &msg, iov, kvlen, len);
+}
+
+
+/*
+ * Shutdown/close the socket for the given connection.
+ */
+static int con_close_socket(struct ceph_connection *con)
+{
+	int rc;
+
+	dout("con_close_socket on %p sock %p\n", con, con->sock);
+	if (!con->sock)
+		return 0;
+	set_bit(SOCK_CLOSED, &con->state);
+	rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
+	sock_release(con->sock);
+	con->sock = NULL;
+	clear_bit(SOCK_CLOSED, &con->state);
+	return rc;
+}
+
+/*
+ * Reset a connection.  Discard all incoming and outgoing messages
+ * and clear *_seq state.
+ */
+static void ceph_msg_remove(struct ceph_msg *msg)
+{
+	list_del_init(&msg->list_head);
+	ceph_msg_put(msg);
+}
+static void ceph_msg_remove_list(struct list_head *head)
+{
+	while (!list_empty(head)) {
+		struct ceph_msg *msg = list_first_entry(head, struct ceph_msg,
+							list_head);
+		ceph_msg_remove(msg);
+	}
+}
+
+static void reset_connection(struct ceph_connection *con)
+{
+	/* reset connection, out_queue, msg_ and connect_seq */
+	/* discard existing out_queue and msg_seq */
+	ceph_msg_remove_list(&con->out_queue);
+	ceph_msg_remove_list(&con->out_sent);
+
+	if (con->in_msg) {
+		ceph_msg_put(con->in_msg);
+		con->in_msg = NULL;
+	}
+
+	con->connect_seq = 0;
+	con->out_seq = 0;
+	if (con->out_msg) {
+		ceph_msg_put(con->out_msg);
+		con->out_msg = NULL;
+	}
+	con->out_keepalive_pending = false;
+	con->in_seq = 0;
+	con->in_seq_acked = 0;
+}
+
+/*
+ * mark a peer down.  drop any open connections.
+ */
+void ceph_con_close(struct ceph_connection *con)
+{
+	dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr));
+	set_bit(CLOSED, &con->state);  /* in case there's queued work */
+	clear_bit(STANDBY, &con->state);  /* avoid connect_seq bump */
+	clear_bit(LOSSYTX, &con->state);  /* so we retry next connect */
+	clear_bit(KEEPALIVE_PENDING, &con->state);
+	clear_bit(WRITE_PENDING, &con->state);
+	mutex_lock(&con->mutex);
+	reset_connection(con);
+	con->peer_global_seq = 0;
+	cancel_delayed_work(&con->work);
+	mutex_unlock(&con->mutex);
+	queue_con(con);
+}
+
+/*
+ * Reopen a closed connection, with a new peer address.
+ */
+void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
+{
+	dout("con_open %p %s\n", con, pr_addr(&addr->in_addr));
+	set_bit(OPENING, &con->state);
+	clear_bit(CLOSED, &con->state);
+	memcpy(&con->peer_addr, addr, sizeof(*addr));
+	con->delay = 0;      /* reset backoff memory */
+	queue_con(con);
+}
+
+/*
+ * return true if this connection ever successfully opened
+ */
+bool ceph_con_opened(struct ceph_connection *con)
+{
+	return con->connect_seq > 0;
+}
+
+/*
+ * generic get/put
+ */
+struct ceph_connection *ceph_con_get(struct ceph_connection *con)
+{
+	dout("con_get %p nref = %d -> %d\n", con,
+	     atomic_read(&con->nref), atomic_read(&con->nref) + 1);
+	if (atomic_inc_not_zero(&con->nref))
+		return con;
+	return NULL;
+}
+
+void ceph_con_put(struct ceph_connection *con)
+{
+	dout("con_put %p nref = %d -> %d\n", con,
+	     atomic_read(&con->nref), atomic_read(&con->nref) - 1);
+	BUG_ON(atomic_read(&con->nref) == 0);
+	if (atomic_dec_and_test(&con->nref)) {
+		BUG_ON(con->sock);
+		kfree(con);
+	}
+}
+
+/*
+ * initialize a new connection.
+ */
+void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
+{
+	dout("con_init %p\n", con);
+	memset(con, 0, sizeof(*con));
+	atomic_set(&con->nref, 1);
+	con->msgr = msgr;
+	mutex_init(&con->mutex);
+	INIT_LIST_HEAD(&con->out_queue);
+	INIT_LIST_HEAD(&con->out_sent);
+	INIT_DELAYED_WORK(&con->work, con_work);
+}
+
+
+/*
+ * We maintain a global counter to order connection attempts.  Get
+ * a unique seq greater than @gt.
+ */
+static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
+{
+	u32 ret;
+
+	spin_lock(&msgr->global_seq_lock);
+	if (msgr->global_seq < gt)
+		msgr->global_seq = gt;
+	ret = ++msgr->global_seq;
+	spin_unlock(&msgr->global_seq_lock);
+	return ret;
+}
+
+
+/*
+ * Prepare footer for currently outgoing message, and finish things
+ * off.  Assumes out_kvec* are already valid.. we just add on to the end.
+ */
+static void prepare_write_message_footer(struct ceph_connection *con, int v)
+{
+	struct ceph_msg *m = con->out_msg;
+
+	dout("prepare_write_message_footer %p\n", con);
+	con->out_kvec_is_msg = true;
+	con->out_kvec[v].iov_base = &m->footer;
+	con->out_kvec[v].iov_len = sizeof(m->footer);
+	con->out_kvec_bytes += sizeof(m->footer);
+	con->out_kvec_left++;
+	con->out_more = m->more_to_follow;
+	con->out_msg_done = true;
+}
+
+/*
+ * Prepare headers for the next outgoing message.
+ */
+static void prepare_write_message(struct ceph_connection *con)
+{
+	struct ceph_msg *m;
+	int v = 0;
+
+	con->out_kvec_bytes = 0;
+	con->out_kvec_is_msg = true;
+	con->out_msg_done = false;
+
+	/* Sneak an ack in there first?  If we can get it into the same
+	 * TCP packet that's a good thing. */
+	if (con->in_seq > con->in_seq_acked) {
+		con->in_seq_acked = con->in_seq;
+		con->out_kvec[v].iov_base = &tag_ack;
+		con->out_kvec[v++].iov_len = 1;
+		con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
+		con->out_kvec[v].iov_base = &con->out_temp_ack;
+		con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack);
+		con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
+	}
+
+	m = list_first_entry(&con->out_queue,
+		       struct ceph_msg, list_head);
+	con->out_msg = m;
+	if (test_bit(LOSSYTX, &con->state)) {
+		list_del_init(&m->list_head);
+	} else {
+		/* put message on sent list */
+		ceph_msg_get(m);
+		list_move_tail(&m->list_head, &con->out_sent);
+	}
+
+	/*
+	 * only assign outgoing seq # if we haven't sent this message
+	 * yet.  if it is requeued, resend with it's original seq.
+	 */
+	if (m->needs_out_seq) {
+		m->hdr.seq = cpu_to_le64(++con->out_seq);
+		m->needs_out_seq = false;
+	}
+
+	dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
+	     m, con->out_seq, le16_to_cpu(m->hdr.type),
+	     le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
+	     le32_to_cpu(m->hdr.data_len),
+	     m->nr_pages);
+	BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
+
+	/* tag + hdr + front + middle */
+	con->out_kvec[v].iov_base = &tag_msg;
+	con->out_kvec[v++].iov_len = 1;
+	con->out_kvec[v].iov_base = &m->hdr;
+	con->out_kvec[v++].iov_len = sizeof(m->hdr);
+	con->out_kvec[v++] = m->front;
+	if (m->middle)
+		con->out_kvec[v++] = m->middle->vec;
+	con->out_kvec_left = v;
+	con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len +
+		(m->middle ? m->middle->vec.iov_len : 0);
+	con->out_kvec_cur = con->out_kvec;
+
+	/* fill in crc (except data pages), footer */
+	con->out_msg->hdr.crc =
+		cpu_to_le32(crc32c(0, (void *)&m->hdr,
+				      sizeof(m->hdr) - sizeof(m->hdr.crc)));
+	con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE;
+	con->out_msg->footer.front_crc =
+		cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len));
+	if (m->middle)
+		con->out_msg->footer.middle_crc =
+			cpu_to_le32(crc32c(0, m->middle->vec.iov_base,
+					   m->middle->vec.iov_len));
+	else
+		con->out_msg->footer.middle_crc = 0;
+	con->out_msg->footer.data_crc = 0;
+	dout("prepare_write_message front_crc %u data_crc %u\n",
+	     le32_to_cpu(con->out_msg->footer.front_crc),
+	     le32_to_cpu(con->out_msg->footer.middle_crc));
+
+	/* is there a data payload? */
+	if (le32_to_cpu(m->hdr.data_len) > 0) {
+		/* initialize page iterator */
+		con->out_msg_pos.page = 0;
+		con->out_msg_pos.page_pos =
+			le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
+		con->out_msg_pos.data_pos = 0;
+		con->out_msg_pos.did_page_crc = 0;
+		con->out_more = 1;  /* data + footer will follow */
+	} else {
+		/* no, queue up footer too and be done */
+		prepare_write_message_footer(con, v);
+	}
+
+	set_bit(WRITE_PENDING, &con->state);
+}
+
+/*
+ * Prepare an ack.
+ */
+static void prepare_write_ack(struct ceph_connection *con)
+{
+	dout("prepare_write_ack %p %llu -> %llu\n", con,
+	     con->in_seq_acked, con->in_seq);
+	con->in_seq_acked = con->in_seq;
+
+	con->out_kvec[0].iov_base = &tag_ack;
+	con->out_kvec[0].iov_len = 1;
+	con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
+	con->out_kvec[1].iov_base = &con->out_temp_ack;
+	con->out_kvec[1].iov_len = sizeof(con->out_temp_ack);
+	con->out_kvec_left = 2;
+	con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
+	con->out_kvec_cur = con->out_kvec;
+	con->out_more = 1;  /* more will follow.. eventually.. */
+	set_bit(WRITE_PENDING, &con->state);
+}
+
+/*
+ * Prepare to write keepalive byte.
+ */
+static void prepare_write_keepalive(struct ceph_connection *con)
+{
+	dout("prepare_write_keepalive %p\n", con);
+	con->out_kvec[0].iov_base = &tag_keepalive;
+	con->out_kvec[0].iov_len = 1;
+	con->out_kvec_left = 1;
+	con->out_kvec_bytes = 1;
+	con->out_kvec_cur = con->out_kvec;
+	set_bit(WRITE_PENDING, &con->state);
+}
+
+/*
+ * Connection negotiation.
+ */
+
+static void prepare_connect_authorizer(struct ceph_connection *con)
+{
+	void *auth_buf;
+	int auth_len = 0;
+	int auth_protocol = 0;
+
+	mutex_unlock(&con->mutex);
+	if (con->ops->get_authorizer)
+		con->ops->get_authorizer(con, &auth_buf, &auth_len,
+					 &auth_protocol, &con->auth_reply_buf,
+					 &con->auth_reply_buf_len,
+					 con->auth_retry);
+	mutex_lock(&con->mutex);
+
+	con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol);
+	con->out_connect.authorizer_len = cpu_to_le32(auth_len);
+
+	con->out_kvec[con->out_kvec_left].iov_base = auth_buf;
+	con->out_kvec[con->out_kvec_left].iov_len = auth_len;
+	con->out_kvec_left++;
+	con->out_kvec_bytes += auth_len;
+}
+
+/*
+ * We connected to a peer and are saying hello.
+ */
+static void prepare_write_banner(struct ceph_messenger *msgr,
+				 struct ceph_connection *con)
+{
+	int len = strlen(CEPH_BANNER);
+
+	con->out_kvec[0].iov_base = CEPH_BANNER;
+	con->out_kvec[0].iov_len = len;
+	con->out_kvec[1].iov_base = &msgr->my_enc_addr;
+	con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr);
+	con->out_kvec_left = 2;
+	con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr);
+	con->out_kvec_cur = con->out_kvec;
+	con->out_more = 0;
+	set_bit(WRITE_PENDING, &con->state);
+}
+
+static void prepare_write_connect(struct ceph_messenger *msgr,
+				  struct ceph_connection *con,
+				  int after_banner)
+{
+	unsigned global_seq = get_global_seq(con->msgr, 0);
+	int proto;
+
+	switch (con->peer_name.type) {
+	case CEPH_ENTITY_TYPE_MON:
+		proto = CEPH_MONC_PROTOCOL;
+		break;
+	case CEPH_ENTITY_TYPE_OSD:
+		proto = CEPH_OSDC_PROTOCOL;
+		break;
+	case CEPH_ENTITY_TYPE_MDS:
+		proto = CEPH_MDSC_PROTOCOL;
+		break;
+	default:
+		BUG();
+	}
+
+	dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
+	     con->connect_seq, global_seq, proto);
+
+	con->out_connect.features = cpu_to_le64(CEPH_FEATURE_SUPPORTED_CLIENT);
+	con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
+	con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
+	con->out_connect.global_seq = cpu_to_le32(global_seq);
+	con->out_connect.protocol_version = cpu_to_le32(proto);
+	con->out_connect.flags = 0;
+
+	if (!after_banner) {
+		con->out_kvec_left = 0;
+		con->out_kvec_bytes = 0;
+	}
+	con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect;
+	con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect);
+	con->out_kvec_left++;
+	con->out_kvec_bytes += sizeof(con->out_connect);
+	con->out_kvec_cur = con->out_kvec;
+	con->out_more = 0;
+	set_bit(WRITE_PENDING, &con->state);
+
+	prepare_connect_authorizer(con);
+}
+
+
+/*
+ * write as much of pending kvecs to the socket as we can.
+ *  1 -> done
+ *  0 -> socket full, but more to do
+ * <0 -> error
+ */
+static int write_partial_kvec(struct ceph_connection *con)
+{
+	int ret;
+
+	dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
+	while (con->out_kvec_bytes > 0) {
+		ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
+				       con->out_kvec_left, con->out_kvec_bytes,
+				       con->out_more);
+		if (ret <= 0)
+			goto out;
+		con->out_kvec_bytes -= ret;
+		if (con->out_kvec_bytes == 0)
+			break;            /* done */
+		while (ret > 0) {
+			if (ret >= con->out_kvec_cur->iov_len) {
+				ret -= con->out_kvec_cur->iov_len;
+				con->out_kvec_cur++;
+				con->out_kvec_left--;
+			} else {
+				con->out_kvec_cur->iov_len -= ret;
+				con->out_kvec_cur->iov_base += ret;
+				ret = 0;
+				break;
+			}
+		}
+	}
+	con->out_kvec_left = 0;
+	con->out_kvec_is_msg = false;
+	ret = 1;
+out:
+	dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
+	     con->out_kvec_bytes, con->out_kvec_left, ret);
+	return ret;  /* done! */
+}
+
+/*
+ * Write as much message data payload as we can.  If we finish, queue
+ * up the footer.
+ *  1 -> done, footer is now queued in out_kvec[].
+ *  0 -> socket full, but more to do
+ * <0 -> error
+ */
+static int write_partial_msg_pages(struct ceph_connection *con)
+{
+	struct ceph_msg *msg = con->out_msg;
+	unsigned data_len = le32_to_cpu(msg->hdr.data_len);
+	size_t len;
+	int crc = con->msgr->nocrc;
+	int ret;
+
+	dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
+	     con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
+	     con->out_msg_pos.page_pos);
+
+	while (con->out_msg_pos.page < con->out_msg->nr_pages) {
+		struct page *page = NULL;
+		void *kaddr = NULL;
+
+		/*
+		 * if we are calculating the data crc (the default), we need
+		 * to map the page.  if our pages[] has been revoked, use the
+		 * zero page.
+		 */
+		if (msg->pages) {
+			page = msg->pages[con->out_msg_pos.page];
+			if (crc)
+				kaddr = kmap(page);
+		} else if (msg->pagelist) {
+			page = list_first_entry(&msg->pagelist->head,
+						struct page, lru);
+			if (crc)
+				kaddr = kmap(page);
+		} else {
+			page = con->msgr->zero_page;
+			if (crc)
+				kaddr = page_address(con->msgr->zero_page);
+		}
+		len = min((int)(PAGE_SIZE - con->out_msg_pos.page_pos),
+			  (int)(data_len - con->out_msg_pos.data_pos));
+		if (crc && !con->out_msg_pos.did_page_crc) {
+			void *base = kaddr + con->out_msg_pos.page_pos;
+			u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
+
+			BUG_ON(kaddr == NULL);
+			con->out_msg->footer.data_crc =
+				cpu_to_le32(crc32c(tmpcrc, base, len));
+			con->out_msg_pos.did_page_crc = 1;
+		}
+
+		ret = kernel_sendpage(con->sock, page,
+				      con->out_msg_pos.page_pos, len,
+				      MSG_DONTWAIT | MSG_NOSIGNAL |
+				      MSG_MORE);
+
+		if (crc && (msg->pages || msg->pagelist))
+			kunmap(page);
+
+		if (ret <= 0)
+			goto out;
+
+		con->out_msg_pos.data_pos += ret;
+		con->out_msg_pos.page_pos += ret;
+		if (ret == len) {
+			con->out_msg_pos.page_pos = 0;
+			con->out_msg_pos.page++;
+			con->out_msg_pos.did_page_crc = 0;
+			if (msg->pagelist)
+				list_move_tail(&page->lru,
+					       &msg->pagelist->head);
+		}
+	}
+
+	dout("write_partial_msg_pages %p msg %p done\n", con, msg);
+
+	/* prepare and queue up footer, too */
+	if (!crc)
+		con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
+	con->out_kvec_bytes = 0;
+	con->out_kvec_left = 0;
+	con->out_kvec_cur = con->out_kvec;
+	prepare_write_message_footer(con, 0);
+	ret = 1;
+out:
+	return ret;
+}
+
+/*
+ * write some zeros
+ */
+static int write_partial_skip(struct ceph_connection *con)
+{
+	int ret;
+
+	while (con->out_skip > 0) {
+		struct kvec iov = {
+			.iov_base = page_address(con->msgr->zero_page),
+			.iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE)
+		};
+
+		ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1);
+		if (ret <= 0)
+			goto out;
+		con->out_skip -= ret;
+	}
+	ret = 1;
+out:
+	return ret;
+}
+
+/*
+ * Prepare to read connection handshake, or an ack.
+ */
+static void prepare_read_banner(struct ceph_connection *con)
+{
+	dout("prepare_read_banner %p\n", con);
+	con->in_base_pos = 0;
+}
+
+static void prepare_read_connect(struct ceph_connection *con)
+{
+	dout("prepare_read_connect %p\n", con);
+	con->in_base_pos = 0;
+}
+
+static void prepare_read_ack(struct ceph_connection *con)
+{
+	dout("prepare_read_ack %p\n", con);
+	con->in_base_pos = 0;
+}
+
+static void prepare_read_tag(struct ceph_connection *con)
+{
+	dout("prepare_read_tag %p\n", con);
+	con->in_base_pos = 0;
+	con->in_tag = CEPH_MSGR_TAG_READY;
+}
+
+/*
+ * Prepare to read a message.
+ */
+static int prepare_read_message(struct ceph_connection *con)
+{
+	dout("prepare_read_message %p\n", con);
+	BUG_ON(con->in_msg != NULL);
+	con->in_base_pos = 0;
+	con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
+	return 0;
+}
+
+
+static int read_partial(struct ceph_connection *con,
+			int *to, int size, void *object)
+{
+	*to += size;
+	while (con->in_base_pos < *to) {
+		int left = *to - con->in_base_pos;
+		int have = size - left;
+		int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
+		if (ret <= 0)
+			return ret;
+		con->in_base_pos += ret;
+	}
+	return 1;
+}
+
+
+/*
+ * Read all or part of the connect-side handshake on a new connection
+ */
+static int read_partial_banner(struct ceph_connection *con)
+{
+	int ret, to = 0;
+
+	dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
+
+	/* peer's banner */
+	ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner);
+	if (ret <= 0)
+		goto out;
+	ret = read_partial(con, &to, sizeof(con->actual_peer_addr),
+			   &con->actual_peer_addr);
+	if (ret <= 0)
+		goto out;
+	ret = read_partial(con, &to, sizeof(con->peer_addr_for_me),
+			   &con->peer_addr_for_me);
+	if (ret <= 0)
+		goto out;
+out:
+	return ret;
+}
+
+static int read_partial_connect(struct ceph_connection *con)
+{
+	int ret, to = 0;
+
+	dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
+
+	ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply);
+	if (ret <= 0)
+		goto out;
+	ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len),
+			   con->auth_reply_buf);
+	if (ret <= 0)
+		goto out;
+
+	dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
+	     con, (int)con->in_reply.tag,
+	     le32_to_cpu(con->in_reply.connect_seq),
+	     le32_to_cpu(con->in_reply.global_seq));
+out:
+	return ret;
+
+}
+
+/*
+ * Verify the hello banner looks okay.
+ */
+static int verify_hello(struct ceph_connection *con)
+{
+	if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
+		pr_err("connect to %s got bad banner\n",
+		       pr_addr(&con->peer_addr.in_addr));
+		con->error_msg = "protocol error, bad banner";
+		return -1;
+	}
+	return 0;
+}
+
+static bool addr_is_blank(struct sockaddr_storage *ss)
+{
+	switch (ss->ss_family) {
+	case AF_INET:
+		return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0;
+	case AF_INET6:
+		return
+		     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 &&
+		     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 &&
+		     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 &&
+		     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0;
+	}
+	return false;
+}
+
+static int addr_port(struct sockaddr_storage *ss)
+{
+	switch (ss->ss_family) {
+	case AF_INET:
+		return ntohs(((struct sockaddr_in *)ss)->sin_port);
+	case AF_INET6:
+		return ntohs(((struct sockaddr_in6 *)ss)->sin6_port);
+	}
+	return 0;
+}
+
+static void addr_set_port(struct sockaddr_storage *ss, int p)
+{
+	switch (ss->ss_family) {
+	case AF_INET:
+		((struct sockaddr_in *)ss)->sin_port = htons(p);
+	case AF_INET6:
+		((struct sockaddr_in6 *)ss)->sin6_port = htons(p);
+	}
+}
+
+/*
+ * Parse an ip[:port] list into an addr array.  Use the default
+ * monitor port if a port isn't specified.
+ */
+int ceph_parse_ips(const char *c, const char *end,
+		   struct ceph_entity_addr *addr,
+		   int max_count, int *count)
+{
+	int i;
+	const char *p = c;
+
+	dout("parse_ips on '%.*s'\n", (int)(end-c), c);
+	for (i = 0; i < max_count; i++) {
+		const char *ipend;
+		struct sockaddr_storage *ss = &addr[i].in_addr;
+		struct sockaddr_in *in4 = (void *)ss;
+		struct sockaddr_in6 *in6 = (void *)ss;
+		int port;
+		char delim = ',';
+
+		if (*p == '[') {
+			delim = ']';
+			p++;
+		}
+
+		memset(ss, 0, sizeof(*ss));
+		if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
+			     delim, &ipend))
+			ss->ss_family = AF_INET;
+		else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
+				  delim, &ipend))
+			ss->ss_family = AF_INET6;
+		else
+			goto bad;
+		p = ipend;
+
+		if (delim == ']') {
+			if (*p != ']') {
+				dout("missing matching ']'\n");
+				goto bad;
+			}
+			p++;
+		}
+
+		/* port? */
+		if (p < end && *p == ':') {
+			port = 0;
+			p++;
+			while (p < end && *p >= '0' && *p <= '9') {
+				port = (port * 10) + (*p - '0');
+				p++;
+			}
+			if (port > 65535 || port == 0)
+				goto bad;
+		} else {
+			port = CEPH_MON_PORT;
+		}
+
+		addr_set_port(ss, port);
+
+		dout("parse_ips got %s\n", pr_addr(ss));
+
+		if (p == end)
+			break;
+		if (*p != ',')
+			goto bad;
+		p++;
+	}
+
+	if (p != end)
+		goto bad;
+
+	if (count)
+		*count = i + 1;
+	return 0;
+
+bad:
+	pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c);
+	return -EINVAL;
+}
+
+static int process_banner(struct ceph_connection *con)
+{
+	dout("process_banner on %p\n", con);
+
+	if (verify_hello(con) < 0)
+		return -1;
+
+	ceph_decode_addr(&con->actual_peer_addr);
+	ceph_decode_addr(&con->peer_addr_for_me);
+
+	/*
+	 * Make sure the other end is who we wanted.  note that the other
+	 * end may not yet know their ip address, so if it's 0.0.0.0, give
+	 * them the benefit of the doubt.
+	 */
+	if (memcmp(&con->peer_addr, &con->actual_peer_addr,
+		   sizeof(con->peer_addr)) != 0 &&
+	    !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
+	      con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
+		pr_warning("wrong peer, want %s/%lld, got %s/%lld\n",
+			   pr_addr(&con->peer_addr.in_addr),
+			   le64_to_cpu(con->peer_addr.nonce),
+			   pr_addr(&con->actual_peer_addr.in_addr),
+			   le64_to_cpu(con->actual_peer_addr.nonce));
+		con->error_msg = "wrong peer at address";
+		return -1;
+	}
+
+	/*
+	 * did we learn our address?
+	 */
+	if (addr_is_blank(&con->msgr->inst.addr.in_addr)) {
+		int port = addr_port(&con->msgr->inst.addr.in_addr);
+
+		memcpy(&con->msgr->inst.addr.in_addr,
+		       &con->peer_addr_for_me.in_addr,
+		       sizeof(con->peer_addr_for_me.in_addr));
+		addr_set_port(&con->msgr->inst.addr.in_addr, port);
+		encode_my_addr(con->msgr);
+		dout("process_banner learned my addr is %s\n",
+		     pr_addr(&con->msgr->inst.addr.in_addr));
+	}
+
+	set_bit(NEGOTIATING, &con->state);
+	prepare_read_connect(con);
+	return 0;
+}
+
+static void fail_protocol(struct ceph_connection *con)
+{
+	reset_connection(con);
+	set_bit(CLOSED, &con->state);  /* in case there's queued work */
+
+	mutex_unlock(&con->mutex);
+	if (con->ops->bad_proto)
+		con->ops->bad_proto(con);
+	mutex_lock(&con->mutex);
+}
+
+static int process_connect(struct ceph_connection *con)
+{
+	u64 sup_feat = CEPH_FEATURE_SUPPORTED_CLIENT;
+	u64 req_feat = CEPH_FEATURE_REQUIRED_CLIENT;
+	u64 server_feat = le64_to_cpu(con->in_reply.features);
+
+	dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
+
+	switch (con->in_reply.tag) {
+	case CEPH_MSGR_TAG_FEATURES:
+		pr_err("%s%lld %s feature set mismatch,"
+		       " my %llx < server's %llx, missing %llx\n",
+		       ENTITY_NAME(con->peer_name),
+		       pr_addr(&con->peer_addr.in_addr),
+		       sup_feat, server_feat, server_feat & ~sup_feat);
+		con->error_msg = "missing required protocol features";
+		fail_protocol(con);
+		return -1;
+
+	case CEPH_MSGR_TAG_BADPROTOVER:
+		pr_err("%s%lld %s protocol version mismatch,"
+		       " my %d != server's %d\n",
+		       ENTITY_NAME(con->peer_name),
+		       pr_addr(&con->peer_addr.in_addr),
+		       le32_to_cpu(con->out_connect.protocol_version),
+		       le32_to_cpu(con->in_reply.protocol_version));
+		con->error_msg = "protocol version mismatch";
+		fail_protocol(con);
+		return -1;
+
+	case CEPH_MSGR_TAG_BADAUTHORIZER:
+		con->auth_retry++;
+		dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
+		     con->auth_retry);
+		if (con->auth_retry == 2) {
+			con->error_msg = "connect authorization failure";
+			reset_connection(con);
+			set_bit(CLOSED, &con->state);
+			return -1;
+		}
+		con->auth_retry = 1;
+		prepare_write_connect(con->msgr, con, 0);
+		prepare_read_connect(con);
+		break;
+
+	case CEPH_MSGR_TAG_RESETSESSION:
+		/*
+		 * If we connected with a large connect_seq but the peer
+		 * has no record of a session with us (no connection, or
+		 * connect_seq == 0), they will send RESETSESION to indicate
+		 * that they must have reset their session, and may have
+		 * dropped messages.
+		 */
+		dout("process_connect got RESET peer seq %u\n",
+		     le32_to_cpu(con->in_connect.connect_seq));
+		pr_err("%s%lld %s connection reset\n",
+		       ENTITY_NAME(con->peer_name),
+		       pr_addr(&con->peer_addr.in_addr));
+		reset_connection(con);
+		prepare_write_connect(con->msgr, con, 0);
+		prepare_read_connect(con);
+
+		/* Tell ceph about it. */
+		mutex_unlock(&con->mutex);
+		pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
+		if (con->ops->peer_reset)
+			con->ops->peer_reset(con);
+		mutex_lock(&con->mutex);
+		break;
+
+	case CEPH_MSGR_TAG_RETRY_SESSION:
+		/*
+		 * If we sent a smaller connect_seq than the peer has, try
+		 * again with a larger value.
+		 */
+		dout("process_connect got RETRY my seq = %u, peer_seq = %u\n",
+		     le32_to_cpu(con->out_connect.connect_seq),
+		     le32_to_cpu(con->in_connect.connect_seq));
+		con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
+		prepare_write_connect(con->msgr, con, 0);
+		prepare_read_connect(con);
+		break;
+
+	case CEPH_MSGR_TAG_RETRY_GLOBAL:
+		/*
+		 * If we sent a smaller global_seq than the peer has, try
+		 * again with a larger value.
+		 */
+		dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
+		     con->peer_global_seq,
+		     le32_to_cpu(con->in_connect.global_seq));
+		get_global_seq(con->msgr,
+			       le32_to_cpu(con->in_connect.global_seq));
+		prepare_write_connect(con->msgr, con, 0);
+		prepare_read_connect(con);
+		break;
+
+	case CEPH_MSGR_TAG_READY:
+		if (req_feat & ~server_feat) {
+			pr_err("%s%lld %s protocol feature mismatch,"
+			       " my required %llx > server's %llx, need %llx\n",
+			       ENTITY_NAME(con->peer_name),
+			       pr_addr(&con->peer_addr.in_addr),
+			       req_feat, server_feat, req_feat & ~server_feat);
+			con->error_msg = "missing required protocol features";
+			fail_protocol(con);
+			return -1;
+		}
+		clear_bit(CONNECTING, &con->state);
+		con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
+		con->connect_seq++;
+		con->peer_features = server_feat;
+		dout("process_connect got READY gseq %d cseq %d (%d)\n",
+		     con->peer_global_seq,
+		     le32_to_cpu(con->in_reply.connect_seq),
+		     con->connect_seq);
+		WARN_ON(con->connect_seq !=
+			le32_to_cpu(con->in_reply.connect_seq));
+
+		if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
+			set_bit(LOSSYTX, &con->state);
+
+		prepare_read_tag(con);
+		break;
+
+	case CEPH_MSGR_TAG_WAIT:
+		/*
+		 * If there is a connection race (we are opening
+		 * connections to each other), one of us may just have
+		 * to WAIT.  This shouldn't happen if we are the
+		 * client.
+		 */
+		pr_err("process_connect peer connecting WAIT\n");
+
+	default:
+		pr_err("connect protocol error, will retry\n");
+		con->error_msg = "protocol error, garbage tag during connect";
+		return -1;
+	}
+	return 0;
+}
+
+
+/*
+ * read (part of) an ack
+ */
+static int read_partial_ack(struct ceph_connection *con)
+{
+	int to = 0;
+
+	return read_partial(con, &to, sizeof(con->in_temp_ack),
+			    &con->in_temp_ack);
+}
+
+
+/*
+ * We can finally discard anything that's been acked.
+ */
+static void process_ack(struct ceph_connection *con)
+{
+	struct ceph_msg *m;
+	u64 ack = le64_to_cpu(con->in_temp_ack);
+	u64 seq;
+
+	while (!list_empty(&con->out_sent)) {
+		m = list_first_entry(&con->out_sent, struct ceph_msg,
+				     list_head);
+		seq = le64_to_cpu(m->hdr.seq);
+		if (seq > ack)
+			break;
+		dout("got ack for seq %llu type %d at %p\n", seq,
+		     le16_to_cpu(m->hdr.type), m);
+		ceph_msg_remove(m);
+	}
+	prepare_read_tag(con);
+}
+
+
+
+
+static int read_partial_message_section(struct ceph_connection *con,
+					struct kvec *section, unsigned int sec_len,
+					u32 *crc)
+{
+	int left;
+	int ret;
+
+	BUG_ON(!section);
+
+	while (section->iov_len < sec_len) {
+		BUG_ON(section->iov_base == NULL);
+		left = sec_len - section->iov_len;
+		ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
+				       section->iov_len, left);
+		if (ret <= 0)
+			return ret;
+		section->iov_len += ret;
+		if (section->iov_len == sec_len)
+			*crc = crc32c(0, section->iov_base,
+				      section->iov_len);
+	}
+
+	return 1;
+}
+
+static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
+				struct ceph_msg_header *hdr,
+				int *skip);
+/*
+ * read (part of) a message.
+ */
+static int read_partial_message(struct ceph_connection *con)
+{
+	struct ceph_msg *m = con->in_msg;
+	void *p;
+	int ret;
+	int to, left;
+	unsigned front_len, middle_len, data_len, data_off;
+	int datacrc = con->msgr->nocrc;
+	int skip;
+	u64 seq;
+
+	dout("read_partial_message con %p msg %p\n", con, m);
+
+	/* header */
+	while (con->in_base_pos < sizeof(con->in_hdr)) {
+		left = sizeof(con->in_hdr) - con->in_base_pos;
+		ret = ceph_tcp_recvmsg(con->sock,
+				       (char *)&con->in_hdr + con->in_base_pos,
+				       left);
+		if (ret <= 0)
+			return ret;
+		con->in_base_pos += ret;
+		if (con->in_base_pos == sizeof(con->in_hdr)) {
+			u32 crc = crc32c(0, (void *)&con->in_hdr,
+				 sizeof(con->in_hdr) - sizeof(con->in_hdr.crc));
+			if (crc != le32_to_cpu(con->in_hdr.crc)) {
+				pr_err("read_partial_message bad hdr "
+				       " crc %u != expected %u\n",
+				       crc, con->in_hdr.crc);
+				return -EBADMSG;
+			}
+		}
+	}
+	front_len = le32_to_cpu(con->in_hdr.front_len);
+	if (front_len > CEPH_MSG_MAX_FRONT_LEN)
+		return -EIO;
+	middle_len = le32_to_cpu(con->in_hdr.middle_len);
+	if (middle_len > CEPH_MSG_MAX_DATA_LEN)
+		return -EIO;
+	data_len = le32_to_cpu(con->in_hdr.data_len);
+	if (data_len > CEPH_MSG_MAX_DATA_LEN)
+		return -EIO;
+	data_off = le16_to_cpu(con->in_hdr.data_off);
+
+	/* verify seq# */
+	seq = le64_to_cpu(con->in_hdr.seq);
+	if ((s64)seq - (s64)con->in_seq < 1) {
+		pr_info("skipping %s%lld %s seq %lld, expected %lld\n",
+			ENTITY_NAME(con->peer_name),
+			pr_addr(&con->peer_addr.in_addr),
+			seq, con->in_seq + 1);
+		con->in_base_pos = -front_len - middle_len - data_len -
+			sizeof(m->footer);
+		con->in_tag = CEPH_MSGR_TAG_READY;
+		con->in_seq++;
+		return 0;
+	} else if ((s64)seq - (s64)con->in_seq > 1) {
+		pr_err("read_partial_message bad seq %lld expected %lld\n",
+		       seq, con->in_seq + 1);
+		con->error_msg = "bad message sequence # for incoming message";
+		return -EBADMSG;
+	}
+
+	/* allocate message? */
+	if (!con->in_msg) {
+		dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
+		     con->in_hdr.front_len, con->in_hdr.data_len);
+		skip = 0;
+		con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
+		if (skip) {
+			/* skip this message */
+			dout("alloc_msg said skip message\n");
+			BUG_ON(con->in_msg);
+			con->in_base_pos = -front_len - middle_len - data_len -
+				sizeof(m->footer);
+			con->in_tag = CEPH_MSGR_TAG_READY;
+			con->in_seq++;
+			return 0;
+		}
+		if (!con->in_msg) {
+			con->error_msg =
+				"error allocating memory for incoming message";
+			return -ENOMEM;
+		}
+		m = con->in_msg;
+		m->front.iov_len = 0;    /* haven't read it yet */
+		if (m->middle)
+			m->middle->vec.iov_len = 0;
+
+		con->in_msg_pos.page = 0;
+		con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
+		con->in_msg_pos.data_pos = 0;
+	}
+
+	/* front */
+	ret = read_partial_message_section(con, &m->front, front_len,
+					   &con->in_front_crc);
+	if (ret <= 0)
+		return ret;
+
+	/* middle */
+	if (m->middle) {
+		ret = read_partial_message_section(con, &m->middle->vec, middle_len,
+						   &con->in_middle_crc);
+		if (ret <= 0)
+			return ret;
+	}
+
+	/* (page) data */
+	while (con->in_msg_pos.data_pos < data_len) {
+		left = min((int)(data_len - con->in_msg_pos.data_pos),
+			   (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
+		BUG_ON(m->pages == NULL);
+		p = kmap(m->pages[con->in_msg_pos.page]);
+		ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
+				       left);
+		if (ret > 0 && datacrc)
+			con->in_data_crc =
+				crc32c(con->in_data_crc,
+					  p + con->in_msg_pos.page_pos, ret);
+		kunmap(m->pages[con->in_msg_pos.page]);
+		if (ret <= 0)
+			return ret;
+		con->in_msg_pos.data_pos += ret;
+		con->in_msg_pos.page_pos += ret;
+		if (con->in_msg_pos.page_pos == PAGE_SIZE) {
+			con->in_msg_pos.page_pos = 0;
+			con->in_msg_pos.page++;
+		}
+	}
+
+	/* footer */
+	to = sizeof(m->hdr) + sizeof(m->footer);
+	while (con->in_base_pos < to) {
+		left = to - con->in_base_pos;
+		ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer +
+				       (con->in_base_pos - sizeof(m->hdr)),
+				       left);
+		if (ret <= 0)
+			return ret;
+		con->in_base_pos += ret;
+	}
+	dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
+	     m, front_len, m->footer.front_crc, middle_len,
+	     m->footer.middle_crc, data_len, m->footer.data_crc);
+
+	/* crc ok? */
+	if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
+		pr_err("read_partial_message %p front crc %u != exp. %u\n",
+		       m, con->in_front_crc, m->footer.front_crc);
+		return -EBADMSG;
+	}
+	if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
+		pr_err("read_partial_message %p middle crc %u != exp %u\n",
+		       m, con->in_middle_crc, m->footer.middle_crc);
+		return -EBADMSG;
+	}
+	if (datacrc &&
+	    (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
+	    con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
+		pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
+		       con->in_data_crc, le32_to_cpu(m->footer.data_crc));
+		return -EBADMSG;
+	}
+
+	return 1; /* done! */
+}
+
+/*
+ * Process message.  This happens in the worker thread.  The callback should
+ * be careful not to do anything that waits on other incoming messages or it
+ * may deadlock.
+ */
+static void process_message(struct ceph_connection *con)
+{
+	struct ceph_msg *msg;
+
+	msg = con->in_msg;
+	con->in_msg = NULL;
+
+	/* if first message, set peer_name */
+	if (con->peer_name.type == 0)
+		con->peer_name = msg->hdr.src;
+
+	con->in_seq++;
+	mutex_unlock(&con->mutex);
+
+	dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
+	     msg, le64_to_cpu(msg->hdr.seq),
+	     ENTITY_NAME(msg->hdr.src),
+	     le16_to_cpu(msg->hdr.type),
+	     ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
+	     le32_to_cpu(msg->hdr.front_len),
+	     le32_to_cpu(msg->hdr.data_len),
+	     con->in_front_crc, con->in_middle_crc, con->in_data_crc);
+	con->ops->dispatch(con, msg);
+
+	mutex_lock(&con->mutex);
+	prepare_read_tag(con);
+}
+
+
+/*
+ * Write something to the socket.  Called in a worker thread when the
+ * socket appears to be writeable and we have something ready to send.
+ */
+static int try_write(struct ceph_connection *con)
+{
+	struct ceph_messenger *msgr = con->msgr;
+	int ret = 1;
+
+	dout("try_write start %p state %lu nref %d\n", con, con->state,
+	     atomic_read(&con->nref));
+
+more:
+	dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
+
+	/* open the socket first? */
+	if (con->sock == NULL) {
+		/*
+		 * if we were STANDBY and are reconnecting _this_
+		 * connection, bump connect_seq now.  Always bump
+		 * global_seq.
+		 */
+		if (test_and_clear_bit(STANDBY, &con->state))
+			con->connect_seq++;
+
+		prepare_write_banner(msgr, con);
+		prepare_write_connect(msgr, con, 1);
+		prepare_read_banner(con);
+		set_bit(CONNECTING, &con->state);
+		clear_bit(NEGOTIATING, &con->state);
+
+		BUG_ON(con->in_msg);
+		con->in_tag = CEPH_MSGR_TAG_READY;
+		dout("try_write initiating connect on %p new state %lu\n",
+		     con, con->state);
+		con->sock = ceph_tcp_connect(con);
+		if (IS_ERR(con->sock)) {
+			con->sock = NULL;
+			con->error_msg = "connect error";
+			ret = -1;
+			goto out;
+		}
+	}
+
+more_kvec:
+	/* kvec data queued? */
+	if (con->out_skip) {
+		ret = write_partial_skip(con);
+		if (ret <= 0)
+			goto done;
+		if (ret < 0) {
+			dout("try_write write_partial_skip err %d\n", ret);
+			goto done;
+		}
+	}
+	if (con->out_kvec_left) {
+		ret = write_partial_kvec(con);
+		if (ret <= 0)
+			goto done;
+	}
+
+	/* msg pages? */
+	if (con->out_msg) {
+		if (con->out_msg_done) {
+			ceph_msg_put(con->out_msg);
+			con->out_msg = NULL;   /* we're done with this one */
+			goto do_next;
+		}
+
+		ret = write_partial_msg_pages(con);
+		if (ret == 1)
+			goto more_kvec;  /* we need to send the footer, too! */
+		if (ret == 0)
+			goto done;
+		if (ret < 0) {
+			dout("try_write write_partial_msg_pages err %d\n",
+			     ret);
+			goto done;
+		}
+	}
+
+do_next:
+	if (!test_bit(CONNECTING, &con->state)) {
+		/* is anything else pending? */
+		if (!list_empty(&con->out_queue)) {
+			prepare_write_message(con);
+			goto more;
+		}
+		if (con->in_seq > con->in_seq_acked) {
+			prepare_write_ack(con);
+			goto more;
+		}
+		if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) {
+			prepare_write_keepalive(con);
+			goto more;
+		}
+	}
+
+	/* Nothing to do! */
+	clear_bit(WRITE_PENDING, &con->state);
+	dout("try_write nothing else to write.\n");
+done:
+	ret = 0;
+out:
+	dout("try_write done on %p\n", con);
+	return ret;
+}
+
+
+
+/*
+ * Read what we can from the socket.
+ */
+static int try_read(struct ceph_connection *con)
+{
+	int ret = -1;
+
+	if (!con->sock)
+		return 0;
+
+	if (test_bit(STANDBY, &con->state))
+		return 0;
+
+	dout("try_read start on %p\n", con);
+
+more:
+	dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
+	     con->in_base_pos);
+	if (test_bit(CONNECTING, &con->state)) {
+		if (!test_bit(NEGOTIATING, &con->state)) {
+			dout("try_read connecting\n");
+			ret = read_partial_banner(con);
+			if (ret <= 0)
+				goto done;
+			if (process_banner(con) < 0) {
+				ret = -1;
+				goto out;
+			}
+		}
+		ret = read_partial_connect(con);
+		if (ret <= 0)
+			goto done;
+		if (process_connect(con) < 0) {
+			ret = -1;
+			goto out;
+		}
+		goto more;
+	}
+
+	if (con->in_base_pos < 0) {
+		/*
+		 * skipping + discarding content.
+		 *
+		 * FIXME: there must be a better way to do this!
+		 */
+		static char buf[1024];
+		int skip = min(1024, -con->in_base_pos);
+		dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
+		ret = ceph_tcp_recvmsg(con->sock, buf, skip);
+		if (ret <= 0)
+			goto done;
+		con->in_base_pos += ret;
+		if (con->in_base_pos)
+			goto more;
+	}
+	if (con->in_tag == CEPH_MSGR_TAG_READY) {
+		/*
+		 * what's next?
+		 */
+		ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
+		if (ret <= 0)
+			goto done;
+		dout("try_read got tag %d\n", (int)con->in_tag);
+		switch (con->in_tag) {
+		case CEPH_MSGR_TAG_MSG:
+			prepare_read_message(con);
+			break;
+		case CEPH_MSGR_TAG_ACK:
+			prepare_read_ack(con);
+			break;
+		case CEPH_MSGR_TAG_CLOSE:
+			set_bit(CLOSED, &con->state);   /* fixme */
+			goto done;
+		default:
+			goto bad_tag;
+		}
+	}
+	if (con->in_tag == CEPH_MSGR_TAG_MSG) {
+		ret = read_partial_message(con);
+		if (ret <= 0) {
+			switch (ret) {
+			case -EBADMSG:
+				con->error_msg = "bad crc";
+				ret = -EIO;
+				goto out;
+			case -EIO:
+				con->error_msg = "io error";
+				goto out;
+			default:
+				goto done;
+			}
+		}
+		if (con->in_tag == CEPH_MSGR_TAG_READY)
+			goto more;
+		process_message(con);
+		goto more;
+	}
+	if (con->in_tag == CEPH_MSGR_TAG_ACK) {
+		ret = read_partial_ack(con);
+		if (ret <= 0)
+			goto done;
+		process_ack(con);
+		goto more;
+	}
+
+done:
+	ret = 0;
+out:
+	dout("try_read done on %p\n", con);
+	return ret;
+
+bad_tag:
+	pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
+	con->error_msg = "protocol error, garbage tag";
+	ret = -1;
+	goto out;
+}
+
+
+/*
+ * Atomically queue work on a connection.  Bump @con reference to
+ * avoid races with connection teardown.
+ *
+ * There is some trickery going on with QUEUED and BUSY because we
+ * only want a _single_ thread operating on each connection at any
+ * point in time, but we want to use all available CPUs.
+ *
+ * The worker thread only proceeds if it can atomically set BUSY.  It
+ * clears QUEUED and does it's thing.  When it thinks it's done, it
+ * clears BUSY, then rechecks QUEUED.. if it's set again, it loops
+ * (tries again to set BUSY).
+ *
+ * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we
+ * try to queue work.  If that fails (work is already queued, or BUSY)
+ * we give up (work also already being done or is queued) but leave QUEUED
+ * set so that the worker thread will loop if necessary.
+ */
+static void queue_con(struct ceph_connection *con)
+{
+	if (test_bit(DEAD, &con->state)) {
+		dout("queue_con %p ignoring: DEAD\n",
+		     con);
+		return;
+	}
+
+	if (!con->ops->get(con)) {
+		dout("queue_con %p ref count 0\n", con);
+		return;
+	}
+
+	set_bit(QUEUED, &con->state);
+	if (test_bit(BUSY, &con->state)) {
+		dout("queue_con %p - already BUSY\n", con);
+		con->ops->put(con);
+	} else if (!queue_work(ceph_msgr_wq, &con->work.work)) {
+		dout("queue_con %p - already queued\n", con);
+		con->ops->put(con);
+	} else {
+		dout("queue_con %p\n", con);
+	}
+}
+
+/*
+ * Do some work on a connection.  Drop a connection ref when we're done.
+ */
+static void con_work(struct work_struct *work)
+{
+	struct ceph_connection *con = container_of(work, struct ceph_connection,
+						   work.work);
+	int backoff = 0;
+
+more:
+	if (test_and_set_bit(BUSY, &con->state) != 0) {
+		dout("con_work %p BUSY already set\n", con);
+		goto out;
+	}
+	dout("con_work %p start, clearing QUEUED\n", con);
+	clear_bit(QUEUED, &con->state);
+
+	mutex_lock(&con->mutex);
+
+	if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
+		dout("con_work CLOSED\n");
+		con_close_socket(con);
+		goto done;
+	}
+	if (test_and_clear_bit(OPENING, &con->state)) {
+		/* reopen w/ new peer */
+		dout("con_work OPENING\n");
+		con_close_socket(con);
+	}
+
+	if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
+	    try_read(con) < 0 ||
+	    try_write(con) < 0) {
+		mutex_unlock(&con->mutex);
+		backoff = 1;
+		ceph_fault(con);     /* error/fault path */
+		goto done_unlocked;
+	}
+
+done:
+	mutex_unlock(&con->mutex);
+
+done_unlocked:
+	clear_bit(BUSY, &con->state);
+	dout("con->state=%lu\n", con->state);
+	if (test_bit(QUEUED, &con->state)) {
+		if (!backoff || test_bit(OPENING, &con->state)) {
+			dout("con_work %p QUEUED reset, looping\n", con);
+			goto more;
+		}
+		dout("con_work %p QUEUED reset, but just faulted\n", con);
+		clear_bit(QUEUED, &con->state);
+	}
+	dout("con_work %p done\n", con);
+
+out:
+	con->ops->put(con);
+}
+
+
+/*
+ * Generic error/fault handler.  A retry mechanism is used with
+ * exponential backoff
+ */
+static void ceph_fault(struct ceph_connection *con)
+{
+	pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
+	       pr_addr(&con->peer_addr.in_addr), con->error_msg);
+	dout("fault %p state %lu to peer %s\n",
+	     con, con->state, pr_addr(&con->peer_addr.in_addr));
+
+	if (test_bit(LOSSYTX, &con->state)) {
+		dout("fault on LOSSYTX channel\n");
+		goto out;
+	}
+
+	mutex_lock(&con->mutex);
+	if (test_bit(CLOSED, &con->state))
+		goto out_unlock;
+
+	con_close_socket(con);
+
+	if (con->in_msg) {
+		ceph_msg_put(con->in_msg);
+		con->in_msg = NULL;
+	}
+
+	/* Requeue anything that hasn't been acked */
+	list_splice_init(&con->out_sent, &con->out_queue);
+
+	/* If there are no messages in the queue, place the connection
+	 * in a STANDBY state (i.e., don't try to reconnect just yet). */
+	if (list_empty(&con->out_queue) && !con->out_keepalive_pending) {
+		dout("fault setting STANDBY\n");
+		set_bit(STANDBY, &con->state);
+	} else {
+		/* retry after a delay. */
+		if (con->delay == 0)
+			con->delay = BASE_DELAY_INTERVAL;
+		else if (con->delay < MAX_DELAY_INTERVAL)
+			con->delay *= 2;
+		dout("fault queueing %p delay %lu\n", con, con->delay);
+		con->ops->get(con);
+		if (queue_delayed_work(ceph_msgr_wq, &con->work,
+				       round_jiffies_relative(con->delay)) == 0)
+			con->ops->put(con);
+	}
+
+out_unlock:
+	mutex_unlock(&con->mutex);
+out:
+	/*
+	 * in case we faulted due to authentication, invalidate our
+	 * current tickets so that we can get new ones.
+         */
+	if (con->auth_retry && con->ops->invalidate_authorizer) {
+		dout("calling invalidate_authorizer()\n");
+		con->ops->invalidate_authorizer(con);
+	}
+
+	if (con->ops->fault)
+		con->ops->fault(con);
+}
+
+
+
+/*
+ * create a new messenger instance
+ */
+struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
+{
+	struct ceph_messenger *msgr;
+
+	msgr = kzalloc(sizeof(*msgr), GFP_KERNEL);
+	if (msgr == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	spin_lock_init(&msgr->global_seq_lock);
+
+	/* the zero page is needed if a request is "canceled" while the message
+	 * is being written over the socket */
+	msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO);
+	if (!msgr->zero_page) {
+		kfree(msgr);
+		return ERR_PTR(-ENOMEM);
+	}
+	kmap(msgr->zero_page);
+
+	if (myaddr)
+		msgr->inst.addr = *myaddr;
+
+	/* select a random nonce */
+	msgr->inst.addr.type = 0;
+	get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
+	encode_my_addr(msgr);
+
+	dout("messenger_create %p\n", msgr);
+	return msgr;
+}
+
+void ceph_messenger_destroy(struct ceph_messenger *msgr)
+{
+	dout("destroy %p\n", msgr);
+	kunmap(msgr->zero_page);
+	__free_page(msgr->zero_page);
+	kfree(msgr);
+	dout("destroyed messenger %p\n", msgr);
+}
+
+/*
+ * Queue up an outgoing message on the given connection.
+ */
+void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
+{
+	if (test_bit(CLOSED, &con->state)) {
+		dout("con_send %p closed, dropping %p\n", con, msg);
+		ceph_msg_put(msg);
+		return;
+	}
+
+	/* set src+dst */
+	msg->hdr.src = con->msgr->inst.name;
+
+	BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
+
+	msg->needs_out_seq = true;
+
+	/* queue */
+	mutex_lock(&con->mutex);
+	BUG_ON(!list_empty(&msg->list_head));
+	list_add_tail(&msg->list_head, &con->out_queue);
+	dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
+	     ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type),
+	     ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
+	     le32_to_cpu(msg->hdr.front_len),
+	     le32_to_cpu(msg->hdr.middle_len),
+	     le32_to_cpu(msg->hdr.data_len));
+	mutex_unlock(&con->mutex);
+
+	/* if there wasn't anything waiting to send before, queue
+	 * new work */
+	if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
+		queue_con(con);
+}
+
+/*
+ * Revoke a message that was previously queued for send
+ */
+void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
+{
+	mutex_lock(&con->mutex);
+	if (!list_empty(&msg->list_head)) {
+		dout("con_revoke %p msg %p - was on queue\n", con, msg);
+		list_del_init(&msg->list_head);
+		ceph_msg_put(msg);
+		msg->hdr.seq = 0;
+	}
+	if (con->out_msg == msg) {
+		dout("con_revoke %p msg %p - was sending\n", con, msg);
+		con->out_msg = NULL;
+		if (con->out_kvec_is_msg) {
+			con->out_skip = con->out_kvec_bytes;
+			con->out_kvec_is_msg = false;
+		}
+		ceph_msg_put(msg);
+		msg->hdr.seq = 0;
+	}
+	mutex_unlock(&con->mutex);
+}
+
+/*
+ * Revoke a message that we may be reading data into
+ */
+void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
+{
+	mutex_lock(&con->mutex);
+	if (con->in_msg && con->in_msg == msg) {
+		unsigned front_len = le32_to_cpu(con->in_hdr.front_len);
+		unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len);
+		unsigned data_len = le32_to_cpu(con->in_hdr.data_len);
+
+		/* skip rest of message */
+		dout("con_revoke_pages %p msg %p revoked\n", con, msg);
+			con->in_base_pos = con->in_base_pos -
+				sizeof(struct ceph_msg_header) -
+				front_len -
+				middle_len -
+				data_len -
+				sizeof(struct ceph_msg_footer);
+		ceph_msg_put(con->in_msg);
+		con->in_msg = NULL;
+		con->in_tag = CEPH_MSGR_TAG_READY;
+		con->in_seq++;
+	} else {
+		dout("con_revoke_pages %p msg %p pages %p no-op\n",
+		     con, con->in_msg, msg);
+	}
+	mutex_unlock(&con->mutex);
+}
+
+/*
+ * Queue a keepalive byte to ensure the tcp connection is alive.
+ */
+void ceph_con_keepalive(struct ceph_connection *con)
+{
+	if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
+	    test_and_set_bit(WRITE_PENDING, &con->state) == 0)
+		queue_con(con);
+}
+
+
+/*
+ * construct a new message with given type, size
+ * the new msg has a ref count of 1.
+ */
+struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
+{
+	struct ceph_msg *m;
+
+	m = kmalloc(sizeof(*m), flags);
+	if (m == NULL)
+		goto out;
+	kref_init(&m->kref);
+	INIT_LIST_HEAD(&m->list_head);
+
+	m->hdr.tid = 0;
+	m->hdr.type = cpu_to_le16(type);
+	m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
+	m->hdr.version = 0;
+	m->hdr.front_len = cpu_to_le32(front_len);
+	m->hdr.middle_len = 0;
+	m->hdr.data_len = 0;
+	m->hdr.data_off = 0;
+	m->hdr.reserved = 0;
+	m->footer.front_crc = 0;
+	m->footer.middle_crc = 0;
+	m->footer.data_crc = 0;
+	m->footer.flags = 0;
+	m->front_max = front_len;
+	m->front_is_vmalloc = false;
+	m->more_to_follow = false;
+	m->pool = NULL;
+
+	/* front */
+	if (front_len) {
+		if (front_len > PAGE_CACHE_SIZE) {
+			m->front.iov_base = __vmalloc(front_len, flags,
+						      PAGE_KERNEL);
+			m->front_is_vmalloc = true;
+		} else {
+			m->front.iov_base = kmalloc(front_len, flags);
+		}
+		if (m->front.iov_base == NULL) {
+			pr_err("msg_new can't allocate %d bytes\n",
+			     front_len);
+			goto out2;
+		}
+	} else {
+		m->front.iov_base = NULL;
+	}
+	m->front.iov_len = front_len;
+
+	/* middle */
+	m->middle = NULL;
+
+	/* data */
+	m->nr_pages = 0;
+	m->pages = NULL;
+	m->pagelist = NULL;
+
+	dout("ceph_msg_new %p front %d\n", m, front_len);
+	return m;
+
+out2:
+	ceph_msg_put(m);
+out:
+	pr_err("msg_new can't create type %d front %d\n", type, front_len);
+	return NULL;
+}
+
+/*
+ * Allocate "middle" portion of a message, if it is needed and wasn't
+ * allocated by alloc_msg.  This allows us to read a small fixed-size
+ * per-type header in the front and then gracefully fail (i.e.,
+ * propagate the error to the caller based on info in the front) when
+ * the middle is too large.
+ */
+static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
+{
+	int type = le16_to_cpu(msg->hdr.type);
+	int middle_len = le32_to_cpu(msg->hdr.middle_len);
+
+	dout("alloc_middle %p type %d %s middle_len %d\n", msg, type,
+	     ceph_msg_type_name(type), middle_len);
+	BUG_ON(!middle_len);
+	BUG_ON(msg->middle);
+
+	msg->middle = ceph_buffer_new(middle_len, GFP_NOFS);
+	if (!msg->middle)
+		return -ENOMEM;
+	return 0;
+}
+
+/*
+ * Generic message allocator, for incoming messages.
+ */
+static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
+				struct ceph_msg_header *hdr,
+				int *skip)
+{
+	int type = le16_to_cpu(hdr->type);
+	int front_len = le32_to_cpu(hdr->front_len);
+	int middle_len = le32_to_cpu(hdr->middle_len);
+	struct ceph_msg *msg = NULL;
+	int ret;
+
+	if (con->ops->alloc_msg) {
+		mutex_unlock(&con->mutex);
+		msg = con->ops->alloc_msg(con, hdr, skip);
+		mutex_lock(&con->mutex);
+		if (!msg || *skip)
+			return NULL;
+	}
+	if (!msg) {
+		*skip = 0;
+		msg = ceph_msg_new(type, front_len, GFP_NOFS);
+		if (!msg) {
+			pr_err("unable to allocate msg type %d len %d\n",
+			       type, front_len);
+			return NULL;
+		}
+	}
+	memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
+
+	if (middle_len && !msg->middle) {
+		ret = ceph_alloc_middle(con, msg);
+		if (ret < 0) {
+			ceph_msg_put(msg);
+			return NULL;
+		}
+	}
+
+	return msg;
+}
+
+
+/*
+ * Free a generically kmalloc'd message.
+ */
+void ceph_msg_kfree(struct ceph_msg *m)
+{
+	dout("msg_kfree %p\n", m);
+	if (m->front_is_vmalloc)
+		vfree(m->front.iov_base);
+	else
+		kfree(m->front.iov_base);
+	kfree(m);
+}
+
+/*
+ * Drop a msg ref.  Destroy as needed.
+ */
+void ceph_msg_last_put(struct kref *kref)
+{
+	struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
+
+	dout("ceph_msg_put last one on %p\n", m);
+	WARN_ON(!list_empty(&m->list_head));
+
+	/* drop middle, data, if any */
+	if (m->middle) {
+		ceph_buffer_put(m->middle);
+		m->middle = NULL;
+	}
+	m->nr_pages = 0;
+	m->pages = NULL;
+
+	if (m->pagelist) {
+		ceph_pagelist_release(m->pagelist);
+		kfree(m->pagelist);
+		m->pagelist = NULL;
+	}
+
+	if (m->pool)
+		ceph_msgpool_put(m->pool, m);
+	else
+		ceph_msg_kfree(m);
+}
+
+void ceph_msg_dump(struct ceph_msg *msg)
+{
+	pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg,
+		 msg->front_max, msg->nr_pages);
+	print_hex_dump(KERN_DEBUG, "header: ",
+		       DUMP_PREFIX_OFFSET, 16, 1,
+		       &msg->hdr, sizeof(msg->hdr), true);
+	print_hex_dump(KERN_DEBUG, " front: ",
+		       DUMP_PREFIX_OFFSET, 16, 1,
+		       msg->front.iov_base, msg->front.iov_len, true);
+	if (msg->middle)
+		print_hex_dump(KERN_DEBUG, "middle: ",
+			       DUMP_PREFIX_OFFSET, 16, 1,
+			       msg->middle->vec.iov_base,
+			       msg->middle->vec.iov_len, true);
+	print_hex_dump(KERN_DEBUG, "footer: ",
+		       DUMP_PREFIX_OFFSET, 16, 1,
+		       &msg->footer, sizeof(msg->footer), true);
+}
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
new file mode 100644
index 000000000000..76fbc957bc13
--- /dev/null
+++ b/fs/ceph/messenger.h
@@ -0,0 +1,253 @@
+#ifndef __FS_CEPH_MESSENGER_H
+#define __FS_CEPH_MESSENGER_H
+
+#include <linux/kref.h>
+#include <linux/mutex.h>
+#include <linux/net.h>
+#include <linux/radix-tree.h>
+#include <linux/uio.h>
+#include <linux/version.h>
+#include <linux/workqueue.h>
+
+#include "types.h"
+#include "buffer.h"
+
+struct ceph_msg;
+struct ceph_connection;
+
+extern struct workqueue_struct *ceph_msgr_wq;       /* receive work queue */
+
+/*
+ * Ceph defines these callbacks for handling connection events.
+ */
+struct ceph_connection_operations {
+	struct ceph_connection *(*get)(struct ceph_connection *);
+	void (*put)(struct ceph_connection *);
+
+	/* handle an incoming message. */
+	void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m);
+
+	/* authorize an outgoing connection */
+	int (*get_authorizer) (struct ceph_connection *con,
+			       void **buf, int *len, int *proto,
+			       void **reply_buf, int *reply_len, int force_new);
+	int (*verify_authorizer_reply) (struct ceph_connection *con, int len);
+	int (*invalidate_authorizer)(struct ceph_connection *con);
+
+	/* protocol version mismatch */
+	void (*bad_proto) (struct ceph_connection *con);
+
+	/* there was some error on the socket (disconnect, whatever) */
+	void (*fault) (struct ceph_connection *con);
+
+	/* a remote host as terminated a message exchange session, and messages
+	 * we sent (or they tried to send us) may be lost. */
+	void (*peer_reset) (struct ceph_connection *con);
+
+	struct ceph_msg * (*alloc_msg) (struct ceph_connection *con,
+					struct ceph_msg_header *hdr,
+					int *skip);
+};
+
+/* use format string %s%d */
+#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num)
+
+struct ceph_messenger {
+	struct ceph_entity_inst inst;    /* my name+address */
+	struct ceph_entity_addr my_enc_addr;
+	struct page *zero_page;          /* used in certain error cases */
+
+	bool nocrc;
+
+	/*
+	 * the global_seq counts connections i (attempt to) initiate
+	 * in order to disambiguate certain connect race conditions.
+	 */
+	u32 global_seq;
+	spinlock_t global_seq_lock;
+};
+
+/*
+ * a single message.  it contains a header (src, dest, message type, etc.),
+ * footer (crc values, mainly), a "front" message body, and possibly a
+ * data payload (stored in some number of pages).
+ */
+struct ceph_msg {
+	struct ceph_msg_header hdr;	/* header */
+	struct ceph_msg_footer footer;	/* footer */
+	struct kvec front;              /* unaligned blobs of message */
+	struct ceph_buffer *middle;
+	struct page **pages;            /* data payload.  NOT OWNER. */
+	unsigned nr_pages;              /* size of page array */
+	struct ceph_pagelist *pagelist; /* instead of pages */
+	struct list_head list_head;
+	struct kref kref;
+	bool front_is_vmalloc;
+	bool more_to_follow;
+	bool needs_out_seq;
+	int front_max;
+
+	struct ceph_msgpool *pool;
+};
+
+struct ceph_msg_pos {
+	int page, page_pos;  /* which page; offset in page */
+	int data_pos;        /* offset in data payload */
+	int did_page_crc;    /* true if we've calculated crc for current page */
+};
+
+/* ceph connection fault delay defaults, for exponential backoff */
+#define BASE_DELAY_INTERVAL	(HZ/2)
+#define MAX_DELAY_INTERVAL	(5 * 60 * HZ)
+
+/*
+ * ceph_connection state bit flags
+ *
+ * QUEUED and BUSY are used together to ensure that only a single
+ * thread is currently opening, reading or writing data to the socket.
+ */
+#define LOSSYTX         0  /* we can close channel or drop messages on errors */
+#define CONNECTING	1
+#define NEGOTIATING	2
+#define KEEPALIVE_PENDING      3
+#define WRITE_PENDING	4  /* we have data ready to send */
+#define QUEUED          5  /* there is work queued on this connection */
+#define BUSY            6  /* work is being done */
+#define STANDBY		8  /* no outgoing messages, socket closed.  we keep
+			    * the ceph_connection around to maintain shared
+			    * state with the peer. */
+#define CLOSED		10 /* we've closed the connection */
+#define SOCK_CLOSED	11 /* socket state changed to closed */
+#define OPENING         13 /* open connection w/ (possibly new) peer */
+#define DEAD            14 /* dead, about to kfree */
+
+/*
+ * A single connection with another host.
+ *
+ * We maintain a queue of outgoing messages, and some session state to
+ * ensure that we can preserve the lossless, ordered delivery of
+ * messages in the case of a TCP disconnect.
+ */
+struct ceph_connection {
+	void *private;
+	atomic_t nref;
+
+	const struct ceph_connection_operations *ops;
+
+	struct ceph_messenger *msgr;
+	struct socket *sock;
+	unsigned long state;	/* connection state (see flags above) */
+	const char *error_msg;  /* error message, if any */
+
+	struct ceph_entity_addr peer_addr; /* peer address */
+	struct ceph_entity_name peer_name; /* peer name */
+	struct ceph_entity_addr peer_addr_for_me;
+	unsigned peer_features;
+	u32 connect_seq;      /* identify the most recent connection
+				 attempt for this connection, client */
+	u32 peer_global_seq;  /* peer's global seq for this connection */
+
+	int auth_retry;       /* true if we need a newer authorizer */
+	void *auth_reply_buf;   /* where to put the authorizer reply */
+	int auth_reply_buf_len;
+
+	struct mutex mutex;
+
+	/* out queue */
+	struct list_head out_queue;
+	struct list_head out_sent;   /* sending or sent but unacked */
+	u64 out_seq;		     /* last message queued for send */
+	bool out_keepalive_pending;
+
+	u64 in_seq, in_seq_acked;  /* last message received, acked */
+
+	/* connection negotiation temps */
+	char in_banner[CEPH_BANNER_MAX_LEN];
+	union {
+		struct {  /* outgoing connection */
+			struct ceph_msg_connect out_connect;
+			struct ceph_msg_connect_reply in_reply;
+		};
+		struct {  /* incoming */
+			struct ceph_msg_connect in_connect;
+			struct ceph_msg_connect_reply out_reply;
+		};
+	};
+	struct ceph_entity_addr actual_peer_addr;
+
+	/* message out temps */
+	struct ceph_msg *out_msg;        /* sending message (== tail of
+					    out_sent) */
+	bool out_msg_done;
+	struct ceph_msg_pos out_msg_pos;
+
+	struct kvec out_kvec[8],         /* sending header/footer data */
+		*out_kvec_cur;
+	int out_kvec_left;   /* kvec's left in out_kvec */
+	int out_skip;        /* skip this many bytes */
+	int out_kvec_bytes;  /* total bytes left */
+	bool out_kvec_is_msg; /* kvec refers to out_msg */
+	int out_more;        /* there is more data after the kvecs */
+	__le64 out_temp_ack; /* for writing an ack */
+
+	/* message in temps */
+	struct ceph_msg_header in_hdr;
+	struct ceph_msg *in_msg;
+	struct ceph_msg_pos in_msg_pos;
+	u32 in_front_crc, in_middle_crc, in_data_crc;  /* calculated crc */
+
+	char in_tag;         /* protocol control byte */
+	int in_base_pos;     /* bytes read */
+	__le64 in_temp_ack;  /* for reading an ack */
+
+	struct delayed_work work;	    /* send|recv work */
+	unsigned long       delay;          /* current delay interval */
+};
+
+
+extern const char *pr_addr(const struct sockaddr_storage *ss);
+extern int ceph_parse_ips(const char *c, const char *end,
+			  struct ceph_entity_addr *addr,
+			  int max_count, int *count);
+
+
+extern int ceph_msgr_init(void);
+extern void ceph_msgr_exit(void);
+extern void ceph_msgr_flush(void);
+
+extern struct ceph_messenger *ceph_messenger_create(
+	struct ceph_entity_addr *myaddr);
+extern void ceph_messenger_destroy(struct ceph_messenger *);
+
+extern void ceph_con_init(struct ceph_messenger *msgr,
+			  struct ceph_connection *con);
+extern void ceph_con_open(struct ceph_connection *con,
+			  struct ceph_entity_addr *addr);
+extern bool ceph_con_opened(struct ceph_connection *con);
+extern void ceph_con_close(struct ceph_connection *con);
+extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
+extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg);
+extern void ceph_con_revoke_message(struct ceph_connection *con,
+				  struct ceph_msg *msg);
+extern void ceph_con_keepalive(struct ceph_connection *con);
+extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
+extern void ceph_con_put(struct ceph_connection *con);
+
+extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags);
+extern void ceph_msg_kfree(struct ceph_msg *m);
+
+
+static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
+{
+	kref_get(&msg->kref);
+	return msg;
+}
+extern void ceph_msg_last_put(struct kref *kref);
+static inline void ceph_msg_put(struct ceph_msg *msg)
+{
+	kref_put(&msg->kref, ceph_msg_last_put);
+}
+
+extern void ceph_msg_dump(struct ceph_msg *msg);
+
+#endif
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
new file mode 100644
index 000000000000..54fe01c50706
--- /dev/null
+++ b/fs/ceph/mon_client.c
@@ -0,0 +1,882 @@
+#include "ceph_debug.h"
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+
+#include "mon_client.h"
+#include "super.h"
+#include "auth.h"
+#include "decode.h"
+
+/*
+ * Interact with Ceph monitor cluster.  Handle requests for new map
+ * versions, and periodically resend as needed.  Also implement
+ * statfs() and umount().
+ *
+ * A small cluster of Ceph "monitors" are responsible for managing critical
+ * cluster configuration and state information.  An odd number (e.g., 3, 5)
+ * of cmon daemons use a modified version of the Paxos part-time parliament
+ * algorithm to manage the MDS map (mds cluster membership), OSD map, and
+ * list of clients who have mounted the file system.
+ *
+ * We maintain an open, active session with a monitor at all times in order to
+ * receive timely MDSMap updates.  We periodically send a keepalive byte on the
+ * TCP socket to ensure we detect a failure.  If the connection does break, we
+ * randomly hunt for a new monitor.  Once the connection is reestablished, we
+ * resend any outstanding requests.
+ */
+
+static const struct ceph_connection_operations mon_con_ops;
+
+static int __validate_auth(struct ceph_mon_client *monc);
+
+/*
+ * Decode a monmap blob (e.g., during mount).
+ */
+struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
+{
+	struct ceph_monmap *m = NULL;
+	int i, err = -EINVAL;
+	struct ceph_fsid fsid;
+	u32 epoch, num_mon;
+	u16 version;
+	u32 len;
+
+	ceph_decode_32_safe(&p, end, len, bad);
+	ceph_decode_need(&p, end, len, bad);
+
+	dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
+
+	ceph_decode_16_safe(&p, end, version, bad);
+
+	ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
+	ceph_decode_copy(&p, &fsid, sizeof(fsid));
+	epoch = ceph_decode_32(&p);
+
+	num_mon = ceph_decode_32(&p);
+	ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
+
+	if (num_mon >= CEPH_MAX_MON)
+		goto bad;
+	m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
+	if (m == NULL)
+		return ERR_PTR(-ENOMEM);
+	m->fsid = fsid;
+	m->epoch = epoch;
+	m->num_mon = num_mon;
+	ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
+	for (i = 0; i < num_mon; i++)
+		ceph_decode_addr(&m->mon_inst[i].addr);
+
+	dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
+	     m->num_mon);
+	for (i = 0; i < m->num_mon; i++)
+		dout("monmap_decode  mon%d is %s\n", i,
+		     pr_addr(&m->mon_inst[i].addr.in_addr));
+	return m;
+
+bad:
+	dout("monmap_decode failed with %d\n", err);
+	kfree(m);
+	return ERR_PTR(err);
+}
+
+/*
+ * return true if *addr is included in the monmap.
+ */
+int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
+{
+	int i;
+
+	for (i = 0; i < m->num_mon; i++)
+		if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
+			return 1;
+	return 0;
+}
+
+/*
+ * Send an auth request.
+ */
+static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
+{
+	monc->pending_auth = 1;
+	monc->m_auth->front.iov_len = len;
+	monc->m_auth->hdr.front_len = cpu_to_le32(len);
+	ceph_con_revoke(monc->con, monc->m_auth);
+	ceph_msg_get(monc->m_auth);  /* keep our ref */
+	ceph_con_send(monc->con, monc->m_auth);
+}
+
+/*
+ * Close monitor session, if any.
+ */
+static void __close_session(struct ceph_mon_client *monc)
+{
+	if (monc->con) {
+		dout("__close_session closing mon%d\n", monc->cur_mon);
+		ceph_con_revoke(monc->con, monc->m_auth);
+		ceph_con_close(monc->con);
+		monc->cur_mon = -1;
+		monc->pending_auth = 0;
+		ceph_auth_reset(monc->auth);
+	}
+}
+
+/*
+ * Open a session with a (new) monitor.
+ */
+static int __open_session(struct ceph_mon_client *monc)
+{
+	char r;
+	int ret;
+
+	if (monc->cur_mon < 0) {
+		get_random_bytes(&r, 1);
+		monc->cur_mon = r % monc->monmap->num_mon;
+		dout("open_session num=%d r=%d -> mon%d\n",
+		     monc->monmap->num_mon, r, monc->cur_mon);
+		monc->sub_sent = 0;
+		monc->sub_renew_after = jiffies;  /* i.e., expired */
+		monc->want_next_osdmap = !!monc->want_next_osdmap;
+
+		dout("open_session mon%d opening\n", monc->cur_mon);
+		monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON;
+		monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
+		ceph_con_open(monc->con,
+			      &monc->monmap->mon_inst[monc->cur_mon].addr);
+
+		/* initiatiate authentication handshake */
+		ret = ceph_auth_build_hello(monc->auth,
+					    monc->m_auth->front.iov_base,
+					    monc->m_auth->front_max);
+		__send_prepared_auth_request(monc, ret);
+	} else {
+		dout("open_session mon%d already open\n", monc->cur_mon);
+	}
+	return 0;
+}
+
+static bool __sub_expired(struct ceph_mon_client *monc)
+{
+	return time_after_eq(jiffies, monc->sub_renew_after);
+}
+
+/*
+ * Reschedule delayed work timer.
+ */
+static void __schedule_delayed(struct ceph_mon_client *monc)
+{
+	unsigned delay;
+
+	if (monc->cur_mon < 0 || __sub_expired(monc))
+		delay = 10 * HZ;
+	else
+		delay = 20 * HZ;
+	dout("__schedule_delayed after %u\n", delay);
+	schedule_delayed_work(&monc->delayed_work, delay);
+}
+
+/*
+ * Send subscribe request for mdsmap and/or osdmap.
+ */
+static void __send_subscribe(struct ceph_mon_client *monc)
+{
+	dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
+	     (unsigned)monc->sub_sent, __sub_expired(monc),
+	     monc->want_next_osdmap);
+	if ((__sub_expired(monc) && !monc->sub_sent) ||
+	    monc->want_next_osdmap == 1) {
+		struct ceph_msg *msg = monc->m_subscribe;
+		struct ceph_mon_subscribe_item *i;
+		void *p, *end;
+
+		p = msg->front.iov_base;
+		end = p + msg->front_max;
+
+		dout("__send_subscribe to 'mdsmap' %u+\n",
+		     (unsigned)monc->have_mdsmap);
+		if (monc->want_next_osdmap) {
+			dout("__send_subscribe to 'osdmap' %u\n",
+			     (unsigned)monc->have_osdmap);
+			ceph_encode_32(&p, 3);
+			ceph_encode_string(&p, end, "osdmap", 6);
+			i = p;
+			i->have = cpu_to_le64(monc->have_osdmap);
+			i->onetime = 1;
+			p += sizeof(*i);
+			monc->want_next_osdmap = 2;  /* requested */
+		} else {
+			ceph_encode_32(&p, 2);
+		}
+		ceph_encode_string(&p, end, "mdsmap", 6);
+		i = p;
+		i->have = cpu_to_le64(monc->have_mdsmap);
+		i->onetime = 0;
+		p += sizeof(*i);
+		ceph_encode_string(&p, end, "monmap", 6);
+		i = p;
+		i->have = 0;
+		i->onetime = 0;
+		p += sizeof(*i);
+
+		msg->front.iov_len = p - msg->front.iov_base;
+		msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+		ceph_con_revoke(monc->con, msg);
+		ceph_con_send(monc->con, ceph_msg_get(msg));
+
+		monc->sub_sent = jiffies | 1;  /* never 0 */
+	}
+}
+
+static void handle_subscribe_ack(struct ceph_mon_client *monc,
+				 struct ceph_msg *msg)
+{
+	unsigned seconds;
+	struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
+
+	if (msg->front.iov_len < sizeof(*h))
+		goto bad;
+	seconds = le32_to_cpu(h->duration);
+
+	mutex_lock(&monc->mutex);
+	if (monc->hunting) {
+		pr_info("mon%d %s session established\n",
+			monc->cur_mon, pr_addr(&monc->con->peer_addr.in_addr));
+		monc->hunting = false;
+	}
+	dout("handle_subscribe_ack after %d seconds\n", seconds);
+	monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
+	monc->sub_sent = 0;
+	mutex_unlock(&monc->mutex);
+	return;
+bad:
+	pr_err("got corrupt subscribe-ack msg\n");
+	ceph_msg_dump(msg);
+}
+
+/*
+ * Keep track of which maps we have
+ */
+int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
+{
+	mutex_lock(&monc->mutex);
+	monc->have_mdsmap = got;
+	mutex_unlock(&monc->mutex);
+	return 0;
+}
+
+int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
+{
+	mutex_lock(&monc->mutex);
+	monc->have_osdmap = got;
+	monc->want_next_osdmap = 0;
+	mutex_unlock(&monc->mutex);
+	return 0;
+}
+
+/*
+ * Register interest in the next osdmap
+ */
+void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
+{
+	dout("request_next_osdmap have %u\n", monc->have_osdmap);
+	mutex_lock(&monc->mutex);
+	if (!monc->want_next_osdmap)
+		monc->want_next_osdmap = 1;
+	if (monc->want_next_osdmap < 2)
+		__send_subscribe(monc);
+	mutex_unlock(&monc->mutex);
+}
+
+/*
+ *
+ */
+int ceph_monc_open_session(struct ceph_mon_client *monc)
+{
+	if (!monc->con) {
+		monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
+		if (!monc->con)
+			return -ENOMEM;
+		ceph_con_init(monc->client->msgr, monc->con);
+		monc->con->private = monc;
+		monc->con->ops = &mon_con_ops;
+	}
+
+	mutex_lock(&monc->mutex);
+	__open_session(monc);
+	__schedule_delayed(monc);
+	mutex_unlock(&monc->mutex);
+	return 0;
+}
+
+/*
+ * The monitor responds with mount ack indicate mount success.  The
+ * included client ticket allows the client to talk to MDSs and OSDs.
+ */
+static void ceph_monc_handle_map(struct ceph_mon_client *monc,
+				 struct ceph_msg *msg)
+{
+	struct ceph_client *client = monc->client;
+	struct ceph_monmap *monmap = NULL, *old = monc->monmap;
+	void *p, *end;
+
+	mutex_lock(&monc->mutex);
+
+	dout("handle_monmap\n");
+	p = msg->front.iov_base;
+	end = p + msg->front.iov_len;
+
+	monmap = ceph_monmap_decode(p, end);
+	if (IS_ERR(monmap)) {
+		pr_err("problem decoding monmap, %d\n",
+		       (int)PTR_ERR(monmap));
+		goto out;
+	}
+
+	if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
+		kfree(monmap);
+		goto out;
+	}
+
+	client->monc.monmap = monmap;
+	kfree(old);
+
+out:
+	mutex_unlock(&monc->mutex);
+	wake_up_all(&client->auth_wq);
+}
+
+/*
+ * statfs
+ */
+static struct ceph_mon_generic_request *__lookup_generic_req(
+	struct ceph_mon_client *monc, u64 tid)
+{
+	struct ceph_mon_generic_request *req;
+	struct rb_node *n = monc->generic_request_tree.rb_node;
+
+	while (n) {
+		req = rb_entry(n, struct ceph_mon_generic_request, node);
+		if (tid < req->tid)
+			n = n->rb_left;
+		else if (tid > req->tid)
+			n = n->rb_right;
+		else
+			return req;
+	}
+	return NULL;
+}
+
+static void __insert_generic_request(struct ceph_mon_client *monc,
+			    struct ceph_mon_generic_request *new)
+{
+	struct rb_node **p = &monc->generic_request_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_mon_generic_request *req = NULL;
+
+	while (*p) {
+		parent = *p;
+		req = rb_entry(parent, struct ceph_mon_generic_request, node);
+		if (new->tid < req->tid)
+			p = &(*p)->rb_left;
+		else if (new->tid > req->tid)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&new->node, parent, p);
+	rb_insert_color(&new->node, &monc->generic_request_tree);
+}
+
+static void release_generic_request(struct kref *kref)
+{
+	struct ceph_mon_generic_request *req =
+		container_of(kref, struct ceph_mon_generic_request, kref);
+
+	if (req->reply)
+		ceph_msg_put(req->reply);
+	if (req->request)
+		ceph_msg_put(req->request);
+
+	kfree(req);
+}
+
+static void put_generic_request(struct ceph_mon_generic_request *req)
+{
+	kref_put(&req->kref, release_generic_request);
+}
+
+static void get_generic_request(struct ceph_mon_generic_request *req)
+{
+	kref_get(&req->kref);
+}
+
+static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
+					 struct ceph_msg_header *hdr,
+					 int *skip)
+{
+	struct ceph_mon_client *monc = con->private;
+	struct ceph_mon_generic_request *req;
+	u64 tid = le64_to_cpu(hdr->tid);
+	struct ceph_msg *m;
+
+	mutex_lock(&monc->mutex);
+	req = __lookup_generic_req(monc, tid);
+	if (!req) {
+		dout("get_generic_reply %lld dne\n", tid);
+		*skip = 1;
+		m = NULL;
+	} else {
+		dout("get_generic_reply %lld got %p\n", tid, req->reply);
+		m = ceph_msg_get(req->reply);
+		/*
+		 * we don't need to track the connection reading into
+		 * this reply because we only have one open connection
+		 * at a time, ever.
+		 */
+	}
+	mutex_unlock(&monc->mutex);
+	return m;
+}
+
+static void handle_statfs_reply(struct ceph_mon_client *monc,
+				struct ceph_msg *msg)
+{
+	struct ceph_mon_generic_request *req;
+	struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
+	u64 tid = le64_to_cpu(msg->hdr.tid);
+
+	if (msg->front.iov_len != sizeof(*reply))
+		goto bad;
+	dout("handle_statfs_reply %p tid %llu\n", msg, tid);
+
+	mutex_lock(&monc->mutex);
+	req = __lookup_generic_req(monc, tid);
+	if (req) {
+		*(struct ceph_statfs *)req->buf = reply->st;
+		req->result = 0;
+		get_generic_request(req);
+	}
+	mutex_unlock(&monc->mutex);
+	if (req) {
+		complete_all(&req->completion);
+		put_generic_request(req);
+	}
+	return;
+
+bad:
+	pr_err("corrupt generic reply, no tid\n");
+	ceph_msg_dump(msg);
+}
+
+/*
+ * Do a synchronous statfs().
+ */
+int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
+{
+	struct ceph_mon_generic_request *req;
+	struct ceph_mon_statfs *h;
+	int err;
+
+	req = kzalloc(sizeof(*req), GFP_NOFS);
+	if (!req)
+		return -ENOMEM;
+
+	kref_init(&req->kref);
+	req->buf = buf;
+	init_completion(&req->completion);
+
+	err = -ENOMEM;
+	req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS);
+	if (!req->request)
+		goto out;
+	req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS);
+	if (!req->reply)
+		goto out;
+
+	/* fill out request */
+	h = req->request->front.iov_base;
+	h->monhdr.have_version = 0;
+	h->monhdr.session_mon = cpu_to_le16(-1);
+	h->monhdr.session_mon_tid = 0;
+	h->fsid = monc->monmap->fsid;
+
+	/* register request */
+	mutex_lock(&monc->mutex);
+	req->tid = ++monc->last_tid;
+	req->request->hdr.tid = cpu_to_le64(req->tid);
+	__insert_generic_request(monc, req);
+	monc->num_generic_requests++;
+	mutex_unlock(&monc->mutex);
+
+	/* send request and wait */
+	ceph_con_send(monc->con, ceph_msg_get(req->request));
+	err = wait_for_completion_interruptible(&req->completion);
+
+	mutex_lock(&monc->mutex);
+	rb_erase(&req->node, &monc->generic_request_tree);
+	monc->num_generic_requests--;
+	mutex_unlock(&monc->mutex);
+
+	if (!err)
+		err = req->result;
+
+out:
+	kref_put(&req->kref, release_generic_request);
+	return err;
+}
+
+/*
+ * Resend pending statfs requests.
+ */
+static void __resend_generic_request(struct ceph_mon_client *monc)
+{
+	struct ceph_mon_generic_request *req;
+	struct rb_node *p;
+
+	for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
+		req = rb_entry(p, struct ceph_mon_generic_request, node);
+		ceph_con_revoke(monc->con, req->request);
+		ceph_con_send(monc->con, ceph_msg_get(req->request));
+	}
+}
+
+/*
+ * Delayed work.  If we haven't mounted yet, retry.  Otherwise,
+ * renew/retry subscription as needed (in case it is timing out, or we
+ * got an ENOMEM).  And keep the monitor connection alive.
+ */
+static void delayed_work(struct work_struct *work)
+{
+	struct ceph_mon_client *monc =
+		container_of(work, struct ceph_mon_client, delayed_work.work);
+
+	dout("monc delayed_work\n");
+	mutex_lock(&monc->mutex);
+	if (monc->hunting) {
+		__close_session(monc);
+		__open_session(monc);  /* continue hunting */
+	} else {
+		ceph_con_keepalive(monc->con);
+
+		__validate_auth(monc);
+
+		if (monc->auth->ops->is_authenticated(monc->auth))
+			__send_subscribe(monc);
+	}
+	__schedule_delayed(monc);
+	mutex_unlock(&monc->mutex);
+}
+
+/*
+ * On startup, we build a temporary monmap populated with the IPs
+ * provided by mount(2).
+ */
+static int build_initial_monmap(struct ceph_mon_client *monc)
+{
+	struct ceph_mount_args *args = monc->client->mount_args;
+	struct ceph_entity_addr *mon_addr = args->mon_addr;
+	int num_mon = args->num_mon;
+	int i;
+
+	/* build initial monmap */
+	monc->monmap = kzalloc(sizeof(*monc->monmap) +
+			       num_mon*sizeof(monc->monmap->mon_inst[0]),
+			       GFP_KERNEL);
+	if (!monc->monmap)
+		return -ENOMEM;
+	for (i = 0; i < num_mon; i++) {
+		monc->monmap->mon_inst[i].addr = mon_addr[i];
+		monc->monmap->mon_inst[i].addr.nonce = 0;
+		monc->monmap->mon_inst[i].name.type =
+			CEPH_ENTITY_TYPE_MON;
+		monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
+	}
+	monc->monmap->num_mon = num_mon;
+	monc->have_fsid = false;
+
+	/* release addr memory */
+	kfree(args->mon_addr);
+	args->mon_addr = NULL;
+	args->num_mon = 0;
+	return 0;
+}
+
+int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
+{
+	int err = 0;
+
+	dout("init\n");
+	memset(monc, 0, sizeof(*monc));
+	monc->client = cl;
+	monc->monmap = NULL;
+	mutex_init(&monc->mutex);
+
+	err = build_initial_monmap(monc);
+	if (err)
+		goto out;
+
+	monc->con = NULL;
+
+	/* authentication */
+	monc->auth = ceph_auth_init(cl->mount_args->name,
+				    cl->mount_args->secret);
+	if (IS_ERR(monc->auth))
+		return PTR_ERR(monc->auth);
+	monc->auth->want_keys =
+		CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
+		CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
+
+	/* msgs */
+	err = -ENOMEM;
+	monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
+				     sizeof(struct ceph_mon_subscribe_ack),
+				     GFP_NOFS);
+	if (!monc->m_subscribe_ack)
+		goto out_monmap;
+
+	monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS);
+	if (!monc->m_subscribe)
+		goto out_subscribe_ack;
+
+	monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS);
+	if (!monc->m_auth_reply)
+		goto out_subscribe;
+
+	monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS);
+	monc->pending_auth = 0;
+	if (!monc->m_auth)
+		goto out_auth_reply;
+
+	monc->cur_mon = -1;
+	monc->hunting = true;
+	monc->sub_renew_after = jiffies;
+	monc->sub_sent = 0;
+
+	INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
+	monc->generic_request_tree = RB_ROOT;
+	monc->num_generic_requests = 0;
+	monc->last_tid = 0;
+
+	monc->have_mdsmap = 0;
+	monc->have_osdmap = 0;
+	monc->want_next_osdmap = 1;
+	return 0;
+
+out_auth_reply:
+	ceph_msg_put(monc->m_auth_reply);
+out_subscribe:
+	ceph_msg_put(monc->m_subscribe);
+out_subscribe_ack:
+	ceph_msg_put(monc->m_subscribe_ack);
+out_monmap:
+	kfree(monc->monmap);
+out:
+	return err;
+}
+
+void ceph_monc_stop(struct ceph_mon_client *monc)
+{
+	dout("stop\n");
+	cancel_delayed_work_sync(&monc->delayed_work);
+
+	mutex_lock(&monc->mutex);
+	__close_session(monc);
+	if (monc->con) {
+		monc->con->private = NULL;
+		monc->con->ops->put(monc->con);
+		monc->con = NULL;
+	}
+	mutex_unlock(&monc->mutex);
+
+	ceph_auth_destroy(monc->auth);
+
+	ceph_msg_put(monc->m_auth);
+	ceph_msg_put(monc->m_auth_reply);
+	ceph_msg_put(monc->m_subscribe);
+	ceph_msg_put(monc->m_subscribe_ack);
+
+	kfree(monc->monmap);
+}
+
+static void handle_auth_reply(struct ceph_mon_client *monc,
+			      struct ceph_msg *msg)
+{
+	int ret;
+	int was_auth = 0;
+
+	mutex_lock(&monc->mutex);
+	if (monc->auth->ops)
+		was_auth = monc->auth->ops->is_authenticated(monc->auth);
+	monc->pending_auth = 0;
+	ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
+				     msg->front.iov_len,
+				     monc->m_auth->front.iov_base,
+				     monc->m_auth->front_max);
+	if (ret < 0) {
+		monc->client->auth_err = ret;
+		wake_up_all(&monc->client->auth_wq);
+	} else if (ret > 0) {
+		__send_prepared_auth_request(monc, ret);
+	} else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) {
+		dout("authenticated, starting session\n");
+
+		monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
+		monc->client->msgr->inst.name.num =
+					cpu_to_le64(monc->auth->global_id);
+
+		__send_subscribe(monc);
+		__resend_generic_request(monc);
+	}
+	mutex_unlock(&monc->mutex);
+}
+
+static int __validate_auth(struct ceph_mon_client *monc)
+{
+	int ret;
+
+	if (monc->pending_auth)
+		return 0;
+
+	ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
+			      monc->m_auth->front_max);
+	if (ret <= 0)
+		return ret; /* either an error, or no need to authenticate */
+	__send_prepared_auth_request(monc, ret);
+	return 0;
+}
+
+int ceph_monc_validate_auth(struct ceph_mon_client *monc)
+{
+	int ret;
+
+	mutex_lock(&monc->mutex);
+	ret = __validate_auth(monc);
+	mutex_unlock(&monc->mutex);
+	return ret;
+}
+
+/*
+ * handle incoming message
+ */
+static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+{
+	struct ceph_mon_client *monc = con->private;
+	int type = le16_to_cpu(msg->hdr.type);
+
+	if (!monc)
+		return;
+
+	switch (type) {
+	case CEPH_MSG_AUTH_REPLY:
+		handle_auth_reply(monc, msg);
+		break;
+
+	case CEPH_MSG_MON_SUBSCRIBE_ACK:
+		handle_subscribe_ack(monc, msg);
+		break;
+
+	case CEPH_MSG_STATFS_REPLY:
+		handle_statfs_reply(monc, msg);
+		break;
+
+	case CEPH_MSG_MON_MAP:
+		ceph_monc_handle_map(monc, msg);
+		break;
+
+	case CEPH_MSG_MDS_MAP:
+		ceph_mdsc_handle_map(&monc->client->mdsc, msg);
+		break;
+
+	case CEPH_MSG_OSD_MAP:
+		ceph_osdc_handle_map(&monc->client->osdc, msg);
+		break;
+
+	default:
+		pr_err("received unknown message type %d %s\n", type,
+		       ceph_msg_type_name(type));
+	}
+	ceph_msg_put(msg);
+}
+
+/*
+ * Allocate memory for incoming message
+ */
+static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
+				      struct ceph_msg_header *hdr,
+				      int *skip)
+{
+	struct ceph_mon_client *monc = con->private;
+	int type = le16_to_cpu(hdr->type);
+	int front_len = le32_to_cpu(hdr->front_len);
+	struct ceph_msg *m = NULL;
+
+	*skip = 0;
+
+	switch (type) {
+	case CEPH_MSG_MON_SUBSCRIBE_ACK:
+		m = ceph_msg_get(monc->m_subscribe_ack);
+		break;
+	case CEPH_MSG_STATFS_REPLY:
+		return get_generic_reply(con, hdr, skip);
+	case CEPH_MSG_AUTH_REPLY:
+		m = ceph_msg_get(monc->m_auth_reply);
+		break;
+	case CEPH_MSG_MON_MAP:
+	case CEPH_MSG_MDS_MAP:
+	case CEPH_MSG_OSD_MAP:
+		m = ceph_msg_new(type, front_len, GFP_NOFS);
+		break;
+	}
+
+	if (!m) {
+		pr_info("alloc_msg unknown type %d\n", type);
+		*skip = 1;
+	}
+	return m;
+}
+
+/*
+ * If the monitor connection resets, pick a new monitor and resubmit
+ * any pending requests.
+ */
+static void mon_fault(struct ceph_connection *con)
+{
+	struct ceph_mon_client *monc = con->private;
+
+	if (!monc)
+		return;
+
+	dout("mon_fault\n");
+	mutex_lock(&monc->mutex);
+	if (!con->private)
+		goto out;
+
+	if (monc->con && !monc->hunting)
+		pr_info("mon%d %s session lost, "
+			"hunting for new mon\n", monc->cur_mon,
+			pr_addr(&monc->con->peer_addr.in_addr));
+
+	__close_session(monc);
+	if (!monc->hunting) {
+		/* start hunting */
+		monc->hunting = true;
+		__open_session(monc);
+	} else {
+		/* already hunting, let's wait a bit */
+		__schedule_delayed(monc);
+	}
+out:
+	mutex_unlock(&monc->mutex);
+}
+
+static const struct ceph_connection_operations mon_con_ops = {
+	.get = ceph_con_get,
+	.put = ceph_con_put,
+	.dispatch = dispatch,
+	.fault = mon_fault,
+	.alloc_msg = mon_alloc_msg,
+};
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
new file mode 100644
index 000000000000..174d794321d0
--- /dev/null
+++ b/fs/ceph/mon_client.h
@@ -0,0 +1,116 @@
+#ifndef _FS_CEPH_MON_CLIENT_H
+#define _FS_CEPH_MON_CLIENT_H
+
+#include <linux/completion.h>
+#include <linux/kref.h>
+#include <linux/rbtree.h>
+
+#include "messenger.h"
+
+struct ceph_client;
+struct ceph_mount_args;
+struct ceph_auth_client;
+
+/*
+ * The monitor map enumerates the set of all monitors.
+ */
+struct ceph_monmap {
+	struct ceph_fsid fsid;
+	u32 epoch;
+	u32 num_mon;
+	struct ceph_entity_inst mon_inst[0];
+};
+
+struct ceph_mon_client;
+struct ceph_mon_generic_request;
+
+
+/*
+ * Generic mechanism for resending monitor requests.
+ */
+typedef void (*ceph_monc_request_func_t)(struct ceph_mon_client *monc,
+					 int newmon);
+
+/* a pending monitor request */
+struct ceph_mon_request {
+	struct ceph_mon_client *monc;
+	struct delayed_work delayed_work;
+	unsigned long delay;
+	ceph_monc_request_func_t do_request;
+};
+
+/*
+ * ceph_mon_generic_request is being used for the statfs and poolop requests
+ * which are bening done a bit differently because we need to get data back
+ * to the caller
+ */
+struct ceph_mon_generic_request {
+	struct kref kref;
+	u64 tid;
+	struct rb_node node;
+	int result;
+	void *buf;
+	struct completion completion;
+	struct ceph_msg *request;  /* original request */
+	struct ceph_msg *reply;    /* and reply */
+};
+
+struct ceph_mon_client {
+	struct ceph_client *client;
+	struct ceph_monmap *monmap;
+
+	struct mutex mutex;
+	struct delayed_work delayed_work;
+
+	struct ceph_auth_client *auth;
+	struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe, *m_subscribe_ack;
+	int pending_auth;
+
+	bool hunting;
+	int cur_mon;                       /* last monitor i contacted */
+	unsigned long sub_sent, sub_renew_after;
+	struct ceph_connection *con;
+	bool have_fsid;
+
+	/* pending generic requests */
+	struct rb_root generic_request_tree;
+	int num_generic_requests;
+	u64 last_tid;
+
+	/* mds/osd map */
+	int want_next_osdmap; /* 1 = want, 2 = want+asked */
+	u32 have_osdmap, have_mdsmap;
+
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *debugfs_file;
+#endif
+};
+
+extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end);
+extern int ceph_monmap_contains(struct ceph_monmap *m,
+				struct ceph_entity_addr *addr);
+
+extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
+extern void ceph_monc_stop(struct ceph_mon_client *monc);
+
+/*
+ * The model here is to indicate that we need a new map of at least
+ * epoch @want, and also call in when we receive a map.  We will
+ * periodically rerequest the map from the monitor cluster until we
+ * get what we want.
+ */
+extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have);
+extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have);
+
+extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
+
+extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
+			       struct ceph_statfs *buf);
+
+extern int ceph_monc_open_session(struct ceph_mon_client *monc);
+
+extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
+
+
+
+#endif
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
new file mode 100644
index 000000000000..dd65a6438131
--- /dev/null
+++ b/fs/ceph/msgpool.c
@@ -0,0 +1,64 @@
+#include "ceph_debug.h"
+
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+
+#include "msgpool.h"
+
+static void *alloc_fn(gfp_t gfp_mask, void *arg)
+{
+	struct ceph_msgpool *pool = arg;
+	void *p;
+
+	p = ceph_msg_new(0, pool->front_len, gfp_mask);
+	if (!p)
+		pr_err("msgpool %s alloc failed\n", pool->name);
+	return p;
+}
+
+static void free_fn(void *element, void *arg)
+{
+	ceph_msg_put(element);
+}
+
+int ceph_msgpool_init(struct ceph_msgpool *pool,
+		      int front_len, int size, bool blocking, const char *name)
+{
+	pool->front_len = front_len;
+	pool->pool = mempool_create(size, alloc_fn, free_fn, pool);
+	if (!pool->pool)
+		return -ENOMEM;
+	pool->name = name;
+	return 0;
+}
+
+void ceph_msgpool_destroy(struct ceph_msgpool *pool)
+{
+	mempool_destroy(pool->pool);
+}
+
+struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
+				  int front_len)
+{
+	if (front_len > pool->front_len) {
+		pr_err("msgpool_get pool %s need front %d, pool size is %d\n",
+		       pool->name, front_len, pool->front_len);
+		WARN_ON(1);
+
+		/* try to alloc a fresh message */
+		return ceph_msg_new(0, front_len, GFP_NOFS);
+	}
+
+	return mempool_alloc(pool->pool, GFP_NOFS);
+}
+
+void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
+{
+	/* reset msg front_len; user may have changed it */
+	msg->front.iov_len = pool->front_len;
+	msg->hdr.front_len = cpu_to_le32(pool->front_len);
+
+	kref_init(&msg->kref);  /* retake single ref */
+}
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h
new file mode 100644
index 000000000000..a362605f9368
--- /dev/null
+++ b/fs/ceph/msgpool.h
@@ -0,0 +1,25 @@
+#ifndef _FS_CEPH_MSGPOOL
+#define _FS_CEPH_MSGPOOL
+
+#include <linux/mempool.h>
+#include "messenger.h"
+
+/*
+ * we use memory pools for preallocating messages we may receive, to
+ * avoid unexpected OOM conditions.
+ */
+struct ceph_msgpool {
+	const char *name;
+	mempool_t *pool;
+	int front_len;          /* preallocated payload size */
+};
+
+extern int ceph_msgpool_init(struct ceph_msgpool *pool,
+			     int front_len, int size, bool blocking,
+			     const char *name);
+extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
+extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
+					 int front_len);
+extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
+
+#endif
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
new file mode 100644
index 000000000000..892a0298dfdf
--- /dev/null
+++ b/fs/ceph/msgr.h
@@ -0,0 +1,175 @@
+#ifndef __MSGR_H
+#define __MSGR_H
+
+/*
+ * Data types for message passing layer used by Ceph.
+ */
+
+#define CEPH_MON_PORT    6789  /* default monitor port */
+
+/*
+ * client-side processes will try to bind to ports in this
+ * range, simply for the benefit of tools like nmap or wireshark
+ * that would like to identify the protocol.
+ */
+#define CEPH_PORT_FIRST  6789
+#define CEPH_PORT_START  6800  /* non-monitors start here */
+#define CEPH_PORT_LAST   6900
+
+/*
+ * tcp connection banner.  include a protocol version. and adjust
+ * whenever the wire protocol changes.  try to keep this string length
+ * constant.
+ */
+#define CEPH_BANNER "ceph v027"
+#define CEPH_BANNER_MAX_LEN 30
+
+
+/*
+ * Rollover-safe type and comparator for 32-bit sequence numbers.
+ * Comparator returns -1, 0, or 1.
+ */
+typedef __u32 ceph_seq_t;
+
+static inline __s32 ceph_seq_cmp(__u32 a, __u32 b)
+{
+       return (__s32)a - (__s32)b;
+}
+
+
+/*
+ * entity_name -- logical name for a process participating in the
+ * network, e.g. 'mds0' or 'osd3'.
+ */
+struct ceph_entity_name {
+	__u8 type;      /* CEPH_ENTITY_TYPE_* */
+	__le64 num;
+} __attribute__ ((packed));
+
+#define CEPH_ENTITY_TYPE_MON    0x01
+#define CEPH_ENTITY_TYPE_MDS    0x02
+#define CEPH_ENTITY_TYPE_OSD    0x04
+#define CEPH_ENTITY_TYPE_CLIENT 0x08
+#define CEPH_ENTITY_TYPE_AUTH   0x20
+
+#define CEPH_ENTITY_TYPE_ANY    0xFF
+
+extern const char *ceph_entity_type_name(int type);
+
+/*
+ * entity_addr -- network address
+ */
+struct ceph_entity_addr {
+	__le32 type;
+	__le32 nonce;  /* unique id for process (e.g. pid) */
+	struct sockaddr_storage in_addr;
+} __attribute__ ((packed));
+
+struct ceph_entity_inst {
+	struct ceph_entity_name name;
+	struct ceph_entity_addr addr;
+} __attribute__ ((packed));
+
+
+/* used by message exchange protocol */
+#define CEPH_MSGR_TAG_READY         1  /* server->client: ready for messages */
+#define CEPH_MSGR_TAG_RESETSESSION  2  /* server->client: reset, try again */
+#define CEPH_MSGR_TAG_WAIT          3  /* server->client: wait for racing
+					  incoming connection */
+#define CEPH_MSGR_TAG_RETRY_SESSION 4  /* server->client + cseq: try again
+					  with higher cseq */
+#define CEPH_MSGR_TAG_RETRY_GLOBAL  5  /* server->client + gseq: try again
+					  with higher gseq */
+#define CEPH_MSGR_TAG_CLOSE         6  /* closing pipe */
+#define CEPH_MSGR_TAG_MSG           7  /* message */
+#define CEPH_MSGR_TAG_ACK           8  /* message ack */
+#define CEPH_MSGR_TAG_KEEPALIVE     9  /* just a keepalive byte! */
+#define CEPH_MSGR_TAG_BADPROTOVER  10  /* bad protocol version */
+#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
+#define CEPH_MSGR_TAG_FEATURES      12 /* insufficient features */
+
+
+/*
+ * connection negotiation
+ */
+struct ceph_msg_connect {
+	__le64 features;     /* supported feature bits */
+	__le32 host_type;    /* CEPH_ENTITY_TYPE_* */
+	__le32 global_seq;   /* count connections initiated by this host */
+	__le32 connect_seq;  /* count connections initiated in this session */
+	__le32 protocol_version;
+	__le32 authorizer_protocol;
+	__le32 authorizer_len;
+	__u8  flags;         /* CEPH_MSG_CONNECT_* */
+} __attribute__ ((packed));
+
+struct ceph_msg_connect_reply {
+	__u8 tag;
+	__le64 features;     /* feature bits for this session */
+	__le32 global_seq;
+	__le32 connect_seq;
+	__le32 protocol_version;
+	__le32 authorizer_len;
+	__u8 flags;
+} __attribute__ ((packed));
+
+#define CEPH_MSG_CONNECT_LOSSY  1  /* messages i send may be safely dropped */
+
+
+/*
+ * message header
+ */
+struct ceph_msg_header_old {
+	__le64 seq;       /* message seq# for this session */
+	__le64 tid;       /* transaction id */
+	__le16 type;      /* message type */
+	__le16 priority;  /* priority.  higher value == higher priority */
+	__le16 version;   /* version of message encoding */
+
+	__le32 front_len; /* bytes in main payload */
+	__le32 middle_len;/* bytes in middle payload */
+	__le32 data_len;  /* bytes of data payload */
+	__le16 data_off;  /* sender: include full offset;
+			     receiver: mask against ~PAGE_MASK */
+
+	struct ceph_entity_inst src, orig_src;
+	__le32 reserved;
+	__le32 crc;       /* header crc32c */
+} __attribute__ ((packed));
+
+struct ceph_msg_header {
+	__le64 seq;       /* message seq# for this session */
+	__le64 tid;       /* transaction id */
+	__le16 type;      /* message type */
+	__le16 priority;  /* priority.  higher value == higher priority */
+	__le16 version;   /* version of message encoding */
+
+	__le32 front_len; /* bytes in main payload */
+	__le32 middle_len;/* bytes in middle payload */
+	__le32 data_len;  /* bytes of data payload */
+	__le16 data_off;  /* sender: include full offset;
+			     receiver: mask against ~PAGE_MASK */
+
+	struct ceph_entity_name src;
+	__le32 reserved;
+	__le32 crc;       /* header crc32c */
+} __attribute__ ((packed));
+
+#define CEPH_MSG_PRIO_LOW     64
+#define CEPH_MSG_PRIO_DEFAULT 127
+#define CEPH_MSG_PRIO_HIGH    196
+#define CEPH_MSG_PRIO_HIGHEST 255
+
+/*
+ * follows data payload
+ */
+struct ceph_msg_footer {
+	__le32 front_crc, middle_crc, data_crc;
+	__u8 flags;
+} __attribute__ ((packed));
+
+#define CEPH_MSG_FOOTER_COMPLETE  (1<<0)   /* msg wasn't aborted */
+#define CEPH_MSG_FOOTER_NOCRC     (1<<1)   /* no data crc */
+
+
+#endif
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
new file mode 100644
index 000000000000..e38522347898
--- /dev/null
+++ b/fs/ceph/osd_client.c
@@ -0,0 +1,1542 @@
+#include "ceph_debug.h"
+
+#include <linux/err.h>
+#include <linux/highmem.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include "super.h"
+#include "osd_client.h"
+#include "messenger.h"
+#include "decode.h"
+#include "auth.h"
+
+#define OSD_OP_FRONT_LEN	4096
+#define OSD_OPREPLY_FRONT_LEN	512
+
+static const struct ceph_connection_operations osd_con_ops;
+static int __kick_requests(struct ceph_osd_client *osdc,
+			  struct ceph_osd *kickosd);
+
+static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
+
+/*
+ * Implement client access to distributed object storage cluster.
+ *
+ * All data objects are stored within a cluster/cloud of OSDs, or
+ * "object storage devices."  (Note that Ceph OSDs have _nothing_ to
+ * do with the T10 OSD extensions to SCSI.)  Ceph OSDs are simply
+ * remote daemons serving up and coordinating consistent and safe
+ * access to storage.
+ *
+ * Cluster membership and the mapping of data objects onto storage devices
+ * are described by the osd map.
+ *
+ * We keep track of pending OSD requests (read, write), resubmit
+ * requests to different OSDs when the cluster topology/data layout
+ * change, or retry the affected requests when the communications
+ * channel with an OSD is reset.
+ */
+
+/*
+ * calculate the mapping of a file extent onto an object, and fill out the
+ * request accordingly.  shorten extent as necessary if it crosses an
+ * object boundary.
+ *
+ * fill osd op in request message.
+ */
+static void calc_layout(struct ceph_osd_client *osdc,
+			struct ceph_vino vino, struct ceph_file_layout *layout,
+			u64 off, u64 *plen,
+			struct ceph_osd_request *req)
+{
+	struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
+	struct ceph_osd_op *op = (void *)(reqhead + 1);
+	u64 orig_len = *plen;
+	u64 objoff, objlen;    /* extent in object */
+	u64 bno;
+
+	reqhead->snapid = cpu_to_le64(vino.snap);
+
+	/* object extent? */
+	ceph_calc_file_object_mapping(layout, off, plen, &bno,
+				      &objoff, &objlen);
+	if (*plen < orig_len)
+		dout(" skipping last %llu, final file extent %llu~%llu\n",
+		     orig_len - *plen, off, *plen);
+
+	sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno);
+	req->r_oid_len = strlen(req->r_oid);
+
+	op->extent.offset = cpu_to_le64(objoff);
+	op->extent.length = cpu_to_le64(objlen);
+	req->r_num_pages = calc_pages_for(off, *plen);
+
+	dout("calc_layout %s (%d) %llu~%llu (%d pages)\n",
+	     req->r_oid, req->r_oid_len, objoff, objlen, req->r_num_pages);
+}
+
+/*
+ * requests
+ */
+void ceph_osdc_release_request(struct kref *kref)
+{
+	struct ceph_osd_request *req = container_of(kref,
+						    struct ceph_osd_request,
+						    r_kref);
+
+	if (req->r_request)
+		ceph_msg_put(req->r_request);
+	if (req->r_reply)
+		ceph_msg_put(req->r_reply);
+	if (req->r_con_filling_msg) {
+		dout("release_request revoking pages %p from con %p\n",
+		     req->r_pages, req->r_con_filling_msg);
+		ceph_con_revoke_message(req->r_con_filling_msg,
+				      req->r_reply);
+		ceph_con_put(req->r_con_filling_msg);
+	}
+	if (req->r_own_pages)
+		ceph_release_page_vector(req->r_pages,
+					 req->r_num_pages);
+	ceph_put_snap_context(req->r_snapc);
+	if (req->r_mempool)
+		mempool_free(req, req->r_osdc->req_mempool);
+	else
+		kfree(req);
+}
+
+/*
+ * build new request AND message, calculate layout, and adjust file
+ * extent as needed.
+ *
+ * if the file was recently truncated, we include information about its
+ * old and new size so that the object can be updated appropriately.  (we
+ * avoid synchronously deleting truncated objects because it's slow.)
+ *
+ * if @do_sync, include a 'startsync' command so that the osd will flush
+ * data quickly.
+ */
+struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
+					       struct ceph_file_layout *layout,
+					       struct ceph_vino vino,
+					       u64 off, u64 *plen,
+					       int opcode, int flags,
+					       struct ceph_snap_context *snapc,
+					       int do_sync,
+					       u32 truncate_seq,
+					       u64 truncate_size,
+					       struct timespec *mtime,
+					       bool use_mempool, int num_reply)
+{
+	struct ceph_osd_request *req;
+	struct ceph_msg *msg;
+	struct ceph_osd_request_head *head;
+	struct ceph_osd_op *op;
+	void *p;
+	int num_op = 1 + do_sync;
+	size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
+	int i;
+
+	if (use_mempool) {
+		req = mempool_alloc(osdc->req_mempool, GFP_NOFS);
+		memset(req, 0, sizeof(*req));
+	} else {
+		req = kzalloc(sizeof(*req), GFP_NOFS);
+	}
+	if (req == NULL)
+		return NULL;
+
+	req->r_osdc = osdc;
+	req->r_mempool = use_mempool;
+	kref_init(&req->r_kref);
+	init_completion(&req->r_completion);
+	init_completion(&req->r_safe_completion);
+	INIT_LIST_HEAD(&req->r_unsafe_item);
+	req->r_flags = flags;
+
+	WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
+
+	/* create reply message */
+	if (use_mempool)
+		msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
+	else
+		msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
+				   OSD_OPREPLY_FRONT_LEN, GFP_NOFS);
+	if (!msg) {
+		ceph_osdc_put_request(req);
+		return NULL;
+	}
+	req->r_reply = msg;
+
+	/* create request message; allow space for oid */
+	msg_size += 40;
+	if (snapc)
+		msg_size += sizeof(u64) * snapc->num_snaps;
+	if (use_mempool)
+		msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
+	else
+		msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, GFP_NOFS);
+	if (!msg) {
+		ceph_osdc_put_request(req);
+		return NULL;
+	}
+	msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
+	memset(msg->front.iov_base, 0, msg->front.iov_len);
+	head = msg->front.iov_base;
+	op = (void *)(head + 1);
+	p = (void *)(op + num_op);
+
+	req->r_request = msg;
+	req->r_snapc = ceph_get_snap_context(snapc);
+
+	head->client_inc = cpu_to_le32(1); /* always, for now. */
+	head->flags = cpu_to_le32(flags);
+	if (flags & CEPH_OSD_FLAG_WRITE)
+		ceph_encode_timespec(&head->mtime, mtime);
+	head->num_ops = cpu_to_le16(num_op);
+	op->op = cpu_to_le16(opcode);
+
+	/* calculate max write size */
+	calc_layout(osdc, vino, layout, off, plen, req);
+	req->r_file_layout = *layout;  /* keep a copy */
+
+	if (flags & CEPH_OSD_FLAG_WRITE) {
+		req->r_request->hdr.data_off = cpu_to_le16(off);
+		req->r_request->hdr.data_len = cpu_to_le32(*plen);
+		op->payload_len = cpu_to_le32(*plen);
+	}
+	op->extent.truncate_size = cpu_to_le64(truncate_size);
+	op->extent.truncate_seq = cpu_to_le32(truncate_seq);
+
+	/* fill in oid */
+	head->object_len = cpu_to_le32(req->r_oid_len);
+	memcpy(p, req->r_oid, req->r_oid_len);
+	p += req->r_oid_len;
+
+	if (do_sync) {
+		op++;
+		op->op = cpu_to_le16(CEPH_OSD_OP_STARTSYNC);
+	}
+	if (snapc) {
+		head->snap_seq = cpu_to_le64(snapc->seq);
+		head->num_snaps = cpu_to_le32(snapc->num_snaps);
+		for (i = 0; i < snapc->num_snaps; i++) {
+			put_unaligned_le64(snapc->snaps[i], p);
+			p += sizeof(u64);
+		}
+	}
+
+	BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
+	msg_size = p - msg->front.iov_base;
+	msg->front.iov_len = msg_size;
+	msg->hdr.front_len = cpu_to_le32(msg_size);
+	return req;
+}
+
+/*
+ * We keep osd requests in an rbtree, sorted by ->r_tid.
+ */
+static void __insert_request(struct ceph_osd_client *osdc,
+			     struct ceph_osd_request *new)
+{
+	struct rb_node **p = &osdc->requests.rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_osd_request *req = NULL;
+
+	while (*p) {
+		parent = *p;
+		req = rb_entry(parent, struct ceph_osd_request, r_node);
+		if (new->r_tid < req->r_tid)
+			p = &(*p)->rb_left;
+		else if (new->r_tid > req->r_tid)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&new->r_node, parent, p);
+	rb_insert_color(&new->r_node, &osdc->requests);
+}
+
+static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
+						 u64 tid)
+{
+	struct ceph_osd_request *req;
+	struct rb_node *n = osdc->requests.rb_node;
+
+	while (n) {
+		req = rb_entry(n, struct ceph_osd_request, r_node);
+		if (tid < req->r_tid)
+			n = n->rb_left;
+		else if (tid > req->r_tid)
+			n = n->rb_right;
+		else
+			return req;
+	}
+	return NULL;
+}
+
+static struct ceph_osd_request *
+__lookup_request_ge(struct ceph_osd_client *osdc,
+		    u64 tid)
+{
+	struct ceph_osd_request *req;
+	struct rb_node *n = osdc->requests.rb_node;
+
+	while (n) {
+		req = rb_entry(n, struct ceph_osd_request, r_node);
+		if (tid < req->r_tid) {
+			if (!n->rb_left)
+				return req;
+			n = n->rb_left;
+		} else if (tid > req->r_tid) {
+			n = n->rb_right;
+		} else {
+			return req;
+		}
+	}
+	return NULL;
+}
+
+
+/*
+ * If the osd connection drops, we need to resubmit all requests.
+ */
+static void osd_reset(struct ceph_connection *con)
+{
+	struct ceph_osd *osd = con->private;
+	struct ceph_osd_client *osdc;
+
+	if (!osd)
+		return;
+	dout("osd_reset osd%d\n", osd->o_osd);
+	osdc = osd->o_osdc;
+	down_read(&osdc->map_sem);
+	kick_requests(osdc, osd);
+	up_read(&osdc->map_sem);
+}
+
+/*
+ * Track open sessions with osds.
+ */
+static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
+{
+	struct ceph_osd *osd;
+
+	osd = kzalloc(sizeof(*osd), GFP_NOFS);
+	if (!osd)
+		return NULL;
+
+	atomic_set(&osd->o_ref, 1);
+	osd->o_osdc = osdc;
+	INIT_LIST_HEAD(&osd->o_requests);
+	INIT_LIST_HEAD(&osd->o_osd_lru);
+	osd->o_incarnation = 1;
+
+	ceph_con_init(osdc->client->msgr, &osd->o_con);
+	osd->o_con.private = osd;
+	osd->o_con.ops = &osd_con_ops;
+	osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
+
+	INIT_LIST_HEAD(&osd->o_keepalive_item);
+	return osd;
+}
+
+static struct ceph_osd *get_osd(struct ceph_osd *osd)
+{
+	if (atomic_inc_not_zero(&osd->o_ref)) {
+		dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
+		     atomic_read(&osd->o_ref));
+		return osd;
+	} else {
+		dout("get_osd %p FAIL\n", osd);
+		return NULL;
+	}
+}
+
+static void put_osd(struct ceph_osd *osd)
+{
+	dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
+	     atomic_read(&osd->o_ref) - 1);
+	if (atomic_dec_and_test(&osd->o_ref)) {
+		struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth;
+
+		if (osd->o_authorizer)
+			ac->ops->destroy_authorizer(ac, osd->o_authorizer);
+		kfree(osd);
+	}
+}
+
+/*
+ * remove an osd from our map
+ */
+static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
+{
+	dout("__remove_osd %p\n", osd);
+	BUG_ON(!list_empty(&osd->o_requests));
+	rb_erase(&osd->o_node, &osdc->osds);
+	list_del_init(&osd->o_osd_lru);
+	ceph_con_close(&osd->o_con);
+	put_osd(osd);
+}
+
+static void __move_osd_to_lru(struct ceph_osd_client *osdc,
+			      struct ceph_osd *osd)
+{
+	dout("__move_osd_to_lru %p\n", osd);
+	BUG_ON(!list_empty(&osd->o_osd_lru));
+	list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
+	osd->lru_ttl = jiffies + osdc->client->mount_args->osd_idle_ttl * HZ;
+}
+
+static void __remove_osd_from_lru(struct ceph_osd *osd)
+{
+	dout("__remove_osd_from_lru %p\n", osd);
+	if (!list_empty(&osd->o_osd_lru))
+		list_del_init(&osd->o_osd_lru);
+}
+
+static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all)
+{
+	struct ceph_osd *osd, *nosd;
+
+	dout("__remove_old_osds %p\n", osdc);
+	mutex_lock(&osdc->request_mutex);
+	list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
+		if (!remove_all && time_before(jiffies, osd->lru_ttl))
+			break;
+		__remove_osd(osdc, osd);
+	}
+	mutex_unlock(&osdc->request_mutex);
+}
+
+/*
+ * reset osd connect
+ */
+static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
+{
+	struct ceph_osd_request *req;
+	int ret = 0;
+
+	dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
+	if (list_empty(&osd->o_requests)) {
+		__remove_osd(osdc, osd);
+	} else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
+			  &osd->o_con.peer_addr,
+			  sizeof(osd->o_con.peer_addr)) == 0 &&
+		   !ceph_con_opened(&osd->o_con)) {
+		dout(" osd addr hasn't changed and connection never opened,"
+		     " letting msgr retry");
+		/* touch each r_stamp for handle_timeout()'s benfit */
+		list_for_each_entry(req, &osd->o_requests, r_osd_item)
+			req->r_stamp = jiffies;
+		ret = -EAGAIN;
+	} else {
+		ceph_con_close(&osd->o_con);
+		ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
+		osd->o_incarnation++;
+	}
+	return ret;
+}
+
+static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
+{
+	struct rb_node **p = &osdc->osds.rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_osd *osd = NULL;
+
+	while (*p) {
+		parent = *p;
+		osd = rb_entry(parent, struct ceph_osd, o_node);
+		if (new->o_osd < osd->o_osd)
+			p = &(*p)->rb_left;
+		else if (new->o_osd > osd->o_osd)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&new->o_node, parent, p);
+	rb_insert_color(&new->o_node, &osdc->osds);
+}
+
+static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
+{
+	struct ceph_osd *osd;
+	struct rb_node *n = osdc->osds.rb_node;
+
+	while (n) {
+		osd = rb_entry(n, struct ceph_osd, o_node);
+		if (o < osd->o_osd)
+			n = n->rb_left;
+		else if (o > osd->o_osd)
+			n = n->rb_right;
+		else
+			return osd;
+	}
+	return NULL;
+}
+
+static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
+{
+	schedule_delayed_work(&osdc->timeout_work,
+			osdc->client->mount_args->osd_keepalive_timeout * HZ);
+}
+
+static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
+{
+	cancel_delayed_work(&osdc->timeout_work);
+}
+
+/*
+ * Register request, assign tid.  If this is the first request, set up
+ * the timeout event.
+ */
+static void register_request(struct ceph_osd_client *osdc,
+			     struct ceph_osd_request *req)
+{
+	mutex_lock(&osdc->request_mutex);
+	req->r_tid = ++osdc->last_tid;
+	req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
+	INIT_LIST_HEAD(&req->r_req_lru_item);
+
+	dout("register_request %p tid %lld\n", req, req->r_tid);
+	__insert_request(osdc, req);
+	ceph_osdc_get_request(req);
+	osdc->num_requests++;
+
+	if (osdc->num_requests == 1) {
+		dout(" first request, scheduling timeout\n");
+		__schedule_osd_timeout(osdc);
+	}
+	mutex_unlock(&osdc->request_mutex);
+}
+
+/*
+ * called under osdc->request_mutex
+ */
+static void __unregister_request(struct ceph_osd_client *osdc,
+				 struct ceph_osd_request *req)
+{
+	dout("__unregister_request %p tid %lld\n", req, req->r_tid);
+	rb_erase(&req->r_node, &osdc->requests);
+	osdc->num_requests--;
+
+	if (req->r_osd) {
+		/* make sure the original request isn't in flight. */
+		ceph_con_revoke(&req->r_osd->o_con, req->r_request);
+
+		list_del_init(&req->r_osd_item);
+		if (list_empty(&req->r_osd->o_requests))
+			__move_osd_to_lru(osdc, req->r_osd);
+		req->r_osd = NULL;
+	}
+
+	ceph_osdc_put_request(req);
+
+	list_del_init(&req->r_req_lru_item);
+	if (osdc->num_requests == 0) {
+		dout(" no requests, canceling timeout\n");
+		__cancel_osd_timeout(osdc);
+	}
+}
+
+/*
+ * Cancel a previously queued request message
+ */
+static void __cancel_request(struct ceph_osd_request *req)
+{
+	if (req->r_sent) {
+		ceph_con_revoke(&req->r_osd->o_con, req->r_request);
+		req->r_sent = 0;
+	}
+	list_del_init(&req->r_req_lru_item);
+}
+
+/*
+ * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
+ * (as needed), and set the request r_osd appropriately.  If there is
+ * no up osd, set r_osd to NULL.
+ *
+ * Return 0 if unchanged, 1 if changed, or negative on error.
+ *
+ * Caller should hold map_sem for read and request_mutex.
+ */
+static int __map_osds(struct ceph_osd_client *osdc,
+		      struct ceph_osd_request *req)
+{
+	struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
+	struct ceph_pg pgid;
+	int acting[CEPH_PG_MAX_SIZE];
+	int o = -1, num = 0;
+	int err;
+
+	dout("map_osds %p tid %lld\n", req, req->r_tid);
+	err = ceph_calc_object_layout(&reqhead->layout, req->r_oid,
+				      &req->r_file_layout, osdc->osdmap);
+	if (err)
+		return err;
+	pgid = reqhead->layout.ol_pgid;
+	req->r_pgid = pgid;
+
+	err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting);
+	if (err > 0) {
+		o = acting[0];
+		num = err;
+	}
+
+	if ((req->r_osd && req->r_osd->o_osd == o &&
+	     req->r_sent >= req->r_osd->o_incarnation &&
+	     req->r_num_pg_osds == num &&
+	     memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
+	    (req->r_osd == NULL && o == -1))
+		return 0;  /* no change */
+
+	dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n",
+	     req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
+	     req->r_osd ? req->r_osd->o_osd : -1);
+
+	/* record full pg acting set */
+	memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
+	req->r_num_pg_osds = num;
+
+	if (req->r_osd) {
+		__cancel_request(req);
+		list_del_init(&req->r_osd_item);
+		req->r_osd = NULL;
+	}
+
+	req->r_osd = __lookup_osd(osdc, o);
+	if (!req->r_osd && o >= 0) {
+		err = -ENOMEM;
+		req->r_osd = create_osd(osdc);
+		if (!req->r_osd)
+			goto out;
+
+		dout("map_osds osd %p is osd%d\n", req->r_osd, o);
+		req->r_osd->o_osd = o;
+		req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
+		__insert_osd(osdc, req->r_osd);
+
+		ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);
+	}
+
+	if (req->r_osd) {
+		__remove_osd_from_lru(req->r_osd);
+		list_add(&req->r_osd_item, &req->r_osd->o_requests);
+	}
+	err = 1;   /* osd or pg changed */
+
+out:
+	return err;
+}
+
+/*
+ * caller should hold map_sem (for read) and request_mutex
+ */
+static int __send_request(struct ceph_osd_client *osdc,
+			  struct ceph_osd_request *req)
+{
+	struct ceph_osd_request_head *reqhead;
+	int err;
+
+	err = __map_osds(osdc, req);
+	if (err < 0)
+		return err;
+	if (req->r_osd == NULL) {
+		dout("send_request %p no up osds in pg\n", req);
+		ceph_monc_request_next_osdmap(&osdc->client->monc);
+		return 0;
+	}
+
+	dout("send_request %p tid %llu to osd%d flags %d\n",
+	     req, req->r_tid, req->r_osd->o_osd, req->r_flags);
+
+	reqhead = req->r_request->front.iov_base;
+	reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch);
+	reqhead->flags |= cpu_to_le32(req->r_flags);  /* e.g., RETRY */
+	reqhead->reassert_version = req->r_reassert_version;
+
+	req->r_stamp = jiffies;
+	list_move_tail(&osdc->req_lru, &req->r_req_lru_item);
+
+	ceph_msg_get(req->r_request); /* send consumes a ref */
+	ceph_con_send(&req->r_osd->o_con, req->r_request);
+	req->r_sent = req->r_osd->o_incarnation;
+	return 0;
+}
+
+/*
+ * Timeout callback, called every N seconds when 1 or more osd
+ * requests has been active for more than N seconds.  When this
+ * happens, we ping all OSDs with requests who have timed out to
+ * ensure any communications channel reset is detected.  Reset the
+ * request timeouts another N seconds in the future as we go.
+ * Reschedule the timeout event another N seconds in future (unless
+ * there are no open requests).
+ */
+static void handle_timeout(struct work_struct *work)
+{
+	struct ceph_osd_client *osdc =
+		container_of(work, struct ceph_osd_client, timeout_work.work);
+	struct ceph_osd_request *req, *last_req = NULL;
+	struct ceph_osd *osd;
+	unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ;
+	unsigned long keepalive =
+		osdc->client->mount_args->osd_keepalive_timeout * HZ;
+	unsigned long last_stamp = 0;
+	struct rb_node *p;
+	struct list_head slow_osds;
+
+	dout("timeout\n");
+	down_read(&osdc->map_sem);
+
+	ceph_monc_request_next_osdmap(&osdc->client->monc);
+
+	mutex_lock(&osdc->request_mutex);
+	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
+		req = rb_entry(p, struct ceph_osd_request, r_node);
+
+		if (req->r_resend) {
+			int err;
+
+			dout("osdc resending prev failed %lld\n", req->r_tid);
+			err = __send_request(osdc, req);
+			if (err)
+				dout("osdc failed again on %lld\n", req->r_tid);
+			else
+				req->r_resend = false;
+			continue;
+		}
+	}
+
+	/*
+	 * reset osds that appear to be _really_ unresponsive.  this
+	 * is a failsafe measure.. we really shouldn't be getting to
+	 * this point if the system is working properly.  the monitors
+	 * should mark the osd as failed and we should find out about
+	 * it from an updated osd map.
+	 */
+	while (timeout && !list_empty(&osdc->req_lru)) {
+		req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
+				 r_req_lru_item);
+
+		if (time_before(jiffies, req->r_stamp + timeout))
+			break;
+
+		BUG_ON(req == last_req && req->r_stamp == last_stamp);
+		last_req = req;
+		last_stamp = req->r_stamp;
+
+		osd = req->r_osd;
+		BUG_ON(!osd);
+		pr_warning(" tid %llu timed out on osd%d, will reset osd\n",
+			   req->r_tid, osd->o_osd);
+		__kick_requests(osdc, osd);
+	}
+
+	/*
+	 * ping osds that are a bit slow.  this ensures that if there
+	 * is a break in the TCP connection we will notice, and reopen
+	 * a connection with that osd (from the fault callback).
+	 */
+	INIT_LIST_HEAD(&slow_osds);
+	list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
+		if (time_before(jiffies, req->r_stamp + keepalive))
+			break;
+
+		osd = req->r_osd;
+		BUG_ON(!osd);
+		dout(" tid %llu is slow, will send keepalive on osd%d\n",
+		     req->r_tid, osd->o_osd);
+		list_move_tail(&osd->o_keepalive_item, &slow_osds);
+	}
+	while (!list_empty(&slow_osds)) {
+		osd = list_entry(slow_osds.next, struct ceph_osd,
+				 o_keepalive_item);
+		list_del_init(&osd->o_keepalive_item);
+		ceph_con_keepalive(&osd->o_con);
+	}
+
+	__schedule_osd_timeout(osdc);
+	mutex_unlock(&osdc->request_mutex);
+
+	up_read(&osdc->map_sem);
+}
+
+static void handle_osds_timeout(struct work_struct *work)
+{
+	struct ceph_osd_client *osdc =
+		container_of(work, struct ceph_osd_client,
+			     osds_timeout_work.work);
+	unsigned long delay =
+		osdc->client->mount_args->osd_idle_ttl * HZ >> 2;
+
+	dout("osds timeout\n");
+	down_read(&osdc->map_sem);
+	remove_old_osds(osdc, 0);
+	up_read(&osdc->map_sem);
+
+	schedule_delayed_work(&osdc->osds_timeout_work,
+			      round_jiffies_relative(delay));
+}
+
+/*
+ * handle osd op reply.  either call the callback if it is specified,
+ * or do the completion to wake up the waiting thread.
+ */
+static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
+			 struct ceph_connection *con)
+{
+	struct ceph_osd_reply_head *rhead = msg->front.iov_base;
+	struct ceph_osd_request *req;
+	u64 tid;
+	int numops, object_len, flags;
+	s32 result;
+
+	tid = le64_to_cpu(msg->hdr.tid);
+	if (msg->front.iov_len < sizeof(*rhead))
+		goto bad;
+	numops = le32_to_cpu(rhead->num_ops);
+	object_len = le32_to_cpu(rhead->object_len);
+	result = le32_to_cpu(rhead->result);
+	if (msg->front.iov_len != sizeof(*rhead) + object_len +
+	    numops * sizeof(struct ceph_osd_op))
+		goto bad;
+	dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result);
+
+	/* lookup */
+	mutex_lock(&osdc->request_mutex);
+	req = __lookup_request(osdc, tid);
+	if (req == NULL) {
+		dout("handle_reply tid %llu dne\n", tid);
+		mutex_unlock(&osdc->request_mutex);
+		return;
+	}
+	ceph_osdc_get_request(req);
+	flags = le32_to_cpu(rhead->flags);
+
+	/*
+	 * if this connection filled our message, drop our reference now, to
+	 * avoid a (safe but slower) revoke later.
+	 */
+	if (req->r_con_filling_msg == con && req->r_reply == msg) {
+		dout(" dropping con_filling_msg ref %p\n", con);
+		req->r_con_filling_msg = NULL;
+		ceph_con_put(con);
+	}
+
+	if (!req->r_got_reply) {
+		unsigned bytes;
+
+		req->r_result = le32_to_cpu(rhead->result);
+		bytes = le32_to_cpu(msg->hdr.data_len);
+		dout("handle_reply result %d bytes %d\n", req->r_result,
+		     bytes);
+		if (req->r_result == 0)
+			req->r_result = bytes;
+
+		/* in case this is a write and we need to replay, */
+		req->r_reassert_version = rhead->reassert_version;
+
+		req->r_got_reply = 1;
+	} else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
+		dout("handle_reply tid %llu dup ack\n", tid);
+		mutex_unlock(&osdc->request_mutex);
+		goto done;
+	}
+
+	dout("handle_reply tid %llu flags %d\n", tid, flags);
+
+	/* either this is a read, or we got the safe response */
+	if (result < 0 ||
+	    (flags & CEPH_OSD_FLAG_ONDISK) ||
+	    ((flags & CEPH_OSD_FLAG_WRITE) == 0))
+		__unregister_request(osdc, req);
+
+	mutex_unlock(&osdc->request_mutex);
+
+	if (req->r_callback)
+		req->r_callback(req, msg);
+	else
+		complete_all(&req->r_completion);
+
+	if (flags & CEPH_OSD_FLAG_ONDISK) {
+		if (req->r_safe_callback)
+			req->r_safe_callback(req, msg);
+		complete_all(&req->r_safe_completion);  /* fsync waiter */
+	}
+
+done:
+	ceph_osdc_put_request(req);
+	return;
+
+bad:
+	pr_err("corrupt osd_op_reply got %d %d expected %d\n",
+	       (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len),
+	       (int)sizeof(*rhead));
+	ceph_msg_dump(msg);
+}
+
+
+static int __kick_requests(struct ceph_osd_client *osdc,
+			  struct ceph_osd *kickosd)
+{
+	struct ceph_osd_request *req;
+	struct rb_node *p, *n;
+	int needmap = 0;
+	int err;
+
+	dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
+	if (kickosd) {
+		err = __reset_osd(osdc, kickosd);
+		if (err == -EAGAIN)
+			return 1;
+	} else {
+		for (p = rb_first(&osdc->osds); p; p = n) {
+			struct ceph_osd *osd =
+				rb_entry(p, struct ceph_osd, o_node);
+
+			n = rb_next(p);
+			if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
+			    memcmp(&osd->o_con.peer_addr,
+				   ceph_osd_addr(osdc->osdmap,
+						 osd->o_osd),
+				   sizeof(struct ceph_entity_addr)) != 0)
+				__reset_osd(osdc, osd);
+		}
+	}
+
+	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
+		req = rb_entry(p, struct ceph_osd_request, r_node);
+
+		if (req->r_resend) {
+			dout(" r_resend set on tid %llu\n", req->r_tid);
+			__cancel_request(req);
+			goto kick;
+		}
+		if (req->r_osd && kickosd == req->r_osd) {
+			__cancel_request(req);
+			goto kick;
+		}
+
+		err = __map_osds(osdc, req);
+		if (err == 0)
+			continue;  /* no change */
+		if (err < 0) {
+			/*
+			 * FIXME: really, we should set the request
+			 * error and fail if this isn't a 'nofail'
+			 * request, but that's a fair bit more
+			 * complicated to do.  So retry!
+			 */
+			dout(" setting r_resend on %llu\n", req->r_tid);
+			req->r_resend = true;
+			continue;
+		}
+		if (req->r_osd == NULL) {
+			dout("tid %llu maps to no valid osd\n", req->r_tid);
+			needmap++;  /* request a newer map */
+			continue;
+		}
+
+kick:
+		dout("kicking %p tid %llu osd%d\n", req, req->r_tid,
+		     req->r_osd ? req->r_osd->o_osd : -1);
+		req->r_flags |= CEPH_OSD_FLAG_RETRY;
+		err = __send_request(osdc, req);
+		if (err) {
+			dout(" setting r_resend on %llu\n", req->r_tid);
+			req->r_resend = true;
+		}
+	}
+
+	return needmap;
+}
+
+/*
+ * Resubmit osd requests whose osd or osd address has changed.  Request
+ * a new osd map if osds are down, or we are otherwise unable to determine
+ * how to direct a request.
+ *
+ * Close connections to down osds.
+ *
+ * If @who is specified, resubmit requests for that specific osd.
+ *
+ * Caller should hold map_sem for read and request_mutex.
+ */
+static void kick_requests(struct ceph_osd_client *osdc,
+			  struct ceph_osd *kickosd)
+{
+	int needmap;
+
+	mutex_lock(&osdc->request_mutex);
+	needmap = __kick_requests(osdc, kickosd);
+	mutex_unlock(&osdc->request_mutex);
+
+	if (needmap) {
+		dout("%d requests for down osds, need new map\n", needmap);
+		ceph_monc_request_next_osdmap(&osdc->client->monc);
+	}
+
+}
+/*
+ * Process updated osd map.
+ *
+ * The message contains any number of incremental and full maps, normally
+ * indicating some sort of topology change in the cluster.  Kick requests
+ * off to different OSDs as needed.
+ */
+void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
+{
+	void *p, *end, *next;
+	u32 nr_maps, maplen;
+	u32 epoch;
+	struct ceph_osdmap *newmap = NULL, *oldmap;
+	int err;
+	struct ceph_fsid fsid;
+
+	dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
+	p = msg->front.iov_base;
+	end = p + msg->front.iov_len;
+
+	/* verify fsid */
+	ceph_decode_need(&p, end, sizeof(fsid), bad);
+	ceph_decode_copy(&p, &fsid, sizeof(fsid));
+	if (ceph_check_fsid(osdc->client, &fsid) < 0)
+		return;
+
+	down_write(&osdc->map_sem);
+
+	/* incremental maps */
+	ceph_decode_32_safe(&p, end, nr_maps, bad);
+	dout(" %d inc maps\n", nr_maps);
+	while (nr_maps > 0) {
+		ceph_decode_need(&p, end, 2*sizeof(u32), bad);
+		epoch = ceph_decode_32(&p);
+		maplen = ceph_decode_32(&p);
+		ceph_decode_need(&p, end, maplen, bad);
+		next = p + maplen;
+		if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
+			dout("applying incremental map %u len %d\n",
+			     epoch, maplen);
+			newmap = osdmap_apply_incremental(&p, next,
+							  osdc->osdmap,
+							  osdc->client->msgr);
+			if (IS_ERR(newmap)) {
+				err = PTR_ERR(newmap);
+				goto bad;
+			}
+			BUG_ON(!newmap);
+			if (newmap != osdc->osdmap) {
+				ceph_osdmap_destroy(osdc->osdmap);
+				osdc->osdmap = newmap;
+			}
+		} else {
+			dout("ignoring incremental map %u len %d\n",
+			     epoch, maplen);
+		}
+		p = next;
+		nr_maps--;
+	}
+	if (newmap)
+		goto done;
+
+	/* full maps */
+	ceph_decode_32_safe(&p, end, nr_maps, bad);
+	dout(" %d full maps\n", nr_maps);
+	while (nr_maps) {
+		ceph_decode_need(&p, end, 2*sizeof(u32), bad);
+		epoch = ceph_decode_32(&p);
+		maplen = ceph_decode_32(&p);
+		ceph_decode_need(&p, end, maplen, bad);
+		if (nr_maps > 1) {
+			dout("skipping non-latest full map %u len %d\n",
+			     epoch, maplen);
+		} else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
+			dout("skipping full map %u len %d, "
+			     "older than our %u\n", epoch, maplen,
+			     osdc->osdmap->epoch);
+		} else {
+			dout("taking full map %u len %d\n", epoch, maplen);
+			newmap = osdmap_decode(&p, p+maplen);
+			if (IS_ERR(newmap)) {
+				err = PTR_ERR(newmap);
+				goto bad;
+			}
+			BUG_ON(!newmap);
+			oldmap = osdc->osdmap;
+			osdc->osdmap = newmap;
+			if (oldmap)
+				ceph_osdmap_destroy(oldmap);
+		}
+		p += maplen;
+		nr_maps--;
+	}
+
+done:
+	downgrade_write(&osdc->map_sem);
+	ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
+	if (newmap)
+		kick_requests(osdc, NULL);
+	up_read(&osdc->map_sem);
+	wake_up_all(&osdc->client->auth_wq);
+	return;
+
+bad:
+	pr_err("osdc handle_map corrupt msg\n");
+	ceph_msg_dump(msg);
+	up_write(&osdc->map_sem);
+	return;
+}
+
+/*
+ * Register request, send initial attempt.
+ */
+int ceph_osdc_start_request(struct ceph_osd_client *osdc,
+			    struct ceph_osd_request *req,
+			    bool nofail)
+{
+	int rc = 0;
+
+	req->r_request->pages = req->r_pages;
+	req->r_request->nr_pages = req->r_num_pages;
+
+	register_request(osdc, req);
+
+	down_read(&osdc->map_sem);
+	mutex_lock(&osdc->request_mutex);
+	/*
+	 * a racing kick_requests() may have sent the message for us
+	 * while we dropped request_mutex above, so only send now if
+	 * the request still han't been touched yet.
+	 */
+	if (req->r_sent == 0) {
+		rc = __send_request(osdc, req);
+		if (rc) {
+			if (nofail) {
+				dout("osdc_start_request failed send, "
+				     " marking %lld\n", req->r_tid);
+				req->r_resend = true;
+				rc = 0;
+			} else {
+				__unregister_request(osdc, req);
+			}
+		}
+	}
+	mutex_unlock(&osdc->request_mutex);
+	up_read(&osdc->map_sem);
+	return rc;
+}
+
+/*
+ * wait for a request to complete
+ */
+int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
+			   struct ceph_osd_request *req)
+{
+	int rc;
+
+	rc = wait_for_completion_interruptible(&req->r_completion);
+	if (rc < 0) {
+		mutex_lock(&osdc->request_mutex);
+		__cancel_request(req);
+		__unregister_request(osdc, req);
+		mutex_unlock(&osdc->request_mutex);
+		dout("wait_request tid %llu canceled/timed out\n", req->r_tid);
+		return rc;
+	}
+
+	dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
+	return req->r_result;
+}
+
+/*
+ * sync - wait for all in-flight requests to flush.  avoid starvation.
+ */
+void ceph_osdc_sync(struct ceph_osd_client *osdc)
+{
+	struct ceph_osd_request *req;
+	u64 last_tid, next_tid = 0;
+
+	mutex_lock(&osdc->request_mutex);
+	last_tid = osdc->last_tid;
+	while (1) {
+		req = __lookup_request_ge(osdc, next_tid);
+		if (!req)
+			break;
+		if (req->r_tid > last_tid)
+			break;
+
+		next_tid = req->r_tid + 1;
+		if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
+			continue;
+
+		ceph_osdc_get_request(req);
+		mutex_unlock(&osdc->request_mutex);
+		dout("sync waiting on tid %llu (last is %llu)\n",
+		     req->r_tid, last_tid);
+		wait_for_completion(&req->r_safe_completion);
+		mutex_lock(&osdc->request_mutex);
+		ceph_osdc_put_request(req);
+	}
+	mutex_unlock(&osdc->request_mutex);
+	dout("sync done (thru tid %llu)\n", last_tid);
+}
+
+/*
+ * init, shutdown
+ */
+int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
+{
+	int err;
+
+	dout("init\n");
+	osdc->client = client;
+	osdc->osdmap = NULL;
+	init_rwsem(&osdc->map_sem);
+	init_completion(&osdc->map_waiters);
+	osdc->last_requested_map = 0;
+	mutex_init(&osdc->request_mutex);
+	osdc->last_tid = 0;
+	osdc->osds = RB_ROOT;
+	INIT_LIST_HEAD(&osdc->osd_lru);
+	osdc->requests = RB_ROOT;
+	INIT_LIST_HEAD(&osdc->req_lru);
+	osdc->num_requests = 0;
+	INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
+	INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
+
+	schedule_delayed_work(&osdc->osds_timeout_work,
+	   round_jiffies_relative(osdc->client->mount_args->osd_idle_ttl * HZ));
+
+	err = -ENOMEM;
+	osdc->req_mempool = mempool_create_kmalloc_pool(10,
+					sizeof(struct ceph_osd_request));
+	if (!osdc->req_mempool)
+		goto out;
+
+	err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true,
+				"osd_op");
+	if (err < 0)
+		goto out_mempool;
+	err = ceph_msgpool_init(&osdc->msgpool_op_reply,
+				OSD_OPREPLY_FRONT_LEN, 10, true,
+				"osd_op_reply");
+	if (err < 0)
+		goto out_msgpool;
+	return 0;
+
+out_msgpool:
+	ceph_msgpool_destroy(&osdc->msgpool_op);
+out_mempool:
+	mempool_destroy(osdc->req_mempool);
+out:
+	return err;
+}
+
+void ceph_osdc_stop(struct ceph_osd_client *osdc)
+{
+	cancel_delayed_work_sync(&osdc->timeout_work);
+	cancel_delayed_work_sync(&osdc->osds_timeout_work);
+	if (osdc->osdmap) {
+		ceph_osdmap_destroy(osdc->osdmap);
+		osdc->osdmap = NULL;
+	}
+	remove_old_osds(osdc, 1);
+	mempool_destroy(osdc->req_mempool);
+	ceph_msgpool_destroy(&osdc->msgpool_op);
+	ceph_msgpool_destroy(&osdc->msgpool_op_reply);
+}
+
+/*
+ * Read some contiguous pages.  If we cross a stripe boundary, shorten
+ * *plen.  Return number of bytes read, or error.
+ */
+int ceph_osdc_readpages(struct ceph_osd_client *osdc,
+			struct ceph_vino vino, struct ceph_file_layout *layout,
+			u64 off, u64 *plen,
+			u32 truncate_seq, u64 truncate_size,
+			struct page **pages, int num_pages)
+{
+	struct ceph_osd_request *req;
+	int rc = 0;
+
+	dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
+	     vino.snap, off, *plen);
+	req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
+				    CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
+				    NULL, 0, truncate_seq, truncate_size, NULL,
+				    false, 1);
+	if (!req)
+		return -ENOMEM;
+
+	/* it may be a short read due to an object boundary */
+	req->r_pages = pages;
+	num_pages = calc_pages_for(off, *plen);
+	req->r_num_pages = num_pages;
+
+	dout("readpages  final extent is %llu~%llu (%d pages)\n",
+	     off, *plen, req->r_num_pages);
+
+	rc = ceph_osdc_start_request(osdc, req, false);
+	if (!rc)
+		rc = ceph_osdc_wait_request(osdc, req);
+
+	ceph_osdc_put_request(req);
+	dout("readpages result %d\n", rc);
+	return rc;
+}
+
+/*
+ * do a synchronous write on N pages
+ */
+int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
+			 struct ceph_file_layout *layout,
+			 struct ceph_snap_context *snapc,
+			 u64 off, u64 len,
+			 u32 truncate_seq, u64 truncate_size,
+			 struct timespec *mtime,
+			 struct page **pages, int num_pages,
+			 int flags, int do_sync, bool nofail)
+{
+	struct ceph_osd_request *req;
+	int rc = 0;
+
+	BUG_ON(vino.snap != CEPH_NOSNAP);
+	req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
+				    CEPH_OSD_OP_WRITE,
+				    flags | CEPH_OSD_FLAG_ONDISK |
+					    CEPH_OSD_FLAG_WRITE,
+				    snapc, do_sync,
+				    truncate_seq, truncate_size, mtime,
+				    nofail, 1);
+	if (!req)
+		return -ENOMEM;
+
+	/* it may be a short write due to an object boundary */
+	req->r_pages = pages;
+	req->r_num_pages = calc_pages_for(off, len);
+	dout("writepages %llu~%llu (%d pages)\n", off, len,
+	     req->r_num_pages);
+
+	rc = ceph_osdc_start_request(osdc, req, nofail);
+	if (!rc)
+		rc = ceph_osdc_wait_request(osdc, req);
+
+	ceph_osdc_put_request(req);
+	if (rc == 0)
+		rc = len;
+	dout("writepages result %d\n", rc);
+	return rc;
+}
+
+/*
+ * handle incoming message
+ */
+static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+{
+	struct ceph_osd *osd = con->private;
+	struct ceph_osd_client *osdc;
+	int type = le16_to_cpu(msg->hdr.type);
+
+	if (!osd)
+		goto out;
+	osdc = osd->o_osdc;
+
+	switch (type) {
+	case CEPH_MSG_OSD_MAP:
+		ceph_osdc_handle_map(osdc, msg);
+		break;
+	case CEPH_MSG_OSD_OPREPLY:
+		handle_reply(osdc, msg, con);
+		break;
+
+	default:
+		pr_err("received unknown message type %d %s\n", type,
+		       ceph_msg_type_name(type));
+	}
+out:
+	ceph_msg_put(msg);
+}
+
+/*
+ * lookup and return message for incoming reply.  set up reply message
+ * pages.
+ */
+static struct ceph_msg *get_reply(struct ceph_connection *con,
+				  struct ceph_msg_header *hdr,
+				  int *skip)
+{
+	struct ceph_osd *osd = con->private;
+	struct ceph_osd_client *osdc = osd->o_osdc;
+	struct ceph_msg *m;
+	struct ceph_osd_request *req;
+	int front = le32_to_cpu(hdr->front_len);
+	int data_len = le32_to_cpu(hdr->data_len);
+	u64 tid;
+
+	tid = le64_to_cpu(hdr->tid);
+	mutex_lock(&osdc->request_mutex);
+	req = __lookup_request(osdc, tid);
+	if (!req) {
+		*skip = 1;
+		m = NULL;
+		pr_info("get_reply unknown tid %llu from osd%d\n", tid,
+			osd->o_osd);
+		goto out;
+	}
+
+	if (req->r_con_filling_msg) {
+		dout("get_reply revoking msg %p from old con %p\n",
+		     req->r_reply, req->r_con_filling_msg);
+		ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
+		ceph_con_put(req->r_con_filling_msg);
+		req->r_con_filling_msg = NULL;
+	}
+
+	if (front > req->r_reply->front.iov_len) {
+		pr_warning("get_reply front %d > preallocated %d\n",
+			   front, (int)req->r_reply->front.iov_len);
+		m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS);
+		if (!m)
+			goto out;
+		ceph_msg_put(req->r_reply);
+		req->r_reply = m;
+	}
+	m = ceph_msg_get(req->r_reply);
+
+	if (data_len > 0) {
+		unsigned data_off = le16_to_cpu(hdr->data_off);
+		int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
+
+		if (unlikely(req->r_num_pages < want)) {
+			pr_warning("tid %lld reply %d > expected %d pages\n",
+				   tid, want, m->nr_pages);
+			*skip = 1;
+			ceph_msg_put(m);
+			m = NULL;
+			goto out;
+		}
+		m->pages = req->r_pages;
+		m->nr_pages = req->r_num_pages;
+	}
+	*skip = 0;
+	req->r_con_filling_msg = ceph_con_get(con);
+	dout("get_reply tid %lld %p\n", tid, m);
+
+out:
+	mutex_unlock(&osdc->request_mutex);
+	return m;
+
+}
+
+static struct ceph_msg *alloc_msg(struct ceph_connection *con,
+				  struct ceph_msg_header *hdr,
+				  int *skip)
+{
+	struct ceph_osd *osd = con->private;
+	int type = le16_to_cpu(hdr->type);
+	int front = le32_to_cpu(hdr->front_len);
+
+	switch (type) {
+	case CEPH_MSG_OSD_MAP:
+		return ceph_msg_new(type, front, GFP_NOFS);
+	case CEPH_MSG_OSD_OPREPLY:
+		return get_reply(con, hdr, skip);
+	default:
+		pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
+			osd->o_osd);
+		*skip = 1;
+		return NULL;
+	}
+}
+
+/*
+ * Wrappers to refcount containing ceph_osd struct
+ */
+static struct ceph_connection *get_osd_con(struct ceph_connection *con)
+{
+	struct ceph_osd *osd = con->private;
+	if (get_osd(osd))
+		return con;
+	return NULL;
+}
+
+static void put_osd_con(struct ceph_connection *con)
+{
+	struct ceph_osd *osd = con->private;
+	put_osd(osd);
+}
+
+/*
+ * authentication
+ */
+static int get_authorizer(struct ceph_connection *con,
+	                  void **buf, int *len, int *proto,
+	                  void **reply_buf, int *reply_len, int force_new)
+{
+	struct ceph_osd *o = con->private;
+	struct ceph_osd_client *osdc = o->o_osdc;
+	struct ceph_auth_client *ac = osdc->client->monc.auth;
+	int ret = 0;
+
+	if (force_new && o->o_authorizer) {
+		ac->ops->destroy_authorizer(ac, o->o_authorizer);
+		o->o_authorizer = NULL;
+	}
+	if (o->o_authorizer == NULL) {
+		ret = ac->ops->create_authorizer(
+			ac, CEPH_ENTITY_TYPE_OSD,
+			&o->o_authorizer,
+			&o->o_authorizer_buf,
+			&o->o_authorizer_buf_len,
+			&o->o_authorizer_reply_buf,
+			&o->o_authorizer_reply_buf_len);
+		if (ret)
+		return ret;
+	}
+
+	*proto = ac->protocol;
+	*buf = o->o_authorizer_buf;
+	*len = o->o_authorizer_buf_len;
+	*reply_buf = o->o_authorizer_reply_buf;
+	*reply_len = o->o_authorizer_reply_buf_len;
+	return 0;
+}
+
+
+static int verify_authorizer_reply(struct ceph_connection *con, int len)
+{
+	struct ceph_osd *o = con->private;
+	struct ceph_osd_client *osdc = o->o_osdc;
+	struct ceph_auth_client *ac = osdc->client->monc.auth;
+
+	return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len);
+}
+
+static int invalidate_authorizer(struct ceph_connection *con)
+{
+	struct ceph_osd *o = con->private;
+	struct ceph_osd_client *osdc = o->o_osdc;
+	struct ceph_auth_client *ac = osdc->client->monc.auth;
+
+	if (ac->ops->invalidate_authorizer)
+		ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
+
+	return ceph_monc_validate_auth(&osdc->client->monc);
+}
+
+static const struct ceph_connection_operations osd_con_ops = {
+	.get = get_osd_con,
+	.put = put_osd_con,
+	.dispatch = dispatch,
+	.get_authorizer = get_authorizer,
+	.verify_authorizer_reply = verify_authorizer_reply,
+	.invalidate_authorizer = invalidate_authorizer,
+	.alloc_msg = alloc_msg,
+	.fault = osd_reset,
+};
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
new file mode 100644
index 000000000000..ce776989ef6a
--- /dev/null
+++ b/fs/ceph/osd_client.h
@@ -0,0 +1,167 @@
+#ifndef _FS_CEPH_OSD_CLIENT_H
+#define _FS_CEPH_OSD_CLIENT_H
+
+#include <linux/completion.h>
+#include <linux/kref.h>
+#include <linux/mempool.h>
+#include <linux/rbtree.h>
+
+#include "types.h"
+#include "osdmap.h"
+#include "messenger.h"
+
+struct ceph_msg;
+struct ceph_snap_context;
+struct ceph_osd_request;
+struct ceph_osd_client;
+struct ceph_authorizer;
+
+/*
+ * completion callback for async writepages
+ */
+typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
+				     struct ceph_msg *);
+
+/* a given osd we're communicating with */
+struct ceph_osd {
+	atomic_t o_ref;
+	struct ceph_osd_client *o_osdc;
+	int o_osd;
+	int o_incarnation;
+	struct rb_node o_node;
+	struct ceph_connection o_con;
+	struct list_head o_requests;
+	struct list_head o_osd_lru;
+	struct ceph_authorizer *o_authorizer;
+	void *o_authorizer_buf, *o_authorizer_reply_buf;
+	size_t o_authorizer_buf_len, o_authorizer_reply_buf_len;
+	unsigned long lru_ttl;
+	int o_marked_for_keepalive;
+	struct list_head o_keepalive_item;
+};
+
+/* an in-flight request */
+struct ceph_osd_request {
+	u64             r_tid;              /* unique for this client */
+	struct rb_node  r_node;
+	struct list_head r_req_lru_item;
+	struct list_head r_osd_item;
+	struct ceph_osd *r_osd;
+	struct ceph_pg   r_pgid;
+	int              r_pg_osds[CEPH_PG_MAX_SIZE];
+	int              r_num_pg_osds;
+
+	struct ceph_connection *r_con_filling_msg;
+
+	struct ceph_msg  *r_request, *r_reply;
+	int               r_result;
+	int               r_flags;     /* any additional flags for the osd */
+	u32               r_sent;      /* >0 if r_request is sending/sent */
+	int               r_got_reply;
+
+	struct ceph_osd_client *r_osdc;
+	struct kref       r_kref;
+	bool              r_mempool;
+	struct completion r_completion, r_safe_completion;
+	ceph_osdc_callback_t r_callback, r_safe_callback;
+	struct ceph_eversion r_reassert_version;
+	struct list_head  r_unsafe_item;
+
+	struct inode *r_inode;         	      /* for use by callbacks */
+
+	char              r_oid[40];          /* object name */
+	int               r_oid_len;
+	unsigned long     r_stamp;            /* send OR check time */
+	bool              r_resend;           /* msg send failed, needs retry */
+
+	struct ceph_file_layout r_file_layout;
+	struct ceph_snap_context *r_snapc;    /* snap context for writes */
+	unsigned          r_num_pages;        /* size of page array (follows) */
+	struct page     **r_pages;            /* pages for data payload */
+	int               r_pages_from_pool;
+	int               r_own_pages;        /* if true, i own page list */
+};
+
+struct ceph_osd_client {
+	struct ceph_client     *client;
+
+	struct ceph_osdmap     *osdmap;       /* current map */
+	struct rw_semaphore    map_sem;
+	struct completion      map_waiters;
+	u64                    last_requested_map;
+
+	struct mutex           request_mutex;
+	struct rb_root         osds;          /* osds */
+	struct list_head       osd_lru;       /* idle osds */
+	u64                    timeout_tid;   /* tid of timeout triggering rq */
+	u64                    last_tid;      /* tid of last request */
+	struct rb_root         requests;      /* pending requests */
+	struct list_head       req_lru;	      /* pending requests lru */
+	int                    num_requests;
+	struct delayed_work    timeout_work;
+	struct delayed_work    osds_timeout_work;
+#ifdef CONFIG_DEBUG_FS
+	struct dentry 	       *debugfs_file;
+#endif
+
+	mempool_t              *req_mempool;
+
+	struct ceph_msgpool	msgpool_op;
+	struct ceph_msgpool	msgpool_op_reply;
+};
+
+extern int ceph_osdc_init(struct ceph_osd_client *osdc,
+			  struct ceph_client *client);
+extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
+
+extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
+				   struct ceph_msg *msg);
+extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
+				 struct ceph_msg *msg);
+
+extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
+				      struct ceph_file_layout *layout,
+				      struct ceph_vino vino,
+				      u64 offset, u64 *len, int op, int flags,
+				      struct ceph_snap_context *snapc,
+				      int do_sync, u32 truncate_seq,
+				      u64 truncate_size,
+				      struct timespec *mtime,
+				      bool use_mempool, int num_reply);
+
+static inline void ceph_osdc_get_request(struct ceph_osd_request *req)
+{
+	kref_get(&req->r_kref);
+}
+extern void ceph_osdc_release_request(struct kref *kref);
+static inline void ceph_osdc_put_request(struct ceph_osd_request *req)
+{
+	kref_put(&req->r_kref, ceph_osdc_release_request);
+}
+
+extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
+				   struct ceph_osd_request *req,
+				   bool nofail);
+extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
+				  struct ceph_osd_request *req);
+extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
+
+extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
+			       struct ceph_vino vino,
+			       struct ceph_file_layout *layout,
+			       u64 off, u64 *plen,
+			       u32 truncate_seq, u64 truncate_size,
+			       struct page **pages, int nr_pages);
+
+extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
+				struct ceph_vino vino,
+				struct ceph_file_layout *layout,
+				struct ceph_snap_context *sc,
+				u64 off, u64 len,
+				u32 truncate_seq, u64 truncate_size,
+				struct timespec *mtime,
+				struct page **pages, int nr_pages,
+				int flags, int do_sync, bool nofail);
+
+#endif
+
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
new file mode 100644
index 000000000000..416d46adbf87
--- /dev/null
+++ b/fs/ceph/osdmap.c
@@ -0,0 +1,1087 @@
+
+#include "ceph_debug.h"
+
+#include <linux/slab.h>
+#include <asm/div64.h>
+
+#include "super.h"
+#include "osdmap.h"
+#include "crush/hash.h"
+#include "crush/mapper.h"
+#include "decode.h"
+
+char *ceph_osdmap_state_str(char *str, int len, int state)
+{
+	int flag = 0;
+
+	if (!len)
+		goto done;
+
+	*str = '\0';
+	if (state) {
+		if (state & CEPH_OSD_EXISTS) {
+			snprintf(str, len, "exists");
+			flag = 1;
+		}
+		if (state & CEPH_OSD_UP) {
+			snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""),
+				 "up");
+			flag = 1;
+		}
+	} else {
+		snprintf(str, len, "doesn't exist");
+	}
+done:
+	return str;
+}
+
+/* maps */
+
+static int calc_bits_of(unsigned t)
+{
+	int b = 0;
+	while (t) {
+		t = t >> 1;
+		b++;
+	}
+	return b;
+}
+
+/*
+ * the foo_mask is the smallest value 2^n-1 that is >= foo.
+ */
+static void calc_pg_masks(struct ceph_pg_pool_info *pi)
+{
+	pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1;
+	pi->pgp_num_mask =
+		(1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
+	pi->lpg_num_mask =
+		(1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
+	pi->lpgp_num_mask =
+		(1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
+}
+
+/*
+ * decode crush map
+ */
+static int crush_decode_uniform_bucket(void **p, void *end,
+				       struct crush_bucket_uniform *b)
+{
+	dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
+	ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
+	b->item_weight = ceph_decode_32(p);
+	return 0;
+bad:
+	return -EINVAL;
+}
+
+static int crush_decode_list_bucket(void **p, void *end,
+				    struct crush_bucket_list *b)
+{
+	int j;
+	dout("crush_decode_list_bucket %p to %p\n", *p, end);
+	b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+	if (b->item_weights == NULL)
+		return -ENOMEM;
+	b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+	if (b->sum_weights == NULL)
+		return -ENOMEM;
+	ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
+	for (j = 0; j < b->h.size; j++) {
+		b->item_weights[j] = ceph_decode_32(p);
+		b->sum_weights[j] = ceph_decode_32(p);
+	}
+	return 0;
+bad:
+	return -EINVAL;
+}
+
+static int crush_decode_tree_bucket(void **p, void *end,
+				    struct crush_bucket_tree *b)
+{
+	int j;
+	dout("crush_decode_tree_bucket %p to %p\n", *p, end);
+	ceph_decode_32_safe(p, end, b->num_nodes, bad);
+	b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
+	if (b->node_weights == NULL)
+		return -ENOMEM;
+	ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
+	for (j = 0; j < b->num_nodes; j++)
+		b->node_weights[j] = ceph_decode_32(p);
+	return 0;
+bad:
+	return -EINVAL;
+}
+
+static int crush_decode_straw_bucket(void **p, void *end,
+				     struct crush_bucket_straw *b)
+{
+	int j;
+	dout("crush_decode_straw_bucket %p to %p\n", *p, end);
+	b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+	if (b->item_weights == NULL)
+		return -ENOMEM;
+	b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+	if (b->straws == NULL)
+		return -ENOMEM;
+	ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
+	for (j = 0; j < b->h.size; j++) {
+		b->item_weights[j] = ceph_decode_32(p);
+		b->straws[j] = ceph_decode_32(p);
+	}
+	return 0;
+bad:
+	return -EINVAL;
+}
+
+static struct crush_map *crush_decode(void *pbyval, void *end)
+{
+	struct crush_map *c;
+	int err = -EINVAL;
+	int i, j;
+	void **p = &pbyval;
+	void *start = pbyval;
+	u32 magic;
+
+	dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
+
+	c = kzalloc(sizeof(*c), GFP_NOFS);
+	if (c == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	ceph_decode_need(p, end, 4*sizeof(u32), bad);
+	magic = ceph_decode_32(p);
+	if (magic != CRUSH_MAGIC) {
+		pr_err("crush_decode magic %x != current %x\n",
+		       (unsigned)magic, (unsigned)CRUSH_MAGIC);
+		goto bad;
+	}
+	c->max_buckets = ceph_decode_32(p);
+	c->max_rules = ceph_decode_32(p);
+	c->max_devices = ceph_decode_32(p);
+
+	c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS);
+	if (c->device_parents == NULL)
+		goto badmem;
+	c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS);
+	if (c->bucket_parents == NULL)
+		goto badmem;
+
+	c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
+	if (c->buckets == NULL)
+		goto badmem;
+	c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
+	if (c->rules == NULL)
+		goto badmem;
+
+	/* buckets */
+	for (i = 0; i < c->max_buckets; i++) {
+		int size = 0;
+		u32 alg;
+		struct crush_bucket *b;
+
+		ceph_decode_32_safe(p, end, alg, bad);
+		if (alg == 0) {
+			c->buckets[i] = NULL;
+			continue;
+		}
+		dout("crush_decode bucket %d off %x %p to %p\n",
+		     i, (int)(*p-start), *p, end);
+
+		switch (alg) {
+		case CRUSH_BUCKET_UNIFORM:
+			size = sizeof(struct crush_bucket_uniform);
+			break;
+		case CRUSH_BUCKET_LIST:
+			size = sizeof(struct crush_bucket_list);
+			break;
+		case CRUSH_BUCKET_TREE:
+			size = sizeof(struct crush_bucket_tree);
+			break;
+		case CRUSH_BUCKET_STRAW:
+			size = sizeof(struct crush_bucket_straw);
+			break;
+		default:
+			err = -EINVAL;
+			goto bad;
+		}
+		BUG_ON(size == 0);
+		b = c->buckets[i] = kzalloc(size, GFP_NOFS);
+		if (b == NULL)
+			goto badmem;
+
+		ceph_decode_need(p, end, 4*sizeof(u32), bad);
+		b->id = ceph_decode_32(p);
+		b->type = ceph_decode_16(p);
+		b->alg = ceph_decode_8(p);
+		b->hash = ceph_decode_8(p);
+		b->weight = ceph_decode_32(p);
+		b->size = ceph_decode_32(p);
+
+		dout("crush_decode bucket size %d off %x %p to %p\n",
+		     b->size, (int)(*p-start), *p, end);
+
+		b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
+		if (b->items == NULL)
+			goto badmem;
+		b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
+		if (b->perm == NULL)
+			goto badmem;
+		b->perm_n = 0;
+
+		ceph_decode_need(p, end, b->size*sizeof(u32), bad);
+		for (j = 0; j < b->size; j++)
+			b->items[j] = ceph_decode_32(p);
+
+		switch (b->alg) {
+		case CRUSH_BUCKET_UNIFORM:
+			err = crush_decode_uniform_bucket(p, end,
+				  (struct crush_bucket_uniform *)b);
+			if (err < 0)
+				goto bad;
+			break;
+		case CRUSH_BUCKET_LIST:
+			err = crush_decode_list_bucket(p, end,
+			       (struct crush_bucket_list *)b);
+			if (err < 0)
+				goto bad;
+			break;
+		case CRUSH_BUCKET_TREE:
+			err = crush_decode_tree_bucket(p, end,
+				(struct crush_bucket_tree *)b);
+			if (err < 0)
+				goto bad;
+			break;
+		case CRUSH_BUCKET_STRAW:
+			err = crush_decode_straw_bucket(p, end,
+				(struct crush_bucket_straw *)b);
+			if (err < 0)
+				goto bad;
+			break;
+		}
+	}
+
+	/* rules */
+	dout("rule vec is %p\n", c->rules);
+	for (i = 0; i < c->max_rules; i++) {
+		u32 yes;
+		struct crush_rule *r;
+
+		ceph_decode_32_safe(p, end, yes, bad);
+		if (!yes) {
+			dout("crush_decode NO rule %d off %x %p to %p\n",
+			     i, (int)(*p-start), *p, end);
+			c->rules[i] = NULL;
+			continue;
+		}
+
+		dout("crush_decode rule %d off %x %p to %p\n",
+		     i, (int)(*p-start), *p, end);
+
+		/* len */
+		ceph_decode_32_safe(p, end, yes, bad);
+#if BITS_PER_LONG == 32
+		err = -EINVAL;
+		if (yes > ULONG_MAX / sizeof(struct crush_rule_step))
+			goto bad;
+#endif
+		r = c->rules[i] = kmalloc(sizeof(*r) +
+					  yes*sizeof(struct crush_rule_step),
+					  GFP_NOFS);
+		if (r == NULL)
+			goto badmem;
+		dout(" rule %d is at %p\n", i, r);
+		r->len = yes;
+		ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
+		ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
+		for (j = 0; j < r->len; j++) {
+			r->steps[j].op = ceph_decode_32(p);
+			r->steps[j].arg1 = ceph_decode_32(p);
+			r->steps[j].arg2 = ceph_decode_32(p);
+		}
+	}
+
+	/* ignore trailing name maps. */
+
+	dout("crush_decode success\n");
+	return c;
+
+badmem:
+	err = -ENOMEM;
+bad:
+	dout("crush_decode fail %d\n", err);
+	crush_destroy(c);
+	return ERR_PTR(err);
+}
+
+/*
+ * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
+ * to a set of osds)
+ */
+static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
+{
+	u64 a = *(u64 *)&l;
+	u64 b = *(u64 *)&r;
+
+	if (a < b)
+		return -1;
+	if (a > b)
+		return 1;
+	return 0;
+}
+
+static int __insert_pg_mapping(struct ceph_pg_mapping *new,
+			       struct rb_root *root)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_pg_mapping *pg = NULL;
+	int c;
+
+	while (*p) {
+		parent = *p;
+		pg = rb_entry(parent, struct ceph_pg_mapping, node);
+		c = pgid_cmp(new->pgid, pg->pgid);
+		if (c < 0)
+			p = &(*p)->rb_left;
+		else if (c > 0)
+			p = &(*p)->rb_right;
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(&new->node, parent, p);
+	rb_insert_color(&new->node, root);
+	return 0;
+}
+
+static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
+						   struct ceph_pg pgid)
+{
+	struct rb_node *n = root->rb_node;
+	struct ceph_pg_mapping *pg;
+	int c;
+
+	while (n) {
+		pg = rb_entry(n, struct ceph_pg_mapping, node);
+		c = pgid_cmp(pgid, pg->pgid);
+		if (c < 0)
+			n = n->rb_left;
+		else if (c > 0)
+			n = n->rb_right;
+		else
+			return pg;
+	}
+	return NULL;
+}
+
+/*
+ * rbtree of pg pool info
+ */
+static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_pg_pool_info *pi = NULL;
+
+	while (*p) {
+		parent = *p;
+		pi = rb_entry(parent, struct ceph_pg_pool_info, node);
+		if (new->id < pi->id)
+			p = &(*p)->rb_left;
+		else if (new->id > pi->id)
+			p = &(*p)->rb_right;
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(&new->node, parent, p);
+	rb_insert_color(&new->node, root);
+	return 0;
+}
+
+static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
+{
+	struct ceph_pg_pool_info *pi;
+	struct rb_node *n = root->rb_node;
+
+	while (n) {
+		pi = rb_entry(n, struct ceph_pg_pool_info, node);
+		if (id < pi->id)
+			n = n->rb_left;
+		else if (id > pi->id)
+			n = n->rb_right;
+		else
+			return pi;
+	}
+	return NULL;
+}
+
+static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
+{
+	rb_erase(&pi->node, root);
+	kfree(pi->name);
+	kfree(pi);
+}
+
+void __decode_pool(void **p, struct ceph_pg_pool_info *pi)
+{
+	ceph_decode_copy(p, &pi->v, sizeof(pi->v));
+	calc_pg_masks(pi);
+	*p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64);
+	*p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
+}
+
+static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
+{
+	struct ceph_pg_pool_info *pi;
+	u32 num, len, pool;
+
+	ceph_decode_32_safe(p, end, num, bad);
+	dout(" %d pool names\n", num);
+	while (num--) {
+		ceph_decode_32_safe(p, end, pool, bad);
+		ceph_decode_32_safe(p, end, len, bad);
+		dout("  pool %d len %d\n", pool, len);
+		pi = __lookup_pg_pool(&map->pg_pools, pool);
+		if (pi) {
+			kfree(pi->name);
+			pi->name = kmalloc(len + 1, GFP_NOFS);
+			if (pi->name) {
+				memcpy(pi->name, *p, len);
+				pi->name[len] = '\0';
+				dout("  name is %s\n", pi->name);
+			}
+		}
+		*p += len;
+	}
+	return 0;
+
+bad:
+	return -EINVAL;
+}
+
+/*
+ * osd map
+ */
+void ceph_osdmap_destroy(struct ceph_osdmap *map)
+{
+	dout("osdmap_destroy %p\n", map);
+	if (map->crush)
+		crush_destroy(map->crush);
+	while (!RB_EMPTY_ROOT(&map->pg_temp)) {
+		struct ceph_pg_mapping *pg =
+			rb_entry(rb_first(&map->pg_temp),
+				 struct ceph_pg_mapping, node);
+		rb_erase(&pg->node, &map->pg_temp);
+		kfree(pg);
+	}
+	while (!RB_EMPTY_ROOT(&map->pg_pools)) {
+		struct ceph_pg_pool_info *pi =
+			rb_entry(rb_first(&map->pg_pools),
+				 struct ceph_pg_pool_info, node);
+		__remove_pg_pool(&map->pg_pools, pi);
+	}
+	kfree(map->osd_state);
+	kfree(map->osd_weight);
+	kfree(map->osd_addr);
+	kfree(map);
+}
+
+/*
+ * adjust max osd value.  reallocate arrays.
+ */
+static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
+{
+	u8 *state;
+	struct ceph_entity_addr *addr;
+	u32 *weight;
+
+	state = kcalloc(max, sizeof(*state), GFP_NOFS);
+	addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
+	weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
+	if (state == NULL || addr == NULL || weight == NULL) {
+		kfree(state);
+		kfree(addr);
+		kfree(weight);
+		return -ENOMEM;
+	}
+
+	/* copy old? */
+	if (map->osd_state) {
+		memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
+		memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
+		memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
+		kfree(map->osd_state);
+		kfree(map->osd_addr);
+		kfree(map->osd_weight);
+	}
+
+	map->osd_state = state;
+	map->osd_weight = weight;
+	map->osd_addr = addr;
+	map->max_osd = max;
+	return 0;
+}
+
+/*
+ * decode a full map.
+ */
+struct ceph_osdmap *osdmap_decode(void **p, void *end)
+{
+	struct ceph_osdmap *map;
+	u16 version;
+	u32 len, max, i;
+	u8 ev;
+	int err = -EINVAL;
+	void *start = *p;
+	struct ceph_pg_pool_info *pi;
+
+	dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
+
+	map = kzalloc(sizeof(*map), GFP_NOFS);
+	if (map == NULL)
+		return ERR_PTR(-ENOMEM);
+	map->pg_temp = RB_ROOT;
+
+	ceph_decode_16_safe(p, end, version, bad);
+	if (version > CEPH_OSDMAP_VERSION) {
+		pr_warning("got unknown v %d > %d of osdmap\n", version,
+			   CEPH_OSDMAP_VERSION);
+		goto bad;
+	}
+
+	ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
+	ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
+	map->epoch = ceph_decode_32(p);
+	ceph_decode_copy(p, &map->created, sizeof(map->created));
+	ceph_decode_copy(p, &map->modified, sizeof(map->modified));
+
+	ceph_decode_32_safe(p, end, max, bad);
+	while (max--) {
+		ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
+		pi = kzalloc(sizeof(*pi), GFP_NOFS);
+		if (!pi)
+			goto bad;
+		pi->id = ceph_decode_32(p);
+		ev = ceph_decode_8(p); /* encoding version */
+		if (ev > CEPH_PG_POOL_VERSION) {
+			pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
+				   ev, CEPH_PG_POOL_VERSION);
+			kfree(pi);
+			goto bad;
+		}
+		__decode_pool(p, pi);
+		__insert_pg_pool(&map->pg_pools, pi);
+	}
+
+	if (version >= 5 && __decode_pool_names(p, end, map) < 0)
+		goto bad;
+
+	ceph_decode_32_safe(p, end, map->pool_max, bad);
+
+	ceph_decode_32_safe(p, end, map->flags, bad);
+
+	max = ceph_decode_32(p);
+
+	/* (re)alloc osd arrays */
+	err = osdmap_set_max_osd(map, max);
+	if (err < 0)
+		goto bad;
+	dout("osdmap_decode max_osd = %d\n", map->max_osd);
+
+	/* osds */
+	err = -EINVAL;
+	ceph_decode_need(p, end, 3*sizeof(u32) +
+			 map->max_osd*(1 + sizeof(*map->osd_weight) +
+				       sizeof(*map->osd_addr)), bad);
+	*p += 4; /* skip length field (should match max) */
+	ceph_decode_copy(p, map->osd_state, map->max_osd);
+
+	*p += 4; /* skip length field (should match max) */
+	for (i = 0; i < map->max_osd; i++)
+		map->osd_weight[i] = ceph_decode_32(p);
+
+	*p += 4; /* skip length field (should match max) */
+	ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
+	for (i = 0; i < map->max_osd; i++)
+		ceph_decode_addr(&map->osd_addr[i]);
+
+	/* pg_temp */
+	ceph_decode_32_safe(p, end, len, bad);
+	for (i = 0; i < len; i++) {
+		int n, j;
+		struct ceph_pg pgid;
+		struct ceph_pg_mapping *pg;
+
+		ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
+		ceph_decode_copy(p, &pgid, sizeof(pgid));
+		n = ceph_decode_32(p);
+		ceph_decode_need(p, end, n * sizeof(u32), bad);
+		err = -ENOMEM;
+		pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
+		if (!pg)
+			goto bad;
+		pg->pgid = pgid;
+		pg->len = n;
+		for (j = 0; j < n; j++)
+			pg->osds[j] = ceph_decode_32(p);
+
+		err = __insert_pg_mapping(pg, &map->pg_temp);
+		if (err)
+			goto bad;
+		dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len);
+	}
+
+	/* crush */
+	ceph_decode_32_safe(p, end, len, bad);
+	dout("osdmap_decode crush len %d from off 0x%x\n", len,
+	     (int)(*p - start));
+	ceph_decode_need(p, end, len, bad);
+	map->crush = crush_decode(*p, end);
+	*p += len;
+	if (IS_ERR(map->crush)) {
+		err = PTR_ERR(map->crush);
+		map->crush = NULL;
+		goto bad;
+	}
+
+	/* ignore the rest of the map */
+	*p = end;
+
+	dout("osdmap_decode done %p %p\n", *p, end);
+	return map;
+
+bad:
+	dout("osdmap_decode fail\n");
+	ceph_osdmap_destroy(map);
+	return ERR_PTR(err);
+}
+
+/*
+ * decode and apply an incremental map update.
+ */
+struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
+					     struct ceph_osdmap *map,
+					     struct ceph_messenger *msgr)
+{
+	struct crush_map *newcrush = NULL;
+	struct ceph_fsid fsid;
+	u32 epoch = 0;
+	struct ceph_timespec modified;
+	u32 len, pool;
+	__s32 new_pool_max, new_flags, max;
+	void *start = *p;
+	int err = -EINVAL;
+	u16 version;
+	struct rb_node *rbp;
+
+	ceph_decode_16_safe(p, end, version, bad);
+	if (version > CEPH_OSDMAP_INC_VERSION) {
+		pr_warning("got unknown v %d > %d of inc osdmap\n", version,
+			   CEPH_OSDMAP_INC_VERSION);
+		goto bad;
+	}
+
+	ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
+			 bad);
+	ceph_decode_copy(p, &fsid, sizeof(fsid));
+	epoch = ceph_decode_32(p);
+	BUG_ON(epoch != map->epoch+1);
+	ceph_decode_copy(p, &modified, sizeof(modified));
+	new_pool_max = ceph_decode_32(p);
+	new_flags = ceph_decode_32(p);
+
+	/* full map? */
+	ceph_decode_32_safe(p, end, len, bad);
+	if (len > 0) {
+		dout("apply_incremental full map len %d, %p to %p\n",
+		     len, *p, end);
+		return osdmap_decode(p, min(*p+len, end));
+	}
+
+	/* new crush? */
+	ceph_decode_32_safe(p, end, len, bad);
+	if (len > 0) {
+		dout("apply_incremental new crush map len %d, %p to %p\n",
+		     len, *p, end);
+		newcrush = crush_decode(*p, min(*p+len, end));
+		if (IS_ERR(newcrush))
+			return ERR_CAST(newcrush);
+		*p += len;
+	}
+
+	/* new flags? */
+	if (new_flags >= 0)
+		map->flags = new_flags;
+	if (new_pool_max >= 0)
+		map->pool_max = new_pool_max;
+
+	ceph_decode_need(p, end, 5*sizeof(u32), bad);
+
+	/* new max? */
+	max = ceph_decode_32(p);
+	if (max >= 0) {
+		err = osdmap_set_max_osd(map, max);
+		if (err < 0)
+			goto bad;
+	}
+
+	map->epoch++;
+	map->modified = map->modified;
+	if (newcrush) {
+		if (map->crush)
+			crush_destroy(map->crush);
+		map->crush = newcrush;
+		newcrush = NULL;
+	}
+
+	/* new_pool */
+	ceph_decode_32_safe(p, end, len, bad);
+	while (len--) {
+		__u8 ev;
+		struct ceph_pg_pool_info *pi;
+
+		ceph_decode_32_safe(p, end, pool, bad);
+		ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
+		ev = ceph_decode_8(p);  /* encoding version */
+		if (ev > CEPH_PG_POOL_VERSION) {
+			pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
+				   ev, CEPH_PG_POOL_VERSION);
+			goto bad;
+		}
+		pi = __lookup_pg_pool(&map->pg_pools, pool);
+		if (!pi) {
+			pi = kzalloc(sizeof(*pi), GFP_NOFS);
+			if (!pi) {
+				err = -ENOMEM;
+				goto bad;
+			}
+			pi->id = pool;
+			__insert_pg_pool(&map->pg_pools, pi);
+		}
+		__decode_pool(p, pi);
+	}
+	if (version >= 5 && __decode_pool_names(p, end, map) < 0)
+		goto bad;
+
+	/* old_pool */
+	ceph_decode_32_safe(p, end, len, bad);
+	while (len--) {
+		struct ceph_pg_pool_info *pi;
+
+		ceph_decode_32_safe(p, end, pool, bad);
+		pi = __lookup_pg_pool(&map->pg_pools, pool);
+		if (pi)
+			__remove_pg_pool(&map->pg_pools, pi);
+	}
+
+	/* new_up */
+	err = -EINVAL;
+	ceph_decode_32_safe(p, end, len, bad);
+	while (len--) {
+		u32 osd;
+		struct ceph_entity_addr addr;
+		ceph_decode_32_safe(p, end, osd, bad);
+		ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad);
+		ceph_decode_addr(&addr);
+		pr_info("osd%d up\n", osd);
+		BUG_ON(osd >= map->max_osd);
+		map->osd_state[osd] |= CEPH_OSD_UP;
+		map->osd_addr[osd] = addr;
+	}
+
+	/* new_down */
+	ceph_decode_32_safe(p, end, len, bad);
+	while (len--) {
+		u32 osd;
+		ceph_decode_32_safe(p, end, osd, bad);
+		(*p)++;  /* clean flag */
+		pr_info("osd%d down\n", osd);
+		if (osd < map->max_osd)
+			map->osd_state[osd] &= ~CEPH_OSD_UP;
+	}
+
+	/* new_weight */
+	ceph_decode_32_safe(p, end, len, bad);
+	while (len--) {
+		u32 osd, off;
+		ceph_decode_need(p, end, sizeof(u32)*2, bad);
+		osd = ceph_decode_32(p);
+		off = ceph_decode_32(p);
+		pr_info("osd%d weight 0x%x %s\n", osd, off,
+		     off == CEPH_OSD_IN ? "(in)" :
+		     (off == CEPH_OSD_OUT ? "(out)" : ""));
+		if (osd < map->max_osd)
+			map->osd_weight[osd] = off;
+	}
+
+	/* new_pg_temp */
+	rbp = rb_first(&map->pg_temp);
+	ceph_decode_32_safe(p, end, len, bad);
+	while (len--) {
+		struct ceph_pg_mapping *pg;
+		int j;
+		struct ceph_pg pgid;
+		u32 pglen;
+		ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
+		ceph_decode_copy(p, &pgid, sizeof(pgid));
+		pglen = ceph_decode_32(p);
+
+		/* remove any? */
+		while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping,
+						node)->pgid, pgid) <= 0) {
+			struct ceph_pg_mapping *cur =
+				rb_entry(rbp, struct ceph_pg_mapping, node);
+			
+			rbp = rb_next(rbp);
+			dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
+			rb_erase(&cur->node, &map->pg_temp);
+			kfree(cur);
+		}
+
+		if (pglen) {
+			/* insert */
+			ceph_decode_need(p, end, pglen*sizeof(u32), bad);
+			pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
+			if (!pg) {
+				err = -ENOMEM;
+				goto bad;
+			}
+			pg->pgid = pgid;
+			pg->len = pglen;
+			for (j = 0; j < pglen; j++)
+				pg->osds[j] = ceph_decode_32(p);
+			err = __insert_pg_mapping(pg, &map->pg_temp);
+			if (err) {
+				kfree(pg);
+				goto bad;
+			}
+			dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
+			     pglen);
+		}
+	}
+	while (rbp) {
+		struct ceph_pg_mapping *cur =
+			rb_entry(rbp, struct ceph_pg_mapping, node);
+
+		rbp = rb_next(rbp);
+		dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
+		rb_erase(&cur->node, &map->pg_temp);
+		kfree(cur);
+	}
+
+	/* ignore the rest */
+	*p = end;
+	return map;
+
+bad:
+	pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
+	       epoch, (int)(*p - start), *p, start, end);
+	print_hex_dump(KERN_DEBUG, "osdmap: ",
+		       DUMP_PREFIX_OFFSET, 16, 1,
+		       start, end - start, true);
+	if (newcrush)
+		crush_destroy(newcrush);
+	return ERR_PTR(err);
+}
+
+
+
+
+/*
+ * calculate file layout from given offset, length.
+ * fill in correct oid, logical length, and object extent
+ * offset, length.
+ *
+ * for now, we write only a single su, until we can
+ * pass a stride back to the caller.
+ */
+void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
+				   u64 off, u64 *plen,
+				   u64 *ono,
+				   u64 *oxoff, u64 *oxlen)
+{
+	u32 osize = le32_to_cpu(layout->fl_object_size);
+	u32 su = le32_to_cpu(layout->fl_stripe_unit);
+	u32 sc = le32_to_cpu(layout->fl_stripe_count);
+	u32 bl, stripeno, stripepos, objsetno;
+	u32 su_per_object;
+	u64 t, su_offset;
+
+	dout("mapping %llu~%llu  osize %u fl_su %u\n", off, *plen,
+	     osize, su);
+	su_per_object = osize / su;
+	dout("osize %u / su %u = su_per_object %u\n", osize, su,
+	     su_per_object);
+
+	BUG_ON((su & ~PAGE_MASK) != 0);
+	/* bl = *off / su; */
+	t = off;
+	do_div(t, su);
+	bl = t;
+	dout("off %llu / su %u = bl %u\n", off, su, bl);
+
+	stripeno = bl / sc;
+	stripepos = bl % sc;
+	objsetno = stripeno / su_per_object;
+
+	*ono = objsetno * sc + stripepos;
+	dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono);
+
+	/* *oxoff = *off % layout->fl_stripe_unit;  # offset in su */
+	t = off;
+	su_offset = do_div(t, su);
+	*oxoff = su_offset + (stripeno % su_per_object) * su;
+
+	/*
+	 * Calculate the length of the extent being written to the selected
+	 * object. This is the minimum of the full length requested (plen) or
+	 * the remainder of the current stripe being written to.
+	 */
+	*oxlen = min_t(u64, *plen, su - su_offset);
+	*plen = *oxlen;
+
+	dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
+}
+
+/*
+ * calculate an object layout (i.e. pgid) from an oid,
+ * file_layout, and osdmap
+ */
+int ceph_calc_object_layout(struct ceph_object_layout *ol,
+			    const char *oid,
+			    struct ceph_file_layout *fl,
+			    struct ceph_osdmap *osdmap)
+{
+	unsigned num, num_mask;
+	struct ceph_pg pgid;
+	s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred);
+	int poolid = le32_to_cpu(fl->fl_pg_pool);
+	struct ceph_pg_pool_info *pool;
+	unsigned ps;
+
+	BUG_ON(!osdmap);
+
+	pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
+	if (!pool)
+		return -EIO;
+	ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
+	if (preferred >= 0) {
+		ps += preferred;
+		num = le32_to_cpu(pool->v.lpg_num);
+		num_mask = pool->lpg_num_mask;
+	} else {
+		num = le32_to_cpu(pool->v.pg_num);
+		num_mask = pool->pg_num_mask;
+	}
+
+	pgid.ps = cpu_to_le16(ps);
+	pgid.preferred = cpu_to_le16(preferred);
+	pgid.pool = fl->fl_pg_pool;
+	if (preferred >= 0)
+		dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps,
+		     (int)preferred);
+	else
+		dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
+
+	ol->ol_pgid = pgid;
+	ol->ol_stripe_unit = fl->fl_object_stripe_unit;
+	return 0;
+}
+
+/*
+ * Calculate raw osd vector for the given pgid.  Return pointer to osd
+ * array, or NULL on failure.
+ */
+static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+			int *osds, int *num)
+{
+	struct ceph_pg_mapping *pg;
+	struct ceph_pg_pool_info *pool;
+	int ruleno;
+	unsigned poolid, ps, pps;
+	int preferred;
+
+	/* pg_temp? */
+	pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
+	if (pg) {
+		*num = pg->len;
+		return pg->osds;
+	}
+
+	/* crush */
+	poolid = le32_to_cpu(pgid.pool);
+	ps = le16_to_cpu(pgid.ps);
+	preferred = (s16)le16_to_cpu(pgid.preferred);
+
+	/* don't forcefeed bad device ids to crush */
+	if (preferred >= osdmap->max_osd ||
+	    preferred >= osdmap->crush->max_devices)
+		preferred = -1;
+
+	pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
+	if (!pool)
+		return NULL;
+	ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
+				 pool->v.type, pool->v.size);
+	if (ruleno < 0) {
+		pr_err("no crush rule pool %d type %d size %d\n",
+		       poolid, pool->v.type, pool->v.size);
+		return NULL;
+	}
+
+	if (preferred >= 0)
+		pps = ceph_stable_mod(ps,
+				      le32_to_cpu(pool->v.lpgp_num),
+				      pool->lpgp_num_mask);
+	else
+		pps = ceph_stable_mod(ps,
+				      le32_to_cpu(pool->v.pgp_num),
+				      pool->pgp_num_mask);
+	pps += poolid;
+	*num = crush_do_rule(osdmap->crush, ruleno, pps, osds,
+			     min_t(int, pool->v.size, *num),
+			     preferred, osdmap->osd_weight);
+	return osds;
+}
+
+/*
+ * Return acting set for given pgid.
+ */
+int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+			int *acting)
+{
+	int rawosds[CEPH_PG_MAX_SIZE], *osds;
+	int i, o, num = CEPH_PG_MAX_SIZE;
+
+	osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
+	if (!osds)
+		return -1;
+
+	/* primary is first up osd */
+	o = 0;
+	for (i = 0; i < num; i++)
+		if (ceph_osd_is_up(osdmap, osds[i]))
+			acting[o++] = osds[i];
+	return o;
+}
+
+/*
+ * Return primary osd for given pgid, or -1 if none.
+ */
+int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
+{
+	int rawosds[CEPH_PG_MAX_SIZE], *osds;
+	int i, num = CEPH_PG_MAX_SIZE;
+
+	osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
+	if (!osds)
+		return -1;
+
+	/* primary is first up osd */
+	for (i = 0; i < num; i++)
+		if (ceph_osd_is_up(osdmap, osds[i]))
+			return osds[i];
+	return -1;
+}
diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h
new file mode 100644
index 000000000000..970b547e510d
--- /dev/null
+++ b/fs/ceph/osdmap.h
@@ -0,0 +1,128 @@
+#ifndef _FS_CEPH_OSDMAP_H
+#define _FS_CEPH_OSDMAP_H
+
+#include <linux/rbtree.h>
+#include "types.h"
+#include "ceph_fs.h"
+#include "crush/crush.h"
+
+/*
+ * The osd map describes the current membership of the osd cluster and
+ * specifies the mapping of objects to placement groups and placement
+ * groups to (sets of) osds.  That is, it completely specifies the
+ * (desired) distribution of all data objects in the system at some
+ * point in time.
+ *
+ * Each map version is identified by an epoch, which increases monotonically.
+ *
+ * The map can be updated either via an incremental map (diff) describing
+ * the change between two successive epochs, or as a fully encoded map.
+ */
+struct ceph_pg_pool_info {
+	struct rb_node node;
+	int id;
+	struct ceph_pg_pool v;
+	int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
+	char *name;
+};
+
+struct ceph_pg_mapping {
+	struct rb_node node;
+	struct ceph_pg pgid;
+	int len;
+	int osds[];
+};
+
+struct ceph_osdmap {
+	struct ceph_fsid fsid;
+	u32 epoch;
+	u32 mkfs_epoch;
+	struct ceph_timespec created, modified;
+
+	u32 flags;         /* CEPH_OSDMAP_* */
+
+	u32 max_osd;       /* size of osd_state, _offload, _addr arrays */
+	u8 *osd_state;     /* CEPH_OSD_* */
+	u32 *osd_weight;   /* 0 = failed, 0x10000 = 100% normal */
+	struct ceph_entity_addr *osd_addr;
+
+	struct rb_root pg_temp;
+	struct rb_root pg_pools;
+	u32 pool_max;
+
+	/* the CRUSH map specifies the mapping of placement groups to
+	 * the list of osds that store+replicate them. */
+	struct crush_map *crush;
+};
+
+/*
+ * file layout helpers
+ */
+#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
+#define ceph_file_layout_stripe_count(l) \
+	((__s32)le32_to_cpu((l).fl_stripe_count))
+#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
+#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
+#define ceph_file_layout_object_su(l) \
+	((__s32)le32_to_cpu((l).fl_object_stripe_unit))
+#define ceph_file_layout_pg_preferred(l) \
+	((__s32)le32_to_cpu((l).fl_pg_preferred))
+#define ceph_file_layout_pg_pool(l) \
+	((__s32)le32_to_cpu((l).fl_pg_pool))
+
+static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
+{
+	return le32_to_cpu(l->fl_stripe_unit) *
+		le32_to_cpu(l->fl_stripe_count);
+}
+
+/* "period" == bytes before i start on a new set of objects */
+static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
+{
+	return le32_to_cpu(l->fl_object_size) *
+		le32_to_cpu(l->fl_stripe_count);
+}
+
+
+static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
+{
+	return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP);
+}
+
+static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
+{
+	return map && (map->flags & flag);
+}
+
+extern char *ceph_osdmap_state_str(char *str, int len, int state);
+
+static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
+						     int osd)
+{
+	if (osd >= map->max_osd)
+		return NULL;
+	return &map->osd_addr[osd];
+}
+
+extern struct ceph_osdmap *osdmap_decode(void **p, void *end);
+extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
+					    struct ceph_osdmap *map,
+					    struct ceph_messenger *msgr);
+extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
+
+/* calculate mapping of a file extent to an object */
+extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
+					  u64 off, u64 *plen,
+					  u64 *bno, u64 *oxoff, u64 *oxlen);
+
+/* calculate mapping of object to a placement group */
+extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
+				   const char *oid,
+				   struct ceph_file_layout *fl,
+				   struct ceph_osdmap *osdmap);
+extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+			       int *acting);
+extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
+				struct ceph_pg pgid);
+
+#endif
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
new file mode 100644
index 000000000000..b6859f47d364
--- /dev/null
+++ b/fs/ceph/pagelist.c
@@ -0,0 +1,55 @@
+
+#include <linux/gfp.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+
+#include "pagelist.h"
+
+int ceph_pagelist_release(struct ceph_pagelist *pl)
+{
+	if (pl->mapped_tail)
+		kunmap(pl->mapped_tail);
+	while (!list_empty(&pl->head)) {
+		struct page *page = list_first_entry(&pl->head, struct page,
+						     lru);
+		list_del(&page->lru);
+		__free_page(page);
+	}
+	return 0;
+}
+
+static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
+{
+	struct page *page = __page_cache_alloc(GFP_NOFS);
+	if (!page)
+		return -ENOMEM;
+	pl->room += PAGE_SIZE;
+	list_add_tail(&page->lru, &pl->head);
+	if (pl->mapped_tail)
+		kunmap(pl->mapped_tail);
+	pl->mapped_tail = kmap(page);
+	return 0;
+}
+
+int ceph_pagelist_append(struct ceph_pagelist *pl, void *buf, size_t len)
+{
+	while (pl->room < len) {
+		size_t bit = pl->room;
+		int ret;
+
+		memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
+		       buf, bit);
+		pl->length += bit;
+		pl->room -= bit;
+		buf += bit;
+		len -= bit;
+		ret = ceph_pagelist_addpage(pl);
+		if (ret)
+			return ret;
+	}
+
+	memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
+	pl->length += len;
+	pl->room -= len;
+	return 0;
+}
diff --git a/fs/ceph/pagelist.h b/fs/ceph/pagelist.h
new file mode 100644
index 000000000000..e8a4187e1087
--- /dev/null
+++ b/fs/ceph/pagelist.h
@@ -0,0 +1,54 @@
+#ifndef __FS_CEPH_PAGELIST_H
+#define __FS_CEPH_PAGELIST_H
+
+#include <linux/list.h>
+
+struct ceph_pagelist {
+	struct list_head head;
+	void *mapped_tail;
+	size_t length;
+	size_t room;
+};
+
+static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
+{
+	INIT_LIST_HEAD(&pl->head);
+	pl->mapped_tail = NULL;
+	pl->length = 0;
+	pl->room = 0;
+}
+extern int ceph_pagelist_release(struct ceph_pagelist *pl);
+
+extern int ceph_pagelist_append(struct ceph_pagelist *pl, void *d, size_t l);
+
+static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v)
+{
+	__le64 ev = cpu_to_le64(v);
+	return ceph_pagelist_append(pl, &ev, sizeof(ev));
+}
+static inline int ceph_pagelist_encode_32(struct ceph_pagelist *pl, u32 v)
+{
+	__le32 ev = cpu_to_le32(v);
+	return ceph_pagelist_append(pl, &ev, sizeof(ev));
+}
+static inline int ceph_pagelist_encode_16(struct ceph_pagelist *pl, u16 v)
+{
+	__le16 ev = cpu_to_le16(v);
+	return ceph_pagelist_append(pl, &ev, sizeof(ev));
+}
+static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v)
+{
+	return ceph_pagelist_append(pl, &v, 1);
+}
+static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl,
+					      char *s, size_t len)
+{
+	int ret = ceph_pagelist_encode_32(pl, len);
+	if (ret)
+		return ret;
+	if (len)
+		return ceph_pagelist_append(pl, s, len);
+	return 0;
+}
+
+#endif
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
new file mode 100644
index 000000000000..8fcc023056c7
--- /dev/null
+++ b/fs/ceph/rados.h
@@ -0,0 +1,396 @@
+#ifndef __RADOS_H
+#define __RADOS_H
+
+/*
+ * Data types for the Ceph distributed object storage layer RADOS
+ * (Reliable Autonomic Distributed Object Store).
+ */
+
+#include "msgr.h"
+
+/*
+ * osdmap encoding versions
+ */
+#define CEPH_OSDMAP_INC_VERSION     5
+#define CEPH_OSDMAP_INC_VERSION_EXT 5
+#define CEPH_OSDMAP_VERSION         5
+#define CEPH_OSDMAP_VERSION_EXT     5
+
+/*
+ * fs id
+ */
+struct ceph_fsid {
+	unsigned char fsid[16];
+};
+
+static inline int ceph_fsid_compare(const struct ceph_fsid *a,
+				    const struct ceph_fsid *b)
+{
+	return memcmp(a, b, sizeof(*a));
+}
+
+/*
+ * ino, object, etc.
+ */
+typedef __le64 ceph_snapid_t;
+#define CEPH_SNAPDIR ((__u64)(-1))  /* reserved for hidden .snap dir */
+#define CEPH_NOSNAP  ((__u64)(-2))  /* "head", "live" revision */
+#define CEPH_MAXSNAP ((__u64)(-3))  /* largest valid snapid */
+
+struct ceph_timespec {
+	__le32 tv_sec;
+	__le32 tv_nsec;
+} __attribute__ ((packed));
+
+
+/*
+ * object layout - how objects are mapped into PGs
+ */
+#define CEPH_OBJECT_LAYOUT_HASH     1
+#define CEPH_OBJECT_LAYOUT_LINEAR   2
+#define CEPH_OBJECT_LAYOUT_HASHINO  3
+
+/*
+ * pg layout -- how PGs are mapped onto (sets of) OSDs
+ */
+#define CEPH_PG_LAYOUT_CRUSH  0
+#define CEPH_PG_LAYOUT_HASH   1
+#define CEPH_PG_LAYOUT_LINEAR 2
+#define CEPH_PG_LAYOUT_HYBRID 3
+
+#define CEPH_PG_MAX_SIZE      16  /* max # osds in a single pg */
+
+/*
+ * placement group.
+ * we encode this into one __le64.
+ */
+struct ceph_pg {
+	__le16 preferred; /* preferred primary osd */
+	__le16 ps;        /* placement seed */
+	__le32 pool;      /* object pool */
+} __attribute__ ((packed));
+
+/*
+ * pg_pool is a set of pgs storing a pool of objects
+ *
+ *  pg_num -- base number of pseudorandomly placed pgs
+ *
+ *  pgp_num -- effective number when calculating pg placement.  this
+ * is used for pg_num increases.  new pgs result in data being "split"
+ * into new pgs.  for this to proceed smoothly, new pgs are intiially
+ * colocated with their parents; that is, pgp_num doesn't increase
+ * until the new pgs have successfully split.  only _then_ are the new
+ * pgs placed independently.
+ *
+ *  lpg_num -- localized pg count (per device).  replicas are randomly
+ * selected.
+ *
+ *  lpgp_num -- as above.
+ */
+#define CEPH_PG_TYPE_REP     1
+#define CEPH_PG_TYPE_RAID4   2
+#define CEPH_PG_POOL_VERSION 2
+struct ceph_pg_pool {
+	__u8 type;                /* CEPH_PG_TYPE_* */
+	__u8 size;                /* number of osds in each pg */
+	__u8 crush_ruleset;       /* crush placement rule */
+	__u8 object_hash;         /* hash mapping object name to ps */
+	__le32 pg_num, pgp_num;   /* number of pg's */
+	__le32 lpg_num, lpgp_num; /* number of localized pg's */
+	__le32 last_change;       /* most recent epoch changed */
+	__le64 snap_seq;          /* seq for per-pool snapshot */
+	__le32 snap_epoch;        /* epoch of last snap */
+	__le32 num_snaps;
+	__le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */
+	__le64 auid;               /* who owns the pg */
+} __attribute__ ((packed));
+
+/*
+ * stable_mod func is used to control number of placement groups.
+ * similar to straight-up modulo, but produces a stable mapping as b
+ * increases over time.  b is the number of bins, and bmask is the
+ * containing power of 2 minus 1.
+ *
+ * b <= bmask and bmask=(2**n)-1
+ * e.g., b=12 -> bmask=15, b=123 -> bmask=127
+ */
+static inline int ceph_stable_mod(int x, int b, int bmask)
+{
+	if ((x & bmask) < b)
+		return x & bmask;
+	else
+		return x & (bmask >> 1);
+}
+
+/*
+ * object layout - how a given object should be stored.
+ */
+struct ceph_object_layout {
+	struct ceph_pg ol_pgid;   /* raw pg, with _full_ ps precision. */
+	__le32 ol_stripe_unit;    /* for per-object parity, if any */
+} __attribute__ ((packed));
+
+/*
+ * compound epoch+version, used by storage layer to serialize mutations
+ */
+struct ceph_eversion {
+	__le32 epoch;
+	__le64 version;
+} __attribute__ ((packed));
+
+/*
+ * osd map bits
+ */
+
+/* status bits */
+#define CEPH_OSD_EXISTS 1
+#define CEPH_OSD_UP     2
+
+/* osd weights.  fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
+#define CEPH_OSD_IN  0x10000
+#define CEPH_OSD_OUT 0
+
+
+/*
+ * osd map flag bits
+ */
+#define CEPH_OSDMAP_NEARFULL (1<<0)  /* sync writes (near ENOSPC) */
+#define CEPH_OSDMAP_FULL     (1<<1)  /* no data writes (ENOSPC) */
+#define CEPH_OSDMAP_PAUSERD  (1<<2)  /* pause all reads */
+#define CEPH_OSDMAP_PAUSEWR  (1<<3)  /* pause all writes */
+#define CEPH_OSDMAP_PAUSEREC (1<<4)  /* pause recovery */
+
+/*
+ * osd ops
+ */
+#define CEPH_OSD_OP_MODE       0xf000
+#define CEPH_OSD_OP_MODE_RD    0x1000
+#define CEPH_OSD_OP_MODE_WR    0x2000
+#define CEPH_OSD_OP_MODE_RMW   0x3000
+#define CEPH_OSD_OP_MODE_SUB   0x4000
+
+#define CEPH_OSD_OP_TYPE       0x0f00
+#define CEPH_OSD_OP_TYPE_LOCK  0x0100
+#define CEPH_OSD_OP_TYPE_DATA  0x0200
+#define CEPH_OSD_OP_TYPE_ATTR  0x0300
+#define CEPH_OSD_OP_TYPE_EXEC  0x0400
+#define CEPH_OSD_OP_TYPE_PG    0x0500
+
+enum {
+	/** data **/
+	/* read */
+	CEPH_OSD_OP_READ      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1,
+	CEPH_OSD_OP_STAT      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2,
+
+	/* fancy read */
+	CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4,
+
+	/* write */
+	CEPH_OSD_OP_WRITE     = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1,
+	CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2,
+	CEPH_OSD_OP_TRUNCATE  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3,
+	CEPH_OSD_OP_ZERO      = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4,
+	CEPH_OSD_OP_DELETE    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5,
+
+	/* fancy write */
+	CEPH_OSD_OP_APPEND    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6,
+	CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7,
+	CEPH_OSD_OP_SETTRUNC  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8,
+	CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9,
+
+	CEPH_OSD_OP_TMAPUP  = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10,
+	CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11,
+	CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
+
+	CEPH_OSD_OP_CREATE  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
+
+	/** attrs **/
+	/* read */
+	CEPH_OSD_OP_GETXATTR  = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
+	CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
+	CEPH_OSD_OP_CMPXATTR  = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 3,
+
+	/* write */
+	CEPH_OSD_OP_SETXATTR  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
+	CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2,
+	CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3,
+	CEPH_OSD_OP_RMXATTR   = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4,
+
+	/** subop **/
+	CEPH_OSD_OP_PULL           = CEPH_OSD_OP_MODE_SUB | 1,
+	CEPH_OSD_OP_PUSH           = CEPH_OSD_OP_MODE_SUB | 2,
+	CEPH_OSD_OP_BALANCEREADS   = CEPH_OSD_OP_MODE_SUB | 3,
+	CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4,
+	CEPH_OSD_OP_SCRUB          = CEPH_OSD_OP_MODE_SUB | 5,
+
+	/** lock **/
+	CEPH_OSD_OP_WRLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
+	CEPH_OSD_OP_WRUNLOCK  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2,
+	CEPH_OSD_OP_RDLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3,
+	CEPH_OSD_OP_RDUNLOCK  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4,
+	CEPH_OSD_OP_UPLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5,
+	CEPH_OSD_OP_DNLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
+
+	/** exec **/
+	CEPH_OSD_OP_CALL    = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
+
+	/** pg **/
+	CEPH_OSD_OP_PGLS      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
+};
+
+static inline int ceph_osd_op_type_lock(int op)
+{
+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK;
+}
+static inline int ceph_osd_op_type_data(int op)
+{
+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
+}
+static inline int ceph_osd_op_type_attr(int op)
+{
+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
+}
+static inline int ceph_osd_op_type_exec(int op)
+{
+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
+}
+static inline int ceph_osd_op_type_pg(int op)
+{
+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
+}
+
+static inline int ceph_osd_op_mode_subop(int op)
+{
+	return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
+}
+static inline int ceph_osd_op_mode_read(int op)
+{
+	return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD;
+}
+static inline int ceph_osd_op_mode_modify(int op)
+{
+	return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
+}
+
+#define CEPH_OSD_TMAP_HDR 'h'
+#define CEPH_OSD_TMAP_SET 's'
+#define CEPH_OSD_TMAP_RM  'r'
+
+extern const char *ceph_osd_op_name(int op);
+
+
+/*
+ * osd op flags
+ *
+ * An op may be READ, WRITE, or READ|WRITE.
+ */
+enum {
+	CEPH_OSD_FLAG_ACK = 1,          /* want (or is) "ack" ack */
+	CEPH_OSD_FLAG_ONNVRAM = 2,      /* want (or is) "onnvram" ack */
+	CEPH_OSD_FLAG_ONDISK = 4,       /* want (or is) "ondisk" ack */
+	CEPH_OSD_FLAG_RETRY = 8,        /* resend attempt */
+	CEPH_OSD_FLAG_READ = 16,        /* op may read */
+	CEPH_OSD_FLAG_WRITE = 32,       /* op may write */
+	CEPH_OSD_FLAG_ORDERSNAP = 64,   /* EOLDSNAP if snapc is out of order */
+	CEPH_OSD_FLAG_PEERSTAT = 128,   /* msg includes osd_peer_stat */
+	CEPH_OSD_FLAG_BALANCE_READS = 256,
+	CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
+	CEPH_OSD_FLAG_PGOP = 1024,      /* pg op, no object */
+	CEPH_OSD_FLAG_EXEC = 2048,      /* op may exec */
+};
+
+enum {
+	CEPH_OSD_OP_FLAG_EXCL = 1,      /* EXCL object create */
+};
+
+#define EOLDSNAPC    ERESTART  /* ORDERSNAP flag set; writer has old snapc*/
+#define EBLACKLISTED ESHUTDOWN /* blacklisted */
+
+/* xattr comparison */
+enum {
+	CEPH_OSD_CMPXATTR_OP_NOP = 0,
+	CEPH_OSD_CMPXATTR_OP_EQ  = 1,
+	CEPH_OSD_CMPXATTR_OP_NE  = 2,
+	CEPH_OSD_CMPXATTR_OP_GT  = 3,
+	CEPH_OSD_CMPXATTR_OP_GTE = 4,
+	CEPH_OSD_CMPXATTR_OP_LT  = 5,
+	CEPH_OSD_CMPXATTR_OP_LTE = 6
+};
+
+enum {
+	CEPH_OSD_CMPXATTR_MODE_STRING = 1,
+	CEPH_OSD_CMPXATTR_MODE_U64    = 2
+};
+
+/*
+ * an individual object operation.  each may be accompanied by some data
+ * payload
+ */
+struct ceph_osd_op {
+	__le16 op;           /* CEPH_OSD_OP_* */
+	__le32 flags;        /* CEPH_OSD_FLAG_* */
+	union {
+		struct {
+			__le64 offset, length;
+			__le64 truncate_size;
+			__le32 truncate_seq;
+		} __attribute__ ((packed)) extent;
+		struct {
+			__le32 name_len;
+			__le32 value_len;
+			__u8 cmp_op;       /* CEPH_OSD_CMPXATTR_OP_* */
+			__u8 cmp_mode;     /* CEPH_OSD_CMPXATTR_MODE_* */
+		} __attribute__ ((packed)) xattr;
+		struct {
+			__u8 class_len;
+			__u8 method_len;
+			__u8 argc;
+			__le32 indata_len;
+		} __attribute__ ((packed)) cls;
+		struct {
+			__le64 cookie, count;
+		} __attribute__ ((packed)) pgls;
+	};
+	__le32 payload_len;
+} __attribute__ ((packed));
+
+/*
+ * osd request message header.  each request may include multiple
+ * ceph_osd_op object operations.
+ */
+struct ceph_osd_request_head {
+	__le32 client_inc;                 /* client incarnation */
+	struct ceph_object_layout layout;  /* pgid */
+	__le32 osdmap_epoch;               /* client's osdmap epoch */
+
+	__le32 flags;
+
+	struct ceph_timespec mtime;        /* for mutations only */
+	struct ceph_eversion reassert_version; /* if we are replaying op */
+
+	__le32 object_len;     /* length of object name */
+
+	__le64 snapid;         /* snapid to read */
+	__le64 snap_seq;       /* writer's snap context */
+	__le32 num_snaps;
+
+	__le16 num_ops;
+	struct ceph_osd_op ops[];  /* followed by ops[], obj, ticket, snaps */
+} __attribute__ ((packed));
+
+struct ceph_osd_reply_head {
+	__le32 client_inc;                /* client incarnation */
+	__le32 flags;
+	struct ceph_object_layout layout;
+	__le32 osdmap_epoch;
+	struct ceph_eversion reassert_version; /* for replaying uncommitted */
+
+	__le32 result;                    /* result code */
+
+	__le32 object_len;                /* length of object name */
+	__le32 num_ops;
+	struct ceph_osd_op ops[0];  /* ops[], object */
+} __attribute__ ((packed));
+
+
+#endif
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
new file mode 100644
index 000000000000..c0b26b6badba
--- /dev/null
+++ b/fs/ceph/snap.c
@@ -0,0 +1,911 @@
+#include "ceph_debug.h"
+
+#include <linux/sort.h>
+#include <linux/slab.h>
+
+#include "super.h"
+#include "decode.h"
+
+/*
+ * Snapshots in ceph are driven in large part by cooperation from the
+ * client.  In contrast to local file systems or file servers that
+ * implement snapshots at a single point in the system, ceph's
+ * distributed access to storage requires clients to help decide
+ * whether a write logically occurs before or after a recently created
+ * snapshot.
+ *
+ * This provides a perfect instantanous client-wide snapshot.  Between
+ * clients, however, snapshots may appear to be applied at slightly
+ * different points in time, depending on delays in delivering the
+ * snapshot notification.
+ *
+ * Snapshots are _not_ file system-wide.  Instead, each snapshot
+ * applies to the subdirectory nested beneath some directory.  This
+ * effectively divides the hierarchy into multiple "realms," where all
+ * of the files contained by each realm share the same set of
+ * snapshots.  An individual realm's snap set contains snapshots
+ * explicitly created on that realm, as well as any snaps in its
+ * parent's snap set _after_ the point at which the parent became it's
+ * parent (due to, say, a rename).  Similarly, snaps from prior parents
+ * during the time intervals during which they were the parent are included.
+ *
+ * The client is spared most of this detail, fortunately... it must only
+ * maintains a hierarchy of realms reflecting the current parent/child
+ * realm relationship, and for each realm has an explicit list of snaps
+ * inherited from prior parents.
+ *
+ * A snap_realm struct is maintained for realms containing every inode
+ * with an open cap in the system.  (The needed snap realm information is
+ * provided by the MDS whenever a cap is issued, i.e., on open.)  A 'seq'
+ * version number is used to ensure that as realm parameters change (new
+ * snapshot, new parent, etc.) the client's realm hierarchy is updated.
+ *
+ * The realm hierarchy drives the generation of a 'snap context' for each
+ * realm, which simply lists the resulting set of snaps for the realm.  This
+ * is attached to any writes sent to OSDs.
+ */
+/*
+ * Unfortunately error handling is a bit mixed here.  If we get a snap
+ * update, but don't have enough memory to update our realm hierarchy,
+ * it's not clear what we can do about it (besides complaining to the
+ * console).
+ */
+
+
+/*
+ * increase ref count for the realm
+ *
+ * caller must hold snap_rwsem for write.
+ */
+void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
+			 struct ceph_snap_realm *realm)
+{
+	dout("get_realm %p %d -> %d\n", realm,
+	     atomic_read(&realm->nref), atomic_read(&realm->nref)+1);
+	/*
+	 * since we _only_ increment realm refs or empty the empty
+	 * list with snap_rwsem held, adjusting the empty list here is
+	 * safe.  we do need to protect against concurrent empty list
+	 * additions, however.
+	 */
+	if (atomic_read(&realm->nref) == 0) {
+		spin_lock(&mdsc->snap_empty_lock);
+		list_del_init(&realm->empty_item);
+		spin_unlock(&mdsc->snap_empty_lock);
+	}
+
+	atomic_inc(&realm->nref);
+}
+
+static void __insert_snap_realm(struct rb_root *root,
+				struct ceph_snap_realm *new)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_snap_realm *r = NULL;
+
+	while (*p) {
+		parent = *p;
+		r = rb_entry(parent, struct ceph_snap_realm, node);
+		if (new->ino < r->ino)
+			p = &(*p)->rb_left;
+		else if (new->ino > r->ino)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&new->node, parent, p);
+	rb_insert_color(&new->node, root);
+}
+
+/*
+ * create and get the realm rooted at @ino and bump its ref count.
+ *
+ * caller must hold snap_rwsem for write.
+ */
+static struct ceph_snap_realm *ceph_create_snap_realm(
+	struct ceph_mds_client *mdsc,
+	u64 ino)
+{
+	struct ceph_snap_realm *realm;
+
+	realm = kzalloc(sizeof(*realm), GFP_NOFS);
+	if (!realm)
+		return ERR_PTR(-ENOMEM);
+
+	atomic_set(&realm->nref, 0);    /* tree does not take a ref */
+	realm->ino = ino;
+	INIT_LIST_HEAD(&realm->children);
+	INIT_LIST_HEAD(&realm->child_item);
+	INIT_LIST_HEAD(&realm->empty_item);
+	INIT_LIST_HEAD(&realm->inodes_with_caps);
+	spin_lock_init(&realm->inodes_with_caps_lock);
+	__insert_snap_realm(&mdsc->snap_realms, realm);
+	dout("create_snap_realm %llx %p\n", realm->ino, realm);
+	return realm;
+}
+
+/*
+ * lookup the realm rooted at @ino.
+ *
+ * caller must hold snap_rwsem for write.
+ */
+struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
+					       u64 ino)
+{
+	struct rb_node *n = mdsc->snap_realms.rb_node;
+	struct ceph_snap_realm *r;
+
+	while (n) {
+		r = rb_entry(n, struct ceph_snap_realm, node);
+		if (ino < r->ino)
+			n = n->rb_left;
+		else if (ino > r->ino)
+			n = n->rb_right;
+		else {
+			dout("lookup_snap_realm %llx %p\n", r->ino, r);
+			return r;
+		}
+	}
+	return NULL;
+}
+
+static void __put_snap_realm(struct ceph_mds_client *mdsc,
+			     struct ceph_snap_realm *realm);
+
+/*
+ * called with snap_rwsem (write)
+ */
+static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
+				 struct ceph_snap_realm *realm)
+{
+	dout("__destroy_snap_realm %p %llx\n", realm, realm->ino);
+
+	rb_erase(&realm->node, &mdsc->snap_realms);
+
+	if (realm->parent) {
+		list_del_init(&realm->child_item);
+		__put_snap_realm(mdsc, realm->parent);
+	}
+
+	kfree(realm->prior_parent_snaps);
+	kfree(realm->snaps);
+	ceph_put_snap_context(realm->cached_context);
+	kfree(realm);
+}
+
+/*
+ * caller holds snap_rwsem (write)
+ */
+static void __put_snap_realm(struct ceph_mds_client *mdsc,
+			     struct ceph_snap_realm *realm)
+{
+	dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
+	     atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
+	if (atomic_dec_and_test(&realm->nref))
+		__destroy_snap_realm(mdsc, realm);
+}
+
+/*
+ * caller needn't hold any locks
+ */
+void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
+			 struct ceph_snap_realm *realm)
+{
+	dout("put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
+	     atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
+	if (!atomic_dec_and_test(&realm->nref))
+		return;
+
+	if (down_write_trylock(&mdsc->snap_rwsem)) {
+		__destroy_snap_realm(mdsc, realm);
+		up_write(&mdsc->snap_rwsem);
+	} else {
+		spin_lock(&mdsc->snap_empty_lock);
+		list_add(&mdsc->snap_empty, &realm->empty_item);
+		spin_unlock(&mdsc->snap_empty_lock);
+	}
+}
+
+/*
+ * Clean up any realms whose ref counts have dropped to zero.  Note
+ * that this does not include realms who were created but not yet
+ * used.
+ *
+ * Called under snap_rwsem (write)
+ */
+static void __cleanup_empty_realms(struct ceph_mds_client *mdsc)
+{
+	struct ceph_snap_realm *realm;
+
+	spin_lock(&mdsc->snap_empty_lock);
+	while (!list_empty(&mdsc->snap_empty)) {
+		realm = list_first_entry(&mdsc->snap_empty,
+				   struct ceph_snap_realm, empty_item);
+		list_del(&realm->empty_item);
+		spin_unlock(&mdsc->snap_empty_lock);
+		__destroy_snap_realm(mdsc, realm);
+		spin_lock(&mdsc->snap_empty_lock);
+	}
+	spin_unlock(&mdsc->snap_empty_lock);
+}
+
+void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc)
+{
+	down_write(&mdsc->snap_rwsem);
+	__cleanup_empty_realms(mdsc);
+	up_write(&mdsc->snap_rwsem);
+}
+
+/*
+ * adjust the parent realm of a given @realm.  adjust child list, and parent
+ * pointers, and ref counts appropriately.
+ *
+ * return true if parent was changed, 0 if unchanged, <0 on error.
+ *
+ * caller must hold snap_rwsem for write.
+ */
+static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
+				    struct ceph_snap_realm *realm,
+				    u64 parentino)
+{
+	struct ceph_snap_realm *parent;
+
+	if (realm->parent_ino == parentino)
+		return 0;
+
+	parent = ceph_lookup_snap_realm(mdsc, parentino);
+	if (!parent) {
+		parent = ceph_create_snap_realm(mdsc, parentino);
+		if (IS_ERR(parent))
+			return PTR_ERR(parent);
+	}
+	dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n",
+	     realm->ino, realm, realm->parent_ino, realm->parent,
+	     parentino, parent);
+	if (realm->parent) {
+		list_del_init(&realm->child_item);
+		ceph_put_snap_realm(mdsc, realm->parent);
+	}
+	realm->parent_ino = parentino;
+	realm->parent = parent;
+	ceph_get_snap_realm(mdsc, parent);
+	list_add(&realm->child_item, &parent->children);
+	return 1;
+}
+
+
+static int cmpu64_rev(const void *a, const void *b)
+{
+	if (*(u64 *)a < *(u64 *)b)
+		return 1;
+	if (*(u64 *)a > *(u64 *)b)
+		return -1;
+	return 0;
+}
+
+/*
+ * build the snap context for a given realm.
+ */
+static int build_snap_context(struct ceph_snap_realm *realm)
+{
+	struct ceph_snap_realm *parent = realm->parent;
+	struct ceph_snap_context *snapc;
+	int err = 0;
+	int i;
+	int num = realm->num_prior_parent_snaps + realm->num_snaps;
+
+	/*
+	 * build parent context, if it hasn't been built.
+	 * conservatively estimate that all parent snaps might be
+	 * included by us.
+	 */
+	if (parent) {
+		if (!parent->cached_context) {
+			err = build_snap_context(parent);
+			if (err)
+				goto fail;
+		}
+		num += parent->cached_context->num_snaps;
+	}
+
+	/* do i actually need to update?  not if my context seq
+	   matches realm seq, and my parents' does to.  (this works
+	   because we rebuild_snap_realms() works _downward_ in
+	   hierarchy after each update.) */
+	if (realm->cached_context &&
+	    realm->cached_context->seq == realm->seq &&
+	    (!parent ||
+	     realm->cached_context->seq >= parent->cached_context->seq)) {
+		dout("build_snap_context %llx %p: %p seq %lld (%d snaps)"
+		     " (unchanged)\n",
+		     realm->ino, realm, realm->cached_context,
+		     realm->cached_context->seq,
+		     realm->cached_context->num_snaps);
+		return 0;
+	}
+
+	/* alloc new snap context */
+	err = -ENOMEM;
+	if (num > ULONG_MAX / sizeof(u64) - sizeof(*snapc))
+		goto fail;
+	snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS);
+	if (!snapc)
+		goto fail;
+	atomic_set(&snapc->nref, 1);
+
+	/* build (reverse sorted) snap vector */
+	num = 0;
+	snapc->seq = realm->seq;
+	if (parent) {
+		/* include any of parent's snaps occuring _after_ my
+		   parent became my parent */
+		for (i = 0; i < parent->cached_context->num_snaps; i++)
+			if (parent->cached_context->snaps[i] >=
+			    realm->parent_since)
+				snapc->snaps[num++] =
+					parent->cached_context->snaps[i];
+		if (parent->cached_context->seq > snapc->seq)
+			snapc->seq = parent->cached_context->seq;
+	}
+	memcpy(snapc->snaps + num, realm->snaps,
+	       sizeof(u64)*realm->num_snaps);
+	num += realm->num_snaps;
+	memcpy(snapc->snaps + num, realm->prior_parent_snaps,
+	       sizeof(u64)*realm->num_prior_parent_snaps);
+	num += realm->num_prior_parent_snaps;
+
+	sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL);
+	snapc->num_snaps = num;
+	dout("build_snap_context %llx %p: %p seq %lld (%d snaps)\n",
+	     realm->ino, realm, snapc, snapc->seq, snapc->num_snaps);
+
+	if (realm->cached_context)
+		ceph_put_snap_context(realm->cached_context);
+	realm->cached_context = snapc;
+	return 0;
+
+fail:
+	/*
+	 * if we fail, clear old (incorrect) cached_context... hopefully
+	 * we'll have better luck building it later
+	 */
+	if (realm->cached_context) {
+		ceph_put_snap_context(realm->cached_context);
+		realm->cached_context = NULL;
+	}
+	pr_err("build_snap_context %llx %p fail %d\n", realm->ino,
+	       realm, err);
+	return err;
+}
+
+/*
+ * rebuild snap context for the given realm and all of its children.
+ */
+static void rebuild_snap_realms(struct ceph_snap_realm *realm)
+{
+	struct ceph_snap_realm *child;
+
+	dout("rebuild_snap_realms %llx %p\n", realm->ino, realm);
+	build_snap_context(realm);
+
+	list_for_each_entry(child, &realm->children, child_item)
+		rebuild_snap_realms(child);
+}
+
+
+/*
+ * helper to allocate and decode an array of snapids.  free prior
+ * instance, if any.
+ */
+static int dup_array(u64 **dst, __le64 *src, int num)
+{
+	int i;
+
+	kfree(*dst);
+	if (num) {
+		*dst = kcalloc(num, sizeof(u64), GFP_NOFS);
+		if (!*dst)
+			return -ENOMEM;
+		for (i = 0; i < num; i++)
+			(*dst)[i] = get_unaligned_le64(src + i);
+	} else {
+		*dst = NULL;
+	}
+	return 0;
+}
+
+
+/*
+ * When a snapshot is applied, the size/mtime inode metadata is queued
+ * in a ceph_cap_snap (one for each snapshot) until writeback
+ * completes and the metadata can be flushed back to the MDS.
+ *
+ * However, if a (sync) write is currently in-progress when we apply
+ * the snapshot, we have to wait until the write succeeds or fails
+ * (and a final size/mtime is known).  In this case the
+ * cap_snap->writing = 1, and is said to be "pending."  When the write
+ * finishes, we __ceph_finish_cap_snap().
+ *
+ * Caller must hold snap_rwsem for read (i.e., the realm topology won't
+ * change).
+ */
+void ceph_queue_cap_snap(struct ceph_inode_info *ci)
+{
+	struct inode *inode = &ci->vfs_inode;
+	struct ceph_cap_snap *capsnap;
+	int used;
+
+	capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
+	if (!capsnap) {
+		pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode);
+		return;
+	}
+
+	spin_lock(&inode->i_lock);
+	used = __ceph_caps_used(ci);
+	if (__ceph_have_pending_cap_snap(ci)) {
+		/* there is no point in queuing multiple "pending" cap_snaps,
+		   as no new writes are allowed to start when pending, so any
+		   writes in progress now were started before the previous
+		   cap_snap.  lucky us. */
+		dout("queue_cap_snap %p already pending\n", inode);
+		kfree(capsnap);
+	} else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) {
+		struct ceph_snap_context *snapc = ci->i_head_snapc;
+
+		igrab(inode);
+
+		atomic_set(&capsnap->nref, 1);
+		capsnap->ci = ci;
+		INIT_LIST_HEAD(&capsnap->ci_item);
+		INIT_LIST_HEAD(&capsnap->flushing_item);
+
+		capsnap->follows = snapc->seq - 1;
+		capsnap->issued = __ceph_caps_issued(ci, NULL);
+		capsnap->dirty = __ceph_caps_dirty(ci);
+
+		capsnap->mode = inode->i_mode;
+		capsnap->uid = inode->i_uid;
+		capsnap->gid = inode->i_gid;
+
+		/* fixme? */
+		capsnap->xattr_blob = NULL;
+		capsnap->xattr_len = 0;
+
+		/* dirty page count moved from _head to this cap_snap;
+		   all subsequent writes page dirties occur _after_ this
+		   snapshot. */
+		capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
+		ci->i_wrbuffer_ref_head = 0;
+		capsnap->context = snapc;
+		ci->i_head_snapc = NULL;
+		list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
+
+		if (used & CEPH_CAP_FILE_WR) {
+			dout("queue_cap_snap %p cap_snap %p snapc %p"
+			     " seq %llu used WR, now pending\n", inode,
+			     capsnap, snapc, snapc->seq);
+			capsnap->writing = 1;
+		} else {
+			/* note mtime, size NOW. */
+			__ceph_finish_cap_snap(ci, capsnap);
+		}
+	} else {
+		dout("queue_cap_snap %p nothing dirty|writing\n", inode);
+		kfree(capsnap);
+	}
+
+	spin_unlock(&inode->i_lock);
+}
+
+/*
+ * Finalize the size, mtime for a cap_snap.. that is, settle on final values
+ * to be used for the snapshot, to be flushed back to the mds.
+ *
+ * If capsnap can now be flushed, add to snap_flush list, and return 1.
+ *
+ * Caller must hold i_lock.
+ */
+int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
+			    struct ceph_cap_snap *capsnap)
+{
+	struct inode *inode = &ci->vfs_inode;
+	struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+
+	BUG_ON(capsnap->writing);
+	capsnap->size = inode->i_size;
+	capsnap->mtime = inode->i_mtime;
+	capsnap->atime = inode->i_atime;
+	capsnap->ctime = inode->i_ctime;
+	capsnap->time_warp_seq = ci->i_time_warp_seq;
+	if (capsnap->dirty_pages) {
+		dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu "
+		     "still has %d dirty pages\n", inode, capsnap,
+		     capsnap->context, capsnap->context->seq,
+		     ceph_cap_string(capsnap->dirty), capsnap->size,
+		     capsnap->dirty_pages);
+		return 0;
+	}
+	dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n",
+	     inode, capsnap, capsnap->context,
+	     capsnap->context->seq, ceph_cap_string(capsnap->dirty),
+	     capsnap->size);
+
+	spin_lock(&mdsc->snap_flush_lock);
+	list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
+	spin_unlock(&mdsc->snap_flush_lock);
+	return 1;  /* caller may want to ceph_flush_snaps */
+}
+
+
+/*
+ * Parse and apply a snapblob "snap trace" from the MDS.  This specifies
+ * the snap realm parameters from a given realm and all of its ancestors,
+ * up to the root.
+ *
+ * Caller must hold snap_rwsem for write.
+ */
+int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
+			   void *p, void *e, bool deletion)
+{
+	struct ceph_mds_snap_realm *ri;    /* encoded */
+	__le64 *snaps;                     /* encoded */
+	__le64 *prior_parent_snaps;        /* encoded */
+	struct ceph_snap_realm *realm;
+	int invalidate = 0;
+	int err = -ENOMEM;
+
+	dout("update_snap_trace deletion=%d\n", deletion);
+more:
+	ceph_decode_need(&p, e, sizeof(*ri), bad);
+	ri = p;
+	p += sizeof(*ri);
+	ceph_decode_need(&p, e, sizeof(u64)*(le32_to_cpu(ri->num_snaps) +
+			    le32_to_cpu(ri->num_prior_parent_snaps)), bad);
+	snaps = p;
+	p += sizeof(u64) * le32_to_cpu(ri->num_snaps);
+	prior_parent_snaps = p;
+	p += sizeof(u64) * le32_to_cpu(ri->num_prior_parent_snaps);
+
+	realm = ceph_lookup_snap_realm(mdsc, le64_to_cpu(ri->ino));
+	if (!realm) {
+		realm = ceph_create_snap_realm(mdsc, le64_to_cpu(ri->ino));
+		if (IS_ERR(realm)) {
+			err = PTR_ERR(realm);
+			goto fail;
+		}
+	}
+
+	if (le64_to_cpu(ri->seq) > realm->seq) {
+		dout("update_snap_trace updating %llx %p %lld -> %lld\n",
+		     realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
+		/*
+		 * if the realm seq has changed, queue a cap_snap for every
+		 * inode with open caps.  we do this _before_ we update
+		 * the realm info so that we prepare for writeback under the
+		 * _previous_ snap context.
+		 *
+		 * ...unless it's a snap deletion!
+		 */
+		if (!deletion) {
+			struct ceph_inode_info *ci;
+			struct inode *lastinode = NULL;
+
+			spin_lock(&realm->inodes_with_caps_lock);
+			list_for_each_entry(ci, &realm->inodes_with_caps,
+					    i_snap_realm_item) {
+				struct inode *inode = igrab(&ci->vfs_inode);
+				if (!inode)
+					continue;
+				spin_unlock(&realm->inodes_with_caps_lock);
+				if (lastinode)
+					iput(lastinode);
+				lastinode = inode;
+				ceph_queue_cap_snap(ci);
+				spin_lock(&realm->inodes_with_caps_lock);
+			}
+			spin_unlock(&realm->inodes_with_caps_lock);
+			if (lastinode)
+				iput(lastinode);
+			dout("update_snap_trace cap_snaps queued\n");
+		}
+
+	} else {
+		dout("update_snap_trace %llx %p seq %lld unchanged\n",
+		     realm->ino, realm, realm->seq);
+	}
+
+	/* ensure the parent is correct */
+	err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
+	if (err < 0)
+		goto fail;
+	invalidate += err;
+
+	if (le64_to_cpu(ri->seq) > realm->seq) {
+		/* update realm parameters, snap lists */
+		realm->seq = le64_to_cpu(ri->seq);
+		realm->created = le64_to_cpu(ri->created);
+		realm->parent_since = le64_to_cpu(ri->parent_since);
+
+		realm->num_snaps = le32_to_cpu(ri->num_snaps);
+		err = dup_array(&realm->snaps, snaps, realm->num_snaps);
+		if (err < 0)
+			goto fail;
+
+		realm->num_prior_parent_snaps =
+			le32_to_cpu(ri->num_prior_parent_snaps);
+		err = dup_array(&realm->prior_parent_snaps, prior_parent_snaps,
+				realm->num_prior_parent_snaps);
+		if (err < 0)
+			goto fail;
+
+		invalidate = 1;
+	} else if (!realm->cached_context) {
+		invalidate = 1;
+	}
+
+	dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
+	     realm, invalidate, p, e);
+
+	if (p < e)
+		goto more;
+
+	/* invalidate when we reach the _end_ (root) of the trace */
+	if (invalidate)
+		rebuild_snap_realms(realm);
+
+	__cleanup_empty_realms(mdsc);
+	return 0;
+
+bad:
+	err = -EINVAL;
+fail:
+	pr_err("update_snap_trace error %d\n", err);
+	return err;
+}
+
+
+/*
+ * Send any cap_snaps that are queued for flush.  Try to carry
+ * s_mutex across multiple snap flushes to avoid locking overhead.
+ *
+ * Caller holds no locks.
+ */
+static void flush_snaps(struct ceph_mds_client *mdsc)
+{
+	struct ceph_inode_info *ci;
+	struct inode *inode;
+	struct ceph_mds_session *session = NULL;
+
+	dout("flush_snaps\n");
+	spin_lock(&mdsc->snap_flush_lock);
+	while (!list_empty(&mdsc->snap_flush_list)) {
+		ci = list_first_entry(&mdsc->snap_flush_list,
+				struct ceph_inode_info, i_snap_flush_item);
+		inode = &ci->vfs_inode;
+		igrab(inode);
+		spin_unlock(&mdsc->snap_flush_lock);
+		spin_lock(&inode->i_lock);
+		__ceph_flush_snaps(ci, &session);
+		spin_unlock(&inode->i_lock);
+		iput(inode);
+		spin_lock(&mdsc->snap_flush_lock);
+	}
+	spin_unlock(&mdsc->snap_flush_lock);
+
+	if (session) {
+		mutex_unlock(&session->s_mutex);
+		ceph_put_mds_session(session);
+	}
+	dout("flush_snaps done\n");
+}
+
+
+/*
+ * Handle a snap notification from the MDS.
+ *
+ * This can take two basic forms: the simplest is just a snap creation
+ * or deletion notification on an existing realm.  This should update the
+ * realm and its children.
+ *
+ * The more difficult case is realm creation, due to snap creation at a
+ * new point in the file hierarchy, or due to a rename that moves a file or
+ * directory into another realm.
+ */
+void ceph_handle_snap(struct ceph_mds_client *mdsc,
+		      struct ceph_mds_session *session,
+		      struct ceph_msg *msg)
+{
+	struct super_block *sb = mdsc->client->sb;
+	int mds = session->s_mds;
+	u64 split;
+	int op;
+	int trace_len;
+	struct ceph_snap_realm *realm = NULL;
+	void *p = msg->front.iov_base;
+	void *e = p + msg->front.iov_len;
+	struct ceph_mds_snap_head *h;
+	int num_split_inos, num_split_realms;
+	__le64 *split_inos = NULL, *split_realms = NULL;
+	int i;
+	int locked_rwsem = 0;
+
+	/* decode */
+	if (msg->front.iov_len < sizeof(*h))
+		goto bad;
+	h = p;
+	op = le32_to_cpu(h->op);
+	split = le64_to_cpu(h->split);   /* non-zero if we are splitting an
+					  * existing realm */
+	num_split_inos = le32_to_cpu(h->num_split_inos);
+	num_split_realms = le32_to_cpu(h->num_split_realms);
+	trace_len = le32_to_cpu(h->trace_len);
+	p += sizeof(*h);
+
+	dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds,
+	     ceph_snap_op_name(op), split, trace_len);
+
+	mutex_lock(&session->s_mutex);
+	session->s_seq++;
+	mutex_unlock(&session->s_mutex);
+
+	down_write(&mdsc->snap_rwsem);
+	locked_rwsem = 1;
+
+	if (op == CEPH_SNAP_OP_SPLIT) {
+		struct ceph_mds_snap_realm *ri;
+
+		/*
+		 * A "split" breaks part of an existing realm off into
+		 * a new realm.  The MDS provides a list of inodes
+		 * (with caps) and child realms that belong to the new
+		 * child.
+		 */
+		split_inos = p;
+		p += sizeof(u64) * num_split_inos;
+		split_realms = p;
+		p += sizeof(u64) * num_split_realms;
+		ceph_decode_need(&p, e, sizeof(*ri), bad);
+		/* we will peek at realm info here, but will _not_
+		 * advance p, as the realm update will occur below in
+		 * ceph_update_snap_trace. */
+		ri = p;
+
+		realm = ceph_lookup_snap_realm(mdsc, split);
+		if (!realm) {
+			realm = ceph_create_snap_realm(mdsc, split);
+			if (IS_ERR(realm))
+				goto out;
+		}
+		ceph_get_snap_realm(mdsc, realm);
+
+		dout("splitting snap_realm %llx %p\n", realm->ino, realm);
+		for (i = 0; i < num_split_inos; i++) {
+			struct ceph_vino vino = {
+				.ino = le64_to_cpu(split_inos[i]),
+				.snap = CEPH_NOSNAP,
+			};
+			struct inode *inode = ceph_find_inode(sb, vino);
+			struct ceph_inode_info *ci;
+
+			if (!inode)
+				continue;
+			ci = ceph_inode(inode);
+
+			spin_lock(&inode->i_lock);
+			if (!ci->i_snap_realm)
+				goto skip_inode;
+			/*
+			 * If this inode belongs to a realm that was
+			 * created after our new realm, we experienced
+			 * a race (due to another split notifications
+			 * arriving from a different MDS).  So skip
+			 * this inode.
+			 */
+			if (ci->i_snap_realm->created >
+			    le64_to_cpu(ri->created)) {
+				dout(" leaving %p in newer realm %llx %p\n",
+				     inode, ci->i_snap_realm->ino,
+				     ci->i_snap_realm);
+				goto skip_inode;
+			}
+			dout(" will move %p to split realm %llx %p\n",
+			     inode, realm->ino, realm);
+			/*
+			 * Remove the inode from the realm's inode
+			 * list, but don't add it to the new realm
+			 * yet.  We don't want the cap_snap to be
+			 * queued (again) by ceph_update_snap_trace()
+			 * below.  Queue it _now_, under the old context.
+			 */
+			spin_lock(&realm->inodes_with_caps_lock);
+			list_del_init(&ci->i_snap_realm_item);
+			spin_unlock(&realm->inodes_with_caps_lock);
+			spin_unlock(&inode->i_lock);
+
+			ceph_queue_cap_snap(ci);
+
+			iput(inode);
+			continue;
+
+skip_inode:
+			spin_unlock(&inode->i_lock);
+			iput(inode);
+		}
+
+		/* we may have taken some of the old realm's children. */
+		for (i = 0; i < num_split_realms; i++) {
+			struct ceph_snap_realm *child =
+				ceph_lookup_snap_realm(mdsc,
+					   le64_to_cpu(split_realms[i]));
+			if (!child)
+				continue;
+			adjust_snap_realm_parent(mdsc, child, realm->ino);
+		}
+	}
+
+	/*
+	 * update using the provided snap trace. if we are deleting a
+	 * snap, we can avoid queueing cap_snaps.
+	 */
+	ceph_update_snap_trace(mdsc, p, e,
+			       op == CEPH_SNAP_OP_DESTROY);
+
+	if (op == CEPH_SNAP_OP_SPLIT) {
+		/*
+		 * ok, _now_ add the inodes into the new realm.
+		 */
+		for (i = 0; i < num_split_inos; i++) {
+			struct ceph_vino vino = {
+				.ino = le64_to_cpu(split_inos[i]),
+				.snap = CEPH_NOSNAP,
+			};
+			struct inode *inode = ceph_find_inode(sb, vino);
+			struct ceph_inode_info *ci;
+
+			if (!inode)
+				continue;
+			ci = ceph_inode(inode);
+			spin_lock(&inode->i_lock);
+			if (list_empty(&ci->i_snap_realm_item)) {
+				struct ceph_snap_realm *oldrealm =
+					ci->i_snap_realm;
+
+				dout(" moving %p to split realm %llx %p\n",
+				     inode, realm->ino, realm);
+				spin_lock(&realm->inodes_with_caps_lock);
+				list_add(&ci->i_snap_realm_item,
+					 &realm->inodes_with_caps);
+				ci->i_snap_realm = realm;
+				spin_unlock(&realm->inodes_with_caps_lock);
+				ceph_get_snap_realm(mdsc, realm);
+				ceph_put_snap_realm(mdsc, oldrealm);
+			}
+			spin_unlock(&inode->i_lock);
+			iput(inode);
+		}
+
+		/* we took a reference when we created the realm, above */
+		ceph_put_snap_realm(mdsc, realm);
+	}
+
+	__cleanup_empty_realms(mdsc);
+
+	up_write(&mdsc->snap_rwsem);
+
+	flush_snaps(mdsc);
+	return;
+
+bad:
+	pr_err("corrupt snap message from mds%d\n", mds);
+	ceph_msg_dump(msg);
+out:
+	if (locked_rwsem)
+		up_write(&mdsc->snap_rwsem);
+	return;
+}
+
+
+
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
new file mode 100644
index 000000000000..fa87f51e38e1
--- /dev/null
+++ b/fs/ceph/super.c
@@ -0,0 +1,1085 @@
+
+#include "ceph_debug.h"
+
+#include <linux/backing-dev.h>
+#include <linux/fs.h>
+#include <linux/inet.h>
+#include <linux/in6.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/parser.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/statfs.h>
+#include <linux/string.h>
+
+#include "decode.h"
+#include "super.h"
+#include "mon_client.h"
+#include "auth.h"
+
+/*
+ * Ceph superblock operations
+ *
+ * Handle the basics of mounting, unmounting.
+ */
+
+
+/*
+ * find filename portion of a path (/foo/bar/baz -> baz)
+ */
+const char *ceph_file_part(const char *s, int len)
+{
+	const char *e = s + len;
+
+	while (e != s && *(e-1) != '/')
+		e--;
+	return e;
+}
+
+
+/*
+ * super ops
+ */
+static void ceph_put_super(struct super_block *s)
+{
+	struct ceph_client *client = ceph_sb_to_client(s);
+
+	dout("put_super\n");
+	ceph_mdsc_close_sessions(&client->mdsc);
+
+	/*
+	 * ensure we release the bdi before put_anon_super releases
+	 * the device name.
+	 */
+	if (s->s_bdi == &client->backing_dev_info) {
+		bdi_unregister(&client->backing_dev_info);
+		s->s_bdi = NULL;
+	}
+
+	return;
+}
+
+static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct ceph_client *client = ceph_inode_to_client(dentry->d_inode);
+	struct ceph_monmap *monmap = client->monc.monmap;
+	struct ceph_statfs st;
+	u64 fsid;
+	int err;
+
+	dout("statfs\n");
+	err = ceph_monc_do_statfs(&client->monc, &st);
+	if (err < 0)
+		return err;
+
+	/* fill in kstatfs */
+	buf->f_type = CEPH_SUPER_MAGIC;  /* ?? */
+
+	/*
+	 * express utilization in terms of large blocks to avoid
+	 * overflow on 32-bit machines.
+	 */
+	buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
+	buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
+	buf->f_bfree = (le64_to_cpu(st.kb) - le64_to_cpu(st.kb_used)) >>
+		(CEPH_BLOCK_SHIFT-10);
+	buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
+
+	buf->f_files = le64_to_cpu(st.num_objects);
+	buf->f_ffree = -1;
+	buf->f_namelen = NAME_MAX;
+	buf->f_frsize = PAGE_CACHE_SIZE;
+
+	/* leave fsid little-endian, regardless of host endianness */
+	fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1);
+	buf->f_fsid.val[0] = fsid & 0xffffffff;
+	buf->f_fsid.val[1] = fsid >> 32;
+
+	return 0;
+}
+
+
+static int ceph_syncfs(struct super_block *sb, int wait)
+{
+	dout("sync_fs %d\n", wait);
+	ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc);
+	ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc);
+	dout("sync_fs %d done\n", wait);
+	return 0;
+}
+
+static int default_congestion_kb(void)
+{
+	int congestion_kb;
+
+	/*
+	 * Copied from NFS
+	 *
+	 * congestion size, scale with available memory.
+	 *
+	 *  64MB:    8192k
+	 * 128MB:   11585k
+	 * 256MB:   16384k
+	 * 512MB:   23170k
+	 *   1GB:   32768k
+	 *   2GB:   46340k
+	 *   4GB:   65536k
+	 *   8GB:   92681k
+	 *  16GB:  131072k
+	 *
+	 * This allows larger machines to have larger/more transfers.
+	 * Limit the default to 256M
+	 */
+	congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
+	if (congestion_kb > 256*1024)
+		congestion_kb = 256*1024;
+
+	return congestion_kb;
+}
+
+/**
+ * ceph_show_options - Show mount options in /proc/mounts
+ * @m: seq_file to write to
+ * @mnt: mount descriptor
+ */
+static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
+{
+	struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb);
+	struct ceph_mount_args *args = client->mount_args;
+
+	if (args->flags & CEPH_OPT_FSID)
+		seq_printf(m, ",fsidmajor=%llu,fsidminor%llu",
+			   le64_to_cpu(*(__le64 *)&args->fsid.fsid[0]),
+			   le64_to_cpu(*(__le64 *)&args->fsid.fsid[8]));
+	if (args->flags & CEPH_OPT_NOSHARE)
+		seq_puts(m, ",noshare");
+	if (args->flags & CEPH_OPT_DIRSTAT)
+		seq_puts(m, ",dirstat");
+	if ((args->flags & CEPH_OPT_RBYTES) == 0)
+		seq_puts(m, ",norbytes");
+	if (args->flags & CEPH_OPT_NOCRC)
+		seq_puts(m, ",nocrc");
+	if (args->flags & CEPH_OPT_NOASYNCREADDIR)
+		seq_puts(m, ",noasyncreaddir");
+
+	if (args->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
+		seq_printf(m, ",mount_timeout=%d", args->mount_timeout);
+	if (args->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
+		seq_printf(m, ",osd_idle_ttl=%d", args->osd_idle_ttl);
+	if (args->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
+		seq_printf(m, ",osdtimeout=%d", args->osd_timeout);
+	if (args->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
+		seq_printf(m, ",osdkeepalivetimeout=%d",
+			 args->osd_keepalive_timeout);
+	if (args->wsize)
+		seq_printf(m, ",wsize=%d", args->wsize);
+	if (args->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
+		seq_printf(m, ",rsize=%d", args->rsize);
+	if (args->congestion_kb != default_congestion_kb())
+		seq_printf(m, ",write_congestion_kb=%d", args->congestion_kb);
+	if (args->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
+		seq_printf(m, ",caps_wanted_delay_min=%d",
+			 args->caps_wanted_delay_min);
+	if (args->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
+		seq_printf(m, ",caps_wanted_delay_max=%d",
+			   args->caps_wanted_delay_max);
+	if (args->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
+		seq_printf(m, ",cap_release_safety=%d",
+			   args->cap_release_safety);
+	if (args->max_readdir != CEPH_MAX_READDIR_DEFAULT)
+		seq_printf(m, ",readdir_max_entries=%d", args->max_readdir);
+	if (args->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
+		seq_printf(m, ",readdir_max_bytes=%d", args->max_readdir_bytes);
+	if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
+		seq_printf(m, ",snapdirname=%s", args->snapdir_name);
+	if (args->name)
+		seq_printf(m, ",name=%s", args->name);
+	if (args->secret)
+		seq_puts(m, ",secret=<hidden>");
+	return 0;
+}
+
+/*
+ * caches
+ */
+struct kmem_cache *ceph_inode_cachep;
+struct kmem_cache *ceph_cap_cachep;
+struct kmem_cache *ceph_dentry_cachep;
+struct kmem_cache *ceph_file_cachep;
+
+static void ceph_inode_init_once(void *foo)
+{
+	struct ceph_inode_info *ci = foo;
+	inode_init_once(&ci->vfs_inode);
+}
+
+static int __init init_caches(void)
+{
+	ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
+				      sizeof(struct ceph_inode_info),
+				      __alignof__(struct ceph_inode_info),
+				      (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
+				      ceph_inode_init_once);
+	if (ceph_inode_cachep == NULL)
+		return -ENOMEM;
+
+	ceph_cap_cachep = KMEM_CACHE(ceph_cap,
+				     SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+	if (ceph_cap_cachep == NULL)
+		goto bad_cap;
+
+	ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
+					SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+	if (ceph_dentry_cachep == NULL)
+		goto bad_dentry;
+
+	ceph_file_cachep = KMEM_CACHE(ceph_file_info,
+				      SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+	if (ceph_file_cachep == NULL)
+		goto bad_file;
+
+	return 0;
+
+bad_file:
+	kmem_cache_destroy(ceph_dentry_cachep);
+bad_dentry:
+	kmem_cache_destroy(ceph_cap_cachep);
+bad_cap:
+	kmem_cache_destroy(ceph_inode_cachep);
+	return -ENOMEM;
+}
+
+static void destroy_caches(void)
+{
+	kmem_cache_destroy(ceph_inode_cachep);
+	kmem_cache_destroy(ceph_cap_cachep);
+	kmem_cache_destroy(ceph_dentry_cachep);
+	kmem_cache_destroy(ceph_file_cachep);
+}
+
+
+/*
+ * ceph_umount_begin - initiate forced umount.  Tear down down the
+ * mount, skipping steps that may hang while waiting for server(s).
+ */
+static void ceph_umount_begin(struct super_block *sb)
+{
+	struct ceph_client *client = ceph_sb_to_client(sb);
+
+	dout("ceph_umount_begin - starting forced umount\n");
+	if (!client)
+		return;
+	client->mount_state = CEPH_MOUNT_SHUTDOWN;
+	return;
+}
+
+static const struct super_operations ceph_super_ops = {
+	.alloc_inode	= ceph_alloc_inode,
+	.destroy_inode	= ceph_destroy_inode,
+	.write_inode    = ceph_write_inode,
+	.sync_fs        = ceph_syncfs,
+	.put_super	= ceph_put_super,
+	.show_options   = ceph_show_options,
+	.statfs		= ceph_statfs,
+	.umount_begin   = ceph_umount_begin,
+};
+
+
+const char *ceph_msg_type_name(int type)
+{
+	switch (type) {
+	case CEPH_MSG_SHUTDOWN: return "shutdown";
+	case CEPH_MSG_PING: return "ping";
+	case CEPH_MSG_AUTH: return "auth";
+	case CEPH_MSG_AUTH_REPLY: return "auth_reply";
+	case CEPH_MSG_MON_MAP: return "mon_map";
+	case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
+	case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
+	case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
+	case CEPH_MSG_STATFS: return "statfs";
+	case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
+	case CEPH_MSG_MDS_MAP: return "mds_map";
+	case CEPH_MSG_CLIENT_SESSION: return "client_session";
+	case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
+	case CEPH_MSG_CLIENT_REQUEST: return "client_request";
+	case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
+	case CEPH_MSG_CLIENT_REPLY: return "client_reply";
+	case CEPH_MSG_CLIENT_CAPS: return "client_caps";
+	case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
+	case CEPH_MSG_CLIENT_SNAP: return "client_snap";
+	case CEPH_MSG_CLIENT_LEASE: return "client_lease";
+	case CEPH_MSG_OSD_MAP: return "osd_map";
+	case CEPH_MSG_OSD_OP: return "osd_op";
+	case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
+	default: return "unknown";
+	}
+}
+
+
+/*
+ * mount options
+ */
+enum {
+	Opt_fsidmajor,
+	Opt_fsidminor,
+	Opt_monport,
+	Opt_wsize,
+	Opt_rsize,
+	Opt_osdtimeout,
+	Opt_osdkeepalivetimeout,
+	Opt_mount_timeout,
+	Opt_osd_idle_ttl,
+	Opt_caps_wanted_delay_min,
+	Opt_caps_wanted_delay_max,
+	Opt_cap_release_safety,
+	Opt_readdir_max_entries,
+	Opt_readdir_max_bytes,
+	Opt_congestion_kb,
+	Opt_last_int,
+	/* int args above */
+	Opt_snapdirname,
+	Opt_name,
+	Opt_secret,
+	Opt_last_string,
+	/* string args above */
+	Opt_ip,
+	Opt_noshare,
+	Opt_dirstat,
+	Opt_nodirstat,
+	Opt_rbytes,
+	Opt_norbytes,
+	Opt_nocrc,
+	Opt_noasyncreaddir,
+};
+
+static match_table_t arg_tokens = {
+	{Opt_fsidmajor, "fsidmajor=%ld"},
+	{Opt_fsidminor, "fsidminor=%ld"},
+	{Opt_monport, "monport=%d"},
+	{Opt_wsize, "wsize=%d"},
+	{Opt_rsize, "rsize=%d"},
+	{Opt_osdtimeout, "osdtimeout=%d"},
+	{Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
+	{Opt_mount_timeout, "mount_timeout=%d"},
+	{Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
+	{Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
+	{Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
+	{Opt_cap_release_safety, "cap_release_safety=%d"},
+	{Opt_readdir_max_entries, "readdir_max_entries=%d"},
+	{Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
+	{Opt_congestion_kb, "write_congestion_kb=%d"},
+	/* int args above */
+	{Opt_snapdirname, "snapdirname=%s"},
+	{Opt_name, "name=%s"},
+	{Opt_secret, "secret=%s"},
+	/* string args above */
+	{Opt_ip, "ip=%s"},
+	{Opt_noshare, "noshare"},
+	{Opt_dirstat, "dirstat"},
+	{Opt_nodirstat, "nodirstat"},
+	{Opt_rbytes, "rbytes"},
+	{Opt_norbytes, "norbytes"},
+	{Opt_nocrc, "nocrc"},
+	{Opt_noasyncreaddir, "noasyncreaddir"},
+	{-1, NULL}
+};
+
+
+static struct ceph_mount_args *parse_mount_args(int flags, char *options,
+						const char *dev_name,
+						const char **path)
+{
+	struct ceph_mount_args *args;
+	const char *c;
+	int err = -ENOMEM;
+	substring_t argstr[MAX_OPT_ARGS];
+
+	args = kzalloc(sizeof(*args), GFP_KERNEL);
+	if (!args)
+		return ERR_PTR(-ENOMEM);
+	args->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*args->mon_addr),
+				 GFP_KERNEL);
+	if (!args->mon_addr)
+		goto out;
+
+	dout("parse_mount_args %p, dev_name '%s'\n", args, dev_name);
+
+	/* start with defaults */
+	args->sb_flags = flags;
+	args->flags = CEPH_OPT_DEFAULT;
+	args->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
+	args->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
+	args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
+	args->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;   /* seconds */
+	args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
+	args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
+	args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
+	args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
+	args->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
+	args->max_readdir = CEPH_MAX_READDIR_DEFAULT;
+	args->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
+	args->congestion_kb = default_congestion_kb();
+
+	/* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
+	err = -EINVAL;
+	if (!dev_name)
+		goto out;
+	*path = strstr(dev_name, ":/");
+	if (*path == NULL) {
+		pr_err("device name is missing path (no :/ in %s)\n",
+		       dev_name);
+		goto out;
+	}
+
+	/* get mon ip(s) */
+	err = ceph_parse_ips(dev_name, *path, args->mon_addr,
+			     CEPH_MAX_MON, &args->num_mon);
+	if (err < 0)
+		goto out;
+
+	/* path on server */
+	*path += 2;
+	dout("server path '%s'\n", *path);
+
+	/* parse mount options */
+	while ((c = strsep(&options, ",")) != NULL) {
+		int token, intval, ret;
+		if (!*c)
+			continue;
+		err = -EINVAL;
+		token = match_token((char *)c, arg_tokens, argstr);
+		if (token < 0) {
+			pr_err("bad mount option at '%s'\n", c);
+			goto out;
+		}
+		if (token < Opt_last_int) {
+			ret = match_int(&argstr[0], &intval);
+			if (ret < 0) {
+				pr_err("bad mount option arg (not int) "
+				       "at '%s'\n", c);
+				continue;
+			}
+			dout("got int token %d val %d\n", token, intval);
+		} else if (token > Opt_last_int && token < Opt_last_string) {
+			dout("got string token %d val %s\n", token,
+			     argstr[0].from);
+		} else {
+			dout("got token %d\n", token);
+		}
+		switch (token) {
+		case Opt_fsidmajor:
+			*(__le64 *)&args->fsid.fsid[0] = cpu_to_le64(intval);
+			break;
+		case Opt_fsidminor:
+			*(__le64 *)&args->fsid.fsid[8] = cpu_to_le64(intval);
+			break;
+		case Opt_ip:
+			err = ceph_parse_ips(argstr[0].from,
+					     argstr[0].to,
+					     &args->my_addr,
+					     1, NULL);
+			if (err < 0)
+				goto out;
+			args->flags |= CEPH_OPT_MYIP;
+			break;
+
+		case Opt_snapdirname:
+			kfree(args->snapdir_name);
+			args->snapdir_name = kstrndup(argstr[0].from,
+					      argstr[0].to-argstr[0].from,
+					      GFP_KERNEL);
+			break;
+		case Opt_name:
+			args->name = kstrndup(argstr[0].from,
+					      argstr[0].to-argstr[0].from,
+					      GFP_KERNEL);
+			break;
+		case Opt_secret:
+			args->secret = kstrndup(argstr[0].from,
+						argstr[0].to-argstr[0].from,
+						GFP_KERNEL);
+			break;
+
+			/* misc */
+		case Opt_wsize:
+			args->wsize = intval;
+			break;
+		case Opt_rsize:
+			args->rsize = intval;
+			break;
+		case Opt_osdtimeout:
+			args->osd_timeout = intval;
+			break;
+		case Opt_osdkeepalivetimeout:
+			args->osd_keepalive_timeout = intval;
+			break;
+		case Opt_mount_timeout:
+			args->mount_timeout = intval;
+			break;
+		case Opt_caps_wanted_delay_min:
+			args->caps_wanted_delay_min = intval;
+			break;
+		case Opt_caps_wanted_delay_max:
+			args->caps_wanted_delay_max = intval;
+			break;
+		case Opt_readdir_max_entries:
+			args->max_readdir = intval;
+			break;
+		case Opt_readdir_max_bytes:
+			args->max_readdir_bytes = intval;
+			break;
+		case Opt_congestion_kb:
+			args->congestion_kb = intval;
+			break;
+
+		case Opt_noshare:
+			args->flags |= CEPH_OPT_NOSHARE;
+			break;
+
+		case Opt_dirstat:
+			args->flags |= CEPH_OPT_DIRSTAT;
+			break;
+		case Opt_nodirstat:
+			args->flags &= ~CEPH_OPT_DIRSTAT;
+			break;
+		case Opt_rbytes:
+			args->flags |= CEPH_OPT_RBYTES;
+			break;
+		case Opt_norbytes:
+			args->flags &= ~CEPH_OPT_RBYTES;
+			break;
+		case Opt_nocrc:
+			args->flags |= CEPH_OPT_NOCRC;
+			break;
+		case Opt_noasyncreaddir:
+			args->flags |= CEPH_OPT_NOASYNCREADDIR;
+			break;
+
+		default:
+			BUG_ON(token);
+		}
+	}
+	return args;
+
+out:
+	kfree(args->mon_addr);
+	kfree(args);
+	return ERR_PTR(err);
+}
+
+static void destroy_mount_args(struct ceph_mount_args *args)
+{
+	dout("destroy_mount_args %p\n", args);
+	kfree(args->snapdir_name);
+	args->snapdir_name = NULL;
+	kfree(args->name);
+	args->name = NULL;
+	kfree(args->secret);
+	args->secret = NULL;
+	kfree(args);
+}
+
+/*
+ * create a fresh client instance
+ */
+static struct ceph_client *ceph_create_client(struct ceph_mount_args *args)
+{
+	struct ceph_client *client;
+	int err = -ENOMEM;
+
+	client = kzalloc(sizeof(*client), GFP_KERNEL);
+	if (client == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&client->mount_mutex);
+
+	init_waitqueue_head(&client->auth_wq);
+
+	client->sb = NULL;
+	client->mount_state = CEPH_MOUNT_MOUNTING;
+	client->mount_args = args;
+
+	client->msgr = NULL;
+
+	client->auth_err = 0;
+	atomic_long_set(&client->writeback_count, 0);
+
+	err = bdi_init(&client->backing_dev_info);
+	if (err < 0)
+		goto fail;
+
+	err = -ENOMEM;
+	client->wb_wq = create_workqueue("ceph-writeback");
+	if (client->wb_wq == NULL)
+		goto fail_bdi;
+	client->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
+	if (client->pg_inv_wq == NULL)
+		goto fail_wb_wq;
+	client->trunc_wq = create_singlethread_workqueue("ceph-trunc");
+	if (client->trunc_wq == NULL)
+		goto fail_pg_inv_wq;
+
+	/* set up mempools */
+	err = -ENOMEM;
+	client->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
+			      client->mount_args->wsize >> PAGE_CACHE_SHIFT);
+	if (!client->wb_pagevec_pool)
+		goto fail_trunc_wq;
+
+	/* caps */
+	client->min_caps = args->max_readdir;
+	ceph_adjust_min_caps(client->min_caps);
+
+	/* subsystems */
+	err = ceph_monc_init(&client->monc, client);
+	if (err < 0)
+		goto fail_mempool;
+	err = ceph_osdc_init(&client->osdc, client);
+	if (err < 0)
+		goto fail_monc;
+	err = ceph_mdsc_init(&client->mdsc, client);
+	if (err < 0)
+		goto fail_osdc;
+	return client;
+
+fail_osdc:
+	ceph_osdc_stop(&client->osdc);
+fail_monc:
+	ceph_monc_stop(&client->monc);
+fail_mempool:
+	mempool_destroy(client->wb_pagevec_pool);
+fail_trunc_wq:
+	destroy_workqueue(client->trunc_wq);
+fail_pg_inv_wq:
+	destroy_workqueue(client->pg_inv_wq);
+fail_wb_wq:
+	destroy_workqueue(client->wb_wq);
+fail_bdi:
+	bdi_destroy(&client->backing_dev_info);
+fail:
+	kfree(client);
+	return ERR_PTR(err);
+}
+
+static void ceph_destroy_client(struct ceph_client *client)
+{
+	dout("destroy_client %p\n", client);
+
+	/* unmount */
+	ceph_mdsc_stop(&client->mdsc);
+	ceph_osdc_stop(&client->osdc);
+
+	/*
+	 * make sure mds and osd connections close out before destroying
+	 * the auth module, which is needed to free those connections'
+	 * ceph_authorizers.
+	 */
+	ceph_msgr_flush();
+
+	ceph_monc_stop(&client->monc);
+
+	ceph_adjust_min_caps(-client->min_caps);
+
+	ceph_debugfs_client_cleanup(client);
+	destroy_workqueue(client->wb_wq);
+	destroy_workqueue(client->pg_inv_wq);
+	destroy_workqueue(client->trunc_wq);
+
+	bdi_destroy(&client->backing_dev_info);
+
+	if (client->msgr)
+		ceph_messenger_destroy(client->msgr);
+	mempool_destroy(client->wb_pagevec_pool);
+
+	destroy_mount_args(client->mount_args);
+
+	kfree(client);
+	dout("destroy_client %p done\n", client);
+}
+
+/*
+ * Initially learn our fsid, or verify an fsid matches.
+ */
+int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
+{
+	if (client->have_fsid) {
+		if (ceph_fsid_compare(&client->fsid, fsid)) {
+			pr_err("bad fsid, had " FSID_FORMAT " got " FSID_FORMAT,
+			       PR_FSID(&client->fsid), PR_FSID(fsid));
+			return -1;
+		}
+	} else {
+		pr_info("client%lld fsid " FSID_FORMAT "\n",
+			client->monc.auth->global_id, PR_FSID(fsid));
+		memcpy(&client->fsid, fsid, sizeof(*fsid));
+		ceph_debugfs_client_init(client);
+		client->have_fsid = true;
+	}
+	return 0;
+}
+
+/*
+ * true if we have the mon map (and have thus joined the cluster)
+ */
+static int have_mon_and_osd_map(struct ceph_client *client)
+{
+	return client->monc.monmap && client->monc.monmap->epoch &&
+	       client->osdc.osdmap && client->osdc.osdmap->epoch;
+}
+
+/*
+ * Bootstrap mount by opening the root directory.  Note the mount
+ * @started time from caller, and time out if this takes too long.
+ */
+static struct dentry *open_root_dentry(struct ceph_client *client,
+				       const char *path,
+				       unsigned long started)
+{
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_mds_request *req = NULL;
+	int err;
+	struct dentry *root;
+
+	/* open dir */
+	dout("open_root_inode opening '%s'\n", path);
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
+	if (IS_ERR(req))
+		return ERR_CAST(req);
+	req->r_path1 = kstrdup(path, GFP_NOFS);
+	req->r_ino1.ino = CEPH_INO_ROOT;
+	req->r_ino1.snap = CEPH_NOSNAP;
+	req->r_started = started;
+	req->r_timeout = client->mount_args->mount_timeout * HZ;
+	req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
+	req->r_num_caps = 2;
+	err = ceph_mdsc_do_request(mdsc, NULL, req);
+	if (err == 0) {
+		dout("open_root_inode success\n");
+		if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
+		    client->sb->s_root == NULL)
+			root = d_alloc_root(req->r_target_inode);
+		else
+			root = d_obtain_alias(req->r_target_inode);
+		req->r_target_inode = NULL;
+		dout("open_root_inode success, root dentry is %p\n", root);
+	} else {
+		root = ERR_PTR(err);
+	}
+	ceph_mdsc_put_request(req);
+	return root;
+}
+
+/*
+ * mount: join the ceph cluster, and open root directory.
+ */
+static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
+		      const char *path)
+{
+	struct ceph_entity_addr *myaddr = NULL;
+	int err;
+	unsigned long timeout = client->mount_args->mount_timeout * HZ;
+	unsigned long started = jiffies;  /* note the start time */
+	struct dentry *root;
+
+	dout("mount start\n");
+	mutex_lock(&client->mount_mutex);
+
+	/* initialize the messenger */
+	if (client->msgr == NULL) {
+		if (ceph_test_opt(client, MYIP))
+			myaddr = &client->mount_args->my_addr;
+		client->msgr = ceph_messenger_create(myaddr);
+		if (IS_ERR(client->msgr)) {
+			err = PTR_ERR(client->msgr);
+			client->msgr = NULL;
+			goto out;
+		}
+		client->msgr->nocrc = ceph_test_opt(client, NOCRC);
+	}
+
+	/* open session, and wait for mon, mds, and osd maps */
+	err = ceph_monc_open_session(&client->monc);
+	if (err < 0)
+		goto out;
+
+	while (!have_mon_and_osd_map(client)) {
+		err = -EIO;
+		if (timeout && time_after_eq(jiffies, started + timeout))
+			goto out;
+
+		/* wait */
+		dout("mount waiting for mon_map\n");
+		err = wait_event_interruptible_timeout(client->auth_wq,
+		       have_mon_and_osd_map(client) || (client->auth_err < 0),
+		       timeout);
+		if (err == -EINTR || err == -ERESTARTSYS)
+			goto out;
+		if (client->auth_err < 0) {
+			err = client->auth_err;
+			goto out;
+		}
+	}
+
+	dout("mount opening root\n");
+	root = open_root_dentry(client, "", started);
+	if (IS_ERR(root)) {
+		err = PTR_ERR(root);
+		goto out;
+	}
+	if (client->sb->s_root)
+		dput(root);
+	else
+		client->sb->s_root = root;
+
+	if (path[0] == 0) {
+		dget(root);
+	} else {
+		dout("mount opening base mountpoint\n");
+		root = open_root_dentry(client, path, started);
+		if (IS_ERR(root)) {
+			err = PTR_ERR(root);
+			dput(client->sb->s_root);
+			client->sb->s_root = NULL;
+			goto out;
+		}
+	}
+
+	mnt->mnt_root = root;
+	mnt->mnt_sb = client->sb;
+
+	client->mount_state = CEPH_MOUNT_MOUNTED;
+	dout("mount success\n");
+	err = 0;
+
+out:
+	mutex_unlock(&client->mount_mutex);
+	return err;
+}
+
+static int ceph_set_super(struct super_block *s, void *data)
+{
+	struct ceph_client *client = data;
+	int ret;
+
+	dout("set_super %p data %p\n", s, data);
+
+	s->s_flags = client->mount_args->sb_flags;
+	s->s_maxbytes = 1ULL << 40;  /* temp value until we get mdsmap */
+
+	s->s_fs_info = client;
+	client->sb = s;
+
+	s->s_op = &ceph_super_ops;
+	s->s_export_op = &ceph_export_ops;
+
+	s->s_time_gran = 1000;  /* 1000 ns == 1 us */
+
+	ret = set_anon_super(s, NULL);  /* what is that second arg for? */
+	if (ret != 0)
+		goto fail;
+
+	return ret;
+
+fail:
+	s->s_fs_info = NULL;
+	client->sb = NULL;
+	return ret;
+}
+
+/*
+ * share superblock if same fs AND options
+ */
+static int ceph_compare_super(struct super_block *sb, void *data)
+{
+	struct ceph_client *new = data;
+	struct ceph_mount_args *args = new->mount_args;
+	struct ceph_client *other = ceph_sb_to_client(sb);
+	int i;
+
+	dout("ceph_compare_super %p\n", sb);
+	if (args->flags & CEPH_OPT_FSID) {
+		if (ceph_fsid_compare(&args->fsid, &other->fsid)) {
+			dout("fsid doesn't match\n");
+			return 0;
+		}
+	} else {
+		/* do we share (a) monitor? */
+		for (i = 0; i < new->monc.monmap->num_mon; i++)
+			if (ceph_monmap_contains(other->monc.monmap,
+					 &new->monc.monmap->mon_inst[i].addr))
+				break;
+		if (i == new->monc.monmap->num_mon) {
+			dout("mon ip not part of monmap\n");
+			return 0;
+		}
+		dout("mon ip matches existing sb %p\n", sb);
+	}
+	if (args->sb_flags != other->mount_args->sb_flags) {
+		dout("flags differ\n");
+		return 0;
+	}
+	return 1;
+}
+
+/*
+ * construct our own bdi so we can control readahead, etc.
+ */
+static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
+
+static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
+{
+	int err;
+
+	/* set ra_pages based on rsize mount option? */
+	if (client->mount_args->rsize >= PAGE_CACHE_SIZE)
+		client->backing_dev_info.ra_pages =
+			(client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
+			>> PAGE_SHIFT;
+	err = bdi_register(&client->backing_dev_info, NULL, "ceph-%d",
+			   atomic_long_inc_return(&bdi_seq));
+	if (!err)
+		sb->s_bdi = &client->backing_dev_info;
+	return err;
+}
+
+static int ceph_get_sb(struct file_system_type *fs_type,
+		       int flags, const char *dev_name, void *data,
+		       struct vfsmount *mnt)
+{
+	struct super_block *sb;
+	struct ceph_client *client;
+	int err;
+	int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
+	const char *path = NULL;
+	struct ceph_mount_args *args;
+
+	dout("ceph_get_sb\n");
+	args = parse_mount_args(flags, data, dev_name, &path);
+	if (IS_ERR(args)) {
+		err = PTR_ERR(args);
+		goto out_final;
+	}
+
+	/* create client (which we may/may not use) */
+	client = ceph_create_client(args);
+	if (IS_ERR(client)) {
+		err = PTR_ERR(client);
+		goto out_final;
+	}
+
+	if (client->mount_args->flags & CEPH_OPT_NOSHARE)
+		compare_super = NULL;
+	sb = sget(fs_type, compare_super, ceph_set_super, client);
+	if (IS_ERR(sb)) {
+		err = PTR_ERR(sb);
+		goto out;
+	}
+
+	if (ceph_sb_to_client(sb) != client) {
+		ceph_destroy_client(client);
+		client = ceph_sb_to_client(sb);
+		dout("get_sb got existing client %p\n", client);
+	} else {
+		dout("get_sb using new client %p\n", client);
+		err = ceph_register_bdi(sb, client);
+		if (err < 0)
+			goto out_splat;
+	}
+
+	err = ceph_mount(client, mnt, path);
+	if (err < 0)
+		goto out_splat;
+	dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root,
+	     mnt->mnt_root->d_inode, ceph_vinop(mnt->mnt_root->d_inode));
+	return 0;
+
+out_splat:
+	ceph_mdsc_close_sessions(&client->mdsc);
+	deactivate_locked_super(sb);
+	goto out_final;
+
+out:
+	ceph_destroy_client(client);
+out_final:
+	dout("ceph_get_sb fail %d\n", err);
+	return err;
+}
+
+static void ceph_kill_sb(struct super_block *s)
+{
+	struct ceph_client *client = ceph_sb_to_client(s);
+	dout("kill_sb %p\n", s);
+	ceph_mdsc_pre_umount(&client->mdsc);
+	kill_anon_super(s);    /* will call put_super after sb is r/o */
+	ceph_destroy_client(client);
+}
+
+static struct file_system_type ceph_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ceph",
+	.get_sb		= ceph_get_sb,
+	.kill_sb	= ceph_kill_sb,
+	.fs_flags	= FS_RENAME_DOES_D_MOVE,
+};
+
+#define _STRINGIFY(x) #x
+#define STRINGIFY(x) _STRINGIFY(x)
+
+static int __init init_ceph(void)
+{
+	int ret = 0;
+
+	ret = ceph_debugfs_init();
+	if (ret < 0)
+		goto out;
+
+	ret = ceph_msgr_init();
+	if (ret < 0)
+		goto out_debugfs;
+
+	ret = init_caches();
+	if (ret)
+		goto out_msgr;
+
+	ceph_caps_init();
+
+	ret = register_filesystem(&ceph_fs_type);
+	if (ret)
+		goto out_icache;
+
+	pr_info("loaded (mon/mds/osd proto %d/%d/%d, osdmap %d/%d %d/%d)\n",
+		CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL,
+		CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
+		CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
+	return 0;
+
+out_icache:
+	destroy_caches();
+out_msgr:
+	ceph_msgr_exit();
+out_debugfs:
+	ceph_debugfs_cleanup();
+out:
+	return ret;
+}
+
+static void __exit exit_ceph(void)
+{
+	dout("exit_ceph\n");
+	unregister_filesystem(&ceph_fs_type);
+	ceph_caps_finalize();
+	destroy_caches();
+	ceph_msgr_exit();
+	ceph_debugfs_cleanup();
+}
+
+module_init(init_ceph);
+module_exit(exit_ceph);
+
+MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
+MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
+MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
+MODULE_DESCRIPTION("Ceph filesystem for Linux");
+MODULE_LICENSE("GPL");
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
new file mode 100644
index 000000000000..10a4a406e887
--- /dev/null
+++ b/fs/ceph/super.h
@@ -0,0 +1,899 @@
+#ifndef _FS_CEPH_SUPER_H
+#define _FS_CEPH_SUPER_H
+
+#include "ceph_debug.h"
+
+#include <asm/unaligned.h>
+#include <linux/backing-dev.h>
+#include <linux/completion.h>
+#include <linux/exportfs.h>
+#include <linux/fs.h>
+#include <linux/mempool.h>
+#include <linux/pagemap.h>
+#include <linux/wait.h>
+#include <linux/writeback.h>
+#include <linux/slab.h>
+
+#include "types.h"
+#include "messenger.h"
+#include "msgpool.h"
+#include "mon_client.h"
+#include "mds_client.h"
+#include "osd_client.h"
+#include "ceph_fs.h"
+
+/* f_type in struct statfs */
+#define CEPH_SUPER_MAGIC 0x00c36400
+
+/* large granularity for statfs utilization stats to facilitate
+ * large volume sizes on 32-bit machines. */
+#define CEPH_BLOCK_SHIFT   20  /* 1 MB */
+#define CEPH_BLOCK         (1 << CEPH_BLOCK_SHIFT)
+
+/*
+ * mount options
+ */
+#define CEPH_OPT_FSID             (1<<0)
+#define CEPH_OPT_NOSHARE          (1<<1) /* don't share client with other sbs */
+#define CEPH_OPT_MYIP             (1<<2) /* specified my ip */
+#define CEPH_OPT_DIRSTAT          (1<<4) /* funky `cat dirname` for stats */
+#define CEPH_OPT_RBYTES           (1<<5) /* dir st_bytes = rbytes */
+#define CEPH_OPT_NOCRC            (1<<6) /* no data crc on writes */
+#define CEPH_OPT_NOASYNCREADDIR   (1<<7) /* no dcache readdir */
+
+#define CEPH_OPT_DEFAULT   (CEPH_OPT_RBYTES)
+
+#define ceph_set_opt(client, opt) \
+	(client)->mount_args->flags |= CEPH_OPT_##opt;
+#define ceph_test_opt(client, opt) \
+	(!!((client)->mount_args->flags & CEPH_OPT_##opt))
+
+
+struct ceph_mount_args {
+	int sb_flags;
+	int flags;
+	struct ceph_fsid fsid;
+	struct ceph_entity_addr my_addr;
+	int num_mon;
+	struct ceph_entity_addr *mon_addr;
+	int mount_timeout;
+	int osd_idle_ttl;
+	int osd_timeout;
+	int osd_keepalive_timeout;
+	int wsize;
+	int rsize;            /* max readahead */
+	int congestion_kb;    /* max writeback in flight */
+	int caps_wanted_delay_min, caps_wanted_delay_max;
+	int cap_release_safety;
+	int max_readdir;       /* max readdir result (entires) */
+	int max_readdir_bytes; /* max readdir result (bytes) */
+	char *snapdir_name;   /* default ".snap" */
+	char *name;
+	char *secret;
+};
+
+/*
+ * defaults
+ */
+#define CEPH_MOUNT_TIMEOUT_DEFAULT  60
+#define CEPH_OSD_TIMEOUT_DEFAULT    60  /* seconds */
+#define CEPH_OSD_KEEPALIVE_DEFAULT  5
+#define CEPH_OSD_IDLE_TTL_DEFAULT    60
+#define CEPH_MOUNT_RSIZE_DEFAULT    (512*1024) /* readahead */
+#define CEPH_MAX_READDIR_DEFAULT    1024
+#define CEPH_MAX_READDIR_BYTES_DEFAULT    (512*1024)
+
+#define CEPH_MSG_MAX_FRONT_LEN	(16*1024*1024)
+#define CEPH_MSG_MAX_DATA_LEN	(16*1024*1024)
+
+#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
+#define CEPH_AUTH_NAME_DEFAULT   "guest"
+/*
+ * Delay telling the MDS we no longer want caps, in case we reopen
+ * the file.  Delay a minimum amount of time, even if we send a cap
+ * message for some other reason.  Otherwise, take the oppotunity to
+ * update the mds to avoid sending another message later.
+ */
+#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT      5  /* cap release delay */
+#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT     60  /* cap release delay */
+
+#define CEPH_CAP_RELEASE_SAFETY_DEFAULT        (CEPH_CAPS_PER_RELEASE * 4)
+
+/* mount state */
+enum {
+	CEPH_MOUNT_MOUNTING,
+	CEPH_MOUNT_MOUNTED,
+	CEPH_MOUNT_UNMOUNTING,
+	CEPH_MOUNT_UNMOUNTED,
+	CEPH_MOUNT_SHUTDOWN,
+};
+
+/*
+ * subtract jiffies
+ */
+static inline unsigned long time_sub(unsigned long a, unsigned long b)
+{
+	BUG_ON(time_after(b, a));
+	return (long)a - (long)b;
+}
+
+/*
+ * per-filesystem client state
+ *
+ * possibly shared by multiple mount points, if they are
+ * mounting the same ceph filesystem/cluster.
+ */
+struct ceph_client {
+	struct ceph_fsid fsid;
+	bool have_fsid;
+
+	struct mutex mount_mutex;       /* serialize mount attempts */
+	struct ceph_mount_args *mount_args;
+
+	struct super_block *sb;
+
+	unsigned long mount_state;
+	wait_queue_head_t auth_wq;
+
+	int auth_err;
+
+	int min_caps;                  /* min caps i added */
+
+	struct ceph_messenger *msgr;   /* messenger instance */
+	struct ceph_mon_client monc;
+	struct ceph_mds_client mdsc;
+	struct ceph_osd_client osdc;
+
+	/* writeback */
+	mempool_t *wb_pagevec_pool;
+	struct workqueue_struct *wb_wq;
+	struct workqueue_struct *pg_inv_wq;
+	struct workqueue_struct *trunc_wq;
+	atomic_long_t writeback_count;
+
+	struct backing_dev_info backing_dev_info;
+
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *debugfs_monmap;
+	struct dentry *debugfs_mdsmap, *debugfs_osdmap;
+	struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps;
+	struct dentry *debugfs_congestion_kb;
+	struct dentry *debugfs_bdi;
+#endif
+};
+
+/*
+ * File i/o capability.  This tracks shared state with the metadata
+ * server that allows us to cache or writeback attributes or to read
+ * and write data.  For any given inode, we should have one or more
+ * capabilities, one issued by each metadata server, and our
+ * cumulative access is the OR of all issued capabilities.
+ *
+ * Each cap is referenced by the inode's i_caps rbtree and by per-mds
+ * session capability lists.
+ */
+struct ceph_cap {
+	struct ceph_inode_info *ci;
+	struct rb_node ci_node;          /* per-ci cap tree */
+	struct ceph_mds_session *session;
+	struct list_head session_caps;   /* per-session caplist */
+	int mds;
+	u64 cap_id;       /* unique cap id (mds provided) */
+	int issued;       /* latest, from the mds */
+	int implemented;  /* implemented superset of issued (for revocation) */
+	int mds_wanted;
+	u32 seq, issue_seq, mseq;
+	u32 cap_gen;      /* active/stale cycle */
+	unsigned long last_used;
+	struct list_head caps_item;
+};
+
+#define CHECK_CAPS_NODELAY    1  /* do not delay any further */
+#define CHECK_CAPS_AUTHONLY   2  /* only check auth cap */
+#define CHECK_CAPS_FLUSH      4  /* flush any dirty caps */
+
+/*
+ * Snapped cap state that is pending flush to mds.  When a snapshot occurs,
+ * we first complete any in-process sync writes and writeback any dirty
+ * data before flushing the snapped state (tracked here) back to the MDS.
+ */
+struct ceph_cap_snap {
+	atomic_t nref;
+	struct ceph_inode_info *ci;
+	struct list_head ci_item, flushing_item;
+
+	u64 follows, flush_tid;
+	int issued, dirty;
+	struct ceph_snap_context *context;
+
+	mode_t mode;
+	uid_t uid;
+	gid_t gid;
+
+	void *xattr_blob;
+	int xattr_len;
+	u64 xattr_version;
+
+	u64 size;
+	struct timespec mtime, atime, ctime;
+	u64 time_warp_seq;
+	int writing;   /* a sync write is still in progress */
+	int dirty_pages;     /* dirty pages awaiting writeback */
+};
+
+static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
+{
+	if (atomic_dec_and_test(&capsnap->nref))
+		kfree(capsnap);
+}
+
+/*
+ * The frag tree describes how a directory is fragmented, potentially across
+ * multiple metadata servers.  It is also used to indicate points where
+ * metadata authority is delegated, and whether/where metadata is replicated.
+ *
+ * A _leaf_ frag will be present in the i_fragtree IFF there is
+ * delegation info.  That is, if mds >= 0 || ndist > 0.
+ */
+#define CEPH_MAX_DIRFRAG_REP 4
+
+struct ceph_inode_frag {
+	struct rb_node node;
+
+	/* fragtree state */
+	u32 frag;
+	int split_by;         /* i.e. 2^(split_by) children */
+
+	/* delegation and replication info */
+	int mds;              /* -1 if same authority as parent */
+	int ndist;            /* >0 if replicated */
+	int dist[CEPH_MAX_DIRFRAG_REP];
+};
+
+/*
+ * We cache inode xattrs as an encoded blob until they are first used,
+ * at which point we parse them into an rbtree.
+ */
+struct ceph_inode_xattr {
+	struct rb_node node;
+
+	const char *name;
+	int name_len;
+	const char *val;
+	int val_len;
+	int dirty;
+
+	int should_free_name;
+	int should_free_val;
+};
+
+struct ceph_inode_xattrs_info {
+	/*
+	 * (still encoded) xattr blob. we avoid the overhead of parsing
+	 * this until someone actually calls getxattr, etc.
+	 *
+	 * blob->vec.iov_len == 4 implies there are no xattrs; blob ==
+	 * NULL means we don't know.
+	*/
+	struct ceph_buffer *blob, *prealloc_blob;
+
+	struct rb_root index;
+	bool dirty;
+	int count;
+	int names_size;
+	int vals_size;
+	u64 version, index_version;
+};
+
+/*
+ * Ceph inode.
+ */
+#define CEPH_I_COMPLETE  1  /* we have complete directory cached */
+#define CEPH_I_NODELAY   4  /* do not delay cap release */
+#define CEPH_I_FLUSH     8  /* do not delay flush of dirty metadata */
+#define CEPH_I_NOFLUSH  16  /* do not flush dirty caps */
+
+struct ceph_inode_info {
+	struct ceph_vino i_vino;   /* ceph ino + snap */
+
+	u64 i_version;
+	u32 i_time_warp_seq;
+
+	unsigned i_ceph_flags;
+	unsigned long i_release_count;
+
+	struct ceph_file_layout i_layout;
+	char *i_symlink;
+
+	/* for dirs */
+	struct timespec i_rctime;
+	u64 i_rbytes, i_rfiles, i_rsubdirs;
+	u64 i_files, i_subdirs;
+	u64 i_max_offset;  /* largest readdir offset, set with I_COMPLETE */
+
+	struct rb_root i_fragtree;
+	struct mutex i_fragtree_mutex;
+
+	struct ceph_inode_xattrs_info i_xattrs;
+
+	/* capabilities.  protected _both_ by i_lock and cap->session's
+	 * s_mutex. */
+	struct rb_root i_caps;           /* cap list */
+	struct ceph_cap *i_auth_cap;     /* authoritative cap, if any */
+	unsigned i_dirty_caps, i_flushing_caps;     /* mask of dirtied fields */
+	struct list_head i_dirty_item, i_flushing_item;
+	u64 i_cap_flush_seq;
+	/* we need to track cap writeback on a per-cap-bit basis, to allow
+	 * overlapping, pipelined cap flushes to the mds.  we can probably
+	 * reduce the tid to 8 bits if we're concerned about inode size. */
+	u16 i_cap_flush_last_tid, i_cap_flush_tid[CEPH_CAP_BITS];
+	wait_queue_head_t i_cap_wq;      /* threads waiting on a capability */
+	unsigned long i_hold_caps_min; /* jiffies */
+	unsigned long i_hold_caps_max; /* jiffies */
+	struct list_head i_cap_delay_list;  /* for delayed cap release to mds */
+	int i_cap_exporting_mds;         /* to handle cap migration between */
+	unsigned i_cap_exporting_mseq;   /*  mds's. */
+	unsigned i_cap_exporting_issued;
+	struct ceph_cap_reservation i_cap_migration_resv;
+	struct list_head i_cap_snaps;   /* snapped state pending flush to mds */
+	struct ceph_snap_context *i_head_snapc;  /* set if wr_buffer_head > 0 */
+	unsigned i_snap_caps;           /* cap bits for snapped files */
+
+	int i_nr_by_mode[CEPH_FILE_MODE_NUM];  /* open file counts */
+
+	u32 i_truncate_seq;        /* last truncate to smaller size */
+	u64 i_truncate_size;       /*  and the size we last truncated down to */
+	int i_truncate_pending;    /*  still need to call vmtruncate */
+
+	u64 i_max_size;            /* max file size authorized by mds */
+	u64 i_reported_size; /* (max_)size reported to or requested of mds */
+	u64 i_wanted_max_size;     /* offset we'd like to write too */
+	u64 i_requested_max_size;  /* max_size we've requested */
+
+	/* held references to caps */
+	int i_pin_ref;
+	int i_rd_ref, i_rdcache_ref, i_wr_ref;
+	int i_wrbuffer_ref, i_wrbuffer_ref_head;
+	u32 i_shared_gen;       /* increment each time we get FILE_SHARED */
+	u32 i_rdcache_gen;      /* we increment this each time we get
+				   FILE_CACHE.  If it's non-zero, we
+				   _may_ have cached pages. */
+	u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
+
+	struct list_head i_unsafe_writes; /* uncommitted sync writes */
+	struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */
+	spinlock_t i_unsafe_lock;
+
+	struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
+	int i_snap_realm_counter; /* snap realm (if caps) */
+	struct list_head i_snap_realm_item;
+	struct list_head i_snap_flush_item;
+
+	struct work_struct i_wb_work;  /* writeback work */
+	struct work_struct i_pg_inv_work;  /* page invalidation work */
+
+	struct work_struct i_vmtruncate_work;
+
+	struct inode vfs_inode; /* at end */
+};
+
+static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
+{
+	return container_of(inode, struct ceph_inode_info, vfs_inode);
+}
+
+static inline void ceph_i_clear(struct inode *inode, unsigned mask)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	spin_lock(&inode->i_lock);
+	ci->i_ceph_flags &= ~mask;
+	spin_unlock(&inode->i_lock);
+}
+
+static inline void ceph_i_set(struct inode *inode, unsigned mask)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	spin_lock(&inode->i_lock);
+	ci->i_ceph_flags |= mask;
+	spin_unlock(&inode->i_lock);
+}
+
+static inline bool ceph_i_test(struct inode *inode, unsigned mask)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	bool r;
+
+	smp_mb();
+	r = (ci->i_ceph_flags & mask) == mask;
+	return r;
+}
+
+
+/* find a specific frag @f */
+extern struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci,
+						u32 f);
+
+/*
+ * choose fragment for value @v.  copy frag content to pfrag, if leaf
+ * exists
+ */
+extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
+			    struct ceph_inode_frag *pfrag,
+			    int *found);
+
+/*
+ * Ceph dentry state
+ */
+struct ceph_dentry_info {
+	struct ceph_mds_session *lease_session;
+	u32 lease_gen, lease_shared_gen;
+	u32 lease_seq;
+	unsigned long lease_renew_after, lease_renew_from;
+	struct list_head lru;
+	struct dentry *dentry;
+	u64 time;
+	u64 offset;
+};
+
+static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
+{
+	return (struct ceph_dentry_info *)dentry->d_fsdata;
+}
+
+static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
+{
+	return ((loff_t)frag << 32) | (loff_t)off;
+}
+
+/*
+ * ino_t is <64 bits on many architectures, blech.
+ *
+ * don't include snap in ino hash, at least for now.
+ */
+static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
+{
+	ino_t ino = (ino_t)vino.ino;  /* ^ (vino.snap << 20); */
+#if BITS_PER_LONG == 32
+	ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
+	if (!ino)
+		ino = 1;
+#endif
+	return ino;
+}
+
+static inline int ceph_set_ino_cb(struct inode *inode, void *data)
+{
+	ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
+	inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
+	return 0;
+}
+
+static inline struct ceph_vino ceph_vino(struct inode *inode)
+{
+	return ceph_inode(inode)->i_vino;
+}
+
+/* for printf-style formatting */
+#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
+
+static inline u64 ceph_ino(struct inode *inode)
+{
+	return ceph_inode(inode)->i_vino.ino;
+}
+static inline u64 ceph_snap(struct inode *inode)
+{
+	return ceph_inode(inode)->i_vino.snap;
+}
+
+static inline int ceph_ino_compare(struct inode *inode, void *data)
+{
+	struct ceph_vino *pvino = (struct ceph_vino *)data;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	return ci->i_vino.ino == pvino->ino &&
+		ci->i_vino.snap == pvino->snap;
+}
+
+static inline struct inode *ceph_find_inode(struct super_block *sb,
+					    struct ceph_vino vino)
+{
+	ino_t t = ceph_vino_to_ino(vino);
+	return ilookup5(sb, t, ceph_ino_compare, &vino);
+}
+
+
+/*
+ * caps helpers
+ */
+static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci)
+{
+	return !RB_EMPTY_ROOT(&ci->i_caps);
+}
+
+extern int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented);
+extern int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int t);
+extern int __ceph_caps_issued_other(struct ceph_inode_info *ci,
+				    struct ceph_cap *cap);
+
+static inline int ceph_caps_issued(struct ceph_inode_info *ci)
+{
+	int issued;
+	spin_lock(&ci->vfs_inode.i_lock);
+	issued = __ceph_caps_issued(ci, NULL);
+	spin_unlock(&ci->vfs_inode.i_lock);
+	return issued;
+}
+
+static inline int ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask,
+					int touch)
+{
+	int r;
+	spin_lock(&ci->vfs_inode.i_lock);
+	r = __ceph_caps_issued_mask(ci, mask, touch);
+	spin_unlock(&ci->vfs_inode.i_lock);
+	return r;
+}
+
+static inline int __ceph_caps_dirty(struct ceph_inode_info *ci)
+{
+	return ci->i_dirty_caps | ci->i_flushing_caps;
+}
+extern void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
+
+extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
+extern int __ceph_caps_used(struct ceph_inode_info *ci);
+
+extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
+
+/*
+ * wanted, by virtue of open file modes AND cap refs (buffered/cached data)
+ */
+static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
+{
+	int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
+	if (w & CEPH_CAP_FILE_BUFFER)
+		w |= CEPH_CAP_FILE_EXCL;  /* we want EXCL if dirty data */
+	return w;
+}
+
+/* what the mds thinks we want */
+extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
+
+extern void ceph_caps_init(void);
+extern void ceph_caps_finalize(void);
+extern void ceph_adjust_min_caps(int delta);
+extern int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need);
+extern int ceph_unreserve_caps(struct ceph_cap_reservation *ctx);
+extern void ceph_reservation_status(struct ceph_client *client,
+				    int *total, int *avail, int *used,
+				    int *reserved, int *min);
+
+static inline struct ceph_client *ceph_inode_to_client(struct inode *inode)
+{
+	return (struct ceph_client *)inode->i_sb->s_fs_info;
+}
+
+static inline struct ceph_client *ceph_sb_to_client(struct super_block *sb)
+{
+	return (struct ceph_client *)sb->s_fs_info;
+}
+
+
+/*
+ * we keep buffered readdir results attached to file->private_data
+ */
+struct ceph_file_info {
+	int fmode;     /* initialized on open */
+
+	/* readdir: position within the dir */
+	u32 frag;
+	struct ceph_mds_request *last_readdir;
+	int at_end;
+
+	/* readdir: position within a frag */
+	unsigned offset;       /* offset of last chunk, adjusted for . and .. */
+	u64 next_offset;       /* offset of next chunk (last_name's + 1) */
+	char *last_name;       /* last entry in previous chunk */
+	struct dentry *dentry; /* next dentry (for dcache readdir) */
+	unsigned long dir_release_count;
+
+	/* used for -o dirstat read() on directory thing */
+	char *dir_info;
+	int dir_info_len;
+};
+
+
+
+/*
+ * snapshots
+ */
+
+/*
+ * A "snap context" is the set of existing snapshots when we
+ * write data.  It is used by the OSD to guide its COW behavior.
+ *
+ * The ceph_snap_context is refcounted, and attached to each dirty
+ * page, indicating which context the dirty data belonged when it was
+ * dirtied.
+ */
+struct ceph_snap_context {
+	atomic_t nref;
+	u64 seq;
+	int num_snaps;
+	u64 snaps[];
+};
+
+static inline struct ceph_snap_context *
+ceph_get_snap_context(struct ceph_snap_context *sc)
+{
+	/*
+	printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
+	       atomic_read(&sc->nref)+1);
+	*/
+	if (sc)
+		atomic_inc(&sc->nref);
+	return sc;
+}
+
+static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
+{
+	if (!sc)
+		return;
+	/*
+	printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
+	       atomic_read(&sc->nref)-1);
+	*/
+	if (atomic_dec_and_test(&sc->nref)) {
+		/*printk(" deleting snap_context %p\n", sc);*/
+		kfree(sc);
+	}
+}
+
+/*
+ * A "snap realm" describes a subset of the file hierarchy sharing
+ * the same set of snapshots that apply to it.  The realms themselves
+ * are organized into a hierarchy, such that children inherit (some of)
+ * the snapshots of their parents.
+ *
+ * All inodes within the realm that have capabilities are linked into a
+ * per-realm list.
+ */
+struct ceph_snap_realm {
+	u64 ino;
+	atomic_t nref;
+	struct rb_node node;
+
+	u64 created, seq;
+	u64 parent_ino;
+	u64 parent_since;   /* snapid when our current parent became so */
+
+	u64 *prior_parent_snaps;      /* snaps inherited from any parents we */
+	int num_prior_parent_snaps;   /*  had prior to parent_since */
+	u64 *snaps;                   /* snaps specific to this realm */
+	int num_snaps;
+
+	struct ceph_snap_realm *parent;
+	struct list_head children;       /* list of child realms */
+	struct list_head child_item;
+
+	struct list_head empty_item;     /* if i have ref==0 */
+
+	/* the current set of snaps for this realm */
+	struct ceph_snap_context *cached_context;
+
+	struct list_head inodes_with_caps;
+	spinlock_t inodes_with_caps_lock;
+};
+
+
+
+/*
+ * calculate the number of pages a given length and offset map onto,
+ * if we align the data.
+ */
+static inline int calc_pages_for(u64 off, u64 len)
+{
+	return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -
+		(off >> PAGE_CACHE_SHIFT);
+}
+
+
+
+/* snap.c */
+struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
+					       u64 ino);
+extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
+				struct ceph_snap_realm *realm);
+extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
+				struct ceph_snap_realm *realm);
+extern int ceph_update_snap_trace(struct ceph_mds_client *m,
+				  void *p, void *e, bool deletion);
+extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
+			     struct ceph_mds_session *session,
+			     struct ceph_msg *msg);
+extern void ceph_queue_cap_snap(struct ceph_inode_info *ci);
+extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
+				  struct ceph_cap_snap *capsnap);
+extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
+
+/*
+ * a cap_snap is "pending" if it is still awaiting an in-progress
+ * sync write (that may/may not still update size, mtime, etc.).
+ */
+static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
+{
+	return !list_empty(&ci->i_cap_snaps) &&
+		list_entry(ci->i_cap_snaps.prev, struct ceph_cap_snap,
+			   ci_item)->writing;
+}
+
+
+/* super.c */
+extern struct kmem_cache *ceph_inode_cachep;
+extern struct kmem_cache *ceph_cap_cachep;
+extern struct kmem_cache *ceph_dentry_cachep;
+extern struct kmem_cache *ceph_file_cachep;
+
+extern const char *ceph_msg_type_name(int type);
+extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
+
+#define FSID_FORMAT "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-" \
+	"%02x%02x%02x%02x%02x%02x"
+#define PR_FSID(f) (f)->fsid[0], (f)->fsid[1], (f)->fsid[2], (f)->fsid[3], \
+		(f)->fsid[4], (f)->fsid[5], (f)->fsid[6], (f)->fsid[7],    \
+		(f)->fsid[8], (f)->fsid[9], (f)->fsid[10], (f)->fsid[11],  \
+		(f)->fsid[12], (f)->fsid[13], (f)->fsid[14], (f)->fsid[15]
+
+/* inode.c */
+extern const struct inode_operations ceph_file_iops;
+
+extern struct inode *ceph_alloc_inode(struct super_block *sb);
+extern void ceph_destroy_inode(struct inode *inode);
+
+extern struct inode *ceph_get_inode(struct super_block *sb,
+				    struct ceph_vino vino);
+extern struct inode *ceph_get_snapdir(struct inode *parent);
+extern int ceph_fill_file_size(struct inode *inode, int issued,
+			       u32 truncate_seq, u64 truncate_size, u64 size);
+extern void ceph_fill_file_time(struct inode *inode, int issued,
+				u64 time_warp_seq, struct timespec *ctime,
+				struct timespec *mtime, struct timespec *atime);
+extern int ceph_fill_trace(struct super_block *sb,
+			   struct ceph_mds_request *req,
+			   struct ceph_mds_session *session);
+extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
+				    struct ceph_mds_session *session);
+
+extern int ceph_inode_holds_cap(struct inode *inode, int mask);
+
+extern int ceph_inode_set_size(struct inode *inode, loff_t size);
+extern void __ceph_do_pending_vmtruncate(struct inode *inode);
+extern void ceph_queue_vmtruncate(struct inode *inode);
+
+extern void ceph_queue_invalidate(struct inode *inode);
+extern void ceph_queue_writeback(struct inode *inode);
+
+extern int ceph_do_getattr(struct inode *inode, int mask);
+extern int ceph_permission(struct inode *inode, int mask);
+extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
+extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
+			struct kstat *stat);
+
+/* xattr.c */
+extern int ceph_setxattr(struct dentry *, const char *, const void *,
+			 size_t, int);
+extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t);
+extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
+extern int ceph_removexattr(struct dentry *, const char *);
+extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci);
+extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
+
+/* caps.c */
+extern const char *ceph_cap_string(int c);
+extern void ceph_handle_caps(struct ceph_mds_session *session,
+			     struct ceph_msg *msg);
+extern int ceph_add_cap(struct inode *inode,
+			struct ceph_mds_session *session, u64 cap_id,
+			int fmode, unsigned issued, unsigned wanted,
+			unsigned cap, unsigned seq, u64 realmino, int flags,
+			struct ceph_cap_reservation *caps_reservation);
+extern void __ceph_remove_cap(struct ceph_cap *cap);
+static inline void ceph_remove_cap(struct ceph_cap *cap)
+{
+	struct inode *inode = &cap->ci->vfs_inode;
+	spin_lock(&inode->i_lock);
+	__ceph_remove_cap(cap);
+	spin_unlock(&inode->i_lock);
+}
+extern void ceph_put_cap(struct ceph_cap *cap);
+
+extern void ceph_queue_caps_release(struct inode *inode);
+extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
+extern int ceph_fsync(struct file *file, int datasync);
+extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
+				    struct ceph_mds_session *session);
+extern int ceph_get_cap_mds(struct inode *inode);
+extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
+extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
+extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
+				       struct ceph_snap_context *snapc);
+extern void __ceph_flush_snaps(struct ceph_inode_info *ci,
+			       struct ceph_mds_session **psession);
+extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
+			    struct ceph_mds_session *session);
+extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
+extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
+
+extern int ceph_encode_inode_release(void **p, struct inode *inode,
+				     int mds, int drop, int unless, int force);
+extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
+				      int mds, int drop, int unless);
+
+extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
+			 int *got, loff_t endoff);
+
+/* for counting open files by mode */
+static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode)
+{
+	ci->i_nr_by_mode[mode]++;
+}
+extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
+
+/* addr.c */
+extern const struct address_space_operations ceph_aops;
+extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
+
+/* file.c */
+extern const struct file_operations ceph_file_fops;
+extern const struct address_space_operations ceph_aops;
+extern int ceph_open(struct inode *inode, struct file *file);
+extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
+				       struct nameidata *nd, int mode,
+				       int locked_dir);
+extern int ceph_release(struct inode *inode, struct file *filp);
+extern void ceph_release_page_vector(struct page **pages, int num_pages);
+
+/* dir.c */
+extern const struct file_operations ceph_dir_fops;
+extern const struct inode_operations ceph_dir_iops;
+extern struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
+	ceph_snapdir_dentry_ops;
+
+extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
+extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
+					 struct dentry *dentry, int err);
+
+extern void ceph_dentry_lru_add(struct dentry *dn);
+extern void ceph_dentry_lru_touch(struct dentry *dn);
+extern void ceph_dentry_lru_del(struct dentry *dn);
+extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
+
+/*
+ * our d_ops vary depending on whether the inode is live,
+ * snapshotted (read-only), or a virtual ".snap" directory.
+ */
+int ceph_init_dentry(struct dentry *dentry);
+
+
+/* ioctl.c */
+extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+
+/* export.c */
+extern const struct export_operations ceph_export_ops;
+
+/* debugfs.c */
+extern int ceph_debugfs_init(void);
+extern void ceph_debugfs_cleanup(void);
+extern int ceph_debugfs_client_init(struct ceph_client *client);
+extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
+
+static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
+{
+	if (dentry && dentry->d_parent)
+		return dentry->d_parent->d_inode;
+
+	return NULL;
+}
+
+#endif /* _FS_CEPH_SUPER_H */
diff --git a/fs/ceph/types.h b/fs/ceph/types.h
new file mode 100644
index 000000000000..28b35a005ec2
--- /dev/null
+++ b/fs/ceph/types.h
@@ -0,0 +1,29 @@
+#ifndef _FS_CEPH_TYPES_H
+#define _FS_CEPH_TYPES_H
+
+/* needed before including ceph_fs.h */
+#include <linux/in.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/string.h>
+
+#include "ceph_fs.h"
+#include "ceph_frag.h"
+#include "ceph_hash.h"
+
+/*
+ * Identify inodes by both their ino AND snapshot id (a u64).
+ */
+struct ceph_vino {
+	u64 ino;
+	u64 snap;
+};
+
+
+/* context for the caps reservation mechanism */
+struct ceph_cap_reservation {
+	int count;
+};
+
+
+#endif
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
new file mode 100644
index 000000000000..68aeebc69681
--- /dev/null
+++ b/fs/ceph/xattr.c
@@ -0,0 +1,840 @@
+#include "ceph_debug.h"
+#include "super.h"
+#include "decode.h"
+
+#include <linux/xattr.h>
+#include <linux/slab.h>
+
+static bool ceph_is_valid_xattr(const char *name)
+{
+	return !strncmp(name, "ceph.", 5) ||
+	       !strncmp(name, XATTR_SECURITY_PREFIX,
+			XATTR_SECURITY_PREFIX_LEN) ||
+	       !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
+	       !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
+}
+
+/*
+ * These define virtual xattrs exposing the recursive directory
+ * statistics and layout metadata.
+ */
+struct ceph_vxattr_cb {
+	bool readonly;
+	char *name;
+	size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
+			      size_t size);
+};
+
+/* directories */
+
+static size_t ceph_vxattrcb_entries(struct ceph_inode_info *ci, char *val,
+					size_t size)
+{
+	return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs);
+}
+
+static size_t ceph_vxattrcb_files(struct ceph_inode_info *ci, char *val,
+				      size_t size)
+{
+	return snprintf(val, size, "%lld", ci->i_files);
+}
+
+static size_t ceph_vxattrcb_subdirs(struct ceph_inode_info *ci, char *val,
+					size_t size)
+{
+	return snprintf(val, size, "%lld", ci->i_subdirs);
+}
+
+static size_t ceph_vxattrcb_rentries(struct ceph_inode_info *ci, char *val,
+					 size_t size)
+{
+	return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs);
+}
+
+static size_t ceph_vxattrcb_rfiles(struct ceph_inode_info *ci, char *val,
+				       size_t size)
+{
+	return snprintf(val, size, "%lld", ci->i_rfiles);
+}
+
+static size_t ceph_vxattrcb_rsubdirs(struct ceph_inode_info *ci, char *val,
+					 size_t size)
+{
+	return snprintf(val, size, "%lld", ci->i_rsubdirs);
+}
+
+static size_t ceph_vxattrcb_rbytes(struct ceph_inode_info *ci, char *val,
+				       size_t size)
+{
+	return snprintf(val, size, "%lld", ci->i_rbytes);
+}
+
+static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
+				       size_t size)
+{
+	return snprintf(val, size, "%ld.%ld", (long)ci->i_rctime.tv_sec,
+			(long)ci->i_rctime.tv_nsec);
+}
+
+static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
+	{ true, "ceph.dir.entries", ceph_vxattrcb_entries},
+	{ true, "ceph.dir.files", ceph_vxattrcb_files},
+	{ true, "ceph.dir.subdirs", ceph_vxattrcb_subdirs},
+	{ true, "ceph.dir.rentries", ceph_vxattrcb_rentries},
+	{ true, "ceph.dir.rfiles", ceph_vxattrcb_rfiles},
+	{ true, "ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
+	{ true, "ceph.dir.rbytes", ceph_vxattrcb_rbytes},
+	{ true, "ceph.dir.rctime", ceph_vxattrcb_rctime},
+	{ true, NULL, NULL }
+};
+
+/* files */
+
+static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
+				   size_t size)
+{
+	int ret;
+
+	ret = snprintf(val, size,
+		"chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n",
+		(unsigned long long)ceph_file_layout_su(ci->i_layout),
+		(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
+		(unsigned long long)ceph_file_layout_object_size(ci->i_layout));
+	if (ceph_file_layout_pg_preferred(ci->i_layout))
+		ret += snprintf(val + ret, size, "preferred_osd=%lld\n",
+			    (unsigned long long)ceph_file_layout_pg_preferred(
+				    ci->i_layout));
+	return ret;
+}
+
+static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
+	{ true, "ceph.layout", ceph_vxattrcb_layout},
+	{ NULL, NULL }
+};
+
+static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode)
+{
+	if (S_ISDIR(inode->i_mode))
+		return ceph_dir_vxattrs;
+	else if (S_ISREG(inode->i_mode))
+		return ceph_file_vxattrs;
+	return NULL;
+}
+
+static struct ceph_vxattr_cb *ceph_match_vxattr(struct ceph_vxattr_cb *vxattr,
+						const char *name)
+{
+	do {
+		if (strcmp(vxattr->name, name) == 0)
+			return vxattr;
+		vxattr++;
+	} while (vxattr->name);
+	return NULL;
+}
+
+static int __set_xattr(struct ceph_inode_info *ci,
+			   const char *name, int name_len,
+			   const char *val, int val_len,
+			   int dirty,
+			   int should_free_name, int should_free_val,
+			   struct ceph_inode_xattr **newxattr)
+{
+	struct rb_node **p;
+	struct rb_node *parent = NULL;
+	struct ceph_inode_xattr *xattr = NULL;
+	int c;
+	int new = 0;
+
+	p = &ci->i_xattrs.index.rb_node;
+	while (*p) {
+		parent = *p;
+		xattr = rb_entry(parent, struct ceph_inode_xattr, node);
+		c = strncmp(name, xattr->name, min(name_len, xattr->name_len));
+		if (c < 0)
+			p = &(*p)->rb_left;
+		else if (c > 0)
+			p = &(*p)->rb_right;
+		else {
+			if (name_len == xattr->name_len)
+				break;
+			else if (name_len < xattr->name_len)
+				p = &(*p)->rb_left;
+			else
+				p = &(*p)->rb_right;
+		}
+		xattr = NULL;
+	}
+
+	if (!xattr) {
+		new = 1;
+		xattr = *newxattr;
+		xattr->name = name;
+		xattr->name_len = name_len;
+		xattr->should_free_name = should_free_name;
+
+		ci->i_xattrs.count++;
+		dout("__set_xattr count=%d\n", ci->i_xattrs.count);
+	} else {
+		kfree(*newxattr);
+		*newxattr = NULL;
+		if (xattr->should_free_val)
+			kfree((void *)xattr->val);
+
+		if (should_free_name) {
+			kfree((void *)name);
+			name = xattr->name;
+		}
+		ci->i_xattrs.names_size -= xattr->name_len;
+		ci->i_xattrs.vals_size -= xattr->val_len;
+	}
+	ci->i_xattrs.names_size += name_len;
+	ci->i_xattrs.vals_size += val_len;
+	if (val)
+		xattr->val = val;
+	else
+		xattr->val = "";
+
+	xattr->val_len = val_len;
+	xattr->dirty = dirty;
+	xattr->should_free_val = (val && should_free_val);
+
+	if (new) {
+		rb_link_node(&xattr->node, parent, p);
+		rb_insert_color(&xattr->node, &ci->i_xattrs.index);
+		dout("__set_xattr_val p=%p\n", p);
+	}
+
+	dout("__set_xattr_val added %llx.%llx xattr %p %s=%.*s\n",
+	     ceph_vinop(&ci->vfs_inode), xattr, name, val_len, val);
+
+	return 0;
+}
+
+static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
+			   const char *name)
+{
+	struct rb_node **p;
+	struct rb_node *parent = NULL;
+	struct ceph_inode_xattr *xattr = NULL;
+	int c;
+
+	p = &ci->i_xattrs.index.rb_node;
+	while (*p) {
+		parent = *p;
+		xattr = rb_entry(parent, struct ceph_inode_xattr, node);
+		c = strncmp(name, xattr->name, xattr->name_len);
+		if (c < 0)
+			p = &(*p)->rb_left;
+		else if (c > 0)
+			p = &(*p)->rb_right;
+		else {
+			dout("__get_xattr %s: found %.*s\n", name,
+			     xattr->val_len, xattr->val);
+			return xattr;
+		}
+	}
+
+	dout("__get_xattr %s: not found\n", name);
+
+	return NULL;
+}
+
+static void __free_xattr(struct ceph_inode_xattr *xattr)
+{
+	BUG_ON(!xattr);
+
+	if (xattr->should_free_name)
+		kfree((void *)xattr->name);
+	if (xattr->should_free_val)
+		kfree((void *)xattr->val);
+
+	kfree(xattr);
+}
+
+static int __remove_xattr(struct ceph_inode_info *ci,
+			  struct ceph_inode_xattr *xattr)
+{
+	if (!xattr)
+		return -EOPNOTSUPP;
+
+	rb_erase(&xattr->node, &ci->i_xattrs.index);
+
+	if (xattr->should_free_name)
+		kfree((void *)xattr->name);
+	if (xattr->should_free_val)
+		kfree((void *)xattr->val);
+
+	ci->i_xattrs.names_size -= xattr->name_len;
+	ci->i_xattrs.vals_size -= xattr->val_len;
+	ci->i_xattrs.count--;
+	kfree(xattr);
+
+	return 0;
+}
+
+static int __remove_xattr_by_name(struct ceph_inode_info *ci,
+			   const char *name)
+{
+	struct rb_node **p;
+	struct ceph_inode_xattr *xattr;
+	int err;
+
+	p = &ci->i_xattrs.index.rb_node;
+	xattr = __get_xattr(ci, name);
+	err = __remove_xattr(ci, xattr);
+	return err;
+}
+
+static char *__copy_xattr_names(struct ceph_inode_info *ci,
+				char *dest)
+{
+	struct rb_node *p;
+	struct ceph_inode_xattr *xattr = NULL;
+
+	p = rb_first(&ci->i_xattrs.index);
+	dout("__copy_xattr_names count=%d\n", ci->i_xattrs.count);
+
+	while (p) {
+		xattr = rb_entry(p, struct ceph_inode_xattr, node);
+		memcpy(dest, xattr->name, xattr->name_len);
+		dest[xattr->name_len] = '\0';
+
+		dout("dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name,
+		     xattr->name_len, ci->i_xattrs.names_size);
+
+		dest += xattr->name_len + 1;
+		p = rb_next(p);
+	}
+
+	return dest;
+}
+
+void __ceph_destroy_xattrs(struct ceph_inode_info *ci)
+{
+	struct rb_node *p, *tmp;
+	struct ceph_inode_xattr *xattr = NULL;
+
+	p = rb_first(&ci->i_xattrs.index);
+
+	dout("__ceph_destroy_xattrs p=%p\n", p);
+
+	while (p) {
+		xattr = rb_entry(p, struct ceph_inode_xattr, node);
+		tmp = p;
+		p = rb_next(tmp);
+		dout("__ceph_destroy_xattrs next p=%p (%.*s)\n", p,
+		     xattr->name_len, xattr->name);
+		rb_erase(tmp, &ci->i_xattrs.index);
+
+		__free_xattr(xattr);
+	}
+
+	ci->i_xattrs.names_size = 0;
+	ci->i_xattrs.vals_size = 0;
+	ci->i_xattrs.index_version = 0;
+	ci->i_xattrs.count = 0;
+	ci->i_xattrs.index = RB_ROOT;
+}
+
+static int __build_xattrs(struct inode *inode)
+{
+	u32 namelen;
+	u32 numattr = 0;
+	void *p, *end;
+	u32 len;
+	const char *name, *val;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int xattr_version;
+	struct ceph_inode_xattr **xattrs = NULL;
+	int err = 0;
+	int i;
+
+	dout("__build_xattrs() len=%d\n",
+	     ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0);
+
+	if (ci->i_xattrs.index_version >= ci->i_xattrs.version)
+		return 0; /* already built */
+
+	__ceph_destroy_xattrs(ci);
+
+start:
+	/* updated internal xattr rb tree */
+	if (ci->i_xattrs.blob && ci->i_xattrs.blob->vec.iov_len > 4) {
+		p = ci->i_xattrs.blob->vec.iov_base;
+		end = p + ci->i_xattrs.blob->vec.iov_len;
+		ceph_decode_32_safe(&p, end, numattr, bad);
+		xattr_version = ci->i_xattrs.version;
+		spin_unlock(&inode->i_lock);
+
+		xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *),
+				 GFP_NOFS);
+		err = -ENOMEM;
+		if (!xattrs)
+			goto bad_lock;
+		memset(xattrs, 0, numattr*sizeof(struct ceph_xattr *));
+		for (i = 0; i < numattr; i++) {
+			xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr),
+					    GFP_NOFS);
+			if (!xattrs[i])
+				goto bad_lock;
+		}
+
+		spin_lock(&inode->i_lock);
+		if (ci->i_xattrs.version != xattr_version) {
+			/* lost a race, retry */
+			for (i = 0; i < numattr; i++)
+				kfree(xattrs[i]);
+			kfree(xattrs);
+			goto start;
+		}
+		err = -EIO;
+		while (numattr--) {
+			ceph_decode_32_safe(&p, end, len, bad);
+			namelen = len;
+			name = p;
+			p += len;
+			ceph_decode_32_safe(&p, end, len, bad);
+			val = p;
+			p += len;
+
+			err = __set_xattr(ci, name, namelen, val, len,
+					  0, 0, 0, &xattrs[numattr]);
+
+			if (err < 0)
+				goto bad;
+		}
+		kfree(xattrs);
+	}
+	ci->i_xattrs.index_version = ci->i_xattrs.version;
+	ci->i_xattrs.dirty = false;
+
+	return err;
+bad_lock:
+	spin_lock(&inode->i_lock);
+bad:
+	if (xattrs) {
+		for (i = 0; i < numattr; i++)
+			kfree(xattrs[i]);
+		kfree(xattrs);
+	}
+	ci->i_xattrs.names_size = 0;
+	return err;
+}
+
+static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size,
+				    int val_size)
+{
+	/*
+	 * 4 bytes for the length, and additional 4 bytes per each xattr name,
+	 * 4 bytes per each value
+	 */
+	int size = 4 + ci->i_xattrs.count*(4 + 4) +
+			     ci->i_xattrs.names_size +
+			     ci->i_xattrs.vals_size;
+	dout("__get_required_blob_size c=%d names.size=%d vals.size=%d\n",
+	     ci->i_xattrs.count, ci->i_xattrs.names_size,
+	     ci->i_xattrs.vals_size);
+
+	if (name_size)
+		size += 4 + 4 + name_size + val_size;
+
+	return size;
+}
+
+/*
+ * If there are dirty xattrs, reencode xattrs into the prealloc_blob
+ * and swap into place.
+ */
+void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
+{
+	struct rb_node *p;
+	struct ceph_inode_xattr *xattr = NULL;
+	void *dest;
+
+	dout("__build_xattrs_blob %p\n", &ci->vfs_inode);
+	if (ci->i_xattrs.dirty) {
+		int need = __get_required_blob_size(ci, 0, 0);
+
+		BUG_ON(need > ci->i_xattrs.prealloc_blob->alloc_len);
+
+		p = rb_first(&ci->i_xattrs.index);
+		dest = ci->i_xattrs.prealloc_blob->vec.iov_base;
+
+		ceph_encode_32(&dest, ci->i_xattrs.count);
+		while (p) {
+			xattr = rb_entry(p, struct ceph_inode_xattr, node);
+
+			ceph_encode_32(&dest, xattr->name_len);
+			memcpy(dest, xattr->name, xattr->name_len);
+			dest += xattr->name_len;
+			ceph_encode_32(&dest, xattr->val_len);
+			memcpy(dest, xattr->val, xattr->val_len);
+			dest += xattr->val_len;
+
+			p = rb_next(p);
+		}
+
+		/* adjust buffer len; it may be larger than we need */
+		ci->i_xattrs.prealloc_blob->vec.iov_len =
+			dest - ci->i_xattrs.prealloc_blob->vec.iov_base;
+
+		if (ci->i_xattrs.blob)
+			ceph_buffer_put(ci->i_xattrs.blob);
+		ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob;
+		ci->i_xattrs.prealloc_blob = NULL;
+		ci->i_xattrs.dirty = false;
+	}
+}
+
+ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
+		      size_t size)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
+	int err;
+	struct ceph_inode_xattr *xattr;
+	struct ceph_vxattr_cb *vxattr = NULL;
+
+	if (!ceph_is_valid_xattr(name))
+		return -ENODATA;
+
+	/* let's see if a virtual xattr was requested */
+	if (vxattrs)
+		vxattr = ceph_match_vxattr(vxattrs, name);
+
+	spin_lock(&inode->i_lock);
+	dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
+	     ci->i_xattrs.version, ci->i_xattrs.index_version);
+
+	if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
+	    (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
+		goto get_xattr;
+	} else {
+		spin_unlock(&inode->i_lock);
+		/* get xattrs from mds (if we don't already have them) */
+		err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
+		if (err)
+			return err;
+	}
+
+	spin_lock(&inode->i_lock);
+
+	if (vxattr && vxattr->readonly) {
+		err = vxattr->getxattr_cb(ci, value, size);
+		goto out;
+	}
+
+	err = __build_xattrs(inode);
+	if (err < 0)
+		goto out;
+
+get_xattr:
+	err = -ENODATA;  /* == ENOATTR */
+	xattr = __get_xattr(ci, name);
+	if (!xattr) {
+		if (vxattr)
+			err = vxattr->getxattr_cb(ci, value, size);
+		goto out;
+	}
+
+	err = -ERANGE;
+	if (size && size < xattr->val_len)
+		goto out;
+
+	err = xattr->val_len;
+	if (size == 0)
+		goto out;
+
+	memcpy(value, xattr->val, xattr->val_len);
+
+out:
+	spin_unlock(&inode->i_lock);
+	return err;
+}
+
+ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
+	u32 vir_namelen = 0;
+	u32 namelen;
+	int err;
+	u32 len;
+	int i;
+
+	spin_lock(&inode->i_lock);
+	dout("listxattr %p ver=%lld index_ver=%lld\n", inode,
+	     ci->i_xattrs.version, ci->i_xattrs.index_version);
+
+	if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
+	    (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
+		goto list_xattr;
+	} else {
+		spin_unlock(&inode->i_lock);
+		err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
+		if (err)
+			return err;
+	}
+
+	spin_lock(&inode->i_lock);
+
+	err = __build_xattrs(inode);
+	if (err < 0)
+		goto out;
+
+list_xattr:
+	vir_namelen = 0;
+	/* include virtual dir xattrs */
+	if (vxattrs)
+		for (i = 0; vxattrs[i].name; i++)
+			vir_namelen += strlen(vxattrs[i].name) + 1;
+	/* adding 1 byte per each variable due to the null termination */
+	namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count;
+	err = -ERANGE;
+	if (size && namelen > size)
+		goto out;
+
+	err = namelen;
+	if (size == 0)
+		goto out;
+
+	names = __copy_xattr_names(ci, names);
+
+	/* virtual xattr names, too */
+	if (vxattrs)
+		for (i = 0; vxattrs[i].name; i++) {
+			len = sprintf(names, "%s", vxattrs[i].name);
+			names += len + 1;
+		}
+
+out:
+	spin_unlock(&inode->i_lock);
+	return err;
+}
+
+static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
+			      const char *value, size_t size, int flags)
+{
+	struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
+	struct inode *inode = dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct inode *parent_inode = dentry->d_parent->d_inode;
+	struct ceph_mds_request *req;
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	int err;
+	int i, nr_pages;
+	struct page **pages = NULL;
+	void *kaddr;
+
+	/* copy value into some pages */
+	nr_pages = calc_pages_for(0, size);
+	if (nr_pages) {
+		pages = kmalloc(sizeof(pages[0])*nr_pages, GFP_NOFS);
+		if (!pages)
+			return -ENOMEM;
+		err = -ENOMEM;
+		for (i = 0; i < nr_pages; i++) {
+			pages[i] = __page_cache_alloc(GFP_NOFS);
+			if (!pages[i]) {
+				nr_pages = i;
+				goto out;
+			}
+			kaddr = kmap(pages[i]);
+			memcpy(kaddr, value + i*PAGE_CACHE_SIZE,
+			       min(PAGE_CACHE_SIZE, size-i*PAGE_CACHE_SIZE));
+		}
+	}
+
+	dout("setxattr value=%.*s\n", (int)size, value);
+
+	/* do request */
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR,
+				       USE_AUTH_MDS);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto out;
+	}
+	req->r_inode = igrab(inode);
+	req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
+	req->r_num_caps = 1;
+	req->r_args.setxattr.flags = cpu_to_le32(flags);
+	req->r_path2 = kstrdup(name, GFP_NOFS);
+
+	req->r_pages = pages;
+	req->r_num_pages = nr_pages;
+	req->r_data_len = size;
+
+	dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
+	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+	ceph_mdsc_put_request(req);
+	dout("xattr.ver (after): %lld\n", ci->i_xattrs.version);
+
+out:
+	if (pages) {
+		for (i = 0; i < nr_pages; i++)
+			__free_page(pages[i]);
+		kfree(pages);
+	}
+	return err;
+}
+
+int ceph_setxattr(struct dentry *dentry, const char *name,
+		  const void *value, size_t size, int flags)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
+	int err;
+	int name_len = strlen(name);
+	int val_len = size;
+	char *newname = NULL;
+	char *newval = NULL;
+	struct ceph_inode_xattr *xattr = NULL;
+	int issued;
+	int required_blob_size;
+
+	if (ceph_snap(inode) != CEPH_NOSNAP)
+		return -EROFS;
+
+	if (!ceph_is_valid_xattr(name))
+		return -EOPNOTSUPP;
+
+	if (vxattrs) {
+		struct ceph_vxattr_cb *vxattr =
+			ceph_match_vxattr(vxattrs, name);
+		if (vxattr && vxattr->readonly)
+			return -EOPNOTSUPP;
+	}
+
+	/* preallocate memory for xattr name, value, index node */
+	err = -ENOMEM;
+	newname = kmalloc(name_len + 1, GFP_NOFS);
+	if (!newname)
+		goto out;
+	memcpy(newname, name, name_len + 1);
+
+	if (val_len) {
+		newval = kmalloc(val_len + 1, GFP_NOFS);
+		if (!newval)
+			goto out;
+		memcpy(newval, value, val_len);
+		newval[val_len] = '\0';
+	}
+
+	xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS);
+	if (!xattr)
+		goto out;
+
+	spin_lock(&inode->i_lock);
+retry:
+	issued = __ceph_caps_issued(ci, NULL);
+	if (!(issued & CEPH_CAP_XATTR_EXCL))
+		goto do_sync;
+	__build_xattrs(inode);
+
+	required_blob_size = __get_required_blob_size(ci, name_len, val_len);
+
+	if (!ci->i_xattrs.prealloc_blob ||
+	    required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
+		struct ceph_buffer *blob = NULL;
+
+		spin_unlock(&inode->i_lock);
+		dout(" preaallocating new blob size=%d\n", required_blob_size);
+		blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
+		if (!blob)
+			goto out;
+		spin_lock(&inode->i_lock);
+		if (ci->i_xattrs.prealloc_blob)
+			ceph_buffer_put(ci->i_xattrs.prealloc_blob);
+		ci->i_xattrs.prealloc_blob = blob;
+		goto retry;
+	}
+
+	dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
+	err = __set_xattr(ci, newname, name_len, newval,
+			  val_len, 1, 1, 1, &xattr);
+	__ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
+	ci->i_xattrs.dirty = true;
+	inode->i_ctime = CURRENT_TIME;
+	spin_unlock(&inode->i_lock);
+
+	return err;
+
+do_sync:
+	spin_unlock(&inode->i_lock);
+	err = ceph_sync_setxattr(dentry, name, value, size, flags);
+out:
+	kfree(newname);
+	kfree(newval);
+	kfree(xattr);
+	return err;
+}
+
+static int ceph_send_removexattr(struct dentry *dentry, const char *name)
+{
+	struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct inode *inode = dentry->d_inode;
+	struct inode *parent_inode = dentry->d_parent->d_inode;
+	struct ceph_mds_request *req;
+	int err;
+
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RMXATTR,
+				       USE_AUTH_MDS);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+	req->r_inode = igrab(inode);
+	req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
+	req->r_num_caps = 1;
+	req->r_path2 = kstrdup(name, GFP_NOFS);
+
+	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+	ceph_mdsc_put_request(req);
+	return err;
+}
+
+int ceph_removexattr(struct dentry *dentry, const char *name)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
+	int issued;
+	int err;
+
+	if (ceph_snap(inode) != CEPH_NOSNAP)
+		return -EROFS;
+
+	if (!ceph_is_valid_xattr(name))
+		return -EOPNOTSUPP;
+
+	if (vxattrs) {
+		struct ceph_vxattr_cb *vxattr =
+			ceph_match_vxattr(vxattrs, name);
+		if (vxattr && vxattr->readonly)
+			return -EOPNOTSUPP;
+	}
+
+	spin_lock(&inode->i_lock);
+	__build_xattrs(inode);
+	issued = __ceph_caps_issued(ci, NULL);
+	dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
+
+	if (!(issued & CEPH_CAP_XATTR_EXCL))
+		goto do_sync;
+
+	err = __remove_xattr_by_name(ceph_inode(inode), name);
+	__ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
+	ci->i_xattrs.dirty = true;
+	inode->i_ctime = CURRENT_TIME;
+
+	spin_unlock(&inode->i_lock);
+
+	return err;
+do_sync:
+	spin_unlock(&inode->i_lock);
+	err = ceph_send_removexattr(dentry, name);
+	return err;
+}
+
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 7b2600b380d7..bc0025cdd1c9 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,8 @@
+Version 1.62
+------------
+Add sockopt=TCP_NODELAY mount option. EA (xattr) routines hardened
+to more strictly handle corrupt frames.
+
 Version 1.61
 ------------
 Fix append problem to Samba servers (files opened with O_APPEND could
diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index 20692fbfdb24..cfd1ce34e0bc 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -136,7 +136,7 @@ asn1_enum_decode(struct asn1_ctx *ctx, __le32 *val)
 		return 0;
 	}
 
-	ch = *(ctx->pointer)++; /* ch has 0xa, ptr points to lenght octet */
+	ch = *(ctx->pointer)++; /* ch has 0xa, ptr points to length octet */
 	if ((ch) == ASN1_ENUM)  /* if ch value is ENUM, 0xa */
 		*val = *(++(ctx->pointer)); /* value has enum value */
 	else
@@ -492,17 +492,13 @@ compare_oid(unsigned long *oid1, unsigned int oid1len,
 
 int
 decode_negTokenInit(unsigned char *security_blob, int length,
-		    enum securityEnum *secType)
+		    struct TCP_Server_Info *server)
 {
 	struct asn1_ctx ctx;
 	unsigned char *end;
 	unsigned char *sequence_end;
 	unsigned long *oid = NULL;
 	unsigned int cls, con, tag, oidlen, rc;
-	bool use_ntlmssp = false;
-	bool use_kerberos = false;
-	bool use_kerberosu2u = false;
-	bool use_mskerberos = false;
 
 	/* cifs_dump_mem(" Received SecBlob ", security_blob, length); */
 
@@ -510,11 +506,11 @@ decode_negTokenInit(unsigned char *security_blob, int length,
 
 	/* GSSAPI header */
 	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-		cFYI(1, ("Error decoding negTokenInit header"));
+		cFYI(1, "Error decoding negTokenInit header");
 		return 0;
 	} else if ((cls != ASN1_APL) || (con != ASN1_CON)
 		   || (tag != ASN1_EOC)) {
-		cFYI(1, ("cls = %d con = %d tag = %d", cls, con, tag));
+		cFYI(1, "cls = %d con = %d tag = %d", cls, con, tag);
 		return 0;
 	}
 
@@ -535,56 +531,52 @@ decode_negTokenInit(unsigned char *security_blob, int length,
 
 	/* SPNEGO OID not present or garbled -- bail out */
 	if (!rc) {
-		cFYI(1, ("Error decoding negTokenInit header"));
+		cFYI(1, "Error decoding negTokenInit header");
 		return 0;
 	}
 
 	/* SPNEGO */
 	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-		cFYI(1, ("Error decoding negTokenInit"));
+		cFYI(1, "Error decoding negTokenInit");
 		return 0;
 	} else if ((cls != ASN1_CTX) || (con != ASN1_CON)
 		   || (tag != ASN1_EOC)) {
-		cFYI(1,
-		     ("cls = %d con = %d tag = %d end = %p (%d) exit 0",
-		      cls, con, tag, end, *end));
+		cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 0",
+		     cls, con, tag, end, *end);
 		return 0;
 	}
 
 	/* negTokenInit */
 	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-		cFYI(1, ("Error decoding negTokenInit"));
+		cFYI(1, "Error decoding negTokenInit");
 		return 0;
 	} else if ((cls != ASN1_UNI) || (con != ASN1_CON)
 		   || (tag != ASN1_SEQ)) {
-		cFYI(1,
-		     ("cls = %d con = %d tag = %d end = %p (%d) exit 1",
-		      cls, con, tag, end, *end));
+		cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 1",
+		     cls, con, tag, end, *end);
 		return 0;
 	}
 
 	/* sequence */
 	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-		cFYI(1, ("Error decoding 2nd part of negTokenInit"));
+		cFYI(1, "Error decoding 2nd part of negTokenInit");
 		return 0;
 	} else if ((cls != ASN1_CTX) || (con != ASN1_CON)
 		   || (tag != ASN1_EOC)) {
-		cFYI(1,
-		     ("cls = %d con = %d tag = %d end = %p (%d) exit 0",
-		      cls, con, tag, end, *end));
+		cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 0",
+		     cls, con, tag, end, *end);
 		return 0;
 	}
 
 	/* sequence of */
 	if (asn1_header_decode
 	    (&ctx, &sequence_end, &cls, &con, &tag) == 0) {
-		cFYI(1, ("Error decoding 2nd part of negTokenInit"));
+		cFYI(1, "Error decoding 2nd part of negTokenInit");
 		return 0;
 	} else if ((cls != ASN1_UNI) || (con != ASN1_CON)
 		   || (tag != ASN1_SEQ)) {
-		cFYI(1,
-		     ("cls = %d con = %d tag = %d end = %p (%d) exit 1",
-		      cls, con, tag, end, *end));
+		cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 1",
+		     cls, con, tag, end, *end);
 		return 0;
 	}
 
@@ -592,37 +584,33 @@ decode_negTokenInit(unsigned char *security_blob, int length,
 	while (!asn1_eoc_decode(&ctx, sequence_end)) {
 		rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag);
 		if (!rc) {
-			cFYI(1,
-			     ("Error decoding negTokenInit hdr exit2"));
+			cFYI(1, "Error decoding negTokenInit hdr exit2");
 			return 0;
 		}
 		if ((tag == ASN1_OJI) && (con == ASN1_PRI)) {
 			if (asn1_oid_decode(&ctx, end, &oid, &oidlen)) {
 
-				cFYI(1, ("OID len = %d oid = 0x%lx 0x%lx "
-					 "0x%lx 0x%lx", oidlen, *oid,
-					 *(oid + 1), *(oid + 2), *(oid + 3)));
+				cFYI(1, "OID len = %d oid = 0x%lx 0x%lx "
+					"0x%lx 0x%lx", oidlen, *oid,
+					*(oid + 1), *(oid + 2), *(oid + 3));
 
 				if (compare_oid(oid, oidlen, MSKRB5_OID,
-						MSKRB5_OID_LEN) &&
-						!use_mskerberos)
-					use_mskerberos = true;
+						MSKRB5_OID_LEN))
+					server->sec_mskerberos = true;
 				else if (compare_oid(oid, oidlen, KRB5U2U_OID,
-						     KRB5U2U_OID_LEN) &&
-						     !use_kerberosu2u)
-					use_kerberosu2u = true;
+						     KRB5U2U_OID_LEN))
+					server->sec_kerberosu2u = true;
 				else if (compare_oid(oid, oidlen, KRB5_OID,
-						     KRB5_OID_LEN) &&
-						     !use_kerberos)
-					use_kerberos = true;
+						     KRB5_OID_LEN))
+					server->sec_kerberos = true;
 				else if (compare_oid(oid, oidlen, NTLMSSP_OID,
 						     NTLMSSP_OID_LEN))
-					use_ntlmssp = true;
+					server->sec_ntlmssp = true;
 
 				kfree(oid);
 			}
 		} else {
-			cFYI(1, ("Should be an oid what is going on?"));
+			cFYI(1, "Should be an oid what is going on?");
 		}
 	}
 
@@ -632,54 +620,47 @@ decode_negTokenInit(unsigned char *security_blob, int length,
 		   no mechListMic (e.g. NTLMSSP instead of KRB5) */
 		if (ctx.error == ASN1_ERR_DEC_EMPTY)
 			goto decode_negtoken_exit;
-		cFYI(1, ("Error decoding last part negTokenInit exit3"));
+		cFYI(1, "Error decoding last part negTokenInit exit3");
 		return 0;
 	} else if ((cls != ASN1_CTX) || (con != ASN1_CON)) {
 		/* tag = 3 indicating mechListMIC */
-		cFYI(1, ("Exit 4 cls = %d con = %d tag = %d end = %p (%d)",
-			 cls, con, tag, end, *end));
+		cFYI(1, "Exit 4 cls = %d con = %d tag = %d end = %p (%d)",
+			cls, con, tag, end, *end);
 		return 0;
 	}
 
 	/* sequence */
 	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-		cFYI(1, ("Error decoding last part negTokenInit exit5"));
+		cFYI(1, "Error decoding last part negTokenInit exit5");
 		return 0;
 	} else if ((cls != ASN1_UNI) || (con != ASN1_CON)
 		   || (tag != ASN1_SEQ)) {
-		cFYI(1, ("cls = %d con = %d tag = %d end = %p (%d)",
-			cls, con, tag, end, *end));
+		cFYI(1, "cls = %d con = %d tag = %d end = %p (%d)",
+			cls, con, tag, end, *end);
 	}
 
 	/* sequence of */
 	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-		cFYI(1, ("Error decoding last part negTokenInit exit 7"));
+		cFYI(1, "Error decoding last part negTokenInit exit 7");
 		return 0;
 	} else if ((cls != ASN1_CTX) || (con != ASN1_CON)) {
-		cFYI(1, ("Exit 8 cls = %d con = %d tag = %d end = %p (%d)",
-			 cls, con, tag, end, *end));
+		cFYI(1, "Exit 8 cls = %d con = %d tag = %d end = %p (%d)",
+			cls, con, tag, end, *end);
 		return 0;
 	}
 
 	/* general string */
 	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-		cFYI(1, ("Error decoding last part negTokenInit exit9"));
+		cFYI(1, "Error decoding last part negTokenInit exit9");
 		return 0;
 	} else if ((cls != ASN1_UNI) || (con != ASN1_PRI)
 		   || (tag != ASN1_GENSTR)) {
-		cFYI(1, ("Exit10 cls = %d con = %d tag = %d end = %p (%d)",
-			 cls, con, tag, end, *end));
+		cFYI(1, "Exit10 cls = %d con = %d tag = %d end = %p (%d)",
+			cls, con, tag, end, *end);
 		return 0;
 	}
-	cFYI(1, ("Need to call asn1_octets_decode() function for %s",
-		 ctx.pointer));	/* is this UTF-8 or ASCII? */
+	cFYI(1, "Need to call asn1_octets_decode() function for %s",
+		ctx.pointer);	/* is this UTF-8 or ASCII? */
 decode_negtoken_exit:
-	if (use_kerberos)
-		*secType = Kerberos;
-	else if (use_mskerberos)
-		*secType = MSKerberos;
-	else if (use_ntlmssp)
-		*secType = RawNTLMSSP;
-
 	return 1;
 }
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 42cec2a7c0cf..4fce6e61b34e 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -60,10 +60,10 @@ cifs_dump_mem(char *label, void *data, int length)
 #ifdef CONFIG_CIFS_DEBUG2
 void cifs_dump_detail(struct smb_hdr *smb)
 {
-	cERROR(1, ("Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d",
+	cERROR(1, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d",
 		  smb->Command, smb->Status.CifsError,
-		  smb->Flags, smb->Flags2, smb->Mid, smb->Pid));
-	cERROR(1, ("smb buf %p len %d", smb, smbCalcSize_LE(smb)));
+		  smb->Flags, smb->Flags2, smb->Mid, smb->Pid);
+	cERROR(1, "smb buf %p len %d", smb, smbCalcSize_LE(smb));
 }
 
 
@@ -75,25 +75,25 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
 	if (server == NULL)
 		return;
 
-	cERROR(1, ("Dump pending requests:"));
+	cERROR(1, "Dump pending requests:");
 	spin_lock(&GlobalMid_Lock);
 	list_for_each(tmp, &server->pending_mid_q) {
 		mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
-		cERROR(1, ("State: %d Cmd: %d Pid: %d Tsk: %p Mid %d",
+		cERROR(1, "State: %d Cmd: %d Pid: %d Tsk: %p Mid %d",
 			mid_entry->midState,
 			(int)mid_entry->command,
 			mid_entry->pid,
 			mid_entry->tsk,
-			mid_entry->mid));
+			mid_entry->mid);
 #ifdef CONFIG_CIFS_STATS2
-		cERROR(1, ("IsLarge: %d buf: %p time rcv: %ld now: %ld",
+		cERROR(1, "IsLarge: %d buf: %p time rcv: %ld now: %ld",
 			mid_entry->largeBuf,
 			mid_entry->resp_buf,
 			mid_entry->when_received,
-			jiffies));
+			jiffies);
 #endif /* STATS2 */
-		cERROR(1, ("IsMult: %d IsEnd: %d", mid_entry->multiRsp,
-			  mid_entry->multiEnd));
+		cERROR(1, "IsMult: %d IsEnd: %d", mid_entry->multiRsp,
+			  mid_entry->multiEnd);
 		if (mid_entry->resp_buf) {
 			cifs_dump_detail(mid_entry->resp_buf);
 			cifs_dump_mem("existing buf: ",
@@ -716,7 +716,7 @@ static const struct file_operations cifs_multiuser_mount_proc_fops = {
 
 static int cifs_security_flags_proc_show(struct seq_file *m, void *v)
 {
-	seq_printf(m, "0x%x\n", extended_security);
+	seq_printf(m, "0x%x\n", global_secflags);
 	return 0;
 }
 
@@ -744,13 +744,13 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
 		/* single char or single char followed by null */
 		c = flags_string[0];
 		if (c == '0' || c == 'n' || c == 'N') {
-			extended_security = CIFSSEC_DEF; /* default */
+			global_secflags = CIFSSEC_DEF; /* default */
 			return count;
 		} else if (c == '1' || c == 'y' || c == 'Y') {
-			extended_security = CIFSSEC_MAX;
+			global_secflags = CIFSSEC_MAX;
 			return count;
 		} else if (!isdigit(c)) {
-			cERROR(1, ("invalid flag %c", c));
+			cERROR(1, "invalid flag %c", c);
 			return -EINVAL;
 		}
 	}
@@ -758,26 +758,26 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
 
 	flags = simple_strtoul(flags_string, NULL, 0);
 
-	cFYI(1, ("sec flags 0x%x", flags));
+	cFYI(1, "sec flags 0x%x", flags);
 
 	if (flags <= 0)  {
-		cERROR(1, ("invalid security flags %s", flags_string));
+		cERROR(1, "invalid security flags %s", flags_string);
 		return -EINVAL;
 	}
 
 	if (flags & ~CIFSSEC_MASK) {
-		cERROR(1, ("attempt to set unsupported security flags 0x%x",
-			flags & ~CIFSSEC_MASK));
+		cERROR(1, "attempt to set unsupported security flags 0x%x",
+			flags & ~CIFSSEC_MASK);
 		return -EINVAL;
 	}
 	/* flags look ok - update the global security flags for cifs module */
-	extended_security = flags;
-	if (extended_security & CIFSSEC_MUST_SIGN) {
+	global_secflags = flags;
+	if (global_secflags & CIFSSEC_MUST_SIGN) {
 		/* requiring signing implies signing is allowed */
-		extended_security |= CIFSSEC_MAY_SIGN;
-		cFYI(1, ("packet signing now required"));
-	} else if ((extended_security & CIFSSEC_MAY_SIGN) == 0) {
-		cFYI(1, ("packet signing disabled"));
+		global_secflags |= CIFSSEC_MAY_SIGN;
+		cFYI(1, "packet signing now required");
+	} else if ((global_secflags & CIFSSEC_MAY_SIGN) == 0) {
+		cFYI(1, "packet signing disabled");
 	}
 	/* BB should we turn on MAY flags for other MUST options? */
 	return count;
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index 5eb3b83bbfa7..aa316891ac0c 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -43,34 +43,54 @@ void dump_smb(struct smb_hdr *, int);
  */
 #ifdef CIFS_DEBUG
 
-
 /* information message: e.g., configuration, major event */
 extern int cifsFYI;
-#define cifsfyi(format,arg...) if (cifsFYI & CIFS_INFO) printk(KERN_DEBUG " " __FILE__ ": " format "\n" "" , ## arg)
+#define cifsfyi(fmt, arg...)						\
+do {									\
+	if (cifsFYI & CIFS_INFO)					\
+		printk(KERN_DEBUG "%s: " fmt "\n", __FILE__, ##arg);	\
+} while (0)
 
-#define cFYI(button,prspec) if (button) cifsfyi prspec
+#define cFYI(set, fmt, arg...)			\
+do {						\
+	if (set)				\
+		cifsfyi(fmt, ##arg);		\
+} while (0)
 
-#define cifswarn(format, arg...) printk(KERN_WARNING ": " format "\n" , ## arg)
+#define cifswarn(fmt, arg...)			\
+	printk(KERN_WARNING fmt "\n", ##arg)
 
 /* debug event message: */
 extern int cifsERROR;
 
-#define cEVENT(format,arg...) if (cifsERROR) printk(KERN_EVENT __FILE__ ": " format "\n" , ## arg)
+#define cEVENT(fmt, arg...)						\
+do {									\
+	if (cifsERROR)							\
+		printk(KERN_EVENT "%s: " fmt "\n", __FILE__, ##arg);	\
+} while (0)
 
 /* error event message: e.g., i/o error */
-#define cifserror(format,arg...) if (cifsERROR) printk(KERN_ERR " CIFS VFS: " format "\n" "" , ## arg)
+#define cifserror(fmt, arg...)					\
+do {								\
+	if (cifsERROR)						\
+		printk(KERN_ERR "CIFS VFS: " fmt "\n", ##arg);	\
+} while (0)
 
-#define cERROR(button, prspec) if (button) cifserror prspec
+#define cERROR(set, fmt, arg...)		\
+do {						\
+	if (set)				\
+		cifserror(fmt, ##arg);		\
+} while (0)
 
 /*
  *	debug OFF
  *	---------
  */
 #else		/* _CIFS_DEBUG */
-#define cERROR(button, prspec)
-#define cEVENT(format, arg...)
-#define cFYI(button, prspec)
-#define cifserror(format, arg...)
+#define cERROR(set, fmt, arg...)
+#define cEVENT(fmt, arg...)
+#define cFYI(set, fmt, arg...)
+#define cifserror(fmt, arg...)
 #endif		/* _CIFS_DEBUG */
 
 #endif				/* _H_CIFS_DEBUG */
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index b44ce0a0711c..ac19a6f3dae0 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -15,6 +15,7 @@
 #include <linux/dcache.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
+#include <linux/slab.h>
 #include <linux/vfs.h>
 #include <linux/fs.h>
 #include "cifsglob.h"
@@ -54,7 +55,7 @@ void cifs_dfs_release_automount_timer(void)
  * Extracts sharename form full UNC.
  * i.e. strips from UNC trailing path that is not part of share
  * name and fixup missing '\' in the begining of DFS node refferal
- * if neccessary.
+ * if necessary.
  * Returns pointer to share name on success or ERR_PTR on error.
  * Caller is responsible for freeing returned string.
  */
@@ -84,8 +85,8 @@ static char *cifs_get_share_name(const char *node_name)
 	/* find server name end */
 	pSep = memchr(UNC+2, '\\', len-2);
 	if (!pSep) {
-		cERROR(1, ("%s: no server name end in node name: %s",
-			__func__, node_name));
+		cERROR(1, "%s: no server name end in node name: %s",
+			__func__, node_name);
 		kfree(UNC);
 		return ERR_PTR(-EINVAL);
 	}
@@ -141,8 +142,8 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
 
 	rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
 	if (rc != 0) {
-		cERROR(1, ("%s: Failed to resolve server part of %s to IP: %d",
-			  __func__, *devname, rc));
+		cERROR(1, "%s: Failed to resolve server part of %s to IP: %d",
+			  __func__, *devname, rc);
 		goto compose_mount_options_err;
 	}
 	/* md_len = strlen(...) + 12 for 'sep+prefixpath='
@@ -216,8 +217,8 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
 		strcat(mountdata, fullpath + ref->path_consumed);
 	}
 
-	/*cFYI(1,("%s: parent mountdata: %s", __func__,sb_mountdata));*/
-	/*cFYI(1, ("%s: submount mountdata: %s", __func__, mountdata ));*/
+	/*cFYI(1, "%s: parent mountdata: %s", __func__,sb_mountdata);*/
+	/*cFYI(1, "%s: submount mountdata: %s", __func__, mountdata );*/
 
 compose_mount_options_out:
 	kfree(srvIP);
@@ -293,11 +294,11 @@ static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
 
 static void dump_referral(const struct dfs_info3_param *ref)
 {
-	cFYI(1, ("DFS: ref path: %s", ref->path_name));
-	cFYI(1, ("DFS: node path: %s", ref->node_name));
-	cFYI(1, ("DFS: fl: %hd, srv_type: %hd", ref->flags, ref->server_type));
-	cFYI(1, ("DFS: ref_flags: %hd, path_consumed: %hd", ref->ref_flag,
-				ref->path_consumed));
+	cFYI(1, "DFS: ref path: %s", ref->path_name);
+	cFYI(1, "DFS: node path: %s", ref->node_name);
+	cFYI(1, "DFS: fl: %hd, srv_type: %hd", ref->flags, ref->server_type);
+	cFYI(1, "DFS: ref_flags: %hd, path_consumed: %hd", ref->ref_flag,
+				ref->path_consumed);
 }
 
 
@@ -313,7 +314,7 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 	int rc = 0;
 	struct vfsmount *mnt = ERR_PTR(-ENOENT);
 
-	cFYI(1, ("in %s", __func__));
+	cFYI(1, "in %s", __func__);
 	BUG_ON(IS_ROOT(dentry));
 
 	xid = GetXid();
@@ -351,15 +352,15 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 		/* connect to a node */
 		len = strlen(referrals[i].node_name);
 		if (len < 2) {
-			cERROR(1, ("%s: Net Address path too short: %s",
-					__func__, referrals[i].node_name));
+			cERROR(1, "%s: Net Address path too short: %s",
+					__func__, referrals[i].node_name);
 			rc = -EINVAL;
 			goto out_err;
 		}
 		mnt = cifs_dfs_do_refmount(nd->path.mnt,
 				nd->path.dentry, referrals + i);
-		cFYI(1, ("%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__,
-					referrals[i].node_name, mnt));
+		cFYI(1, "%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__,
+					referrals[i].node_name, mnt);
 
 		/* complete mount procedure if we accured submount */
 		if (!IS_ERR(mnt))
@@ -377,7 +378,7 @@ out:
 	FreeXid(xid);
 	free_dfs_info_array(referrals, num_referrals);
 	kfree(full_path);
-	cFYI(1, ("leaving %s" , __func__));
+	cFYI(1, "leaving %s" , __func__);
 	return ERR_PTR(rc);
 out_err:
 	path_put(&nd->path);
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 4797787c6a44..246a167cb913 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -18,6 +18,8 @@
 #ifndef _CIFS_FS_SB_H
 #define _CIFS_FS_SB_H
 
+#include <linux/backing-dev.h>
+
 #define CIFS_MOUNT_NO_PERM      1 /* do not do client vfs_perm check */
 #define CIFS_MOUNT_SET_UID      2 /* set current's euid in create etc. */
 #define CIFS_MOUNT_SERVER_INUM  4 /* inode numbers from uniqueid from server  */
@@ -50,5 +52,6 @@ struct cifs_sb_info {
 #ifdef CONFIG_CIFS_DFS_UPCALL
 	char   *mountdata; /* mount options received at mount time */
 #endif
+	struct backing_dev_info bdi;
 };
 #endif				/* _CIFS_FS_SB_H */
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 8ec7736ce954..379bd7d9c05f 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -20,6 +20,7 @@
  */
 
 #include <linux/list.h>
+#include <linux/slab.h>
 #include <linux/string.h>
 #include <keys/user-type.h>
 #include <linux/key-type.h>
@@ -132,9 +133,9 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
 	dp = description + strlen(description);
 
 	/* for now, only sec=krb5 and sec=mskrb5 are valid */
-	if (server->secType == Kerberos)
+	if (server->sec_kerberos)
 		sprintf(dp, ";sec=krb5");
-	else if (server->secType == MSKerberos)
+	else if (server->sec_mskerberos)
 		sprintf(dp, ";sec=mskrb5");
 	else
 		goto out;
@@ -148,7 +149,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
 	dp = description + strlen(description);
 	sprintf(dp, ";pid=0x%x", current->pid);
 
-	cFYI(1, ("key description = %s", description));
+	cFYI(1, "key description = %s", description);
 	spnego_key = request_key(&cifs_spnego_key_type, description, "");
 
 #ifdef CONFIG_CIFS_DEBUG2
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 714a542cbafc..430f510a1720 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -19,6 +19,7 @@
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include "cifs_unicode.h"
 #include "cifs_uniupr.h"
 #include "cifspdu.h"
@@ -199,9 +200,8 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
 		/* works for 2.4.0 kernel or later */
 		charlen = codepage->char2uni(from, len, &wchar_to[i]);
 		if (charlen < 1) {
-			cERROR(1,
-			       ("strtoUCS: char2uni of %d returned %d",
-				(int)*from, charlen));
+			cERROR(1, "strtoUCS: char2uni of %d returned %d",
+				(int)*from, charlen);
 			/* A question mark */
 			to[i] = cpu_to_le16(0x003f);
 			charlen = 1;
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 7dfe0842a6f6..85d7cf7ff2c8 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -22,6 +22,7 @@
  */
 
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
 #include "cifsacl.h"
@@ -86,11 +87,11 @@ int match_sid(struct cifs_sid *ctsid)
 				continue; /* all sub_auth values do not match */
 		}
 
-		cFYI(1, ("matching sid: %s\n", wksidarr[i].sidname));
+		cFYI(1, "matching sid: %s\n", wksidarr[i].sidname);
 		return 0; /* sids compare/match */
 	}
 
-	cFYI(1, ("No matching sid"));
+	cFYI(1, "No matching sid");
 	return -1;
 }
 
@@ -207,14 +208,14 @@ static void access_flags_to_mode(__le32 ace_flags, int type, umode_t *pmode,
 			*pbits_to_set &= ~S_IXUGO;
 		return;
 	} else if (type != ACCESS_ALLOWED) {
-		cERROR(1, ("unknown access control type %d", type));
+		cERROR(1, "unknown access control type %d", type);
 		return;
 	}
 	/* else ACCESS_ALLOWED type */
 
 	if (flags & GENERIC_ALL) {
 		*pmode |= (S_IRWXUGO & (*pbits_to_set));
-		cFYI(DBG2, ("all perms"));
+		cFYI(DBG2, "all perms");
 		return;
 	}
 	if ((flags & GENERIC_WRITE) ||
@@ -227,7 +228,7 @@ static void access_flags_to_mode(__le32 ace_flags, int type, umode_t *pmode,
 			((flags & FILE_EXEC_RIGHTS) == FILE_EXEC_RIGHTS))
 		*pmode |= (S_IXUGO & (*pbits_to_set));
 
-	cFYI(DBG2, ("access flags 0x%x mode now 0x%x", flags, *pmode));
+	cFYI(DBG2, "access flags 0x%x mode now 0x%x", flags, *pmode);
 	return;
 }
 
@@ -256,7 +257,7 @@ static void mode_to_access_flags(umode_t mode, umode_t bits_to_use,
 	if (mode & S_IXUGO)
 		*pace_flags |= SET_FILE_EXEC_RIGHTS;
 
-	cFYI(DBG2, ("mode: 0x%x, access flags now 0x%x", mode, *pace_flags));
+	cFYI(DBG2, "mode: 0x%x, access flags now 0x%x", mode, *pace_flags);
 	return;
 }
 
@@ -296,24 +297,24 @@ static void dump_ace(struct cifs_ace *pace, char *end_of_acl)
 	/* validate that we do not go past end of acl */
 
 	if (le16_to_cpu(pace->size) < 16) {
-		cERROR(1, ("ACE too small, %d", le16_to_cpu(pace->size)));
+		cERROR(1, "ACE too small %d", le16_to_cpu(pace->size));
 		return;
 	}
 
 	if (end_of_acl < (char *)pace + le16_to_cpu(pace->size)) {
-		cERROR(1, ("ACL too small to parse ACE"));
+		cERROR(1, "ACL too small to parse ACE");
 		return;
 	}
 
 	num_subauth = pace->sid.num_subauth;
 	if (num_subauth) {
 		int i;
-		cFYI(1, ("ACE revision %d num_auth %d type %d flags %d size %d",
+		cFYI(1, "ACE revision %d num_auth %d type %d flags %d size %d",
 			pace->sid.revision, pace->sid.num_subauth, pace->type,
-			pace->flags, le16_to_cpu(pace->size)));
+			pace->flags, le16_to_cpu(pace->size));
 		for (i = 0; i < num_subauth; ++i) {
-			cFYI(1, ("ACE sub_auth[%d]: 0x%x", i,
-				le32_to_cpu(pace->sid.sub_auth[i])));
+			cFYI(1, "ACE sub_auth[%d]: 0x%x", i,
+				le32_to_cpu(pace->sid.sub_auth[i]));
 		}
 
 		/* BB add length check to make sure that we do not have huge
@@ -346,13 +347,13 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
 
 	/* validate that we do not go past end of acl */
 	if (end_of_acl < (char *)pdacl + le16_to_cpu(pdacl->size)) {
-		cERROR(1, ("ACL too small to parse DACL"));
+		cERROR(1, "ACL too small to parse DACL");
 		return;
 	}
 
-	cFYI(DBG2, ("DACL revision %d size %d num aces %d",
+	cFYI(DBG2, "DACL revision %d size %d num aces %d",
 		le16_to_cpu(pdacl->revision), le16_to_cpu(pdacl->size),
-		le32_to_cpu(pdacl->num_aces)));
+		le32_to_cpu(pdacl->num_aces));
 
 	/* reset rwx permissions for user/group/other.
 	   Also, if num_aces is 0 i.e. DACL has no ACEs,
@@ -436,25 +437,25 @@ static int parse_sid(struct cifs_sid *psid, char *end_of_acl)
 	/* validate that we do not go past end of ACL - sid must be at least 8
 	   bytes long (assuming no sub-auths - e.g. the null SID */
 	if (end_of_acl < (char *)psid + 8) {
-		cERROR(1, ("ACL too small to parse SID %p", psid));
+		cERROR(1, "ACL too small to parse SID %p", psid);
 		return -EINVAL;
 	}
 
 	if (psid->num_subauth) {
 #ifdef CONFIG_CIFS_DEBUG2
 		int i;
-		cFYI(1, ("SID revision %d num_auth %d",
-			psid->revision, psid->num_subauth));
+		cFYI(1, "SID revision %d num_auth %d",
+			psid->revision, psid->num_subauth);
 
 		for (i = 0; i < psid->num_subauth; i++) {
-			cFYI(1, ("SID sub_auth[%d]: 0x%x ", i,
-				le32_to_cpu(psid->sub_auth[i])));
+			cFYI(1, "SID sub_auth[%d]: 0x%x ", i,
+				le32_to_cpu(psid->sub_auth[i]));
 		}
 
 		/* BB add length check to make sure that we do not have huge
 			num auths and therefore go off the end */
-		cFYI(1, ("RID 0x%x",
-			le32_to_cpu(psid->sub_auth[psid->num_subauth-1])));
+		cFYI(1, "RID 0x%x",
+			le32_to_cpu(psid->sub_auth[psid->num_subauth-1]));
 #endif
 	}
 
@@ -481,11 +482,11 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
 				le32_to_cpu(pntsd->gsidoffset));
 	dacloffset = le32_to_cpu(pntsd->dacloffset);
 	dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset);
-	cFYI(DBG2, ("revision %d type 0x%x ooffset 0x%x goffset 0x%x "
+	cFYI(DBG2, "revision %d type 0x%x ooffset 0x%x goffset 0x%x "
 		 "sacloffset 0x%x dacloffset 0x%x",
 		 pntsd->revision, pntsd->type, le32_to_cpu(pntsd->osidoffset),
 		 le32_to_cpu(pntsd->gsidoffset),
-		 le32_to_cpu(pntsd->sacloffset), dacloffset));
+		 le32_to_cpu(pntsd->sacloffset), dacloffset);
 /*	cifs_dump_mem("owner_sid: ", owner_sid_ptr, 64); */
 	rc = parse_sid(owner_sid_ptr, end_of_acl);
 	if (rc)
@@ -499,7 +500,7 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
 		parse_dacl(dacl_ptr, end_of_acl, owner_sid_ptr,
 			   group_sid_ptr, fattr);
 	else
-		cFYI(1, ("no ACL")); /* BB grant all or default perms? */
+		cFYI(1, "no ACL"); /* BB grant all or default perms? */
 
 /*	cifscred->uid = owner_sid_ptr->rid;
 	cifscred->gid = group_sid_ptr->rid;
@@ -562,7 +563,7 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
 	FreeXid(xid);
 
 
-	cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, *pacllen));
+	cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen);
 	return pntsd;
 }
 
@@ -580,12 +581,12 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
 			 &fid, &oplock, NULL, cifs_sb->local_nls,
 			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 	if (rc) {
-		cERROR(1, ("Unable to open file to get ACL"));
+		cERROR(1, "Unable to open file to get ACL");
 		goto out;
 	}
 
 	rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen);
-	cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, *pacllen));
+	cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen);
 
 	CIFSSMBClose(xid, cifs_sb->tcon, fid);
  out:
@@ -620,7 +621,7 @@ static int set_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, __u16 fid,
 	rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
 	FreeXid(xid);
 
-	cFYI(DBG2, ("SetCIFSACL rc = %d", rc));
+	cFYI(DBG2, "SetCIFSACL rc = %d", rc);
 	return rc;
 }
 
@@ -637,12 +638,12 @@ static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
 			 &fid, &oplock, NULL, cifs_sb->local_nls,
 			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 	if (rc) {
-		cERROR(1, ("Unable to open file to set ACL"));
+		cERROR(1, "Unable to open file to set ACL");
 		goto out;
 	}
 
 	rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
-	cFYI(DBG2, ("SetCIFSACL rc = %d", rc));
+	cFYI(DBG2, "SetCIFSACL rc = %d", rc);
 
 	CIFSSMBClose(xid, cifs_sb->tcon, fid);
  out:
@@ -658,7 +659,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
 	struct cifsFileInfo *open_file;
 	int rc;
 
-	cFYI(DBG2, ("set ACL for %s from mode 0x%x", path, inode->i_mode));
+	cFYI(DBG2, "set ACL for %s from mode 0x%x", path, inode->i_mode);
 
 	open_file = find_readable_file(CIFS_I(inode));
 	if (!open_file)
@@ -678,7 +679,7 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
 	u32 acllen = 0;
 	int rc = 0;
 
-	cFYI(DBG2, ("converting ACL to mode for %s", path));
+	cFYI(DBG2, "converting ACL to mode for %s", path);
 
 	if (pfid)
 		pntsd = get_cifs_acl_by_fid(cifs_sb, *pfid, &acllen);
@@ -689,7 +690,7 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
 	if (pntsd)
 		rc = parse_sec_desc(pntsd, acllen, fattr);
 	if (rc)
-		cFYI(1, ("parse sec desc failed rc = %d", rc));
+		cFYI(1, "parse sec desc failed rc = %d", rc);
 
 	kfree(pntsd);
 	return;
@@ -703,7 +704,7 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
 	struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */
 	struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */
 
-	cFYI(DBG2, ("set ACL from mode for %s", path));
+	cFYI(DBG2, "set ACL from mode for %s", path);
 
 	/* Get the security descriptor */
 	pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen);
@@ -720,19 +721,19 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
 					DEFSECDESCLEN : secdesclen;
 		pnntsd = kmalloc(secdesclen, GFP_KERNEL);
 		if (!pnntsd) {
-			cERROR(1, ("Unable to allocate security descriptor"));
+			cERROR(1, "Unable to allocate security descriptor");
 			kfree(pntsd);
 			return -ENOMEM;
 		}
 
 		rc = build_sec_desc(pntsd, pnntsd, inode, nmode);
 
-		cFYI(DBG2, ("build_sec_desc rc: %d", rc));
+		cFYI(DBG2, "build_sec_desc rc: %d", rc);
 
 		if (!rc) {
 			/* Set the security descriptor */
 			rc = set_cifs_acl(pnntsd, secdesclen, inode, path);
-			cFYI(DBG2, ("set_cifs_acl rc: %d", rc));
+			cFYI(DBG2, "set_cifs_acl rc: %d", rc);
 		}
 
 		kfree(pnntsd);
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 7efe1745494d..847628dfdc44 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -20,6 +20,7 @@
  */
 
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
 #include "cifs_debug.h"
@@ -102,7 +103,7 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
 		if (iov[i].iov_len == 0)
 			continue;
 		if (iov[i].iov_base == NULL) {
-			cERROR(1, ("null iovec entry"));
+			cERROR(1, "null iovec entry");
 			return -EIO;
 		}
 		/* The first entry includes a length field (which does not get
@@ -180,8 +181,8 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
 
 	/* Do not need to verify session setups with signature "BSRSPYL "  */
 	if (memcmp(cifs_pdu->Signature.SecuritySignature, "BSRSPYL ", 8) == 0)
-		cFYI(1, ("dummy signature received for smb command 0x%x",
-			cifs_pdu->Command));
+		cFYI(1, "dummy signature received for smb command 0x%x",
+			cifs_pdu->Command);
 
 	/* save off the origiginal signature so we can modify the smb and check
 		its signature against what the server sent */
@@ -290,7 +291,7 @@ void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
 	if (password)
 		strncpy(password_with_pad, password, CIFS_ENCPWD_SIZE);
 
-	if (!encrypt && extended_security & CIFSSEC_MAY_PLNTXT) {
+	if (!encrypt && global_secflags & CIFSSEC_MAY_PLNTXT) {
 		memset(lnm_session_key, 0, CIFS_SESS_KEY_SIZE);
 		memcpy(lnm_session_key, password_with_pad,
 			CIFS_ENCPWD_SIZE);
@@ -397,7 +398,7 @@ void setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf,
 	/* calculate buf->ntlmv2_hash */
 	rc = calc_ntlmv2_hash(ses, nls_cp);
 	if (rc)
-		cERROR(1, ("could not get v2 hash rc %d", rc));
+		cERROR(1, "could not get v2 hash rc %d", rc);
 	CalcNTLMv2_response(ses, resp_buf);
 
 	/* now calculate the MAC key for NTLMv2 */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 8c6a03627176..2cb1a70214d7 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -49,10 +49,6 @@
 #include "cifs_spnego.h"
 #define CIFS_MAGIC_NUMBER 0xFF534D42	/* the first four bytes of SMB PDUs */
 
-#ifdef CONFIG_CIFS_QUOTA
-static const struct quotactl_ops cifs_quotactl_ops;
-#endif /* QUOTA */
-
 int cifsFYI = 0;
 int cifsERROR = 1;
 int traceSMB = 0;
@@ -61,7 +57,7 @@ unsigned int experimEnabled = 0;
 unsigned int linuxExtEnabled = 1;
 unsigned int lookupCacheEnabled = 1;
 unsigned int multiuser_mount = 0;
-unsigned int extended_security = CIFSSEC_DEF;
+unsigned int global_secflags = CIFSSEC_DEF;
 /* unsigned int ntlmv2_support = 0; */
 unsigned int sign_CIFS_PDUs = 1;
 static const struct super_operations cifs_super_ops;
@@ -86,8 +82,6 @@ extern mempool_t *cifs_sm_req_poolp;
 extern mempool_t *cifs_req_poolp;
 extern mempool_t *cifs_mid_poolp;
 
-extern struct kmem_cache *cifs_oplock_cachep;
-
 static int
 cifs_read_super(struct super_block *sb, void *data,
 		const char *devname, int silent)
@@ -103,6 +97,12 @@ cifs_read_super(struct super_block *sb, void *data,
 	if (cifs_sb == NULL)
 		return -ENOMEM;
 
+	rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY);
+	if (rc) {
+		kfree(cifs_sb);
+		return rc;
+	}
+
 #ifdef CONFIG_CIFS_DFS_UPCALL
 	/* copy mount params to sb for use in submounts */
 	/* BB: should we move this after the mount so we
@@ -115,6 +115,7 @@ cifs_read_super(struct super_block *sb, void *data,
 		int len = strlen(data);
 		cifs_sb->mountdata = kzalloc(len + 1, GFP_KERNEL);
 		if (cifs_sb->mountdata == NULL) {
+			bdi_destroy(&cifs_sb->bdi);
 			kfree(sb->s_fs_info);
 			sb->s_fs_info = NULL;
 			return -ENOMEM;
@@ -128,19 +129,16 @@ cifs_read_super(struct super_block *sb, void *data,
 
 	if (rc) {
 		if (!silent)
-			cERROR(1,
-			       ("cifs_mount failed w/return code = %d", rc));
+			cERROR(1, "cifs_mount failed w/return code = %d", rc);
 		goto out_mount_failed;
 	}
 
 	sb->s_magic = CIFS_MAGIC_NUMBER;
 	sb->s_op = &cifs_super_ops;
+	sb->s_bdi = &cifs_sb->bdi;
 /*	if (cifs_sb->tcon->ses->server->maxBuf > MAX_CIFS_HDR_SIZE + 512)
 	    sb->s_blocksize =
 		cifs_sb->tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE; */
-#ifdef CONFIG_CIFS_QUOTA
-	sb->s_qcop = &cifs_quotactl_ops;
-#endif
 	sb->s_blocksize = CIFS_MAX_MSGSIZE;
 	sb->s_blocksize_bits = 14;	/* default 2**14 = CIFS_MAX_MSGSIZE */
 	inode = cifs_root_iget(sb, ROOT_I);
@@ -160,7 +158,7 @@ cifs_read_super(struct super_block *sb, void *data,
 
 #ifdef CONFIG_CIFS_EXPERIMENTAL
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
-		cFYI(1, ("export ops supported"));
+		cFYI(1, "export ops supported");
 		sb->s_export_op = &cifs_export_ops;
 	}
 #endif /* EXPERIMENTAL */
@@ -168,7 +166,7 @@ cifs_read_super(struct super_block *sb, void *data,
 	return 0;
 
 out_no_root:
-	cERROR(1, ("cifs_read_super: get root inode failed"));
+	cERROR(1, "cifs_read_super: get root inode failed");
 	if (inode)
 		iput(inode);
 
@@ -183,6 +181,7 @@ out_mount_failed:
 		}
 #endif
 		unload_nls(cifs_sb->local_nls);
+		bdi_destroy(&cifs_sb->bdi);
 		kfree(cifs_sb);
 	}
 	return rc;
@@ -194,10 +193,10 @@ cifs_put_super(struct super_block *sb)
 	int rc = 0;
 	struct cifs_sb_info *cifs_sb;
 
-	cFYI(1, ("In cifs_put_super"));
+	cFYI(1, "In cifs_put_super");
 	cifs_sb = CIFS_SB(sb);
 	if (cifs_sb == NULL) {
-		cFYI(1, ("Empty cifs superblock info passed to unmount"));
+		cFYI(1, "Empty cifs superblock info passed to unmount");
 		return;
 	}
 
@@ -205,7 +204,7 @@ cifs_put_super(struct super_block *sb)
 
 	rc = cifs_umount(sb, cifs_sb);
 	if (rc)
-		cERROR(1, ("cifs_umount failed with return code %d", rc));
+		cERROR(1, "cifs_umount failed with return code %d", rc);
 #ifdef CONFIG_CIFS_DFS_UPCALL
 	if (cifs_sb->mountdata) {
 		kfree(cifs_sb->mountdata);
@@ -214,6 +213,7 @@ cifs_put_super(struct super_block *sb)
 #endif
 
 	unload_nls(cifs_sb->local_nls);
+	bdi_destroy(&cifs_sb->bdi);
 	kfree(cifs_sb);
 
 	unlock_kernel();
@@ -290,7 +290,6 @@ static int cifs_permission(struct inode *inode, int mask)
 static struct kmem_cache *cifs_inode_cachep;
 static struct kmem_cache *cifs_req_cachep;
 static struct kmem_cache *cifs_mid_cachep;
-struct kmem_cache *cifs_oplock_cachep;
 static struct kmem_cache *cifs_sm_req_cachep;
 mempool_t *cifs_sm_req_poolp;
 mempool_t *cifs_req_poolp;
@@ -312,6 +311,7 @@ cifs_alloc_inode(struct super_block *sb)
 	cifs_inode->clientCanCacheRead = false;
 	cifs_inode->clientCanCacheAll = false;
 	cifs_inode->delete_pending = false;
+	cifs_inode->invalid_mapping = false;
 	cifs_inode->vfs_inode.i_blkbits = 14;  /* 2**14 = CIFS_MAX_MSGSIZE */
 	cifs_inode->server_eof = 0;
 
@@ -421,106 +421,6 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
 	return 0;
 }
 
-#ifdef CONFIG_CIFS_QUOTA
-int cifs_xquota_set(struct super_block *sb, int quota_type, qid_t qid,
-		struct fs_disk_quota *pdquota)
-{
-	int xid;
-	int rc = 0;
-	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-	struct cifsTconInfo *pTcon;
-
-	if (cifs_sb)
-		pTcon = cifs_sb->tcon;
-	else
-		return -EIO;
-
-
-	xid = GetXid();
-	if (pTcon) {
-		cFYI(1, ("set type: 0x%x id: %d", quota_type, qid));
-	} else
-		rc = -EIO;
-
-	FreeXid(xid);
-	return rc;
-}
-
-int cifs_xquota_get(struct super_block *sb, int quota_type, qid_t qid,
-		    struct fs_disk_quota *pdquota)
-{
-	int xid;
-	int rc = 0;
-	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-	struct cifsTconInfo *pTcon;
-
-	if (cifs_sb)
-		pTcon = cifs_sb->tcon;
-	else
-		return -EIO;
-
-	xid = GetXid();
-	if (pTcon) {
-		cFYI(1, ("set type: 0x%x id: %d", quota_type, qid));
-	} else
-		rc = -EIO;
-
-	FreeXid(xid);
-	return rc;
-}
-
-int cifs_xstate_set(struct super_block *sb, unsigned int flags, int operation)
-{
-	int xid;
-	int rc = 0;
-	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-	struct cifsTconInfo *pTcon;
-
-	if (cifs_sb)
-		pTcon = cifs_sb->tcon;
-	else
-		return -EIO;
-
-	xid = GetXid();
-	if (pTcon) {
-		cFYI(1, ("flags: 0x%x operation: 0x%x", flags, operation));
-	} else
-		rc = -EIO;
-
-	FreeXid(xid);
-	return rc;
-}
-
-int cifs_xstate_get(struct super_block *sb, struct fs_quota_stat *qstats)
-{
-	int xid;
-	int rc = 0;
-	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-	struct cifsTconInfo *pTcon;
-
-	if (cifs_sb)
-		pTcon = cifs_sb->tcon;
-	else
-		return -EIO;
-
-	xid = GetXid();
-	if (pTcon) {
-		cFYI(1, ("pqstats %p", qstats));
-	} else
-		rc = -EIO;
-
-	FreeXid(xid);
-	return rc;
-}
-
-static const struct quotactl_ops cifs_quotactl_ops = {
-	.set_xquota	= cifs_xquota_set,
-	.get_xquota	= cifs_xquota_get,
-	.set_xstate	= cifs_xstate_set,
-	.get_xstate	= cifs_xstate_get,
-};
-#endif
-
 static void cifs_umount_begin(struct super_block *sb)
 {
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
@@ -547,7 +447,7 @@ static void cifs_umount_begin(struct super_block *sb)
 	/* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */
 	/* cancel_notify_requests(tcon); */
 	if (tcon->ses && tcon->ses->server) {
-		cFYI(1, ("wake up tasks now - umount begin not complete"));
+		cFYI(1, "wake up tasks now - umount begin not complete");
 		wake_up_all(&tcon->ses->server->request_q);
 		wake_up_all(&tcon->ses->server->response_q);
 		msleep(1); /* yield */
@@ -573,14 +473,24 @@ static int cifs_remount(struct super_block *sb, int *flags, char *data)
 	return 0;
 }
 
+void cifs_drop_inode(struct inode *inode)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
+		return generic_drop_inode(inode);
+
+	return generic_delete_inode(inode);
+}
+
 static const struct super_operations cifs_super_ops = {
 	.put_super = cifs_put_super,
 	.statfs = cifs_statfs,
 	.alloc_inode = cifs_alloc_inode,
 	.destroy_inode = cifs_destroy_inode,
-/*	.drop_inode	    = generic_delete_inode,
-	.delete_inode	= cifs_delete_inode,  */  /* Do not need above two
-	functions unless later we add lazy close of inodes or unless the
+	.drop_inode	= cifs_drop_inode,
+/*	.delete_inode	= cifs_delete_inode,  */  /* Do not need above
+	function unless later we add lazy close of inodes or unless the
 	kernel forgets to call us with the same number of releases (closes)
 	as opens */
 	.show_options = cifs_show_options,
@@ -598,7 +508,7 @@ cifs_get_sb(struct file_system_type *fs_type,
 	int rc;
 	struct super_block *sb = sget(fs_type, NULL, set_anon_super, NULL);
 
-	cFYI(1, ("Devname: %s flags: %d ", dev_name, flags));
+	cFYI(1, "Devname: %s flags: %d ", dev_name, flags);
 
 	if (IS_ERR(sb))
 		return PTR_ERR(sb);
@@ -638,14 +548,13 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
 		   setting the revalidate time to zero */
 		CIFS_I(file->f_path.dentry->d_inode)->time = 0;
 
-		retval = cifs_revalidate(file->f_path.dentry);
+		retval = cifs_revalidate_file(file);
 		if (retval < 0)
 			return (loff_t)retval;
 	}
 	return generic_file_llseek_unlocked(file, offset, origin);
 }
 
-#ifdef CONFIG_CIFS_EXPERIMENTAL
 static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
 {
 	/* note that this is called by vfs setlease with the BKL held
@@ -674,7 +583,6 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
 	else
 		return -EAGAIN;
 }
-#endif
 
 struct file_system_type cifs_fs_type = {
 	.owner = THIS_MODULE,
@@ -751,10 +659,7 @@ const struct file_operations cifs_file_ops = {
 #ifdef CONFIG_CIFS_POSIX
 	.unlocked_ioctl	= cifs_ioctl,
 #endif /* CONFIG_CIFS_POSIX */
-
-#ifdef CONFIG_CIFS_EXPERIMENTAL
 	.setlease = cifs_setlease,
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
 
 const struct file_operations cifs_file_direct_ops = {
@@ -773,9 +678,7 @@ const struct file_operations cifs_file_direct_ops = {
 	.unlocked_ioctl  = cifs_ioctl,
 #endif /* CONFIG_CIFS_POSIX */
 	.llseek = cifs_llseek,
-#ifdef CONFIG_CIFS_EXPERIMENTAL
 	.setlease = cifs_setlease,
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
 const struct file_operations cifs_file_nobrl_ops = {
 	.read = do_sync_read,
@@ -792,10 +695,7 @@ const struct file_operations cifs_file_nobrl_ops = {
 #ifdef CONFIG_CIFS_POSIX
 	.unlocked_ioctl	= cifs_ioctl,
 #endif /* CONFIG_CIFS_POSIX */
-
-#ifdef CONFIG_CIFS_EXPERIMENTAL
 	.setlease = cifs_setlease,
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
 
 const struct file_operations cifs_file_direct_nobrl_ops = {
@@ -807,14 +707,13 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
 	.release = cifs_close,
 	.fsync = cifs_fsync,
 	.flush = cifs_flush,
+	.mmap = cifs_file_mmap,
 	.splice_read = generic_file_splice_read,
 #ifdef CONFIG_CIFS_POSIX
 	.unlocked_ioctl  = cifs_ioctl,
 #endif /* CONFIG_CIFS_POSIX */
 	.llseek = cifs_llseek,
-#ifdef CONFIG_CIFS_EXPERIMENTAL
 	.setlease = cifs_setlease,
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
 
 const struct file_operations cifs_dir_ops = {
@@ -866,7 +765,7 @@ cifs_init_request_bufs(void)
 	} else {
 		CIFSMaxBufSize &= 0x1FE00; /* Round size to even 512 byte mult*/
 	}
-/*	cERROR(1,("CIFSMaxBufSize %d 0x%x",CIFSMaxBufSize,CIFSMaxBufSize)); */
+/*	cERROR(1, "CIFSMaxBufSize %d 0x%x",CIFSMaxBufSize,CIFSMaxBufSize); */
 	cifs_req_cachep = kmem_cache_create("cifs_request",
 					    CIFSMaxBufSize +
 					    MAX_CIFS_HDR_SIZE, 0,
@@ -878,7 +777,7 @@ cifs_init_request_bufs(void)
 		cifs_min_rcv = 1;
 	else if (cifs_min_rcv > 64) {
 		cifs_min_rcv = 64;
-		cERROR(1, ("cifs_min_rcv set to maximum (64)"));
+		cERROR(1, "cifs_min_rcv set to maximum (64)");
 	}
 
 	cifs_req_poolp = mempool_create_slab_pool(cifs_min_rcv,
@@ -909,7 +808,7 @@ cifs_init_request_bufs(void)
 		cifs_min_small = 2;
 	else if (cifs_min_small > 256) {
 		cifs_min_small = 256;
-		cFYI(1, ("cifs_min_small set to maximum (256)"));
+		cFYI(1, "cifs_min_small set to maximum (256)");
 	}
 
 	cifs_sm_req_poolp = mempool_create_slab_pool(cifs_min_small,
@@ -950,15 +849,6 @@ cifs_init_mids(void)
 		return -ENOMEM;
 	}
 
-	cifs_oplock_cachep = kmem_cache_create("cifs_oplock_structs",
-					sizeof(struct oplock_q_entry), 0,
-					SLAB_HWCACHE_ALIGN, NULL);
-	if (cifs_oplock_cachep == NULL) {
-		mempool_destroy(cifs_mid_poolp);
-		kmem_cache_destroy(cifs_mid_cachep);
-		return -ENOMEM;
-	}
-
 	return 0;
 }
 
@@ -967,7 +857,6 @@ cifs_destroy_mids(void)
 {
 	mempool_destroy(cifs_mid_poolp);
 	kmem_cache_destroy(cifs_mid_cachep);
-	kmem_cache_destroy(cifs_oplock_cachep);
 }
 
 static int __init
@@ -1007,10 +896,10 @@ init_cifs(void)
 
 	if (cifs_max_pending < 2) {
 		cifs_max_pending = 2;
-		cFYI(1, ("cifs_max_pending set to min of 2"));
+		cFYI(1, "cifs_max_pending set to min of 2");
 	} else if (cifs_max_pending > 256) {
 		cifs_max_pending = 256;
-		cFYI(1, ("cifs_max_pending set to max of 256"));
+		cFYI(1, "cifs_max_pending set to max of 256");
 	}
 
 	rc = cifs_init_inodecache();
@@ -1034,7 +923,7 @@ init_cifs(void)
 		goto out_unregister_filesystem;
 #endif
 #ifdef CONFIG_CIFS_DFS_UPCALL
-	rc = register_key_type(&key_type_dns_resolver);
+	rc = cifs_init_dns_resolver();
 	if (rc)
 		goto out_unregister_key_type;
 #endif
@@ -1046,7 +935,7 @@ init_cifs(void)
 
  out_unregister_resolver_key:
 #ifdef CONFIG_CIFS_DFS_UPCALL
-	unregister_key_type(&key_type_dns_resolver);
+	cifs_exit_dns_resolver();
  out_unregister_key_type:
 #endif
 #ifdef CONFIG_CIFS_UPCALL
@@ -1068,11 +957,11 @@ init_cifs(void)
 static void __exit
 exit_cifs(void)
 {
-	cFYI(DBG2, ("exit_cifs"));
+	cFYI(DBG2, "exit_cifs");
 	cifs_proc_clean();
 #ifdef CONFIG_CIFS_DFS_UPCALL
 	cifs_dfs_release_automount_timer();
-	unregister_key_type(&key_type_dns_resolver);
+	cifs_exit_dns_resolver();
 #endif
 #ifdef CONFIG_CIFS_UPCALL
 	unregister_key_type(&cifs_spnego_key_type);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index ac2b24c192f8..a7eb65c84b1c 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -61,7 +61,8 @@ extern int cifs_mkdir(struct inode *, struct dentry *, int);
 extern int cifs_rmdir(struct inode *, struct dentry *);
 extern int cifs_rename(struct inode *, struct dentry *, struct inode *,
 		       struct dentry *);
-extern int cifs_revalidate(struct dentry *);
+extern int cifs_revalidate_file(struct file *filp);
+extern int cifs_revalidate_dentry(struct dentry *);
 extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 extern int cifs_setattr(struct dentry *, struct iattr *);
 
@@ -83,7 +84,7 @@ extern ssize_t cifs_user_read(struct file *file, char __user *read_data,
 extern ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 			 size_t write_size, loff_t *poffset);
 extern int cifs_lock(struct file *, int, struct file_lock *);
-extern int cifs_fsync(struct file *, struct dentry *, int);
+extern int cifs_fsync(struct file *, int);
 extern int cifs_flush(struct file *, fl_owner_t id);
 extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
 extern const struct file_operations cifs_dir_ops;
@@ -113,5 +114,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
 
-#define CIFS_VERSION   "1.61"
+#define CIFS_VERSION   "1.64"
 #endif				/* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 4b35f7ec0583..a88479ceaad5 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -18,6 +18,7 @@
  */
 #include <linux/in.h>
 #include <linux/in6.h>
+#include <linux/slab.h>
 #include <linux/slow-work.h>
 #include "cifs_fs_sb.h"
 #include "cifsacl.h"
@@ -86,7 +87,6 @@ enum securityEnum {
 	RawNTLMSSP,		/* NTLMSSP without SPNEGO, NTLMv2 hash */
 /*	NTLMSSP, */ /* can use rawNTLMSSP instead of NTLMSSP via SPNEGO */
 	Kerberos,		/* Kerberos via SPNEGO */
-	MSKerberos,		/* MS Kerberos via SPNEGO */
 };
 
 enum protocolEnum {
@@ -149,6 +149,7 @@ struct TCP_Server_Info {
 	bool svlocal:1;			/* local server or remote */
 	bool noblocksnd;		/* use blocking sendmsg */
 	bool noautotune;		/* do not autotune send buf sizes */
+	bool tcp_nodelay;
 	atomic_t inFlight;  /* number of requests on the wire to server */
 #ifdef CONFIG_CIFS_STATS2
 	atomic_t inSend; /* requests trying to send */
@@ -183,6 +184,12 @@ struct TCP_Server_Info {
 	struct mac_key mac_signing_key;
 	char ntlmv2_hash[16];
 	unsigned long lstrp; /* when we got last response from this server */
+	u16 dialect; /* dialect index that server chose */
+	/* extended security flavors that server supports */
+	bool	sec_kerberos;		/* supports plain Kerberos */
+	bool	sec_mskerberos;		/* supports legacy MS Kerberos */
+	bool	sec_kerberosu2u;	/* supports U2U Kerberos */
+	bool	sec_ntlmssp;		/* supports NTLMSSP */
 };
 
 /*
@@ -204,7 +211,7 @@ struct cifsUidInfo {
 struct cifsSesInfo {
 	struct list_head smb_ses_list;
 	struct list_head tcon_list;
-	struct semaphore sesSem;
+	struct mutex session_mutex;
 #if 0
 	struct cifsUidInfo *uidInfo;	/* pointer to user info */
 #endif
@@ -388,6 +395,7 @@ struct cifsInodeInfo {
 	bool clientCanCacheRead:1;	/* read oplock */
 	bool clientCanCacheAll:1;	/* read and writebehind oplock */
 	bool delete_pending:1;		/* DELETE_ON_CLOSE is set */
+	bool invalid_mapping:1;		/* pagecache is invalid */
 	u64  server_eof;		/* current file size on server */
 	u64  uniqueid;			/* server inode number */
 	struct inode vfs_inode;
@@ -499,6 +507,7 @@ struct dfs_info3_param {
 #define CIFS_FATTR_DFS_REFERRAL		0x1
 #define CIFS_FATTR_DELETE_PENDING	0x2
 #define CIFS_FATTR_NEED_REVAL		0x4
+#define CIFS_FATTR_INO_COLLISION	0x8
 
 struct cifs_fattr {
 	u32		cf_flags;
@@ -714,7 +723,7 @@ GLOBAL_EXTERN unsigned int multiuser_mount; /* if enabled allows new sessions
 GLOBAL_EXTERN unsigned int oplockEnabled;
 GLOBAL_EXTERN unsigned int experimEnabled;
 GLOBAL_EXTERN unsigned int lookupCacheEnabled;
-GLOBAL_EXTERN unsigned int extended_security;	/* if on, session setup sent
+GLOBAL_EXTERN unsigned int global_secflags;	/* if on, session setup sent
 				with more secure ntlmssp2 challenge/resp */
 GLOBAL_EXTERN unsigned int sign_CIFS_PDUs;  /* enable smb packet signing */
 GLOBAL_EXTERN unsigned int linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 3877737f96a6..14d036d8db11 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -415,10 +415,10 @@ struct smb_hdr {
 	__u8 WordCount;
 } __attribute__((packed));
 /* given a pointer to an smb_hdr retrieve the value of byte count */
-#define BCC(smb_var) (*(__u16 *)((char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount)))
-#define BCC_LE(smb_var) (*(__le16 *)((char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount)))
+#define BCC(smb_var) (*(__u16 *)((char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount)))
+#define BCC_LE(smb_var) (*(__le16 *)((char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount)))
 /* given a pointer to an smb_hdr retrieve the pointer to the byte area */
-#define pByteArea(smb_var) ((unsigned char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount) + 2)
+#define pByteArea(smb_var) ((unsigned char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount) + 2)
 
 /*
  * Computer Name Length (since Netbios name was length 16 with last byte 0x20)
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 5646727e33f5..fb6318b81509 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -39,8 +39,20 @@ extern int smb_send(struct TCP_Server_Info *, struct smb_hdr *,
 			unsigned int /* length */);
 extern unsigned int _GetXid(void);
 extern void _FreeXid(unsigned int);
-#define GetXid() (int)_GetXid(); cFYI(1,("CIFS VFS: in %s as Xid: %d with uid: %d",__func__, xid,current_fsuid()));
-#define FreeXid(curr_xid) {_FreeXid(curr_xid); cFYI(1,("CIFS VFS: leaving %s (xid = %d) rc = %d",__func__,curr_xid,(int)rc));}
+#define GetXid()						\
+({								\
+	int __xid = (int)_GetXid();				\
+	cFYI(1, "CIFS VFS: in %s as Xid: %d with uid: %d",	\
+	     __func__, __xid, current_fsuid());			\
+	__xid;							\
+})
+
+#define FreeXid(curr_xid)					\
+do {								\
+	_FreeXid(curr_xid);					\
+	cFYI(1, "CIFS VFS: leaving %s (xid = %d) rc = %d",	\
+	     __func__, curr_xid, (int)rc);			\
+} while (0)
 extern char *build_path_from_dentry(struct dentry *);
 extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb);
 extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
@@ -73,7 +85,7 @@ extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *);
 extern unsigned int smbCalcSize(struct smb_hdr *ptr);
 extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr);
 extern int decode_negTokenInit(unsigned char *security_blob, int length,
-			enum securityEnum *secType);
+			struct TCP_Server_Info *server);
 extern int cifs_convert_address(char *src, void *dst);
 extern int map_smb_to_linux_error(struct smb_hdr *smb, int logErr);
 extern void header_assemble(struct smb_hdr *, char /* command */ ,
@@ -83,7 +95,6 @@ extern int small_smb_init_no_tc(const int smb_cmd, const int wct,
 				struct cifsSesInfo *ses,
 				void **request_buf);
 extern int CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
-			     const int stage,
 			     const struct nls_table *nls_cp);
 extern __u16 GetNextMid(struct TCP_Server_Info *server);
 extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
@@ -95,8 +106,10 @@ extern struct cifsFileInfo *cifs_new_fileinfo(struct inode *newinode,
 				__u16 fileHandle, struct file *file,
 				struct vfsmount *mnt, unsigned int oflags);
 extern int cifs_posix_open(char *full_path, struct inode **pinode,
-			   struct vfsmount *mnt, int mode, int oflags,
-			   __u32 *poplock, __u16 *pnetfid, int xid);
+				struct super_block *sb,
+				int mode, int oflags,
+				__u32 *poplock, __u16 *pnetfid, int xid);
+void cifs_fill_uniqueid(struct super_block *sb, struct cifs_fattr *fattr);
 extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr,
 				     FILE_UNIX_BASIC_INFO *info,
 				     struct cifs_sb_info *cifs_sb);
@@ -104,10 +117,12 @@ extern void cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr);
 extern struct inode *cifs_iget(struct super_block *sb,
 			       struct cifs_fattr *fattr);
 
+extern int cifs_get_file_info(struct file *filp);
 extern int cifs_get_inode_info(struct inode **pinode,
 			const unsigned char *search_path,
 			FILE_ALL_INFO *pfile_info,
 			struct super_block *sb, int xid, const __u16 *pfid);
+extern int cifs_get_file_info_unix(struct file *filp);
 extern int cifs_get_inode_info_unix(struct inode **pinode,
 			const unsigned char *search_path,
 			struct super_block *sb, int xid);
@@ -123,7 +138,9 @@ extern void cifs_dfs_release_automount_timer(void);
 void cifs_proc_init(void);
 void cifs_proc_clean(void);
 
-extern int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
+extern int cifs_negotiate_protocol(unsigned int xid,
+				  struct cifsSesInfo *ses);
+extern int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
 			struct nls_table *nls_info);
 extern int CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses);
 
@@ -142,6 +159,8 @@ extern int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
 extern int CIFSFindClose(const int, struct cifsTconInfo *tcon,
 			const __u16 search_handle);
 
+extern int CIFSSMBQFileInfo(const int xid, struct cifsTconInfo *tcon,
+			u16 netfid, FILE_ALL_INFO *pFindData);
 extern int CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
 			const unsigned char *searchName,
 			FILE_ALL_INFO *findData,
@@ -152,6 +171,8 @@ extern int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon,
 			FILE_ALL_INFO *findData,
 			const struct nls_table *nls_codepage, int remap);
 
+extern int CIFSSMBUnixQFileInfo(const int xid, struct cifsTconInfo *tcon,
+			u16 netfid, FILE_UNIX_BASIC_INFO *pFindData);
 extern int CIFSSMBUnixQPathInfo(const int xid,
 			struct cifsTconInfo *tcon,
 			const unsigned char *searchName,
@@ -363,13 +384,10 @@ extern int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
 			__u32 filter, struct file *file, int multishot,
 			const struct nls_table *nls_codepage);
 extern ssize_t CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon,
-			const unsigned char *searchName, char *EAData,
+			const unsigned char *searchName,
+			const unsigned char *ea_name, char *EAData,
 			size_t bufsize, const struct nls_table *nls_codepage,
 			int remap_special_chars);
-extern ssize_t CIFSSMBQueryEA(const int xid, struct cifsTconInfo *tcon,
-		const unsigned char *searchName, const unsigned char *ea_name,
-		unsigned char *ea_value, size_t buf_size,
-		const struct nls_table *nls_codepage, int remap_special_chars);
 extern int CIFSSMBSetEA(const int xid, struct cifsTconInfo *tcon,
 		const char *fileName, const char *ea_name,
 		const void *ea_value, const __u16 ea_value_len,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 941441d3e386..c65c3419dd37 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1,7 +1,7 @@
 /*
  *   fs/cifs/cifssmb.c
  *
- *   Copyright (C) International Business Machines  Corp., 2002,2009
+ *   Copyright (C) International Business Machines  Corp., 2002,2010
  *   Author(s): Steve French (sfrench@us.ibm.com)
  *
  *   Contains the routines for constructing the SMB PDUs themselves
@@ -30,6 +30,7 @@
 #include <linux/fs.h>
 #include <linux/kernel.h>
 #include <linux/vfs.h>
+#include <linux/slab.h>
 #include <linux/posix_acl_xattr.h>
 #include <asm/uaccess.h>
 #include "cifspdu.h"
@@ -129,8 +130,8 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
 		if (smb_command != SMB_COM_WRITE_ANDX &&
 		    smb_command != SMB_COM_OPEN_ANDX &&
 		    smb_command != SMB_COM_TREE_DISCONNECT) {
-			cFYI(1, ("can not send cmd %d while umounting",
-				smb_command));
+			cFYI(1, "can not send cmd %d while umounting",
+				smb_command);
 			return -ENODEV;
 		}
 	}
@@ -156,7 +157,7 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
 		 * back on-line
 		 */
 		if (!tcon->retry || ses->status == CifsExiting) {
-			cFYI(1, ("gave up waiting on reconnect in smb_init"));
+			cFYI(1, "gave up waiting on reconnect in smb_init");
 			return -EHOSTDOWN;
 		}
 	}
@@ -170,20 +171,21 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
 	 * need to prevent multiple threads trying to simultaneously
 	 * reconnect the same SMB session
 	 */
-	down(&ses->sesSem);
-	if (ses->need_reconnect)
+	mutex_lock(&ses->session_mutex);
+	rc = cifs_negotiate_protocol(0, ses);
+	if (rc == 0 && ses->need_reconnect)
 		rc = cifs_setup_session(0, ses, nls_codepage);
 
 	/* do we need to reconnect tcon? */
 	if (rc || !tcon->need_reconnect) {
-		up(&ses->sesSem);
+		mutex_unlock(&ses->session_mutex);
 		goto out;
 	}
 
 	mark_open_files_invalid(tcon);
 	rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage);
-	up(&ses->sesSem);
-	cFYI(1, ("reconnect tcon rc = %d", rc));
+	mutex_unlock(&ses->session_mutex);
+	cFYI(1, "reconnect tcon rc = %d", rc);
 
 	if (rc)
 		goto out;
@@ -354,7 +356,6 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 	struct TCP_Server_Info *server;
 	u16 count;
 	unsigned int secFlags;
-	u16 dialect;
 
 	if (ses->server)
 		server = ses->server;
@@ -371,9 +372,9 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 	if (ses->overrideSecFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL)))
 		secFlags = ses->overrideSecFlg;  /* BB FIXME fix sign flags? */
 	else /* if override flags set only sign/seal OR them with global auth */
-		secFlags = extended_security | ses->overrideSecFlg;
+		secFlags = global_secflags | ses->overrideSecFlg;
 
-	cFYI(1, ("secFlags 0x%x", secFlags));
+	cFYI(1, "secFlags 0x%x", secFlags);
 
 	pSMB->hdr.Mid = GetNextMid(server);
 	pSMB->hdr.Flags2 |= (SMBFLG2_UNICODE | SMBFLG2_ERR_STATUS);
@@ -381,14 +382,14 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 	if ((secFlags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5)
 		pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
 	else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_KRB5) {
-		cFYI(1, ("Kerberos only mechanism, enable extended security"));
+		cFYI(1, "Kerberos only mechanism, enable extended security");
 		pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
 	}
 #ifdef CONFIG_CIFS_EXPERIMENTAL
 	else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
 		pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
 	else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_NTLMSSP) {
-		cFYI(1, ("NTLMSSP only mechanism, enable extended security"));
+		cFYI(1, "NTLMSSP only mechanism, enable extended security");
 		pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
 	}
 #endif
@@ -407,10 +408,10 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 	if (rc != 0)
 		goto neg_err_exit;
 
-	dialect = le16_to_cpu(pSMBr->DialectIndex);
-	cFYI(1, ("Dialect: %d", dialect));
+	server->dialect = le16_to_cpu(pSMBr->DialectIndex);
+	cFYI(1, "Dialect: %d", server->dialect);
 	/* Check wct = 1 error case */
-	if ((pSMBr->hdr.WordCount < 13) || (dialect == BAD_PROT)) {
+	if ((pSMBr->hdr.WordCount < 13) || (server->dialect == BAD_PROT)) {
 		/* core returns wct = 1, but we do not ask for core - otherwise
 		small wct just comes when dialect index is -1 indicating we
 		could not negotiate a common dialect */
@@ -418,8 +419,8 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 		goto neg_err_exit;
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 	} else if ((pSMBr->hdr.WordCount == 13)
-			&& ((dialect == LANMAN_PROT)
-				|| (dialect == LANMAN2_PROT))) {
+			&& ((server->dialect == LANMAN_PROT)
+				|| (server->dialect == LANMAN2_PROT))) {
 		__s16 tmp;
 		struct lanman_neg_rsp *rsp = (struct lanman_neg_rsp *)pSMBr;
 
@@ -427,8 +428,8 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 			(secFlags & CIFSSEC_MAY_PLNTXT))
 			server->secType = LANMAN;
 		else {
-			cERROR(1, ("mount failed weak security disabled"
-				   " in /proc/fs/cifs/SecurityFlags"));
+			cERROR(1, "mount failed weak security disabled"
+				   " in /proc/fs/cifs/SecurityFlags");
 			rc = -EOPNOTSUPP;
 			goto neg_err_exit;
 		}
@@ -461,9 +462,9 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 			utc = CURRENT_TIME;
 			ts = cnvrtDosUnixTm(rsp->SrvTime.Date,
 					    rsp->SrvTime.Time, 0);
-			cFYI(1, ("SrvTime %d sec since 1970 (utc: %d) diff: %d",
+			cFYI(1, "SrvTime %d sec since 1970 (utc: %d) diff: %d",
 				(int)ts.tv_sec, (int)utc.tv_sec,
-				(int)(utc.tv_sec - ts.tv_sec)));
+				(int)(utc.tv_sec - ts.tv_sec));
 			val = (int)(utc.tv_sec - ts.tv_sec);
 			seconds = abs(val);
 			result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ;
@@ -477,7 +478,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 			server->timeAdj = (int)tmp;
 			server->timeAdj *= 60; /* also in seconds */
 		}
-		cFYI(1, ("server->timeAdj: %d seconds", server->timeAdj));
+		cFYI(1, "server->timeAdj: %d seconds", server->timeAdj);
 
 
 		/* BB get server time for time conversions and add
@@ -492,15 +493,15 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 			goto neg_err_exit;
 		}
 
-		cFYI(1, ("LANMAN negotiated"));
+		cFYI(1, "LANMAN negotiated");
 		/* we will not end up setting signing flags - as no signing
 		was in LANMAN and server did not return the flags on */
 		goto signing_check;
 #else /* weak security disabled */
 	} else if (pSMBr->hdr.WordCount == 13) {
-		cERROR(1, ("mount failed, cifs module not built "
-			  "with CIFS_WEAK_PW_HASH support"));
-			rc = -EOPNOTSUPP;
+		cERROR(1, "mount failed, cifs module not built "
+			  "with CIFS_WEAK_PW_HASH support");
+		rc = -EOPNOTSUPP;
 #endif /* WEAK_PW_HASH */
 		goto neg_err_exit;
 	} else if (pSMBr->hdr.WordCount != 17) {
@@ -511,14 +512,14 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 	/* else wct == 17 NTLM */
 	server->secMode = pSMBr->SecurityMode;
 	if ((server->secMode & SECMODE_USER) == 0)
-		cFYI(1, ("share mode security"));
+		cFYI(1, "share mode security");
 
 	if ((server->secMode & SECMODE_PW_ENCRYPT) == 0)
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 		if ((secFlags & CIFSSEC_MAY_PLNTXT) == 0)
 #endif /* CIFS_WEAK_PW_HASH */
-			cERROR(1, ("Server requests plain text password"
-				  " but client support disabled"));
+			cERROR(1, "Server requests plain text password"
+				  " but client support disabled");
 
 	if ((secFlags & CIFSSEC_MUST_NTLMV2) == CIFSSEC_MUST_NTLMV2)
 		server->secType = NTLMv2;
@@ -538,7 +539,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 #endif */
 	else {
 		rc = -EOPNOTSUPP;
-		cERROR(1, ("Invalid security type"));
+		cERROR(1, "Invalid security type");
 		goto neg_err_exit;
 	}
 	/* else ... any others ...? */
@@ -550,7 +551,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 	server->maxBuf = min(le32_to_cpu(pSMBr->MaxBufferSize),
 			(__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
 	server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
-	cFYI(DBG2, ("Max buf = %d", ses->server->maxBuf));
+	cFYI(DBG2, "Max buf = %d", ses->server->maxBuf);
 	GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey);
 	server->capabilities = le32_to_cpu(pSMBr->Capabilities);
 	server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone);
@@ -581,7 +582,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 			if (memcmp(server->server_GUID,
 				   pSMBr->u.extended_response.
 				   GUID, 16) != 0) {
-				cFYI(1, ("server UID changed"));
+				cFYI(1, "server UID changed");
 				memcpy(server->server_GUID,
 					pSMBr->u.extended_response.GUID,
 					16);
@@ -596,13 +597,19 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 			server->secType = RawNTLMSSP;
 		} else {
 			rc = decode_negTokenInit(pSMBr->u.extended_response.
-						 SecurityBlob,
-						 count - 16,
-						 &server->secType);
+						 SecurityBlob, count - 16,
+						 server);
 			if (rc == 1)
 				rc = 0;
 			else
 				rc = -EINVAL;
+
+			if (server->sec_kerberos || server->sec_mskerberos)
+				server->secType = Kerberos;
+			else if (server->sec_ntlmssp)
+				server->secType = RawNTLMSSP;
+			else
+				rc = -EOPNOTSUPP;
 		}
 	} else
 		server->capabilities &= ~CAP_EXTENDED_SECURITY;
@@ -613,22 +620,21 @@ signing_check:
 	if ((secFlags & CIFSSEC_MAY_SIGN) == 0) {
 		/* MUST_SIGN already includes the MAY_SIGN FLAG
 		   so if this is zero it means that signing is disabled */
-		cFYI(1, ("Signing disabled"));
+		cFYI(1, "Signing disabled");
 		if (server->secMode & SECMODE_SIGN_REQUIRED) {
-			cERROR(1, ("Server requires "
+			cERROR(1, "Server requires "
 				   "packet signing to be enabled in "
-				   "/proc/fs/cifs/SecurityFlags."));
+				   "/proc/fs/cifs/SecurityFlags.");
 			rc = -EOPNOTSUPP;
 		}
 		server->secMode &=
 			~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
 	} else if ((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) {
 		/* signing required */
-		cFYI(1, ("Must sign - secFlags 0x%x", secFlags));
+		cFYI(1, "Must sign - secFlags 0x%x", secFlags);
 		if ((server->secMode &
 			(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED)) == 0) {
-			cERROR(1,
-				("signing required but server lacks support"));
+			cERROR(1, "signing required but server lacks support");
 			rc = -EOPNOTSUPP;
 		} else
 			server->secMode |= SECMODE_SIGN_REQUIRED;
@@ -642,7 +648,7 @@ signing_check:
 neg_err_exit:
 	cifs_buf_release(pSMB);
 
-	cFYI(1, ("negprot rc %d", rc));
+	cFYI(1, "negprot rc %d", rc);
 	return rc;
 }
 
@@ -652,7 +658,7 @@ CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
 	struct smb_hdr *smb_buffer;
 	int rc = 0;
 
-	cFYI(1, ("In tree disconnect"));
+	cFYI(1, "In tree disconnect");
 
 	/* BB: do we need to check this? These should never be NULL. */
 	if ((tcon->ses == NULL) || (tcon->ses->server == NULL))
@@ -674,7 +680,7 @@ CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
 
 	rc = SendReceiveNoRsp(xid, tcon->ses, smb_buffer, 0);
 	if (rc)
-		cFYI(1, ("Tree disconnect failed %d", rc));
+		cFYI(1, "Tree disconnect failed %d", rc);
 
 	/* No need to return error on this operation if tid invalidated and
 	   closed on server already e.g. due to tcp session crashing */
@@ -690,7 +696,7 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
 	LOGOFF_ANDX_REQ *pSMB;
 	int rc = 0;
 
-	cFYI(1, ("In SMBLogoff for session disconnect"));
+	cFYI(1, "In SMBLogoff for session disconnect");
 
 	/*
 	 * BB: do we need to check validity of ses and server? They should
@@ -700,13 +706,13 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
 	if (!ses || !ses->server)
 		return -EIO;
 
-	down(&ses->sesSem);
+	mutex_lock(&ses->session_mutex);
 	if (ses->need_reconnect)
 		goto session_already_dead; /* no need to send SMBlogoff if uid
 					      already closed due to reconnect */
 	rc = small_smb_init(SMB_COM_LOGOFF_ANDX, 2, NULL, (void **)&pSMB);
 	if (rc) {
-		up(&ses->sesSem);
+		mutex_unlock(&ses->session_mutex);
 		return rc;
 	}
 
@@ -721,7 +727,7 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
 	pSMB->AndXCommand = 0xFF;
 	rc = SendReceiveNoRsp(xid, ses, (struct smb_hdr *) pSMB, 0);
 session_already_dead:
-	up(&ses->sesSem);
+	mutex_unlock(&ses->session_mutex);
 
 	/* if session dead then we do not need to do ulogoff,
 		since server closed smb session, no sense reporting
@@ -743,7 +749,7 @@ CIFSPOSIXDelFile(const int xid, struct cifsTconInfo *tcon, const char *fileName,
 	int bytes_returned = 0;
 	__u16 params, param_offset, offset, byte_count;
 
-	cFYI(1, ("In POSIX delete"));
+	cFYI(1, "In POSIX delete");
 PsxDelete:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -795,7 +801,7 @@ PsxDelete:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc)
-		cFYI(1, ("Posix delete returned %d", rc));
+		cFYI(1, "Posix delete returned %d", rc);
 	cifs_buf_release(pSMB);
 
 	cifs_stats_inc(&tcon->num_deletes);
@@ -842,7 +848,7 @@ DelFileRetry:
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	cifs_stats_inc(&tcon->num_deletes);
 	if (rc)
-		cFYI(1, ("Error in RMFile = %d", rc));
+		cFYI(1, "Error in RMFile = %d", rc);
 
 	cifs_buf_release(pSMB);
 	if (rc == -EAGAIN)
@@ -861,7 +867,7 @@ CIFSSMBRmDir(const int xid, struct cifsTconInfo *tcon, const char *dirName,
 	int bytes_returned;
 	int name_len;
 
-	cFYI(1, ("In CIFSSMBRmDir"));
+	cFYI(1, "In CIFSSMBRmDir");
 RmDirRetry:
 	rc = smb_init(SMB_COM_DELETE_DIRECTORY, 0, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -886,7 +892,7 @@ RmDirRetry:
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	cifs_stats_inc(&tcon->num_rmdirs);
 	if (rc)
-		cFYI(1, ("Error in RMDir = %d", rc));
+		cFYI(1, "Error in RMDir = %d", rc);
 
 	cifs_buf_release(pSMB);
 	if (rc == -EAGAIN)
@@ -904,7 +910,7 @@ CIFSSMBMkDir(const int xid, struct cifsTconInfo *tcon,
 	int bytes_returned;
 	int name_len;
 
-	cFYI(1, ("In CIFSSMBMkDir"));
+	cFYI(1, "In CIFSSMBMkDir");
 MkDirRetry:
 	rc = smb_init(SMB_COM_CREATE_DIRECTORY, 0, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -929,7 +935,7 @@ MkDirRetry:
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	cifs_stats_inc(&tcon->num_mkdirs);
 	if (rc)
-		cFYI(1, ("Error in Mkdir = %d", rc));
+		cFYI(1, "Error in Mkdir = %d", rc);
 
 	cifs_buf_release(pSMB);
 	if (rc == -EAGAIN)
@@ -952,7 +958,7 @@ CIFSPOSIXCreate(const int xid, struct cifsTconInfo *tcon, __u32 posix_flags,
 	OPEN_PSX_REQ *pdata;
 	OPEN_PSX_RSP *psx_rsp;
 
-	cFYI(1, ("In POSIX Create"));
+	cFYI(1, "In POSIX Create");
 PsxCreat:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -1006,11 +1012,11 @@ PsxCreat:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, ("Posix create returned %d", rc));
+		cFYI(1, "Posix create returned %d", rc);
 		goto psx_create_err;
 	}
 
-	cFYI(1, ("copying inode info"));
+	cFYI(1, "copying inode info");
 	rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
 	if (rc || (pSMBr->ByteCount < sizeof(OPEN_PSX_RSP))) {
@@ -1032,11 +1038,11 @@ PsxCreat:
 	/* check to make sure response data is there */
 	if (psx_rsp->ReturnedLevel != cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC)) {
 		pRetData->Type = cpu_to_le32(-1); /* unknown */
-		cFYI(DBG2, ("unknown type"));
+		cFYI(DBG2, "unknown type");
 	} else {
 		if (pSMBr->ByteCount < sizeof(OPEN_PSX_RSP)
 					+ sizeof(FILE_UNIX_BASIC_INFO)) {
-			cERROR(1, ("Open response data too small"));
+			cERROR(1, "Open response data too small");
 			pRetData->Type = cpu_to_le32(-1);
 			goto psx_create_err;
 		}
@@ -1083,7 +1089,7 @@ static __u16 convert_disposition(int disposition)
 			ofun = SMBOPEN_OCREATE | SMBOPEN_OTRUNC;
 			break;
 		default:
-			cFYI(1, ("unknown disposition %d", disposition));
+			cFYI(1, "unknown disposition %d", disposition);
 			ofun =  SMBOPEN_OAPPEND; /* regular open */
 	}
 	return ofun;
@@ -1174,7 +1180,7 @@ OldOpenRetry:
 			(struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP);
 	cifs_stats_inc(&tcon->num_opens);
 	if (rc) {
-		cFYI(1, ("Error in Open = %d", rc));
+		cFYI(1, "Error in Open = %d", rc);
 	} else {
 	/* BB verify if wct == 15 */
 
@@ -1287,7 +1293,7 @@ openRetry:
 			(struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP);
 	cifs_stats_inc(&tcon->num_opens);
 	if (rc) {
-		cFYI(1, ("Error in Open = %d", rc));
+		cFYI(1, "Error in Open = %d", rc);
 	} else {
 		*pOplock = pSMBr->OplockLevel; /* 1 byte no need to le_to_cpu */
 		*netfid = pSMBr->Fid;	/* cifs fid stays in le */
@@ -1325,7 +1331,7 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
 	int resp_buf_type = 0;
 	struct kvec iov[1];
 
-	cFYI(1, ("Reading %d bytes on fid %d", count, netfid));
+	cFYI(1, "Reading %d bytes on fid %d", count, netfid);
 	if (tcon->ses->capabilities & CAP_LARGE_FILES)
 		wct = 12;
 	else {
@@ -1370,7 +1376,7 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
 	cifs_stats_inc(&tcon->num_reads);
 	pSMBr = (READ_RSP *)iov[0].iov_base;
 	if (rc) {
-		cERROR(1, ("Send error in read = %d", rc));
+		cERROR(1, "Send error in read = %d", rc);
 	} else {
 		int data_length = le16_to_cpu(pSMBr->DataLengthHigh);
 		data_length = data_length << 16;
@@ -1380,15 +1386,15 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
 		/*check that DataLength would not go beyond end of SMB */
 		if ((data_length > CIFSMaxBufSize)
 				|| (data_length > count)) {
-			cFYI(1, ("bad length %d for count %d",
-				 data_length, count));
+			cFYI(1, "bad length %d for count %d",
+				 data_length, count);
 			rc = -EIO;
 			*nbytes = 0;
 		} else {
 			pReadData = (char *) (&pSMBr->hdr.Protocol) +
 					le16_to_cpu(pSMBr->DataOffset);
 /*			if (rc = copy_to_user(buf, pReadData, data_length)) {
-				cERROR(1,("Faulting on read rc = %d",rc));
+				cERROR(1, "Faulting on read rc = %d",rc);
 				rc = -EFAULT;
 			}*/ /* can not use copy_to_user when using page cache*/
 			if (*buf)
@@ -1430,7 +1436,9 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
 	__u32 bytes_sent;
 	__u16 byte_count;
 
-	/* cFYI(1, ("write at %lld %d bytes", offset, count));*/
+	*nbytes = 0;
+
+	/* cFYI(1, "write at %lld %d bytes", offset, count);*/
 	if (tcon->ses == NULL)
 		return -ECONNABORTED;
 
@@ -1511,12 +1519,19 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, long_op);
 	cifs_stats_inc(&tcon->num_writes);
 	if (rc) {
-		cFYI(1, ("Send error in write = %d", rc));
-		*nbytes = 0;
+		cFYI(1, "Send error in write = %d", rc);
 	} else {
 		*nbytes = le16_to_cpu(pSMBr->CountHigh);
 		*nbytes = (*nbytes) << 16;
 		*nbytes += le16_to_cpu(pSMBr->Count);
+
+		/*
+		 * Mask off high 16 bits when bytes written as returned by the
+		 * server is greater than bytes requested by the client. Some
+		 * OS/2 servers are known to set incorrect CountHigh values.
+		 */
+		if (*nbytes > count)
+			*nbytes &= 0xFFFF;
 	}
 
 	cifs_buf_release(pSMB);
@@ -1541,7 +1556,7 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
 
 	*nbytes = 0;
 
-	cFYI(1, ("write2 at %lld %d bytes", (long long)offset, count));
+	cFYI(1, "write2 at %lld %d bytes", (long long)offset, count);
 
 	if (tcon->ses->capabilities & CAP_LARGE_FILES) {
 		wct = 14;
@@ -1596,7 +1611,7 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
 			  long_op);
 	cifs_stats_inc(&tcon->num_writes);
 	if (rc) {
-		cFYI(1, ("Send error Write2 = %d", rc));
+		cFYI(1, "Send error Write2 = %d", rc);
 	} else if (resp_buf_type == 0) {
 		/* presumably this can not happen, but best to be safe */
 		rc = -EIO;
@@ -1605,6 +1620,14 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
 		*nbytes = le16_to_cpu(pSMBr->CountHigh);
 		*nbytes = (*nbytes) << 16;
 		*nbytes += le16_to_cpu(pSMBr->Count);
+
+		/*
+		 * Mask off high 16 bits when bytes written as returned by the
+		 * server is greater than bytes requested by the client. OS/2
+		 * servers are known to set incorrect CountHigh values.
+		 */
+		if (*nbytes > count)
+			*nbytes &= 0xFFFF;
 	}
 
 /*	cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */
@@ -1633,7 +1656,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
 	int timeout = 0;
 	__u16 count;
 
-	cFYI(1, ("CIFSSMBLock timeout %d numLock %d", (int)waitFlag, numLock));
+	cFYI(1, "CIFSSMBLock timeout %d numLock %d", (int)waitFlag, numLock);
 	rc = small_smb_init(SMB_COM_LOCKING_ANDX, 8, tcon, (void **) &pSMB);
 
 	if (rc)
@@ -1681,7 +1704,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
 	}
 	cifs_stats_inc(&tcon->num_locks);
 	if (rc)
-		cFYI(1, ("Send error in Lock = %d", rc));
+		cFYI(1, "Send error in Lock = %d", rc);
 
 	/* Note: On -EAGAIN error only caller can retry on handle based calls
 	since file handle passed in no longer valid */
@@ -1704,7 +1727,7 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
 	__u16 params, param_offset, offset, byte_count, count;
 	struct kvec iov[1];
 
-	cFYI(1, ("Posix Lock"));
+	cFYI(1, "Posix Lock");
 
 	if (pLockData == NULL)
 		return -EINVAL;
@@ -1774,7 +1797,7 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
 	}
 
 	if (rc) {
-		cFYI(1, ("Send error in Posix Lock = %d", rc));
+		cFYI(1, "Send error in Posix Lock = %d", rc);
 	} else if (get_flag) {
 		/* lock structure can be returned on get */
 		__u16 data_offset;
@@ -1793,8 +1816,21 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
 		}
 		parm_data = (struct cifs_posix_lock *)
 			((char *)&pSMBr->hdr.Protocol + data_offset);
-		if (parm_data->lock_type == cpu_to_le16(CIFS_UNLCK))
+		if (parm_data->lock_type == __constant_cpu_to_le16(CIFS_UNLCK))
 			pLockData->fl_type = F_UNLCK;
+		else {
+			if (parm_data->lock_type ==
+					__constant_cpu_to_le16(CIFS_RDLCK))
+				pLockData->fl_type = F_RDLCK;
+			else if (parm_data->lock_type ==
+					__constant_cpu_to_le16(CIFS_WRLCK))
+				pLockData->fl_type = F_WRLCK;
+
+			pLockData->fl_start = parm_data->start;
+			pLockData->fl_end = parm_data->start +
+						parm_data->length - 1;
+			pLockData->fl_pid = parm_data->pid;
+		}
 	}
 
 plk_err_exit:
@@ -1818,7 +1854,7 @@ CIFSSMBClose(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
 {
 	int rc = 0;
 	CLOSE_REQ *pSMB = NULL;
-	cFYI(1, ("In CIFSSMBClose"));
+	cFYI(1, "In CIFSSMBClose");
 
 /* do not retry on dead session on close */
 	rc = small_smb_init(SMB_COM_CLOSE, 3, tcon, (void **) &pSMB);
@@ -1835,7 +1871,7 @@ CIFSSMBClose(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
 	if (rc) {
 		if (rc != -EINTR) {
 			/* EINTR is expected when user ctl-c to kill app */
-			cERROR(1, ("Send error in Close = %d", rc));
+			cERROR(1, "Send error in Close = %d", rc);
 		}
 	}
 
@@ -1851,7 +1887,7 @@ CIFSSMBFlush(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
 {
 	int rc = 0;
 	FLUSH_REQ *pSMB = NULL;
-	cFYI(1, ("In CIFSSMBFlush"));
+	cFYI(1, "In CIFSSMBFlush");
 
 	rc = small_smb_init(SMB_COM_FLUSH, 1, tcon, (void **) &pSMB);
 	if (rc)
@@ -1862,7 +1898,7 @@ CIFSSMBFlush(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
 	rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
 	cifs_stats_inc(&tcon->num_flushes);
 	if (rc)
-		cERROR(1, ("Send error in Flush = %d", rc));
+		cERROR(1, "Send error in Flush = %d", rc);
 
 	return rc;
 }
@@ -1879,7 +1915,7 @@ CIFSSMBRename(const int xid, struct cifsTconInfo *tcon,
 	int name_len, name_len2;
 	__u16 count;
 
-	cFYI(1, ("In CIFSSMBRename"));
+	cFYI(1, "In CIFSSMBRename");
 renameRetry:
 	rc = smb_init(SMB_COM_RENAME, 1, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -1925,7 +1961,7 @@ renameRetry:
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	cifs_stats_inc(&tcon->num_renames);
 	if (rc)
-		cFYI(1, ("Send error in rename = %d", rc));
+		cFYI(1, "Send error in rename = %d", rc);
 
 	cifs_buf_release(pSMB);
 
@@ -1949,7 +1985,7 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
 	int len_of_str;
 	__u16 params, param_offset, offset, count, byte_count;
 
-	cFYI(1, ("Rename to File by handle"));
+	cFYI(1, "Rename to File by handle");
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, pTcon, (void **) &pSMB,
 			(void **) &pSMBr);
 	if (rc)
@@ -2004,7 +2040,7 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	cifs_stats_inc(&pTcon->num_t2renames);
 	if (rc)
-		cFYI(1, ("Send error in Rename (by file handle) = %d", rc));
+		cFYI(1, "Send error in Rename (by file handle) = %d", rc);
 
 	cifs_buf_release(pSMB);
 
@@ -2026,7 +2062,7 @@ CIFSSMBCopy(const int xid, struct cifsTconInfo *tcon, const char *fromName,
 	int name_len, name_len2;
 	__u16 count;
 
-	cFYI(1, ("In CIFSSMBCopy"));
+	cFYI(1, "In CIFSSMBCopy");
 copyRetry:
 	rc = smb_init(SMB_COM_COPY, 1, tcon, (void **) &pSMB,
 			(void **) &pSMBr);
@@ -2071,8 +2107,8 @@ copyRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 		(struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, ("Send error in copy = %d with %d files copied",
-			rc, le16_to_cpu(pSMBr->CopyCount)));
+		cFYI(1, "Send error in copy = %d with %d files copied",
+			rc, le16_to_cpu(pSMBr->CopyCount));
 	}
 	cifs_buf_release(pSMB);
 
@@ -2096,7 +2132,7 @@ CIFSUnixCreateSymLink(const int xid, struct cifsTconInfo *tcon,
 	int bytes_returned = 0;
 	__u16 params, param_offset, offset, byte_count;
 
-	cFYI(1, ("In Symlink Unix style"));
+	cFYI(1, "In Symlink Unix style");
 createSymLinkRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -2161,7 +2197,7 @@ createSymLinkRetry:
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	cifs_stats_inc(&tcon->num_symlinks);
 	if (rc)
-		cFYI(1, ("Send error in SetPathInfo create symlink = %d", rc));
+		cFYI(1, "Send error in SetPathInfo create symlink = %d", rc);
 
 	cifs_buf_release(pSMB);
 
@@ -2185,7 +2221,7 @@ CIFSUnixCreateHardLink(const int xid, struct cifsTconInfo *tcon,
 	int bytes_returned = 0;
 	__u16 params, param_offset, offset, byte_count;
 
-	cFYI(1, ("In Create Hard link Unix style"));
+	cFYI(1, "In Create Hard link Unix style");
 createHardLinkRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -2247,7 +2283,7 @@ createHardLinkRetry:
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	cifs_stats_inc(&tcon->num_hardlinks);
 	if (rc)
-		cFYI(1, ("Send error in SetPathInfo (hard link) = %d", rc));
+		cFYI(1, "Send error in SetPathInfo (hard link) = %d", rc);
 
 	cifs_buf_release(pSMB);
 	if (rc == -EAGAIN)
@@ -2268,7 +2304,7 @@ CIFSCreateHardLink(const int xid, struct cifsTconInfo *tcon,
 	int name_len, name_len2;
 	__u16 count;
 
-	cFYI(1, ("In CIFSCreateHardLink"));
+	cFYI(1, "In CIFSCreateHardLink");
 winCreateHardLinkRetry:
 
 	rc = smb_init(SMB_COM_NT_RENAME, 4, tcon, (void **) &pSMB,
@@ -2319,7 +2355,7 @@ winCreateHardLinkRetry:
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	cifs_stats_inc(&tcon->num_hardlinks);
 	if (rc)
-		cFYI(1, ("Send error in hard link (NT rename) = %d", rc));
+		cFYI(1, "Send error in hard link (NT rename) = %d", rc);
 
 	cifs_buf_release(pSMB);
 	if (rc == -EAGAIN)
@@ -2342,7 +2378,7 @@ CIFSSMBUnixQuerySymLink(const int xid, struct cifsTconInfo *tcon,
 	__u16 params, byte_count;
 	char *data_start;
 
-	cFYI(1, ("In QPathSymLinkInfo (Unix) for path %s", searchName));
+	cFYI(1, "In QPathSymLinkInfo (Unix) for path %s", searchName);
 
 querySymLinkRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -2389,7 +2425,7 @@ querySymLinkRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, ("Send error in QuerySymLinkInfo = %d", rc));
+		cFYI(1, "Send error in QuerySymLinkInfo = %d", rc);
 	} else {
 		/* decode response */
 
@@ -2490,21 +2526,21 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,
 
 	/* should we also check that parm and data areas do not overlap? */
 	if (*ppparm > end_of_smb) {
-		cFYI(1, ("parms start after end of smb"));
+		cFYI(1, "parms start after end of smb");
 		return -EINVAL;
 	} else if (parm_count + *ppparm > end_of_smb) {
-		cFYI(1, ("parm end after end of smb"));
+		cFYI(1, "parm end after end of smb");
 		return -EINVAL;
 	} else if (*ppdata > end_of_smb) {
-		cFYI(1, ("data starts after end of smb"));
+		cFYI(1, "data starts after end of smb");
 		return -EINVAL;
 	} else if (data_count + *ppdata > end_of_smb) {
-		cFYI(1, ("data %p + count %d (%p) ends after end of smb %p start %p",
+		cFYI(1, "data %p + count %d (%p) past smb end %p start %p",
 			*ppdata, data_count, (data_count + *ppdata),
-			end_of_smb, pSMBr));
+			end_of_smb, pSMBr);
 		return -EINVAL;
 	} else if (parm_count + data_count > pSMBr->ByteCount) {
-		cFYI(1, ("parm count and data count larger than SMB"));
+		cFYI(1, "parm count and data count larger than SMB");
 		return -EINVAL;
 	}
 	*pdatalen = data_count;
@@ -2523,7 +2559,7 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
 	struct smb_com_transaction_ioctl_req *pSMB;
 	struct smb_com_transaction_ioctl_rsp *pSMBr;
 
-	cFYI(1, ("In Windows reparse style QueryLink for path %s", searchName));
+	cFYI(1, "In Windows reparse style QueryLink for path %s", searchName);
 	rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
 	if (rc)
@@ -2552,7 +2588,7 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, ("Send error in QueryReparseLinkInfo = %d", rc));
+		cFYI(1, "Send error in QueryReparseLinkInfo = %d", rc);
 	} else {		/* decode response */
 		__u32 data_offset = le32_to_cpu(pSMBr->DataOffset);
 		__u32 data_count = le32_to_cpu(pSMBr->DataCount);
@@ -2576,7 +2612,7 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
 			if ((reparse_buf->LinkNamesBuf +
 				reparse_buf->TargetNameOffset +
 				reparse_buf->TargetNameLen) > end_of_smb) {
-				cFYI(1, ("reparse buf beyond SMB"));
+				cFYI(1, "reparse buf beyond SMB");
 				rc = -EIO;
 				goto qreparse_out;
 			}
@@ -2597,12 +2633,12 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
 			}
 		} else {
 			rc = -EIO;
-			cFYI(1, ("Invalid return data count on "
-				 "get reparse info ioctl"));
+			cFYI(1, "Invalid return data count on "
+				 "get reparse info ioctl");
 		}
 		symlinkinfo[buflen] = 0; /* just in case so the caller
 					does not go off the end of the buffer */
-		cFYI(1, ("readlink result - %s", symlinkinfo));
+		cFYI(1, "readlink result - %s", symlinkinfo);
 	}
 
 qreparse_out:
@@ -2625,7 +2661,7 @@ static void cifs_convert_ace(posix_acl_xattr_entry *ace,
 	ace->e_perm = cpu_to_le16(cifs_ace->cifs_e_perm);
 	ace->e_tag  = cpu_to_le16(cifs_ace->cifs_e_tag);
 	ace->e_id   = cpu_to_le32(le64_to_cpu(cifs_ace->cifs_uid));
-	/* cFYI(1,("perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id)); */
+	/* cFYI(1, "perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id); */
 
 	return;
 }
@@ -2651,8 +2687,8 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen,
 		size += sizeof(struct cifs_posix_ace) * count;
 		/* check if we would go beyond end of SMB */
 		if (size_of_data_area < size) {
-			cFYI(1, ("bad CIFS POSIX ACL size %d vs. %d",
-				size_of_data_area, size));
+			cFYI(1, "bad CIFS POSIX ACL size %d vs. %d",
+				size_of_data_area, size);
 			return -EINVAL;
 		}
 	} else if (acl_type & ACL_TYPE_DEFAULT) {
@@ -2699,7 +2735,7 @@ static __u16 convert_ace_to_cifs_ace(struct cifs_posix_ace *cifs_ace,
 		cifs_ace->cifs_uid = cpu_to_le64(-1);
 	} else
 		cifs_ace->cifs_uid = cpu_to_le64(le32_to_cpu(local_ace->e_id));
-	/*cFYI(1,("perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id));*/
+	/*cFYI(1, "perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id);*/
 	return rc;
 }
 
@@ -2717,12 +2753,12 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
 		return 0;
 
 	count = posix_acl_xattr_count((size_t)buflen);
-	cFYI(1, ("setting acl with %d entries from buf of length %d and "
+	cFYI(1, "setting acl with %d entries from buf of length %d and "
 		"version of %d",
-		count, buflen, le32_to_cpu(local_acl->a_version)));
+		count, buflen, le32_to_cpu(local_acl->a_version));
 	if (le32_to_cpu(local_acl->a_version) != 2) {
-		cFYI(1, ("unknown POSIX ACL version %d",
-		     le32_to_cpu(local_acl->a_version)));
+		cFYI(1, "unknown POSIX ACL version %d",
+		     le32_to_cpu(local_acl->a_version));
 		return 0;
 	}
 	cifs_acl->version = cpu_to_le16(1);
@@ -2731,7 +2767,7 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
 	else if (acl_type == ACL_TYPE_DEFAULT)
 		cifs_acl->default_entry_count = cpu_to_le16(count);
 	else {
-		cFYI(1, ("unknown ACL type %d", acl_type));
+		cFYI(1, "unknown ACL type %d", acl_type);
 		return 0;
 	}
 	for (i = 0; i < count; i++) {
@@ -2764,7 +2800,7 @@ CIFSSMBGetPosixACL(const int xid, struct cifsTconInfo *tcon,
 	int name_len;
 	__u16 params, byte_count;
 
-	cFYI(1, ("In GetPosixACL (Unix) for path %s", searchName));
+	cFYI(1, "In GetPosixACL (Unix) for path %s", searchName);
 
 queryAclRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -2816,7 +2852,7 @@ queryAclRetry:
 		(struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	cifs_stats_inc(&tcon->num_acl_get);
 	if (rc) {
-		cFYI(1, ("Send error in Query POSIX ACL = %d", rc));
+		cFYI(1, "Send error in Query POSIX ACL = %d", rc);
 	} else {
 		/* decode response */
 
@@ -2853,7 +2889,7 @@ CIFSSMBSetPosixACL(const int xid, struct cifsTconInfo *tcon,
 	int bytes_returned = 0;
 	__u16 params, byte_count, data_count, param_offset, offset;
 
-	cFYI(1, ("In SetPosixACL (Unix) for path %s", fileName));
+	cFYI(1, "In SetPosixACL (Unix) for path %s", fileName);
 setAclRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -2908,7 +2944,7 @@ setAclRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc)
-		cFYI(1, ("Set POSIX ACL returned %d", rc));
+		cFYI(1, "Set POSIX ACL returned %d", rc);
 
 setACLerrorExit:
 	cifs_buf_release(pSMB);
@@ -2928,7 +2964,7 @@ CIFSGetExtAttr(const int xid, struct cifsTconInfo *tcon,
 	int bytes_returned;
 	__u16 params, byte_count;
 
-	cFYI(1, ("In GetExtAttr"));
+	cFYI(1, "In GetExtAttr");
 	if (tcon == NULL)
 		return -ENODEV;
 
@@ -2967,7 +3003,7 @@ GetExtAttrRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, ("error %d in GetExtAttr", rc));
+		cFYI(1, "error %d in GetExtAttr", rc);
 	} else {
 		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -2982,7 +3018,7 @@ GetExtAttrRetry:
 			struct file_chattr_info *pfinfo;
 			/* BB Do we need a cast or hash here ? */
 			if (count != 16) {
-				cFYI(1, ("Illegal size ret in GetExtAttr"));
+				cFYI(1, "Illegal size ret in GetExtAttr");
 				rc = -EIO;
 				goto GetExtAttrOut;
 			}
@@ -3012,7 +3048,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
 	QUERY_SEC_DESC_REQ *pSMB;
 	struct kvec iov[1];
 
-	cFYI(1, ("GetCifsACL"));
+	cFYI(1, "GetCifsACL");
 
 	*pbuflen = 0;
 	*acl_inf = NULL;
@@ -3037,7 +3073,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
 			 CIFS_STD_OP);
 	cifs_stats_inc(&tcon->num_acl_get);
 	if (rc) {
-		cFYI(1, ("Send error in QuerySecDesc = %d", rc));
+		cFYI(1, "Send error in QuerySecDesc = %d", rc);
 	} else {                /* decode response */
 		__le32 *parm;
 		__u32 parm_len;
@@ -3052,7 +3088,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
 			goto qsec_out;
 		pSMBr = (struct smb_com_ntransact_rsp *)iov[0].iov_base;
 
-		cFYI(1, ("smb %p parm %p data %p", pSMBr, parm, *acl_inf));
+		cFYI(1, "smb %p parm %p data %p", pSMBr, parm, *acl_inf);
 
 		if (le32_to_cpu(pSMBr->ParameterCount) != 4) {
 			rc = -EIO;      /* bad smb */
@@ -3064,8 +3100,8 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
 
 		acl_len = le32_to_cpu(*parm);
 		if (acl_len != *pbuflen) {
-			cERROR(1, ("acl length %d does not match %d",
-				   acl_len, *pbuflen));
+			cERROR(1, "acl length %d does not match %d",
+				   acl_len, *pbuflen);
 			if (*pbuflen > acl_len)
 				*pbuflen = acl_len;
 		}
@@ -3074,7 +3110,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
 		   header followed by the smallest SID */
 		if ((*pbuflen < sizeof(struct cifs_ntsd) + 8) ||
 		    (*pbuflen >= 64 * 1024)) {
-			cERROR(1, ("bad acl length %d", *pbuflen));
+			cERROR(1, "bad acl length %d", *pbuflen);
 			rc = -EINVAL;
 			*pbuflen = 0;
 		} else {
@@ -3148,9 +3184,9 @@ setCifsAclRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 		(struct smb_hdr *) pSMBr, &bytes_returned, 0);
 
-	cFYI(1, ("SetCIFSACL bytes_returned: %d, rc: %d", bytes_returned, rc));
+	cFYI(1, "SetCIFSACL bytes_returned: %d, rc: %d", bytes_returned, rc);
 	if (rc)
-		cFYI(1, ("Set CIFS ACL returned %d", rc));
+		cFYI(1, "Set CIFS ACL returned %d", rc);
 	cifs_buf_release(pSMB);
 
 	if (rc == -EAGAIN)
@@ -3174,7 +3210,7 @@ int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon,
 	int bytes_returned;
 	int name_len;
 
-	cFYI(1, ("In SMBQPath path %s", searchName));
+	cFYI(1, "In SMBQPath path %s", searchName);
 QInfRetry:
 	rc = smb_init(SMB_COM_QUERY_INFORMATION, 0, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -3200,7 +3236,7 @@ QInfRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, ("Send error in QueryInfo = %d", rc));
+		cFYI(1, "Send error in QueryInfo = %d", rc);
 	} else if (pFinfo) {
 		struct timespec ts;
 		__u32 time = le32_to_cpu(pSMBr->last_write_time);
@@ -3230,8 +3266,72 @@ QInfRetry:
 	return rc;
 }
 
+int
+CIFSSMBQFileInfo(const int xid, struct cifsTconInfo *tcon,
+		 u16 netfid, FILE_ALL_INFO *pFindData)
+{
+	struct smb_t2_qfi_req *pSMB = NULL;
+	struct smb_t2_qfi_rsp *pSMBr = NULL;
+	int rc = 0;
+	int bytes_returned;
+	__u16 params, byte_count;
+
+QFileInfoRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	params = 2 /* level */ + 2 /* fid */;
+	pSMB->t2.TotalDataCount = 0;
+	pSMB->t2.MaxParameterCount = cpu_to_le16(4);
+	/* BB find exact max data count below from sess structure BB */
+	pSMB->t2.MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
+	pSMB->t2.MaxSetupCount = 0;
+	pSMB->t2.Reserved = 0;
+	pSMB->t2.Flags = 0;
+	pSMB->t2.Timeout = 0;
+	pSMB->t2.Reserved2 = 0;
+	pSMB->t2.ParameterOffset = cpu_to_le16(offsetof(struct smb_t2_qfi_req,
+					       Fid) - 4);
+	pSMB->t2.DataCount = 0;
+	pSMB->t2.DataOffset = 0;
+	pSMB->t2.SetupCount = 1;
+	pSMB->t2.Reserved3 = 0;
+	pSMB->t2.SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION);
+	byte_count = params + 1 /* pad */ ;
+	pSMB->t2.TotalParameterCount = cpu_to_le16(params);
+	pSMB->t2.ParameterCount = pSMB->t2.TotalParameterCount;
+	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_ALL_INFO);
+	pSMB->Pad = 0;
+	pSMB->Fid = netfid;
+	pSMB->hdr.smb_buf_length += byte_count;
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cFYI(1, "Send error in QPathInfo = %d", rc);
+	} else {		/* decode response */
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
+		if (rc) /* BB add auto retry on EOPNOTSUPP? */
+			rc = -EIO;
+		else if (pSMBr->ByteCount < 40)
+			rc = -EIO;	/* bad smb */
+		else if (pFindData) {
+			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+			memcpy((char *) pFindData,
+			       (char *) &pSMBr->hdr.Protocol +
+			       data_offset, sizeof(FILE_ALL_INFO));
+		} else
+		    rc = -ENOMEM;
+	}
+	cifs_buf_release(pSMB);
+	if (rc == -EAGAIN)
+		goto QFileInfoRetry;
 
+	return rc;
+}
 
 int
 CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
@@ -3248,7 +3348,7 @@ CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
 	int name_len;
 	__u16 params, byte_count;
 
-/* cFYI(1, ("In QPathInfo path %s", searchName)); */
+/* cFYI(1, "In QPathInfo path %s", searchName); */
 QPathInfoRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -3298,7 +3398,7 @@ QPathInfoRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, ("Send error in QPathInfo = %d", rc));
+		cFYI(1, "Send error in QPathInfo = %d", rc);
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
@@ -3335,6 +3435,75 @@ QPathInfoRetry:
 }
 
 int
+CIFSSMBUnixQFileInfo(const int xid, struct cifsTconInfo *tcon,
+		 u16 netfid, FILE_UNIX_BASIC_INFO *pFindData)
+{
+	struct smb_t2_qfi_req *pSMB = NULL;
+	struct smb_t2_qfi_rsp *pSMBr = NULL;
+	int rc = 0;
+	int bytes_returned;
+	__u16 params, byte_count;
+
+UnixQFileInfoRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	params = 2 /* level */ + 2 /* fid */;
+	pSMB->t2.TotalDataCount = 0;
+	pSMB->t2.MaxParameterCount = cpu_to_le16(4);
+	/* BB find exact max data count below from sess structure BB */
+	pSMB->t2.MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
+	pSMB->t2.MaxSetupCount = 0;
+	pSMB->t2.Reserved = 0;
+	pSMB->t2.Flags = 0;
+	pSMB->t2.Timeout = 0;
+	pSMB->t2.Reserved2 = 0;
+	pSMB->t2.ParameterOffset = cpu_to_le16(offsetof(struct smb_t2_qfi_req,
+					       Fid) - 4);
+	pSMB->t2.DataCount = 0;
+	pSMB->t2.DataOffset = 0;
+	pSMB->t2.SetupCount = 1;
+	pSMB->t2.Reserved3 = 0;
+	pSMB->t2.SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION);
+	byte_count = params + 1 /* pad */ ;
+	pSMB->t2.TotalParameterCount = cpu_to_le16(params);
+	pSMB->t2.ParameterCount = pSMB->t2.TotalParameterCount;
+	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC);
+	pSMB->Pad = 0;
+	pSMB->Fid = netfid;
+	pSMB->hdr.smb_buf_length += byte_count;
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cFYI(1, "Send error in QPathInfo = %d", rc);
+	} else {		/* decode response */
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+
+		if (rc || (pSMBr->ByteCount < sizeof(FILE_UNIX_BASIC_INFO))) {
+			cERROR(1, "Malformed FILE_UNIX_BASIC_INFO response.\n"
+				   "Unix Extensions can be disabled on mount "
+				   "by specifying the nosfu mount option.");
+			rc = -EIO;	/* bad smb */
+		} else {
+			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+			memcpy((char *) pFindData,
+			       (char *) &pSMBr->hdr.Protocol +
+			       data_offset,
+			       sizeof(FILE_UNIX_BASIC_INFO));
+		}
+	}
+
+	cifs_buf_release(pSMB);
+	if (rc == -EAGAIN)
+		goto UnixQFileInfoRetry;
+
+	return rc;
+}
+
+int
 CIFSSMBUnixQPathInfo(const int xid, struct cifsTconInfo *tcon,
 		     const unsigned char *searchName,
 		     FILE_UNIX_BASIC_INFO *pFindData,
@@ -3348,7 +3517,7 @@ CIFSSMBUnixQPathInfo(const int xid, struct cifsTconInfo *tcon,
 	int name_len;
 	__u16 params, byte_count;
 
-	cFYI(1, ("In QPathInfo (Unix) the path %s", searchName));
+	cFYI(1, "In QPathInfo (Unix) the path %s", searchName);
 UnixQPathInfoRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -3395,14 +3564,14 @@ UnixQPathInfoRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, ("Send error in QPathInfo = %d", rc));
+		cFYI(1, "Send error in QPathInfo = %d", rc);
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
 		if (rc || (pSMBr->ByteCount < sizeof(FILE_UNIX_BASIC_INFO))) {
-			cERROR(1, ("Malformed FILE_UNIX_BASIC_INFO response.\n"
+			cERROR(1, "Malformed FILE_UNIX_BASIC_INFO response.\n"
 				   "Unix Extensions can be disabled on mount "
-				   "by specifying the nosfu mount option."));
+				   "by specifying the nosfu mount option.");
 			rc = -EIO;	/* bad smb */
 		} else {
 			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -3436,7 +3605,7 @@ CIFSFindFirst(const int xid, struct cifsTconInfo *tcon,
 	int name_len;
 	__u16 params, byte_count;
 
-	cFYI(1, ("In FindFirst for %s", searchName));
+	cFYI(1, "In FindFirst for %s", searchName);
 
 findFirstRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -3513,7 +3682,7 @@ findFirstRetry:
 	if (rc) {/* BB add logic to retry regular search if Unix search
 			rejected unexpectedly by server */
 		/* BB Add code to handle unsupported level rc */
-		cFYI(1, ("Error in FindFirst = %d", rc));
+		cFYI(1, "Error in FindFirst = %d", rc);
 
 		cifs_buf_release(pSMB);
 
@@ -3552,7 +3721,7 @@ findFirstRetry:
 			lnoff = le16_to_cpu(parms->LastNameOffset);
 			if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE <
 			      lnoff) {
-				cERROR(1, ("ignoring corrupt resume name"));
+				cERROR(1, "ignoring corrupt resume name");
 				psrch_inf->last_entry = NULL;
 				return rc;
 			}
@@ -3580,7 +3749,7 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
 	int bytes_returned, name_len;
 	__u16 params, byte_count;
 
-	cFYI(1, ("In FindNext"));
+	cFYI(1, "In FindNext");
 
 	if (psrch_inf->endOfSearch)
 		return -ENOENT;
@@ -3644,7 +3813,7 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
 			cifs_buf_release(pSMB);
 			rc = 0; /* search probably was closed at end of search*/
 		} else
-			cFYI(1, ("FindNext returned = %d", rc));
+			cFYI(1, "FindNext returned = %d", rc);
 	} else {                /* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
@@ -3680,15 +3849,15 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
 			lnoff = le16_to_cpu(parms->LastNameOffset);
 			if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE <
 			      lnoff) {
-				cERROR(1, ("ignoring corrupt resume name"));
+				cERROR(1, "ignoring corrupt resume name");
 				psrch_inf->last_entry = NULL;
 				return rc;
 			} else
 				psrch_inf->last_entry =
 					psrch_inf->srch_entries_start + lnoff;
 
-/*  cFYI(1,("fnxt2 entries in buf %d index_of_last %d",
-	    psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry)); */
+/*  cFYI(1, "fnxt2 entries in buf %d index_of_last %d",
+	    psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry); */
 
 			/* BB fixme add unlock here */
 		}
@@ -3713,7 +3882,7 @@ CIFSFindClose(const int xid, struct cifsTconInfo *tcon,
 	int rc = 0;
 	FINDCLOSE_REQ *pSMB = NULL;
 
-	cFYI(1, ("In CIFSSMBFindClose"));
+	cFYI(1, "In CIFSSMBFindClose");
 	rc = small_smb_init(SMB_COM_FIND_CLOSE2, 1, tcon, (void **)&pSMB);
 
 	/* no sense returning error if session restarted
@@ -3727,7 +3896,7 @@ CIFSFindClose(const int xid, struct cifsTconInfo *tcon,
 	pSMB->ByteCount = 0;
 	rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
 	if (rc)
-		cERROR(1, ("Send error in FindClose = %d", rc));
+		cERROR(1, "Send error in FindClose = %d", rc);
 
 	cifs_stats_inc(&tcon->num_fclose);
 
@@ -3750,7 +3919,7 @@ CIFSGetSrvInodeNumber(const int xid, struct cifsTconInfo *tcon,
 	int name_len, bytes_returned;
 	__u16 params, byte_count;
 
-	cFYI(1, ("In GetSrvInodeNum for %s", searchName));
+	cFYI(1, "In GetSrvInodeNum for %s", searchName);
 	if (tcon == NULL)
 		return -ENODEV;
 
@@ -3800,7 +3969,7 @@ GetInodeNumberRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 		(struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, ("error %d in QueryInternalInfo", rc));
+		cFYI(1, "error %d in QueryInternalInfo", rc);
 	} else {
 		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -3815,7 +3984,7 @@ GetInodeNumberRetry:
 			struct file_internal_info *pfinfo;
 			/* BB Do we need a cast or hash here ? */
 			if (count < 8) {
-				cFYI(1, ("Illegal size ret in QryIntrnlInf"));
+				cFYI(1, "Illegal size ret in QryIntrnlInf");
 				rc = -EIO;
 				goto GetInodeNumOut;
 			}
@@ -3856,16 +4025,16 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
 	*num_of_nodes = le16_to_cpu(pSMBr->NumberOfReferrals);
 
 	if (*num_of_nodes < 1) {
-		cERROR(1, ("num_referrals: must be at least > 0,"
-			"but we get num_referrals = %d\n", *num_of_nodes));
+		cERROR(1, "num_referrals: must be at least > 0,"
+			"but we get num_referrals = %d\n", *num_of_nodes);
 		rc = -EINVAL;
 		goto parse_DFS_referrals_exit;
 	}
 
 	ref = (struct dfs_referral_level_3 *) &(pSMBr->referrals);
 	if (ref->VersionNumber != cpu_to_le16(3)) {
-		cERROR(1, ("Referrals of V%d version are not supported,"
-			"should be V3", le16_to_cpu(ref->VersionNumber)));
+		cERROR(1, "Referrals of V%d version are not supported,"
+			"should be V3", le16_to_cpu(ref->VersionNumber));
 		rc = -EINVAL;
 		goto parse_DFS_referrals_exit;
 	}
@@ -3874,19 +4043,19 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
 	data_end = (char *)(&(pSMBr->PathConsumed)) +
 				le16_to_cpu(pSMBr->t2.DataCount);
 
-	cFYI(1, ("num_referrals: %d dfs flags: 0x%x ... \n",
+	cFYI(1, "num_referrals: %d dfs flags: 0x%x ...\n",
 			*num_of_nodes,
-			le32_to_cpu(pSMBr->DFSFlags)));
+			le32_to_cpu(pSMBr->DFSFlags));
 
 	*target_nodes = kzalloc(sizeof(struct dfs_info3_param) *
 			*num_of_nodes, GFP_KERNEL);
 	if (*target_nodes == NULL) {
-		cERROR(1, ("Failed to allocate buffer for target_nodes\n"));
+		cERROR(1, "Failed to allocate buffer for target_nodes\n");
 		rc = -ENOMEM;
 		goto parse_DFS_referrals_exit;
 	}
 
-	/* collect neccessary data from referrals */
+	/* collect necessary data from referrals */
 	for (i = 0; i < *num_of_nodes; i++) {
 		char *temp;
 		int max_len;
@@ -3957,7 +4126,7 @@ CIFSGetDFSRefer(const int xid, struct cifsSesInfo *ses,
 	*num_of_nodes = 0;
 	*target_nodes = NULL;
 
-	cFYI(1, ("In GetDFSRefer the path %s", searchName));
+	cFYI(1, "In GetDFSRefer the path %s", searchName);
 	if (ses == NULL)
 		return -ENODEV;
 getDFSRetry:
@@ -4024,7 +4193,7 @@ getDFSRetry:
 	rc = SendReceive(xid, ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, ("Send error in GetDFSRefer = %d", rc));
+		cFYI(1, "Send error in GetDFSRefer = %d", rc);
 		goto GetDFSRefExit;
 	}
 	rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -4035,9 +4204,9 @@ getDFSRetry:
 		goto GetDFSRefExit;
 	}
 
-	cFYI(1, ("Decoding GetDFSRefer response BCC: %d  Offset %d",
+	cFYI(1, "Decoding GetDFSRefer response BCC: %d  Offset %d",
 				pSMBr->ByteCount,
-				le16_to_cpu(pSMBr->t2.DataOffset)));
+				le16_to_cpu(pSMBr->t2.DataOffset));
 
 	/* parse returned result into more usable form */
 	rc = parse_DFS_referrals(pSMBr, num_of_nodes,
@@ -4065,7 +4234,7 @@ SMBOldQFSInfo(const int xid, struct cifsTconInfo *tcon, struct kstatfs *FSData)
 	int bytes_returned = 0;
 	__u16 params, byte_count;
 
-	cFYI(1, ("OldQFSInfo"));
+	cFYI(1, "OldQFSInfo");
 oldQFSInfoRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		(void **) &pSMBr);
@@ -4098,7 +4267,7 @@ oldQFSInfoRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 		(struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, ("Send error in QFSInfo = %d", rc));
+		cFYI(1, "Send error in QFSInfo = %d", rc);
 	} else {                /* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
@@ -4106,8 +4275,8 @@ oldQFSInfoRetry:
 			rc = -EIO;      /* bad smb */
 		else {
 			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
-			cFYI(1, ("qfsinf resp BCC: %d  Offset %d",
-				 pSMBr->ByteCount, data_offset));
+			cFYI(1, "qfsinf resp BCC: %d  Offset %d",
+				 pSMBr->ByteCount, data_offset);
 
 			response_data = (FILE_SYSTEM_ALLOC_INFO *)
 				(((char *) &pSMBr->hdr.Protocol) + data_offset);
@@ -4119,11 +4288,10 @@ oldQFSInfoRetry:
 			       le32_to_cpu(response_data->TotalAllocationUnits);
 			FSData->f_bfree = FSData->f_bavail =
 				le32_to_cpu(response_data->FreeAllocationUnits);
-			cFYI(1,
-			     ("Blocks: %lld  Free: %lld Block size %ld",
-			      (unsigned long long)FSData->f_blocks,
-			      (unsigned long long)FSData->f_bfree,
-			      FSData->f_bsize));
+			cFYI(1, "Blocks: %lld  Free: %lld Block size %ld",
+			     (unsigned long long)FSData->f_blocks,
+			     (unsigned long long)FSData->f_bfree,
+			     FSData->f_bsize);
 		}
 	}
 	cifs_buf_release(pSMB);
@@ -4145,7 +4313,7 @@ CIFSSMBQFSInfo(const int xid, struct cifsTconInfo *tcon, struct kstatfs *FSData)
 	int bytes_returned = 0;
 	__u16 params, byte_count;
 
-	cFYI(1, ("In QFSInfo"));
+	cFYI(1, "In QFSInfo");
 QFSInfoRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -4178,7 +4346,7 @@ QFSInfoRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, ("Send error in QFSInfo = %d", rc));
+		cFYI(1, "Send error in QFSInfo = %d", rc);
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
@@ -4199,11 +4367,10 @@ QFSInfoRetry:
 			    le64_to_cpu(response_data->TotalAllocationUnits);
 			FSData->f_bfree = FSData->f_bavail =
 			    le64_to_cpu(response_data->FreeAllocationUnits);
-			cFYI(1,
-			     ("Blocks: %lld  Free: %lld Block size %ld",
-			      (unsigned long long)FSData->f_blocks,
-			      (unsigned long long)FSData->f_bfree,
-			      FSData->f_bsize));
+			cFYI(1, "Blocks: %lld  Free: %lld Block size %ld",
+			     (unsigned long long)FSData->f_blocks,
+			     (unsigned long long)FSData->f_bfree,
+			     FSData->f_bsize);
 		}
 	}
 	cifs_buf_release(pSMB);
@@ -4225,7 +4392,7 @@ CIFSSMBQFSAttributeInfo(const int xid, struct cifsTconInfo *tcon)
 	int bytes_returned = 0;
 	__u16 params, byte_count;
 
-	cFYI(1, ("In QFSAttributeInfo"));
+	cFYI(1, "In QFSAttributeInfo");
 QFSAttributeRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -4259,7 +4426,7 @@ QFSAttributeRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cERROR(1, ("Send error in QFSAttributeInfo = %d", rc));
+		cERROR(1, "Send error in QFSAttributeInfo = %d", rc);
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
@@ -4295,7 +4462,7 @@ CIFSSMBQFSDeviceInfo(const int xid, struct cifsTconInfo *tcon)
 	int bytes_returned = 0;
 	__u16 params, byte_count;
 
-	cFYI(1, ("In QFSDeviceInfo"));
+	cFYI(1, "In QFSDeviceInfo");
 QFSDeviceRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -4330,7 +4497,7 @@ QFSDeviceRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, ("Send error in QFSDeviceInfo = %d", rc));
+		cFYI(1, "Send error in QFSDeviceInfo = %d", rc);
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
@@ -4365,7 +4532,7 @@ CIFSSMBQFSUnixInfo(const int xid, struct cifsTconInfo *tcon)
 	int bytes_returned = 0;
 	__u16 params, byte_count;
 
-	cFYI(1, ("In QFSUnixInfo"));
+	cFYI(1, "In QFSUnixInfo");
 QFSUnixRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -4399,7 +4566,7 @@ QFSUnixRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cERROR(1, ("Send error in QFSUnixInfo = %d", rc));
+		cERROR(1, "Send error in QFSUnixInfo = %d", rc);
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
@@ -4434,7 +4601,7 @@ CIFSSMBSetFSUnixInfo(const int xid, struct cifsTconInfo *tcon, __u64 cap)
 	int bytes_returned = 0;
 	__u16 params, param_offset, offset, byte_count;
 
-	cFYI(1, ("In SETFSUnixInfo"));
+	cFYI(1, "In SETFSUnixInfo");
 SETFSUnixRetry:
 	/* BB switch to small buf init to save memory */
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -4482,7 +4649,7 @@ SETFSUnixRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cERROR(1, ("Send error in SETFSUnixInfo = %d", rc));
+		cERROR(1, "Send error in SETFSUnixInfo = %d", rc);
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 		if (rc)
@@ -4510,7 +4677,7 @@ CIFSSMBQFSPosixInfo(const int xid, struct cifsTconInfo *tcon,
 	int bytes_returned = 0;
 	__u16 params, byte_count;
 
-	cFYI(1, ("In QFSPosixInfo"));
+	cFYI(1, "In QFSPosixInfo");
 QFSPosixRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -4544,7 +4711,7 @@ QFSPosixRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, ("Send error in QFSUnixInfo = %d", rc));
+		cFYI(1, "Send error in QFSUnixInfo = %d", rc);
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
@@ -4604,7 +4771,7 @@ CIFSSMBSetEOF(const int xid, struct cifsTconInfo *tcon, const char *fileName,
 	int bytes_returned = 0;
 	__u16 params, byte_count, data_count, param_offset, offset;
 
-	cFYI(1, ("In SetEOF"));
+	cFYI(1, "In SetEOF");
 SetEOFRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -4670,7 +4837,7 @@ SetEOFRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc)
-		cFYI(1, ("SetPathInfo (file size) returned %d", rc));
+		cFYI(1, "SetPathInfo (file size) returned %d", rc);
 
 	cifs_buf_release(pSMB);
 
@@ -4690,8 +4857,8 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
 	int rc = 0;
 	__u16 params, param_offset, offset, byte_count, count;
 
-	cFYI(1, ("SetFileSize (via SetFileInfo) %lld",
-			(long long)size));
+	cFYI(1, "SetFileSize (via SetFileInfo) %lld",
+			(long long)size);
 	rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
 
 	if (rc)
@@ -4750,9 +4917,7 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 	rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
 	if (rc) {
-		cFYI(1,
-		     ("Send error in SetFileInfo (SetFileSize) = %d",
-		      rc));
+		cFYI(1, "Send error in SetFileInfo (SetFileSize) = %d", rc);
 	}
 
 	/* Note: On -EAGAIN error only caller can retry on handle based calls
@@ -4776,7 +4941,7 @@ CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
 	int rc = 0;
 	__u16 params, param_offset, offset, byte_count, count;
 
-	cFYI(1, ("Set Times (via SetFileInfo)"));
+	cFYI(1, "Set Times (via SetFileInfo)");
 	rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
 
 	if (rc)
@@ -4821,7 +4986,7 @@ CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
 	memcpy(data_offset, data, sizeof(FILE_BASIC_INFO));
 	rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
 	if (rc)
-		cFYI(1, ("Send error in Set Time (SetFileInfo) = %d", rc));
+		cFYI(1, "Send error in Set Time (SetFileInfo) = %d", rc);
 
 	/* Note: On -EAGAIN error only caller can retry on handle based calls
 		since file handle passed in no longer valid */
@@ -4838,7 +5003,7 @@ CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon,
 	int rc = 0;
 	__u16 params, param_offset, offset, byte_count, count;
 
-	cFYI(1, ("Set File Disposition (via SetFileInfo)"));
+	cFYI(1, "Set File Disposition (via SetFileInfo)");
 	rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
 
 	if (rc)
@@ -4880,7 +5045,7 @@ CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon,
 	*data_offset = delete_file ? 1 : 0;
 	rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
 	if (rc)
-		cFYI(1, ("Send error in SetFileDisposition = %d", rc));
+		cFYI(1, "Send error in SetFileDisposition = %d", rc);
 
 	return rc;
 }
@@ -4898,7 +5063,7 @@ CIFSSMBSetPathInfo(const int xid, struct cifsTconInfo *tcon,
 	char *data_offset;
 	__u16 params, param_offset, offset, byte_count, count;
 
-	cFYI(1, ("In SetTimes"));
+	cFYI(1, "In SetTimes");
 
 SetTimesRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -4954,7 +5119,7 @@ SetTimesRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc)
-		cFYI(1, ("SetPathInfo (times) returned %d", rc));
+		cFYI(1, "SetPathInfo (times) returned %d", rc);
 
 	cifs_buf_release(pSMB);
 
@@ -4979,7 +5144,7 @@ CIFSSMBSetAttrLegacy(int xid, struct cifsTconInfo *tcon, char *fileName,
 	int bytes_returned;
 	int name_len;
 
-	cFYI(1, ("In SetAttrLegacy"));
+	cFYI(1, "In SetAttrLegacy");
 
 SetAttrLgcyRetry:
 	rc = smb_init(SMB_COM_SETATTR, 8, tcon, (void **) &pSMB,
@@ -5005,7 +5170,7 @@ SetAttrLgcyRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc)
-		cFYI(1, ("Error in LegacySetAttr = %d", rc));
+		cFYI(1, "Error in LegacySetAttr = %d", rc);
 
 	cifs_buf_release(pSMB);
 
@@ -5067,7 +5232,7 @@ CIFSSMBUnixSetFileInfo(const int xid, struct cifsTconInfo *tcon,
 	int rc = 0;
 	u16 params, param_offset, offset, byte_count, count;
 
-	cFYI(1, ("Set Unix Info (via SetFileInfo)"));
+	cFYI(1, "Set Unix Info (via SetFileInfo)");
 	rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
 
 	if (rc)
@@ -5112,7 +5277,7 @@ CIFSSMBUnixSetFileInfo(const int xid, struct cifsTconInfo *tcon,
 
 	rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
 	if (rc)
-		cFYI(1, ("Send error in Set Time (SetFileInfo) = %d", rc));
+		cFYI(1, "Send error in Set Time (SetFileInfo) = %d", rc);
 
 	/* Note: On -EAGAIN error only caller can retry on handle based calls
 		since file handle passed in no longer valid */
@@ -5133,7 +5298,7 @@ CIFSSMBUnixSetPathInfo(const int xid, struct cifsTconInfo *tcon, char *fileName,
 	FILE_UNIX_BASIC_INFO *data_offset;
 	__u16 params, param_offset, offset, count, byte_count;
 
-	cFYI(1, ("In SetUID/GID/Mode"));
+	cFYI(1, "In SetUID/GID/Mode");
 setPermsRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -5189,7 +5354,7 @@ setPermsRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc)
-		cFYI(1, ("SetPathInfo (perms) returned %d", rc));
+		cFYI(1, "SetPathInfo (perms) returned %d", rc);
 
 	cifs_buf_release(pSMB);
 	if (rc == -EAGAIN)
@@ -5208,7 +5373,7 @@ int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
 	struct dir_notify_req *dnotify_req;
 	int bytes_returned;
 
-	cFYI(1, ("In CIFSSMBNotify for file handle %d", (int)netfid));
+	cFYI(1, "In CIFSSMBNotify for file handle %d", (int)netfid);
 	rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
 	if (rc)
@@ -5242,7 +5407,7 @@ int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
 			 (struct smb_hdr *)pSMBr, &bytes_returned,
 			 CIFS_ASYNC_OP);
 	if (rc) {
-		cFYI(1, ("Error in Notify = %d", rc));
+		cFYI(1, "Error in Notify = %d", rc);
 	} else {
 		/* Add file to outstanding requests */
 		/* BB change to kmem cache alloc */
@@ -5269,24 +5434,36 @@ int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
 	cifs_buf_release(pSMB);
 	return rc;
 }
+
 #ifdef CONFIG_CIFS_XATTR
+/*
+ * Do a path-based QUERY_ALL_EAS call and parse the result. This is a common
+ * function used by listxattr and getxattr type calls. When ea_name is set,
+ * it looks for that attribute name and stuffs that value into the EAData
+ * buffer. When ea_name is NULL, it stuffs a list of attribute names into the
+ * buffer. In both cases, the return value is either the length of the
+ * resulting data or a negative error code. If EAData is a NULL pointer then
+ * the data isn't copied to it, but the length is returned.
+ */
 ssize_t
 CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon,
-		 const unsigned char *searchName,
-		 char *EAData, size_t buf_size,
-		 const struct nls_table *nls_codepage, int remap)
+		const unsigned char *searchName, const unsigned char *ea_name,
+		char *EAData, size_t buf_size,
+		const struct nls_table *nls_codepage, int remap)
 {
 		/* BB assumes one setup word */
 	TRANSACTION2_QPI_REQ *pSMB = NULL;
 	TRANSACTION2_QPI_RSP *pSMBr = NULL;
 	int rc = 0;
 	int bytes_returned;
-	int name_len;
+	int list_len;
+	struct fealist *ea_response_data;
 	struct fea *temp_fea;
 	char *temp_ptr;
-	__u16 params, byte_count;
+	char *end_of_smb;
+	__u16 params, byte_count, data_offset;
 
-	cFYI(1, ("In Query All EAs path %s", searchName));
+	cFYI(1, "In Query All EAs path %s", searchName);
 QAllEAsRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -5294,22 +5471,22 @@ QAllEAsRetry:
 		return rc;
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
-		name_len =
+		list_len =
 		    cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
 				     PATH_MAX, nls_codepage, remap);
-		name_len++;	/* trailing null */
-		name_len *= 2;
+		list_len++;	/* trailing null */
+		list_len *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
-		name_len = strnlen(searchName, PATH_MAX);
-		name_len++;	/* trailing null */
-		strncpy(pSMB->FileName, searchName, name_len);
+		list_len = strnlen(searchName, PATH_MAX);
+		list_len++;	/* trailing null */
+		strncpy(pSMB->FileName, searchName, list_len);
 	}
 
-	params = 2 /* level */ + 4 /* reserved */ + name_len /* includes NUL */;
+	params = 2 /* level */ + 4 /* reserved */ + list_len /* includes NUL */;
 	pSMB->TotalDataCount = 0;
 	pSMB->MaxParameterCount = cpu_to_le16(2);
 	/* BB find exact max SMB PDU from sess structure BB */
-	pSMB->MaxDataCount = cpu_to_le16(4000);
+	pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
 	pSMB->MaxSetupCount = 0;
 	pSMB->Reserved = 0;
 	pSMB->Flags = 0;
@@ -5333,238 +5510,118 @@ QAllEAsRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, ("Send error in QueryAllEAs = %d", rc));
-	} else {		/* decode response */
-		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+		cFYI(1, "Send error in QueryAllEAs = %d", rc);
+		goto QAllEAsOut;
+	}
 
-		/* BB also check enough total bytes returned */
-		/* BB we need to improve the validity checking
-		of these trans2 responses */
-		if (rc || (pSMBr->ByteCount < 4))
-			rc = -EIO;	/* bad smb */
-	   /* else if (pFindData){
-			memcpy((char *) pFindData,
-			       (char *) &pSMBr->hdr.Protocol +
-			       data_offset, kl);
-		}*/ else {
-			/* check that length of list is not more than bcc */
-			/* check that each entry does not go beyond length
-			   of list */
-			/* check that each element of each entry does not
-			   go beyond end of list */
-			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
-			struct fealist *ea_response_data;
-			rc = 0;
-			/* validate_trans2_offsets() */
-			/* BB check if start of smb + data_offset > &bcc+ bcc */
-			ea_response_data = (struct fealist *)
-				(((char *) &pSMBr->hdr.Protocol) +
-				data_offset);
-			name_len = le32_to_cpu(ea_response_data->list_len);
-			cFYI(1, ("ea length %d", name_len));
-			if (name_len <= 8) {
-			/* returned EA size zeroed at top of function */
-				cFYI(1, ("empty EA list returned from server"));
-			} else {
-				/* account for ea list len */
-				name_len -= 4;
-				temp_fea = ea_response_data->list;
-				temp_ptr = (char *)temp_fea;
-				while (name_len > 0) {
-					__u16 value_len;
-					name_len -= 4;
-					temp_ptr += 4;
-					rc += temp_fea->name_len;
-				/* account for prefix user. and trailing null */
-					rc = rc + 5 + 1;
-					if (rc < (int)buf_size) {
-						memcpy(EAData, "user.", 5);
-						EAData += 5;
-						memcpy(EAData, temp_ptr,
-						       temp_fea->name_len);
-						EAData += temp_fea->name_len;
-						/* null terminate name */
-						*EAData = 0;
-						EAData = EAData + 1;
-					} else if (buf_size == 0) {
-						/* skip copy - calc size only */
-					} else {
-						/* stop before overrun buffer */
-						rc = -ERANGE;
-						break;
-					}
-					name_len -= temp_fea->name_len;
-					temp_ptr += temp_fea->name_len;
-					/* account for trailing null */
-					name_len--;
-					temp_ptr++;
-					value_len =
-					      le16_to_cpu(temp_fea->value_len);
-					name_len -= value_len;
-					temp_ptr += value_len;
-					/* BB check that temp_ptr is still
-					      within the SMB BB*/
-
-					/* no trailing null to account for
-					   in value len */
-					/* go on to next EA */
-					temp_fea = (struct fea *)temp_ptr;
-				}
-			}
-		}
+
+	/* BB also check enough total bytes returned */
+	/* BB we need to improve the validity checking
+	of these trans2 responses */
+
+	rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+	if (rc || (pSMBr->ByteCount < 4)) {
+		rc = -EIO;	/* bad smb */
+		goto QAllEAsOut;
 	}
-	cifs_buf_release(pSMB);
-	if (rc == -EAGAIN)
-		goto QAllEAsRetry;
 
-	return (ssize_t)rc;
-}
+	/* check that length of list is not more than bcc */
+	/* check that each entry does not go beyond length
+	   of list */
+	/* check that each element of each entry does not
+	   go beyond end of list */
+	/* validate_trans2_offsets() */
+	/* BB check if start of smb + data_offset > &bcc+ bcc */
 
-ssize_t CIFSSMBQueryEA(const int xid, struct cifsTconInfo *tcon,
-		const unsigned char *searchName, const unsigned char *ea_name,
-		unsigned char *ea_value, size_t buf_size,
-		const struct nls_table *nls_codepage, int remap)
-{
-	TRANSACTION2_QPI_REQ *pSMB = NULL;
-	TRANSACTION2_QPI_RSP *pSMBr = NULL;
-	int rc = 0;
-	int bytes_returned;
-	int name_len;
-	struct fea *temp_fea;
-	char *temp_ptr;
-	__u16 params, byte_count;
+	data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+	ea_response_data = (struct fealist *)
+				(((char *) &pSMBr->hdr.Protocol) + data_offset);
 
-	cFYI(1, ("In Query EA path %s", searchName));
-QEARetry:
-	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
-		      (void **) &pSMBr);
-	if (rc)
-		return rc;
+	list_len = le32_to_cpu(ea_response_data->list_len);
+	cFYI(1, "ea length %d", list_len);
+	if (list_len <= 8) {
+		cFYI(1, "empty EA list returned from server");
+		goto QAllEAsOut;
+	}
 
-	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
-		name_len =
-		    cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
-				     PATH_MAX, nls_codepage, remap);
-		name_len++;	/* trailing null */
-		name_len *= 2;
-	} else {	/* BB improve the check for buffer overruns BB */
-		name_len = strnlen(searchName, PATH_MAX);
-		name_len++;	/* trailing null */
-		strncpy(pSMB->FileName, searchName, name_len);
+	/* make sure list_len doesn't go past end of SMB */
+	end_of_smb = (char *)pByteArea(&pSMBr->hdr) + BCC(&pSMBr->hdr);
+	if ((char *)ea_response_data + list_len > end_of_smb) {
+		cFYI(1, "EA list appears to go beyond SMB");
+		rc = -EIO;
+		goto QAllEAsOut;
 	}
 
-	params = 2 /* level */ + 4 /* reserved */ + name_len /* includes NUL */;
-	pSMB->TotalDataCount = 0;
-	pSMB->MaxParameterCount = cpu_to_le16(2);
-	/* BB find exact max SMB PDU from sess structure BB */
-	pSMB->MaxDataCount = cpu_to_le16(4000);
-	pSMB->MaxSetupCount = 0;
-	pSMB->Reserved = 0;
-	pSMB->Flags = 0;
-	pSMB->Timeout = 0;
-	pSMB->Reserved2 = 0;
-	pSMB->ParameterOffset = cpu_to_le16(offsetof(
-		struct smb_com_transaction2_qpi_req, InformationLevel) - 4);
-	pSMB->DataCount = 0;
-	pSMB->DataOffset = 0;
-	pSMB->SetupCount = 1;
-	pSMB->Reserved3 = 0;
-	pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_PATH_INFORMATION);
-	byte_count = params + 1 /* pad */ ;
-	pSMB->TotalParameterCount = cpu_to_le16(params);
-	pSMB->ParameterCount = pSMB->TotalParameterCount;
-	pSMB->InformationLevel = cpu_to_le16(SMB_INFO_QUERY_ALL_EAS);
-	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
-	pSMB->ByteCount = cpu_to_le16(byte_count);
+	/* account for ea list len */
+	list_len -= 4;
+	temp_fea = ea_response_data->list;
+	temp_ptr = (char *)temp_fea;
+	while (list_len > 0) {
+		unsigned int name_len;
+		__u16 value_len;
+
+		list_len -= 4;
+		temp_ptr += 4;
+		/* make sure we can read name_len and value_len */
+		if (list_len < 0) {
+			cFYI(1, "EA entry goes beyond length of list");
+			rc = -EIO;
+			goto QAllEAsOut;
+		}
 
-	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
-			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
-	if (rc) {
-		cFYI(1, ("Send error in Query EA = %d", rc));
-	} else {		/* decode response */
-		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+		name_len = temp_fea->name_len;
+		value_len = le16_to_cpu(temp_fea->value_len);
+		list_len -= name_len + 1 + value_len;
+		if (list_len < 0) {
+			cFYI(1, "EA entry goes beyond length of list");
+			rc = -EIO;
+			goto QAllEAsOut;
+		}
 
-		/* BB also check enough total bytes returned */
-		/* BB we need to improve the validity checking
-		of these trans2 responses */
-		if (rc || (pSMBr->ByteCount < 4))
-			rc = -EIO;	/* bad smb */
-	   /* else if (pFindData){
-			memcpy((char *) pFindData,
-			       (char *) &pSMBr->hdr.Protocol +
-			       data_offset, kl);
-		}*/ else {
-			/* check that length of list is not more than bcc */
-			/* check that each entry does not go beyond length
-			   of list */
-			/* check that each element of each entry does not
-			   go beyond end of list */
-			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
-			struct fealist *ea_response_data;
-			rc = -ENODATA;
-			/* validate_trans2_offsets() */
-			/* BB check if start of smb + data_offset > &bcc+ bcc*/
-			ea_response_data = (struct fealist *)
-				(((char *) &pSMBr->hdr.Protocol) +
-				data_offset);
-			name_len = le32_to_cpu(ea_response_data->list_len);
-			cFYI(1, ("ea length %d", name_len));
-			if (name_len <= 8) {
-			/* returned EA size zeroed at top of function */
-				cFYI(1, ("empty EA list returned from server"));
-			} else {
-				/* account for ea list len */
-				name_len -= 4;
-				temp_fea = ea_response_data->list;
-				temp_ptr = (char *)temp_fea;
-				/* loop through checking if we have a matching
-				name and then return the associated value */
-				while (name_len > 0) {
-					__u16 value_len;
-					name_len -= 4;
-					temp_ptr += 4;
-					value_len =
-					      le16_to_cpu(temp_fea->value_len);
-				/* BB validate that value_len falls within SMB,
-				even though maximum for name_len is 255 */
-					if (memcmp(temp_fea->name, ea_name,
-						  temp_fea->name_len) == 0) {
-						/* found a match */
-						rc = value_len;
-				/* account for prefix user. and trailing null */
-						if (rc <= (int)buf_size) {
-							memcpy(ea_value,
-								temp_fea->name+temp_fea->name_len+1,
-								rc);
-							/* ea values, unlike ea
-							   names, are not null
-							   terminated */
-						} else if (buf_size == 0) {
-						/* skip copy - calc size only */
-						} else {
-						/* stop before overrun buffer */
-							rc = -ERANGE;
-						}
-						break;
-					}
-					name_len -= temp_fea->name_len;
-					temp_ptr += temp_fea->name_len;
-					/* account for trailing null */
-					name_len--;
-					temp_ptr++;
-					name_len -= value_len;
-					temp_ptr += value_len;
-					/* No trailing null to account for in
-					   value_len.  Go on to next EA */
-					temp_fea = (struct fea *)temp_ptr;
+		if (ea_name) {
+			if (strncmp(ea_name, temp_ptr, name_len) == 0) {
+				temp_ptr += name_len + 1;
+				rc = value_len;
+				if (buf_size == 0)
+					goto QAllEAsOut;
+				if ((size_t)value_len > buf_size) {
+					rc = -ERANGE;
+					goto QAllEAsOut;
 				}
+				memcpy(EAData, temp_ptr, value_len);
+				goto QAllEAsOut;
+			}
+		} else {
+			/* account for prefix user. and trailing null */
+			rc += (5 + 1 + name_len);
+			if (rc < (int) buf_size) {
+				memcpy(EAData, "user.", 5);
+				EAData += 5;
+				memcpy(EAData, temp_ptr, name_len);
+				EAData += name_len;
+				/* null terminate name */
+				*EAData = 0;
+				++EAData;
+			} else if (buf_size == 0) {
+				/* skip copy - calc size only */
+			} else {
+				/* stop before overrun buffer */
+				rc = -ERANGE;
+				break;
 			}
 		}
+		temp_ptr += name_len + 1 + value_len;
+		temp_fea = (struct fea *)temp_ptr;
 	}
+
+	/* didn't find the named attribute */
+	if (ea_name)
+		rc = -ENODATA;
+
+QAllEAsOut:
 	cifs_buf_release(pSMB);
 	if (rc == -EAGAIN)
-		goto QEARetry;
+		goto QAllEAsRetry;
 
 	return (ssize_t)rc;
 }
@@ -5583,7 +5640,7 @@ CIFSSMBSetEA(const int xid, struct cifsTconInfo *tcon, const char *fileName,
 	int bytes_returned = 0;
 	__u16 params, param_offset, byte_count, offset, count;
 
-	cFYI(1, ("In SetEA"));
+	cFYI(1, "In SetEA");
 SetEARetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -5665,7 +5722,7 @@ SetEARetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc)
-		cFYI(1, ("SetPathInfo (EA) returned %d", rc));
+		cFYI(1, "SetPathInfo (EA) returned %d", rc);
 
 	cifs_buf_release(pSMB);
 
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 3bbcaa716b3c..2208f06e4c45 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -23,6 +23,7 @@
 #include <linux/string.h>
 #include <linux/list.h>
 #include <linux/wait.h>
+#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/ctype.h>
 #include <linux/utsname.h>
@@ -98,9 +99,10 @@ struct smb_vol {
 	bool nostrictsync:1; /* do not force expensive SMBflush on every sync */
 	unsigned int rsize;
 	unsigned int wsize;
-	unsigned int sockopt;
+	bool sockopt_tcp_nodelay:1;
 	unsigned short int port;
 	char *prepath;
+	struct nls_table *local_nls;
 };
 
 static int ipv4_connect(struct TCP_Server_Info *server);
@@ -134,7 +136,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
 	spin_unlock(&GlobalMid_Lock);
 	server->maxBuf = 0;
 
-	cFYI(1, ("Reconnecting tcp session"));
+	cFYI(1, "Reconnecting tcp session");
 
 	/* before reconnecting the tcp session, mark the smb session (uid)
 		and the tid bad so they are not used until reconnected */
@@ -152,12 +154,12 @@ cifs_reconnect(struct TCP_Server_Info *server)
 	/* do not want to be sending data on a socket we are freeing */
 	mutex_lock(&server->srv_mutex);
 	if (server->ssocket) {
-		cFYI(1, ("State: 0x%x Flags: 0x%lx", server->ssocket->state,
-			server->ssocket->flags));
+		cFYI(1, "State: 0x%x Flags: 0x%lx", server->ssocket->state,
+			server->ssocket->flags);
 		kernel_sock_shutdown(server->ssocket, SHUT_WR);
-		cFYI(1, ("Post shutdown state: 0x%x Flags: 0x%lx",
+		cFYI(1, "Post shutdown state: 0x%x Flags: 0x%lx",
 			server->ssocket->state,
-			server->ssocket->flags));
+			server->ssocket->flags);
 		sock_release(server->ssocket);
 		server->ssocket = NULL;
 	}
@@ -186,7 +188,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
 		else
 			rc = ipv4_connect(server);
 		if (rc) {
-			cFYI(1, ("reconnect error %d", rc));
+			cFYI(1, "reconnect error %d", rc);
 			msleep(3000);
 		} else {
 			atomic_inc(&tcpSesReconnectCount);
@@ -222,7 +224,7 @@ static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
 	/* check for plausible wct, bcc and t2 data and parm sizes */
 	/* check for parm and data offset going beyond end of smb */
 	if (pSMB->WordCount != 10) { /* coalesce_t2 depends on this */
-		cFYI(1, ("invalid transact2 word count"));
+		cFYI(1, "invalid transact2 word count");
 		return -EINVAL;
 	}
 
@@ -236,15 +238,15 @@ static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
 	if (remaining == 0)
 		return 0;
 	else if (remaining < 0) {
-		cFYI(1, ("total data %d smaller than data in frame %d",
-			total_data_size, data_in_this_rsp));
+		cFYI(1, "total data %d smaller than data in frame %d",
+			total_data_size, data_in_this_rsp);
 		return -EINVAL;
 	} else {
-		cFYI(1, ("missing %d bytes from transact2, check next response",
-			remaining));
+		cFYI(1, "missing %d bytes from transact2, check next response",
+			remaining);
 		if (total_data_size > maxBufSize) {
-			cERROR(1, ("TotalDataSize %d is over maximum buffer %d",
-				total_data_size, maxBufSize));
+			cERROR(1, "TotalDataSize %d is over maximum buffer %d",
+				total_data_size, maxBufSize);
 			return -EINVAL;
 		}
 		return remaining;
@@ -266,7 +268,7 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
 	total_data_size = le16_to_cpu(pSMBt->t2_rsp.TotalDataCount);
 
 	if (total_data_size != le16_to_cpu(pSMB2->t2_rsp.TotalDataCount)) {
-		cFYI(1, ("total data size of primary and secondary t2 differ"));
+		cFYI(1, "total data size of primary and secondary t2 differ");
 	}
 
 	total_in_buf = le16_to_cpu(pSMBt->t2_rsp.DataCount);
@@ -281,7 +283,7 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
 
 	total_in_buf2 = le16_to_cpu(pSMB2->t2_rsp.DataCount);
 	if (remaining < total_in_buf2) {
-		cFYI(1, ("transact2 2nd response contains too much data"));
+		cFYI(1, "transact2 2nd response contains too much data");
 	}
 
 	/* find end of first SMB data area */
@@ -310,7 +312,7 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
 	pTargetSMB->smb_buf_length = byte_count;
 
 	if (remaining == total_in_buf2) {
-		cFYI(1, ("found the last secondary response"));
+		cFYI(1, "found the last secondary response");
 		return 0; /* we are done */
 	} else /* more responses to go */
 		return 1;
@@ -338,7 +340,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
 	int reconnect;
 
 	current->flags |= PF_MEMALLOC;
-	cFYI(1, ("Demultiplex PID: %d", task_pid_nr(current)));
+	cFYI(1, "Demultiplex PID: %d", task_pid_nr(current));
 
 	length = atomic_inc_return(&tcpSesAllocCount);
 	if (length > 1)
@@ -352,7 +354,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
 		if (bigbuf == NULL) {
 			bigbuf = cifs_buf_get();
 			if (!bigbuf) {
-				cERROR(1, ("No memory for large SMB response"));
+				cERROR(1, "No memory for large SMB response");
 				msleep(3000);
 				/* retry will check if exiting */
 				continue;
@@ -365,7 +367,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
 		if (smallbuf == NULL) {
 			smallbuf = cifs_small_buf_get();
 			if (!smallbuf) {
-				cERROR(1, ("No memory for SMB response"));
+				cERROR(1, "No memory for SMB response");
 				msleep(1000);
 				/* retry will check if exiting */
 				continue;
@@ -390,9 +392,9 @@ incomplete_rcv:
 		if (server->tcpStatus == CifsExiting) {
 			break;
 		} else if (server->tcpStatus == CifsNeedReconnect) {
-			cFYI(1, ("Reconnect after server stopped responding"));
+			cFYI(1, "Reconnect after server stopped responding");
 			cifs_reconnect(server);
-			cFYI(1, ("call to reconnect done"));
+			cFYI(1, "call to reconnect done");
 			csocket = server->ssocket;
 			continue;
 		} else if ((length == -ERESTARTSYS) || (length == -EAGAIN)) {
@@ -410,7 +412,7 @@ incomplete_rcv:
 				continue;
 		} else if (length <= 0) {
 			if (server->tcpStatus == CifsNew) {
-				cFYI(1, ("tcp session abend after SMBnegprot"));
+				cFYI(1, "tcp session abend after SMBnegprot");
 				/* some servers kill the TCP session rather than
 				   returning an SMB negprot error, in which
 				   case reconnecting here is not going to help,
@@ -418,18 +420,18 @@ incomplete_rcv:
 				break;
 			}
 			if (!try_to_freeze() && (length == -EINTR)) {
-				cFYI(1, ("cifsd thread killed"));
+				cFYI(1, "cifsd thread killed");
 				break;
 			}
-			cFYI(1, ("Reconnect after unexpected peek error %d",
-				length));
+			cFYI(1, "Reconnect after unexpected peek error %d",
+				length);
 			cifs_reconnect(server);
 			csocket = server->ssocket;
 			wake_up(&server->response_q);
 			continue;
 		} else if (length < pdu_length) {
-			cFYI(1, ("requested %d bytes but only got %d bytes",
-				  pdu_length, length));
+			cFYI(1, "requested %d bytes but only got %d bytes",
+				  pdu_length, length);
 			pdu_length -= length;
 			msleep(1);
 			goto incomplete_rcv;
@@ -449,18 +451,18 @@ incomplete_rcv:
 		pdu_length = be32_to_cpu((__force __be32)smb_buffer->smb_buf_length);
 		smb_buffer->smb_buf_length = pdu_length;
 
-		cFYI(1, ("rfc1002 length 0x%x", pdu_length+4));
+		cFYI(1, "rfc1002 length 0x%x", pdu_length+4);
 
 		if (temp == (char) RFC1002_SESSION_KEEP_ALIVE) {
 			continue;
 		} else if (temp == (char)RFC1002_POSITIVE_SESSION_RESPONSE) {
-			cFYI(1, ("Good RFC 1002 session rsp"));
+			cFYI(1, "Good RFC 1002 session rsp");
 			continue;
 		} else if (temp == (char)RFC1002_NEGATIVE_SESSION_RESPONSE) {
 			/* we get this from Windows 98 instead of
 			   an error on SMB negprot response */
-			cFYI(1, ("Negative RFC1002 Session Response Error 0x%x)",
-				pdu_length));
+			cFYI(1, "Negative RFC1002 Session Response Error 0x%x)",
+				pdu_length);
 			if (server->tcpStatus == CifsNew) {
 				/* if nack on negprot (rather than
 				ret of smb negprot error) reconnecting
@@ -483,7 +485,7 @@ incomplete_rcv:
 				continue;
 			}
 		} else if (temp != (char) 0) {
-			cERROR(1, ("Unknown RFC 1002 frame"));
+			cERROR(1, "Unknown RFC 1002 frame");
 			cifs_dump_mem(" Received Data: ", (char *)smb_buffer,
 				      length);
 			cifs_reconnect(server);
@@ -494,8 +496,8 @@ incomplete_rcv:
 		/* else we have an SMB response */
 		if ((pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) ||
 			    (pdu_length < sizeof(struct smb_hdr) - 1 - 4)) {
-			cERROR(1, ("Invalid size SMB length %d pdu_length %d",
-					length, pdu_length+4));
+			cERROR(1, "Invalid size SMB length %d pdu_length %d",
+					length, pdu_length+4);
 			cifs_reconnect(server);
 			csocket = server->ssocket;
 			wake_up(&server->response_q);
@@ -538,8 +540,8 @@ incomplete_rcv:
 				length = 0;
 				continue;
 			} else if (length <= 0) {
-				cERROR(1, ("Received no data, expecting %d",
-					      pdu_length - total_read));
+				cERROR(1, "Received no data, expecting %d",
+					      pdu_length - total_read);
 				cifs_reconnect(server);
 				csocket = server->ssocket;
 				reconnect = 1;
@@ -587,7 +589,7 @@ incomplete_rcv:
 						}
 					} else {
 						if (!isLargeBuf) {
-							cERROR(1,("1st trans2 resp needs bigbuf"));
+							cERROR(1, "1st trans2 resp needs bigbuf");
 					/* BB maybe we can fix this up,  switch
 					   to already allocated large buffer? */
 						} else {
@@ -629,8 +631,8 @@ multi_t2_fnd:
 			wake_up_process(task_to_wake);
 		} else if (!is_valid_oplock_break(smb_buffer, server) &&
 			   !isMultiRsp) {
-			cERROR(1, ("No task to wake, unknown frame received! "
-				   "NumMids %d", midCount.counter));
+			cERROR(1, "No task to wake, unknown frame received! "
+				   "NumMids %d", midCount.counter);
 			cifs_dump_mem("Received Data is: ", (char *)smb_buffer,
 				      sizeof(struct smb_hdr));
 #ifdef CONFIG_CIFS_DEBUG2
@@ -707,8 +709,8 @@ multi_t2_fnd:
 		list_for_each(tmp, &server->pending_mid_q) {
 		mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
 			if (mid_entry->midState == MID_REQUEST_SUBMITTED) {
-				cFYI(1, ("Clearing Mid 0x%x - waking up ",
-					 mid_entry->mid));
+				cFYI(1, "Clearing Mid 0x%x - waking up ",
+					 mid_entry->mid);
 				task_to_wake = mid_entry->tsk;
 				if (task_to_wake)
 					wake_up_process(task_to_wake);
@@ -727,7 +729,7 @@ multi_t2_fnd:
 		to wait at least 45 seconds before giving up
 		on a request getting a response and going ahead
 		and killing cifsd */
-		cFYI(1, ("Wait for exit from demultiplex thread"));
+		cFYI(1, "Wait for exit from demultiplex thread");
 		msleep(46000);
 		/* if threads still have not exited they are probably never
 		coming home not much else we can do but free the memory */
@@ -848,7 +850,7 @@ cifs_parse_mount_options(char *options, const char *devname,
 			separator[0] = options[4];
 			options += 5;
 		} else {
-			cFYI(1, ("Null separator not allowed"));
+			cFYI(1, "Null separator not allowed");
 		}
 	}
 
@@ -973,7 +975,7 @@ cifs_parse_mount_options(char *options, const char *devname,
 			}
 		} else if (strnicmp(data, "sec", 3) == 0) {
 			if (!value || !*value) {
-				cERROR(1, ("no security value specified"));
+				cERROR(1, "no security value specified");
 				continue;
 			} else if (strnicmp(value, "krb5i", 5) == 0) {
 				vol->secFlg |= CIFSSEC_MAY_KRB5 |
@@ -981,7 +983,7 @@ cifs_parse_mount_options(char *options, const char *devname,
 			} else if (strnicmp(value, "krb5p", 5) == 0) {
 				/* vol->secFlg |= CIFSSEC_MUST_SEAL |
 					CIFSSEC_MAY_KRB5; */
-				cERROR(1, ("Krb5 cifs privacy not supported"));
+				cERROR(1, "Krb5 cifs privacy not supported");
 				return 1;
 			} else if (strnicmp(value, "krb5", 4) == 0) {
 				vol->secFlg |= CIFSSEC_MAY_KRB5;
@@ -1013,7 +1015,7 @@ cifs_parse_mount_options(char *options, const char *devname,
 			} else if (strnicmp(value, "none", 4) == 0) {
 				vol->nullauth = 1;
 			} else {
-				cERROR(1, ("bad security option: %s", value));
+				cERROR(1, "bad security option: %s", value);
 				return 1;
 			}
 		} else if ((strnicmp(data, "unc", 3) == 0)
@@ -1052,7 +1054,7 @@ cifs_parse_mount_options(char *options, const char *devname,
 			a domain name and need special handling? */
 			if (strnlen(value, 256) < 256) {
 				vol->domainname = value;
-				cFYI(1, ("Domain name set"));
+				cFYI(1, "Domain name set");
 			} else {
 				printk(KERN_WARNING "CIFS: domain name too "
 						    "long\n");
@@ -1075,7 +1077,7 @@ cifs_parse_mount_options(char *options, const char *devname,
 					strcpy(vol->prepath+1, value);
 				} else
 					strcpy(vol->prepath, value);
-				cFYI(1, ("prefix path %s", vol->prepath));
+				cFYI(1, "prefix path %s", vol->prepath);
 			} else {
 				printk(KERN_WARNING "CIFS: prefix too long\n");
 				return 1;
@@ -1091,7 +1093,7 @@ cifs_parse_mount_options(char *options, const char *devname,
 					vol->iocharset = value;
 				/* if iocharset not set then load_nls_default
 				   is used by caller */
-				cFYI(1, ("iocharset set to %s", value));
+				cFYI(1, "iocharset set to %s", value);
 			} else {
 				printk(KERN_WARNING "CIFS: iocharset name "
 						    "too long.\n");
@@ -1142,13 +1144,15 @@ cifs_parse_mount_options(char *options, const char *devname,
 					simple_strtoul(value, &value, 0);
 			}
 		} else if (strnicmp(data, "sockopt", 5) == 0) {
-			if (value && *value) {
-				vol->sockopt =
-					simple_strtoul(value, &value, 0);
+			if (!value || !*value) {
+				cERROR(1, "no socket option specified");
+				continue;
+			} else if (strnicmp(value, "TCP_NODELAY", 11) == 0) {
+				vol->sockopt_tcp_nodelay = 1;
 			}
 		} else if (strnicmp(data, "netbiosname", 4) == 0) {
 			if (!value || !*value || (*value == ' ')) {
-				cFYI(1, ("invalid (empty) netbiosname"));
+				cFYI(1, "invalid (empty) netbiosname");
 			} else {
 				memset(vol->source_rfc1001_name, 0x20, 15);
 				for (i = 0; i < 15; i++) {
@@ -1172,7 +1176,7 @@ cifs_parse_mount_options(char *options, const char *devname,
 		} else if (strnicmp(data, "servern", 7) == 0) {
 			/* servernetbiosname specified override *SMBSERVER */
 			if (!value || !*value || (*value == ' ')) {
-				cFYI(1, ("empty server netbiosname specified"));
+				cFYI(1, "empty server netbiosname specified");
 			} else {
 				/* last byte, type, is 0x20 for servr type */
 				memset(vol->target_rfc1001_name, 0x20, 16);
@@ -1431,7 +1435,7 @@ cifs_find_tcp_session(struct sockaddr_storage *addr, unsigned short int port)
 
 		++server->srv_count;
 		write_unlock(&cifs_tcp_ses_lock);
-		cFYI(1, ("Existing tcp session with server found"));
+		cFYI(1, "Existing tcp session with server found");
 		return server;
 	}
 	write_unlock(&cifs_tcp_ses_lock);
@@ -1472,7 +1476,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 
 	memset(&addr, 0, sizeof(struct sockaddr_storage));
 
-	cFYI(1, ("UNC: %s ip: %s", volume_info->UNC, volume_info->UNCip));
+	cFYI(1, "UNC: %s ip: %s", volume_info->UNC, volume_info->UNCip);
 
 	if (volume_info->UNCip && volume_info->UNC) {
 		rc = cifs_convert_address(volume_info->UNCip, &addr);
@@ -1484,13 +1488,12 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 	} else if (volume_info->UNCip) {
 		/* BB using ip addr as tcp_ses name to connect to the
 		   DFS root below */
-		cERROR(1, ("Connecting to DFS root not implemented yet"));
+		cERROR(1, "Connecting to DFS root not implemented yet");
 		rc = -EINVAL;
 		goto out_err;
 	} else /* which tcp_sess DFS root would we conect to */ {
-		cERROR(1,
-		       ("CIFS mount error: No UNC path (e.g. -o "
-			"unc=//192.168.1.100/public) specified"));
+		cERROR(1, "CIFS mount error: No UNC path (e.g. -o "
+			"unc=//192.168.1.100/public) specified");
 		rc = -EINVAL;
 		goto out_err;
 	}
@@ -1514,6 +1517,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 
 	tcp_ses->noblocksnd = volume_info->noblocksnd;
 	tcp_ses->noautotune = volume_info->noautotune;
+	tcp_ses->tcp_nodelay = volume_info->sockopt_tcp_nodelay;
 	atomic_set(&tcp_ses->inFlight, 0);
 	init_waitqueue_head(&tcp_ses->response_q);
 	init_waitqueue_head(&tcp_ses->request_q);
@@ -1536,7 +1540,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 	++tcp_ses->srv_count;
 
 	if (addr.ss_family == AF_INET6) {
-		cFYI(1, ("attempting ipv6 connect"));
+		cFYI(1, "attempting ipv6 connect");
 		/* BB should we allow ipv6 on port 139? */
 		/* other OS never observed in Wild doing 139 with v6 */
 		sin_server6->sin6_port = htons(volume_info->port);
@@ -1550,7 +1554,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 		rc = ipv4_connect(tcp_ses);
 	}
 	if (rc < 0) {
-		cERROR(1, ("Error connecting to socket. Aborting operation"));
+		cERROR(1, "Error connecting to socket. Aborting operation");
 		goto out_err;
 	}
 
@@ -1563,7 +1567,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 				  tcp_ses, "cifsd");
 	if (IS_ERR(tcp_ses->tsk)) {
 		rc = PTR_ERR(tcp_ses->tsk);
-		cERROR(1, ("error %d create cifsd thread", rc));
+		cERROR(1, "error %d create cifsd thread", rc);
 		module_put(THIS_MODULE);
 		goto out_err;
 	}
@@ -1612,6 +1616,7 @@ cifs_put_smb_ses(struct cifsSesInfo *ses)
 	int xid;
 	struct TCP_Server_Info *server = ses->server;
 
+	cFYI(1, "%s: ses_count=%d\n", __func__, ses->ses_count);
 	write_lock(&cifs_tcp_ses_lock);
 	if (--ses->ses_count > 0) {
 		write_unlock(&cifs_tcp_ses_lock);
@@ -1630,6 +1635,102 @@ cifs_put_smb_ses(struct cifsSesInfo *ses)
 	cifs_put_tcp_session(server);
 }
 
+static struct cifsSesInfo *
+cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
+{
+	int rc = -ENOMEM, xid;
+	struct cifsSesInfo *ses;
+
+	xid = GetXid();
+
+	ses = cifs_find_smb_ses(server, volume_info->username);
+	if (ses) {
+		cFYI(1, "Existing smb sess found (status=%d)", ses->status);
+
+		/* existing SMB ses has a server reference already */
+		cifs_put_tcp_session(server);
+
+		mutex_lock(&ses->session_mutex);
+		rc = cifs_negotiate_protocol(xid, ses);
+		if (rc) {
+			mutex_unlock(&ses->session_mutex);
+			/* problem -- put our ses reference */
+			cifs_put_smb_ses(ses);
+			FreeXid(xid);
+			return ERR_PTR(rc);
+		}
+		if (ses->need_reconnect) {
+			cFYI(1, "Session needs reconnect");
+			rc = cifs_setup_session(xid, ses,
+						volume_info->local_nls);
+			if (rc) {
+				mutex_unlock(&ses->session_mutex);
+				/* problem -- put our reference */
+				cifs_put_smb_ses(ses);
+				FreeXid(xid);
+				return ERR_PTR(rc);
+			}
+		}
+		mutex_unlock(&ses->session_mutex);
+		FreeXid(xid);
+		return ses;
+	}
+
+	cFYI(1, "Existing smb sess not found");
+	ses = sesInfoAlloc();
+	if (ses == NULL)
+		goto get_ses_fail;
+
+	/* new SMB session uses our server ref */
+	ses->server = server;
+	if (server->addr.sockAddr6.sin6_family == AF_INET6)
+		sprintf(ses->serverName, "%pI6",
+			&server->addr.sockAddr6.sin6_addr);
+	else
+		sprintf(ses->serverName, "%pI4",
+			&server->addr.sockAddr.sin_addr.s_addr);
+
+	if (volume_info->username)
+		strncpy(ses->userName, volume_info->username,
+			MAX_USERNAME_SIZE);
+
+	/* volume_info->password freed at unmount */
+	if (volume_info->password) {
+		ses->password = kstrdup(volume_info->password, GFP_KERNEL);
+		if (!ses->password)
+			goto get_ses_fail;
+	}
+	if (volume_info->domainname) {
+		int len = strlen(volume_info->domainname);
+		ses->domainName = kmalloc(len + 1, GFP_KERNEL);
+		if (ses->domainName)
+			strcpy(ses->domainName, volume_info->domainname);
+	}
+	ses->linux_uid = volume_info->linux_uid;
+	ses->overrideSecFlg = volume_info->secFlg;
+
+	mutex_lock(&ses->session_mutex);
+	rc = cifs_negotiate_protocol(xid, ses);
+	if (!rc)
+		rc = cifs_setup_session(xid, ses, volume_info->local_nls);
+	mutex_unlock(&ses->session_mutex);
+	if (rc)
+		goto get_ses_fail;
+
+	/* success, put it on the list */
+	write_lock(&cifs_tcp_ses_lock);
+	list_add(&ses->smb_ses_list, &server->smb_ses_list);
+	write_unlock(&cifs_tcp_ses_lock);
+
+	FreeXid(xid);
+	return ses;
+
+get_ses_fail:
+	sesInfoFree(ses);
+	FreeXid(xid);
+	return ERR_PTR(rc);
+}
+
 static struct cifsTconInfo *
 cifs_find_tcon(struct cifsSesInfo *ses, const char *unc)
 {
@@ -1658,6 +1759,7 @@ cifs_put_tcon(struct cifsTconInfo *tcon)
 	int xid;
 	struct cifsSesInfo *ses = tcon->ses;
 
+	cFYI(1, "%s: tc_count=%d\n", __func__, tcon->tc_count);
 	write_lock(&cifs_tcp_ses_lock);
 	if (--tcon->tc_count > 0) {
 		write_unlock(&cifs_tcp_ses_lock);
@@ -1675,6 +1777,80 @@ cifs_put_tcon(struct cifsTconInfo *tcon)
 	cifs_put_smb_ses(ses);
 }
 
+static struct cifsTconInfo *
+cifs_get_tcon(struct cifsSesInfo *ses, struct smb_vol *volume_info)
+{
+	int rc, xid;
+	struct cifsTconInfo *tcon;
+
+	tcon = cifs_find_tcon(ses, volume_info->UNC);
+	if (tcon) {
+		cFYI(1, "Found match on UNC path");
+		/* existing tcon already has a reference */
+		cifs_put_smb_ses(ses);
+		if (tcon->seal != volume_info->seal)
+			cERROR(1, "transport encryption setting "
+				   "conflicts with existing tid");
+		return tcon;
+	}
+
+	tcon = tconInfoAlloc();
+	if (tcon == NULL) {
+		rc = -ENOMEM;
+		goto out_fail;
+	}
+
+	tcon->ses = ses;
+	if (volume_info->password) {
+		tcon->password = kstrdup(volume_info->password, GFP_KERNEL);
+		if (!tcon->password) {
+			rc = -ENOMEM;
+			goto out_fail;
+		}
+	}
+
+	if (strchr(volume_info->UNC + 3, '\\') == NULL
+	    && strchr(volume_info->UNC + 3, '/') == NULL) {
+		cERROR(1, "Missing share name");
+		rc = -ENODEV;
+		goto out_fail;
+	}
+
+	/* BB Do we need to wrap session_mutex around
+	 * this TCon call and Unix SetFS as
+	 * we do on SessSetup and reconnect? */
+	xid = GetXid();
+	rc = CIFSTCon(xid, ses, volume_info->UNC, tcon, volume_info->local_nls);
+	FreeXid(xid);
+	cFYI(1, "CIFS Tcon rc = %d", rc);
+	if (rc)
+		goto out_fail;
+
+	if (volume_info->nodfs) {
+		tcon->Flags &= ~SMB_SHARE_IS_IN_DFS;
+		cFYI(1, "DFS disabled (%d)", tcon->Flags);
+	}
+	tcon->seal = volume_info->seal;
+	/* we can have only one retry value for a connection
+	   to a share so for resources mounted more than once
+	   to the same server share the last value passed in
+	   for the retry flag is used */
+	tcon->retry = volume_info->retry;
+	tcon->nocase = volume_info->nocase;
+	tcon->local_lease = volume_info->local_lease;
+
+	write_lock(&cifs_tcp_ses_lock);
+	list_add(&tcon->tcon_list, &ses->tcon_list);
+	write_unlock(&cifs_tcp_ses_lock);
+
+	return tcon;
+
+out_fail:
+	tconInfoFree(tcon);
+	return ERR_PTR(rc);
+}
+
+
 int
 get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
 	     const struct nls_table *nls_codepage, unsigned int *pnum_referrals,
@@ -1699,8 +1875,7 @@ get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
 		strcpy(temp_unc + 2, pSesInfo->serverName);
 		strcpy(temp_unc + 2 + strlen(pSesInfo->serverName), "\\IPC$");
 		rc = CIFSTCon(xid, pSesInfo, temp_unc, NULL, nls_codepage);
-		cFYI(1,
-		     ("CIFS Tcon rc = %d ipc_tid = %d", rc, pSesInfo->ipc_tid));
+		cFYI(1, "CIFS Tcon rc = %d ipc_tid = %d", rc, pSesInfo->ipc_tid);
 		kfree(temp_unc);
 	}
 	if (rc == 0)
@@ -1764,6 +1939,7 @@ static int
 ipv4_connect(struct TCP_Server_Info *server)
 {
 	int rc = 0;
+	int val;
 	bool connected = false;
 	__be16 orig_port = 0;
 	struct socket *socket = server->ssocket;
@@ -1772,12 +1948,12 @@ ipv4_connect(struct TCP_Server_Info *server)
 		rc = sock_create_kern(PF_INET, SOCK_STREAM,
 				      IPPROTO_TCP, &socket);
 		if (rc < 0) {
-			cERROR(1, ("Error %d creating socket", rc));
+			cERROR(1, "Error %d creating socket", rc);
 			return rc;
 		}
 
 		/* BB other socket options to set KEEPALIVE, NODELAY? */
-		cFYI(1, ("Socket created"));
+		cFYI(1, "Socket created");
 		server->ssocket = socket;
 		socket->sk->sk_allocation = GFP_NOFS;
 		cifs_reclassify_socket4(socket);
@@ -1822,7 +1998,7 @@ ipv4_connect(struct TCP_Server_Info *server)
 	if (!connected) {
 		if (orig_port)
 			server->addr.sockAddr.sin_port = orig_port;
-		cFYI(1, ("Error %d connecting to server via ipv4", rc));
+		cFYI(1, "Error %d connecting to server via ipv4", rc);
 		sock_release(socket);
 		server->ssocket = NULL;
 		return rc;
@@ -1845,9 +2021,17 @@ ipv4_connect(struct TCP_Server_Info *server)
 			socket->sk->sk_rcvbuf = 140 * 1024;
 	}
 
-	 cFYI(1, ("sndbuf %d rcvbuf %d rcvtimeo 0x%lx",
+	if (server->tcp_nodelay) {
+		val = 1;
+		rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
+				(char *)&val, sizeof(val));
+		if (rc)
+			cFYI(1, "set TCP_NODELAY socket option error %d", rc);
+	}
+
+	 cFYI(1, "sndbuf %d rcvbuf %d rcvtimeo 0x%lx",
 		 socket->sk->sk_sndbuf,
-		 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo));
+		 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo);
 
 	/* send RFC1001 sessinit */
 	if (server->addr.sockAddr.sin_port == htons(RFC1001_PORT)) {
@@ -1916,6 +2100,7 @@ static int
 ipv6_connect(struct TCP_Server_Info *server)
 {
 	int rc = 0;
+	int val;
 	bool connected = false;
 	__be16 orig_port = 0;
 	struct socket *socket = server->ssocket;
@@ -1924,13 +2109,13 @@ ipv6_connect(struct TCP_Server_Info *server)
 		rc = sock_create_kern(PF_INET6, SOCK_STREAM,
 				      IPPROTO_TCP, &socket);
 		if (rc < 0) {
-			cERROR(1, ("Error %d creating ipv6 socket", rc));
+			cERROR(1, "Error %d creating ipv6 socket", rc);
 			socket = NULL;
 			return rc;
 		}
 
 		/* BB other socket options to set KEEPALIVE, NODELAY? */
-		cFYI(1, ("ipv6 Socket created"));
+		cFYI(1, "ipv6 Socket created");
 		server->ssocket = socket;
 		socket->sk->sk_allocation = GFP_NOFS;
 		cifs_reclassify_socket6(socket);
@@ -1974,7 +2159,7 @@ ipv6_connect(struct TCP_Server_Info *server)
 	if (!connected) {
 		if (orig_port)
 			server->addr.sockAddr6.sin6_port = orig_port;
-		cFYI(1, ("Error %d connecting to server via ipv6", rc));
+		cFYI(1, "Error %d connecting to server via ipv6", rc);
 		sock_release(socket);
 		server->ssocket = NULL;
 		return rc;
@@ -1987,6 +2172,15 @@ ipv6_connect(struct TCP_Server_Info *server)
 	 */
 	socket->sk->sk_rcvtimeo = 7 * HZ;
 	socket->sk->sk_sndtimeo = 5 * HZ;
+
+	if (server->tcp_nodelay) {
+		val = 1;
+		rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
+				(char *)&val, sizeof(val));
+		if (rc)
+			cFYI(1, "set TCP_NODELAY socket option error %d", rc);
+	}
+
 	server->ssocket = socket;
 
 	return rc;
@@ -2009,13 +2203,13 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
 	if (vol_info && vol_info->no_linux_ext) {
 		tcon->fsUnixInfo.Capability = 0;
 		tcon->unix_ext = 0; /* Unix Extensions disabled */
-		cFYI(1, ("Linux protocol extensions disabled"));
+		cFYI(1, "Linux protocol extensions disabled");
 		return;
 	} else if (vol_info)
 		tcon->unix_ext = 1; /* Unix Extensions supported */
 
 	if (tcon->unix_ext == 0) {
-		cFYI(1, ("Unix extensions disabled so not set on reconnect"));
+		cFYI(1, "Unix extensions disabled so not set on reconnect");
 		return;
 	}
 
@@ -2031,12 +2225,11 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
 				cap &= ~CIFS_UNIX_POSIX_ACL_CAP;
 			if ((saved_cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0) {
 				if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP)
-					cERROR(1, ("POSIXPATH support change"));
+					cERROR(1, "POSIXPATH support change");
 				cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP;
 			} else if ((cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0) {
-				cERROR(1, ("possible reconnect error"));
-				cERROR(1,
-					("server disabled POSIX path support"));
+				cERROR(1, "possible reconnect error");
+				cERROR(1, "server disabled POSIX path support");
 			}
 		}
 
@@ -2044,7 +2237,7 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
 		if (vol_info && vol_info->no_psx_acl)
 			cap &= ~CIFS_UNIX_POSIX_ACL_CAP;
 		else if (CIFS_UNIX_POSIX_ACL_CAP & cap) {
-			cFYI(1, ("negotiated posix acl support"));
+			cFYI(1, "negotiated posix acl support");
 			if (sb)
 				sb->s_flags |= MS_POSIXACL;
 		}
@@ -2052,7 +2245,7 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
 		if (vol_info && vol_info->posix_paths == 0)
 			cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP;
 		else if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) {
-			cFYI(1, ("negotiate posix pathnames"));
+			cFYI(1, "negotiate posix pathnames");
 			if (sb)
 				CIFS_SB(sb)->mnt_cifs_flags |=
 					CIFS_MOUNT_POSIX_PATHS;
@@ -2067,39 +2260,38 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
 		if (sb && (CIFS_SB(sb)->rsize > 127 * 1024)) {
 			if ((cap & CIFS_UNIX_LARGE_READ_CAP) == 0) {
 				CIFS_SB(sb)->rsize = 127 * 1024;
-				cFYI(DBG2,
-					("larger reads not supported by srv"));
+				cFYI(DBG2, "larger reads not supported by srv");
 			}
 		}
 
 
-		cFYI(1, ("Negotiate caps 0x%x", (int)cap));
+		cFYI(1, "Negotiate caps 0x%x", (int)cap);
 #ifdef CONFIG_CIFS_DEBUG2
 		if (cap & CIFS_UNIX_FCNTL_CAP)
-			cFYI(1, ("FCNTL cap"));
+			cFYI(1, "FCNTL cap");
 		if (cap & CIFS_UNIX_EXTATTR_CAP)
-			cFYI(1, ("EXTATTR cap"));
+			cFYI(1, "EXTATTR cap");
 		if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP)
-			cFYI(1, ("POSIX path cap"));
+			cFYI(1, "POSIX path cap");
 		if (cap & CIFS_UNIX_XATTR_CAP)
-			cFYI(1, ("XATTR cap"));
+			cFYI(1, "XATTR cap");
 		if (cap & CIFS_UNIX_POSIX_ACL_CAP)
-			cFYI(1, ("POSIX ACL cap"));
+			cFYI(1, "POSIX ACL cap");
 		if (cap & CIFS_UNIX_LARGE_READ_CAP)
-			cFYI(1, ("very large read cap"));
+			cFYI(1, "very large read cap");
 		if (cap & CIFS_UNIX_LARGE_WRITE_CAP)
-			cFYI(1, ("very large write cap"));
+			cFYI(1, "very large write cap");
 #endif /* CIFS_DEBUG2 */
 		if (CIFSSMBSetFSUnixInfo(xid, tcon, cap)) {
 			if (vol_info == NULL) {
-				cFYI(1, ("resetting capabilities failed"));
+				cFYI(1, "resetting capabilities failed");
 			} else
-				cERROR(1, ("Negotiating Unix capabilities "
+				cERROR(1, "Negotiating Unix capabilities "
 					   "with the server failed.  Consider "
 					   "mounting with the Unix Extensions\n"
 					   "disabled, if problems are found, "
 					   "by specifying the nounix mount "
-					   "option."));
+					   "option.");
 
 		}
 	}
@@ -2129,8 +2321,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
 			  struct cifs_sb_info *cifs_sb)
 {
 	if (pvolume_info->rsize > CIFSMaxBufSize) {
-		cERROR(1, ("rsize %d too large, using MaxBufSize",
-			pvolume_info->rsize));
+		cERROR(1, "rsize %d too large, using MaxBufSize",
+			pvolume_info->rsize);
 		cifs_sb->rsize = CIFSMaxBufSize;
 	} else if ((pvolume_info->rsize) &&
 			(pvolume_info->rsize <= CIFSMaxBufSize))
@@ -2139,8 +2331,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
 		cifs_sb->rsize = CIFSMaxBufSize;
 
 	if (pvolume_info->wsize > PAGEVEC_SIZE * PAGE_CACHE_SIZE) {
-		cERROR(1, ("wsize %d too large, using 4096 instead",
-			  pvolume_info->wsize));
+		cERROR(1, "wsize %d too large, using 4096 instead",
+			  pvolume_info->wsize);
 		cifs_sb->wsize = 4096;
 	} else if (pvolume_info->wsize)
 		cifs_sb->wsize = pvolume_info->wsize;
@@ -2158,7 +2350,7 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
 	if (cifs_sb->rsize < 2048) {
 		cifs_sb->rsize = 2048;
 		/* Windows ME may prefer this */
-		cFYI(1, ("readsize set to minimum: 2048"));
+		cFYI(1, "readsize set to minimum: 2048");
 	}
 	/* calculate prepath */
 	cifs_sb->prepath = pvolume_info->prepath;
@@ -2176,8 +2368,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
 	cifs_sb->mnt_gid = pvolume_info->linux_gid;
 	cifs_sb->mnt_file_mode = pvolume_info->file_mode;
 	cifs_sb->mnt_dir_mode = pvolume_info->dir_mode;
-	cFYI(1, ("file mode: 0x%x  dir mode: 0x%x",
-		cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode));
+	cFYI(1, "file mode: 0x%x  dir mode: 0x%x",
+		cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode);
 
 	if (pvolume_info->noperm)
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM;
@@ -2206,13 +2398,13 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
 	if (pvolume_info->dynperm)
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM;
 	if (pvolume_info->direct_io) {
-		cFYI(1, ("mounting share using direct i/o"));
+		cFYI(1, "mounting share using direct i/o");
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
 	}
 
 	if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm))
-		cERROR(1, ("mount option dynperm ignored if cifsacl "
-			   "mount option supported"));
+		cERROR(1, "mount option dynperm ignored if cifsacl "
+			   "mount option supported");
 }
 
 static int
@@ -2239,7 +2431,7 @@ cleanup_volume_info(struct smb_vol **pvolume_info)
 {
 	struct smb_vol *volume_info;
 
-	if (!pvolume_info && !*pvolume_info)
+	if (!pvolume_info || !*pvolume_info)
 		return;
 
 	volume_info = *pvolume_info;
@@ -2321,11 +2513,11 @@ try_mount_again:
 	}
 
 	if (volume_info->nullauth) {
-		cFYI(1, ("null user"));
+		cFYI(1, "null user");
 		volume_info->username = "";
 	} else if (volume_info->username) {
 		/* BB fixme parse for domain name here */
-		cFYI(1, ("Username: %s", volume_info->username));
+		cFYI(1, "Username: %s", volume_info->username);
 	} else {
 		cifserror("No username specified");
 	/* In userspace mount helper we can get user name from alternate
@@ -2334,20 +2526,20 @@ try_mount_again:
 		goto out;
 	}
 
-
 	/* this is needed for ASCII cp to Unicode converts */
 	if (volume_info->iocharset == NULL) {
-		cifs_sb->local_nls = load_nls_default();
-	/* load_nls_default can not return null */
+		/* load_nls_default cannot return null */
+		volume_info->local_nls = load_nls_default();
 	} else {
-		cifs_sb->local_nls = load_nls(volume_info->iocharset);
-		if (cifs_sb->local_nls == NULL) {
-			cERROR(1, ("CIFS mount error: iocharset %s not found",
-				 volume_info->iocharset));
+		volume_info->local_nls = load_nls(volume_info->iocharset);
+		if (volume_info->local_nls == NULL) {
+			cERROR(1, "CIFS mount error: iocharset %s not found",
+				 volume_info->iocharset);
 			rc = -ELIBACC;
 			goto out;
 		}
 	}
+	cifs_sb->local_nls = volume_info->local_nls;
 
 	/* get a reference to a tcp session */
 	srvTcp = cifs_get_tcp_session(volume_info);
@@ -2356,148 +2548,30 @@ try_mount_again:
 		goto out;
 	}
 
-	pSesInfo = cifs_find_smb_ses(srvTcp, volume_info->username);
-	if (pSesInfo) {
-		cFYI(1, ("Existing smb sess found (status=%d)",
-			pSesInfo->status));
-		/*
-		 * The existing SMB session already has a reference to srvTcp,
-		 * so we can put back the extra one we got before
-		 */
-		cifs_put_tcp_session(srvTcp);
-
-		down(&pSesInfo->sesSem);
-		if (pSesInfo->need_reconnect) {
-			cFYI(1, ("Session needs reconnect"));
-			rc = cifs_setup_session(xid, pSesInfo,
-						cifs_sb->local_nls);
-		}
-		up(&pSesInfo->sesSem);
-	} else if (!rc) {
-		cFYI(1, ("Existing smb sess not found"));
-		pSesInfo = sesInfoAlloc();
-		if (pSesInfo == NULL) {
-			rc = -ENOMEM;
-			goto mount_fail_check;
-		}
-
-		/* new SMB session uses our srvTcp ref */
-		pSesInfo->server = srvTcp;
-		if (srvTcp->addr.sockAddr6.sin6_family == AF_INET6)
-			sprintf(pSesInfo->serverName, "%pI6",
-				&srvTcp->addr.sockAddr6.sin6_addr);
-		else
-			sprintf(pSesInfo->serverName, "%pI4",
-				&srvTcp->addr.sockAddr.sin_addr.s_addr);
-
-		write_lock(&cifs_tcp_ses_lock);
-		list_add(&pSesInfo->smb_ses_list, &srvTcp->smb_ses_list);
-		write_unlock(&cifs_tcp_ses_lock);
-
-		/* volume_info->password freed at unmount */
-		if (volume_info->password) {
-			pSesInfo->password = kstrdup(volume_info->password,
-						     GFP_KERNEL);
-			if (!pSesInfo->password) {
-				rc = -ENOMEM;
-				goto mount_fail_check;
-			}
-		}
-		if (volume_info->username)
-			strncpy(pSesInfo->userName, volume_info->username,
-				MAX_USERNAME_SIZE);
-		if (volume_info->domainname) {
-			int len = strlen(volume_info->domainname);
-			pSesInfo->domainName = kmalloc(len + 1, GFP_KERNEL);
-			if (pSesInfo->domainName)
-				strcpy(pSesInfo->domainName,
-					volume_info->domainname);
-		}
-		pSesInfo->linux_uid = volume_info->linux_uid;
-		pSesInfo->overrideSecFlg = volume_info->secFlg;
-		down(&pSesInfo->sesSem);
-
-		/* BB FIXME need to pass vol->secFlgs BB */
-		rc = cifs_setup_session(xid, pSesInfo,
-					cifs_sb->local_nls);
-		up(&pSesInfo->sesSem);
+	/* get a reference to a SMB session */
+	pSesInfo = cifs_get_smb_ses(srvTcp, volume_info);
+	if (IS_ERR(pSesInfo)) {
+		rc = PTR_ERR(pSesInfo);
+		pSesInfo = NULL;
+		goto mount_fail_check;
 	}
 
-	/* search for existing tcon to this server share */
-	if (!rc) {
-		setup_cifs_sb(volume_info, cifs_sb);
-
-		tcon = cifs_find_tcon(pSesInfo, volume_info->UNC);
-		if (tcon) {
-			cFYI(1, ("Found match on UNC path"));
-			/* existing tcon already has a reference */
-			cifs_put_smb_ses(pSesInfo);
-			if (tcon->seal != volume_info->seal)
-				cERROR(1, ("transport encryption setting "
-					   "conflicts with existing tid"));
-		} else {
-			tcon = tconInfoAlloc();
-			if (tcon == NULL) {
-				rc = -ENOMEM;
-				goto mount_fail_check;
-			}
-
-			tcon->ses = pSesInfo;
-			if (volume_info->password) {
-				tcon->password = kstrdup(volume_info->password,
-							 GFP_KERNEL);
-				if (!tcon->password) {
-					rc = -ENOMEM;
-					goto mount_fail_check;
-				}
-			}
-
-			if ((strchr(volume_info->UNC + 3, '\\') == NULL)
-			    && (strchr(volume_info->UNC + 3, '/') == NULL)) {
-				cERROR(1, ("Missing share name"));
-				rc = -ENODEV;
-				goto mount_fail_check;
-			} else {
-				/* BB Do we need to wrap sesSem around
-				 * this TCon call and Unix SetFS as
-				 * we do on SessSetup and reconnect? */
-				rc = CIFSTCon(xid, pSesInfo, volume_info->UNC,
-					      tcon, cifs_sb->local_nls);
-				cFYI(1, ("CIFS Tcon rc = %d", rc));
-				if (volume_info->nodfs) {
-					tcon->Flags &= ~SMB_SHARE_IS_IN_DFS;
-					cFYI(1, ("DFS disabled (%d)",
-						tcon->Flags));
-				}
-			}
-			if (rc)
-				goto remote_path_check;
-			tcon->seal = volume_info->seal;
-			write_lock(&cifs_tcp_ses_lock);
-			list_add(&tcon->tcon_list, &pSesInfo->tcon_list);
-			write_unlock(&cifs_tcp_ses_lock);
-		}
-
-		/* we can have only one retry value for a connection
-		   to a share so for resources mounted more than once
-		   to the same server share the last value passed in
-		   for the retry flag is used */
-		tcon->retry = volume_info->retry;
-		tcon->nocase = volume_info->nocase;
-		tcon->local_lease = volume_info->local_lease;
-	}
-	if (pSesInfo) {
-		if (pSesInfo->capabilities & CAP_LARGE_FILES)
-			sb->s_maxbytes = MAX_LFS_FILESIZE;
-		else
-			sb->s_maxbytes = MAX_NON_LFS;
-	}
+	setup_cifs_sb(volume_info, cifs_sb);
+	if (pSesInfo->capabilities & CAP_LARGE_FILES)
+		sb->s_maxbytes = MAX_LFS_FILESIZE;
+	else
+		sb->s_maxbytes = MAX_NON_LFS;
 
 	/* BB FIXME fix time_gran to be larger for LANMAN sessions */
 	sb->s_time_gran = 100;
 
-	if (rc)
+	/* search for existing tcon to this server share */
+	tcon = cifs_get_tcon(pSesInfo, volume_info);
+	if (IS_ERR(tcon)) {
+		rc = PTR_ERR(tcon);
+		tcon = NULL;
 		goto remote_path_check;
+	}
 
 	cifs_sb->tcon = tcon;
 
@@ -2521,7 +2595,7 @@ try_mount_again:
 
 	if ((tcon->unix_ext == 0) && (cifs_sb->rsize > (1024 * 127))) {
 		cifs_sb->rsize = 1024 * 127;
-		cFYI(DBG2, ("no very large read support, rsize now 127K"));
+		cFYI(DBG2, "no very large read support, rsize now 127K");
 	}
 	if (!(tcon->ses->capabilities & CAP_LARGE_WRITE_X))
 		cifs_sb->wsize = min(cifs_sb->wsize,
@@ -2570,7 +2644,7 @@ remote_path_check:
 			goto mount_fail_check;
 		}
 
-		cFYI(1, ("Getting referral for: %s", full_path));
+		cFYI(1, "Getting referral for: %s", full_path);
 		rc = get_dfs_path(xid, pSesInfo , full_path + 1,
 			cifs_sb->local_nls, &num_referrals, &referrals,
 			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -2684,7 +2758,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 		   by Samba (not sure whether other servers allow
 		   NTLMv2 password here) */
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
-		if ((extended_security & CIFSSEC_MAY_LANMAN) &&
+		if ((global_secflags & CIFSSEC_MAY_LANMAN) &&
 		    (ses->server->secType == LANMAN))
 			calc_lanman_hash(tcon->password, ses->server->cryptKey,
 					 ses->server->secMode &
@@ -2755,13 +2829,13 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 		if (length == 3) {
 			if ((bcc_ptr[0] == 'I') && (bcc_ptr[1] == 'P') &&
 			    (bcc_ptr[2] == 'C')) {
-				cFYI(1, ("IPC connection"));
+				cFYI(1, "IPC connection");
 				tcon->ipc = 1;
 			}
 		} else if (length == 2) {
 			if ((bcc_ptr[0] == 'A') && (bcc_ptr[1] == ':')) {
 				/* the most common case */
-				cFYI(1, ("disk share connection"));
+				cFYI(1, "disk share connection");
 			}
 		}
 		bcc_ptr += length + 1;
@@ -2774,7 +2848,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 						      bytes_left, is_unicode,
 						      nls_codepage);
 
-		cFYI(1, ("nativeFileSystem=%s", tcon->nativeFileSystem));
+		cFYI(1, "nativeFileSystem=%s", tcon->nativeFileSystem);
 
 		if ((smb_buffer_response->WordCount == 3) ||
 			 (smb_buffer_response->WordCount == 7))
@@ -2782,7 +2856,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 			tcon->Flags = le16_to_cpu(pSMBr->OptionalSupport);
 		else
 			tcon->Flags = 0;
-		cFYI(1, ("Tcon flags: 0x%x ", tcon->Flags));
+		cFYI(1, "Tcon flags: 0x%x ", tcon->Flags);
 	} else if ((rc == 0) && tcon == NULL) {
 		/* all we need to save for IPC$ connection */
 		ses->ipc_tid = smb_buffer_response->Tid;
@@ -2810,57 +2884,61 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
 	return rc;
 }
 
-int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
-					   struct nls_table *nls_info)
+int cifs_negotiate_protocol(unsigned int xid, struct cifsSesInfo *ses)
 {
 	int rc = 0;
-	int first_time = 0;
-	struct TCP_Server_Info *server = pSesInfo->server;
-
-	/* what if server changes its buffer size after dropping the session? */
-	if (server->maxBuf == 0) /* no need to send on reconnect */ {
-		rc = CIFSSMBNegotiate(xid, pSesInfo);
-		if (rc == -EAGAIN) {
-			/* retry only once on 1st time connection */
-			rc = CIFSSMBNegotiate(xid, pSesInfo);
-			if (rc == -EAGAIN)
-				rc = -EHOSTDOWN;
-		}
-		if (rc == 0) {
-			spin_lock(&GlobalMid_Lock);
-			if (server->tcpStatus != CifsExiting)
-				server->tcpStatus = CifsGood;
-			else
-				rc = -EHOSTDOWN;
-			spin_unlock(&GlobalMid_Lock);
+	struct TCP_Server_Info *server = ses->server;
 
-		}
-		first_time = 1;
+	/* only send once per connect */
+	if (server->maxBuf != 0)
+		return 0;
+
+	rc = CIFSSMBNegotiate(xid, ses);
+	if (rc == -EAGAIN) {
+		/* retry only once on 1st time connection */
+		rc = CIFSSMBNegotiate(xid, ses);
+		if (rc == -EAGAIN)
+			rc = -EHOSTDOWN;
 	}
+	if (rc == 0) {
+		spin_lock(&GlobalMid_Lock);
+		if (server->tcpStatus != CifsExiting)
+			server->tcpStatus = CifsGood;
+		else
+			rc = -EHOSTDOWN;
+		spin_unlock(&GlobalMid_Lock);
 
-	if (rc)
-		goto ss_err_exit;
+	}
+
+	return rc;
+}
+
+
+int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
+			struct nls_table *nls_info)
+{
+	int rc = 0;
+	struct TCP_Server_Info *server = ses->server;
 
-	pSesInfo->flags = 0;
-	pSesInfo->capabilities = server->capabilities;
+	ses->flags = 0;
+	ses->capabilities = server->capabilities;
 	if (linuxExtEnabled == 0)
-		pSesInfo->capabilities &= (~CAP_UNIX);
+		ses->capabilities &= (~CAP_UNIX);
 
-	cFYI(1, ("Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d",
-		 server->secMode, server->capabilities, server->timeAdj));
+	cFYI(1, "Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d",
+		 server->secMode, server->capabilities, server->timeAdj);
 
-	rc = CIFS_SessSetup(xid, pSesInfo, first_time, nls_info);
+	rc = CIFS_SessSetup(xid, ses, nls_info);
 	if (rc) {
-		cERROR(1, ("Send error in SessSetup = %d", rc));
+		cERROR(1, "Send error in SessSetup = %d", rc);
 	} else {
-		cFYI(1, ("CIFS Session Established successfully"));
+		cFYI(1, "CIFS Session Established successfully");
 		spin_lock(&GlobalMid_Lock);
-		pSesInfo->status = CifsGood;
-		pSesInfo->need_reconnect = false;
+		ses->status = CifsGood;
+		ses->need_reconnect = false;
 		spin_unlock(&GlobalMid_Lock);
 	}
 
-ss_err_exit:
 	return rc;
 }
 
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 6ccf7262d1b7..e7ae78b66fa1 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -25,6 +25,7 @@
 #include <linux/slab.h>
 #include <linux/namei.h>
 #include <linux/mount.h>
+#include <linux/file.h>
 #include "cifsfs.h"
 #include "cifspdu.h"
 #include "cifsglob.h"
@@ -73,7 +74,7 @@ cifs_bp_rename_retry:
 		namelen += (1 + temp->d_name.len);
 		temp = temp->d_parent;
 		if (temp == NULL) {
-			cERROR(1, ("corrupt dentry"));
+			cERROR(1, "corrupt dentry");
 			return NULL;
 		}
 	}
@@ -90,19 +91,18 @@ cifs_bp_rename_retry:
 			full_path[namelen] = dirsep;
 			strncpy(full_path + namelen + 1, temp->d_name.name,
 				temp->d_name.len);
-			cFYI(0, ("name: %s", full_path + namelen));
+			cFYI(0, "name: %s", full_path + namelen);
 		}
 		temp = temp->d_parent;
 		if (temp == NULL) {
-			cERROR(1, ("corrupt dentry"));
+			cERROR(1, "corrupt dentry");
 			kfree(full_path);
 			return NULL;
 		}
 	}
 	if (namelen != pplen + dfsplen) {
-		cERROR(1,
-		       ("did not end path lookup where expected namelen is %d",
-			namelen));
+		cERROR(1, "did not end path lookup where expected namelen is %d",
+			namelen);
 		/* presumably this is only possible if racing with a rename
 		of one of the parent directories  (we can not lock the dentries
 		above us to prevent this, but retrying should be harmless) */
@@ -130,6 +130,12 @@ cifs_bp_rename_retry:
 	return full_path;
 }
 
+/*
+ * When called with struct file pointer set to NULL, there is no way we could
+ * update file->private_data, but getting it stuck on openFileList provides a
+ * way to access it from cifs_fill_filedata and thereby set file->private_data
+ * from cifs_open.
+ */
 struct cifsFileInfo *
 cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
 		  struct file *file, struct vfsmount *mnt, unsigned int oflags)
@@ -173,26 +179,28 @@ cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
 		if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
 			pCifsInode->clientCanCacheAll = true;
 			pCifsInode->clientCanCacheRead = true;
-			cFYI(1, ("Exclusive Oplock inode %p", newinode));
+			cFYI(1, "Exclusive Oplock inode %p", newinode);
 		} else if ((oplock & 0xF) == OPLOCK_READ)
 				pCifsInode->clientCanCacheRead = true;
 	}
 	write_unlock(&GlobalSMBSeslock);
 
+	file->private_data = pCifsFile;
+
 	return pCifsFile;
 }
 
 int cifs_posix_open(char *full_path, struct inode **pinode,
-		    struct vfsmount *mnt, int mode, int oflags,
-		    __u32 *poplock, __u16 *pnetfid, int xid)
+			struct super_block *sb, int mode, int oflags,
+			__u32 *poplock, __u16 *pnetfid, int xid)
 {
 	int rc;
 	FILE_UNIX_BASIC_INFO *presp_data;
 	__u32 posix_flags = 0;
-	struct cifs_sb_info *cifs_sb = CIFS_SB(mnt->mnt_sb);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 	struct cifs_fattr fattr;
 
-	cFYI(1, ("posix open %s", full_path));
+	cFYI(1, "posix open %s", full_path);
 
 	presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
 	if (presp_data == NULL)
@@ -242,7 +250,8 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
 
 	/* get new inode and set it up */
 	if (*pinode == NULL) {
-		*pinode = cifs_iget(mnt->mnt_sb, &fattr);
+		cifs_fill_uniqueid(sb, &fattr);
+		*pinode = cifs_iget(sb, &fattr);
 		if (!*pinode) {
 			rc = -ENOMEM;
 			goto posix_open_ret;
@@ -251,8 +260,6 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
 		cifs_fattr_to_inode(*pinode, &fattr);
 	}
 
-	cifs_new_fileinfo(*pinode, *pnetfid, NULL, mnt, oflags);
-
 posix_open_ret:
 	kfree(presp_data);
 	return rc;
@@ -280,7 +287,6 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 	int create_options = CREATE_NOT_DIR;
 	__u32 oplock = 0;
 	int oflags;
-	bool posix_create = false;
 	/*
 	 * BB below access is probably too much for mknod to request
 	 *    but we have to do query and setpathinfo so requesting
@@ -315,20 +321,19 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 	if (nd && (nd->flags & LOOKUP_OPEN))
 		oflags = nd->intent.open.flags;
 	else
-		oflags = FMODE_READ;
+		oflags = FMODE_READ | SMB_O_CREAT;
 
 	if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
 	    (CIFS_UNIX_POSIX_PATH_OPS_CAP &
 			le64_to_cpu(tcon->fsUnixInfo.Capability))) {
-		rc = cifs_posix_open(full_path, &newinode, nd->path.mnt,
-				     mode, oflags, &oplock, &fileHandle, xid);
+		rc = cifs_posix_open(full_path, &newinode,
+			inode->i_sb, mode, oflags, &oplock, &fileHandle, xid);
 		/* EIO could indicate that (posix open) operation is not
 		   supported, despite what server claimed in capability
 		   negotation.  EREMOTE indicates DFS junction, which is not
 		   handled in posix open */
 
 		if (rc == 0) {
-			posix_create = true;
 			if (newinode == NULL) /* query inode info */
 				goto cifs_create_get_file_info;
 			else /* success, no need to query */
@@ -358,7 +363,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 		else if ((oflags & O_CREAT) == O_CREAT)
 			disposition = FILE_OPEN_IF;
 		else
-			cFYI(1, ("Create flag not set in create function"));
+			cFYI(1, "Create flag not set in create function");
 	}
 
 	/* BB add processing to set equivalent of mode - e.g. via CreateX with
@@ -394,7 +399,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 	}
 	if (rc) {
-		cFYI(1, ("cifs_create returned 0x%x", rc));
+		cFYI(1, "cifs_create returned 0x%x", rc);
 		goto cifs_create_out;
 	}
 
@@ -457,16 +462,30 @@ cifs_create_set_dentry:
 	if (rc == 0)
 		setup_cifs_dentry(tcon, direntry, newinode);
 	else
-		cFYI(1, ("Create worked, get_inode_info failed rc = %d", rc));
+		cFYI(1, "Create worked, get_inode_info failed rc = %d", rc);
 
-	/* nfsd case - nfs srv does not set nd */
-	if ((nd == NULL) || (!(nd->flags & LOOKUP_OPEN))) {
-		/* mknod case - do not leave file open */
+	if (newinode && nd && (nd->flags & LOOKUP_OPEN)) {
+		struct cifsFileInfo *pfile_info;
+		struct file *filp;
+
+		filp = lookup_instantiate_filp(nd, direntry, generic_file_open);
+		if (IS_ERR(filp)) {
+			rc = PTR_ERR(filp);
+			CIFSSMBClose(xid, tcon, fileHandle);
+			goto cifs_create_out;
+		}
+
+		pfile_info = cifs_new_fileinfo(newinode, fileHandle, filp,
+					       nd->path.mnt, oflags);
+		if (pfile_info == NULL) {
+			fput(filp);
+			CIFSSMBClose(xid, tcon, fileHandle);
+			rc = -ENOMEM;
+		}
+	} else {
 		CIFSSMBClose(xid, tcon, fileHandle);
-	} else if (!(posix_create) && (newinode)) {
-			cifs_new_fileinfo(newinode, fileHandle, NULL,
-						nd->path.mnt, oflags);
 	}
+
 cifs_create_out:
 	kfree(buf);
 	kfree(full_path);
@@ -531,7 +550,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
 			u16 fileHandle;
 			FILE_ALL_INFO *buf;
 
-			cFYI(1, ("sfu compat create special file"));
+			cFYI(1, "sfu compat create special file");
 
 			buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
 			if (buf == NULL) {
@@ -610,14 +629,15 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 	bool posix_open = false;
 	struct cifs_sb_info *cifs_sb;
 	struct cifsTconInfo *pTcon;
+	struct cifsFileInfo *cfile;
 	struct inode *newInode = NULL;
 	char *full_path = NULL;
 	struct file *filp;
 
 	xid = GetXid();
 
-	cFYI(1, ("parent inode = 0x%p name is: %s and dentry = 0x%p",
-	      parent_dir_inode, direntry->d_name.name, direntry));
+	cFYI(1, "parent inode = 0x%p name is: %s and dentry = 0x%p",
+	      parent_dir_inode, direntry->d_name.name, direntry);
 
 	/* check whether path exists */
 
@@ -632,7 +652,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 		int i;
 		for (i = 0; i < direntry->d_name.len; i++)
 			if (direntry->d_name.name[i] == '\\') {
-				cFYI(1, ("Invalid file name"));
+				cFYI(1, "Invalid file name");
 				FreeXid(xid);
 				return ERR_PTR(-EINVAL);
 			}
@@ -657,11 +677,11 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 	}
 
 	if (direntry->d_inode != NULL) {
-		cFYI(1, ("non-NULL inode in lookup"));
+		cFYI(1, "non-NULL inode in lookup");
 	} else {
-		cFYI(1, ("NULL inode in lookup"));
+		cFYI(1, "NULL inode in lookup");
 	}
-	cFYI(1, ("Full path: %s inode = 0x%p", full_path, direntry->d_inode));
+	cFYI(1, "Full path: %s inode = 0x%p", full_path, direntry->d_inode);
 
 	/* Posix open is only called (at lookup time) for file create now.
 	 * For opens (rather than creates), because we do not know if it
@@ -677,7 +697,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 		if (nd && !(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) &&
 		     (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open &&
 		     (nd->intent.open.flags & O_CREAT)) {
-			rc = cifs_posix_open(full_path, &newInode, nd->path.mnt,
+			rc = cifs_posix_open(full_path, &newInode,
+					parent_dir_inode->i_sb,
 					nd->intent.open.create_mode,
 					nd->intent.open.flags, &oplock,
 					&fileHandle, xid);
@@ -706,8 +727,25 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 		else
 			direntry->d_op = &cifs_dentry_ops;
 		d_add(direntry, newInode);
-		if (posix_open)
-			filp = lookup_instantiate_filp(nd, direntry, NULL);
+		if (posix_open) {
+			filp = lookup_instantiate_filp(nd, direntry,
+						       generic_file_open);
+			if (IS_ERR(filp)) {
+				rc = PTR_ERR(filp);
+				CIFSSMBClose(xid, pTcon, fileHandle);
+				goto lookup_out;
+			}
+
+			cfile = cifs_new_fileinfo(newInode, fileHandle, filp,
+						  nd->path.mnt,
+						  nd->intent.open.flags);
+			if (cfile == NULL) {
+				fput(filp);
+				CIFSSMBClose(xid, pTcon, fileHandle);
+				rc = -ENOMEM;
+				goto lookup_out;
+			}
+		}
 		/* since paths are not looked up by component - the parent
 		   directories are presumed to be good here */
 		renew_parental_timestamps(direntry);
@@ -723,11 +761,12 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 	/*	if it was once a directory (but how can we tell?) we could do
 		shrink_dcache_parent(direntry); */
 	} else if (rc != -EACCES) {
-		cERROR(1, ("Unexpected lookup error %d", rc));
+		cERROR(1, "Unexpected lookup error %d", rc);
 		/* We special case check for Access Denied - since that
 		is a common return code */
 	}
 
+lookup_out:
 	kfree(full_path);
 	FreeXid(xid);
 	return ERR_PTR(rc);
@@ -739,11 +778,11 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
 	int isValid = 1;
 
 	if (direntry->d_inode) {
-		if (cifs_revalidate(direntry))
+		if (cifs_revalidate_dentry(direntry))
 			return 0;
 	} else {
-		cFYI(1, ("neg dentry 0x%p name = %s",
-			 direntry, direntry->d_name.name));
+		cFYI(1, "neg dentry 0x%p name = %s",
+			 direntry, direntry->d_name.name);
 		if (time_after(jiffies, direntry->d_time + HZ) ||
 			!lookupCacheEnabled) {
 			d_drop(direntry);
@@ -758,7 +797,7 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
 {
 	int rc = 0;
 
-	cFYI(1, ("In cifs d_delete, name = %s", direntry->d_name.name));
+	cFYI(1, "In cifs d_delete, name = %s", direntry->d_name.name);
 
 	return rc;
 }     */
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 87948147d7ec..853a968e82d7 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -23,12 +23,17 @@
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
+#include <linux/slab.h>
+#include <linux/keyctl.h>
+#include <linux/key-type.h>
 #include <keys/user-type.h>
 #include "dns_resolve.h"
 #include "cifsglob.h"
 #include "cifsproto.h"
 #include "cifs_debug.h"
 
+static const struct cred *dns_resolver_cache;
+
 /* Checks if supplied name is IP address
  * returns:
  * 		1 - name is IP
@@ -93,6 +98,7 @@ struct key_type key_type_dns_resolver = {
 int
 dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
 {
+	const struct cred *saved_cred;
 	int rc = -EAGAIN;
 	struct key *rkey = ERR_PTR(-EAGAIN);
 	char *name;
@@ -105,14 +111,14 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
 	/* search for server name delimiter */
 	len = strlen(unc);
 	if (len < 3) {
-		cFYI(1, ("%s: unc is too short: %s", __func__, unc));
+		cFYI(1, "%s: unc is too short: %s", __func__, unc);
 		return -EINVAL;
 	}
 	len -= 2;
 	name = memchr(unc+2, '\\', len);
 	if (!name) {
-		cFYI(1, ("%s: probably server name is whole unc: %s",
-					__func__, unc));
+		cFYI(1, "%s: probably server name is whole unc: %s",
+					__func__, unc);
 	} else {
 		len = (name - unc) - 2/* leading // */;
 	}
@@ -126,18 +132,25 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
 	name[len] = 0;
 
 	if (is_ip(name)) {
-		cFYI(1, ("%s: it is IP, skipping dns upcall: %s",
-					__func__, name));
+		cFYI(1, "%s: it is IP, skipping dns upcall: %s",
+					__func__, name);
 		data = name;
 		goto skip_upcall;
 	}
 
+	saved_cred = override_creds(dns_resolver_cache);
 	rkey = request_key(&key_type_dns_resolver, name, "");
+	revert_creds(saved_cred);
 	if (!IS_ERR(rkey)) {
+		if (!(rkey->perm & KEY_USR_VIEW)) {
+			down_read(&rkey->sem);
+			rkey->perm |= KEY_USR_VIEW;
+			up_read(&rkey->sem);
+		}
 		len = rkey->type_data.x[0];
 		data = rkey->payload.data;
 	} else {
-		cERROR(1, ("%s: unable to resolve: %s", __func__, name));
+		cERROR(1, "%s: unable to resolve: %s", __func__, name);
 		goto out;
 	}
 
@@ -147,10 +160,10 @@ skip_upcall:
 		if (*ip_addr) {
 			memcpy(*ip_addr, data, len + 1);
 			if (!IS_ERR(rkey))
-				cFYI(1, ("%s: resolved: %s to %s", __func__,
+				cFYI(1, "%s: resolved: %s to %s", __func__,
 							name,
 							*ip_addr
-					));
+					);
 			rc = 0;
 		} else {
 			rc = -ENOMEM;
@@ -164,4 +177,61 @@ out:
 	return rc;
 }
 
+int __init cifs_init_dns_resolver(void)
+{
+	struct cred *cred;
+	struct key *keyring;
+	int ret;
+
+	printk(KERN_NOTICE "Registering the %s key type\n",
+	       key_type_dns_resolver.name);
+
+	/* create an override credential set with a special thread keyring in
+	 * which DNS requests are cached
+	 *
+	 * this is used to prevent malicious redirections from being installed
+	 * with add_key().
+	 */
+	cred = prepare_kernel_cred(NULL);
+	if (!cred)
+		return -ENOMEM;
+
+	keyring = key_alloc(&key_type_keyring, ".dns_resolver", 0, 0, cred,
+			    (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+			    KEY_USR_VIEW | KEY_USR_READ,
+			    KEY_ALLOC_NOT_IN_QUOTA);
+	if (IS_ERR(keyring)) {
+		ret = PTR_ERR(keyring);
+		goto failed_put_cred;
+	}
+
+	ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL);
+	if (ret < 0)
+		goto failed_put_key;
+
+	ret = register_key_type(&key_type_dns_resolver);
+	if (ret < 0)
+		goto failed_put_key;
+
+	/* instruct request_key() to use this special keyring as a cache for
+	 * the results it looks up */
+	cred->thread_keyring = keyring;
+	cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
+	dns_resolver_cache = cred;
+	return 0;
+
+failed_put_key:
+	key_put(keyring);
+failed_put_cred:
+	put_cred(cred);
+	return ret;
+}
 
+void cifs_exit_dns_resolver(void)
+{
+	key_revoke(dns_resolver_cache->thread_keyring);
+	unregister_key_type(&key_type_dns_resolver);
+	put_cred(dns_resolver_cache);
+	printk(KERN_NOTICE "Unregistered %s key type\n",
+	       key_type_dns_resolver.name);
+}
diff --git a/fs/cifs/dns_resolve.h b/fs/cifs/dns_resolve.h
index 966e9288930b..5d7f291df162 100644
--- a/fs/cifs/dns_resolve.h
+++ b/fs/cifs/dns_resolve.h
@@ -24,8 +24,8 @@
 #define _DNS_RESOLVE_H
 
 #ifdef __KERNEL__
-#include <linux/key-type.h>
-extern struct key_type key_type_dns_resolver;
+extern int __init cifs_init_dns_resolver(void);
+extern void cifs_exit_dns_resolver(void);
 extern int dns_resolve_server_name_to_ip(const char *unc, char **ip_addr);
 #endif /* KERNEL */
 
diff --git a/fs/cifs/export.c b/fs/cifs/export.c
index 6177f7cca16a..993f82045bf6 100644
--- a/fs/cifs/export.c
+++ b/fs/cifs/export.c
@@ -49,7 +49,7 @@
 static struct dentry *cifs_get_parent(struct dentry *dentry)
 {
 	/* BB need to add code here eventually to enable export via NFSD */
-	cFYI(1, ("get parent for %p", dentry));
+	cFYI(1, "get parent for %p", dentry);
 	return ERR_PTR(-EACCES);
 }
 
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 057e1dae12ab..409e4f523e61 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3,7 +3,7 @@
  *
  *   vfs operations that deal with files
  *
- *   Copyright (C) International Business Machines  Corp., 2002,2007
+ *   Copyright (C) International Business Machines  Corp., 2002,2010
  *   Author(s): Steve French (sfrench@us.ibm.com)
  *              Jeremy Allison (jra@samba.org)
  *
@@ -31,6 +31,7 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/delay.h>
 #include <linux/mount.h>
+#include <linux/slab.h>
 #include <asm/div64.h>
 #include "cifsfs.h"
 #include "cifspdu.h"
@@ -107,8 +108,7 @@ static inline int cifs_get_disposition(unsigned int flags)
 /* all arguments to this function must be checked for validity in caller */
 static inline int
 cifs_posix_open_inode_helper(struct inode *inode, struct file *file,
-			     struct cifsInodeInfo *pCifsInode,
-			     struct cifsFileInfo *pCifsFile, __u32 oplock,
+			     struct cifsInodeInfo *pCifsInode, __u32 oplock,
 			     u16 netfid)
 {
 
@@ -135,15 +135,15 @@ cifs_posix_open_inode_helper(struct inode *inode, struct file *file,
 	if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) &&
 			   (file->f_path.dentry->d_inode->i_size ==
 			    (loff_t)le64_to_cpu(buf->EndOfFile))) {
-		cFYI(1, ("inode unchanged on server"));
+		cFYI(1, "inode unchanged on server");
 	} else {
 		if (file->f_path.dentry->d_inode->i_mapping) {
 			rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping);
 			if (rc != 0)
 				CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc;
 		}
-		cFYI(1, ("invalidating remote inode since open detected it "
-			 "changed"));
+		cFYI(1, "invalidating remote inode since open detected it "
+			 "changed");
 		invalidate_remote_inode(file->f_path.dentry->d_inode);
 	} */
 
@@ -151,8 +151,8 @@ psx_client_can_cache:
 	if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
 		pCifsInode->clientCanCacheAll = true;
 		pCifsInode->clientCanCacheRead = true;
-		cFYI(1, ("Exclusive Oplock granted on inode %p",
-			 file->f_path.dentry->d_inode));
+		cFYI(1, "Exclusive Oplock granted on inode %p",
+			 file->f_path.dentry->d_inode);
 	} else if ((oplock & 0xF) == OPLOCK_READ)
 		pCifsInode->clientCanCacheRead = true;
 
@@ -162,44 +162,12 @@ psx_client_can_cache:
 	return 0;
 }
 
-static struct cifsFileInfo *
-cifs_fill_filedata(struct file *file)
-{
-	struct list_head *tmp;
-	struct cifsFileInfo *pCifsFile = NULL;
-	struct cifsInodeInfo *pCifsInode = NULL;
-
-	/* search inode for this file and fill in file->private_data */
-	pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
-	read_lock(&GlobalSMBSeslock);
-	list_for_each(tmp, &pCifsInode->openFileList) {
-		pCifsFile = list_entry(tmp, struct cifsFileInfo, flist);
-		if ((pCifsFile->pfile == NULL) &&
-		    (pCifsFile->pid == current->tgid)) {
-			/* mode set in cifs_create */
-
-			/* needed for writepage */
-			pCifsFile->pfile = file;
-			file->private_data = pCifsFile;
-			break;
-		}
-	}
-	read_unlock(&GlobalSMBSeslock);
-
-	if (file->private_data != NULL) {
-		return pCifsFile;
-	} else if ((file->f_flags & O_CREAT) && (file->f_flags & O_EXCL))
-			cERROR(1, ("could not find file instance for "
-				   "new file %p", file));
-	return NULL;
-}
-
 /* all arguments to this function must be checked for validity in caller */
-static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
-	struct cifsInodeInfo *pCifsInode, struct cifsFileInfo *pCifsFile,
+static inline int cifs_open_inode_helper(struct inode *inode,
 	struct cifsTconInfo *pTcon, int *oplock, FILE_ALL_INFO *buf,
 	char *full_path, int xid)
 {
+	struct cifsInodeInfo *pCifsInode = CIFS_I(inode);
 	struct timespec temp;
 	int rc;
 
@@ -213,36 +181,35 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
 	/* if not oplocked, invalidate inode pages if mtime or file
 	   size changed */
 	temp = cifs_NTtimeToUnix(buf->LastWriteTime);
-	if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) &&
-			   (file->f_path.dentry->d_inode->i_size ==
+	if (timespec_equal(&inode->i_mtime, &temp) &&
+			   (inode->i_size ==
 			    (loff_t)le64_to_cpu(buf->EndOfFile))) {
-		cFYI(1, ("inode unchanged on server"));
+		cFYI(1, "inode unchanged on server");
 	} else {
-		if (file->f_path.dentry->d_inode->i_mapping) {
-		/* BB no need to lock inode until after invalidate
-		   since namei code should already have it locked? */
-			rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping);
+		if (inode->i_mapping) {
+			/* BB no need to lock inode until after invalidate
+			since namei code should already have it locked? */
+			rc = filemap_write_and_wait(inode->i_mapping);
 			if (rc != 0)
-				CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc;
+				pCifsInode->write_behind_rc = rc;
 		}
-		cFYI(1, ("invalidating remote inode since open detected it "
-			 "changed"));
-		invalidate_remote_inode(file->f_path.dentry->d_inode);
+		cFYI(1, "invalidating remote inode since open detected it "
+			 "changed");
+		invalidate_remote_inode(inode);
 	}
 
 client_can_cache:
 	if (pTcon->unix_ext)
-		rc = cifs_get_inode_info_unix(&file->f_path.dentry->d_inode,
-			full_path, inode->i_sb, xid);
+		rc = cifs_get_inode_info_unix(&inode, full_path, inode->i_sb,
+					      xid);
 	else
-		rc = cifs_get_inode_info(&file->f_path.dentry->d_inode,
-			full_path, buf, inode->i_sb, xid, NULL);
+		rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb,
+					 xid, NULL);
 
 	if ((*oplock & 0xF) == OPLOCK_EXCLUSIVE) {
 		pCifsInode->clientCanCacheAll = true;
 		pCifsInode->clientCanCacheRead = true;
-		cFYI(1, ("Exclusive Oplock granted on inode %p",
-			 file->f_path.dentry->d_inode));
+		cFYI(1, "Exclusive Oplock granted on inode %p", inode);
 	} else if ((*oplock & 0xF) == OPLOCK_READ)
 		pCifsInode->clientCanCacheRead = true;
 
@@ -256,7 +223,7 @@ int cifs_open(struct inode *inode, struct file *file)
 	__u32 oplock;
 	struct cifs_sb_info *cifs_sb;
 	struct cifsTconInfo *tcon;
-	struct cifsFileInfo *pCifsFile;
+	struct cifsFileInfo *pCifsFile = NULL;
 	struct cifsInodeInfo *pCifsInode;
 	char *full_path = NULL;
 	int desiredAccess;
@@ -270,12 +237,6 @@ int cifs_open(struct inode *inode, struct file *file)
 	tcon = cifs_sb->tcon;
 
 	pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
-	pCifsFile = cifs_fill_filedata(file);
-	if (pCifsFile) {
-		rc = 0;
-		FreeXid(xid);
-		return rc;
-	}
 
 	full_path = build_path_from_dentry(file->f_path.dentry);
 	if (full_path == NULL) {
@@ -284,8 +245,8 @@ int cifs_open(struct inode *inode, struct file *file)
 		return rc;
 	}
 
-	cFYI(1, ("inode = 0x%p file flags are 0x%x for %s",
-		 inode, file->f_flags, full_path));
+	cFYI(1, "inode = 0x%p file flags are 0x%x for %s",
+		 inode, file->f_flags, full_path);
 
 	if (oplockEnabled)
 		oplock = REQ_OPLOCK;
@@ -297,27 +258,39 @@ int cifs_open(struct inode *inode, struct file *file)
 	    (CIFS_UNIX_POSIX_PATH_OPS_CAP &
 			le64_to_cpu(tcon->fsUnixInfo.Capability))) {
 		int oflags = (int) cifs_posix_convert_flags(file->f_flags);
+		oflags |= SMB_O_CREAT;
 		/* can not refresh inode info since size could be stale */
-		rc = cifs_posix_open(full_path, &inode, file->f_path.mnt,
-				     cifs_sb->mnt_file_mode /* ignored */,
-				     oflags, &oplock, &netfid, xid);
+		rc = cifs_posix_open(full_path, &inode, inode->i_sb,
+				cifs_sb->mnt_file_mode /* ignored */,
+				oflags, &oplock, &netfid, xid);
 		if (rc == 0) {
-			cFYI(1, ("posix open succeeded"));
+			cFYI(1, "posix open succeeded");
 			/* no need for special case handling of setting mode
 			   on read only files needed here */
 
-			pCifsFile = cifs_fill_filedata(file);
-			cifs_posix_open_inode_helper(inode, file, pCifsInode,
-						     pCifsFile, oplock, netfid);
+			rc = cifs_posix_open_inode_helper(inode, file,
+					pCifsInode, oplock, netfid);
+			if (rc != 0) {
+				CIFSSMBClose(xid, tcon, netfid);
+				goto out;
+			}
+
+			pCifsFile = cifs_new_fileinfo(inode, netfid, file,
+							file->f_path.mnt,
+							oflags);
+			if (pCifsFile == NULL) {
+				CIFSSMBClose(xid, tcon, netfid);
+				rc = -ENOMEM;
+			}
 			goto out;
 		} else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
 			if (tcon->ses->serverNOS)
-				cERROR(1, ("server %s of type %s returned"
+				cERROR(1, "server %s of type %s returned"
 					   " unexpected error on SMB posix open"
 					   ", disabling posix open support."
 					   " Check if server update available.",
 					   tcon->ses->serverName,
-					   tcon->ses->serverNOS));
+					   tcon->ses->serverNOS);
 			tcon->broken_posix_open = true;
 		} else if ((rc != -EIO) && (rc != -EREMOTE) &&
 			 (rc != -EOPNOTSUPP)) /* path not found or net err */
@@ -385,21 +358,21 @@ int cifs_open(struct inode *inode, struct file *file)
 				& CIFS_MOUNT_MAP_SPECIAL_CHR);
 	}
 	if (rc) {
-		cFYI(1, ("cifs_open returned 0x%x", rc));
+		cFYI(1, "cifs_open returned 0x%x", rc);
 		goto out;
 	}
 
+	rc = cifs_open_inode_helper(inode, tcon, &oplock, buf, full_path, xid);
+	if (rc != 0)
+		goto out;
+
 	pCifsFile = cifs_new_fileinfo(inode, netfid, file, file->f_path.mnt,
 					file->f_flags);
-	file->private_data = pCifsFile;
-	if (file->private_data == NULL) {
+	if (pCifsFile == NULL) {
 		rc = -ENOMEM;
 		goto out;
 	}
 
-	rc = cifs_open_inode_helper(inode, file, pCifsInode, pCifsFile, tcon,
-				    &oplock, buf, full_path, xid);
-
 	if (oplock & CIFS_CREATE_ACTION) {
 		/* time to set mode which we can not set earlier due to
 		   problems creating new read-only files */
@@ -468,7 +441,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
 	}
 
 	if (file->f_path.dentry == NULL) {
-		cERROR(1, ("no valid name if dentry freed"));
+		cERROR(1, "no valid name if dentry freed");
 		dump_stack();
 		rc = -EBADF;
 		goto reopen_error_exit;
@@ -476,7 +449,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
 
 	inode = file->f_path.dentry->d_inode;
 	if (inode == NULL) {
-		cERROR(1, ("inode not valid"));
+		cERROR(1, "inode not valid");
 		dump_stack();
 		rc = -EBADF;
 		goto reopen_error_exit;
@@ -498,8 +471,8 @@ reopen_error_exit:
 		return rc;
 	}
 
-	cFYI(1, ("inode = 0x%p file flags 0x%x for %s",
-		 inode, file->f_flags, full_path));
+	cFYI(1, "inode = 0x%p file flags 0x%x for %s",
+		 inode, file->f_flags, full_path);
 
 	if (oplockEnabled)
 		oplock = REQ_OPLOCK;
@@ -511,11 +484,11 @@ reopen_error_exit:
 			le64_to_cpu(tcon->fsUnixInfo.Capability))) {
 		int oflags = (int) cifs_posix_convert_flags(file->f_flags);
 		/* can not refresh inode info since size could be stale */
-		rc = cifs_posix_open(full_path, NULL, file->f_path.mnt,
-				     cifs_sb->mnt_file_mode /* ignored */,
-				     oflags, &oplock, &netfid, xid);
+		rc = cifs_posix_open(full_path, NULL, inode->i_sb,
+				cifs_sb->mnt_file_mode /* ignored */,
+				oflags, &oplock, &netfid, xid);
 		if (rc == 0) {
-			cFYI(1, ("posix reopen succeeded"));
+			cFYI(1, "posix reopen succeeded");
 			goto reopen_success;
 		}
 		/* fallthrough to retry open the old way on errors, especially
@@ -536,8 +509,8 @@ reopen_error_exit:
 				CIFS_MOUNT_MAP_SPECIAL_CHR);
 	if (rc) {
 		mutex_unlock(&pCifsFile->fh_mutex);
-		cFYI(1, ("cifs_open returned 0x%x", rc));
-		cFYI(1, ("oplock: %d", oplock));
+		cFYI(1, "cifs_open returned 0x%x", rc);
+		cFYI(1, "oplock: %d", oplock);
 	} else {
 reopen_success:
 		pCifsFile->netfid = netfid;
@@ -569,8 +542,8 @@ reopen_success:
 			if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
 				pCifsInode->clientCanCacheAll = true;
 				pCifsInode->clientCanCacheRead = true;
-				cFYI(1, ("Exclusive Oplock granted on inode %p",
-					 file->f_path.dentry->d_inode));
+				cFYI(1, "Exclusive Oplock granted on inode %p",
+					 file->f_path.dentry->d_inode);
 			} else if ((oplock & 0xF) == OPLOCK_READ) {
 				pCifsInode->clientCanCacheRead = true;
 				pCifsInode->clientCanCacheAll = false;
@@ -618,8 +591,7 @@ int cifs_close(struct inode *inode, struct file *file)
 					the struct would be in each open file,
 					but this should give enough time to
 					clear the socket */
-					cFYI(DBG2,
-						("close delay, write pending"));
+					cFYI(DBG2, "close delay, write pending");
 					msleep(timeout);
 					timeout *= 4;
 				}
@@ -652,7 +624,7 @@ int cifs_close(struct inode *inode, struct file *file)
 
 	read_lock(&GlobalSMBSeslock);
 	if (list_empty(&(CIFS_I(inode)->openFileList))) {
-		cFYI(1, ("closing last open instance for inode %p", inode));
+		cFYI(1, "closing last open instance for inode %p", inode);
 		/* if the file is not open we do not know if we can cache info
 		   on this inode, much less write behind and read ahead */
 		CIFS_I(inode)->clientCanCacheRead = false;
@@ -673,7 +645,7 @@ int cifs_closedir(struct inode *inode, struct file *file)
 	    (struct cifsFileInfo *)file->private_data;
 	char *ptmp;
 
-	cFYI(1, ("Closedir inode = 0x%p", inode));
+	cFYI(1, "Closedir inode = 0x%p", inode);
 
 	xid = GetXid();
 
@@ -684,22 +656,22 @@ int cifs_closedir(struct inode *inode, struct file *file)
 
 		pTcon = cifs_sb->tcon;
 
-		cFYI(1, ("Freeing private data in close dir"));
+		cFYI(1, "Freeing private data in close dir");
 		write_lock(&GlobalSMBSeslock);
 		if (!pCFileStruct->srch_inf.endOfSearch &&
 		    !pCFileStruct->invalidHandle) {
 			pCFileStruct->invalidHandle = true;
 			write_unlock(&GlobalSMBSeslock);
 			rc = CIFSFindClose(xid, pTcon, pCFileStruct->netfid);
-			cFYI(1, ("Closing uncompleted readdir with rc %d",
-				 rc));
+			cFYI(1, "Closing uncompleted readdir with rc %d",
+				 rc);
 			/* not much we can do if it fails anyway, ignore rc */
 			rc = 0;
 		} else
 			write_unlock(&GlobalSMBSeslock);
 		ptmp = pCFileStruct->srch_inf.ntwrk_buf_start;
 		if (ptmp) {
-			cFYI(1, ("closedir free smb buf in srch struct"));
+			cFYI(1, "closedir free smb buf in srch struct");
 			pCFileStruct->srch_inf.ntwrk_buf_start = NULL;
 			if (pCFileStruct->srch_inf.smallBuf)
 				cifs_small_buf_release(ptmp);
@@ -747,49 +719,49 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
 	rc = -EACCES;
 	xid = GetXid();
 
-	cFYI(1, ("Lock parm: 0x%x flockflags: "
+	cFYI(1, "Lock parm: 0x%x flockflags: "
 		 "0x%x flocktype: 0x%x start: %lld end: %lld",
 		cmd, pfLock->fl_flags, pfLock->fl_type, pfLock->fl_start,
-		pfLock->fl_end));
+		pfLock->fl_end);
 
 	if (pfLock->fl_flags & FL_POSIX)
-		cFYI(1, ("Posix"));
+		cFYI(1, "Posix");
 	if (pfLock->fl_flags & FL_FLOCK)
-		cFYI(1, ("Flock"));
+		cFYI(1, "Flock");
 	if (pfLock->fl_flags & FL_SLEEP) {
-		cFYI(1, ("Blocking lock"));
+		cFYI(1, "Blocking lock");
 		wait_flag = true;
 	}
 	if (pfLock->fl_flags & FL_ACCESS)
-		cFYI(1, ("Process suspended by mandatory locking - "
-			 "not implemented yet"));
+		cFYI(1, "Process suspended by mandatory locking - "
+			 "not implemented yet");
 	if (pfLock->fl_flags & FL_LEASE)
-		cFYI(1, ("Lease on file - not implemented yet"));
+		cFYI(1, "Lease on file - not implemented yet");
 	if (pfLock->fl_flags &
 	    (~(FL_POSIX | FL_FLOCK | FL_SLEEP | FL_ACCESS | FL_LEASE)))
-		cFYI(1, ("Unknown lock flags 0x%x", pfLock->fl_flags));
+		cFYI(1, "Unknown lock flags 0x%x", pfLock->fl_flags);
 
 	if (pfLock->fl_type == F_WRLCK) {
-		cFYI(1, ("F_WRLCK "));
+		cFYI(1, "F_WRLCK ");
 		numLock = 1;
 	} else if (pfLock->fl_type == F_UNLCK) {
-		cFYI(1, ("F_UNLCK"));
+		cFYI(1, "F_UNLCK");
 		numUnlock = 1;
 		/* Check if unlock includes more than
 		one lock range */
 	} else if (pfLock->fl_type == F_RDLCK) {
-		cFYI(1, ("F_RDLCK"));
+		cFYI(1, "F_RDLCK");
 		lockType |= LOCKING_ANDX_SHARED_LOCK;
 		numLock = 1;
 	} else if (pfLock->fl_type == F_EXLCK) {
-		cFYI(1, ("F_EXLCK"));
+		cFYI(1, "F_EXLCK");
 		numLock = 1;
 	} else if (pfLock->fl_type == F_SHLCK) {
-		cFYI(1, ("F_SHLCK"));
+		cFYI(1, "F_SHLCK");
 		lockType |= LOCKING_ANDX_SHARED_LOCK;
 		numLock = 1;
 	} else
-		cFYI(1, ("Unknown type of lock"));
+		cFYI(1, "Unknown type of lock");
 
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
 	tcon = cifs_sb->tcon;
@@ -832,14 +804,38 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
 					 0 /* wait flag */ );
 			pfLock->fl_type = F_UNLCK;
 			if (rc != 0)
-				cERROR(1, ("Error unlocking previously locked "
-					   "range %d during test of lock", rc));
+				cERROR(1, "Error unlocking previously locked "
+					   "range %d during test of lock", rc);
 			rc = 0;
 
 		} else {
 			/* if rc == ERR_SHARING_VIOLATION ? */
-			rc = 0;	/* do not change lock type to unlock
-				   since range in use */
+			rc = 0;
+
+			if (lockType & LOCKING_ANDX_SHARED_LOCK) {
+				pfLock->fl_type = F_WRLCK;
+			} else {
+				rc = CIFSSMBLock(xid, tcon, netfid, length,
+					pfLock->fl_start, 0, 1,
+					lockType | LOCKING_ANDX_SHARED_LOCK,
+					0 /* wait flag */);
+				if (rc == 0) {
+					rc = CIFSSMBLock(xid, tcon, netfid,
+						length, pfLock->fl_start, 1, 0,
+						lockType |
+						LOCKING_ANDX_SHARED_LOCK,
+						0 /* wait flag */);
+					pfLock->fl_type = F_RDLCK;
+					if (rc != 0)
+						cERROR(1, "Error unlocking "
+						"previously locked range %d "
+						"during test of lock", rc);
+					rc = 0;
+				} else {
+					pfLock->fl_type = F_WRLCK;
+					rc = 0;
+				}
+			}
 		}
 
 		FreeXid(xid);
@@ -898,9 +894,10 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
 							1, 0, li->type, false);
 					if (stored_rc)
 						rc = stored_rc;
-
-					list_del(&li->llist);
-					kfree(li);
+					else {
+						list_del(&li->llist);
+						kfree(li);
+					}
 				}
 			}
 			mutex_unlock(&fid->lock_mutex);
@@ -963,9 +960,8 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 
 	pTcon = cifs_sb->tcon;
 
-	/* cFYI(1,
-	   (" write %d bytes to offset %lld of %s", write_size,
-	   *poffset, file->f_path.dentry->d_name.name)); */
+	/* cFYI(1, " write %d bytes to offset %lld of %s", write_size,
+	   *poffset, file->f_path.dentry->d_name.name); */
 
 	if (file->private_data == NULL)
 		return -EBADF;
@@ -1066,8 +1062,8 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
 
 	pTcon = cifs_sb->tcon;
 
-	cFYI(1, ("write %zd bytes to offset %lld of %s", write_size,
-	   *poffset, file->f_path.dentry->d_name.name));
+	cFYI(1, "write %zd bytes to offset %lld of %s", write_size,
+	   *poffset, file->f_path.dentry->d_name.name);
 
 	if (file->private_data == NULL)
 		return -EBADF;
@@ -1208,7 +1204,7 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode)
 	it being zero) during stress testcases so we need to check for it */
 
 	if (cifs_inode == NULL) {
-		cERROR(1, ("Null inode passed to cifs_writeable_file"));
+		cERROR(1, "Null inode passed to cifs_writeable_file");
 		dump_stack();
 		return NULL;
 	}
@@ -1252,7 +1248,7 @@ refind_writable:
 			again. Note that it would be bad
 			to hold up writepages here (rather than
 			in caller) with continuous retries */
-			cFYI(1, ("wp failed on reopen file"));
+			cFYI(1, "wp failed on reopen file");
 			read_lock(&GlobalSMBSeslock);
 			/* can not use this handle, no write
 			   pending on this one after all */
@@ -1328,7 +1324,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
 		else if (bytes_written < 0)
 			rc = bytes_written;
 	} else {
-		cFYI(1, ("No writeable filehandles for inode"));
+		cFYI(1, "No writeable filehandles for inode");
 		rc = -EIO;
 	}
 
@@ -1500,7 +1496,7 @@ retry:
 			 */
 			open_file = find_writable_file(CIFS_I(mapping->host));
 			if (!open_file) {
-				cERROR(1, ("No writable handles for inode"));
+				cERROR(1, "No writable handles for inode");
 				rc = -EBADF;
 			} else {
 				long_op = cifs_write_timeout(cifsi, offset);
@@ -1513,8 +1509,8 @@ retry:
 				cifs_update_eof(cifsi, offset, bytes_written);
 
 				if (rc || bytes_written < bytes_to_write) {
-					cERROR(1, ("Write2 ret %d, wrote %d",
-						  rc, bytes_written));
+					cERROR(1, "Write2 ret %d, wrote %d",
+						  rc, bytes_written);
 					/* BB what if continued retry is
 					   requested via mount flags? */
 					if (rc == -ENOSPC)
@@ -1575,7 +1571,7 @@ static int cifs_writepage(struct page *page, struct writeback_control *wbc)
 /* BB add check for wbc flags */
 	page_cache_get(page);
 	if (!PageUptodate(page))
-		cFYI(1, ("ppw - page not up to date"));
+		cFYI(1, "ppw - page not up to date");
 
 	/*
 	 * Set the "writeback" flag, and clear "dirty" in the radix tree.
@@ -1604,8 +1600,8 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
 	int rc;
 	struct inode *inode = mapping->host;
 
-	cFYI(1, ("write_end for page %p from pos %lld with %d bytes",
-		 page, pos, copied));
+	cFYI(1, "write_end for page %p from pos %lld with %d bytes",
+		 page, pos, copied);
 
 	if (PageChecked(page)) {
 		if (copied == len)
@@ -1650,7 +1646,7 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
 	return rc;
 }
 
-int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int cifs_fsync(struct file *file, int datasync)
 {
 	int xid;
 	int rc = 0;
@@ -1661,8 +1657,8 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
 
 	xid = GetXid();
 
-	cFYI(1, ("Sync file - name: %s datasync: 0x%x",
-		dentry->d_name.name, datasync));
+	cFYI(1, "Sync file - name: %s datasync: 0x%x",
+		file->f_path.dentry->d_name.name, datasync);
 
 	rc = filemap_write_and_wait(inode->i_mapping);
 	if (rc == 0) {
@@ -1686,7 +1682,7 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
 	unsigned int rpages = 0;
 	int rc = 0;
 
-	cFYI(1, ("sync page %p",page));
+	cFYI(1, "sync page %p", page);
 	mapping = page->mapping;
 	if (!mapping)
 		return 0;
@@ -1697,7 +1693,7 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
 /*	fill in rpages then
 	result = cifs_pagein_inode(inode, index, rpages); */ /* BB finish */
 
-/*	cFYI(1, ("rpages is %d for sync page of Index %ld", rpages, index));
+/*	cFYI(1, "rpages is %d for sync page of Index %ld", rpages, index);
 
 #if 0
 	if (rc < 0)
@@ -1731,7 +1727,7 @@ int cifs_flush(struct file *file, fl_owner_t id)
 		CIFS_I(inode)->write_behind_rc = 0;
 	}
 
-	cFYI(1, ("Flush inode %p file %p rc %d", inode, file, rc));
+	cFYI(1, "Flush inode %p file %p rc %d", inode, file, rc);
 
 	return rc;
 }
@@ -1763,7 +1759,7 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
 	open_file = (struct cifsFileInfo *)file->private_data;
 
 	if ((file->f_flags & O_ACCMODE) == O_WRONLY)
-		cFYI(1, ("attempting read on write only file instance"));
+		cFYI(1, "attempting read on write only file instance");
 
 	for (total_read = 0, current_offset = read_data;
 	     read_size > total_read;
@@ -1844,7 +1840,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
 	open_file = (struct cifsFileInfo *)file->private_data;
 
 	if ((file->f_flags & O_ACCMODE) == O_WRONLY)
-		cFYI(1, ("attempting read on write only file instance"));
+		cFYI(1, "attempting read on write only file instance");
 
 	for (total_read = 0, current_offset = read_data;
 	     read_size > total_read;
@@ -1890,13 +1886,12 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
 
 int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
-	struct dentry *dentry = file->f_path.dentry;
 	int rc, xid;
 
 	xid = GetXid();
-	rc = cifs_revalidate(dentry);
+	rc = cifs_revalidate_file(file);
 	if (rc) {
-		cFYI(1, ("Validation prior to mmap failed, error=%d", rc));
+		cFYI(1, "Validation prior to mmap failed, error=%d", rc);
 		FreeXid(xid);
 		return rc;
 	}
@@ -1907,8 +1902,7 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
 
 
 static void cifs_copy_cache_pages(struct address_space *mapping,
-	struct list_head *pages, int bytes_read, char *data,
-	struct pagevec *plru_pvec)
+	struct list_head *pages, int bytes_read, char *data)
 {
 	struct page *page;
 	char *target;
@@ -1920,14 +1914,15 @@ static void cifs_copy_cache_pages(struct address_space *mapping,
 		page = list_entry(pages->prev, struct page, lru);
 		list_del(&page->lru);
 
-		if (add_to_page_cache(page, mapping, page->index,
+		if (add_to_page_cache_lru(page, mapping, page->index,
 				      GFP_KERNEL)) {
 			page_cache_release(page);
-			cFYI(1, ("Add page cache failed"));
+			cFYI(1, "Add page cache failed");
 			data += PAGE_CACHE_SIZE;
 			bytes_read -= PAGE_CACHE_SIZE;
 			continue;
 		}
+		page_cache_release(page);
 
 		target = kmap_atomic(page, KM_USER0);
 
@@ -1946,8 +1941,6 @@ static void cifs_copy_cache_pages(struct address_space *mapping,
 		flush_dcache_page(page);
 		SetPageUptodate(page);
 		unlock_page(page);
-		if (!pagevec_add(plru_pvec, page))
-			__pagevec_lru_add_file(plru_pvec);
 		data += PAGE_CACHE_SIZE;
 	}
 	return;
@@ -1966,7 +1959,6 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 	unsigned int read_size, i;
 	char *smb_read_data = NULL;
 	struct smb_com_read_rsp *pSMBr;
-	struct pagevec lru_pvec;
 	struct cifsFileInfo *open_file;
 	int buf_type = CIFS_NO_BUFFER;
 
@@ -1980,8 +1972,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
 	pTcon = cifs_sb->tcon;
 
-	pagevec_init(&lru_pvec, 0);
-	cFYI(DBG2, ("rpages: num pages %d", num_pages));
+	cFYI(DBG2, "rpages: num pages %d", num_pages);
 	for (i = 0; i < num_pages; ) {
 		unsigned contig_pages;
 		struct page *tmp_page;
@@ -2014,8 +2005,8 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 		/* Read size needs to be in multiples of one page */
 		read_size = min_t(const unsigned int, read_size,
 				  cifs_sb->rsize & PAGE_CACHE_MASK);
-		cFYI(DBG2, ("rpages: read size 0x%x  contiguous pages %d",
-				read_size, contig_pages));
+		cFYI(DBG2, "rpages: read size 0x%x  contiguous pages %d",
+				read_size, contig_pages);
 		rc = -EAGAIN;
 		while (rc == -EAGAIN) {
 			if ((open_file->invalidHandle) &&
@@ -2042,14 +2033,14 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 			}
 		}
 		if ((rc < 0) || (smb_read_data == NULL)) {
-			cFYI(1, ("Read error in readpages: %d", rc));
+			cFYI(1, "Read error in readpages: %d", rc);
 			break;
 		} else if (bytes_read > 0) {
 			task_io_account_read(bytes_read);
 			pSMBr = (struct smb_com_read_rsp *)smb_read_data;
 			cifs_copy_cache_pages(mapping, page_list, bytes_read,
 				smb_read_data + 4 /* RFC1001 hdr */ +
-				le16_to_cpu(pSMBr->DataOffset), &lru_pvec);
+				le16_to_cpu(pSMBr->DataOffset));
 
 			i +=  bytes_read >> PAGE_CACHE_SHIFT;
 			cifs_stats_bytes_read(pTcon, bytes_read);
@@ -2065,9 +2056,9 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 				/* break; */
 			}
 		} else {
-			cFYI(1, ("No bytes read (%d) at offset %lld . "
-				 "Cleaning remaining pages from readahead list",
-				 bytes_read, offset));
+			cFYI(1, "No bytes read (%d) at offset %lld . "
+				"Cleaning remaining pages from readahead list",
+				bytes_read, offset);
 			/* BB turn off caching and do new lookup on
 			   file size at server? */
 			break;
@@ -2082,8 +2073,6 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 		bytes_read = 0;
 	}
 
-	pagevec_lru_add_file(&lru_pvec);
-
 /* need to free smb_read_data buf before exit */
 	if (smb_read_data) {
 		if (buf_type == CIFS_SMALL_BUFFER)
@@ -2112,7 +2101,7 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
 	if (rc < 0)
 		goto io_error;
 	else
-		cFYI(1, ("Bytes read %d", rc));
+		cFYI(1, "Bytes read %d", rc);
 
 	file->f_path.dentry->d_inode->i_atime =
 		current_fs_time(file->f_path.dentry->d_inode->i_sb);
@@ -2144,8 +2133,8 @@ static int cifs_readpage(struct file *file, struct page *page)
 		return rc;
 	}
 
-	cFYI(1, ("readpage %p at offset %d 0x%x\n",
-		 page, (int)offset, (int)offset));
+	cFYI(1, "readpage %p at offset %d 0x%x\n",
+		 page, (int)offset, (int)offset);
 
 	rc = cifs_readpage_worker(file, page, &offset);
 
@@ -2215,7 +2204,7 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping,
 	struct page *page;
 	int rc = 0;
 
-	cFYI(1, ("write_begin from %lld len %d", (long long)pos, len));
+	cFYI(1, "write_begin from %lld len %d", (long long)pos, len);
 
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page) {
@@ -2287,12 +2276,10 @@ cifs_oplock_break(struct slow_work *work)
 	int rc, waitrc = 0;
 
 	if (inode && S_ISREG(inode->i_mode)) {
-#ifdef CONFIG_CIFS_EXPERIMENTAL
-		if (cinode->clientCanCacheAll == 0)
-			break_lease(inode, FMODE_READ);
-		else if (cinode->clientCanCacheRead == 0)
-			break_lease(inode, FMODE_WRITE);
-#endif
+		if (cinode->clientCanCacheRead)
+			break_lease(inode, O_RDONLY);
+		else
+			break_lease(inode, O_WRONLY);
 		rc = filemap_fdatawrite(inode->i_mapping);
 		if (cinode->clientCanCacheRead == 0) {
 			waitrc = filemap_fdatawait(inode->i_mapping);
@@ -2302,7 +2289,7 @@ cifs_oplock_break(struct slow_work *work)
 			rc = waitrc;
 		if (rc)
 			cinode->write_behind_rc = rc;
-		cFYI(1, ("Oplock flush inode %p rc %d", inode, rc));
+		cFYI(1, "Oplock flush inode %p rc %d", inode, rc);
 	}
 
 	/*
@@ -2314,7 +2301,7 @@ cifs_oplock_break(struct slow_work *work)
 	if (!cfile->closePend && !cfile->oplock_break_cancelled) {
 		rc = CIFSSMBLock(0, cifs_sb->tcon, cfile->netfid, 0, 0, 0, 0,
 				 LOCKING_ANDX_OPLOCK_RELEASE, false);
-		cFYI(1, ("Oplock release rc = %d", rc));
+		cFYI(1, "Oplock release rc = %d", rc);
 	}
 }
 
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index cf18ee765590..6f0683c68952 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1,7 +1,7 @@
 /*
  *   fs/cifs/inode.c
  *
- *   Copyright (C) International Business Machines  Corp., 2002,2008
+ *   Copyright (C) International Business Machines  Corp., 2002,2010
  *   Author(s): Steve French (sfrench@us.ibm.com)
  *
  *   This library is free software; you can redistribute it and/or modify
@@ -20,6 +20,7 @@
  */
 #include <linux/fs.h>
 #include <linux/stat.h>
+#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <asm/div64.h>
 #include "cifsfs.h"
@@ -77,6 +78,41 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
 	}
 }
 
+/* check inode attributes against fattr. If they don't match, tag the
+ * inode for cache invalidation
+ */
+static void
+cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
+{
+	struct cifsInodeInfo *cifs_i = CIFS_I(inode);
+
+	cFYI(1, "%s: revalidating inode %llu", __func__, cifs_i->uniqueid);
+
+	if (inode->i_state & I_NEW) {
+		cFYI(1, "%s: inode %llu is new", __func__, cifs_i->uniqueid);
+		return;
+	}
+
+	/* don't bother with revalidation if we have an oplock */
+	if (cifs_i->clientCanCacheRead) {
+		cFYI(1, "%s: inode %llu is oplocked", __func__,
+			 cifs_i->uniqueid);
+		return;
+	}
+
+	 /* revalidate if mtime or size have changed */
+	if (timespec_equal(&inode->i_mtime, &fattr->cf_mtime) &&
+	    cifs_i->server_eof == fattr->cf_eof) {
+		cFYI(1, "%s: inode %llu is unchanged", __func__,
+			 cifs_i->uniqueid);
+		return;
+	}
+
+	cFYI(1, "%s: invalidating inode %llu mapping", __func__,
+		 cifs_i->uniqueid);
+	cifs_i->invalid_mapping = true;
+}
+
 /* populate an inode with info from a cifs_fattr struct */
 void
 cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
@@ -85,6 +121,8 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	unsigned long oldtime = cifs_i->time;
 
+	cifs_revalidate_cache(inode, fattr);
+
 	inode->i_atime = fattr->cf_atime;
 	inode->i_mtime = fattr->cf_mtime;
 	inode->i_ctime = fattr->cf_ctime;
@@ -99,18 +137,18 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
 		inode->i_mode = fattr->cf_mode;
 
 	cifs_i->cifsAttrs = fattr->cf_cifsattrs;
-	cifs_i->uniqueid = fattr->cf_uniqueid;
 
 	if (fattr->cf_flags & CIFS_FATTR_NEED_REVAL)
 		cifs_i->time = 0;
 	else
 		cifs_i->time = jiffies;
 
-	cFYI(1, ("inode 0x%p old_time=%ld new_time=%ld", inode,
-		 oldtime, cifs_i->time));
+	cFYI(1, "inode 0x%p old_time=%ld new_time=%ld", inode,
+		 oldtime, cifs_i->time);
 
 	cifs_i->delete_pending = fattr->cf_flags & CIFS_FATTR_DELETE_PENDING;
 
+	cifs_i->server_eof = fattr->cf_eof;
 	/*
 	 * Can't safely change the file size here if the client is writing to
 	 * it due to potential races.
@@ -131,6 +169,17 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
 	cifs_set_ops(inode, fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL);
 }
 
+void
+cifs_fill_uniqueid(struct super_block *sb, struct cifs_fattr *fattr)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
+		return;
+
+	fattr->cf_uniqueid = iunique(sb, ROOT_I);
+}
+
 /* Fill a cifs_fattr struct with info from FILE_UNIX_BASIC_INFO. */
 void
 cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info,
@@ -188,7 +237,7 @@ cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info,
 		/* safest to call it a file if we do not know */
 		fattr->cf_mode |= S_IFREG;
 		fattr->cf_dtype = DT_REG;
-		cFYI(1, ("unknown type %d", le32_to_cpu(info->Type)));
+		cFYI(1, "unknown type %d", le32_to_cpu(info->Type));
 		break;
 	}
 
@@ -217,7 +266,7 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
 {
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 
-	cFYI(1, ("creating fake fattr for DFS referral"));
+	cFYI(1, "creating fake fattr for DFS referral");
 
 	memset(fattr, 0, sizeof(*fattr));
 	fattr->cf_mode = S_IFDIR | S_IXUGO | S_IRWXU;
@@ -230,6 +279,31 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
 	fattr->cf_flags |= CIFS_FATTR_DFS_REFERRAL;
 }
 
+int cifs_get_file_info_unix(struct file *filp)
+{
+	int rc;
+	int xid;
+	FILE_UNIX_BASIC_INFO find_data;
+	struct cifs_fattr fattr;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct cifsTconInfo *tcon = cifs_sb->tcon;
+	struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
+
+	xid = GetXid();
+	rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->netfid, &find_data);
+	if (!rc) {
+		cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb);
+	} else if (rc == -EREMOTE) {
+		cifs_create_dfs_fattr(&fattr, inode->i_sb);
+		rc = 0;
+	}
+
+	cifs_fattr_to_inode(inode, &fattr);
+	FreeXid(xid);
+	return rc;
+}
+
 int cifs_get_inode_info_unix(struct inode **pinode,
 			     const unsigned char *full_path,
 			     struct super_block *sb, int xid)
@@ -241,7 +315,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 
 	tcon = cifs_sb->tcon;
-	cFYI(1, ("Getting info on %s", full_path));
+	cFYI(1, "Getting info on %s", full_path);
 
 	/* could have done a find first instead but this returns more info */
 	rc = CIFSSMBUnixQPathInfo(xid, tcon, full_path, &find_data,
@@ -259,6 +333,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
 
 	if (*pinode == NULL) {
 		/* get new inode */
+		cifs_fill_uniqueid(sb, &fattr);
 		*pinode = cifs_iget(sb, &fattr);
 		if (!*pinode)
 			rc = -ENOMEM;
@@ -309,7 +384,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
 				 &bytes_read, &pbuf, &buf_type);
 		if ((rc == 0) && (bytes_read >= 8)) {
 			if (memcmp("IntxBLK", pbuf, 8) == 0) {
-				cFYI(1, ("Block device"));
+				cFYI(1, "Block device");
 				fattr->cf_mode |= S_IFBLK;
 				fattr->cf_dtype = DT_BLK;
 				if (bytes_read == 24) {
@@ -321,7 +396,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
 					fattr->cf_rdev = MKDEV(mjr, mnr);
 				}
 			} else if (memcmp("IntxCHR", pbuf, 8) == 0) {
-				cFYI(1, ("Char device"));
+				cFYI(1, "Char device");
 				fattr->cf_mode |= S_IFCHR;
 				fattr->cf_dtype = DT_CHR;
 				if (bytes_read == 24) {
@@ -333,7 +408,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
 					fattr->cf_rdev = MKDEV(mjr, mnr);
 				}
 			} else if (memcmp("IntxLNK", pbuf, 7) == 0) {
-				cFYI(1, ("Symlink"));
+				cFYI(1, "Symlink");
 				fattr->cf_mode |= S_IFLNK;
 				fattr->cf_dtype = DT_LNK;
 			} else {
@@ -366,7 +441,7 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
 	char ea_value[4];
 	__u32 mode;
 
-	rc = CIFSSMBQueryEA(xid, cifs_sb->tcon, path, "SETFILEBITS",
+	rc = CIFSSMBQAllEAs(xid, cifs_sb->tcon, path, "SETFILEBITS",
 			    ea_value, 4 /* size of buf */, cifs_sb->local_nls,
 			    cifs_sb->mnt_cifs_flags &
 				CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -375,10 +450,10 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
 	else if (rc > 3) {
 		mode = le32_to_cpu(*((__le32 *)ea_value));
 		fattr->cf_mode &= ~SFBITS_MASK;
-		cFYI(1, ("special bits 0%o org mode 0%o", mode,
-			 fattr->cf_mode));
+		cFYI(1, "special bits 0%o org mode 0%o", mode,
+			 fattr->cf_mode);
 		fattr->cf_mode = (mode & SFBITS_MASK) | fattr->cf_mode;
-		cFYI(1, ("special mode bits 0%o", mode));
+		cFYI(1, "special mode bits 0%o", mode);
 	}
 
 	return 0;
@@ -431,6 +506,47 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
 	fattr->cf_gid = cifs_sb->mnt_gid;
 }
 
+int cifs_get_file_info(struct file *filp)
+{
+	int rc;
+	int xid;
+	FILE_ALL_INFO find_data;
+	struct cifs_fattr fattr;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct cifsTconInfo *tcon = cifs_sb->tcon;
+	struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
+
+	xid = GetXid();
+	rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data);
+	if (rc == -EOPNOTSUPP || rc == -EINVAL) {
+		/*
+		 * FIXME: legacy server -- fall back to path-based call?
+		 * for now, just skip revalidating and mark inode for
+		 * immediate reval.
+		 */
+		rc = 0;
+		CIFS_I(inode)->time = 0;
+		goto cgfi_exit;
+	} else if (rc == -EREMOTE) {
+		cifs_create_dfs_fattr(&fattr, inode->i_sb);
+		rc = 0;
+	} else if (rc)
+		goto cgfi_exit;
+
+	/*
+	 * don't bother with SFU junk here -- just mark inode as needing
+	 * revalidation.
+	 */
+	cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false);
+	fattr.cf_uniqueid = CIFS_I(inode)->uniqueid;
+	fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
+	cifs_fattr_to_inode(inode, &fattr);
+cgfi_exit:
+	FreeXid(xid);
+	return rc;
+}
+
 int cifs_get_inode_info(struct inode **pinode,
 	const unsigned char *full_path, FILE_ALL_INFO *pfindData,
 	struct super_block *sb, int xid, const __u16 *pfid)
@@ -443,11 +559,11 @@ int cifs_get_inode_info(struct inode **pinode,
 	struct cifs_fattr fattr;
 
 	pTcon = cifs_sb->tcon;
-	cFYI(1, ("Getting info on %s", full_path));
+	cFYI(1, "Getting info on %s", full_path);
 
 	if ((pfindData == NULL) && (*pinode != NULL)) {
 		if (CIFS_I(*pinode)->clientCanCacheRead) {
-			cFYI(1, ("No need to revalidate cached inode sizes"));
+			cFYI(1, "No need to revalidate cached inode sizes");
 			return rc;
 		}
 	}
@@ -513,7 +629,7 @@ int cifs_get_inode_info(struct inode **pinode,
 					cifs_sb->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
 			if (rc1 || !fattr.cf_uniqueid) {
-				cFYI(1, ("GetSrvInodeNum rc %d", rc1));
+				cFYI(1, "GetSrvInodeNum rc %d", rc1);
 				fattr.cf_uniqueid = iunique(sb, ROOT_I);
 				cifs_autodisable_serverino(cifs_sb);
 			}
@@ -529,13 +645,13 @@ int cifs_get_inode_info(struct inode **pinode,
 	    cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
 		tmprc = cifs_sfu_type(&fattr, full_path, cifs_sb, xid);
 		if (tmprc)
-			cFYI(1, ("cifs_sfu_type failed: %d", tmprc));
+			cFYI(1, "cifs_sfu_type failed: %d", tmprc);
 	}
 
 #ifdef CONFIG_CIFS_EXPERIMENTAL
 	/* fill in 0777 bits from ACL */
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
-		cFYI(1, ("Getting mode bits from ACL"));
+		cFYI(1, "Getting mode bits from ACL");
 		cifs_acl_to_fattr(cifs_sb, &fattr, *pinode, full_path, pfid);
 	}
 #endif
@@ -610,6 +726,16 @@ cifs_find_inode(struct inode *inode, void *opaque)
 	if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid)
 		return 0;
 
+	/*
+	 * uh oh -- it's a directory. We can't use it since hardlinked dirs are
+	 * verboten. Disable serverino and return it as if it were found, the
+	 * caller can discard it, generate a uniqueid and retry the find
+	 */
+	if (S_ISDIR(inode->i_mode) && !list_empty(&inode->i_dentry)) {
+		fattr->cf_flags |= CIFS_FATTR_INO_COLLISION;
+		cifs_autodisable_serverino(CIFS_SB(inode->i_sb));
+	}
+
 	return 1;
 }
 
@@ -629,15 +755,22 @@ cifs_iget(struct super_block *sb, struct cifs_fattr *fattr)
 	unsigned long hash;
 	struct inode *inode;
 
-	cFYI(1, ("looking for uniqueid=%llu", fattr->cf_uniqueid));
+retry_iget5_locked:
+	cFYI(1, "looking for uniqueid=%llu", fattr->cf_uniqueid);
 
 	/* hash down to 32-bits on 32-bit arch */
 	hash = cifs_uniqueid_to_ino_t(fattr->cf_uniqueid);
 
 	inode = iget5_locked(sb, hash, cifs_find_inode, cifs_init_inode, fattr);
-
-	/* we have fattrs in hand, update the inode */
 	if (inode) {
+		/* was there a problematic inode number collision? */
+		if (fattr->cf_flags & CIFS_FATTR_INO_COLLISION) {
+			iput(inode);
+			fattr->cf_uniqueid = iunique(sb, ROOT_I);
+			fattr->cf_flags &= ~CIFS_FATTR_INO_COLLISION;
+			goto retry_iget5_locked;
+		}
+
 		cifs_fattr_to_inode(inode, fattr);
 		if (sb->s_flags & MS_NOATIME)
 			inode->i_flags |= S_NOATIME | S_NOCMTIME;
@@ -675,7 +808,7 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
 		return ERR_PTR(-ENOMEM);
 
 	if (rc && cifs_sb->tcon->ipc) {
-		cFYI(1, ("ipc connection - fake read inode"));
+		cFYI(1, "ipc connection - fake read inode");
 		inode->i_mode |= S_IFDIR;
 		inode->i_nlink = 2;
 		inode->i_op = &cifs_ipc_inode_ops;
@@ -737,7 +870,7 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
 	 * server times.
 	 */
 	if (set_time && (attrs->ia_valid & ATTR_CTIME)) {
-		cFYI(1, ("CIFS - CTIME changed"));
+		cFYI(1, "CIFS - CTIME changed");
 		info_buf.ChangeTime =
 		    cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime));
 	} else
@@ -772,8 +905,8 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
 			goto out;
 	}
 
-	cFYI(1, ("calling SetFileInfo since SetPathInfo for "
-		 "times not supported by this server"));
+	cFYI(1, "calling SetFileInfo since SetPathInfo for "
+		 "times not supported by this server");
 	rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN,
 			 SYNCHRONIZE | FILE_WRITE_ATTRIBUTES,
 			 CREATE_NOT_DIR, &netfid, &oplock,
@@ -931,7 +1064,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
 	struct iattr *attrs = NULL;
 	__u32 dosattr = 0, origattr = 0;
 
-	cFYI(1, ("cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry));
+	cFYI(1, "cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry);
 
 	xid = GetXid();
 
@@ -950,7 +1083,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
 		rc = CIFSPOSIXDelFile(xid, tcon, full_path,
 			SMB_POSIX_UNLINK_FILE_TARGET, cifs_sb->local_nls,
 			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
-		cFYI(1, ("posix del rc %d", rc));
+		cFYI(1, "posix del rc %d", rc);
 		if ((rc == 0) || (rc == -ENOENT))
 			goto psx_del_no_retry;
 	}
@@ -1024,7 +1157,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 	struct inode *newinode = NULL;
 	struct cifs_fattr fattr;
 
-	cFYI(1, ("In cifs_mkdir, mode = 0x%x inode = 0x%p", mode, inode));
+	cFYI(1, "In cifs_mkdir, mode = 0x%x inode = 0x%p", mode, inode);
 
 	xid = GetXid();
 
@@ -1059,7 +1192,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 			kfree(pInfo);
 			goto mkdir_retry_old;
 		} else if (rc) {
-			cFYI(1, ("posix mkdir returned 0x%x", rc));
+			cFYI(1, "posix mkdir returned 0x%x", rc);
 			d_drop(direntry);
 		} else {
 			if (pInfo->Type == cpu_to_le32(-1)) {
@@ -1076,6 +1209,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 				direntry->d_op = &cifs_dentry_ops;
 
 			cifs_unix_basic_to_fattr(&fattr, pInfo, cifs_sb);
+			cifs_fill_uniqueid(inode->i_sb, &fattr);
 			newinode = cifs_iget(inode->i_sb, &fattr);
 			if (!newinode) {
 				kfree(pInfo);
@@ -1085,12 +1219,12 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 			d_instantiate(direntry, newinode);
 
 #ifdef CONFIG_CIFS_DEBUG2
-			cFYI(1, ("instantiated dentry %p %s to inode %p",
-				direntry, direntry->d_name.name, newinode));
+			cFYI(1, "instantiated dentry %p %s to inode %p",
+				direntry, direntry->d_name.name, newinode);
 
 			if (newinode->i_nlink != 2)
-				cFYI(1, ("unexpected number of links %d",
-					newinode->i_nlink));
+				cFYI(1, "unexpected number of links %d",
+					newinode->i_nlink);
 #endif
 		}
 		kfree(pInfo);
@@ -1101,7 +1235,7 @@ mkdir_retry_old:
 	rc = CIFSSMBMkDir(xid, pTcon, full_path, cifs_sb->local_nls,
 			  cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 	if (rc) {
-		cFYI(1, ("cifs_mkdir returned 0x%x", rc));
+		cFYI(1, "cifs_mkdir returned 0x%x", rc);
 		d_drop(direntry);
 	} else {
 mkdir_get_info:
@@ -1204,7 +1338,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
 	char *full_path = NULL;
 	struct cifsInodeInfo *cifsInode;
 
-	cFYI(1, ("cifs_rmdir, inode = 0x%p", inode));
+	cFYI(1, "cifs_rmdir, inode = 0x%p", inode);
 
 	xid = GetXid();
 
@@ -1267,6 +1401,10 @@ cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath,
 	if (rc == 0 || rc != -ETXTBSY)
 		return rc;
 
+	/* open-file renames don't work across directories */
+	if (to_dentry->d_parent != from_dentry->d_parent)
+		return rc;
+
 	/* open the file to be renamed -- we need DELETE perms */
 	rc = CIFSSMBOpen(xid, pTcon, fromPath, FILE_OPEN, DELETE,
 			 CREATE_NOT_DIR, &srcfid, &oplock, NULL,
@@ -1388,135 +1526,108 @@ cifs_rename_exit:
 	return rc;
 }
 
-int cifs_revalidate(struct dentry *direntry)
+static bool
+cifs_inode_needs_reval(struct inode *inode)
 {
-	int xid;
-	int rc = 0, wbrc = 0;
-	char *full_path;
-	struct cifs_sb_info *cifs_sb;
-	struct cifsInodeInfo *cifsInode;
-	loff_t local_size;
-	struct timespec local_mtime;
-	bool invalidate_inode = false;
+	struct cifsInodeInfo *cifs_i = CIFS_I(inode);
 
-	if (direntry->d_inode == NULL)
-		return -ENOENT;
+	if (cifs_i->clientCanCacheRead)
+		return false;
 
-	cifsInode = CIFS_I(direntry->d_inode);
+	if (!lookupCacheEnabled)
+		return true;
 
-	if (cifsInode == NULL)
-		return -ENOENT;
+	if (cifs_i->time == 0)
+		return true;
 
-	/* no sense revalidating inode info on file that no one can write */
-	if (CIFS_I(direntry->d_inode)->clientCanCacheRead)
-		return rc;
+	/* FIXME: the actimeo should be tunable */
+	if (time_after_eq(jiffies, cifs_i->time + HZ))
+		return true;
+
+	/* hardlinked files w/ noserverino get "special" treatment */
+	if (!(CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) &&
+	    S_ISREG(inode->i_mode) && inode->i_nlink != 1)
+		return true;
+
+	return false;
+}
+
+/* check invalid_mapping flag and zap the cache if it's set */
+static void
+cifs_invalidate_mapping(struct inode *inode)
+{
+	int rc;
+	struct cifsInodeInfo *cifs_i = CIFS_I(inode);
+
+	cifs_i->invalid_mapping = false;
+
+	/* write back any cached data */
+	if (inode->i_mapping && inode->i_mapping->nrpages != 0) {
+		rc = filemap_write_and_wait(inode->i_mapping);
+		if (rc)
+			cifs_i->write_behind_rc = rc;
+	}
+	invalidate_remote_inode(inode);
+}
+
+int cifs_revalidate_file(struct file *filp)
+{
+	int rc = 0;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+
+	if (!cifs_inode_needs_reval(inode))
+		goto check_inval;
+
+	if (CIFS_SB(inode->i_sb)->tcon->unix_ext)
+		rc = cifs_get_file_info_unix(filp);
+	else
+		rc = cifs_get_file_info(filp);
+
+check_inval:
+	if (CIFS_I(inode)->invalid_mapping)
+		cifs_invalidate_mapping(inode);
+
+	return rc;
+}
+
+/* revalidate a dentry's inode attributes */
+int cifs_revalidate_dentry(struct dentry *dentry)
+{
+	int xid;
+	int rc = 0;
+	char *full_path = NULL;
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = dentry->d_sb;
+
+	if (inode == NULL)
+		return -ENOENT;
 
 	xid = GetXid();
 
-	cifs_sb = CIFS_SB(direntry->d_sb);
+	if (!cifs_inode_needs_reval(inode))
+		goto check_inval;
 
 	/* can not safely grab the rename sem here if rename calls revalidate
 	   since that would deadlock */
-	full_path = build_path_from_dentry(direntry);
+	full_path = build_path_from_dentry(dentry);
 	if (full_path == NULL) {
 		rc = -ENOMEM;
-		FreeXid(xid);
-		return rc;
-	}
-	cFYI(1, ("Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld "
-		 "jiffies %ld", full_path, direntry->d_inode,
-		 direntry->d_inode->i_count.counter, direntry,
-		 direntry->d_time, jiffies));
-
-	if (cifsInode->time == 0) {
-		/* was set to zero previously to force revalidate */
-	} else if (time_before(jiffies, cifsInode->time + HZ) &&
-		   lookupCacheEnabled) {
-		if ((S_ISREG(direntry->d_inode->i_mode) == 0) ||
-		    (direntry->d_inode->i_nlink == 1)) {
-			kfree(full_path);
-			FreeXid(xid);
-			return rc;
-		} else {
-			cFYI(1, ("Have to revalidate file due to hardlinks"));
-		}
-	}
-
-	/* save mtime and size */
-	local_mtime = direntry->d_inode->i_mtime;
-	local_size = direntry->d_inode->i_size;
-
-	if (cifs_sb->tcon->unix_ext) {
-		rc = cifs_get_inode_info_unix(&direntry->d_inode, full_path,
-					      direntry->d_sb, xid);
-		if (rc) {
-			cFYI(1, ("error on getting revalidate info %d", rc));
-/*			if (rc != -ENOENT)
-				rc = 0; */	/* BB should we cache info on
-						   certain errors? */
-		}
-	} else {
-		rc = cifs_get_inode_info(&direntry->d_inode, full_path, NULL,
-					 direntry->d_sb, xid, NULL);
-		if (rc) {
-			cFYI(1, ("error on getting revalidate info %d", rc));
-/*			if (rc != -ENOENT)
-				rc = 0; */	/* BB should we cache info on
-						   certain errors? */
-		}
+		goto check_inval;
 	}
-	/* should we remap certain errors, access denied?, to zero */
 
-	/* if not oplocked, we invalidate inode pages if mtime or file size
-	   had changed on server */
+	cFYI(1, "Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld "
+		 "jiffies %ld", full_path, inode, inode->i_count.counter,
+		 dentry, dentry->d_time, jiffies);
 
-	if (timespec_equal(&local_mtime, &direntry->d_inode->i_mtime) &&
-	    (local_size == direntry->d_inode->i_size)) {
-		cFYI(1, ("cifs_revalidate - inode unchanged"));
-	} else {
-		/* file may have changed on server */
-		if (cifsInode->clientCanCacheRead) {
-			/* no need to invalidate inode pages since we were the
-			   only ones who could have modified the file and the
-			   server copy is staler than ours */
-		} else {
-			invalidate_inode = true;
-		}
-	}
+	if (CIFS_SB(sb)->tcon->unix_ext)
+		rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
+	else
+		rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
+					 xid, NULL);
 
-	/* can not grab this sem since kernel filesys locking documentation
-	   indicates i_mutex may be taken by the kernel on lookup and rename
-	   which could deadlock if we grab the i_mutex here as well */
-/*	mutex_lock(&direntry->d_inode->i_mutex);*/
-	/* need to write out dirty pages here  */
-	if (direntry->d_inode->i_mapping) {
-		/* do we need to lock inode until after invalidate completes
-		   below? */
-		wbrc = filemap_fdatawrite(direntry->d_inode->i_mapping);
-		if (wbrc)
-			CIFS_I(direntry->d_inode)->write_behind_rc = wbrc;
-	}
-	if (invalidate_inode) {
-	/* shrink_dcache not necessary now that cifs dentry ops
-	are exported for negative dentries */
-/*		if (S_ISDIR(direntry->d_inode->i_mode))
-			shrink_dcache_parent(direntry); */
-		if (S_ISREG(direntry->d_inode->i_mode)) {
-			if (direntry->d_inode->i_mapping) {
-				wbrc = filemap_fdatawait(direntry->d_inode->i_mapping);
-				if (wbrc)
-					CIFS_I(direntry->d_inode)->write_behind_rc = wbrc;
-			}
-			/* may eventually have to do this for open files too */
-			if (list_empty(&(cifsInode->openFileList))) {
-				/* changed on server - flush read ahead pages */
-				cFYI(1, ("Invalidating read ahead data on "
-					 "closed file"));
-				invalidate_remote_inode(direntry->d_inode);
-			}
-		}
-	}
-/*	mutex_unlock(&direntry->d_inode->i_mutex); */
+check_inval:
+	if (CIFS_I(inode)->invalid_mapping)
+		cifs_invalidate_mapping(inode);
 
 	kfree(full_path);
 	FreeXid(xid);
@@ -1526,7 +1637,7 @@ int cifs_revalidate(struct dentry *direntry)
 int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	struct kstat *stat)
 {
-	int err = cifs_revalidate(dentry);
+	int err = cifs_revalidate_dentry(dentry);
 	if (!err) {
 		generic_fillattr(dentry->d_inode, stat);
 		stat->blksize = CIFS_MAX_MSGSIZE;
@@ -1600,12 +1711,12 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
 		rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid,
 					npid, false);
 		cifsFileInfo_put(open_file);
-		cFYI(1, ("SetFSize for attrs rc = %d", rc));
+		cFYI(1, "SetFSize for attrs rc = %d", rc);
 		if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
 			unsigned int bytes_written;
 			rc = CIFSSMBWrite(xid, pTcon, nfid, 0, attrs->ia_size,
 					  &bytes_written, NULL, NULL, 1);
-			cFYI(1, ("Wrt seteof rc %d", rc));
+			cFYI(1, "Wrt seteof rc %d", rc);
 		}
 	} else
 		rc = -EINVAL;
@@ -1619,7 +1730,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
 				   false, cifs_sb->local_nls,
 				   cifs_sb->mnt_cifs_flags &
 					CIFS_MOUNT_MAP_SPECIAL_CHR);
-		cFYI(1, ("SetEOF by path (setattrs) rc = %d", rc));
+		cFYI(1, "SetEOF by path (setattrs) rc = %d", rc);
 		if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
 			__u16 netfid;
 			int oplock = 0;
@@ -1636,7 +1747,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
 						  attrs->ia_size,
 						  &bytes_written, NULL,
 						  NULL, 1);
-				cFYI(1, ("wrt seteof rc %d", rc));
+				cFYI(1, "wrt seteof rc %d", rc);
 				CIFSSMBClose(xid, pTcon, netfid);
 			}
 		}
@@ -1664,8 +1775,8 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
 	struct cifs_unix_set_info_args *args = NULL;
 	struct cifsFileInfo *open_file;
 
-	cFYI(1, ("setattr_unix on file %s attrs->ia_valid=0x%x",
-		 direntry->d_name.name, attrs->ia_valid));
+	cFYI(1, "setattr_unix on file %s attrs->ia_valid=0x%x",
+		 direntry->d_name.name, attrs->ia_valid);
 
 	xid = GetXid();
 
@@ -1762,8 +1873,18 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
 					CIFS_MOUNT_MAP_SPECIAL_CHR);
 	}
 
-	if (!rc)
+	if (!rc) {
 		rc = inode_setattr(inode, attrs);
+
+		/* force revalidate when any of these times are set since some
+		   of the fs types (eg ext3, fat) do not have fine enough
+		   time granularity to match protocol, and we do not have a
+		   a way (yet) to query the server fs's time granularity (and
+		   whether it rounds times down).
+		*/
+		if (!rc && (attrs->ia_valid & (ATTR_MTIME | ATTR_CTIME)))
+			cifsInode->time = 0;
+	}
 out:
 	kfree(args);
 	kfree(full_path);
@@ -1785,8 +1906,8 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
 
 	xid = GetXid();
 
-	cFYI(1, ("setattr on file %s attrs->iavalid 0x%x",
-		 direntry->d_name.name, attrs->ia_valid));
+	cFYI(1, "setattr on file %s attrs->iavalid 0x%x",
+		 direntry->d_name.name, attrs->ia_valid);
 
 	if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) == 0) {
 		/* check if we have permission to change attrs */
@@ -1843,7 +1964,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
 		attrs->ia_valid &= ~ATTR_MODE;
 
 	if (attrs->ia_valid & ATTR_MODE) {
-		cFYI(1, ("Mode changed to 0%o", attrs->ia_mode));
+		cFYI(1, "Mode changed to 0%o", attrs->ia_mode);
 		mode = attrs->ia_mode;
 	}
 
@@ -1929,7 +2050,7 @@ cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 #if 0
 void cifs_delete_inode(struct inode *inode)
 {
-	cFYI(1, ("In cifs_delete_inode, inode = 0x%p", inode));
+	cFYI(1, "In cifs_delete_inode, inode = 0x%p", inode);
 	/* may have to add back in if and when safe distributed caching of
 	   directories added e.g. via FindNotify */
 }
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index f94650683a00..505926f1ee6b 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -47,7 +47,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 
 	xid = GetXid();
 
-	cFYI(1, ("ioctl file %p  cmd %u  arg %lu", filep, command, arg));
+	cFYI(1, "ioctl file %p  cmd %u  arg %lu", filep, command, arg);
 
 	cifs_sb = CIFS_SB(inode->i_sb);
 
@@ -64,12 +64,12 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 
 	switch (command) {
 		case CIFS_IOC_CHECKUMOUNT:
-			cFYI(1, ("User unmount attempted"));
+			cFYI(1, "User unmount attempted");
 			if (cifs_sb->mnt_uid == current_uid())
 				rc = 0;
 			else {
 				rc = -EACCES;
-				cFYI(1, ("uids do not match"));
+				cFYI(1, "uids do not match");
 			}
 			break;
 #ifdef CONFIG_CIFS_POSIX
@@ -97,11 +97,11 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 				/* rc= CIFSGetExtAttr(xid,tcon,pSMBFile->netfid,
 					extAttrBits, &ExtAttrMask);*/
 			}
-			cFYI(1, ("set flags not implemented yet"));
+			cFYI(1, "set flags not implemented yet");
 			break;
 #endif /* CONFIG_CIFS_POSIX */
 		default:
-			cFYI(1, ("unsupported ioctl"));
+			cFYI(1, "unsupported ioctl");
 			break;
 	}
 
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index fc1e0487eaee..473ca8033656 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -20,6 +20,7 @@
  */
 #include <linux/fs.h>
 #include <linux/stat.h>
+#include <linux/slab.h>
 #include <linux/namei.h>
 #include "cifsfs.h"
 #include "cifspdu.h"
@@ -138,7 +139,7 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
 	if (!full_path)
 		goto out;
 
-	cFYI(1, ("Full path: %s inode = 0x%p", full_path, inode));
+	cFYI(1, "Full path: %s inode = 0x%p", full_path, inode);
 
 	rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path,
 				     cifs_sb->local_nls);
@@ -177,8 +178,8 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
 		return rc;
 	}
 
-	cFYI(1, ("Full path: %s", full_path));
-	cFYI(1, ("symname is %s", symname));
+	cFYI(1, "Full path: %s", full_path);
+	cFYI(1, "symname is %s", symname);
 
 	/* BB what if DFS and this volume is on different share? BB */
 	if (pTcon->unix_ext)
@@ -197,8 +198,8 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
 						 inode->i_sb, xid, NULL);
 
 		if (rc != 0) {
-			cFYI(1, ("Create symlink ok, getinodeinfo fail rc = %d",
-			      rc));
+			cFYI(1, "Create symlink ok, getinodeinfo fail rc = %d",
+			      rc);
 		} else {
 			if (pTcon->nocase)
 				direntry->d_op = &cifs_ci_dentry_ops;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index d27d4ec6579b..1394aa37f26c 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -51,7 +51,7 @@ _GetXid(void)
 	if (GlobalTotalActiveXid > GlobalMaxActiveXid)
 		GlobalMaxActiveXid = GlobalTotalActiveXid;
 	if (GlobalTotalActiveXid > 65000)
-		cFYI(1, ("warning: more than 65000 requests active"));
+		cFYI(1, "warning: more than 65000 requests active");
 	xid = GlobalCurrentXid++;
 	spin_unlock(&GlobalMid_Lock);
 	return xid;
@@ -79,7 +79,7 @@ sesInfoAlloc(void)
 		++ret_buf->ses_count;
 		INIT_LIST_HEAD(&ret_buf->smb_ses_list);
 		INIT_LIST_HEAD(&ret_buf->tcon_list);
-		init_MUTEX(&ret_buf->sesSem);
+		mutex_init(&ret_buf->session_mutex);
 	}
 	return ret_buf;
 }
@@ -88,7 +88,7 @@ void
 sesInfoFree(struct cifsSesInfo *buf_to_free)
 {
 	if (buf_to_free == NULL) {
-		cFYI(1, ("Null buffer passed to sesInfoFree"));
+		cFYI(1, "Null buffer passed to sesInfoFree");
 		return;
 	}
 
@@ -126,7 +126,7 @@ void
 tconInfoFree(struct cifsTconInfo *buf_to_free)
 {
 	if (buf_to_free == NULL) {
-		cFYI(1, ("Null buffer passed to tconInfoFree"));
+		cFYI(1, "Null buffer passed to tconInfoFree");
 		return;
 	}
 	atomic_dec(&tconInfoAllocCount);
@@ -166,7 +166,7 @@ void
 cifs_buf_release(void *buf_to_free)
 {
 	if (buf_to_free == NULL) {
-		/* cFYI(1, ("Null buffer passed to cifs_buf_release"));*/
+		/* cFYI(1, "Null buffer passed to cifs_buf_release");*/
 		return;
 	}
 	mempool_free(buf_to_free, cifs_req_poolp);
@@ -202,7 +202,7 @@ cifs_small_buf_release(void *buf_to_free)
 {
 
 	if (buf_to_free == NULL) {
-		cFYI(1, ("Null buffer passed to cifs_small_buf_release"));
+		cFYI(1, "Null buffer passed to cifs_small_buf_release");
 		return;
 	}
 	mempool_free(buf_to_free, cifs_sm_req_poolp);
@@ -345,19 +345,19 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
 		/*      with userid/password pairs found on the smb session   */
 		/*	for other target tcp/ip addresses 		BB    */
 				if (current_fsuid() != treeCon->ses->linux_uid) {
-					cFYI(1, ("Multiuser mode and UID "
-						 "did not match tcon uid"));
+					cFYI(1, "Multiuser mode and UID "
+						 "did not match tcon uid");
 					read_lock(&cifs_tcp_ses_lock);
 					list_for_each(temp_item, &treeCon->ses->server->smb_ses_list) {
 						ses = list_entry(temp_item, struct cifsSesInfo, smb_ses_list);
 						if (ses->linux_uid == current_fsuid()) {
 							if (ses->server == treeCon->ses->server) {
-								cFYI(1, ("found matching uid substitute right smb_uid"));
+								cFYI(1, "found matching uid substitute right smb_uid");
 								buffer->Uid = ses->Suid;
 								break;
 							} else {
 				/* BB eventually call cifs_setup_session here */
-								cFYI(1, ("local UID found but no smb sess with this server exists"));
+								cFYI(1, "local UID found but no smb sess with this server exists");
 							}
 						}
 					}
@@ -394,17 +394,16 @@ checkSMBhdr(struct smb_hdr *smb, __u16 mid)
 			if (smb->Command == SMB_COM_LOCKING_ANDX)
 				return 0;
 			else
-				cERROR(1, ("Received Request not response"));
+				cERROR(1, "Received Request not response");
 		}
 	} else { /* bad signature or mid */
 		if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff))
-			cERROR(1,
-			       ("Bad protocol string signature header %x",
-				*(unsigned int *) smb->Protocol));
+			cERROR(1, "Bad protocol string signature header %x",
+				*(unsigned int *) smb->Protocol);
 		if (mid != smb->Mid)
-			cERROR(1, ("Mids do not match"));
+			cERROR(1, "Mids do not match");
 	}
-	cERROR(1, ("bad smb detected. The Mid=%d", smb->Mid));
+	cERROR(1, "bad smb detected. The Mid=%d", smb->Mid);
 	return 1;
 }
 
@@ -413,7 +412,7 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
 {
 	__u32 len = smb->smb_buf_length;
 	__u32 clc_len;  /* calculated length */
-	cFYI(0, ("checkSMB Length: 0x%x, smb_buf_length: 0x%x", length, len));
+	cFYI(0, "checkSMB Length: 0x%x, smb_buf_length: 0x%x", length, len);
 
 	if (length < 2 + sizeof(struct smb_hdr)) {
 		if ((length >= sizeof(struct smb_hdr) - 1)
@@ -437,15 +436,15 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
 				tmp[sizeof(struct smb_hdr)+1] = 0;
 				return 0;
 			}
-			cERROR(1, ("rcvd invalid byte count (bcc)"));
+			cERROR(1, "rcvd invalid byte count (bcc)");
 		} else {
-			cERROR(1, ("Length less than smb header size"));
+			cERROR(1, "Length less than smb header size");
 		}
 		return 1;
 	}
 	if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
-		cERROR(1, ("smb length greater than MaxBufSize, mid=%d",
-				   smb->Mid));
+		cERROR(1, "smb length greater than MaxBufSize, mid=%d",
+				   smb->Mid);
 		return 1;
 	}
 
@@ -454,8 +453,8 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
 	clc_len = smbCalcSize_LE(smb);
 
 	if (4 + len != length) {
-		cERROR(1, ("Length read does not match RFC1001 length %d",
-			   len));
+		cERROR(1, "Length read does not match RFC1001 length %d",
+			   len);
 		return 1;
 	}
 
@@ -466,8 +465,8 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
 			if (((4 + len) & 0xFFFF) == (clc_len & 0xFFFF))
 				return 0; /* bcc wrapped */
 		}
-		cFYI(1, ("Calculated size %d vs length %d mismatch for mid %d",
-				clc_len, 4 + len, smb->Mid));
+		cFYI(1, "Calculated size %d vs length %d mismatch for mid %d",
+				clc_len, 4 + len, smb->Mid);
 		/* Windows XP can return a few bytes too much, presumably
 		an illegal pad, at the end of byte range lock responses
 		so we allow for that three byte pad, as long as actual
@@ -482,8 +481,8 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
 		if ((4+len > clc_len) && (len <= clc_len + 512))
 			return 0;
 		else {
-			cERROR(1, ("RFC1001 size %d bigger than SMB for Mid=%d",
-					len, smb->Mid));
+			cERROR(1, "RFC1001 size %d bigger than SMB for Mid=%d",
+					len, smb->Mid);
 			return 1;
 		}
 	}
@@ -501,7 +500,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 	struct cifsFileInfo *netfile;
 	int rc;
 
-	cFYI(1, ("Checking for oplock break or dnotify response"));
+	cFYI(1, "Checking for oplock break or dnotify response");
 	if ((pSMB->hdr.Command == SMB_COM_NT_TRANSACT) &&
 	   (pSMB->hdr.Flags & SMBFLG_RESPONSE)) {
 		struct smb_com_transaction_change_notify_rsp *pSMBr =
@@ -513,15 +512,15 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 
 			pnotify = (struct file_notify_information *)
 				((char *)&pSMBr->hdr.Protocol + data_offset);
-			cFYI(1, ("dnotify on %s Action: 0x%x",
-				 pnotify->FileName, pnotify->Action));
+			cFYI(1, "dnotify on %s Action: 0x%x",
+				 pnotify->FileName, pnotify->Action);
 			/*   cifs_dump_mem("Rcvd notify Data: ",buf,
 				sizeof(struct smb_hdr)+60); */
 			return true;
 		}
 		if (pSMBr->hdr.Status.CifsError) {
-			cFYI(1, ("notify err 0x%d",
-				pSMBr->hdr.Status.CifsError));
+			cFYI(1, "notify err 0x%d",
+				pSMBr->hdr.Status.CifsError);
 			return true;
 		}
 		return false;
@@ -535,7 +534,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 		   large dirty files cached on the client */
 		if ((NT_STATUS_INVALID_HANDLE) ==
 		   le32_to_cpu(pSMB->hdr.Status.CifsError)) {
-			cFYI(1, ("invalid handle on oplock break"));
+			cFYI(1, "invalid handle on oplock break");
 			return true;
 		} else if (ERRbadfid ==
 		   le16_to_cpu(pSMB->hdr.Status.DosError.Error)) {
@@ -547,8 +546,8 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 	if (pSMB->hdr.WordCount != 8)
 		return false;
 
-	cFYI(1, ("oplock type 0x%d level 0x%d",
-		 pSMB->LockType, pSMB->OplockLevel));
+	cFYI(1, "oplock type 0x%d level 0x%d",
+		 pSMB->LockType, pSMB->OplockLevel);
 	if (!(pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE))
 		return false;
 
@@ -579,15 +578,15 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 					return true;
 				}
 
-				cFYI(1, ("file id match, oplock break"));
+				cFYI(1, "file id match, oplock break");
 				pCifsInode = CIFS_I(netfile->pInode);
 				pCifsInode->clientCanCacheAll = false;
 				if (pSMB->OplockLevel == 0)
 					pCifsInode->clientCanCacheRead = false;
 				rc = slow_work_enqueue(&netfile->oplock_break);
 				if (rc) {
-					cERROR(1, ("failed to enqueue oplock "
-						   "break: %d\n", rc));
+					cERROR(1, "failed to enqueue oplock "
+						   "break: %d\n", rc);
 				} else {
 					netfile->oplock_break_cancelled = false;
 				}
@@ -597,12 +596,12 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 			}
 			read_unlock(&GlobalSMBSeslock);
 			read_unlock(&cifs_tcp_ses_lock);
-			cFYI(1, ("No matching file for oplock break"));
+			cFYI(1, "No matching file for oplock break");
 			return true;
 		}
 	}
 	read_unlock(&cifs_tcp_ses_lock);
-	cFYI(1, ("Can not process oplock break for non-existent connection"));
+	cFYI(1, "Can not process oplock break for non-existent connection");
 	return true;
 }
 
@@ -721,11 +720,11 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
 {
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
 		cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
-		cERROR(1, ("Autodisabling the use of server inode numbers on "
+		cERROR(1, "Autodisabling the use of server inode numbers on "
 			   "%s. This server doesn't seem to support them "
 			   "properly. Hardlinks will not be recognized on this "
 			   "mount. Consider mounting with the \"noserverino\" "
 			   "option to silence this message.",
-			   cifs_sb->tcon->treeName));
+			   cifs_sb->tcon->treeName);
 	}
 }
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index bd6d6895730d..d35d52889cb5 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -149,7 +149,7 @@ cifs_inet_pton(const int address_family, const char *cp, void *dst)
 	else if (address_family == AF_INET6)
 		ret = in6_pton(cp, -1 /* len */, dst , '\\', NULL);
 
-	cFYI(DBG2, ("address conversion returned %d for %s", ret, cp));
+	cFYI(DBG2, "address conversion returned %d for %s", ret, cp);
 	if (ret > 0)
 		ret = 1;
 	return ret;
@@ -870,8 +870,8 @@ map_smb_to_linux_error(struct smb_hdr *smb, int logErr)
 	}
 	/* else ERRHRD class errors or junk  - return EIO */
 
-	cFYI(1, ("Mapping smb error code %d to POSIX err %d",
-		 smberrcode, rc));
+	cFYI(1, "Mapping smb error code %d to POSIX err %d",
+		 smberrcode, rc);
 
 	/* generic corrective action e.g. reconnect SMB session on
 	 * ERRbaduid could be added */
@@ -940,20 +940,20 @@ struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset)
 	SMB_TIME *st = (SMB_TIME *)&time;
 	SMB_DATE *sd = (SMB_DATE *)&date;
 
-	cFYI(1, ("date %d time %d", date, time));
+	cFYI(1, "date %d time %d", date, time);
 
 	sec = 2 * st->TwoSeconds;
 	min = st->Minutes;
 	if ((sec > 59) || (min > 59))
-		cERROR(1, ("illegal time min %d sec %d", min, sec));
+		cERROR(1, "illegal time min %d sec %d", min, sec);
 	sec += (min * 60);
 	sec += 60 * 60 * st->Hours;
 	if (st->Hours > 24)
-		cERROR(1, ("illegal hours %d", st->Hours));
+		cERROR(1, "illegal hours %d", st->Hours);
 	days = sd->Day;
 	month = sd->Month;
 	if ((days > 31) || (month > 12)) {
-		cERROR(1, ("illegal date, month %d day: %d", month, days));
+		cERROR(1, "illegal date, month %d day: %d", month, days);
 		if (month > 12)
 			month = 12;
 	}
@@ -979,7 +979,7 @@ struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset)
 
 	ts.tv_sec = sec + offset;
 
-	/* cFYI(1,("sec after cnvrt dos to unix time %d",sec)); */
+	/* cFYI(1, "sec after cnvrt dos to unix time %d",sec); */
 
 	ts.tv_nsec = 0;
 	return ts;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index f84062f9a985..daf1753af674 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -22,6 +22,7 @@
  */
 #include <linux/fs.h>
 #include <linux/pagemap.h>
+#include <linux/slab.h>
 #include <linux/stat.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
@@ -46,15 +47,15 @@ static void dump_cifs_file_struct(struct file *file, char *label)
 	if (file) {
 		cf = file->private_data;
 		if (cf == NULL) {
-			cFYI(1, ("empty cifs private file data"));
+			cFYI(1, "empty cifs private file data");
 			return;
 		}
 		if (cf->invalidHandle)
-			cFYI(1, ("invalid handle"));
+			cFYI(1, "invalid handle");
 		if (cf->srch_inf.endOfSearch)
-			cFYI(1, ("end of search"));
+			cFYI(1, "end of search");
 		if (cf->srch_inf.emptyDir)
-			cFYI(1, ("empty dir"));
+			cFYI(1, "empty dir");
 	}
 }
 #else
@@ -75,7 +76,12 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
 	struct inode *inode;
 	struct super_block *sb = parent->d_inode->i_sb;
 
-	cFYI(1, ("For %s", name->name));
+	cFYI(1, "For %s", name->name);
+
+	if (parent->d_op && parent->d_op->d_hash)
+		parent->d_op->d_hash(parent, name);
+	else
+		name->hash = full_name_hash(name->name, name->len);
 
 	dentry = d_lookup(parent, name);
 	if (dentry) {
@@ -208,7 +214,7 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
 				fid,
 				cifs_sb->local_nls);
 		if (CIFSSMBClose(xid, ptcon, fid)) {
-			cFYI(1, ("Error closing temporary reparsepoint open)"));
+			cFYI(1, "Error closing temporary reparsepoint open");
 		}
 	}
 }
@@ -246,7 +252,7 @@ static int initiate_cifs_search(const int xid, struct file *file)
 	if (full_path == NULL)
 		return -ENOMEM;
 
-	cFYI(1, ("Full path: %s start at: %lld", full_path, file->f_pos));
+	cFYI(1, "Full path: %s start at: %lld", full_path, file->f_pos);
 
 ffirst_retry:
 	/* test for Unix extensions */
@@ -291,7 +297,7 @@ static int cifs_unicode_bytelen(char *str)
 		if (ustr[len] == 0)
 			return len << 1;
 	}
-	cFYI(1, ("Unicode string longer than PATH_MAX found"));
+	cFYI(1, "Unicode string longer than PATH_MAX found");
 	return len << 1;
 }
 
@@ -308,19 +314,18 @@ static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level)
 				pfData->FileNameLength;
 	} else
 		new_entry = old_entry + le32_to_cpu(pDirInfo->NextEntryOffset);
-	cFYI(1, ("new entry %p old entry %p", new_entry, old_entry));
+	cFYI(1, "new entry %p old entry %p", new_entry, old_entry);
 	/* validate that new_entry is not past end of SMB */
 	if (new_entry >= end_of_smb) {
-		cERROR(1,
-		      ("search entry %p began after end of SMB %p old entry %p",
-			new_entry, end_of_smb, old_entry));
+		cERROR(1, "search entry %p began after end of SMB %p old entry %p",
+			new_entry, end_of_smb, old_entry);
 		return NULL;
 	} else if (((level == SMB_FIND_FILE_INFO_STANDARD) &&
 		    (new_entry + sizeof(FIND_FILE_STANDARD_INFO) > end_of_smb))
 		  || ((level != SMB_FIND_FILE_INFO_STANDARD) &&
 		   (new_entry + sizeof(FILE_DIRECTORY_INFO) > end_of_smb)))  {
-		cERROR(1, ("search entry %p extends after end of SMB %p",
-			new_entry, end_of_smb));
+		cERROR(1, "search entry %p extends after end of SMB %p",
+			new_entry, end_of_smb);
 		return NULL;
 	} else
 		return new_entry;
@@ -374,8 +379,8 @@ static int cifs_entry_is_dot(char *current_entry, struct cifsFileInfo *cfile)
 		filename = &pFindData->FileName[0];
 		len = pFindData->FileNameLength;
 	} else {
-		cFYI(1, ("Unknown findfirst level %d",
-			 cfile->srch_inf.info_level));
+		cFYI(1, "Unknown findfirst level %d",
+			 cfile->srch_inf.info_level);
 	}
 
 	if (filename) {
@@ -475,7 +480,7 @@ static int cifs_save_resume_key(const char *current_entry,
 		len = (unsigned int)pFindData->FileNameLength;
 		cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
 	} else {
-		cFYI(1, ("Unknown findfirst level %d", level));
+		cFYI(1, "Unknown findfirst level %d", level);
 		return -EINVAL;
 	}
 	cifsFile->srch_inf.resume_name_len = len;
@@ -519,7 +524,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 	     is_dir_changed(file)) ||
 	   (index_to_find < first_entry_in_buffer)) {
 		/* close and restart search */
-		cFYI(1, ("search backing up - close and restart search"));
+		cFYI(1, "search backing up - close and restart search");
 		write_lock(&GlobalSMBSeslock);
 		if (!cifsFile->srch_inf.endOfSearch &&
 		    !cifsFile->invalidHandle) {
@@ -529,7 +534,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 		} else
 			write_unlock(&GlobalSMBSeslock);
 		if (cifsFile->srch_inf.ntwrk_buf_start) {
-			cFYI(1, ("freeing SMB ff cache buf on search rewind"));
+			cFYI(1, "freeing SMB ff cache buf on search rewind");
 			if (cifsFile->srch_inf.smallBuf)
 				cifs_small_buf_release(cifsFile->srch_inf.
 						ntwrk_buf_start);
@@ -540,8 +545,8 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 		}
 		rc = initiate_cifs_search(xid, file);
 		if (rc) {
-			cFYI(1, ("error %d reinitiating a search on rewind",
-				 rc));
+			cFYI(1, "error %d reinitiating a search on rewind",
+				 rc);
 			return rc;
 		}
 		cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile);
@@ -549,7 +554,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 
 	while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) &&
 	      (rc == 0) && !cifsFile->srch_inf.endOfSearch) {
-		cFYI(1, ("calling findnext2"));
+		cFYI(1, "calling findnext2");
 		rc = CIFSFindNext(xid, pTcon, cifsFile->netfid,
 				  &cifsFile->srch_inf);
 		cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile);
@@ -569,7 +574,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 		first_entry_in_buffer = cifsFile->srch_inf.index_of_last_entry
 					- cifsFile->srch_inf.entries_in_buffer;
 		pos_in_buf = index_to_find - first_entry_in_buffer;
-		cFYI(1, ("found entry - pos_in_buf %d", pos_in_buf));
+		cFYI(1, "found entry - pos_in_buf %d", pos_in_buf);
 
 		for (i = 0; (i < (pos_in_buf)) && (current_entry != NULL); i++) {
 			/* go entry by entry figuring out which is first */
@@ -578,19 +583,19 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 		}
 		if ((current_entry == NULL) && (i < pos_in_buf)) {
 			/* BB fixme - check if we should flag this error */
-			cERROR(1, ("reached end of buf searching for pos in buf"
+			cERROR(1, "reached end of buf searching for pos in buf"
 			  " %d index to find %lld rc %d",
-			  pos_in_buf, index_to_find, rc));
+			  pos_in_buf, index_to_find, rc);
 		}
 		rc = 0;
 		*ppCurrentEntry = current_entry;
 	} else {
-		cFYI(1, ("index not in buffer - could not findnext into it"));
+		cFYI(1, "index not in buffer - could not findnext into it");
 		return 0;
 	}
 
 	if (pos_in_buf >= cifsFile->srch_inf.entries_in_buffer) {
-		cFYI(1, ("can not return entries pos_in_buf beyond last"));
+		cFYI(1, "can not return entries pos_in_buf beyond last");
 		*num_to_ret = 0;
 	} else
 		*num_to_ret = cifsFile->srch_inf.entries_in_buffer - pos_in_buf;
@@ -650,12 +655,12 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
 		/* one byte length, no name conversion */
 		len = (unsigned int)pFindData->FileNameLength;
 	} else {
-		cFYI(1, ("Unknown findfirst level %d", level));
+		cFYI(1, "Unknown findfirst level %d", level);
 		return -EINVAL;
 	}
 
 	if (len > max_len) {
-		cERROR(1, ("bad search response length %d past smb end", len));
+		cERROR(1, "bad search response length %d past smb end", len);
 		return -EINVAL;
 	}
 
@@ -666,12 +671,11 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
 					   min(len, max_len), nlt,
 					   cifs_sb->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
+		pqst->len -= nls_nullsize(nlt);
 	} else {
 		pqst->name = filename;
 		pqst->len = len;
 	}
-	pqst->hash = full_name_hash(pqst->name, pqst->len);
-/*	cFYI(1, ("filldir on %s",pqst->name));  */
 	return rc;
 }
 
@@ -749,7 +753,7 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
 	 * case already. Why should we be clobbering other errors from it?
 	 */
 	if (rc) {
-		cFYI(1, ("filldir rc = %d", rc));
+		cFYI(1, "filldir rc = %d", rc);
 		rc = -EOVERFLOW;
 	}
 	dput(tmp_dentry);
@@ -781,7 +785,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 	case 0:
 		if (filldir(direntry, ".", 1, file->f_pos,
 		     file->f_path.dentry->d_inode->i_ino, DT_DIR) < 0) {
-			cERROR(1, ("Filldir for current dir failed"));
+			cERROR(1, "Filldir for current dir failed");
 			rc = -ENOMEM;
 			break;
 		}
@@ -789,7 +793,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 	case 1:
 		if (filldir(direntry, "..", 2, file->f_pos,
 		     file->f_path.dentry->d_parent->d_inode->i_ino, DT_DIR) < 0) {
-			cERROR(1, ("Filldir for parent dir failed"));
+			cERROR(1, "Filldir for parent dir failed");
 			rc = -ENOMEM;
 			break;
 		}
@@ -802,7 +806,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 
 		if (file->private_data == NULL) {
 			rc = initiate_cifs_search(xid, file);
-			cFYI(1, ("initiate cifs search rc %d", rc));
+			cFYI(1, "initiate cifs search rc %d", rc);
 			if (rc) {
 				FreeXid(xid);
 				return rc;
@@ -816,7 +820,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 		cifsFile = file->private_data;
 		if (cifsFile->srch_inf.endOfSearch) {
 			if (cifsFile->srch_inf.emptyDir) {
-				cFYI(1, ("End of search, empty dir"));
+				cFYI(1, "End of search, empty dir");
 				rc = 0;
 				break;
 			}
@@ -828,16 +832,16 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 		rc = find_cifs_entry(xid, pTcon, file,
 				&current_entry, &num_to_fill);
 		if (rc) {
-			cFYI(1, ("fce error %d", rc));
+			cFYI(1, "fce error %d", rc);
 			goto rddir2_exit;
 		} else if (current_entry != NULL) {
-			cFYI(1, ("entry %lld found", file->f_pos));
+			cFYI(1, "entry %lld found", file->f_pos);
 		} else {
-			cFYI(1, ("could not find entry"));
+			cFYI(1, "could not find entry");
 			goto rddir2_exit;
 		}
-		cFYI(1, ("loop through %d times filling dir for net buf %p",
-			num_to_fill, cifsFile->srch_inf.ntwrk_buf_start));
+		cFYI(1, "loop through %d times filling dir for net buf %p",
+			num_to_fill, cifsFile->srch_inf.ntwrk_buf_start);
 		max_len = smbCalcSize((struct smb_hdr *)
 				cifsFile->srch_inf.ntwrk_buf_start);
 		end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len;
@@ -846,8 +850,8 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 		for (i = 0; (i < num_to_fill) && (rc == 0); i++) {
 			if (current_entry == NULL) {
 				/* evaluate whether this case is an error */
-				cERROR(1, ("past SMB end,  num to fill %d i %d",
-					  num_to_fill, i));
+				cERROR(1, "past SMB end,  num to fill %d i %d",
+					  num_to_fill, i);
 				break;
 			}
 			/* if buggy server returns . and .. late do
@@ -862,8 +866,8 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 			file->f_pos++;
 			if (file->f_pos ==
 				cifsFile->srch_inf.index_of_last_entry) {
-				cFYI(1, ("last entry in buf at pos %lld %s",
-					file->f_pos, tmp_buf));
+				cFYI(1, "last entry in buf at pos %lld %s",
+					file->f_pos, tmp_buf);
 				cifs_save_resume_key(current_entry, cifsFile);
 				break;
 			} else
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 7085a6275c4c..0a57cb7db5dd 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -29,14 +29,17 @@
 #include "ntlmssp.h"
 #include "nterr.h"
 #include <linux/utsname.h>
+#include <linux/slab.h>
 #include "cifs_spnego.h"
 
 extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
 			 unsigned char *p24);
 
-/* Checks if this is the first smb session to be reconnected after
-   the socket has been reestablished (so we know whether to use vc 0).
-   Called while holding the cifs_tcp_ses_lock, so do not block */
+/*
+ * Checks if this is the first smb session to be reconnected after
+ * the socket has been reestablished (so we know whether to use vc 0).
+ * Called while holding the cifs_tcp_ses_lock, so do not block
+ */
 static bool is_first_ses_reconnect(struct cifsSesInfo *ses)
 {
 	struct list_head *tmp;
@@ -223,9 +226,9 @@ static void unicode_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
 		/* null user mount */
 		*bcc_ptr = 0;
 		*(bcc_ptr+1) = 0;
-	} else { /* 300 should be long enough for any conceivable user name */
+	} else {
 		bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->userName,
-					  300, nls_cp);
+					  MAX_USERNAME_SIZE, nls_cp);
 	}
 	bcc_ptr += 2 * bytes_ret;
 	bcc_ptr += 2; /* account for null termination */
@@ -246,11 +249,10 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
 	/* copy user */
 	if (ses->userName == NULL) {
 		/* BB what about null user mounts - check that we do this BB */
-	} else { /* 300 should be long enough for any conceivable user name */
-		strncpy(bcc_ptr, ses->userName, 300);
+	} else {
+		strncpy(bcc_ptr, ses->userName, MAX_USERNAME_SIZE);
 	}
-	/* BB improve check for overflow */
-	bcc_ptr += strnlen(ses->userName, 300);
+	bcc_ptr += strnlen(ses->userName, MAX_USERNAME_SIZE);
 	*bcc_ptr = 0;
 	bcc_ptr++; /* account for null termination */
 
@@ -284,7 +286,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
 	int len;
 	char *data = *pbcc_area;
 
-	cFYI(1, ("bleft %d", bleft));
+	cFYI(1, "bleft %d", bleft);
 
 	/*
 	 * Windows servers do not always double null terminate their final
@@ -301,7 +303,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
 
 	kfree(ses->serverOS);
 	ses->serverOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
-	cFYI(1, ("serverOS=%s", ses->serverOS));
+	cFYI(1, "serverOS=%s", ses->serverOS);
 	len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
 	data += len;
 	bleft -= len;
@@ -310,7 +312,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
 
 	kfree(ses->serverNOS);
 	ses->serverNOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
-	cFYI(1, ("serverNOS=%s", ses->serverNOS));
+	cFYI(1, "serverNOS=%s", ses->serverNOS);
 	len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
 	data += len;
 	bleft -= len;
@@ -319,7 +321,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
 
 	kfree(ses->serverDomain);
 	ses->serverDomain = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
-	cFYI(1, ("serverDomain=%s", ses->serverDomain));
+	cFYI(1, "serverDomain=%s", ses->serverDomain);
 
 	return;
 }
@@ -332,7 +334,7 @@ static int decode_ascii_ssetup(char **pbcc_area, int bleft,
 	int len;
 	char *bcc_ptr = *pbcc_area;
 
-	cFYI(1, ("decode sessetup ascii. bleft %d", bleft));
+	cFYI(1, "decode sessetup ascii. bleft %d", bleft);
 
 	len = strnlen(bcc_ptr, bleft);
 	if (len >= bleft)
@@ -344,7 +346,7 @@ static int decode_ascii_ssetup(char **pbcc_area, int bleft,
 	if (ses->serverOS)
 		strncpy(ses->serverOS, bcc_ptr, len);
 	if (strncmp(ses->serverOS, "OS/2", 4) == 0) {
-			cFYI(1, ("OS/2 server"));
+			cFYI(1, "OS/2 server");
 			ses->flags |= CIFS_SES_OS2;
 	}
 
@@ -373,7 +375,7 @@ static int decode_ascii_ssetup(char **pbcc_area, int bleft,
 	/* BB For newer servers which do not support Unicode,
 	   but thus do return domain here we could add parsing
 	   for it later, but it is not very important */
-	cFYI(1, ("ascii: bytes left %d", bleft));
+	cFYI(1, "ascii: bytes left %d", bleft);
 
 	return rc;
 }
@@ -384,16 +386,16 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
 	CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr;
 
 	if (blob_len < sizeof(CHALLENGE_MESSAGE)) {
-		cERROR(1, ("challenge blob len %d too small", blob_len));
+		cERROR(1, "challenge blob len %d too small", blob_len);
 		return -EINVAL;
 	}
 
 	if (memcmp(pblob->Signature, "NTLMSSP", 8)) {
-		cERROR(1, ("blob signature incorrect %s", pblob->Signature));
+		cERROR(1, "blob signature incorrect %s", pblob->Signature);
 		return -EINVAL;
 	}
 	if (pblob->MessageType != NtLmChallenge) {
-		cERROR(1, ("Incorrect message type %d", pblob->MessageType));
+		cERROR(1, "Incorrect message type %d", pblob->MessageType);
 		return -EINVAL;
 	}
 
@@ -447,7 +449,7 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
    This function returns the length of the data in the blob */
 static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
 				   struct cifsSesInfo *ses,
-				   const struct nls_table *nls_cp, int first)
+				   const struct nls_table *nls_cp, bool first)
 {
 	AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer;
 	__u32 flags;
@@ -546,7 +548,7 @@ static void setup_ntlmssp_neg_req(SESSION_SETUP_ANDX *pSMB,
 
 static int setup_ntlmssp_auth_req(SESSION_SETUP_ANDX *pSMB,
 				  struct cifsSesInfo *ses,
-				  const struct nls_table *nls, int first_time)
+				  const struct nls_table *nls, bool first_time)
 {
 	int bloblen;
 
@@ -559,8 +561,8 @@ static int setup_ntlmssp_auth_req(SESSION_SETUP_ANDX *pSMB,
 #endif
 
 int
-CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
-		const struct nls_table *nls_cp)
+CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
+	       const struct nls_table *nls_cp)
 {
 	int rc = 0;
 	int wct;
@@ -577,13 +579,18 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 	int bytes_remaining;
 	struct key *spnego_key = NULL;
 	__le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
+	bool first_time;
 
 	if (ses == NULL)
 		return -EINVAL;
 
+	read_lock(&cifs_tcp_ses_lock);
+	first_time = is_first_ses_reconnect(ses);
+	read_unlock(&cifs_tcp_ses_lock);
+
 	type = ses->server->secType;
 
-	cFYI(1, ("sess setup type %d", type));
+	cFYI(1, "sess setup type %d", type);
 ssetup_ntlmssp_authenticate:
 	if (phase == NtLmChallenge)
 		phase = NtLmAuthenticate; /* if ntlmssp, now final phase */
@@ -664,7 +671,7 @@ ssetup_ntlmssp_authenticate:
 		changed to do higher than lanman dialect and
 		we reconnected would we ever calc signing_key? */
 
-		cFYI(1, ("Negotiating LANMAN setting up strings"));
+		cFYI(1, "Negotiating LANMAN setting up strings");
 		/* Unicode not allowed for LANMAN dialects */
 		ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
 #endif
@@ -723,15 +730,7 @@ ssetup_ntlmssp_authenticate:
 
 		/* calculate session key */
 		setup_ntlmv2_rsp(ses, v2_sess_key, nls_cp);
-		if (first_time) /* should this be moved into common code
-				   with similar ntlmv2 path? */
-		/*   cifs_calculate_ntlmv2_mac_key(ses->server->mac_signing_key,
-				response BB FIXME, v2_sess_key); */
-
-		/* copy session key */
-
-	/*	memcpy(bcc_ptr, (char *)ntlm_session_key,LM2_SESS_KEY_SIZE);
-		bcc_ptr += LM2_SESS_KEY_SIZE; */
+		/* FIXME: calculate MAC key */
 		memcpy(bcc_ptr, (char *)v2_sess_key,
 		       sizeof(struct ntlmv2_resp));
 		bcc_ptr += sizeof(struct ntlmv2_resp);
@@ -744,7 +743,7 @@ ssetup_ntlmssp_authenticate:
 			unicode_ssetup_strings(&bcc_ptr, ses, nls_cp);
 		} else
 			ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
-	} else if (type == Kerberos || type == MSKerberos) {
+	} else if (type == Kerberos) {
 #ifdef CONFIG_CIFS_UPCALL
 		struct cifs_spnego_msg *msg;
 		spnego_key = cifs_get_spnego_key(ses);
@@ -758,17 +757,17 @@ ssetup_ntlmssp_authenticate:
 		/* check version field to make sure that cifs.upcall is
 		   sending us a response in an expected form */
 		if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) {
-			cERROR(1, ("incorrect version of cifs.upcall (expected"
+			cERROR(1, "incorrect version of cifs.upcall (expected"
 				   " %d but got %d)",
-				   CIFS_SPNEGO_UPCALL_VERSION, msg->version));
+				   CIFS_SPNEGO_UPCALL_VERSION, msg->version);
 			rc = -EKEYREJECTED;
 			goto ssetup_exit;
 		}
 		/* bail out if key is too long */
 		if (msg->sesskey_len >
 		    sizeof(ses->server->mac_signing_key.data.krb5)) {
-			cERROR(1, ("Kerberos signing key too long (%u bytes)",
-				msg->sesskey_len));
+			cERROR(1, "Kerberos signing key too long (%u bytes)",
+				msg->sesskey_len);
 			rc = -EOVERFLOW;
 			goto ssetup_exit;
 		}
@@ -796,7 +795,7 @@ ssetup_ntlmssp_authenticate:
 		/* BB: is this right? */
 			ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
 #else /* ! CONFIG_CIFS_UPCALL */
-		cERROR(1, ("Kerberos negotiated but upcall support disabled!"));
+		cERROR(1, "Kerberos negotiated but upcall support disabled!");
 		rc = -ENOSYS;
 		goto ssetup_exit;
 #endif /* CONFIG_CIFS_UPCALL */
@@ -804,12 +803,12 @@ ssetup_ntlmssp_authenticate:
 #ifdef CONFIG_CIFS_EXPERIMENTAL
 		if (type == RawNTLMSSP) {
 			if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
-				cERROR(1, ("NTLMSSP requires Unicode support"));
+				cERROR(1, "NTLMSSP requires Unicode support");
 				rc = -ENOSYS;
 				goto ssetup_exit;
 			}
 
-			cFYI(1, ("ntlmssp session setup phase %d", phase));
+			cFYI(1, "ntlmssp session setup phase %d", phase);
 			pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
 			capabilities |= CAP_EXTENDED_SECURITY;
 			pSMB->req.Capabilities |= cpu_to_le32(capabilities);
@@ -827,7 +826,7 @@ ssetup_ntlmssp_authenticate:
 				   on the response (challenge) */
 				smb_buf->Uid = ses->Suid;
 			} else {
-				cERROR(1, ("invalid phase %d", phase));
+				cERROR(1, "invalid phase %d", phase);
 				rc = -ENOSYS;
 				goto ssetup_exit;
 			}
@@ -839,12 +838,12 @@ ssetup_ntlmssp_authenticate:
 			}
 			unicode_oslm_strings(&bcc_ptr, nls_cp);
 		} else {
-			cERROR(1, ("secType %d not supported!", type));
+			cERROR(1, "secType %d not supported!", type);
 			rc = -ENOSYS;
 			goto ssetup_exit;
 		}
 #else
-		cERROR(1, ("secType %d not supported!", type));
+		cERROR(1, "secType %d not supported!", type);
 		rc = -ENOSYS;
 		goto ssetup_exit;
 #endif
@@ -862,7 +861,7 @@ ssetup_ntlmssp_authenticate:
 			  CIFS_STD_OP /* not long */ | CIFS_LOG_ERROR);
 	/* SMB request buf freed in SendReceive2 */
 
-	cFYI(1, ("ssetup rc from sendrecv2 is %d", rc));
+	cFYI(1, "ssetup rc from sendrecv2 is %d", rc);
 
 	pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base;
 	smb_buf = (struct smb_hdr *)iov[0].iov_base;
@@ -870,7 +869,7 @@ ssetup_ntlmssp_authenticate:
 	if ((type == RawNTLMSSP) && (smb_buf->Status.CifsError ==
 			cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))) {
 		if (phase != NtLmNegotiate) {
-			cERROR(1, ("Unexpected more processing error"));
+			cERROR(1, "Unexpected more processing error");
 			goto ssetup_exit;
 		}
 		/* NTLMSSP Negotiate sent now processing challenge (response) */
@@ -882,14 +881,14 @@ ssetup_ntlmssp_authenticate:
 
 	if ((smb_buf->WordCount != 3) && (smb_buf->WordCount != 4)) {
 		rc = -EIO;
-		cERROR(1, ("bad word count %d", smb_buf->WordCount));
+		cERROR(1, "bad word count %d", smb_buf->WordCount);
 		goto ssetup_exit;
 	}
 	action = le16_to_cpu(pSMB->resp.Action);
 	if (action & GUEST_LOGIN)
-		cFYI(1, ("Guest login")); /* BB mark SesInfo struct? */
+		cFYI(1, "Guest login"); /* BB mark SesInfo struct? */
 	ses->Suid = smb_buf->Uid;   /* UID left in wire format (le) */
-	cFYI(1, ("UID = %d ", ses->Suid));
+	cFYI(1, "UID = %d ", ses->Suid);
 	/* response can have either 3 or 4 word count - Samba sends 3 */
 	/* and lanman response is 3 */
 	bytes_remaining = BCC(smb_buf);
@@ -899,7 +898,7 @@ ssetup_ntlmssp_authenticate:
 		__u16 blob_len;
 		blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
 		if (blob_len > bytes_remaining) {
-			cERROR(1, ("bad security blob length %d", blob_len));
+			cERROR(1, "bad security blob length %d", blob_len);
 			rc = -EINVAL;
 			goto ssetup_exit;
 		}
@@ -933,7 +932,7 @@ ssetup_exit:
 	}
 	kfree(str_area);
 	if (resp_buf_type == CIFS_SMALL_BUFFER) {
-		cFYI(1, ("ssetup freeing small buf %p", iov[0].iov_base));
+		cFYI(1, "ssetup freeing small buf %p", iov[0].iov_base);
 		cifs_small_buf_release(iov[0].iov_base);
 	} else if (resp_buf_type == CIFS_LARGE_BUFFER)
 		cifs_buf_release(iov[0].iov_base);
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 93fb09a99c69..192ea51af20f 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -24,6 +24,7 @@
 */
 
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/string.h>
 #include <linux/kernel.h>
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 07b8e71544ee..82f78c4d6978 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -22,6 +22,7 @@
 
 #include <linux/fs.h>
 #include <linux/list.h>
+#include <linux/gfp.h>
 #include <linux/wait.h>
 #include <linux/net.h>
 #include <linux/delay.h>
@@ -34,7 +35,6 @@
 #include "cifs_debug.h"
 
 extern mempool_t *cifs_mid_poolp;
-extern struct kmem_cache *cifs_oplock_cachep;
 
 static struct mid_q_entry *
 AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
@@ -42,7 +42,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
 	struct mid_q_entry *temp;
 
 	if (server == NULL) {
-		cERROR(1, ("Null TCP session in AllocMidQEntry"));
+		cERROR(1, "Null TCP session in AllocMidQEntry");
 		return NULL;
 	}
 
@@ -54,7 +54,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
 		temp->mid = smb_buffer->Mid;	/* always LE */
 		temp->pid = current->pid;
 		temp->command = smb_buffer->Command;
-		cFYI(1, ("For smb_command %d", temp->command));
+		cFYI(1, "For smb_command %d", temp->command);
 	/*	do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */
 		/* when mid allocated can be before when sent */
 		temp->when_alloc = jiffies;
@@ -139,7 +139,7 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
 		total_len += iov[i].iov_len;
 
 	smb_buffer->smb_buf_length = cpu_to_be32(smb_buffer->smb_buf_length);
-	cFYI(1, ("Sending smb:  total_len %d", total_len));
+	cFYI(1, "Sending smb:  total_len %d", total_len);
 	dump_smb(smb_buffer, len);
 
 	i = 0;
@@ -167,9 +167,8 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
 			   reconnect which may clear the network problem.
 			*/
 			if ((i >= 14) || (!server->noblocksnd && (i > 2))) {
-				cERROR(1,
-				   ("sends on sock %p stuck for 15 seconds",
-				    ssocket));
+				cERROR(1, "sends on sock %p stuck for 15 seconds",
+				    ssocket);
 				rc = -EAGAIN;
 				break;
 			}
@@ -183,13 +182,13 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
 			total_len = 0;
 			break;
 		} else if (rc > total_len) {
-			cERROR(1, ("sent %d requested %d", rc, total_len));
+			cERROR(1, "sent %d requested %d", rc, total_len);
 			break;
 		}
 		if (rc == 0) {
 			/* should never happen, letting socket clear before
 			   retrying is our only obvious option here */
-			cERROR(1, ("tcp sent no data"));
+			cERROR(1, "tcp sent no data");
 			msleep(500);
 			continue;
 		}
@@ -212,8 +211,8 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
 	}
 
 	if ((total_len > 0) && (total_len != smb_buf_length + 4)) {
-		cFYI(1, ("partial send (%d remaining), terminating session",
-			total_len));
+		cFYI(1, "partial send (%d remaining), terminating session",
+			total_len);
 		/* If we have only sent part of an SMB then the next SMB
 		   could be taken as the remainder of this one.  We need
 		   to kill the socket so the server throws away the partial
@@ -222,7 +221,7 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
 	}
 
 	if (rc < 0) {
-		cERROR(1, ("Error %d sending data on socket to server", rc));
+		cERROR(1, "Error %d sending data on socket to server", rc);
 	} else
 		rc = 0;
 
@@ -295,7 +294,7 @@ static int allocate_mid(struct cifsSesInfo *ses, struct smb_hdr *in_buf,
 	}
 
 	if (ses->server->tcpStatus == CifsNeedReconnect) {
-		cFYI(1, ("tcp session dead - return to caller to retry"));
+		cFYI(1, "tcp session dead - return to caller to retry");
 		return -EAGAIN;
 	}
 
@@ -347,7 +346,7 @@ static int wait_for_response(struct cifsSesInfo *ses,
 			lrt += time_to_wait;
 			if (time_after(jiffies, lrt)) {
 				/* No replies for time_to_wait. */
-				cERROR(1, ("server not responding"));
+				cERROR(1, "server not responding");
 				return -1;
 			}
 		} else {
@@ -378,7 +377,7 @@ SendReceiveNoRsp(const unsigned int xid, struct cifsSesInfo *ses,
 	iov[0].iov_len = in_buf->smb_buf_length + 4;
 	flags |= CIFS_NO_RESP;
 	rc = SendReceive2(xid, ses, iov, 1, &resp_buf_type, flags);
-	cFYI(DBG2, ("SendRcvNoRsp flags %d rc %d", flags, rc));
+	cFYI(DBG2, "SendRcvNoRsp flags %d rc %d", flags, rc);
 
 	return rc;
 }
@@ -401,7 +400,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 
 	if ((ses == NULL) || (ses->server == NULL)) {
 		cifs_small_buf_release(in_buf);
-		cERROR(1, ("Null session"));
+		cERROR(1, "Null session");
 		return -EIO;
 	}
 
@@ -470,7 +469,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 	else if (long_op == CIFS_BLOCKING_OP)
 		timeout = 0x7FFFFFFF; /*  large, but not so large as to wrap */
 	else {
-		cERROR(1, ("unknown timeout flag %d", long_op));
+		cERROR(1, "unknown timeout flag %d", long_op);
 		rc = -EIO;
 		goto out;
 	}
@@ -489,8 +488,8 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 	spin_lock(&GlobalMid_Lock);
 
 	if (midQ->resp_buf == NULL) {
-		cERROR(1, ("No response to cmd %d mid %d",
-			midQ->command, midQ->mid));
+		cERROR(1, "No response to cmd %d mid %d",
+			midQ->command, midQ->mid);
 		if (midQ->midState == MID_REQUEST_SUBMITTED) {
 			if (ses->server->tcpStatus == CifsExiting)
 				rc = -EHOSTDOWN;
@@ -503,7 +502,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 		if (rc != -EHOSTDOWN) {
 			if (midQ->midState == MID_RETRY_NEEDED) {
 				rc = -EAGAIN;
-				cFYI(1, ("marking request for retry"));
+				cFYI(1, "marking request for retry");
 			} else {
 				rc = -EIO;
 			}
@@ -520,8 +519,8 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 	receive_len = midQ->resp_buf->smb_buf_length;
 
 	if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
-		cERROR(1, ("Frame too large received.  Length: %d  Xid: %d",
-			receive_len, xid));
+		cERROR(1, "Frame too large received.  Length: %d  Xid: %d",
+			receive_len, xid);
 		rc = -EIO;
 		goto out;
 	}
@@ -547,7 +546,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 						&ses->server->mac_signing_key,
 						midQ->sequence_number+1);
 			if (rc) {
-				cERROR(1, ("Unexpected SMB signature"));
+				cERROR(1, "Unexpected SMB signature");
 				/* BB FIXME add code to kill session */
 			}
 		}
@@ -568,7 +567,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 						   DeleteMidQEntry */
 	} else {
 		rc = -EIO;
-		cFYI(1, ("Bad MID state?"));
+		cFYI(1, "Bad MID state?");
 	}
 
 out:
@@ -590,11 +589,11 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 	struct mid_q_entry *midQ;
 
 	if (ses == NULL) {
-		cERROR(1, ("Null smb session"));
+		cERROR(1, "Null smb session");
 		return -EIO;
 	}
 	if (ses->server == NULL) {
-		cERROR(1, ("Null tcp session"));
+		cERROR(1, "Null tcp session");
 		return -EIO;
 	}
 
@@ -606,8 +605,8 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 	   use ses->maxReq */
 
 	if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
-		cERROR(1, ("Illegal length, greater than maximum frame, %d",
-			   in_buf->smb_buf_length));
+		cERROR(1, "Illegal length, greater than maximum frame, %d",
+			   in_buf->smb_buf_length);
 		return -EIO;
 	}
 
@@ -664,7 +663,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 	else if (long_op == CIFS_BLOCKING_OP)
 		timeout = 0x7FFFFFFF; /* large but no so large as to wrap */
 	else {
-		cERROR(1, ("unknown timeout flag %d", long_op));
+		cERROR(1, "unknown timeout flag %d", long_op);
 		rc = -EIO;
 		goto out;
 	}
@@ -680,8 +679,8 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 
 	spin_lock(&GlobalMid_Lock);
 	if (midQ->resp_buf == NULL) {
-		cERROR(1, ("No response for cmd %d mid %d",
-			  midQ->command, midQ->mid));
+		cERROR(1, "No response for cmd %d mid %d",
+			  midQ->command, midQ->mid);
 		if (midQ->midState == MID_REQUEST_SUBMITTED) {
 			if (ses->server->tcpStatus == CifsExiting)
 				rc = -EHOSTDOWN;
@@ -694,7 +693,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 		if (rc != -EHOSTDOWN) {
 			if (midQ->midState == MID_RETRY_NEEDED) {
 				rc = -EAGAIN;
-				cFYI(1, ("marking request for retry"));
+				cFYI(1, "marking request for retry");
 			} else {
 				rc = -EIO;
 			}
@@ -711,8 +710,8 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 	receive_len = midQ->resp_buf->smb_buf_length;
 
 	if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
-		cERROR(1, ("Frame too large received.  Length: %d  Xid: %d",
-			receive_len, xid));
+		cERROR(1, "Frame too large received.  Length: %d  Xid: %d",
+			receive_len, xid);
 		rc = -EIO;
 		goto out;
 	}
@@ -735,7 +734,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 						&ses->server->mac_signing_key,
 						midQ->sequence_number+1);
 			if (rc) {
-				cERROR(1, ("Unexpected SMB signature"));
+				cERROR(1, "Unexpected SMB signature");
 				/* BB FIXME add code to kill session */
 			}
 		}
@@ -752,7 +751,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 			BCC(out_buf) = le16_to_cpu(BCC_LE(out_buf));
 	} else {
 		rc = -EIO;
-		cERROR(1, ("Bad MID state?"));
+		cERROR(1, "Bad MID state?");
 	}
 
 out:
@@ -823,13 +822,13 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 	struct cifsSesInfo *ses;
 
 	if (tcon == NULL || tcon->ses == NULL) {
-		cERROR(1, ("Null smb session"));
+		cERROR(1, "Null smb session");
 		return -EIO;
 	}
 	ses = tcon->ses;
 
 	if (ses->server == NULL) {
-		cERROR(1, ("Null tcp session"));
+		cERROR(1, "Null tcp session");
 		return -EIO;
 	}
 
@@ -841,8 +840,8 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 	   use ses->maxReq */
 
 	if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
-		cERROR(1, ("Illegal length, greater than maximum frame, %d",
-			   in_buf->smb_buf_length));
+		cERROR(1, "Illegal length, greater than maximum frame, %d",
+			   in_buf->smb_buf_length);
 		return -EIO;
 	}
 
@@ -932,8 +931,8 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 		spin_unlock(&GlobalMid_Lock);
 		receive_len = midQ->resp_buf->smb_buf_length;
 	} else {
-		cERROR(1, ("No response for cmd %d mid %d",
-			  midQ->command, midQ->mid));
+		cERROR(1, "No response for cmd %d mid %d",
+			  midQ->command, midQ->mid);
 		if (midQ->midState == MID_REQUEST_SUBMITTED) {
 			if (ses->server->tcpStatus == CifsExiting)
 				rc = -EHOSTDOWN;
@@ -946,7 +945,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 		if (rc != -EHOSTDOWN) {
 			if (midQ->midState == MID_RETRY_NEEDED) {
 				rc = -EAGAIN;
-				cFYI(1, ("marking request for retry"));
+				cFYI(1, "marking request for retry");
 			} else {
 				rc = -EIO;
 			}
@@ -957,8 +956,8 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 	}
 
 	if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
-		cERROR(1, ("Frame too large received.  Length: %d  Xid: %d",
-			receive_len, xid));
+		cERROR(1, "Frame too large received.  Length: %d  Xid: %d",
+			receive_len, xid);
 		rc = -EIO;
 		goto out;
 	}
@@ -967,7 +966,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 
 	if ((out_buf == NULL) || (midQ->midState != MID_RESPONSE_RECEIVED)) {
 		rc = -EIO;
-		cERROR(1, ("Bad MID state?"));
+		cERROR(1, "Bad MID state?");
 		goto out;
 	}
 
@@ -985,7 +984,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 					   &ses->server->mac_signing_key,
 					   midQ->sequence_number+1);
 		if (rc) {
-			cERROR(1, ("Unexpected SMB signature"));
+			cERROR(1, "Unexpected SMB signature");
 			/* BB FIXME add code to kill session */
 		}
 	}
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index a75afa3dd9e1..a1509207bfa6 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -21,6 +21,7 @@
 
 #include <linux/fs.h>
 #include <linux/posix_acl_xattr.h>
+#include <linux/slab.h>
 #include "cifsfs.h"
 #include "cifspdu.h"
 #include "cifsglob.h"
@@ -69,12 +70,12 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
 		return rc;
 	}
 	if (ea_name == NULL) {
-		cFYI(1, ("Null xattr names not supported"));
+		cFYI(1, "Null xattr names not supported");
 	} else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5)
 		&& (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4))) {
 		cFYI(1,
-		    ("illegal xattr request %s (only user namespace supported)",
-			ea_name));
+		     "illegal xattr request %s (only user namespace supported)",
+		     ea_name);
 		/* BB what if no namespace prefix? */
 		/* Should we just pass them to server, except for
 		system and perhaps security prefixes? */
@@ -130,19 +131,19 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
 		search server for EAs or streams to
 		returns as xattrs */
 	if (value_size > MAX_EA_VALUE_SIZE) {
-		cFYI(1, ("size of EA value too large"));
+		cFYI(1, "size of EA value too large");
 		kfree(full_path);
 		FreeXid(xid);
 		return -EOPNOTSUPP;
 	}
 
 	if (ea_name == NULL) {
-		cFYI(1, ("Null xattr names not supported"));
+		cFYI(1, "Null xattr names not supported");
 	} else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) {
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
 			goto set_ea_exit;
 		if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0)
-			cFYI(1, ("attempt to set cifs inode metadata"));
+			cFYI(1, "attempt to set cifs inode metadata");
 
 		ea_name += 5; /* skip past user. prefix */
 		rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value,
@@ -168,9 +169,9 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
 					ACL_TYPE_ACCESS, cifs_sb->local_nls,
 					cifs_sb->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
-			cFYI(1, ("set POSIX ACL rc %d", rc));
+			cFYI(1, "set POSIX ACL rc %d", rc);
 #else
-			cFYI(1, ("set POSIX ACL not supported"));
+			cFYI(1, "set POSIX ACL not supported");
 #endif
 		} else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
 				   strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
@@ -181,13 +182,13 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
 					ACL_TYPE_DEFAULT, cifs_sb->local_nls,
 					cifs_sb->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
-			cFYI(1, ("set POSIX default ACL rc %d", rc));
+			cFYI(1, "set POSIX default ACL rc %d", rc);
 #else
-			cFYI(1, ("set default POSIX ACL not supported"));
+			cFYI(1, "set default POSIX ACL not supported");
 #endif
 		} else {
-			cFYI(1, ("illegal xattr request %s (only user namespace"
-				 " supported)", ea_name));
+			cFYI(1, "illegal xattr request %s (only user namespace"
+				" supported)", ea_name);
 		  /* BB what if no namespace prefix? */
 		  /* Should we just pass them to server, except for
 		  system and perhaps security prefixes? */
@@ -234,17 +235,17 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 	/* return dos attributes as pseudo xattr */
 	/* return alt name if available as pseudo attr */
 	if (ea_name == NULL) {
-		cFYI(1, ("Null xattr names not supported"));
+		cFYI(1, "Null xattr names not supported");
 	} else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) {
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
 			goto get_ea_exit;
 
 		if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0) {
-			cFYI(1, ("attempt to query cifs inode metadata"));
+			cFYI(1, "attempt to query cifs inode metadata");
 			/* revalidate/getattr then populate from inode */
 		} /* BB add else when above is implemented */
 		ea_name += 5; /* skip past user. prefix */
-		rc = CIFSSMBQueryEA(xid, pTcon, full_path, ea_name, ea_value,
+		rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value,
 			buf_size, cifs_sb->local_nls,
 			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 	} else if (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4) == 0) {
@@ -252,7 +253,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 			goto get_ea_exit;
 
 		ea_name += 4; /* skip past os2. prefix */
-		rc = CIFSSMBQueryEA(xid, pTcon, full_path, ea_name, ea_value,
+		rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value,
 			buf_size, cifs_sb->local_nls,
 			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 	} else if (strncmp(ea_name, POSIX_ACL_XATTR_ACCESS,
@@ -286,7 +287,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 		}
 #endif /* EXPERIMENTAL */
 #else
-		cFYI(1, ("query POSIX ACL not supported yet"));
+		cFYI(1, "query POSIX ACL not supported yet");
 #endif /* CONFIG_CIFS_POSIX */
 	} else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
 			  strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
@@ -298,18 +299,18 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 				cifs_sb->mnt_cifs_flags &
 					CIFS_MOUNT_MAP_SPECIAL_CHR);
 #else
-		cFYI(1, ("query POSIX default ACL not supported yet"));
+		cFYI(1, "query POSIX default ACL not supported yet");
 #endif
 	} else if (strncmp(ea_name,
 		  CIFS_XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) {
-		cFYI(1, ("Trusted xattr namespace not supported yet"));
+		cFYI(1, "Trusted xattr namespace not supported yet");
 	} else if (strncmp(ea_name,
 		  CIFS_XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) {
-		cFYI(1, ("Security xattr namespace not supported yet"));
+		cFYI(1, "Security xattr namespace not supported yet");
 	} else
 		cFYI(1,
-		    ("illegal xattr request %s (only user namespace supported)",
-			ea_name));
+		    "illegal xattr request %s (only user namespace supported)",
+		     ea_name);
 
 	/* We could add an additional check for streams ie
 	    if proc/fs/cifs/streamstoxattr is set then
@@ -364,8 +365,8 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
 	/* if proc/fs/cifs/streamstoxattr is set then
 		search server for EAs or streams to
 		returns as xattrs */
-	rc = CIFSSMBQAllEAs(xid, pTcon, full_path, data, buf_size,
-				cifs_sb->local_nls,
+	rc = CIFSSMBQAllEAs(xid, pTcon, full_path, NULL, data,
+				buf_size, cifs_sb->local_nls,
 				cifs_sb->mnt_cifs_flags &
 					CIFS_MOUNT_MAP_SPECIAL_CHR);
 
diff --git a/fs/coda/coda_int.h b/fs/coda/coda_int.h
index d99860a33890..6b443ff43a19 100644
--- a/fs/coda/coda_int.h
+++ b/fs/coda/coda_int.h
@@ -11,8 +11,7 @@ extern int coda_fake_statfs;
 
 void coda_destroy_inodecache(void);
 int coda_init_inodecache(void);
-int coda_fsync(struct file *coda_file, struct dentry *coda_dentry,
-	       int datasync);
+int coda_fsync(struct file *coda_file, int datasync);
 void coda_sysctl_init(void);
 void coda_sysctl_clean(void);
 
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 4bb9d0a5decc..ccd98b0f2b0b 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -12,6 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/time.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/file.h>
 #include <linux/stat.h>
 #include <linux/errno.h>
diff --git a/fs/coda/file.c b/fs/coda/file.c
index ffd42815fda1..ad3cd2abeeb4 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -17,6 +17,7 @@
 #include <linux/errno.h>
 #include <linux/smp_lock.h>
 #include <linux/string.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 
 #include <linux/coda.h>
@@ -201,10 +202,10 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
 	return 0;
 }
 
-int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
+int coda_fsync(struct file *coda_file, int datasync)
 {
 	struct file *host_file;
-	struct inode *coda_inode = coda_dentry->d_inode;
+	struct inode *coda_inode = coda_file->f_path.dentry->d_inode;
 	struct coda_file_info *cfi;
 	int err = 0;
 
@@ -216,7 +217,7 @@ int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
 	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
 	host_file = cfi->cfi_container;
 
-	err = vfs_fsync(host_file, host_file->f_path.dentry, datasync);
+	err = vfs_fsync(host_file, datasync);
 	if ( !err && !datasync ) {
 		lock_kernel();
 		err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode));
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 830f51abb971..d97f9935a028 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -18,6 +18,7 @@
 #include <linux/smp_lock.h>
 #include <linux/file.h>
 #include <linux/vfs.h>
+#include <linux/slab.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -166,6 +167,10 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
 		return -EBUSY;
 	}
 
+	error = bdi_setup_and_register(&vc->bdi, "coda", BDI_CAP_MAP_COPY);
+	if (error)
+		goto bdi_err;
+
 	vc->vc_sb = sb;
 
 	sb->s_fs_info = vc;
@@ -174,6 +179,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_blocksize_bits = 12;
 	sb->s_magic = CODA_SUPER_MAGIC;
 	sb->s_op = &coda_super_operations;
+	sb->s_bdi = &vc->bdi;
 
 	/* get root fid from Venus: this needs the root inode */
 	error = venus_rootfid(sb, &fid);
@@ -199,6 +205,8 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
         return 0;
 
  error:
+	bdi_destroy(&vc->bdi);
+ bdi_err:
 	if (root)
 		iput(root);
 	if (vc)
@@ -209,6 +217,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
 
 static void coda_put_super(struct super_block *sb)
 {
+	bdi_destroy(&coda_vcp(sb)->bdi);
 	coda_vcp(sb)->vc_sb = NULL;
 	sb->s_fs_info = NULL;
 
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index 773f2ce9aa06..ca25d96d45c9 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -1,6 +1,6 @@
 /*
  * Pioctl operations for Coda.
- * Original version: (C) 1996 Peter Braam 
+ * Original version: (C) 1996 Peter Braam
  * Rewritten for Linux 2.1: (C) 1997 Carnegie Mellon University
  *
  * Carnegie Mellon encourages users of this code to contribute improvements
@@ -23,21 +23,22 @@
 #include <linux/coda_fs_i.h>
 #include <linux/coda_psdev.h>
 
+#include <linux/smp_lock.h>
+
 /* pioctl ops */
 static int coda_ioctl_permission(struct inode *inode, int mask);
-static int coda_pioctl(struct inode * inode, struct file * filp, 
-                       unsigned int cmd, unsigned long user_data);
+static long coda_pioctl(struct file *filp, unsigned int cmd,
+			unsigned long user_data);
 
 /* exported from this file */
-const struct inode_operations coda_ioctl_inode_operations =
-{
+const struct inode_operations coda_ioctl_inode_operations = {
 	.permission	= coda_ioctl_permission,
 	.setattr	= coda_setattr,
 };
 
 const struct file_operations coda_ioctl_operations = {
 	.owner		= THIS_MODULE,
-	.ioctl		= coda_pioctl,
+	.unlocked_ioctl	= coda_pioctl,
 };
 
 /* the coda pioctl inode ops */
@@ -46,48 +47,53 @@ static int coda_ioctl_permission(struct inode *inode, int mask)
 	return (mask & MAY_EXEC) ? -EACCES : 0;
 }
 
-static int coda_pioctl(struct inode * inode, struct file * filp, 
-                       unsigned int cmd, unsigned long user_data)
+static long coda_pioctl(struct file *filp, unsigned int cmd,
+			unsigned long user_data)
 {
 	struct path path;
-        int error;
+	int error;
 	struct PioctlData data;
-        struct inode *target_inode = NULL;
-        struct coda_inode_info *cnp;
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct inode *target_inode = NULL;
+	struct coda_inode_info *cnp;
 
-        /* get the Pioctl data arguments from user space */
-        if (copy_from_user(&data, (void __user *)user_data, sizeof(data))) {
-	    return -EINVAL;
-	}
-       
-        /* 
-         * Look up the pathname. Note that the pathname is in 
-         * user memory, and namei takes care of this
-         */
-        if (data.follow) {
-                error = user_path(data.path, &path);
-	} else {
-	        error = user_lpath(data.path, &path);
+	lock_kernel();
+
+	/* get the Pioctl data arguments from user space */
+	if (copy_from_user(&data, (void __user *)user_data, sizeof(data))) {
+		error = -EINVAL;
+		goto out;
 	}
-		
-	if ( error ) {
-		return error;
-        } else {
+
+	/*
+	 * Look up the pathname. Note that the pathname is in
+	 * user memory, and namei takes care of this
+	 */
+	if (data.follow)
+		error = user_path(data.path, &path);
+	else
+		error = user_lpath(data.path, &path);
+
+	if (error)
+		goto out;
+	else
 		target_inode = path.dentry->d_inode;
-	}
-	
+
 	/* return if it is not a Coda inode */
-	if ( target_inode->i_sb != inode->i_sb ) {
+	if (target_inode->i_sb != inode->i_sb) {
 		path_put(&path);
-	        return  -EINVAL;
+		error = -EINVAL;
+		goto out;
 	}
 
 	/* now proceed to make the upcall */
-        cnp = ITOC(target_inode);
+	cnp = ITOC(target_inode);
 
 	error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data);
 
 	path_put(&path);
-        return error;
-}
 
+out:
+	unlock_kernel();
+	return error;
+}
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index be4392ca2098..66b9cf79c5ba 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -73,8 +73,7 @@ static unsigned int coda_psdev_poll(struct file *file, poll_table * wait)
 	return mask;
 }
 
-static int coda_psdev_ioctl(struct inode * inode, struct file * filp, 
-			    unsigned int cmd, unsigned long arg)
+static long coda_psdev_ioctl(struct file * filp, unsigned int cmd, unsigned long arg)
 {
 	unsigned int data;
 
@@ -344,7 +343,7 @@ static const struct file_operations coda_psdev_fops = {
 	.read		= coda_psdev_read,
 	.write		= coda_psdev_write,
 	.poll		= coda_psdev_poll,
-	.ioctl		= coda_psdev_ioctl,
+	.unlocked_ioctl	= coda_psdev_ioctl,
 	.open		= coda_psdev_open,
 	.release	= coda_psdev_release,
 };
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index c274d949179d..f09c5ed76f6c 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -26,6 +26,7 @@
 #include <linux/stat.h>
 #include <linux/errno.h>
 #include <linux/string.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <linux/vmalloc.h>
 #include <linux/vfs.h>
diff --git a/fs/compat.c b/fs/compat.c
index 00d90c2e66f0..6490d2134ff3 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -49,6 +49,7 @@
 #include <linux/mm.h>
 #include <linux/eventpoll.h>
 #include <linux/fs_struct.h>
+#include <linux/slab.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -567,6 +568,79 @@ out:
 	return ret;
 }
 
+/* A write operation does a read from user space and vice versa */
+#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
+
+ssize_t compat_rw_copy_check_uvector(int type,
+		const struct compat_iovec __user *uvector, unsigned long nr_segs,
+		unsigned long fast_segs, struct iovec *fast_pointer,
+		struct iovec **ret_pointer)
+{
+	compat_ssize_t tot_len;
+	struct iovec *iov = *ret_pointer = fast_pointer;
+	ssize_t ret = 0;
+	int seg;
+
+	/*
+	 * SuS says "The readv() function *may* fail if the iovcnt argument
+	 * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
+	 * traditionally returned zero for zero segments, so...
+	 */
+	if (nr_segs == 0)
+		goto out;
+
+	ret = -EINVAL;
+	if (nr_segs > UIO_MAXIOV || nr_segs < 0)
+		goto out;
+	if (nr_segs > fast_segs) {
+		ret = -ENOMEM;
+		iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
+		if (iov == NULL) {
+			*ret_pointer = fast_pointer;
+			goto out;
+		}
+	}
+	*ret_pointer = iov;
+
+	/*
+	 * Single unix specification:
+	 * We should -EINVAL if an element length is not >= 0 and fitting an
+	 * ssize_t.  The total length is fitting an ssize_t
+	 *
+	 * Be careful here because iov_len is a size_t not an ssize_t
+	 */
+	tot_len = 0;
+	ret = -EINVAL;
+	for (seg = 0; seg < nr_segs; seg++) {
+		compat_ssize_t tmp = tot_len;
+		compat_uptr_t buf;
+		compat_ssize_t len;
+
+		if (__get_user(len, &uvector->iov_len) ||
+		   __get_user(buf, &uvector->iov_base)) {
+			ret = -EFAULT;
+			goto out;
+		}
+		if (len < 0)	/* size_t not fitting in compat_ssize_t .. */
+			goto out;
+		tot_len += len;
+		if (tot_len < tmp) /* maths overflow on the compat_ssize_t */
+			goto out;
+		if (!access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
+			ret = -EFAULT;
+			goto out;
+		}
+		iov->iov_base = compat_ptr(buf);
+		iov->iov_len = (compat_size_t) len;
+		uvector++;
+		iov++;
+	}
+	ret = tot_len;
+
+out:
+	return ret;
+}
+
 static inline long
 copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64)
 {
@@ -599,7 +673,7 @@ compat_sys_io_submit(aio_context_t ctx_id, int nr, u32 __user *iocb)
 	iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64));
 	ret = copy_iocb(nr, iocb, iocb64);
 	if (!ret)
-		ret = sys_io_submit(ctx_id, nr, iocb64);
+		ret = do_io_submit(ctx_id, nr, iocb64, 1);
 	return ret;
 }
 
@@ -1076,70 +1150,21 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
 {
 	compat_ssize_t tot_len;
 	struct iovec iovstack[UIO_FASTIOV];
-	struct iovec *iov=iovstack, *vector;
+	struct iovec *iov;
 	ssize_t ret;
-	int seg;
 	io_fn_t fn;
 	iov_fn_t fnv;
 
-	/*
-	 * SuS says "The readv() function *may* fail if the iovcnt argument
-	 * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
-	 * traditionally returned zero for zero segments, so...
-	 */
-	ret = 0;
-	if (nr_segs == 0)
-		goto out;
-
-	/*
-	 * First get the "struct iovec" from user memory and
-	 * verify all the pointers
-	 */
 	ret = -EINVAL;
-	if ((nr_segs > UIO_MAXIOV) || (nr_segs <= 0))
-		goto out;
 	if (!file->f_op)
 		goto out;
-	if (nr_segs > UIO_FASTIOV) {
-		ret = -ENOMEM;
-		iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
-		if (!iov)
-			goto out;
-	}
+
 	ret = -EFAULT;
 	if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
 		goto out;
 
-	/*
-	 * Single unix specification:
-	 * We should -EINVAL if an element length is not >= 0 and fitting an
-	 * ssize_t.  The total length is fitting an ssize_t
-	 *
-	 * Be careful here because iov_len is a size_t not an ssize_t
-	 */
-	tot_len = 0;
-	vector = iov;
-	ret = -EINVAL;
-	for (seg = 0 ; seg < nr_segs; seg++) {
-		compat_ssize_t tmp = tot_len;
-		compat_ssize_t len;
-		compat_uptr_t buf;
-
-		if (__get_user(len, &uvector->iov_len) ||
-		    __get_user(buf, &uvector->iov_base)) {
-			ret = -EFAULT;
-			goto out;
-		}
-		if (len < 0)	/* size_t not fitting an compat_ssize_t .. */
-			goto out;
-		tot_len += len;
-		if (tot_len < tmp) /* maths overflow on the compat_ssize_t */
-			goto out;
-		vector->iov_base = compat_ptr(buf);
-		vector->iov_len = (compat_size_t) len;
-		uvector++;
-		vector++;
-	}
+	tot_len = compat_rw_copy_check_uvector(type, uvector, nr_segs,
+					       UIO_FASTIOV, iovstack, &iov);
 	if (tot_len == 0) {
 		ret = 0;
 		goto out;
@@ -1530,8 +1555,6 @@ int compat_do_execve(char * filename,
 	if (retval < 0)
 		goto out;
 
-	current->stack_start = current->mm->start_stack;
-
 	/* execve succeeded */
 	current->fs->in_exec = 0;
 	current->in_execve = 0;
@@ -1795,6 +1818,24 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
 	return ret;
 }
 
+struct compat_sel_arg_struct {
+	compat_ulong_t n;
+	compat_uptr_t inp;
+	compat_uptr_t outp;
+	compat_uptr_t exp;
+	compat_uptr_t tvp;
+};
+
+asmlinkage long compat_sys_old_select(struct compat_sel_arg_struct __user *arg)
+{
+	struct compat_sel_arg_struct a;
+
+	if (copy_from_user(&a, arg, sizeof(a)))
+		return -EFAULT;
+	return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp),
+				 compat_ptr(a.exp), compat_ptr(a.tvp));
+}
+
 #ifdef HAVE_SET_RESTORE_SIGMASK
 static long do_compat_pselect(int n, compat_ulong_t __user *inp,
 	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
index 0adced2f296f..112e45a17e99 100644
--- a/fs/compat_binfmt_elf.c
+++ b/fs/compat_binfmt_elf.c
@@ -28,10 +28,12 @@
 
 #undef	elfhdr
 #undef	elf_phdr
+#undef	elf_shdr
 #undef	elf_note
 #undef	elf_addr_t
 #define elfhdr		elf32_hdr
 #define elf_phdr	elf32_phdr
+#define elf_shdr	elf32_shdr
 #define elf_note	elf32_note
 #define elf_addr_t	Elf32_Addr
 
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index c5c45de1a2ee..641640dc7ae5 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -23,7 +23,6 @@
 #include <linux/ioctl.h>
 #include <linux/if.h>
 #include <linux/if_bridge.h>
-#include <linux/slab.h>
 #include <linux/raid/md_u.h>
 #include <linux/kd.h>
 #include <linux/route.h>
@@ -60,6 +59,7 @@
 #include <linux/i2c.h>
 #include <linux/i2c-dev.h>
 #include <linux/atalk.h>
+#include <linux/gfp.h>
 
 #include <net/bluetooth/bluetooth.h>
 #include <net/bluetooth/hci.h>
@@ -102,7 +102,6 @@
 #include <linux/nbd.h>
 #include <linux/random.h>
 #include <linux/filter.h>
-#include <linux/pktcdvd.h>
 
 #include <linux/hiddev.h>
 
@@ -301,6 +300,12 @@ static int sg_ioctl_trans(unsigned int fd, unsigned int cmd,
 	u32 data;
 	void __user *dxferp;
 	int err;
+	int interface_id;
+
+	if (get_user(interface_id, &sgio32->interface_id))
+		return -EFAULT;
+	if (interface_id != 'S')
+		return sys_ioctl(fd, cmd, (unsigned long)sgio32);
 
 	if (get_user(iovec_count, &sgio32->iovec_count))
 		return -EFAULT;
@@ -539,7 +544,7 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp)
 		kcmd = MTIOCPOS;
 		karg = &pos;
 		break;
-	case MTIOCGET32:
+	default:	/* MTIOCGET32 */
 		kcmd = MTIOCGET;
 		karg = &get;
 		break;
@@ -657,7 +662,7 @@ static int raw_ioctl(unsigned fd, unsigned cmd,
 
         switch (cmd) {
         case RAW_SETBIND:
-        case RAW_GETBIND: {
+	default: {	/* RAW_GETBIND */
                 struct raw_config_request req;
                 mm_segment_t oldfs = get_fs();
 
@@ -936,6 +941,7 @@ COMPATIBLE_IOCTL(TCSETSF)
 COMPATIBLE_IOCTL(TIOCLINUX)
 COMPATIBLE_IOCTL(TIOCSBRK)
 COMPATIBLE_IOCTL(TIOCCBRK)
+COMPATIBLE_IOCTL(TIOCGSID)
 COMPATIBLE_IOCTL(TIOCGICOUNT)
 /* Little t */
 COMPATIBLE_IOCTL(TIOCGETD)
@@ -1038,6 +1044,8 @@ COMPATIBLE_IOCTL(FIOQSIZE)
 #ifdef CONFIG_BLOCK
 /* loop */
 IGNORE_IOCTL(LOOP_CLR_FD)
+/* md calls this on random blockdevs */
+IGNORE_IOCTL(RAID_VERSION)
 /* SG stuff */
 COMPATIBLE_IOCTL(SG_SET_TIMEOUT)
 COMPATIBLE_IOCTL(SG_GET_TIMEOUT)
@@ -1117,8 +1125,6 @@ COMPATIBLE_IOCTL(PPGETMODE)
 COMPATIBLE_IOCTL(PPGETPHASE)
 COMPATIBLE_IOCTL(PPGETFLAGS)
 COMPATIBLE_IOCTL(PPSETFLAGS)
-/* pktcdvd */
-COMPATIBLE_IOCTL(PACKET_CTRL_CMD)
 /* Big A */
 /* sparc only */
 /* Big Q for sound/OSS */
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 8e48b52205aa..0b502f80c691 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -645,6 +645,7 @@ static void detach_groups(struct config_group *group)
 
 		configfs_detach_group(sd->s_element);
 		child->d_inode->i_flags |= S_DEAD;
+		dont_mount(child);
 
 		mutex_unlock(&child->d_inode->i_mutex);
 
@@ -840,6 +841,7 @@ static int configfs_attach_item(struct config_item *parent_item,
 			mutex_lock(&dentry->d_inode->i_mutex);
 			configfs_remove_dir(item);
 			dentry->d_inode->i_flags |= S_DEAD;
+			dont_mount(dentry);
 			mutex_unlock(&dentry->d_inode->i_mutex);
 			d_delete(dentry);
 		}
@@ -882,6 +884,7 @@ static int configfs_attach_group(struct config_item *parent_item,
 		if (ret) {
 			configfs_detach_item(item);
 			dentry->d_inode->i_flags |= S_DEAD;
+			dont_mount(dentry);
 		}
 		configfs_adjust_dir_dirent_depth_after_populate(sd);
 		mutex_unlock(&dentry->d_inode->i_mutex);
@@ -1725,6 +1728,7 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
 	mutex_unlock(&configfs_symlink_mutex);
 	configfs_detach_group(&group->cg_item);
 	dentry->d_inode->i_flags |= S_DEAD;
+	dont_mount(dentry);
 	mutex_unlock(&dentry->d_inode->i_mutex);
 
 	d_delete(dentry);
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index a2f746066c5d..cf78d44a8d6a 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -34,6 +34,7 @@
 #include <linux/capability.h>
 #include <linux/sched.h>
 #include <linux/lockdep.h>
+#include <linux/slab.h>
 
 #include <linux/configfs.h>
 #include "configfs_internal.h"
@@ -72,15 +73,6 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
 		return -EINVAL;
 
 	sd_iattr = sd->s_iattr;
-
-	error = inode_change_ok(inode, iattr);
-	if (error)
-		return error;
-
-	error = inode_setattr(inode, iattr);
-	if (error)
-		return error;
-
 	if (!sd_iattr) {
 		/* setting attributes for the first time, allocate now */
 		sd_iattr = kzalloc(sizeof(struct iattr), GFP_KERNEL);
@@ -93,9 +85,12 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
 		sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME;
 		sd->s_iattr = sd_iattr;
 	}
-
 	/* attributes were changed atleast once in past */
 
+	error = simple_setattr(dentry, iattr);
+	if (error)
+		return error;
+
 	if (ia_valid & ATTR_UID)
 		sd_iattr->ia_uid = iattr->ia_uid;
 	if (ia_valid & ATTR_GID)
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 8421cea7d8c7..8c8d64230c2d 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -29,6 +29,7 @@
 #include <linux/mount.h>
 #include <linux/pagemap.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 
 #include <linux/configfs.h>
 #include "configfs_internal.h"
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index 32a5f46b1157..0f3eb41d9201 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -27,6 +27,7 @@
 #include <linux/fs.h>
 #include <linux/module.h>
 #include <linux/namei.h>
+#include <linux/slab.h>
 
 #include <linux/configfs.h>
 #include "configfs_internal.h"
diff --git a/fs/dcache.c b/fs/dcache.c
index 953173a293a9..86d4db15473e 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -257,6 +257,7 @@ kill_it:
 	if (dentry)
 		goto repeat;
 }
+EXPORT_SYMBOL(dput);
 
 /**
  * d_invalidate - invalidate a dentry
@@ -314,6 +315,7 @@ int d_invalidate(struct dentry * dentry)
 	spin_unlock(&dcache_lock);
 	return 0;
 }
+EXPORT_SYMBOL(d_invalidate);
 
 /* This should be called _only_ with dcache_lock held */
 
@@ -328,6 +330,7 @@ struct dentry * dget_locked(struct dentry *dentry)
 {
 	return __dget_locked(dentry);
 }
+EXPORT_SYMBOL(dget_locked);
 
 /**
  * d_find_alias - grab a hashed alias of inode
@@ -384,6 +387,7 @@ struct dentry * d_find_alias(struct inode *inode)
 	}
 	return de;
 }
+EXPORT_SYMBOL(d_find_alias);
 
 /*
  *	Try to kill dentries associated with this inode.
@@ -408,6 +412,7 @@ restart:
 	}
 	spin_unlock(&dcache_lock);
 }
+EXPORT_SYMBOL(d_prune_aliases);
 
 /*
  * Throw away a dentry - free the inode, dput the parent.  This requires that
@@ -531,7 +536,7 @@ restart:
  */
 static void prune_dcache(int count)
 {
-	struct super_block *sb;
+	struct super_block *sb, *n;
 	int w_count;
 	int unused = dentry_stat.nr_unused;
 	int prune_ratio;
@@ -540,13 +545,14 @@ static void prune_dcache(int count)
 	if (unused == 0 || count == 0)
 		return;
 	spin_lock(&dcache_lock);
-restart:
 	if (count >= unused)
 		prune_ratio = 1;
 	else
 		prune_ratio = unused / count;
 	spin_lock(&sb_lock);
-	list_for_each_entry(sb, &super_blocks, s_list) {
+	list_for_each_entry_safe(sb, n, &super_blocks, s_list) {
+		if (list_empty(&sb->s_instances))
+			continue;
 		if (sb->s_nr_dentry_unused == 0)
 			continue;
 		sb->s_count++;
@@ -584,15 +590,13 @@ restart:
 			up_read(&sb->s_umount);
 		}
 		spin_lock(&sb_lock);
+		/* lock was dropped, must reset next */
+		list_safe_reset_next(sb, n, s_list);
 		count -= pruned;
-		/*
-		 * restart only when sb is no longer on the list and
-		 * we have more work to do.
-		 */
-		if (__put_super_and_need_restart(sb) && count > 0) {
-			spin_unlock(&sb_lock);
-			goto restart;
-		}
+		__put_super(sb);
+		/* more work left to do? */
+		if (count <= 0)
+			break;
 	}
 	spin_unlock(&sb_lock);
 	spin_unlock(&dcache_lock);
@@ -610,6 +614,7 @@ void shrink_dcache_sb(struct super_block * sb)
 {
 	__shrink_dcache_sb(sb, NULL, 0);
 }
+EXPORT_SYMBOL(shrink_dcache_sb);
 
 /*
  * destroy a single subtree of dentries for unmount
@@ -792,6 +797,7 @@ positive:
 	spin_unlock(&dcache_lock);
 	return 1;
 }
+EXPORT_SYMBOL(have_submounts);
 
 /*
  * Search the dentry child list for the specified parent,
@@ -876,6 +882,7 @@ void shrink_dcache_parent(struct dentry * parent)
 	while ((found = select_parent(parent)) != 0)
 		__shrink_dcache_sb(sb, &found, 0);
 }
+EXPORT_SYMBOL(shrink_dcache_parent);
 
 /*
  * Scan `nr' dentries and return the number which remain.
@@ -889,7 +896,7 @@ void shrink_dcache_parent(struct dentry * parent)
  *
  * In this case we return -1 to tell the caller that we baled.
  */
-static int shrink_dcache_memory(int nr, gfp_t gfp_mask)
+static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 {
 	if (nr) {
 		if (!(gfp_mask & __GFP_FS))
@@ -968,6 +975,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
 
 	return dentry;
 }
+EXPORT_SYMBOL(d_alloc);
 
 struct dentry *d_alloc_name(struct dentry *parent, const char *name)
 {
@@ -1012,6 +1020,7 @@ void d_instantiate(struct dentry *entry, struct inode * inode)
 	spin_unlock(&dcache_lock);
 	security_d_instantiate(entry, inode);
 }
+EXPORT_SYMBOL(d_instantiate);
 
 /**
  * d_instantiate_unique - instantiate a non-aliased dentry
@@ -1108,6 +1117,7 @@ struct dentry * d_alloc_root(struct inode * root_inode)
 	}
 	return res;
 }
+EXPORT_SYMBOL(d_alloc_root);
 
 static inline struct hlist_head *d_hash(struct dentry *parent,
 					unsigned long hash)
@@ -1211,7 +1221,6 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
 			BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED));
 			spin_unlock(&dcache_lock);
 			security_d_instantiate(new, inode);
-			d_rehash(dentry);
 			d_move(new, dentry);
 			iput(inode);
 		} else {
@@ -1225,6 +1234,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
 		d_add(dentry, inode);
 	return new;
 }
+EXPORT_SYMBOL(d_splice_alias);
 
 /**
  * d_add_ci - lookup or allocate new dentry with case-exact name
@@ -1314,6 +1324,7 @@ err_out:
 	iput(inode);
 	return ERR_PTR(error);
 }
+EXPORT_SYMBOL(d_add_ci);
 
 /**
  * d_lookup - search for a dentry
@@ -1357,6 +1368,7 @@ struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
 	} while (read_seqretry(&rename_lock, seq));
 	return dentry;
 }
+EXPORT_SYMBOL(d_lookup);
 
 struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
 {
@@ -1483,6 +1495,7 @@ int d_validate(struct dentry *dentry, struct dentry *dparent)
 out:
 	return 0;
 }
+EXPORT_SYMBOL(d_validate);
 
 /*
  * When a file is deleted, we have two options:
@@ -1515,6 +1528,7 @@ void d_delete(struct dentry * dentry)
 	spin_lock(&dentry->d_lock);
 	isdir = S_ISDIR(dentry->d_inode->i_mode);
 	if (atomic_read(&dentry->d_count) == 1) {
+		dentry->d_flags &= ~DCACHE_CANT_MOUNT;
 		dentry_iput(dentry);
 		fsnotify_nameremove(dentry, isdir);
 		return;
@@ -1528,6 +1542,7 @@ void d_delete(struct dentry * dentry)
 
 	fsnotify_nameremove(dentry, isdir);
 }
+EXPORT_SYMBOL(d_delete);
 
 static void __d_rehash(struct dentry * entry, struct hlist_head *list)
 {
@@ -1556,6 +1571,7 @@ void d_rehash(struct dentry * entry)
 	spin_unlock(&entry->d_lock);
 	spin_unlock(&dcache_lock);
 }
+EXPORT_SYMBOL(d_rehash);
 
 /*
  * When switching names, the actual string doesn't strictly have to
@@ -1702,6 +1718,7 @@ void d_move(struct dentry * dentry, struct dentry * target)
 	d_move_locked(dentry, target);
 	spin_unlock(&dcache_lock);
 }
+EXPORT_SYMBOL(d_move);
 
 /**
  * d_ancestor - search for an ancestor
@@ -1868,6 +1885,7 @@ shouldnt_be_hashed:
 	spin_unlock(&dcache_lock);
 	BUG();
 }
+EXPORT_SYMBOL_GPL(d_materialise_unique);
 
 static int prepend(char **buffer, int *buflen, const char *str, int namelen)
 {
@@ -2005,6 +2023,7 @@ char *d_path(const struct path *path, char *buf, int buflen)
 	path_put(&root);
 	return res;
 }
+EXPORT_SYMBOL(d_path);
 
 /*
  * Helper function for dentry_operations.d_dname() members
@@ -2171,6 +2190,30 @@ int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
 	return result;
 }
 
+int path_is_under(struct path *path1, struct path *path2)
+{
+	struct vfsmount *mnt = path1->mnt;
+	struct dentry *dentry = path1->dentry;
+	int res;
+	spin_lock(&vfsmount_lock);
+	if (mnt != path2->mnt) {
+		for (;;) {
+			if (mnt->mnt_parent == mnt) {
+				spin_unlock(&vfsmount_lock);
+				return 0;
+			}
+			if (mnt->mnt_parent == path2->mnt)
+				break;
+			mnt = mnt->mnt_parent;
+		}
+		dentry = mnt->mnt_mountpoint;
+	}
+	res = is_subdir(dentry, path2->dentry);
+	spin_unlock(&vfsmount_lock);
+	return res;
+}
+EXPORT_SYMBOL(path_is_under);
+
 void d_genocide(struct dentry *root)
 {
 	struct dentry *this_parent = root;
@@ -2228,6 +2271,7 @@ ino_t find_inode_number(struct dentry *dir, struct qstr *name)
 	}
 	return ino;
 }
+EXPORT_SYMBOL(find_inode_number);
 
 static __initdata unsigned long dhash_entries;
 static int __init set_dhash_entries(char *str)
@@ -2297,6 +2341,7 @@ static void __init dcache_init(void)
 
 /* SLAB cache for __getname() consumers */
 struct kmem_cache *names_cachep __read_mostly;
+EXPORT_SYMBOL(names_cachep);
 
 EXPORT_SYMBOL(d_genocide);
 
@@ -2326,26 +2371,3 @@ void __init vfs_caches_init(unsigned long mempages)
 	bdev_cache_init();
 	chrdev_init();
 }
-
-EXPORT_SYMBOL(d_alloc);
-EXPORT_SYMBOL(d_alloc_root);
-EXPORT_SYMBOL(d_delete);
-EXPORT_SYMBOL(d_find_alias);
-EXPORT_SYMBOL(d_instantiate);
-EXPORT_SYMBOL(d_invalidate);
-EXPORT_SYMBOL(d_lookup);
-EXPORT_SYMBOL(d_move);
-EXPORT_SYMBOL_GPL(d_materialise_unique);
-EXPORT_SYMBOL(d_path);
-EXPORT_SYMBOL(d_prune_aliases);
-EXPORT_SYMBOL(d_rehash);
-EXPORT_SYMBOL(d_splice_alias);
-EXPORT_SYMBOL(d_add_ci);
-EXPORT_SYMBOL(d_validate);
-EXPORT_SYMBOL(dget_locked);
-EXPORT_SYMBOL(dput);
-EXPORT_SYMBOL(find_inode_number);
-EXPORT_SYMBOL(have_submounts);
-EXPORT_SYMBOL(names_cachep);
-EXPORT_SYMBOL(shrink_dcache_parent);
-EXPORT_SYMBOL(shrink_dcache_sb);
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 4d74fc72c195..0210898458b2 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -277,8 +277,10 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set, "0x%08llx\n"
 DEFINE_SIMPLE_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n");
 DEFINE_SIMPLE_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n");
 
+DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n");
+
 /*
- * debugfs_create_x{8,16,32} - create a debugfs file that is used to read and write an unsigned {8,16,32}-bit value
+ * debugfs_create_x{8,16,32,64} - create a debugfs file that is used to read and write an unsigned {8,16,32,64}-bit value
  *
  * These functions are exactly the same as the above functions (but use a hex
  * output for the decimal challenged). For details look at the above unsigned
@@ -357,6 +359,23 @@ struct dentry *debugfs_create_x32(const char *name, mode_t mode,
 }
 EXPORT_SYMBOL_GPL(debugfs_create_x32);
 
+/**
+ * debugfs_create_x64 - create a debugfs file that is used to read and write an unsigned 64-bit value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ *         from.
+ */
+struct dentry *debugfs_create_x64(const char *name, mode_t mode,
+				 struct dentry *parent, u64 *value)
+{
+	return debugfs_create_file(name, mode, parent, value, &fops_x64);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_x64);
+
 
 static int debugfs_size_t_set(void *data, u64 val)
 {
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 274ac865bae8..30a87b3dbcac 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -27,6 +27,7 @@
 #include <linux/fsnotify.h>
 #include <linux/string.h>
 #include <linux/magic.h>
+#include <linux/slab.h>
 
 static struct vfsmount *debugfs_mount;
 static int debugfs_mount_count;
@@ -496,7 +497,7 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
 	}
 	d_move(old_dentry, dentry);
 	fsnotify_move(old_dir->d_inode, new_dir->d_inode, old_name,
-		old_dentry->d_name.name, S_ISDIR(old_dentry->d_inode->i_mode),
+		S_ISDIR(old_dentry->d_inode->i_mode),
 		NULL, old_dentry);
 	fsnotify_oldname_free(old_name);
 	unlock_rename(new_dir, old_dir);
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 8882ecc0f1bf..8b3ffd5b5235 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -15,6 +15,7 @@
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/namei.h>
+#include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/tty.h>
 #include <linux/mutex.h>
@@ -383,18 +384,15 @@ static int devpts_get_sb(struct file_system_type *fs_type,
 		s->s_flags |= MS_ACTIVE;
 	}
 
-	simple_set_mnt(mnt, s);
-
 	memcpy(&(DEVPTS_SB(s))->mount_opts, &opts, sizeof(opts));
 
 	error = mknod_ptmx(s);
 	if (error)
-		goto out_dput;
+		goto out_undo_sget;
 
-	return 0;
+	simple_set_mnt(mnt, s);
 
-out_dput:
-	dput(s->s_root); /* undo dget() in simple_set_mnt() */
+	return 0;
 
 out_undo_sget:
 	deactivate_locked_super(s);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index e82adc2debb7..7600aacf531d 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -82,6 +82,8 @@ struct dio {
 	int reap_counter;		/* rate limit reaping */
 	get_block_t *get_block;		/* block mapping function */
 	dio_iodone_t *end_io;		/* IO completion function */
+	dio_submit_t *submit_io;	/* IO submition function */
+	loff_t logical_offset_in_bio;	/* current first logical block in bio */
 	sector_t final_block_in_bio;	/* current final block in bio + 1 */
 	sector_t next_block_for_io;	/* next block to be put under IO,
 					   in dio_blocks units */
@@ -96,6 +98,7 @@ struct dio {
 	unsigned cur_page_offset;	/* Offset into it, in bytes */
 	unsigned cur_page_len;		/* Nr of bytes at cur_page_offset */
 	sector_t cur_page_block;	/* Where it starts */
+	loff_t cur_page_fs_offset;	/* Offset in file */
 
 	/* BIO completion state */
 	spinlock_t bio_lock;		/* protects BIO fields below */
@@ -300,6 +303,26 @@ static void dio_bio_end_io(struct bio *bio, int error)
 	spin_unlock_irqrestore(&dio->bio_lock, flags);
 }
 
+/**
+ * dio_end_io - handle the end io action for the given bio
+ * @bio: The direct io bio thats being completed
+ * @error: Error if there was one
+ *
+ * This is meant to be called by any filesystem that uses their own dio_submit_t
+ * so that the DIO specific endio actions are dealt with after the filesystem
+ * has done it's completion work.
+ */
+void dio_end_io(struct bio *bio, int error)
+{
+	struct dio *dio = bio->bi_private;
+
+	if (dio->is_async)
+		dio_bio_end_aio(bio, error);
+	else
+		dio_bio_end_io(bio, error);
+}
+EXPORT_SYMBOL_GPL(dio_end_io);
+
 static int
 dio_bio_alloc(struct dio *dio, struct block_device *bdev,
 		sector_t first_sector, int nr_vecs)
@@ -316,6 +339,7 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
 		bio->bi_end_io = dio_bio_end_io;
 
 	dio->bio = bio;
+	dio->logical_offset_in_bio = dio->cur_page_fs_offset;
 	return 0;
 }
 
@@ -340,10 +364,15 @@ static void dio_bio_submit(struct dio *dio)
 	if (dio->is_async && dio->rw == READ)
 		bio_set_pages_dirty(bio);
 
-	submit_bio(dio->rw, bio);
+	if (dio->submit_io)
+		dio->submit_io(dio->rw, bio, dio->inode,
+			       dio->logical_offset_in_bio);
+	else
+		submit_bio(dio->rw, bio);
 
 	dio->bio = NULL;
 	dio->boundary = 0;
+	dio->logical_offset_in_bio = 0;
 }
 
 /*
@@ -603,10 +632,26 @@ static int dio_send_cur_page(struct dio *dio)
 	int ret = 0;
 
 	if (dio->bio) {
+		loff_t cur_offset = dio->block_in_file << dio->blkbits;
+		loff_t bio_next_offset = dio->logical_offset_in_bio +
+			dio->bio->bi_size;
+
 		/*
-		 * See whether this new request is contiguous with the old
+		 * See whether this new request is contiguous with the old.
+		 *
+		 * Btrfs cannot handl having logically non-contiguous requests
+		 * submitted.  For exmple if you have
+		 *
+		 * Logical:  [0-4095][HOLE][8192-12287]
+		 * Phyiscal: [0-4095]      [4096-8181]
+		 *
+		 * We cannot submit those pages together as one BIO.  So if our
+		 * current logical offset in the file does not equal what would
+		 * be the next logical offset in the bio, submit the bio we
+		 * have.
 		 */
-		if (dio->final_block_in_bio != dio->cur_page_block)
+		if (dio->final_block_in_bio != dio->cur_page_block ||
+		    cur_offset != bio_next_offset)
 			dio_bio_submit(dio);
 		/*
 		 * Submit now if the underlying fs is about to perform a
@@ -701,6 +746,7 @@ submit_page_section(struct dio *dio, struct page *page,
 	dio->cur_page_offset = offset;
 	dio->cur_page_len = len;
 	dio->cur_page_block = blocknr;
+	dio->cur_page_fs_offset = dio->block_in_file << dio->blkbits;
 out:
 	return ret;
 }
@@ -935,7 +981,7 @@ static ssize_t
 direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, 
 	const struct iovec *iov, loff_t offset, unsigned long nr_segs, 
 	unsigned blkbits, get_block_t get_block, dio_iodone_t end_io,
-	struct dio *dio)
+	dio_submit_t submit_io, struct dio *dio)
 {
 	unsigned long user_addr; 
 	unsigned long flags;
@@ -952,6 +998,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 
 	dio->get_block = get_block;
 	dio->end_io = end_io;
+	dio->submit_io = submit_io;
 	dio->final_block_in_bio = -1;
 	dio->next_block_for_io = -1;
 
@@ -1008,7 +1055,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 		}
 	} /* end iovec loop */
 
-	if (ret == -ENOTBLK && (rw & WRITE)) {
+	if (ret == -ENOTBLK) {
 		/*
 		 * The remaining part of the request will be
 		 * be handled by buffered I/O when we return
@@ -1087,30 +1134,11 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	return ret;
 }
 
-/*
- * This is a library function for use by filesystem drivers.
- *
- * The locking rules are governed by the flags parameter:
- *  - if the flags value contains DIO_LOCKING we use a fancy locking
- *    scheme for dumb filesystems.
- *    For writes this function is called under i_mutex and returns with
- *    i_mutex held, for reads, i_mutex is not held on entry, but it is
- *    taken and dropped again before returning.
- *    For reads and writes i_alloc_sem is taken in shared mode and released
- *    on I/O completion (which may happen asynchronously after returning to
- *    the caller).
- *
- *  - if the flags value does NOT contain DIO_LOCKING we don't use any
- *    internal locking but rather rely on the filesystem to synchronize
- *    direct I/O reads/writes versus each other and truncate.
- *    For reads and writes both i_mutex and i_alloc_sem are not held on
- *    entry and are never taken.
- */
 ssize_t
-__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+__blockdev_direct_IO_newtrunc(int rw, struct kiocb *iocb, struct inode *inode,
 	struct block_device *bdev, const struct iovec *iov, loff_t offset, 
 	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
-	int flags)
+	dio_submit_t submit_io,	int flags)
 {
 	int seg;
 	size_t size;
@@ -1197,11 +1225,49 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 		(end > i_size_read(inode)));
 
 	retval = direct_io_worker(rw, iocb, inode, iov, offset,
-				nr_segs, blkbits, get_block, end_io, dio);
+				nr_segs, blkbits, get_block, end_io,
+				submit_io, dio);
+
+out:
+	return retval;
+}
+EXPORT_SYMBOL(__blockdev_direct_IO_newtrunc);
+
+/*
+ * This is a library function for use by filesystem drivers.
+ *
+ * The locking rules are governed by the flags parameter:
+ *  - if the flags value contains DIO_LOCKING we use a fancy locking
+ *    scheme for dumb filesystems.
+ *    For writes this function is called under i_mutex and returns with
+ *    i_mutex held, for reads, i_mutex is not held on entry, but it is
+ *    taken and dropped again before returning.
+ *    For reads and writes i_alloc_sem is taken in shared mode and released
+ *    on I/O completion (which may happen asynchronously after returning to
+ *    the caller).
+ *
+ *  - if the flags value does NOT contain DIO_LOCKING we don't use any
+ *    internal locking but rather rely on the filesystem to synchronize
+ *    direct I/O reads/writes versus each other and truncate.
+ *    For reads and writes both i_mutex and i_alloc_sem are not held on
+ *    entry and are never taken.
+ */
+ssize_t
+__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+	struct block_device *bdev, const struct iovec *iov, loff_t offset,
+	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
+	dio_submit_t submit_io,	int flags)
+{
+	ssize_t retval;
 
+	retval = __blockdev_direct_IO_newtrunc(rw, iocb, inode, bdev, iov,
+			offset, nr_segs, get_block, end_io, submit_io, flags);
 	/*
 	 * In case of error extending write may have instantiated a few
 	 * blocks outside i_size. Trim these off again for DIO_LOCKING.
+	 * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this in
+	 * their own manner. This is a further example of where the old
+	 * truncate sequence is inadequate.
 	 *
 	 * NOTE: filesystems with their own locking have to handle this
 	 * on their own.
@@ -1209,12 +1275,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	if (flags & DIO_LOCKING) {
 		if (unlikely((rw & WRITE) && retval < 0)) {
 			loff_t isize = i_size_read(inode);
+			loff_t end = offset + iov_length(iov, nr_segs);
+
 			if (end > isize)
 				vmtruncate(inode, isize);
 		}
 	}
 
-out:
 	return retval;
 }
 EXPORT_SYMBOL(__blockdev_direct_IO);
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index dc2ad6008b2d..4314f0d48d85 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2010 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -33,10 +33,10 @@ void dlm_del_ast(struct dlm_lkb *lkb)
 	spin_unlock(&ast_queue_lock);
 }
 
-void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
+void dlm_add_ast(struct dlm_lkb *lkb, int type, int mode)
 {
 	if (lkb->lkb_flags & DLM_IFL_USER) {
-		dlm_user_add_ast(lkb, type, bastmode);
+		dlm_user_add_ast(lkb, type, mode);
 		return;
 	}
 
@@ -44,10 +44,21 @@ void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
 	if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
 		kref_get(&lkb->lkb_ref);
 		list_add_tail(&lkb->lkb_astqueue, &ast_queue);
+		lkb->lkb_ast_first = type;
 	}
+
+	/* sanity check, this should not happen */
+
+	if ((type == AST_COMP) && (lkb->lkb_ast_type & AST_COMP))
+		log_print("repeat cast %d castmode %d lock %x %s",
+			  mode, lkb->lkb_castmode,
+			  lkb->lkb_id, lkb->lkb_resource->res_name);
+
 	lkb->lkb_ast_type |= type;
-	if (bastmode)
-		lkb->lkb_bastmode = bastmode;
+	if (type == AST_BAST)
+		lkb->lkb_bastmode = mode;
+	else
+		lkb->lkb_castmode = mode;
 	spin_unlock(&ast_queue_lock);
 
 	set_bit(WAKE_ASTS, &astd_wakeflags);
@@ -59,9 +70,9 @@ static void process_asts(void)
 	struct dlm_ls *ls = NULL;
 	struct dlm_rsb *r = NULL;
 	struct dlm_lkb *lkb;
-	void (*cast) (void *astparam);
-	void (*bast) (void *astparam, int mode);
-	int type = 0, bastmode;
+	void (*castfn) (void *astparam);
+	void (*bastfn) (void *astparam, int mode);
+	int type, first, bastmode, castmode, do_bast, do_cast, last_castmode;
 
 repeat:
 	spin_lock(&ast_queue_lock);
@@ -75,17 +86,48 @@ repeat:
 		list_del(&lkb->lkb_astqueue);
 		type = lkb->lkb_ast_type;
 		lkb->lkb_ast_type = 0;
+		first = lkb->lkb_ast_first;
+		lkb->lkb_ast_first = 0;
 		bastmode = lkb->lkb_bastmode;
-
+		castmode = lkb->lkb_castmode;
+		castfn = lkb->lkb_astfn;
+		bastfn = lkb->lkb_bastfn;
 		spin_unlock(&ast_queue_lock);
-		cast = lkb->lkb_astfn;
-		bast = lkb->lkb_bastfn;
-
-		if ((type & AST_COMP) && cast)
-			cast(lkb->lkb_astparam);
 
-		if ((type & AST_BAST) && bast)
-			bast(lkb->lkb_astparam, bastmode);
+		do_cast = (type & AST_COMP) && castfn;
+		do_bast = (type & AST_BAST) && bastfn;
+
+		/* Skip a bast if its blocking mode is compatible with the
+		   granted mode of the preceding cast. */
+
+		if (do_bast) {
+			if (first == AST_COMP)
+				last_castmode = castmode;
+			else
+				last_castmode = lkb->lkb_castmode_done;
+			if (dlm_modes_compat(bastmode, last_castmode))
+				do_bast = 0;
+		}
+
+		if (first == AST_COMP) {
+			if (do_cast)
+				castfn(lkb->lkb_astparam);
+			if (do_bast)
+				bastfn(lkb->lkb_astparam, bastmode);
+		} else if (first == AST_BAST) {
+			if (do_bast)
+				bastfn(lkb->lkb_astparam, bastmode);
+			if (do_cast)
+				castfn(lkb->lkb_astparam);
+		} else {
+			log_error(ls, "bad ast_first %d ast_type %d",
+				  first, type);
+		}
+
+		if (do_cast)
+			lkb->lkb_castmode_done = castmode;
+		if (do_bast)
+			lkb->lkb_bastmode_done = bastmode;
 
 		/* this removes the reference added by dlm_add_ast
 		   and may result in the lkb being freed */
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
index 1b5fc5f428fd..bcb1aaba519d 100644
--- a/fs/dlm/ast.h
+++ b/fs/dlm/ast.h
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2010 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -13,7 +13,7 @@
 #ifndef __ASTD_DOT_H__
 #define __ASTD_DOT_H__
 
-void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode);
+void dlm_add_ast(struct dlm_lkb *lkb, int type, int mode);
 void dlm_del_ast(struct dlm_lkb *lkb);
 
 void dlm_astd_wake(void);
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 0df243850818..b54bca03d92f 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -14,6 +14,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/configfs.h>
+#include <linux/slab.h>
 #include <linux/in.h>
 #include <linux/in6.h>
 #include <net/ipv6.h>
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 375a2359b3bf..c6cf25158746 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/ctype.h>
 #include <linux/debugfs.h>
+#include <linux/slab.h>
 
 #include "dlm_internal.h"
 #include "lock.h"
@@ -256,7 +257,7 @@ static int print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb,
 			lkb->lkb_status,
 			lkb->lkb_grmode,
 			lkb->lkb_rqmode,
-			lkb->lkb_highbast,
+			lkb->lkb_bastmode,
 			rsb_lookup,
 			lkb->lkb_wait_type,
 			lkb->lkb_lvbseq,
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 826d3dc6e0ab..f632b58cd222 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2010 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -232,11 +232,17 @@ struct dlm_lkb {
 	int8_t			lkb_status;     /* granted, waiting, convert */
 	int8_t			lkb_rqmode;	/* requested lock mode */
 	int8_t			lkb_grmode;	/* granted lock mode */
-	int8_t			lkb_bastmode;	/* requested mode */
 	int8_t			lkb_highbast;	/* highest mode bast sent for */
+
 	int8_t			lkb_wait_type;	/* type of reply waiting for */
 	int8_t			lkb_wait_count;
 	int8_t			lkb_ast_type;	/* type of ast queued for */
+	int8_t			lkb_ast_first;	/* type of first ast queued */
+
+	int8_t			lkb_bastmode;	/* req mode of queued bast */
+	int8_t			lkb_castmode;	/* gr mode of queued cast */
+	int8_t			lkb_bastmode_done; /* last delivered bastmode */
+	int8_t			lkb_castmode_done; /* last delivered castmode */
 
 	struct list_head	lkb_idtbl_list;	/* lockspace lkbtbl */
 	struct list_head	lkb_statequeue;	/* rsb g/c/w list */
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 9c0c1db1e105..031dbe3a15ca 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2010 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -56,6 +56,7 @@
    L: receive_xxxx_reply()     <-  R: send_xxxx_reply()
 */
 #include <linux/types.h>
+#include <linux/slab.h>
 #include "dlm_internal.h"
 #include <linux/dlm_device.h>
 #include "memory.h"
@@ -307,7 +308,7 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
 	lkb->lkb_lksb->sb_status = rv;
 	lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
 
-	dlm_add_ast(lkb, AST_COMP, 0);
+	dlm_add_ast(lkb, AST_COMP, lkb->lkb_grmode);
 }
 
 static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
@@ -320,10 +321,12 @@ static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
 {
 	lkb->lkb_time_bast = ktime_get();
 
-	if (is_master_copy(lkb))
+	if (is_master_copy(lkb)) {
+		lkb->lkb_bastmode = rqmode; /* printed by debugfs */
 		send_bast(r, lkb, rqmode);
-	else
+	} else {
 		dlm_add_ast(lkb, AST_BAST, rqmode);
+	}
 }
 
 /*
@@ -730,10 +733,7 @@ static void lkb_add_ordered(struct list_head *new, struct list_head *head,
 		if (lkb->lkb_rqmode < mode)
 			break;
 
-	if (!lkb)
-		list_add_tail(new, head);
-	else
-		__list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
+	__list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
 }
 
 /* add/remove lkb to rsb's grant/convert/wait queue */
@@ -2280,20 +2280,30 @@ static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
 	if (can_be_queued(lkb)) {
 		error = -EINPROGRESS;
 		add_lkb(r, lkb, DLM_LKSTS_WAITING);
-		send_blocking_asts(r, lkb);
 		add_timeout(lkb);
 		goto out;
 	}
 
 	error = -EAGAIN;
-	if (force_blocking_asts(lkb))
-		send_blocking_asts_all(r, lkb);
 	queue_cast(r, lkb, -EAGAIN);
-
  out:
 	return error;
 }
 
+static void do_request_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
+			       int error)
+{
+	switch (error) {
+	case -EAGAIN:
+		if (force_blocking_asts(lkb))
+			send_blocking_asts_all(r, lkb);
+		break;
+	case -EINPROGRESS:
+		send_blocking_asts(r, lkb);
+		break;
+	}
+}
+
 static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
 {
 	int error = 0;
@@ -2304,7 +2314,6 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
 	if (can_be_granted(r, lkb, 1, &deadlk)) {
 		grant_lock(r, lkb);
 		queue_cast(r, lkb, 0);
-		grant_pending_locks(r);
 		goto out;
 	}
 
@@ -2334,7 +2343,6 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
 		if (_can_be_granted(r, lkb, 1)) {
 			grant_lock(r, lkb);
 			queue_cast(r, lkb, 0);
-			grant_pending_locks(r);
 			goto out;
 		}
 		/* else fall through and move to convert queue */
@@ -2344,28 +2352,47 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
 		error = -EINPROGRESS;
 		del_lkb(r, lkb);
 		add_lkb(r, lkb, DLM_LKSTS_CONVERT);
-		send_blocking_asts(r, lkb);
 		add_timeout(lkb);
 		goto out;
 	}
 
 	error = -EAGAIN;
-	if (force_blocking_asts(lkb))
-		send_blocking_asts_all(r, lkb);
 	queue_cast(r, lkb, -EAGAIN);
-
  out:
 	return error;
 }
 
+static void do_convert_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
+			       int error)
+{
+	switch (error) {
+	case 0:
+		grant_pending_locks(r);
+		/* grant_pending_locks also sends basts */
+		break;
+	case -EAGAIN:
+		if (force_blocking_asts(lkb))
+			send_blocking_asts_all(r, lkb);
+		break;
+	case -EINPROGRESS:
+		send_blocking_asts(r, lkb);
+		break;
+	}
+}
+
 static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
 {
 	remove_lock(r, lkb);
 	queue_cast(r, lkb, -DLM_EUNLOCK);
-	grant_pending_locks(r);
 	return -DLM_EUNLOCK;
 }
 
+static void do_unlock_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
+			      int error)
+{
+	grant_pending_locks(r);
+}
+
 /* returns: 0 did nothing, -DLM_ECANCEL canceled lock */
  
 static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
@@ -2375,12 +2402,18 @@ static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
 	error = revert_lock(r, lkb);
 	if (error) {
 		queue_cast(r, lkb, -DLM_ECANCEL);
-		grant_pending_locks(r);
 		return -DLM_ECANCEL;
 	}
 	return 0;
 }
 
+static void do_cancel_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
+			      int error)
+{
+	if (error)
+		grant_pending_locks(r);
+}
+
 /*
  * Four stage 3 varieties:
  * _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock()
@@ -2402,11 +2435,15 @@ static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
 		goto out;
 	}
 
-	if (is_remote(r))
+	if (is_remote(r)) {
 		/* receive_request() calls do_request() on remote node */
 		error = send_request(r, lkb);
-	else
+	} else {
 		error = do_request(r, lkb);
+		/* for remote locks the request_reply is sent
+		   between do_request and do_request_effects */
+		do_request_effects(r, lkb, error);
+	}
  out:
 	return error;
 }
@@ -2417,11 +2454,15 @@ static int _convert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
 {
 	int error;
 
-	if (is_remote(r))
+	if (is_remote(r)) {
 		/* receive_convert() calls do_convert() on remote node */
 		error = send_convert(r, lkb);
-	else
+	} else {
 		error = do_convert(r, lkb);
+		/* for remote locks the convert_reply is sent
+		   between do_convert and do_convert_effects */
+		do_convert_effects(r, lkb, error);
+	}
 
 	return error;
 }
@@ -2432,11 +2473,15 @@ static int _unlock_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
 {
 	int error;
 
-	if (is_remote(r))
+	if (is_remote(r)) {
 		/* receive_unlock() calls do_unlock() on remote node */
 		error = send_unlock(r, lkb);
-	else
+	} else {
 		error = do_unlock(r, lkb);
+		/* for remote locks the unlock_reply is sent
+		   between do_unlock and do_unlock_effects */
+		do_unlock_effects(r, lkb, error);
+	}
 
 	return error;
 }
@@ -2447,11 +2492,15 @@ static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
 {
 	int error;
 
-	if (is_remote(r))
+	if (is_remote(r)) {
 		/* receive_cancel() calls do_cancel() on remote node */
 		error = send_cancel(r, lkb);
-	else
+	} else {
 		error = do_cancel(r, lkb);
+		/* for remote locks the cancel_reply is sent
+		   between do_cancel and do_cancel_effects */
+		do_cancel_effects(r, lkb, error);
+	}
 
 	return error;
 }
@@ -3191,6 +3240,7 @@ static void receive_request(struct dlm_ls *ls, struct dlm_message *ms)
 	attach_lkb(r, lkb);
 	error = do_request(r, lkb);
 	send_request_reply(r, lkb, error);
+	do_request_effects(r, lkb, error);
 
 	unlock_rsb(r);
 	put_rsb(r);
@@ -3226,15 +3276,19 @@ static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
 		goto out;
 
 	receive_flags(lkb, ms);
+
 	error = receive_convert_args(ls, lkb, ms);
-	if (error)
-		goto out_reply;
+	if (error) {
+		send_convert_reply(r, lkb, error);
+		goto out;
+	}
+
 	reply = !down_conversion(lkb);
 
 	error = do_convert(r, lkb);
- out_reply:
 	if (reply)
 		send_convert_reply(r, lkb, error);
+	do_convert_effects(r, lkb, error);
  out:
 	unlock_rsb(r);
 	put_rsb(r);
@@ -3266,13 +3320,16 @@ static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
 		goto out;
 
 	receive_flags(lkb, ms);
+
 	error = receive_unlock_args(ls, lkb, ms);
-	if (error)
-		goto out_reply;
+	if (error) {
+		send_unlock_reply(r, lkb, error);
+		goto out;
+	}
 
 	error = do_unlock(r, lkb);
- out_reply:
 	send_unlock_reply(r, lkb, error);
+	do_unlock_effects(r, lkb, error);
  out:
 	unlock_rsb(r);
 	put_rsb(r);
@@ -3307,6 +3364,7 @@ static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
 
 	error = do_cancel(r, lkb);
 	send_cancel_reply(r, lkb, error);
+	do_cancel_effects(r, lkb, error);
  out:
 	unlock_rsb(r);
 	put_rsb(r);
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index c010ecfc0d29..f994a7dfda85 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -148,7 +148,7 @@ static void lockspace_kobj_release(struct kobject *k)
 	kfree(ls);
 }
 
-static struct sysfs_ops dlm_attr_ops = {
+static const struct sysfs_ops dlm_attr_ops = {
 	.show  = dlm_attr_show,
 	.store = dlm_attr_store,
 };
@@ -191,6 +191,18 @@ static int do_uevent(struct dlm_ls *ls, int in)
 	return error;
 }
 
+static int dlm_uevent(struct kset *kset, struct kobject *kobj,
+		      struct kobj_uevent_env *env)
+{
+	struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj);
+
+	add_uevent_var(env, "LOCKSPACE=%s", ls->ls_name);
+	return 0;
+}
+
+static struct kset_uevent_ops dlm_uevent_ops = {
+	.uevent = dlm_uevent,
+};
 
 int __init dlm_lockspace_init(void)
 {
@@ -199,7 +211,7 @@ int __init dlm_lockspace_init(void)
 	INIT_LIST_HEAD(&lslist);
 	spin_lock_init(&lslist_lock);
 
-	dlm_kset = kset_create_and_add("dlm", NULL, kernel_kobj);
+	dlm_kset = kset_create_and_add("dlm", &dlm_uevent_ops, kernel_kobj);
 	if (!dlm_kset) {
 		printk(KERN_WARNING "%s: can not create kset\n", __func__);
 		return -ENOMEM;
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 52cab160893c..c0d35c620526 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -51,6 +51,7 @@
 #include <linux/file.h>
 #include <linux/mutex.h>
 #include <linux/sctp.h>
+#include <linux/slab.h>
 #include <net/sctp/user.h>
 #include <net/ipv6.h>
 
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index 84f70bfb0baf..b12532e553f8 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -312,7 +312,7 @@ int dlm_ls_stop(struct dlm_ls *ls)
 	/*
 	 * This in_recovery lock does two things:
 	 * 1) Keeps this function from returning until all threads are out
-	 *    of locking routines and locking is truely stopped.
+	 *    of locking routines and locking is truly stopped.
 	 * 2) Keeps any new requests from being processed until it's unlocked
 	 *    when recovery is complete.
 	 */
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index 052095cd592f..2c6ad518100d 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -9,6 +9,7 @@
 #include <net/genetlink.h>
 #include <linux/dlm.h>
 #include <linux/dlm_netlink.h>
+#include <linux/gfp.h>
 
 #include "dlm_internal.h"
 
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index b5f89aef3b29..d45c02db6943 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -11,6 +11,7 @@
 #include <linux/poll.h>
 #include <linux/dlm.h>
 #include <linux/dlm_plock.h>
+#include <linux/slab.h>
 
 #include "dlm_internal.h"
 #include "lockspace.h"
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index e73a4bb572aa..b6272853130c 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2006-2009 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2006-2010 Red Hat, Inc.  All rights reserved.
  *
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
@@ -17,6 +17,7 @@
 #include <linux/spinlock.h>
 #include <linux/dlm.h>
 #include <linux/dlm_device.h>
+#include <linux/slab.h>
 
 #include "dlm_internal.h"
 #include "lockspace.h"
@@ -173,7 +174,7 @@ static int lkb_is_endoflife(struct dlm_lkb *lkb, int sb_status, int type)
 /* we could possibly check if the cancel of an orphan has resulted in the lkb
    being removed and then remove that lkb from the orphans list and free it */
 
-void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
+void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode)
 {
 	struct dlm_ls *ls;
 	struct dlm_user_args *ua;
@@ -206,12 +207,15 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
 
 	ast_type = lkb->lkb_ast_type;
 	lkb->lkb_ast_type |= type;
-	if (bastmode)
-		lkb->lkb_bastmode = bastmode;
+	if (type == AST_BAST)
+		lkb->lkb_bastmode = mode;
+	else
+		lkb->lkb_castmode = mode;
 
 	if (!ast_type) {
 		kref_get(&lkb->lkb_ref);
 		list_add_tail(&lkb->lkb_astqueue, &proc->asts);
+		lkb->lkb_ast_first = type;
 		wake_up_interruptible(&proc->wait);
 	}
 	if (type == AST_COMP && (ast_type & AST_COMP))
@@ -220,7 +224,6 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
 
 	eol = lkb_is_endoflife(lkb, ua->lksb.sb_status, type);
 	if (eol) {
-		lkb->lkb_ast_type &= ~AST_BAST;
 		lkb->lkb_flags |= DLM_IFL_ENDOFLIFE;
 	}
 
@@ -703,7 +706,7 @@ static int device_close(struct inode *inode, struct file *file)
 }
 
 static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
-			       int bmode, char __user *buf, size_t count)
+			       int mode, char __user *buf, size_t count)
 {
 #ifdef CONFIG_COMPAT
 	struct dlm_lock_result32 result32;
@@ -730,7 +733,7 @@ static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
 	if (type == AST_BAST) {
 		result.user_astaddr = ua->bastaddr;
 		result.user_astparam = ua->bastparam;
-		result.bast_mode = bmode;
+		result.bast_mode = mode;
 	} else {
 		result.user_astaddr = ua->castaddr;
 		result.user_astparam = ua->castparam;
@@ -798,7 +801,9 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
 	struct dlm_user_proc *proc = file->private_data;
 	struct dlm_lkb *lkb;
 	DECLARE_WAITQUEUE(wait, current);
-	int error, type=0, bmode=0, removed = 0;
+	int error = 0, removed;
+	int ret_type, ret_mode;
+	int bastmode, castmode, do_bast, do_cast;
 
 	if (count == sizeof(struct dlm_device_version)) {
 		error = copy_version_to_user(buf, count);
@@ -817,6 +822,8 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
 #endif
 		return -EINVAL;
 
+ try_another:
+
 	/* do we really need this? can a read happen after a close? */
 	if (test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))
 		return -EINVAL;
@@ -852,13 +859,55 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
 
 	lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_astqueue);
 
-	if (lkb->lkb_ast_type & AST_COMP) {
-		lkb->lkb_ast_type &= ~AST_COMP;
-		type = AST_COMP;
-	} else if (lkb->lkb_ast_type & AST_BAST) {
-		lkb->lkb_ast_type &= ~AST_BAST;
-		type = AST_BAST;
-		bmode = lkb->lkb_bastmode;
+	removed = 0;
+	ret_type = 0;
+	ret_mode = 0;
+	do_bast = lkb->lkb_ast_type & AST_BAST;
+	do_cast = lkb->lkb_ast_type & AST_COMP;
+	bastmode = lkb->lkb_bastmode;
+	castmode = lkb->lkb_castmode;
+
+	/* when both are queued figure out which to do first and
+	   switch first so the other goes in the next read */
+
+	if (do_cast && do_bast) {
+		if (lkb->lkb_ast_first == AST_COMP) {
+			ret_type = AST_COMP;
+			ret_mode = castmode;
+			lkb->lkb_ast_type &= ~AST_COMP;
+			lkb->lkb_ast_first = AST_BAST;
+		} else {
+			ret_type = AST_BAST;
+			ret_mode = bastmode;
+			lkb->lkb_ast_type &= ~AST_BAST;
+			lkb->lkb_ast_first = AST_COMP;
+		}
+	} else {
+		ret_type = lkb->lkb_ast_first;
+		ret_mode = (ret_type == AST_COMP) ? castmode : bastmode;
+		lkb->lkb_ast_type &= ~ret_type;
+		lkb->lkb_ast_first = 0;
+	}
+
+	/* if we're doing a bast but the bast is unnecessary, then
+	   switch to do nothing or do a cast if that was needed next */
+
+	if ((ret_type == AST_BAST) &&
+	    dlm_modes_compat(bastmode, lkb->lkb_castmode_done)) {
+		ret_type = 0;
+		ret_mode = 0;
+
+		if (do_cast) {
+			ret_type = AST_COMP;
+			ret_mode = castmode;
+			lkb->lkb_ast_type &= ~AST_COMP;
+			lkb->lkb_ast_first = 0;
+		}
+	}
+
+	if (lkb->lkb_ast_first != lkb->lkb_ast_type) {
+		log_print("device_read %x ast_first %x ast_type %x",
+			  lkb->lkb_id, lkb->lkb_ast_first, lkb->lkb_ast_type);
 	}
 
 	if (!lkb->lkb_ast_type) {
@@ -867,15 +916,29 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
 	}
 	spin_unlock(&proc->asts_spin);
 
-	error = copy_result_to_user(lkb->lkb_ua,
-			 	test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
-				type, bmode, buf, count);
+	if (ret_type) {
+		error = copy_result_to_user(lkb->lkb_ua,
+				test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
+				ret_type, ret_mode, buf, count);
+
+		if (ret_type == AST_COMP)
+			lkb->lkb_castmode_done = castmode;
+		if (ret_type == AST_BAST)
+			lkb->lkb_bastmode_done = bastmode;
+	}
 
 	/* removes reference for the proc->asts lists added by
 	   dlm_user_add_ast() and may result in the lkb being freed */
+
 	if (removed)
 		dlm_put_lkb(lkb);
 
+	/* the bast that was queued was eliminated (see unnecessary above),
+	   leaving nothing to return */
+
+	if (!ret_type)
+		goto try_another;
+
 	return error;
 }
 
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
index 1c9686492286..f196091dd7ff 100644
--- a/fs/dlm/user.h
+++ b/fs/dlm/user.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2006-2008 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2006-2010 Red Hat, Inc.  All rights reserved.
  *
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
@@ -9,7 +9,7 @@
 #ifndef __USER_DOT_H__
 #define __USER_DOT_H__
 
-void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode);
+void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode);
 int dlm_user_init(void);
 void dlm_user_exit(void);
 int dlm_device_deregister(struct dlm_ls *ls);
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 31f4b0e6d72c..83c4f600786a 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -12,7 +12,7 @@
 /* A global variable is a bit ugly, but it keeps the code simple */
 int sysctl_drop_caches;
 
-static void drop_pagecache_sb(struct super_block *sb)
+static void drop_pagecache_sb(struct super_block *sb, void *unused)
 {
 	struct inode *inode, *toput_inode = NULL;
 
@@ -33,26 +33,6 @@ static void drop_pagecache_sb(struct super_block *sb)
 	iput(toput_inode);
 }
 
-static void drop_pagecache(void)
-{
-	struct super_block *sb;
-
-	spin_lock(&sb_lock);
-restart:
-	list_for_each_entry(sb, &super_blocks, s_list) {
-		sb->s_count++;
-		spin_unlock(&sb_lock);
-		down_read(&sb->s_umount);
-		if (sb->s_root)
-			drop_pagecache_sb(sb);
-		up_read(&sb->s_umount);
-		spin_lock(&sb_lock);
-		if (__put_super_and_need_restart(sb))
-			goto restart;
-	}
-	spin_unlock(&sb_lock);
-}
-
 static void drop_slab(void)
 {
 	int nr_objects;
@@ -68,7 +48,7 @@ int drop_caches_sysctl_handler(ctl_table *table, int write,
 	proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (write) {
 		if (sysctl_drop_caches & 1)
-			drop_pagecache();
+			iterate_supers(drop_pagecache_sb, NULL);
 		if (sysctl_drop_caches & 2)
 			drop_slab();
 	}
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 7cb0a59f4b9d..1cc087635a5e 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -33,6 +33,7 @@
 #include <linux/crypto.h>
 #include <linux/file.h>
 #include <linux/scatterlist.h>
+#include <linux/slab.h>
 #include <asm/unaligned.h>
 #include "ecryptfs_kernel.h"
 
@@ -381,8 +382,8 @@ out:
 static void ecryptfs_lower_offset_for_extent(loff_t *offset, loff_t extent_num,
 					     struct ecryptfs_crypt_stat *crypt_stat)
 {
-	(*offset) = (crypt_stat->num_header_bytes_at_front
-		     + (crypt_stat->extent_size * extent_num));
+	(*offset) = ecryptfs_lower_header_size(crypt_stat)
+		    + (crypt_stat->extent_size * extent_num);
 }
 
 /**
@@ -834,13 +835,13 @@ void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat)
 	set_extent_mask_and_shift(crypt_stat);
 	crypt_stat->iv_bytes = ECRYPTFS_DEFAULT_IV_BYTES;
 	if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
-		crypt_stat->num_header_bytes_at_front = 0;
+		crypt_stat->metadata_size = ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
 	else {
 		if (PAGE_CACHE_SIZE <= ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)
-			crypt_stat->num_header_bytes_at_front =
+			crypt_stat->metadata_size =
 				ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
 		else
-			crypt_stat->num_header_bytes_at_front =	PAGE_CACHE_SIZE;
+			crypt_stat->metadata_size = PAGE_CACHE_SIZE;
 	}
 }
 
@@ -1107,9 +1108,9 @@ static void write_ecryptfs_marker(char *page_virt, size_t *written)
 	(*written) = MAGIC_ECRYPTFS_MARKER_SIZE_BYTES;
 }
 
-static void
-write_ecryptfs_flags(char *page_virt, struct ecryptfs_crypt_stat *crypt_stat,
-		     size_t *written)
+void ecryptfs_write_crypt_stat_flags(char *page_virt,
+				     struct ecryptfs_crypt_stat *crypt_stat,
+				     size_t *written)
 {
 	u32 flags = 0;
 	int i;
@@ -1237,8 +1238,7 @@ ecryptfs_write_header_metadata(char *virt,
 
 	header_extent_size = (u32)crypt_stat->extent_size;
 	num_header_extents_at_front =
-		(u16)(crypt_stat->num_header_bytes_at_front
-		      / crypt_stat->extent_size);
+		(u16)(crypt_stat->metadata_size / crypt_stat->extent_size);
 	put_unaligned_be32(header_extent_size, virt);
 	virt += 4;
 	put_unaligned_be16(num_header_extents_at_front, virt);
@@ -1291,7 +1291,8 @@ static int ecryptfs_write_headers_virt(char *page_virt, size_t max,
 	offset = ECRYPTFS_FILE_SIZE_BYTES;
 	write_ecryptfs_marker((page_virt + offset), &written);
 	offset += written;
-	write_ecryptfs_flags((page_virt + offset), crypt_stat, &written);
+	ecryptfs_write_crypt_stat_flags((page_virt + offset), crypt_stat,
+					&written);
 	offset += written;
 	ecryptfs_write_header_metadata((page_virt + offset), crypt_stat,
 				       &written);
@@ -1381,7 +1382,7 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry)
 		rc = -EINVAL;
 		goto out;
 	}
-	virt_len = crypt_stat->num_header_bytes_at_front;
+	virt_len = crypt_stat->metadata_size;
 	order = get_order(virt_len);
 	/* Released in this function */
 	virt = (char *)ecryptfs_get_zeroed_pages(GFP_KERNEL, order);
@@ -1427,16 +1428,15 @@ static int parse_header_metadata(struct ecryptfs_crypt_stat *crypt_stat,
 	header_extent_size = get_unaligned_be32(virt);
 	virt += sizeof(__be32);
 	num_header_extents_at_front = get_unaligned_be16(virt);
-	crypt_stat->num_header_bytes_at_front =
-		(((size_t)num_header_extents_at_front
-		  * (size_t)header_extent_size));
+	crypt_stat->metadata_size = (((size_t)num_header_extents_at_front
+				     * (size_t)header_extent_size));
 	(*bytes_read) = (sizeof(__be32) + sizeof(__be16));
 	if ((validate_header_size == ECRYPTFS_VALIDATE_HEADER_SIZE)
-	    && (crypt_stat->num_header_bytes_at_front
+	    && (crypt_stat->metadata_size
 		< ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)) {
 		rc = -EINVAL;
 		printk(KERN_WARNING "Invalid header size: [%zd]\n",
-		       crypt_stat->num_header_bytes_at_front);
+		       crypt_stat->metadata_size);
 	}
 	return rc;
 }
@@ -1451,8 +1451,7 @@ static int parse_header_metadata(struct ecryptfs_crypt_stat *crypt_stat,
  */
 static void set_default_header_data(struct ecryptfs_crypt_stat *crypt_stat)
 {
-	crypt_stat->num_header_bytes_at_front =
-		ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
+	crypt_stat->metadata_size = ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
 }
 
 /**
@@ -1606,6 +1605,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
 						ecryptfs_dentry,
 						ECRYPTFS_VALIDATE_HEADER_SIZE);
 	if (rc) {
+		memset(page_virt, 0, PAGE_CACHE_SIZE);
 		rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_inode);
 		if (rc) {
 			printk(KERN_DEBUG "Valid eCryptfs headers not found in "
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 8f006a0d6076..906e803f7f79 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -26,6 +26,7 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/fs_stack.h>
+#include <linux/slab.h>
 #include "ecryptfs_kernel.h"
 
 /**
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 542f625312f3..0032a9f5a3a9 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -35,6 +35,7 @@
 #include <linux/scatterlist.h>
 #include <linux/hash.h>
 #include <linux/nsproxy.h>
+#include <linux/backing-dev.h>
 
 /* Version verification for shared data structures w/ userspace */
 #define ECRYPTFS_VERSION_MAJOR 0x00
@@ -273,7 +274,7 @@ struct ecryptfs_crypt_stat {
 	u32 flags;
 	unsigned int file_version;
 	size_t iv_bytes;
-	size_t num_header_bytes_at_front;
+	size_t metadata_size;
 	size_t extent_size; /* Data extent size; default is 4096 */
 	size_t key_size;
 	size_t extent_shift;
@@ -393,6 +394,7 @@ struct ecryptfs_mount_crypt_stat {
 struct ecryptfs_sb_info {
 	struct super_block *wsi_sb;
 	struct ecryptfs_mount_crypt_stat mount_crypt_stat;
+	struct backing_dev_info bdi;
 };
 
 /* file private data. */
@@ -464,6 +466,14 @@ struct ecryptfs_daemon {
 
 extern struct mutex ecryptfs_daemon_hash_mux;
 
+static inline size_t
+ecryptfs_lower_header_size(struct ecryptfs_crypt_stat *crypt_stat)
+{
+	if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
+		return 0;
+	return crypt_stat->metadata_size;
+}
+
 static inline struct ecryptfs_file_info *
 ecryptfs_file_to_private(struct file *file)
 {
@@ -651,6 +661,9 @@ int ecryptfs_decrypt_page(struct page *page);
 int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry);
 int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry);
 int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry);
+void ecryptfs_write_crypt_stat_flags(char *page_virt,
+				     struct ecryptfs_crypt_stat *crypt_stat,
+				     size_t *written);
 int ecryptfs_read_and_validate_header_region(char *data,
 					     struct inode *ecryptfs_inode);
 int ecryptfs_read_and_validate_xattr_region(char *page_virt,
@@ -718,15 +731,14 @@ int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
 int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
 				      struct page *page_for_lower,
 				      size_t offset_in_page, size_t size);
-int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
-		   size_t size);
+int ecryptfs_write(struct inode *inode, char *data, loff_t offset, size_t size);
 int ecryptfs_read_lower(char *data, loff_t offset, size_t size,
 			struct inode *ecryptfs_inode);
 int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
 				     pgoff_t page_index,
 				     size_t offset_in_page, size_t size,
 				     struct inode *ecryptfs_inode);
-struct page *ecryptfs_get_locked_page(struct file *file, loff_t index);
+struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index);
 int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon);
 int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon, uid_t euid,
 				 struct user_namespace *user_ns);
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 678172b61be2..e8fcf4e2ed7d 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -25,6 +25,7 @@
 
 #include <linux/file.h>
 #include <linux/poll.h>
+#include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/pagemap.h>
 #include <linux/security.h>
@@ -273,11 +274,9 @@ static int ecryptfs_release(struct inode *inode, struct file *file)
 }
 
 static int
-ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+ecryptfs_fsync(struct file *file, int datasync)
 {
-	return vfs_fsync(ecryptfs_file_to_lower(file),
-			 ecryptfs_dentry_to_lower(dentry),
-			 datasync);
+	return vfs_fsync(ecryptfs_file_to_lower(file), datasync);
 }
 
 static int ecryptfs_fasync(int fd, struct file *file, int flag)
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 4a430ab4115c..31ef5252f0fe 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -31,6 +31,7 @@
 #include <linux/mount.h>
 #include <linux/crypto.h>
 #include <linux/fs_stack.h>
+#include <linux/slab.h>
 #include <asm/unaligned.h>
 #include "ecryptfs_kernel.h"
 
@@ -141,19 +142,10 @@ out:
 static int grow_file(struct dentry *ecryptfs_dentry)
 {
 	struct inode *ecryptfs_inode = ecryptfs_dentry->d_inode;
-	struct file fake_file;
-	struct ecryptfs_file_info tmp_file_info;
 	char zero_virt[] = { 0x00 };
 	int rc = 0;
 
-	memset(&fake_file, 0, sizeof(fake_file));
-	fake_file.f_path.dentry = ecryptfs_dentry;
-	memset(&tmp_file_info, 0, sizeof(tmp_file_info));
-	ecryptfs_set_file_private(&fake_file, &tmp_file_info);
-	ecryptfs_set_file_lower(
-		&fake_file,
-		ecryptfs_inode_to_private(ecryptfs_inode)->lower_file);
-	rc = ecryptfs_write(&fake_file, zero_virt, 0, 1);
+	rc = ecryptfs_write(ecryptfs_inode, zero_virt, 0, 1);
 	i_size_write(ecryptfs_inode, 0);
 	rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode);
 	ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat.flags |=
@@ -323,6 +315,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
 	rc = ecryptfs_read_and_validate_header_region(page_virt,
 						      ecryptfs_dentry->d_inode);
 	if (rc) {
+		memset(page_virt, 0, PAGE_CACHE_SIZE);
 		rc = ecryptfs_read_and_validate_xattr_region(page_virt,
 							     ecryptfs_dentry);
 		if (rc) {
@@ -335,7 +328,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
 		ecryptfs_dentry->d_sb)->mount_crypt_stat;
 	if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) {
 		if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
-			file_size = (crypt_stat->num_header_bytes_at_front
+			file_size = (crypt_stat->metadata_size
 				     + i_size_read(lower_dentry->d_inode));
 		else
 			file_size = i_size_read(lower_dentry->d_inode);
@@ -387,9 +380,9 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
 	mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
 	if (IS_ERR(lower_dentry)) {
 		rc = PTR_ERR(lower_dentry);
-		printk(KERN_ERR "%s: lookup_one_len() returned [%d] on "
-		       "lower_dentry = [%s]\n", __func__, rc,
-		       ecryptfs_dentry->d_name.name);
+		ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
+				"[%d] on lower_dentry = [%s]\n", __func__, rc,
+				encrypted_and_encoded_name);
 		goto out_d_drop;
 	}
 	if (lower_dentry->d_inode)
@@ -416,9 +409,9 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
 	mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
 	if (IS_ERR(lower_dentry)) {
 		rc = PTR_ERR(lower_dentry);
-		printk(KERN_ERR "%s: lookup_one_len() returned [%d] on "
-		       "lower_dentry = [%s]\n", __func__, rc,
-		       encrypted_and_encoded_name);
+		ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
+				"[%d] on lower_dentry = [%s]\n", __func__, rc,
+				encrypted_and_encoded_name);
 		goto out_d_drop;
 	}
 lookup_and_interpose:
@@ -455,8 +448,8 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
 	rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb, 0);
 	if (rc)
 		goto out_lock;
-	fsstack_copy_attr_times(dir, lower_new_dentry->d_inode);
-	fsstack_copy_inode_size(dir, lower_new_dentry->d_inode);
+	fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
+	fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode);
 	old_dentry->d_inode->i_nlink =
 		ecryptfs_inode_to_lower(old_dentry->d_inode)->i_nlink;
 	i_size_write(new_dentry->d_inode, file_size_save);
@@ -647,38 +640,17 @@ out_lock:
 	return rc;
 }
 
-static int
-ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
+static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf,
+				   size_t *bufsiz)
 {
+	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
 	char *lower_buf;
-	size_t lower_bufsiz;
-	struct dentry *lower_dentry;
-	struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
-	char *plaintext_name;
-	size_t plaintext_name_size;
+	size_t lower_bufsiz = PATH_MAX;
 	mm_segment_t old_fs;
 	int rc;
 
-	lower_dentry = ecryptfs_dentry_to_lower(dentry);
-	if (!lower_dentry->d_inode->i_op->readlink) {
-		rc = -EINVAL;
-		goto out;
-	}
-	mount_crypt_stat = &ecryptfs_superblock_to_private(
-						dentry->d_sb)->mount_crypt_stat;
-	/*
-	 * If the lower filename is encrypted, it will result in a significantly
-	 * longer name.  If needed, truncate the name after decode and decrypt.
-	 */
-	if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
-		lower_bufsiz = PATH_MAX;
-	else
-		lower_bufsiz = bufsiz;
-	/* Released in this function */
 	lower_buf = kmalloc(lower_bufsiz, GFP_KERNEL);
-	if (lower_buf == NULL) {
-		printk(KERN_ERR "%s: Out of memory whilst attempting to "
-		       "kmalloc [%zd] bytes\n", __func__, lower_bufsiz);
+	if (!lower_buf) {
 		rc = -ENOMEM;
 		goto out;
 	}
@@ -688,29 +660,31 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
 						   (char __user *)lower_buf,
 						   lower_bufsiz);
 	set_fs(old_fs);
-	if (rc >= 0) {
-		rc = ecryptfs_decode_and_decrypt_filename(&plaintext_name,
-							  &plaintext_name_size,
-							  dentry, lower_buf,
-							  rc);
-		if (rc) {
-			printk(KERN_ERR "%s: Error attempting to decode and "
-			       "decrypt filename; rc = [%d]\n", __func__,
-				rc);
-			goto out_free_lower_buf;
-		}
-		/* Check for bufsiz <= 0 done in sys_readlinkat() */
-		rc = copy_to_user(buf, plaintext_name,
-				  min((size_t) bufsiz, plaintext_name_size));
-		if (rc)
-			rc = -EFAULT;
-		else
-			rc = plaintext_name_size;
-		kfree(plaintext_name);
-		fsstack_copy_attr_atime(dentry->d_inode, lower_dentry->d_inode);
-	}
-out_free_lower_buf:
+	if (rc < 0)
+		goto out;
+	lower_bufsiz = rc;
+	rc = ecryptfs_decode_and_decrypt_filename(buf, bufsiz, dentry,
+						  lower_buf, lower_bufsiz);
+out:
 	kfree(lower_buf);
+	return rc;
+}
+
+static int
+ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
+{
+	char *kbuf;
+	size_t kbufsiz, copied;
+	int rc;
+
+	rc = ecryptfs_readlink_lower(dentry, &kbuf, &kbufsiz);
+	if (rc)
+		goto out;
+	copied = min_t(size_t, bufsiz, kbufsiz);
+	rc = copy_to_user(buf, kbuf, copied) ? -EFAULT : copied;
+	kfree(kbuf);
+	fsstack_copy_attr_atime(dentry->d_inode,
+				ecryptfs_dentry_to_lower(dentry)->d_inode);
 out:
 	return rc;
 }
@@ -768,7 +742,7 @@ upper_size_to_lower_size(struct ecryptfs_crypt_stat *crypt_stat,
 {
 	loff_t lower_size;
 
-	lower_size = crypt_stat->num_header_bytes_at_front;
+	lower_size = ecryptfs_lower_header_size(crypt_stat);
 	if (upper_size != 0) {
 		loff_t num_extents;
 
@@ -801,8 +775,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
 {
 	int rc = 0;
 	struct inode *inode = dentry->d_inode;
-	struct dentry *lower_dentry;
-	struct file fake_ecryptfs_file;
 	struct ecryptfs_crypt_stat *crypt_stat;
 	loff_t i_size = i_size_read(inode);
 	loff_t lower_size_before_truncate;
@@ -813,23 +785,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
 		goto out;
 	}
 	crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
-	/* Set up a fake ecryptfs file, this is used to interface with
-	 * the file in the underlying filesystem so that the
-	 * truncation has an effect there as well. */
-	memset(&fake_ecryptfs_file, 0, sizeof(fake_ecryptfs_file));
-	fake_ecryptfs_file.f_path.dentry = dentry;
-	/* Released at out_free: label */
-	ecryptfs_set_file_private(&fake_ecryptfs_file,
-				  kmem_cache_alloc(ecryptfs_file_info_cache,
-						   GFP_KERNEL));
-	if (unlikely(!ecryptfs_file_to_private(&fake_ecryptfs_file))) {
-		rc = -ENOMEM;
-		goto out;
-	}
-	lower_dentry = ecryptfs_dentry_to_lower(dentry);
-	ecryptfs_set_file_lower(
-		&fake_ecryptfs_file,
-		ecryptfs_inode_to_private(dentry->d_inode)->lower_file);
 	/* Switch on growing or shrinking file */
 	if (ia->ia_size > i_size) {
 		char zero[] = { 0x00 };
@@ -839,7 +794,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
 		 * this triggers code that will fill in 0's throughout
 		 * the intermediate portion of the previous end of the
 		 * file and the new and of the file */
-		rc = ecryptfs_write(&fake_ecryptfs_file, zero,
+		rc = ecryptfs_write(inode, zero,
 				    (ia->ia_size - 1), 1);
 	} else { /* ia->ia_size < i_size_read(inode) */
 		/* We're chopping off all the pages down to the page
@@ -850,12 +805,12 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
 				    - (ia->ia_size & ~PAGE_CACHE_MASK));
 
 		if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
-			rc = vmtruncate(inode, ia->ia_size);
+			rc = simple_setsize(inode, ia->ia_size);
 			if (rc)
-				goto out_free;
+				goto out;
 			lower_ia->ia_size = ia->ia_size;
 			lower_ia->ia_valid |= ATTR_SIZE;
-			goto out_free;
+			goto out;
 		}
 		if (num_zeros) {
 			char *zeros_virt;
@@ -863,25 +818,25 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
 			zeros_virt = kzalloc(num_zeros, GFP_KERNEL);
 			if (!zeros_virt) {
 				rc = -ENOMEM;
-				goto out_free;
+				goto out;
 			}
-			rc = ecryptfs_write(&fake_ecryptfs_file, zeros_virt,
+			rc = ecryptfs_write(inode, zeros_virt,
 					    ia->ia_size, num_zeros);
 			kfree(zeros_virt);
 			if (rc) {
 				printk(KERN_ERR "Error attempting to zero out "
 				       "the remainder of the end page on "
 				       "reducing truncate; rc = [%d]\n", rc);
-				goto out_free;
+				goto out;
 			}
 		}
-		vmtruncate(inode, ia->ia_size);
+		simple_setsize(inode, ia->ia_size);
 		rc = ecryptfs_write_inode_size_to_metadata(inode);
 		if (rc) {
 			printk(KERN_ERR	"Problem with "
 			       "ecryptfs_write_inode_size_to_metadata; "
 			       "rc = [%d]\n", rc);
-			goto out_free;
+			goto out;
 		}
 		/* We are reducing the size of the ecryptfs file, and need to
 		 * know if we need to reduce the size of the lower file. */
@@ -895,10 +850,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
 		} else
 			lower_ia->ia_valid &= ~ATTR_SIZE;
 	}
-out_free:
-	if (ecryptfs_file_to_private(&fake_ecryptfs_file))
-		kmem_cache_free(ecryptfs_file_info_cache,
-				ecryptfs_file_to_private(&fake_ecryptfs_file));
 out:
 	return rc;
 }
@@ -1015,6 +966,28 @@ out:
 	return rc;
 }
 
+int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry,
+			  struct kstat *stat)
+{
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
+	int rc = 0;
+
+	mount_crypt_stat = &ecryptfs_superblock_to_private(
+						dentry->d_sb)->mount_crypt_stat;
+	generic_fillattr(dentry->d_inode, stat);
+	if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) {
+		char *target;
+		size_t targetsiz;
+
+		rc = ecryptfs_readlink_lower(dentry, &target, &targetsiz);
+		if (!rc) {
+			kfree(target);
+			stat->size = targetsiz;
+		}
+	}
+	return rc;
+}
+
 int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		     struct kstat *stat)
 {
@@ -1039,7 +1012,7 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 
 	lower_dentry = ecryptfs_dentry_to_lower(dentry);
 	if (!lower_dentry->d_inode->i_op->setxattr) {
-		rc = -ENOSYS;
+		rc = -EOPNOTSUPP;
 		goto out;
 	}
 	mutex_lock(&lower_dentry->d_inode->i_mutex);
@@ -1057,7 +1030,7 @@ ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name,
 	int rc = 0;
 
 	if (!lower_dentry->d_inode->i_op->getxattr) {
-		rc = -ENOSYS;
+		rc = -EOPNOTSUPP;
 		goto out;
 	}
 	mutex_lock(&lower_dentry->d_inode->i_mutex);
@@ -1084,7 +1057,7 @@ ecryptfs_listxattr(struct dentry *dentry, char *list, size_t size)
 
 	lower_dentry = ecryptfs_dentry_to_lower(dentry);
 	if (!lower_dentry->d_inode->i_op->listxattr) {
-		rc = -ENOSYS;
+		rc = -EOPNOTSUPP;
 		goto out;
 	}
 	mutex_lock(&lower_dentry->d_inode->i_mutex);
@@ -1101,7 +1074,7 @@ static int ecryptfs_removexattr(struct dentry *dentry, const char *name)
 
 	lower_dentry = ecryptfs_dentry_to_lower(dentry);
 	if (!lower_dentry->d_inode->i_op->removexattr) {
-		rc = -ENOSYS;
+		rc = -EOPNOTSUPP;
 		goto out;
 	}
 	mutex_lock(&lower_dentry->d_inode->i_mutex);
@@ -1132,6 +1105,7 @@ const struct inode_operations ecryptfs_symlink_iops = {
 	.put_link = ecryptfs_put_link,
 	.permission = ecryptfs_permission,
 	.setattr = ecryptfs_setattr,
+	.getattr = ecryptfs_getattr_link,
 	.setxattr = ecryptfs_setxattr,
 	.getxattr = ecryptfs_getxattr,
 	.listxattr = ecryptfs_listxattr,
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index a0a7847567e9..89c5476506ef 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -32,6 +32,7 @@
 #include <linux/random.h>
 #include <linux/crypto.h>
 #include <linux/scatterlist.h>
+#include <linux/slab.h>
 #include "ecryptfs_kernel.h"
 
 /**
diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
index e14cf7e588db..d8c3a373aafa 100644
--- a/fs/ecryptfs/kthread.c
+++ b/fs/ecryptfs/kthread.c
@@ -22,6 +22,7 @@
 
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+#include <linux/slab.h>
 #include <linux/wait.h>
 #include <linux/mount.h>
 #include "ecryptfs_kernel.h"
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index ea2f92101dfe..cbd4e18adb20 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -35,6 +35,7 @@
 #include <linux/key.h>
 #include <linux/parser.h>
 #include <linux/fs_stack.h>
+#include <linux/slab.h>
 #include "ecryptfs_kernel.h"
 
 /**
@@ -280,7 +281,7 @@ static void ecryptfs_init_mount_crypt_stat(
  *
  * Returns zero on success; non-zero on error
  */
-static int ecryptfs_parse_options(struct super_block *sb, char *options)
+static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options)
 {
 	char *p;
 	int rc = 0;
@@ -292,7 +293,7 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
 	int fn_cipher_key_bytes;
 	int fn_cipher_key_bytes_set = 0;
 	struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
-		&ecryptfs_superblock_to_private(sb)->mount_crypt_stat;
+		&sbi->mount_crypt_stat;
 	substring_t args[MAX_OPT_ARGS];
 	int token;
 	char *sig_src;
@@ -482,60 +483,7 @@ out:
 }
 
 struct kmem_cache *ecryptfs_sb_info_cache;
-
-/**
- * ecryptfs_fill_super
- * @sb: The ecryptfs super block
- * @raw_data: The options passed to mount
- * @silent: Not used but required by function prototype
- *
- * Sets up what we can of the sb, rest is done in ecryptfs_read_super
- *
- * Returns zero on success; non-zero otherwise
- */
-static int
-ecryptfs_fill_super(struct super_block *sb, void *raw_data, int silent)
-{
-	int rc = 0;
-
-	/* Released in ecryptfs_put_super() */
-	ecryptfs_set_superblock_private(sb,
-					kmem_cache_zalloc(ecryptfs_sb_info_cache,
-							 GFP_KERNEL));
-	if (!ecryptfs_superblock_to_private(sb)) {
-		ecryptfs_printk(KERN_WARNING, "Out of memory\n");
-		rc = -ENOMEM;
-		goto out;
-	}
-	sb->s_op = &ecryptfs_sops;
-	/* Released through deactivate_super(sb) from get_sb_nodev */
-	sb->s_root = d_alloc(NULL, &(const struct qstr) {
-			     .hash = 0,.name = "/",.len = 1});
-	if (!sb->s_root) {
-		ecryptfs_printk(KERN_ERR, "d_alloc failed\n");
-		rc = -ENOMEM;
-		goto out;
-	}
-	sb->s_root->d_op = &ecryptfs_dops;
-	sb->s_root->d_sb = sb;
-	sb->s_root->d_parent = sb->s_root;
-	/* Released in d_release when dput(sb->s_root) is called */
-	/* through deactivate_super(sb) from get_sb_nodev() */
-	ecryptfs_set_dentry_private(sb->s_root,
-				    kmem_cache_zalloc(ecryptfs_dentry_info_cache,
-						     GFP_KERNEL));
-	if (!ecryptfs_dentry_to_private(sb->s_root)) {
-		ecryptfs_printk(KERN_ERR,
-				"dentry_info_cache alloc failed\n");
-		rc = -ENOMEM;
-		goto out;
-	}
-	rc = 0;
-out:
-	/* Should be able to rely on deactivate_super called from
-	 * get_sb_nodev */
-	return rc;
-}
+static struct file_system_type ecryptfs_fs_type;
 
 /**
  * ecryptfs_read_super
@@ -556,6 +504,13 @@ static int ecryptfs_read_super(struct super_block *sb, const char *dev_name)
 		ecryptfs_printk(KERN_WARNING, "path_lookup() failed\n");
 		goto out;
 	}
+	if (path.dentry->d_sb->s_type == &ecryptfs_fs_type) {
+		rc = -EINVAL;
+		printk(KERN_ERR "Mount on filesystem of type "
+			"eCryptfs explicitly disallowed due to "
+			"known incompatibilities\n");
+		goto out_free;
+	}
 	ecryptfs_set_superblock_lower(sb, path.dentry->d_sb);
 	sb->s_maxbytes = path.dentry->d_sb->s_maxbytes;
 	sb->s_blocksize = path.dentry->d_sb->s_blocksize;
@@ -579,11 +534,8 @@ out:
  * @dev_name: The path to mount over
  * @raw_data: The options passed into the kernel
  *
- * The whole ecryptfs_get_sb process is broken into 4 functions:
+ * The whole ecryptfs_get_sb process is broken into 3 functions:
  * ecryptfs_parse_options(): handle options passed to ecryptfs, if any
- * ecryptfs_fill_super(): used by get_sb_nodev, fills out the super_block
- *                        with as much information as it can before needing
- *                        the lower filesystem.
  * ecryptfs_read_super(): this accesses the lower filesystem and uses
  *                        ecryptfs_interpose to perform most of the linking
  * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c)
@@ -592,30 +544,78 @@ static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags,
 			const char *dev_name, void *raw_data,
 			struct vfsmount *mnt)
 {
+	struct super_block *s;
+	struct ecryptfs_sb_info *sbi;
+	struct ecryptfs_dentry_info *root_info;
+	const char *err = "Getting sb failed";
 	int rc;
-	struct super_block *sb;
 
-	rc = get_sb_nodev(fs_type, flags, raw_data, ecryptfs_fill_super, mnt);
-	if (rc < 0) {
-		printk(KERN_ERR "Getting sb failed; rc = [%d]\n", rc);
+	sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL);
+	if (!sbi) {
+		rc = -ENOMEM;
 		goto out;
 	}
-	sb = mnt->mnt_sb;
-	rc = ecryptfs_parse_options(sb, raw_data);
+
+	rc = ecryptfs_parse_options(sbi, raw_data);
 	if (rc) {
-		printk(KERN_ERR "Error parsing options; rc = [%d]\n", rc);
-		goto out_abort;
+		err = "Error parsing options";
+		goto out;
+	}
+
+	s = sget(fs_type, NULL, set_anon_super, NULL);
+	if (IS_ERR(s)) {
+		rc = PTR_ERR(s);
+		goto out;
 	}
-	rc = ecryptfs_read_super(sb, dev_name);
+
+	s->s_flags = flags;
+	rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs", BDI_CAP_MAP_COPY);
 	if (rc) {
-		printk(KERN_ERR "Reading sb failed; rc = [%d]\n", rc);
-		goto out_abort;
+		deactivate_locked_super(s);
+		goto out;
 	}
-	goto out;
-out_abort:
-	dput(sb->s_root); /* aka mnt->mnt_root, as set by get_sb_nodev() */
-	deactivate_locked_super(sb);
+
+	ecryptfs_set_superblock_private(s, sbi);
+	s->s_bdi = &sbi->bdi;
+
+	/* ->kill_sb() will take care of sbi after that point */
+	sbi = NULL;
+	s->s_op = &ecryptfs_sops;
+
+	rc = -ENOMEM;
+	s->s_root = d_alloc(NULL, &(const struct qstr) {
+			     .hash = 0,.name = "/",.len = 1});
+	if (!s->s_root) {
+		deactivate_locked_super(s);
+		goto out;
+	}
+	s->s_root->d_op = &ecryptfs_dops;
+	s->s_root->d_sb = s;
+	s->s_root->d_parent = s->s_root;
+
+	root_info = kmem_cache_zalloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
+	if (!root_info) {
+		deactivate_locked_super(s);
+		goto out;
+	}
+	/* ->kill_sb() will take care of root_info */
+	ecryptfs_set_dentry_private(s->s_root, root_info);
+	s->s_flags |= MS_ACTIVE;
+	rc = ecryptfs_read_super(s, dev_name);
+	if (rc) {
+		deactivate_locked_super(s);
+		err = "Reading sb failed";
+		goto out;
+	}
+	simple_set_mnt(mnt, s);
+	return 0;
+
 out:
+	if (sbi) {
+		ecryptfs_destroy_mount_crypt_stat(&sbi->mount_crypt_stat);
+		kmem_cache_free(ecryptfs_sb_info_cache, sbi);
+	}
+	printk(KERN_ERR "%s; rc = [%d]\n", err, rc);
 	return rc;
 }
 
@@ -624,11 +624,16 @@ out:
  * @sb: The ecryptfs super block
  *
  * Used to bring the superblock down and free the private data.
- * Private data is free'd in ecryptfs_put_super()
  */
 static void ecryptfs_kill_block_super(struct super_block *sb)
 {
-	generic_shutdown_super(sb);
+	struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb);
+	kill_anon_super(sb);
+	if (!sb_info)
+		return;
+	ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat);
+	bdi_destroy(&sb_info->bdi);
+	kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
 }
 
 static struct file_system_type ecryptfs_fs_type = {
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index f1c17e87c5fb..46c4dd8dfcc3 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -20,6 +20,7 @@
  * 02111-1307, USA.
  */
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/user_namespace.h>
 #include <linux/nsproxy.h>
 #include "ecryptfs_kernel.h"
@@ -30,9 +31,9 @@ static struct mutex ecryptfs_msg_ctx_lists_mux;
 
 static struct hlist_head *ecryptfs_daemon_hash;
 struct mutex ecryptfs_daemon_hash_mux;
-static int ecryptfs_hash_buckets;
+static int ecryptfs_hash_bits;
 #define ecryptfs_uid_hash(uid) \
-        hash_long((unsigned long)uid, ecryptfs_hash_buckets)
+        hash_long((unsigned long)uid, ecryptfs_hash_bits)
 
 static u32 ecryptfs_msg_counter;
 static struct ecryptfs_msg_ctx *ecryptfs_msg_ctx_arr;
@@ -485,18 +486,19 @@ int ecryptfs_init_messaging(void)
 	}
 	mutex_init(&ecryptfs_daemon_hash_mux);
 	mutex_lock(&ecryptfs_daemon_hash_mux);
-	ecryptfs_hash_buckets = 1;
-	while (ecryptfs_number_of_users >> ecryptfs_hash_buckets)
-		ecryptfs_hash_buckets++;
+	ecryptfs_hash_bits = 1;
+	while (ecryptfs_number_of_users >> ecryptfs_hash_bits)
+		ecryptfs_hash_bits++;
 	ecryptfs_daemon_hash = kmalloc((sizeof(struct hlist_head)
-					* ecryptfs_hash_buckets), GFP_KERNEL);
+					* (1 << ecryptfs_hash_bits)),
+				       GFP_KERNEL);
 	if (!ecryptfs_daemon_hash) {
 		rc = -ENOMEM;
 		printk(KERN_ERR "%s: Failed to allocate memory\n", __func__);
 		mutex_unlock(&ecryptfs_daemon_hash_mux);
 		goto out;
 	}
-	for (i = 0; i < ecryptfs_hash_buckets; i++)
+	for (i = 0; i < (1 << ecryptfs_hash_bits); i++)
 		INIT_HLIST_HEAD(&ecryptfs_daemon_hash[i]);
 	mutex_unlock(&ecryptfs_daemon_hash_mux);
 	ecryptfs_msg_ctx_arr = kmalloc((sizeof(struct ecryptfs_msg_ctx)
@@ -553,7 +555,7 @@ void ecryptfs_release_messaging(void)
 		int i;
 
 		mutex_lock(&ecryptfs_daemon_hash_mux);
-		for (i = 0; i < ecryptfs_hash_buckets; i++) {
+		for (i = 0; i < (1 << ecryptfs_hash_bits); i++) {
 			int rc;
 
 			hlist_for_each_entry(daemon, elem,
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 4ec8f61ccf5a..3745f612bcd4 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -24,6 +24,7 @@
 #include <linux/random.h>
 #include <linux/miscdevice.h>
 #include <linux/poll.h>
+#include <linux/slab.h>
 #include <linux/wait.h>
 #include <linux/module.h>
 #include "ecryptfs_kernel.h"
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index df4ce99d0597..b1d82756544b 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -32,6 +32,7 @@
 #include <linux/file.h>
 #include <linux/crypto.h>
 #include <linux/scatterlist.h>
+#include <linux/slab.h>
 #include <asm/unaligned.h>
 #include "ecryptfs_kernel.h"
 
@@ -43,17 +44,9 @@
  * Returns locked and up-to-date page (if ok), with increased
  * refcnt.
  */
-struct page *ecryptfs_get_locked_page(struct file *file, loff_t index)
+struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index)
 {
-	struct dentry *dentry;
-	struct inode *inode;
-	struct address_space *mapping;
-	struct page *page;
-
-	dentry = file->f_path.dentry;
-	inode = dentry->d_inode;
-	mapping = inode->i_mapping;
-	page = read_mapping_page(mapping, index, (void *)file);
+	struct page *page = read_mapping_page(inode->i_mapping, index, NULL);
 	if (!IS_ERR(page))
 		lock_page(page);
 	return page;
@@ -82,6 +75,19 @@ out:
 	return rc;
 }
 
+static void strip_xattr_flag(char *page_virt,
+			     struct ecryptfs_crypt_stat *crypt_stat)
+{
+	if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) {
+		size_t written;
+
+		crypt_stat->flags &= ~ECRYPTFS_METADATA_IN_XATTR;
+		ecryptfs_write_crypt_stat_flags(page_virt, crypt_stat,
+						&written);
+		crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
+	}
+}
+
 /**
  *   Header Extent:
  *     Octets 0-7:        Unencrypted file size (big-endian)
@@ -97,19 +103,6 @@ out:
  *                        (big-endian)
  *     Octet  26:         Begin RFC 2440 authentication token packet set
  */
-static void set_header_info(char *page_virt,
-			    struct ecryptfs_crypt_stat *crypt_stat)
-{
-	size_t written;
-	size_t save_num_header_bytes_at_front =
-		crypt_stat->num_header_bytes_at_front;
-
-	crypt_stat->num_header_bytes_at_front =
-		ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
-	ecryptfs_write_header_metadata(page_virt + 20, crypt_stat, &written);
-	crypt_stat->num_header_bytes_at_front =
-		save_num_header_bytes_at_front;
-}
 
 /**
  * ecryptfs_copy_up_encrypted_with_header
@@ -135,8 +128,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
 					   * num_extents_per_page)
 					  + extent_num_in_page);
 		size_t num_header_extents_at_front =
-			(crypt_stat->num_header_bytes_at_front
-			 / crypt_stat->extent_size);
+			(crypt_stat->metadata_size / crypt_stat->extent_size);
 
 		if (view_extent_num < num_header_extents_at_front) {
 			/* This is a header extent */
@@ -146,9 +138,14 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
 			memset(page_virt, 0, PAGE_CACHE_SIZE);
 			/* TODO: Support more than one header extent */
 			if (view_extent_num == 0) {
+				size_t written;
+
 				rc = ecryptfs_read_xattr_region(
 					page_virt, page->mapping->host);
-				set_header_info(page_virt, crypt_stat);
+				strip_xattr_flag(page_virt + 16, crypt_stat);
+				ecryptfs_write_header_metadata(page_virt + 20,
+							       crypt_stat,
+							       &written);
 			}
 			kunmap_atomic(page_virt, KM_USER0);
 			flush_dcache_page(page);
@@ -161,7 +158,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
 			/* This is an encrypted data extent */
 			loff_t lower_offset =
 				((view_extent_num * crypt_stat->extent_size)
-				 - crypt_stat->num_header_bytes_at_front);
+				 - crypt_stat->metadata_size);
 
 			rc = ecryptfs_read_lower_page_segment(
 				page, (lower_offset >> PAGE_CACHE_SHIFT),
@@ -193,7 +190,7 @@ out:
 static int ecryptfs_readpage(struct file *file, struct page *page)
 {
 	struct ecryptfs_crypt_stat *crypt_stat =
-		&ecryptfs_inode_to_private(file->f_path.dentry->d_inode)->crypt_stat;
+		&ecryptfs_inode_to_private(page->mapping->host)->crypt_stat;
 	int rc = 0;
 
 	if (!crypt_stat
@@ -295,8 +292,7 @@ static int ecryptfs_write_begin(struct file *file,
 
 	if (!PageUptodate(page)) {
 		struct ecryptfs_crypt_stat *crypt_stat =
-			&ecryptfs_inode_to_private(
-				file->f_path.dentry->d_inode)->crypt_stat;
+			&ecryptfs_inode_to_private(mapping->host)->crypt_stat;
 
 		if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)
 		    || (crypt_stat->flags & ECRYPTFS_NEW_FILE)) {
@@ -482,7 +478,7 @@ static int ecryptfs_write_end(struct file *file,
 	unsigned to = from + copied;
 	struct inode *ecryptfs_inode = mapping->host;
 	struct ecryptfs_crypt_stat *crypt_stat =
-		&ecryptfs_inode_to_private(file->f_path.dentry->d_inode)->crypt_stat;
+		&ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
 	int rc;
 
 	if (crypt_stat->flags & ECRYPTFS_NEW_FILE) {
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 0cc4fafd6552..db184ef15d3d 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -93,7 +93,7 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
 
 /**
  * ecryptfs_write
- * @ecryptfs_file: The eCryptfs file into which to write
+ * @ecryptfs_inode: The eCryptfs file into which to write
  * @data: Virtual address where data to write is located
  * @offset: Offset in the eCryptfs file at which to begin writing the
  *          data from @data
@@ -109,12 +109,11 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
  *
  * Returns zero on success; non-zero otherwise
  */
-int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
+int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
 		   size_t size)
 {
 	struct page *ecryptfs_page;
 	struct ecryptfs_crypt_stat *crypt_stat;
-	struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode;
 	char *ecryptfs_page_virt;
 	loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode);
 	loff_t data_offset = 0;
@@ -145,7 +144,7 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
 			if (num_bytes > total_remaining_zeros)
 				num_bytes = total_remaining_zeros;
 		}
-		ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_file,
+		ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode,
 							 ecryptfs_page_idx);
 		if (IS_ERR(ecryptfs_page)) {
 			rc = PTR_ERR(ecryptfs_page);
@@ -302,10 +301,10 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
 int ecryptfs_read(char *data, loff_t offset, size_t size,
 		  struct file *ecryptfs_file)
 {
+	struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode;
 	struct page *ecryptfs_page;
 	char *ecryptfs_page_virt;
-	loff_t ecryptfs_file_size =
-		i_size_read(ecryptfs_file->f_dentry->d_inode);
+	loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode);
 	loff_t data_offset = 0;
 	loff_t pos;
 	int rc = 0;
@@ -327,7 +326,7 @@ int ecryptfs_read(char *data, loff_t offset, size_t size,
 
 		if (num_bytes > total_remaining_bytes)
 			num_bytes = total_remaining_bytes;
-		ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_file,
+		ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode,
 							 ecryptfs_page_idx);
 		if (IS_ERR(ecryptfs_page)) {
 			rc = PTR_ERR(ecryptfs_page);
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index b15a43a80ab7..0435886e4a9f 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -26,6 +26,7 @@
 #include <linux/fs.h>
 #include <linux/mount.h>
 #include <linux/key.h>
+#include <linux/slab.h>
 #include <linux/seq_file.h>
 #include <linux/smp_lock.h>
 #include <linux/file.h>
@@ -85,7 +86,6 @@ static void ecryptfs_destroy_inode(struct inode *inode)
 		if (lower_dentry->d_inode) {
 			fput(inode_info->lower_file);
 			inode_info->lower_file = NULL;
-			d_drop(lower_dentry);
 		}
 	}
 	ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat);
@@ -109,26 +109,6 @@ void ecryptfs_init_inode(struct inode *inode, struct inode *lower_inode)
 }
 
 /**
- * ecryptfs_put_super
- * @sb: Pointer to the ecryptfs super block
- *
- * Final actions when unmounting a file system.
- * This will handle deallocation and release of our private data.
- */
-static void ecryptfs_put_super(struct super_block *sb)
-{
-	struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb);
-
-	lock_kernel();
-
-	ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat);
-	kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
-	ecryptfs_set_superblock_private(sb, NULL);
-
-	unlock_kernel();
-}
-
-/**
  * ecryptfs_statfs
  * @sb: The ecryptfs super block
  * @buf: The struct kstatfs to fill in with stats
@@ -202,7 +182,6 @@ const struct super_operations ecryptfs_sops = {
 	.alloc_inode = ecryptfs_alloc_inode,
 	.destroy_inode = ecryptfs_destroy_inode,
 	.drop_inode = generic_delete_inode,
-	.put_super = ecryptfs_put_super,
 	.statfs = ecryptfs_statfs,
 	.remount_fs = NULL,
 	.clear_inode = ecryptfs_clear_inode,
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 7758cc382ef0..6bd3f76fdf88 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -11,6 +11,7 @@
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
+#include <linux/slab.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/anon_inodes.h>
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index bd056a5b4efc..3817149919cb 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1140,8 +1140,7 @@ retry:
 		 * ep_poll_callback() when events will become available.
 		 */
 		init_waitqueue_entry(&wait, current);
-		wait.flags |= WQ_FLAG_EXCLUSIVE;
-		__add_wait_queue(&ep->wq, &wait);
+		__add_wait_queue_exclusive(&ep->wq, &wait);
 
 		for (;;) {
 			/*
diff --git a/fs/exec.c b/fs/exec.c
index 675c3f44c2ea..e19de6a80339 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -195,7 +195,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 		 *    to work from.
 		 */
 		rlim = current->signal->rlim;
-		if (size > rlim[RLIMIT_STACK].rlim_cur / 4) {
+		if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur) / 4) {
 			put_page(page);
 			return NULL;
 		}
@@ -242,10 +242,12 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
 	 * use STACK_TOP because that can depend on attributes which aren't
 	 * configured yet.
 	 */
+	BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
 	vma->vm_end = STACK_TOP_MAX;
 	vma->vm_start = vma->vm_end - PAGE_SIZE;
-	vma->vm_flags = VM_STACK_FLAGS;
+	vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
 	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
+	INIT_LIST_HEAD(&vma->anon_vma_chain);
 	err = insert_vm_struct(mm, vma);
 	if (err)
 		goto err;
@@ -516,7 +518,8 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
 	/*
 	 * cover the whole range: [new_start, old_end)
 	 */
-	vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL);
+	if (vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL))
+		return -ENOMEM;
 
 	/*
 	 * move the page tables downwards, on failure we rely on
@@ -547,15 +550,13 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
 	tlb_finish_mmu(tlb, new_end, old_end);
 
 	/*
-	 * shrink the vma to just the new range.
+	 * Shrink the vma to just the new range.  Always succeeds.
 	 */
 	vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL);
 
 	return 0;
 }
 
-#define EXTRA_STACK_VM_PAGES	20	/* random */
-
 /*
  * Finalizes the stack vm_area_struct. The flags and permissions are updated,
  * the stack is optionally relocated, and some extra space is added.
@@ -571,10 +572,13 @@ int setup_arg_pages(struct linux_binprm *bprm,
 	struct vm_area_struct *prev = NULL;
 	unsigned long vm_flags;
 	unsigned long stack_base;
+	unsigned long stack_size;
+	unsigned long stack_expand;
+	unsigned long rlim_stack;
 
 #ifdef CONFIG_STACK_GROWSUP
 	/* Limit stack size to 1GB */
-	stack_base = current->signal->rlim[RLIMIT_STACK].rlim_max;
+	stack_base = rlimit_max(RLIMIT_STACK);
 	if (stack_base > (1 << 30))
 		stack_base = 1 << 30;
 
@@ -613,6 +617,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
 	else if (executable_stack == EXSTACK_DISABLE_X)
 		vm_flags &= ~VM_EXEC;
 	vm_flags |= mm->def_flags;
+	vm_flags |= VM_STACK_INCOMPLETE_SETUP;
 
 	ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
 			vm_flags);
@@ -627,10 +632,26 @@ int setup_arg_pages(struct linux_binprm *bprm,
 			goto out_unlock;
 	}
 
+	/* mprotect_fixup is overkill to remove the temporary stack flags */
+	vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP;
+
+	stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */
+	stack_size = vma->vm_end - vma->vm_start;
+	/*
+	 * Align this down to a page boundary as expand_stack
+	 * will align it up.
+	 */
+	rlim_stack = rlimit(RLIMIT_STACK) & PAGE_MASK;
 #ifdef CONFIG_STACK_GROWSUP
-	stack_base = vma->vm_end + EXTRA_STACK_VM_PAGES * PAGE_SIZE;
+	if (stack_size + stack_expand > rlim_stack)
+		stack_base = vma->vm_start + rlim_stack;
+	else
+		stack_base = vma->vm_end + stack_expand;
 #else
-	stack_base = vma->vm_start - EXTRA_STACK_VM_PAGES * PAGE_SIZE;
+	if (stack_size + stack_expand > rlim_stack)
+		stack_base = vma->vm_end - rlim_stack;
+	else
+		stack_base = vma->vm_start - stack_expand;
 #endif
 	ret = expand_stack(vma, stack_base);
 	if (ret)
@@ -702,6 +723,7 @@ static int exec_mmap(struct mm_struct *mm)
 	/* Notify parent that we're no longer interested in the old VM */
 	tsk = current;
 	old_mm = current->mm;
+	sync_mm_rss(tsk, old_mm);
 	mm_release(tsk, old_mm);
 
 	if (old_mm) {
@@ -746,7 +768,6 @@ static int de_thread(struct task_struct *tsk)
 	struct signal_struct *sig = tsk->signal;
 	struct sighand_struct *oldsighand = tsk->sighand;
 	spinlock_t *lock = &oldsighand->siglock;
-	int count;
 
 	if (thread_group_empty(tsk))
 		goto no_thread_group;
@@ -763,13 +784,13 @@ static int de_thread(struct task_struct *tsk)
 		spin_unlock_irq(lock);
 		return -EAGAIN;
 	}
+
 	sig->group_exit_task = tsk;
-	zap_other_threads(tsk);
+	sig->notify_count = zap_other_threads(tsk);
+	if (!thread_group_leader(tsk))
+		sig->notify_count--;
 
-	/* Account for the thread group leader hanging around: */
-	count = thread_group_leader(tsk) ? 1 : 2;
-	sig->notify_count = count;
-	while (atomic_read(&sig->count) > count) {
+	while (sig->notify_count) {
 		__set_current_state(TASK_UNINTERRUPTIBLE);
 		spin_unlock_irq(lock);
 		schedule();
@@ -961,6 +982,11 @@ int flush_old_exec(struct linux_binprm * bprm)
 		goto out;
 
 	bprm->mm = NULL;		/* We're using it now */
+
+	current->flags &= ~PF_RANDOMIZE;
+	flush_thread();
+	current->personality &= ~bprm->per_clear;
+
 	return 0;
 
 out:
@@ -997,9 +1023,6 @@ void setup_new_exec(struct linux_binprm * bprm)
 	tcomm[i] = '\0';
 	set_task_comm(current, tcomm);
 
-	current->flags &= ~PF_RANDOMIZE;
-	flush_thread();
-
 	/* Set the new mm task size. We have to do that late because it may
 	 * depend on TIF_32BIT which is only updated in flush_thread() on
 	 * some architectures like powerpc
@@ -1015,8 +1038,6 @@ void setup_new_exec(struct linux_binprm * bprm)
 		set_dumpable(current->mm, suid_dumpable);
 	}
 
-	current->personality &= ~bprm->per_clear;
-
 	/*
 	 * Flush performance counters when crossing a
 	 * security domain:
@@ -1370,8 +1391,6 @@ int do_execve(char * filename,
 	if (retval < 0)
 		goto out;
 
-	current->stack_start = current->mm->start_stack;
-
 	/* execve succeeded */
 	current->fs->in_exec = 0;
 	current->in_execve = 0;
@@ -1516,7 +1535,7 @@ static int format_corename(char *corename, long signr)
 			/* core limit size */
 			case 'c':
 				rc = snprintf(out_ptr, out_end - out_ptr,
-					      "%lu", current->signal->rlim[RLIMIT_CORE].rlim_cur);
+					      "%lu", rlimit(RLIMIT_CORE));
 				if (rc > out_end - out_ptr)
 					goto out;
 				out_ptr += rc;
@@ -1544,12 +1563,13 @@ out:
 	return ispipe;
 }
 
-static int zap_process(struct task_struct *start)
+static int zap_process(struct task_struct *start, int exit_code)
 {
 	struct task_struct *t;
 	int nr = 0;
 
 	start->signal->flags = SIGNAL_GROUP_EXIT;
+	start->signal->group_exit_code = exit_code;
 	start->signal->group_stop_count = 0;
 
 	t = start;
@@ -1574,8 +1594,7 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
 	spin_lock_irq(&tsk->sighand->siglock);
 	if (!signal_group_exit(tsk->signal)) {
 		mm->core_state = core_state;
-		tsk->signal->group_exit_code = exit_code;
-		nr = zap_process(tsk);
+		nr = zap_process(tsk, exit_code);
 	}
 	spin_unlock_irq(&tsk->sighand->siglock);
 	if (unlikely(nr < 0))
@@ -1624,7 +1643,7 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
 			if (p->mm) {
 				if (unlikely(p->mm == mm)) {
 					lock_task_sighand(p, &flags);
-					nr += zap_process(p);
+					nr += zap_process(p, exit_code);
 					unlock_task_sighand(p, &flags);
 				}
 				break;
@@ -1642,12 +1661,15 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
 	struct task_struct *tsk = current;
 	struct mm_struct *mm = tsk->mm;
 	struct completion *vfork_done;
-	int core_waiters;
+	int core_waiters = -EBUSY;
 
 	init_completion(&core_state->startup);
 	core_state->dumper.task = tsk;
 	core_state->dumper.next = NULL;
-	core_waiters = zap_threads(tsk, mm, core_state, exit_code);
+
+	down_write(&mm->mmap_sem);
+	if (!mm->core_state)
+		core_waiters = zap_threads(tsk, mm, core_state, exit_code);
 	up_write(&mm->mmap_sem);
 
 	if (unlikely(core_waiters < 0))
@@ -1731,14 +1753,19 @@ void set_dumpable(struct mm_struct *mm, int value)
 	}
 }
 
-int get_dumpable(struct mm_struct *mm)
+static int __get_dumpable(unsigned long mm_flags)
 {
 	int ret;
 
-	ret = mm->flags & 0x3;
+	ret = mm_flags & MMF_DUMPABLE_MASK;
 	return (ret >= 2) ? 2 : ret;
 }
 
+int get_dumpable(struct mm_struct *mm)
+{
+	return __get_dumpable(mm->flags);
+}
+
 static void wait_for_dump_helpers(struct file *file)
 {
 	struct pipe_inode_info *pipe;
@@ -1762,26 +1789,72 @@ static void wait_for_dump_helpers(struct file *file)
 }
 
 
+/*
+ * uhm_pipe_setup
+ * helper function to customize the process used
+ * to collect the core in userspace.  Specifically
+ * it sets up a pipe and installs it as fd 0 (stdin)
+ * for the process.  Returns 0 on success, or
+ * PTR_ERR on failure.
+ * Note that it also sets the core limit to 1.  This
+ * is a special value that we use to trap recursive
+ * core dumps
+ */
+static int umh_pipe_setup(struct subprocess_info *info)
+{
+	struct file *rp, *wp;
+	struct fdtable *fdt;
+	struct coredump_params *cp = (struct coredump_params *)info->data;
+	struct files_struct *cf = current->files;
+
+	wp = create_write_pipe(0);
+	if (IS_ERR(wp))
+		return PTR_ERR(wp);
+
+	rp = create_read_pipe(wp, 0);
+	if (IS_ERR(rp)) {
+		free_write_pipe(wp);
+		return PTR_ERR(rp);
+	}
+
+	cp->file = wp;
+
+	sys_close(0);
+	fd_install(0, rp);
+	spin_lock(&cf->file_lock);
+	fdt = files_fdtable(cf);
+	FD_SET(0, fdt->open_fds);
+	FD_CLR(0, fdt->close_on_exec);
+	spin_unlock(&cf->file_lock);
+
+	/* and disallow core files too */
+	current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
+
+	return 0;
+}
+
 void do_coredump(long signr, int exit_code, struct pt_regs *regs)
 {
 	struct core_state core_state;
 	char corename[CORENAME_MAX_SIZE + 1];
 	struct mm_struct *mm = current->mm;
 	struct linux_binfmt * binfmt;
-	struct inode * inode;
 	const struct cred *old_cred;
 	struct cred *cred;
 	int retval = 0;
 	int flag = 0;
-	int ispipe = 0;
-	char **helper_argv = NULL;
-	int helper_argc = 0;
-	int dump_count = 0;
+	int ispipe;
 	static atomic_t core_dump_count = ATOMIC_INIT(0);
 	struct coredump_params cprm = {
 		.signr = signr,
 		.regs = regs,
-		.limit = current->signal->rlim[RLIMIT_CORE].rlim_cur,
+		.limit = rlimit(RLIMIT_CORE),
+		/*
+		 * We must use the same mm->flags while dumping core to avoid
+		 * inconsistency of bit flags, since this flag is not protected
+		 * by any locks.
+		 */
+		.mm_flags = mm->flags,
 	};
 
 	audit_core_dumps(signr);
@@ -1789,38 +1862,26 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
 	binfmt = mm->binfmt;
 	if (!binfmt || !binfmt->core_dump)
 		goto fail;
-
-	cred = prepare_creds();
-	if (!cred) {
-		retval = -ENOMEM;
+	if (!__get_dumpable(cprm.mm_flags))
 		goto fail;
-	}
 
-	down_write(&mm->mmap_sem);
-	/*
-	 * If another thread got here first, or we are not dumpable, bail out.
-	 */
-	if (mm->core_state || !get_dumpable(mm)) {
-		up_write(&mm->mmap_sem);
-		put_cred(cred);
+	cred = prepare_creds();
+	if (!cred)
 		goto fail;
-	}
-
 	/*
 	 *	We cannot trust fsuid as being the "true" uid of the
 	 *	process nor do we know its entire history. We only know it
 	 *	was tainted so we dump it as root in mode 2.
 	 */
-	if (get_dumpable(mm) == 2) {	/* Setuid core dump mode */
+	if (__get_dumpable(cprm.mm_flags) == 2) {
+		/* Setuid core dump mode */
 		flag = O_EXCL;		/* Stop rewrite attacks */
 		cred->fsuid = 0;	/* Dump root private */
 	}
 
 	retval = coredump_wait(exit_code, &core_state);
-	if (retval < 0) {
-		put_cred(cred);
-		goto fail;
-	}
+	if (retval < 0)
+		goto fail_creds;
 
 	old_cred = override_creds(cred);
 
@@ -1838,19 +1899,19 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
 	ispipe = format_corename(corename, signr);
 	unlock_kernel();
 
-	if ((!ispipe) && (cprm.limit < binfmt->min_coredump))
-		goto fail_unlock;
-
  	if (ispipe) {
-		if (cprm.limit == 0) {
+		int dump_count;
+		char **helper_argv;
+
+		if (cprm.limit == 1) {
 			/*
 			 * Normally core limits are irrelevant to pipes, since
 			 * we're not writing to the file system, but we use
-			 * cprm.limit of 0 here as a speacial value. Any
-			 * non-zero limit gets set to RLIM_INFINITY below, but
+			 * cprm.limit of 1 here as a speacial value. Any
+			 * non-1 limit gets set to RLIM_INFINITY below, but
 			 * a limit of 0 skips the dump.  This is a consistent
 			 * way to catch recursive crashes.  We can still crash
-			 * if the core_pattern binary sets RLIM_CORE =  !0
+			 * if the core_pattern binary sets RLIM_CORE =  !1
 			 * but it runs as root, and can do lots of stupid things
 			 * Note that we use task_tgid_vnr here to grab the pid
 			 * of the process group leader.  That way we get the
@@ -1858,11 +1919,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
 			 * core_pattern process dies.
 			 */
 			printk(KERN_WARNING
-				"Process %d(%s) has RLIMIT_CORE set to 0\n",
+				"Process %d(%s) has RLIMIT_CORE set to 1\n",
 				task_tgid_vnr(current), current->comm);
 			printk(KERN_WARNING "Aborting core\n");
 			goto fail_unlock;
 		}
+		cprm.limit = RLIM_INFINITY;
 
 		dump_count = atomic_inc_return(&core_dump_count);
 		if (core_pipe_limit && (core_pipe_limit < dump_count)) {
@@ -1872,70 +1934,74 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
 			goto fail_dropcount;
 		}
 
-		helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc);
+		helper_argv = argv_split(GFP_KERNEL, corename+1, NULL);
 		if (!helper_argv) {
 			printk(KERN_WARNING "%s failed to allocate memory\n",
 			       __func__);
 			goto fail_dropcount;
 		}
 
-		cprm.limit = RLIM_INFINITY;
-
-		/* SIGPIPE can happen, but it's just never processed */
-		if (call_usermodehelper_pipe(helper_argv[0], helper_argv, NULL,
-				&cprm.file)) {
+		retval = call_usermodehelper_fns(helper_argv[0], helper_argv,
+					NULL, UMH_WAIT_EXEC, umh_pipe_setup,
+					NULL, &cprm);
+		argv_free(helper_argv);
+		if (retval) {
  			printk(KERN_INFO "Core dump to %s pipe failed\n",
 			       corename);
-			goto fail_dropcount;
+			goto close_fail;
  		}
- 	} else
+	} else {
+		struct inode *inode;
+
+		if (cprm.limit < binfmt->min_coredump)
+			goto fail_unlock;
+
 		cprm.file = filp_open(corename,
 				 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
 				 0600);
-	if (IS_ERR(cprm.file))
-		goto fail_dropcount;
-	inode = cprm.file->f_path.dentry->d_inode;
-	if (inode->i_nlink > 1)
-		goto close_fail;	/* multiple links - don't dump */
-	if (!ispipe && d_unhashed(cprm.file->f_path.dentry))
-		goto close_fail;
-
-	/* AK: actually i see no reason to not allow this for named pipes etc.,
-	   but keep the previous behaviour for now. */
-	if (!ispipe && !S_ISREG(inode->i_mode))
-		goto close_fail;
-	/*
-	 * Dont allow local users get cute and trick others to coredump
-	 * into their pre-created files:
-	 */
-	if (inode->i_uid != current_fsuid())
-		goto close_fail;
-	if (!cprm.file->f_op)
-		goto close_fail;
-	if (!cprm.file->f_op->write)
-		goto close_fail;
-	if (!ispipe &&
-	    do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file) != 0)
-		goto close_fail;
+		if (IS_ERR(cprm.file))
+			goto fail_unlock;
 
-	retval = binfmt->core_dump(&cprm);
+		inode = cprm.file->f_path.dentry->d_inode;
+		if (inode->i_nlink > 1)
+			goto close_fail;
+		if (d_unhashed(cprm.file->f_path.dentry))
+			goto close_fail;
+		/*
+		 * AK: actually i see no reason to not allow this for named
+		 * pipes etc, but keep the previous behaviour for now.
+		 */
+		if (!S_ISREG(inode->i_mode))
+			goto close_fail;
+		/*
+		 * Dont allow local users get cute and trick others to coredump
+		 * into their pre-created files.
+		 */
+		if (inode->i_uid != current_fsuid())
+			goto close_fail;
+		if (!cprm.file->f_op || !cprm.file->f_op->write)
+			goto close_fail;
+		if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
+			goto close_fail;
+	}
 
+	retval = binfmt->core_dump(&cprm);
 	if (retval)
 		current->signal->group_exit_code |= 0x80;
-close_fail:
+
 	if (ispipe && core_pipe_limit)
 		wait_for_dump_helpers(cprm.file);
-	filp_close(cprm.file, NULL);
+close_fail:
+	if (cprm.file)
+		filp_close(cprm.file, NULL);
 fail_dropcount:
-	if (dump_count)
+	if (ispipe)
 		atomic_dec(&core_dump_count);
 fail_unlock:
-	if (helper_argv)
-		argv_free(helper_argv);
-
+	coredump_finish(mm);
 	revert_creds(old_cred);
+fail_creds:
 	put_cred(cred);
-	coredump_finish(mm);
 fail:
 	return;
 }
diff --git a/fs/exofs/common.h b/fs/exofs/common.h
index b1b178e61718..f0d520312d8b 100644
--- a/fs/exofs/common.h
+++ b/fs/exofs/common.h
@@ -55,6 +55,8 @@
 /* exofs Application specific page/attribute */
 # define EXOFS_APAGE_FS_DATA	(OSD_APAGE_APP_DEFINED_FIRST + 3)
 # define EXOFS_ATTR_INODE_DATA	1
+# define EXOFS_ATTR_INODE_FILE_LAYOUT	2
+# define EXOFS_ATTR_INODE_DIR_LAYOUT	3
 
 /*
  * The maximum number of files we can have is limited by the size of the
@@ -206,4 +208,41 @@ enum {
 	(((name_len) + offsetof(struct exofs_dir_entry, name)  + \
 	  EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND)
 
+/*
+ * The on-disk (optional) layout structure.
+ * sits in an EXOFS_ATTR_INODE_FILE_LAYOUT or EXOFS_ATTR_INODE_DIR_LAYOUT
+ * attribute, attached to any inode, usually to a directory.
+ */
+
+enum exofs_inode_layout_gen_functions {
+	LAYOUT_MOVING_WINDOW = 0,
+	LAYOUT_IMPLICT = 1,
+};
+
+struct exofs_on_disk_inode_layout {
+	__le16 gen_func; /* One of enum exofs_inode_layout_gen_functions */
+	__le16 pad;
+	union {
+		/* gen_func == LAYOUT_MOVING_WINDOW (default) */
+		struct exofs_layout_sliding_window {
+			__le32 num_devices; /* first n devices in global-table*/
+		} sliding_window __packed;
+
+		/* gen_func == LAYOUT_IMPLICT */
+		struct exofs_layout_implict_list {
+			struct exofs_dt_data_map data_map;
+			/* Variable array of size data_map.cb_num_comps. These
+			 * are device indexes of the devices in the global table
+			 */
+			__le32 dev_indexes[];
+		} implict __packed;
+	};
+} __packed;
+
+static inline size_t exofs_on_disk_inode_layout_size(unsigned max_devs)
+{
+	return sizeof(struct exofs_on_disk_inode_layout) +
+		max_devs * sizeof(__le32);
+}
+
 #endif /*ifndef __EXOFS_COM_H__*/
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 4cfab1cc75c0..d91e9d829bc1 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -608,7 +608,7 @@ int exofs_make_empty(struct inode *inode, struct inode *parent)
 	de->inode_no = cpu_to_le64(parent->i_ino);
 	memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR));
 	exofs_set_de_type(de, inode);
-	kunmap_atomic(page, KM_USER0);
+	kunmap_atomic(kaddr, KM_USER0);
 	err = exofs_commit_chunk(page, 0, chunk_size);
 fail:
 	page_cache_release(page);
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index c35fd4623986..22721b2fd890 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -35,6 +35,7 @@
 
 #include <linux/fs.h>
 #include <linux/time.h>
+#include <linux/backing-dev.h>
 #include "common.h"
 
 /* FIXME: Remove once pnfs hits mainline
@@ -55,12 +56,28 @@
 /* u64 has problems with printk this will cast it to unsigned long long */
 #define _LLU(x) (unsigned long long)(x)
 
+struct exofs_layout {
+	osd_id		s_pid;			/* partition ID of file system*/
+
+	/* Our way of looking at the data_map */
+	unsigned stripe_unit;
+	unsigned mirrors_p1;
+
+	unsigned group_width;
+	u64	 group_depth;
+	unsigned group_count;
+
+	enum exofs_inode_layout_gen_functions lay_func;
+
+	unsigned	s_numdevs;		/* Num of devices in array    */
+	struct osd_dev	*s_ods[0];		/* Variable length            */
+};
+
 /*
  * our extension to the in-memory superblock
  */
 struct exofs_sb_info {
 	struct exofs_fscb s_fscb;		/* Written often, pre-allocate*/
-	osd_id		s_pid;			/* partition ID of file system*/
 	int		s_timeout;		/* timeout for OSD operations */
 	uint64_t	s_nextid;		/* highest object ID used     */
 	uint32_t	s_numfiles;		/* number of files on fs      */
@@ -68,23 +85,29 @@ struct exofs_sb_info {
 	u32		s_next_generation;	/* next gen # to use          */
 	atomic_t	s_curr_pending;		/* number of pending commands */
 	uint8_t		s_cred[OSD_CAP_LEN];	/* credential for the fscb    */
-
-	struct pnfs_osd_data_map data_map;	/* Default raid to use        */
-	unsigned	s_numdevs;		/* Num of devices in array    */
-	struct osd_dev	*s_ods[1];		/* Variable length, minimum 1 */
+	struct 		backing_dev_info bdi;	/* register our bdi with VFS  */
+
+	struct pnfs_osd_data_map data_map;	/* Default raid to use
+						 * FIXME: Needed ?
+						 */
+/*	struct exofs_layout	dir_layout;*/	/* Default dir layout */
+	struct exofs_layout	layout;		/* Default files layout,
+						 * contains the variable osd_dev
+						 * array. Keep last */
+	struct osd_dev	*_min_one_dev[1];	/* Place holder for one dev   */
 };
 
 /*
  * our extension to the in-memory inode
  */
 struct exofs_i_info {
+	struct inode   vfs_inode;          /* normal in-memory inode          */
+	wait_queue_head_t i_wq;            /* wait queue for inode            */
 	unsigned long  i_flags;            /* various atomic flags            */
 	uint32_t       i_data[EXOFS_IDATA];/*short symlink names and device #s*/
 	uint32_t       i_dir_start_lookup; /* which page to start lookup      */
-	wait_queue_head_t i_wq;            /* wait queue for inode            */
 	uint64_t       i_commit_size;      /* the object's written length     */
 	uint8_t        i_cred[OSD_CAP_LEN];/* all-powerful credential         */
-	struct inode   vfs_inode;          /* normal in-memory inode          */
 };
 
 static inline osd_id exofs_oi_objno(struct exofs_i_info *oi)
@@ -101,7 +124,7 @@ struct exofs_io_state {
 	void			*private;
 	exofs_io_done_fn	done;
 
-	struct exofs_sb_info	*sbi;
+	struct exofs_layout	*layout;
 	struct osd_obj_id	obj;
 	u8			*cred;
 
@@ -109,7 +132,11 @@ struct exofs_io_state {
 	loff_t			offset;
 	unsigned long		length;
 	void			*kern_buff;
-	struct bio		*bio;
+
+	struct page		**pages;
+	unsigned		nr_pages;
+	unsigned		pgbase;
+	unsigned		pages_consumed;
 
 	/* Attributes */
 	unsigned		in_attr_len;
@@ -122,6 +149,9 @@ struct exofs_io_state {
 	struct exofs_per_dev_state {
 		struct osd_request *or;
 		struct bio *bio;
+		loff_t offset;
+		unsigned length;
+		unsigned dev;
 	} per_dev[];
 };
 
@@ -175,6 +205,12 @@ static inline struct exofs_i_info *exofs_i(struct inode *inode)
 }
 
 /*
+ * Given a layout, object_number and stripe_index return the associated global
+ * dev_index
+ */
+unsigned exofs_layout_od_id(struct exofs_layout *layout,
+			    osd_id obj_no, unsigned layout_index);
+/*
  * Maximum count of links to a file
  */
 #define EXOFS_LINK_MAX           32000
@@ -189,7 +225,8 @@ void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
 int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
 		    u64 offset, void *p, unsigned length);
 
-int  exofs_get_io_state(struct exofs_sb_info *sbi, struct exofs_io_state** ios);
+int  exofs_get_io_state(struct exofs_layout *layout,
+			struct exofs_io_state **ios);
 void exofs_put_io_state(struct exofs_io_state *ios);
 
 int exofs_check_io(struct exofs_io_state *ios, u64 *resid);
@@ -226,7 +263,7 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
 		struct page **pagep, void **fsdata);
 extern struct inode *exofs_iget(struct super_block *, unsigned long);
 struct inode *exofs_new_inode(struct inode *, int);
-extern int exofs_write_inode(struct inode *, int);
+extern int exofs_write_inode(struct inode *, struct writeback_control *wbc);
 extern void exofs_delete_inode(struct inode *);
 
 /* dir.c:                */
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 839b9dc1e70f..fef6899be397 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -40,12 +40,11 @@ static int exofs_release_file(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-static int exofs_file_fsync(struct file *filp, struct dentry *dentry,
-			    int datasync)
+static int exofs_file_fsync(struct file *filp, int datasync)
 {
 	int ret;
 	struct address_space *mapping = filp->f_mapping;
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = mapping->host;
 	struct super_block *sb;
 
 	ret = filemap_write_and_wait(mapping);
@@ -66,7 +65,7 @@ static int exofs_file_fsync(struct file *filp, struct dentry *dentry,
 
 static int exofs_flush(struct file *file, fl_owner_t id)
 {
-	exofs_file_fsync(file, file->f_path.dentry, 1);
+	exofs_file_fsync(file, 1);
 	/* TODO: Flush the OSD target */
 	return 0;
 }
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 2afbcebeda71..4bb6ef822e46 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -31,6 +31,7 @@
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
+#include <linux/slab.h>
 #include <linux/writeback.h>
 #include <linux/buffer_head.h>
 #include <scsi/scsi_device.h>
@@ -41,16 +42,18 @@
 
 enum { BIO_MAX_PAGES_KMALLOC =
 		(PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
+	MAX_PAGES_KMALLOC =
+		PAGE_SIZE / sizeof(struct page *),
 };
 
 struct page_collect {
 	struct exofs_sb_info *sbi;
-	struct request_queue *req_q;
 	struct inode *inode;
 	unsigned expected_pages;
 	struct exofs_io_state *ios;
 
-	struct bio *bio;
+	struct page **pages;
+	unsigned alloc_pages;
 	unsigned nr_pages;
 	unsigned long length;
 	loff_t pg_first; /* keep 64bit also in 32-arches */
@@ -62,15 +65,12 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
 	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
 
 	pcol->sbi = sbi;
-	/* Create master bios on first Q, later on cloning, each clone will be
-	 * allocated on it's destination Q
-	 */
-	pcol->req_q = osd_request_queue(sbi->s_ods[0]);
 	pcol->inode = inode;
 	pcol->expected_pages = expected_pages;
 
 	pcol->ios = NULL;
-	pcol->bio = NULL;
+	pcol->pages = NULL;
+	pcol->alloc_pages = 0;
 	pcol->nr_pages = 0;
 	pcol->length = 0;
 	pcol->pg_first = -1;
@@ -80,7 +80,8 @@ static void _pcol_reset(struct page_collect *pcol)
 {
 	pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages);
 
-	pcol->bio = NULL;
+	pcol->pages = NULL;
+	pcol->alloc_pages = 0;
 	pcol->nr_pages = 0;
 	pcol->length = 0;
 	pcol->pg_first = -1;
@@ -90,38 +91,43 @@ static void _pcol_reset(struct page_collect *pcol)
 	 * it might not end here. don't be left with nothing
 	 */
 	if (!pcol->expected_pages)
-		pcol->expected_pages = BIO_MAX_PAGES_KMALLOC;
+		pcol->expected_pages = MAX_PAGES_KMALLOC;
 }
 
 static int pcol_try_alloc(struct page_collect *pcol)
 {
-	int pages = min_t(unsigned, pcol->expected_pages,
-			  BIO_MAX_PAGES_KMALLOC);
+	unsigned pages = min_t(unsigned, pcol->expected_pages,
+			  MAX_PAGES_KMALLOC);
 
 	if (!pcol->ios) { /* First time allocate io_state */
-		int ret = exofs_get_io_state(pcol->sbi, &pcol->ios);
+		int ret = exofs_get_io_state(&pcol->sbi->layout, &pcol->ios);
 
 		if (ret)
 			return ret;
 	}
 
+	/* TODO: easily support bio chaining */
+	pages =  min_t(unsigned, pages,
+		       pcol->sbi->layout.group_width * BIO_MAX_PAGES_KMALLOC);
+
 	for (; pages; pages >>= 1) {
-		pcol->bio = bio_kmalloc(GFP_KERNEL, pages);
-		if (likely(pcol->bio))
+		pcol->pages = kmalloc(pages * sizeof(struct page *),
+				      GFP_KERNEL);
+		if (likely(pcol->pages)) {
+			pcol->alloc_pages = pages;
 			return 0;
+		}
 	}
 
-	EXOFS_ERR("Failed to bio_kmalloc expected_pages=%u\n",
+	EXOFS_ERR("Failed to kmalloc expected_pages=%u\n",
 		  pcol->expected_pages);
 	return -ENOMEM;
 }
 
 static void pcol_free(struct page_collect *pcol)
 {
-	if (pcol->bio) {
-		bio_put(pcol->bio);
-		pcol->bio = NULL;
-	}
+	kfree(pcol->pages);
+	pcol->pages = NULL;
 
 	if (pcol->ios) {
 		exofs_put_io_state(pcol->ios);
@@ -132,11 +138,10 @@ static void pcol_free(struct page_collect *pcol)
 static int pcol_add_page(struct page_collect *pcol, struct page *page,
 			 unsigned len)
 {
-	int added_len = bio_add_pc_page(pcol->req_q, pcol->bio, page, len, 0);
-	if (unlikely(len != added_len))
+	if (unlikely(pcol->nr_pages >= pcol->alloc_pages))
 		return -ENOMEM;
 
-	++pcol->nr_pages;
+	pcol->pages[pcol->nr_pages++] = page;
 	pcol->length += len;
 	return 0;
 }
@@ -181,7 +186,6 @@ static void update_write_page(struct page *page, int ret)
  */
 static int __readpages_done(struct page_collect *pcol, bool do_unlock)
 {
-	struct bio_vec *bvec;
 	int i;
 	u64 resid;
 	u64 good_bytes;
@@ -193,13 +197,13 @@ static int __readpages_done(struct page_collect *pcol, bool do_unlock)
 	else
 		good_bytes = pcol->length - resid;
 
-	EXOFS_DBGMSG("readpages_done(0x%lx) good_bytes=0x%llx"
+	EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx"
 		     " length=0x%lx nr_pages=%u\n",
 		     pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
 		     pcol->nr_pages);
 
-	__bio_for_each_segment(bvec, pcol->bio, i, 0) {
-		struct page *page = bvec->bv_page;
+	for (i = 0; i < pcol->nr_pages; i++) {
+		struct page *page = pcol->pages[i];
 		struct inode *inode = page->mapping->host;
 		int page_stat;
 
@@ -218,11 +222,11 @@ static int __readpages_done(struct page_collect *pcol, bool do_unlock)
 		ret = update_read_page(page, page_stat);
 		if (do_unlock)
 			unlock_page(page);
-		length += bvec->bv_len;
+		length += PAGE_SIZE;
 	}
 
 	pcol_free(pcol);
-	EXOFS_DBGMSG("readpages_done END\n");
+	EXOFS_DBGMSG2("readpages_done END\n");
 	return ret;
 }
 
@@ -238,11 +242,10 @@ static void readpages_done(struct exofs_io_state *ios, void *p)
 
 static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
 {
-	struct bio_vec *bvec;
 	int i;
 
-	__bio_for_each_segment(bvec, pcol->bio, i, 0) {
-		struct page *page = bvec->bv_page;
+	for (i = 0; i < pcol->nr_pages; i++) {
+		struct page *page = pcol->pages[i];
 
 		if (rw == READ)
 			update_read_page(page, ret);
@@ -260,13 +263,14 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
 	struct page_collect *pcol_copy = NULL;
 	int ret;
 
-	if (!pcol->bio)
+	if (!pcol->pages)
 		return 0;
 
 	/* see comment in _readpage() about sync reads */
 	WARN_ON(is_sync && (pcol->nr_pages != 1));
 
-	ios->bio = pcol->bio;
+	ios->pages = pcol->pages;
+	ios->nr_pages = pcol->nr_pages;
 	ios->length = pcol->length;
 	ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT;
 
@@ -290,7 +294,7 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
 
 	atomic_inc(&pcol->sbi->s_curr_pending);
 
-	EXOFS_DBGMSG("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
+	EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
 		  ios->obj.id, _LLU(ios->offset), pcol->length);
 
 	/* pages ownership was passed to pcol_copy */
@@ -366,7 +370,7 @@ try_again:
 		goto try_again;
 	}
 
-	if (!pcol->bio) {
+	if (!pcol->pages) {
 		ret = pcol_try_alloc(pcol);
 		if (unlikely(ret))
 			goto fail;
@@ -448,7 +452,6 @@ static int exofs_readpage(struct file *file, struct page *page)
 static void writepages_done(struct exofs_io_state *ios, void *p)
 {
 	struct page_collect *pcol = p;
-	struct bio_vec *bvec;
 	int i;
 	u64 resid;
 	u64  good_bytes;
@@ -462,13 +465,13 @@ static void writepages_done(struct exofs_io_state *ios, void *p)
 	else
 		good_bytes = pcol->length - resid;
 
-	EXOFS_DBGMSG("writepages_done(0x%lx) good_bytes=0x%llx"
+	EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx"
 		     " length=0x%lx nr_pages=%u\n",
 		     pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
 		     pcol->nr_pages);
 
-	__bio_for_each_segment(bvec, pcol->bio, i, 0) {
-		struct page *page = bvec->bv_page;
+	for (i = 0; i < pcol->nr_pages; i++) {
+		struct page *page = pcol->pages[i];
 		struct inode *inode = page->mapping->host;
 		int page_stat;
 
@@ -485,12 +488,12 @@ static void writepages_done(struct exofs_io_state *ios, void *p)
 		EXOFS_DBGMSG2("    writepages_done(0x%lx, 0x%lx) status=%d\n",
 			     inode->i_ino, page->index, page_stat);
 
-		length += bvec->bv_len;
+		length += PAGE_SIZE;
 	}
 
 	pcol_free(pcol);
 	kfree(pcol);
-	EXOFS_DBGMSG("writepages_done END\n");
+	EXOFS_DBGMSG2("writepages_done END\n");
 }
 
 static int write_exec(struct page_collect *pcol)
@@ -500,7 +503,7 @@ static int write_exec(struct page_collect *pcol)
 	struct page_collect *pcol_copy = NULL;
 	int ret;
 
-	if (!pcol->bio)
+	if (!pcol->pages)
 		return 0;
 
 	pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
@@ -512,9 +515,8 @@ static int write_exec(struct page_collect *pcol)
 
 	*pcol_copy = *pcol;
 
-	pcol_copy->bio->bi_rw |= (1 << BIO_RW); /* FIXME: bio_set_dir() */
-
-	ios->bio = pcol_copy->bio;
+	ios->pages = pcol_copy->pages;
+	ios->nr_pages = pcol_copy->nr_pages;
 	ios->offset = pcol_copy->pg_first << PAGE_CACHE_SHIFT;
 	ios->length = pcol_copy->length;
 	ios->done = writepages_done;
@@ -527,7 +529,7 @@ static int write_exec(struct page_collect *pcol)
 	}
 
 	atomic_inc(&pcol->sbi->s_curr_pending);
-	EXOFS_DBGMSG("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
+	EXOFS_DBGMSG2("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
 		  pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset),
 		  pcol->length);
 	/* pages ownership was passed to pcol_copy */
@@ -605,7 +607,7 @@ try_again:
 		goto try_again;
 	}
 
-	if (!pcol->bio) {
+	if (!pcol->pages) {
 		ret = pcol_try_alloc(pcol);
 		if (unlikely(ret))
 			goto fail;
@@ -616,7 +618,7 @@ try_again:
 
 	ret = pcol_add_page(pcol, page, len);
 	if (unlikely(ret)) {
-		EXOFS_DBGMSG("Failed pcol_add_page "
+		EXOFS_DBGMSG2("Failed pcol_add_page "
 			     "nr_pages=%u total_length=0x%lx\n",
 			     pcol->nr_pages, pcol->length);
 
@@ -663,7 +665,7 @@ static int exofs_writepages(struct address_space *mapping,
 	if (expected_pages < 32L)
 		expected_pages = 32L;
 
-	EXOFS_DBGMSG("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx "
+	EXOFS_DBGMSG2("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx "
 		     "nrpages=%lu start=0x%lx end=0x%lx expected_pages=%ld\n",
 		     mapping->host->i_ino, wbc->range_start, wbc->range_end,
 		     mapping->nrpages, start, end, expected_pages);
@@ -753,6 +755,21 @@ static int exofs_write_end(struct file *file, struct address_space *mapping,
 	return ret;
 }
 
+static int exofs_releasepage(struct page *page, gfp_t gfp)
+{
+	EXOFS_DBGMSG("page 0x%lx\n", page->index);
+	WARN_ON(1);
+	return try_to_free_buffers(page);
+}
+
+static void exofs_invalidatepage(struct page *page, unsigned long offset)
+{
+	EXOFS_DBGMSG("page_has_buffers=>%d\n", page_has_buffers(page));
+	WARN_ON(1);
+
+	block_invalidatepage(page, offset);
+}
+
 const struct address_space_operations exofs_aops = {
 	.readpage	= exofs_readpage,
 	.readpages	= exofs_readpages,
@@ -760,6 +777,21 @@ const struct address_space_operations exofs_aops = {
 	.writepages	= exofs_writepages,
 	.write_begin	= exofs_write_begin_export,
 	.write_end	= exofs_write_end,
+	.releasepage	= exofs_releasepage,
+	.set_page_dirty	= __set_page_dirty_nobuffers,
+	.invalidatepage = exofs_invalidatepage,
+
+	/* Not implemented Yet */
+	.bmap		= NULL, /* TODO: use osd's OSD_ACT_READ_MAP */
+	.direct_IO	= NULL, /* TODO: Should be trivial to do */
+
+	/* With these NULL has special meaning or default is not exported */
+	.sync_page	= NULL,
+	.get_xip_mem	= NULL,
+	.migratepage	= NULL,
+	.launder_page	= NULL,
+	.is_partially_uptodate = NULL,
+	.error_remove_page = NULL,
 };
 
 /******************************************************************************
@@ -859,20 +891,33 @@ int exofs_setattr(struct dentry *dentry, struct iattr *iattr)
 	return error;
 }
 
+static const struct osd_attr g_attr_inode_file_layout = ATTR_DEF(
+	EXOFS_APAGE_FS_DATA,
+	EXOFS_ATTR_INODE_FILE_LAYOUT,
+	0);
+static const struct osd_attr g_attr_inode_dir_layout = ATTR_DEF(
+	EXOFS_APAGE_FS_DATA,
+	EXOFS_ATTR_INODE_DIR_LAYOUT,
+	0);
+
 /*
- * Read an inode from the OSD, and return it as is.  We also return the size
- * attribute in the 'obj_size' argument.
+ * Read the Linux inode info from the OSD, and return it as is. In exofs the
+ * inode info is in an application specific page/attribute of the osd-object.
  */
 static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
-		    struct exofs_fcb *inode, uint64_t *obj_size)
+		    struct exofs_fcb *inode)
 {
 	struct exofs_sb_info *sbi = sb->s_fs_info;
-	struct osd_attr attrs[2];
+	struct osd_attr attrs[] = {
+		[0] = g_attr_inode_data,
+		[1] = g_attr_inode_file_layout,
+		[2] = g_attr_inode_dir_layout,
+	};
 	struct exofs_io_state *ios;
+	struct exofs_on_disk_inode_layout *layout;
 	int ret;
 
-	*obj_size = ~0;
-	ret = exofs_get_io_state(sbi, &ios);
+	ret = exofs_get_io_state(&sbi->layout, &ios);
 	if (unlikely(ret)) {
 		EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
 		return ret;
@@ -882,14 +927,25 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
 	exofs_make_credential(oi->i_cred, &ios->obj);
 	ios->cred = oi->i_cred;
 
-	attrs[0] = g_attr_inode_data;
-	attrs[1] = g_attr_logical_length;
+	attrs[1].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs);
+	attrs[2].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs);
+
 	ios->in_attr = attrs;
 	ios->in_attr_len = ARRAY_SIZE(attrs);
 
 	ret = exofs_sbi_read(ios);
-	if (ret)
+	if (unlikely(ret)) {
+		EXOFS_ERR("object(0x%llx) corrupted, return empty file=>%d\n",
+			  _LLU(ios->obj.id), ret);
+		memset(inode, 0, sizeof(*inode));
+		inode->i_mode = 0040000 | (0777 & ~022);
+		/* If object is lost on target we might as well enable it's
+		 * delete.
+		 */
+		if ((ret == -ENOENT) || (ret == -EINVAL))
+			ret = 0;
 		goto out;
+	}
 
 	ret = extract_attr_from_ios(ios, &attrs[0]);
 	if (ret) {
@@ -901,11 +957,33 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
 
 	ret = extract_attr_from_ios(ios, &attrs[1]);
 	if (ret) {
-		EXOFS_ERR("%s: extract_attr of logical_length failed\n",
-			  __func__);
+		EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
 		goto out;
 	}
-	*obj_size = get_unaligned_be64(attrs[1].val_ptr);
+	if (attrs[1].len) {
+		layout = attrs[1].val_ptr;
+		if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) {
+			EXOFS_ERR("%s: unsupported files layout %d\n",
+				__func__, layout->gen_func);
+			ret = -ENOTSUPP;
+			goto out;
+		}
+	}
+
+	ret = extract_attr_from_ios(ios, &attrs[2]);
+	if (ret) {
+		EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
+		goto out;
+	}
+	if (attrs[2].len) {
+		layout = attrs[2].val_ptr;
+		if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) {
+			EXOFS_ERR("%s: unsupported meta-data layout %d\n",
+				__func__, layout->gen_func);
+			ret = -ENOTSUPP;
+			goto out;
+		}
+	}
 
 out:
 	exofs_put_io_state(ios);
@@ -925,7 +1003,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
 	struct exofs_i_info *oi;
 	struct exofs_fcb fcb;
 	struct inode *inode;
-	uint64_t obj_size;
 	int ret;
 
 	inode = iget_locked(sb, ino);
@@ -937,7 +1014,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
 	__oi_init(oi);
 
 	/* read the inode from the osd */
-	ret = exofs_get_inode(sb, oi, &fcb, &obj_size);
+	ret = exofs_get_inode(sb, oi, &fcb);
 	if (ret)
 		goto bad_inode;
 
@@ -958,13 +1035,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
 	inode->i_blkbits = EXOFS_BLKSHIFT;
 	inode->i_generation = le32_to_cpu(fcb.i_generation);
 
-	if ((inode->i_size != obj_size) &&
-		(!exofs_inode_is_fast_symlink(inode))) {
-		EXOFS_ERR("WARNING: Size of inode=%llu != object=%llu\n",
-			  inode->i_size, _LLU(obj_size));
-		/* FIXME: call exofs_inode_recovery() */
-	}
-
 	oi->i_dir_start_lookup = 0;
 
 	if ((inode->i_nlink == 0) && (inode->i_mode == 0)) {
@@ -1043,7 +1113,7 @@ static void create_done(struct exofs_io_state *ios, void *p)
 
 	if (unlikely(ret)) {
 		EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx",
-			  _LLU(exofs_oi_objno(oi)), _LLU(sbi->s_pid));
+			  _LLU(exofs_oi_objno(oi)), _LLU(sbi->layout.s_pid));
 		/*TODO: When FS is corrupted creation can fail, object already
 		 * exist. Get rid of this asynchronous creation, if exist
 		 * increment the obj counter and try the next object. Until we
@@ -1083,16 +1153,7 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
 	sbi = sb->s_fs_info;
 
 	sb->s_dirt = 1;
-	inode->i_uid = current->cred->fsuid;
-	if (dir->i_mode & S_ISGID) {
-		inode->i_gid = dir->i_gid;
-		if (S_ISDIR(mode))
-			mode |= S_ISGID;
-	} else {
-		inode->i_gid = current->cred->fsgid;
-	}
-	inode->i_mode = mode;
-
+	inode_init_owner(inode, dir, mode);
 	inode->i_ino = sbi->s_nextid++;
 	inode->i_blkbits = EXOFS_BLKSHIFT;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
@@ -1104,7 +1165,7 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
 
 	mark_inode_dirty(inode);
 
-	ret = exofs_get_io_state(sbi, &ios);
+	ret = exofs_get_io_state(&sbi->layout, &ios);
 	if (unlikely(ret)) {
 		EXOFS_ERR("exofs_new_inode: exofs_get_io_state failed\n");
 		return ERR_PTR(ret);
@@ -1170,8 +1231,10 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
 	int ret;
 
 	args = kzalloc(sizeof(*args), GFP_KERNEL);
-	if (!args)
+	if (!args) {
+		EXOFS_DBGMSG("Faild kzalloc of args\n");
 		return -ENOMEM;
+	}
 
 	fcb = &args->fcb;
 
@@ -1200,7 +1263,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
 	} else
 		memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
 
-	ret = exofs_get_io_state(sbi, &ios);
+	ret = exofs_get_io_state(&sbi->layout, &ios);
 	if (unlikely(ret)) {
 		EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
 		goto free_args;
@@ -1234,13 +1297,14 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
 free_args:
 	kfree(args);
 out:
-	EXOFS_DBGMSG("ret=>%d\n", ret);
+	EXOFS_DBGMSG("(0x%lx) do_sync=%d ret=>%d\n",
+		     inode->i_ino, do_sync, ret);
 	return ret;
 }
 
-int exofs_write_inode(struct inode *inode, int wait)
+int exofs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
-	return exofs_update_inode(inode, wait);
+	return exofs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
 }
 
 /*
@@ -1283,7 +1347,7 @@ void exofs_delete_inode(struct inode *inode)
 
 	clear_inode(inode);
 
-	ret = exofs_get_io_state(sbi, &ios);
+	ret = exofs_get_io_state(&sbi->layout, &ios);
 	if (unlikely(ret)) {
 		EXOFS_ERR("%s: exofs_get_io_state failed\n", __func__);
 		return;
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index 5bad01fa1f9f..4337cad7777b 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -22,10 +22,15 @@
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
+#include <linux/slab.h>
 #include <scsi/scsi_device.h>
+#include <asm/div64.h>
 
 #include "exofs.h"
 
+#define EXOFS_DBGMSG2(M...) do {} while (0)
+/* #define EXOFS_DBGMSG2 EXOFS_DBGMSG */
+
 void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
 {
 	osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
@@ -64,21 +69,24 @@ out:
 	return ret;
 }
 
-int exofs_get_io_state(struct exofs_sb_info *sbi, struct exofs_io_state** pios)
+int exofs_get_io_state(struct exofs_layout *layout,
+		       struct exofs_io_state **pios)
 {
 	struct exofs_io_state *ios;
 
 	/*TODO: Maybe use kmem_cach per sbi of size
-	 * exofs_io_state_size(sbi->s_numdevs)
+	 * exofs_io_state_size(layout->s_numdevs)
 	 */
-	ios = kzalloc(exofs_io_state_size(sbi->s_numdevs), GFP_KERNEL);
+	ios = kzalloc(exofs_io_state_size(layout->s_numdevs), GFP_KERNEL);
 	if (unlikely(!ios)) {
+		EXOFS_DBGMSG("Faild kzalloc bytes=%d\n",
+			     exofs_io_state_size(layout->s_numdevs));
 		*pios = NULL;
 		return -ENOMEM;
 	}
 
-	ios->sbi = sbi;
-	ios->obj.partition = sbi->s_pid;
+	ios->layout = layout;
+	ios->obj.partition = layout->s_pid;
 	*pios = ios;
 	return 0;
 }
@@ -101,6 +109,29 @@ void exofs_put_io_state(struct exofs_io_state *ios)
 	}
 }
 
+unsigned exofs_layout_od_id(struct exofs_layout *layout,
+			    osd_id obj_no, unsigned layout_index)
+{
+/*	switch (layout->lay_func) {
+	case LAYOUT_MOVING_WINDOW:
+	{*/
+		unsigned dev_mod = obj_no;
+
+		return (layout_index + dev_mod * layout->mirrors_p1) %
+							      layout->s_numdevs;
+/*	}
+	case LAYOUT_FUNC_IMPLICT:
+		return layout->devs[layout_index];
+	}*/
+}
+
+static inline struct osd_dev *exofs_ios_od(struct exofs_io_state *ios,
+					   unsigned layout_index)
+{
+	return ios->layout->s_ods[
+		exofs_layout_od_id(ios->layout, ios->obj.id, layout_index)];
+}
+
 static void _sync_done(struct exofs_io_state *ios, void *p)
 {
 	struct completion *waiting = p;
@@ -168,6 +199,21 @@ static int exofs_io_execute(struct exofs_io_state *ios)
 	return ret;
 }
 
+static void _clear_bio(struct bio *bio)
+{
+	struct bio_vec *bv;
+	unsigned i;
+
+	__bio_for_each_segment(bv, bio, i, 0) {
+		unsigned this_count = bv->bv_len;
+
+		if (likely(PAGE_SIZE == this_count))
+			clear_highpage(bv->bv_page);
+		else
+			zero_user(bv->bv_page, bv->bv_offset, this_count);
+	}
+}
+
 int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
 {
 	enum osd_err_priority acumulated_osd_err = 0;
@@ -176,16 +222,25 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
 
 	for (i = 0; i < ios->numdevs; i++) {
 		struct osd_sense_info osi;
-		int ret = osd_req_decode_sense(ios->per_dev[i].or, &osi);
+		struct osd_request *or = ios->per_dev[i].or;
+		int ret;
+
+		if (unlikely(!or))
+			continue;
 
+		ret = osd_req_decode_sense(or, &osi);
 		if (likely(!ret))
 			continue;
 
-		if (unlikely(ret == -EFAULT)) {
-			EXOFS_DBGMSG("%s: EFAULT Need page clear\n", __func__);
-			/*FIXME: All the pages in this device range should:
-			 *	clear_highpage(page);
-			 */
+		if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
+			/* start read offset passed endof file */
+			_clear_bio(ios->per_dev[i].bio);
+			EXOFS_DBGMSG("start read offset passed end of file "
+				"offset=0x%llx, length=0x%llx\n",
+				_LLU(ios->per_dev[i].offset),
+				_LLU(ios->per_dev[i].length));
+
+			continue; /* we recovered */
 		}
 
 		if (osi.osd_err_pri >= acumulated_osd_err) {
@@ -205,14 +260,259 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
 	return acumulated_lin_err;
 }
 
+/*
+ * L - logical offset into the file
+ *
+ * U - The number of bytes in a stripe within a group
+ *
+ *	U = stripe_unit * group_width
+ *
+ * T - The number of bytes striped within a group of component objects
+ *     (before advancing to the next group)
+ *
+ *	T = stripe_unit * group_width * group_depth
+ *
+ * S - The number of bytes striped across all component objects
+ *     before the pattern repeats
+ *
+ *	S = stripe_unit * group_width * group_depth * group_count
+ *
+ * M - The "major" (i.e., across all components) stripe number
+ *
+ *	M = L / S
+ *
+ * G - Counts the groups from the beginning of the major stripe
+ *
+ *	G = (L - (M * S)) / T	[or (L % S) / T]
+ *
+ * H - The byte offset within the group
+ *
+ *	H = (L - (M * S)) % T	[or (L % S) % T]
+ *
+ * N - The "minor" (i.e., across the group) stripe number
+ *
+ *	N = H / U
+ *
+ * C - The component index coresponding to L
+ *
+ *	C = (H - (N * U)) / stripe_unit + G * group_width
+ *	[or (L % U) / stripe_unit + G * group_width]
+ *
+ * O - The component offset coresponding to L
+ *
+ *	O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit
+ */
+struct _striping_info {
+	u64 obj_offset;
+	u64 group_length;
+	u64 total_group_length;
+	u64 Major;
+	unsigned dev;
+	unsigned unit_off;
+};
+
+static void _calc_stripe_info(struct exofs_io_state *ios, u64 file_offset,
+			      struct _striping_info *si)
+{
+	u32	stripe_unit = ios->layout->stripe_unit;
+	u32	group_width = ios->layout->group_width;
+	u64	group_depth = ios->layout->group_depth;
+
+	u32	U = stripe_unit * group_width;
+	u64	T = U * group_depth;
+	u64	S = T * ios->layout->group_count;
+	u64	M = div64_u64(file_offset, S);
+
+	/*
+	G = (L - (M * S)) / T
+	H = (L - (M * S)) % T
+	*/
+	u64	LmodS = file_offset - M * S;
+	u32	G = div64_u64(LmodS, T);
+	u64	H = LmodS - G * T;
+
+	u32	N = div_u64(H, U);
+
+	/* "H - (N * U)" is just "H % U" so it's bound to u32 */
+	si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
+	si->dev *= ios->layout->mirrors_p1;
+
+	div_u64_rem(file_offset, stripe_unit, &si->unit_off);
+
+	si->obj_offset = si->unit_off + (N * stripe_unit) +
+				  (M * group_depth * stripe_unit);
+
+	si->group_length = T - H;
+	si->total_group_length = T;
+	si->Major = M;
+}
+
+static int _add_stripe_unit(struct exofs_io_state *ios,  unsigned *cur_pg,
+		unsigned pgbase, struct exofs_per_dev_state *per_dev,
+		int cur_len)
+{
+	unsigned pg = *cur_pg;
+	struct request_queue *q =
+			osd_request_queue(exofs_ios_od(ios, per_dev->dev));
+
+	per_dev->length += cur_len;
+
+	if (per_dev->bio == NULL) {
+		unsigned pages_in_stripe = ios->layout->group_width *
+					(ios->layout->stripe_unit / PAGE_SIZE);
+		unsigned bio_size = (ios->nr_pages + pages_in_stripe) /
+						ios->layout->group_width;
+
+		per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
+		if (unlikely(!per_dev->bio)) {
+			EXOFS_DBGMSG("Faild to allocate BIO size=%u\n",
+				     bio_size);
+			return -ENOMEM;
+		}
+	}
+
+	while (cur_len > 0) {
+		unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
+		unsigned added_len;
+
+		BUG_ON(ios->nr_pages <= pg);
+		cur_len -= pglen;
+
+		added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg],
+					    pglen, pgbase);
+		if (unlikely(pglen != added_len))
+			return -ENOMEM;
+		pgbase = 0;
+		++pg;
+	}
+	BUG_ON(cur_len);
+
+	*cur_pg = pg;
+	return 0;
+}
+
+static int _prepare_one_group(struct exofs_io_state *ios, u64 length,
+			      struct _striping_info *si, unsigned first_comp)
+{
+	unsigned stripe_unit = ios->layout->stripe_unit;
+	unsigned mirrors_p1 = ios->layout->mirrors_p1;
+	unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
+	unsigned dev = si->dev;
+	unsigned first_dev = dev - (dev % devs_in_group);
+	unsigned comp = first_comp + (dev - first_dev);
+	unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
+	unsigned cur_pg = ios->pages_consumed;
+	int ret = 0;
+
+	while (length) {
+		struct exofs_per_dev_state *per_dev = &ios->per_dev[comp];
+		unsigned cur_len, page_off = 0;
+
+		if (!per_dev->length) {
+			per_dev->dev = dev;
+			if (dev < si->dev) {
+				per_dev->offset = si->obj_offset + stripe_unit -
+								   si->unit_off;
+				cur_len = stripe_unit;
+			} else if (dev == si->dev) {
+				per_dev->offset = si->obj_offset;
+				cur_len = stripe_unit - si->unit_off;
+				page_off = si->unit_off & ~PAGE_MASK;
+				BUG_ON(page_off && (page_off != ios->pgbase));
+			} else { /* dev > si->dev */
+				per_dev->offset = si->obj_offset - si->unit_off;
+				cur_len = stripe_unit;
+			}
+
+			if (max_comp < comp)
+				max_comp = comp;
+
+			dev += mirrors_p1;
+			dev = (dev % devs_in_group) + first_dev;
+		} else {
+			cur_len = stripe_unit;
+		}
+		if (cur_len >= length)
+			cur_len = length;
+
+		ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
+				       cur_len);
+		if (unlikely(ret))
+			goto out;
+
+		comp += mirrors_p1;
+		comp = (comp % devs_in_group) + first_comp;
+
+		length -= cur_len;
+	}
+out:
+	ios->numdevs = max_comp + mirrors_p1;
+	ios->pages_consumed = cur_pg;
+	return ret;
+}
+
+static int _prepare_for_striping(struct exofs_io_state *ios)
+{
+	u64 length = ios->length;
+	struct _striping_info si;
+	unsigned devs_in_group = ios->layout->group_width *
+				 ios->layout->mirrors_p1;
+	unsigned first_comp = 0;
+	int ret = 0;
+
+	_calc_stripe_info(ios, ios->offset, &si);
+
+	if (!ios->pages) {
+		if (ios->kern_buff) {
+			struct exofs_per_dev_state *per_dev = &ios->per_dev[0];
+
+			per_dev->offset = si.obj_offset;
+			per_dev->dev = si.dev;
+
+			/* no cross device without page array */
+			BUG_ON((ios->layout->group_width > 1) &&
+			       (si.unit_off + ios->length >
+				ios->layout->stripe_unit));
+		}
+		ios->numdevs = ios->layout->mirrors_p1;
+		return 0;
+	}
+
+	while (length) {
+		if (length < si.group_length)
+			si.group_length = length;
+
+		ret = _prepare_one_group(ios, si.group_length, &si, first_comp);
+		if (unlikely(ret))
+			goto out;
+
+		length -= si.group_length;
+
+		si.group_length = si.total_group_length;
+		si.unit_off = 0;
+		++si.Major;
+		si.obj_offset = si.Major * ios->layout->stripe_unit *
+						ios->layout->group_depth;
+
+		si.dev = (si.dev - (si.dev % devs_in_group)) + devs_in_group;
+		si.dev %= ios->layout->s_numdevs;
+
+		first_comp += devs_in_group;
+		first_comp %= ios->layout->s_numdevs;
+	}
+
+out:
+	return ret;
+}
+
 int exofs_sbi_create(struct exofs_io_state *ios)
 {
 	int i, ret;
 
-	for (i = 0; i < ios->sbi->s_numdevs; i++) {
+	for (i = 0; i < ios->layout->s_numdevs; i++) {
 		struct osd_request *or;
 
-		or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL);
+		or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL);
 		if (unlikely(!or)) {
 			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
 			ret = -ENOMEM;
@@ -233,10 +533,10 @@ int exofs_sbi_remove(struct exofs_io_state *ios)
 {
 	int i, ret;
 
-	for (i = 0; i < ios->sbi->s_numdevs; i++) {
+	for (i = 0; i < ios->layout->s_numdevs; i++) {
 		struct osd_request *or;
 
-		or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL);
+		or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL);
 		if (unlikely(!or)) {
 			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
 			ret = -ENOMEM;
@@ -253,51 +553,74 @@ out:
 	return ret;
 }
 
-int exofs_sbi_write(struct exofs_io_state *ios)
+static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp)
 {
-	int i, ret;
+	struct exofs_per_dev_state *master_dev = &ios->per_dev[cur_comp];
+	unsigned dev = ios->per_dev[cur_comp].dev;
+	unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
+	int ret = 0;
 
-	for (i = 0; i < ios->sbi->s_numdevs; i++) {
+	if (ios->pages && !master_dev->length)
+		return 0; /* Just an empty slot */
+
+	for (; cur_comp < last_comp; ++cur_comp, ++dev) {
+		struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp];
 		struct osd_request *or;
 
-		or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL);
+		or = osd_start_request(exofs_ios_od(ios, dev), GFP_KERNEL);
 		if (unlikely(!or)) {
 			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
 			ret = -ENOMEM;
 			goto out;
 		}
-		ios->per_dev[i].or = or;
-		ios->numdevs++;
+		per_dev->or = or;
+		per_dev->offset = master_dev->offset;
 
-		if (ios->bio) {
+		if (ios->pages) {
 			struct bio *bio;
 
-			if (i != 0) {
+			if (per_dev != master_dev) {
 				bio = bio_kmalloc(GFP_KERNEL,
-						  ios->bio->bi_max_vecs);
+						  master_dev->bio->bi_max_vecs);
 				if (unlikely(!bio)) {
+					EXOFS_DBGMSG(
+					      "Faild to allocate BIO size=%u\n",
+					      master_dev->bio->bi_max_vecs);
 					ret = -ENOMEM;
 					goto out;
 				}
 
-				__bio_clone(bio, ios->bio);
+				__bio_clone(bio, master_dev->bio);
 				bio->bi_bdev = NULL;
 				bio->bi_next = NULL;
-				ios->per_dev[i].bio =  bio;
+				per_dev->length = master_dev->length;
+				per_dev->bio =  bio;
+				per_dev->dev = dev;
 			} else {
-				bio = ios->bio;
+				bio = master_dev->bio;
+				/* FIXME: bio_set_dir() */
+				bio->bi_rw |= (1 << BIO_RW);
 			}
 
-			osd_req_write(or, &ios->obj, ios->offset, bio,
-				      ios->length);
-/*			EXOFS_DBGMSG("write sync=%d\n", sync);*/
+			osd_req_write(or, &ios->obj, per_dev->offset, bio,
+				      per_dev->length);
+			EXOFS_DBGMSG("write(0x%llx) offset=0x%llx "
+				      "length=0x%llx dev=%d\n",
+				     _LLU(ios->obj.id), _LLU(per_dev->offset),
+				     _LLU(per_dev->length), dev);
 		} else if (ios->kern_buff) {
-			osd_req_write_kern(or, &ios->obj, ios->offset,
+			ret = osd_req_write_kern(or, &ios->obj, per_dev->offset,
 					   ios->kern_buff, ios->length);
-/*			EXOFS_DBGMSG("write_kern sync=%d\n", sync);*/
+			if (unlikely(ret))
+				goto out;
+			EXOFS_DBGMSG2("write_kern(0x%llx) offset=0x%llx "
+				      "length=0x%llx dev=%d\n",
+				     _LLU(ios->obj.id), _LLU(per_dev->offset),
+				     _LLU(ios->length), dev);
 		} else {
 			osd_req_set_attributes(or, &ios->obj);
-/*			EXOFS_DBGMSG("set_attributes sync=%d\n", sync);*/
+			EXOFS_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n",
+				     _LLU(ios->obj.id), ios->out_attr_len, dev);
 		}
 
 		if (ios->out_attr)
@@ -308,54 +631,93 @@ int exofs_sbi_write(struct exofs_io_state *ios)
 			osd_req_add_get_attr_list(or, ios->in_attr,
 						  ios->in_attr_len);
 	}
-	ret = exofs_io_execute(ios);
 
 out:
 	return ret;
 }
 
-int exofs_sbi_read(struct exofs_io_state *ios)
+int exofs_sbi_write(struct exofs_io_state *ios)
 {
-	int i, ret;
+	int i;
+	int ret;
 
-	for (i = 0; i < 1; i++) {
-		struct osd_request *or;
-		unsigned first_dev = (unsigned)ios->obj.id;
+	ret = _prepare_for_striping(ios);
+	if (unlikely(ret))
+		return ret;
 
-		first_dev %= ios->sbi->s_numdevs;
-		or = osd_start_request(ios->sbi->s_ods[first_dev], GFP_KERNEL);
-		if (unlikely(!or)) {
-			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
-			ret = -ENOMEM;
-			goto out;
-		}
-		ios->per_dev[i].or = or;
-		ios->numdevs++;
+	for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
+		ret = _sbi_write_mirror(ios, i);
+		if (unlikely(ret))
+			return ret;
+	}
 
-		if (ios->bio) {
-			osd_req_read(or, &ios->obj, ios->offset, ios->bio,
-				     ios->length);
-/*			EXOFS_DBGMSG("read sync=%d\n", sync);*/
-		} else if (ios->kern_buff) {
-			osd_req_read_kern(or, &ios->obj, ios->offset,
-					   ios->kern_buff, ios->length);
-/*			EXOFS_DBGMSG("read_kern sync=%d\n", sync);*/
-		} else {
-			osd_req_get_attributes(or, &ios->obj);
-/*			EXOFS_DBGMSG("get_attributes sync=%d\n", sync);*/
-		}
+	ret = exofs_io_execute(ios);
+	return ret;
+}
 
-		if (ios->out_attr)
-			osd_req_add_set_attr_list(or, ios->out_attr,
-						  ios->out_attr_len);
+static int _sbi_read_mirror(struct exofs_io_state *ios, unsigned cur_comp)
+{
+	struct osd_request *or;
+	struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp];
+	unsigned first_dev = (unsigned)ios->obj.id;
 
-		if (ios->in_attr)
-			osd_req_add_get_attr_list(or, ios->in_attr,
-						  ios->in_attr_len);
+	if (ios->pages && !per_dev->length)
+		return 0; /* Just an empty slot */
+
+	first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1;
+	or = osd_start_request(exofs_ios_od(ios, first_dev), GFP_KERNEL);
+	if (unlikely(!or)) {
+		EXOFS_ERR("%s: osd_start_request failed\n", __func__);
+		return -ENOMEM;
 	}
-	ret = exofs_io_execute(ios);
+	per_dev->or = or;
+
+	if (ios->pages) {
+		osd_req_read(or, &ios->obj, per_dev->offset,
+				per_dev->bio, per_dev->length);
+		EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"
+			     " dev=%d\n", _LLU(ios->obj.id),
+			     _LLU(per_dev->offset), _LLU(per_dev->length),
+			     first_dev);
+	} else if (ios->kern_buff) {
+		int ret = osd_req_read_kern(or, &ios->obj, per_dev->offset,
+					    ios->kern_buff, ios->length);
+		EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx "
+			      "length=0x%llx dev=%d ret=>%d\n",
+			      _LLU(ios->obj.id), _LLU(per_dev->offset),
+			      _LLU(ios->length), first_dev, ret);
+		if (unlikely(ret))
+			return ret;
+	} else {
+		osd_req_get_attributes(or, &ios->obj);
+		EXOFS_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n",
+			      _LLU(ios->obj.id), ios->in_attr_len, first_dev);
+	}
+	if (ios->out_attr)
+		osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len);
 
-out:
+	if (ios->in_attr)
+		osd_req_add_get_attr_list(or, ios->in_attr, ios->in_attr_len);
+
+	return 0;
+}
+
+int exofs_sbi_read(struct exofs_io_state *ios)
+{
+	int i;
+	int ret;
+
+	ret = _prepare_for_striping(ios);
+	if (unlikely(ret))
+		return ret;
+
+	for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
+		ret = _sbi_read_mirror(ios, i);
+		if (unlikely(ret))
+			return ret;
+	}
+
+	ret = exofs_io_execute(ios);
 	return ret;
 }
 
@@ -380,42 +742,82 @@ int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr)
 	return -EIO;
 }
 
+static int _truncate_mirrors(struct exofs_io_state *ios, unsigned cur_comp,
+			     struct osd_attr *attr)
+{
+	int last_comp = cur_comp + ios->layout->mirrors_p1;
+
+	for (; cur_comp < last_comp; ++cur_comp) {
+		struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp];
+		struct osd_request *or;
+
+		or = osd_start_request(exofs_ios_od(ios, cur_comp), GFP_KERNEL);
+		if (unlikely(!or)) {
+			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
+			return -ENOMEM;
+		}
+		per_dev->or = or;
+
+		osd_req_set_attributes(or, &ios->obj);
+		osd_req_add_set_attr_list(or, attr, 1);
+	}
+
+	return 0;
+}
+
 int exofs_oi_truncate(struct exofs_i_info *oi, u64 size)
 {
 	struct exofs_sb_info *sbi = oi->vfs_inode.i_sb->s_fs_info;
 	struct exofs_io_state *ios;
-	struct osd_attr attr;
-	__be64 newsize;
+	struct exofs_trunc_attr {
+		struct osd_attr attr;
+		__be64 newsize;
+	} *size_attrs;
+	struct _striping_info si;
 	int i, ret;
 
-	if (exofs_get_io_state(sbi, &ios))
-		return -ENOMEM;
+	ret = exofs_get_io_state(&sbi->layout, &ios);
+	if (unlikely(ret))
+		return ret;
+
+	size_attrs = kcalloc(ios->layout->group_width, sizeof(*size_attrs),
+			     GFP_KERNEL);
+	if (unlikely(!size_attrs)) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
 	ios->obj.id = exofs_oi_objno(oi);
 	ios->cred = oi->i_cred;
 
-	newsize = cpu_to_be64(size);
-	attr = g_attr_logical_length;
-	attr.val_ptr = &newsize;
+	ios->numdevs = ios->layout->s_numdevs;
+	_calc_stripe_info(ios, size, &si);
 
-	for (i = 0; i < sbi->s_numdevs; i++) {
-		struct osd_request *or;
+	for (i = 0; i < ios->layout->group_width; ++i) {
+		struct exofs_trunc_attr *size_attr = &size_attrs[i];
+		u64 obj_size;
 
-		or = osd_start_request(sbi->s_ods[i], GFP_KERNEL);
-		if (unlikely(!or)) {
-			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
-			ret = -ENOMEM;
-			goto out;
-		}
-		ios->per_dev[i].or = or;
-		ios->numdevs++;
+		if (i < si.dev)
+			obj_size = si.obj_offset +
+					ios->layout->stripe_unit - si.unit_off;
+		else if (i == si.dev)
+			obj_size = si.obj_offset;
+		else /* i > si.dev */
+			obj_size = si.obj_offset - si.unit_off;
 
-		osd_req_set_attributes(or, &ios->obj);
-		osd_req_add_set_attr_list(or, &attr, 1);
+		size_attr->newsize = cpu_to_be64(obj_size);
+		size_attr->attr = g_attr_logical_length;
+		size_attr->attr.val_ptr = &size_attr->newsize;
+
+		ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1,
+					&size_attr->attr);
+		if (unlikely(ret))
+			goto out;
 	}
 	ret = exofs_io_execute(ios);
 
 out:
+	kfree(size_attrs);
 	exofs_put_io_state(ios);
 	return ret;
 }
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index a1d1e77b12eb..03149b9a5178 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -37,6 +37,7 @@
 #include <linux/vfs.h>
 #include <linux/random.h>
 #include <linux/exportfs.h>
+#include <linux/slab.h>
 
 #include "exofs.h"
 
@@ -210,7 +211,7 @@ int exofs_sync_fs(struct super_block *sb, int wait)
 	sbi = sb->s_fs_info;
 	fscb = &sbi->s_fscb;
 
-	ret = exofs_get_io_state(sbi, &ios);
+	ret = exofs_get_io_state(&sbi->layout, &ios);
 	if (ret)
 		goto out;
 
@@ -264,12 +265,12 @@ static void _exofs_print_device(const char *msg, const char *dev_path,
 
 void exofs_free_sbi(struct exofs_sb_info *sbi)
 {
-	while (sbi->s_numdevs) {
-		int i = --sbi->s_numdevs;
-		struct osd_dev *od = sbi->s_ods[i];
+	while (sbi->layout.s_numdevs) {
+		int i = --sbi->layout.s_numdevs;
+		struct osd_dev *od = sbi->layout.s_ods[i];
 
 		if (od) {
-			sbi->s_ods[i] = NULL;
+			sbi->layout.s_ods[i] = NULL;
 			osduld_put_device(od);
 		}
 	}
@@ -298,8 +299,10 @@ static void exofs_put_super(struct super_block *sb)
 				  msecs_to_jiffies(100));
 	}
 
-	_exofs_print_device("Unmounting", NULL, sbi->s_ods[0], sbi->s_pid);
+	_exofs_print_device("Unmounting", NULL, sbi->layout.s_ods[0],
+			    sbi->layout.s_pid);
 
+	bdi_destroy(&sbi->bdi);
 	exofs_free_sbi(sbi);
 	sb->s_fs_info = NULL;
 }
@@ -307,6 +310,8 @@ static void exofs_put_super(struct super_block *sb)
 static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
 				    struct exofs_device_table *dt)
 {
+	u64 stripe_length;
+
 	sbi->data_map.odm_num_comps   =
 				le32_to_cpu(dt->dt_data_map.cb_num_comps);
 	sbi->data_map.odm_stripe_unit =
@@ -320,14 +325,63 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
 	sbi->data_map.odm_raid_algorithm  =
 				le32_to_cpu(dt->dt_data_map.cb_raid_algorithm);
 
-/* FIXME: Hard coded mirror only for now. if not so do not mount */
-	if ((sbi->data_map.odm_num_comps != numdevs) ||
-	    (sbi->data_map.odm_stripe_unit != EXOFS_BLKSIZE) ||
-	    (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) ||
-	    (sbi->data_map.odm_mirror_cnt != (numdevs - 1)))
+/* FIXME: Only raid0 for now. if not so, do not mount */
+	if (sbi->data_map.odm_num_comps != numdevs) {
+		EXOFS_ERR("odm_num_comps(%u) != numdevs(%u)\n",
+			  sbi->data_map.odm_num_comps, numdevs);
 		return -EINVAL;
-	else
-		return 0;
+	}
+	if (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) {
+		EXOFS_ERR("Only RAID_0 for now\n");
+		return -EINVAL;
+	}
+	if (0 != (numdevs % (sbi->data_map.odm_mirror_cnt + 1))) {
+		EXOFS_ERR("Data Map wrong, numdevs=%d mirrors=%d\n",
+			  numdevs, sbi->data_map.odm_mirror_cnt);
+		return -EINVAL;
+	}
+
+	if (0 != (sbi->data_map.odm_stripe_unit & ~PAGE_MASK)) {
+		EXOFS_ERR("Stripe Unit(0x%llx)"
+			  " must be Multples of PAGE_SIZE(0x%lx)\n",
+			  _LLU(sbi->data_map.odm_stripe_unit), PAGE_SIZE);
+		return -EINVAL;
+	}
+
+	sbi->layout.stripe_unit = sbi->data_map.odm_stripe_unit;
+	sbi->layout.mirrors_p1 = sbi->data_map.odm_mirror_cnt + 1;
+
+	if (sbi->data_map.odm_group_width) {
+		sbi->layout.group_width = sbi->data_map.odm_group_width;
+		sbi->layout.group_depth = sbi->data_map.odm_group_depth;
+		if (!sbi->layout.group_depth) {
+			EXOFS_ERR("group_depth == 0 && group_width != 0\n");
+			return -EINVAL;
+		}
+		sbi->layout.group_count = sbi->data_map.odm_num_comps /
+						sbi->layout.mirrors_p1 /
+						sbi->data_map.odm_group_width;
+	} else {
+		if (sbi->data_map.odm_group_depth) {
+			printk(KERN_NOTICE "Warning: group_depth ignored "
+				"group_width == 0 && group_depth == %d\n",
+				sbi->data_map.odm_group_depth);
+			sbi->data_map.odm_group_depth = 0;
+		}
+		sbi->layout.group_width = sbi->data_map.odm_num_comps /
+							sbi->layout.mirrors_p1;
+		sbi->layout.group_depth = -1;
+		sbi->layout.group_count = 1;
+	}
+
+	stripe_length = (u64)sbi->layout.group_width * sbi->layout.stripe_unit;
+	if (stripe_length >= (1ULL << 32)) {
+		EXOFS_ERR("Total Stripe length(0x%llx)"
+			  " >= 32bit is not supported\n", _LLU(stripe_length));
+		return -EINVAL;
+	}
+
+	return 0;
 }
 
 /* @odi is valid only as long as @fscb_dev is valid */
@@ -361,7 +415,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
 {
 	struct exofs_sb_info *sbi = *psbi;
 	struct osd_dev *fscb_od;
-	struct osd_obj_id obj = {.partition = sbi->s_pid,
+	struct osd_obj_id obj = {.partition = sbi->layout.s_pid,
 				 .id = EXOFS_DEVTABLE_ID};
 	struct exofs_device_table *dt;
 	unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) +
@@ -376,9 +430,9 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
 		return -ENOMEM;
 	}
 
-	fscb_od = sbi->s_ods[0];
-	sbi->s_ods[0] = NULL;
-	sbi->s_numdevs = 0;
+	fscb_od = sbi->layout.s_ods[0];
+	sbi->layout.s_ods[0] = NULL;
+	sbi->layout.s_numdevs = 0;
 	ret = exofs_read_kern(fscb_od, sbi->s_cred, &obj, 0, dt, table_bytes);
 	if (unlikely(ret)) {
 		EXOFS_ERR("ERROR: reading device table\n");
@@ -397,14 +451,15 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
 		goto out;
 
 	if (likely(numdevs > 1)) {
-		unsigned size = numdevs * sizeof(sbi->s_ods[0]);
+		unsigned size = numdevs * sizeof(sbi->layout.s_ods[0]);
 
 		sbi = krealloc(sbi, sizeof(*sbi) + size, GFP_KERNEL);
 		if (unlikely(!sbi)) {
 			ret = -ENOMEM;
 			goto out;
 		}
-		memset(&sbi->s_ods[1], 0, size - sizeof(sbi->s_ods[0]));
+		memset(&sbi->layout.s_ods[1], 0,
+		       size - sizeof(sbi->layout.s_ods[0]));
 		*psbi = sbi;
 	}
 
@@ -427,8 +482,8 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
 		 * line. We always keep them in device-table order.
 		 */
 		if (fscb_od && osduld_device_same(fscb_od, &odi)) {
-			sbi->s_ods[i] = fscb_od;
-			++sbi->s_numdevs;
+			sbi->layout.s_ods[i] = fscb_od;
+			++sbi->layout.s_numdevs;
 			fscb_od = NULL;
 			continue;
 		}
@@ -441,8 +496,8 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
 			goto out;
 		}
 
-		sbi->s_ods[i] = od;
-		++sbi->s_numdevs;
+		sbi->layout.s_ods[i] = od;
+		++sbi->layout.s_numdevs;
 
 		/* Read the fscb of the other devices to make sure the FS
 		 * partition is there.
@@ -492,6 +547,10 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 	if (!sbi)
 		return -ENOMEM;
 
+	ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY);
+	if (ret)
+		goto free_bdi;
+
 	/* use mount options to fill superblock */
 	od = osduld_path_lookup(opts->dev_name);
 	if (IS_ERR(od)) {
@@ -499,9 +558,15 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 		goto free_sbi;
 	}
 
-	sbi->s_ods[0] = od;
-	sbi->s_numdevs = 1;
-	sbi->s_pid = opts->pid;
+	/* Default layout in case we do not have a device-table */
+	sbi->layout.stripe_unit = PAGE_SIZE;
+	sbi->layout.mirrors_p1 = 1;
+	sbi->layout.group_width = 1;
+	sbi->layout.group_depth = -1;
+	sbi->layout.group_count = 1;
+	sbi->layout.s_ods[0] = od;
+	sbi->layout.s_numdevs = 1;
+	sbi->layout.s_pid = opts->pid;
 	sbi->s_timeout = opts->timeout;
 
 	/* fill in some other data by hand */
@@ -514,7 +579,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_bdev = NULL;
 	sb->s_dev = 0;
 
-	obj.partition = sbi->s_pid;
+	obj.partition = sbi->layout.s_pid;
 	obj.id = EXOFS_SUPER_ID;
 	exofs_make_credential(sbi->s_cred, &obj);
 
@@ -552,6 +617,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 	}
 
 	/* set up operation vectors */
+	sb->s_bdi = &sbi->bdi;
 	sb->s_fs_info = sbi;
 	sb->s_op = &exofs_sops;
 	sb->s_export_op = &exofs_export_ops;
@@ -578,13 +644,15 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 		goto free_sbi;
 	}
 
-	_exofs_print_device("Mounting", opts->dev_name, sbi->s_ods[0],
-			    sbi->s_pid);
+	_exofs_print_device("Mounting", opts->dev_name, sbi->layout.s_ods[0],
+			    sbi->layout.s_pid);
 	return 0;
 
 free_sbi:
+	bdi_destroy(&sbi->bdi);
+free_bdi:
 	EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n",
-		  opts->dev_name, sbi->s_pid, ret);
+		  opts->dev_name, sbi->layout.s_pid, ret);
 	exofs_free_sbi(sbi);
 	return ret;
 }
@@ -627,7 +695,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	uint8_t cred_a[OSD_CAP_LEN];
 	int ret;
 
-	ret = exofs_get_io_state(sbi, &ios);
+	ret = exofs_get_io_state(&sbi->layout, &ios);
 	if (ret) {
 		EXOFS_DBGMSG("exofs_get_io_state failed.\n");
 		return ret;
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index a99e54318c3d..2bcc0431bada 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -200,6 +200,7 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 					return error;
 				else {
 					inode->i_mode = mode;
+					inode->i_ctime = CURRENT_TIME_SEC;
 					mark_inode_dirty(inode);
 					if (error == 0)
 						acl = NULL;
@@ -420,7 +421,7 @@ release_and_out:
 	return error;
 }
 
-struct xattr_handler ext2_xattr_acl_access_handler = {
+const struct xattr_handler ext2_xattr_acl_access_handler = {
 	.prefix	= POSIX_ACL_XATTR_ACCESS,
 	.flags	= ACL_TYPE_ACCESS,
 	.list	= ext2_xattr_list_acl_access,
@@ -428,7 +429,7 @@ struct xattr_handler ext2_xattr_acl_access_handler = {
 	.set	= ext2_xattr_set_acl,
 };
 
-struct xattr_handler ext2_xattr_acl_default_handler = {
+const struct xattr_handler ext2_xattr_acl_default_handler = {
 	.prefix	= POSIX_ACL_XATTR_DEFAULT,
 	.flags	= ACL_TYPE_DEFAULT,
 	.list	= ext2_xattr_list_acl_default,
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 7f8d2e5a7ea6..e8766a396776 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -13,6 +13,7 @@
 
 #include "ext2.h"
 #include <linux/quotaops.h>
+#include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/buffer_head.h>
 #include <linux/capability.h>
@@ -570,7 +571,7 @@ do_more:
 error_return:
 	brelse(bitmap_bh);
 	release_blocks(sb, freed);
-	vfs_dq_free_block(inode, freed);
+	dquot_free_block(inode, freed);
 }
 
 /**
@@ -1236,6 +1237,7 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal,
 	unsigned short windowsz = 0;
 	unsigned long ngroups;
 	unsigned long num = *count;
+	int ret;
 
 	*errp = -ENOSPC;
 	sb = inode->i_sb;
@@ -1247,8 +1249,9 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal,
 	/*
 	 * Check quota for allocation of this block.
 	 */
-	if (vfs_dq_alloc_block(inode, num)) {
-		*errp = -EDQUOT;
+	ret = dquot_alloc_block(inode, num);
+	if (ret) {
+		*errp = ret;
 		return 0;
 	}
 
@@ -1329,6 +1332,12 @@ retry_alloc:
 
 		free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
 		/*
+		 * skip this group (and avoid loading bitmap) if there
+		 * are no free blocks
+		 */
+		if (!free_blocks)
+			continue;
+		/*
 		 * skip this group if the number of
 		 * free blocks is less than half of the reservation
 		 * window size.
@@ -1409,7 +1418,7 @@ allocated:
 
 	*errp = 0;
 	brelse(bitmap_bh);
-	vfs_dq_free_block(inode, *count-num);
+	dquot_free_block(inode, *count-num);
 	*count = num;
 	return ret_block;
 
@@ -1420,7 +1429,7 @@ out:
 	 * Undo the block allocation
 	 */
 	if (!performed_allocation)
-		vfs_dq_free_block(inode, *count);
+		dquot_free_block(inode, *count);
 	brelse(bitmap_bh);
 	return 0;
 }
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 061914add3cf..52b34f1d2738 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -118,11 +118,10 @@ extern unsigned long ext2_count_free (struct buffer_head *, unsigned);
 
 /* inode.c */
 extern struct inode *ext2_iget (struct super_block *, unsigned long);
-extern int ext2_write_inode (struct inode *, int);
+extern int ext2_write_inode (struct inode *, struct writeback_control *);
 extern void ext2_delete_inode (struct inode *);
 extern int ext2_sync_inode (struct inode *);
 extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
-extern void ext2_truncate (struct inode *);
 extern int ext2_setattr (struct dentry *, struct iattr *);
 extern void ext2_set_inode_flags(struct inode *inode);
 extern void ext2_get_inode_flags(struct ext2_inode_info *);
@@ -155,7 +154,7 @@ extern void ext2_write_super (struct super_block *);
 extern const struct file_operations ext2_dir_operations;
 
 /* file.c */
-extern int ext2_fsync(struct file *file, struct dentry *dentry, int datasync);
+extern int ext2_fsync(struct file *file, int datasync);
 extern const struct inode_operations ext2_file_inode_operations;
 extern const struct file_operations ext2_file_operations;
 extern const struct file_operations ext2_xip_file_operations;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 586e3589d4c2..49eec9456c5b 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -20,6 +20,7 @@
 
 #include <linux/time.h>
 #include <linux/pagemap.h>
+#include <linux/quotaops.h>
 #include "ext2.h"
 #include "xattr.h"
 #include "acl.h"
@@ -39,13 +40,13 @@ static int ext2_release_file (struct inode * inode, struct file * filp)
 	return 0;
 }
 
-int ext2_fsync(struct file *file, struct dentry *dentry, int datasync)
+int ext2_fsync(struct file *file, int datasync)
 {
 	int ret;
-	struct super_block *sb = dentry->d_inode->i_sb;
+	struct super_block *sb = file->f_mapping->host->i_sb;
 	struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
 
-	ret = simple_fsync(file, dentry, datasync);
+	ret = generic_file_fsync(file, datasync);
 	if (ret == -EIO || test_and_clear_bit(AS_EIO, &mapping->flags)) {
 		/* We don't really know where the IO error happened... */
 		ext2_error(sb, __func__,
@@ -70,7 +71,7 @@ const struct file_operations ext2_file_operations = {
 	.compat_ioctl	= ext2_compat_ioctl,
 #endif
 	.mmap		= generic_file_mmap,
-	.open		= generic_file_open,
+	.open		= dquot_file_open,
 	.release	= ext2_release_file,
 	.fsync		= ext2_fsync,
 	.splice_read	= generic_file_splice_read,
@@ -87,14 +88,13 @@ const struct file_operations ext2_xip_file_operations = {
 	.compat_ioctl	= ext2_compat_ioctl,
 #endif
 	.mmap		= xip_file_mmap,
-	.open		= generic_file_open,
+	.open		= dquot_file_open,
 	.release	= ext2_release_file,
 	.fsync		= ext2_fsync,
 };
 #endif
 
 const struct inode_operations ext2_file_inode_operations = {
-	.truncate	= ext2_truncate,
 #ifdef CONFIG_EXT2_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 15387c9c17d8..938dbc739d00 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -106,7 +106,7 @@ void ext2_free_inode (struct inode * inode)
 	struct super_block * sb = inode->i_sb;
 	int is_directory;
 	unsigned long ino;
-	struct buffer_head *bitmap_bh = NULL;
+	struct buffer_head *bitmap_bh;
 	unsigned long block_group;
 	unsigned long bit;
 	struct ext2_super_block * es;
@@ -121,8 +121,8 @@ void ext2_free_inode (struct inode * inode)
 	if (!is_bad_inode(inode)) {
 		/* Quota is already initialized in iput() */
 		ext2_xattr_delete_inode(inode);
-		vfs_dq_free_inode(inode);
-		vfs_dq_drop(inode);
+		dquot_free_inode(inode);
+		dquot_drop(inode);
 	}
 
 	es = EXT2_SB(sb)->s_es;
@@ -135,14 +135,13 @@ void ext2_free_inode (struct inode * inode)
 	    ino > le32_to_cpu(es->s_inodes_count)) {
 		ext2_error (sb, "ext2_free_inode",
 			    "reserved or nonexistent inode %lu", ino);
-		goto error_return;
+		return;
 	}
 	block_group = (ino - 1) / EXT2_INODES_PER_GROUP(sb);
 	bit = (ino - 1) % EXT2_INODES_PER_GROUP(sb);
-	brelse(bitmap_bh);
 	bitmap_bh = read_inode_bitmap(sb, block_group);
 	if (!bitmap_bh)
-		goto error_return;
+		return;
 
 	/* Ok, now we can actually update the inode bitmaps.. */
 	if (!ext2_clear_bit_atomic(sb_bgl_lock(EXT2_SB(sb), block_group),
@@ -154,7 +153,7 @@ void ext2_free_inode (struct inode * inode)
 	mark_buffer_dirty(bitmap_bh);
 	if (sb->s_flags & MS_SYNCHRONOUS)
 		sync_dirty_buffer(bitmap_bh);
-error_return:
+
 	brelse(bitmap_bh);
 }
 
@@ -550,16 +549,12 @@ got:
 
 	sb->s_dirt = 1;
 	mark_buffer_dirty(bh2);
-	inode->i_uid = current_fsuid();
-	if (test_opt (sb, GRPID))
-		inode->i_gid = dir->i_gid;
-	else if (dir->i_mode & S_ISGID) {
+	if (test_opt(sb, GRPID)) {
+		inode->i_mode = mode;
+		inode->i_uid = current_fsuid();
 		inode->i_gid = dir->i_gid;
-		if (S_ISDIR(mode))
-			mode |= S_ISGID;
 	} else
-		inode->i_gid = current_fsgid();
-	inode->i_mode = mode;
+		inode_init_owner(inode, dir, mode);
 
 	inode->i_ino = ino;
 	inode->i_blocks = 0;
@@ -586,10 +581,10 @@ got:
 		goto fail_drop;
 	}
 
-	if (vfs_dq_alloc_inode(inode)) {
-		err = -EDQUOT;
+	dquot_initialize(inode);
+	err = dquot_alloc_inode(inode);
+	if (err)
 		goto fail_drop;
-	}
 
 	err = ext2_init_acl(inode, dir);
 	if (err)
@@ -605,10 +600,10 @@ got:
 	return inode;
 
 fail_free_drop:
-	vfs_dq_free_inode(inode);
+	dquot_free_inode(inode);
 
 fail_drop:
-	vfs_dq_drop(inode);
+	dquot_drop(inode);
 	inode->i_flags |= S_NOQUOTA;
 	inode->i_nlink = 0;
 	unlock_new_inode(inode);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 71b032c65a02..3675088cb88c 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -22,7 +22,6 @@
  *  Assorted race fixes, rewrite of ext2_get_block() by Al Viro, 2000
  */
 
-#include <linux/smp_lock.h>
 #include <linux/time.h>
 #include <linux/highuid.h>
 #include <linux/pagemap.h>
@@ -41,6 +40,8 @@ MODULE_AUTHOR("Remy Card and others");
 MODULE_DESCRIPTION("Second Extended Filesystem");
 MODULE_LICENSE("GPL");
 
+static int __ext2_write_inode(struct inode *inode, int do_sync);
+
 /*
  * Test whether an inode is a fast symlink.
  */
@@ -53,22 +54,36 @@ static inline int ext2_inode_is_fast_symlink(struct inode *inode)
 		inode->i_blocks - ea_blocks == 0);
 }
 
+static void ext2_truncate_blocks(struct inode *inode, loff_t offset);
+
+static void ext2_write_failed(struct address_space *mapping, loff_t to)
+{
+	struct inode *inode = mapping->host;
+
+	if (to > inode->i_size) {
+		truncate_pagecache(inode, to, inode->i_size);
+		ext2_truncate_blocks(inode, inode->i_size);
+	}
+}
+
 /*
  * Called at the last iput() if i_nlink is zero.
  */
 void ext2_delete_inode (struct inode * inode)
 {
+	if (!is_bad_inode(inode))
+		dquot_initialize(inode);
 	truncate_inode_pages(&inode->i_data, 0);
 
 	if (is_bad_inode(inode))
 		goto no_delete;
 	EXT2_I(inode)->i_dtime	= get_seconds();
 	mark_inode_dirty(inode);
-	ext2_write_inode(inode, inode_needs_sync(inode));
+	__ext2_write_inode(inode, inode_needs_sync(inode));
 
 	inode->i_size = 0;
 	if (inode->i_blocks)
-		ext2_truncate (inode);
+		ext2_truncate_blocks(inode, 0);
 	ext2_free_inode (inode);
 
 	return;
@@ -754,8 +769,8 @@ int __ext2_write_begin(struct file *file, struct address_space *mapping,
 		loff_t pos, unsigned len, unsigned flags,
 		struct page **pagep, void **fsdata)
 {
-	return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
-							ext2_get_block);
+	return block_write_begin_newtrunc(file, mapping, pos, len, flags,
+					pagep, fsdata, ext2_get_block);
 }
 
 static int
@@ -763,8 +778,25 @@ ext2_write_begin(struct file *file, struct address_space *mapping,
 		loff_t pos, unsigned len, unsigned flags,
 		struct page **pagep, void **fsdata)
 {
+	int ret;
+
 	*pagep = NULL;
-	return __ext2_write_begin(file, mapping, pos, len, flags, pagep,fsdata);
+	ret = __ext2_write_begin(file, mapping, pos, len, flags, pagep, fsdata);
+	if (ret < 0)
+		ext2_write_failed(mapping, pos + len);
+	return ret;
+}
+
+static int ext2_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata)
+{
+	int ret;
+
+	ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+	if (ret < len)
+		ext2_write_failed(mapping, pos + len);
+	return ret;
 }
 
 static int
@@ -772,13 +804,18 @@ ext2_nobh_write_begin(struct file *file, struct address_space *mapping,
 		loff_t pos, unsigned len, unsigned flags,
 		struct page **pagep, void **fsdata)
 {
+	int ret;
+
 	/*
 	 * Dir-in-pagecache still uses ext2_write_begin. Would have to rework
 	 * directory handling code to pass around offsets rather than struct
 	 * pages in order to make this work easily.
 	 */
-	return nobh_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
-							ext2_get_block);
+	ret = nobh_write_begin_newtrunc(file, mapping, pos, len, flags, pagep,
+						fsdata, ext2_get_block);
+	if (ret < 0)
+		ext2_write_failed(mapping, pos + len);
+	return ret;
 }
 
 static int ext2_nobh_writepage(struct page *page,
@@ -797,10 +834,15 @@ ext2_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 			loff_t offset, unsigned long nr_segs)
 {
 	struct file *file = iocb->ki_filp;
-	struct inode *inode = file->f_mapping->host;
-
-	return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
-				offset, nr_segs, ext2_get_block, NULL);
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	ssize_t ret;
+
+	ret = blockdev_direct_IO_newtrunc(rw, iocb, inode, inode->i_sb->s_bdev,
+				iov, offset, nr_segs, ext2_get_block, NULL);
+	if (ret < 0 && (rw & WRITE))
+		ext2_write_failed(mapping, offset + iov_length(iov, nr_segs));
+	return ret;
 }
 
 static int
@@ -815,7 +857,7 @@ const struct address_space_operations ext2_aops = {
 	.writepage		= ext2_writepage,
 	.sync_page		= block_sync_page,
 	.write_begin		= ext2_write_begin,
-	.write_end		= generic_write_end,
+	.write_end		= ext2_write_end,
 	.bmap			= ext2_bmap,
 	.direct_IO		= ext2_direct_IO,
 	.writepages		= ext2_writepages,
@@ -1024,7 +1066,7 @@ static void ext2_free_branches(struct inode *inode, __le32 *p, __le32 *q, int de
 		ext2_free_data(inode, p, q);
 }
 
-void ext2_truncate(struct inode *inode)
+static void __ext2_truncate_blocks(struct inode *inode, loff_t offset)
 {
 	__le32 *i_data = EXT2_I(inode)->i_data;
 	struct ext2_inode_info *ei = EXT2_I(inode);
@@ -1036,27 +1078,8 @@ void ext2_truncate(struct inode *inode)
 	int n;
 	long iblock;
 	unsigned blocksize;
-
-	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
-	    S_ISLNK(inode->i_mode)))
-		return;
-	if (ext2_inode_is_fast_symlink(inode))
-		return;
-	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
-		return;
-
 	blocksize = inode->i_sb->s_blocksize;
-	iblock = (inode->i_size + blocksize-1)
-					>> EXT2_BLOCK_SIZE_BITS(inode->i_sb);
-
-	if (mapping_is_xip(inode->i_mapping))
-		xip_truncate_page(inode->i_mapping, inode->i_size);
-	else if (test_opt(inode->i_sb, NOBH))
-		nobh_truncate_page(inode->i_mapping,
-				inode->i_size, ext2_get_block);
-	else
-		block_truncate_page(inode->i_mapping,
-				inode->i_size, ext2_get_block);
+	iblock = (offset + blocksize-1) >> EXT2_BLOCK_SIZE_BITS(inode->i_sb);
 
 	n = ext2_block_to_path(inode, iblock, offsets, NULL);
 	if (n == 0)
@@ -1124,6 +1147,62 @@ do_indirects:
 	ext2_discard_reservation(inode);
 
 	mutex_unlock(&ei->truncate_mutex);
+}
+
+static void ext2_truncate_blocks(struct inode *inode, loff_t offset)
+{
+	/*
+	 * XXX: it seems like a bug here that we don't allow
+	 * IS_APPEND inode to have blocks-past-i_size trimmed off.
+	 * review and fix this.
+	 *
+	 * Also would be nice to be able to handle IO errors and such,
+	 * but that's probably too much to ask.
+	 */
+	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+	    S_ISLNK(inode->i_mode)))
+		return;
+	if (ext2_inode_is_fast_symlink(inode))
+		return;
+	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+		return;
+	__ext2_truncate_blocks(inode, offset);
+}
+
+int ext2_setsize(struct inode *inode, loff_t newsize)
+{
+	loff_t oldsize;
+	int error;
+
+	error = inode_newsize_ok(inode, newsize);
+	if (error)
+		return error;
+
+	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+	    S_ISLNK(inode->i_mode)))
+		return -EINVAL;
+	if (ext2_inode_is_fast_symlink(inode))
+		return -EINVAL;
+	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+		return -EPERM;
+
+	if (mapping_is_xip(inode->i_mapping))
+		error = xip_truncate_page(inode->i_mapping, newsize);
+	else if (test_opt(inode->i_sb, NOBH))
+		error = nobh_truncate_page(inode->i_mapping,
+				newsize, ext2_get_block);
+	else
+		error = block_truncate_page(inode->i_mapping,
+				newsize, ext2_get_block);
+	if (error)
+		return error;
+
+	oldsize = inode->i_size;
+	i_size_write(inode, newsize);
+	truncate_pagecache(inode, oldsize, newsize);
+
+	__ext2_truncate_blocks(inode, newsize);
+
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
 	if (inode_needs_sync(inode)) {
 		sync_mapping_buffers(inode->i_mapping);
@@ -1131,6 +1210,8 @@ do_indirects:
 	} else {
 		mark_inode_dirty(inode);
 	}
+
+	return 0;
 }
 
 static struct ext2_inode *ext2_get_inode(struct super_block *sb, ino_t ino,
@@ -1335,7 +1416,7 @@ bad_inode:
 	return ERR_PTR(ret);
 }
 
-int ext2_write_inode(struct inode *inode, int do_sync)
+static int __ext2_write_inode(struct inode *inode, int do_sync)
 {
 	struct ext2_inode_info *ei = EXT2_I(inode);
 	struct super_block *sb = inode->i_sb;
@@ -1402,11 +1483,11 @@ int ext2_write_inode(struct inode *inode, int do_sync)
 			       /* If this is the first large file
 				* created, add a flag to the superblock.
 				*/
-				lock_kernel();
+				spin_lock(&EXT2_SB(sb)->s_lock);
 				ext2_update_dynamic_rev(sb);
 				EXT2_SET_RO_COMPAT_FEATURE(sb,
 					EXT2_FEATURE_RO_COMPAT_LARGE_FILE);
-				unlock_kernel();
+				spin_unlock(&EXT2_SB(sb)->s_lock);
 				ext2_write_super(sb);
 			}
 		}
@@ -1440,6 +1521,11 @@ int ext2_write_inode(struct inode *inode, int do_sync)
 	return err;
 }
 
+int ext2_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
+}
+
 int ext2_sync_inode(struct inode *inode)
 {
 	struct writeback_control wbc = {
@@ -1457,14 +1543,24 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
 	error = inode_change_ok(inode, iattr);
 	if (error)
 		return error;
+
+	if (is_quota_modification(inode, iattr))
+		dquot_initialize(inode);
 	if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
 	    (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
-		error = vfs_dq_transfer(inode, iattr) ? -EDQUOT : 0;
+		error = dquot_transfer(inode, iattr);
 		if (error)
 			return error;
 	}
-	error = inode_setattr(inode, iattr);
-	if (!error && (iattr->ia_valid & ATTR_MODE))
+	if (iattr->ia_valid & ATTR_SIZE && iattr->ia_size != inode->i_size) {
+		error = ext2_setsize(inode, iattr->ia_size);
+		if (error)
+			return error;
+	}
+	generic_setattr(inode, iattr);
+	if (iattr->ia_valid & ATTR_MODE)
 		error = ext2_acl_chmod(inode);
+	mark_inode_dirty(inode);
+
 	return error;
 }
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index dd7175ce5606..71efb0e9a3f2 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -31,6 +31,7 @@
  */
 
 #include <linux/pagemap.h>
+#include <linux/quotaops.h>
 #include "ext2.h"
 #include "xattr.h"
 #include "acl.h"
@@ -99,24 +100,27 @@ struct dentry *ext2_get_parent(struct dentry *child)
  */
 static int ext2_create (struct inode * dir, struct dentry * dentry, int mode, struct nameidata *nd)
 {
-	struct inode * inode = ext2_new_inode (dir, mode);
-	int err = PTR_ERR(inode);
-	if (!IS_ERR(inode)) {
-		inode->i_op = &ext2_file_inode_operations;
-		if (ext2_use_xip(inode->i_sb)) {
-			inode->i_mapping->a_ops = &ext2_aops_xip;
-			inode->i_fop = &ext2_xip_file_operations;
-		} else if (test_opt(inode->i_sb, NOBH)) {
-			inode->i_mapping->a_ops = &ext2_nobh_aops;
-			inode->i_fop = &ext2_file_operations;
-		} else {
-			inode->i_mapping->a_ops = &ext2_aops;
-			inode->i_fop = &ext2_file_operations;
-		}
-		mark_inode_dirty(inode);
-		err = ext2_add_nondir(dentry, inode);
+	struct inode *inode;
+
+	dquot_initialize(dir);
+
+	inode = ext2_new_inode(dir, mode);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	inode->i_op = &ext2_file_inode_operations;
+	if (ext2_use_xip(inode->i_sb)) {
+		inode->i_mapping->a_ops = &ext2_aops_xip;
+		inode->i_fop = &ext2_xip_file_operations;
+	} else if (test_opt(inode->i_sb, NOBH)) {
+		inode->i_mapping->a_ops = &ext2_nobh_aops;
+		inode->i_fop = &ext2_file_operations;
+	} else {
+		inode->i_mapping->a_ops = &ext2_aops;
+		inode->i_fop = &ext2_file_operations;
 	}
-	return err;
+	mark_inode_dirty(inode);
+	return ext2_add_nondir(dentry, inode);
 }
 
 static int ext2_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t rdev)
@@ -127,6 +131,8 @@ static int ext2_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_
 	if (!new_valid_dev(rdev))
 		return -EINVAL;
 
+	dquot_initialize(dir);
+
 	inode = ext2_new_inode (dir, mode);
 	err = PTR_ERR(inode);
 	if (!IS_ERR(inode)) {
@@ -151,6 +157,8 @@ static int ext2_symlink (struct inode * dir, struct dentry * dentry,
 	if (l > sb->s_blocksize)
 		goto out;
 
+	dquot_initialize(dir);
+
 	inode = ext2_new_inode (dir, S_IFLNK | S_IRWXUGO);
 	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
@@ -194,6 +202,8 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
 	if (inode->i_nlink >= EXT2_LINK_MAX)
 		return -EMLINK;
 
+	dquot_initialize(dir);
+
 	inode->i_ctime = CURRENT_TIME_SEC;
 	inode_inc_link_count(inode);
 	atomic_inc(&inode->i_count);
@@ -216,6 +226,8 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode)
 	if (dir->i_nlink >= EXT2_LINK_MAX)
 		goto out;
 
+	dquot_initialize(dir);
+
 	inode_inc_link_count(dir);
 
 	inode = ext2_new_inode (dir, S_IFDIR | mode);
@@ -262,6 +274,8 @@ static int ext2_unlink(struct inode * dir, struct dentry *dentry)
 	struct page * page;
 	int err = -ENOENT;
 
+	dquot_initialize(dir);
+
 	de = ext2_find_entry (dir, &dentry->d_name, &page);
 	if (!de)
 		goto out;
@@ -304,6 +318,9 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
 	struct ext2_dir_entry_2 * old_de;
 	int err = -ENOENT;
 
+	dquot_initialize(old_dir);
+	dquot_initialize(new_dir);
+
 	old_de = ext2_find_entry (old_dir, &old_dentry->d_name, &old_page);
 	if (!old_de)
 		goto out;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index f9cb54a585ce..7ff43f4a59cd 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -26,7 +26,6 @@
 #include <linux/random.h>
 #include <linux/buffer_head.h>
 #include <linux/exportfs.h>
-#include <linux/smp_lock.h>
 #include <linux/vfs.h>
 #include <linux/seq_file.h>
 #include <linux/mount.h>
@@ -39,7 +38,7 @@
 #include "xip.h"
 
 static void ext2_sync_super(struct super_block *sb,
-			    struct ext2_super_block *es);
+			    struct ext2_super_block *es, int wait);
 static int ext2_remount (struct super_block * sb, int * flags, char * data);
 static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf);
 static int ext2_sync_fs(struct super_block *sb, int wait);
@@ -52,9 +51,11 @@ void ext2_error (struct super_block * sb, const char * function,
 	struct ext2_super_block *es = sbi->s_es;
 
 	if (!(sb->s_flags & MS_RDONLY)) {
+		spin_lock(&sbi->s_lock);
 		sbi->s_mount_state |= EXT2_ERROR_FS;
 		es->s_state |= cpu_to_le16(EXT2_ERROR_FS);
-		ext2_sync_super(sb, es);
+		spin_unlock(&sbi->s_lock);
+		ext2_sync_super(sb, es, 1);
 	}
 
 	va_start(args, fmt);
@@ -84,6 +85,9 @@ void ext2_msg(struct super_block *sb, const char *prefix,
 	va_end(args);
 }
 
+/*
+ * This must be called with sbi->s_lock held.
+ */
 void ext2_update_dynamic_rev(struct super_block *sb)
 {
 	struct ext2_super_block *es = EXT2_SB(sb)->s_es;
@@ -115,7 +119,7 @@ static void ext2_put_super (struct super_block * sb)
 	int i;
 	struct ext2_sb_info *sbi = EXT2_SB(sb);
 
-	lock_kernel();
+	dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
 
 	if (sb->s_dirt)
 		ext2_write_super(sb);
@@ -124,8 +128,10 @@ static void ext2_put_super (struct super_block * sb)
 	if (!(sb->s_flags & MS_RDONLY)) {
 		struct ext2_super_block *es = sbi->s_es;
 
+		spin_lock(&sbi->s_lock);
 		es->s_state = cpu_to_le16(sbi->s_mount_state);
-		ext2_sync_super(sb, es);
+		spin_unlock(&sbi->s_lock);
+		ext2_sync_super(sb, es, 1);
 	}
 	db_count = sbi->s_gdb_count;
 	for (i = 0; i < db_count; i++)
@@ -140,8 +146,6 @@ static void ext2_put_super (struct super_block * sb)
 	sb->s_fs_info = NULL;
 	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
-
-	unlock_kernel();
 }
 
 static struct kmem_cache * ext2_inode_cachep;
@@ -194,6 +198,8 @@ static void destroy_inodecache(void)
 static void ext2_clear_inode(struct inode *inode)
 {
 	struct ext2_block_alloc_info *rsv = EXT2_I(inode)->i_block_alloc_info;
+
+	dquot_drop(inode);
 	ext2_discard_reservation(inode);
 	EXT2_I(inode)->i_block_alloc_info = NULL;
 	if (unlikely(rsv))
@@ -207,6 +213,7 @@ static int ext2_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	struct ext2_super_block *es = sbi->s_es;
 	unsigned long def_mount_opts;
 
+	spin_lock(&sbi->s_lock);
 	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
 
 	if (sbi->s_sb_block != 1)
@@ -279,6 +286,7 @@ static int ext2_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	if (!test_opt(sb, RESERVATION))
 		seq_puts(seq, ",noreservation");
 
+	spin_unlock(&sbi->s_lock);
 	return 0;
 }
 
@@ -604,7 +612,6 @@ static int ext2_setup_super (struct super_block * sb,
 	if (!le16_to_cpu(es->s_max_mnt_count))
 		es->s_max_mnt_count = cpu_to_le16(EXT2_DFL_MAX_MNT_COUNT);
 	le16_add_cpu(&es->s_mnt_count, 1);
-	ext2_write_super(sb);
 	if (test_opt (sb, DEBUG))
 		ext2_msg(sb, KERN_INFO, "%s, %s, bs=%lu, fs=%lu, gc=%lu, "
 			"bpg=%lu, ipg=%lu, mo=%04lx]",
@@ -765,6 +772,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_fs_info = sbi;
 	sbi->s_sb_block = sb_block;
 
+	spin_lock_init(&sbi->s_lock);
+
 	/*
 	 * See what the current blocksize for the device is, and
 	 * use that as the blocksize.  Otherwise (or if the blocksize
@@ -1056,6 +1065,12 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_op = &ext2_sops;
 	sb->s_export_op = &ext2_export_ops;
 	sb->s_xattr = ext2_xattr_handlers;
+
+#ifdef CONFIG_QUOTA
+	sb->dq_op = &dquot_operations;
+	sb->s_qcop = &dquot_quotactl_ops;
+#endif
+
 	root = ext2_iget(sb, EXT2_ROOT_INO);
 	if (IS_ERR(root)) {
 		ret = PTR_ERR(root);
@@ -1077,7 +1092,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	if (EXT2_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL))
 		ext2_msg(sb, KERN_WARNING,
 			"warning: mounting ext3 filesystem as ext2");
-	ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY);
+	if (ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY))
+		sb->s_flags |= MS_RDONLY;
+	ext2_write_super(sb);
 	return 0;
 
 cantfind_ext2:
@@ -1118,30 +1135,26 @@ static void ext2_clear_super_error(struct super_block *sb)
 		 * be remapped.  Nothing we can do but to retry the
 		 * write and hope for the best.
 		 */
-		printk(KERN_ERR "EXT2-fs: %s previous I/O error to "
-		       "superblock detected", sb->s_id);
+		ext2_msg(sb, KERN_ERR,
+		       "previous I/O error to superblock detected\n");
 		clear_buffer_write_io_error(sbh);
 		set_buffer_uptodate(sbh);
 	}
 }
 
-static void ext2_commit_super (struct super_block * sb,
-			       struct ext2_super_block * es)
-{
-	ext2_clear_super_error(sb);
-	es->s_wtime = cpu_to_le32(get_seconds());
-	mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
-	sb->s_dirt = 0;
-}
-
-static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es)
+static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es,
+			    int wait)
 {
 	ext2_clear_super_error(sb);
+	spin_lock(&EXT2_SB(sb)->s_lock);
 	es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb));
 	es->s_free_inodes_count = cpu_to_le32(ext2_count_free_inodes(sb));
 	es->s_wtime = cpu_to_le32(get_seconds());
+	/* unlock before we do IO */
+	spin_unlock(&EXT2_SB(sb)->s_lock);
 	mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
-	sync_dirty_buffer(EXT2_SB(sb)->s_sbh);
+	if (wait)
+		sync_dirty_buffer(EXT2_SB(sb)->s_sbh);
 	sb->s_dirt = 0;
 }
 
@@ -1155,43 +1168,18 @@ static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es)
  * may have been checked while mounted and e2fsck may have
  * set s_state to EXT2_VALID_FS after some corrections.
  */
-
 static int ext2_sync_fs(struct super_block *sb, int wait)
 {
+	struct ext2_sb_info *sbi = EXT2_SB(sb);
 	struct ext2_super_block *es = EXT2_SB(sb)->s_es;
-	struct buffer_head *sbh = EXT2_SB(sb)->s_sbh;
-
-	lock_kernel();
-	if (buffer_write_io_error(sbh)) {
-		/*
-		 * Oh, dear.  A previous attempt to write the
-		 * superblock failed.  This could happen because the
-		 * USB device was yanked out.  Or it could happen to
-		 * be a transient write error and maybe the block will
-		 * be remapped.  Nothing we can do but to retry the
-		 * write and hope for the best.
-		 */
-		ext2_msg(sb, KERN_ERR,
-		       "previous I/O error to superblock detected\n");
-		clear_buffer_write_io_error(sbh);
-		set_buffer_uptodate(sbh);
-	}
 
+	spin_lock(&sbi->s_lock);
 	if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) {
 		ext2_debug("setting valid to 0\n");
 		es->s_state &= cpu_to_le16(~EXT2_VALID_FS);
-		es->s_free_blocks_count =
-			cpu_to_le32(ext2_count_free_blocks(sb));
-		es->s_free_inodes_count =
-			cpu_to_le32(ext2_count_free_inodes(sb));
-		es->s_mtime = cpu_to_le32(get_seconds());
-		ext2_sync_super(sb, es);
-	} else {
-		ext2_commit_super(sb, es);
 	}
-	sb->s_dirt = 0;
-	unlock_kernel();
-
+	spin_unlock(&sbi->s_lock);
+	ext2_sync_super(sb, es, wait);
 	return 0;
 }
 
@@ -1213,7 +1201,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
 	unsigned long old_sb_flags;
 	int err;
 
-	lock_kernel();
+	spin_lock(&sbi->s_lock);
 
 	/* Store the old options */
 	old_sb_flags = sb->s_flags;
@@ -1252,21 +1240,31 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
 		sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP;
 	}
 	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
-		unlock_kernel();
+		spin_unlock(&sbi->s_lock);
 		return 0;
 	}
 	if (*flags & MS_RDONLY) {
 		if (le16_to_cpu(es->s_state) & EXT2_VALID_FS ||
 		    !(sbi->s_mount_state & EXT2_VALID_FS)) {
-			unlock_kernel();
+			spin_unlock(&sbi->s_lock);
 			return 0;
 		}
+
 		/*
 		 * OK, we are remounting a valid rw partition rdonly, so set
 		 * the rdonly flag and then mark the partition as valid again.
 		 */
 		es->s_state = cpu_to_le16(sbi->s_mount_state);
 		es->s_mtime = cpu_to_le32(get_seconds());
+		spin_unlock(&sbi->s_lock);
+
+		err = dquot_suspend(sb, -1);
+		if (err < 0) {
+			spin_lock(&sbi->s_lock);
+			goto restore_opts;
+		}
+
+		ext2_sync_super(sb, es, 1);
 	} else {
 		__le32 ret = EXT2_HAS_RO_COMPAT_FEATURE(sb,
 					       ~EXT2_FEATURE_RO_COMPAT_SUPP);
@@ -1286,16 +1284,20 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
 		sbi->s_mount_state = le16_to_cpu(es->s_state);
 		if (!ext2_setup_super (sb, es, 0))
 			sb->s_flags &= ~MS_RDONLY;
+		spin_unlock(&sbi->s_lock);
+
+		ext2_write_super(sb);
+
+		dquot_resume(sb, -1);
 	}
-	ext2_sync_super(sb, es);
-	unlock_kernel();
+
 	return 0;
 restore_opts:
 	sbi->s_mount_opt = old_opts.s_mount_opt;
 	sbi->s_resuid = old_opts.s_resuid;
 	sbi->s_resgid = old_opts.s_resgid;
 	sb->s_flags = old_sb_flags;
-	unlock_kernel();
+	spin_unlock(&sbi->s_lock);
 	return err;
 }
 
@@ -1306,6 +1308,8 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
 	struct ext2_super_block *es = sbi->s_es;
 	u64 fsid;
 
+	spin_lock(&sbi->s_lock);
+
 	if (test_opt (sb, MINIX_DF))
 		sbi->s_overhead_last = 0;
 	else if (sbi->s_blocks_last != le32_to_cpu(es->s_blocks_count)) {
@@ -1360,6 +1364,7 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
 	       le64_to_cpup((void *)es->s_uuid + sizeof(u64));
 	buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
 	buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
+	spin_unlock(&sbi->s_lock);
 	return 0;
 }
 
diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c
index 4e2426e22bbe..565cf817bbf1 100644
--- a/fs/ext2/symlink.c
+++ b/fs/ext2/symlink.c
@@ -32,6 +32,7 @@ const struct inode_operations ext2_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= page_follow_link_light,
 	.put_link	= page_put_link,
+	.setattr	= ext2_setattr,
 #ifdef CONFIG_EXT2_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
@@ -43,6 +44,7 @@ const struct inode_operations ext2_symlink_inode_operations = {
 const struct inode_operations ext2_fast_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= ext2_follow_link,
+	.setattr	= ext2_setattr,
 #ifdef CONFIG_EXT2_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 904f00642f84..7c3915780b19 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -101,7 +101,7 @@ static void ext2_xattr_rehash(struct ext2_xattr_header *,
 
 static struct mb_cache *ext2_xattr_cache;
 
-static struct xattr_handler *ext2_xattr_handler_map[] = {
+static const struct xattr_handler *ext2_xattr_handler_map[] = {
 	[EXT2_XATTR_INDEX_USER]		     = &ext2_xattr_user_handler,
 #ifdef CONFIG_EXT2_FS_POSIX_ACL
 	[EXT2_XATTR_INDEX_POSIX_ACL_ACCESS]  = &ext2_xattr_acl_access_handler,
@@ -113,7 +113,7 @@ static struct xattr_handler *ext2_xattr_handler_map[] = {
 #endif
 };
 
-struct xattr_handler *ext2_xattr_handlers[] = {
+const struct xattr_handler *ext2_xattr_handlers[] = {
 	&ext2_xattr_user_handler,
 	&ext2_xattr_trusted_handler,
 #ifdef CONFIG_EXT2_FS_POSIX_ACL
@@ -126,10 +126,10 @@ struct xattr_handler *ext2_xattr_handlers[] = {
 	NULL
 };
 
-static inline struct xattr_handler *
+static inline const struct xattr_handler *
 ext2_xattr_handler(int name_index)
 {
-	struct xattr_handler *handler = NULL;
+	const struct xattr_handler *handler = NULL;
 
 	if (name_index > 0 && name_index < ARRAY_SIZE(ext2_xattr_handler_map))
 		handler = ext2_xattr_handler_map[name_index];
@@ -298,7 +298,7 @@ bad_block:	ext2_error(inode->i_sb, "ext2_xattr_list",
 	/* list the attribute names */
 	for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
 	     entry = EXT2_XATTR_NEXT(entry)) {
-		struct xattr_handler *handler =
+		const struct xattr_handler *handler =
 			ext2_xattr_handler(entry->e_name_index);
 
 		if (handler) {
@@ -345,7 +345,9 @@ static void ext2_xattr_update_super_block(struct super_block *sb)
 	if (EXT2_HAS_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR))
 		return;
 
+	spin_lock(&EXT2_SB(sb)->s_lock);
 	EXT2_SET_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR);
+	spin_unlock(&EXT2_SB(sb)->s_lock);
 	sb->s_dirt = 1;
 	mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
 }
@@ -644,8 +646,8 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
 				   the inode.  */
 				ea_bdebug(new_bh, "reusing block");
 
-				error = -EDQUOT;
-				if (vfs_dq_alloc_block(inode, 1)) {
+				error = dquot_alloc_block(inode, 1);
+				if (error) {
 					unlock_buffer(new_bh);
 					goto cleanup;
 				}
@@ -702,7 +704,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
 		 * as if nothing happened and cleanup the unused block */
 		if (error && error != -ENOSPC) {
 			if (new_bh && new_bh != old_bh)
-				vfs_dq_free_block(inode, 1);
+				dquot_free_block(inode, 1);
 			goto cleanup;
 		}
 	} else
@@ -734,7 +736,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
 			le32_add_cpu(&HDR(old_bh)->h_refcount, -1);
 			if (ce)
 				mb_cache_entry_release(ce);
-			vfs_dq_free_block(inode, 1);
+			dquot_free_block(inode, 1);
 			mark_buffer_dirty(old_bh);
 			ea_bdebug(old_bh, "refcount now=%d",
 				le32_to_cpu(HDR(old_bh)->h_refcount));
@@ -797,7 +799,7 @@ ext2_xattr_delete_inode(struct inode *inode)
 		mark_buffer_dirty(bh);
 		if (IS_SYNC(inode))
 			sync_dirty_buffer(bh);
-		vfs_dq_free_block(inode, 1);
+		dquot_free_block(inode, 1);
 	}
 	EXT2_I(inode)->i_file_acl = 0;
 
diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h
index bf8175b2ced9..a1a1c2184616 100644
--- a/fs/ext2/xattr.h
+++ b/fs/ext2/xattr.h
@@ -55,11 +55,11 @@ struct ext2_xattr_entry {
 
 # ifdef CONFIG_EXT2_FS_XATTR
 
-extern struct xattr_handler ext2_xattr_user_handler;
-extern struct xattr_handler ext2_xattr_trusted_handler;
-extern struct xattr_handler ext2_xattr_acl_access_handler;
-extern struct xattr_handler ext2_xattr_acl_default_handler;
-extern struct xattr_handler ext2_xattr_security_handler;
+extern const struct xattr_handler ext2_xattr_user_handler;
+extern const struct xattr_handler ext2_xattr_trusted_handler;
+extern const struct xattr_handler ext2_xattr_acl_access_handler;
+extern const struct xattr_handler ext2_xattr_acl_default_handler;
+extern const struct xattr_handler ext2_xattr_security_handler;
 
 extern ssize_t ext2_listxattr(struct dentry *, char *, size_t);
 
@@ -72,7 +72,7 @@ extern void ext2_xattr_put_super(struct super_block *);
 extern int init_ext2_xattr(void);
 extern void exit_ext2_xattr(void);
 
-extern struct xattr_handler *ext2_xattr_handlers[];
+extern const struct xattr_handler *ext2_xattr_handlers[];
 
 # else  /* CONFIG_EXT2_FS_XATTR */
 
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index c8155845ac05..3004e15d5da5 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -4,6 +4,7 @@
  */
 
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/fs.h>
 #include <linux/ext2_fs.h>
@@ -66,7 +67,7 @@ ext2_init_security(struct inode *inode, struct inode *dir)
 	return err;
 }
 
-struct xattr_handler ext2_xattr_security_handler = {
+const struct xattr_handler ext2_xattr_security_handler = {
 	.prefix	= XATTR_SECURITY_PREFIX,
 	.list	= ext2_xattr_security_list,
 	.get	= ext2_xattr_security_get,
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index 2a26d71f4771..667e46a8d62d 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -50,7 +50,7 @@ ext2_xattr_trusted_set(struct dentry *dentry, const char *name,
 			      value, size, flags);
 }
 
-struct xattr_handler ext2_xattr_trusted_handler = {
+const struct xattr_handler ext2_xattr_trusted_handler = {
 	.prefix	= XATTR_TRUSTED_PREFIX,
 	.list	= ext2_xattr_trusted_list,
 	.get	= ext2_xattr_trusted_get,
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index 3f6caf3684b4..099d20f47163 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -54,7 +54,7 @@ ext2_xattr_user_set(struct dentry *dentry, const char *name,
 			      name, value, size, flags);
 }
 
-struct xattr_handler ext2_xattr_user_handler = {
+const struct xattr_handler ext2_xattr_user_handler = {
 	.prefix	= XATTR_USER_PREFIX,
 	.list	= ext2_xattr_user_list,
 	.get	= ext2_xattr_user_get,
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 82ba34158661..8a11fe212183 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -205,6 +205,7 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
 					return error;
 				else {
 					inode->i_mode = mode;
+					inode->i_ctime = CURRENT_TIME_SEC;
 					ext3_mark_inode_dirty(handle, inode);
 					if (error == 0)
 						acl = NULL;
@@ -456,7 +457,7 @@ release_and_out:
 	return error;
 }
 
-struct xattr_handler ext3_xattr_acl_access_handler = {
+const struct xattr_handler ext3_xattr_acl_access_handler = {
 	.prefix	= POSIX_ACL_XATTR_ACCESS,
 	.flags	= ACL_TYPE_ACCESS,
 	.list	= ext3_xattr_list_acl_access,
@@ -464,7 +465,7 @@ struct xattr_handler ext3_xattr_acl_access_handler = {
 	.set	= ext3_xattr_set_acl,
 };
 
-struct xattr_handler ext3_xattr_acl_default_handler = {
+const struct xattr_handler ext3_xattr_acl_default_handler = {
 	.prefix	= POSIX_ACL_XATTR_DEFAULT,
 	.flags	= ACL_TYPE_DEFAULT,
 	.list	= ext3_xattr_list_acl_default,
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 27967f92e820..4a32511f4ded 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -14,6 +14,7 @@
 #include <linux/time.h>
 #include <linux/capability.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/jbd.h>
 #include <linux/ext3_fs.h>
 #include <linux/ext3_jbd.h>
@@ -676,7 +677,7 @@ void ext3_free_blocks(handle_t *handle, struct inode *inode,
 	}
 	ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
 	if (dquot_freed_blocks)
-		vfs_dq_free_block(inode, dquot_freed_blocks);
+		dquot_free_block(inode, dquot_freed_blocks);
 	return;
 }
 
@@ -1502,8 +1503,9 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
 	/*
 	 * Check quota for allocation of this block.
 	 */
-	if (vfs_dq_alloc_block(inode, num)) {
-		*errp = -EDQUOT;
+	err = dquot_alloc_block(inode, num);
+	if (err) {
+		*errp = err;
 		return 0;
 	}
 
@@ -1582,6 +1584,12 @@ retry_alloc:
 			goto io_error;
 		free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
 		/*
+		 * skip this group (and avoid loading bitmap) if there
+		 * are no free blocks
+		 */
+		if (!free_blocks)
+			continue;
+		/*
 		 * skip this group if the number of
 		 * free blocks is less than half of the reservation
 		 * window size.
@@ -1713,7 +1721,7 @@ allocated:
 
 	*errp = 0;
 	brelse(bitmap_bh);
-	vfs_dq_free_block(inode, *count-num);
+	dquot_free_block(inode, *count-num);
 	*count = num;
 	return ret_block;
 
@@ -1728,7 +1736,7 @@ out:
 	 * Undo the block allocation
 	 */
 	if (!performed_allocation)
-		vfs_dq_free_block(inode, *count);
+		dquot_free_block(inode, *count);
 	brelse(bitmap_bh);
 	return 0;
 }
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 373fa90c796a..e2e72c367cf6 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -297,7 +297,7 @@ static void free_rb_tree_fname(struct rb_root *root)
 			kfree (old);
 		}
 		if (!parent)
-			root->rb_node = NULL;
+			*root = RB_ROOT;
 		else if (parent->rb_left == n)
 			parent->rb_left = NULL;
 		else if (parent->rb_right == n)
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 388bbdfa0b4e..f55df0e61cbd 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -21,6 +21,7 @@
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/jbd.h>
+#include <linux/quotaops.h>
 #include <linux/ext3_fs.h>
 #include <linux/ext3_jbd.h>
 #include "xattr.h"
@@ -33,9 +34,9 @@
  */
 static int ext3_release_file (struct inode * inode, struct file * filp)
 {
-	if (EXT3_I(inode)->i_state & EXT3_STATE_FLUSH_ON_CLOSE) {
+	if (ext3_test_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE)) {
 		filemap_flush(inode->i_mapping);
-		EXT3_I(inode)->i_state &= ~EXT3_STATE_FLUSH_ON_CLOSE;
+		ext3_clear_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE);
 	}
 	/* if we are the last writer on the inode, drop the block reservation */
 	if ((filp->f_mode & FMODE_WRITE) &&
@@ -62,7 +63,7 @@ const struct file_operations ext3_file_operations = {
 	.compat_ioctl	= ext3_compat_ioctl,
 #endif
 	.mmap		= generic_file_mmap,
-	.open		= generic_file_open,
+	.open		= dquot_file_open,
 	.release	= ext3_release_file,
 	.fsync		= ext3_sync_file,
 	.splice_read	= generic_file_splice_read,
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index 8209f266e9ad..d7e9f74dc3a6 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -43,12 +43,12 @@
  * inode to disk.
  */
 
-int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
+int ext3_sync_file(struct file *file, int datasync)
 {
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = file->f_mapping->host;
 	struct ext3_inode_info *ei = EXT3_I(inode);
 	journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
-	int ret = 0;
+	int ret, needs_barrier = 0;
 	tid_t commit_tid;
 
 	if (inode->i_sb->s_flags & MS_RDONLY)
@@ -70,28 +70,27 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
 	 *  (they were dirtied by commit).  But that's OK - the blocks are
 	 *  safe in-journal, which is all fsync() needs to ensure.
 	 */
-	if (ext3_should_journal_data(inode)) {
-		ret = ext3_force_commit(inode->i_sb);
-		goto out;
-	}
+	if (ext3_should_journal_data(inode))
+		return ext3_force_commit(inode->i_sb);
 
 	if (datasync)
 		commit_tid = atomic_read(&ei->i_datasync_tid);
 	else
 		commit_tid = atomic_read(&ei->i_sync_tid);
 
-	if (log_start_commit(journal, commit_tid)) {
-		log_wait_commit(journal, commit_tid);
-		goto out;
-	}
+	if (test_opt(inode->i_sb, BARRIER) &&
+	    !journal_trans_will_send_data_barrier(journal, commit_tid))
+		needs_barrier = 1;
+	log_start_commit(journal, commit_tid);
+	ret = log_wait_commit(journal, commit_tid);
 
 	/*
 	 * In case we didn't commit a transaction, we have to flush
 	 * disk caches manually so that data really is on persistent
 	 * storage
 	 */
-	if (test_opt(inode->i_sb, BARRIER))
-		blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
-out:
+	if (needs_barrier)
+		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
+				BLKDEV_IFL_WAIT);
 	return ret;
 }
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index b39991285136..498021eb88fb 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -123,10 +123,10 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
 	 * Note: we must free any quota before locking the superblock,
 	 * as writing the quota to disk may need the lock as well.
 	 */
-	vfs_dq_init(inode);
+	dquot_initialize(inode);
 	ext3_xattr_delete_inode(handle, inode);
-	vfs_dq_free_inode(inode);
-	vfs_dq_drop(inode);
+	dquot_free_inode(inode);
+	dquot_drop(inode);
 
 	is_directory = S_ISDIR(inode->i_mode);
 
@@ -538,16 +538,13 @@ got:
 	if (S_ISDIR(mode))
 		percpu_counter_inc(&sbi->s_dirs_counter);
 
-	inode->i_uid = current_fsuid();
-	if (test_opt (sb, GRPID))
-		inode->i_gid = dir->i_gid;
-	else if (dir->i_mode & S_ISGID) {
+
+	if (test_opt(sb, GRPID)) {
+		inode->i_mode = mode;
+		inode->i_uid = current_fsuid();
 		inode->i_gid = dir->i_gid;
-		if (S_ISDIR(mode))
-			mode |= S_ISGID;
 	} else
-		inode->i_gid = current_fsgid();
-	inode->i_mode = mode;
+		inode_init_owner(inode, dir, mode);
 
 	inode->i_ino = ino;
 	/* This is the optimal IO size (for stat), not the fs block size */
@@ -582,16 +579,18 @@ got:
 	inode->i_generation = sbi->s_next_generation++;
 	spin_unlock(&sbi->s_next_gen_lock);
 
-	ei->i_state = EXT3_STATE_NEW;
+	ei->i_state_flags = 0;
+	ext3_set_inode_state(inode, EXT3_STATE_NEW);
+
 	ei->i_extra_isize =
 		(EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ?
 		sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0;
 
 	ret = inode;
-	if (vfs_dq_alloc_inode(inode)) {
-		err = -EDQUOT;
+	dquot_initialize(inode);
+	err = dquot_alloc_inode(inode);
+	if (err)
 		goto fail_drop;
-	}
 
 	err = ext3_init_acl(handle, inode, dir);
 	if (err)
@@ -619,10 +618,10 @@ really_out:
 	return ret;
 
 fail_free_drop:
-	vfs_dq_free_inode(inode);
+	dquot_free_inode(inode);
 
 fail_drop:
-	vfs_dq_drop(inode);
+	dquot_drop(inode);
 	inode->i_flags |= S_NOQUOTA;
 	inode->i_nlink = 0;
 	unlock_new_inode(inode);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 455e6e6e5cb9..735f0190ec2a 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -196,6 +196,9 @@ void ext3_delete_inode (struct inode * inode)
 {
 	handle_t *handle;
 
+	if (!is_bad_inode(inode))
+		dquot_initialize(inode);
+
 	truncate_inode_pages(&inode->i_data, 0);
 
 	if (is_bad_inode(inode))
@@ -1378,7 +1381,7 @@ static int ext3_journalled_write_end(struct file *file,
 	 */
 	if (pos + len > inode->i_size && ext3_can_truncate(inode))
 		ext3_orphan_add(handle, inode);
-	EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
+	ext3_set_inode_state(inode, EXT3_STATE_JDATA);
 	if (inode->i_size > EXT3_I(inode)->i_disksize) {
 		EXT3_I(inode)->i_disksize = inode->i_size;
 		ret2 = ext3_mark_inode_dirty(handle, inode);
@@ -1417,7 +1420,7 @@ static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
 	journal_t *journal;
 	int err;
 
-	if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) {
+	if (ext3_test_inode_state(inode, EXT3_STATE_JDATA)) {
 		/*
 		 * This is a REALLY heavyweight approach, but the use of
 		 * bmap on dirty files is expected to be extremely rare:
@@ -1436,7 +1439,7 @@ static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
 		 * everything they get.
 		 */
 
-		EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA;
+		ext3_clear_inode_state(inode, EXT3_STATE_JDATA);
 		journal = EXT3_JOURNAL(inode);
 		journal_lock_updates(journal);
 		err = journal_flush(journal);
@@ -1528,6 +1531,7 @@ static int ext3_ordered_writepage(struct page *page,
 	int err;
 
 	J_ASSERT(PageLocked(page));
+	WARN_ON_ONCE(IS_RDONLY(inode));
 
 	/*
 	 * We give up here if we're reentered, because it might be for a
@@ -1600,6 +1604,9 @@ static int ext3_writeback_writepage(struct page *page,
 	int ret = 0;
 	int err;
 
+	J_ASSERT(PageLocked(page));
+	WARN_ON_ONCE(IS_RDONLY(inode));
+
 	if (ext3_journal_current_handle())
 		goto out_fail;
 
@@ -1642,6 +1649,9 @@ static int ext3_journalled_writepage(struct page *page,
 	int ret = 0;
 	int err;
 
+	J_ASSERT(PageLocked(page));
+	WARN_ON_ONCE(IS_RDONLY(inode));
+
 	if (ext3_journal_current_handle())
 		goto no_write;
 
@@ -1670,7 +1680,7 @@ static int ext3_journalled_writepage(struct page *page,
 				PAGE_CACHE_SIZE, NULL, write_end_fn);
 		if (ret == 0)
 			ret = err;
-		EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
+		ext3_set_inode_state(inode, EXT3_STATE_JDATA);
 		unlock_page(page);
 	} else {
 		/*
@@ -1785,8 +1795,9 @@ retry:
 		handle = ext3_journal_start(inode, 2);
 		if (IS_ERR(handle)) {
 			/* This is really bad luck. We've written the data
-			 * but cannot extend i_size. Bail out and pretend
-			 * the write failed... */
+			 * but cannot extend i_size. Truncate allocated blocks
+			 * and pretend the write failed... */
+			ext3_truncate(inode);
 			ret = PTR_ERR(handle);
 			goto out;
 		}
@@ -2402,7 +2413,7 @@ void ext3_truncate(struct inode *inode)
 		goto out_notrans;
 
 	if (inode->i_size == 0 && ext3_should_writeback_data(inode))
-		ei->i_state |= EXT3_STATE_FLUSH_ON_CLOSE;
+		ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE);
 
 	/*
 	 * We have to lock the EOF page here, because lock_page() nests
@@ -2721,7 +2732,7 @@ int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc)
 {
 	/* We have all inode data except xattrs in memory here. */
 	return __ext3_get_inode_loc(inode, iloc,
-		!(EXT3_I(inode)->i_state & EXT3_STATE_XATTR));
+		!ext3_test_inode_state(inode, EXT3_STATE_XATTR));
 }
 
 void ext3_set_inode_flags(struct inode *inode)
@@ -2800,7 +2811,7 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
 	inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime);
 	inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
 
-	ei->i_state = 0;
+	ei->i_state_flags = 0;
 	ei->i_dir_start_lookup = 0;
 	ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
 	/* We now have enough fields to check if the inode was active or not.
@@ -2893,7 +2904,7 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
 					EXT3_GOOD_OLD_INODE_SIZE +
 					ei->i_extra_isize;
 			if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC))
-				 ei->i_state |= EXT3_STATE_XATTR;
+				 ext3_set_inode_state(inode, EXT3_STATE_XATTR);
 		}
 	} else
 		ei->i_extra_isize = 0;
@@ -2955,7 +2966,7 @@ again:
 
 	/* For fields not not tracking in the in-memory inode,
 	 * initialise them to zero for new inodes. */
-	if (ei->i_state & EXT3_STATE_NEW)
+	if (ext3_test_inode_state(inode, EXT3_STATE_NEW))
 		memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
 
 	ext3_get_inode_flags(ei);
@@ -3052,7 +3063,7 @@ again:
 	rc = ext3_journal_dirty_metadata(handle, bh);
 	if (!err)
 		err = rc;
-	ei->i_state &= ~EXT3_STATE_NEW;
+	ext3_clear_inode_state(inode, EXT3_STATE_NEW);
 
 	atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid);
 out_brelse:
@@ -3096,7 +3107,7 @@ out_brelse:
  * `stuff()' is running, and the new i_size will be lost.  Plus the inode
  * will no longer be on the superblock's dirty inode list.
  */
-int ext3_write_inode(struct inode *inode, int wait)
+int ext3_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	if (current->flags & PF_MEMALLOC)
 		return 0;
@@ -3107,7 +3118,7 @@ int ext3_write_inode(struct inode *inode, int wait)
 		return -EIO;
 	}
 
-	if (!wait)
+	if (wbc->sync_mode != WB_SYNC_ALL)
 		return 0;
 
 	return ext3_force_commit(inode->i_sb);
@@ -3140,6 +3151,8 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
 	if (error)
 		return error;
 
+	if (is_quota_modification(inode, attr))
+		dquot_initialize(inode);
 	if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
 		(ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
 		handle_t *handle;
@@ -3152,7 +3165,7 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
 			error = PTR_ERR(handle);
 			goto err_out;
 		}
-		error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
+		error = dquot_transfer(inode, attr);
 		if (error) {
 			ext3_journal_stop(handle);
 			return error;
@@ -3237,7 +3250,7 @@ static int ext3_writepage_trans_blocks(struct inode *inode)
 		ret = 2 * (bpp + indirects) + 2;
 
 #ifdef CONFIG_QUOTA
-	/* We know that structure was already allocated during vfs_dq_init so
+	/* We know that structure was already allocated during dquot_initialize so
 	 * we will be updating only the data blocks + inodes */
 	ret += EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
 #endif
@@ -3328,7 +3341,7 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
  * i_size has been changed by generic_commit_write() and we thus need
  * to include the updated inode in the current transaction.
  *
- * Also, vfs_dq_alloc_space() will always dirty the inode when blocks
+ * Also, dquot_alloc_space() will always dirty the inode when blocks
  * are allocated to the file.
  *
  * If the inode is marked synchronous, we don't honour that here - doing
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 7b0e44f7d66f..ee184084ca42 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1696,6 +1696,8 @@ static int ext3_create (struct inode * dir, struct dentry * dentry, int mode,
 	struct inode * inode;
 	int err, retries = 0;
 
+	dquot_initialize(dir);
+
 retry:
 	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
 					EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
@@ -1730,6 +1732,8 @@ static int ext3_mknod (struct inode * dir, struct dentry *dentry,
 	if (!new_valid_dev(rdev))
 		return -EINVAL;
 
+	dquot_initialize(dir);
+
 retry:
 	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
 					EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
@@ -1766,6 +1770,8 @@ static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode)
 	if (dir->i_nlink >= EXT3_LINK_MAX)
 		return -EMLINK;
 
+	dquot_initialize(dir);
+
 retry:
 	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
 					EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
@@ -2060,7 +2066,9 @@ static int ext3_rmdir (struct inode * dir, struct dentry *dentry)
 
 	/* Initialize quotas before so that eventual writes go in
 	 * separate transaction */
-	vfs_dq_init(dentry->d_inode);
+	dquot_initialize(dir);
+	dquot_initialize(dentry->d_inode);
+
 	handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
@@ -2119,7 +2127,9 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry)
 
 	/* Initialize quotas before so that eventual writes go
 	 * in separate transaction */
-	vfs_dq_init(dentry->d_inode);
+	dquot_initialize(dir);
+	dquot_initialize(dentry->d_inode);
+
 	handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
@@ -2174,6 +2184,8 @@ static int ext3_symlink (struct inode * dir,
 	if (l > dir->i_sb->s_blocksize)
 		return -ENAMETOOLONG;
 
+	dquot_initialize(dir);
+
 retry:
 	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
 					EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 +
@@ -2228,6 +2240,9 @@ static int ext3_link (struct dentry * old_dentry,
 
 	if (inode->i_nlink >= EXT3_LINK_MAX)
 		return -EMLINK;
+
+	dquot_initialize(dir);
+
 	/*
 	 * Return -ENOENT if we've raced with unlink and i_nlink is 0.  Doing
 	 * otherwise has the potential to corrupt the orphan inode list.
@@ -2278,12 +2293,15 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
 	struct ext3_dir_entry_2 * old_de, * new_de;
 	int retval, flush_file = 0;
 
+	dquot_initialize(old_dir);
+	dquot_initialize(new_dir);
+
 	old_bh = new_bh = dir_bh = NULL;
 
 	/* Initialize quotas before so that eventual writes go
 	 * in separate transaction */
 	if (new_dentry->d_inode)
-		vfs_dq_init(new_dentry->d_inode);
+		dquot_initialize(new_dentry->d_inode);
 	handle = ext3_journal_start(old_dir, 2 *
 					EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) +
 					EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index afa2b569da10..6c953bb255e7 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -164,7 +164,7 @@ void ext3_msg(struct super_block *sb, const char *prefix,
  * write out the superblock safely.
  *
  * We'll just use the journal_abort() error code to record an error in
- * the journal instead.  On recovery, the journal will compain about
+ * the journal instead.  On recovery, the journal will complain about
  * that error until we've noted it down and cleared it.
  */
 
@@ -181,7 +181,7 @@ static void ext3_handle_error(struct super_block *sb)
 	if (!test_opt (sb, ERRORS_CONT)) {
 		journal_t *journal = EXT3_SB(sb)->s_journal;
 
-		EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT;
+		set_opt(EXT3_SB(sb)->s_mount_opt, ABORT);
 		if (journal)
 			journal_abort(journal, -EIO);
 	}
@@ -296,7 +296,7 @@ void ext3_abort (struct super_block * sb, const char * function,
 		"error: remounting filesystem read-only");
 	EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
 	sb->s_flags |= MS_RDONLY;
-	EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT;
+	set_opt(EXT3_SB(sb)->s_mount_opt, ABORT);
 	if (EXT3_SB(sb)->s_journal)
 		journal_abort(EXT3_SB(sb)->s_journal, -EIO);
 }
@@ -410,6 +410,8 @@ static void ext3_put_super (struct super_block * sb)
 	struct ext3_super_block *es = sbi->s_es;
 	int i, err;
 
+	dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
+
 	lock_kernel();
 
 	ext3_xattr_put_super(sb);
@@ -528,6 +530,8 @@ static void destroy_inodecache(void)
 static void ext3_clear_inode(struct inode *inode)
 {
 	struct ext3_block_alloc_info *rsv = EXT3_I(inode)->i_block_alloc_info;
+
+	dquot_drop(inode);
 	ext3_discard_reservation(inode);
 	EXT3_I(inode)->i_block_alloc_info = NULL;
 	if (unlikely(rsv))
@@ -562,10 +566,10 @@ static inline void ext3_show_quota_options(struct seq_file *seq, struct super_bl
 	if (sbi->s_qf_names[GRPQUOTA])
 		seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
 
-	if (sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA)
+	if (test_opt(sb, USRQUOTA))
 		seq_puts(seq, ",usrquota");
 
-	if (sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA)
+	if (test_opt(sb, GRPQUOTA))
 		seq_puts(seq, ",grpquota");
 #endif
 }
@@ -651,13 +655,16 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_printf(seq, ",commit=%u",
 			   (unsigned) (sbi->s_commit_interval / HZ));
 	}
-	if (test_opt(sb, BARRIER))
-		seq_puts(seq, ",barrier=1");
+
+	/*
+	 * Always display barrier state so it's clear what the status is.
+	 */
+	seq_puts(seq, ",barrier=");
+	seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
 	if (test_opt(sb, NOBH))
 		seq_puts(seq, ",nobh");
 
-	seq_printf(seq, ",data=%s", data_mode_string(sbi->s_mount_opt &
-						     EXT3_MOUNT_DATA_FLAGS));
+	seq_printf(seq, ",data=%s", data_mode_string(test_opt(sb, DATA_FLAGS)));
 	if (test_opt(sb, DATA_ERR_ABORT))
 		seq_puts(seq, ",data_err=abort");
 
@@ -743,7 +750,7 @@ static int ext3_release_dquot(struct dquot *dquot);
 static int ext3_mark_dquot_dirty(struct dquot *dquot);
 static int ext3_write_info(struct super_block *sb, int type);
 static int ext3_quota_on(struct super_block *sb, int type, int format_id,
-				char *path, int remount);
+				char *path);
 static int ext3_quota_on_mount(struct super_block *sb, int type);
 static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
 			       size_t len, loff_t off);
@@ -751,13 +758,6 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
 				const char *data, size_t len, loff_t off);
 
 static const struct dquot_operations ext3_quota_operations = {
-	.initialize	= dquot_initialize,
-	.drop		= dquot_drop,
-	.alloc_space	= dquot_alloc_space,
-	.alloc_inode	= dquot_alloc_inode,
-	.free_space	= dquot_free_space,
-	.free_inode	= dquot_free_inode,
-	.transfer	= dquot_transfer,
 	.write_dquot	= ext3_write_dquot,
 	.acquire_dquot	= ext3_acquire_dquot,
 	.release_dquot	= ext3_release_dquot,
@@ -769,12 +769,12 @@ static const struct dquot_operations ext3_quota_operations = {
 
 static const struct quotactl_ops ext3_qctl_operations = {
 	.quota_on	= ext3_quota_on,
-	.quota_off	= vfs_quota_off,
-	.quota_sync	= vfs_quota_sync,
-	.get_info	= vfs_get_dqinfo,
-	.set_info	= vfs_set_dqinfo,
-	.get_dqblk	= vfs_get_dqblk,
-	.set_dqblk	= vfs_set_dqblk
+	.quota_off	= dquot_quota_off,
+	.quota_sync	= dquot_quota_sync,
+	.get_info	= dquot_get_dqinfo,
+	.set_info	= dquot_set_dqinfo,
+	.get_dqblk	= dquot_get_dqblk,
+	.set_dqblk	= dquot_set_dqblk
 };
 #endif
 
@@ -816,8 +816,8 @@ enum {
 	Opt_data_err_abort, Opt_data_err_ignore,
 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
-	Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize,
-	Opt_usrquota, Opt_grpquota
+	Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err,
+	Opt_resize, Opt_usrquota, Opt_grpquota
 };
 
 static const match_table_t tokens = {
@@ -871,6 +871,8 @@ static const match_table_t tokens = {
 	{Opt_quota, "quota"},
 	{Opt_usrquota, "usrquota"},
 	{Opt_barrier, "barrier=%u"},
+	{Opt_barrier, "barrier"},
+	{Opt_nobarrier, "nobarrier"},
 	{Opt_resize, "resize"},
 	{Opt_err, NULL},
 };
@@ -896,6 +898,63 @@ static ext3_fsblk_t get_sb_block(void **data, struct super_block *sb)
 	return sb_block;
 }
 
+#ifdef CONFIG_QUOTA
+static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
+{
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	char *qname;
+
+	if (sb_any_quota_loaded(sb) &&
+		!sbi->s_qf_names[qtype]) {
+		ext3_msg(sb, KERN_ERR,
+			"Cannot change journaled "
+			"quota options when quota turned on");
+		return 0;
+	}
+	qname = match_strdup(args);
+	if (!qname) {
+		ext3_msg(sb, KERN_ERR,
+			"Not enough memory for storing quotafile name");
+		return 0;
+	}
+	if (sbi->s_qf_names[qtype] &&
+		strcmp(sbi->s_qf_names[qtype], qname)) {
+		ext3_msg(sb, KERN_ERR,
+			"%s quota file already specified", QTYPE2NAME(qtype));
+		kfree(qname);
+		return 0;
+	}
+	sbi->s_qf_names[qtype] = qname;
+	if (strchr(sbi->s_qf_names[qtype], '/')) {
+		ext3_msg(sb, KERN_ERR,
+			"quotafile must be on filesystem root");
+		kfree(sbi->s_qf_names[qtype]);
+		sbi->s_qf_names[qtype] = NULL;
+		return 0;
+	}
+	set_opt(sbi->s_mount_opt, QUOTA);
+	return 1;
+}
+
+static int clear_qf_name(struct super_block *sb, int qtype) {
+
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+
+	if (sb_any_quota_loaded(sb) &&
+		sbi->s_qf_names[qtype]) {
+		ext3_msg(sb, KERN_ERR, "Cannot change journaled quota options"
+			" when quota turned on");
+		return 0;
+	}
+	/*
+	 * The space will be released later when all options are confirmed
+	 * to be correct
+	 */
+	sbi->s_qf_names[qtype] = NULL;
+	return 1;
+}
+#endif
+
 static int parse_options (char *options, struct super_block *sb,
 			  unsigned int *inum, unsigned long *journal_devnum,
 			  ext3_fsblk_t *n_blocks_count, int is_remount)
@@ -906,8 +965,7 @@ static int parse_options (char *options, struct super_block *sb,
 	int data_opt = 0;
 	int option;
 #ifdef CONFIG_QUOTA
-	int qtype, qfmt;
-	char *qname;
+	int qfmt;
 #endif
 
 	if (!options)
@@ -917,7 +975,11 @@ static int parse_options (char *options, struct super_block *sb,
 		int token;
 		if (!*p)
 			continue;
-
+		/*
+		 * Initialize args struct so we know whether arg was
+		 * found; some options take optional arguments.
+		 */
+		args[0].to = args[0].from = 0;
 		token = match_token(p, tokens, args);
 		switch (token) {
 		case Opt_bsd_df:
@@ -1065,20 +1127,19 @@ static int parse_options (char *options, struct super_block *sb,
 			data_opt = EXT3_MOUNT_WRITEBACK_DATA;
 		datacheck:
 			if (is_remount) {
-				if ((sbi->s_mount_opt & EXT3_MOUNT_DATA_FLAGS)
-						== data_opt)
+				if (test_opt(sb, DATA_FLAGS) == data_opt)
 					break;
 				ext3_msg(sb, KERN_ERR,
 					"error: cannot change "
 					"data mode on remount. The filesystem "
 					"is mounted in data=%s mode and you "
 					"try to remount it in data=%s mode.",
-					data_mode_string(sbi->s_mount_opt &
-							EXT3_MOUNT_DATA_FLAGS),
+					data_mode_string(test_opt(sb,
+							DATA_FLAGS)),
 					data_mode_string(data_opt));
 				return 0;
 			} else {
-				sbi->s_mount_opt &= ~EXT3_MOUNT_DATA_FLAGS;
+				clear_opt(sbi->s_mount_opt, DATA_FLAGS);
 				sbi->s_mount_opt |= data_opt;
 			}
 			break;
@@ -1090,62 +1151,20 @@ static int parse_options (char *options, struct super_block *sb,
 			break;
 #ifdef CONFIG_QUOTA
 		case Opt_usrjquota:
-			qtype = USRQUOTA;
-			goto set_qf_name;
-		case Opt_grpjquota:
-			qtype = GRPQUOTA;
-set_qf_name:
-			if (sb_any_quota_loaded(sb) &&
-			    !sbi->s_qf_names[qtype]) {
-				ext3_msg(sb, KERN_ERR,
-					"error: cannot change journaled "
-					"quota options when quota turned on.");
-				return 0;
-			}
-			qname = match_strdup(&args[0]);
-			if (!qname) {
-				ext3_msg(sb, KERN_ERR,
-					"error: not enough memory for "
-					"storing quotafile name.");
-				return 0;
-			}
-			if (sbi->s_qf_names[qtype] &&
-			    strcmp(sbi->s_qf_names[qtype], qname)) {
-				ext3_msg(sb, KERN_ERR,
-					"error: %s quota file already "
-					"specified.", QTYPE2NAME(qtype));
-				kfree(qname);
+			if (!set_qf_name(sb, USRQUOTA, &args[0]))
 				return 0;
-			}
-			sbi->s_qf_names[qtype] = qname;
-			if (strchr(sbi->s_qf_names[qtype], '/')) {
-				ext3_msg(sb, KERN_ERR,
-					"error: quotafile must be on "
-					"filesystem root.");
-				kfree(sbi->s_qf_names[qtype]);
-				sbi->s_qf_names[qtype] = NULL;
+			break;
+		case Opt_grpjquota:
+			if (!set_qf_name(sb, GRPQUOTA, &args[0]))
 				return 0;
-			}
-			set_opt(sbi->s_mount_opt, QUOTA);
 			break;
 		case Opt_offusrjquota:
-			qtype = USRQUOTA;
-			goto clear_qf_name;
+			if (!clear_qf_name(sb, USRQUOTA))
+				return 0;
+			break;
 		case Opt_offgrpjquota:
-			qtype = GRPQUOTA;
-clear_qf_name:
-			if (sb_any_quota_loaded(sb) &&
-			    sbi->s_qf_names[qtype]) {
-				ext3_msg(sb, KERN_ERR, "error: cannot change "
-					"journaled quota options when "
-					"quota turned on.");
+			if (!clear_qf_name(sb, GRPQUOTA))
 				return 0;
-			}
-			/*
-			 * The space will be released later when all options
-			 * are confirmed to be correct
-			 */
-			sbi->s_qf_names[qtype] = NULL;
 			break;
 		case Opt_jqfmt_vfsold:
 			qfmt = QFMT_VFS_OLD;
@@ -1208,9 +1227,15 @@ set_qf_format:
 		case Opt_abort:
 			set_opt(sbi->s_mount_opt, ABORT);
 			break;
+		case Opt_nobarrier:
+			clear_opt(sbi->s_mount_opt, BARRIER);
+			break;
 		case Opt_barrier:
-			if (match_int(&args[0], &option))
-				return 0;
+			if (args[0].from) {
+				if (match_int(&args[0], &option))
+					return 0;
+			} else
+				option = 1;	/* No argument, default to 1 */
 			if (option)
 				set_opt(sbi->s_mount_opt, BARRIER);
 			else
@@ -1244,18 +1269,12 @@ set_qf_format:
 	}
 #ifdef CONFIG_QUOTA
 	if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
-		if ((sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA) &&
-		     sbi->s_qf_names[USRQUOTA])
+		if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
 			clear_opt(sbi->s_mount_opt, USRQUOTA);
-
-		if ((sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA) &&
-		     sbi->s_qf_names[GRPQUOTA])
+		if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
 			clear_opt(sbi->s_mount_opt, GRPQUOTA);
 
-		if ((sbi->s_qf_names[USRQUOTA] &&
-				(sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA)) ||
-		    (sbi->s_qf_names[GRPQUOTA] &&
-				(sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA))) {
+		if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
 			ext3_msg(sb, KERN_ERR, "error: old and new quota "
 					"format mixing.");
 			return 0;
@@ -1478,7 +1497,7 @@ static void ext3_orphan_cleanup (struct super_block * sb,
 		}
 
 		list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
-		vfs_dq_init(inode);
+		dquot_initialize(inode);
 		if (inode->i_nlink) {
 			printk(KERN_DEBUG
 				"%s: truncating inode %lu to %Ld bytes\n",
@@ -1510,7 +1529,7 @@ static void ext3_orphan_cleanup (struct super_block * sb,
 	/* Turn quotas off */
 	for (i = 0; i < MAXQUOTAS; i++) {
 		if (sb_dqopt(sb)->files[i])
-			vfs_quota_off(sb, i, 0);
+			dquot_quota_off(sb, i);
 	}
 #endif
 	sb->s_flags = s_flags; /* Restore MS_RDONLY status */
@@ -1671,11 +1690,11 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		set_opt(sbi->s_mount_opt, POSIX_ACL);
 #endif
 	if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_DATA)
-		sbi->s_mount_opt |= EXT3_MOUNT_JOURNAL_DATA;
+		set_opt(sbi->s_mount_opt, JOURNAL_DATA);
 	else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_ORDERED)
-		sbi->s_mount_opt |= EXT3_MOUNT_ORDERED_DATA;
+		set_opt(sbi->s_mount_opt, ORDERED_DATA);
 	else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_WBACK)
-		sbi->s_mount_opt |= EXT3_MOUNT_WRITEBACK_DATA;
+		set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
 
 	if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_PANIC)
 		set_opt(sbi->s_mount_opt, ERRORS_PANIC);
@@ -1694,7 +1713,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		goto failed_mount;
 
 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
-		((sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+		(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
 
 	if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV &&
 	    (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) ||
@@ -1889,21 +1908,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
 	spin_lock_init(&sbi->s_next_gen_lock);
 
-	err = percpu_counter_init(&sbi->s_freeblocks_counter,
-			ext3_count_free_blocks(sb));
-	if (!err) {
-		err = percpu_counter_init(&sbi->s_freeinodes_counter,
-				ext3_count_free_inodes(sb));
-	}
-	if (!err) {
-		err = percpu_counter_init(&sbi->s_dirs_counter,
-				ext3_count_dirs(sb));
-	}
-	if (err) {
-		ext3_msg(sb, KERN_ERR, "error: insufficient memory");
-		goto failed_mount3;
-	}
-
 	/* per fileystem reservation list head & lock */
 	spin_lock_init(&sbi->s_rsv_window_lock);
 	sbi->s_rsv_window_root = RB_ROOT;
@@ -1944,15 +1948,29 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	if (!test_opt(sb, NOLOAD) &&
 	    EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
 		if (ext3_load_journal(sb, es, journal_devnum))
-			goto failed_mount3;
+			goto failed_mount2;
 	} else if (journal_inum) {
 		if (ext3_create_journal(sb, es, journal_inum))
-			goto failed_mount3;
+			goto failed_mount2;
 	} else {
 		if (!silent)
 			ext3_msg(sb, KERN_ERR,
 				"error: no journal found. "
 				"mounting ext3 over ext2?");
+		goto failed_mount2;
+	}
+	err = percpu_counter_init(&sbi->s_freeblocks_counter,
+			ext3_count_free_blocks(sb));
+	if (!err) {
+		err = percpu_counter_init(&sbi->s_freeinodes_counter,
+				ext3_count_free_inodes(sb));
+	}
+	if (!err) {
+		err = percpu_counter_init(&sbi->s_dirs_counter,
+				ext3_count_dirs(sb));
+	}
+	if (err) {
+		ext3_msg(sb, KERN_ERR, "error: insufficient memory");
 		goto failed_mount3;
 	}
 
@@ -1977,7 +1995,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 			ext3_msg(sb, KERN_ERR,
 				"error: journal does not support "
 				"requested data journaling mode");
-			goto failed_mount4;
+			goto failed_mount3;
 		}
 	default:
 		break;
@@ -2000,19 +2018,19 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	if (IS_ERR(root)) {
 		ext3_msg(sb, KERN_ERR, "error: get root inode failed");
 		ret = PTR_ERR(root);
-		goto failed_mount4;
+		goto failed_mount3;
 	}
 	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
 		iput(root);
 		ext3_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck");
-		goto failed_mount4;
+		goto failed_mount3;
 	}
 	sb->s_root = d_alloc_root(root);
 	if (!sb->s_root) {
 		ext3_msg(sb, KERN_ERR, "error: get root dentry failed");
 		iput(root);
 		ret = -ENOMEM;
-		goto failed_mount4;
+		goto failed_mount3;
 	}
 
 	ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
@@ -2038,12 +2056,11 @@ cantfind_ext3:
 		       sb->s_id);
 	goto failed_mount;
 
-failed_mount4:
-	journal_destroy(sbi->s_journal);
 failed_mount3:
 	percpu_counter_destroy(&sbi->s_freeblocks_counter);
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
+	journal_destroy(sbi->s_journal);
 failed_mount2:
 	for (i = 0; i < db_count; i++)
 		brelse(sbi->s_group_desc[i]);
@@ -2277,6 +2294,9 @@ static int ext3_load_journal(struct super_block *sb,
 			return -EINVAL;
 	}
 
+	if (!(journal->j_flags & JFS_BARRIER))
+		printk(KERN_INFO "EXT3-fs: barriers not enabled\n");
+
 	if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
 		err = journal_update_format(journal);
 		if (err)  {
@@ -2533,6 +2553,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 	ext3_fsblk_t n_blocks_count = 0;
 	unsigned long old_sb_flags;
 	struct ext3_mount_options old_opts;
+	int enable_quota = 0;
 	int err;
 #ifdef CONFIG_QUOTA
 	int i;
@@ -2561,11 +2582,11 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 		goto restore_opts;
 	}
 
-	if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
+	if (test_opt(sb, ABORT))
 		ext3_abort(sb, __func__, "Abort forced by user");
 
 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
-		((sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+		(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
 
 	es = sbi->s_es;
 
@@ -2573,12 +2594,16 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 
 	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
 		n_blocks_count > le32_to_cpu(es->s_blocks_count)) {
-		if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) {
+		if (test_opt(sb, ABORT)) {
 			err = -EROFS;
 			goto restore_opts;
 		}
 
 		if (*flags & MS_RDONLY) {
+			err = dquot_suspend(sb, -1);
+			if (err < 0)
+				goto restore_opts;
+
 			/*
 			 * First of all, the unconditional stuff we have to do
 			 * to disable replay of the journal when we next remount
@@ -2633,6 +2658,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 				goto restore_opts;
 			if (!ext3_setup_super (sb, es, 0))
 				sb->s_flags &= ~MS_RDONLY;
+			enable_quota = 1;
 		}
 	}
 #ifdef CONFIG_QUOTA
@@ -2644,6 +2670,9 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 #endif
 	unlock_super(sb);
 	unlock_kernel();
+
+	if (enable_quota)
+		dquot_resume(sb, -1);
 	return 0;
 restore_opts:
 	sb->s_flags = old_sb_flags;
@@ -2734,7 +2763,7 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
  * Process 1                         Process 2
  * ext3_create()                     quota_sync()
  *   journal_start()                   write_dquot()
- *   vfs_dq_init()                       down(dqio_mutex)
+ *   dquot_initialize()                       down(dqio_mutex)
  *     down(dqio_mutex)                    journal_start()
  *
  */
@@ -2833,24 +2862,21 @@ static int ext3_write_info(struct super_block *sb, int type)
  */
 static int ext3_quota_on_mount(struct super_block *sb, int type)
 {
-	return vfs_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type],
-			EXT3_SB(sb)->s_jquota_fmt, type);
+	return dquot_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type],
+					EXT3_SB(sb)->s_jquota_fmt, type);
 }
 
 /*
  * Standard function to be called on quota_on
  */
 static int ext3_quota_on(struct super_block *sb, int type, int format_id,
-			 char *name, int remount)
+			 char *name)
 {
 	int err;
 	struct path path;
 
 	if (!test_opt(sb, QUOTA))
 		return -EINVAL;
-	/* When remounting, no checks are needed and in fact, name is NULL */
-	if (remount)
-		return vfs_quota_on(sb, type, format_id, name, remount);
 
 	err = kern_path(name, LOOKUP_FOLLOW, &path);
 	if (err)
@@ -2888,7 +2914,7 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
 		}
 	}
 
-	err = vfs_quota_on_path(sb, type, format_id, &path);
+	err = dquot_quota_on_path(sb, type, format_id, &path);
 	path_put(&path);
 	return err;
 }
@@ -2942,9 +2968,7 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
 	sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb);
 	int err = 0;
 	int offset = off & (sb->s_blocksize - 1);
-	int tocopy;
 	int journal_quota = EXT3_SB(sb)->s_qf_names[type] != NULL;
-	size_t towrite = len;
 	struct buffer_head *bh;
 	handle_t *handle = journal_current_handle();
 
@@ -2955,53 +2979,54 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
 			(unsigned long long)off, (unsigned long long)len);
 		return -EIO;
 	}
+
+	/*
+	 * Since we account only one data block in transaction credits,
+	 * then it is impossible to cross a block boundary.
+	 */
+	if (sb->s_blocksize - offset < len) {
+		ext3_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
+			" cancelled because not block aligned",
+			(unsigned long long)off, (unsigned long long)len);
+		return -EIO;
+	}
 	mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
-	while (towrite > 0) {
-		tocopy = sb->s_blocksize - offset < towrite ?
-				sb->s_blocksize - offset : towrite;
-		bh = ext3_bread(handle, inode, blk, 1, &err);
-		if (!bh)
+	bh = ext3_bread(handle, inode, blk, 1, &err);
+	if (!bh)
+		goto out;
+	if (journal_quota) {
+		err = ext3_journal_get_write_access(handle, bh);
+		if (err) {
+			brelse(bh);
 			goto out;
-		if (journal_quota) {
-			err = ext3_journal_get_write_access(handle, bh);
-			if (err) {
-				brelse(bh);
-				goto out;
-			}
-		}
-		lock_buffer(bh);
-		memcpy(bh->b_data+offset, data, tocopy);
-		flush_dcache_page(bh->b_page);
-		unlock_buffer(bh);
-		if (journal_quota)
-			err = ext3_journal_dirty_metadata(handle, bh);
-		else {
-			/* Always do at least ordered writes for quotas */
-			err = ext3_journal_dirty_data(handle, bh);
-			mark_buffer_dirty(bh);
 		}
-		brelse(bh);
-		if (err)
-			goto out;
-		offset = 0;
-		towrite -= tocopy;
-		data += tocopy;
-		blk++;
 	}
+	lock_buffer(bh);
+	memcpy(bh->b_data+offset, data, len);
+	flush_dcache_page(bh->b_page);
+	unlock_buffer(bh);
+	if (journal_quota)
+		err = ext3_journal_dirty_metadata(handle, bh);
+	else {
+		/* Always do at least ordered writes for quotas */
+		err = ext3_journal_dirty_data(handle, bh);
+		mark_buffer_dirty(bh);
+	}
+	brelse(bh);
 out:
-	if (len == towrite) {
+	if (err) {
 		mutex_unlock(&inode->i_mutex);
 		return err;
 	}
-	if (inode->i_size < off+len-towrite) {
-		i_size_write(inode, off+len-towrite);
+	if (inode->i_size < off + len) {
+		i_size_write(inode, off + len);
 		EXT3_I(inode)->i_disksize = inode->i_size;
 	}
 	inode->i_version++;
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	ext3_mark_inode_dirty(handle, inode);
 	mutex_unlock(&inode->i_mutex);
-	return len - towrite;
+	return len;
 }
 
 #endif
diff --git a/fs/ext3/symlink.c b/fs/ext3/symlink.c
index ff7b4ccd8983..7c4898207776 100644
--- a/fs/ext3/symlink.c
+++ b/fs/ext3/symlink.c
@@ -34,6 +34,7 @@ const struct inode_operations ext3_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= page_follow_link_light,
 	.put_link	= page_put_link,
+	.setattr	= ext3_setattr,
 #ifdef CONFIG_EXT3_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
@@ -45,6 +46,7 @@ const struct inode_operations ext3_symlink_inode_operations = {
 const struct inode_operations ext3_fast_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= ext3_follow_link,
+	.setattr	= ext3_setattr,
 #ifdef CONFIG_EXT3_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 66895ccf76c7..71fb8d65e54c 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -104,7 +104,7 @@ static int ext3_xattr_list(struct dentry *dentry, char *buffer,
 
 static struct mb_cache *ext3_xattr_cache;
 
-static struct xattr_handler *ext3_xattr_handler_map[] = {
+static const struct xattr_handler *ext3_xattr_handler_map[] = {
 	[EXT3_XATTR_INDEX_USER]		     = &ext3_xattr_user_handler,
 #ifdef CONFIG_EXT3_FS_POSIX_ACL
 	[EXT3_XATTR_INDEX_POSIX_ACL_ACCESS]  = &ext3_xattr_acl_access_handler,
@@ -116,7 +116,7 @@ static struct xattr_handler *ext3_xattr_handler_map[] = {
 #endif
 };
 
-struct xattr_handler *ext3_xattr_handlers[] = {
+const struct xattr_handler *ext3_xattr_handlers[] = {
 	&ext3_xattr_user_handler,
 	&ext3_xattr_trusted_handler,
 #ifdef CONFIG_EXT3_FS_POSIX_ACL
@@ -129,10 +129,10 @@ struct xattr_handler *ext3_xattr_handlers[] = {
 	NULL
 };
 
-static inline struct xattr_handler *
+static inline const struct xattr_handler *
 ext3_xattr_handler(int name_index)
 {
-	struct xattr_handler *handler = NULL;
+	const struct xattr_handler *handler = NULL;
 
 	if (name_index > 0 && name_index < ARRAY_SIZE(ext3_xattr_handler_map))
 		handler = ext3_xattr_handler_map[name_index];
@@ -274,7 +274,7 @@ ext3_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
 	void *end;
 	int error;
 
-	if (!(EXT3_I(inode)->i_state & EXT3_STATE_XATTR))
+	if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR))
 		return -ENODATA;
 	error = ext3_get_inode_loc(inode, &iloc);
 	if (error)
@@ -338,7 +338,7 @@ ext3_xattr_list_entries(struct dentry *dentry, struct ext3_xattr_entry *entry,
 	size_t rest = buffer_size;
 
 	for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) {
-		struct xattr_handler *handler =
+		const struct xattr_handler *handler =
 			ext3_xattr_handler(entry->e_name_index);
 
 		if (handler) {
@@ -403,7 +403,7 @@ ext3_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 	void *end;
 	int error;
 
-	if (!(EXT3_I(inode)->i_state & EXT3_STATE_XATTR))
+	if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR))
 		return 0;
 	error = ext3_get_inode_loc(inode, &iloc);
 	if (error)
@@ -500,7 +500,7 @@ ext3_xattr_release_block(handle_t *handle, struct inode *inode,
 		error = ext3_journal_dirty_metadata(handle, bh);
 		if (IS_SYNC(inode))
 			handle->h_sync = 1;
-		vfs_dq_free_block(inode, 1);
+		dquot_free_block(inode, 1);
 		ea_bdebug(bh, "refcount now=%d; releasing",
 			  le32_to_cpu(BHDR(bh)->h_refcount));
 		if (ce)
@@ -775,8 +775,8 @@ inserted:
 			else {
 				/* The old block is released after updating
 				   the inode. */
-				error = -EDQUOT;
-				if (vfs_dq_alloc_block(inode, 1))
+				error = dquot_alloc_block(inode, 1);
+				if (error)
 					goto cleanup;
 				error = ext3_journal_get_write_access(handle,
 								      new_bh);
@@ -850,7 +850,7 @@ cleanup:
 	return error;
 
 cleanup_dquot:
-	vfs_dq_free_block(inode, 1);
+	dquot_free_block(inode, 1);
 	goto cleanup;
 
 bad_block:
@@ -882,7 +882,7 @@ ext3_xattr_ibody_find(struct inode *inode, struct ext3_xattr_info *i,
 	is->s.base = is->s.first = IFIRST(header);
 	is->s.here = is->s.first;
 	is->s.end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size;
-	if (EXT3_I(inode)->i_state & EXT3_STATE_XATTR) {
+	if (ext3_test_inode_state(inode, EXT3_STATE_XATTR)) {
 		error = ext3_xattr_check_names(IFIRST(header), is->s.end);
 		if (error)
 			return error;
@@ -914,10 +914,10 @@ ext3_xattr_ibody_set(handle_t *handle, struct inode *inode,
 	header = IHDR(inode, ext3_raw_inode(&is->iloc));
 	if (!IS_LAST_ENTRY(s->first)) {
 		header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC);
-		EXT3_I(inode)->i_state |= EXT3_STATE_XATTR;
+		ext3_set_inode_state(inode, EXT3_STATE_XATTR);
 	} else {
 		header->h_magic = cpu_to_le32(0);
-		EXT3_I(inode)->i_state &= ~EXT3_STATE_XATTR;
+		ext3_clear_inode_state(inode, EXT3_STATE_XATTR);
 	}
 	return 0;
 }
@@ -967,10 +967,10 @@ ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 	if (error)
 		goto cleanup;
 
-	if (EXT3_I(inode)->i_state & EXT3_STATE_NEW) {
+	if (ext3_test_inode_state(inode, EXT3_STATE_NEW)) {
 		struct ext3_inode *raw_inode = ext3_raw_inode(&is.iloc);
 		memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
-		EXT3_I(inode)->i_state &= ~EXT3_STATE_NEW;
+		ext3_clear_inode_state(inode, EXT3_STATE_NEW);
 	}
 
 	error = ext3_xattr_ibody_find(inode, &i, &is);
diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h
index 148a4dfc82ab..377fe7201169 100644
--- a/fs/ext3/xattr.h
+++ b/fs/ext3/xattr.h
@@ -58,11 +58,11 @@ struct ext3_xattr_entry {
 
 # ifdef CONFIG_EXT3_FS_XATTR
 
-extern struct xattr_handler ext3_xattr_user_handler;
-extern struct xattr_handler ext3_xattr_trusted_handler;
-extern struct xattr_handler ext3_xattr_acl_access_handler;
-extern struct xattr_handler ext3_xattr_acl_default_handler;
-extern struct xattr_handler ext3_xattr_security_handler;
+extern const struct xattr_handler ext3_xattr_user_handler;
+extern const struct xattr_handler ext3_xattr_trusted_handler;
+extern const struct xattr_handler ext3_xattr_acl_access_handler;
+extern const struct xattr_handler ext3_xattr_acl_default_handler;
+extern const struct xattr_handler ext3_xattr_security_handler;
 
 extern ssize_t ext3_listxattr(struct dentry *, char *, size_t);
 
@@ -76,7 +76,7 @@ extern void ext3_xattr_put_super(struct super_block *);
 extern int init_ext3_xattr(void);
 extern void exit_ext3_xattr(void);
 
-extern struct xattr_handler *ext3_xattr_handlers[];
+extern const struct xattr_handler *ext3_xattr_handlers[];
 
 # else  /* CONFIG_EXT3_FS_XATTR */
 
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index 474348788dd9..03a99bfc59f9 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -4,6 +4,7 @@
  */
 
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/fs.h>
 #include <linux/ext3_jbd.h>
@@ -68,7 +69,7 @@ ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
 	return err;
 }
 
-struct xattr_handler ext3_xattr_security_handler = {
+const struct xattr_handler ext3_xattr_security_handler = {
 	.prefix	= XATTR_SECURITY_PREFIX,
 	.list	= ext3_xattr_security_list,
 	.get	= ext3_xattr_security_get,
diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c
index e5562845ed96..dc8edda9ffe0 100644
--- a/fs/ext3/xattr_trusted.c
+++ b/fs/ext3/xattr_trusted.c
@@ -51,7 +51,7 @@ ext3_xattr_trusted_set(struct dentry *dentry, const char *name,
 			      value, size, flags);
 }
 
-struct xattr_handler ext3_xattr_trusted_handler = {
+const struct xattr_handler ext3_xattr_trusted_handler = {
 	.prefix	= XATTR_TRUSTED_PREFIX,
 	.list	= ext3_xattr_trusted_list,
 	.get	= ext3_xattr_trusted_get,
diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c
index 3bcfe9ee0a68..7a321974d584 100644
--- a/fs/ext3/xattr_user.c
+++ b/fs/ext3/xattr_user.c
@@ -54,7 +54,7 @@ ext3_xattr_user_set(struct dentry *dentry, const char *name,
 			      name, value, size, flags);
 }
 
-struct xattr_handler ext3_xattr_user_handler = {
+const struct xattr_handler ext3_xattr_user_handler = {
 	.prefix	= XATTR_USER_PREFIX,
 	.list	= ext3_xattr_user_list,
 	.get	= ext3_xattr_user_get,
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 8a2a29d35a6f..feaf498feaa6 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -454,7 +454,7 @@ release_and_out:
 	return error;
 }
 
-struct xattr_handler ext4_xattr_acl_access_handler = {
+const struct xattr_handler ext4_xattr_acl_access_handler = {
 	.prefix	= POSIX_ACL_XATTR_ACCESS,
 	.flags	= ACL_TYPE_ACCESS,
 	.list	= ext4_xattr_list_acl_access,
@@ -462,7 +462,7 @@ struct xattr_handler ext4_xattr_acl_access_handler = {
 	.set	= ext4_xattr_set_acl,
 };
 
-struct xattr_handler ext4_xattr_acl_default_handler = {
+const struct xattr_handler ext4_xattr_acl_default_handler = {
 	.prefix	= POSIX_ACL_XATTR_DEFAULT,
 	.flags	= ACL_TYPE_DEFAULT,
 	.list	= ext4_xattr_list_acl_default,
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 22bc7435d913..95b7594c76f9 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -97,8 +97,8 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 		/* If checksum is bad mark all blocks used to prevent allocation
 		 * essentially implementing a per-group read-only flag. */
 		if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
-			ext4_error(sb, __func__,
-				  "Checksum bad for group %u", block_group);
+			ext4_error(sb, "Checksum bad for group %u",
+					block_group);
 			ext4_free_blks_set(sb, gdp, 0);
 			ext4_free_inodes_set(sb, gdp, 0);
 			ext4_itable_unused_set(sb, gdp, 0);
@@ -130,8 +130,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 		 * to make sure we calculate the right free blocks
 		 */
 		group_blocks = ext4_blocks_count(sbi->s_es) -
-			le32_to_cpu(sbi->s_es->s_first_data_block) -
-			(EXT4_BLOCKS_PER_GROUP(sb) * (ngroups - 1));
+			ext4_group_first_block_no(sb, ngroups - 1);
 	} else {
 		group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
 	}
@@ -189,9 +188,6 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
  * when a file system is mounted (see ext4_fill_super).
  */
 
-
-#define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
-
 /**
  * ext4_get_group_desc() -- load group descriptor from disk
  * @sb:			super block
@@ -210,10 +206,8 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
 	if (block_group >= ngroups) {
-		ext4_error(sb, "ext4_get_group_desc",
-			   "block_group >= groups_count - "
-			   "block_group = %u, groups_count = %u",
-			   block_group, ngroups);
+		ext4_error(sb, "block_group >= groups_count - block_group = %u,"
+			   " groups_count = %u", block_group, ngroups);
 
 		return NULL;
 	}
@@ -221,8 +215,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
 	group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
 	offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
 	if (!sbi->s_group_desc[group_desc]) {
-		ext4_error(sb, "ext4_get_group_desc",
-			   "Group descriptor not loaded - "
+		ext4_error(sb, "Group descriptor not loaded - "
 			   "block_group = %u, group_desc = %u, desc = %u",
 			   block_group, group_desc, offset);
 		return NULL;
@@ -282,9 +275,7 @@ static int ext4_valid_block_bitmap(struct super_block *sb,
 		return 1;
 
 err_out:
-	ext4_error(sb, __func__,
-			"Invalid block bitmap - "
-			"block_group = %d, block = %llu",
+	ext4_error(sb, "Invalid block bitmap - block_group = %d, block = %llu",
 			block_group, bitmap_blk);
 	return 0;
 }
@@ -311,8 +302,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 	bitmap_blk = ext4_block_bitmap(sb, desc);
 	bh = sb_getblk(sb, bitmap_blk);
 	if (unlikely(!bh)) {
-		ext4_error(sb, __func__,
-			    "Cannot read block bitmap - "
+		ext4_error(sb, "Cannot read block bitmap - "
 			    "block_group = %u, block_bitmap = %llu",
 			    block_group, bitmap_blk);
 		return NULL;
@@ -354,8 +344,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 	set_bitmap_uptodate(bh);
 	if (bh_submit_read(bh) < 0) {
 		put_bh(bh);
-		ext4_error(sb, __func__,
-			    "Cannot read block bitmap - "
+		ext4_error(sb, "Cannot read block bitmap - "
 			    "block_group = %u, block_bitmap = %llu",
 			    block_group, bitmap_blk);
 		return NULL;
@@ -419,8 +408,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
 	    in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
 	    in_range(block + count - 1, ext4_inode_table(sb, desc),
 		     sbi->s_itb_per_group)) {
-		ext4_error(sb, __func__,
-			   "Adding blocks in system zones - "
+		ext4_error(sb, "Adding blocks in system zones - "
 			   "Block = %llu, count = %lu",
 			   block, count);
 		goto error_return;
@@ -453,8 +441,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
 		BUFFER_TRACE(bitmap_bh, "clear bit");
 		if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
 						bit + i, bitmap_bh->b_data)) {
-			ext4_error(sb, __func__,
-				   "bit already cleared for block %llu",
+			ext4_error(sb, "bit already cleared for block %llu",
 				   (ext4_fsblk_t)(block + i));
 			BUFFER_TRACE(bitmap_bh, "bit already cleared");
 		} else {
@@ -604,14 +591,15 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
 	ret = ext4_mb_new_blocks(handle, &ar, errp);
 	if (count)
 		*count = ar.len;
-
 	/*
-	 * Account for the allocated meta blocks
+	 * Account for the allocated meta blocks.  We will never
+	 * fail EDQUOT for metdata, but we do account for it.
 	 */
 	if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
 		spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
 		EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
 		spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+		dquot_alloc_block_nofail(inode, ar.len);
 	}
 	return ret;
 }
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index a60ab9aad57d..5b6973fbf1bd 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -18,6 +18,7 @@
 #include <linux/pagemap.h>
 #include <linux/blkdev.h>
 #include <linux/mutex.h>
+#include <linux/slab.h>
 #include "ext4.h"
 
 struct ext4_system_zone {
@@ -71,9 +72,9 @@ static int add_system_zone(struct ext4_sb_info *sbi,
 		else if (start_blk >= (entry->start_blk + entry->count))
 			n = &(*n)->rb_right;
 		else {
-			if (start_blk + count > (entry->start_blk + 
+			if (start_blk + count > (entry->start_blk +
 						 entry->count))
-				entry->count = (start_blk + count - 
+				entry->count = (start_blk + count -
 						entry->start_blk);
 			new_node = *n;
 			new_entry = rb_entry(new_node, struct ext4_system_zone,
@@ -205,14 +206,14 @@ void ext4_release_system_zone(struct super_block *sb)
 		entry = rb_entry(n, struct ext4_system_zone, node);
 		kmem_cache_free(ext4_system_zone_cachep, entry);
 		if (!parent)
-			EXT4_SB(sb)->system_blks.rb_node = NULL;
+			EXT4_SB(sb)->system_blks = RB_ROOT;
 		else if (parent->rb_left == n)
 			parent->rb_left = NULL;
 		else if (parent->rb_right == n)
 			parent->rb_right = NULL;
 		n = parent;
 	}
-	EXT4_SB(sb)->system_blks.rb_node = NULL;
+	EXT4_SB(sb)->system_blks = RB_ROOT;
 }
 
 /*
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 9dc93168e262..ea5e6cb7e2a5 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -83,10 +83,11 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
 		error_msg = "inode out of bounds";
 
 	if (error_msg != NULL)
-		ext4_error(dir->i_sb, function,
-			"bad entry in directory #%lu: %s - "
-			"offset=%u, inode=%u, rec_len=%d, name_len=%d",
-			dir->i_ino, error_msg, offset,
+		ext4_error_inode(function, dir,
+			"bad entry in directory: %s - block=%llu"
+			"offset=%u(%u), inode=%u, rec_len=%d, name_len=%d",
+			error_msg, (unsigned long long) bh->b_blocknr,
+			(unsigned) (offset%bh->b_size), offset,
 			le32_to_cpu(de->inode),
 			rlen, de->name_len);
 	return error_msg == NULL ? 1 : 0;
@@ -109,7 +110,7 @@ static int ext4_readdir(struct file *filp,
 
 	if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
 				    EXT4_FEATURE_COMPAT_DIR_INDEX) &&
-	    ((EXT4_I(inode)->i_flags & EXT4_INDEX_FL) ||
+	    ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
 	     ((inode->i_size >> sb->s_blocksize_bits) == 1))) {
 		err = ext4_dx_readdir(filp, dirent, filldir);
 		if (err != ERR_BAD_DX_DIR) {
@@ -120,20 +121,20 @@ static int ext4_readdir(struct file *filp,
 		 * We don't set the inode dirty flag since it's not
 		 * critical that it get flushed back to the disk.
 		 */
-		EXT4_I(filp->f_path.dentry->d_inode)->i_flags &= ~EXT4_INDEX_FL;
+		ext4_clear_inode_flag(filp->f_path.dentry->d_inode, EXT4_INODE_INDEX);
 	}
 	stored = 0;
 	offset = filp->f_pos & (sb->s_blocksize - 1);
 
 	while (!error && !stored && filp->f_pos < inode->i_size) {
-		ext4_lblk_t blk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
-		struct buffer_head map_bh;
+		struct ext4_map_blocks map;
 		struct buffer_head *bh = NULL;
 
-		map_bh.b_state = 0;
-		err = ext4_get_blocks(NULL, inode, blk, 1, &map_bh, 0);
+		map.m_lblk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
+		map.m_len = 1;
+		err = ext4_map_blocks(NULL, inode, &map, 0);
 		if (err > 0) {
-			pgoff_t index = map_bh.b_blocknr >>
+			pgoff_t index = map.m_pblk >>
 					(PAGE_CACHE_SHIFT - inode->i_blkbits);
 			if (!ra_has_index(&filp->f_ra, index))
 				page_cache_sync_readahead(
@@ -141,7 +142,7 @@ static int ext4_readdir(struct file *filp,
 					&filp->f_ra, filp,
 					index, 1);
 			filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
-			bh = ext4_bread(NULL, inode, blk, 0, &err);
+			bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err);
 		}
 
 		/*
@@ -150,9 +151,8 @@ static int ext4_readdir(struct file *filp,
 		 */
 		if (!bh) {
 			if (!dir_has_error) {
-				ext4_error(sb, __func__, "directory #%lu "
+				EXT4_ERROR_INODE(inode, "directory "
 					   "contains a hole at offset %Lu",
-					   inode->i_ino,
 					   (unsigned long long) filp->f_pos);
 				dir_has_error = 1;
 			}
@@ -303,7 +303,7 @@ static void free_rb_tree_fname(struct rb_root *root)
 			kfree(old);
 		}
 		if (!parent)
-			root->rb_node = NULL;
+			*root = RB_ROOT;
 		else if (parent->rb_left == n)
 			parent->rb_left = NULL;
 		else if (parent->rb_right == n)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 874d169a193e..19a4de57128a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -29,6 +29,9 @@
 #include <linux/wait.h>
 #include <linux/blockgroup_lock.h>
 #include <linux/percpu_counter.h>
+#ifdef __KERNEL__
+#include <linux/compat.h>
+#endif
 
 /*
  * The fourth extended filesystem constants/structures
@@ -53,6 +56,12 @@
 #define ext4_debug(f, a...)	do {} while (0)
 #endif
 
+#define EXT4_ERROR_INODE(inode, fmt, a...) \
+	ext4_error_inode(__func__, (inode), (fmt), ## a)
+
+#define EXT4_ERROR_FILE(file, fmt, a...)	\
+	ext4_error_file(__func__, (file), (fmt), ## a)
+
 /* data type for block offset of block group */
 typedef int ext4_grpblk_t;
 
@@ -66,7 +75,7 @@ typedef __u32 ext4_lblk_t;
 typedef unsigned int ext4_group_t;
 
 /*
- * Flags used in mballoc's allocation_context flags field.  
+ * Flags used in mballoc's allocation_context flags field.
  *
  * Also used to show what's going on for debugging purposes when the
  * flag field is exported via the traceport interface
@@ -120,6 +129,29 @@ struct ext4_allocation_request {
 };
 
 /*
+ * Logical to physical block mapping, used by ext4_map_blocks()
+ *
+ * This structure is used to pass requests into ext4_map_blocks() as
+ * well as to store the information returned by ext4_map_blocks().  It
+ * takes less room on the stack than a struct buffer_head.
+ */
+#define EXT4_MAP_NEW		(1 << BH_New)
+#define EXT4_MAP_MAPPED		(1 << BH_Mapped)
+#define EXT4_MAP_UNWRITTEN	(1 << BH_Unwritten)
+#define EXT4_MAP_BOUNDARY	(1 << BH_Boundary)
+#define EXT4_MAP_UNINIT		(1 << BH_Uninit)
+#define EXT4_MAP_FLAGS		(EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
+				 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
+				 EXT4_MAP_UNINIT)
+
+struct ext4_map_blocks {
+	ext4_fsblk_t m_pblk;
+	ext4_lblk_t m_lblk;
+	unsigned int m_len;
+	unsigned int m_flags;
+};
+
+/*
  * For delayed allocation tracking
  */
 struct mpage_da_data {
@@ -133,14 +165,14 @@ struct mpage_da_data {
 	int pages_written;
 	int retval;
 };
-#define	DIO_AIO_UNWRITTEN	0x1
+#define	EXT4_IO_UNWRITTEN	0x1
 typedef struct ext4_io_end {
 	struct list_head	list;		/* per-file finished AIO list */
 	struct inode		*inode;		/* file being written to */
 	unsigned int		flag;		/* unwritten or not */
-	int			error;		/* I/O error code */
-	ext4_lblk_t		offset;		/* offset in the file */
-	size_t			size;		/* size of the extent */
+	struct page		*page;		/* page struct for buffer write */
+	loff_t			offset;		/* offset in the file */
+	ssize_t			size;		/* size of the extent */
 	struct work_struct	work;		/* data work queue */
 } ext4_io_end_t;
 
@@ -284,10 +316,12 @@ struct flex_groups {
 #define EXT4_TOPDIR_FL			0x00020000 /* Top of directory hierarchies*/
 #define EXT4_HUGE_FILE_FL               0x00040000 /* Set to each huge file */
 #define EXT4_EXTENTS_FL			0x00080000 /* Inode uses extents */
+#define EXT4_EA_INODE_FL	        0x00200000 /* Inode used for large EA */
+#define EXT4_EOFBLOCKS_FL		0x00400000 /* Blocks allocated beyond EOF */
 #define EXT4_RESERVED_FL		0x80000000 /* reserved for ext4 lib */
 
-#define EXT4_FL_USER_VISIBLE		0x000BDFFF /* User visible flags */
-#define EXT4_FL_USER_MODIFIABLE		0x000B80FF /* User modifiable flags */
+#define EXT4_FL_USER_VISIBLE		0x004BDFFF /* User visible flags */
+#define EXT4_FL_USER_MODIFIABLE		0x004B80FF /* User modifiable flags */
 
 /* Flags that should be inherited by new inodes from their parent. */
 #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
@@ -314,15 +348,81 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
 }
 
 /*
- * Inode dynamic state flags
+ * Inode flags used for atomic set/get
  */
-#define EXT4_STATE_JDATA		0x00000001 /* journaled data exists */
-#define EXT4_STATE_NEW			0x00000002 /* inode is newly created */
-#define EXT4_STATE_XATTR		0x00000004 /* has in-inode xattrs */
-#define EXT4_STATE_NO_EXPAND		0x00000008 /* No space for expansion */
-#define EXT4_STATE_DA_ALLOC_CLOSE	0x00000010 /* Alloc DA blks on close */
-#define EXT4_STATE_EXT_MIGRATE		0x00000020 /* Inode is migrating */
-#define EXT4_STATE_DIO_UNWRITTEN	0x00000040 /* need convert on dio done*/
+enum {
+	EXT4_INODE_SECRM	= 0,	/* Secure deletion */
+	EXT4_INODE_UNRM		= 1,	/* Undelete */
+	EXT4_INODE_COMPR	= 2,	/* Compress file */
+	EXT4_INODE_SYNC		= 3,	/* Synchronous updates */
+	EXT4_INODE_IMMUTABLE	= 4,	/* Immutable file */
+	EXT4_INODE_APPEND	= 5,	/* writes to file may only append */
+	EXT4_INODE_NODUMP	= 6,	/* do not dump file */
+	EXT4_INODE_NOATIME	= 7,	/* do not update atime */
+/* Reserved for compression usage... */
+	EXT4_INODE_DIRTY	= 8,
+	EXT4_INODE_COMPRBLK	= 9,	/* One or more compressed clusters */
+	EXT4_INODE_NOCOMPR	= 10,	/* Don't compress */
+	EXT4_INODE_ECOMPR	= 11,	/* Compression error */
+/* End compression flags --- maybe not all used */
+	EXT4_INODE_INDEX	= 12,	/* hash-indexed directory */
+	EXT4_INODE_IMAGIC	= 13,	/* AFS directory */
+	EXT4_INODE_JOURNAL_DATA	= 14,	/* file data should be journaled */
+	EXT4_INODE_NOTAIL	= 15,	/* file tail should not be merged */
+	EXT4_INODE_DIRSYNC	= 16,	/* dirsync behaviour (directories only) */
+	EXT4_INODE_TOPDIR	= 17,	/* Top of directory hierarchies*/
+	EXT4_INODE_HUGE_FILE	= 18,	/* Set to each huge file */
+	EXT4_INODE_EXTENTS	= 19,	/* Inode uses extents */
+	EXT4_INODE_EA_INODE	= 21,	/* Inode used for large EA */
+	EXT4_INODE_EOFBLOCKS	= 22,	/* Blocks allocated beyond EOF */
+	EXT4_INODE_RESERVED	= 31,	/* reserved for ext4 lib */
+};
+
+#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
+#define CHECK_FLAG_VALUE(FLAG) if (!TEST_FLAG_VALUE(FLAG)) { \
+	printk(KERN_EMERG "EXT4 flag fail: " #FLAG ": %d %d\n", \
+		EXT4_##FLAG##_FL, EXT4_INODE_##FLAG); BUG_ON(1); }
+
+/*
+ * Since it's pretty easy to mix up bit numbers and hex values, and we
+ * can't do a compile-time test for ENUM values, we use a run-time
+ * test to make sure that EXT4_XXX_FL is consistent with respect to
+ * EXT4_INODE_XXX.  If all is well the printk and BUG_ON will all drop
+ * out so it won't cost any extra space in the compiled kernel image.
+ * But it's important that these values are the same, since we are
+ * using EXT4_INODE_XXX to test for the flag values, but EXT4_XX_FL
+ * must be consistent with the values of FS_XXX_FL defined in
+ * include/linux/fs.h and the on-disk values found in ext2, ext3, and
+ * ext4 filesystems, and of course the values defined in e2fsprogs.
+ *
+ * It's not paranoia if the Murphy's Law really *is* out to get you.  :-)
+ */
+static inline void ext4_check_flag_values(void)
+{
+	CHECK_FLAG_VALUE(SECRM);
+	CHECK_FLAG_VALUE(UNRM);
+	CHECK_FLAG_VALUE(COMPR);
+	CHECK_FLAG_VALUE(SYNC);
+	CHECK_FLAG_VALUE(IMMUTABLE);
+	CHECK_FLAG_VALUE(APPEND);
+	CHECK_FLAG_VALUE(NODUMP);
+	CHECK_FLAG_VALUE(NOATIME);
+	CHECK_FLAG_VALUE(DIRTY);
+	CHECK_FLAG_VALUE(COMPRBLK);
+	CHECK_FLAG_VALUE(NOCOMPR);
+	CHECK_FLAG_VALUE(ECOMPR);
+	CHECK_FLAG_VALUE(INDEX);
+	CHECK_FLAG_VALUE(IMAGIC);
+	CHECK_FLAG_VALUE(JOURNAL_DATA);
+	CHECK_FLAG_VALUE(NOTAIL);
+	CHECK_FLAG_VALUE(DIRSYNC);
+	CHECK_FLAG_VALUE(TOPDIR);
+	CHECK_FLAG_VALUE(HUGE_FILE);
+	CHECK_FLAG_VALUE(EXTENTS);
+	CHECK_FLAG_VALUE(EA_INODE);
+	CHECK_FLAG_VALUE(EOFBLOCKS);
+	CHECK_FLAG_VALUE(RESERVED);
+}
 
 /* Used to pass group descriptor data when online resize is done */
 struct ext4_new_group_input {
@@ -335,6 +435,18 @@ struct ext4_new_group_input {
 	__u16 unused;
 };
 
+#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
+struct compat_ext4_new_group_input {
+	u32 group;
+	compat_u64 block_bitmap;
+	compat_u64 inode_bitmap;
+	compat_u64 inode_table;
+	u32 blocks_count;
+	u16 reserved_blocks;
+	u16 unused;
+};
+#endif
+
 /* The struct ext4_new_group_input in kernel space, with free_blocks_count */
 struct ext4_new_group_data {
 	__u32 group;
@@ -358,25 +470,26 @@ struct ext4_new_group_data {
 #define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT	(EXT4_GET_BLOCKS_UNINIT_EXT|\
 						 EXT4_GET_BLOCKS_CREATE)
 	/* Caller is from the delayed allocation writeout path,
-	   so set the magic i_delalloc_reserve_flag after taking the 
+	   so set the magic i_delalloc_reserve_flag after taking the
 	   inode allocation semaphore for */
 #define EXT4_GET_BLOCKS_DELALLOC_RESERVE	0x0004
 	/* caller is from the direct IO path, request to creation of an
 	unitialized extents if not allocated, split the uninitialized
 	extent if blocks has been preallocated already*/
-#define EXT4_GET_BLOCKS_DIO			0x0008
+#define EXT4_GET_BLOCKS_PRE_IO			0x0008
 #define EXT4_GET_BLOCKS_CONVERT			0x0010
-#define EXT4_GET_BLOCKS_DIO_CREATE_EXT		(EXT4_GET_BLOCKS_DIO|\
+#define EXT4_GET_BLOCKS_IO_CREATE_EXT		(EXT4_GET_BLOCKS_PRE_IO|\
+					 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
+	/* Convert extent to initialized after IO complete */
+#define EXT4_GET_BLOCKS_IO_CONVERT_EXT		(EXT4_GET_BLOCKS_CONVERT|\
 					 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
-	/* Convert extent to initialized after direct IO complete */
-#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT		(EXT4_GET_BLOCKS_CONVERT|\
-					 EXT4_GET_BLOCKS_DIO_CREATE_EXT)
 
 /*
  * Flags used by ext4_free_blocks
  */
 #define EXT4_FREE_BLOCKS_METADATA	0x0001
 #define EXT4_FREE_BLOCKS_FORGET		0x0002
+#define EXT4_FREE_BLOCKS_VALIDATED	0x0004
 
 /*
  * ioctl commands
@@ -400,6 +513,7 @@ struct ext4_new_group_data {
 #define EXT4_IOC_ALLOC_DA_BLKS		_IO('f', 12)
 #define EXT4_IOC_MOVE_EXT		_IOWR('f', 15, struct move_extent)
 
+#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /*
  * ioctl commands in 32 bit emulation
  */
@@ -410,11 +524,13 @@ struct ext4_new_group_data {
 #define EXT4_IOC32_GETRSVSZ		_IOR('f', 5, int)
 #define EXT4_IOC32_SETRSVSZ		_IOW('f', 6, int)
 #define EXT4_IOC32_GROUP_EXTEND		_IOW('f', 7, unsigned int)
+#define EXT4_IOC32_GROUP_ADD		_IOW('f', 8, struct compat_ext4_new_group_input)
 #ifdef CONFIG_JBD2_DEBUG
 #define EXT4_IOC32_WAIT_FOR_READONLY	_IOR('f', 99, int)
 #endif
 #define EXT4_IOC32_GETVERSION_OLD	FS_IOC32_GETVERSION
 #define EXT4_IOC32_SETVERSION_OLD	FS_IOC32_SETVERSION
+#endif
 
 
 /*
@@ -618,9 +734,8 @@ struct ext4_ext_cache {
  */
 struct ext4_inode_info {
 	__le32	i_data[15];	/* unconverted */
-	__u32	i_flags;
-	ext4_fsblk_t	i_file_acl;
 	__u32	i_dtime;
+	ext4_fsblk_t	i_file_acl;
 
 	/*
 	 * i_block_group is the number of the block group which contains
@@ -630,7 +745,8 @@ struct ext4_inode_info {
 	 * near to their parent directory's inode.
 	 */
 	ext4_group_t	i_block_group;
-	__u32	i_state;		/* Dynamic state flags for ext4 */
+	unsigned long	i_state_flags;		/* Dynamic state flags */
+	unsigned long	i_flags;
 
 	ext4_lblk_t		i_dir_start_lookup;
 #ifdef CONFIG_EXT4_FS_XATTR
@@ -708,8 +824,9 @@ struct ext4_inode_info {
 	qsize_t i_reserved_quota;
 #endif
 
-	/* completed async DIOs that might need unwritten extents handling */
-	struct list_head i_aio_dio_complete_list;
+	/* completed IOs that might need unwritten extents handling */
+	struct list_head i_completed_io_list;
+	spinlock_t i_completed_io_lock;
 	/* current io_end structure for async DIO write*/
 	ext4_io_end_t *cur_aio_dio;
 
@@ -760,6 +877,7 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_QUOTA		0x80000 /* Some quota option set */
 #define EXT4_MOUNT_USRQUOTA		0x100000 /* "old" user quota */
 #define EXT4_MOUNT_GRPQUOTA		0x200000 /* "old" group quota */
+#define EXT4_MOUNT_DIOREAD_NOLOCK	0x400000 /* Enable support for dio read nolocking */
 #define EXT4_MOUNT_JOURNAL_CHECKSUM	0x800000 /* Journal checksums */
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT	0x1000000 /* Journal Async Commit */
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
@@ -1014,7 +1132,7 @@ struct ext4_sb_info {
 	atomic_t s_lock_busy;
 
 	/* locality groups */
-	struct ext4_locality_group *s_locality_groups;
+	struct ext4_locality_group __percpu *s_locality_groups;
 
 	/* for write statistics */
 	unsigned long s_sectors_written_start;
@@ -1050,6 +1168,37 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 		(ino >= EXT4_FIRST_INO(sb) &&
 		 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
 }
+
+/*
+ * Inode dynamic state flags
+ */
+enum {
+	EXT4_STATE_JDATA,		/* journaled data exists */
+	EXT4_STATE_NEW,			/* inode is newly created */
+	EXT4_STATE_XATTR,		/* has in-inode xattrs */
+	EXT4_STATE_NO_EXPAND,		/* No space for expansion */
+	EXT4_STATE_DA_ALLOC_CLOSE,	/* Alloc DA blks on close */
+	EXT4_STATE_EXT_MIGRATE,		/* Inode is migrating */
+	EXT4_STATE_DIO_UNWRITTEN,	/* need convert on dio done*/
+	EXT4_STATE_NEWENTRY,		/* File just added to dir */
+};
+
+#define EXT4_INODE_BIT_FNS(name, field)					\
+static inline int ext4_test_inode_##name(struct inode *inode, int bit)	\
+{									\
+	return test_bit(bit, &EXT4_I(inode)->i_##field);		\
+}									\
+static inline void ext4_set_inode_##name(struct inode *inode, int bit)	\
+{									\
+	set_bit(bit, &EXT4_I(inode)->i_##field);			\
+}									\
+static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
+{									\
+	clear_bit(bit, &EXT4_I(inode)->i_##field);			\
+}
+
+EXT4_INODE_BIT_FNS(flag, flags)
+EXT4_INODE_BIT_FNS(state, state_flags)
 #else
 /* Assume that user mode programs are passing in an ext4fs superblock, not
  * a kernel struct super_block.  This will allow us to call the feature-test
@@ -1126,6 +1275,8 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 #define EXT4_FEATURE_INCOMPAT_64BIT		0x0080
 #define EXT4_FEATURE_INCOMPAT_MMP               0x0100
 #define EXT4_FEATURE_INCOMPAT_FLEX_BG		0x0200
+#define EXT4_FEATURE_INCOMPAT_EA_INODE		0x0400 /* EA in inode */
+#define EXT4_FEATURE_INCOMPAT_DIRDATA		0x1000 /* data in dirent */
 
 #define EXT4_FEATURE_COMPAT_SUPP	EXT2_FEATURE_COMPAT_EXT_ATTR
 #define EXT4_FEATURE_INCOMPAT_SUPP	(EXT4_FEATURE_INCOMPAT_FILETYPE| \
@@ -1234,7 +1385,7 @@ struct ext4_dir_entry_2 {
 
 #define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \
 				      EXT4_FEATURE_COMPAT_DIR_INDEX) && \
-		      (EXT4_I(dir)->i_flags & EXT4_INDEX_FL))
+		    ext4_test_inode_flag((dir), EXT4_INODE_INDEX))
 #define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX)
 #define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
 
@@ -1368,7 +1519,7 @@ extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
 extern void ext4_htree_free_dir_info(struct dir_private_info *p);
 
 /* fsync.c */
-extern int ext4_sync_file(struct file *, struct dentry *, int);
+extern int ext4_sync_file(struct file *, int);
 
 /* hash.c */
 extern int ext4fs_dirhash(const char *name, int len, struct
@@ -1416,7 +1567,7 @@ int ext4_get_block(struct inode *inode, sector_t iblock,
 				struct buffer_head *bh_result, int create);
 
 extern struct inode *ext4_iget(struct super_block *, unsigned long);
-extern int  ext4_write_inode(struct inode *, int);
+extern int  ext4_write_inode(struct inode *, struct writeback_control *);
 extern int  ext4_setattr(struct dentry *, struct iattr *);
 extern int  ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
 				struct kstat *stat);
@@ -1439,7 +1590,7 @@ extern int ext4_block_truncate_page(handle_t *handle,
 		struct address_space *mapping, loff_t from);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
-extern int flush_aio_dio_completed_IO(struct inode *inode);
+extern int flush_completed_IO(struct inode *inode);
 extern void ext4_da_update_reserve_space(struct inode *inode,
 					int used, int quota_claim);
 /* ioctl.c */
@@ -1465,13 +1616,20 @@ extern int ext4_group_extend(struct super_block *sb,
 				ext4_fsblk_t n_blocks_count);
 
 /* super.c */
-extern void ext4_error(struct super_block *, const char *, const char *, ...)
+extern void __ext4_error(struct super_block *, const char *, const char *, ...)
+	__attribute__ ((format (printf, 3, 4)));
+#define ext4_error(sb, message...)	__ext4_error(sb, __func__, ## message)
+extern void ext4_error_inode(const char *, struct inode *, const char *, ...)
+	__attribute__ ((format (printf, 3, 4)));
+extern void ext4_error_file(const char *, struct file *, const char *, ...)
 	__attribute__ ((format (printf, 3, 4)));
 extern void __ext4_std_error(struct super_block *, const char *, int);
 extern void ext4_abort(struct super_block *, const char *, const char *, ...)
 	__attribute__ ((format (printf, 3, 4)));
-extern void ext4_warning(struct super_block *, const char *, const char *, ...)
+extern void __ext4_warning(struct super_block *, const char *,
+			  const char *, ...)
 	__attribute__ ((format (printf, 3, 4)));
+#define ext4_warning(sb, message...)	__ext4_warning(sb, __func__, ## message)
 extern void ext4_msg(struct super_block *, const char *, const char *, ...)
 	__attribute__ ((format (printf, 3, 4)));
 extern void ext4_grp_locked_error(struct super_block *, ext4_group_t,
@@ -1641,6 +1799,7 @@ struct ext4_group_info {
 	ext4_grpblk_t	bb_first_free;	/* first free block */
 	ext4_grpblk_t	bb_free;	/* total free blocks */
 	ext4_grpblk_t	bb_fragments;	/* nr of freespace fragments */
+	ext4_grpblk_t	bb_largest_free_order;/* order of largest frag in BG */
 	struct          list_head bb_prealloc_list;
 #ifdef DOUBLE_CHECK
 	void            *bb_bitmap;
@@ -1735,16 +1894,17 @@ extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
 extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
 extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
 				       int chunk);
-extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
-			       ext4_lblk_t iblock, unsigned int max_blocks,
-			       struct buffer_head *bh_result, int flags);
+extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
+			       struct ext4_map_blocks *map, int flags);
 extern void ext4_ext_truncate(struct inode *);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
 extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
 			  loff_t len);
 extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
-			  loff_t len);
+			  ssize_t len);
+extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
+			   struct ext4_map_blocks *map, int flags);
 extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
 			   sector_t block, unsigned int max_blocks,
 			   struct buffer_head *bh, int flags);
@@ -1756,6 +1916,15 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
 			     __u64 len, __u64 *moved_len);
 
 
+/* BH_Uninit flag: blocks are allocated but uninitialized on disk */
+enum ext4_state_bits {
+	BH_Uninit	/* blocks are allocated but uninitialized on disk */
+	  = BH_JBDPrivateStart,
+};
+
+BUFFER_FNS(Uninit, uninit)
+TAS_BUFFER_FNS(Uninit, uninit)
+
 /*
  * Add new method to test wether block and inode bitmaps are properly
  * initialized. With uninit_bg reading the block from disk is not enough
@@ -1773,6 +1942,8 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
 	set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
 }
 
+#define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
+
 #endif	/* __KERNEL__ */
 
 #endif	/* _EXT4_H */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index b57e5c711b6d..53d2764d71ca 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -125,14 +125,14 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
 			ext4_journal_abort_handle(where, __func__, bh,
 						  handle, err);
 	} else {
-		if (inode && bh)
+		if (inode)
 			mark_buffer_dirty_inode(bh, inode);
 		else
 			mark_buffer_dirty(bh);
 		if (inode && inode_needs_sync(inode)) {
 			sync_dirty_buffer(bh);
 			if (buffer_req(bh) && !buffer_uptodate(bh)) {
-				ext4_error(inode->i_sb, __func__,
+				ext4_error(inode->i_sb,
 					   "IO error syncing inode, "
 					   "inode=%lu, block=%llu",
 					   inode->i_ino,
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 05eca817d704..dade0c024797 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -273,7 +273,7 @@ static inline int ext4_should_journal_data(struct inode *inode)
 		return 1;
 	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
 		return 1;
-	if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
+	if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
 		return 1;
 	return 0;
 }
@@ -284,7 +284,7 @@ static inline int ext4_should_order_data(struct inode *inode)
 		return 0;
 	if (!S_ISREG(inode->i_mode))
 		return 0;
-	if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
+	if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
 		return 0;
 	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
 		return 1;
@@ -297,11 +297,35 @@ static inline int ext4_should_writeback_data(struct inode *inode)
 		return 0;
 	if (EXT4_JOURNAL(inode) == NULL)
 		return 1;
-	if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
+	if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
 		return 0;
 	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
 		return 1;
 	return 0;
 }
 
+/*
+ * This function controls whether or not we should try to go down the
+ * dioread_nolock code paths, which makes it safe to avoid taking
+ * i_mutex for direct I/O reads.  This only works for extent-based
+ * files, and it doesn't work for nobh or if data journaling is
+ * enabled, since the dioread_nolock code uses b_private to pass
+ * information back to the I/O completion handler, and this conflicts
+ * with the jbd's use of b_private.
+ */
+static inline int ext4_should_dioread_nolock(struct inode *inode)
+{
+	if (!test_opt(inode->i_sb, DIOREAD_NOLOCK))
+		return 0;
+	if (test_opt(inode->i_sb, NOBH))
+		return 0;
+	if (!S_ISREG(inode->i_mode))
+		return 0;
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+		return 0;
+	if (ext4_should_journal_data(inode))
+		return 0;
+	return 1;
+}
+
 #endif	/* _EXT4_JBD2_H */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 765a4826b118..377309c1af65 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -107,11 +107,8 @@ static int ext4_ext_truncate_extend_restart(handle_t *handle,
 	if (err <= 0)
 		return err;
 	err = ext4_truncate_restart_trans(handle, inode, needed);
-	/*
-	 * We have dropped i_data_sem so someone might have cached again
-	 * an extent we are going to truncate.
-	 */
-	ext4_ext_invalidate_cache(inode);
+	if (err == 0)
+		err = -EAGAIN;
 
 	return err;
 }
@@ -185,18 +182,17 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
 	if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
 		/*
 		 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
-		 * block groups per flexgroup, reserve the first block 
-		 * group for directories and special files.  Regular 
+		 * block groups per flexgroup, reserve the first block
+		 * group for directories and special files.  Regular
 		 * files will start at the second block group.  This
-		 * tends to speed up directory access and improves 
+		 * tends to speed up directory access and improves
 		 * fsck times.
 		 */
 		block_group &= ~(flex_size-1);
 		if (S_ISREG(inode->i_mode))
 			block_group++;
 	}
-	bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
-		le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
+	bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
 	last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
 
 	/*
@@ -440,10 +436,10 @@ static int __ext4_ext_check(const char *function, struct inode *inode,
 	return 0;
 
 corrupted:
-	ext4_error(inode->i_sb, function,
-			"bad header/extent in inode #%lu: %s - magic %x, "
+	ext4_error_inode(function, inode,
+			"bad header/extent: %s - magic %x, "
 			"entries %u, max %u(%u), depth %u(%u)",
-			inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
+			error_msg, le16_to_cpu(eh->eh_magic),
 			le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
 			max, le16_to_cpu(eh->eh_depth), depth);
 
@@ -703,7 +699,12 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 		}
 		eh = ext_block_hdr(bh);
 		ppos++;
-		BUG_ON(ppos > depth);
+		if (unlikely(ppos > depth)) {
+			put_bh(bh);
+			EXT4_ERROR_INODE(inode,
+					 "ppos %d > depth %d", ppos, depth);
+			goto err;
+		}
 		path[ppos].p_bh = bh;
 		path[ppos].p_hdr = eh;
 		i--;
@@ -749,7 +750,12 @@ int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
 	if (err)
 		return err;
 
-	BUG_ON(logical == le32_to_cpu(curp->p_idx->ei_block));
+	if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
+		EXT4_ERROR_INODE(inode,
+				 "logical %d == ei_block %d!",
+				 logical, le32_to_cpu(curp->p_idx->ei_block));
+		return -EIO;
+	}
 	len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
 	if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
 		/* insert after */
@@ -779,9 +785,17 @@ int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
 	ext4_idx_store_pblock(ix, ptr);
 	le16_add_cpu(&curp->p_hdr->eh_entries, 1);
 
-	BUG_ON(le16_to_cpu(curp->p_hdr->eh_entries)
-			     > le16_to_cpu(curp->p_hdr->eh_max));
-	BUG_ON(ix > EXT_LAST_INDEX(curp->p_hdr));
+	if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
+			     > le16_to_cpu(curp->p_hdr->eh_max))) {
+		EXT4_ERROR_INODE(inode,
+				 "logical %d == ei_block %d!",
+				 logical, le32_to_cpu(curp->p_idx->ei_block));
+		return -EIO;
+	}
+	if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
+		EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
+		return -EIO;
+	}
 
 	err = ext4_ext_dirty(handle, inode, curp);
 	ext4_std_error(inode->i_sb, err);
@@ -819,7 +833,10 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 
 	/* if current leaf will be split, then we should use
 	 * border from split point */
-	BUG_ON(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr));
+	if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
+		EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
+		return -EIO;
+	}
 	if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
 		border = path[depth].p_ext[1].ee_block;
 		ext_debug("leaf will be split."
@@ -860,7 +877,11 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 
 	/* initialize new leaf */
 	newblock = ablocks[--a];
-	BUG_ON(newblock == 0);
+	if (unlikely(newblock == 0)) {
+		EXT4_ERROR_INODE(inode, "newblock == 0!");
+		err = -EIO;
+		goto cleanup;
+	}
 	bh = sb_getblk(inode->i_sb, newblock);
 	if (!bh) {
 		err = -EIO;
@@ -880,7 +901,14 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 	ex = EXT_FIRST_EXTENT(neh);
 
 	/* move remainder of path[depth] to the new leaf */
-	BUG_ON(path[depth].p_hdr->eh_entries != path[depth].p_hdr->eh_max);
+	if (unlikely(path[depth].p_hdr->eh_entries !=
+		     path[depth].p_hdr->eh_max)) {
+		EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
+				 path[depth].p_hdr->eh_entries,
+				 path[depth].p_hdr->eh_max);
+		err = -EIO;
+		goto cleanup;
+	}
 	/* start copy from next extent */
 	/* TODO: we could do it by single memmove */
 	m = 0;
@@ -927,7 +955,11 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 
 	/* create intermediate indexes */
 	k = depth - at - 1;
-	BUG_ON(k < 0);
+	if (unlikely(k < 0)) {
+		EXT4_ERROR_INODE(inode, "k %d < 0!", k);
+		err = -EIO;
+		goto cleanup;
+	}
 	if (k)
 		ext_debug("create %d intermediate indices\n", k);
 	/* insert new index into current index block */
@@ -964,8 +996,14 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 
 		ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
 				EXT_MAX_INDEX(path[i].p_hdr));
-		BUG_ON(EXT_MAX_INDEX(path[i].p_hdr) !=
-				EXT_LAST_INDEX(path[i].p_hdr));
+		if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
+					EXT_LAST_INDEX(path[i].p_hdr))) {
+			EXT4_ERROR_INODE(inode,
+					 "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
+					 le32_to_cpu(path[i].p_ext->ee_block));
+			err = -EIO;
+			goto cleanup;
+		}
 		while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
 			ext_debug("%d: move %d:%llu in new index %llu\n", i,
 					le32_to_cpu(path[i].p_idx->ei_block),
@@ -1203,7 +1241,10 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
 	struct ext4_extent *ex;
 	int depth, ee_len;
 
-	BUG_ON(path == NULL);
+	if (unlikely(path == NULL)) {
+		EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
+		return -EIO;
+	}
 	depth = path->p_depth;
 	*phys = 0;
 
@@ -1217,15 +1258,33 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
 	ex = path[depth].p_ext;
 	ee_len = ext4_ext_get_actual_len(ex);
 	if (*logical < le32_to_cpu(ex->ee_block)) {
-		BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex);
+		if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
+			EXT4_ERROR_INODE(inode,
+					 "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
+					 *logical, le32_to_cpu(ex->ee_block));
+			return -EIO;
+		}
 		while (--depth >= 0) {
 			ix = path[depth].p_idx;
-			BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr));
+			if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
+				EXT4_ERROR_INODE(inode,
+				  "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
+				  ix != NULL ? ix->ei_block : 0,
+				  EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
+				    EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block : 0,
+				  depth);
+				return -EIO;
+			}
 		}
 		return 0;
 	}
 
-	BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len));
+	if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
+		EXT4_ERROR_INODE(inode,
+				 "logical %d < ee_block %d + ee_len %d!",
+				 *logical, le32_to_cpu(ex->ee_block), ee_len);
+		return -EIO;
+	}
 
 	*logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
 	*phys = ext_pblock(ex) + ee_len - 1;
@@ -1251,7 +1310,10 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
 	int depth;	/* Note, NOT eh_depth; depth from top of tree */
 	int ee_len;
 
-	BUG_ON(path == NULL);
+	if (unlikely(path == NULL)) {
+		EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
+		return -EIO;
+	}
 	depth = path->p_depth;
 	*phys = 0;
 
@@ -1265,17 +1327,32 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
 	ex = path[depth].p_ext;
 	ee_len = ext4_ext_get_actual_len(ex);
 	if (*logical < le32_to_cpu(ex->ee_block)) {
-		BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex);
+		if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
+			EXT4_ERROR_INODE(inode,
+					 "first_extent(path[%d].p_hdr) != ex",
+					 depth);
+			return -EIO;
+		}
 		while (--depth >= 0) {
 			ix = path[depth].p_idx;
-			BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr));
+			if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
+				EXT4_ERROR_INODE(inode,
+						 "ix != EXT_FIRST_INDEX *logical %d!",
+						 *logical);
+				return -EIO;
+			}
 		}
 		*logical = le32_to_cpu(ex->ee_block);
 		*phys = ext_pblock(ex);
 		return 0;
 	}
 
-	BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len));
+	if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
+		EXT4_ERROR_INODE(inode,
+				 "logical %d < ee_block %d + ee_len %d!",
+				 *logical, le32_to_cpu(ex->ee_block), ee_len);
+		return -EIO;
+	}
 
 	if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
 		/* next allocated block in this leaf */
@@ -1414,8 +1491,12 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
 
 	eh = path[depth].p_hdr;
 	ex = path[depth].p_ext;
-	BUG_ON(ex == NULL);
-	BUG_ON(eh == NULL);
+
+	if (unlikely(ex == NULL || eh == NULL)) {
+		EXT4_ERROR_INODE(inode,
+				 "ex %p == NULL or eh %p == NULL", ex, eh);
+		return -EIO;
+	}
 
 	if (depth == 0) {
 		/* there is no tree at all */
@@ -1538,8 +1619,7 @@ int ext4_ext_try_to_merge(struct inode *inode,
 		merge_done = 1;
 		WARN_ON(eh->eh_entries == 0);
 		if (!eh->eh_entries)
-			ext4_error(inode->i_sb, "ext4_ext_try_to_merge",
-			   "inode#%lu, eh->eh_entries = 0!", inode->i_ino);
+			EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
 	}
 
 	return merge_done;
@@ -1612,13 +1692,19 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
 	ext4_lblk_t next;
 	unsigned uninitialized = 0;
 
-	BUG_ON(ext4_ext_get_actual_len(newext) == 0);
+	if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
+		EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
+		return -EIO;
+	}
 	depth = ext_depth(inode);
 	ex = path[depth].p_ext;
-	BUG_ON(path[depth].p_hdr == NULL);
+	if (unlikely(path[depth].p_hdr == NULL)) {
+		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
+		return -EIO;
+	}
 
 	/* try to insert block into found extent and return */
-	if (ex && (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
+	if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
 		&& ext4_can_extents_be_merged(inode, ex, newext)) {
 		ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
 				ext4_ext_is_uninitialized(newext),
@@ -1739,7 +1825,7 @@ has_space:
 
 merge:
 	/* try to merge extents to the right */
-	if (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
+	if (!(flag & EXT4_GET_BLOCKS_PRE_IO))
 		ext4_ext_try_to_merge(inode, path, nearex);
 
 	/* try to merge extents to the left */
@@ -1787,7 +1873,11 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
 		}
 
 		depth = ext_depth(inode);
-		BUG_ON(path[depth].p_hdr == NULL);
+		if (unlikely(path[depth].p_hdr == NULL)) {
+			EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
+			err = -EIO;
+			break;
+		}
 		ex = path[depth].p_ext;
 		next = ext4_ext_next_allocated_block(path);
 
@@ -1838,7 +1928,11 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
 			cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
 		}
 
-		BUG_ON(cbex.ec_len == 0);
+		if (unlikely(cbex.ec_len == 0)) {
+			EXT4_ERROR_INODE(inode, "cbex.ec_len == 0");
+			err = -EIO;
+			break;
+		}
 		err = func(inode, path, &cbex, ex, cbdata);
 		ext4_ext_drop_refs(path);
 
@@ -1940,7 +2034,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
 	struct ext4_ext_cache *cex;
 	int ret = EXT4_EXT_CACHE_NO;
 
-	/* 
+	/*
 	 * We borrow i_block_reservation_lock to protect i_cached_extent
 	 */
 	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -1952,7 +2046,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
 
 	BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
 			cex->ec_type != EXT4_EXT_CACHE_EXTENT);
-	if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) {
+	if (in_range(block, cex->ec_block, cex->ec_len)) {
 		ex->ee_block = cpu_to_le32(cex->ec_block);
 		ext4_ext_store_pblock(ex, cex->ec_start);
 		ex->ee_len = cpu_to_le16(cex->ec_len);
@@ -1981,7 +2075,10 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
 	/* free index block */
 	path--;
 	leaf = idx_pblock(path->p_idx);
-	BUG_ON(path->p_hdr->eh_entries == 0);
+	if (unlikely(path->p_hdr->eh_entries == 0)) {
+		EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
+		return -EIO;
+	}
 	err = ext4_ext_get_access(handle, inode, path);
 	if (err)
 		return err;
@@ -2119,8 +2216,10 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 	if (!path[depth].p_hdr)
 		path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
 	eh = path[depth].p_hdr;
-	BUG_ON(eh == NULL);
-
+	if (unlikely(path[depth].p_hdr == NULL)) {
+		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
+		return -EIO;
+	}
 	/* find where to start removing */
 	ex = EXT_LAST_EXTENT(eh);
 
@@ -2257,7 +2356,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
 	int depth = ext_depth(inode);
 	struct ext4_ext_path *path;
 	handle_t *handle;
-	int i = 0, err = 0;
+	int i, err;
 
 	ext_debug("truncate since %u\n", start);
 
@@ -2266,23 +2365,26 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 
+again:
 	ext4_ext_invalidate_cache(inode);
 
 	/*
 	 * We start scanning from right side, freeing all the blocks
 	 * after i_size and walking into the tree depth-wise.
 	 */
+	depth = ext_depth(inode);
 	path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS);
 	if (path == NULL) {
 		ext4_journal_stop(handle);
 		return -ENOMEM;
 	}
+	path[0].p_depth = depth;
 	path[0].p_hdr = ext_inode_hdr(inode);
 	if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
 		err = -EIO;
 		goto out;
 	}
-	path[0].p_depth = depth;
+	i = err = 0;
 
 	while (i >= 0 && err == 0) {
 		if (i == depth) {
@@ -2376,6 +2478,8 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
 out:
 	ext4_ext_drop_refs(path);
 	kfree(path);
+	if (err == -EAGAIN)
+		goto again;
 	ext4_journal_stop(handle);
 
 	return err;
@@ -2440,7 +2544,7 @@ static void bi_complete(struct bio *bio, int error)
 /* FIXME!! we need to try to merge to left or right after zero-out  */
 static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
 {
-	int ret = -EIO;
+	int ret;
 	struct bio *bio;
 	int blkbits, blocksize;
 	sector_t ee_pblock;
@@ -2464,6 +2568,9 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
 			len = ee_len;
 
 		bio = bio_alloc(GFP_NOIO, len);
+		if (!bio)
+			return -ENOMEM;
+
 		bio->bi_sector = ee_pblock;
 		bio->bi_bdev   = inode->i_sb->s_bdev;
 
@@ -2491,22 +2598,20 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
 		submit_bio(WRITE, bio);
 		wait_for_completion(&event);
 
-		if (test_bit(BIO_UPTODATE, &bio->bi_flags))
-			ret = 0;
-		else {
-			ret = -EIO;
-			break;
+		if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+			bio_put(bio);
+			return -EIO;
 		}
 		bio_put(bio);
 		ee_len    -= done;
 		ee_pblock += done  << (blkbits - 9);
 	}
-	return ret;
+	return 0;
 }
 
 #define EXT4_EXT_ZERO_LEN 7
 /*
- * This function is called by ext4_ext_get_blocks() if someone tries to write
+ * This function is called by ext4_ext_map_blocks() if someone tries to write
  * to an uninitialized extent. It may result in splitting the uninitialized
  * extent into multiple extents (upto three - one initialized and two
  * uninitialized).
@@ -2516,39 +2621,55 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
  *   c> Splits in three extents: Somone is writing in middle of the extent
  */
 static int ext4_ext_convert_to_initialized(handle_t *handle,
-						struct inode *inode,
-						struct ext4_ext_path *path,
-						ext4_lblk_t iblock,
-						unsigned int max_blocks)
+					   struct inode *inode,
+					   struct ext4_map_blocks *map,
+					   struct ext4_ext_path *path)
 {
 	struct ext4_extent *ex, newex, orig_ex;
 	struct ext4_extent *ex1 = NULL;
 	struct ext4_extent *ex2 = NULL;
 	struct ext4_extent *ex3 = NULL;
 	struct ext4_extent_header *eh;
-	ext4_lblk_t ee_block;
+	ext4_lblk_t ee_block, eof_block;
 	unsigned int allocated, ee_len, depth;
 	ext4_fsblk_t newblock;
 	int err = 0;
 	int ret = 0;
+	int may_zeroout;
+
+	ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
+		"block %llu, max_blocks %u\n", inode->i_ino,
+		(unsigned long long)map->m_lblk, map->m_len);
+
+	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
+		inode->i_sb->s_blocksize_bits;
+	if (eof_block < map->m_lblk + map->m_len)
+		eof_block = map->m_lblk + map->m_len;
 
 	depth = ext_depth(inode);
 	eh = path[depth].p_hdr;
 	ex = path[depth].p_ext;
 	ee_block = le32_to_cpu(ex->ee_block);
 	ee_len = ext4_ext_get_actual_len(ex);
-	allocated = ee_len - (iblock - ee_block);
-	newblock = iblock - ee_block + ext_pblock(ex);
+	allocated = ee_len - (map->m_lblk - ee_block);
+	newblock = map->m_lblk - ee_block + ext_pblock(ex);
+
 	ex2 = ex;
 	orig_ex.ee_block = ex->ee_block;
 	orig_ex.ee_len   = cpu_to_le16(ee_len);
 	ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
 
+	/*
+	 * It is safe to convert extent to initialized via explicit
+	 * zeroout only if extent is fully insde i_size or new_size.
+	 */
+	may_zeroout = ee_block + ee_len <= eof_block;
+
 	err = ext4_ext_get_access(handle, inode, path + depth);
 	if (err)
 		goto out;
 	/* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
-	if (ee_len <= 2*EXT4_EXT_ZERO_LEN) {
+	if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) {
 		err =  ext4_ext_zeroout(inode, &orig_ex);
 		if (err)
 			goto fix_extent_len;
@@ -2561,10 +2682,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 		return allocated;
 	}
 
-	/* ex1: ee_block to iblock - 1 : uninitialized */
-	if (iblock > ee_block) {
+	/* ex1: ee_block to map->m_lblk - 1 : uninitialized */
+	if (map->m_lblk > ee_block) {
 		ex1 = ex;
-		ex1->ee_len = cpu_to_le16(iblock - ee_block);
+		ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
 		ext4_ext_mark_uninitialized(ex1);
 		ex2 = &newex;
 	}
@@ -2573,15 +2694,15 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	 * we insert ex3, if ex1 is NULL. This is to avoid temporary
 	 * overlap of blocks.
 	 */
-	if (!ex1 && allocated > max_blocks)
-		ex2->ee_len = cpu_to_le16(max_blocks);
+	if (!ex1 && allocated > map->m_len)
+		ex2->ee_len = cpu_to_le16(map->m_len);
 	/* ex3: to ee_block + ee_len : uninitialised */
-	if (allocated > max_blocks) {
+	if (allocated > map->m_len) {
 		unsigned int newdepth;
 		/* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
-		if (allocated <= EXT4_EXT_ZERO_LEN) {
+		if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) {
 			/*
-			 * iblock == ee_block is handled by the zerouout
+			 * map->m_lblk == ee_block is handled by the zerouout
 			 * at the beginning.
 			 * Mark first half uninitialized.
 			 * Mark second half initialized and zero out the
@@ -2594,7 +2715,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 			ext4_ext_dirty(handle, inode, path + depth);
 
 			ex3 = &newex;
-			ex3->ee_block = cpu_to_le32(iblock);
+			ex3->ee_block = cpu_to_le32(map->m_lblk);
 			ext4_ext_store_pblock(ex3, newblock);
 			ex3->ee_len = cpu_to_le16(allocated);
 			err = ext4_ext_insert_extent(handle, inode, path,
@@ -2607,7 +2728,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 				ex->ee_len   = orig_ex.ee_len;
 				ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
 				ext4_ext_dirty(handle, inode, path + depth);
-				/* blocks available from iblock */
+				/* blocks available from map->m_lblk */
 				return allocated;
 
 			} else if (err)
@@ -2629,8 +2750,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 				 */
 				depth = ext_depth(inode);
 				ext4_ext_drop_refs(path);
-				path = ext4_ext_find_extent(inode,
-								iblock, path);
+				path = ext4_ext_find_extent(inode, map->m_lblk,
+							    path);
 				if (IS_ERR(path)) {
 					err = PTR_ERR(path);
 					return err;
@@ -2650,12 +2771,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 			return allocated;
 		}
 		ex3 = &newex;
-		ex3->ee_block = cpu_to_le32(iblock + max_blocks);
-		ext4_ext_store_pblock(ex3, newblock + max_blocks);
-		ex3->ee_len = cpu_to_le16(allocated - max_blocks);
+		ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
+		ext4_ext_store_pblock(ex3, newblock + map->m_len);
+		ex3->ee_len = cpu_to_le16(allocated - map->m_len);
 		ext4_ext_mark_uninitialized(ex3);
 		err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
-		if (err == -ENOSPC) {
+		if (err == -ENOSPC && may_zeroout) {
 			err =  ext4_ext_zeroout(inode, &orig_ex);
 			if (err)
 				goto fix_extent_len;
@@ -2665,7 +2786,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
 			ext4_ext_dirty(handle, inode, path + depth);
 			/* zeroed the full extent */
-			/* blocks available from iblock */
+			/* blocks available from map->m_lblk */
 			return allocated;
 
 		} else if (err)
@@ -2679,11 +2800,13 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 		 * update the extent length after successful insert of the
 		 * split extent
 		 */
-		orig_ex.ee_len = cpu_to_le16(ee_len -
-						ext4_ext_get_actual_len(ex3));
+		ee_len -= ext4_ext_get_actual_len(ex3);
+		orig_ex.ee_len = cpu_to_le16(ee_len);
+		may_zeroout = ee_block + ee_len <= eof_block;
+
 		depth = newdepth;
 		ext4_ext_drop_refs(path);
-		path = ext4_ext_find_extent(inode, iblock, path);
+		path = ext4_ext_find_extent(inode, map->m_lblk, path);
 		if (IS_ERR(path)) {
 			err = PTR_ERR(path);
 			goto out;
@@ -2697,14 +2820,14 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 		if (err)
 			goto out;
 
-		allocated = max_blocks;
+		allocated = map->m_len;
 
 		/* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
 		 * to insert a extent in the middle zerout directly
 		 * otherwise give the extent a chance to merge to left
 		 */
 		if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
-							iblock != ee_block) {
+			map->m_lblk != ee_block && may_zeroout) {
 			err =  ext4_ext_zeroout(inode, &orig_ex);
 			if (err)
 				goto fix_extent_len;
@@ -2714,7 +2837,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
 			ext4_ext_dirty(handle, inode, path + depth);
 			/* zero out the first half */
-			/* blocks available from iblock */
+			/* blocks available from map->m_lblk */
 			return allocated;
 		}
 	}
@@ -2725,12 +2848,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	 */
 	if (ex1 && ex1 != ex) {
 		ex1 = ex;
-		ex1->ee_len = cpu_to_le16(iblock - ee_block);
+		ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
 		ext4_ext_mark_uninitialized(ex1);
 		ex2 = &newex;
 	}
-	/* ex2: iblock to iblock + maxblocks-1 : initialised */
-	ex2->ee_block = cpu_to_le32(iblock);
+	/* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */
+	ex2->ee_block = cpu_to_le32(map->m_lblk);
 	ext4_ext_store_pblock(ex2, newblock);
 	ex2->ee_len = cpu_to_le16(allocated);
 	if (ex2 != ex)
@@ -2773,7 +2896,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	goto out;
 insert:
 	err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
-	if (err == -ENOSPC) {
+	if (err == -ENOSPC && may_zeroout) {
 		err =  ext4_ext_zeroout(inode, &orig_ex);
 		if (err)
 			goto fix_extent_len;
@@ -2800,7 +2923,7 @@ fix_extent_len:
 }
 
 /*
- * This function is called by ext4_ext_get_blocks() from
+ * This function is called by ext4_ext_map_blocks() from
  * ext4_get_blocks_dio_write() when DIO to write
  * to an uninitialized extent.
  *
@@ -2823,9 +2946,8 @@ fix_extent_len:
  */
 static int ext4_split_unwritten_extents(handle_t *handle,
 					struct inode *inode,
+					struct ext4_map_blocks *map,
 					struct ext4_ext_path *path,
-					ext4_lblk_t iblock,
-					unsigned int max_blocks,
 					int flags)
 {
 	struct ext4_extent *ex, newex, orig_ex;
@@ -2833,41 +2955,55 @@ static int ext4_split_unwritten_extents(handle_t *handle,
 	struct ext4_extent *ex2 = NULL;
 	struct ext4_extent *ex3 = NULL;
 	struct ext4_extent_header *eh;
-	ext4_lblk_t ee_block;
+	ext4_lblk_t ee_block, eof_block;
 	unsigned int allocated, ee_len, depth;
 	ext4_fsblk_t newblock;
 	int err = 0;
+	int may_zeroout;
+
+	ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
+		"block %llu, max_blocks %u\n", inode->i_ino,
+		(unsigned long long)map->m_lblk, map->m_len);
+
+	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
+		inode->i_sb->s_blocksize_bits;
+	if (eof_block < map->m_lblk + map->m_len)
+		eof_block = map->m_lblk + map->m_len;
 
-	ext_debug("ext4_split_unwritten_extents: inode %lu,"
-		  "iblock %llu, max_blocks %u\n", inode->i_ino,
-		  (unsigned long long)iblock, max_blocks);
 	depth = ext_depth(inode);
 	eh = path[depth].p_hdr;
 	ex = path[depth].p_ext;
 	ee_block = le32_to_cpu(ex->ee_block);
 	ee_len = ext4_ext_get_actual_len(ex);
-	allocated = ee_len - (iblock - ee_block);
-	newblock = iblock - ee_block + ext_pblock(ex);
+	allocated = ee_len - (map->m_lblk - ee_block);
+	newblock = map->m_lblk - ee_block + ext_pblock(ex);
+
 	ex2 = ex;
 	orig_ex.ee_block = ex->ee_block;
 	orig_ex.ee_len   = cpu_to_le16(ee_len);
 	ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
 
 	/*
+	 * It is safe to convert extent to initialized via explicit
+	 * zeroout only if extent is fully insde i_size or new_size.
+	 */
+	may_zeroout = ee_block + ee_len <= eof_block;
+
+	/*
  	 * If the uninitialized extent begins at the same logical
  	 * block where the write begins, and the write completely
  	 * covers the extent, then we don't need to split it.
  	 */
-	if ((iblock == ee_block) && (allocated <= max_blocks))
+	if ((map->m_lblk == ee_block) && (allocated <= map->m_len))
 		return allocated;
 
 	err = ext4_ext_get_access(handle, inode, path + depth);
 	if (err)
 		goto out;
-	/* ex1: ee_block to iblock - 1 : uninitialized */
-	if (iblock > ee_block) {
+	/* ex1: ee_block to map->m_lblk - 1 : uninitialized */
+	if (map->m_lblk > ee_block) {
 		ex1 = ex;
-		ex1->ee_len = cpu_to_le16(iblock - ee_block);
+		ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
 		ext4_ext_mark_uninitialized(ex1);
 		ex2 = &newex;
 	}
@@ -2876,18 +3012,18 @@ static int ext4_split_unwritten_extents(handle_t *handle,
 	 * we insert ex3, if ex1 is NULL. This is to avoid temporary
 	 * overlap of blocks.
 	 */
-	if (!ex1 && allocated > max_blocks)
-		ex2->ee_len = cpu_to_le16(max_blocks);
+	if (!ex1 && allocated > map->m_len)
+		ex2->ee_len = cpu_to_le16(map->m_len);
 	/* ex3: to ee_block + ee_len : uninitialised */
-	if (allocated > max_blocks) {
+	if (allocated > map->m_len) {
 		unsigned int newdepth;
 		ex3 = &newex;
-		ex3->ee_block = cpu_to_le32(iblock + max_blocks);
-		ext4_ext_store_pblock(ex3, newblock + max_blocks);
-		ex3->ee_len = cpu_to_le16(allocated - max_blocks);
+		ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
+		ext4_ext_store_pblock(ex3, newblock + map->m_len);
+		ex3->ee_len = cpu_to_le16(allocated - map->m_len);
 		ext4_ext_mark_uninitialized(ex3);
 		err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
-		if (err == -ENOSPC) {
+		if (err == -ENOSPC && may_zeroout) {
 			err =  ext4_ext_zeroout(inode, &orig_ex);
 			if (err)
 				goto fix_extent_len;
@@ -2897,7 +3033,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
 			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
 			ext4_ext_dirty(handle, inode, path + depth);
 			/* zeroed the full extent */
-			/* blocks available from iblock */
+			/* blocks available from map->m_lblk */
 			return allocated;
 
 		} else if (err)
@@ -2911,11 +3047,13 @@ static int ext4_split_unwritten_extents(handle_t *handle,
 		 * update the extent length after successful insert of the
 		 * split extent
 		 */
-		orig_ex.ee_len = cpu_to_le16(ee_len -
-						ext4_ext_get_actual_len(ex3));
+		ee_len -= ext4_ext_get_actual_len(ex3);
+		orig_ex.ee_len = cpu_to_le16(ee_len);
+		may_zeroout = ee_block + ee_len <= eof_block;
+
 		depth = newdepth;
 		ext4_ext_drop_refs(path);
-		path = ext4_ext_find_extent(inode, iblock, path);
+		path = ext4_ext_find_extent(inode, map->m_lblk, path);
 		if (IS_ERR(path)) {
 			err = PTR_ERR(path);
 			goto out;
@@ -2929,7 +3067,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
 		if (err)
 			goto out;
 
-		allocated = max_blocks;
+		allocated = map->m_len;
 	}
 	/*
 	 * If there was a change of depth as part of the
@@ -2938,15 +3076,15 @@ static int ext4_split_unwritten_extents(handle_t *handle,
 	 */
 	if (ex1 && ex1 != ex) {
 		ex1 = ex;
-		ex1->ee_len = cpu_to_le16(iblock - ee_block);
+		ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
 		ext4_ext_mark_uninitialized(ex1);
 		ex2 = &newex;
 	}
 	/*
-	 * ex2: iblock to iblock + maxblocks-1 : to be direct IO written,
-	 * uninitialised still.
+	 * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written
+	 * using direct I/O, uninitialised still.
 	 */
-	ex2->ee_block = cpu_to_le32(iblock);
+	ex2->ee_block = cpu_to_le32(map->m_lblk);
 	ext4_ext_store_pblock(ex2, newblock);
 	ex2->ee_len = cpu_to_le16(allocated);
 	ext4_ext_mark_uninitialized(ex2);
@@ -2958,7 +3096,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
 	goto out;
 insert:
 	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
-	if (err == -ENOSPC) {
+	if (err == -ENOSPC && may_zeroout) {
 		err =  ext4_ext_zeroout(inode, &orig_ex);
 		if (err)
 			goto fix_extent_len;
@@ -2983,7 +3121,7 @@ fix_extent_len:
 	ext4_ext_dirty(handle, inode, path + depth);
 	return err;
 }
-static int ext4_convert_unwritten_extents_dio(handle_t *handle,
+static int ext4_convert_unwritten_extents_endio(handle_t *handle,
 					      struct inode *inode,
 					      struct ext4_ext_path *path)
 {
@@ -3048,10 +3186,9 @@ static void unmap_underlying_metadata_blocks(struct block_device *bdev,
 
 static int
 ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
-			ext4_lblk_t iblock, unsigned int max_blocks,
+			struct ext4_map_blocks *map,
 			struct ext4_ext_path *path, int flags,
-			unsigned int allocated, struct buffer_head *bh_result,
-			ext4_fsblk_t newblock)
+			unsigned int allocated, ext4_fsblk_t newblock)
 {
 	int ret = 0;
 	int err = 0;
@@ -3059,29 +3196,30 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 
 	ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
 		  "block %llu, max_blocks %u, flags %d, allocated %u",
-		  inode->i_ino, (unsigned long long)iblock, max_blocks,
+		  inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
 		  flags, allocated);
 	ext4_ext_show_leaf(inode, path);
 
-	/* DIO get_block() before submit the IO, split the extent */
-	if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
-		ret = ext4_split_unwritten_extents(handle,
-						inode, path, iblock,
-						max_blocks, flags);
+	/* get_block() before submit the IO, split the extent */
+	if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
+		ret = ext4_split_unwritten_extents(handle, inode, map,
+						   path, flags);
 		/*
 		 * Flag the inode(non aio case) or end_io struct (aio case)
 		 * that this IO needs to convertion to written when IO is
 		 * completed
 		 */
 		if (io)
-			io->flag = DIO_AIO_UNWRITTEN;
+			io->flag = EXT4_IO_UNWRITTEN;
 		else
-			EXT4_I(inode)->i_state |= EXT4_STATE_DIO_UNWRITTEN;
+			ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
+		if (ext4_should_dioread_nolock(inode))
+			map->m_flags |= EXT4_MAP_UNINIT;
 		goto out;
 	}
-	/* async DIO end_io complete, convert the filled extent to written */
-	if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
-		ret = ext4_convert_unwritten_extents_dio(handle, inode,
+	/* IO end_io complete, convert the filled extent to written */
+	if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
+		ret = ext4_convert_unwritten_extents_endio(handle, inode,
 							path);
 		if (ret >= 0)
 			ext4_update_inode_fsync_trans(handle, inode, 1);
@@ -3104,14 +3242,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 		 * the buffer head will be unmapped so that
 		 * a read from the block returns 0s.
 		 */
-		set_buffer_unwritten(bh_result);
+		map->m_flags |= EXT4_MAP_UNWRITTEN;
 		goto out1;
 	}
 
 	/* buffered write, writepage time, convert*/
-	ret = ext4_ext_convert_to_initialized(handle, inode,
-						path, iblock,
-						max_blocks);
+	ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
 	if (ret >= 0)
 		ext4_update_inode_fsync_trans(handle, inode, 1);
 out:
@@ -3120,7 +3256,7 @@ out:
 		goto out2;
 	} else
 		allocated = ret;
-	set_buffer_new(bh_result);
+	map->m_flags |= EXT4_MAP_NEW;
 	/*
 	 * if we allocated more blocks than requested
 	 * we need to make sure we unmap the extra block
@@ -3128,11 +3264,11 @@ out:
 	 * unmapped later when we find the buffer_head marked
 	 * new.
 	 */
-	if (allocated > max_blocks) {
+	if (allocated > map->m_len) {
 		unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
-					newblock + max_blocks,
-					allocated - max_blocks);
-		allocated = max_blocks;
+					newblock + map->m_len,
+					allocated - map->m_len);
+		allocated = map->m_len;
 	}
 
 	/*
@@ -3146,13 +3282,13 @@ out:
 		ext4_da_update_reserve_space(inode, allocated, 0);
 
 map_out:
-	set_buffer_mapped(bh_result);
+	map->m_flags |= EXT4_MAP_MAPPED;
 out1:
-	if (allocated > max_blocks)
-		allocated = max_blocks;
+	if (allocated > map->m_len)
+		allocated = map->m_len;
 	ext4_ext_show_leaf(inode, path);
-	bh_result->b_bdev = inode->i_sb->s_bdev;
-	bh_result->b_blocknr = newblock;
+	map->m_pblk = newblock;
+	map->m_len = allocated;
 out2:
 	if (path) {
 		ext4_ext_drop_refs(path);
@@ -3178,26 +3314,23 @@ out2:
  *
  * return < 0, error case.
  */
-int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
-			ext4_lblk_t iblock,
-			unsigned int max_blocks, struct buffer_head *bh_result,
-			int flags)
+int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
+			struct ext4_map_blocks *map, int flags)
 {
 	struct ext4_ext_path *path = NULL;
 	struct ext4_extent_header *eh;
-	struct ext4_extent newex, *ex;
+	struct ext4_extent newex, *ex, *last_ex;
 	ext4_fsblk_t newblock;
-	int err = 0, depth, ret, cache_type;
+	int i, err = 0, depth, ret, cache_type;
 	unsigned int allocated = 0;
 	struct ext4_allocation_request ar;
 	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
 
-	__clear_bit(BH_New, &bh_result->b_state);
 	ext_debug("blocks %u/%u requested for inode %lu\n",
-			iblock, max_blocks, inode->i_ino);
+		  map->m_lblk, map->m_len, inode->i_ino);
 
 	/* check in cache */
-	cache_type = ext4_ext_in_cache(inode, iblock, &newex);
+	cache_type = ext4_ext_in_cache(inode, map->m_lblk, &newex);
 	if (cache_type) {
 		if (cache_type == EXT4_EXT_CACHE_GAP) {
 			if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
@@ -3210,12 +3343,12 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 			/* we should allocate requested block */
 		} else if (cache_type == EXT4_EXT_CACHE_EXTENT) {
 			/* block is already allocated */
-			newblock = iblock
+			newblock = map->m_lblk
 				   - le32_to_cpu(newex.ee_block)
 				   + ext_pblock(&newex);
 			/* number of remaining blocks in the extent */
 			allocated = ext4_ext_get_actual_len(&newex) -
-					(iblock - le32_to_cpu(newex.ee_block));
+				(map->m_lblk - le32_to_cpu(newex.ee_block));
 			goto out;
 		} else {
 			BUG();
@@ -3223,7 +3356,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	}
 
 	/* find extent for this block */
-	path = ext4_ext_find_extent(inode, iblock, NULL);
+	path = ext4_ext_find_extent(inode, map->m_lblk, NULL);
 	if (IS_ERR(path)) {
 		err = PTR_ERR(path);
 		path = NULL;
@@ -3237,10 +3370,11 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	 * this situation is possible, though, _during_ tree modification;
 	 * this is why assert can't be put in ext4_ext_find_extent()
 	 */
-	if (path[depth].p_ext == NULL && depth != 0) {
-		ext4_error(inode->i_sb, __func__, "bad extent address "
-			   "inode: %lu, iblock: %d, depth: %d",
-			   inode->i_ino, iblock, depth);
+	if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
+		EXT4_ERROR_INODE(inode, "bad extent address "
+				 "lblock: %lu, depth: %d pblock %lld",
+				 (unsigned long) map->m_lblk, depth,
+				 path[depth].p_block);
 		err = -EIO;
 		goto out2;
 	}
@@ -3258,12 +3392,12 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 		 */
 		ee_len = ext4_ext_get_actual_len(ex);
 		/* if found extent covers block, simply return it */
-		if (iblock >= ee_block && iblock < ee_block + ee_len) {
-			newblock = iblock - ee_block + ee_start;
+		if (in_range(map->m_lblk, ee_block, ee_len)) {
+			newblock = map->m_lblk - ee_block + ee_start;
 			/* number of remaining blocks in the extent */
-			allocated = ee_len - (iblock - ee_block);
-			ext_debug("%u fit into %u:%d -> %llu\n", iblock,
-					ee_block, ee_len, newblock);
+			allocated = ee_len - (map->m_lblk - ee_block);
+			ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
+				  ee_block, ee_len, newblock);
 
 			/* Do not put uninitialized extent in the cache */
 			if (!ext4_ext_is_uninitialized(ex)) {
@@ -3273,8 +3407,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 				goto out;
 			}
 			ret = ext4_ext_handle_uninitialized_extents(handle,
-					inode, iblock, max_blocks, path,
-					flags, allocated, bh_result, newblock);
+					inode, map, path, flags, allocated,
+					newblock);
 			return ret;
 		}
 	}
@@ -3288,7 +3422,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 		 * put just found gap into cache to speed up
 		 * subsequent requests
 		 */
-		ext4_ext_put_gap_in_cache(inode, path, iblock);
+		ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
 		goto out2;
 	}
 	/*
@@ -3296,11 +3430,11 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	 */
 
 	/* find neighbour allocated blocks */
-	ar.lleft = iblock;
+	ar.lleft = map->m_lblk;
 	err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
 	if (err)
 		goto out2;
-	ar.lright = iblock;
+	ar.lright = map->m_lblk;
 	err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
 	if (err)
 		goto out2;
@@ -3311,26 +3445,26 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
 	 * EXT_UNINIT_MAX_LEN.
 	 */
-	if (max_blocks > EXT_INIT_MAX_LEN &&
+	if (map->m_len > EXT_INIT_MAX_LEN &&
 	    !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
-		max_blocks = EXT_INIT_MAX_LEN;
-	else if (max_blocks > EXT_UNINIT_MAX_LEN &&
+		map->m_len = EXT_INIT_MAX_LEN;
+	else if (map->m_len > EXT_UNINIT_MAX_LEN &&
 		 (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
-		max_blocks = EXT_UNINIT_MAX_LEN;
+		map->m_len = EXT_UNINIT_MAX_LEN;
 
-	/* Check if we can really insert (iblock)::(iblock+max_blocks) extent */
-	newex.ee_block = cpu_to_le32(iblock);
-	newex.ee_len = cpu_to_le16(max_blocks);
+	/* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
+	newex.ee_block = cpu_to_le32(map->m_lblk);
+	newex.ee_len = cpu_to_le16(map->m_len);
 	err = ext4_ext_check_overlap(inode, &newex, path);
 	if (err)
 		allocated = ext4_ext_get_actual_len(&newex);
 	else
-		allocated = max_blocks;
+		allocated = map->m_len;
 
 	/* allocate new block */
 	ar.inode = inode;
-	ar.goal = ext4_ext_find_goal(inode, path, iblock);
-	ar.logical = iblock;
+	ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
+	ar.logical = map->m_lblk;
 	ar.len = allocated;
 	if (S_ISREG(inode->i_mode))
 		ar.flags = EXT4_MB_HINT_DATA;
@@ -3350,21 +3484,47 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
 		ext4_ext_mark_uninitialized(&newex);
 		/*
-		 * io_end structure was created for every async
-		 * direct IO write to the middle of the file.
-		 * To avoid unecessary convertion for every aio dio rewrite
-		 * to the mid of file, here we flag the IO that is really
-		 * need the convertion.
+		 * io_end structure was created for every IO write to an
+		 * uninitialized extent. To avoid unecessary conversion,
+		 * here we flag the IO that really needs the conversion.
 		 * For non asycn direct IO case, flag the inode state
 		 * that we need to perform convertion when IO is done.
 		 */
-		if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
+		if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
 			if (io)
-				io->flag = DIO_AIO_UNWRITTEN;
+				io->flag = EXT4_IO_UNWRITTEN;
 			else
-				EXT4_I(inode)->i_state |=
-					EXT4_STATE_DIO_UNWRITTEN;;
+				ext4_set_inode_state(inode,
+						     EXT4_STATE_DIO_UNWRITTEN);
 		}
+		if (ext4_should_dioread_nolock(inode))
+			map->m_flags |= EXT4_MAP_UNINIT;
+	}
+
+	if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) {
+		if (unlikely(!eh->eh_entries)) {
+			EXT4_ERROR_INODE(inode,
+					 "eh->eh_entries == 0 and "
+					 "EOFBLOCKS_FL set");
+			err = -EIO;
+			goto out2;
+		}
+		last_ex = EXT_LAST_EXTENT(eh);
+		/*
+		 * If the current leaf block was reached by looking at
+		 * the last index block all the way down the tree, and
+		 * we are extending the inode beyond the last extent
+		 * in the current leaf block, then clear the
+		 * EOFBLOCKS_FL flag.
+		 */
+		for (i = depth-1; i >= 0; i--) {
+			if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
+				break;
+		}
+		if ((i < 0) &&
+		    (map->m_lblk + ar.len > le32_to_cpu(last_ex->ee_block) +
+		     ext4_ext_get_actual_len(last_ex)))
+			ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
 	}
 	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
 	if (err) {
@@ -3380,9 +3540,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	/* previous routine could use block we allocated */
 	newblock = ext_pblock(&newex);
 	allocated = ext4_ext_get_actual_len(&newex);
-	if (allocated > max_blocks)
-		allocated = max_blocks;
-	set_buffer_new(bh_result);
+	if (allocated > map->m_len)
+		allocated = map->m_len;
+	map->m_flags |= EXT4_MAP_NEW;
 
 	/*
 	 * Update reserved blocks/metadata blocks after successful
@@ -3396,18 +3556,18 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	 * when it is _not_ an uninitialized extent.
 	 */
 	if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
-		ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
+		ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock,
 						EXT4_EXT_CACHE_EXTENT);
 		ext4_update_inode_fsync_trans(handle, inode, 1);
 	} else
 		ext4_update_inode_fsync_trans(handle, inode, 0);
 out:
-	if (allocated > max_blocks)
-		allocated = max_blocks;
+	if (allocated > map->m_len)
+		allocated = map->m_len;
 	ext4_ext_show_leaf(inode, path);
-	set_buffer_mapped(bh_result);
-	bh_result->b_bdev = inode->i_sb->s_bdev;
-	bh_result->b_blocknr = newblock;
+	map->m_flags |= EXT4_MAP_MAPPED;
+	map->m_pblk = newblock;
+	map->m_len = allocated;
 out2:
 	if (path) {
 		ext4_ext_drop_refs(path);
@@ -3499,6 +3659,13 @@ static void ext4_falloc_update_inode(struct inode *inode,
 			i_size_write(inode, new_size);
 		if (new_size > EXT4_I(inode)->i_disksize)
 			ext4_update_i_disksize(inode, new_size);
+	} else {
+		/*
+		 * Mark that we allocate beyond EOF so the subsequent truncate
+		 * can proceed even if the new size is the same as i_size.
+		 */
+		if (new_size > i_size_read(inode))
+			ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
 	}
 
 }
@@ -3513,55 +3680,57 @@ static void ext4_falloc_update_inode(struct inode *inode,
 long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
 {
 	handle_t *handle;
-	ext4_lblk_t block;
 	loff_t new_size;
 	unsigned int max_blocks;
 	int ret = 0;
 	int ret2 = 0;
 	int retries = 0;
-	struct buffer_head map_bh;
+	struct ext4_map_blocks map;
 	unsigned int credits, blkbits = inode->i_blkbits;
 
 	/*
 	 * currently supporting (pre)allocate mode for extent-based
 	 * files _only_
 	 */
-	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 		return -EOPNOTSUPP;
 
 	/* preallocation to directories is currently not supported */
 	if (S_ISDIR(inode->i_mode))
 		return -ENODEV;
 
-	block = offset >> blkbits;
+	map.m_lblk = offset >> blkbits;
 	/*
 	 * We can't just convert len to max_blocks because
 	 * If blocksize = 4096 offset = 3072 and len = 2048
 	 */
 	max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
-							- block;
+		- map.m_lblk;
 	/*
 	 * credits to insert 1 extent into extent tree
 	 */
 	credits = ext4_chunk_trans_blocks(inode, max_blocks);
 	mutex_lock(&inode->i_mutex);
+	ret = inode_newsize_ok(inode, (len + offset));
+	if (ret) {
+		mutex_unlock(&inode->i_mutex);
+		return ret;
+	}
 retry:
 	while (ret >= 0 && ret < max_blocks) {
-		block = block + ret;
-		max_blocks = max_blocks - ret;
+		map.m_lblk = map.m_lblk + ret;
+		map.m_len = max_blocks = max_blocks - ret;
 		handle = ext4_journal_start(inode, credits);
 		if (IS_ERR(handle)) {
 			ret = PTR_ERR(handle);
 			break;
 		}
-		map_bh.b_state = 0;
-		ret = ext4_get_blocks(handle, inode, block,
-				      max_blocks, &map_bh,
+		ret = ext4_map_blocks(handle, inode, &map,
 				      EXT4_GET_BLOCKS_CREATE_UNINIT_EXT);
 		if (ret <= 0) {
 #ifdef EXT4FS_DEBUG
 			WARN_ON(ret <= 0);
-			printk(KERN_ERR "%s: ext4_ext_get_blocks "
+			printk(KERN_ERR "%s: ext4_ext_map_blocks "
 				    "returned error inode#%lu, block=%u, "
 				    "max_blocks=%u", __func__,
 				    inode->i_ino, block, max_blocks);
@@ -3570,14 +3739,14 @@ retry:
 			ret2 = ext4_journal_stop(handle);
 			break;
 		}
-		if ((block + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
+		if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
 						blkbits) >> blkbits))
 			new_size = offset + len;
 		else
-			new_size = (block + ret) << blkbits;
+			new_size = (map.m_lblk + ret) << blkbits;
 
 		ext4_falloc_update_inode(inode, mode, new_size,
-						buffer_new(&map_bh));
+					 (map.m_flags & EXT4_MAP_NEW));
 		ext4_mark_inode_dirty(handle, inode);
 		ret2 = ext4_journal_stop(handle);
 		if (ret2)
@@ -3603,45 +3772,42 @@ retry:
  * Returns 0 on success.
  */
 int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
-				    loff_t len)
+				    ssize_t len)
 {
 	handle_t *handle;
-	ext4_lblk_t block;
 	unsigned int max_blocks;
 	int ret = 0;
 	int ret2 = 0;
-	struct buffer_head map_bh;
+	struct ext4_map_blocks map;
 	unsigned int credits, blkbits = inode->i_blkbits;
 
-	block = offset >> blkbits;
+	map.m_lblk = offset >> blkbits;
 	/*
 	 * We can't just convert len to max_blocks because
 	 * If blocksize = 4096 offset = 3072 and len = 2048
 	 */
-	max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
-							- block;
+	max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
+		      map.m_lblk);
 	/*
 	 * credits to insert 1 extent into extent tree
 	 */
 	credits = ext4_chunk_trans_blocks(inode, max_blocks);
 	while (ret >= 0 && ret < max_blocks) {
-		block = block + ret;
-		max_blocks = max_blocks - ret;
+		map.m_lblk += ret;
+		map.m_len = (max_blocks -= ret);
 		handle = ext4_journal_start(inode, credits);
 		if (IS_ERR(handle)) {
 			ret = PTR_ERR(handle);
 			break;
 		}
-		map_bh.b_state = 0;
-		ret = ext4_get_blocks(handle, inode, block,
-				      max_blocks, &map_bh,
-				      EXT4_GET_BLOCKS_DIO_CONVERT_EXT);
+		ret = ext4_map_blocks(handle, inode, &map,
+				      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
 		if (ret <= 0) {
 			WARN_ON(ret <= 0);
-			printk(KERN_ERR "%s: ext4_ext_get_blocks "
+			printk(KERN_ERR "%s: ext4_ext_map_blocks "
 				    "returned error inode#%lu, block=%u, "
 				    "max_blocks=%u", __func__,
-				    inode->i_ino, block, max_blocks);
+				    inode->i_ino, map.m_lblk, map.m_len);
 		}
 		ext4_mark_inode_dirty(handle, inode);
 		ret2 = ext4_journal_stop(handle);
@@ -3739,7 +3905,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
 	int error = 0;
 
 	/* in-inode? */
-	if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) {
+	if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
 		struct ext4_iloc iloc;
 		int offset;	/* offset of xattr in inode */
 
@@ -3752,6 +3918,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
 		physical += offset;
 		length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
 		flags |= FIEMAP_EXTENT_DATA_INLINE;
+		brelse(iloc.bh);
 	} else { /* external block */
 		physical = EXT4_I(inode)->i_file_acl << blockbits;
 		length = inode->i_sb->s_blocksize;
@@ -3767,11 +3934,10 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		__u64 start, __u64 len)
 {
 	ext4_lblk_t start_blk;
-	ext4_lblk_t len_blks;
 	int error = 0;
 
 	/* fallback to generic here if not in extents fmt */
-	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 		return generic_block_fiemap(inode, fieinfo, start, len,
 			ext4_get_block);
 
@@ -3781,8 +3947,14 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
 		error = ext4_xattr_fiemap(inode, fieinfo);
 	} else {
+		ext4_lblk_t len_blks;
+		__u64 last_blk;
+
 		start_blk = start >> inode->i_sb->s_blocksize_bits;
-		len_blks = len >> inode->i_sb->s_blocksize_bits;
+		last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
+		if (last_blk >= EXT_MAX_BLOCK)
+			last_blk = EXT_MAX_BLOCK-1;
+		len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
 
 		/*
 		 * Walk the extent tree gathering extent information.
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 9630583cef28..5313ae4cda2d 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -23,6 +23,7 @@
 #include <linux/jbd2.h>
 #include <linux/mount.h>
 #include <linux/path.h>
+#include <linux/quotaops.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -35,9 +36,9 @@
  */
 static int ext4_release_file(struct inode *inode, struct file *filp)
 {
-	if (EXT4_I(inode)->i_state & EXT4_STATE_DA_ALLOC_CLOSE) {
+	if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) {
 		ext4_alloc_da_blocks(inode);
-		EXT4_I(inode)->i_state &= ~EXT4_STATE_DA_ALLOC_CLOSE;
+		ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
 	}
 	/* if we are the last writer on the inode, drop the block reservation */
 	if ((filp->f_mode & FMODE_WRITE) &&
@@ -65,7 +66,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
 	 * is smaller than s_maxbytes, which is for extent-mapped files.
 	 */
 
-	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
 		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 		size_t length = iov_length(iov, nr_segs);
 
@@ -116,18 +117,16 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
 		 * devices or filesystem images.
 		 */
 		memset(buf, 0, sizeof(buf));
-		path.mnt = mnt->mnt_parent;
-		path.dentry = mnt->mnt_mountpoint;
-		path_get(&path);
+		path.mnt = mnt;
+		path.dentry = mnt->mnt_root;
 		cp = d_path(&path, buf, sizeof(buf));
-		path_put(&path);
 		if (!IS_ERR(cp)) {
 			memcpy(sbi->s_es->s_last_mounted, cp,
 			       sizeof(sbi->s_es->s_last_mounted));
 			sb->s_dirt = 1;
 		}
 	}
-	return generic_file_open(inode, filp);
+	return dquot_file_open(inode, filp);
 }
 
 const struct file_operations ext4_file_operations = {
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 98bd140aad01..592adf2e546e 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -35,6 +35,29 @@
 #include <trace/events/ext4.h>
 
 /*
+ * If we're not journaling and this is a just-created file, we have to
+ * sync our parent directory (if it was freshly created) since
+ * otherwise it will only be written by writeback, leaving a huge
+ * window during which a crash may lose the file.  This may apply for
+ * the parent directory's parent as well, and so on recursively, if
+ * they are also freshly created.
+ */
+static void ext4_sync_parent(struct inode *inode)
+{
+	struct dentry *dentry = NULL;
+
+	while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
+		ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
+		dentry = list_entry(inode->i_dentry.next,
+				    struct dentry, d_alias);
+		if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode)
+			break;
+		inode = dentry->d_parent->d_inode;
+		sync_mapping_buffers(inode->i_mapping);
+	}
+}
+
+/*
  * akpm: A new design for ext4_sync_file().
  *
  * This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
@@ -48,9 +71,9 @@
  * i_mutex lock is held when entering and exiting this function
  */
 
-int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
+int ext4_sync_file(struct file *file, int datasync)
 {
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = file->f_mapping->host;
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
 	int ret;
@@ -58,17 +81,21 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 
 	J_ASSERT(ext4_journal_current_handle() == NULL);
 
-	trace_ext4_sync_file(file, dentry, datasync);
+	trace_ext4_sync_file(file, datasync);
 
 	if (inode->i_sb->s_flags & MS_RDONLY)
 		return 0;
 
-	ret = flush_aio_dio_completed_IO(inode);
+	ret = flush_completed_IO(inode);
 	if (ret < 0)
 		return ret;
-	
-	if (!journal)
-		return simple_fsync(file, dentry, datasync);
+
+	if (!journal) {
+		ret = generic_file_fsync(file, datasync);
+		if (!ret && !list_empty(&inode->i_dentry))
+			ext4_sync_parent(inode);
+		return ret;
+	}
 
 	/*
 	 * data=writeback,ordered:
@@ -100,9 +127,11 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 		if (ext4_should_writeback_data(inode) &&
 		    (journal->j_fs_dev != journal->j_dev) &&
 		    (journal->j_flags & JBD2_BARRIER))
-			blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
-		jbd2_log_wait_commit(journal, commit_tid);
+			blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
+					NULL, BLKDEV_IFL_WAIT);
+		ret = jbd2_log_wait_commit(journal, commit_tid);
 	} else if (journal->j_flags & JBD2_BARRIER)
-		blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
+			BLKDEV_IFL_WAIT);
 	return ret;
 }
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index f3624ead4f6c..25c4b3173fd9 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -76,8 +76,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
 	/* If checksum is bad mark all blocks and inodes use to prevent
 	 * allocation, essentially implementing a per-group read-only flag. */
 	if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
-		ext4_error(sb, __func__, "Checksum bad for group %u",
-			   block_group);
+		ext4_error(sb, "Checksum bad for group %u", block_group);
 		ext4_free_blks_set(sb, gdp, 0);
 		ext4_free_inodes_set(sb, gdp, 0);
 		ext4_itable_unused_set(sb, gdp, 0);
@@ -111,8 +110,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 	bitmap_blk = ext4_inode_bitmap(sb, desc);
 	bh = sb_getblk(sb, bitmap_blk);
 	if (unlikely(!bh)) {
-		ext4_error(sb, __func__,
-			    "Cannot read inode bitmap - "
+		ext4_error(sb, "Cannot read inode bitmap - "
 			    "block_group = %u, inode_bitmap = %llu",
 			    block_group, bitmap_blk);
 		return NULL;
@@ -153,8 +151,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 	set_bitmap_uptodate(bh);
 	if (bh_submit_read(bh) < 0) {
 		put_bh(bh);
-		ext4_error(sb, __func__,
-			    "Cannot read inode bitmap - "
+		ext4_error(sb, "Cannot read inode bitmap - "
 			    "block_group = %u, inode_bitmap = %llu",
 			    block_group, bitmap_blk);
 		return NULL;
@@ -217,10 +214,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 	 * Note: we must free any quota before locking the superblock,
 	 * as writing the quota to disk may need the lock as well.
 	 */
-	vfs_dq_init(inode);
+	dquot_initialize(inode);
 	ext4_xattr_delete_inode(handle, inode);
-	vfs_dq_free_inode(inode);
-	vfs_dq_drop(inode);
+	dquot_free_inode(inode);
+	dquot_drop(inode);
 
 	is_directory = S_ISDIR(inode->i_mode);
 
@@ -229,8 +226,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 
 	es = EXT4_SB(sb)->s_es;
 	if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
-		ext4_error(sb, "ext4_free_inode",
-			   "reserved or nonexistent inode %lu", ino);
+		ext4_error(sb, "reserved or nonexistent inode %lu", ino);
 		goto error_return;
 	}
 	block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
@@ -244,57 +240,49 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 	if (fatal)
 		goto error_return;
 
-	/* Ok, now we can actually update the inode bitmaps.. */
-	cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
-					bit, bitmap_bh->b_data);
-	if (!cleared)
-		ext4_error(sb, "ext4_free_inode",
-			   "bit already cleared for inode %lu", ino);
-	else {
-		gdp = ext4_get_group_desc(sb, block_group, &bh2);
-
+	fatal = -ESRCH;
+	gdp = ext4_get_group_desc(sb, block_group, &bh2);
+	if (gdp) {
 		BUFFER_TRACE(bh2, "get_write_access");
 		fatal = ext4_journal_get_write_access(handle, bh2);
-		if (fatal) goto error_return;
-
-		if (gdp) {
-			ext4_lock_group(sb, block_group);
-			count = ext4_free_inodes_count(sb, gdp) + 1;
-			ext4_free_inodes_set(sb, gdp, count);
-			if (is_directory) {
-				count = ext4_used_dirs_count(sb, gdp) - 1;
-				ext4_used_dirs_set(sb, gdp, count);
-				if (sbi->s_log_groups_per_flex) {
-					ext4_group_t f;
-
-					f = ext4_flex_group(sbi, block_group);
-					atomic_dec(&sbi->s_flex_groups[f].free_inodes);
-				}
+	}
+	ext4_lock_group(sb, block_group);
+	cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
+	if (fatal || !cleared) {
+		ext4_unlock_group(sb, block_group);
+		goto out;
+	}
 
-			}
-			gdp->bg_checksum = ext4_group_desc_csum(sbi,
-							block_group, gdp);
-			ext4_unlock_group(sb, block_group);
-			percpu_counter_inc(&sbi->s_freeinodes_counter);
-			if (is_directory)
-				percpu_counter_dec(&sbi->s_dirs_counter);
-
-			if (sbi->s_log_groups_per_flex) {
-				ext4_group_t f;
-
-				f = ext4_flex_group(sbi, block_group);
-				atomic_inc(&sbi->s_flex_groups[f].free_inodes);
-			}
-		}
-		BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
-		err = ext4_handle_dirty_metadata(handle, NULL, bh2);
-		if (!fatal) fatal = err;
+	count = ext4_free_inodes_count(sb, gdp) + 1;
+	ext4_free_inodes_set(sb, gdp, count);
+	if (is_directory) {
+		count = ext4_used_dirs_count(sb, gdp) - 1;
+		ext4_used_dirs_set(sb, gdp, count);
+		percpu_counter_dec(&sbi->s_dirs_counter);
 	}
-	BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
-	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
-	if (!fatal)
-		fatal = err;
-	sb->s_dirt = 1;
+	gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
+	ext4_unlock_group(sb, block_group);
+
+	percpu_counter_inc(&sbi->s_freeinodes_counter);
+	if (sbi->s_log_groups_per_flex) {
+		ext4_group_t f = ext4_flex_group(sbi, block_group);
+
+		atomic_inc(&sbi->s_flex_groups[f].free_inodes);
+		if (is_directory)
+			atomic_dec(&sbi->s_flex_groups[f].used_dirs);
+	}
+	BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
+	fatal = ext4_handle_dirty_metadata(handle, NULL, bh2);
+out:
+	if (cleared) {
+		BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
+		err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+		if (!fatal)
+			fatal = err;
+		sb->s_dirt = 1;
+	} else
+		ext4_error(sb, "bit already cleared for inode %lu", ino);
+
 error_return:
 	brelse(bitmap_bh);
 	ext4_std_error(sb, fatal);
@@ -504,7 +492,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 
 	if (S_ISDIR(mode) &&
 	    ((parent == sb->s_root->d_inode) ||
-	     (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) {
+	     (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) {
 		int best_ndir = inodes_per_group;
 		int ret = -1;
 
@@ -736,8 +724,7 @@ static int ext4_claim_inode(struct super_block *sb,
 	if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
 			ino > EXT4_INODES_PER_GROUP(sb)) {
 		ext4_unlock_group(sb, group);
-		ext4_error(sb, __func__,
-			   "reserved inode or inode > inodes count - "
+		ext4_error(sb, "reserved inode or inode > inodes count - "
 			   "block_group = %u, inode=%lu", group,
 			   ino + group * EXT4_INODES_PER_GROUP(sb));
 		return 1;
@@ -779,7 +766,7 @@ static int ext4_claim_inode(struct super_block *sb,
 		if (sbi->s_log_groups_per_flex) {
 			ext4_group_t f = ext4_flex_group(sbi, group);
 
-			atomic_inc(&sbi->s_flex_groups[f].free_inodes);
+			atomic_inc(&sbi->s_flex_groups[f].used_dirs);
 		}
 	}
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
@@ -904,7 +891,7 @@ repeat_in_this_group:
 				BUFFER_TRACE(inode_bitmap_bh,
 					"call ext4_handle_dirty_metadata");
 				err = ext4_handle_dirty_metadata(handle,
-								 inode,
+								 NULL,
 							inode_bitmap_bh);
 				if (err)
 					goto fail;
@@ -985,16 +972,12 @@ got:
 		atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
 	}
 
-	inode->i_uid = current_fsuid();
-	if (test_opt(sb, GRPID))
-		inode->i_gid = dir->i_gid;
-	else if (dir->i_mode & S_ISGID) {
+	if (test_opt(sb, GRPID)) {
+		inode->i_mode = mode;
+		inode->i_uid = current_fsuid();
 		inode->i_gid = dir->i_gid;
-		if (S_ISDIR(mode))
-			mode |= S_ISGID;
 	} else
-		inode->i_gid = current_fsgid();
-	inode->i_mode = mode;
+		inode_init_owner(inode, dir, mode);
 
 	inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
 	/* This is the optimal IO size (for stat), not the fs block size */
@@ -1029,15 +1012,16 @@ got:
 	inode->i_generation = sbi->s_next_generation++;
 	spin_unlock(&sbi->s_next_gen_lock);
 
-	ei->i_state = EXT4_STATE_NEW;
+	ei->i_state_flags = 0;
+	ext4_set_inode_state(inode, EXT4_STATE_NEW);
 
 	ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
 
 	ret = inode;
-	if (vfs_dq_alloc_inode(inode)) {
-		err = -EDQUOT;
+	dquot_initialize(inode);
+	err = dquot_alloc_inode(inode);
+	if (err)
 		goto fail_drop;
-	}
 
 	err = ext4_init_acl(handle, inode, dir);
 	if (err)
@@ -1050,7 +1034,7 @@ got:
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
 		/* set extent flag only for directory, file and normal symlink*/
 		if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
-			EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
+			ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
 			ext4_ext_tree_init(handle, inode);
 		}
 	}
@@ -1074,10 +1058,10 @@ really_out:
 	return ret;
 
 fail_free_drop:
-	vfs_dq_free_inode(inode);
+	dquot_free_inode(inode);
 
 fail_drop:
-	vfs_dq_drop(inode);
+	dquot_drop(inode);
 	inode->i_flags |= S_NOQUOTA;
 	inode->i_nlink = 0;
 	unlock_new_inode(inode);
@@ -1098,8 +1082,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
 
 	/* Error cases - e2fsck has already cleaned up for us */
 	if (ino > max_ino) {
-		ext4_warning(sb, __func__,
-			     "bad orphan ino %lu!  e2fsck was run?", ino);
+		ext4_warning(sb, "bad orphan ino %lu!  e2fsck was run?", ino);
 		goto error;
 	}
 
@@ -1107,8 +1090,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
 	bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
 	bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
 	if (!bitmap_bh) {
-		ext4_warning(sb, __func__,
-			     "inode bitmap error for orphan %lu", ino);
+		ext4_warning(sb, "inode bitmap error for orphan %lu", ino);
 		goto error;
 	}
 
@@ -1140,8 +1122,7 @@ iget_failed:
 	err = PTR_ERR(inode);
 	inode = NULL;
 bad_orphan:
-	ext4_warning(sb, __func__,
-		     "bad orphan inode %lu!  e2fsck was run?", ino);
+	ext4_warning(sb, "bad orphan inode %lu!  e2fsck was run?", ino);
 	printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n",
 	       bit, (unsigned long long)bitmap_bh->b_blocknr,
 	       ext4_test_bit(bit, bitmap_bh->b_data));
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e11952404e02..42272d67955a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -38,6 +38,8 @@
 #include <linux/uio.h>
 #include <linux/bio.h>
 #include <linux/workqueue.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
 
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -147,7 +149,7 @@ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
 	int ret;
 
 	/*
-	 * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this
+	 * Drop i_data_sem to avoid deadlock with ext4_map_blocks.  At this
 	 * moment, get_block can be called only for blocks inside i_size since
 	 * page cache has been already dropped and writes are blocked by
 	 * i_mutex. So we can safely drop the i_data_sem here.
@@ -170,6 +172,9 @@ void ext4_delete_inode(struct inode *inode)
 	handle_t *handle;
 	int err;
 
+	if (!is_bad_inode(inode))
+		dquot_initialize(inode);
+
 	if (ext4_should_order_data(inode))
 		ext4_begin_ordered_truncate(inode, 0);
 	truncate_inode_pages(&inode->i_data, 0);
@@ -194,7 +199,7 @@ void ext4_delete_inode(struct inode *inode)
 	inode->i_size = 0;
 	err = ext4_mark_inode_dirty(handle, inode);
 	if (err) {
-		ext4_warning(inode->i_sb, __func__,
+		ext4_warning(inode->i_sb,
 			     "couldn't mark inode dirty (err %d)", err);
 		goto stop_handle;
 	}
@@ -212,7 +217,7 @@ void ext4_delete_inode(struct inode *inode)
 		if (err > 0)
 			err = ext4_journal_restart(handle, 3);
 		if (err != 0) {
-			ext4_warning(inode->i_sb, __func__,
+			ext4_warning(inode->i_sb,
 				     "couldn't extend journal (err %d)", err);
 		stop_handle:
 			ext4_journal_stop(handle);
@@ -323,8 +328,7 @@ static int ext4_block_to_path(struct inode *inode,
 		offsets[n++] = i_block & (ptrs - 1);
 		final = ptrs;
 	} else {
-		ext4_warning(inode->i_sb, "ext4_block_to_path",
-			     "block %lu > max in inode %lu",
+		ext4_warning(inode->i_sb, "block %lu > max in inode %lu",
 			     i_block + direct_blocks +
 			     indirect_blocks + double_blocks, inode->i_ino);
 	}
@@ -344,9 +348,8 @@ static int __ext4_check_blockref(const char *function, struct inode *inode,
 		if (blk &&
 		    unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
 						    blk, 1))) {
-			ext4_error(inode->i_sb, function,
-				   "invalid block reference %u "
-				   "in inode #%lu", blk, inode->i_ino);
+			ext4_error_inode(function, inode,
+					 "invalid block reference %u", blk);
 			return -EIO;
 		}
 	}
@@ -607,7 +610,14 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
 		if (*err)
 			goto failed_out;
 
-		BUG_ON(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS);
+		if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) {
+			EXT4_ERROR_INODE(inode,
+					 "current_block %llu + count %lu > %d!",
+					 current_block, count,
+					 EXT4_MAX_BLOCK_FILE_PHYS);
+			*err = -EIO;
+			goto failed_out;
+		}
 
 		target -= count;
 		/* allocate blocks for indirect blocks */
@@ -643,7 +653,14 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
 		ar.flags = EXT4_MB_HINT_DATA;
 
 	current_block = ext4_mb_new_blocks(handle, &ar, err);
-	BUG_ON(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS);
+	if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) {
+		EXT4_ERROR_INODE(inode,
+				 "current_block %llu + ar.len %d > %d!",
+				 current_block, ar.len,
+				 EXT4_MAX_BLOCK_FILE_PHYS);
+		*err = -EIO;
+		goto failed_out;
+	}
 
 	if (*err && (target == blks)) {
 		/*
@@ -767,7 +784,7 @@ failed:
 	/* Allocation failed, free what we already allocated */
 	ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0);
 	for (i = 1; i <= n ; i++) {
-		/* 
+		/*
 		 * branch[i].bh is newly allocated, so there is no
 		 * need to revoke the block, which is why we don't
 		 * need to set EXT4_FREE_BLOCKS_METADATA.
@@ -857,7 +874,7 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
 
 err_out:
 	for (i = 1; i <= num; i++) {
-		/* 
+		/*
 		 * branch[i].bh is newly allocated, so there is no
 		 * need to revoke the block, which is why we don't
 		 * need to set EXT4_FREE_BLOCKS_METADATA.
@@ -872,9 +889,9 @@ err_out:
 }
 
 /*
- * The ext4_ind_get_blocks() function handles non-extents inodes
+ * The ext4_ind_map_blocks() function handles non-extents inodes
  * (i.e., using the traditional indirect/double-indirect i_blocks
- * scheme) for ext4_get_blocks().
+ * scheme) for ext4_map_blocks().
  *
  * Allocation strategy is simple: if we have to allocate something, we will
  * have to go the whole way to leaf. So let's do it before attaching anything
@@ -899,9 +916,8 @@ err_out:
  * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
  * blocks.
  */
-static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
-			       ext4_lblk_t iblock, unsigned int maxblocks,
-			       struct buffer_head *bh_result,
+static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
+			       struct ext4_map_blocks *map,
 			       int flags)
 {
 	int err = -EIO;
@@ -915,9 +931,9 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 	int count = 0;
 	ext4_fsblk_t first_block = 0;
 
-	J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
+	J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
 	J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
-	depth = ext4_block_to_path(inode, iblock, offsets,
+	depth = ext4_block_to_path(inode, map->m_lblk, offsets,
 				   &blocks_to_boundary);
 
 	if (depth == 0)
@@ -928,10 +944,9 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 	/* Simplest case - block found, no allocation needed */
 	if (!partial) {
 		first_block = le32_to_cpu(chain[depth - 1].key);
-		clear_buffer_new(bh_result);
 		count++;
 		/*map more blocks*/
-		while (count < maxblocks && count <= blocks_to_boundary) {
+		while (count < map->m_len && count <= blocks_to_boundary) {
 			ext4_fsblk_t blk;
 
 			blk = le32_to_cpu(*(chain[depth-1].p + count));
@@ -951,7 +966,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 	/*
 	 * Okay, we need to do block allocation.
 	*/
-	goal = ext4_find_goal(inode, iblock, partial);
+	goal = ext4_find_goal(inode, map->m_lblk, partial);
 
 	/* the number of blocks need to allocate for [d,t]indirect blocks */
 	indirect_blks = (chain + depth) - partial - 1;
@@ -961,11 +976,11 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 	 * direct blocks to allocate for this branch.
 	 */
 	count = ext4_blks_to_allocate(partial, indirect_blks,
-					maxblocks, blocks_to_boundary);
+				      map->m_len, blocks_to_boundary);
 	/*
 	 * Block out ext4_truncate while we alter the tree
 	 */
-	err = ext4_alloc_branch(handle, inode, iblock, indirect_blks,
+	err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,
 				&count, goal,
 				offsets + (partial - chain), partial);
 
@@ -977,18 +992,20 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 	 * may need to return -EAGAIN upwards in the worst case.  --sct
 	 */
 	if (!err)
-		err = ext4_splice_branch(handle, inode, iblock,
+		err = ext4_splice_branch(handle, inode, map->m_lblk,
 					 partial, indirect_blks, count);
 	if (err)
 		goto cleanup;
 
-	set_buffer_new(bh_result);
+	map->m_flags |= EXT4_MAP_NEW;
 
 	ext4_update_inode_fsync_trans(handle, inode, 1);
 got_it:
-	map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
+	map->m_flags |= EXT4_MAP_MAPPED;
+	map->m_pblk = le32_to_cpu(chain[depth-1].key);
+	map->m_len = count;
 	if (count > blocks_to_boundary)
-		set_buffer_boundary(bh_result);
+		map->m_flags |= EXT4_MAP_BOUNDARY;
 	err = count;
 	/* Clean up and exit */
 	partial = chain + depth - 1;	/* the whole chain */
@@ -998,7 +1015,6 @@ cleanup:
 		brelse(partial->bh);
 		partial--;
 	}
-	BUFFER_TRACE(bh_result, "returned");
 out:
 	return err;
 }
@@ -1018,7 +1034,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
 					      sector_t lblock)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
-	int dind_mask = EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1;
+	sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
 	int blk_bits;
 
 	if (lblock < EXT4_NDIR_BLOCKS)
@@ -1033,7 +1049,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
 	}
 	ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
 	ei->i_da_metadata_calc_len = 1;
-	blk_bits = roundup_pow_of_two(lblock + 1);
+	blk_bits = order_base_2(lblock);
 	return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
 }
 
@@ -1043,7 +1059,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
  */
 static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
 {
-	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 		return ext4_ext_calc_metadata_amount(inode, lblock);
 
 	return ext4_indirect_calc_metadata_amount(inode, lblock);
@@ -1058,9 +1074,9 @@ void ext4_da_update_reserve_space(struct inode *inode,
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	struct ext4_inode_info *ei = EXT4_I(inode);
-	int mdb_free = 0, allocated_meta_blocks = 0;
 
 	spin_lock(&ei->i_block_reservation_lock);
+	trace_ext4_da_update_reserve_space(inode, used);
 	if (unlikely(used > ei->i_reserved_data_blocks)) {
 		ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
 			 "with only %d reserved data blocks\n",
@@ -1072,11 +1088,10 @@ void ext4_da_update_reserve_space(struct inode *inode,
 
 	/* Update per-inode reservations */
 	ei->i_reserved_data_blocks -= used;
-	used += ei->i_allocated_meta_blocks;
 	ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
-	allocated_meta_blocks = ei->i_allocated_meta_blocks;
+	percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+			   used + ei->i_allocated_meta_blocks);
 	ei->i_allocated_meta_blocks = 0;
-	percpu_counter_sub(&sbi->s_dirtyblocks_counter, used);
 
 	if (ei->i_reserved_data_blocks == 0) {
 		/*
@@ -1084,30 +1099,23 @@ void ext4_da_update_reserve_space(struct inode *inode,
 		 * only when we have written all of the delayed
 		 * allocation blocks.
 		 */
-		mdb_free = ei->i_reserved_meta_blocks;
+		percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+				   ei->i_reserved_meta_blocks);
 		ei->i_reserved_meta_blocks = 0;
 		ei->i_da_metadata_calc_len = 0;
-		percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
 	}
 	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 
-	/* Update quota subsystem */
-	if (quota_claim) {
-		vfs_dq_claim_block(inode, used);
-		if (mdb_free)
-			vfs_dq_release_reservation_block(inode, mdb_free);
-	} else {
+	/* Update quota subsystem for data blocks */
+	if (quota_claim)
+		dquot_claim_block(inode, used);
+	else {
 		/*
 		 * We did fallocate with an offset that is already delayed
 		 * allocated. So on delayed allocated writeback we should
-		 * not update the quota for allocated blocks. But then
-		 * converting an fallocate region to initialized region would
-		 * have caused a metadata allocation. So claim quota for
-		 * that
+		 * not re-claim the quota for fallocated blocks.
 		 */
-		if (allocated_meta_blocks)
-			vfs_dq_claim_block(inode, allocated_meta_blocks);
-		vfs_dq_release_reservation_block(inode, mdb_free + used);
+		dquot_release_reservation_block(inode, used);
 	}
 
 	/*
@@ -1120,15 +1128,15 @@ void ext4_da_update_reserve_space(struct inode *inode,
 		ext4_discard_preallocations(inode);
 }
 
-static int check_block_validity(struct inode *inode, const char *msg,
-				sector_t logical, sector_t phys, int len)
+static int check_block_validity(struct inode *inode, const char *func,
+				struct ext4_map_blocks *map)
 {
-	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
-		ext4_error(inode->i_sb, msg,
-			   "inode #%lu logical block %llu mapped to %llu "
-			   "(size %d)", inode->i_ino,
-			   (unsigned long long) logical,
-			   (unsigned long long) phys, len);
+	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
+				   map->m_len)) {
+		ext4_error_inode(func, inode,
+			   "lblock %lu mapped to illegal pblock %llu "
+			   "(length %d)", (unsigned long) map->m_lblk,
+				 map->m_pblk, map->m_len);
 		return -EIO;
 	}
 	return 0;
@@ -1193,15 +1201,15 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
 }
 
 /*
- * The ext4_get_blocks() function tries to look up the requested blocks,
+ * The ext4_map_blocks() function tries to look up the requested blocks,
  * and returns if the blocks are already mapped.
  *
  * Otherwise it takes the write lock of the i_data_sem and allocate blocks
  * and store the allocated blocks in the result buffer head and mark it
  * mapped.
  *
- * If file type is extents based, it will call ext4_ext_get_blocks(),
- * Otherwise, call with ext4_ind_get_blocks() to handle indirect mapping
+ * If file type is extents based, it will call ext4_ext_map_blocks(),
+ * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
  * based files
  *
  * On success, it returns the number of blocks being mapped or allocate.
@@ -1214,35 +1222,29 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
  *
  * It returns the error in case of allocation failure.
  */
-int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
-		    unsigned int max_blocks, struct buffer_head *bh,
-		    int flags)
+int ext4_map_blocks(handle_t *handle, struct inode *inode,
+		    struct ext4_map_blocks *map, int flags)
 {
 	int retval;
 
-	clear_buffer_mapped(bh);
-	clear_buffer_unwritten(bh);
-
-	ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u,"
-		  "logical block %lu\n", inode->i_ino, flags, max_blocks,
-		  (unsigned long)block);
+	map->m_flags = 0;
+	ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
+		  "logical block %lu\n", inode->i_ino, flags, map->m_len,
+		  (unsigned long) map->m_lblk);
 	/*
 	 * Try to see if we can get the block without requesting a new
 	 * file system block.
 	 */
 	down_read((&EXT4_I(inode)->i_data_sem));
-	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
-		retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
-				bh, 0);
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+		retval = ext4_ext_map_blocks(handle, inode, map, 0);
 	} else {
-		retval = ext4_ind_get_blocks(handle, inode, block, max_blocks,
-					     bh, 0);
+		retval = ext4_ind_map_blocks(handle, inode, map, 0);
 	}
 	up_read((&EXT4_I(inode)->i_data_sem));
 
-	if (retval > 0 && buffer_mapped(bh)) {
-		int ret = check_block_validity(inode, "file system corruption",
-					       block, bh->b_blocknr, retval);
+	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
+		int ret = check_block_validity(inode, __func__, map);
 		if (ret != 0)
 			return ret;
 	}
@@ -1258,7 +1260,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 	 * ext4_ext_get_block() returns th create = 0
 	 * with buffer head unmapped.
 	 */
-	if (retval > 0 && buffer_mapped(bh))
+	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
 		return retval;
 
 	/*
@@ -1271,7 +1273,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 	 * of BH_Unwritten and BH_Mapped flags being simultaneously
 	 * set on the buffer_head.
 	 */
-	clear_buffer_unwritten(bh);
+	map->m_flags &= ~EXT4_MAP_UNWRITTEN;
 
 	/*
 	 * New blocks allocate and/or writing to uninitialized extent
@@ -1293,20 +1295,18 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 	 * We need to check for EXT4 here because migrate
 	 * could have changed the inode type in between
 	 */
-	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
-		retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
-					      bh, flags);
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+		retval = ext4_ext_map_blocks(handle, inode, map, flags);
 	} else {
-		retval = ext4_ind_get_blocks(handle, inode, block,
-					     max_blocks, bh, flags);
+		retval = ext4_ind_map_blocks(handle, inode, map, flags);
 
-		if (retval > 0 && buffer_new(bh)) {
+		if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
 			/*
 			 * We allocated new blocks which will result in
 			 * i_data's format changing.  Force the migrate
 			 * to fail by clearing migrate flags
 			 */
-			EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
+			ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
 		}
 
 		/*
@@ -1323,10 +1323,10 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 		EXT4_I(inode)->i_delalloc_reserved_flag = 0;
 
 	up_write((&EXT4_I(inode)->i_data_sem));
-	if (retval > 0 && buffer_mapped(bh)) {
-		int ret = check_block_validity(inode, "file system "
-					       "corruption after allocation",
-					       block, bh->b_blocknr, retval);
+	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
+		int ret = check_block_validity(inode,
+					       "ext4_map_blocks_after_alloc",
+					       map);
 		if (ret != 0)
 			return ret;
 	}
@@ -1336,109 +1336,109 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 /* Maximum number of blocks we map for direct IO at once. */
 #define DIO_MAX_BLOCKS 4096
 
-int ext4_get_block(struct inode *inode, sector_t iblock,
-		   struct buffer_head *bh_result, int create)
+static int _ext4_get_block(struct inode *inode, sector_t iblock,
+			   struct buffer_head *bh, int flags)
 {
 	handle_t *handle = ext4_journal_current_handle();
+	struct ext4_map_blocks map;
 	int ret = 0, started = 0;
-	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
 	int dio_credits;
 
-	if (create && !handle) {
+	map.m_lblk = iblock;
+	map.m_len = bh->b_size >> inode->i_blkbits;
+
+	if (flags && !handle) {
 		/* Direct IO write... */
-		if (max_blocks > DIO_MAX_BLOCKS)
-			max_blocks = DIO_MAX_BLOCKS;
-		dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
+		if (map.m_len > DIO_MAX_BLOCKS)
+			map.m_len = DIO_MAX_BLOCKS;
+		dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
 		handle = ext4_journal_start(inode, dio_credits);
 		if (IS_ERR(handle)) {
 			ret = PTR_ERR(handle);
-			goto out;
+			return ret;
 		}
 		started = 1;
 	}
 
-	ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
-			      create ? EXT4_GET_BLOCKS_CREATE : 0);
+	ret = ext4_map_blocks(handle, inode, &map, flags);
 	if (ret > 0) {
-		bh_result->b_size = (ret << inode->i_blkbits);
+		map_bh(bh, inode->i_sb, map.m_pblk);
+		bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+		bh->b_size = inode->i_sb->s_blocksize * map.m_len;
 		ret = 0;
 	}
 	if (started)
 		ext4_journal_stop(handle);
-out:
 	return ret;
 }
 
+int ext4_get_block(struct inode *inode, sector_t iblock,
+		   struct buffer_head *bh, int create)
+{
+	return _ext4_get_block(inode, iblock, bh,
+			       create ? EXT4_GET_BLOCKS_CREATE : 0);
+}
+
 /*
  * `handle' can be NULL if create is zero
  */
 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
 				ext4_lblk_t block, int create, int *errp)
 {
-	struct buffer_head dummy;
+	struct ext4_map_blocks map;
+	struct buffer_head *bh;
 	int fatal = 0, err;
-	int flags = 0;
 
 	J_ASSERT(handle != NULL || create == 0);
 
-	dummy.b_state = 0;
-	dummy.b_blocknr = -1000;
-	buffer_trace_init(&dummy.b_history);
-	if (create)
-		flags |= EXT4_GET_BLOCKS_CREATE;
-	err = ext4_get_blocks(handle, inode, block, 1, &dummy, flags);
-	/*
-	 * ext4_get_blocks() returns number of blocks mapped. 0 in
-	 * case of a HOLE.
-	 */
-	if (err > 0) {
-		if (err > 1)
-			WARN_ON(1);
-		err = 0;
+	map.m_lblk = block;
+	map.m_len = 1;
+	err = ext4_map_blocks(handle, inode, &map,
+			      create ? EXT4_GET_BLOCKS_CREATE : 0);
+
+	if (err < 0)
+		*errp = err;
+	if (err <= 0)
+		return NULL;
+	*errp = 0;
+
+	bh = sb_getblk(inode->i_sb, map.m_pblk);
+	if (!bh) {
+		*errp = -EIO;
+		return NULL;
 	}
-	*errp = err;
-	if (!err && buffer_mapped(&dummy)) {
-		struct buffer_head *bh;
-		bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
-		if (!bh) {
-			*errp = -EIO;
-			goto err;
-		}
-		if (buffer_new(&dummy)) {
-			J_ASSERT(create != 0);
-			J_ASSERT(handle != NULL);
+	if (map.m_flags & EXT4_MAP_NEW) {
+		J_ASSERT(create != 0);
+		J_ASSERT(handle != NULL);
 
-			/*
-			 * Now that we do not always journal data, we should
-			 * keep in mind whether this should always journal the
-			 * new buffer as metadata.  For now, regular file
-			 * writes use ext4_get_block instead, so it's not a
-			 * problem.
-			 */
-			lock_buffer(bh);
-			BUFFER_TRACE(bh, "call get_create_access");
-			fatal = ext4_journal_get_create_access(handle, bh);
-			if (!fatal && !buffer_uptodate(bh)) {
-				memset(bh->b_data, 0, inode->i_sb->s_blocksize);
-				set_buffer_uptodate(bh);
-			}
-			unlock_buffer(bh);
-			BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-			err = ext4_handle_dirty_metadata(handle, inode, bh);
-			if (!fatal)
-				fatal = err;
-		} else {
-			BUFFER_TRACE(bh, "not a new buffer");
-		}
-		if (fatal) {
-			*errp = fatal;
-			brelse(bh);
-			bh = NULL;
+		/*
+		 * Now that we do not always journal data, we should
+		 * keep in mind whether this should always journal the
+		 * new buffer as metadata.  For now, regular file
+		 * writes use ext4_get_block instead, so it's not a
+		 * problem.
+		 */
+		lock_buffer(bh);
+		BUFFER_TRACE(bh, "call get_create_access");
+		fatal = ext4_journal_get_create_access(handle, bh);
+		if (!fatal && !buffer_uptodate(bh)) {
+			memset(bh->b_data, 0, inode->i_sb->s_blocksize);
+			set_buffer_uptodate(bh);
 		}
-		return bh;
+		unlock_buffer(bh);
+		BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+		err = ext4_handle_dirty_metadata(handle, inode, bh);
+		if (!fatal)
+			fatal = err;
+	} else {
+		BUFFER_TRACE(bh, "not a new buffer");
 	}
-err:
-	return NULL;
+	if (fatal) {
+		*errp = fatal;
+		brelse(bh);
+		bh = NULL;
+	}
+	return bh;
 }
 
 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
@@ -1534,6 +1534,8 @@ static void ext4_truncate_failed_write(struct inode *inode)
 	ext4_truncate(inode);
 }
 
+static int ext4_get_block_write(struct inode *inode, sector_t iblock,
+		   struct buffer_head *bh_result, int create);
 static int ext4_write_begin(struct file *file, struct address_space *mapping,
 			    loff_t pos, unsigned len, unsigned flags,
 			    struct page **pagep, void **fsdata)
@@ -1575,8 +1577,12 @@ retry:
 	}
 	*pagep = page;
 
-	ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
-				ext4_get_block);
+	if (ext4_should_dioread_nolock(inode))
+		ret = block_write_begin(file, mapping, pos, len, flags, pagep,
+				fsdata, ext4_get_block_write);
+	else
+		ret = block_write_begin(file, mapping, pos, len, flags, pagep,
+				fsdata, ext4_get_block);
 
 	if (!ret && ext4_should_journal_data(inode)) {
 		ret = walk_page_buffers(handle, page_buffers(page),
@@ -1793,7 +1799,7 @@ static int ext4_journalled_write_end(struct file *file,
 	new_i_size = pos + copied;
 	if (new_i_size > inode->i_size)
 		i_size_write(inode, pos+copied);
-	EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+	ext4_set_inode_state(inode, EXT4_STATE_JDATA);
 	if (new_i_size > EXT4_I(inode)->i_disksize) {
 		ext4_update_i_disksize(inode, new_i_size);
 		ret2 = ext4_mark_inode_dirty(handle, inode);
@@ -1835,7 +1841,8 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
 	int retries = 0;
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	struct ext4_inode_info *ei = EXT4_I(inode);
-	unsigned long md_needed, md_reserved;
+	unsigned long md_needed;
+	int ret;
 
 	/*
 	 * recalculate the amount of metadata blocks to reserve
@@ -1844,20 +1851,24 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
 	 */
 repeat:
 	spin_lock(&ei->i_block_reservation_lock);
-	md_reserved = ei->i_reserved_meta_blocks;
 	md_needed = ext4_calc_metadata_amount(inode, lblock);
+	trace_ext4_da_reserve_space(inode, md_needed);
 	spin_unlock(&ei->i_block_reservation_lock);
 
 	/*
-	 * Make quota reservation here to prevent quota overflow
-	 * later. Real quota accounting is done at pages writeout
-	 * time.
+	 * We will charge metadata quota at writeout time; this saves
+	 * us from metadata over-estimation, though we may go over by
+	 * a small amount in the end.  Here we just reserve for data.
+	 */
+	ret = dquot_reserve_block(inode, 1);
+	if (ret)
+		return ret;
+	/*
+	 * We do still charge estimated metadata to the sb though;
+	 * we cannot afford to run out of free blocks.
 	 */
-	if (vfs_dq_reserve_block(inode, md_needed + 1))
-		return -EDQUOT;
-
 	if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
-		vfs_dq_release_reservation_block(inode, md_needed + 1);
+		dquot_release_reservation_block(inode, 1);
 		if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
 			yield();
 			goto repeat;
@@ -1882,6 +1893,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
 
 	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
 
+	trace_ext4_da_release_space(inode, to_free);
 	if (unlikely(to_free > ei->i_reserved_data_blocks)) {
 		/*
 		 * if there aren't enough reserved blocks, then the
@@ -1904,17 +1916,18 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
 		 * only when we have written all of the delayed
 		 * allocation blocks.
 		 */
-		to_free += ei->i_reserved_meta_blocks;
+		percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+				   ei->i_reserved_meta_blocks);
 		ei->i_reserved_meta_blocks = 0;
 		ei->i_da_metadata_calc_len = 0;
 	}
 
-	/* update fs dirty blocks counter */
+	/* update fs dirty data blocks counter */
 	percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
 
 	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 
-	vfs_dq_release_reservation_block(inode, to_free);
+	dquot_release_reservation_block(inode, to_free);
 }
 
 static void ext4_da_page_release_reservation(struct page *page,
@@ -2014,28 +2027,23 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
 /*
  * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
  *
- * @mpd->inode - inode to walk through
- * @exbh->b_blocknr - first block on a disk
- * @exbh->b_size - amount of space in bytes
- * @logical - first logical block to start assignment with
- *
  * the function goes through all passed space and put actual disk
  * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
  */
-static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
-				 struct buffer_head *exbh)
+static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd,
+				 struct ext4_map_blocks *map)
 {
 	struct inode *inode = mpd->inode;
 	struct address_space *mapping = inode->i_mapping;
-	int blocks = exbh->b_size >> inode->i_blkbits;
-	sector_t pblock = exbh->b_blocknr, cur_logical;
+	int blocks = map->m_len;
+	sector_t pblock = map->m_pblk, cur_logical;
 	struct buffer_head *head, *bh;
 	pgoff_t index, end;
 	struct pagevec pvec;
 	int nr_pages, i;
 
-	index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
-	end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	end = (map->m_lblk + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
 	cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
 
 	pagevec_init(&pvec, 0);
@@ -2062,17 +2070,16 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
 
 			/* skip blocks out of the range */
 			do {
-				if (cur_logical >= logical)
+				if (cur_logical >= map->m_lblk)
 					break;
 				cur_logical++;
 			} while ((bh = bh->b_this_page) != head);
 
 			do {
-				if (cur_logical >= logical + blocks)
+				if (cur_logical >= map->m_lblk + blocks)
 					break;
 
-				if (buffer_delay(bh) ||
-						buffer_unwritten(bh)) {
+				if (buffer_delay(bh) || buffer_unwritten(bh)) {
 
 					BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
 
@@ -2091,6 +2098,8 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
 				} else if (buffer_mapped(bh))
 					BUG_ON(bh->b_blocknr != pblock);
 
+				if (map->m_flags & EXT4_MAP_UNINIT)
+					set_buffer_uninit(bh);
 				cur_logical++;
 				pblock++;
 			} while ((bh = bh->b_this_page) != head);
@@ -2100,21 +2109,6 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
 }
 
 
-/*
- * __unmap_underlying_blocks - just a helper function to unmap
- * set of blocks described by @bh
- */
-static inline void __unmap_underlying_blocks(struct inode *inode,
-					     struct buffer_head *bh)
-{
-	struct block_device *bdev = inode->i_sb->s_bdev;
-	int blocks, i;
-
-	blocks = bh->b_size >> inode->i_blkbits;
-	for (i = 0; i < blocks; i++)
-		unmap_underlying_metadata(bdev, bh->b_blocknr + i);
-}
-
 static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
 					sector_t logical, long blk_cnt)
 {
@@ -2133,17 +2127,16 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
 			break;
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
-			index = page->index;
-			if (index > end)
+			if (page->index > end)
 				break;
-			index++;
-
 			BUG_ON(!PageLocked(page));
 			BUG_ON(PageWriteback(page));
 			block_invalidatepage(page, 0);
 			ClearPageUptodate(page);
 			unlock_page(page);
 		}
+		index = pvec.pages[nr_pages - 1]->index + 1;
+		pagevec_release(&pvec);
 	}
 	return;
 }
@@ -2177,7 +2170,7 @@ static void ext4_print_free_blocks(struct inode *inode)
 static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 {
 	int err, blks, get_blocks_flags;
-	struct buffer_head new;
+	struct ext4_map_blocks map;
 	sector_t next = mpd->b_blocknr;
 	unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
 	loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
@@ -2218,13 +2211,15 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 	 * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
 	 * variables are updated after the blocks have been allocated.
 	 */
-	new.b_state = 0;
+	map.m_lblk = next;
+	map.m_len = max_blocks;
 	get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
+	if (ext4_should_dioread_nolock(mpd->inode))
+		get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
 	if (mpd->b_state & (1 << BH_Delay))
 		get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
 
-	blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks,
-			       &new, get_blocks_flags);
+	blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
 	if (blks < 0) {
 		err = blks;
 		/*
@@ -2251,7 +2246,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 		ext4_msg(mpd->inode->i_sb, KERN_CRIT,
 			 "delayed block allocation failed for inode %lu at "
 			 "logical offset %llu with max blocks %zd with "
-			 "error %d\n", mpd->inode->i_ino,
+			 "error %d", mpd->inode->i_ino,
 			 (unsigned long long) next,
 			 mpd->b_size >> mpd->inode->i_blkbits, err);
 		printk(KERN_CRIT "This should not happen!!  "
@@ -2266,10 +2261,13 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 	}
 	BUG_ON(blks == 0);
 
-	new.b_size = (blks << mpd->inode->i_blkbits);
+	if (map.m_flags & EXT4_MAP_NEW) {
+		struct block_device *bdev = mpd->inode->i_sb->s_bdev;
+		int i;
 
-	if (buffer_new(&new))
-		__unmap_underlying_blocks(mpd->inode, &new);
+		for (i = 0; i < map.m_len; i++)
+			unmap_underlying_metadata(bdev, map.m_pblk + i);
+	}
 
 	/*
 	 * If blocks are delayed marked, we need to
@@ -2277,7 +2275,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 	 */
 	if ((mpd->b_state & (1 << BH_Delay)) ||
 	    (mpd->b_state & (1 << BH_Unwritten)))
-		mpage_put_bnr_to_bhs(mpd, next, &new);
+		mpage_put_bnr_to_bhs(mpd, &map);
 
 	if (ext4_should_order_data(mpd->inode)) {
 		err = ext4_jbd2_file_inode(handle, mpd->inode);
@@ -2318,8 +2316,17 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
 	sector_t next;
 	int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
 
+	/*
+	 * XXX Don't go larger than mballoc is willing to allocate
+	 * This is a stopgap solution.  We eventually need to fold
+	 * mpage_da_submit_io() into this function and then call
+	 * ext4_get_blocks() multiple times in a loop
+	 */
+	if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
+		goto flush_it;
+
 	/* check if thereserved journal credits might overflow */
-	if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
+	if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {
 		if (nrblocks >= EXT4_MAX_TRANS_DATA) {
 			/*
 			 * With non-extent format we are limited by the journal
@@ -2392,17 +2399,6 @@ static int __mpage_da_writepage(struct page *page,
 	struct buffer_head *bh, *head;
 	sector_t logical;
 
-	if (mpd->io_done) {
-		/*
-		 * Rest of the page in the page_vec
-		 * redirty then and skip then. We will
-		 * try to write them again after
-		 * starting a new transaction
-		 */
-		redirty_page_for_writepage(wbc, page);
-		unlock_page(page);
-		return MPAGE_DA_EXTENT_TAIL;
-	}
 	/*
 	 * Can we merge this page to current extent?
 	 */
@@ -2497,8 +2493,9 @@ static int __mpage_da_writepage(struct page *page,
  * initialized properly.
  */
 static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
-				  struct buffer_head *bh_result, int create)
+				  struct buffer_head *bh, int create)
 {
+	struct ext4_map_blocks map;
 	int ret = 0;
 	sector_t invalid_block = ~((sector_t) 0xffff);
 
@@ -2506,16 +2503,22 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 		invalid_block = ~0;
 
 	BUG_ON(create == 0);
-	BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
+	BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
+
+	map.m_lblk = iblock;
+	map.m_len = 1;
 
 	/*
 	 * first, we need to know whether the block is allocated already
 	 * preallocated blocks are unmapped but should treated
 	 * the same as allocated blocks.
 	 */
-	ret = ext4_get_blocks(NULL, inode, iblock, 1,  bh_result, 0);
-	if ((ret == 0) && !buffer_delay(bh_result)) {
-		/* the block isn't (pre)allocated yet, let's reserve space */
+	ret = ext4_map_blocks(NULL, inode, &map, 0);
+	if (ret < 0)
+		return ret;
+	if (ret == 0) {
+		if (buffer_delay(bh))
+			return 0; /* Not sure this could or should happen */
 		/*
 		 * XXX: __block_prepare_write() unmaps passed block,
 		 * is it OK?
@@ -2525,26 +2528,26 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 			/* not enough space to reserve */
 			return ret;
 
-		map_bh(bh_result, inode->i_sb, invalid_block);
-		set_buffer_new(bh_result);
-		set_buffer_delay(bh_result);
-	} else if (ret > 0) {
-		bh_result->b_size = (ret << inode->i_blkbits);
-		if (buffer_unwritten(bh_result)) {
-			/* A delayed write to unwritten bh should
-			 * be marked new and mapped.  Mapped ensures
-			 * that we don't do get_block multiple times
-			 * when we write to the same offset and new
-			 * ensures that we do proper zero out for
-			 * partial write.
-			 */
-			set_buffer_new(bh_result);
-			set_buffer_mapped(bh_result);
-		}
-		ret = 0;
+		map_bh(bh, inode->i_sb, invalid_block);
+		set_buffer_new(bh);
+		set_buffer_delay(bh);
+		return 0;
 	}
 
-	return ret;
+	map_bh(bh, inode->i_sb, map.m_pblk);
+	bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+
+	if (buffer_unwritten(bh)) {
+		/* A delayed write to unwritten bh should be marked
+		 * new and mapped.  Mapped ensures that we don't do
+		 * get_block multiple times when we write to the same
+		 * offset and new ensures that we do proper zero out
+		 * for partial write.
+		 */
+		set_buffer_new(bh);
+		set_buffer_mapped(bh);
+	}
+	return 0;
 }
 
 /*
@@ -2566,21 +2569,8 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
 				   struct buffer_head *bh_result, int create)
 {
-	int ret = 0;
-	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
-
 	BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
-
-	/*
-	 * we don't want to do block allocation in writepage
-	 * so call get_block_wrap with create = 0
-	 */
-	ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0);
-	if (ret > 0) {
-		bh_result->b_size = (ret << inode->i_blkbits);
-		ret = 0;
-	}
-	return ret;
+	return _ext4_get_block(inode, iblock, bh_result, 0);
 }
 
 static int bget_one(handle_t *handle, struct buffer_head *bh)
@@ -2630,11 +2620,14 @@ static int __ext4_journalled_writepage(struct page *page,
 		ret = err;
 
 	walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
-	EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+	ext4_set_inode_state(inode, EXT4_STATE_JDATA);
 out:
 	return ret;
 }
 
+static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
+static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
+
 /*
  * Note that we don't need to start a transaction unless we're journaling data
  * because we should have holes filled from ext4_page_mkwrite(). We even don't
@@ -2682,7 +2675,7 @@ static int ext4_writepage(struct page *page,
 	int ret = 0;
 	loff_t size;
 	unsigned int len;
-	struct buffer_head *page_bufs;
+	struct buffer_head *page_bufs = NULL;
 	struct inode *inode = page->mapping->host;
 
 	trace_ext4_writepage(inode, page);
@@ -2758,7 +2751,11 @@ static int ext4_writepage(struct page *page,
 
 	if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
 		ret = nobh_writepage(page, noalloc_get_block_write, wbc);
-	else
+	else if (page_bufs && buffer_uninit(page_bufs)) {
+		ext4_set_bh_endio(page_bufs, inode);
+		ret = block_write_full_page_endio(page, noalloc_get_block_write,
+					    wbc, ext4_end_io_buffer_write);
+	} else
 		ret = block_write_full_page(page, noalloc_get_block_write,
 					    wbc);
 
@@ -2783,13 +2780,131 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
 	 * number of contiguous block. So we will limit
 	 * number of contiguous block to a sane value
 	 */
-	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) &&
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
 	    (max_blocks > EXT4_MAX_TRANS_DATA))
 		max_blocks = EXT4_MAX_TRANS_DATA;
 
 	return ext4_chunk_trans_blocks(inode, max_blocks);
 }
 
+/*
+ * write_cache_pages_da - walk the list of dirty pages of the given
+ * address space and call the callback function (which usually writes
+ * the pages).
+ *
+ * This is a forked version of write_cache_pages().  Differences:
+ *	Range cyclic is ignored.
+ *	no_nrwrite_index_update is always presumed true
+ */
+static int write_cache_pages_da(struct address_space *mapping,
+				struct writeback_control *wbc,
+				struct mpage_da_data *mpd)
+{
+	int ret = 0;
+	int done = 0;
+	struct pagevec pvec;
+	int nr_pages;
+	pgoff_t index;
+	pgoff_t end;		/* Inclusive */
+	long nr_to_write = wbc->nr_to_write;
+
+	pagevec_init(&pvec, 0);
+	index = wbc->range_start >> PAGE_CACHE_SHIFT;
+	end = wbc->range_end >> PAGE_CACHE_SHIFT;
+
+	while (!done && (index <= end)) {
+		int i;
+
+		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+			      PAGECACHE_TAG_DIRTY,
+			      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+		if (nr_pages == 0)
+			break;
+
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			/*
+			 * At this point, the page may be truncated or
+			 * invalidated (changing page->mapping to NULL), or
+			 * even swizzled back from swapper_space to tmpfs file
+			 * mapping. However, page->index will not change
+			 * because we have a reference on the page.
+			 */
+			if (page->index > end) {
+				done = 1;
+				break;
+			}
+
+			lock_page(page);
+
+			/*
+			 * Page truncated or invalidated. We can freely skip it
+			 * then, even for data integrity operations: the page
+			 * has disappeared concurrently, so there could be no
+			 * real expectation of this data interity operation
+			 * even if there is now a new, dirty page at the same
+			 * pagecache address.
+			 */
+			if (unlikely(page->mapping != mapping)) {
+continue_unlock:
+				unlock_page(page);
+				continue;
+			}
+
+			if (!PageDirty(page)) {
+				/* someone wrote it for us */
+				goto continue_unlock;
+			}
+
+			if (PageWriteback(page)) {
+				if (wbc->sync_mode != WB_SYNC_NONE)
+					wait_on_page_writeback(page);
+				else
+					goto continue_unlock;
+			}
+
+			BUG_ON(PageWriteback(page));
+			if (!clear_page_dirty_for_io(page))
+				goto continue_unlock;
+
+			ret = __mpage_da_writepage(page, wbc, mpd);
+			if (unlikely(ret)) {
+				if (ret == AOP_WRITEPAGE_ACTIVATE) {
+					unlock_page(page);
+					ret = 0;
+				} else {
+					done = 1;
+					break;
+				}
+			}
+
+			if (nr_to_write > 0) {
+				nr_to_write--;
+				if (nr_to_write == 0 &&
+				    wbc->sync_mode == WB_SYNC_NONE) {
+					/*
+					 * We stop writing back only if we are
+					 * not doing integrity sync. In case of
+					 * integrity sync we have to keep going
+					 * because someone may be concurrently
+					 * dirtying pages, and we might have
+					 * synced a lot of newly appeared dirty
+					 * pages, but have not synced all of the
+					 * old dirty pages.
+					 */
+					done = 1;
+					break;
+				}
+			}
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+	return ret;
+}
+
+
 static int ext4_da_writepages(struct address_space *mapping,
 			      struct writeback_control *wbc)
 {
@@ -2798,7 +2913,6 @@ static int ext4_da_writepages(struct address_space *mapping,
 	handle_t *handle = NULL;
 	struct mpage_da_data mpd;
 	struct inode *inode = mapping->host;
-	int no_nrwrite_index_update;
 	int pages_written = 0;
 	long pages_skipped;
 	unsigned int max_pages;
@@ -2878,12 +2992,6 @@ static int ext4_da_writepages(struct address_space *mapping,
 	mpd.wbc = wbc;
 	mpd.inode = mapping->host;
 
-	/*
-	 * we don't want write_cache_pages to update
-	 * nr_to_write and writeback_index
-	 */
-	no_nrwrite_index_update = wbc->no_nrwrite_index_update;
-	wbc->no_nrwrite_index_update = 1;
 	pages_skipped = wbc->pages_skipped;
 
 retry:
@@ -2903,7 +3011,7 @@ retry:
 		if (IS_ERR(handle)) {
 			ret = PTR_ERR(handle);
 			ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
-			       "%ld pages, ino %lu; err %d\n", __func__,
+			       "%ld pages, ino %lu; err %d", __func__,
 				wbc->nr_to_write, inode->i_ino, ret);
 			goto out_writepages;
 		}
@@ -2925,8 +3033,7 @@ retry:
 		mpd.io_done = 0;
 		mpd.pages_written = 0;
 		mpd.retval = 0;
-		ret = write_cache_pages(mapping, wbc, __mpage_da_writepage,
-					&mpd);
+		ret = write_cache_pages_da(mapping, wbc, &mpd);
 		/*
 		 * If we have a contiguous extent of pages and we
 		 * haven't done the I/O yet, map the blocks and submit
@@ -2978,7 +3085,7 @@ retry:
 	if (pages_skipped != wbc->pages_skipped)
 		ext4_msg(inode->i_sb, KERN_CRIT,
 			 "This should not happen leaving %s "
-			 "with nr_to_write = %ld ret = %d\n",
+			 "with nr_to_write = %ld ret = %d",
 			 __func__, wbc->nr_to_write, ret);
 
 	/* Update index */
@@ -2992,8 +3099,6 @@ retry:
 		mapping->writeback_index = index;
 
 out_writepages:
-	if (!no_nrwrite_index_update)
-		wbc->no_nrwrite_index_update = 0;
 	wbc->nr_to_write -= nr_to_writebump;
 	wbc->range_start = range_start;
 	trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
@@ -3038,7 +3143,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 			       loff_t pos, unsigned len, unsigned flags,
 			       struct page **pagep, void **fsdata)
 {
-	int ret, retries = 0, quota_retries = 0;
+	int ret, retries = 0;
 	struct page *page;
 	pgoff_t index;
 	unsigned from, to;
@@ -3097,22 +3202,6 @@ retry:
 
 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
 		goto retry;
-
-	if ((ret == -EDQUOT) &&
-	    EXT4_I(inode)->i_reserved_meta_blocks &&
-	    (quota_retries++ < 3)) {
-		/*
-		 * Since we often over-estimate the number of meta
-		 * data blocks required, we may sometimes get a
-		 * spurios out of quota error even though there would
-		 * be enough space once we write the data blocks and
-		 * find out how many meta data blocks were _really_
-		 * required.  So try forcing the inode write to see if
-		 * that helps.
-		 */
-		write_inode_now(inode, (quota_retries == 3));
-		goto retry;
-	}
 out:
 	return ret;
 }
@@ -3301,7 +3390,8 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
 		filemap_write_and_wait(mapping);
 	}
 
-	if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
+	if (EXT4_JOURNAL(inode) &&
+	    ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {
 		/*
 		 * This is a REALLY heavyweight approach, but the use of
 		 * bmap on dirty files is expected to be extremely rare:
@@ -3320,7 +3410,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
 		 * everything they get.
 		 */
 
-		EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA;
+		ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
 		journal = EXT4_JOURNAL(inode);
 		jbd2_journal_lock_updates(journal);
 		err = jbd2_journal_flush(journal);
@@ -3345,11 +3435,45 @@ ext4_readpages(struct file *file, struct address_space *mapping,
 	return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
 }
 
+static void ext4_free_io_end(ext4_io_end_t *io)
+{
+	BUG_ON(!io);
+	if (io->page)
+		put_page(io->page);
+	iput(io->inode);
+	kfree(io);
+}
+
+static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
+{
+	struct buffer_head *head, *bh;
+	unsigned int curr_off = 0;
+
+	if (!page_has_buffers(page))
+		return;
+	head = bh = page_buffers(page);
+	do {
+		if (offset <= curr_off && test_clear_buffer_uninit(bh)
+					&& bh->b_private) {
+			ext4_free_io_end(bh->b_private);
+			bh->b_private = NULL;
+			bh->b_end_io = NULL;
+		}
+		curr_off = curr_off + bh->b_size;
+		bh = bh->b_this_page;
+	} while (bh != head);
+}
+
 static void ext4_invalidatepage(struct page *page, unsigned long offset)
 {
 	journal_t *journal = EXT4_JOURNAL(page->mapping->host);
 
 	/*
+	 * free any io_end structure allocated for buffers to be discarded
+	 */
+	if (ext4_should_dioread_nolock(page->mapping->host))
+		ext4_invalidatepage_free_endio(page, offset);
+	/*
 	 * If it's a full truncate we just forget about the pending dirtying
 	 */
 	if (offset == 0)
@@ -3420,7 +3544,14 @@ static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
 	}
 
 retry:
-	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+	if (rw == READ && ext4_should_dioread_nolock(inode))
+		ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
+				 inode->i_sb->s_bdev, iov,
+				 offset, nr_segs,
+				 ext4_get_block, NULL);
+	else
+		ret = blockdev_direct_IO(rw, iocb, inode,
+				 inode->i_sb->s_bdev, iov,
 				 offset, nr_segs,
 				 ext4_get_block, NULL);
 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -3436,6 +3567,9 @@ retry:
 			 * but cannot extend i_size. Bail out and pretend
 			 * the write failed... */
 			ret = PTR_ERR(handle);
+			if (inode->i_nlink)
+				ext4_orphan_del(NULL, inode);
+
 			goto out;
 		}
 		if (inode->i_nlink)
@@ -3463,75 +3597,35 @@ out:
 	return ret;
 }
 
-static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock,
+/*
+ * ext4_get_block used when preparing for a DIO write or buffer write.
+ * We allocate an uinitialized extent if blocks haven't been allocated.
+ * The extent will be converted to initialized after the IO is complete.
+ */
+static int ext4_get_block_write(struct inode *inode, sector_t iblock,
 		   struct buffer_head *bh_result, int create)
 {
-	handle_t *handle = NULL;
-	int ret = 0;
-	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
-	int dio_credits;
-
-	ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n",
+	ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
 		   inode->i_ino, create);
-	/*
-	 * DIO VFS code passes create = 0 flag for write to
-	 * the middle of file. It does this to avoid block
-	 * allocation for holes, to prevent expose stale data
-	 * out when there is parallel buffered read (which does
-	 * not hold the i_mutex lock) while direct IO write has
-	 * not completed. DIO request on holes finally falls back
-	 * to buffered IO for this reason.
-	 *
-	 * For ext4 extent based file, since we support fallocate,
-	 * new allocated extent as uninitialized, for holes, we
-	 * could fallocate blocks for holes, thus parallel
-	 * buffered IO read will zero out the page when read on
-	 * a hole while parallel DIO write to the hole has not completed.
-	 *
-	 * when we come here, we know it's a direct IO write to
-	 * to the middle of file (<i_size)
-	 * so it's safe to override the create flag from VFS.
-	 */
-	create = EXT4_GET_BLOCKS_DIO_CREATE_EXT;
-
-	if (max_blocks > DIO_MAX_BLOCKS)
-		max_blocks = DIO_MAX_BLOCKS;
-	dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
-	handle = ext4_journal_start(inode, dio_credits);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		goto out;
-	}
-	ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
-			      create);
-	if (ret > 0) {
-		bh_result->b_size = (ret << inode->i_blkbits);
-		ret = 0;
-	}
-	ext4_journal_stop(handle);
-out:
-	return ret;
+	return _ext4_get_block(inode, iblock, bh_result,
+			       EXT4_GET_BLOCKS_IO_CREATE_EXT);
 }
 
-static void ext4_free_io_end(ext4_io_end_t *io)
-{
-	BUG_ON(!io);
-	iput(io->inode);
-	kfree(io);
-}
-static void dump_aio_dio_list(struct inode * inode)
+static void dump_completed_IO(struct inode * inode)
 {
 #ifdef	EXT4_DEBUG
 	struct list_head *cur, *before, *after;
 	ext4_io_end_t *io, *io0, *io1;
+	unsigned long flags;
 
-	if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
-		ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino);
+	if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
+		ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
 		return;
 	}
 
-	ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino);
-	list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){
+	ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
+	spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
+	list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
 		cur = &io->list;
 		before = cur->prev;
 		io0 = container_of(before, ext4_io_end_t, list);
@@ -3541,32 +3635,31 @@ static void dump_aio_dio_list(struct inode * inode)
 		ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
 			    io, inode->i_ino, io0, io1);
 	}
+	spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
 #endif
 }
 
 /*
  * check a range of space and convert unwritten extents to written.
  */
-static int ext4_end_aio_dio_nolock(ext4_io_end_t *io)
+static int ext4_end_io_nolock(ext4_io_end_t *io)
 {
 	struct inode *inode = io->inode;
 	loff_t offset = io->offset;
-	size_t size = io->size;
+	ssize_t size = io->size;
 	int ret = 0;
 
-	ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p,"
+	ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
 		   "list->prev 0x%p\n",
 	           io, inode->i_ino, io->list.next, io->list.prev);
 
 	if (list_empty(&io->list))
 		return ret;
 
-	if (io->flag != DIO_AIO_UNWRITTEN)
+	if (io->flag != EXT4_IO_UNWRITTEN)
 		return ret;
 
-	if (offset + size <= i_size_read(inode))
-		ret = ext4_convert_unwritten_extents(inode, offset, size);
-
+	ret = ext4_convert_unwritten_extents(inode, offset, size);
 	if (ret < 0) {
 		printk(KERN_EMERG "%s: failed to convert unwritten"
 			"extents to written extents, error is %d"
@@ -3579,50 +3672,64 @@ static int ext4_end_aio_dio_nolock(ext4_io_end_t *io)
 	io->flag = 0;
 	return ret;
 }
+
 /*
  * work on completed aio dio IO, to convert unwritten extents to extents
  */
-static void ext4_end_aio_dio_work(struct work_struct *work)
+static void ext4_end_io_work(struct work_struct *work)
 {
-	ext4_io_end_t *io  = container_of(work, ext4_io_end_t, work);
-	struct inode *inode = io->inode;
-	int ret = 0;
+	ext4_io_end_t		*io = container_of(work, ext4_io_end_t, work);
+	struct inode		*inode = io->inode;
+	struct ext4_inode_info	*ei = EXT4_I(inode);
+	unsigned long		flags;
+	int			ret;
 
 	mutex_lock(&inode->i_mutex);
-	ret = ext4_end_aio_dio_nolock(io);
-	if (ret >= 0) {
-		if (!list_empty(&io->list))
-			list_del_init(&io->list);
-		ext4_free_io_end(io);
+	ret = ext4_end_io_nolock(io);
+	if (ret < 0) {
+		mutex_unlock(&inode->i_mutex);
+		return;
 	}
+
+	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+	if (!list_empty(&io->list))
+		list_del_init(&io->list);
+	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
 	mutex_unlock(&inode->i_mutex);
+	ext4_free_io_end(io);
 }
+
 /*
  * This function is called from ext4_sync_file().
  *
- * When AIO DIO IO is completed, the work to convert unwritten
- * extents to written is queued on workqueue but may not get immediately
+ * When IO is completed, the work to convert unwritten extents to
+ * written is queued on workqueue but may not get immediately
  * scheduled. When fsync is called, we need to ensure the
  * conversion is complete before fsync returns.
- * The inode keeps track of a list of completed AIO from DIO path
- * that might needs to do the conversion. This function walks through
- * the list and convert the related unwritten extents to written.
+ * The inode keeps track of a list of pending/completed IO that
+ * might needs to do the conversion. This function walks through
+ * the list and convert the related unwritten extents for completed IO
+ * to written.
+ * The function return the number of pending IOs on success.
  */
-int flush_aio_dio_completed_IO(struct inode *inode)
+int flush_completed_IO(struct inode *inode)
 {
 	ext4_io_end_t *io;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	unsigned long flags;
 	int ret = 0;
 	int ret2 = 0;
 
-	if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list))
+	if (list_empty(&ei->i_completed_io_list))
 		return ret;
 
-	dump_aio_dio_list(inode);
-	while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
-		io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next,
+	dump_completed_IO(inode);
+	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+	while (!list_empty(&ei->i_completed_io_list)){
+		io = list_entry(ei->i_completed_io_list.next,
 				ext4_io_end_t, list);
 		/*
-		 * Calling ext4_end_aio_dio_nolock() to convert completed
+		 * Calling ext4_end_io_nolock() to convert completed
 		 * IO to written.
 		 *
 		 * When ext4_sync_file() is called, run_queue() may already
@@ -3635,20 +3742,23 @@ int flush_aio_dio_completed_IO(struct inode *inode)
 		 * avoid double converting from both fsync and background work
 		 * queue work.
 		 */
-		ret = ext4_end_aio_dio_nolock(io);
+		spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+		ret = ext4_end_io_nolock(io);
+		spin_lock_irqsave(&ei->i_completed_io_lock, flags);
 		if (ret < 0)
 			ret2 = ret;
 		else
 			list_del_init(&io->list);
 	}
+	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
 	return (ret2 < 0) ? ret2 : 0;
 }
 
-static ext4_io_end_t *ext4_init_io_end (struct inode *inode)
+static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
 {
 	ext4_io_end_t *io = NULL;
 
-	io = kmalloc(sizeof(*io), GFP_NOFS);
+	io = kmalloc(sizeof(*io), flags);
 
 	if (io) {
 		igrab(inode);
@@ -3656,8 +3766,8 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode)
 		io->flag = 0;
 		io->offset = 0;
 		io->size = 0;
-		io->error = 0;
-		INIT_WORK(&io->work, ext4_end_aio_dio_work);
+		io->page = NULL;
+		INIT_WORK(&io->work, ext4_end_io_work);
 		INIT_LIST_HEAD(&io->list);
 	}
 
@@ -3669,6 +3779,8 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 {
         ext4_io_end_t *io_end = iocb->private;
 	struct workqueue_struct *wq;
+	unsigned long flags;
+	struct ext4_inode_info *ei;
 
 	/* if not async direct IO or dio with 0 bytes write, just return */
 	if (!io_end || !size)
@@ -3680,7 +3792,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 		  size);
 
 	/* if not aio dio with unwritten extents, just free io and return */
-	if (io_end->flag != DIO_AIO_UNWRITTEN){
+	if (io_end->flag != EXT4_IO_UNWRITTEN){
 		ext4_free_io_end(io_end);
 		iocb->private = NULL;
 		return;
@@ -3688,16 +3800,85 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 
 	io_end->offset = offset;
 	io_end->size = size;
+	io_end->flag = EXT4_IO_UNWRITTEN;
 	wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
 
 	/* queue the work to convert unwritten extents to written */
 	queue_work(wq, &io_end->work);
 
 	/* Add the io_end to per-inode completed aio dio list*/
-	list_add_tail(&io_end->list,
-		 &EXT4_I(io_end->inode)->i_aio_dio_complete_list);
+	ei = EXT4_I(io_end->inode);
+	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+	list_add_tail(&io_end->list, &ei->i_completed_io_list);
+	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
 	iocb->private = NULL;
 }
+
+static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
+{
+	ext4_io_end_t *io_end = bh->b_private;
+	struct workqueue_struct *wq;
+	struct inode *inode;
+	unsigned long flags;
+
+	if (!test_clear_buffer_uninit(bh) || !io_end)
+		goto out;
+
+	if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
+		printk("sb umounted, discard end_io request for inode %lu\n",
+			io_end->inode->i_ino);
+		ext4_free_io_end(io_end);
+		goto out;
+	}
+
+	io_end->flag = EXT4_IO_UNWRITTEN;
+	inode = io_end->inode;
+
+	/* Add the io_end to per-inode completed io list*/
+	spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
+	list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
+	spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
+
+	wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
+	/* queue the work to convert unwritten extents to written */
+	queue_work(wq, &io_end->work);
+out:
+	bh->b_private = NULL;
+	bh->b_end_io = NULL;
+	clear_buffer_uninit(bh);
+	end_buffer_async_write(bh, uptodate);
+}
+
+static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
+{
+	ext4_io_end_t *io_end;
+	struct page *page = bh->b_page;
+	loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;
+	size_t size = bh->b_size;
+
+retry:
+	io_end = ext4_init_io_end(inode, GFP_ATOMIC);
+	if (!io_end) {
+		if (printk_ratelimit())
+			printk(KERN_WARNING "%s: allocation fail\n", __func__);
+		schedule();
+		goto retry;
+	}
+	io_end->offset = offset;
+	io_end->size = size;
+	/*
+	 * We need to hold a reference to the page to make sure it
+	 * doesn't get evicted before ext4_end_io_work() has a chance
+	 * to convert the extent from written to unwritten.
+	 */
+	io_end->page = page;
+	get_page(io_end->page);
+
+	bh->b_private = io_end;
+	bh->b_end_io = ext4_end_io_buffer_write;
+	return 0;
+}
+
 /*
  * For ext4 extent files, ext4 will do direct-io write to holes,
  * preallocated extents, and those write extend the file, no need to
@@ -3751,7 +3932,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 		iocb->private = NULL;
 		EXT4_I(inode)->cur_aio_dio = NULL;
 		if (!is_sync_kiocb(iocb)) {
-			iocb->private = ext4_init_io_end(inode);
+			iocb->private = ext4_init_io_end(inode, GFP_NOFS);
 			if (!iocb->private)
 				return -ENOMEM;
 			/*
@@ -3767,7 +3948,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 		ret = blockdev_direct_IO(rw, iocb, inode,
 					 inode->i_sb->s_bdev, iov,
 					 offset, nr_segs,
-					 ext4_get_block_dio_write,
+					 ext4_get_block_write,
 					 ext4_end_io_dio);
 		if (iocb->private)
 			EXT4_I(inode)->cur_aio_dio = NULL;
@@ -3788,8 +3969,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 		if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
 			ext4_free_io_end(iocb->private);
 			iocb->private = NULL;
-		} else if (ret > 0 && (EXT4_I(inode)->i_state &
-				       EXT4_STATE_DIO_UNWRITTEN)) {
+		} else if (ret > 0 && ext4_test_inode_state(inode,
+						EXT4_STATE_DIO_UNWRITTEN)) {
 			int err;
 			/*
 			 * for non AIO case, since the IO is already
@@ -3799,7 +3980,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 							     offset, ret);
 			if (err < 0)
 				ret = err;
-			EXT4_I(inode)->i_state &= ~EXT4_STATE_DIO_UNWRITTEN;
+			ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
 		}
 		return ret;
 	}
@@ -3815,7 +3996,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 
-	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 		return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
 
 	return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
@@ -4130,18 +4311,26 @@ no_top:
  * We release `count' blocks on disk, but (last - first) may be greater
  * than `count' because there can be holes in there.
  */
-static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
-			      struct buffer_head *bh,
-			      ext4_fsblk_t block_to_free,
-			      unsigned long count, __le32 *first,
-			      __le32 *last)
+static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
+			     struct buffer_head *bh,
+			     ext4_fsblk_t block_to_free,
+			     unsigned long count, __le32 *first,
+			     __le32 *last)
 {
 	__le32 *p;
-	int	flags = EXT4_FREE_BLOCKS_FORGET;
+	int	flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
 
 	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
 		flags |= EXT4_FREE_BLOCKS_METADATA;
 
+	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
+				   count)) {
+		EXT4_ERROR_INODE(inode, "attempt to clear invalid "
+				 "blocks %llu len %lu",
+				 (unsigned long long) block_to_free, count);
+		return 1;
+	}
+
 	if (try_to_extend_transaction(handle, inode)) {
 		if (bh) {
 			BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
@@ -4160,6 +4349,7 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
 		*p = 0;
 
 	ext4_free_blocks(handle, inode, 0, block_to_free, count, flags);
+	return 0;
 }
 
 /**
@@ -4215,9 +4405,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
 			} else if (nr == block_to_free + count) {
 				count++;
 			} else {
-				ext4_clear_blocks(handle, inode, this_bh,
-						  block_to_free,
-						  count, block_to_free_p, p);
+				if (ext4_clear_blocks(handle, inode, this_bh,
+						      block_to_free, count,
+						      block_to_free_p, p))
+					break;
 				block_to_free = nr;
 				block_to_free_p = p;
 				count = 1;
@@ -4241,11 +4432,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
 		if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
 			ext4_handle_dirty_metadata(handle, inode, this_bh);
 		else
-			ext4_error(inode->i_sb, __func__,
-				   "circular indirect block detected, "
-				   "inode=%lu, block=%llu",
-				   inode->i_ino,
-				   (unsigned long long) this_bh->b_blocknr);
+			EXT4_ERROR_INODE(inode,
+					 "circular indirect block detected at "
+					 "block %llu",
+				(unsigned long long) this_bh->b_blocknr);
 	}
 }
 
@@ -4281,6 +4471,15 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
 			if (!nr)
 				continue;		/* A hole */
 
+			if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
+						   nr, 1)) {
+				EXT4_ERROR_INODE(inode,
+						 "invalid indirect mapped "
+						 "block %lu (level %d)",
+						 (unsigned long) nr, depth);
+				break;
+			}
+
 			/* Go read the buffer for the next level down */
 			bh = sb_bread(inode->i_sb, nr);
 
@@ -4289,9 +4488,9 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
 			 * (should be rare).
 			 */
 			if (!bh) {
-				ext4_error(inode->i_sb, "ext4_free_branches",
-					   "Read failure, inode=%lu, block=%llu",
-					   inode->i_ino, nr);
+				EXT4_ERROR_INODE(inode,
+						 "Read failure block=%llu",
+						 (unsigned long long) nr);
 				continue;
 			}
 
@@ -4433,10 +4632,12 @@ void ext4_truncate(struct inode *inode)
 	if (!ext4_can_truncate(inode))
 		return;
 
+	ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+
 	if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
-		ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
+		ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
 
-	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
 		ext4_ext_truncate(inode);
 		return;
 	}
@@ -4604,9 +4805,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
 
 	bh = sb_getblk(sb, block);
 	if (!bh) {
-		ext4_error(sb, "ext4_get_inode_loc", "unable to read "
-			   "inode block - inode=%lu, block=%llu",
-			   inode->i_ino, block);
+		EXT4_ERROR_INODE(inode, "unable to read inode block - "
+				 "block %llu", block);
 		return -EIO;
 	}
 	if (!buffer_uptodate(bh)) {
@@ -4704,9 +4904,8 @@ make_io:
 		submit_bh(READ_META, bh);
 		wait_on_buffer(bh);
 		if (!buffer_uptodate(bh)) {
-			ext4_error(sb, __func__,
-				   "unable to read inode block - inode=%lu, "
-				   "block=%llu", inode->i_ino, block);
+			EXT4_ERROR_INODE(inode, "unable to read inode "
+					 "block %llu", block);
 			brelse(bh);
 			return -EIO;
 		}
@@ -4720,7 +4919,7 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
 {
 	/* We have all inode data except xattrs in memory here. */
 	return __ext4_get_inode_loc(inode, iloc,
-		!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR));
+		!ext4_test_inode_state(inode, EXT4_STATE_XATTR));
 }
 
 void ext4_set_inode_flags(struct inode *inode)
@@ -4743,20 +4942,26 @@ void ext4_set_inode_flags(struct inode *inode)
 /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
 void ext4_get_inode_flags(struct ext4_inode_info *ei)
 {
-	unsigned int flags = ei->vfs_inode.i_flags;
-
-	ei->i_flags &= ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
-			EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|EXT4_DIRSYNC_FL);
-	if (flags & S_SYNC)
-		ei->i_flags |= EXT4_SYNC_FL;
-	if (flags & S_APPEND)
-		ei->i_flags |= EXT4_APPEND_FL;
-	if (flags & S_IMMUTABLE)
-		ei->i_flags |= EXT4_IMMUTABLE_FL;
-	if (flags & S_NOATIME)
-		ei->i_flags |= EXT4_NOATIME_FL;
-	if (flags & S_DIRSYNC)
-		ei->i_flags |= EXT4_DIRSYNC_FL;
+	unsigned int vfs_fl;
+	unsigned long old_fl, new_fl;
+
+	do {
+		vfs_fl = ei->vfs_inode.i_flags;
+		old_fl = ei->i_flags;
+		new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
+				EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|
+				EXT4_DIRSYNC_FL);
+		if (vfs_fl & S_SYNC)
+			new_fl |= EXT4_SYNC_FL;
+		if (vfs_fl & S_APPEND)
+			new_fl |= EXT4_APPEND_FL;
+		if (vfs_fl & S_IMMUTABLE)
+			new_fl |= EXT4_IMMUTABLE_FL;
+		if (vfs_fl & S_NOATIME)
+			new_fl |= EXT4_NOATIME_FL;
+		if (vfs_fl & S_DIRSYNC)
+			new_fl |= EXT4_DIRSYNC_FL;
+	} while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl);
 }
 
 static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
@@ -4814,7 +5019,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	}
 	inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
 
-	ei->i_state = 0;
+	ei->i_state_flags = 0;
 	ei->i_dir_start_lookup = 0;
 	ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
 	/* We now have enough fields to check if the inode was active or not.
@@ -4897,7 +5102,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 					EXT4_GOOD_OLD_INODE_SIZE +
 					ei->i_extra_isize;
 			if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
-				ei->i_state |= EXT4_STATE_XATTR;
+				ext4_set_inode_state(inode, EXT4_STATE_XATTR);
 		}
 	} else
 		ei->i_extra_isize = 0;
@@ -4917,9 +5122,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	ret = 0;
 	if (ei->i_file_acl &&
 	    !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
-		ext4_error(sb, __func__,
-			   "bad extended attribute block %llu in inode #%lu",
-			   ei->i_file_acl, inode->i_ino);
+		EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
+				 ei->i_file_acl);
 		ret = -EIO;
 		goto bad_inode;
 	} else if (ei->i_flags & EXT4_EXTENTS_FL) {
@@ -4964,9 +5168,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 			   new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
 	} else {
 		ret = -EIO;
-		ext4_error(inode->i_sb, __func__,
-			   "bogus i_mode (%o) for inode=%lu",
-			   inode->i_mode, inode->i_ino);
+		EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
 		goto bad_inode;
 	}
 	brelse(iloc.bh);
@@ -4995,7 +5197,7 @@ static int ext4_inode_blocks_set(handle_t *handle,
 		 */
 		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
 		raw_inode->i_blocks_high = 0;
-		ei->i_flags &= ~EXT4_HUGE_FILE_FL;
+		ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
 		return 0;
 	}
 	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
@@ -5008,9 +5210,9 @@ static int ext4_inode_blocks_set(handle_t *handle,
 		 */
 		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
 		raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
-		ei->i_flags &= ~EXT4_HUGE_FILE_FL;
+		ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
 	} else {
-		ei->i_flags |= EXT4_HUGE_FILE_FL;
+		ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
 		/* i_block is stored in file system block size */
 		i_blocks = i_blocks >> (inode->i_blkbits - 9);
 		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
@@ -5037,7 +5239,7 @@ static int ext4_do_update_inode(handle_t *handle,
 
 	/* For fields not not tracking in the in-memory inode,
 	 * initialise them to zero for new inodes. */
-	if (ei->i_state & EXT4_STATE_NEW)
+	if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
 		memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
 
 	ext4_get_inode_flags(ei);
@@ -5101,7 +5303,7 @@ static int ext4_do_update_inode(handle_t *handle,
 					EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
 			sb->s_dirt = 1;
 			ext4_handle_sync(handle);
-			err = ext4_handle_dirty_metadata(handle, inode,
+			err = ext4_handle_dirty_metadata(handle, NULL,
 					EXT4_SB(sb)->s_sbh);
 		}
 	}
@@ -5130,10 +5332,10 @@ static int ext4_do_update_inode(handle_t *handle,
 	}
 
 	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-	rc = ext4_handle_dirty_metadata(handle, inode, bh);
+	rc = ext4_handle_dirty_metadata(handle, NULL, bh);
 	if (!err)
 		err = rc;
-	ei->i_state &= ~EXT4_STATE_NEW;
+	ext4_clear_inode_state(inode, EXT4_STATE_NEW);
 
 	ext4_update_inode_fsync_trans(handle, inode, 0);
 out_brelse:
@@ -5177,7 +5379,7 @@ out_brelse:
  * `stuff()' is running, and the new i_size will be lost.  Plus the inode
  * will no longer be on the superblock's dirty inode list.
  */
-int ext4_write_inode(struct inode *inode, int wait)
+int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	int err;
 
@@ -5191,26 +5393,25 @@ int ext4_write_inode(struct inode *inode, int wait)
 			return -EIO;
 		}
 
-		if (!wait)
+		if (wbc->sync_mode != WB_SYNC_ALL)
 			return 0;
 
 		err = ext4_force_commit(inode->i_sb);
 	} else {
 		struct ext4_iloc iloc;
 
-		err = ext4_get_inode_loc(inode, &iloc);
+		err = __ext4_get_inode_loc(inode, &iloc, 0);
 		if (err)
 			return err;
-		if (wait)
+		if (wbc->sync_mode == WB_SYNC_ALL)
 			sync_dirty_buffer(iloc.bh);
 		if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
-			ext4_error(inode->i_sb, __func__,
-				   "IO error syncing inode, "
-				   "inode=%lu, block=%llu",
-				   inode->i_ino,
-				   (unsigned long long)iloc.bh->b_blocknr);
+			EXT4_ERROR_INODE(inode,
+				"IO error syncing inode (block=%llu)",
+				(unsigned long long) iloc.bh->b_blocknr);
 			err = -EIO;
 		}
+		brelse(iloc.bh);
 	}
 	return err;
 }
@@ -5249,6 +5450,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 	if (error)
 		return error;
 
+	if (is_quota_modification(inode, attr))
+		dquot_initialize(inode);
 	if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
 		(ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
 		handle_t *handle;
@@ -5261,7 +5464,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 			error = PTR_ERR(handle);
 			goto err_out;
 		}
-		error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
+		error = dquot_transfer(inode, attr);
 		if (error) {
 			ext4_journal_stop(handle);
 			return error;
@@ -5277,7 +5480,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 	}
 
 	if (attr->ia_valid & ATTR_SIZE) {
-		if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
+		if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
 			struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 
 			if (attr->ia_size > sbi->s_bitmap_maxbytes) {
@@ -5288,7 +5491,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 	}
 
 	if (S_ISREG(inode->i_mode) &&
-	    attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
+	    attr->ia_valid & ATTR_SIZE &&
+	    (attr->ia_size < inode->i_size ||
+	     (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {
 		handle_t *handle;
 
 		handle = ext4_journal_start(inode, 3);
@@ -5319,6 +5524,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 				goto err_out;
 			}
 		}
+		/* ext4_truncate will clear the flag */
+		if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))
+			ext4_truncate(inode);
 	}
 
 	rc = inode_setattr(inode, attr);
@@ -5393,7 +5601,7 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
 
 static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 {
-	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 		return ext4_indirect_trans_blocks(inode, nrblocks, chunk);
 	return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
 }
@@ -5557,8 +5765,8 @@ static int ext4_expand_extra_isize(struct inode *inode,
 	entry = IFIRST(header);
 
 	/* No extended attributes present */
-	if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) ||
-		header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
+	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
+	    header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
 		memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
 			new_extra_isize);
 		EXT4_I(inode)->i_extra_isize = new_extra_isize;
@@ -5602,7 +5810,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
 	err = ext4_reserve_inode_write(handle, inode, &iloc);
 	if (ext4_handle_valid(handle) &&
 	    EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
-	    !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {
+	    !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
 		/*
 		 * We need extra buffer credits since we may write into EA block
 		 * with this same handle. If journal_extend fails, then it will
@@ -5616,10 +5824,11 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
 						      sbi->s_want_extra_isize,
 						      iloc, handle);
 			if (ret) {
-				EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
+				ext4_set_inode_state(inode,
+						     EXT4_STATE_NO_EXPAND);
 				if (mnt_count !=
 					le16_to_cpu(sbi->s_es->s_mnt_count)) {
-					ext4_warning(inode->i_sb, __func__,
+					ext4_warning(inode->i_sb,
 					"Unable to expand inode %lu. Delete"
 					" some EAs or run e2fsck.",
 					inode->i_ino);
@@ -5641,7 +5850,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
  * i_size has been changed by generic_commit_write() and we thus need
  * to include the updated inode in the current transaction.
  *
- * Also, vfs_dq_alloc_block() will always dirty the inode when blocks
+ * Also, dquot_alloc_block() will always dirty the inode when blocks
  * are allocated to the file.
  *
  * If the inode is marked synchronous, we don't honour that here - doing
@@ -5683,7 +5892,7 @@ static int ext4_pin_inode(handle_t *handle, struct inode *inode)
 			err = jbd2_journal_get_write_access(handle, iloc.bh);
 			if (!err)
 				err = ext4_handle_dirty_metadata(handle,
-								 inode,
+								 NULL,
 								 iloc.bh);
 			brelse(iloc.bh);
 		}
@@ -5727,9 +5936,9 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 	 */
 
 	if (val)
-		EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL;
+		ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
 	else
-		EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL;
+		ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
 	ext4_set_aops(inode);
 
 	jbd2_journal_unlock_updates(journal);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index b63d193126db..bf5ae883b1bd 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -92,6 +92,15 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 			flags &= ~EXT4_EXTENTS_FL;
 		}
 
+		if (flags & EXT4_EOFBLOCKS_FL) {
+			/* we don't support adding EOFBLOCKS flag */
+			if (!(oldflags & EXT4_EOFBLOCKS_FL)) {
+				err = -EOPNOTSUPP;
+				goto flags_out;
+			}
+		} else if (oldflags & EXT4_EOFBLOCKS_FL)
+			ext4_truncate(inode);
+
 		handle = ext4_journal_start(inode, 1);
 		if (IS_ERR(handle)) {
 			err = PTR_ERR(handle);
@@ -249,7 +258,8 @@ setversion_out:
 		if (me.moved_len > 0)
 			file_remove_suid(donor_filp);
 
-		if (copy_to_user((struct move_extent *)arg, &me, sizeof(me)))
+		if (copy_to_user((struct move_extent __user *)arg,
+				 &me, sizeof(me)))
 			err = -EFAULT;
 mext_out:
 		fput(donor_filp);
@@ -363,7 +373,30 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case EXT4_IOC32_SETRSVSZ:
 		cmd = EXT4_IOC_SETRSVSZ;
 		break;
-	case EXT4_IOC_GROUP_ADD:
+	case EXT4_IOC32_GROUP_ADD: {
+		struct compat_ext4_new_group_input __user *uinput;
+		struct ext4_new_group_input input;
+		mm_segment_t old_fs;
+		int err;
+
+		uinput = compat_ptr(arg);
+		err = get_user(input.group, &uinput->group);
+		err |= get_user(input.block_bitmap, &uinput->block_bitmap);
+		err |= get_user(input.inode_bitmap, &uinput->inode_bitmap);
+		err |= get_user(input.inode_table, &uinput->inode_table);
+		err |= get_user(input.blocks_count, &uinput->blocks_count);
+		err |= get_user(input.reserved_blocks,
+				&uinput->reserved_blocks);
+		if (err)
+			return -EFAULT;
+		old_fs = get_fs();
+		set_fs(KERNEL_DS);
+		err = ext4_ioctl(file, EXT4_IOC_GROUP_ADD,
+				 (unsigned long) &input);
+		set_fs(old_fs);
+		return err;
+	}
+	case EXT4_IOC_MOVE_EXT:
 		break;
 	default:
 		return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d34afad3e137..12b3bc026a68 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -23,6 +23,7 @@
 
 #include "mballoc.h"
 #include <linux/debugfs.h>
+#include <linux/slab.h>
 #include <trace/events/ext4.h>
 
 /*
@@ -69,7 +70,7 @@
  *
  * pa_lstart -> the logical start block for this prealloc space
  * pa_pstart -> the physical start block for this prealloc space
- * pa_len    -> lenght for this prealloc space
+ * pa_len    -> length for this prealloc space
  * pa_free   ->  free space available in this prealloc space
  *
  * The inode preallocation space is used looking at the _logical_ start
@@ -441,10 +442,9 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
 	for (i = 0; i < count; i++) {
 		if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
 			ext4_fsblk_t blocknr;
-			blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb);
+
+			blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
 			blocknr += first + i;
-			blocknr +=
-			    le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
 			ext4_grp_locked_error(sb, e4b->bd_group,
 				   __func__, "double-free of inode"
 				   " %lu's block %llu(bit %u in group %u)",
@@ -658,6 +658,27 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
 	}
 }
 
+/*
+ * Cache the order of the largest free extent we have available in this block
+ * group.
+ */
+static void
+mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
+{
+	int i;
+	int bits;
+
+	grp->bb_largest_free_order = -1; /* uninit */
+
+	bits = sb->s_blocksize_bits + 1;
+	for (i = bits; i >= 0; i--) {
+		if (grp->bb_counters[i] > 0) {
+			grp->bb_largest_free_order = i;
+			break;
+		}
+	}
+}
+
 static noinline_for_stack
 void ext4_mb_generate_buddy(struct super_block *sb,
 				void *buddy, void *bitmap, ext4_group_t group)
@@ -700,6 +721,7 @@ void ext4_mb_generate_buddy(struct super_block *sb,
 		 */
 		grp->bb_free = free;
 	}
+	mb_set_largest_free_order(sb, grp);
 
 	clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
 
@@ -725,6 +747,9 @@ void ext4_mb_generate_buddy(struct super_block *sb,
  * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize)  blocks.
  * So it can have information regarding groups_per_page which
  * is blocks_per_page/2
+ *
+ * Locking note:  This routine takes the block group lock of all groups
+ * for this page; do not hold this lock when calling this routine!
  */
 
 static int ext4_mb_init_cache(struct page *page, char *incore)
@@ -865,6 +890,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 			BUG_ON(incore == NULL);
 			mb_debug(1, "put buddy for group %u in page %lu/%x\n",
 				group, page->index, i * blocksize);
+			trace_ext4_mb_buddy_bitmap_load(sb, group);
 			grinfo = ext4_get_group_info(sb, group);
 			grinfo->bb_fragments = 0;
 			memset(grinfo->bb_counters, 0,
@@ -882,6 +908,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 			BUG_ON(incore != NULL);
 			mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
 				group, page->index, i * blocksize);
+			trace_ext4_mb_bitmap_load(sb, group);
 
 			/* see comments in ext4_mb_put_pa() */
 			ext4_lock_group(sb, group);
@@ -910,6 +937,11 @@ out:
 	return err;
 }
 
+/*
+ * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
+ * block group lock of all groups for this page; do not hold the BG lock when
+ * calling this routine!
+ */
 static noinline_for_stack
 int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
 {
@@ -1004,6 +1036,11 @@ err:
 	return ret;
 }
 
+/*
+ * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
+ * block group lock of all groups for this page; do not hold the BG lock when
+ * calling this routine!
+ */
 static noinline_for_stack int
 ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
 					struct ext4_buddy *e4b)
@@ -1150,7 +1187,7 @@ err:
 	return ret;
 }
 
-static void ext4_mb_release_desc(struct ext4_buddy *e4b)
+static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
 {
 	if (e4b->bd_bitmap_page)
 		page_cache_release(e4b->bd_bitmap_page);
@@ -1255,10 +1292,9 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
 
 		if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) {
 			ext4_fsblk_t blocknr;
-			blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb);
+
+			blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
 			blocknr += block;
-			blocknr +=
-			    le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
 			ext4_grp_locked_error(sb, e4b->bd_group,
 				   __func__, "double-free of inode"
 				   " %lu's block %llu(bit %u in group %u)",
@@ -1300,6 +1336,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
 			buddy = buddy2;
 		} while (1);
 	}
+	mb_set_largest_free_order(sb, e4b->bd_info);
 	mb_check_buddy(e4b);
 }
 
@@ -1428,6 +1465,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
 		e4b->bd_info->bb_counters[ord]++;
 		e4b->bd_info->bb_counters[ord]++;
 	}
+	mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
 
 	mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
 	mb_check_buddy(e4b);
@@ -1618,7 +1656,7 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
 	}
 
 	ext4_unlock_group(ac->ac_sb, group);
-	ext4_mb_release_desc(e4b);
+	ext4_mb_unload_buddy(e4b);
 
 	return 0;
 }
@@ -1631,7 +1669,6 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
 	int max;
 	int err;
 	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
-	struct ext4_super_block *es = sbi->s_es;
 	struct ext4_free_extent ex;
 
 	if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
@@ -1648,8 +1685,8 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
 	if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
 		ext4_fsblk_t start;
 
-		start = (e4b->bd_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) +
-			ex.fe_start + le32_to_cpu(es->s_first_data_block);
+		start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) +
+			ex.fe_start;
 		/* use do_div to get remainder (would be 64-bit modulo) */
 		if (do_div(start, sbi->s_stripe) == 0) {
 			ac->ac_found++;
@@ -1674,7 +1711,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
 		ext4_mb_use_best_found(ac, e4b);
 	}
 	ext4_unlock_group(ac->ac_sb, group);
-	ext4_mb_release_desc(e4b);
+	ext4_mb_unload_buddy(e4b);
 
 	return 0;
 }
@@ -1803,8 +1840,8 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
 	BUG_ON(sbi->s_stripe == 0);
 
 	/* find first stripe-aligned block in group */
-	first_group_block = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb)
-		+ le32_to_cpu(sbi->s_es->s_first_data_block);
+	first_group_block = ext4_group_first_block_no(sb, e4b->bd_group);
+
 	a = first_group_block + sbi->s_stripe - 1;
 	do_div(a, sbi->s_stripe);
 	i = (a * sbi->s_stripe) - first_group_block;
@@ -1823,16 +1860,22 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
 	}
 }
 
+/* This is now called BEFORE we load the buddy bitmap. */
 static int ext4_mb_good_group(struct ext4_allocation_context *ac,
 				ext4_group_t group, int cr)
 {
 	unsigned free, fragments;
-	unsigned i, bits;
 	int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
 	struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
 
 	BUG_ON(cr < 0 || cr >= 4);
-	BUG_ON(EXT4_MB_GRP_NEED_INIT(grp));
+
+	/* We only do this if the grp has never been initialized */
+	if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+		int ret = ext4_mb_init_group(ac->ac_sb, group);
+		if (ret)
+			return 0;
+	}
 
 	free = grp->bb_free;
 	fragments = grp->bb_fragments;
@@ -1845,17 +1888,16 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
 	case 0:
 		BUG_ON(ac->ac_2order == 0);
 
+		if (grp->bb_largest_free_order < ac->ac_2order)
+			return 0;
+
 		/* Avoid using the first bg of a flexgroup for data files */
 		if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
 		    (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
 		    ((group % flex_size) == 0))
 			return 0;
 
-		bits = ac->ac_sb->s_blocksize_bits + 1;
-		for (i = ac->ac_2order; i <= bits; i++)
-			if (grp->bb_counters[i] > 0)
-				return 1;
-		break;
+		return 1;
 	case 1:
 		if ((free / fragments) >= ac->ac_g_ex.fe_len)
 			return 1;
@@ -1966,7 +2008,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 	sbi = EXT4_SB(sb);
 	ngroups = ext4_get_groups_count(sb);
 	/* non-extent files are limited to low blocks/groups */
-	if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL))
+	if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
 		ngroups = sbi->s_blockfile_groups;
 
 	BUG_ON(ac->ac_status == AC_STATUS_FOUND);
@@ -2026,15 +2068,11 @@ repeat:
 		group = ac->ac_g_ex.fe_group;
 
 		for (i = 0; i < ngroups; group++, i++) {
-			struct ext4_group_info *grp;
-			struct ext4_group_desc *desc;
-
 			if (group == ngroups)
 				group = 0;
 
-			/* quick check to skip empty groups */
-			grp = ext4_get_group_info(sb, group);
-			if (grp->bb_free == 0)
+			/* This now checks without needing the buddy page */
+			if (!ext4_mb_good_group(ac, group, cr))
 				continue;
 
 			err = ext4_mb_load_buddy(sb, group, &e4b);
@@ -2042,15 +2080,18 @@ repeat:
 				goto out;
 
 			ext4_lock_group(sb, group);
+
+			/*
+			 * We need to check again after locking the
+			 * block group
+			 */
 			if (!ext4_mb_good_group(ac, group, cr)) {
-				/* someone did allocation from this group */
 				ext4_unlock_group(sb, group);
-				ext4_mb_release_desc(&e4b);
+				ext4_mb_unload_buddy(&e4b);
 				continue;
 			}
 
 			ac->ac_groups_scanned++;
-			desc = ext4_get_group_desc(sb, group, NULL);
 			if (cr == 0)
 				ext4_mb_simple_scan_group(ac, &e4b);
 			else if (cr == 1 &&
@@ -2060,7 +2101,7 @@ repeat:
 				ext4_mb_complex_scan_group(ac, &e4b);
 
 			ext4_unlock_group(sb, group);
-			ext4_mb_release_desc(&e4b);
+			ext4_mb_unload_buddy(&e4b);
 
 			if (ac->ac_status != AC_STATUS_CONTINUE)
 				break;
@@ -2150,7 +2191,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
 	ext4_lock_group(sb, group);
 	memcpy(&sg, ext4_get_group_info(sb, group), i);
 	ext4_unlock_group(sb, group);
-	ext4_mb_release_desc(&e4b);
+	ext4_mb_unload_buddy(&e4b);
 
 	seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
 			sg.info.bb_fragments, sg.info.bb_first_free);
@@ -2256,7 +2297,8 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
 
 	INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
 	init_rwsem(&meta_group_info[i]->alloc_sem);
-	meta_group_info[i]->bb_free_root.rb_node = NULL;
+	meta_group_info[i]->bb_free_root = RB_ROOT;
+	meta_group_info[i]->bb_largest_free_order = -1;  /* uninit */
 
 #ifdef DOUBLE_CHECK
 	{
@@ -2537,6 +2579,23 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 		mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
 			 entry->count, entry->group, entry);
 
+		if (test_opt(sb, DISCARD)) {
+			int ret;
+			ext4_fsblk_t discard_block;
+
+			discard_block = entry->start_blk +
+				ext4_group_first_block_no(sb, entry->group);
+			trace_ext4_discard_blocks(sb,
+					(unsigned long long)discard_block,
+					entry->count);
+			ret = sb_issue_discard(sb, discard_block, entry->count);
+			if (ret == EOPNOTSUPP) {
+				ext4_warning(sb,
+					"discard not supported, disabling");
+				clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
+			}
+		}
+
 		err = ext4_mb_load_buddy(sb, entry->group, &e4b);
 		/* we expect to find existing buddy because it's pinned */
 		BUG_ON(err != 0);
@@ -2558,21 +2617,8 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 			page_cache_release(e4b.bd_bitmap_page);
 		}
 		ext4_unlock_group(sb, entry->group);
-		if (test_opt(sb, DISCARD)) {
-			ext4_fsblk_t discard_block;
-			struct ext4_super_block *es = EXT4_SB(sb)->s_es;
-
-			discard_block = (ext4_fsblk_t)entry->group *
-						EXT4_BLOCKS_PER_GROUP(sb)
-					+ entry->start_blk
-					+ le32_to_cpu(es->s_first_data_block);
-			trace_ext4_discard_blocks(sb,
-					(unsigned long long)discard_block,
-					entry->count);
-			sb_issue_discard(sb, discard_block, entry->count);
-		}
 		kmem_cache_free(ext4_free_ext_cachep, entry);
-		ext4_mb_release_desc(&e4b);
+		ext4_mb_unload_buddy(&e4b);
 	}
 
 	mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
@@ -2645,7 +2691,7 @@ int __init init_ext4_mballoc(void)
 
 void exit_ext4_mballoc(void)
 {
-	/* 
+	/*
 	 * Wait for completion of call_rcu()'s on ext4_pspace_cachep
 	 * before destroying the slab cache.
 	 */
@@ -2703,14 +2749,11 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 	if (err)
 		goto out_err;
 
-	block = ac->ac_b_ex.fe_group * EXT4_BLOCKS_PER_GROUP(sb)
-		+ ac->ac_b_ex.fe_start
-		+ le32_to_cpu(es->s_first_data_block);
+	block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
 
 	len = ac->ac_b_ex.fe_len;
 	if (!ext4_data_block_valid(sbi, block, len)) {
-		ext4_error(sb, __func__,
-			   "Allocating blocks %llu-%llu which overlap "
+		ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
 			   "fs metadata\n", block, block+len);
 		/* File system mounted not to panic on error
 		 * Fix the bitmap and repeat the block allocation
@@ -2988,7 +3031,7 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
 	if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
 		atomic_inc(&sbi->s_bal_reqs);
 		atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
-		if (ac->ac_o_ex.fe_len >= ac->ac_g_ex.fe_len)
+		if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
 			atomic_inc(&sbi->s_bal_success);
 		atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
 		if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
@@ -3130,7 +3173,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
 			continue;
 
 		/* non-extent files can't have physical blocks past 2^32 */
-		if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) &&
+		if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
 			pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
 			continue;
 
@@ -3161,9 +3204,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
 		/* The max size of hash table is PREALLOC_TB_SIZE */
 		order = PREALLOC_TB_SIZE - 1;
 
-	goal_block = ac->ac_g_ex.fe_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb) +
-		     ac->ac_g_ex.fe_start +
-		     le32_to_cpu(EXT4_SB(ac->ac_sb)->s_es->s_first_data_block);
+	goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex);
 	/*
 	 * search for the prealloc space that is having
 	 * minimal distance from the goal block.
@@ -3289,7 +3330,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
 	spin_unlock(&pa->pa_lock);
 
 	grp_blk = pa->pa_pstart;
-	/* 
+	/*
 	 * If doing group-based preallocation, pa_pstart may be in the
 	 * next group when pa is used up
 	 */
@@ -3526,8 +3567,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
 		if (bit >= end)
 			break;
 		next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
-		start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
-				le32_to_cpu(sbi->s_es->s_first_data_block);
+		start = ext4_group_first_block_no(sb, group) + bit;
 		mb_debug(1, "    free preallocated %u/%u in group %u\n",
 				(unsigned) start, (unsigned) next - bit,
 				(unsigned) group);
@@ -3623,15 +3663,13 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
 
 	bitmap_bh = ext4_read_block_bitmap(sb, group);
 	if (bitmap_bh == NULL) {
-		ext4_error(sb, __func__, "Error in reading block "
-				"bitmap for %u", group);
+		ext4_error(sb, "Error reading block bitmap for %u", group);
 		return 0;
 	}
 
 	err = ext4_mb_load_buddy(sb, group, &e4b);
 	if (err) {
-		ext4_error(sb, __func__, "Error in loading buddy "
-				"information for %u", group);
+		ext4_error(sb, "Error loading buddy information for %u", group);
 		put_bh(bitmap_bh);
 		return 0;
 	}
@@ -3709,7 +3747,7 @@ out:
 	ext4_unlock_group(sb, group);
 	if (ac)
 		kmem_cache_free(ext4_ac_cachep, ac);
-	ext4_mb_release_desc(&e4b);
+	ext4_mb_unload_buddy(&e4b);
 	put_bh(bitmap_bh);
 	return free;
 }
@@ -3804,16 +3842,16 @@ repeat:
 
 		err = ext4_mb_load_buddy(sb, group, &e4b);
 		if (err) {
-			ext4_error(sb, __func__, "Error in loading buddy "
-					"information for %u", group);
+			ext4_error(sb, "Error loading buddy information for %u",
+					group);
 			continue;
 		}
 
 		bitmap_bh = ext4_read_block_bitmap(sb, group);
 		if (bitmap_bh == NULL) {
-			ext4_error(sb, __func__, "Error in reading block "
-					"bitmap for %u", group);
-			ext4_mb_release_desc(&e4b);
+			ext4_error(sb, "Error reading block bitmap for %u",
+					group);
+			ext4_mb_unload_buddy(&e4b);
 			continue;
 		}
 
@@ -3822,7 +3860,7 @@ repeat:
 		ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
 		ext4_unlock_group(sb, group);
 
-		ext4_mb_release_desc(&e4b);
+		ext4_mb_unload_buddy(&e4b);
 		put_bh(bitmap_bh);
 
 		list_del(&pa->u.pa_tmp_list);
@@ -3938,7 +3976,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
 
 	/* don't use group allocation for large files */
 	size = max(size, isize);
-	if (size >= sbi->s_mb_stream_request) {
+	if (size > sbi->s_mb_stream_request) {
 		ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
 		return;
 	}
@@ -4077,8 +4115,8 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
 
 		ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
 		if (ext4_mb_load_buddy(sb, group, &e4b)) {
-			ext4_error(sb, __func__, "Error in loading buddy "
-					"information for %u", group);
+			ext4_error(sb, "Error loading buddy information for %u",
+					group);
 			continue;
 		}
 		ext4_lock_group(sb, group);
@@ -4086,7 +4124,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
 		ext4_mb_release_group_pa(&e4b, pa, ac);
 		ext4_unlock_group(sb, group);
 
-		ext4_mb_release_desc(&e4b);
+		ext4_mb_unload_buddy(&e4b);
 		list_del(&pa->u.pa_tmp_list);
 		call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
 	}
@@ -4254,7 +4292,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 			return 0;
 		}
 		reserv_blks = ar->len;
-		while (ar->len && vfs_dq_alloc_block(ar->inode, ar->len)) {
+		while (ar->len && dquot_alloc_block(ar->inode, ar->len)) {
 			ar->flags |= EXT4_MB_HINT_NOPREALLOC;
 			ar->len--;
 		}
@@ -4331,7 +4369,7 @@ out2:
 	kmem_cache_free(ext4_ac_cachep, ac);
 out1:
 	if (inquota && ar->len < inquota)
-		vfs_dq_free_block(ar->inode, inquota - ar->len);
+		dquot_free_block(ar->inode, inquota - ar->len);
 out3:
 	if (!ar->len) {
 		if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag)
@@ -4476,10 +4514,10 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 
 	sbi = EXT4_SB(sb);
 	es = EXT4_SB(sb)->s_es;
-	if (!ext4_data_block_valid(sbi, block, count)) {
-		ext4_error(sb, __func__,
-			    "Freeing blocks not in datazone - "
-			    "block = %llu, count = %lu", block, count);
+	if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
+	    !ext4_data_block_valid(sbi, block, count)) {
+		ext4_error(sb, "Freeing blocks not in datazone - "
+			   "block = %llu, count = %lu", block, count);
 		goto error_return;
 	}
 
@@ -4496,12 +4534,12 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 			if (!bh)
 				tbh = sb_find_get_block(inode->i_sb,
 							block + i);
-			ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, 
+			ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
 				    inode, tbh, block + i);
 		}
 	}
 
-	/* 
+	/*
 	 * We need to make sure we don't reuse the freed block until
 	 * after the transaction is committed, which we can do by
 	 * treating the block as metadata, below.  We make an
@@ -4547,8 +4585,7 @@ do_more:
 	    in_range(block + count - 1, ext4_inode_table(sb, gdp),
 		      EXT4_SB(sb)->s_itb_per_group)) {
 
-		ext4_error(sb, __func__,
-			   "Freeing blocks in system zone - "
+		ext4_error(sb, "Freeing blocks in system zone - "
 			   "Block = %llu, count = %lu", block, count);
 		/* err = 0. ext4_std_error should be a no op */
 		goto error_return;
@@ -4623,7 +4660,7 @@ do_more:
 		atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
 	}
 
-	ext4_mb_release_desc(&e4b);
+	ext4_mb_unload_buddy(&e4b);
 
 	freed += count;
 
@@ -4646,7 +4683,7 @@ do_more:
 	sb->s_dirt = 1;
 error_return:
 	if (freed)
-		vfs_dq_free_block(inode, freed);
+		dquot_free_block(inode, freed);
 	brelse(bitmap_bh);
 	ext4_std_error(sb, err);
 	if (ac)
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 436521cae456..b619322c76f0 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -220,16 +220,9 @@ struct ext4_buddy {
 #define EXT4_MB_BITMAP(e4b)	((e4b)->bd_bitmap)
 #define EXT4_MB_BUDDY(e4b)	((e4b)->bd_buddy)
 
-#define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
-
 static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
 					struct ext4_free_extent *fex)
 {
-	ext4_fsblk_t block;
-
-	block = (ext4_fsblk_t) fex->fe_group * EXT4_BLOCKS_PER_GROUP(sb)
-			+ fex->fe_start
-			+ le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
-	return block;
+	return ext4_group_first_block_no(sb, fex->fe_group) + fex->fe_start;
 }
 #endif
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 81415814b00b..6f3a27ec30bf 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -13,6 +13,7 @@
  */
 
 #include <linux/module.h>
+#include <linux/slab.h>
 #include "ext4_jbd2.h"
 #include "ext4_extents.h"
 
@@ -365,12 +366,12 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
 	 * happened after we started the migrate. We need to
 	 * fail the migrate
 	 */
-	if (!(EXT4_I(inode)->i_state & EXT4_STATE_EXT_MIGRATE)) {
+	if (!ext4_test_inode_state(inode, EXT4_STATE_EXT_MIGRATE)) {
 		retval = -EAGAIN;
 		up_write(&EXT4_I(inode)->i_data_sem);
 		goto err_out;
 	} else
-		EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
+		ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
 	/*
 	 * We have the extent map build with the tmp inode.
 	 * Now copy the i_data across
@@ -474,7 +475,7 @@ int ext4_ext_migrate(struct inode *inode)
 	 */
 	if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
 				       EXT4_FEATURE_INCOMPAT_EXTENTS) ||
-	    (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+	    (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 		return -EINVAL;
 
 	if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
@@ -503,14 +504,10 @@ int ext4_ext_migrate(struct inode *inode)
 	}
 	i_size_write(tmp_inode, i_size_read(inode));
 	/*
-	 * We don't want the inode to be reclaimed
-	 * if we got interrupted in between. We have
-	 * this tmp inode carrying reference to the
-	 * data blocks of the original file. We set
-	 * the i_nlink to zero at the last stage after
-	 * switching the original file to extent format
+	 * Set the i_nlink to zero so it will be deleted later
+	 * when we drop inode reference.
 	 */
-	tmp_inode->i_nlink = 1;
+	tmp_inode->i_nlink = 0;
 
 	ext4_ext_tree_init(handle, tmp_inode);
 	ext4_orphan_add(handle, tmp_inode);
@@ -533,10 +530,20 @@ int ext4_ext_migrate(struct inode *inode)
 	 * allocation.
 	 */
 	down_read((&EXT4_I(inode)->i_data_sem));
-	EXT4_I(inode)->i_state |= EXT4_STATE_EXT_MIGRATE;
+	ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
 	up_read((&EXT4_I(inode)->i_data_sem));
 
 	handle = ext4_journal_start(inode, 1);
+	if (IS_ERR(handle)) {
+		/*
+		 * It is impossible to update on-disk structures without
+		 * a handle, so just rollback in-core changes and live other
+		 * work to orphan_list_cleanup()
+		 */
+		ext4_orphan_del(NULL, tmp_inode);
+		retval = PTR_ERR(handle);
+		goto out;
+	}
 
 	ei = EXT4_I(inode);
 	i_data = ei->i_data;
@@ -618,15 +625,8 @@ err_out:
 
 	/* Reset the extent details */
 	ext4_ext_tree_init(handle, tmp_inode);
-
-	/*
-	 * Set the i_nlink to zero so that
-	 * generic_drop_inode really deletes the
-	 * inode
-	 */
-	tmp_inode->i_nlink = 0;
-
 	ext4_journal_stop(handle);
+out:
 	unlock_new_inode(tmp_inode);
 	iput(tmp_inode);
 
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 82c415be87a4..52abfa12762a 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -15,6 +15,7 @@
 
 #include <linux/fs.h>
 #include <linux/quotaops.h>
+#include <linux/slab.h>
 #include "ext4_jbd2.h"
 #include "ext4_extents.h"
 #include "ext4.h"
@@ -152,12 +153,12 @@ mext_check_null_inode(struct inode *inode1, struct inode *inode2,
 	int ret = 0;
 
 	if (inode1 == NULL) {
-		ext4_error(inode2->i_sb, function,
+		__ext4_error(inode2->i_sb, function,
 			"Both inodes should not be NULL: "
 			"inode1 NULL inode2 %lu", inode2->i_ino);
 		ret = -EIO;
 	} else if (inode2 == NULL) {
-		ext4_error(inode1->i_sb, function,
+		__ext4_error(inode1->i_sb, function,
 			"Both inodes should not be NULL: "
 			"inode1 %lu inode2 NULL", inode1->i_ino);
 		ret = -EIO;
@@ -252,6 +253,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
 		}
 
 		o_start->ee_len = start_ext->ee_len;
+		eblock = le32_to_cpu(start_ext->ee_block);
 		new_flag = 1;
 
 	} else if (start_ext->ee_len && new_ext->ee_len &&
@@ -262,6 +264,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
 		 * orig  |------------------------------|
 		 */
 		o_start->ee_len = start_ext->ee_len;
+		eblock = le32_to_cpu(start_ext->ee_block);
 		new_flag = 1;
 
 	} else if (!start_ext->ee_len && new_ext->ee_len &&
@@ -475,11 +478,11 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
 	struct ext4_extent *oext, *o_start, *o_end, *prev_ext;
 	struct ext4_extent new_ext, start_ext, end_ext;
 	ext4_lblk_t new_ext_end;
-	ext4_fsblk_t new_phys_end;
 	int oext_alen, new_ext_alen, end_ext_alen;
 	int depth = ext_depth(orig_inode);
 	int ret;
 
+	start_ext.ee_block = end_ext.ee_block = 0;
 	o_start = o_end = oext = orig_path[depth].p_ext;
 	oext_alen = ext4_ext_get_actual_len(oext);
 	start_ext.ee_len = end_ext.ee_len = 0;
@@ -489,7 +492,6 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
 	new_ext.ee_len = dext->ee_len;
 	new_ext_alen = ext4_ext_get_actual_len(&new_ext);
 	new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
-	new_phys_end = ext_pblock(&new_ext) + new_ext_alen - 1;
 
 	/*
 	 * Case: original extent is first
@@ -502,6 +504,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
 		le32_to_cpu(oext->ee_block) + oext_alen) {
 		start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) -
 					       le32_to_cpu(oext->ee_block));
+		start_ext.ee_block = oext->ee_block;
 		copy_extent_status(oext, &start_ext);
 	} else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) {
 		prev_ext = oext - 1;
@@ -515,6 +518,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
 			start_ext.ee_len = cpu_to_le16(
 				ext4_ext_get_actual_len(prev_ext) +
 				new_ext_alen);
+			start_ext.ee_block = oext->ee_block;
 			copy_extent_status(prev_ext, &start_ext);
 			new_ext.ee_len = 0;
 		}
@@ -526,7 +530,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
 	 * new_ext       |-------|
 	 */
 	if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
-		ext4_error(orig_inode->i_sb, __func__,
+		EXT4_ERROR_INODE(orig_inode,
 			"new_ext_end(%u) should be less than or equal to "
 			"oext->ee_block(%u) + oext_alen(%d) - 1",
 			new_ext_end, le32_to_cpu(oext->ee_block),
@@ -689,12 +693,12 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
 	while (1) {
 		/* The extent for donor must be found. */
 		if (!dext) {
-			ext4_error(donor_inode->i_sb, __func__,
+			EXT4_ERROR_INODE(donor_inode,
 				   "The extent for donor must be found");
 			*err = -EIO;
 			goto out;
 		} else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
-			ext4_error(donor_inode->i_sb, __func__,
+			EXT4_ERROR_INODE(donor_inode,
 				"Donor offset(%u) and the first block of donor "
 				"extent(%u) should be equal",
 				donor_off,
@@ -928,7 +932,7 @@ out2:
 }
 
 /**
- * mext_check_argumants - Check whether move extent can be done
+ * mext_check_arguments - Check whether move extent can be done
  *
  * @orig_inode:		original inode
  * @donor_inode:	donor inode
@@ -949,14 +953,6 @@ mext_check_arguments(struct inode *orig_inode,
 	unsigned int blkbits = orig_inode->i_blkbits;
 	unsigned int blocksize = 1 << blkbits;
 
-	/* Regular file check */
-	if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
-		ext4_debug("ext4 move extent: The argument files should be "
-			"regular file [ino:orig %lu, donor %lu]\n",
-			orig_inode->i_ino, donor_inode->i_ino);
-		return -EINVAL;
-	}
-
 	if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
 		ext4_debug("ext4 move extent: suid or sgid is set"
 			   " to donor file [ino:orig %lu, donor %lu]\n",
@@ -964,6 +960,9 @@ mext_check_arguments(struct inode *orig_inode,
 		return -EINVAL;
 	}
 
+	if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode))
+		return -EPERM;
+
 	/* Ext4 move extent does not support swapfile */
 	if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
 		ext4_debug("ext4 move extent: The argument files should "
@@ -981,11 +980,11 @@ mext_check_arguments(struct inode *orig_inode,
 	}
 
 	/* Ext4 move extent supports only extent based file */
-	if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) {
+	if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) {
 		ext4_debug("ext4 move extent: orig file is not extents "
 			"based file [ino:orig %lu]\n", orig_inode->i_ino);
 		return -EOPNOTSUPP;
-	} else if (!(EXT4_I(donor_inode)->i_flags & EXT4_EXTENTS_FL)) {
+	} else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) {
 		ext4_debug("ext4 move extent: donor file is not extents "
 			"based file [ino:donor %lu]\n", donor_inode->i_ino);
 		return -EOPNOTSUPP;
@@ -1204,6 +1203,14 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 		return -EINVAL;
 	}
 
+	/* Regular file check */
+	if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
+		ext4_debug("ext4 move extent: The argument files should be "
+			"regular file [ino:orig %lu, donor %lu]\n",
+			orig_inode->i_ino, donor_inode->i_ino);
+		return -EINVAL;
+	}
+
 	/* Protect orig and donor inodes against a truncate */
 	ret1 = mext_inode_double_lock(orig_inode, donor_inode);
 	if (ret1 < 0)
@@ -1351,7 +1358,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 			if (ret1 < 0)
 				break;
 			if (*moved_len > len) {
-				ext4_error(orig_inode->i_sb, __func__,
+				EXT4_ERROR_INODE(orig_inode,
 					"We replaced blocks too much! "
 					"sum of replaced: %llu requested: %llu",
 					*moved_len, len);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 17a17e10dd60..a43e6617b351 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -187,7 +187,7 @@ unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
 		return blocksize;
 	return (len & 65532) | ((len & 3) << 16);
 }
-  
+
 __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
 {
 	if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
@@ -197,7 +197,7 @@ __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
 	if (len == blocksize) {
 		if (blocksize == 65536)
 			return cpu_to_le16(EXT4_MAX_REC_LEN);
-		else 
+		else
 			return cpu_to_le16(0);
 	}
 	return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
@@ -349,7 +349,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
 		brelse(bh);
 	}
 	if (bcount)
-		printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n", 
+		printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n",
 		       levels ? "" : "   ", names, space/bcount,
 		       (space/bcount)*100/blocksize);
 	return (struct stats) { names, space, bcount};
@@ -383,8 +383,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
 	if (root->info.hash_version != DX_HASH_TEA &&
 	    root->info.hash_version != DX_HASH_HALF_MD4 &&
 	    root->info.hash_version != DX_HASH_LEGACY) {
-		ext4_warning(dir->i_sb, __func__,
-			     "Unrecognised inode hash code %d",
+		ext4_warning(dir->i_sb, "Unrecognised inode hash code %d",
 			     root->info.hash_version);
 		brelse(bh);
 		*err = ERR_BAD_DX_DIR;
@@ -399,8 +398,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
 	hash = hinfo->hash;
 
 	if (root->info.unused_flags & 1) {
-		ext4_warning(dir->i_sb, __func__,
-			     "Unimplemented inode hash flags: %#06x",
+		ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x",
 			     root->info.unused_flags);
 		brelse(bh);
 		*err = ERR_BAD_DX_DIR;
@@ -408,8 +406,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
 	}
 
 	if ((indirect = root->info.indirect_levels) > 1) {
-		ext4_warning(dir->i_sb, __func__,
-			     "Unimplemented inode hash depth: %#06x",
+		ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
 			     root->info.indirect_levels);
 		brelse(bh);
 		*err = ERR_BAD_DX_DIR;
@@ -421,8 +418,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
 
 	if (dx_get_limit(entries) != dx_root_limit(dir,
 						   root->info.info_length)) {
-		ext4_warning(dir->i_sb, __func__,
-			     "dx entry: limit != root limit");
+		ext4_warning(dir->i_sb, "dx entry: limit != root limit");
 		brelse(bh);
 		*err = ERR_BAD_DX_DIR;
 		goto fail;
@@ -433,7 +429,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
 	{
 		count = dx_get_count(entries);
 		if (!count || count > dx_get_limit(entries)) {
-			ext4_warning(dir->i_sb, __func__,
+			ext4_warning(dir->i_sb,
 				     "dx entry: no count or count > limit");
 			brelse(bh);
 			*err = ERR_BAD_DX_DIR;
@@ -478,7 +474,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
 			goto fail2;
 		at = entries = ((struct dx_node *) bh->b_data)->entries;
 		if (dx_get_limit(entries) != dx_node_limit (dir)) {
-			ext4_warning(dir->i_sb, __func__,
+			ext4_warning(dir->i_sb,
 				     "dx entry: limit != node limit");
 			brelse(bh);
 			*err = ERR_BAD_DX_DIR;
@@ -494,7 +490,7 @@ fail2:
 	}
 fail:
 	if (*err == ERR_BAD_DX_DIR)
-		ext4_warning(dir->i_sb, __func__,
+		ext4_warning(dir->i_sb,
 			     "Corrupt dir inode %ld, running e2fsck is "
 			     "recommended.", dir->i_ino);
 	return NULL;
@@ -657,10 +653,10 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
 	int ret, err;
 	__u32 hashval;
 
-	dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", 
+	dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
 		       start_hash, start_minor_hash));
 	dir = dir_file->f_path.dentry->d_inode;
-	if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
+	if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) {
 		hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
 		if (hinfo.hash_version <= DX_HASH_TEA)
 			hinfo.hash_version +=
@@ -805,7 +801,7 @@ static void ext4_update_dx_flag(struct inode *inode)
 {
 	if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
 				     EXT4_FEATURE_COMPAT_DIR_INDEX))
-		EXT4_I(inode)->i_flags &= ~EXT4_INDEX_FL;
+		ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
 }
 
 /*
@@ -947,9 +943,8 @@ restart:
 		wait_on_buffer(bh);
 		if (!buffer_uptodate(bh)) {
 			/* read error, skip block & hope for the best */
-			ext4_error(sb, __func__, "reading directory #%lu "
-				   "offset %lu", dir->i_ino,
-				   (unsigned long)block);
+			EXT4_ERROR_INODE(dir, "reading directory lblock %lu",
+					 (unsigned long) block);
 			brelse(bh);
 			goto next;
 		}
@@ -1041,7 +1036,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
 		retval = ext4_htree_next_block(dir, hash, frame,
 					       frames, NULL);
 		if (retval < 0) {
-			ext4_warning(sb, __func__,
+			ext4_warning(sb,
 			     "error reading index page in directory #%lu",
 			     dir->i_ino);
 			*err = retval;
@@ -1071,16 +1066,15 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
 		__u32 ino = le32_to_cpu(de->inode);
 		brelse(bh);
 		if (!ext4_valid_inum(dir->i_sb, ino)) {
-			ext4_error(dir->i_sb, "ext4_lookup",
-				   "bad inode number: %u", ino);
+			EXT4_ERROR_INODE(dir, "bad inode number: %u", ino);
 			return ERR_PTR(-EIO);
 		}
 		inode = ext4_iget(dir->i_sb, ino);
 		if (unlikely(IS_ERR(inode))) {
 			if (PTR_ERR(inode) == -ESTALE) {
-				ext4_error(dir->i_sb, __func__,
-						"deleted inode referenced: %u",
-						ino);
+				EXT4_ERROR_INODE(dir,
+						 "deleted inode referenced: %u",
+						 ino);
 				return ERR_PTR(-EIO);
 			} else {
 				return ERR_CAST(inode);
@@ -1110,8 +1104,8 @@ struct dentry *ext4_get_parent(struct dentry *child)
 	brelse(bh);
 
 	if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
-		ext4_error(child->d_inode->i_sb, "ext4_get_parent",
-			   "bad inode number: %u", ino);
+		EXT4_ERROR_INODE(child->d_inode,
+				 "bad parent inode number: %u", ino);
 		return ERR_PTR(-EIO);
 	}
 
@@ -1147,7 +1141,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
 	unsigned rec_len = 0;
 
 	while (count--) {
-		struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) 
+		struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)
 						(from + (map->offs<<2));
 		rec_len = EXT4_DIR_REC_LEN(de->name_len);
 		memcpy (to, de, rec_len);
@@ -1410,9 +1404,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 	de = (struct ext4_dir_entry_2 *)((char *)fde +
 		ext4_rec_len_from_disk(fde->rec_len, blocksize));
 	if ((char *) de >= (((char *) root) + blocksize)) {
-		ext4_error(dir->i_sb, __func__,
-			   "invalid rec_len for '..' in inode %lu",
-			   dir->i_ino);
+		EXT4_ERROR_INODE(dir, "invalid rec_len for '..'");
 		brelse(bh);
 		return -EIO;
 	}
@@ -1424,7 +1416,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 		brelse(bh);
 		return retval;
 	}
-	EXT4_I(dir)->i_flags |= EXT4_INDEX_FL;
+	ext4_set_inode_flag(dir, EXT4_INODE_INDEX);
 	data1 = bh2->b_data;
 
 	memcpy (data1, de, len);
@@ -1497,7 +1489,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
 		retval = ext4_dx_add_entry(handle, dentry, inode);
 		if (!retval || (retval != ERR_BAD_DX_DIR))
 			return retval;
-		EXT4_I(dir)->i_flags &= ~EXT4_INDEX_FL;
+		ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
 		dx_fallback++;
 		ext4_mark_inode_dirty(handle, dir);
 	}
@@ -1525,6 +1517,8 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
 	de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
 	retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
 	brelse(bh);
+	if (retval == 0)
+		ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
 	return retval;
 }
 
@@ -1575,8 +1569,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 
 		if (levels && (dx_get_count(frames->entries) ==
 			       dx_get_limit(frames->entries))) {
-			ext4_warning(sb, __func__,
-				     "Directory index full!");
+			ext4_warning(sb, "Directory index full!");
 			err = -ENOSPC;
 			goto cleanup;
 		}
@@ -1766,6 +1759,8 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, int mode,
 	struct inode *inode;
 	int err, retries = 0;
 
+	dquot_initialize(dir);
+
 retry:
 	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
 					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
@@ -1800,6 +1795,8 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
 	if (!new_valid_dev(rdev))
 		return -EINVAL;
 
+	dquot_initialize(dir);
+
 retry:
 	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
 					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
@@ -1837,6 +1834,8 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	if (EXT4_DIR_LINK_MAX(dir))
 		return -EMLINK;
 
+	dquot_initialize(dir);
+
 retry:
 	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
 					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
@@ -1916,11 +1915,10 @@ static int empty_dir(struct inode *inode)
 	if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
 	    !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
 		if (err)
-			ext4_error(inode->i_sb, __func__,
-				   "error %d reading directory #%lu offset 0",
-				   err, inode->i_ino);
+			EXT4_ERROR_INODE(inode,
+				"error %d reading directory lblock 0", err);
 		else
-			ext4_warning(inode->i_sb, __func__,
+			ext4_warning(inode->i_sb,
 				     "bad directory (dir #%lu) - no data block",
 				     inode->i_ino);
 		return 1;
@@ -1931,7 +1929,7 @@ static int empty_dir(struct inode *inode)
 			!le32_to_cpu(de1->inode) ||
 			strcmp(".", de->name) ||
 			strcmp("..", de1->name)) {
-		ext4_warning(inode->i_sb, "empty_dir",
+		ext4_warning(inode->i_sb,
 			     "bad directory (dir #%lu) - no `.' or `..'",
 			     inode->i_ino);
 		brelse(bh);
@@ -1942,17 +1940,17 @@ static int empty_dir(struct inode *inode)
 	de = ext4_next_entry(de1, sb->s_blocksize);
 	while (offset < inode->i_size) {
 		if (!bh ||
-			(void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
+		    (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
+			unsigned int lblock;
 			err = 0;
 			brelse(bh);
-			bh = ext4_bread(NULL, inode,
-				offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err);
+			lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb);
+			bh = ext4_bread(NULL, inode, lblock, 0, &err);
 			if (!bh) {
 				if (err)
-					ext4_error(sb, __func__,
-						   "error %d reading directory"
-						   " #%lu offset %u",
-						   err, inode->i_ino, offset);
+					EXT4_ERROR_INODE(inode,
+						"error %d reading directory "
+						"lblock %u", err, lblock);
 				offset += sb->s_blocksize;
 				continue;
 			}
@@ -2020,11 +2018,18 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
 	err = ext4_reserve_inode_write(handle, inode, &iloc);
 	if (err)
 		goto out_unlock;
+	/*
+	 * Due to previous errors inode may be already a part of on-disk
+	 * orphan list. If so skip on-disk list modification.
+	 */
+	if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <=
+		(le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)))
+			goto mem_insert;
 
 	/* Insert this inode at the head of the on-disk orphan list... */
 	NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
 	EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
-	err = ext4_handle_dirty_metadata(handle, inode, EXT4_SB(sb)->s_sbh);
+	err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
 	rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
 	if (!err)
 		err = rc;
@@ -2037,6 +2042,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
 	 *
 	 * This is safe: on error we're going to ignore the orphan list
 	 * anyway on the next recovery. */
+mem_insert:
 	if (!err)
 		list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
 
@@ -2096,7 +2102,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
 		if (err)
 			goto out_brelse;
 		sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
-		err = ext4_handle_dirty_metadata(handle, inode, sbi->s_sbh);
+		err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
 	} else {
 		struct ext4_iloc iloc2;
 		struct inode *i_prev =
@@ -2136,7 +2142,9 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
 
 	/* Initialize quotas before so that eventual writes go in
 	 * separate transaction */
-	vfs_dq_init(dentry->d_inode);
+	dquot_initialize(dir);
+	dquot_initialize(dentry->d_inode);
+
 	handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
@@ -2163,7 +2171,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
 	if (retval)
 		goto end_rmdir;
 	if (!EXT4_DIR_LINK_EMPTY(inode))
-		ext4_warning(inode->i_sb, "ext4_rmdir",
+		ext4_warning(inode->i_sb,
 			     "empty directory has too many links (%d)",
 			     inode->i_nlink);
 	inode->i_version++;
@@ -2195,7 +2203,9 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
 
 	/* Initialize quotas before so that eventual writes go
 	 * in separate transaction */
-	vfs_dq_init(dentry->d_inode);
+	dquot_initialize(dir);
+	dquot_initialize(dentry->d_inode);
+
 	handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
@@ -2215,7 +2225,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
 		goto end_unlink;
 
 	if (!inode->i_nlink) {
-		ext4_warning(inode->i_sb, "ext4_unlink",
+		ext4_warning(inode->i_sb,
 			     "Deleting nonexistent file (%lu), %d",
 			     inode->i_ino, inode->i_nlink);
 		inode->i_nlink = 1;
@@ -2250,6 +2260,8 @@ static int ext4_symlink(struct inode *dir,
 	if (l > dir->i_sb->s_blocksize)
 		return -ENAMETOOLONG;
 
+	dquot_initialize(dir);
+
 retry:
 	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
 					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 +
@@ -2284,7 +2296,7 @@ retry:
 		}
 	} else {
 		/* clear the extent format for fast symlink */
-		EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL;
+		ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
 		inode->i_op = &ext4_fast_symlink_inode_operations;
 		memcpy((char *)&EXT4_I(inode)->i_data, symname, l);
 		inode->i_size = l-1;
@@ -2308,6 +2320,8 @@ static int ext4_link(struct dentry *old_dentry,
 	if (inode->i_nlink >= EXT4_LINK_MAX)
 		return -EMLINK;
 
+	dquot_initialize(dir);
+
 	/*
 	 * Return -ENOENT if we've raced with unlink and i_nlink is 0.  Doing
 	 * otherwise has the potential to corrupt the orphan inode list.
@@ -2358,12 +2372,15 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct ext4_dir_entry_2 *old_de, *new_de;
 	int retval, force_da_alloc = 0;
 
+	dquot_initialize(old_dir);
+	dquot_initialize(new_dir);
+
 	old_bh = new_bh = dir_bh = NULL;
 
 	/* Initialize quotas before so that eventual writes go
 	 * in separate transaction */
 	if (new_dentry->d_inode)
-		vfs_dq_init(new_dentry->d_inode);
+		dquot_initialize(new_dentry->d_inode);
 	handle = ext4_journal_start(old_dir, 2 *
 					EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
 					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
@@ -2462,7 +2479,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 		}
 	}
 	if (retval) {
-		ext4_warning(old_dir->i_sb, "ext4_rename",
+		ext4_warning(old_dir->i_sb,
 				"Deleting old file (%lu), %d, error=%d",
 				old_dir->i_ino, old_dir->i_nlink, retval);
 	}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 3b2c5541d8a6..6df797eb9aeb 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -48,65 +48,54 @@ static int verify_group_input(struct super_block *sb,
 
 	ext4_get_group_no_and_offset(sb, start, NULL, &offset);
 	if (group != sbi->s_groups_count)
-		ext4_warning(sb, __func__,
-			     "Cannot add at group %u (only %u groups)",
+		ext4_warning(sb, "Cannot add at group %u (only %u groups)",
 			     input->group, sbi->s_groups_count);
 	else if (offset != 0)
-			ext4_warning(sb, __func__, "Last group not full");
+			ext4_warning(sb, "Last group not full");
 	else if (input->reserved_blocks > input->blocks_count / 5)
-		ext4_warning(sb, __func__, "Reserved blocks too high (%u)",
+		ext4_warning(sb, "Reserved blocks too high (%u)",
 			     input->reserved_blocks);
 	else if (free_blocks_count < 0)
-		ext4_warning(sb, __func__, "Bad blocks count %u",
+		ext4_warning(sb, "Bad blocks count %u",
 			     input->blocks_count);
 	else if (!(bh = sb_bread(sb, end - 1)))
-		ext4_warning(sb, __func__,
-			     "Cannot read last block (%llu)",
+		ext4_warning(sb, "Cannot read last block (%llu)",
 			     end - 1);
 	else if (outside(input->block_bitmap, start, end))
-		ext4_warning(sb, __func__,
-			     "Block bitmap not in group (block %llu)",
+		ext4_warning(sb, "Block bitmap not in group (block %llu)",
 			     (unsigned long long)input->block_bitmap);
 	else if (outside(input->inode_bitmap, start, end))
-		ext4_warning(sb, __func__,
-			     "Inode bitmap not in group (block %llu)",
+		ext4_warning(sb, "Inode bitmap not in group (block %llu)",
 			     (unsigned long long)input->inode_bitmap);
 	else if (outside(input->inode_table, start, end) ||
 		 outside(itend - 1, start, end))
-		ext4_warning(sb, __func__,
-			     "Inode table not in group (blocks %llu-%llu)",
+		ext4_warning(sb, "Inode table not in group (blocks %llu-%llu)",
 			     (unsigned long long)input->inode_table, itend - 1);
 	else if (input->inode_bitmap == input->block_bitmap)
-		ext4_warning(sb, __func__,
-			     "Block bitmap same as inode bitmap (%llu)",
+		ext4_warning(sb, "Block bitmap same as inode bitmap (%llu)",
 			     (unsigned long long)input->block_bitmap);
 	else if (inside(input->block_bitmap, input->inode_table, itend))
-		ext4_warning(sb, __func__,
-			     "Block bitmap (%llu) in inode table (%llu-%llu)",
+		ext4_warning(sb, "Block bitmap (%llu) in inode table "
+			     "(%llu-%llu)",
 			     (unsigned long long)input->block_bitmap,
 			     (unsigned long long)input->inode_table, itend - 1);
 	else if (inside(input->inode_bitmap, input->inode_table, itend))
-		ext4_warning(sb, __func__,
-			     "Inode bitmap (%llu) in inode table (%llu-%llu)",
+		ext4_warning(sb, "Inode bitmap (%llu) in inode table "
+			     "(%llu-%llu)",
 			     (unsigned long long)input->inode_bitmap,
 			     (unsigned long long)input->inode_table, itend - 1);
 	else if (inside(input->block_bitmap, start, metaend))
-		ext4_warning(sb, __func__,
-			     "Block bitmap (%llu) in GDT table"
-			     " (%llu-%llu)",
+		ext4_warning(sb, "Block bitmap (%llu) in GDT table (%llu-%llu)",
 			     (unsigned long long)input->block_bitmap,
 			     start, metaend - 1);
 	else if (inside(input->inode_bitmap, start, metaend))
-		ext4_warning(sb, __func__,
-			     "Inode bitmap (%llu) in GDT table"
-			     " (%llu-%llu)",
+		ext4_warning(sb, "Inode bitmap (%llu) in GDT table (%llu-%llu)",
 			     (unsigned long long)input->inode_bitmap,
 			     start, metaend - 1);
 	else if (inside(input->inode_table, start, metaend) ||
 		 inside(itend - 1, start, metaend))
-		ext4_warning(sb, __func__,
-			     "Inode table (%llu-%llu) overlaps"
-			     "GDT table (%llu-%llu)",
+		ext4_warning(sb, "Inode table (%llu-%llu) overlaps GDT table "
+			     "(%llu-%llu)",
 			     (unsigned long long)input->inode_table,
 			     itend - 1, start, metaend - 1);
 	else
@@ -364,8 +353,7 @@ static int verify_reserved_gdb(struct super_block *sb,
 	while ((grp = ext4_list_backups(sb, &three, &five, &seven)) < end) {
 		if (le32_to_cpu(*p++) !=
 		    grp * EXT4_BLOCKS_PER_GROUP(sb) + blk){
-			ext4_warning(sb, __func__,
-				     "reserved GDT %llu"
+			ext4_warning(sb, "reserved GDT %llu"
 				     " missing grp %d (%llu)",
 				     blk, grp,
 				     grp *
@@ -420,8 +408,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
          */
 	if (EXT4_SB(sb)->s_sbh->b_blocknr !=
 	    le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
-		ext4_warning(sb, __func__,
-			"won't resize using backup superblock at %llu",
+		ext4_warning(sb, "won't resize using backup superblock at %llu",
 			(unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr);
 		return -EPERM;
 	}
@@ -444,8 +431,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 
 	data = (__le32 *)dind->b_data;
 	if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) {
-		ext4_warning(sb, __func__,
-			     "new group %u GDT block %llu not reserved",
+		ext4_warning(sb, "new group %u GDT block %llu not reserved",
 			     input->group, gdblock);
 		err = -EINVAL;
 		goto exit_dind;
@@ -468,7 +454,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 			GFP_NOFS);
 	if (!n_group_desc) {
 		err = -ENOMEM;
-		ext4_warning(sb, __func__,
+		ext4_warning(sb,
 			      "not enough memory for %lu groups", gdb_num + 1);
 		goto exit_inode;
 	}
@@ -567,8 +553,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
 	/* Get each reserved primary GDT block and verify it holds backups */
 	for (res = 0; res < reserved_gdb; res++, blk++) {
 		if (le32_to_cpu(*data) != blk) {
-			ext4_warning(sb, __func__,
-				     "reserved block %llu"
+			ext4_warning(sb, "reserved block %llu"
 				     " not at offset %ld",
 				     blk,
 				     (long)(data - (__le32 *)dind->b_data));
@@ -713,8 +698,7 @@ static void update_backups(struct super_block *sb,
 	 */
 exit_err:
 	if (err) {
-		ext4_warning(sb, __func__,
-			     "can't update backup for group %u (err %d), "
+		ext4_warning(sb, "can't update backup for group %u (err %d), "
 			     "forcing fsck on next reboot", group, err);
 		sbi->s_mount_state &= ~EXT4_VALID_FS;
 		sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
@@ -753,20 +737,19 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 
 	if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
 					EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
-		ext4_warning(sb, __func__,
-			     "Can't resize non-sparse filesystem further");
+		ext4_warning(sb, "Can't resize non-sparse filesystem further");
 		return -EPERM;
 	}
 
 	if (ext4_blocks_count(es) + input->blocks_count <
 	    ext4_blocks_count(es)) {
-		ext4_warning(sb, __func__, "blocks_count overflow");
+		ext4_warning(sb, "blocks_count overflow");
 		return -EINVAL;
 	}
 
 	if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) <
 	    le32_to_cpu(es->s_inodes_count)) {
-		ext4_warning(sb, __func__, "inodes_count overflow");
+		ext4_warning(sb, "inodes_count overflow");
 		return -EINVAL;
 	}
 
@@ -774,14 +757,13 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 		if (!EXT4_HAS_COMPAT_FEATURE(sb,
 					     EXT4_FEATURE_COMPAT_RESIZE_INODE)
 		    || !le16_to_cpu(es->s_reserved_gdt_blocks)) {
-			ext4_warning(sb, __func__,
+			ext4_warning(sb,
 				     "No reserved GDT blocks, can't resize");
 			return -EPERM;
 		}
 		inode = ext4_iget(sb, EXT4_RESIZE_INO);
 		if (IS_ERR(inode)) {
-			ext4_warning(sb, __func__,
-				     "Error opening resize inode");
+			ext4_warning(sb, "Error opening resize inode");
 			return PTR_ERR(inode);
 		}
 	}
@@ -810,8 +792,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 
 	mutex_lock(&sbi->s_resize_lock);
 	if (input->group != sbi->s_groups_count) {
-		ext4_warning(sb, __func__,
-			     "multiple resizers run on filesystem!");
+		ext4_warning(sb, "multiple resizers run on filesystem!");
 		err = -EBUSY;
 		goto exit_journal;
 	}
@@ -930,7 +911,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	percpu_counter_add(&sbi->s_freeinodes_counter,
 			   EXT4_INODES_PER_GROUP(sb));
 
-	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
+	    sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group;
 		flex_group = ext4_flex_group(sbi, input->group);
 		atomic_add(input->free_blocks_count,
@@ -997,13 +979,12 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 			" too large to resize to %llu blocks safely\n",
 			sb->s_id, n_blocks_count);
 		if (sizeof(sector_t) < 8)
-			ext4_warning(sb, __func__, "CONFIG_LBDAF not enabled");
+			ext4_warning(sb, "CONFIG_LBDAF not enabled");
 		return -EINVAL;
 	}
 
 	if (n_blocks_count < o_blocks_count) {
-		ext4_warning(sb, __func__,
-			     "can't shrink FS - resize aborted");
+		ext4_warning(sb, "can't shrink FS - resize aborted");
 		return -EBUSY;
 	}
 
@@ -1011,15 +992,14 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 	ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
 
 	if (last == 0) {
-		ext4_warning(sb, __func__,
-			     "need to use ext2online to resize further");
+		ext4_warning(sb, "need to use ext2online to resize further");
 		return -EPERM;
 	}
 
 	add = EXT4_BLOCKS_PER_GROUP(sb) - last;
 
 	if (o_blocks_count + add < o_blocks_count) {
-		ext4_warning(sb, __func__, "blocks_count overflow");
+		ext4_warning(sb, "blocks_count overflow");
 		return -EINVAL;
 	}
 
@@ -1027,16 +1007,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 		add = n_blocks_count - o_blocks_count;
 
 	if (o_blocks_count + add < n_blocks_count)
-		ext4_warning(sb, __func__,
-			     "will only finish group (%llu"
-			     " blocks, %u new)",
+		ext4_warning(sb, "will only finish group (%llu blocks, %u new)",
 			     o_blocks_count + add, add);
 
 	/* See if the device is actually as big as what was requested */
 	bh = sb_bread(sb, o_blocks_count + add - 1);
 	if (!bh) {
-		ext4_warning(sb, __func__,
-			     "can't read last block, resize aborted");
+		ext4_warning(sb, "can't read last block, resize aborted");
 		return -ENOSPC;
 	}
 	brelse(bh);
@@ -1047,14 +1024,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 	handle = ext4_journal_start_sb(sb, 3);
 	if (IS_ERR(handle)) {
 		err = PTR_ERR(handle);
-		ext4_warning(sb, __func__, "error %d on journal start", err);
+		ext4_warning(sb, "error %d on journal start", err);
 		goto exit_put;
 	}
 
 	mutex_lock(&EXT4_SB(sb)->s_resize_lock);
 	if (o_blocks_count != ext4_blocks_count(es)) {
-		ext4_warning(sb, __func__,
-			     "multiple resizers run on filesystem!");
+		ext4_warning(sb, "multiple resizers run on filesystem!");
 		mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
 		ext4_journal_stop(handle);
 		err = -EBUSY;
@@ -1063,8 +1039,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 
 	if ((err = ext4_journal_get_write_access(handle,
 						 EXT4_SB(sb)->s_sbh))) {
-		ext4_warning(sb, __func__,
-			     "error %d on journal write access", err);
+		ext4_warning(sb, "error %d on journal write access", err);
 		mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
 		ext4_journal_stop(handle);
 		goto exit_put;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 735c20d5fd56..4e8983a9811b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -68,7 +68,21 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int ext4_unfreeze(struct super_block *sb);
 static void ext4_write_super(struct super_block *sb);
 static int ext4_freeze(struct super_block *sb);
+static int ext4_get_sb(struct file_system_type *fs_type, int flags,
+		       const char *dev_name, void *data, struct vfsmount *mnt);
 
+#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static struct file_system_type ext3_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ext3",
+	.get_sb		= ext4_get_sb,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
+#else
+#define IS_EXT3_SB(sb) (0)
+#endif
 
 ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
 			       struct ext4_group_desc *bg)
@@ -227,6 +241,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
 	if (sb->s_flags & MS_RDONLY)
 		return ERR_PTR(-EROFS);
 
+	vfs_check_frozen(sb, SB_FREEZE_WRITE);
 	/* Special case here: if the journal has aborted behind our
 	 * backs (eg. EIO in the commit thread), then we still need to
 	 * take the FS itself readonly cleanly. */
@@ -302,7 +317,7 @@ void ext4_journal_abort_handle(const char *caller, const char *err_fn,
  * write out the superblock safely.
  *
  * We'll just use the jbd2_journal_abort() error code to record an error in
- * the journal instead.  On recovery, the journal will compain about
+ * the journal instead.  On recovery, the journal will complain about
  * that error until we've noted it down and cleared it.
  */
 
@@ -333,7 +348,7 @@ static void ext4_handle_error(struct super_block *sb)
 			sb->s_id);
 }
 
-void ext4_error(struct super_block *sb, const char *function,
+void __ext4_error(struct super_block *sb, const char *function,
 		const char *fmt, ...)
 {
 	va_list args;
@@ -347,6 +362,42 @@ void ext4_error(struct super_block *sb, const char *function,
 	ext4_handle_error(sb);
 }
 
+void ext4_error_inode(const char *function, struct inode *inode,
+		      const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	printk(KERN_CRIT "EXT4-fs error (device %s): %s: inode #%lu: (comm %s) ",
+	       inode->i_sb->s_id, function, inode->i_ino, current->comm);
+	vprintk(fmt, args);
+	printk("\n");
+	va_end(args);
+
+	ext4_handle_error(inode->i_sb);
+}
+
+void ext4_error_file(const char *function, struct file *file,
+		     const char *fmt, ...)
+{
+	va_list args;
+	struct inode *inode = file->f_dentry->d_inode;
+	char pathname[80], *path;
+
+	va_start(args, fmt);
+	path = d_path(&(file->f_path), pathname, sizeof(pathname));
+	if (!path)
+		path = "(unknown)";
+	printk(KERN_CRIT
+	       "EXT4-fs error (device %s): %s: inode #%lu (comm %s path %s): ",
+	       inode->i_sb->s_id, function, inode->i_ino, current->comm, path);
+	vprintk(fmt, args);
+	printk("\n");
+	va_end(args);
+
+	ext4_handle_error(inode->i_sb);
+}
+
 static const char *ext4_decode_error(struct super_block *sb, int errno,
 				     char nbuf[16])
 {
@@ -450,7 +501,7 @@ void ext4_msg (struct super_block * sb, const char *prefix,
 	va_end(args);
 }
 
-void ext4_warning(struct super_block *sb, const char *function,
+void __ext4_warning(struct super_block *sb, const char *function,
 		  const char *fmt, ...)
 {
 	va_list args;
@@ -507,7 +558,7 @@ void ext4_update_dynamic_rev(struct super_block *sb)
 	if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV)
 		return;
 
-	ext4_warning(sb, __func__,
+	ext4_warning(sb,
 		     "updating to rev %d because of new feature flag, "
 		     "running e2fsck is recommended",
 		     EXT4_DYNAMIC_REV);
@@ -595,6 +646,8 @@ static void ext4_put_super(struct super_block *sb)
 	struct ext4_super_block *es = sbi->s_es;
 	int i, err;
 
+	dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
+
 	flush_workqueue(sbi->dio_unwritten_wq);
 	destroy_workqueue(sbi->dio_unwritten_wq);
 
@@ -708,7 +761,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 #ifdef CONFIG_QUOTA
 	ei->i_reserved_quota = 0;
 #endif
-	INIT_LIST_HEAD(&ei->i_aio_dio_complete_list);
+	INIT_LIST_HEAD(&ei->i_completed_io_list);
+	spin_lock_init(&ei->i_completed_io_lock);
 	ei->cur_aio_dio = NULL;
 	ei->i_sync_tid = 0;
 	ei->i_datasync_tid = 0;
@@ -761,6 +815,7 @@ static void destroy_inodecache(void)
 
 static void ext4_clear_inode(struct inode *inode)
 {
+	dquot_drop(inode);
 	ext4_discard_preallocations(inode);
 	if (EXT4_JOURNAL(inode))
 		jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
@@ -796,10 +851,10 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
 	if (sbi->s_qf_names[GRPQUOTA])
 		seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
 
-	if (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA)
+	if (test_opt(sb, USRQUOTA))
 		seq_puts(seq, ",usrquota");
 
-	if (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)
+	if (test_opt(sb, GRPQUOTA))
 		seq_puts(seq, ",grpquota");
 #endif
 }
@@ -889,6 +944,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
 	if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
 		seq_puts(seq, ",journal_async_commit");
+	else if (test_opt(sb, JOURNAL_CHECKSUM))
+		seq_puts(seq, ",journal_checksum");
 	if (test_opt(sb, NOBH))
 		seq_puts(seq, ",nobh");
 	if (test_opt(sb, I_VERSION))
@@ -926,6 +983,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	if (test_opt(sb, NOLOAD))
 		seq_puts(seq, ",norecovery");
 
+	if (test_opt(sb, DIOREAD_NOLOCK))
+		seq_puts(seq, ",dioread_nolock");
+
 	ext4_show_quota_options(seq, sb);
 
 	return 0;
@@ -1004,7 +1064,7 @@ static int ext4_release_dquot(struct dquot *dquot);
 static int ext4_mark_dquot_dirty(struct dquot *dquot);
 static int ext4_write_info(struct super_block *sb, int type);
 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
-				char *path, int remount);
+				char *path);
 static int ext4_quota_on_mount(struct super_block *sb, int type);
 static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
 			       size_t len, loff_t off);
@@ -1012,19 +1072,9 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
 				const char *data, size_t len, loff_t off);
 
 static const struct dquot_operations ext4_quota_operations = {
-	.initialize	= dquot_initialize,
-	.drop		= dquot_drop,
-	.alloc_space	= dquot_alloc_space,
-	.reserve_space	= dquot_reserve_space,
-	.claim_space	= dquot_claim_space,
-	.release_rsv	= dquot_release_reserved_space,
 #ifdef CONFIG_QUOTA
 	.get_reserved_space = ext4_get_reserved_space,
 #endif
-	.alloc_inode	= dquot_alloc_inode,
-	.free_space	= dquot_free_space,
-	.free_inode	= dquot_free_inode,
-	.transfer	= dquot_transfer,
 	.write_dquot	= ext4_write_dquot,
 	.acquire_dquot	= ext4_acquire_dquot,
 	.release_dquot	= ext4_release_dquot,
@@ -1036,12 +1086,12 @@ static const struct dquot_operations ext4_quota_operations = {
 
 static const struct quotactl_ops ext4_qctl_operations = {
 	.quota_on	= ext4_quota_on,
-	.quota_off	= vfs_quota_off,
-	.quota_sync	= vfs_quota_sync,
-	.get_info	= vfs_get_dqinfo,
-	.set_info	= vfs_set_dqinfo,
-	.get_dqblk	= vfs_get_dqblk,
-	.set_dqblk	= vfs_set_dqblk
+	.quota_off	= dquot_quota_off,
+	.quota_sync	= dquot_quota_sync,
+	.get_info	= dquot_get_dqinfo,
+	.set_info	= dquot_set_dqinfo,
+	.get_dqblk	= dquot_get_dqblk,
+	.set_dqblk	= dquot_set_dqblk
 };
 #endif
 
@@ -1109,6 +1159,7 @@ enum {
 	Opt_stripe, Opt_delalloc, Opt_nodelalloc,
 	Opt_block_validity, Opt_noblock_validity,
 	Opt_inode_readahead_blks, Opt_journal_ioprio,
+	Opt_dioread_nolock, Opt_dioread_lock,
 	Opt_discard, Opt_nodiscard,
 };
 
@@ -1176,6 +1227,8 @@ static const match_table_t tokens = {
 	{Opt_auto_da_alloc, "auto_da_alloc=%u"},
 	{Opt_auto_da_alloc, "auto_da_alloc"},
 	{Opt_noauto_da_alloc, "noauto_da_alloc"},
+	{Opt_dioread_nolock, "dioread_nolock"},
+	{Opt_dioread_lock, "dioread_lock"},
 	{Opt_discard, "discard"},
 	{Opt_nodiscard, "nodiscard"},
 	{Opt_err, NULL},
@@ -1205,6 +1258,66 @@ static ext4_fsblk_t get_sb_block(void **data)
 }
 
 #define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
+static char deprecated_msg[] = "Mount option \"%s\" will be removed by %s\n"
+	"Contact linux-ext4@vger.kernel.org if you think we should keep it.\n";
+
+#ifdef CONFIG_QUOTA
+static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	char *qname;
+
+	if (sb_any_quota_loaded(sb) &&
+		!sbi->s_qf_names[qtype]) {
+		ext4_msg(sb, KERN_ERR,
+			"Cannot change journaled "
+			"quota options when quota turned on");
+		return 0;
+	}
+	qname = match_strdup(args);
+	if (!qname) {
+		ext4_msg(sb, KERN_ERR,
+			"Not enough memory for storing quotafile name");
+		return 0;
+	}
+	if (sbi->s_qf_names[qtype] &&
+		strcmp(sbi->s_qf_names[qtype], qname)) {
+		ext4_msg(sb, KERN_ERR,
+			"%s quota file already specified", QTYPE2NAME(qtype));
+		kfree(qname);
+		return 0;
+	}
+	sbi->s_qf_names[qtype] = qname;
+	if (strchr(sbi->s_qf_names[qtype], '/')) {
+		ext4_msg(sb, KERN_ERR,
+			"quotafile must be on filesystem root");
+		kfree(sbi->s_qf_names[qtype]);
+		sbi->s_qf_names[qtype] = NULL;
+		return 0;
+	}
+	set_opt(sbi->s_mount_opt, QUOTA);
+	return 1;
+}
+
+static int clear_qf_name(struct super_block *sb, int qtype)
+{
+
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	if (sb_any_quota_loaded(sb) &&
+		sbi->s_qf_names[qtype]) {
+		ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options"
+			" when quota turned on");
+		return 0;
+	}
+	/*
+	 * The space will be released later when all options are confirmed
+	 * to be correct
+	 */
+	sbi->s_qf_names[qtype] = NULL;
+	return 1;
+}
+#endif
 
 static int parse_options(char *options, struct super_block *sb,
 			 unsigned long *journal_devnum,
@@ -1217,8 +1330,7 @@ static int parse_options(char *options, struct super_block *sb,
 	int data_opt = 0;
 	int option;
 #ifdef CONFIG_QUOTA
-	int qtype, qfmt;
-	char *qname;
+	int qfmt;
 #endif
 
 	if (!options)
@@ -1229,19 +1341,31 @@ static int parse_options(char *options, struct super_block *sb,
 		if (!*p)
 			continue;
 
+		/*
+		 * Initialize args struct so we know whether arg was
+		 * found; some options take optional arguments.
+		 */
+		args[0].to = args[0].from = 0;
 		token = match_token(p, tokens, args);
 		switch (token) {
 		case Opt_bsd_df:
+			ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
 			clear_opt(sbi->s_mount_opt, MINIX_DF);
 			break;
 		case Opt_minix_df:
+			ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
 			set_opt(sbi->s_mount_opt, MINIX_DF);
+
 			break;
 		case Opt_grpid:
+			ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
 			set_opt(sbi->s_mount_opt, GRPID);
+
 			break;
 		case Opt_nogrpid:
+			ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
 			clear_opt(sbi->s_mount_opt, GRPID);
+
 			break;
 		case Opt_resuid:
 			if (match_int(&args[0], &option))
@@ -1378,14 +1502,13 @@ static int parse_options(char *options, struct super_block *sb,
 			data_opt = EXT4_MOUNT_WRITEBACK_DATA;
 		datacheck:
 			if (is_remount) {
-				if ((sbi->s_mount_opt & EXT4_MOUNT_DATA_FLAGS)
-						!= data_opt) {
+				if (test_opt(sb, DATA_FLAGS) != data_opt) {
 					ext4_msg(sb, KERN_ERR,
 						"Cannot change data mode on remount");
 					return 0;
 				}
 			} else {
-				sbi->s_mount_opt &= ~EXT4_MOUNT_DATA_FLAGS;
+				clear_opt(sbi->s_mount_opt, DATA_FLAGS);
 				sbi->s_mount_opt |= data_opt;
 			}
 			break;
@@ -1397,63 +1520,22 @@ static int parse_options(char *options, struct super_block *sb,
 			break;
 #ifdef CONFIG_QUOTA
 		case Opt_usrjquota:
-			qtype = USRQUOTA;
-			goto set_qf_name;
-		case Opt_grpjquota:
-			qtype = GRPQUOTA;
-set_qf_name:
-			if (sb_any_quota_loaded(sb) &&
-			    !sbi->s_qf_names[qtype]) {
-				ext4_msg(sb, KERN_ERR,
-				       "Cannot change journaled "
-				       "quota options when quota turned on");
-				return 0;
-			}
-			qname = match_strdup(&args[0]);
-			if (!qname) {
-				ext4_msg(sb, KERN_ERR,
-					"Not enough memory for "
-					"storing quotafile name");
-				return 0;
-			}
-			if (sbi->s_qf_names[qtype] &&
-			    strcmp(sbi->s_qf_names[qtype], qname)) {
-				ext4_msg(sb, KERN_ERR,
-					"%s quota file already "
-					"specified", QTYPE2NAME(qtype));
-				kfree(qname);
+			if (!set_qf_name(sb, USRQUOTA, &args[0]))
 				return 0;
-			}
-			sbi->s_qf_names[qtype] = qname;
-			if (strchr(sbi->s_qf_names[qtype], '/')) {
-				ext4_msg(sb, KERN_ERR,
-					"quotafile must be on "
-					"filesystem root");
-				kfree(sbi->s_qf_names[qtype]);
-				sbi->s_qf_names[qtype] = NULL;
+			break;
+		case Opt_grpjquota:
+			if (!set_qf_name(sb, GRPQUOTA, &args[0]))
 				return 0;
-			}
-			set_opt(sbi->s_mount_opt, QUOTA);
 			break;
 		case Opt_offusrjquota:
-			qtype = USRQUOTA;
-			goto clear_qf_name;
+			if (!clear_qf_name(sb, USRQUOTA))
+				return 0;
+			break;
 		case Opt_offgrpjquota:
-			qtype = GRPQUOTA;
-clear_qf_name:
-			if (sb_any_quota_loaded(sb) &&
-			    sbi->s_qf_names[qtype]) {
-				ext4_msg(sb, KERN_ERR, "Cannot change "
-					"journaled quota options when "
-					"quota turned on");
+			if (!clear_qf_name(sb, GRPQUOTA))
 				return 0;
-			}
-			/*
-			 * The space will be released later when all options
-			 * are confirmed to be correct
-			 */
-			sbi->s_qf_names[qtype] = NULL;
 			break;
+
 		case Opt_jqfmt_vfsold:
 			qfmt = QFMT_VFS_OLD;
 			goto set_qf_format;
@@ -1518,10 +1600,11 @@ set_qf_format:
 			clear_opt(sbi->s_mount_opt, BARRIER);
 			break;
 		case Opt_barrier:
-			if (match_int(&args[0], &option)) {
-				set_opt(sbi->s_mount_opt, BARRIER);
-				break;
-			}
+			if (args[0].from) {
+				if (match_int(&args[0], &option))
+					return 0;
+			} else
+				option = 1;	/* No argument, default to 1 */
 			if (option)
 				set_opt(sbi->s_mount_opt, BARRIER);
 			else
@@ -1594,10 +1677,11 @@ set_qf_format:
 			set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
 			break;
 		case Opt_auto_da_alloc:
-			if (match_int(&args[0], &option)) {
-				clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
-				break;
-			}
+			if (args[0].from) {
+				if (match_int(&args[0], &option))
+					return 0;
+			} else
+				option = 1;	/* No argument, default to 1 */
 			if (option)
 				clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
 			else
@@ -1609,6 +1693,12 @@ set_qf_format:
 		case Opt_nodiscard:
 			clear_opt(sbi->s_mount_opt, DISCARD);
 			break;
+		case Opt_dioread_nolock:
+			set_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+			break;
+		case Opt_dioread_lock:
+			clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+			break;
 		default:
 			ext4_msg(sb, KERN_ERR,
 			       "Unrecognized mount option \"%s\" "
@@ -1618,18 +1708,13 @@ set_qf_format:
 	}
 #ifdef CONFIG_QUOTA
 	if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
-		if ((sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA) &&
-		     sbi->s_qf_names[USRQUOTA])
+		if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
 			clear_opt(sbi->s_mount_opt, USRQUOTA);
 
-		if ((sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA) &&
-		     sbi->s_qf_names[GRPQUOTA])
+		if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
 			clear_opt(sbi->s_mount_opt, GRPQUOTA);
 
-		if ((sbi->s_qf_names[USRQUOTA] &&
-				(sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)) ||
-		    (sbi->s_qf_names[GRPQUOTA] &&
-				(sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA))) {
+		if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
 			ext4_msg(sb, KERN_ERR, "old and new quota "
 					"format mixing");
 			return 0;
@@ -1939,7 +2024,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 		}
 
 		list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
-		vfs_dq_init(inode);
+		dquot_initialize(inode);
 		if (inode->i_nlink) {
 			ext4_msg(sb, KERN_DEBUG,
 				"%s: truncating inode %lu to %lld bytes",
@@ -1971,7 +2056,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 	/* Turn quotas off */
 	for (i = 0; i < MAXQUOTAS; i++) {
 		if (sb_dqopt(sb)->files[i])
-			vfs_quota_off(sb, i, 0);
+			dquot_quota_off(sb, i);
 	}
 #endif
 	sb->s_flags = s_flags; /* Restore MS_RDONLY status */
@@ -2133,7 +2218,7 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
 struct ext4_attr {
 	struct attribute attr;
 	ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
-	ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, 
+	ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
 			 const char *, size_t);
 	int offset;
 };
@@ -2292,7 +2377,7 @@ static void ext4_sb_release(struct kobject *kobj)
 }
 
 
-static struct sysfs_ops ext4_attr_ops = {
+static const struct sysfs_ops ext4_attr_ops = {
 	.show	= ext4_attr_show,
 	.store	= ext4_attr_store,
 };
@@ -2350,6 +2435,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 				__releases(kernel_lock)
 				__acquires(kernel_lock)
 {
+	char *orig_data = kstrdup(data, GFP_KERNEL);
 	struct buffer_head *bh;
 	struct ext4_super_block *es = NULL;
 	struct ext4_sb_info *sbi;
@@ -2432,8 +2518,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
 	if (def_mount_opts & EXT4_DEFM_DEBUG)
 		set_opt(sbi->s_mount_opt, DEBUG);
-	if (def_mount_opts & EXT4_DEFM_BSDGROUPS)
+	if (def_mount_opts & EXT4_DEFM_BSDGROUPS) {
+		ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups",
+			"2.6.38");
 		set_opt(sbi->s_mount_opt, GRPID);
+	}
 	if (def_mount_opts & EXT4_DEFM_UID16)
 		set_opt(sbi->s_mount_opt, NO_UID32);
 #ifdef CONFIG_EXT4_FS_XATTR
@@ -2445,11 +2534,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		set_opt(sbi->s_mount_opt, POSIX_ACL);
 #endif
 	if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
-		sbi->s_mount_opt |= EXT4_MOUNT_JOURNAL_DATA;
+		set_opt(sbi->s_mount_opt, JOURNAL_DATA);
 	else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
-		sbi->s_mount_opt |= EXT4_MOUNT_ORDERED_DATA;
+		set_opt(sbi->s_mount_opt, ORDERED_DATA);
 	else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
-		sbi->s_mount_opt |= EXT4_MOUNT_WRITEBACK_DATA;
+		set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
 
 	if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
 		set_opt(sbi->s_mount_opt, ERRORS_PANIC);
@@ -2470,14 +2559,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	 * enable delayed allocation by default
 	 * Use -o nodelalloc to turn it off
 	 */
-	set_opt(sbi->s_mount_opt, DELALLOC);
+	if (!IS_EXT3_SB(sb))
+		set_opt(sbi->s_mount_opt, DELALLOC);
 
 	if (!parse_options((char *) data, sb, &journal_devnum,
 			   &journal_ioprio, NULL, 0))
 		goto failed_mount;
 
 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
-		((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+		(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
 
 	if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
 	    (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
@@ -2709,24 +2799,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
 	spin_lock_init(&sbi->s_next_gen_lock);
 
-	err = percpu_counter_init(&sbi->s_freeblocks_counter,
-			ext4_count_free_blocks(sb));
-	if (!err) {
-		err = percpu_counter_init(&sbi->s_freeinodes_counter,
-				ext4_count_free_inodes(sb));
-	}
-	if (!err) {
-		err = percpu_counter_init(&sbi->s_dirs_counter,
-				ext4_count_dirs(sb));
-	}
-	if (!err) {
-		err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
-	}
-	if (err) {
-		ext4_msg(sb, KERN_ERR, "insufficient memory");
-		goto failed_mount3;
-	}
-
 	sbi->s_stripe = ext4_get_stripe_size(sbi);
 	sbi->s_max_writeback_mb_bump = 128;
 
@@ -2766,7 +2838,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	      EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
 		ext4_msg(sb, KERN_ERR, "required journal recovery "
 		       "suppressed and not mounted read-only");
-		goto failed_mount4;
+		goto failed_mount_wq;
 	} else {
 		clear_opt(sbi->s_mount_opt, DATA_FLAGS);
 		set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
@@ -2779,7 +2851,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	    !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
 				       JBD2_FEATURE_INCOMPAT_64BIT)) {
 		ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
-		goto failed_mount4;
+		goto failed_mount_wq;
 	}
 
 	if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
@@ -2818,7 +2890,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
 			ext4_msg(sb, KERN_ERR, "Journal does not support "
 			       "requested data journaling mode");
-			goto failed_mount4;
+			goto failed_mount_wq;
 		}
 	default:
 		break;
@@ -2826,13 +2898,31 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
 
 no_journal:
-
+	err = percpu_counter_init(&sbi->s_freeblocks_counter,
+				  ext4_count_free_blocks(sb));
+	if (!err)
+		err = percpu_counter_init(&sbi->s_freeinodes_counter,
+					  ext4_count_free_inodes(sb));
+	if (!err)
+		err = percpu_counter_init(&sbi->s_dirs_counter,
+					  ext4_count_dirs(sb));
+	if (!err)
+		err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
+	if (err) {
+		ext4_msg(sb, KERN_ERR, "insufficient memory");
+		goto failed_mount_wq;
+	}
 	if (test_opt(sb, NOBH)) {
 		if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
 			ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
 				"its supported only with writeback mode");
 			clear_opt(sbi->s_mount_opt, NOBH);
 		}
+		if (test_opt(sb, DIOREAD_NOLOCK)) {
+			ext4_msg(sb, KERN_WARNING, "dioread_nolock option is "
+				"not supported with nobh mode");
+			goto failed_mount_wq;
+		}
 	}
 	EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
 	if (!EXT4_SB(sb)->dio_unwritten_wq) {
@@ -2897,11 +2987,23 @@ no_journal:
 			 "requested data journaling mode");
 		clear_opt(sbi->s_mount_opt, DELALLOC);
 	}
+	if (test_opt(sb, DIOREAD_NOLOCK)) {
+		if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+			ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
+				"option - requested data journaling mode");
+			clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+		}
+		if (sb->s_blocksize < PAGE_SIZE) {
+			ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
+				"option - block size is too small");
+			clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+		}
+	}
 
 	err = ext4_setup_system_zone(sb);
 	if (err) {
 		ext4_msg(sb, KERN_ERR, "failed to initialize system "
-			 "zone (%d)\n", err);
+			 "zone (%d)", err);
 		goto failed_mount4;
 	}
 
@@ -2940,9 +3042,11 @@ no_journal:
 	} else
 		descr = "out journal";
 
-	ext4_msg(sb, KERN_INFO, "mounted filesystem with%s", descr);
+	ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
+		"Opts: %s", descr, orig_data);
 
 	lock_kernel();
+	kfree(orig_data);
 	return 0;
 
 cantfind_ext4:
@@ -2959,6 +3063,10 @@ failed_mount_wq:
 		jbd2_journal_destroy(sbi->s_journal);
 		sbi->s_journal = NULL;
 	}
+	percpu_counter_destroy(&sbi->s_freeblocks_counter);
+	percpu_counter_destroy(&sbi->s_freeinodes_counter);
+	percpu_counter_destroy(&sbi->s_dirs_counter);
+	percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
 failed_mount3:
 	if (sbi->s_flex_groups) {
 		if (is_vmalloc_addr(sbi->s_flex_groups))
@@ -2966,10 +3074,6 @@ failed_mount3:
 		else
 			kfree(sbi->s_flex_groups);
 	}
-	percpu_counter_destroy(&sbi->s_freeblocks_counter);
-	percpu_counter_destroy(&sbi->s_freeinodes_counter);
-	percpu_counter_destroy(&sbi->s_dirs_counter);
-	percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
 failed_mount2:
 	for (i = 0; i < db_count; i++)
 		brelse(sbi->s_group_desc[i]);
@@ -2989,6 +3093,7 @@ out_fail:
 	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
 	lock_kernel();
+	kfree(orig_data);
 	return ret;
 }
 
@@ -3280,7 +3385,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
 	if (!(sb->s_flags & MS_RDONLY))
 		es->s_wtime = cpu_to_le32(get_seconds());
 	es->s_kbytes_written =
-		cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + 
+		cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
 			    ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
 			      EXT4_SB(sb)->s_sectors_written_start) >> 1));
 	ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
@@ -3360,10 +3465,9 @@ static void ext4_clear_journal_err(struct super_block *sb,
 		char nbuf[16];
 
 		errstr = ext4_decode_error(sb, j_errno, nbuf);
-		ext4_warning(sb, __func__, "Filesystem error recorded "
+		ext4_warning(sb, "Filesystem error recorded "
 			     "from previous mount: %s", errstr);
-		ext4_warning(sb, __func__, "Marking fs in need of "
-			     "filesystem check.");
+		ext4_warning(sb, "Marking fs in need of filesystem check.");
 
 		EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
 		es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
@@ -3386,8 +3490,10 @@ int ext4_force_commit(struct super_block *sb)
 		return 0;
 
 	journal = EXT4_SB(sb)->s_journal;
-	if (journal)
+	if (journal) {
+		vfs_check_frozen(sb, SB_FREEZE_WRITE);
 		ret = ext4_journal_force_commit(journal);
+	}
 
 	return ret;
 }
@@ -3436,18 +3542,16 @@ static int ext4_freeze(struct super_block *sb)
 	 * the journal.
 	 */
 	error = jbd2_journal_flush(journal);
-	if (error < 0) {
-	out:
-		jbd2_journal_unlock_updates(journal);
-		return error;
-	}
+	if (error < 0)
+		goto out;
 
 	/* Journal blocked and flushed, clear needs_recovery flag. */
 	EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 	error = ext4_commit_super(sb, 1);
-	if (error)
-		goto out;
-	return 0;
+out:
+	/* we rely on s_frozen to stop further updates */
+	jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+	return error;
 }
 
 /*
@@ -3464,7 +3568,6 @@ static int ext4_unfreeze(struct super_block *sb)
 	EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 	ext4_commit_super(sb, 1);
 	unlock_super(sb);
-	jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
 	return 0;
 }
 
@@ -3475,12 +3578,14 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	ext4_fsblk_t n_blocks_count = 0;
 	unsigned long old_sb_flags;
 	struct ext4_mount_options old_opts;
+	int enable_quota = 0;
 	ext4_group_t g;
 	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
 	int err;
 #ifdef CONFIG_QUOTA
 	int i;
 #endif
+	char *orig_data = kstrdup(data, GFP_KERNEL);
 
 	lock_kernel();
 
@@ -3514,7 +3619,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 		ext4_abort(sb, __func__, "Abort forced by user");
 
 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
-		((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+		(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
 
 	es = sbi->s_es;
 
@@ -3531,6 +3636,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 		}
 
 		if (*flags & MS_RDONLY) {
+			err = dquot_suspend(sb, -1);
+			if (err < 0)
+				goto restore_opts;
+
 			/*
 			 * First of all, the unconditional stuff we have to do
 			 * to disable replay of the journal when we next remount
@@ -3599,6 +3708,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 				goto restore_opts;
 			if (!ext4_setup_super(sb, es, 0))
 				sb->s_flags &= ~MS_RDONLY;
+			enable_quota = 1;
 		}
 	}
 	ext4_setup_system_zone(sb);
@@ -3614,6 +3724,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 #endif
 	unlock_super(sb);
 	unlock_kernel();
+	if (enable_quota)
+		dquot_resume(sb, -1);
+
+	ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
+	kfree(orig_data);
 	return 0;
 
 restore_opts:
@@ -3635,6 +3750,7 @@ restore_opts:
 #endif
 	unlock_super(sb);
 	unlock_kernel();
+	kfree(orig_data);
 	return err;
 }
 
@@ -3708,7 +3824,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
  * Process 1                         Process 2
  * ext4_create()                     quota_sync()
  *   jbd2_journal_start()                  write_dquot()
- *   vfs_dq_init()                         down(dqio_mutex)
+ *   dquot_initialize()                         down(dqio_mutex)
  *     down(dqio_mutex)                    jbd2_journal_start()
  *
  */
@@ -3807,24 +3923,21 @@ static int ext4_write_info(struct super_block *sb, int type)
  */
 static int ext4_quota_on_mount(struct super_block *sb, int type)
 {
-	return vfs_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
-				  EXT4_SB(sb)->s_jquota_fmt, type);
+	return dquot_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
+					EXT4_SB(sb)->s_jquota_fmt, type);
 }
 
 /*
  * Standard function to be called on quota_on
  */
 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
-			 char *name, int remount)
+			 char *name)
 {
 	int err;
 	struct path path;
 
 	if (!test_opt(sb, QUOTA))
 		return -EINVAL;
-	/* When remounting, no checks are needed and in fact, name is NULL */
-	if (remount)
-		return vfs_quota_on(sb, type, format_id, name, remount);
 
 	err = kern_path(name, LOOKUP_FOLLOW, &path);
 	if (err)
@@ -3863,7 +3976,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
 		}
 	}
 
-	err = vfs_quota_on_path(sb, type, format_id, &path);
+	err = dquot_quota_on_path(sb, type, format_id, &path);
 	path_put(&path);
 	return err;
 }
@@ -3917,9 +4030,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
 	ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
 	int err = 0;
 	int offset = off & (sb->s_blocksize - 1);
-	int tocopy;
 	int journal_quota = EXT4_SB(sb)->s_qf_names[type] != NULL;
-	size_t towrite = len;
 	struct buffer_head *bh;
 	handle_t *handle = journal_current_handle();
 
@@ -3929,52 +4040,53 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
 			(unsigned long long)off, (unsigned long long)len);
 		return -EIO;
 	}
+	/*
+	 * Since we account only one data block in transaction credits,
+	 * then it is impossible to cross a block boundary.
+	 */
+	if (sb->s_blocksize - offset < len) {
+		ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
+			" cancelled because not block aligned",
+			(unsigned long long)off, (unsigned long long)len);
+		return -EIO;
+	}
+
 	mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
-	while (towrite > 0) {
-		tocopy = sb->s_blocksize - offset < towrite ?
-				sb->s_blocksize - offset : towrite;
-		bh = ext4_bread(handle, inode, blk, 1, &err);
-		if (!bh)
+	bh = ext4_bread(handle, inode, blk, 1, &err);
+	if (!bh)
+		goto out;
+	if (journal_quota) {
+		err = ext4_journal_get_write_access(handle, bh);
+		if (err) {
+			brelse(bh);
 			goto out;
-		if (journal_quota) {
-			err = ext4_journal_get_write_access(handle, bh);
-			if (err) {
-				brelse(bh);
-				goto out;
-			}
-		}
-		lock_buffer(bh);
-		memcpy(bh->b_data+offset, data, tocopy);
-		flush_dcache_page(bh->b_page);
-		unlock_buffer(bh);
-		if (journal_quota)
-			err = ext4_handle_dirty_metadata(handle, NULL, bh);
-		else {
-			/* Always do at least ordered writes for quotas */
-			err = ext4_jbd2_file_inode(handle, inode);
-			mark_buffer_dirty(bh);
 		}
-		brelse(bh);
-		if (err)
-			goto out;
-		offset = 0;
-		towrite -= tocopy;
-		data += tocopy;
-		blk++;
 	}
+	lock_buffer(bh);
+	memcpy(bh->b_data+offset, data, len);
+	flush_dcache_page(bh->b_page);
+	unlock_buffer(bh);
+	if (journal_quota)
+		err = ext4_handle_dirty_metadata(handle, NULL, bh);
+	else {
+		/* Always do at least ordered writes for quotas */
+		err = ext4_jbd2_file_inode(handle, inode);
+		mark_buffer_dirty(bh);
+	}
+	brelse(bh);
 out:
-	if (len == towrite) {
+	if (err) {
 		mutex_unlock(&inode->i_mutex);
 		return err;
 	}
-	if (inode->i_size < off+len-towrite) {
-		i_size_write(inode, off+len-towrite);
+	if (inode->i_size < off + len) {
+		i_size_write(inode, off + len);
 		EXT4_I(inode)->i_disksize = inode->i_size;
 	}
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	ext4_mark_inode_dirty(handle, inode);
 	mutex_unlock(&inode->i_mutex);
-	return len - towrite;
+	return len;
 }
 
 #endif
@@ -3985,7 +4097,7 @@ static int ext4_get_sb(struct file_system_type *fs_type, int flags,
 	return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
 }
 
-#if !defined(CONTIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
 static struct file_system_type ext2_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "ext2",
@@ -4012,15 +4124,7 @@ static inline void register_as_ext2(void) { }
 static inline void unregister_as_ext2(void) { }
 #endif
 
-#if !defined(CONTIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
-static struct file_system_type ext3_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "ext3",
-	.get_sb		= ext4_get_sb,
-	.kill_sb	= kill_block_super,
-	.fs_flags	= FS_REQUIRES_DEV,
-};
-
+#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
 static inline void register_as_ext3(void)
 {
 	int err = register_filesystem(&ext3_fs_type);
@@ -4051,6 +4155,7 @@ static int __init init_ext4_fs(void)
 {
 	int err;
 
+	ext4_check_flag_values();
 	err = init_ext4_system_zone();
 	if (err)
 		return err;
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index 00740cb32be3..ed9354aff279 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -34,6 +34,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= page_follow_link_light,
 	.put_link	= page_put_link,
+	.setattr	= ext4_setattr,
 #ifdef CONFIG_EXT4_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
@@ -45,6 +46,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
 const struct inode_operations ext4_fast_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= ext4_follow_link,
+	.setattr	= ext4_setattr,
 #ifdef CONFIG_EXT4_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index f3a2f7ed45aa..04338009793a 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -97,7 +97,7 @@ static int ext4_xattr_list(struct dentry *dentry, char *buffer,
 
 static struct mb_cache *ext4_xattr_cache;
 
-static struct xattr_handler *ext4_xattr_handler_map[] = {
+static const struct xattr_handler *ext4_xattr_handler_map[] = {
 	[EXT4_XATTR_INDEX_USER]		     = &ext4_xattr_user_handler,
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
 	[EXT4_XATTR_INDEX_POSIX_ACL_ACCESS]  = &ext4_xattr_acl_access_handler,
@@ -109,7 +109,7 @@ static struct xattr_handler *ext4_xattr_handler_map[] = {
 #endif
 };
 
-struct xattr_handler *ext4_xattr_handlers[] = {
+const struct xattr_handler *ext4_xattr_handlers[] = {
 	&ext4_xattr_user_handler,
 	&ext4_xattr_trusted_handler,
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
@@ -122,10 +122,10 @@ struct xattr_handler *ext4_xattr_handlers[] = {
 	NULL
 };
 
-static inline struct xattr_handler *
+static inline const struct xattr_handler *
 ext4_xattr_handler(int name_index)
 {
-	struct xattr_handler *handler = NULL;
+	const struct xattr_handler *handler = NULL;
 
 	if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map))
 		handler = ext4_xattr_handler_map[name_index];
@@ -227,9 +227,9 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
 	ea_bdebug(bh, "b_count=%d, refcount=%d",
 		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
 	if (ext4_xattr_check_block(bh)) {
-bad_block:	ext4_error(inode->i_sb, __func__,
-			   "inode %lu: bad block %llu", inode->i_ino,
-			   EXT4_I(inode)->i_file_acl);
+bad_block:
+		EXT4_ERROR_INODE(inode, "bad block %llu",
+				 EXT4_I(inode)->i_file_acl);
 		error = -EIO;
 		goto cleanup;
 	}
@@ -267,7 +267,7 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
 	void *end;
 	int error;
 
-	if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR))
+	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
 		return -ENODATA;
 	error = ext4_get_inode_loc(inode, &iloc);
 	if (error)
@@ -331,7 +331,7 @@ ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry,
 	size_t rest = buffer_size;
 
 	for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
-		struct xattr_handler *handler =
+		const struct xattr_handler *handler =
 			ext4_xattr_handler(entry->e_name_index);
 
 		if (handler) {
@@ -371,9 +371,8 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 	ea_bdebug(bh, "b_count=%d, refcount=%d",
 		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
 	if (ext4_xattr_check_block(bh)) {
-		ext4_error(inode->i_sb, __func__,
-			   "inode %lu: bad block %llu", inode->i_ino,
-			   EXT4_I(inode)->i_file_acl);
+		EXT4_ERROR_INODE(inode, "bad block %llu",
+				 EXT4_I(inode)->i_file_acl);
 		error = -EIO;
 		goto cleanup;
 	}
@@ -396,7 +395,7 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 	void *end;
 	int error;
 
-	if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR))
+	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
 		return 0;
 	error = ext4_get_inode_loc(inode, &iloc);
 	if (error)
@@ -494,7 +493,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
 		error = ext4_handle_dirty_metadata(handle, inode, bh);
 		if (IS_SYNC(inode))
 			ext4_handle_sync(handle);
-		vfs_dq_free_block(inode, 1);
+		dquot_free_block(inode, 1);
 		ea_bdebug(bh, "refcount now=%d; releasing",
 			  le32_to_cpu(BHDR(bh)->h_refcount));
 		if (ce)
@@ -665,9 +664,8 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
 			atomic_read(&(bs->bh->b_count)),
 			le32_to_cpu(BHDR(bs->bh)->h_refcount));
 		if (ext4_xattr_check_block(bs->bh)) {
-			ext4_error(sb, __func__,
-				"inode %lu: bad block %llu", inode->i_ino,
-				EXT4_I(inode)->i_file_acl);
+			EXT4_ERROR_INODE(inode, "bad block %llu",
+					 EXT4_I(inode)->i_file_acl);
 			error = -EIO;
 			goto cleanup;
 		}
@@ -787,8 +785,8 @@ inserted:
 			else {
 				/* The old block is released after updating
 				   the inode. */
-				error = -EDQUOT;
-				if (vfs_dq_alloc_block(inode, 1))
+				error = dquot_alloc_block(inode, 1);
+				if (error)
 					goto cleanup;
 				error = ext4_journal_get_write_access(handle,
 								      new_bh);
@@ -820,7 +818,7 @@ inserted:
 						EXT4_I(inode)->i_block_group);
 
 			/* non-extent files can't have physical blocks past 2^32 */
-			if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+			if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 				goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
 
 			block = ext4_new_meta_blocks(handle, inode,
@@ -828,7 +826,7 @@ inserted:
 			if (error)
 				goto cleanup;
 
-			if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+			if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 				BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
 
 			ea_idebug(inode, "creating block %d", block);
@@ -876,13 +874,12 @@ cleanup:
 	return error;
 
 cleanup_dquot:
-	vfs_dq_free_block(inode, 1);
+	dquot_free_block(inode, 1);
 	goto cleanup;
 
 bad_block:
-	ext4_error(inode->i_sb, __func__,
-		   "inode %lu: bad block %llu", inode->i_ino,
-		   EXT4_I(inode)->i_file_acl);
+	EXT4_ERROR_INODE(inode, "bad block %llu",
+			 EXT4_I(inode)->i_file_acl);
 	goto cleanup;
 
 #undef header
@@ -908,7 +905,7 @@ ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
 	is->s.base = is->s.first = IFIRST(header);
 	is->s.here = is->s.first;
 	is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
-	if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) {
+	if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
 		error = ext4_xattr_check_names(IFIRST(header), is->s.end);
 		if (error)
 			return error;
@@ -940,10 +937,10 @@ ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
 	header = IHDR(inode, ext4_raw_inode(&is->iloc));
 	if (!IS_LAST_ENTRY(s->first)) {
 		header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
-		EXT4_I(inode)->i_state |= EXT4_STATE_XATTR;
+		ext4_set_inode_state(inode, EXT4_STATE_XATTR);
 	} else {
 		header->h_magic = cpu_to_le32(0);
-		EXT4_I(inode)->i_state &= ~EXT4_STATE_XATTR;
+		ext4_clear_inode_state(inode, EXT4_STATE_XATTR);
 	}
 	return 0;
 }
@@ -986,8 +983,8 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 	if (strlen(name) > 255)
 		return -ERANGE;
 	down_write(&EXT4_I(inode)->xattr_sem);
-	no_expand = EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND;
-	EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
+	no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND);
+	ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
 
 	error = ext4_get_inode_loc(inode, &is.iloc);
 	if (error)
@@ -997,10 +994,10 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 	if (error)
 		goto cleanup;
 
-	if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) {
+	if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) {
 		struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc);
 		memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
-		EXT4_I(inode)->i_state &= ~EXT4_STATE_NEW;
+		ext4_clear_inode_state(inode, EXT4_STATE_NEW);
 	}
 
 	error = ext4_xattr_ibody_find(inode, &i, &is);
@@ -1052,7 +1049,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 		ext4_xattr_update_super_block(handle, inode->i_sb);
 		inode->i_ctime = ext4_current_time(inode);
 		if (!value)
-			EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND;
+			ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
 		error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
 		/*
 		 * The bh is consumed by ext4_mark_iloc_dirty, even with
@@ -1067,7 +1064,7 @@ cleanup:
 	brelse(is.iloc.bh);
 	brelse(bs.bh);
 	if (no_expand == 0)
-		EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND;
+		ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
 	up_write(&EXT4_I(inode)->xattr_sem);
 	return error;
 }
@@ -1195,9 +1192,8 @@ retry:
 		if (!bh)
 			goto cleanup;
 		if (ext4_xattr_check_block(bh)) {
-			ext4_error(inode->i_sb, __func__,
-				"inode %lu: bad block %llu", inode->i_ino,
-				EXT4_I(inode)->i_file_acl);
+			EXT4_ERROR_INODE(inode, "bad block %llu",
+					 EXT4_I(inode)->i_file_acl);
 			error = -EIO;
 			goto cleanup;
 		}
@@ -1302,6 +1298,8 @@ retry:
 
 		/* Remove the chosen entry from the inode */
 		error = ext4_xattr_ibody_set(handle, inode, &i, is);
+		if (error)
+			goto cleanup;
 
 		entry = IFIRST(header);
 		if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize)
@@ -1372,16 +1370,14 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
 		goto cleanup;
 	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
 	if (!bh) {
-		ext4_error(inode->i_sb, __func__,
-			"inode %lu: block %llu read error", inode->i_ino,
-			EXT4_I(inode)->i_file_acl);
+		EXT4_ERROR_INODE(inode, "block %llu read error",
+				 EXT4_I(inode)->i_file_acl);
 		goto cleanup;
 	}
 	if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
 	    BHDR(bh)->h_blocks != cpu_to_le32(1)) {
-		ext4_error(inode->i_sb, __func__,
-			"inode %lu: bad block %llu", inode->i_ino,
-			EXT4_I(inode)->i_file_acl);
+		EXT4_ERROR_INODE(inode, "bad block %llu",
+				 EXT4_I(inode)->i_file_acl);
 		goto cleanup;
 	}
 	ext4_xattr_release_block(handle, inode, bh);
@@ -1506,9 +1502,8 @@ again:
 		}
 		bh = sb_bread(inode->i_sb, ce->e_block);
 		if (!bh) {
-			ext4_error(inode->i_sb, __func__,
-				"inode %lu: block %lu read error",
-				inode->i_ino, (unsigned long) ce->e_block);
+			EXT4_ERROR_INODE(inode, "block %lu read error",
+					 (unsigned long) ce->e_block);
 		} else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
 				EXT4_XATTR_REFCOUNT_MAX) {
 			ea_idebug(inode, "block %lu refcount %d>=%d",
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 8ede88b18c29..518e96e43905 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -65,11 +65,11 @@ struct ext4_xattr_entry {
 
 # ifdef CONFIG_EXT4_FS_XATTR
 
-extern struct xattr_handler ext4_xattr_user_handler;
-extern struct xattr_handler ext4_xattr_trusted_handler;
-extern struct xattr_handler ext4_xattr_acl_access_handler;
-extern struct xattr_handler ext4_xattr_acl_default_handler;
-extern struct xattr_handler ext4_xattr_security_handler;
+extern const struct xattr_handler ext4_xattr_user_handler;
+extern const struct xattr_handler ext4_xattr_trusted_handler;
+extern const struct xattr_handler ext4_xattr_acl_access_handler;
+extern const struct xattr_handler ext4_xattr_acl_default_handler;
+extern const struct xattr_handler ext4_xattr_security_handler;
 
 extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
 
@@ -86,7 +86,7 @@ extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 extern int init_ext4_xattr(void);
 extern void exit_ext4_xattr(void);
 
-extern struct xattr_handler *ext4_xattr_handlers[];
+extern const struct xattr_handler *ext4_xattr_handlers[];
 
 # else  /* CONFIG_EXT4_FS_XATTR */
 
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 983c253999a7..9b21268e121c 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -7,6 +7,7 @@
 #include <linux/string.h>
 #include <linux/fs.h>
 #include <linux/security.h>
+#include <linux/slab.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
 #include "xattr.h"
@@ -68,7 +69,7 @@ ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
 	return err;
 }
 
-struct xattr_handler ext4_xattr_security_handler = {
+const struct xattr_handler ext4_xattr_security_handler = {
 	.prefix	= XATTR_SECURITY_PREFIX,
 	.list	= ext4_xattr_security_list,
 	.get	= ext4_xattr_security_get,
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index 15b50edc6587..37e6ebca2cc3 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -51,7 +51,7 @@ ext4_xattr_trusted_set(struct dentry *dentry, const char *name,
 			      name, value, size, flags);
 }
 
-struct xattr_handler ext4_xattr_trusted_handler = {
+const struct xattr_handler ext4_xattr_trusted_handler = {
 	.prefix	= XATTR_TRUSTED_PREFIX,
 	.list	= ext4_xattr_trusted_list,
 	.get	= ext4_xattr_trusted_get,
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index c4ce05746ce1..98c375352d0e 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -54,7 +54,7 @@ ext4_xattr_user_set(struct dentry *dentry, const char *name,
 			      name, value, size, flags);
 }
 
-struct xattr_handler ext4_xattr_user_handler = {
+const struct xattr_handler ext4_xattr_user_handler = {
 	.prefix	= XATTR_USER_PREFIX,
 	.list	= ext4_xattr_user_list,
 	.get	= ext4_xattr_user_get,
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 923990e4f16e..ae8200f84e39 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -9,6 +9,7 @@
  */
 
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/buffer_head.h>
 #include "fat.h"
 
@@ -241,9 +242,10 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
 	while (*fclus < cluster) {
 		/* prevent the infinite loop of cluster chain */
 		if (*fclus > limit) {
-			fat_fs_error(sb, "%s: detected the cluster chain loop"
-				     " (i_pos %lld)", __func__,
-				     MSDOS_I(inode)->i_pos);
+			fat_fs_error_ratelimit(sb,
+					"%s: detected the cluster chain loop"
+					" (i_pos %lld)", __func__,
+					MSDOS_I(inode)->i_pos);
 			nr = -EIO;
 			goto out;
 		}
@@ -252,9 +254,9 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
 		if (nr < 0)
 			goto out;
 		else if (nr == FAT_ENT_FREE) {
-			fat_fs_error(sb, "%s: invalid cluster chain"
-				     " (i_pos %lld)", __func__,
-				     MSDOS_I(inode)->i_pos);
+			fat_fs_error_ratelimit(sb, "%s: invalid cluster chain"
+					       " (i_pos %lld)", __func__,
+					       MSDOS_I(inode)->i_pos);
 			nr = -EIO;
 			goto out;
 		} else if (nr == FAT_ENT_EOF) {
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 530b4ca01510..ee42b9e0b16a 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -19,6 +19,7 @@
 #include <linux/buffer_head.h>
 #include <linux/compat.h>
 #include <asm/uaccess.h>
+#include <linux/kernel.h>
 #include "fat.h"
 
 /*
@@ -140,28 +141,22 @@ static int uni16_to_x8(unsigned char *ascii, const wchar_t *uni, int len,
 {
 	const wchar_t *ip;
 	wchar_t ec;
-	unsigned char *op, nc;
+	unsigned char *op;
 	int charlen;
-	int k;
 
 	ip = uni;
 	op = ascii;
 
 	while (*ip && ((len - NLS_MAX_CHARSET_SIZE) > 0)) {
 		ec = *ip++;
-		if ( (charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE)) > 0) {
+		if ((charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE)) > 0) {
 			op += charlen;
 			len -= charlen;
 		} else {
 			if (uni_xlate == 1) {
-				*op = ':';
-				for (k = 4; k > 0; k--) {
-					nc = ec & 0xF;
-					op[k] = nc > 9	? nc + ('a' - 10)
-							: nc + '0';
-					ec >>= 4;
-				}
-				op += 5;
+				*op++ = ':';
+				op = pack_hex_byte(op, ec >> 8);
+				op = pack_hex_byte(op, ec);
 				len -= 5;
 			} else {
 				*op++ = '?';
@@ -758,9 +753,10 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *filp,
 	return ret;
 }
 
-static int fat_dir_ioctl(struct inode *inode, struct file *filp,
-			 unsigned int cmd, unsigned long arg)
+static long fat_dir_ioctl(struct file *filp, unsigned int cmd,
+			  unsigned long arg)
 {
+	struct inode *inode = filp->f_path.dentry->d_inode;
 	struct __fat_dirent __user *d1 = (struct __fat_dirent __user *)arg;
 	int short_only, both;
 
@@ -774,7 +770,7 @@ static int fat_dir_ioctl(struct inode *inode, struct file *filp,
 		both = 1;
 		break;
 	default:
-		return fat_generic_ioctl(inode, filp, cmd, arg);
+		return fat_generic_ioctl(filp, cmd, arg);
 	}
 
 	if (!access_ok(VERIFY_WRITE, d1, sizeof(struct __fat_dirent[2])))
@@ -814,7 +810,7 @@ static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd,
 		both = 1;
 		break;
 	default:
-		return -ENOIOCTLCMD;
+		return fat_generic_ioctl(filp, cmd, (unsigned long)arg);
 	}
 
 	if (!access_ok(VERIFY_WRITE, d1, sizeof(struct compat_dirent[2])))
@@ -836,7 +832,7 @@ const struct file_operations fat_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= fat_readdir,
-	.ioctl		= fat_dir_ioctl,
+	.unlocked_ioctl	= fat_dir_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= fat_compat_dir_ioctl,
 #endif
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index e6efdfa0f6db..27ac25725954 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -6,6 +6,7 @@
 #include <linux/nls.h>
 #include <linux/fs.h>
 #include <linux/mutex.h>
+#include <linux/ratelimit.h>
 #include <linux/msdos_fs.h>
 
 /*
@@ -82,6 +83,8 @@ struct msdos_sb_info {
 	struct fatent_operations *fatent_ops;
 	struct inode *fat_inode;
 
+	struct ratelimit_state ratelimit;
+
 	spinlock_t inode_hash_lock;
 	struct hlist_head inode_hashtable[FAT_HASH_SIZE];
 };
@@ -298,16 +301,16 @@ extern int fat_free_clusters(struct inode *inode, int cluster);
 extern int fat_count_free_clusters(struct super_block *sb);
 
 /* fat/file.c */
-extern int fat_generic_ioctl(struct inode *inode, struct file *filp,
-			     unsigned int cmd, unsigned long arg);
+extern long fat_generic_ioctl(struct file *filp, unsigned int cmd,
+			      unsigned long arg);
 extern const struct file_operations fat_file_operations;
 extern const struct inode_operations fat_file_inode_operations;
 extern int fat_setattr(struct dentry * dentry, struct iattr * attr);
-extern void fat_truncate(struct inode *inode);
+extern int fat_setsize(struct inode *inode, loff_t offset);
+extern void fat_truncate_blocks(struct inode *inode, loff_t offset);
 extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		       struct kstat *stat);
-extern int fat_file_fsync(struct file *file, struct dentry *dentry,
-			  int datasync);
+extern int fat_file_fsync(struct file *file, int datasync);
 
 /* fat/inode.c */
 extern void fat_attach(struct inode *inode, loff_t i_pos);
@@ -322,8 +325,13 @@ extern int fat_fill_super(struct super_block *sb, void *data, int silent,
 extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
 		            struct inode *i2);
 /* fat/misc.c */
-extern void fat_fs_error(struct super_block *s, const char *fmt, ...)
-	__attribute__ ((format (printf, 2, 3))) __cold;
+extern void
+__fat_fs_error(struct super_block *s, int report, const char *fmt, ...)
+	__attribute__ ((format (printf, 3, 4))) __cold;
+#define fat_fs_error(s, fmt, args...)		\
+	__fat_fs_error(s, 1, fmt , ## args)
+#define fat_fs_error_ratelimit(s, fmt, args...) \
+	__fat_fs_error(s, __ratelimit(&MSDOS_SB(s)->ratelimit), fmt , ## args)
 extern int fat_clusters_flush(struct super_block *sb);
 extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
 extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
diff --git a/fs/fat/file.c b/fs/fat/file.c
index e8c159de236b..990dfae022e5 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -8,6 +8,7 @@
 
 #include <linux/capability.h>
 #include <linux/module.h>
+#include <linux/compat.h>
 #include <linux/mount.h>
 #include <linux/time.h>
 #include <linux/buffer_head.h>
@@ -114,9 +115,9 @@ out:
 	return err;
 }
 
-int fat_generic_ioctl(struct inode *inode, struct file *filp,
-		      unsigned int cmd, unsigned long arg)
+long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
+	struct inode *inode = filp->f_path.dentry->d_inode;
 	u32 __user *user_attr = (u32 __user *)arg;
 
 	switch (cmd) {
@@ -129,6 +130,15 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
 	}
 }
 
+#ifdef CONFIG_COMPAT
+static long fat_generic_compat_ioctl(struct file *filp, unsigned int cmd,
+				      unsigned long arg)
+
+{
+	return fat_generic_ioctl(filp, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
+
 static int fat_file_release(struct inode *inode, struct file *filp)
 {
 	if ((filp->f_mode & FMODE_WRITE) &&
@@ -139,12 +149,12 @@ static int fat_file_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-int fat_file_fsync(struct file *filp, struct dentry *dentry, int datasync)
+int fat_file_fsync(struct file *filp, int datasync)
 {
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = filp->f_mapping->host;
 	int res, err;
 
-	res = simple_fsync(filp, dentry, datasync);
+	res = generic_file_fsync(filp, datasync);
 	err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping);
 
 	return res ? res : err;
@@ -159,7 +169,10 @@ const struct file_operations fat_file_operations = {
 	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
 	.release	= fat_file_release,
-	.ioctl		= fat_generic_ioctl,
+	.unlocked_ioctl	= fat_generic_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= fat_generic_compat_ioctl,
+#endif
 	.fsync		= fat_file_fsync,
 	.splice_read	= generic_file_splice_read,
 };
@@ -270,7 +283,7 @@ static int fat_free(struct inode *inode, int skip)
 	return fat_free_clusters(inode, free_start);
 }
 
-void fat_truncate(struct inode *inode)
+void fat_truncate_blocks(struct inode *inode, loff_t offset)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
 	const unsigned int cluster_size = sbi->cluster_size;
@@ -280,10 +293,10 @@ void fat_truncate(struct inode *inode)
 	 * This protects against truncating a file bigger than it was then
 	 * trying to write into the hole.
 	 */
-	if (MSDOS_I(inode)->mmu_private > inode->i_size)
-		MSDOS_I(inode)->mmu_private = inode->i_size;
+	if (MSDOS_I(inode)->mmu_private > offset)
+		MSDOS_I(inode)->mmu_private = offset;
 
-	nr_clusters = (inode->i_size + (cluster_size - 1)) >> sbi->cluster_bits;
+	nr_clusters = (offset + (cluster_size - 1)) >> sbi->cluster_bits;
 
 	fat_free(inode, nr_clusters);
 	fat_flush_inodes(inode->i_sb, inode, NULL);
@@ -351,6 +364,18 @@ static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode)
 	return 0;
 }
 
+int fat_setsize(struct inode *inode, loff_t offset)
+{
+	int error;
+
+	error = simple_setsize(inode, offset);
+	if (error)
+		return error;
+	fat_truncate_blocks(inode, offset);
+
+	return error;
+}
+
 #define TIMES_SET_FLAGS	(ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)
 /* valid file mode bits */
 #define FAT_VALID_MODE	(S_IFREG | S_IFDIR | S_IRWXUGO)
@@ -365,7 +390,8 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
 	/*
 	 * Expand the file. Since inode_setattr() updates ->i_size
 	 * before calling the ->truncate(), but FAT needs to fill the
-	 * hole before it.
+	 * hole before it. XXX: this is no longer true with new truncate
+	 * sequence.
 	 */
 	if (attr->ia_valid & ATTR_SIZE) {
 		if (attr->ia_size > inode->i_size) {
@@ -414,15 +440,20 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
 			attr->ia_valid &= ~ATTR_MODE;
 	}
 
-	if (attr->ia_valid)
-		error = inode_setattr(inode, attr);
+	if (attr->ia_valid & ATTR_SIZE) {
+		error = fat_setsize(inode, attr->ia_size);
+		if (error)
+			goto out;
+	}
+
+	generic_setattr(inode, attr);
+	mark_inode_dirty(inode);
 out:
 	return error;
 }
 EXPORT_SYMBOL_GPL(fat_setattr);
 
 const struct inode_operations fat_file_inode_operations = {
-	.truncate	= fat_truncate,
 	.setattr	= fat_setattr,
 	.getattr	= fat_getattr,
 };
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 14da530b05ca..7bf45aee56d7 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -142,14 +142,29 @@ static int fat_readpages(struct file *file, struct address_space *mapping,
 	return mpage_readpages(mapping, pages, nr_pages, fat_get_block);
 }
 
+static void fat_write_failed(struct address_space *mapping, loff_t to)
+{
+	struct inode *inode = mapping->host;
+
+	if (to > inode->i_size) {
+		truncate_pagecache(inode, to, inode->i_size);
+		fat_truncate_blocks(inode, inode->i_size);
+	}
+}
+
 static int fat_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
 {
+	int err;
+
 	*pagep = NULL;
-	return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
-				fat_get_block,
+	err = cont_write_begin_newtrunc(file, mapping, pos, len, flags,
+				pagep, fsdata, fat_get_block,
 				&MSDOS_I(mapping->host)->mmu_private);
+	if (err < 0)
+		fat_write_failed(mapping, pos + len);
+	return err;
 }
 
 static int fat_write_end(struct file *file, struct address_space *mapping,
@@ -159,6 +174,8 @@ static int fat_write_end(struct file *file, struct address_space *mapping,
 	struct inode *inode = mapping->host;
 	int err;
 	err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata);
+	if (err < len)
+		fat_write_failed(mapping, pos + len);
 	if (!(err < 0) && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) {
 		inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
 		MSDOS_I(inode)->i_attrs |= ATTR_ARCH;
@@ -172,7 +189,9 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
 			     loff_t offset, unsigned long nr_segs)
 {
 	struct file *file = iocb->ki_filp;
-	struct inode *inode = file->f_mapping->host;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	ssize_t ret;
 
 	if (rw == WRITE) {
 		/*
@@ -193,8 +212,12 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
 	 * FAT need to use the DIO_LOCKING for avoiding the race
 	 * condition of fat_get_block() and ->truncate().
 	 */
-	return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
-				  offset, nr_segs, fat_get_block, NULL);
+	ret = blockdev_direct_IO_newtrunc(rw, iocb, inode, inode->i_sb->s_bdev,
+				iov, offset, nr_segs, fat_get_block, NULL);
+	if (ret < 0 && (rw & WRITE))
+		fat_write_failed(mapping, offset + iov_length(iov, nr_segs));
+
+	return ret;
 }
 
 static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
@@ -429,7 +452,7 @@ static void fat_delete_inode(struct inode *inode)
 {
 	truncate_inode_pages(&inode->i_data, 0);
 	inode->i_size = 0;
-	fat_truncate(inode);
+	fat_truncate_blocks(inode, 0);
 	clear_inode(inode);
 }
 
@@ -558,7 +581,7 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_bavail = sbi->free_clusters;
 	buf->f_fsid.val[0] = (u32)id;
 	buf->f_fsid.val[1] = (u32)(id >> 32);
-	buf->f_namelen = sbi->options.isvfat ? 260 : 12;
+	buf->f_namelen = sbi->options.isvfat ? FAT_LFN_LEN : 12;
 
 	return 0;
 }
@@ -577,7 +600,7 @@ static inline loff_t fat_i_pos_read(struct msdos_sb_info *sbi,
 	return i_pos;
 }
 
-static int fat_write_inode(struct inode *inode, int wait)
+static int __fat_write_inode(struct inode *inode, int wait)
 {
 	struct super_block *sb = inode->i_sb;
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
@@ -634,9 +657,14 @@ retry:
 	return err;
 }
 
+static int fat_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	return __fat_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
+}
+
 int fat_sync_inode(struct inode *inode)
 {
-	return fat_write_inode(inode, 1);
+	return __fat_write_inode(inode, 1);
 }
 
 EXPORT_SYMBOL_GPL(fat_sync_inode);
@@ -1245,6 +1273,8 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
 	sb->s_op = &fat_sops;
 	sb->s_export_op = &fat_export_ops;
 	sbi->dir_ops = fs_dir_inode_ops;
+	ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL,
+			     DEFAULT_RATELIMIT_BURST);
 
 	error = parse_options(data, isvfat, silent, &debug, &sbi->options);
 	if (error)
@@ -1492,10 +1522,8 @@ out_fail:
 		iput(fat_inode);
 	if (root_inode)
 		iput(root_inode);
-	if (sbi->nls_io)
-		unload_nls(sbi->nls_io);
-	if (sbi->nls_disk)
-		unload_nls(sbi->nls_disk);
+	unload_nls(sbi->nls_io);
+	unload_nls(sbi->nls_disk);
 	if (sbi->options.iocharset != fat_default_iocharset)
 		kfree(sbi->options.iocharset);
 	sb->s_fs_info = NULL;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index d3da05f26465..1fa23f6ffba5 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -20,27 +20,29 @@
  * In case the file system is remounted read-only, it can be made writable
  * again by remounting it.
  */
-void fat_fs_error(struct super_block *s, const char *fmt, ...)
+void __fat_fs_error(struct super_block *s, int report, const char *fmt, ...)
 {
 	struct fat_mount_options *opts = &MSDOS_SB(s)->options;
 	va_list args;
 
-	printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id);
+	if (report) {
+		printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id);
 
-	printk(KERN_ERR "    ");
-	va_start(args, fmt);
-	vprintk(fmt, args);
-	va_end(args);
-	printk("\n");
+		printk(KERN_ERR "    ");
+		va_start(args, fmt);
+		vprintk(fmt, args);
+		va_end(args);
+		printk("\n");
+	}
 
 	if (opts->errors == FAT_ERRORS_PANIC)
-		panic("    FAT fs panic from previous error\n");
+		panic("FAT: fs panic from previous error\n");
 	else if (opts->errors == FAT_ERRORS_RO && !(s->s_flags & MS_RDONLY)) {
 		s->s_flags |= MS_RDONLY;
-		printk(KERN_ERR "    File system has been set read-only\n");
+		printk(KERN_ERR "FAT: Filesystem has been set read-only\n");
 	}
 }
-EXPORT_SYMBOL_GPL(fat_fs_error);
+EXPORT_SYMBOL_GPL(__fat_fs_error);
 
 /* Flushes the number of free clusters on FAT32 */
 /* XXX: Need to write one per FSINFO block.  Currently only writes 1 */
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index f565f24019b5..6fcc7e71fbaa 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -309,7 +309,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
 {
 	struct fat_mount_options *opts = &MSDOS_SB(dir->i_sb)->options;
 	wchar_t *ip, *ext_start, *end, *name_start;
-	unsigned char base[9], ext[4], buf[8], *p;
+	unsigned char base[9], ext[4], buf[5], *p;
 	unsigned char charbuf[NLS_MAX_CHARSET_SIZE];
 	int chl, chi;
 	int sz = 0, extlen, baselen, i, numtail_baselen, numtail2_baselen;
@@ -467,7 +467,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
 			return 0;
 	}
 
-	i = jiffies & 0xffff;
+	i = jiffies;
 	sz = (jiffies >> 16) & 0x7;
 	if (baselen > 2) {
 		baselen = numtail2_baselen;
@@ -476,7 +476,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
 	name_res[baselen + 4] = '~';
 	name_res[baselen + 5] = '1' + sz;
 	while (1) {
-		sprintf(buf, "%04X", i);
+		snprintf(buf, sizeof(buf), "%04X", i & 0xffff);
 		memcpy(&name_res[baselen], buf, 4);
 		if (vfat_find_form(dir, name_res) < 0)
 			break;
@@ -502,14 +502,14 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
 		*outlen = utf8s_to_utf16s(name, len, (wchar_t *)outname);
 		if (*outlen < 0)
 			return *outlen;
-		else if (*outlen > 255)
+		else if (*outlen > FAT_LFN_LEN)
 			return -ENAMETOOLONG;
 
 		op = &outname[*outlen * sizeof(wchar_t)];
 	} else {
 		if (nls) {
 			for (i = 0, ip = name, op = outname, *outlen = 0;
-			     i < len && *outlen <= 255;
+			     i < len && *outlen <= FAT_LFN_LEN;
 			     *outlen += 1)
 			{
 				if (escape && (*ip == ':')) {
@@ -549,7 +549,7 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
 				return -ENAMETOOLONG;
 		} else {
 			for (i = 0, ip = name, op = outname, *outlen = 0;
-			     i < len && *outlen <= 255;
+			     i < len && *outlen <= FAT_LFN_LEN;
 			     i++, *outlen += 1)
 			{
 				*op++ = *ip++;
@@ -701,6 +701,15 @@ static int vfat_find(struct inode *dir, struct qstr *qname,
 	return fat_search_long(dir, qname->name, len, sinfo);
 }
 
+/*
+ * (nfsd's) anonymous disconnected dentry?
+ * NOTE: !IS_ROOT() is not anonymous (I.e. d_splice_alias() did the job).
+ */
+static int vfat_d_anon_disconn(struct dentry *dentry)
+{
+	return IS_ROOT(dentry) && (dentry->d_flags & DCACHE_DISCONNECTED);
+}
+
 static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
 				  struct nameidata *nd)
 {
@@ -729,11 +738,11 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
 	}
 
 	alias = d_find_alias(inode);
-	if (alias && !(alias->d_flags & DCACHE_DISCONNECTED)) {
+	if (alias && !vfat_d_anon_disconn(alias)) {
 		/*
-		 * This inode has non DCACHE_DISCONNECTED dentry. This
-		 * means, the user did ->lookup() by an another name
-		 * (longname vs 8.3 alias of it) in past.
+		 * This inode has non anonymous-DCACHE_DISCONNECTED
+		 * dentry. This means, the user did ->lookup() by an
+		 * another name (longname vs 8.3 alias of it) in past.
 		 *
 		 * Switch to new one for reason of locality if possible.
 		 */
@@ -743,7 +752,9 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
 		iput(inode);
 		unlock_super(sb);
 		return alias;
-	}
+	} else
+		dput(alias);
+
 out:
 	unlock_super(sb);
 	dentry->d_op = sb->s_root->d_op;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 5ef953e6f908..9d175d623aab 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -14,6 +14,7 @@
 #include <linux/dnotify.h>
 #include <linux/slab.h>
 #include <linux/module.h>
+#include <linux/pipe_fs_i.h>
 #include <linux/security.h>
 #include <linux/ptrace.h>
 #include <linux/signal.h>
@@ -199,9 +200,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
 static void f_modown(struct file *filp, struct pid *pid, enum pid_type type,
                      int force)
 {
-	unsigned long flags;
-
-	write_lock_irqsave(&filp->f_owner.lock, flags);
+	write_lock_irq(&filp->f_owner.lock);
 	if (force || !filp->f_owner.pid) {
 		put_pid(filp->f_owner.pid);
 		filp->f_owner.pid = get_pid(pid);
@@ -213,7 +212,7 @@ static void f_modown(struct file *filp, struct pid *pid, enum pid_type type,
 			filp->f_owner.euid = cred->euid;
 		}
 	}
-	write_unlock_irqrestore(&filp->f_owner.lock, flags);
+	write_unlock_irq(&filp->f_owner.lock);
 }
 
 int __f_setown(struct file *filp, struct pid *pid, enum pid_type type,
@@ -275,7 +274,7 @@ static int f_setown_ex(struct file *filp, unsigned long arg)
 
 	ret = copy_from_user(&owner, owner_p, sizeof(owner));
 	if (ret)
-		return ret;
+		return -EFAULT;
 
 	switch (owner.type) {
 	case F_OWNER_TID:
@@ -333,8 +332,11 @@ static int f_getown_ex(struct file *filp, unsigned long arg)
 	}
 	read_unlock(&filp->f_owner.lock);
 
-	if (!ret)
+	if (!ret) {
 		ret = copy_to_user(owner_p, &owner, sizeof(owner));
+		if (ret)
+			ret = -EFAULT;
+	}
 	return ret;
 }
 
@@ -346,7 +348,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
 	switch (cmd) {
 	case F_DUPFD:
 	case F_DUPFD_CLOEXEC:
-		if (arg >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
+		if (arg >= rlimit(RLIMIT_NOFILE))
 			break;
 		err = alloc_fd(arg, cmd == F_DUPFD_CLOEXEC ? O_CLOEXEC : 0);
 		if (err >= 0) {
@@ -414,6 +416,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
 	case F_NOTIFY:
 		err = fcntl_dirnotify(fd, filp, arg);
 		break;
+	case F_SETPIPE_SZ:
+	case F_GETPIPE_SZ:
+		err = pipe_fcntl(filp, cmd, arg);
+		break;
 	default:
 		break;
 	}
@@ -616,9 +622,15 @@ int send_sigurg(struct fown_struct *fown)
 	return ret;
 }
 
-static DEFINE_RWLOCK(fasync_lock);
+static DEFINE_SPINLOCK(fasync_lock);
 static struct kmem_cache *fasync_cache __read_mostly;
 
+static void fasync_free_rcu(struct rcu_head *head)
+{
+	kmem_cache_free(fasync_cache,
+			container_of(head, struct fasync_struct, fa_rcu));
+}
+
 /*
  * Remove a fasync entry. If successfully removed, return
  * positive and clear the FASYNC flag. If no entry exists,
@@ -627,8 +639,6 @@ static struct kmem_cache *fasync_cache __read_mostly;
  * NOTE! It is very important that the FASYNC flag always
  * match the state "is the filp on a fasync list".
  *
- * We always take the 'filp->f_lock', in since fasync_lock
- * needs to be irq-safe.
  */
 static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
 {
@@ -636,17 +646,22 @@ static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
 	int result = 0;
 
 	spin_lock(&filp->f_lock);
-	write_lock_irq(&fasync_lock);
+	spin_lock(&fasync_lock);
 	for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
 		if (fa->fa_file != filp)
 			continue;
+
+		spin_lock_irq(&fa->fa_lock);
+		fa->fa_file = NULL;
+		spin_unlock_irq(&fa->fa_lock);
+
 		*fp = fa->fa_next;
-		kmem_cache_free(fasync_cache, fa);
+		call_rcu(&fa->fa_rcu, fasync_free_rcu);
 		filp->f_flags &= ~FASYNC;
 		result = 1;
 		break;
 	}
-	write_unlock_irq(&fasync_lock);
+	spin_unlock(&fasync_lock);
 	spin_unlock(&filp->f_lock);
 	return result;
 }
@@ -668,25 +683,30 @@ static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fa
 		return -ENOMEM;
 
 	spin_lock(&filp->f_lock);
-	write_lock_irq(&fasync_lock);
+	spin_lock(&fasync_lock);
 	for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
 		if (fa->fa_file != filp)
 			continue;
+
+		spin_lock_irq(&fa->fa_lock);
 		fa->fa_fd = fd;
+		spin_unlock_irq(&fa->fa_lock);
+
 		kmem_cache_free(fasync_cache, new);
 		goto out;
 	}
 
+	spin_lock_init(&new->fa_lock);
 	new->magic = FASYNC_MAGIC;
 	new->fa_file = filp;
 	new->fa_fd = fd;
 	new->fa_next = *fapp;
-	*fapp = new;
+	rcu_assign_pointer(*fapp, new);
 	result = 1;
 	filp->f_flags |= FASYNC;
 
 out:
-	write_unlock_irq(&fasync_lock);
+	spin_unlock(&fasync_lock);
 	spin_unlock(&filp->f_lock);
 	return result;
 }
@@ -706,37 +726,43 @@ int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fap
 
 EXPORT_SYMBOL(fasync_helper);
 
-void __kill_fasync(struct fasync_struct *fa, int sig, int band)
+/*
+ * rcu_read_lock() is held
+ */
+static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
 {
 	while (fa) {
-		struct fown_struct * fown;
+		struct fown_struct *fown;
+		unsigned long flags;
+
 		if (fa->magic != FASYNC_MAGIC) {
 			printk(KERN_ERR "kill_fasync: bad magic number in "
 			       "fasync_struct!\n");
 			return;
 		}
-		fown = &fa->fa_file->f_owner;
-		/* Don't send SIGURG to processes which have not set a
-		   queued signum: SIGURG has its own default signalling
-		   mechanism. */
-		if (!(sig == SIGURG && fown->signum == 0))
-			send_sigio(fown, fa->fa_fd, band);
-		fa = fa->fa_next;
+		spin_lock_irqsave(&fa->fa_lock, flags);
+		if (fa->fa_file) {
+			fown = &fa->fa_file->f_owner;
+			/* Don't send SIGURG to processes which have not set a
+			   queued signum: SIGURG has its own default signalling
+			   mechanism. */
+			if (!(sig == SIGURG && fown->signum == 0))
+				send_sigio(fown, fa->fa_fd, band);
+		}
+		spin_unlock_irqrestore(&fa->fa_lock, flags);
+		fa = rcu_dereference(fa->fa_next);
 	}
 }
 
-EXPORT_SYMBOL(__kill_fasync);
-
 void kill_fasync(struct fasync_struct **fp, int sig, int band)
 {
 	/* First a quick test without locking: usually
 	 * the list is empty.
 	 */
 	if (*fp) {
-		read_lock(&fasync_lock);
-		/* reread *fp after obtaining the lock */
-		__kill_fasync(*fp, sig, band);
-		read_unlock(&fasync_lock);
+		rcu_read_lock();
+		kill_fasync_rcu(rcu_dereference(*fp), sig, band);
+		rcu_read_unlock();
 	}
 }
 EXPORT_SYMBOL(kill_fasync);
diff --git a/fs/fifo.c b/fs/fifo.c
index f8f97b8b6d44..5d6606ffc2d2 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -10,7 +10,6 @@
  */
 
 #include <linux/mm.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/pipe_fs_i.h>
diff --git a/fs/file.c b/fs/file.c
index 87e129030ab1..34bb7f71d994 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -257,7 +257,7 @@ int expand_files(struct files_struct *files, int nr)
 	 * N.B. For clone tasks sharing a files structure, this test
 	 * will limit the total number of files that can be opened.
 	 */
-	if (nr >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
+	if (nr >= rlimit(RLIMIT_NOFILE))
 		return -EMFILE;
 
 	/* Do we need to expand? */
@@ -478,7 +478,7 @@ repeat:
 	error = fd;
 #if 1
 	/* Sanity check */
-	if (rcu_dereference(fdt->fd[fd]) != NULL) {
+	if (rcu_dereference_raw(fdt->fd[fd]) != NULL) {
 		printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
 		rcu_assign_pointer(fdt->fd[fd], NULL);
 	}
diff --git a/fs/file_table.c b/fs/file_table.c
index 69652c5bd5f0..5c7d10ead4ad 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -194,14 +194,6 @@ struct file *alloc_file(struct path *path, fmode_t mode,
 }
 EXPORT_SYMBOL(alloc_file);
 
-void fput(struct file *file)
-{
-	if (atomic_long_dec_and_test(&file->f_count))
-		__fput(file);
-}
-
-EXPORT_SYMBOL(fput);
-
 /**
  * drop_file_write_access - give up ability to write to a file
  * @file: the file to which we will stop writing
@@ -227,10 +219,9 @@ void drop_file_write_access(struct file *file)
 }
 EXPORT_SYMBOL_GPL(drop_file_write_access);
 
-/* __fput is called from task context when aio completion releases the last
- * last use of a struct file *.  Do not use otherwise.
+/* the real guts of fput() - releasing the last reference to file
  */
-void __fput(struct file *file)
+static void __fput(struct file *file)
 {
 	struct dentry *dentry = file->f_path.dentry;
 	struct vfsmount *mnt = file->f_path.mnt;
@@ -253,6 +244,7 @@ void __fput(struct file *file)
 	if (file->f_op && file->f_op->release)
 		file->f_op->release(inode, file);
 	security_file_free(file);
+	ima_file_free(file);
 	if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL))
 		cdev_put(inode->i_cdev);
 	fops_put(file->f_op);
@@ -267,6 +259,14 @@ void __fput(struct file *file)
 	mntput(mnt);
 }
 
+void fput(struct file *file)
+{
+	if (atomic_long_dec_and_test(&file->f_count))
+		__fput(file);
+}
+
+EXPORT_SYMBOL(fput);
+
 struct file *fget(unsigned int fd)
 {
 	struct file *file;
@@ -392,7 +392,9 @@ retry:
 			continue;
 		if (!(f->f_mode & FMODE_WRITE))
 			continue;
+		spin_lock(&f->f_lock);
 		f->f_mode &= ~FMODE_WRITE;
+		spin_unlock(&f->f_lock);
 		if (file_check_writeable(f) != 0)
 			continue;
 		file_release_write(f);
diff --git a/fs/filesystems.c b/fs/filesystems.c
index a24c58e181db..68ba492d8eef 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -10,10 +10,10 @@
 #include <linux/fs.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-#include <linux/slab.h>
 #include <linux/kmod.h>
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 
 /*
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index aee049cb9f84..0ec7bb2c95c6 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -57,6 +57,8 @@ const struct inode_operations vxfs_dir_inode_ops = {
 };
 
 const struct file_operations vxfs_dir_operations = {
+	.llseek =		generic_file_llseek,
+	.read =			generic_read_dir,
 	.readdir =		vxfs_readdir,
 };
 
diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c
index ed8f0b0dd880..1429f3ae1e86 100644
--- a/fs/freevxfs/vxfs_subr.c
+++ b/fs/freevxfs/vxfs_subr.c
@@ -33,7 +33,6 @@
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/pagemap.h>
 
 #include "vxfs_extern.h"
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 1a7c42c64ff4..d5be1693ac93 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -16,6 +16,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/spinlock.h>
+#include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
@@ -37,51 +38,18 @@ int nr_pdflush_threads;
 /*
  * Passed into wb_writeback(), essentially a subset of writeback_control
  */
-struct wb_writeback_args {
+struct wb_writeback_work {
 	long nr_pages;
 	struct super_block *sb;
 	enum writeback_sync_modes sync_mode;
-	int for_kupdate:1;
-	int range_cyclic:1;
-	int for_background:1;
-};
+	unsigned int for_kupdate:1;
+	unsigned int range_cyclic:1;
+	unsigned int for_background:1;
 
-/*
- * Work items for the bdi_writeback threads
- */
-struct bdi_work {
 	struct list_head list;		/* pending work list */
-	struct rcu_head rcu_head;	/* for RCU free/clear of work */
-
-	unsigned long seen;		/* threads that have seen this work */
-	atomic_t pending;		/* number of threads still to do work */
-
-	struct wb_writeback_args args;	/* writeback arguments */
-
-	unsigned long state;		/* flag bits, see WS_* */
-};
-
-enum {
-	WS_USED_B = 0,
-	WS_ONSTACK_B,
+	struct completion *done;	/* set if the caller waits */
 };
 
-#define WS_USED (1 << WS_USED_B)
-#define WS_ONSTACK (1 << WS_ONSTACK_B)
-
-static inline bool bdi_work_on_stack(struct bdi_work *work)
-{
-	return test_bit(WS_ONSTACK_B, &work->state);
-}
-
-static inline void bdi_work_init(struct bdi_work *work,
-				 struct wb_writeback_args *args)
-{
-	INIT_RCU_HEAD(&work->rcu_head);
-	work->args = *args;
-	work->state = WS_USED;
-}
-
 /**
  * writeback_in_progress - determine whether there is writeback in progress
  * @bdi: the device's backing_dev_info structure.
@@ -94,76 +62,11 @@ int writeback_in_progress(struct backing_dev_info *bdi)
 	return !list_empty(&bdi->work_list);
 }
 
-static void bdi_work_clear(struct bdi_work *work)
-{
-	clear_bit(WS_USED_B, &work->state);
-	smp_mb__after_clear_bit();
-	/*
-	 * work can have disappeared at this point. bit waitq functions
-	 * should be able to tolerate this, provided bdi_sched_wait does
-	 * not dereference it's pointer argument.
-	*/
-	wake_up_bit(&work->state, WS_USED_B);
-}
-
-static void bdi_work_free(struct rcu_head *head)
-{
-	struct bdi_work *work = container_of(head, struct bdi_work, rcu_head);
-
-	if (!bdi_work_on_stack(work))
-		kfree(work);
-	else
-		bdi_work_clear(work);
-}
-
-static void wb_work_complete(struct bdi_work *work)
-{
-	const enum writeback_sync_modes sync_mode = work->args.sync_mode;
-	int onstack = bdi_work_on_stack(work);
-
-	/*
-	 * For allocated work, we can clear the done/seen bit right here.
-	 * For on-stack work, we need to postpone both the clear and free
-	 * to after the RCU grace period, since the stack could be invalidated
-	 * as soon as bdi_work_clear() has done the wakeup.
-	 */
-	if (!onstack)
-		bdi_work_clear(work);
-	if (sync_mode == WB_SYNC_NONE || onstack)
-		call_rcu(&work->rcu_head, bdi_work_free);
-}
-
-static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
+static void bdi_queue_work(struct backing_dev_info *bdi,
+		struct wb_writeback_work *work)
 {
-	/*
-	 * The caller has retrieved the work arguments from this work,
-	 * drop our reference. If this is the last ref, delete and free it
-	 */
-	if (atomic_dec_and_test(&work->pending)) {
-		struct backing_dev_info *bdi = wb->bdi;
-
-		spin_lock(&bdi->wb_lock);
-		list_del_rcu(&work->list);
-		spin_unlock(&bdi->wb_lock);
-
-		wb_work_complete(work);
-	}
-}
-
-static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
-{
-	work->seen = bdi->wb_mask;
-	BUG_ON(!work->seen);
-	atomic_set(&work->pending, bdi->wb_cnt);
-	BUG_ON(!bdi->wb_cnt);
-
-	/*
-	 * list_add_tail_rcu() contains the necessary barriers to
-	 * make sure the above stores are seen before the item is
-	 * noticed on the list
-	 */
 	spin_lock(&bdi->wb_lock);
-	list_add_tail_rcu(&work->list, &bdi->work_list);
+	list_add_tail(&work->list, &bdi->work_list);
 	spin_unlock(&bdi->wb_lock);
 
 	/*
@@ -180,97 +83,59 @@ static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
 	}
 }
 
-/*
- * Used for on-stack allocated work items. The caller needs to wait until
- * the wb threads have acked the work before it's safe to continue.
- */
-static void bdi_wait_on_work_clear(struct bdi_work *work)
+static void
+__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
+		bool range_cyclic, bool for_background)
 {
-	wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait,
-		    TASK_UNINTERRUPTIBLE);
-}
-
-static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
-				 struct wb_writeback_args *args)
-{
-	struct bdi_work *work;
+	struct wb_writeback_work *work;
 
 	/*
 	 * This is WB_SYNC_NONE writeback, so if allocation fails just
 	 * wakeup the thread for old dirty data writeback
 	 */
-	work = kmalloc(sizeof(*work), GFP_ATOMIC);
-	if (work) {
-		bdi_work_init(work, args);
-		bdi_queue_work(bdi, work);
-	} else {
-		struct bdi_writeback *wb = &bdi->wb;
-
-		if (wb->task)
-			wake_up_process(wb->task);
+	work = kzalloc(sizeof(*work), GFP_ATOMIC);
+	if (!work) {
+		if (bdi->wb.task)
+			wake_up_process(bdi->wb.task);
+		return;
 	}
+
+	work->sync_mode	= WB_SYNC_NONE;
+	work->nr_pages	= nr_pages;
+	work->range_cyclic = range_cyclic;
+	work->for_background = for_background;
+
+	bdi_queue_work(bdi, work);
 }
 
 /**
- * bdi_sync_writeback - start and wait for writeback
+ * bdi_start_writeback - start writeback
  * @bdi: the backing device to write from
- * @sb: write inodes from this super_block
+ * @nr_pages: the number of pages to write
  *
  * Description:
- *   This does WB_SYNC_ALL data integrity writeback and waits for the
- *   IO to complete. Callers must hold the sb s_umount semaphore for
- *   reading, to avoid having the super disappear before we are done.
+ *   This does WB_SYNC_NONE opportunistic writeback. The IO is only
+ *   started when this function returns, we make no guarentees on
+ *   completion. Caller need not hold sb s_umount semaphore.
+ *
  */
-static void bdi_sync_writeback(struct backing_dev_info *bdi,
-			       struct super_block *sb)
+void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
 {
-	struct wb_writeback_args args = {
-		.sb		= sb,
-		.sync_mode	= WB_SYNC_ALL,
-		.nr_pages	= LONG_MAX,
-		.range_cyclic	= 0,
-	};
-	struct bdi_work work;
-
-	bdi_work_init(&work, &args);
-	work.state |= WS_ONSTACK;
-
-	bdi_queue_work(bdi, &work);
-	bdi_wait_on_work_clear(&work);
+	__bdi_start_writeback(bdi, nr_pages, true, false);
 }
 
 /**
- * bdi_start_writeback - start writeback
+ * bdi_start_background_writeback - start background writeback
  * @bdi: the backing device to write from
- * @sb: write inodes from this super_block
- * @nr_pages: the number of pages to write
  *
  * Description:
- *   This does WB_SYNC_NONE opportunistic writeback. The IO is only
+ *   This does WB_SYNC_NONE background writeback. The IO is only
  *   started when this function returns, we make no guarentees on
  *   completion. Caller need not hold sb s_umount semaphore.
- *
  */
-void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
-			 long nr_pages)
+void bdi_start_background_writeback(struct backing_dev_info *bdi)
 {
-	struct wb_writeback_args args = {
-		.sb		= sb,
-		.sync_mode	= WB_SYNC_NONE,
-		.nr_pages	= nr_pages,
-		.range_cyclic	= 1,
-	};
-
-	/*
-	 * We treat @nr_pages=0 as the special case to do background writeback,
-	 * ie. to sync pages until the background dirty threshold is reached.
-	 */
-	if (!nr_pages) {
-		args.nr_pages = LONG_MAX;
-		args.for_background = 1;
-	}
-
-	bdi_alloc_queue_work(bdi, &args);
+	__bdi_start_writeback(bdi, LONG_MAX, true, true);
 }
 
 /*
@@ -381,10 +246,10 @@ static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
 	move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
 }
 
-static int write_inode(struct inode *inode, int sync)
+static int write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
-		return inode->i_sb->s_op->write_inode(inode, sync);
+		return inode->i_sb->s_op->write_inode(inode, wbc);
 	return 0;
 }
 
@@ -397,11 +262,11 @@ static void inode_wait_for_writeback(struct inode *inode)
 	wait_queue_head_t *wqh;
 
 	wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
-	do {
+	 while (inode->i_state & I_SYNC) {
 		spin_unlock(&inode_lock);
 		__wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
 		spin_lock(&inode_lock);
-	} while (inode->i_state & I_SYNC);
+	}
 }
 
 /*
@@ -421,7 +286,6 @@ static int
 writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	struct address_space *mapping = inode->i_mapping;
-	int wait = wbc->sync_mode == WB_SYNC_ALL;
 	unsigned dirty;
 	int ret;
 
@@ -439,7 +303,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 		 * We'll have another go at writing back this inode when we
 		 * completed a full scan of b_io.
 		 */
-		if (!wait) {
+		if (wbc->sync_mode != WB_SYNC_ALL) {
 			requeue_io(inode);
 			return 0;
 		}
@@ -452,24 +316,36 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 
 	BUG_ON(inode->i_state & I_SYNC);
 
-	/* Set I_SYNC, reset I_DIRTY */
-	dirty = inode->i_state & I_DIRTY;
+	/* Set I_SYNC, reset I_DIRTY_PAGES */
 	inode->i_state |= I_SYNC;
-	inode->i_state &= ~I_DIRTY;
-
+	inode->i_state &= ~I_DIRTY_PAGES;
 	spin_unlock(&inode_lock);
 
 	ret = do_writepages(mapping, wbc);
 
-	/* Don't write the inode if only I_DIRTY_PAGES was set */
-	if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
-		int err = write_inode(inode, wait);
+	/*
+	 * Make sure to wait on the data before writing out the metadata.
+	 * This is important for filesystems that modify metadata on data
+	 * I/O completion.
+	 */
+	if (wbc->sync_mode == WB_SYNC_ALL) {
+		int err = filemap_fdatawait(mapping);
 		if (ret == 0)
 			ret = err;
 	}
 
-	if (wait) {
-		int err = filemap_fdatawait(mapping);
+	/*
+	 * Some filesystems may redirty the inode during the writeback
+	 * due to delalloc, clear dirty metadata flags right before
+	 * write_inode()
+	 */
+	spin_lock(&inode_lock);
+	dirty = inode->i_state & I_DIRTY;
+	inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+	spin_unlock(&inode_lock);
+	/* Don't write the inode if only I_DIRTY_PAGES was set */
+	if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
+		int err = write_inode(inode, wbc);
 		if (ret == 0)
 			ret = err;
 	}
@@ -549,108 +425,79 @@ select_queue:
 	return ret;
 }
 
-static void unpin_sb_for_writeback(struct super_block **psb)
-{
-	struct super_block *sb = *psb;
-
-	if (sb) {
-		up_read(&sb->s_umount);
-		put_super(sb);
-		*psb = NULL;
-	}
-}
-
 /*
- * For WB_SYNC_NONE writeback, the caller does not have the sb pinned
+ * For background writeback the caller does not have the sb pinned
  * before calling writeback. So make sure that we do pin it, so it doesn't
  * go away while we are writing inodes from it.
- *
- * Returns 0 if the super was successfully pinned (or pinning wasn't needed),
- * 1 if we failed.
  */
-static int pin_sb_for_writeback(struct writeback_control *wbc,
-				struct inode *inode, struct super_block **psb)
+static bool pin_sb_for_writeback(struct super_block *sb)
 {
-	struct super_block *sb = inode->i_sb;
-
-	/*
-	 * If this sb is already pinned, nothing more to do. If not and
-	 * *psb is non-NULL, unpin the old one first
-	 */
-	if (sb == *psb)
-		return 0;
-	else if (*psb)
-		unpin_sb_for_writeback(psb);
-
-	/*
-	 * Caller must already hold the ref for this
-	 */
-	if (wbc->sync_mode == WB_SYNC_ALL) {
-		WARN_ON(!rwsem_is_locked(&sb->s_umount));
-		return 0;
+	spin_lock(&sb_lock);
+	if (list_empty(&sb->s_instances)) {
+		spin_unlock(&sb_lock);
+		return false;
 	}
 
-	spin_lock(&sb_lock);
 	sb->s_count++;
+	spin_unlock(&sb_lock);
+
 	if (down_read_trylock(&sb->s_umount)) {
-		if (sb->s_root) {
-			spin_unlock(&sb_lock);
-			goto pinned;
-		}
-		/*
-		 * umounted, drop rwsem again and fall through to failure
-		 */
+		if (sb->s_root)
+			return true;
 		up_read(&sb->s_umount);
 	}
 
-	sb->s_count--;
-	spin_unlock(&sb_lock);
-	return 1;
-pinned:
-	*psb = sb;
-	return 0;
+	put_super(sb);
+	return false;
 }
 
-static void writeback_inodes_wb(struct bdi_writeback *wb,
-				struct writeback_control *wbc)
+/*
+ * Write a portion of b_io inodes which belong to @sb.
+ *
+ * If @only_this_sb is true, then find and write all such
+ * inodes. Otherwise write only ones which go sequentially
+ * in reverse order.
+ *
+ * Return 1, if the caller writeback routine should be
+ * interrupted. Otherwise return 0.
+ */
+static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
+		struct writeback_control *wbc, bool only_this_sb)
 {
-	struct super_block *sb = wbc->sb, *pin_sb = NULL;
-	const unsigned long start = jiffies;	/* livelock avoidance */
-
-	spin_lock(&inode_lock);
-
-	if (!wbc->for_kupdate || list_empty(&wb->b_io))
-		queue_io(wb, wbc->older_than_this);
-
 	while (!list_empty(&wb->b_io)) {
-		struct inode *inode = list_entry(wb->b_io.prev,
-						struct inode, i_list);
 		long pages_skipped;
+		struct inode *inode = list_entry(wb->b_io.prev,
+						 struct inode, i_list);
 
-		/*
-		 * super block given and doesn't match, skip this inode
-		 */
-		if (sb && sb != inode->i_sb) {
-			redirty_tail(inode);
-			continue;
+		if (inode->i_sb != sb) {
+			if (only_this_sb) {
+				/*
+				 * We only want to write back data for this
+				 * superblock, move all inodes not belonging
+				 * to it back onto the dirty list.
+				 */
+				redirty_tail(inode);
+				continue;
+			}
+
+			/*
+			 * The inode belongs to a different superblock.
+			 * Bounce back to the caller to unpin this and
+			 * pin the next superblock.
+			 */
+			return 0;
 		}
 
 		if (inode->i_state & (I_NEW | I_WILL_FREE)) {
 			requeue_io(inode);
 			continue;
 		}
-
 		/*
 		 * Was this inode dirtied after sync_sb_inodes was called?
 		 * This keeps sync from extra jobs and livelock.
 		 */
-		if (inode_dirtied_after(inode, start))
-			break;
-
-		if (pin_sb_for_writeback(wbc, inode, &pin_sb)) {
-			requeue_io(inode);
-			continue;
-		}
+		if (inode_dirtied_after(inode, wbc->wb_start))
+			return 1;
 
 		BUG_ON(inode->i_state & (I_FREEING | I_CLEAR));
 		__iget(inode);
@@ -669,23 +516,55 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
 		spin_lock(&inode_lock);
 		if (wbc->nr_to_write <= 0) {
 			wbc->more_io = 1;
-			break;
+			return 1;
 		}
 		if (!list_empty(&wb->b_more_io))
 			wbc->more_io = 1;
 	}
+	/* b_io is empty */
+	return 1;
+}
 
-	unpin_sb_for_writeback(&pin_sb);
+void writeback_inodes_wb(struct bdi_writeback *wb,
+		struct writeback_control *wbc)
+{
+	int ret = 0;
 
+	wbc->wb_start = jiffies; /* livelock avoidance */
+	spin_lock(&inode_lock);
+	if (!wbc->for_kupdate || list_empty(&wb->b_io))
+		queue_io(wb, wbc->older_than_this);
+
+	while (!list_empty(&wb->b_io)) {
+		struct inode *inode = list_entry(wb->b_io.prev,
+						 struct inode, i_list);
+		struct super_block *sb = inode->i_sb;
+
+		if (!pin_sb_for_writeback(sb)) {
+			requeue_io(inode);
+			continue;
+		}
+		ret = writeback_sb_inodes(sb, wb, wbc, false);
+		drop_super(sb);
+
+		if (ret)
+			break;
+	}
 	spin_unlock(&inode_lock);
 	/* Leave any unwritten inodes on b_io */
 }
 
-void writeback_inodes_wbc(struct writeback_control *wbc)
+static void __writeback_inodes_sb(struct super_block *sb,
+		struct bdi_writeback *wb, struct writeback_control *wbc)
 {
-	struct backing_dev_info *bdi = wbc->bdi;
+	WARN_ON(!rwsem_is_locked(&sb->s_umount));
 
-	writeback_inodes_wb(&bdi->wb, wbc);
+	wbc->wb_start = jiffies; /* livelock avoidance */
+	spin_lock(&inode_lock);
+	if (!wbc->for_kupdate || list_empty(&wb->b_io))
+		queue_io(wb, wbc->older_than_this);
+	writeback_sb_inodes(sb, wb, wbc, true);
+	spin_unlock(&inode_lock);
 }
 
 /*
@@ -723,16 +602,14 @@ static inline bool over_bground_thresh(void)
  * all dirty pages if they are all attached to "old" mappings.
  */
 static long wb_writeback(struct bdi_writeback *wb,
-			 struct wb_writeback_args *args)
+			 struct wb_writeback_work *work)
 {
 	struct writeback_control wbc = {
-		.bdi			= wb->bdi,
-		.sb			= args->sb,
-		.sync_mode		= args->sync_mode,
+		.sync_mode		= work->sync_mode,
 		.older_than_this	= NULL,
-		.for_kupdate		= args->for_kupdate,
-		.for_background		= args->for_background,
-		.range_cyclic		= args->range_cyclic,
+		.for_kupdate		= work->for_kupdate,
+		.for_background		= work->for_background,
+		.range_cyclic		= work->range_cyclic,
 	};
 	unsigned long oldest_jif;
 	long wrote = 0;
@@ -752,21 +629,24 @@ static long wb_writeback(struct bdi_writeback *wb,
 		/*
 		 * Stop writeback when nr_pages has been consumed
 		 */
-		if (args->nr_pages <= 0)
+		if (work->nr_pages <= 0)
 			break;
 
 		/*
 		 * For background writeout, stop when we are below the
 		 * background dirty threshold
 		 */
-		if (args->for_background && !over_bground_thresh())
+		if (work->for_background && !over_bground_thresh())
 			break;
 
 		wbc.more_io = 0;
 		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
 		wbc.pages_skipped = 0;
-		writeback_inodes_wb(wb, &wbc);
-		args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+		if (work->sb)
+			__writeback_inodes_sb(work->sb, wb, &wbc);
+		else
+			writeback_inodes_wb(wb, &wbc);
+		work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
 		wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
 
 		/*
@@ -802,31 +682,21 @@ static long wb_writeback(struct bdi_writeback *wb,
 }
 
 /*
- * Return the next bdi_work struct that hasn't been processed by this
- * wb thread yet. ->seen is initially set for each thread that exists
- * for this device, when a thread first notices a piece of work it
- * clears its bit. Depending on writeback type, the thread will notify
- * completion on either receiving the work (WB_SYNC_NONE) or after
- * it is done (WB_SYNC_ALL).
+ * Return the next wb_writeback_work struct that hasn't been processed yet.
  */
-static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi,
-					   struct bdi_writeback *wb)
+static struct wb_writeback_work *
+get_next_work_item(struct backing_dev_info *bdi, struct bdi_writeback *wb)
 {
-	struct bdi_work *work, *ret = NULL;
+	struct wb_writeback_work *work = NULL;
 
-	rcu_read_lock();
-
-	list_for_each_entry_rcu(work, &bdi->work_list, list) {
-		if (!test_bit(wb->nr, &work->seen))
-			continue;
-		clear_bit(wb->nr, &work->seen);
-
-		ret = work;
-		break;
+	spin_lock(&bdi->wb_lock);
+	if (!list_empty(&bdi->work_list)) {
+		work = list_entry(bdi->work_list.next,
+				  struct wb_writeback_work, list);
+		list_del_init(&work->list);
 	}
-
-	rcu_read_unlock();
-	return ret;
+	spin_unlock(&bdi->wb_lock);
+	return work;
 }
 
 static long wb_check_old_data_flush(struct bdi_writeback *wb)
@@ -834,6 +704,12 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
 	unsigned long expired;
 	long nr_pages;
 
+	/*
+	 * When set to zero, disable periodic writeback
+	 */
+	if (!dirty_writeback_interval)
+		return 0;
+
 	expired = wb->last_old_flush +
 			msecs_to_jiffies(dirty_writeback_interval * 10);
 	if (time_before(jiffies, expired))
@@ -845,14 +721,14 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
 			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
 
 	if (nr_pages) {
-		struct wb_writeback_args args = {
+		struct wb_writeback_work work = {
 			.nr_pages	= nr_pages,
 			.sync_mode	= WB_SYNC_NONE,
 			.for_kupdate	= 1,
 			.range_cyclic	= 1,
 		};
 
-		return wb_writeback(wb, &args);
+		return wb_writeback(wb, &work);
 	}
 
 	return 0;
@@ -864,33 +740,27 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
 long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 {
 	struct backing_dev_info *bdi = wb->bdi;
-	struct bdi_work *work;
+	struct wb_writeback_work *work;
 	long wrote = 0;
 
 	while ((work = get_next_work_item(bdi, wb)) != NULL) {
-		struct wb_writeback_args args = work->args;
-
 		/*
 		 * Override sync mode, in case we must wait for completion
+		 * because this thread is exiting now.
 		 */
 		if (force_wait)
-			work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
+			work->sync_mode = WB_SYNC_ALL;
 
-		/*
-		 * If this isn't a data integrity operation, just notify
-		 * that we have seen this work and we are now starting it.
-		 */
-		if (args.sync_mode == WB_SYNC_NONE)
-			wb_clear_pending(wb, work);
-
-		wrote += wb_writeback(wb, &args);
+		wrote += wb_writeback(wb, work);
 
 		/*
-		 * This is a data integrity writeback, so only do the
-		 * notification when we have completed the work.
+		 * Notify the caller of completion if this is a synchronous
+		 * work item, otherwise just free it.
 		 */
-		if (args.sync_mode == WB_SYNC_ALL)
-			wb_clear_pending(wb, work);
+		if (work->done)
+			complete(work->done);
+		else
+			kfree(work);
 	}
 
 	/*
@@ -929,8 +799,17 @@ int bdi_writeback_task(struct bdi_writeback *wb)
 				break;
 		}
 
-		wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
-		schedule_timeout_interruptible(wait_jiffies);
+		if (dirty_writeback_interval) {
+			wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
+			schedule_timeout_interruptible(wait_jiffies);
+		} else {
+			set_current_state(TASK_INTERRUPTIBLE);
+			if (list_empty_careful(&wb->bdi->work_list) &&
+			    !kthread_should_stop())
+				schedule();
+			__set_current_state(TASK_RUNNING);
+		}
+
 		try_to_freeze();
 	}
 
@@ -938,42 +817,27 @@ int bdi_writeback_task(struct bdi_writeback *wb)
 }
 
 /*
- * Schedule writeback for all backing devices. This does WB_SYNC_NONE
- * writeback, for integrity writeback see bdi_sync_writeback().
+ * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
+ * the whole world.
  */
-static void bdi_writeback_all(struct super_block *sb, long nr_pages)
+void wakeup_flusher_threads(long nr_pages)
 {
-	struct wb_writeback_args args = {
-		.sb		= sb,
-		.nr_pages	= nr_pages,
-		.sync_mode	= WB_SYNC_NONE,
-	};
 	struct backing_dev_info *bdi;
 
-	rcu_read_lock();
+	if (!nr_pages) {
+		nr_pages = global_page_state(NR_FILE_DIRTY) +
+				global_page_state(NR_UNSTABLE_NFS);
+	}
 
+	rcu_read_lock();
 	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
 		if (!bdi_has_dirty_io(bdi))
 			continue;
-
-		bdi_alloc_queue_work(bdi, &args);
+		__bdi_start_writeback(bdi, nr_pages, false, false);
 	}
-
 	rcu_read_unlock();
 }
 
-/*
- * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
- * the whole world.
- */
-void wakeup_flusher_threads(long nr_pages)
-{
-	if (nr_pages == 0)
-		nr_pages = global_page_state(NR_FILE_DIRTY) +
-				global_page_state(NR_UNSTABLE_NFS);
-	bdi_writeback_all(NULL, nr_pages);
-}
-
 static noinline void block_dump___mark_inode_dirty(struct inode *inode)
 {
 	if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
@@ -1178,12 +1042,20 @@ void writeback_inodes_sb(struct super_block *sb)
 {
 	unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
 	unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
-	long nr_to_write;
+	DECLARE_COMPLETION_ONSTACK(done);
+	struct wb_writeback_work work = {
+		.sb		= sb,
+		.sync_mode	= WB_SYNC_NONE,
+		.done		= &done,
+	};
+
+	WARN_ON(!rwsem_is_locked(&sb->s_umount));
 
-	nr_to_write = nr_dirty + nr_unstable +
+	work.nr_pages = nr_dirty + nr_unstable +
 			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
 
-	bdi_start_writeback(sb->s_bdi, sb, nr_to_write);
+	bdi_queue_work(sb->s_bdi, &work);
+	wait_for_completion(&done);
 }
 EXPORT_SYMBOL(writeback_inodes_sb);
 
@@ -1197,7 +1069,9 @@ EXPORT_SYMBOL(writeback_inodes_sb);
 int writeback_inodes_sb_if_idle(struct super_block *sb)
 {
 	if (!writeback_in_progress(sb->s_bdi)) {
+		down_read(&sb->s_umount);
 		writeback_inodes_sb(sb);
+		up_read(&sb->s_umount);
 		return 1;
 	} else
 		return 0;
@@ -1213,7 +1087,20 @@ EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
  */
 void sync_inodes_sb(struct super_block *sb)
 {
-	bdi_sync_writeback(sb->s_bdi, sb);
+	DECLARE_COMPLETION_ONSTACK(done);
+	struct wb_writeback_work work = {
+		.sb		= sb,
+		.sync_mode	= WB_SYNC_ALL,
+		.nr_pages	= LONG_MAX,
+		.range_cyclic	= 0,
+		.done		= &done,
+	};
+
+	WARN_ON(!rwsem_is_locked(&sb->s_umount));
+
+	bdi_queue_work(sb->s_bdi, &work);
+	wait_for_completion(&done);
+
 	wait_sb_inodes(sb);
 }
 EXPORT_SYMBOL(sync_inodes_sb);
diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig
index 864dac20a242..cc94bb9563f2 100644
--- a/fs/fscache/Kconfig
+++ b/fs/fscache/Kconfig
@@ -1,7 +1,6 @@
 
 config FSCACHE
 	tristate "General filesystem local caching manager"
-	depends on EXPERIMENTAL
 	select SLOW_WORK
 	help
 	  This option enables a generic filesystem caching manager that can be
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index 3221a0c7944e..4a8eb31c5338 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -12,6 +12,7 @@
 #define FSCACHE_DEBUG_LEVEL COOKIE
 #include <linux/module.h>
 #include <linux/seq_file.h>
+#include <linux/slab.h>
 #include <linux/key.h>
 #include <keys/user-type.h>
 #include "internal.h"
@@ -102,7 +103,7 @@ static struct fscache_object *fscache_objlist_lookup(loff_t *_pos)
 	/* banners (can't represent line 0 by pos 0 as that would involve
 	 * returning a NULL pointer) */
 	if (pos == 0)
-		return (struct fscache_object *) ++(*_pos);
+		return (struct fscache_object *)(long)++(*_pos);
 	if (pos < 3)
 		return (struct fscache_object *)pos;
 
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index e513ac599c8e..0b589a9b4ffc 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -53,7 +53,7 @@ const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {
 static void fscache_object_slow_work_put_ref(struct slow_work *);
 static int  fscache_object_slow_work_get_ref(struct slow_work *);
 static void fscache_object_slow_work_execute(struct slow_work *);
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 static void fscache_object_slow_work_desc(struct slow_work *, struct seq_file *);
 #endif
 static void fscache_initialise_object(struct fscache_object *);
@@ -69,7 +69,7 @@ const struct slow_work_ops fscache_object_slow_work_ops = {
 	.get_ref	= fscache_object_slow_work_get_ref,
 	.put_ref	= fscache_object_slow_work_put_ref,
 	.execute	= fscache_object_slow_work_execute,
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 	.desc		= fscache_object_slow_work_desc,
 #endif
 };
@@ -364,7 +364,7 @@ static void fscache_object_slow_work_execute(struct slow_work *work)
 /*
  * describe an object for slow-work debugging
  */
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 static void fscache_object_slow_work_desc(struct slow_work *work,
 					  struct seq_file *m)
 {
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 313e79a14266..f17cecafae44 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -14,6 +14,7 @@
 #define FSCACHE_DEBUG_LEVEL OPERATION
 #include <linux/module.h>
 #include <linux/seq_file.h>
+#include <linux/slab.h>
 #include "internal.h"
 
 atomic_t fscache_op_debug_id;
@@ -500,7 +501,7 @@ static void fscache_op_execute(struct slow_work *work)
 /*
  * describe an operation for slow-work debugging
  */
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 static void fscache_op_desc(struct slow_work *work, struct seq_file *m)
 {
 	struct fscache_operation *op =
@@ -517,7 +518,7 @@ const struct slow_work_ops fscache_op_slow_work_ops = {
 	.get_ref	= fscache_op_get_ref,
 	.put_ref	= fscache_op_put_ref,
 	.execute	= fscache_op_execute,
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 	.desc		= fscache_op_desc,
 #endif
 };
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index c598ea4c4e7d..723b889fd219 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -14,6 +14,7 @@
 #include <linux/fscache-cache.h>
 #include <linux/buffer_head.h>
 #include <linux/pagevec.h>
+#include <linux/slab.h>
 #include "internal.h"
 
 /*
@@ -709,30 +710,26 @@ static void fscache_write_op(struct fscache_operation *_op)
 		goto superseded;
 	}
 
-	if (page) {
-		radix_tree_tag_set(&cookie->stores, page->index,
-				   FSCACHE_COOKIE_STORING_TAG);
-		radix_tree_tag_clear(&cookie->stores, page->index,
-				     FSCACHE_COOKIE_PENDING_TAG);
-	}
+	radix_tree_tag_set(&cookie->stores, page->index,
+			   FSCACHE_COOKIE_STORING_TAG);
+	radix_tree_tag_clear(&cookie->stores, page->index,
+			     FSCACHE_COOKIE_PENDING_TAG);
 
 	spin_unlock(&cookie->stores_lock);
 	spin_unlock(&object->lock);
 
-	if (page) {
-		fscache_set_op_state(&op->op, "Store");
-		fscache_stat(&fscache_n_store_pages);
-		fscache_stat(&fscache_n_cop_write_page);
-		ret = object->cache->ops->write_page(op, page);
-		fscache_stat_d(&fscache_n_cop_write_page);
-		fscache_set_op_state(&op->op, "EndWrite");
-		fscache_end_page_write(object, page);
-		if (ret < 0) {
-			fscache_set_op_state(&op->op, "Abort");
-			fscache_abort_object(object);
-		} else {
-			fscache_enqueue_operation(&op->op);
-		}
+	fscache_set_op_state(&op->op, "Store");
+	fscache_stat(&fscache_n_store_pages);
+	fscache_stat(&fscache_n_cop_write_page);
+	ret = object->cache->ops->write_page(op, page);
+	fscache_stat_d(&fscache_n_cop_write_page);
+	fscache_set_op_state(&op->op, "EndWrite");
+	fscache_end_page_write(object, page);
+	if (ret < 0) {
+		fscache_set_op_state(&op->op, "Abort");
+		fscache_abort_object(object);
+	} else {
+		fscache_enqueue_operation(&op->op);
 	}
 
 	_leave("");
@@ -881,6 +878,7 @@ submit_failed:
 	goto nobufs;
 
 nobufs_unlock_obj:
+	spin_unlock(&cookie->stores_lock);
 	spin_unlock(&object->lock);
 nobufs:
 	spin_unlock(&cookie->lock);
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
index 46435f3aae68..4765190d537f 100644
--- a/fs/fscache/stats.c
+++ b/fs/fscache/stats.c
@@ -165,8 +165,8 @@ static int fscache_stats_show(struct seq_file *m, void *v)
 		   atomic_read(&fscache_n_object_lookups),
 		   atomic_read(&fscache_n_object_lookups_negative),
 		   atomic_read(&fscache_n_object_lookups_positive),
-		   atomic_read(&fscache_n_object_lookups_timed_out),
-		   atomic_read(&fscache_n_object_created));
+		   atomic_read(&fscache_n_object_created),
+		   atomic_read(&fscache_n_object_lookups_timed_out));
 
 	seq_printf(m, "Updates: n=%u nul=%u run=%u\n",
 		   atomic_read(&fscache_n_updates),
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index de792dcf3274..e1f8171278bd 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -44,6 +44,7 @@
 #include <linux/magic.h>
 #include <linux/miscdevice.h>
 #include <linux/mutex.h>
+#include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/stat.h>
 
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 51d9e33d634f..9424796d6634 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -16,8 +16,12 @@
 #include <linux/pagemap.h>
 #include <linux/file.h>
 #include <linux/slab.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/swap.h>
+#include <linux/splice.h>
 
 MODULE_ALIAS_MISCDEV(FUSE_MINOR);
+MODULE_ALIAS("devname:fuse");
 
 static struct kmem_cache *fuse_req_cachep;
 
@@ -498,6 +502,9 @@ struct fuse_copy_state {
 	int write;
 	struct fuse_req *req;
 	const struct iovec *iov;
+	struct pipe_buffer *pipebufs;
+	struct pipe_buffer *currbuf;
+	struct pipe_inode_info *pipe;
 	unsigned long nr_segs;
 	unsigned long seglen;
 	unsigned long addr;
@@ -505,16 +512,16 @@ struct fuse_copy_state {
 	void *mapaddr;
 	void *buf;
 	unsigned len;
+	unsigned move_pages:1;
 };
 
 static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc,
-			   int write, struct fuse_req *req,
+			   int write,
 			   const struct iovec *iov, unsigned long nr_segs)
 {
 	memset(cs, 0, sizeof(*cs));
 	cs->fc = fc;
 	cs->write = write;
-	cs->req = req;
 	cs->iov = iov;
 	cs->nr_segs = nr_segs;
 }
@@ -522,7 +529,18 @@ static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc,
 /* Unmap and put previous page of userspace buffer */
 static void fuse_copy_finish(struct fuse_copy_state *cs)
 {
-	if (cs->mapaddr) {
+	if (cs->currbuf) {
+		struct pipe_buffer *buf = cs->currbuf;
+
+		if (!cs->write) {
+			buf->ops->unmap(cs->pipe, buf, cs->mapaddr);
+		} else {
+			kunmap_atomic(cs->mapaddr, KM_USER0);
+			buf->len = PAGE_SIZE - cs->len;
+		}
+		cs->currbuf = NULL;
+		cs->mapaddr = NULL;
+	} else if (cs->mapaddr) {
 		kunmap_atomic(cs->mapaddr, KM_USER0);
 		if (cs->write) {
 			flush_dcache_page(cs->pg);
@@ -544,26 +562,61 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
 
 	unlock_request(cs->fc, cs->req);
 	fuse_copy_finish(cs);
-	if (!cs->seglen) {
-		BUG_ON(!cs->nr_segs);
-		cs->seglen = cs->iov[0].iov_len;
-		cs->addr = (unsigned long) cs->iov[0].iov_base;
-		cs->iov++;
-		cs->nr_segs--;
+	if (cs->pipebufs) {
+		struct pipe_buffer *buf = cs->pipebufs;
+
+		if (!cs->write) {
+			err = buf->ops->confirm(cs->pipe, buf);
+			if (err)
+				return err;
+
+			BUG_ON(!cs->nr_segs);
+			cs->currbuf = buf;
+			cs->mapaddr = buf->ops->map(cs->pipe, buf, 1);
+			cs->len = buf->len;
+			cs->buf = cs->mapaddr + buf->offset;
+			cs->pipebufs++;
+			cs->nr_segs--;
+		} else {
+			struct page *page;
+
+			if (cs->nr_segs == cs->pipe->buffers)
+				return -EIO;
+
+			page = alloc_page(GFP_HIGHUSER);
+			if (!page)
+				return -ENOMEM;
+
+			buf->page = page;
+			buf->offset = 0;
+			buf->len = 0;
+
+			cs->currbuf = buf;
+			cs->mapaddr = kmap_atomic(page, KM_USER0);
+			cs->buf = cs->mapaddr;
+			cs->len = PAGE_SIZE;
+			cs->pipebufs++;
+			cs->nr_segs++;
+		}
+	} else {
+		if (!cs->seglen) {
+			BUG_ON(!cs->nr_segs);
+			cs->seglen = cs->iov[0].iov_len;
+			cs->addr = (unsigned long) cs->iov[0].iov_base;
+			cs->iov++;
+			cs->nr_segs--;
+		}
+		err = get_user_pages_fast(cs->addr, 1, cs->write, &cs->pg);
+		if (err < 0)
+			return err;
+		BUG_ON(err != 1);
+		offset = cs->addr % PAGE_SIZE;
+		cs->mapaddr = kmap_atomic(cs->pg, KM_USER0);
+		cs->buf = cs->mapaddr + offset;
+		cs->len = min(PAGE_SIZE - offset, cs->seglen);
+		cs->seglen -= cs->len;
+		cs->addr += cs->len;
 	}
-	down_read(&current->mm->mmap_sem);
-	err = get_user_pages(current, current->mm, cs->addr, 1, cs->write, 0,
-			     &cs->pg, NULL);
-	up_read(&current->mm->mmap_sem);
-	if (err < 0)
-		return err;
-	BUG_ON(err != 1);
-	offset = cs->addr % PAGE_SIZE;
-	cs->mapaddr = kmap_atomic(cs->pg, KM_USER0);
-	cs->buf = cs->mapaddr + offset;
-	cs->len = min(PAGE_SIZE - offset, cs->seglen);
-	cs->seglen -= cs->len;
-	cs->addr += cs->len;
 
 	return lock_request(cs->fc, cs->req);
 }
@@ -585,23 +638,178 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size)
 	return ncpy;
 }
 
+static int fuse_check_page(struct page *page)
+{
+	if (page_mapcount(page) ||
+	    page->mapping != NULL ||
+	    page_count(page) != 1 ||
+	    (page->flags & PAGE_FLAGS_CHECK_AT_PREP &
+	     ~(1 << PG_locked |
+	       1 << PG_referenced |
+	       1 << PG_uptodate |
+	       1 << PG_lru |
+	       1 << PG_active |
+	       1 << PG_reclaim))) {
+		printk(KERN_WARNING "fuse: trying to steal weird page\n");
+		printk(KERN_WARNING "  page=%p index=%li flags=%08lx, count=%i, mapcount=%i, mapping=%p\n", page, page->index, page->flags, page_count(page), page_mapcount(page), page->mapping);
+		return 1;
+	}
+	return 0;
+}
+
+static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
+{
+	int err;
+	struct page *oldpage = *pagep;
+	struct page *newpage;
+	struct pipe_buffer *buf = cs->pipebufs;
+	struct address_space *mapping;
+	pgoff_t index;
+
+	unlock_request(cs->fc, cs->req);
+	fuse_copy_finish(cs);
+
+	err = buf->ops->confirm(cs->pipe, buf);
+	if (err)
+		return err;
+
+	BUG_ON(!cs->nr_segs);
+	cs->currbuf = buf;
+	cs->len = buf->len;
+	cs->pipebufs++;
+	cs->nr_segs--;
+
+	if (cs->len != PAGE_SIZE)
+		goto out_fallback;
+
+	if (buf->ops->steal(cs->pipe, buf) != 0)
+		goto out_fallback;
+
+	newpage = buf->page;
+
+	if (WARN_ON(!PageUptodate(newpage)))
+		return -EIO;
+
+	ClearPageMappedToDisk(newpage);
+
+	if (fuse_check_page(newpage) != 0)
+		goto out_fallback_unlock;
+
+	mapping = oldpage->mapping;
+	index = oldpage->index;
+
+	/*
+	 * This is a new and locked page, it shouldn't be mapped or
+	 * have any special flags on it
+	 */
+	if (WARN_ON(page_mapped(oldpage)))
+		goto out_fallback_unlock;
+	if (WARN_ON(page_has_private(oldpage)))
+		goto out_fallback_unlock;
+	if (WARN_ON(PageDirty(oldpage) || PageWriteback(oldpage)))
+		goto out_fallback_unlock;
+	if (WARN_ON(PageMlocked(oldpage)))
+		goto out_fallback_unlock;
+
+	remove_from_page_cache(oldpage);
+	page_cache_release(oldpage);
+
+	err = add_to_page_cache_locked(newpage, mapping, index, GFP_KERNEL);
+	if (err) {
+		printk(KERN_WARNING "fuse_try_move_page: failed to add page");
+		goto out_fallback_unlock;
+	}
+	page_cache_get(newpage);
+
+	if (!(buf->flags & PIPE_BUF_FLAG_LRU))
+		lru_cache_add_file(newpage);
+
+	err = 0;
+	spin_lock(&cs->fc->lock);
+	if (cs->req->aborted)
+		err = -ENOENT;
+	else
+		*pagep = newpage;
+	spin_unlock(&cs->fc->lock);
+
+	if (err) {
+		unlock_page(newpage);
+		page_cache_release(newpage);
+		return err;
+	}
+
+	unlock_page(oldpage);
+	page_cache_release(oldpage);
+	cs->len = 0;
+
+	return 0;
+
+out_fallback_unlock:
+	unlock_page(newpage);
+out_fallback:
+	cs->mapaddr = buf->ops->map(cs->pipe, buf, 1);
+	cs->buf = cs->mapaddr + buf->offset;
+
+	err = lock_request(cs->fc, cs->req);
+	if (err)
+		return err;
+
+	return 1;
+}
+
+static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
+			 unsigned offset, unsigned count)
+{
+	struct pipe_buffer *buf;
+
+	if (cs->nr_segs == cs->pipe->buffers)
+		return -EIO;
+
+	unlock_request(cs->fc, cs->req);
+	fuse_copy_finish(cs);
+
+	buf = cs->pipebufs;
+	page_cache_get(page);
+	buf->page = page;
+	buf->offset = offset;
+	buf->len = count;
+
+	cs->pipebufs++;
+	cs->nr_segs++;
+	cs->len = 0;
+
+	return 0;
+}
+
 /*
  * Copy a page in the request to/from the userspace buffer.  Must be
  * done atomically
  */
-static int fuse_copy_page(struct fuse_copy_state *cs, struct page *page,
+static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
 			  unsigned offset, unsigned count, int zeroing)
 {
+	int err;
+	struct page *page = *pagep;
+
 	if (page && zeroing && count < PAGE_SIZE) {
 		void *mapaddr = kmap_atomic(page, KM_USER1);
 		memset(mapaddr, 0, PAGE_SIZE);
 		kunmap_atomic(mapaddr, KM_USER1);
 	}
 	while (count) {
-		if (!cs->len) {
-			int err = fuse_copy_fill(cs);
-			if (err)
-				return err;
+		if (cs->write && cs->pipebufs && page) {
+			return fuse_ref_page(cs, page, offset, count);
+		} else if (!cs->len) {
+			if (cs->move_pages && page &&
+			    offset == 0 && count == PAGE_SIZE) {
+				err = fuse_try_move_page(cs, pagep);
+				if (err <= 0)
+					return err;
+			} else {
+				err = fuse_copy_fill(cs);
+				if (err)
+					return err;
+			}
 		}
 		if (page) {
 			void *mapaddr = kmap_atomic(page, KM_USER1);
@@ -626,8 +834,10 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
 	unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset);
 
 	for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) {
-		struct page *page = req->pages[i];
-		int err = fuse_copy_page(cs, page, offset, count, zeroing);
+		int err;
+
+		err = fuse_copy_page(cs, &req->pages[i], offset, count,
+				     zeroing);
 		if (err)
 			return err;
 
@@ -704,11 +914,10 @@ __acquires(&fc->lock)
  *
  * Called with fc->lock held, releases it
  */
-static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_req *req,
-			       const struct iovec *iov, unsigned long nr_segs)
+static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_copy_state *cs,
+			       size_t nbytes, struct fuse_req *req)
 __releases(&fc->lock)
 {
-	struct fuse_copy_state cs;
 	struct fuse_in_header ih;
 	struct fuse_interrupt_in arg;
 	unsigned reqsize = sizeof(ih) + sizeof(arg);
@@ -724,14 +933,13 @@ __releases(&fc->lock)
 	arg.unique = req->in.h.unique;
 
 	spin_unlock(&fc->lock);
-	if (iov_length(iov, nr_segs) < reqsize)
+	if (nbytes < reqsize)
 		return -EINVAL;
 
-	fuse_copy_init(&cs, fc, 1, NULL, iov, nr_segs);
-	err = fuse_copy_one(&cs, &ih, sizeof(ih));
+	err = fuse_copy_one(cs, &ih, sizeof(ih));
 	if (!err)
-		err = fuse_copy_one(&cs, &arg, sizeof(arg));
-	fuse_copy_finish(&cs);
+		err = fuse_copy_one(cs, &arg, sizeof(arg));
+	fuse_copy_finish(cs);
 
 	return err ? err : reqsize;
 }
@@ -745,18 +953,13 @@ __releases(&fc->lock)
  * request_end().  Otherwise add it to the processing list, and set
  * the 'sent' flag.
  */
-static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
-			      unsigned long nr_segs, loff_t pos)
+static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file,
+				struct fuse_copy_state *cs, size_t nbytes)
 {
 	int err;
 	struct fuse_req *req;
 	struct fuse_in *in;
-	struct fuse_copy_state cs;
 	unsigned reqsize;
-	struct file *file = iocb->ki_filp;
-	struct fuse_conn *fc = fuse_get_conn(file);
-	if (!fc)
-		return -EPERM;
 
  restart:
 	spin_lock(&fc->lock);
@@ -776,7 +979,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
 	if (!list_empty(&fc->interrupts)) {
 		req = list_entry(fc->interrupts.next, struct fuse_req,
 				 intr_entry);
-		return fuse_read_interrupt(fc, req, iov, nr_segs);
+		return fuse_read_interrupt(fc, cs, nbytes, req);
 	}
 
 	req = list_entry(fc->pending.next, struct fuse_req, list);
@@ -786,7 +989,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
 	in = &req->in;
 	reqsize = in->h.len;
 	/* If request is too large, reply with an error and restart the read */
-	if (iov_length(iov, nr_segs) < reqsize) {
+	if (nbytes < reqsize) {
 		req->out.h.error = -EIO;
 		/* SETXATTR is special, since it may contain too large data */
 		if (in->h.opcode == FUSE_SETXATTR)
@@ -795,12 +998,12 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
 		goto restart;
 	}
 	spin_unlock(&fc->lock);
-	fuse_copy_init(&cs, fc, 1, req, iov, nr_segs);
-	err = fuse_copy_one(&cs, &in->h, sizeof(in->h));
+	cs->req = req;
+	err = fuse_copy_one(cs, &in->h, sizeof(in->h));
 	if (!err)
-		err = fuse_copy_args(&cs, in->numargs, in->argpages,
+		err = fuse_copy_args(cs, in->numargs, in->argpages,
 				     (struct fuse_arg *) in->args, 0);
-	fuse_copy_finish(&cs);
+	fuse_copy_finish(cs);
 	spin_lock(&fc->lock);
 	req->locked = 0;
 	if (req->aborted) {
@@ -828,6 +1031,110 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
 	return err;
 }
 
+static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
+			      unsigned long nr_segs, loff_t pos)
+{
+	struct fuse_copy_state cs;
+	struct file *file = iocb->ki_filp;
+	struct fuse_conn *fc = fuse_get_conn(file);
+	if (!fc)
+		return -EPERM;
+
+	fuse_copy_init(&cs, fc, 1, iov, nr_segs);
+
+	return fuse_dev_do_read(fc, file, &cs, iov_length(iov, nr_segs));
+}
+
+static int fuse_dev_pipe_buf_steal(struct pipe_inode_info *pipe,
+				   struct pipe_buffer *buf)
+{
+	return 1;
+}
+
+static const struct pipe_buf_operations fuse_dev_pipe_buf_ops = {
+	.can_merge = 0,
+	.map = generic_pipe_buf_map,
+	.unmap = generic_pipe_buf_unmap,
+	.confirm = generic_pipe_buf_confirm,
+	.release = generic_pipe_buf_release,
+	.steal = fuse_dev_pipe_buf_steal,
+	.get = generic_pipe_buf_get,
+};
+
+static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
+				    struct pipe_inode_info *pipe,
+				    size_t len, unsigned int flags)
+{
+	int ret;
+	int page_nr = 0;
+	int do_wakeup = 0;
+	struct pipe_buffer *bufs;
+	struct fuse_copy_state cs;
+	struct fuse_conn *fc = fuse_get_conn(in);
+	if (!fc)
+		return -EPERM;
+
+	bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL);
+	if (!bufs)
+		return -ENOMEM;
+
+	fuse_copy_init(&cs, fc, 1, NULL, 0);
+	cs.pipebufs = bufs;
+	cs.pipe = pipe;
+	ret = fuse_dev_do_read(fc, in, &cs, len);
+	if (ret < 0)
+		goto out;
+
+	ret = 0;
+	pipe_lock(pipe);
+
+	if (!pipe->readers) {
+		send_sig(SIGPIPE, current, 0);
+		if (!ret)
+			ret = -EPIPE;
+		goto out_unlock;
+	}
+
+	if (pipe->nrbufs + cs.nr_segs > pipe->buffers) {
+		ret = -EIO;
+		goto out_unlock;
+	}
+
+	while (page_nr < cs.nr_segs) {
+		int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
+		struct pipe_buffer *buf = pipe->bufs + newbuf;
+
+		buf->page = bufs[page_nr].page;
+		buf->offset = bufs[page_nr].offset;
+		buf->len = bufs[page_nr].len;
+		buf->ops = &fuse_dev_pipe_buf_ops;
+
+		pipe->nrbufs++;
+		page_nr++;
+		ret += buf->len;
+
+		if (pipe->inode)
+			do_wakeup = 1;
+	}
+
+out_unlock:
+	pipe_unlock(pipe);
+
+	if (do_wakeup) {
+		smp_mb();
+		if (waitqueue_active(&pipe->wait))
+			wake_up_interruptible(&pipe->wait);
+		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+	}
+
+out:
+	for (; page_nr < cs.nr_segs; page_nr++)
+		page_cache_release(bufs[page_nr].page);
+
+	kfree(bufs);
+	return ret;
+}
+
 static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size,
 			    struct fuse_copy_state *cs)
 {
@@ -865,13 +1172,10 @@ static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size,
 
 	down_read(&fc->killsb);
 	err = -ENOENT;
-	if (!fc->sb)
-		goto err_unlock;
-
-	err = fuse_reverse_inval_inode(fc->sb, outarg.ino,
-				       outarg.off, outarg.len);
-
-err_unlock:
+	if (fc->sb) {
+		err = fuse_reverse_inval_inode(fc->sb, outarg.ino,
+					       outarg.off, outarg.len);
+	}
 	up_read(&fc->killsb);
 	return err;
 
@@ -884,10 +1188,15 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
 				   struct fuse_copy_state *cs)
 {
 	struct fuse_notify_inval_entry_out outarg;
-	int err = -EINVAL;
-	char buf[FUSE_NAME_MAX+1];
+	int err = -ENOMEM;
+	char *buf;
 	struct qstr name;
 
+	buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL);
+	if (!buf)
+		goto err;
+
+	err = -EINVAL;
 	if (size < sizeof(outarg))
 		goto err;
 
@@ -910,16 +1219,14 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
 
 	down_read(&fc->killsb);
 	err = -ENOENT;
-	if (!fc->sb)
-		goto err_unlock;
-
-	err = fuse_reverse_inval_entry(fc->sb, outarg.parent, &name);
-
-err_unlock:
+	if (fc->sb)
+		err = fuse_reverse_inval_entry(fc->sb, outarg.parent, &name);
 	up_read(&fc->killsb);
+	kfree(buf);
 	return err;
 
 err:
+	kfree(buf);
 	fuse_copy_finish(cs);
 	return err;
 }
@@ -987,23 +1294,17 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
  * it from the list and copy the rest of the buffer to the request.
  * The request is finished by calling request_end()
  */
-static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
-			       unsigned long nr_segs, loff_t pos)
+static ssize_t fuse_dev_do_write(struct fuse_conn *fc,
+				 struct fuse_copy_state *cs, size_t nbytes)
 {
 	int err;
-	size_t nbytes = iov_length(iov, nr_segs);
 	struct fuse_req *req;
 	struct fuse_out_header oh;
-	struct fuse_copy_state cs;
-	struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp);
-	if (!fc)
-		return -EPERM;
 
-	fuse_copy_init(&cs, fc, 0, NULL, iov, nr_segs);
 	if (nbytes < sizeof(struct fuse_out_header))
 		return -EINVAL;
 
-	err = fuse_copy_one(&cs, &oh, sizeof(oh));
+	err = fuse_copy_one(cs, &oh, sizeof(oh));
 	if (err)
 		goto err_finish;
 
@@ -1016,7 +1317,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
 	 * and error contains notification code.
 	 */
 	if (!oh.unique) {
-		err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), &cs);
+		err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs);
 		return err ? err : nbytes;
 	}
 
@@ -1035,7 +1336,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
 
 	if (req->aborted) {
 		spin_unlock(&fc->lock);
-		fuse_copy_finish(&cs);
+		fuse_copy_finish(cs);
 		spin_lock(&fc->lock);
 		request_end(fc, req);
 		return -ENOENT;
@@ -1052,7 +1353,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
 			queue_interrupt(fc, req);
 
 		spin_unlock(&fc->lock);
-		fuse_copy_finish(&cs);
+		fuse_copy_finish(cs);
 		return nbytes;
 	}
 
@@ -1060,11 +1361,13 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
 	list_move(&req->list, &fc->io);
 	req->out.h = oh;
 	req->locked = 1;
-	cs.req = req;
+	cs->req = req;
+	if (!req->out.page_replace)
+		cs->move_pages = 0;
 	spin_unlock(&fc->lock);
 
-	err = copy_out_args(&cs, &req->out, nbytes);
-	fuse_copy_finish(&cs);
+	err = copy_out_args(cs, &req->out, nbytes);
+	fuse_copy_finish(cs);
 
 	spin_lock(&fc->lock);
 	req->locked = 0;
@@ -1080,10 +1383,101 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
  err_unlock:
 	spin_unlock(&fc->lock);
  err_finish:
-	fuse_copy_finish(&cs);
+	fuse_copy_finish(cs);
 	return err;
 }
 
+static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
+			      unsigned long nr_segs, loff_t pos)
+{
+	struct fuse_copy_state cs;
+	struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp);
+	if (!fc)
+		return -EPERM;
+
+	fuse_copy_init(&cs, fc, 0, iov, nr_segs);
+
+	return fuse_dev_do_write(fc, &cs, iov_length(iov, nr_segs));
+}
+
+static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
+				     struct file *out, loff_t *ppos,
+				     size_t len, unsigned int flags)
+{
+	unsigned nbuf;
+	unsigned idx;
+	struct pipe_buffer *bufs;
+	struct fuse_copy_state cs;
+	struct fuse_conn *fc;
+	size_t rem;
+	ssize_t ret;
+
+	fc = fuse_get_conn(out);
+	if (!fc)
+		return -EPERM;
+
+	bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL);
+	if (!bufs)
+		return -ENOMEM;
+
+	pipe_lock(pipe);
+	nbuf = 0;
+	rem = 0;
+	for (idx = 0; idx < pipe->nrbufs && rem < len; idx++)
+		rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len;
+
+	ret = -EINVAL;
+	if (rem < len) {
+		pipe_unlock(pipe);
+		goto out;
+	}
+
+	rem = len;
+	while (rem) {
+		struct pipe_buffer *ibuf;
+		struct pipe_buffer *obuf;
+
+		BUG_ON(nbuf >= pipe->buffers);
+		BUG_ON(!pipe->nrbufs);
+		ibuf = &pipe->bufs[pipe->curbuf];
+		obuf = &bufs[nbuf];
+
+		if (rem >= ibuf->len) {
+			*obuf = *ibuf;
+			ibuf->ops = NULL;
+			pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
+			pipe->nrbufs--;
+		} else {
+			ibuf->ops->get(pipe, ibuf);
+			*obuf = *ibuf;
+			obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
+			obuf->len = rem;
+			ibuf->offset += obuf->len;
+			ibuf->len -= obuf->len;
+		}
+		nbuf++;
+		rem -= obuf->len;
+	}
+	pipe_unlock(pipe);
+
+	fuse_copy_init(&cs, fc, 0, NULL, nbuf);
+	cs.pipebufs = bufs;
+	cs.pipe = pipe;
+
+	if (flags & SPLICE_F_MOVE)
+		cs.move_pages = 1;
+
+	ret = fuse_dev_do_write(fc, &cs, len);
+
+	for (idx = 0; idx < nbuf; idx++) {
+		struct pipe_buffer *buf = &bufs[idx];
+		buf->ops->release(pipe, buf);
+	}
+out:
+	kfree(bufs);
+	return ret;
+}
+
 static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
 {
 	unsigned mask = POLLOUT | POLLWRNORM;
@@ -1225,8 +1619,10 @@ const struct file_operations fuse_dev_operations = {
 	.llseek		= no_llseek,
 	.read		= do_sync_read,
 	.aio_read	= fuse_dev_read,
+	.splice_read	= fuse_dev_splice_read,
 	.write		= do_sync_write,
 	.aio_write	= fuse_dev_write,
+	.splice_write	= fuse_dev_splice_write,
 	.poll		= fuse_dev_poll,
 	.release	= fuse_dev_release,
 	.fasync		= fuse_dev_fasync,
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 4787ae6c5c1c..3cdc5f78a406 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1156,10 +1156,9 @@ static int fuse_dir_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static int fuse_dir_fsync(struct file *file, struct dentry *de, int datasync)
+static int fuse_dir_fsync(struct file *file, int datasync)
 {
-	/* nfsd can call this with no file */
-	return file ? fuse_fsync_common(file, de, datasync, 1) : 0;
+	return fuse_fsync_common(file, datasync, 1);
 }
 
 static bool update_mtime(unsigned ivalid)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index c18913a777ae..ada0adeb3bb5 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -351,10 +351,9 @@ static void fuse_sync_writes(struct inode *inode)
 	fuse_release_nowrite(inode);
 }
 
-int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
-		      int isdir)
+int fuse_fsync_common(struct file *file, int datasync, int isdir)
 {
-	struct inode *inode = de->d_inode;
+	struct inode *inode = file->f_mapping->host;
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_file *ff = file->private_data;
 	struct fuse_req *req;
@@ -403,9 +402,9 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
 	return err;
 }
 
-static int fuse_fsync(struct file *file, struct dentry *de, int datasync)
+static int fuse_fsync(struct file *file, int datasync)
 {
-	return fuse_fsync_common(file, de, datasync, 0);
+	return fuse_fsync_common(file, datasync, 0);
 }
 
 void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos,
@@ -517,17 +516,26 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
 	int i;
 	size_t count = req->misc.read.in.size;
 	size_t num_read = req->out.args[0].size;
-	struct inode *inode = req->pages[0]->mapping->host;
+	struct address_space *mapping = NULL;
 
-	/*
-	 * Short read means EOF.  If file size is larger, truncate it
-	 */
-	if (!req->out.h.error && num_read < count) {
-		loff_t pos = page_offset(req->pages[0]) + num_read;
-		fuse_read_update_size(inode, pos, req->misc.read.attr_ver);
-	}
+	for (i = 0; mapping == NULL && i < req->num_pages; i++)
+		mapping = req->pages[i]->mapping;
 
-	fuse_invalidate_attr(inode); /* atime changed */
+	if (mapping) {
+		struct inode *inode = mapping->host;
+
+		/*
+		 * Short read means EOF. If file size is larger, truncate it
+		 */
+		if (!req->out.h.error && num_read < count) {
+			loff_t pos;
+
+			pos = page_offset(req->pages[0]) + num_read;
+			fuse_read_update_size(inode, pos,
+					      req->misc.read.attr_ver);
+		}
+		fuse_invalidate_attr(inode); /* atime changed */
+	}
 
 	for (i = 0; i < req->num_pages; i++) {
 		struct page *page = req->pages[i];
@@ -536,6 +544,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
 		else
 			SetPageError(page);
 		unlock_page(page);
+		page_cache_release(page);
 	}
 	if (req->ff)
 		fuse_file_put(req->ff);
@@ -550,6 +559,7 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file)
 
 	req->out.argpages = 1;
 	req->out.page_zeroing = 1;
+	req->out.page_replace = 1;
 	fuse_read_fill(req, file, pos, count, FUSE_READ);
 	req->misc.read.attr_ver = fuse_get_attr_version(fc);
 	if (fc->async_read) {
@@ -589,6 +599,7 @@ static int fuse_readpages_fill(void *_data, struct page *page)
 			return PTR_ERR(req);
 		}
 	}
+	page_cache_get(page);
 	req->pages[req->num_pages] = page;
 	req->num_pages++;
 	return 0;
@@ -828,6 +839,9 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
 		if (!page)
 			break;
 
+		if (mapping_writably_mapped(mapping))
+			flush_dcache_page(page);
+
 		pagefault_disable();
 		tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes);
 		pagefault_enable();
@@ -991,10 +1005,7 @@ static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
 	nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
 	npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ);
-	down_read(&current->mm->mmap_sem);
-	npages = get_user_pages(current, current->mm, user_addr, npages, !write,
-				0, req->pages, NULL);
-	up_read(&current->mm->mmap_sem);
+	npages = get_user_pages_fast(user_addr, npages, !write, req->pages);
 	if (npages < 0)
 		return npages;
 
@@ -1577,9 +1588,9 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
 	while (iov_iter_count(&ii)) {
 		struct page *page = pages[page_idx++];
 		size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii));
-		void *kaddr, *map;
+		void *kaddr;
 
-		kaddr = map = kmap(page);
+		kaddr = kmap(page);
 
 		while (todo) {
 			char __user *uaddr = ii.iov->iov_base + ii.iov_offset;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 01cc462ff45d..8f309f04064e 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -177,6 +177,9 @@ struct fuse_out {
 	/** Zero partially or not copied pages */
 	unsigned page_zeroing:1;
 
+	/** Pages may be replaced with new ones */
+	unsigned page_replace:1;
+
 	/** Number or arguments */
 	unsigned numargs;
 
@@ -568,8 +571,7 @@ void fuse_release_common(struct file *file, int opcode);
 /**
  * Send FSYNC or FSYNCDIR request
  */
-int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
-		      int isdir);
+int fuse_fsync_common(struct file *file, int datasync, int isdir);
 
 /**
  * Notify poll wakeup
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 1a822ce2b24b..ec14d19ce501 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -850,7 +850,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
 	req->in.args[0].size = sizeof(*arg);
 	req->in.args[0].value = arg;
 	req->out.numargs = 1;
-	/* Variable length arguement used for backward compatibility
+	/* Variable length argument used for backward compatibility
 	   with interface version < 7.5.  Rest of init_out is zeroed
 	   by do_get_request(), so a short reply is not a problem */
 	req->out.argvar = 1;
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 55458031e501..99800e564157 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -7,6 +7,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/gfp.h>
 #include <linux/fs.h>
 #include <linux/generic_acl.h>
 #include <linux/posix_acl.h>
@@ -200,7 +201,7 @@ generic_check_acl(struct inode *inode, int mask)
 	return -EAGAIN;
 }
 
-struct xattr_handler generic_acl_access_handler = {
+const struct xattr_handler generic_acl_access_handler = {
 	.prefix = POSIX_ACL_XATTR_ACCESS,
 	.flags	= ACL_TYPE_ACCESS,
 	.list	= generic_acl_list,
@@ -208,7 +209,7 @@ struct xattr_handler generic_acl_access_handler = {
 	.set	= generic_acl_set,
 };
 
-struct xattr_handler generic_acl_default_handler = {
+const struct xattr_handler generic_acl_default_handler = {
 	.prefix = POSIX_ACL_XATTR_DEFAULT,
 	.flags	= ACL_TYPE_DEFAULT,
 	.list	= generic_acl_list,
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 4dcddf83326f..a47b43107112 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -8,7 +8,6 @@ config GFS2_FS
 	select FS_POSIX_ACL
 	select CRC32
 	select SLOW_WORK
-	select QUOTA
 	select QUOTACTL
 	help
 	  A cluster filesystem.
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 87ee309d4c24..48171f4c943d 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -236,10 +236,14 @@ static int gfs2_xattr_system_get(struct dentry *dentry, const char *name,
 				 void *buffer, size_t size, int xtype)
 {
 	struct inode *inode = dentry->d_inode;
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct posix_acl *acl;
 	int type;
 	int error;
 
+	if (!sdp->sd_args.ar_posix_acl)
+		return -EOPNOTSUPP;
+
 	type = gfs2_acl_type(name);
 	if (type < 0)
 		return type;
@@ -335,7 +339,7 @@ out:
 	return error;
 }
 
-struct xattr_handler gfs2_xattr_system_handler = {
+const struct xattr_handler gfs2_xattr_system_handler = {
 	.prefix = XATTR_SYSTEM_PREFIX,
 	.flags  = GFS2_EATYPE_SYS,
 	.get    = gfs2_xattr_system_get,
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index 9306a2e6620c..b522b0cb39ea 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -19,6 +19,6 @@
 extern int gfs2_check_acl(struct inode *inode, int mask);
 extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode);
 extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
-extern struct xattr_handler gfs2_xattr_system_handler;
+extern const struct xattr_handler gfs2_xattr_system_handler;
 
 #endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 7b8da9415267..9f8b52500d63 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -418,6 +418,7 @@ static int gfs2_jdata_writepages(struct address_space *mapping,
 static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
 {
 	struct buffer_head *dibh;
+	u64 dsize = i_size_read(&ip->i_inode);
 	void *kaddr;
 	int error;
 
@@ -437,9 +438,10 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
 		return error;
 
 	kaddr = kmap_atomic(page, KM_USER0);
-	memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
-	       ip->i_disksize);
-	memset(kaddr + ip->i_disksize, 0, PAGE_CACHE_SIZE - ip->i_disksize);
+	if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode)))
+		dsize = (dibh->b_size - sizeof(struct gfs2_dinode));
+	memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
+	memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize);
 	kunmap_atomic(kaddr, KM_USER0);
 	flush_dcache_page(page);
 	brelse(dibh);
@@ -698,8 +700,14 @@ out:
 		return 0;
 
 	page_cache_release(page);
+
+	/*
+	 * XXX(hch): the call below should probably be replaced with
+	 * a call to the gfs2-specific truncate blocks helper to actually
+	 * release disk blocks..
+	 */
 	if (pos + len > ip->i_inode.i_size)
-		vmtruncate(&ip->i_inode, ip->i_inode.i_size);
+		simple_setsize(&ip->i_inode, ip->i_inode.i_size);
 out_endtrans:
 	gfs2_trans_end(sdp);
 out_trans_fail:
@@ -1061,8 +1069,8 @@ out:
 
 int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
 {
-	struct inode *aspace = page->mapping->host;
-	struct gfs2_sbd *sdp = aspace->i_sb->s_fs_info;
+	struct address_space *mapping = page->mapping;
+	struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping);
 	struct buffer_head *bh, *head;
 	struct gfs2_bufdata *bd;
 
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 6d47379e794b..84da64b551b2 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -7,7 +7,6 @@
  * of the GNU General Public License version 2.
  */
 
-#include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
@@ -72,11 +71,13 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
 
 	if (!PageUptodate(page)) {
 		void *kaddr = kmap(page);
+		u64 dsize = i_size_read(inode);
+ 
+		if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode)))
+			dsize = dibh->b_size - sizeof(struct gfs2_dinode);
 
-		memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
-		       ip->i_disksize);
-		memset(kaddr + ip->i_disksize, 0,
-		       PAGE_CACHE_SIZE - ip->i_disksize);
+		memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
+		memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize);
 		kunmap(page);
 
 		SetPageUptodate(page);
@@ -541,7 +542,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
 				*ptr++ = cpu_to_be64(bn++);
 			break;
 		}
-	} while (state != ALLOC_DATA);
+	} while ((state != ALLOC_DATA) || !dblock);
 
 	ip->i_height = height;
 	gfs2_add_inode_blocks(&ip->i_inode, alloced);
@@ -1039,13 +1040,15 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
 		goto out;
 
 	if (gfs2_is_stuffed(ip)) {
+		u64 dsize = size + sizeof(struct gfs2_inode);
 		ip->i_disksize = size;
 		ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
 		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 		gfs2_dinode_out(ip, dibh->b_data);
-		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + size);
+		if (dsize > dibh->b_size)
+			dsize = dibh->b_size;
+		gfs2_buffer_clear_tail(dibh, dsize);
 		error = 1;
-
 	} else {
 		if (size & (u64)(sdp->sd_sb.sb_bsize - 1))
 			error = gfs2_block_truncate_page(ip->i_inode.i_mapping);
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 91beddadd388..bb7907bde3d8 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -7,7 +7,6 @@
  * of the GNU General Public License version 2.
  */
 
-#include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 25fddc100f18..6b48d7c268b2 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -392,7 +392,7 @@ static int gfs2_dirent_find_space(const struct gfs2_dirent *dent,
 	unsigned totlen = be16_to_cpu(dent->de_rec_len);
 
 	if (gfs2_dirent_sentinel(dent))
-		actual = GFS2_DIRENT_SIZE(0);
+		actual = 0;
 	if (totlen - actual >= required)
 		return 1;
 	return 0;
@@ -1231,6 +1231,25 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
 	return 0;
 }
 
+static void *gfs2_alloc_sort_buffer(unsigned size)
+{
+	void *ptr = NULL;
+
+	if (size < KMALLOC_MAX_SIZE)
+		ptr = kmalloc(size, GFP_NOFS | __GFP_NOWARN);
+	if (!ptr)
+		ptr = __vmalloc(size, GFP_NOFS, PAGE_KERNEL);
+	return ptr;
+}
+
+static void gfs2_free_sort_buffer(void *ptr)
+{
+	if (is_vmalloc_addr(ptr))
+		vfree(ptr);
+	else
+		kfree(ptr);
+}
+
 static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
 			      filldir_t filldir, int *copied, unsigned *depth,
 			      u64 leaf_no)
@@ -1271,7 +1290,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
 	 * 99 is the maximum number of entries that can fit in a single
 	 * leaf block.
 	 */
-	larr = vmalloc((leaves + entries + 99) * sizeof(void *));
+	larr = gfs2_alloc_sort_buffer((leaves + entries + 99) * sizeof(void *));
 	if (!larr)
 		goto out;
 	darr = (const struct gfs2_dirent **)(larr + leaves);
@@ -1282,7 +1301,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
 	do {
 		error = get_leaf(ip, lfn, &bh);
 		if (error)
-			goto out_kfree;
+			goto out_free;
 		lf = (struct gfs2_leaf *)bh->b_data;
 		lfn = be64_to_cpu(lf->lf_next);
 		if (lf->lf_entries) {
@@ -1291,7 +1310,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
 						gfs2_dirent_gather, NULL, &g);
 			error = PTR_ERR(dent);
 			if (IS_ERR(dent))
-				goto out_kfree;
+				goto out_free;
 			if (entries2 != g.offset) {
 				fs_warn(sdp, "Number of entries corrupt in dir "
 						"leaf %llu, entries2 (%u) != "
@@ -1300,7 +1319,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
 					entries2, g.offset);
 					
 				error = -EIO;
-				goto out_kfree;
+				goto out_free;
 			}
 			error = 0;
 			larr[leaf++] = bh;
@@ -1312,10 +1331,10 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
 	BUG_ON(entries2 != entries);
 	error = do_filldir_main(ip, offset, opaque, filldir, darr,
 				entries, copied);
-out_kfree:
+out_free:
 	for(i = 0; i < leaf; i++)
 		brelse(larr[i]);
-	vfree(larr);
+	gfs2_free_sort_buffer(larr);
 out:
 	return error;
 }
@@ -1475,7 +1494,7 @@ struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name)
 		inode = gfs2_inode_lookup(dir->i_sb, 
 				be16_to_cpu(dent->de_type),
 				be64_to_cpu(dent->de_inum.no_addr),
-				be64_to_cpu(dent->de_inum.no_formal_ino), 0);
+				be64_to_cpu(dent->de_inum.no_formal_ino));
 		brelse(bh);
 		return inode;
 	}
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index d15876e9aa26..dfe237a3f8ad 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -7,7 +7,6 @@
  * of the GNU General Public License version 2.
  */
 
-#include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
@@ -169,7 +168,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
 	if (error)
 		goto fail;
 
-	inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0, 0);
+	inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0);
 	if (IS_ERR(inode)) {
 		error = PTR_ERR(inode);
 		goto fail;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index a6abbae8a278..ed9a94f0ef15 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -218,6 +218,11 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
 	if (error)
 		goto out_drop_write;
 
+	error = -EACCES;
+	if (!is_owner_or_cap(inode))
+		goto out;
+
+	error = 0;
 	flags = ip->i_diskflags;
 	new_flags = (flags & ~mask) | (reqflags & mask);
 	if ((new_flags ^ flags) == 0)
@@ -275,8 +280,10 @@ static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
 {
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	u32 fsflags, gfsflags;
+
 	if (get_user(fsflags, ptr))
 		return -EFAULT;
+
 	gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags);
 	if (!S_ISDIR(inode->i_mode)) {
 		if (gfsflags & GFS2_DIF_INHERIT_JDATA)
@@ -547,9 +554,9 @@ static int gfs2_close(struct inode *inode, struct file *file)
  * Returns: errno
  */
 
-static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync)
+static int gfs2_fsync(struct file *file, int datasync)
 {
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = file->f_mapping->host;
 	int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC);
 	int ret = 0;
 
@@ -640,7 +647,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
 
 	if (!(fl->fl_flags & FL_POSIX))
 		return -ENOLCK;
-	if (__mandatory_lock(&ip->i_inode))
+	if (__mandatory_lock(&ip->i_inode) && fl->fl_type != F_UNLCK)
 		return -ENOLCK;
 
 	if (cmd == F_CANCELLK) {
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index f455a03a09e2..0898f3ec8212 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -19,7 +19,6 @@
 #include <linux/list.h>
 #include <linux/wait.h>
 #include <linux/module.h>
-#include <linux/rwsem.h>
 #include <asm/uaccess.h>
 #include <linux/seq_file.h>
 #include <linux/debugfs.h>
@@ -60,7 +59,6 @@ static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
 #define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0)
 static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);
 
-static DECLARE_RWSEM(gfs2_umount_flush_sem);
 static struct dentry *gfs2_root;
 static struct workqueue_struct *glock_workqueue;
 struct workqueue_struct *gfs2_delete_workqueue;
@@ -154,12 +152,14 @@ static unsigned int gl_hash(const struct gfs2_sbd *sdp,
 static void glock_free(struct gfs2_glock *gl)
 {
 	struct gfs2_sbd *sdp = gl->gl_sbd;
-	struct inode *aspace = gl->gl_aspace;
+	struct address_space *mapping = gfs2_glock2aspace(gl);
+	struct kmem_cache *cachep = gfs2_glock_cachep;
 
-	if (aspace)
-		gfs2_aspace_put(aspace);
+	GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
 	trace_gfs2_glock_put(gl);
-	sdp->sd_lockstruct.ls_ops->lm_put_lock(gfs2_glock_cachep, gl);
+	if (mapping)
+		cachep = gfs2_glock_aspace_cachep;
+	sdp->sd_lockstruct.ls_ops->lm_put_lock(cachep, gl);
 }
 
 /**
@@ -706,13 +706,22 @@ static void glock_work_func(struct work_struct *work)
 {
 	unsigned long delay = 0;
 	struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work);
+	struct gfs2_holder *gh;
 	int drop_ref = 0;
 
+	if (unlikely(test_bit(GLF_FROZEN, &gl->gl_flags))) {
+		spin_lock(&gl->gl_spin);
+		gh = find_first_waiter(gl);
+		if (gh && (gh->gh_flags & LM_FLAG_NOEXP) &&
+		    test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))
+			set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
+		spin_unlock(&gl->gl_spin);
+	}
+
 	if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags)) {
 		finish_xmote(gl, gl->gl_reply);
 		drop_ref = 1;
 	}
-	down_read(&gfs2_umount_flush_sem);
 	spin_lock(&gl->gl_spin);
 	if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
 	    gl->gl_state != LM_ST_UNLOCKED &&
@@ -725,7 +734,6 @@ static void glock_work_func(struct work_struct *work)
 	}
 	run_queue(gl, 0);
 	spin_unlock(&gl->gl_spin);
-	up_read(&gfs2_umount_flush_sem);
 	if (!delay ||
 	    queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
 		gfs2_glock_put(gl);
@@ -750,10 +758,11 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 		   const struct gfs2_glock_operations *glops, int create,
 		   struct gfs2_glock **glp)
 {
+	struct super_block *s = sdp->sd_vfs;
 	struct lm_lockname name = { .ln_number = number, .ln_type = glops->go_type };
 	struct gfs2_glock *gl, *tmp;
 	unsigned int hash = gl_hash(sdp, &name);
-	int error;
+	struct address_space *mapping;
 
 	read_lock(gl_lock_addr(hash));
 	gl = search_bucket(hash, sdp, &name);
@@ -765,10 +774,14 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 	if (!create)
 		return -ENOENT;
 
-	gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL);
+	if (glops->go_flags & GLOF_ASPACE)
+		gl = kmem_cache_alloc(gfs2_glock_aspace_cachep, GFP_KERNEL);
+	else
+		gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL);
 	if (!gl)
 		return -ENOMEM;
 
+	atomic_inc(&sdp->sd_glock_disposal);
 	gl->gl_flags = 0;
 	gl->gl_name = name;
 	atomic_set(&gl->gl_ref, 1);
@@ -783,18 +796,18 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 	gl->gl_tchange = jiffies;
 	gl->gl_object = NULL;
 	gl->gl_sbd = sdp;
-	gl->gl_aspace = NULL;
 	INIT_DELAYED_WORK(&gl->gl_work, glock_work_func);
 	INIT_WORK(&gl->gl_delete, delete_work_func);
 
-	/* If this glock protects actual on-disk data or metadata blocks,
-	   create a VFS inode to manage the pages/buffers holding them. */
-	if (glops == &gfs2_inode_glops || glops == &gfs2_rgrp_glops) {
-		gl->gl_aspace = gfs2_aspace_get(sdp);
-		if (!gl->gl_aspace) {
-			error = -ENOMEM;
-			goto fail;
-		}
+	mapping = gfs2_glock2aspace(gl);
+	if (mapping) {
+                mapping->a_ops = &gfs2_meta_aops;
+		mapping->host = s->s_bdev->bd_inode;
+		mapping->flags = 0;
+		mapping_set_gfp_mask(mapping, GFP_NOFS);
+		mapping->assoc_mapping = NULL;
+		mapping->backing_dev_info = s->s_bdi;
+		mapping->writeback_index = 0;
 	}
 
 	write_lock(gl_lock_addr(hash));
@@ -811,10 +824,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 	*glp = gl;
 
 	return 0;
-
-fail:
-	kmem_cache_free(gfs2_glock_cachep, gl);
-	return error;
 }
 
 /**
@@ -856,6 +865,9 @@ void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *
 	gh->gh_flags = flags;
 	gh->gh_iflags = 0;
 	gh->gh_ip = (unsigned long)__builtin_return_address(0);
+	if (gh->gh_owner_pid)
+		put_pid(gh->gh_owner_pid);
+	gh->gh_owner_pid = get_pid(task_pid(current));
 }
 
 /**
@@ -1346,7 +1358,7 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
 }
 
 
-static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
+static int gfs2_shrink_glock_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 {
 	struct gfs2_glock *gl;
 	int may_demote;
@@ -1509,35 +1521,13 @@ void gfs2_glock_thaw(struct gfs2_sbd *sdp)
 
 void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
 {
-	unsigned long t;
 	unsigned int x;
-	int cont;
-
-	t = jiffies;
 
-	for (;;) {
-		cont = 0;
-		for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
-			if (examine_bucket(clear_glock, sdp, x))
-				cont = 1;
-		}
-
-		if (!cont)
-			break;
-
-		if (time_after_eq(jiffies,
-				  t + gfs2_tune_get(sdp, gt_stall_secs) * HZ)) {
-			fs_warn(sdp, "Unmount seems to be stalled. "
-				     "Dumping lock state...\n");
-			gfs2_dump_lockstate(sdp);
-			t = jiffies;
-		}
-
-		down_write(&gfs2_umount_flush_sem);
-		invalidate_inodes(sdp->sd_vfs);
-		up_write(&gfs2_umount_flush_sem);
-		msleep(10);
-	}
+	for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
+		examine_bucket(clear_glock, sdp, x);
+	flush_workqueue(glock_workqueue);
+	wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0);
+	gfs2_dump_lockstate(sdp);
 }
 
 void gfs2_glock_finish_truncate(struct gfs2_inode *ip)
@@ -1681,7 +1671,7 @@ static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
 	dtime *= 1000000/HZ; /* demote time in uSec */
 	if (!test_bit(GLF_DEMOTE, &gl->gl_flags))
 		dtime = 0;
-	gfs2_print_dbg(seq, "G:  s:%s n:%u/%llu f:%s t:%s d:%s/%llu a:%d r:%d\n",
+	gfs2_print_dbg(seq, "G:  s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d r:%d\n",
 		  state2str(gl->gl_state),
 		  gl->gl_name.ln_type,
 		  (unsigned long long)gl->gl_name.ln_number,
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 13f0bd228132..2bda1911b156 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -123,7 +123,7 @@ struct lm_lockops {
 	int (*lm_mount) (struct gfs2_sbd *sdp, const char *fsname);
  	void (*lm_unmount) (struct gfs2_sbd *sdp);
 	void (*lm_withdraw) (struct gfs2_sbd *sdp);
-	void (*lm_put_lock) (struct kmem_cache *cachep, void *gl);
+	void (*lm_put_lock) (struct kmem_cache *cachep, struct gfs2_glock *gl);
 	unsigned int (*lm_lock) (struct gfs2_glock *gl,
 				 unsigned int req_state, unsigned int flags);
 	void (*lm_cancel) (struct gfs2_glock *gl);
@@ -180,6 +180,13 @@ static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl)
 	return gl->gl_state == LM_ST_SHARED;
 }
 
+static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl)
+{
+	if (gl->gl_ops->go_flags & GLOF_ASPACE)
+		return (struct address_space *)(gl + 1);
+	return NULL;
+}
+
 int gfs2_glock_get(struct gfs2_sbd *sdp,
 		   u64 number, const struct gfs2_glock_operations *glops,
 		   int create, struct gfs2_glock **glp);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 78554acc0605..49f97d3bb690 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -7,7 +7,6 @@
  * of the GNU General Public License version 2.
  */
 
-#include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
@@ -87,7 +86,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
 
 static void rgrp_go_sync(struct gfs2_glock *gl)
 {
-	struct address_space *metamapping = gl->gl_aspace->i_mapping;
+	struct address_space *metamapping = gfs2_glock2aspace(gl);
 	int error;
 
 	if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
@@ -113,7 +112,7 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
 
 static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
 {
-	struct address_space *mapping = gl->gl_aspace->i_mapping;
+	struct address_space *mapping = gfs2_glock2aspace(gl);
 
 	BUG_ON(!(flags & DIO_METADATA));
 	gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count));
@@ -134,7 +133,7 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
 static void inode_go_sync(struct gfs2_glock *gl)
 {
 	struct gfs2_inode *ip = gl->gl_object;
-	struct address_space *metamapping = gl->gl_aspace->i_mapping;
+	struct address_space *metamapping = gfs2_glock2aspace(gl);
 	int error;
 
 	if (ip && !S_ISREG(ip->i_inode.i_mode))
@@ -183,7 +182,7 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
 	gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count));
 
 	if (flags & DIO_METADATA) {
-		struct address_space *mapping = gl->gl_aspace->i_mapping;
+		struct address_space *mapping = gfs2_glock2aspace(gl);
 		truncate_inode_pages(mapping, 0);
 		if (ip) {
 			set_bit(GIF_INVALID, &ip->i_flags);
@@ -282,7 +281,8 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
 
 static int rgrp_go_demote_ok(const struct gfs2_glock *gl)
 {
-	return !gl->gl_aspace->i_mapping->nrpages;
+	const struct address_space *mapping = (const struct address_space *)(gl + 1);
+	return !mapping->nrpages;
 }
 
 /**
@@ -387,8 +387,7 @@ static void iopen_go_callback(struct gfs2_glock *gl)
 	struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object;
 
 	if (gl->gl_demote_state == LM_ST_UNLOCKED &&
-	    gl->gl_state == LM_ST_SHARED &&
-	    ip && test_bit(GIF_USER, &ip->i_flags)) {
+	    gl->gl_state == LM_ST_SHARED && ip) {
 		gfs2_glock_hold(gl);
 		if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
 			gfs2_glock_put_nolock(gl);
@@ -407,6 +406,7 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
 	.go_dump = inode_go_dump,
 	.go_type = LM_TYPE_INODE,
 	.go_min_hold_time = HZ / 5,
+	.go_flags = GLOF_ASPACE,
 };
 
 const struct gfs2_glock_operations gfs2_rgrp_glops = {
@@ -418,6 +418,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
 	.go_dump = gfs2_rgrp_dump,
 	.go_type = LM_TYPE_RGRP,
 	.go_min_hold_time = HZ / 5,
+	.go_flags = GLOF_ASPACE,
 };
 
 const struct gfs2_glock_operations gfs2_trans_glops = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 4792200978c8..b5d7363b22da 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -162,6 +162,8 @@ struct gfs2_glock_operations {
 	void (*go_callback) (struct gfs2_glock *gl);
 	const int go_type;
 	const unsigned long go_min_hold_time;
+	const unsigned long go_flags;
+#define GLOF_ASPACE 1
 };
 
 enum {
@@ -225,7 +227,6 @@ struct gfs2_glock {
 
 	struct gfs2_sbd *gl_sbd;
 
-	struct inode *gl_aspace;
 	struct list_head gl_ail_list;
 	atomic_t gl_ail_count;
 	struct delayed_work gl_work;
@@ -258,7 +259,6 @@ enum {
 	GIF_INVALID		= 0,
 	GIF_QD_LOCKED		= 1,
 	GIF_SW_PAGED		= 3,
-	GIF_USER                = 4, /* user inode, not metadata addr space */
 };
 
 
@@ -439,9 +439,6 @@ struct gfs2_args {
 struct gfs2_tune {
 	spinlock_t gt_spin;
 
-	unsigned int gt_incore_log_blocks;
-	unsigned int gt_log_flush_secs;
-
 	unsigned int gt_logd_secs;
 
 	unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
@@ -451,7 +448,6 @@ struct gfs2_tune {
 	unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
 	unsigned int gt_new_files_jdata;
 	unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
-	unsigned int gt_stall_secs; /* Detects trouble! */
 	unsigned int gt_complain_secs;
 	unsigned int gt_statfs_quantum;
 	unsigned int gt_statfs_slow;
@@ -463,6 +459,7 @@ enum {
 	SDF_SHUTDOWN		= 2,
 	SDF_NOBARRIERS		= 3,
 	SDF_NORECOVERY		= 4,
+	SDF_DEMOTE		= 5,
 };
 
 #define GFS2_FSNAME_LEN		256
@@ -544,6 +541,8 @@ struct gfs2_sbd {
 	struct gfs2_holder sd_live_gh;
 	struct gfs2_glock *sd_rename_gl;
 	struct gfs2_glock *sd_trans_gl;
+	wait_queue_head_t sd_glock_wait;
+	atomic_t sd_glock_disposal;
 
 	/* Inode Stuff */
 
@@ -615,8 +614,9 @@ struct gfs2_sbd {
 	unsigned int sd_log_blks_reserved;
 	unsigned int sd_log_commited_buf;
 	unsigned int sd_log_commited_databuf;
-	unsigned int sd_log_commited_revoke;
+	int sd_log_commited_revoke;
 
+	atomic_t sd_log_pinned;
 	unsigned int sd_log_num_buf;
 	unsigned int sd_log_num_revoke;
 	unsigned int sd_log_num_rg;
@@ -628,15 +628,17 @@ struct gfs2_sbd {
 	struct list_head sd_log_le_databuf;
 	struct list_head sd_log_le_ordered;
 
+	atomic_t sd_log_thresh1;
+	atomic_t sd_log_thresh2;
 	atomic_t sd_log_blks_free;
-	struct mutex sd_log_reserve_mutex;
+	wait_queue_head_t sd_log_waitq;
+	wait_queue_head_t sd_logd_waitq;
 
 	u64 sd_log_sequence;
 	unsigned int sd_log_head;
 	unsigned int sd_log_tail;
 	int sd_log_idle;
 
-	unsigned long sd_log_flush_time;
 	struct rw_semaphore sd_log_flush_lock;
 	atomic_t sd_log_in_flight;
 	wait_queue_head_t sd_log_flush_wait;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 6e220f4eee7d..f03afd9c44bc 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -45,7 +45,7 @@ static int iget_test(struct inode *inode, void *opaque)
 	struct gfs2_inode *ip = GFS2_I(inode);
 	u64 *no_addr = opaque;
 
-	if (ip->i_no_addr == *no_addr && test_bit(GIF_USER, &ip->i_flags))
+	if (ip->i_no_addr == *no_addr)
 		return 1;
 
 	return 0;
@@ -58,7 +58,6 @@ static int iget_set(struct inode *inode, void *opaque)
 
 	inode->i_ino = (unsigned long)*no_addr;
 	ip->i_no_addr = *no_addr;
-	set_bit(GIF_USER, &ip->i_flags);
 	return 0;
 }
 
@@ -84,7 +83,7 @@ static int iget_skip_test(struct inode *inode, void *opaque)
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_skip_data *data = opaque;
 
-	if (ip->i_no_addr == data->no_addr && test_bit(GIF_USER, &ip->i_flags)){
+	if (ip->i_no_addr == data->no_addr) {
 		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)){
 			data->skipped = 1;
 			return 0;
@@ -103,7 +102,6 @@ static int iget_skip_set(struct inode *inode, void *opaque)
 		return 1;
 	inode->i_ino = (unsigned long)(data->no_addr);
 	ip->i_no_addr = data->no_addr;
-	set_bit(GIF_USER, &ip->i_flags);
 	return 0;
 }
 
@@ -160,7 +158,6 @@ void gfs2_set_iop(struct inode *inode)
  * @sb: The super block
  * @no_addr: The inode number
  * @type: The type of the inode
- * @skip_freeing: set this not return an inode if it is currently being freed.
  *
  * Returns: A VFS inode, or an error
  */
@@ -168,17 +165,14 @@ void gfs2_set_iop(struct inode *inode)
 struct inode *gfs2_inode_lookup(struct super_block *sb,
 				unsigned int type,
 				u64 no_addr,
-				u64 no_formal_ino, int skip_freeing)
+				u64 no_formal_ino)
 {
 	struct inode *inode;
 	struct gfs2_inode *ip;
-	struct gfs2_glock *io_gl;
+	struct gfs2_glock *io_gl = NULL;
 	int error;
 
-	if (skip_freeing)
-		inode = gfs2_iget_skip(sb, no_addr);
-	else
-		inode = gfs2_iget(sb, no_addr);
+	inode = gfs2_iget(sb, no_addr);
 	ip = GFS2_I(inode);
 
 	if (!inode)
@@ -204,6 +198,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb,
 		ip->i_iopen_gh.gh_gl->gl_object = ip;
 
 		gfs2_glock_put(io_gl);
+		io_gl = NULL;
 
 		if ((type == DT_UNKNOWN) && (no_formal_ino == 0))
 			goto gfs2_nfsbypass;
@@ -234,13 +229,107 @@ gfs2_nfsbypass:
 fail_glock:
 	gfs2_glock_dq(&ip->i_iopen_gh);
 fail_iopen:
+	if (io_gl)
+		gfs2_glock_put(io_gl);
+fail_put:
+	if (inode->i_state & I_NEW)
+		ip->i_gl->gl_object = NULL;
+	gfs2_glock_put(ip->i_gl);
+fail:
+	if (inode->i_state & I_NEW)
+		iget_failed(inode);
+	else
+		iput(inode);
+	return ERR_PTR(error);
+}
+
+/**
+ * gfs2_process_unlinked_inode - Lookup an unlinked inode for reclamation
+ *                               and try to reclaim it by doing iput.
+ *
+ * This function assumes no rgrp locks are currently held.
+ *
+ * @sb: The super block
+ * no_addr: The inode number
+ *
+ */
+
+void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr)
+{
+	struct gfs2_sbd *sdp;
+	struct gfs2_inode *ip;
+	struct gfs2_glock *io_gl = NULL;
+	int error;
+	struct gfs2_holder gh;
+	struct inode *inode;
+
+	inode = gfs2_iget_skip(sb, no_addr);
+
+	if (!inode)
+		return;
+
+	/* If it's not a new inode, someone's using it, so leave it alone. */
+	if (!(inode->i_state & I_NEW)) {
+		iput(inode);
+		return;
+	}
+
+	ip = GFS2_I(inode);
+	sdp = GFS2_SB(inode);
+	ip->i_no_formal_ino = -1;
+
+	error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
+	if (unlikely(error))
+		goto fail;
+	ip->i_gl->gl_object = ip;
+
+	error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
+	if (unlikely(error))
+		goto fail_put;
+
+	set_bit(GIF_INVALID, &ip->i_flags);
+	error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, LM_FLAG_TRY | GL_EXACT,
+				   &ip->i_iopen_gh);
+	if (unlikely(error))
+		goto fail_iopen;
+
+	ip->i_iopen_gh.gh_gl->gl_object = ip;
 	gfs2_glock_put(io_gl);
+	io_gl = NULL;
+
+	inode->i_mode = DT2IF(DT_UNKNOWN);
+
+	/*
+	 * We must read the inode in order to work out its type in
+	 * this case. Note that this doesn't happen often as we normally
+	 * know the type beforehand. This code path only occurs during
+	 * unlinked inode recovery (where it is safe to do this glock,
+	 * which is not true in the general case).
+	 */
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY,
+				   &gh);
+	if (unlikely(error))
+		goto fail_glock;
+
+	/* Inode is now uptodate */
+	gfs2_glock_dq_uninit(&gh);
+	gfs2_set_iop(inode);
+
+	/* The iput will cause it to be deleted. */
+	iput(inode);
+	return;
+
+fail_glock:
+	gfs2_glock_dq(&ip->i_iopen_gh);
+fail_iopen:
+	if (io_gl)
+		gfs2_glock_put(io_gl);
 fail_put:
 	ip->i_gl->gl_object = NULL;
 	gfs2_glock_put(ip->i_gl);
 fail:
 	iget_failed(inode);
-	return ERR_PTR(error);
+	return;
 }
 
 static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
@@ -864,7 +953,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
 		goto fail_gunlock2;
 
 	inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), inum.no_addr,
-				  inum.no_formal_ino, 0);
+				  inum.no_formal_ino);
 	if (IS_ERR(inode))
 		goto fail_gunlock2;
 
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index c341aaf67adb..300ada3f21de 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -83,8 +83,8 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
 
 extern void gfs2_set_iop(struct inode *inode);
 extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
-				       u64 no_addr, u64 no_formal_ino,
-				       int skip_freeing);
+				       u64 no_addr, u64 no_formal_ino);
+extern void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr);
 extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
 
 extern int gfs2_inode_refresh(struct gfs2_inode *ip);
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 46df988323bc..0e0470ed34c2 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -9,6 +9,7 @@
 
 #include <linux/fs.h>
 #include <linux/dlm.h>
+#include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/gfs2_ondisk.h>
 
@@ -21,6 +22,7 @@ static void gdlm_ast(void *arg)
 {
 	struct gfs2_glock *gl = arg;
 	unsigned ret = gl->gl_state;
+	struct gfs2_sbd *sdp = gl->gl_sbd;
 
 	BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED);
 
@@ -29,7 +31,12 @@ static void gdlm_ast(void *arg)
 
 	switch (gl->gl_lksb.sb_status) {
 	case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */
-		kmem_cache_free(gfs2_glock_cachep, gl);
+		if (gl->gl_ops->go_flags & GLOF_ASPACE)
+			kmem_cache_free(gfs2_glock_aspace_cachep, gl);
+		else
+			kmem_cache_free(gfs2_glock_cachep, gl);
+		if (atomic_dec_and_test(&sdp->sd_glock_disposal))
+			wake_up(&sdp->sd_glock_wait);
 		return;
 	case -DLM_ECANCEL: /* Cancel while getting lock */
 		ret |= LM_OUT_CANCELED;
@@ -164,14 +171,16 @@ static unsigned int gdlm_lock(struct gfs2_glock *gl,
 	return LM_OUT_ASYNC;
 }
 
-static void gdlm_put_lock(struct kmem_cache *cachep, void *ptr)
+static void gdlm_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl)
 {
-	struct gfs2_glock *gl = ptr;
-	struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
 	int error;
 
 	if (gl->gl_lksb.sb_lkid == 0) {
 		kmem_cache_free(cachep, gl);
+		if (atomic_dec_and_test(&sdp->sd_glock_disposal))
+			wake_up(&sdp->sd_glock_wait);
 		return;
 	}
 
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 4511b08fc451..6a857e24f947 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -168,12 +168,11 @@ static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int fl
 	return list_empty(&ai->ai_ail1_list);
 }
 
-static void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags)
+static void gfs2_ail1_start(struct gfs2_sbd *sdp)
 {
 	struct list_head *head;
 	u64 sync_gen;
-	struct list_head *first;
-	struct gfs2_ail *first_ai, *ai, *tmp;
+	struct gfs2_ail *ai;
 	int done = 0;
 
 	gfs2_log_lock(sdp);
@@ -184,21 +183,9 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags)
 	}
 	sync_gen = sdp->sd_ail_sync_gen++;
 
-	first = head->prev;
-	first_ai = list_entry(first, struct gfs2_ail, ai_list);
-	first_ai->ai_sync_gen = sync_gen;
-	gfs2_ail1_start_one(sdp, first_ai); /* This may drop log lock */
-
-	if (flags & DIO_ALL)
-		first = NULL;
-
 	while(!done) {
-		if (first && (head->prev != first ||
-			      gfs2_ail1_empty_one(sdp, first_ai, 0)))
-			break;
-
 		done = 1;
-		list_for_each_entry_safe_reverse(ai, tmp, head, ai_list) {
+		list_for_each_entry_reverse(ai, head, ai_list) {
 			if (ai->ai_sync_gen >= sync_gen)
 				continue;
 			ai->ai_sync_gen = sync_gen;
@@ -290,58 +277,57 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
  * flush time, so we ensure that we have just enough free blocks at all
  * times to avoid running out during a log flush.
  *
+ * We no longer flush the log here, instead we wake up logd to do that
+ * for us. To avoid the thundering herd and to ensure that we deal fairly
+ * with queued waiters, we use an exclusive wait. This means that when we
+ * get woken with enough journal space to get our reservation, we need to
+ * wake the next waiter on the list.
+ *
  * Returns: errno
  */
 
 int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
 {
-	unsigned int try = 0;
 	unsigned reserved_blks = 6 * (4096 / sdp->sd_vfs->s_blocksize);
+	unsigned wanted = blks + reserved_blks;
+	DEFINE_WAIT(wait);
+	int did_wait = 0;
+	unsigned int free_blocks;
 
 	if (gfs2_assert_warn(sdp, blks) ||
 	    gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks))
 		return -EINVAL;
-
-	mutex_lock(&sdp->sd_log_reserve_mutex);
-	gfs2_log_lock(sdp);
-	while(atomic_read(&sdp->sd_log_blks_free) <= (blks + reserved_blks)) {
-		gfs2_log_unlock(sdp);
-		gfs2_ail1_empty(sdp, 0);
-		gfs2_log_flush(sdp, NULL);
-
-		if (try++)
-			gfs2_ail1_start(sdp, 0);
-		gfs2_log_lock(sdp);
+retry:
+	free_blocks = atomic_read(&sdp->sd_log_blks_free);
+	if (unlikely(free_blocks <= wanted)) {
+		do {
+			prepare_to_wait_exclusive(&sdp->sd_log_waitq, &wait,
+					TASK_UNINTERRUPTIBLE);
+			wake_up(&sdp->sd_logd_waitq);
+			did_wait = 1;
+			if (atomic_read(&sdp->sd_log_blks_free) <= wanted)
+				io_schedule();
+			free_blocks = atomic_read(&sdp->sd_log_blks_free);
+		} while(free_blocks <= wanted);
+		finish_wait(&sdp->sd_log_waitq, &wait);
 	}
-	atomic_sub(blks, &sdp->sd_log_blks_free);
+	if (atomic_cmpxchg(&sdp->sd_log_blks_free, free_blocks,
+				free_blocks - blks) != free_blocks)
+		goto retry;
 	trace_gfs2_log_blocks(sdp, -blks);
-	gfs2_log_unlock(sdp);
-	mutex_unlock(&sdp->sd_log_reserve_mutex);
+
+	/*
+	 * If we waited, then so might others, wake them up _after_ we get
+	 * our share of the log.
+	 */
+	if (unlikely(did_wait))
+		wake_up(&sdp->sd_log_waitq);
 
 	down_read(&sdp->sd_log_flush_lock);
 
 	return 0;
 }
 
-/**
- * gfs2_log_release - Release a given number of log blocks
- * @sdp: The GFS2 superblock
- * @blks: The number of blocks
- *
- */
-
-void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
-{
-
-	gfs2_log_lock(sdp);
-	atomic_add(blks, &sdp->sd_log_blks_free);
-	trace_gfs2_log_blocks(sdp, blks);
-	gfs2_assert_withdraw(sdp,
-			     atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks);
-	gfs2_log_unlock(sdp);
-	up_read(&sdp->sd_log_flush_lock);
-}
-
 static u64 log_bmap(struct gfs2_sbd *sdp, unsigned int lbn)
 {
 	struct gfs2_journal_extent *je;
@@ -417,7 +403,7 @@ static unsigned int calc_reserved(struct gfs2_sbd *sdp)
 	databufhdrs_needed = (sdp->sd_log_commited_databuf +
 			      (dbuf_limit - 1)) / dbuf_limit;
 
-	if (sdp->sd_log_commited_revoke)
+	if (sdp->sd_log_commited_revoke > 0)
 		revokes = gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke,
 					  sizeof(u64));
 
@@ -559,11 +545,10 @@ static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail)
 
 	ail2_empty(sdp, new_tail);
 
-	gfs2_log_lock(sdp);
 	atomic_add(dist, &sdp->sd_log_blks_free);
 	trace_gfs2_log_blocks(sdp, dist);
-	gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks);
-	gfs2_log_unlock(sdp);
+	gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
+			     sdp->sd_jdesc->jd_blocks);
 
 	sdp->sd_log_tail = new_tail;
 }
@@ -615,6 +600,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
 	if (buffer_eopnotsupp(bh)) {
 		clear_buffer_eopnotsupp(bh);
 		set_buffer_uptodate(bh);
+		fs_info(sdp, "barrier sync failed - disabling barriers\n");
 		set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
 		lock_buffer(bh);
 skip_barrier:
@@ -710,7 +696,7 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp)
  *
  */
 
-void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
+void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
 {
 	struct gfs2_ail *ai;
 
@@ -790,7 +776,6 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 	gfs2_assert_withdraw(sdp, (((int)sdp->sd_log_commited_buf) >= 0) ||
 			     (((int)sdp->sd_log_commited_databuf) >= 0));
 	sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
-	gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0);
 	reserved = calc_reserved(sdp);
 	gfs2_assert_withdraw(sdp, sdp->sd_log_blks_reserved + tr->tr_reserved >= reserved);
 	unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved;
@@ -823,6 +808,13 @@ static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
  * @sdp: the filesystem
  * @tr: the transaction
  *
+ * We wake up gfs2_logd if the number of pinned blocks exceed thresh1
+ * or the total number of used blocks (pinned blocks plus AIL blocks)
+ * is greater than thresh2.
+ *
+ * At mount time thresh1 is 1/3rd of journal size, thresh2 is 2/3rd of
+ * journal size.
+ *
  * Returns: errno
  */
 
@@ -833,10 +825,10 @@ void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 
 	up_read(&sdp->sd_log_flush_lock);
 
-	gfs2_log_lock(sdp);
-	if (sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks))
-		wake_up_process(sdp->sd_logd_process);
-	gfs2_log_unlock(sdp);
+	if (atomic_read(&sdp->sd_log_pinned) > atomic_read(&sdp->sd_log_thresh1) ||
+	    ((sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free)) >
+	    atomic_read(&sdp->sd_log_thresh2)))
+		wake_up(&sdp->sd_logd_waitq);
 }
 
 /**
@@ -883,13 +875,23 @@ void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
 {
 	gfs2_log_flush(sdp, NULL);
 	for (;;) {
-		gfs2_ail1_start(sdp, DIO_ALL);
+		gfs2_ail1_start(sdp);
 		if (gfs2_ail1_empty(sdp, DIO_ALL))
 			break;
 		msleep(10);
 	}
 }
 
+static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp)
+{
+	return (atomic_read(&sdp->sd_log_pinned) >= atomic_read(&sdp->sd_log_thresh1));
+}
+
+static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp)
+{
+	unsigned int used_blocks = sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free);
+	return used_blocks >= atomic_read(&sdp->sd_log_thresh2);
+}
 
 /**
  * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks
@@ -902,28 +904,43 @@ void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
 int gfs2_logd(void *data)
 {
 	struct gfs2_sbd *sdp = data;
-	unsigned long t;
-	int need_flush;
+	unsigned long t = 1;
+	DEFINE_WAIT(wait);
+	unsigned preflush;
 
 	while (!kthread_should_stop()) {
-		/* Advance the log tail */
 
-		t = sdp->sd_log_flush_time +
-		    gfs2_tune_get(sdp, gt_log_flush_secs) * HZ;
+		preflush = atomic_read(&sdp->sd_log_pinned);
+		if (gfs2_jrnl_flush_reqd(sdp) || t == 0) {
+			gfs2_ail1_empty(sdp, DIO_ALL);
+			gfs2_log_flush(sdp, NULL);
+			gfs2_ail1_empty(sdp, DIO_ALL);
+		}
 
-		gfs2_ail1_empty(sdp, DIO_ALL);
-		gfs2_log_lock(sdp);
-		need_flush = sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks);
-		gfs2_log_unlock(sdp);
-		if (need_flush || time_after_eq(jiffies, t)) {
+		if (gfs2_ail_flush_reqd(sdp)) {
+			gfs2_ail1_start(sdp);
+			io_schedule();
+			gfs2_ail1_empty(sdp, 0);
 			gfs2_log_flush(sdp, NULL);
-			sdp->sd_log_flush_time = jiffies;
+			gfs2_ail1_empty(sdp, DIO_ALL);
 		}
 
+		wake_up(&sdp->sd_log_waitq);
 		t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
 		if (freezing(current))
 			refrigerator();
-		schedule_timeout_interruptible(t);
+
+		do {
+			prepare_to_wait(&sdp->sd_logd_waitq, &wait,
+					TASK_UNINTERRUPTIBLE);
+			if (!gfs2_ail_flush_reqd(sdp) &&
+			    !gfs2_jrnl_flush_reqd(sdp) &&
+			    !kthread_should_stop())
+				t = schedule_timeout(t);
+		} while(t && !gfs2_ail_flush_reqd(sdp) &&
+			!gfs2_jrnl_flush_reqd(sdp) &&
+			!kthread_should_stop());
+		finish_wait(&sdp->sd_logd_waitq, &wait);
 	}
 
 	return 0;
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 7c64510ccfd2..0d007f920234 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -47,29 +47,21 @@ static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
 	sdp->sd_log_head = sdp->sd_log_tail = value;
 }
 
-unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
+extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
 			    unsigned int ssize);
 
-int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
-void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks);
-void gfs2_log_incr_head(struct gfs2_sbd *sdp);
+extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
+extern void gfs2_log_incr_head(struct gfs2_sbd *sdp);
 
-struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
-struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
+extern struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
+extern struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
 				      struct buffer_head *real);
-void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
+extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
+extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
+extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
 
-static inline void gfs2_log_flush(struct gfs2_sbd *sbd, struct gfs2_glock *gl)
-{
-	if (!gl || test_bit(GLF_LFLUSH, &gl->gl_flags))
-		__gfs2_log_flush(sbd, gl);
-}
-
-void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
-void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
-
-void gfs2_log_shutdown(struct gfs2_sbd *sdp);
-void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
-int gfs2_logd(void *data);
+extern void gfs2_log_shutdown(struct gfs2_sbd *sdp);
+extern void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
+extern int gfs2_logd(void *data);
 
 #endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index de97632ba32f..bf33f822058d 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -54,6 +54,7 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
 	if (bd->bd_ail)
 		list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
 	get_bh(bh);
+	atomic_inc(&sdp->sd_log_pinned);
 	trace_gfs2_pin(bd, 1);
 }
 
@@ -94,6 +95,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
 	trace_gfs2_pin(bd, 0);
 	gfs2_log_unlock(sdp);
 	unlock_buffer(bh);
+	atomic_dec(&sdp->sd_log_pinned);
 }
 
 
@@ -528,9 +530,9 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
 		gfs2_pin(sdp, bd->bd_bh);
 		tr->tr_num_databuf_new++;
 		sdp->sd_log_num_databuf++;
-		list_add(&le->le_list, &sdp->sd_log_le_databuf);
+		list_add_tail(&le->le_list, &sdp->sd_log_le_databuf);
 	} else {
-		list_add(&le->le_list, &sdp->sd_log_le_ordered);
+		list_add_tail(&le->le_list, &sdp->sd_log_le_ordered);
 	}
 out:
 	gfs2_log_unlock(sdp);
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 5b31f7741a8f..fb2a5f93b7c3 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -52,6 +52,22 @@ static void gfs2_init_glock_once(void *foo)
 	atomic_set(&gl->gl_ail_count, 0);
 }
 
+static void gfs2_init_gl_aspace_once(void *foo)
+{
+	struct gfs2_glock *gl = foo;
+	struct address_space *mapping = (struct address_space *)(gl + 1);
+
+	gfs2_init_glock_once(gl);
+	memset(mapping, 0, sizeof(*mapping));
+	INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
+	spin_lock_init(&mapping->tree_lock);
+	spin_lock_init(&mapping->i_mmap_lock);
+	INIT_LIST_HEAD(&mapping->private_list);
+	spin_lock_init(&mapping->private_lock);
+	INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
+	INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
+}
+
 /**
  * init_gfs2_fs - Register GFS2 as a filesystem
  *
@@ -78,6 +94,14 @@ static int __init init_gfs2_fs(void)
 	if (!gfs2_glock_cachep)
 		goto fail;
 
+	gfs2_glock_aspace_cachep = kmem_cache_create("gfs2_glock(aspace)",
+					sizeof(struct gfs2_glock) +
+					sizeof(struct address_space),
+					0, 0, gfs2_init_gl_aspace_once);
+
+	if (!gfs2_glock_aspace_cachep)
+		goto fail;
+
 	gfs2_inode_cachep = kmem_cache_create("gfs2_inode",
 					      sizeof(struct gfs2_inode),
 					      0,  SLAB_RECLAIM_ACCOUNT|
@@ -144,6 +168,9 @@ fail:
 	if (gfs2_inode_cachep)
 		kmem_cache_destroy(gfs2_inode_cachep);
 
+	if (gfs2_glock_aspace_cachep)
+		kmem_cache_destroy(gfs2_glock_aspace_cachep);
+
 	if (gfs2_glock_cachep)
 		kmem_cache_destroy(gfs2_glock_cachep);
 
@@ -169,6 +196,7 @@ static void __exit exit_gfs2_fs(void)
 	kmem_cache_destroy(gfs2_rgrpd_cachep);
 	kmem_cache_destroy(gfs2_bufdata_cachep);
 	kmem_cache_destroy(gfs2_inode_cachep);
+	kmem_cache_destroy(gfs2_glock_aspace_cachep);
 	kmem_cache_destroy(gfs2_glock_cachep);
 
 	gfs2_sys_uninit();
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 6f68a5f18eb8..18176d0b75d7 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -34,7 +34,6 @@
 
 static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc)
 {
-	int err;
 	struct buffer_head *bh, *head;
 	int nr_underway = 0;
 	int write_op = (1 << BIO_RW_META) | ((wbc->sync_mode == WB_SYNC_ALL ?
@@ -86,56 +85,19 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
 	} while (bh != head);
 	unlock_page(page);
 
-	err = 0;
 	if (nr_underway == 0)
 		end_page_writeback(page);
 
-	return err;
+	return 0;
 }
 
-static const struct address_space_operations aspace_aops = {
+const struct address_space_operations gfs2_meta_aops = {
 	.writepage = gfs2_aspace_writepage,
 	.releasepage = gfs2_releasepage,
 	.sync_page = block_sync_page,
 };
 
 /**
- * gfs2_aspace_get - Create and initialize a struct inode structure
- * @sdp: the filesystem the aspace is in
- *
- * Right now a struct inode is just a struct inode.  Maybe Linux
- * will supply a more lightweight address space construct (that works)
- * in the future.
- *
- * Make sure pages/buffers in this aspace aren't in high memory.
- *
- * Returns: the aspace
- */
-
-struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp)
-{
-	struct inode *aspace;
-	struct gfs2_inode *ip;
-
-	aspace = new_inode(sdp->sd_vfs);
-	if (aspace) {
-		mapping_set_gfp_mask(aspace->i_mapping, GFP_NOFS);
-		aspace->i_mapping->a_ops = &aspace_aops;
-		aspace->i_size = MAX_LFS_FILESIZE;
-		ip = GFS2_I(aspace);
-		clear_bit(GIF_USER, &ip->i_flags);
-		insert_inode_hash(aspace);
-	}
-	return aspace;
-}
-
-void gfs2_aspace_put(struct inode *aspace)
-{
-	remove_inode_hash(aspace);
-	iput(aspace);
-}
-
-/**
  * gfs2_meta_sync - Sync all buffers associated with a glock
  * @gl: The glock
  *
@@ -143,7 +105,7 @@ void gfs2_aspace_put(struct inode *aspace)
 
 void gfs2_meta_sync(struct gfs2_glock *gl)
 {
-	struct address_space *mapping = gl->gl_aspace->i_mapping;
+	struct address_space *mapping = gfs2_glock2aspace(gl);
 	int error;
 
 	filemap_fdatawrite(mapping);
@@ -164,7 +126,7 @@ void gfs2_meta_sync(struct gfs2_glock *gl)
 
 struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
 {
-	struct address_space *mapping = gl->gl_aspace->i_mapping;
+	struct address_space *mapping = gfs2_glock2aspace(gl);
 	struct gfs2_sbd *sdp = gl->gl_sbd;
 	struct page *page;
 	struct buffer_head *bh;
@@ -344,9 +306,12 @@ void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
 
 void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int meta)
 {
-	struct gfs2_sbd *sdp = GFS2_SB(bh->b_page->mapping->host);
+	struct address_space *mapping = bh->b_page->mapping;
+	struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping);
 	struct gfs2_bufdata *bd = bh->b_private;
+
 	if (test_clear_buffer_pinned(bh)) {
+		atomic_dec(&sdp->sd_log_pinned);
 		list_del_init(&bd->bd_le.le_list);
 		if (meta) {
 			gfs2_assert_warn(sdp, sdp->sd_log_num_buf);
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index de270c2f9b63..6a1d9ba16411 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -37,8 +37,16 @@ static inline void gfs2_buffer_copy_tail(struct buffer_head *to_bh,
 	       0, from_head - to_head);
 }
 
-struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp);
-void gfs2_aspace_put(struct inode *aspace);
+extern const struct address_space_operations gfs2_meta_aops;
+
+static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping)
+{
+	struct inode *inode = mapping->host;
+	if (mapping->a_ops == &gfs2_meta_aops)
+		return (((struct gfs2_glock *)mapping) - 1)->gl_sbd;
+	else
+		return inode->i_sb->s_fs_info;
+}
 
 void gfs2_meta_sync(struct gfs2_glock *gl);
 
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index edfee24f3636..3593b3a7290e 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -57,15 +57,12 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
 {
 	spin_lock_init(&gt->gt_spin);
 
-	gt->gt_incore_log_blocks = 1024;
-	gt->gt_logd_secs = 1;
 	gt->gt_quota_simul_sync = 64;
 	gt->gt_quota_warn_period = 10;
 	gt->gt_quota_scale_num = 1;
 	gt->gt_quota_scale_den = 1;
 	gt->gt_new_files_jdata = 0;
 	gt->gt_max_readahead = 1 << 18;
-	gt->gt_stall_secs = 600;
 	gt->gt_complain_secs = 10;
 }
 
@@ -82,6 +79,8 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 
 	gfs2_tune_init(&sdp->sd_tune);
 
+	init_waitqueue_head(&sdp->sd_glock_wait);
+	atomic_set(&sdp->sd_glock_disposal, 0);
 	spin_lock_init(&sdp->sd_statfs_spin);
 
 	spin_lock_init(&sdp->sd_rindex_spin);
@@ -100,14 +99,15 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 	spin_lock_init(&sdp->sd_trunc_lock);
 
 	spin_lock_init(&sdp->sd_log_lock);
-
+	atomic_set(&sdp->sd_log_pinned, 0);
 	INIT_LIST_HEAD(&sdp->sd_log_le_buf);
 	INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
 	INIT_LIST_HEAD(&sdp->sd_log_le_rg);
 	INIT_LIST_HEAD(&sdp->sd_log_le_databuf);
 	INIT_LIST_HEAD(&sdp->sd_log_le_ordered);
 
-	mutex_init(&sdp->sd_log_reserve_mutex);
+	init_waitqueue_head(&sdp->sd_log_waitq);
+	init_waitqueue_head(&sdp->sd_logd_waitq);
 	INIT_LIST_HEAD(&sdp->sd_ail1_list);
 	INIT_LIST_HEAD(&sdp->sd_ail2_list);
 
@@ -486,7 +486,7 @@ static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr,
 	struct dentry *dentry;
 	struct inode *inode;
 
-	inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0);
+	inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0);
 	if (IS_ERR(inode)) {
 		fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode));
 		return PTR_ERR(inode);
@@ -723,7 +723,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
 		goto fail;
 	}
 
-	error = -EINVAL;
+	error = -EUSERS;
 	if (!gfs2_jindex_size(sdp)) {
 		fs_err(sdp, "no journals!\n");
 		goto fail_jindex;
@@ -732,6 +732,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
 	if (sdp->sd_args.ar_spectator) {
 		sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0);
 		atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks);
+		atomic_set(&sdp->sd_log_thresh1, 2*sdp->sd_jdesc->jd_blocks/5);
+		atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5);
 	} else {
 		if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) {
 			fs_err(sdp, "can't mount journal #%u\n",
@@ -769,6 +771,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
 			goto fail_jinode_gh;
 		}
 		atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks);
+		atomic_set(&sdp->sd_log_thresh1, 2*sdp->sd_jdesc->jd_blocks/5);
+		atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5);
 
 		/* Map the extents for this journal's blocks */
 		map_journal_extents(sdp);
@@ -950,8 +954,6 @@ static int init_threads(struct gfs2_sbd *sdp, int undo)
 	if (undo)
 		goto fail_quotad;
 
-	sdp->sd_log_flush_time = jiffies;
-
 	p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
 	error = IS_ERR(p);
 	if (error) {
@@ -983,16 +985,24 @@ static const match_table_t nolock_tokens = {
 	{ Opt_err, NULL },
 };
 
+static void nolock_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	kmem_cache_free(cachep, gl);
+	if (atomic_dec_and_test(&sdp->sd_glock_disposal))
+		wake_up(&sdp->sd_glock_wait);
+}
+
 static const struct lm_lockops nolock_ops = {
 	.lm_proto_name = "lock_nolock",
-	.lm_put_lock = kmem_cache_free,
+	.lm_put_lock = nolock_put_lock,
 	.lm_tokens = &nolock_tokens,
 };
 
 /**
  * gfs2_lm_mount - mount a locking protocol
  * @sdp: the filesystem
- * @args: mount arguements
+ * @args: mount arguments
  * @silent: if 1, don't complain if the FS isn't a GFS2 fs
  *
  * Returns: errno
@@ -1151,7 +1161,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
                                GFS2_BASIC_BLOCK_SHIFT;
 	sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
 
-	sdp->sd_tune.gt_log_flush_secs = sdp->sd_args.ar_commit;
+	sdp->sd_tune.gt_logd_secs = sdp->sd_args.ar_commit;
 	sdp->sd_tune.gt_quota_quantum = sdp->sd_args.ar_quota_quantum;
 	if (sdp->sd_args.ar_statfs_quantum) {
 		sdp->sd_tune.gt_statfs_slow = 0;
@@ -1231,10 +1241,9 @@ fail_sb:
 fail_locking:
 	init_locking(sdp, &mount_gh, UNDO);
 fail_lm:
+	invalidate_inodes(sb);
 	gfs2_gl_hash_clear(sdp);
 	gfs2_lm_unmount(sdp);
-	while (invalidate_inodes(sb))
-		yield();
 fail_sys:
 	gfs2_sys_fs_del(sdp);
 fail:
@@ -1315,7 +1324,7 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
 	memset(&args, 0, sizeof(args));
 	args.ar_quota = GFS2_QUOTA_DEFAULT;
 	args.ar_data = GFS2_DATA_DEFAULT;
-	args.ar_commit = 60;
+	args.ar_commit = 30;
 	args.ar_statfs_quantum = 30;
 	args.ar_quota_quantum = 60;
 	args.ar_errors = GFS2_ERRORS_DEFAULT;
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 78f73ca1ef3e..98cdd05f3316 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -976,121 +976,62 @@ out:
 }
 
 /**
- * gfs2_readlinki - return the contents of a symlink
- * @ip: the symlink's inode
- * @buf: a pointer to the buffer to be filled
- * @len: a pointer to the length of @buf
+ * gfs2_follow_link - Follow a symbolic link
+ * @dentry: The dentry of the link
+ * @nd: Data that we pass to vfs_follow_link()
  *
- * If @buf is too small, a piece of memory is kmalloc()ed and needs
- * to be freed by the caller.
+ * This can handle symlinks of any size.
  *
- * Returns: errno
+ * Returns: 0 on success or error code
  */
 
-static int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
+static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
+	struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
 	struct gfs2_holder i_gh;
 	struct buffer_head *dibh;
 	unsigned int x;
+	char *buf;
 	int error;
 
 	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
 	error = gfs2_glock_nq(&i_gh);
 	if (error) {
 		gfs2_holder_uninit(&i_gh);
-		return error;
+		nd_set_link(nd, ERR_PTR(error));
+		return NULL;
 	}
 
 	if (!ip->i_disksize) {
 		gfs2_consist_inode(ip);
-		error = -EIO;
+		buf = ERR_PTR(-EIO);
 		goto out;
 	}
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
-	if (error)
+	if (error) {
+		buf = ERR_PTR(error);
 		goto out;
-
-	x = ip->i_disksize + 1;
-	if (x > *len) {
-		*buf = kmalloc(x, GFP_NOFS);
-		if (!*buf) {
-			error = -ENOMEM;
-			goto out_brelse;
-		}
 	}
 
-	memcpy(*buf, dibh->b_data + sizeof(struct gfs2_dinode), x);
-	*len = x;
-
-out_brelse:
+	x = ip->i_disksize + 1;
+	buf = kmalloc(x, GFP_NOFS);
+	if (!buf)
+		buf = ERR_PTR(-ENOMEM);
+	else
+		memcpy(buf, dibh->b_data + sizeof(struct gfs2_dinode), x);
 	brelse(dibh);
 out:
 	gfs2_glock_dq_uninit(&i_gh);
-	return error;
-}
-
-/**
- * gfs2_readlink - Read the value of a symlink
- * @dentry: the symlink
- * @buf: the buffer to read the symlink data into
- * @size: the size of the buffer
- *
- * Returns: errno
- */
-
-static int gfs2_readlink(struct dentry *dentry, char __user *user_buf,
-			 int user_size)
-{
-	struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
-	char array[GFS2_FAST_NAME_SIZE], *buf = array;
-	unsigned int len = GFS2_FAST_NAME_SIZE;
-	int error;
-
-	error = gfs2_readlinki(ip, &buf, &len);
-	if (error)
-		return error;
-
-	if (user_size > len - 1)
-		user_size = len - 1;
-
-	if (copy_to_user(user_buf, buf, user_size))
-		error = -EFAULT;
-	else
-		error = user_size;
-
-	if (buf != array)
-		kfree(buf);
-
-	return error;
+	nd_set_link(nd, buf);
+	return NULL;
 }
 
-/**
- * gfs2_follow_link - Follow a symbolic link
- * @dentry: The dentry of the link
- * @nd: Data that we pass to vfs_follow_link()
- *
- * This can handle symlinks of any size. It is optimised for symlinks
- * under GFS2_FAST_NAME_SIZE.
- *
- * Returns: 0 on success or error code
- */
-
-static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
+static void gfs2_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
 {
-	struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
-	char array[GFS2_FAST_NAME_SIZE], *buf = array;
-	unsigned int len = GFS2_FAST_NAME_SIZE;
-	int error;
-
-	error = gfs2_readlinki(ip, &buf, &len);
-	if (!error) {
-		error = vfs_follow_link(nd, buf);
-		if (buf != array)
-			kfree(buf);
-	}
-
-	return ERR_PTR(error);
+	char *s = nd_get_link(nd);
+	if (!IS_ERR(s))
+		kfree(s);
 }
 
 /**
@@ -1130,6 +1071,9 @@ int gfs2_permission(struct inode *inode, int mask)
 	return error;
 }
 
+/*
+ * XXX: should be changed to have proper ordering by opencoding simple_setsize
+ */
 static int setattr_size(struct inode *inode, struct iattr *attr)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
@@ -1140,7 +1084,7 @@ static int setattr_size(struct inode *inode, struct iattr *attr)
 		error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
 		if (error)
 			return error;
-		error = vmtruncate(inode, attr->ia_size);
+		error = simple_setsize(inode, attr->ia_size);
 		gfs2_trans_end(sdp);
 		if (error) 
 			return error;
@@ -1425,8 +1369,9 @@ const struct inode_operations gfs2_dir_iops = {
 };
 
 const struct inode_operations gfs2_symlink_iops = {
-	.readlink = gfs2_readlink,
+	.readlink = generic_readlink,
 	.follow_link = gfs2_follow_link,
+	.put_link = gfs2_put_link,
 	.permission = gfs2_permission,
 	.setattr = gfs2_setattr,
 	.getattr = gfs2_getattr,
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index e3bf6eab8750..8f02d3db8f42 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -77,7 +77,7 @@ static LIST_HEAD(qd_lru_list);
 static atomic_t qd_lru_count = ATOMIC_INIT(0);
 static DEFINE_SPINLOCK(qd_lru_lock);
 
-int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask)
+int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 {
 	struct gfs2_quota_data *qd;
 	struct gfs2_sbd *sdp;
@@ -637,15 +637,40 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
 	unsigned blocksize, iblock, pos;
 	struct buffer_head *bh, *dibh;
 	struct page *page;
-	void *kaddr;
-	struct gfs2_quota *qp;
-	s64 value;
-	int err = -EIO;
+	void *kaddr, *ptr;
+	struct gfs2_quota q, *qp;
+	int err, nbytes;
 	u64 size;
 
 	if (gfs2_is_stuffed(ip))
 		gfs2_unstuff_dinode(ip, NULL);
-	
+
+	memset(&q, 0, sizeof(struct gfs2_quota));
+	err = gfs2_internal_read(ip, NULL, (char *)&q, &loc, sizeof(q));
+	if (err < 0)
+		return err;
+
+	err = -EIO;
+	qp = &q;
+	qp->qu_value = be64_to_cpu(qp->qu_value);
+	qp->qu_value += change;
+	qp->qu_value = cpu_to_be64(qp->qu_value);
+	qd->qd_qb.qb_value = qp->qu_value;
+	if (fdq) {
+		if (fdq->d_fieldmask & FS_DQ_BSOFT) {
+			qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit);
+			qd->qd_qb.qb_warn = qp->qu_warn;
+		}
+		if (fdq->d_fieldmask & FS_DQ_BHARD) {
+			qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit);
+			qd->qd_qb.qb_limit = qp->qu_limit;
+		}
+	}
+
+	/* Write the quota into the quota file on disk */
+	ptr = qp;
+	nbytes = sizeof(struct gfs2_quota);
+get_a_page:
 	page = grab_cache_page(mapping, index);
 	if (!page)
 		return -ENOMEM;
@@ -667,7 +692,10 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
 	if (!buffer_mapped(bh)) {
 		gfs2_block_map(inode, iblock, bh, 1);
 		if (!buffer_mapped(bh))
-			goto unlock;
+			goto unlock_out;
+		/* If it's a newly allocated disk block for quota, zero it */
+		if (buffer_new(bh))
+			zero_user(page, pos - blocksize, bh->b_size);
 	}
 
 	if (PageUptodate(page))
@@ -677,32 +705,34 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
 		ll_rw_block(READ_META, 1, &bh);
 		wait_on_buffer(bh);
 		if (!buffer_uptodate(bh))
-			goto unlock;
+			goto unlock_out;
 	}
 
 	gfs2_trans_add_bh(ip->i_gl, bh, 0);
 
 	kaddr = kmap_atomic(page, KM_USER0);
-	qp = kaddr + offset;
-	value = (s64)be64_to_cpu(qp->qu_value) + change;
-	qp->qu_value = cpu_to_be64(value);
-	qd->qd_qb.qb_value = qp->qu_value;
-	if (fdq) {
-		if (fdq->d_fieldmask & FS_DQ_BSOFT) {
-			qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit);
-			qd->qd_qb.qb_warn = qp->qu_warn;
-		}
-		if (fdq->d_fieldmask & FS_DQ_BHARD) {
-			qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit);
-			qd->qd_qb.qb_limit = qp->qu_limit;
-		}
-	}
+	if (offset + sizeof(struct gfs2_quota) > PAGE_CACHE_SIZE)
+		nbytes = PAGE_CACHE_SIZE - offset;
+	memcpy(kaddr + offset, ptr, nbytes);
 	flush_dcache_page(page);
 	kunmap_atomic(kaddr, KM_USER0);
+	unlock_page(page);
+	page_cache_release(page);
 
+	/* If quota straddles page boundary, we need to update the rest of the
+	 * quota at the beginning of the next page */
+	if ((offset + sizeof(struct gfs2_quota)) > PAGE_CACHE_SIZE) {
+		ptr = ptr + nbytes;
+		nbytes = sizeof(struct gfs2_quota) - nbytes;
+		offset = 0;
+		index++;
+		goto get_a_page;
+	}
+
+	/* Update the disk inode timestamp and size (if extended) */
 	err = gfs2_meta_inode_buffer(ip, &dibh);
 	if (err)
-		goto unlock;
+		goto out;
 
 	size = loc + sizeof(struct gfs2_quota);
 	if (size > inode->i_size) {
@@ -715,7 +745,9 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
 	brelse(dibh);
 	mark_inode_dirty(inode);
 
-unlock:
+out:
+	return err;
+unlock_out:
 	unlock_page(page);
 	page_cache_release(page);
 	return err;
@@ -779,8 +811,10 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
 	 * rgrp since it won't be allocated during the transaction
 	 */
 	al->al_requested = 1;
-	/* +1 in the end for block requested above for unstuffing */
-	blocks = num_qd * data_blocks + RES_DINODE + num_qd + 1;
+	/* +3 in the end for unstuffing block, inode size update block
+	 * and another block in case quota straddles page boundary and 
+	 * two blocks need to be updated instead of 1 */
+	blocks = num_qd * data_blocks + RES_DINODE + num_qd + 3;
 
 	if (nalloc)
 		al->al_requested += nalloc * (data_blocks + ind_blocks);		
@@ -1083,7 +1117,7 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
 	}
 }
 
-int gfs2_quota_sync(struct super_block *sb, int type)
+int gfs2_quota_sync(struct super_block *sb, int type, int wait)
 {
 	struct gfs2_sbd *sdp = sb->s_fs_info;
 	struct gfs2_quota_data **qda;
@@ -1127,6 +1161,11 @@ int gfs2_quota_sync(struct super_block *sb, int type)
 	return error;
 }
 
+static int gfs2_quota_sync_timeo(struct super_block *sb, int type)
+{
+	return gfs2_quota_sync(sb, type, 0);
+}
+
 int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id)
 {
 	struct gfs2_quota_data *qd;
@@ -1382,7 +1421,7 @@ int gfs2_quotad(void *data)
 					   &tune->gt_statfs_quantum);
 
 		/* Update quota file */
-		quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t,
+		quotad_check_timeo(sdp, "sync", gfs2_quota_sync_timeo, t,
 				   &quotad_timeo, &tune->gt_quota_quantum);
 
 		/* Check for & recover partially truncated inodes */
@@ -1413,10 +1452,18 @@ static int gfs2_quota_get_xstate(struct super_block *sb,
 
 	memset(fqs, 0, sizeof(struct fs_quota_stat));
 	fqs->qs_version = FS_QSTAT_VERSION;
-	if (sdp->sd_args.ar_quota == GFS2_QUOTA_ON)
-		fqs->qs_flags = (XFS_QUOTA_UDQ_ENFD | XFS_QUOTA_GDQ_ENFD);
-	else if (sdp->sd_args.ar_quota == GFS2_QUOTA_ACCOUNT)
-		fqs->qs_flags = (XFS_QUOTA_UDQ_ACCT | XFS_QUOTA_GDQ_ACCT);
+
+	switch (sdp->sd_args.ar_quota) {
+	case GFS2_QUOTA_ON:
+		fqs->qs_flags |= (XFS_QUOTA_UDQ_ENFD | XFS_QUOTA_GDQ_ENFD);
+		/*FALLTHRU*/
+	case GFS2_QUOTA_ACCOUNT:
+		fqs->qs_flags |= (XFS_QUOTA_UDQ_ACCT | XFS_QUOTA_GDQ_ACCT);
+		break;
+	case GFS2_QUOTA_OFF:
+		break;
+	}
+
 	if (sdp->sd_quota_inode) {
 		fqs->qs_uquota.qfs_ino = GFS2_I(sdp->sd_quota_inode)->i_no_addr;
 		fqs->qs_uquota.qfs_nblks = sdp->sd_quota_inode->i_blocks;
@@ -1427,8 +1474,8 @@ static int gfs2_quota_get_xstate(struct super_block *sb,
 	return 0;
 }
 
-static int gfs2_xquota_get(struct super_block *sb, int type, qid_t id,
-			   struct fs_disk_quota *fdq)
+static int gfs2_get_dqblk(struct super_block *sb, int type, qid_t id,
+			  struct fs_disk_quota *fdq)
 {
 	struct gfs2_sbd *sdp = sb->s_fs_info;
 	struct gfs2_quota_lvb *qlvb;
@@ -1472,8 +1519,8 @@ out:
 /* GFS2 only supports a subset of the XFS fields */
 #define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD)
 
-static int gfs2_xquota_set(struct super_block *sb, int type, qid_t id,
-			   struct fs_disk_quota *fdq)
+static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
+			  struct fs_disk_quota *fdq)
 {
 	struct gfs2_sbd *sdp = sb->s_fs_info;
 	struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
@@ -1580,7 +1627,7 @@ out_put:
 const struct quotactl_ops gfs2_quotactl_ops = {
 	.quota_sync     = gfs2_quota_sync,
 	.get_xstate     = gfs2_quota_get_xstate,
-	.get_xquota	= gfs2_xquota_get,
-	.set_xquota	= gfs2_xquota_set,
+	.get_dqblk	= gfs2_get_dqblk,
+	.set_dqblk	= gfs2_set_dqblk,
 };
 
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index e271fa07ad02..e7d236ca48bd 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -25,7 +25,7 @@ extern int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid);
 extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
 			      u32 uid, u32 gid);
 
-extern int gfs2_quota_sync(struct super_block *sb, int type);
+extern int gfs2_quota_sync(struct super_block *sb, int type, int wait);
 extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id);
 
 extern int gfs2_quota_init(struct gfs2_sbd *sdp);
@@ -51,7 +51,7 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
 	return ret;
 }
 
-extern int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask);
+extern int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask);
 extern const struct quotactl_ops gfs2_quotactl_ops;
 
 #endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 0608f490c295..171a744f8e45 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -591,11 +591,7 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
 	u64 rgrp_count = ip->i_disksize;
 	int error;
 
-	if (do_div(rgrp_count, sizeof(struct gfs2_rindex))) {
-		gfs2_consist_inode(ip);
-		return -EIO;
-	}
-
+	do_div(rgrp_count, sizeof(struct gfs2_rindex));
 	clear_rgrpdi(sdp);
 
 	file_ra_state_init(&ra_state, inode->i_mapping);
@@ -858,7 +854,8 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
 				if ((start + nr_sects) != blk) {
 					rv = blkdev_issue_discard(bdev, start,
 							    nr_sects, GFP_NOFS,
-							    DISCARD_FL_BARRIER);
+							    BLKDEV_IFL_WAIT |
+							    BLKDEV_IFL_BARRIER);
 					if (rv)
 						goto fail;
 					nr_sects = 0;
@@ -873,7 +870,7 @@ start_new_extent:
 	}
 	if (nr_sects) {
 		rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS,
-					 DISCARD_FL_BARRIER);
+					 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
 		if (rv)
 			goto fail;
 	}
@@ -915,7 +912,7 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
 struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip)
 {
 	BUG_ON(ip->i_alloc != NULL);
-	ip->i_alloc = kzalloc(sizeof(struct gfs2_alloc), GFP_KERNEL);
+	ip->i_alloc = kzalloc(sizeof(struct gfs2_alloc), GFP_NOFS);
 	return ip->i_alloc;
 }
 
@@ -952,13 +949,13 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
  * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes
  * @rgd: The rgrp
  *
- * Returns: The inode, if one has been found
+ * Returns: 0 if no error
+ *          The inode, if one has been found, in inode.
  */
 
-static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
-				     u64 skip)
+static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
+			   u64 skip)
 {
-	struct inode *inode;
 	u32 goal = 0, block;
 	u64 no_addr;
 	struct gfs2_sbd *sdp = rgd->rd_sbd;
@@ -983,14 +980,11 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
 		if (no_addr == skip)
 			continue;
 		*last_unlinked = no_addr;
-		inode = gfs2_inode_lookup(rgd->rd_sbd->sd_vfs, DT_UNKNOWN,
-					  no_addr, -1, 1);
-		if (!IS_ERR(inode))
-			return inode;
+		return no_addr;
 	}
 
 	rgd->rd_flags &= ~GFS2_RDF_CHECK;
-	return NULL;
+	return 0;
 }
 
 /**
@@ -1071,11 +1065,12 @@ static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd)
  * Try to acquire rgrp in way which avoids contending with others.
  *
  * Returns: errno
+ *          unlinked: the block address of an unlinked block to be reclaimed
  */
 
-static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
+static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
+			  u64 *last_unlinked)
 {
-	struct inode *inode = NULL;
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_rgrpd *rgd, *begin = NULL;
 	struct gfs2_alloc *al = ip->i_alloc;
@@ -1084,6 +1079,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
 	int loops = 0;
 	int error, rg_locked;
 
+	*unlinked = 0;
 	rgd = gfs2_blk2rgrpd(sdp, ip->i_goal);
 
 	while (rgd) {
@@ -1100,19 +1096,24 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
 		case 0:
 			if (try_rgrp_fit(rgd, al))
 				goto out;
-			if (rgd->rd_flags & GFS2_RDF_CHECK)
-				inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
+			/* If the rg came in already locked, there's no
+			   way we can recover from a failed try_rgrp_unlink
+			   because that would require an iput which can only
+			   happen after the rgrp is unlocked. */
+			if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
+				*unlinked = try_rgrp_unlink(rgd, last_unlinked,
+							   ip->i_no_addr);
 			if (!rg_locked)
 				gfs2_glock_dq_uninit(&al->al_rgd_gh);
-			if (inode)
-				return inode;
+			if (*unlinked)
+				return -EAGAIN;
 			/* fall through */
 		case GLR_TRYFAILED:
 			rgd = recent_rgrp_next(rgd);
 			break;
 
 		default:
-			return ERR_PTR(error);
+			return error;
 		}
 	}
 
@@ -1134,12 +1135,13 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
 		case 0:
 			if (try_rgrp_fit(rgd, al))
 				goto out;
-			if (rgd->rd_flags & GFS2_RDF_CHECK)
-				inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
+			if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
+				*unlinked = try_rgrp_unlink(rgd, last_unlinked,
+							    ip->i_no_addr);
 			if (!rg_locked)
 				gfs2_glock_dq_uninit(&al->al_rgd_gh);
-			if (inode)
-				return inode;
+			if (*unlinked)
+				return -EAGAIN;
 			break;
 
 		case GLR_TRYFAILED:
@@ -1147,7 +1149,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
 			break;
 
 		default:
-			return ERR_PTR(error);
+			return error;
 		}
 
 		rgd = gfs2_rgrpd_get_next(rgd);
@@ -1156,7 +1158,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
 
 		if (rgd == begin) {
 			if (++loops >= 3)
-				return ERR_PTR(-ENOSPC);
+				return -ENOSPC;
 			if (!skipped)
 				loops++;
 			flags = 0;
@@ -1176,7 +1178,7 @@ out:
 		forward_rgrp_set(sdp, rgd);
 	}
 
-	return NULL;
+	return 0;
 }
 
 /**
@@ -1190,9 +1192,8 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_alloc *al = ip->i_alloc;
-	struct inode *inode;
 	int error = 0;
-	u64 last_unlinked = NO_BLOCK;
+	u64 last_unlinked = NO_BLOCK, unlinked;
 
 	if (gfs2_assert_warn(sdp, al->al_requested))
 		return -EINVAL;
@@ -1208,17 +1209,27 @@ try_again:
 	if (error)
 		return error;
 
-	inode = get_local_rgrp(ip, &last_unlinked);
-	if (inode) {
+	/* Find an rgrp suitable for allocation.  If it encounters any unlinked
+	   dinodes along the way, error will equal -EAGAIN and unlinked will
+	   contains it block address. We then need to look up that inode and
+	   try to free it, and try the allocation again. */
+	error = get_local_rgrp(ip, &unlinked, &last_unlinked);
+	if (error) {
 		if (ip != GFS2_I(sdp->sd_rindex))
 			gfs2_glock_dq_uninit(&al->al_ri_gh);
-		if (IS_ERR(inode))
-			return PTR_ERR(inode);
-		iput(inode);
+		if (error != -EAGAIN)
+			return error;
+
+		gfs2_process_unlinked_inode(ip->i_inode.i_sb, unlinked);
+		/* regardless of whether or not gfs2_process_unlinked_inode
+		   was successful, we don't want to repeat it again. */
+		last_unlinked = unlinked;
 		gfs2_log_flush(sdp, NULL);
+		error = 0;
+
 		goto try_again;
 	}
-
+	/* no error, so we have the rgrp set in the inode's allocation. */
 	al->al_file = file;
 	al->al_line = line;
 
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index b4106ddaaa98..f07119d89557 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -10,6 +10,8 @@
 #ifndef __RGRP_DOT_H__
 #define __RGRP_DOT_H__
 
+#include <linux/slab.h>
+
 struct gfs2_rgrpd;
 struct gfs2_sbd;
 struct gfs2_holder;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index c282ad41f3d1..4d1aad38f1b1 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -21,6 +21,8 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
 #include <linux/time.h>
+#include <linux/wait.h>
+#include <linux/writeback.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -710,7 +712,7 @@ void gfs2_unfreeze_fs(struct gfs2_sbd *sdp)
  * Returns: errno
  */
 
-static int gfs2_write_inode(struct inode *inode, int sync)
+static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -721,8 +723,7 @@ static int gfs2_write_inode(struct inode *inode, int sync)
 	int ret = 0;
 
 	/* Check this is a "normal" inode, etc */
-	if (!test_bit(GIF_USER, &ip->i_flags) ||
-	    (current->flags & PF_MEMALLOC))
+	if (current->flags & PF_MEMALLOC)
 		return 0;
 	ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
 	if (ret)
@@ -745,7 +746,7 @@ static int gfs2_write_inode(struct inode *inode, int sync)
 do_unlock:
 	gfs2_glock_dq_uninit(&gh);
 do_flush:
-	if (sync != 0)
+	if (wbc->sync_mode == WB_SYNC_ALL)
 		gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
 	return ret;
 }
@@ -763,7 +764,7 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
 	int error;
 
 	flush_workqueue(gfs2_delete_workqueue);
-	gfs2_quota_sync(sdp->sd_vfs, 0);
+	gfs2_quota_sync(sdp->sd_vfs, 0, 1);
 	gfs2_statfs_sync(sdp->sd_vfs, 0);
 
 	error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
@@ -859,6 +860,7 @@ restart:
 	gfs2_clear_rgrpd(sdp);
 	gfs2_jindex_free(sdp);
 	/*  Take apart glock structures and buffer lists  */
+	invalidate_inodes(sdp->sd_vfs);
 	gfs2_gl_hash_clear(sdp);
 	/*  Unmount the locking protocol  */
 	gfs2_lm_unmount(sdp);
@@ -1111,7 +1113,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
 	int error;
 
 	spin_lock(&gt->gt_spin);
-	args.ar_commit = gt->gt_log_flush_secs;
+	args.ar_commit = gt->gt_logd_secs;
 	args.ar_quota_quantum = gt->gt_quota_quantum;
 	if (gt->gt_statfs_slow)
 		args.ar_statfs_quantum = 0;
@@ -1158,7 +1160,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
 	else
 		clear_bit(SDF_NOBARRIERS, &sdp->sd_flags);
 	spin_lock(&gt->gt_spin);
-	gt->gt_log_flush_secs = args.ar_commit;
+	gt->gt_logd_secs = args.ar_commit;
 	gt->gt_quota_quantum = args.ar_quota_quantum;
 	if (args.ar_statfs_quantum) {
 		gt->gt_statfs_slow = 0;
@@ -1193,7 +1195,7 @@ static void gfs2_drop_inode(struct inode *inode)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 
-	if (test_bit(GIF_USER, &ip->i_flags) && inode->i_nlink) {
+	if (inode->i_nlink) {
 		struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
 		if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
 			clear_nlink(inode);
@@ -1211,18 +1213,12 @@ static void gfs2_clear_inode(struct inode *inode)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 
-	/* This tells us its a "real" inode and not one which only
-	 * serves to contain an address space (see rgrp.c, meta_io.c)
-	 * which therefore doesn't have its own glocks.
-	 */
-	if (test_bit(GIF_USER, &ip->i_flags)) {
-		ip->i_gl->gl_object = NULL;
-		gfs2_glock_put(ip->i_gl);
-		ip->i_gl = NULL;
-		if (ip->i_iopen_gh.gh_gl) {
-			ip->i_iopen_gh.gh_gl->gl_object = NULL;
-			gfs2_glock_dq_uninit(&ip->i_iopen_gh);
-		}
+	ip->i_gl->gl_object = NULL;
+	gfs2_glock_put(ip->i_gl);
+	ip->i_gl = NULL;
+	if (ip->i_iopen_gh.gh_gl) {
+		ip->i_iopen_gh.gh_gl->gl_object = NULL;
+		gfs2_glock_dq_uninit(&ip->i_iopen_gh);
 	}
 }
 
@@ -1309,8 +1305,8 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 	}
 	if (args->ar_discard)
 		seq_printf(s, ",discard");
-	val = sdp->sd_tune.gt_log_flush_secs;
-	if (val != 60)
+	val = sdp->sd_tune.gt_logd_secs;
+	if (val != 30)
 		seq_printf(s, ",commit=%d", val);
 	val = sdp->sd_tune.gt_statfs_quantum;
 	if (val != 30)
@@ -1338,7 +1334,8 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 	}
 	if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
 		seq_printf(s, ",nobarrier");
-
+	if (test_bit(SDF_DEMOTE, &sdp->sd_flags))
+		seq_printf(s, ",demote_interface_used");
 	return 0;
 }
 
@@ -1357,9 +1354,6 @@ static void gfs2_delete_inode(struct inode *inode)
 	struct gfs2_holder gh;
 	int error;
 
-	if (!test_bit(GIF_USER, &ip->i_flags))
-		goto out;
-
 	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
 	if (unlikely(error)) {
 		gfs2_glock_dq_uninit(&ip->i_iopen_gh);
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 3df60f2d84e3..a0464680af0b 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -54,7 +54,7 @@ extern struct file_system_type gfs2meta_fs_type;
 extern const struct export_operations gfs2_export_ops;
 extern const struct super_operations gfs2_super_ops;
 extern const struct dentry_operations gfs2_dops;
-extern struct xattr_handler *gfs2_xattr_handlers[];
+extern const struct xattr_handler *gfs2_xattr_handlers[];
 
 #endif /* __SUPER_DOT_H__ */
 
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 0dc34621f6a6..37f5393e68e6 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -8,7 +8,6 @@
  */
 
 #include <linux/sched.h>
-#include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
@@ -49,7 +48,7 @@ static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr,
 	return a->store ? a->store(sdp, buf, len) : len;
 }
 
-static struct sysfs_ops gfs2_attr_ops = {
+static const struct sysfs_ops gfs2_attr_ops = {
 	.show  = gfs2_attr_show,
 	.store = gfs2_attr_store,
 };
@@ -167,7 +166,7 @@ static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
 	if (simple_strtol(buf, NULL, 0) != 1)
 		return -EINVAL;
 
-	gfs2_quota_sync(sdp->sd_vfs, 0);
+	gfs2_quota_sync(sdp->sd_vfs, 0, 1);
 	return len;
 }
 
@@ -233,6 +232,8 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len
 	glops = gfs2_glops_list[gltype];
 	if (glops == NULL)
 		return -EINVAL;
+	if (!test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags))
+		fs_info(sdp, "demote interface used\n");
 	rv = gfs2_glock_get(sdp, glnum, glops, 0, &gl);
 	if (rv)
 		return rv;
@@ -469,8 +470,6 @@ static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
 }                                                                             \
 TUNE_ATTR_2(name, name##_store)
 
-TUNE_ATTR(incore_log_blocks, 0);
-TUNE_ATTR(log_flush_secs, 0);
 TUNE_ATTR(quota_warn_period, 0);
 TUNE_ATTR(quota_quantum, 0);
 TUNE_ATTR(max_readahead, 0);
@@ -478,20 +477,16 @@ TUNE_ATTR(complain_secs, 0);
 TUNE_ATTR(statfs_slow, 0);
 TUNE_ATTR(new_files_jdata, 0);
 TUNE_ATTR(quota_simul_sync, 1);
-TUNE_ATTR(stall_secs, 1);
 TUNE_ATTR(statfs_quantum, 1);
 TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
 
 static struct attribute *tune_attrs[] = {
-	&tune_attr_incore_log_blocks.attr,
-	&tune_attr_log_flush_secs.attr,
 	&tune_attr_quota_warn_period.attr,
 	&tune_attr_quota_quantum.attr,
 	&tune_attr_max_readahead.attr,
 	&tune_attr_complain_secs.attr,
 	&tune_attr_statfs_slow.attr,
 	&tune_attr_quota_simul_sync.attr,
-	&tune_attr_stall_secs.attr,
 	&tune_attr_statfs_quantum.attr,
 	&tune_attr_quota_scale.attr,
 	&tune_attr_new_files_jdata.attr,
@@ -576,7 +571,7 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
 	return 0;
 }
 
-static struct kset_uevent_ops gfs2_uevent_ops = {
+static const struct kset_uevent_ops gfs2_uevent_ops = {
 	.uevent = gfs2_uevent,
 };
 
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 4ef0e9fa3549..9ec73a854111 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -23,6 +23,7 @@
 #include "meta_io.h"
 #include "trans.h"
 #include "util.h"
+#include "trace_gfs2.h"
 
 int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
 		     unsigned int revokes)
@@ -75,6 +76,23 @@ fail_holder_uninit:
 	return error;
 }
 
+/**
+ * gfs2_log_release - Release a given number of log blocks
+ * @sdp: The GFS2 superblock
+ * @blks: The number of blocks
+ *
+ */
+
+static void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
+{
+
+	atomic_add(blks, &sdp->sd_log_blks_free);
+	trace_gfs2_log_blocks(sdp, blks);
+	gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
+				  sdp->sd_jdesc->jd_blocks);
+	up_read(&sdp->sd_log_flush_lock);
+}
+
 void gfs2_trans_end(struct gfs2_sbd *sdp)
 {
 	struct gfs2_trans *tr = current->journal_info;
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index f6a7efa34eb9..53511291fe36 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -7,7 +7,6 @@
  * of the GNU General Public License version 2.
  */
 
-#include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
@@ -21,6 +20,7 @@
 #include "util.h"
 
 struct kmem_cache *gfs2_glock_cachep __read_mostly;
+struct kmem_cache *gfs2_glock_aspace_cachep __read_mostly;
 struct kmem_cache *gfs2_inode_cachep __read_mostly;
 struct kmem_cache *gfs2_bufdata_cachep __read_mostly;
 struct kmem_cache *gfs2_rgrpd_cachep __read_mostly;
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index 33e96b0ce9ab..b432e04600de 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -145,6 +145,7 @@ gfs2_io_error_bh_i((sdp), (bh), __func__, __FILE__, __LINE__);
 
 
 extern struct kmem_cache *gfs2_glock_cachep;
+extern struct kmem_cache *gfs2_glock_aspace_cachep;
 extern struct kmem_cache *gfs2_inode_cachep;
 extern struct kmem_cache *gfs2_bufdata_cachep;
 extern struct kmem_cache *gfs2_rgrpd_cachep;
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index c2ebdf2c01d4..82f93da00d1b 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1535,21 +1535,21 @@ out_alloc:
 	return error;
 }
 
-static struct xattr_handler gfs2_xattr_user_handler = {
+static const struct xattr_handler gfs2_xattr_user_handler = {
 	.prefix = XATTR_USER_PREFIX,
 	.flags  = GFS2_EATYPE_USR,
 	.get    = gfs2_xattr_get,
 	.set    = gfs2_xattr_set,
 };
 
-static struct xattr_handler gfs2_xattr_security_handler = {
+static const struct xattr_handler gfs2_xattr_security_handler = {
 	.prefix = XATTR_SECURITY_PREFIX,
 	.flags  = GFS2_EATYPE_SECURITY,
 	.get    = gfs2_xattr_get,
 	.set    = gfs2_xattr_set,
 };
 
-struct xattr_handler *gfs2_xattr_handlers[] = {
+const struct xattr_handler *gfs2_xattr_handlers[] = {
 	&gfs2_xattr_user_handler,
 	&gfs2_xattr_security_handler,
 	&gfs2_xattr_system_handler,
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index 0d200068d0af..cdb41a1f6a64 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -9,6 +9,7 @@
  */
 
 #include <linux/pagemap.h>
+#include <linux/slab.h>
 #include <linux/swap.h>
 
 #include "btree.h"
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 052f214ea6f0..38a0a9917d7f 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -9,6 +9,7 @@
  */
 
 #include <linux/pagemap.h>
+#include <linux/slab.h>
 #include <linux/log2.h>
 
 #include "btree.h"
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 052387e11671..fe35e3b626c4 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -188,7 +188,7 @@ extern const struct address_space_operations hfs_btree_aops;
 
 extern struct inode *hfs_new_inode(struct inode *, struct qstr *, int);
 extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *);
-extern int hfs_write_inode(struct inode *, int);
+extern int hfs_write_inode(struct inode *, struct writeback_control *);
 extern int hfs_inode_setattr(struct dentry *, struct iattr *);
 extern void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext,
 			__be32 log_size, __be32 phys_size, u32 clump_size);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index a1cbff2b4d99..14f5cb1b9fdc 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -381,7 +381,7 @@ void hfs_inode_write_fork(struct inode *inode, struct hfs_extent *ext,
 					 HFS_SB(inode->i_sb)->alloc_blksz);
 }
 
-int hfs_write_inode(struct inode *inode, int unused)
+int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	struct inode *main_inode = inode;
 	struct hfs_find_data fd;
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 8bbe03c3f6d5..86428f5ac991 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -11,6 +11,7 @@
 #include <linux/cdrom.h>
 #include <linux/genhd.h>
 #include <linux/nls.h>
+#include <linux/slab.h>
 
 #include "hfs_fs.h"
 #include "btree.h"
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 5ed7252b7b23..0a81eb7111f3 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -19,6 +19,7 @@
 #include <linux/nls.h>
 #include <linux/parser.h>
 #include <linux/seq_file.h>
+#include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/vfs.h>
 
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 5f4023678251..764fd1bdca88 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -494,7 +494,7 @@ const struct inode_operations hfsplus_dir_inode_operations = {
 const struct file_operations hfsplus_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= hfsplus_readdir,
-	.ioctl          = hfsplus_ioctl,
+	.unlocked_ioctl = hfsplus_ioctl,
 	.llseek		= generic_file_llseek,
 	.release	= hfsplus_dir_release,
 };
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 5c10d803d9df..6505c30ad965 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -337,8 +337,7 @@ struct inode *hfsplus_new_inode(struct super_block *, int);
 void hfsplus_delete_inode(struct inode *);
 
 /* ioctl.c */
-int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
-		  unsigned long arg);
+long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
 int hfsplus_setxattr(struct dentry *dentry, const char *name,
 		     const void *value, size_t size, int flags);
 ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 1bcf597c0562..9bbb82924a22 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -285,7 +285,7 @@ static const struct file_operations hfsplus_file_operations = {
 	.fsync		= file_fsync,
 	.open		= hfsplus_file_open,
 	.release	= hfsplus_file_release,
-	.ioctl          = hfsplus_ioctl,
+	.unlocked_ioctl = hfsplus_ioctl,
 };
 
 struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index f457d2ca51ab..ac405f099026 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -17,14 +17,16 @@
 #include <linux/mount.h>
 #include <linux/sched.h>
 #include <linux/xattr.h>
+#include <linux/smp_lock.h>
 #include <asm/uaccess.h>
 #include "hfsplus_fs.h"
 
-int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
-		  unsigned long arg)
+long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
+	struct inode *inode = filp->f_path.dentry->d_inode;
 	unsigned int flags;
 
+	lock_kernel();
 	switch (cmd) {
 	case HFSPLUS_IOC_EXT2_GETFLAGS:
 		flags = 0;
@@ -38,8 +40,10 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
 	case HFSPLUS_IOC_EXT2_SETFLAGS: {
 		int err = 0;
 		err = mnt_want_write(filp->f_path.mnt);
-		if (err)
+		if (err) {
+			unlock_kernel();
 			return err;
+		}
 
 		if (!is_owner_or_cap(inode)) {
 			err = -EACCES;
@@ -85,9 +89,11 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
 		mark_inode_dirty(inode);
 setflags_out:
 		mnt_drop_write(filp->f_path.mnt);
+		unlock_kernel();
 		return err;
 	}
 	default:
+		unlock_kernel();
 		return -ENOTTY;
 	}
 }
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index 3fcbb0e1f6fc..572628b4b07d 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -15,6 +15,7 @@
 #include <linux/nls.h>
 #include <linux/mount.h>
 #include <linux/seq_file.h>
+#include <linux/slab.h>
 #include "hfsplus_fs.h"
 
 enum {
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 43022f3d5148..74b473a8ef92 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -87,7 +87,8 @@ bad_inode:
 	return ERR_PTR(err);
 }
 
-static int hfsplus_write_inode(struct inode *inode, int unused)
+static int hfsplus_write_inode(struct inode *inode,
+		struct writeback_control *wbc)
 {
 	struct hfsplus_vh *vhdr;
 	int ret = 0;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 032604e5ef2c..87ac1891a185 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -11,6 +11,7 @@
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/statfs.h>
+#include <linux/slab.h>
 #include <linux/seq_file.h>
 #include <linux/mount.h>
 #include "hostfs.h"
@@ -410,9 +411,9 @@ int hostfs_file_open(struct inode *ino, struct file *file)
 	return 0;
 }
 
-int hostfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int hostfs_fsync(struct file *file, int datasync)
 {
-	return fsync_file(HOSTFS_I(dentry->d_inode)->fd, datasync);
+	return fsync_file(HOSTFS_I(file->f_mapping->host)->fd, datasync);
 }
 
 static const struct file_operations hostfs_file_fops = {
diff --git a/fs/hpfs/anode.c b/fs/hpfs/anode.c
index 1aa88c4e0964..6a2f04bf3df0 100644
--- a/fs/hpfs/anode.c
+++ b/fs/hpfs/anode.c
@@ -353,7 +353,7 @@ int hpfs_ea_read(struct super_block *s, secno a, int ano, unsigned pos,
 }
 
 int hpfs_ea_write(struct super_block *s, secno a, int ano, unsigned pos,
-	     unsigned len, char *buf)
+	     unsigned len, const char *buf)
 {
 	struct buffer_head *bh;
 	char *data;
diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c
index b6fca543544c..eac5f96323e3 100644
--- a/fs/hpfs/buffer.c
+++ b/fs/hpfs/buffer.c
@@ -6,6 +6,7 @@
  *  general buffer i/o
  */
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include "hpfs_fn.h"
 
 void hpfs_lock_creation(struct super_block *s)
diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c
index 940d6d150bee..67d9d36b3d5f 100644
--- a/fs/hpfs/dentry.c
+++ b/fs/hpfs/dentry.c
@@ -20,8 +20,8 @@ static int hpfs_hash_dentry(struct dentry *dentry, struct qstr *qstr)
 
 	if (l == 1) if (qstr->name[0]=='.') goto x;
 	if (l == 2) if (qstr->name[0]=='.' || qstr->name[1]=='.') goto x;
-	hpfs_adjust_length((char *)qstr->name, &l);
-	/*if (hpfs_chk_name((char *)qstr->name,&l))*/
+	hpfs_adjust_length(qstr->name, &l);
+	/*if (hpfs_chk_name(qstr->name,&l))*/
 		/*return -ENAMETOOLONG;*/
 		/*return -ENOENT;*/
 	x:
@@ -38,14 +38,16 @@ static int hpfs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qst
 {
 	unsigned al=a->len;
 	unsigned bl=b->len;
-	hpfs_adjust_length((char *)a->name, &al);
-	/*hpfs_adjust_length((char *)b->name, &bl);*/
+	hpfs_adjust_length(a->name, &al);
+	/*hpfs_adjust_length(b->name, &bl);*/
 	/* 'a' is the qstr of an already existing dentry, so the name
 	 * must be valid. 'b' must be validated first.
 	 */
 
-	if (hpfs_chk_name((char *)b->name, &bl)) return 1;
-	if (hpfs_compare_names(dentry->d_sb, (char *)a->name, al, (char *)b->name, bl, 0)) return 1;
+	if (hpfs_chk_name(b->name, &bl))
+		return 1;
+	if (hpfs_compare_names(dentry->d_sb, a->name, al, b->name, bl, 0))
+		return 1;
 	return 0;
 }
 
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 8865c94f55f6..2338130cceba 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -7,6 +7,7 @@
  */
 
 #include <linux/smp_lock.h>
+#include <linux/slab.h>
 #include "hpfs_fn.h"
 
 static int hpfs_dir_release(struct inode *inode, struct file *filp)
@@ -59,7 +60,7 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	struct hpfs_dirent *de;
 	int lc;
 	long old_pos;
-	char *tempname;
+	unsigned char *tempname;
 	int c1, c2 = 0;
 	int ret = 0;
 
@@ -158,11 +159,11 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		tempname = hpfs_translate_name(inode->i_sb, de->name, de->namelen, lc, de->not_8x3);
 		if (filldir(dirent, tempname, de->namelen, old_pos, de->fnode, DT_UNKNOWN) < 0) {
 			filp->f_pos = old_pos;
-			if (tempname != (char *)de->name) kfree(tempname);
+			if (tempname != de->name) kfree(tempname);
 			hpfs_brelse4(&qbh);
 			goto out;
 		}
-		if (tempname != (char *)de->name) kfree(tempname);
+		if (tempname != de->name) kfree(tempname);
 		hpfs_brelse4(&qbh);
 	}
 out:
@@ -187,7 +188,7 @@ out:
 
 struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
-	const char *name = dentry->d_name.name;
+	const unsigned char *name = dentry->d_name.name;
 	unsigned len = dentry->d_name.len;
 	struct quad_buffer_head qbh;
 	struct hpfs_dirent *de;
@@ -197,7 +198,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
 	struct hpfs_inode_info *hpfs_result;
 
 	lock_kernel();
-	if ((err = hpfs_chk_name((char *)name, &len))) {
+	if ((err = hpfs_chk_name(name, &len))) {
 		if (err == -ENAMETOOLONG) {
 			unlock_kernel();
 			return ERR_PTR(-ENAMETOOLONG);
@@ -209,7 +210,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
 	 * '.' and '..' will never be passed here.
 	 */
 
-	de = map_dirent(dir, hpfs_i(dir)->i_dno, (char *) name, len, NULL, &qbh);
+	de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, NULL, &qbh);
 
 	/*
 	 * This is not really a bailout, just means file not found.
@@ -250,7 +251,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
 	hpfs_result = hpfs_i(result);
 	if (!de->directory) hpfs_result->i_parent_dir = dir->i_ino;
 
-	hpfs_decide_conv(result, (char *)name, len);
+	hpfs_decide_conv(result, name, len);
 
 	if (de->has_acl || de->has_xtd_perm) if (!(dir->i_sb->s_flags & MS_RDONLY)) {
 		hpfs_error(result->i_sb, "ACLs or XPERM found. This is probably HPFS386. This driver doesn't support it now. Send me some info on these structures");
diff --git a/fs/hpfs/dnode.c b/fs/hpfs/dnode.c
index fe83c2b7d2d8..9b2ffadfc8c4 100644
--- a/fs/hpfs/dnode.c
+++ b/fs/hpfs/dnode.c
@@ -158,7 +158,8 @@ static void set_last_pointer(struct super_block *s, struct dnode *d, dnode_secno
 
 /* Add an entry to dnode and don't care if it grows over 2048 bytes */
 
-struct hpfs_dirent *hpfs_add_de(struct super_block *s, struct dnode *d, unsigned char *name,
+struct hpfs_dirent *hpfs_add_de(struct super_block *s, struct dnode *d,
+				const unsigned char *name,
 				unsigned namelen, secno down_ptr)
 {
 	struct hpfs_dirent *de;
@@ -223,7 +224,7 @@ static void fix_up_ptrs(struct super_block *s, struct dnode *d)
 /* Add an entry to dnode and do dnode splitting if required */
 
 static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno,
-			     unsigned char *name, unsigned namelen,
+			     const unsigned char *name, unsigned namelen,
 			     struct hpfs_dirent *new_de, dnode_secno down_ptr)
 {
 	struct quad_buffer_head qbh, qbh1, qbh2;
@@ -231,7 +232,7 @@ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno,
 	dnode_secno adno, rdno;
 	struct hpfs_dirent *de;
 	struct hpfs_dirent nde;
-	char *nname;
+	unsigned char *nname;
 	int h;
 	int pos;
 	struct buffer_head *bh;
@@ -305,7 +306,9 @@ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno,
 		pos++;
 	}
 	copy_de(new_de = &nde, de);
-	memcpy(name = nname, de->name, namelen = de->namelen);
+	memcpy(nname, de->name, de->namelen);
+	name = nname;
+	namelen = de->namelen;
 	for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | pos, 4);
 	down_ptr = adno;
 	set_last_pointer(i->i_sb, ad, de->down ? de_down_pointer(de) : 0);
@@ -368,7 +371,8 @@ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno,
  * I hope, now it's finally bug-free.
  */
 
-int hpfs_add_dirent(struct inode *i, unsigned char *name, unsigned namelen,
+int hpfs_add_dirent(struct inode *i,
+		    const unsigned char *name, unsigned namelen,
 		    struct hpfs_dirent *new_de, int cdepth)
 {
 	struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
@@ -897,7 +901,8 @@ struct hpfs_dirent *map_pos_dirent(struct inode *inode, loff_t *posp,
 
 /* Find a dirent in tree */
 
-struct hpfs_dirent *map_dirent(struct inode *inode, dnode_secno dno, char *name, unsigned len,
+struct hpfs_dirent *map_dirent(struct inode *inode, dnode_secno dno,
+			       const unsigned char *name, unsigned len,
 			       dnode_secno *dd, struct quad_buffer_head *qbh)
 {
 	struct dnode *dnode;
@@ -988,8 +993,8 @@ void hpfs_remove_dtree(struct super_block *s, dnode_secno dno)
 struct hpfs_dirent *map_fnode_dirent(struct super_block *s, fnode_secno fno,
 				     struct fnode *f, struct quad_buffer_head *qbh)
 {
-	char *name1;
-	char *name2;
+	unsigned char *name1;
+	unsigned char *name2;
 	int name1len, name2len;
 	struct dnode *d;
 	dnode_secno dno, downd;
diff --git a/fs/hpfs/ea.c b/fs/hpfs/ea.c
index 547a8384571f..45e53d972b42 100644
--- a/fs/hpfs/ea.c
+++ b/fs/hpfs/ea.c
@@ -62,8 +62,8 @@ static char *get_indirect_ea(struct super_block *s, int ano, secno a, int size)
 	return ret;
 }
 
-static void set_indirect_ea(struct super_block *s, int ano, secno a, char *data,
-			    int size)
+static void set_indirect_ea(struct super_block *s, int ano, secno a,
+			    const char *data, int size)
 {
 	hpfs_ea_write(s, a, ano, 0, size, data);
 }
@@ -186,7 +186,8 @@ char *hpfs_get_ea(struct super_block *s, struct fnode *fnode, char *key, int *si
  * This driver can't change sizes of eas ('cause I just don't need it).
  */
 
-void hpfs_set_ea(struct inode *inode, struct fnode *fnode, char *key, char *data, int size)
+void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
+		 const char *data, int size)
 {
 	fnode_secno fno = inode->i_ino;
 	struct super_block *s = inode->i_sb;
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 3efabff00367..a9ae9bfa752f 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -19,9 +19,9 @@ static int hpfs_file_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-int hpfs_file_fsync(struct file *file, struct dentry *dentry, int datasync)
+int hpfs_file_fsync(struct file *file, int datasync)
 {
-	/*return file_fsync(file, dentry);*/
+	/*return file_fsync(file, datasync);*/
 	return 0; /* Don't fsync :-) */
 }
 
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 701ca54c0867..75f9d4324851 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -215,7 +215,7 @@ secno hpfs_bplus_lookup(struct super_block *, struct inode *, struct bplus_heade
 secno hpfs_add_sector_to_btree(struct super_block *, secno, int, unsigned);
 void hpfs_remove_btree(struct super_block *, struct bplus_header *);
 int hpfs_ea_read(struct super_block *, secno, int, unsigned, unsigned, char *);
-int hpfs_ea_write(struct super_block *, secno, int, unsigned, unsigned, char *);
+int hpfs_ea_write(struct super_block *, secno, int, unsigned, unsigned, const char *);
 void hpfs_ea_remove(struct super_block *, secno, int, unsigned);
 void hpfs_truncate_btree(struct super_block *, secno, int, unsigned);
 void hpfs_remove_fnode(struct super_block *, fnode_secno fno);
@@ -244,13 +244,17 @@ extern const struct file_operations hpfs_dir_ops;
 
 void hpfs_add_pos(struct inode *, loff_t *);
 void hpfs_del_pos(struct inode *, loff_t *);
-struct hpfs_dirent *hpfs_add_de(struct super_block *, struct dnode *, unsigned char *, unsigned, secno);
-int hpfs_add_dirent(struct inode *, unsigned char *, unsigned, struct hpfs_dirent *, int);
+struct hpfs_dirent *hpfs_add_de(struct super_block *, struct dnode *,
+				const unsigned char *, unsigned, secno);
+int hpfs_add_dirent(struct inode *, const unsigned char *, unsigned,
+		    struct hpfs_dirent *, int);
 int hpfs_remove_dirent(struct inode *, dnode_secno, struct hpfs_dirent *, struct quad_buffer_head *, int);
 void hpfs_count_dnodes(struct super_block *, dnode_secno, int *, int *, int *);
 dnode_secno hpfs_de_as_down_as_possible(struct super_block *, dnode_secno dno);
 struct hpfs_dirent *map_pos_dirent(struct inode *, loff_t *, struct quad_buffer_head *);
-struct hpfs_dirent *map_dirent(struct inode *, dnode_secno, char *, unsigned, dnode_secno *, struct quad_buffer_head *);
+struct hpfs_dirent *map_dirent(struct inode *, dnode_secno,
+			       const unsigned char *, unsigned, dnode_secno *,
+			       struct quad_buffer_head *);
 void hpfs_remove_dtree(struct super_block *, dnode_secno);
 struct hpfs_dirent *map_fnode_dirent(struct super_block *, fnode_secno, struct fnode *, struct quad_buffer_head *);
 
@@ -259,11 +263,12 @@ struct hpfs_dirent *map_fnode_dirent(struct super_block *, fnode_secno, struct f
 void hpfs_ea_ext_remove(struct super_block *, secno, int, unsigned);
 int hpfs_read_ea(struct super_block *, struct fnode *, char *, char *, int);
 char *hpfs_get_ea(struct super_block *, struct fnode *, char *, int *);
-void hpfs_set_ea(struct inode *, struct fnode *, char *, char *, int);
+void hpfs_set_ea(struct inode *, struct fnode *, const char *,
+		 const char *, int);
 
 /* file.c */
 
-int hpfs_file_fsync(struct file *, struct dentry *, int);
+int hpfs_file_fsync(struct file *, int);
 extern const struct file_operations hpfs_file_ops;
 extern const struct inode_operations hpfs_file_iops;
 extern const struct address_space_operations hpfs_aops;
@@ -282,7 +287,7 @@ void hpfs_delete_inode(struct inode *);
 
 unsigned *hpfs_map_dnode_bitmap(struct super_block *, struct quad_buffer_head *);
 unsigned *hpfs_map_bitmap(struct super_block *, unsigned, struct quad_buffer_head *, char *);
-char *hpfs_load_code_page(struct super_block *, secno);
+unsigned char *hpfs_load_code_page(struct super_block *, secno);
 secno *hpfs_load_bitmap_directory(struct super_block *, secno bmp);
 struct fnode *hpfs_map_fnode(struct super_block *s, ino_t, struct buffer_head **);
 struct anode *hpfs_map_anode(struct super_block *s, anode_secno, struct buffer_head **);
@@ -292,12 +297,13 @@ dnode_secno hpfs_fnode_dno(struct super_block *s, ino_t ino);
 /* name.c */
 
 unsigned char hpfs_upcase(unsigned char *, unsigned char);
-int hpfs_chk_name(unsigned char *, unsigned *);
-char *hpfs_translate_name(struct super_block *, unsigned char *, unsigned, int, int);
-int hpfs_compare_names(struct super_block *, unsigned char *, unsigned, unsigned char *, unsigned, int);
-int hpfs_is_name_long(unsigned char *, unsigned);
-void hpfs_adjust_length(unsigned char *, unsigned *);
-void hpfs_decide_conv(struct inode *, unsigned char *, unsigned);
+int hpfs_chk_name(const unsigned char *, unsigned *);
+unsigned char *hpfs_translate_name(struct super_block *, unsigned char *, unsigned, int, int);
+int hpfs_compare_names(struct super_block *, const unsigned char *, unsigned,
+		       const unsigned char *, unsigned, int);
+int hpfs_is_name_long(const unsigned char *, unsigned);
+void hpfs_adjust_length(const unsigned char *, unsigned *);
+void hpfs_decide_conv(struct inode *, const unsigned char *, unsigned);
 
 /* namei.c */
 
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index fe703ae46bc7..1042a9bc97f3 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -7,6 +7,7 @@
  */
 
 #include <linux/smp_lock.h>
+#include <linux/slab.h>
 #include "hpfs_fn.h"
 
 void hpfs_init_inode(struct inode *i)
@@ -46,7 +47,7 @@ void hpfs_read_inode(struct inode *i)
 	struct fnode *fnode;
 	struct super_block *sb = i->i_sb;
 	struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
-	unsigned char *ea;
+	void *ea;
 	int ea_size;
 
 	if (!(fnode = hpfs_map_fnode(sb, i->i_ino, &bh))) {
@@ -112,7 +113,7 @@ void hpfs_read_inode(struct inode *i)
 		}
 	}
 	if (fnode->dirflag) {
-		unsigned n_dnodes, n_subdirs;
+		int n_dnodes, n_subdirs;
 		i->i_mode |= S_IFDIR;
 		i->i_op = &hpfs_dir_iops;
 		i->i_fop = &hpfs_dir_ops;
diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c
index c4724589b2eb..840d033ecee8 100644
--- a/fs/hpfs/map.c
+++ b/fs/hpfs/map.c
@@ -35,7 +35,7 @@ unsigned int *hpfs_map_bitmap(struct super_block *s, unsigned bmp_block,
  * lowercasing table
  */
 
-char *hpfs_load_code_page(struct super_block *s, secno cps)
+unsigned char *hpfs_load_code_page(struct super_block *s, secno cps)
 {
 	struct buffer_head *bh;
 	secno cpds;
@@ -71,7 +71,7 @@ char *hpfs_load_code_page(struct super_block *s, secno cps)
 		brelse(bh);
 		return NULL;
 	}
-	ptr = (char *)cpd + cpd->offs[cpi] + 6;
+	ptr = (unsigned char *)cpd + cpd->offs[cpi] + 6;
 	if (!(cp_table = kmalloc(256, GFP_KERNEL))) {
 		printk("HPFS: out of memory for code page table\n");
 		brelse(bh);
@@ -217,7 +217,7 @@ struct dnode *hpfs_map_dnode(struct super_block *s, unsigned secno,
 	if ((dnode = hpfs_map_4sectors(s, secno, qbh, DNODE_RD_AHEAD)))
 		if (hpfs_sb(s)->sb_chk) {
 			unsigned p, pp = 0;
-			unsigned char *d = (char *)dnode;
+			unsigned char *d = (unsigned char *)dnode;
 			int b = 0;
 			if (dnode->magic != DNODE_MAGIC) {
 				hpfs_error(s, "bad magic on dnode %08x", secno);
diff --git a/fs/hpfs/name.c b/fs/hpfs/name.c
index 1f4a964384eb..f24736d7a439 100644
--- a/fs/hpfs/name.c
+++ b/fs/hpfs/name.c
@@ -8,16 +8,16 @@
 
 #include "hpfs_fn.h"
 
-static char *text_postfix[]={
+static const char *text_postfix[]={
 ".ASM", ".BAS", ".BAT", ".C", ".CC", ".CFG", ".CMD", ".CON", ".CPP", ".DEF",
 ".DOC", ".DPR", ".ERX", ".H", ".HPP", ".HTM", ".HTML", ".JAVA", ".LOG", ".PAS",
 ".RC", ".TEX", ".TXT", ".Y", ""};
 
-static char *text_prefix[]={
+static const char *text_prefix[]={
 "AUTOEXEC.", "CHANGES", "COPYING", "CONFIG.", "CREDITS", "FAQ", "FILE_ID.DIZ",
 "MAKEFILE", "READ.ME", "README", "TERMCAP", ""};
 
-void hpfs_decide_conv(struct inode *inode, unsigned char *name, unsigned len)
+void hpfs_decide_conv(struct inode *inode, const unsigned char *name, unsigned len)
 {
 	struct hpfs_inode_info *hpfs_inode = hpfs_i(inode);
 	int i;
@@ -71,7 +71,7 @@ static inline unsigned char locase(unsigned char *dir, unsigned char a)
 	return dir[a];
 }
 
-int hpfs_chk_name(unsigned char *name, unsigned *len)
+int hpfs_chk_name(const unsigned char *name, unsigned *len)
 {
 	int i;
 	if (*len > 254) return -ENAMETOOLONG;
@@ -83,10 +83,10 @@ int hpfs_chk_name(unsigned char *name, unsigned *len)
 	return 0;
 }
 
-char *hpfs_translate_name(struct super_block *s, unsigned char *from,
+unsigned char *hpfs_translate_name(struct super_block *s, unsigned char *from,
 			  unsigned len, int lc, int lng)
 {
-	char *to;
+	unsigned char *to;
 	int i;
 	if (hpfs_sb(s)->sb_chk >= 2) if (hpfs_is_name_long(from, len) != lng) {
 		printk("HPFS: Long name flag mismatch - name ");
@@ -103,8 +103,9 @@ char *hpfs_translate_name(struct super_block *s, unsigned char *from,
 	return to;
 }
 
-int hpfs_compare_names(struct super_block *s, unsigned char *n1, unsigned l1,
-		       unsigned char *n2, unsigned l2, int last)
+int hpfs_compare_names(struct super_block *s,
+		       const unsigned char *n1, unsigned l1,
+		       const unsigned char *n2, unsigned l2, int last)
 {
 	unsigned l = l1 < l2 ? l1 : l2;
 	unsigned i;
@@ -120,7 +121,7 @@ int hpfs_compare_names(struct super_block *s, unsigned char *n1, unsigned l1,
 	return 0;
 }
 
-int hpfs_is_name_long(unsigned char *name, unsigned len)
+int hpfs_is_name_long(const unsigned char *name, unsigned len)
 {
 	int i,j;
 	for (i = 0; i < len && name[i] != '.'; i++)
@@ -134,7 +135,7 @@ int hpfs_is_name_long(unsigned char *name, unsigned len)
 
 /* OS/2 clears dots and spaces at the end of file name, so we have to */
 
-void hpfs_adjust_length(unsigned char *name, unsigned *len)
+void hpfs_adjust_length(const unsigned char *name, unsigned *len)
 {
 	if (!*len) return;
 	if (*len == 1 && name[0] == '.') return;
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 82b9c4ba9ed0..11c2b4080f65 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -11,7 +11,7 @@
 
 static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
-	const char *name = dentry->d_name.name;
+	const unsigned char *name = dentry->d_name.name;
 	unsigned len = dentry->d_name.len;
 	struct quad_buffer_head qbh0;
 	struct buffer_head *bh;
@@ -24,7 +24,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	int r;
 	struct hpfs_dirent dee;
 	int err;
-	if ((err = hpfs_chk_name((char *)name, &len))) return err==-ENOENT ? -EINVAL : err;
+	if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
 	lock_kernel();
 	err = -ENOSPC;
 	fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh);
@@ -62,7 +62,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 		result->i_mode &= ~0222;
 
 	mutex_lock(&hpfs_i(dir)->i_mutex);
-	r = hpfs_add_dirent(dir, (char *)name, len, &dee, 0);
+	r = hpfs_add_dirent(dir, name, len, &dee, 0);
 	if (r == 1)
 		goto bail3;
 	if (r == -1) {
@@ -121,7 +121,7 @@ bail:
 
 static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd)
 {
-	const char *name = dentry->d_name.name;
+	const unsigned char *name = dentry->d_name.name;
 	unsigned len = dentry->d_name.len;
 	struct inode *result = NULL;
 	struct buffer_head *bh;
@@ -130,7 +130,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc
 	int r;
 	struct hpfs_dirent dee;
 	int err;
-	if ((err = hpfs_chk_name((char *)name, &len)))
+	if ((err = hpfs_chk_name(name, &len)))
 		return err==-ENOENT ? -EINVAL : err;
 	lock_kernel();
 	err = -ENOSPC;
@@ -155,7 +155,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc
 	result->i_op = &hpfs_file_iops;
 	result->i_fop = &hpfs_file_ops;
 	result->i_nlink = 1;
-	hpfs_decide_conv(result, (char *)name, len);
+	hpfs_decide_conv(result, name, len);
 	hpfs_i(result)->i_parent_dir = dir->i_ino;
 	result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, dee.creation_date);
 	result->i_ctime.tv_nsec = 0;
@@ -170,7 +170,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc
 	hpfs_i(result)->mmu_private = 0;
 
 	mutex_lock(&hpfs_i(dir)->i_mutex);
-	r = hpfs_add_dirent(dir, (char *)name, len, &dee, 0);
+	r = hpfs_add_dirent(dir, name, len, &dee, 0);
 	if (r == 1)
 		goto bail2;
 	if (r == -1) {
@@ -211,7 +211,7 @@ bail:
 
 static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
 {
-	const char *name = dentry->d_name.name;
+	const unsigned char *name = dentry->d_name.name;
 	unsigned len = dentry->d_name.len;
 	struct buffer_head *bh;
 	struct fnode *fnode;
@@ -220,7 +220,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t
 	struct hpfs_dirent dee;
 	struct inode *result = NULL;
 	int err;
-	if ((err = hpfs_chk_name((char *)name, &len))) return err==-ENOENT ? -EINVAL : err;
+	if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
 	if (hpfs_sb(dir->i_sb)->sb_eas < 2) return -EPERM;
 	if (!new_valid_dev(rdev))
 		return -EINVAL;
@@ -256,7 +256,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t
 	init_special_inode(result, mode, rdev);
 
 	mutex_lock(&hpfs_i(dir)->i_mutex);
-	r = hpfs_add_dirent(dir, (char *)name, len, &dee, 0);
+	r = hpfs_add_dirent(dir, name, len, &dee, 0);
 	if (r == 1)
 		goto bail2;
 	if (r == -1) {
@@ -289,7 +289,7 @@ bail:
 
 static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *symlink)
 {
-	const char *name = dentry->d_name.name;
+	const unsigned char *name = dentry->d_name.name;
 	unsigned len = dentry->d_name.len;
 	struct buffer_head *bh;
 	struct fnode *fnode;
@@ -298,7 +298,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
 	struct hpfs_dirent dee;
 	struct inode *result;
 	int err;
-	if ((err = hpfs_chk_name((char *)name, &len))) return err==-ENOENT ? -EINVAL : err;
+	if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
 	lock_kernel();
 	if (hpfs_sb(dir->i_sb)->sb_eas < 2) {
 		unlock_kernel();
@@ -335,7 +335,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
 	result->i_data.a_ops = &hpfs_symlink_aops;
 
 	mutex_lock(&hpfs_i(dir)->i_mutex);
-	r = hpfs_add_dirent(dir, (char *)name, len, &dee, 0);
+	r = hpfs_add_dirent(dir, name, len, &dee, 0);
 	if (r == 1)
 		goto bail2;
 	if (r == -1) {
@@ -345,7 +345,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
 	fnode->len = len;
 	memcpy(fnode->name, name, len > 15 ? 15 : len);
 	fnode->up = dir->i_ino;
-	hpfs_set_ea(result, fnode, "SYMLINK", (char *)symlink, strlen(symlink));
+	hpfs_set_ea(result, fnode, "SYMLINK", symlink, strlen(symlink));
 	mark_buffer_dirty(bh);
 	brelse(bh);
 
@@ -369,7 +369,7 @@ bail:
 
 static int hpfs_unlink(struct inode *dir, struct dentry *dentry)
 {
-	const char *name = dentry->d_name.name;
+	const unsigned char *name = dentry->d_name.name;
 	unsigned len = dentry->d_name.len;
 	struct quad_buffer_head qbh;
 	struct hpfs_dirent *de;
@@ -381,12 +381,12 @@ static int hpfs_unlink(struct inode *dir, struct dentry *dentry)
 	int err;
 
 	lock_kernel();
-	hpfs_adjust_length((char *)name, &len);
+	hpfs_adjust_length(name, &len);
 again:
 	mutex_lock(&hpfs_i(inode)->i_parent_mutex);
 	mutex_lock(&hpfs_i(dir)->i_mutex);
 	err = -ENOENT;
-	de = map_dirent(dir, hpfs_i(dir)->i_dno, (char *)name, len, &dno, &qbh);
+	de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, &dno, &qbh);
 	if (!de)
 		goto out;
 
@@ -413,22 +413,25 @@ again:
 
 		mutex_unlock(&hpfs_i(dir)->i_mutex);
 		mutex_unlock(&hpfs_i(inode)->i_parent_mutex);
-		d_drop(dentry);
-		spin_lock(&dentry->d_lock);
-		if (atomic_read(&dentry->d_count) > 1 ||
-		    generic_permission(inode, MAY_WRITE, NULL) ||
+		dentry_unhash(dentry);
+		if (!d_unhashed(dentry)) {
+			dput(dentry);
+			unlock_kernel();
+			return -ENOSPC;
+		}
+		if (generic_permission(inode, MAY_WRITE, NULL) ||
 		    !S_ISREG(inode->i_mode) ||
 		    get_write_access(inode)) {
-			spin_unlock(&dentry->d_lock);
 			d_rehash(dentry);
+			dput(dentry);
 		} else {
 			struct iattr newattrs;
-			spin_unlock(&dentry->d_lock);
 			/*printk("HPFS: truncating file before delete.\n");*/
 			newattrs.ia_size = 0;
 			newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
 			err = notify_change(dentry, &newattrs);
 			put_write_access(inode);
+			dput(dentry);
 			if (!err)
 				goto again;
 		}
@@ -451,7 +454,7 @@ out:
 
 static int hpfs_rmdir(struct inode *dir, struct dentry *dentry)
 {
-	const char *name = dentry->d_name.name;
+	const unsigned char *name = dentry->d_name.name;
 	unsigned len = dentry->d_name.len;
 	struct quad_buffer_head qbh;
 	struct hpfs_dirent *de;
@@ -462,12 +465,12 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry)
 	int err;
 	int r;
 
-	hpfs_adjust_length((char *)name, &len);
+	hpfs_adjust_length(name, &len);
 	lock_kernel();
 	mutex_lock(&hpfs_i(inode)->i_parent_mutex);
 	mutex_lock(&hpfs_i(dir)->i_mutex);
 	err = -ENOENT;
-	de = map_dirent(dir, hpfs_i(dir)->i_dno, (char *)name, len, &dno, &qbh);
+	de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, &dno, &qbh);
 	if (!de)
 		goto out;
 
@@ -546,10 +549,10 @@ const struct address_space_operations hpfs_symlink_aops = {
 static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		struct inode *new_dir, struct dentry *new_dentry)
 {
-	char *old_name = (char *)old_dentry->d_name.name;
-	int old_len = old_dentry->d_name.len;
-	char *new_name = (char *)new_dentry->d_name.name;
-	int new_len = new_dentry->d_name.len;
+	const unsigned char *old_name = old_dentry->d_name.name;
+	unsigned old_len = old_dentry->d_name.len;
+	const unsigned char *new_name = new_dentry->d_name.name;
+	unsigned new_len = new_dentry->d_name.len;
 	struct inode *i = old_dentry->d_inode;
 	struct inode *new_inode = new_dentry->d_inode;
 	struct quad_buffer_head qbh, qbh1;
@@ -560,9 +563,9 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct buffer_head *bh;
 	struct fnode *fnode;
 	int err;
-	if ((err = hpfs_chk_name((char *)new_name, &new_len))) return err;
+	if ((err = hpfs_chk_name(new_name, &new_len))) return err;
 	err = 0;
-	hpfs_adjust_length((char *)old_name, &old_len);
+	hpfs_adjust_length(old_name, &old_len);
 
 	lock_kernel();
 	/* order doesn't matter, due to VFS exclusion */
@@ -579,7 +582,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		goto end1;
 	}
 
-	if (!(dep = map_dirent(old_dir, hpfs_i(old_dir)->i_dno, (char *)old_name, old_len, &dno, &qbh))) {
+	if (!(dep = map_dirent(old_dir, hpfs_i(old_dir)->i_dno, old_name, old_len, &dno, &qbh))) {
 		hpfs_error(i->i_sb, "lookup succeeded but map dirent failed");
 		err = -ENOENT;
 		goto end1;
@@ -590,7 +593,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (new_inode) {
 		int r;
 		if ((r = hpfs_remove_dirent(old_dir, dno, dep, &qbh, 1)) != 2) {
-			if ((nde = map_dirent(new_dir, hpfs_i(new_dir)->i_dno, (char *)new_name, new_len, NULL, &qbh1))) {
+			if ((nde = map_dirent(new_dir, hpfs_i(new_dir)->i_dno, new_name, new_len, NULL, &qbh1))) {
 				clear_nlink(new_inode);
 				copy_de(nde, &de);
 				memcpy(nde->name, new_name, new_len);
@@ -618,7 +621,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	}
 	
 	if (new_dir == old_dir)
-		if (!(dep = map_dirent(old_dir, hpfs_i(old_dir)->i_dno, (char *)old_name, old_len, &dno, &qbh))) {
+		if (!(dep = map_dirent(old_dir, hpfs_i(old_dir)->i_dno, old_name, old_len, &dno, &qbh))) {
 			hpfs_unlock_creation(i->i_sb);
 			hpfs_error(i->i_sb, "lookup succeeded but map dirent failed at #2");
 			err = -ENOENT;
@@ -648,7 +651,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		brelse(bh);
 	}
 	hpfs_i(i)->i_conv = hpfs_sb(i->i_sb)->sb_conv;
-	hpfs_decide_conv(i, (char *)new_name, new_len);
+	hpfs_decide_conv(i, new_name, new_len);
 end1:
 	if (old_dir != new_dir)
 		mutex_unlock(&hpfs_i(new_dir)->i_mutex);
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index cadc4ce48656..aa53842c599c 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -15,6 +15,7 @@
 #include <linux/sched.h>
 #include <linux/smp_lock.h>
 #include <linux/bitmap.h>
+#include <linux/slab.h>
 
 /* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */
 
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 7239efc690d8..826c3f9d29ac 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -587,7 +587,7 @@ static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir)
 	return err;
 }
 
-static int hppfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+static int hppfs_fsync(struct file *file, int datasync)
 {
 	return 0;
 }
@@ -718,7 +718,7 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
 	struct vfsmount *proc_mnt;
 	int err = -ENOENT;
 
-	proc_mnt = do_kern_mount("proc", 0, "proc", NULL);
+	proc_mnt = mntget(current->nsproxy->pid_ns->proc_mnt);
 	if (IS_ERR(proc_mnt))
 		goto out;
 
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a0bbd3d1b41a..a4e9a7ec3691 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -688,7 +688,7 @@ static void init_once(void *foo)
 const struct file_operations hugetlbfs_file_operations = {
 	.read			= hugetlbfs_read,
 	.mmap			= hugetlbfs_file_mmap,
-	.fsync			= simple_sync_file,
+	.fsync			= noop_fsync,
 	.get_unmapped_area	= hugetlb_get_unmapped_area,
 };
 
diff --git a/fs/inode.c b/fs/inode.c
index 03dfeb2e3928..722860b323a9 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -8,7 +8,6 @@
 #include <linux/mm.h>
 #include <linux/dcache.h>
 #include <linux/init.h>
-#include <linux/quotaops.h>
 #include <linux/slab.h>
 #include <linux/writeback.h>
 #include <linux/module.h>
@@ -287,11 +286,9 @@ static void init_once(void *foo)
  */
 void __iget(struct inode *inode)
 {
-	if (atomic_read(&inode->i_count)) {
-		atomic_inc(&inode->i_count);
+	if (atomic_inc_return(&inode->i_count) != 1)
 		return;
-	}
-	atomic_inc(&inode->i_count);
+
 	if (!(inode->i_state & (I_DIRTY|I_SYNC)))
 		list_move(&inode->i_list, &inode_in_use);
 	inodes_stat.nr_unused--;
@@ -314,7 +311,6 @@ void clear_inode(struct inode *inode)
 	BUG_ON(!(inode->i_state & I_FREEING));
 	BUG_ON(inode->i_state & I_CLEAR);
 	inode_sync_wait(inode);
-	vfs_dq_drop(inode);
 	if (inode->i_sb->s_op->clear_inode)
 		inode->i_sb->s_op->clear_inode(inode);
 	if (S_ISBLK(inode->i_mode) && inode->i_bdev)
@@ -516,7 +512,7 @@ static void prune_icache(int nr_to_scan)
  * This function is passed the number of inodes to scan, and it returns the
  * total number of remaining possibly-reclaimable inodes.
  */
-static int shrink_icache_memory(int nr, gfp_t gfp_mask)
+static int shrink_icache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 {
 	if (nr) {
 		/*
@@ -1207,12 +1203,8 @@ void generic_delete_inode(struct inode *inode)
 	inodes_stat.nr_inodes--;
 	spin_unlock(&inode_lock);
 
-	security_inode_delete(inode);
-
 	if (op->delete_inode) {
 		void (*delete)(struct inode *) = op->delete_inode;
-		if (!is_bad_inode(inode))
-			vfs_dq_init(inode);
 		/* Filesystems implementing their own
 		 * s_op->delete_inode are required to call
 		 * truncate_inode_pages and clear_inode()
@@ -1614,3 +1606,23 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
 				  inode->i_ino);
 }
 EXPORT_SYMBOL(init_special_inode);
+
+/**
+ * Init uid,gid,mode for new inode according to posix standards
+ * @inode: New inode
+ * @dir: Directory inode
+ * @mode: mode of the new inode
+ */
+void inode_init_owner(struct inode *inode, const struct inode *dir,
+			mode_t mode)
+{
+	inode->i_uid = current_fsuid();
+	if (dir && dir->i_mode & S_ISGID) {
+		inode->i_gid = dir->i_gid;
+		if (S_ISDIR(mode))
+			mode |= S_ISGID;
+	} else
+		inode->i_gid = current_fsgid();
+	inode->i_mode = mode;
+}
+EXPORT_SYMBOL(inode_init_owner);
diff --git a/fs/internal.h b/fs/internal.h
index e96a1667d749..6b706bc60a66 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -70,6 +70,8 @@ extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
 
 extern void __init mnt_init(void);
 
+extern spinlock_t vfsmount_lock;
+
 /*
  * fs_struct.c
  */
@@ -85,6 +87,8 @@ extern struct file *get_empty_filp(void);
  * super.c
  */
 extern int do_remount_sb(struct super_block *, int, void *, int);
+extern void __put_super(struct super_block *sb);
+extern void put_super(struct super_block *sb);
 
 /*
  * open.c
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 6c751106c2e5..2d140a713861 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -228,14 +228,23 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
 
 #ifdef CONFIG_BLOCK
 
-#define blk_to_logical(inode, blk) (blk << (inode)->i_blkbits)
-#define logical_to_blk(inode, offset) (offset >> (inode)->i_blkbits);
+static inline sector_t logical_to_blk(struct inode *inode, loff_t offset)
+{
+	return (offset >> inode->i_blkbits);
+}
+
+static inline loff_t blk_to_logical(struct inode *inode, sector_t blk)
+{
+	return (blk << inode->i_blkbits);
+}
 
 /**
  * __generic_block_fiemap - FIEMAP for block based inodes (no locking)
- * @inode - the inode to map
- * @arg - the pointer to userspace where we copy everything to
- * @get_block - the fs's get_block function
+ * @inode: the inode to map
+ * @fieinfo: the fiemap info struct that will be passed back to userspace
+ * @start: where to start mapping in the inode
+ * @len: how much space to map
+ * @get_block: the fs's get_block function
  *
  * This does FIEMAP for block based inodes.  Basically it will just loop
  * through get_block until we hit the number of extents we want to map, or we
@@ -250,58 +259,63 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
  */
 
 int __generic_block_fiemap(struct inode *inode,
-			   struct fiemap_extent_info *fieinfo, u64 start,
-			   u64 len, get_block_t *get_block)
+			   struct fiemap_extent_info *fieinfo, loff_t start,
+			   loff_t len, get_block_t *get_block)
 {
-	struct buffer_head tmp;
-	unsigned long long start_blk;
-	long long length = 0, map_len = 0;
+	struct buffer_head map_bh;
+	sector_t start_blk, last_blk;
+	loff_t isize = i_size_read(inode);
 	u64 logical = 0, phys = 0, size = 0;
 	u32 flags = FIEMAP_EXTENT_MERGED;
-	int ret = 0, past_eof = 0, whole_file = 0;
+	bool past_eof = false, whole_file = false;
+	int ret = 0;
 
-	if ((ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC)))
+	ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
+	if (ret)
 		return ret;
 
-	start_blk = logical_to_blk(inode, start);
-
-	length = (long long)min_t(u64, len, i_size_read(inode));
-	if (length < len)
-		whole_file = 1;
+	/*
+	 * Either the i_mutex or other appropriate locking needs to be held
+	 * since we expect isize to not change at all through the duration of
+	 * this call.
+	 */
+	if (len >= isize) {
+		whole_file = true;
+		len = isize;
+	}
 
-	map_len = length;
+	start_blk = logical_to_blk(inode, start);
+	last_blk = logical_to_blk(inode, start + len - 1);
 
 	do {
 		/*
 		 * we set b_size to the total size we want so it will map as
 		 * many contiguous blocks as possible at once
 		 */
-		memset(&tmp, 0, sizeof(struct buffer_head));
-		tmp.b_size = map_len;
+		memset(&map_bh, 0, sizeof(struct buffer_head));
+		map_bh.b_size = len;
 
-		ret = get_block(inode, start_blk, &tmp, 0);
+		ret = get_block(inode, start_blk, &map_bh, 0);
 		if (ret)
 			break;
 
 		/* HOLE */
-		if (!buffer_mapped(&tmp)) {
-			length -= blk_to_logical(inode, 1);
+		if (!buffer_mapped(&map_bh)) {
 			start_blk++;
 
 			/*
-			 * we want to handle the case where there is an
+			 * We want to handle the case where there is an
 			 * allocated block at the front of the file, and then
 			 * nothing but holes up to the end of the file properly,
 			 * to make sure that extent at the front gets properly
 			 * marked with FIEMAP_EXTENT_LAST
 			 */
 			if (!past_eof &&
-			    blk_to_logical(inode, start_blk) >=
-			    blk_to_logical(inode, 0)+i_size_read(inode))
+			    blk_to_logical(inode, start_blk) >= isize)
 				past_eof = 1;
 
 			/*
-			 * first hole after going past the EOF, this is our
+			 * First hole after going past the EOF, this is our
 			 * last extent
 			 */
 			if (past_eof && size) {
@@ -309,15 +323,18 @@ int __generic_block_fiemap(struct inode *inode,
 				ret = fiemap_fill_next_extent(fieinfo, logical,
 							      phys, size,
 							      flags);
-				break;
+			} else if (size) {
+				ret = fiemap_fill_next_extent(fieinfo, logical,
+							      phys, size, flags);
+				size = 0;
 			}
 
 			/* if we have holes up to/past EOF then we're done */
-			if (length <= 0 || past_eof)
+			if (start_blk > last_blk || past_eof || ret)
 				break;
 		} else {
 			/*
-			 * we have gone over the length of what we wanted to
+			 * We have gone over the length of what we wanted to
 			 * map, and it wasn't the entire file, so add the extent
 			 * we got last time and exit.
 			 *
@@ -331,7 +348,7 @@ int __generic_block_fiemap(struct inode *inode,
 			 * are good to go, just add the extent to the fieinfo
 			 * and break
 			 */
-			if (length <= 0 && !whole_file) {
+			if (start_blk > last_blk && !whole_file) {
 				ret = fiemap_fill_next_extent(fieinfo, logical,
 							      phys, size,
 							      flags);
@@ -351,11 +368,10 @@ int __generic_block_fiemap(struct inode *inode,
 			}
 
 			logical = blk_to_logical(inode, start_blk);
-			phys = blk_to_logical(inode, tmp.b_blocknr);
-			size = tmp.b_size;
+			phys = blk_to_logical(inode, map_bh.b_blocknr);
+			size = map_bh.b_size;
 			flags = FIEMAP_EXTENT_MERGED;
 
-			length -= tmp.b_size;
 			start_blk += logical_to_blk(inode, size);
 
 			/*
@@ -363,15 +379,13 @@ int __generic_block_fiemap(struct inode *inode,
 			 * soon as we find a hole that the last extent we found
 			 * is marked with FIEMAP_EXTENT_LAST
 			 */
-			if (!past_eof &&
-			    logical+size >=
-			    blk_to_logical(inode, 0)+i_size_read(inode))
-				past_eof = 1;
+			if (!past_eof && logical + size >= isize)
+				past_eof = true;
 		}
 		cond_resched();
 	} while (1);
 
-	/* if ret is 1 then we just hit the end of the extent array */
+	/* If ret is 1 then we just hit the end of the extent array */
 	if (ret == 1)
 		ret = 0;
 
@@ -511,15 +525,8 @@ static int ioctl_fsfreeze(struct file *filp)
 	if (sb->s_op->freeze_fs == NULL)
 		return -EOPNOTSUPP;
 
-	/* If a blockdevice-backed filesystem isn't specified, return. */
-	if (sb->s_bdev == NULL)
-		return -EINVAL;
-
 	/* Freeze */
-	sb = freeze_bdev(sb->s_bdev);
-	if (IS_ERR(sb))
-		return PTR_ERR(sb);
-	return 0;
+	return freeze_super(sb);
 }
 
 static int ioctl_fsthaw(struct file *filp)
@@ -529,12 +536,8 @@ static int ioctl_fsthaw(struct file *filp)
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	/* If a blockdevice-backed filesystem isn't specified, return EINVAL. */
-	if (sb->s_bdev == NULL)
-		return -EINVAL;
-
 	/* Thaw */
-	return thaw_bdev(sb->s_bdev, sb);
+	return thaw_super(sb);
 }
 
 /*
diff --git a/fs/ioprio.c b/fs/ioprio.c
index c7c0b28d7d21..748cfb92dcc6 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -19,6 +19,7 @@
  * See also Documentation/block/ioprio.txt
  *
  */
+#include <linux/gfp.h>
 #include <linux/kernel.h>
 #include <linux/ioprio.h>
 #include <linux/blkdev.h>
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index 8ba5441063be..e0aca9a0ac68 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -11,6 +11,7 @@
  *  isofs directory handling functions
  */
 #include <linux/smp_lock.h>
+#include <linux/gfp.h>
 #include "isofs.h"
 
 int isofs_name_translate(struct iso_directory_record *de, char *new, struct inode *inode)
@@ -271,6 +272,7 @@ static int isofs_readdir(struct file *filp,
 
 const struct file_operations isofs_dir_operations =
 {
+	.llseek = generic_file_llseek,
 	.read = generic_read_dir,
 	.readdir = isofs_readdir,
 };
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index eaa831311c9c..ab438beb867c 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -7,6 +7,7 @@
  */
 
 #include <linux/smp_lock.h>
+#include <linux/gfp.h>
 #include "isofs.h"
 
 /*
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 4bd882548c45..28a9ddaa0c49 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -17,7 +17,6 @@
 #include <linux/fs.h>
 #include <linux/jbd.h>
 #include <linux/errno.h>
-#include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/bio.h>
@@ -787,6 +786,12 @@ wait_for_iobuf:
 
 	jbd_debug(3, "JBD: commit phase 6\n");
 
+	/* All metadata is written, now write commit record and do cleanup */
+	spin_lock(&journal->j_state_lock);
+	J_ASSERT(commit_transaction->t_state == T_COMMIT);
+	commit_transaction->t_state = T_COMMIT_RECORD;
+	spin_unlock(&journal->j_state_lock);
+
 	if (journal_write_commit_record(journal, commit_transaction))
 		err = -EIO;
 
@@ -862,12 +867,12 @@ restart_loop:
 		/* A buffer which has been freed while still being
 		 * journaled by a previous transaction may end up still
 		 * being dirty here, but we want to avoid writing back
-		 * that buffer in the future now that the last use has
-		 * been committed.  That's not only a performance gain,
-		 * it also stops aliasing problems if the buffer is left
-		 * behind for writeback and gets reallocated for another
+		 * that buffer in the future after the "add to orphan"
+		 * operation been committed,  That's not only a performance
+		 * gain, it also stops aliasing problems if the buffer is
+		 * left behind for writeback and gets reallocated for another
 		 * use in a different page. */
-		if (buffer_freed(bh)) {
+		if (buffer_freed(bh) && !jh->b_next_transaction) {
 			clear_buffer_freed(bh);
 			clear_buffer_jbddirty(bh);
 		}
@@ -924,7 +929,7 @@ restart_loop:
 
 	jbd_debug(3, "JBD: commit phase 8\n");
 
-	J_ASSERT(commit_transaction->t_state == T_COMMIT);
+	J_ASSERT(commit_transaction->t_state == T_COMMIT_RECORD);
 
 	commit_transaction->t_state = T_FINISHED;
 	J_ASSERT(commit_transaction == journal->j_committing_transaction);
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index bd224eec9b07..93d1e47647bd 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -565,6 +565,38 @@ int log_wait_commit(journal_t *journal, tid_t tid)
 }
 
 /*
+ * Return 1 if a given transaction has not yet sent barrier request
+ * connected with a transaction commit. If 0 is returned, transaction
+ * may or may not have sent the barrier. Used to avoid sending barrier
+ * twice in common cases.
+ */
+int journal_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
+{
+	int ret = 0;
+	transaction_t *commit_trans;
+
+	if (!(journal->j_flags & JFS_BARRIER))
+		return 0;
+	spin_lock(&journal->j_state_lock);
+	/* Transaction already committed? */
+	if (tid_geq(journal->j_commit_sequence, tid))
+		goto out;
+	/*
+	 * Transaction is being committed and we already proceeded to
+	 * writing commit record?
+	 */
+	commit_trans = journal->j_committing_transaction;
+	if (commit_trans && commit_trans->t_tid == tid &&
+	    commit_trans->t_state >= T_COMMIT_RECORD)
+		goto out;
+	ret = 1;
+out:
+	spin_unlock(&journal->j_state_lock);
+	return ret;
+}
+EXPORT_SYMBOL(journal_trans_will_send_data_barrier);
+
+/*
  * Log buffer allocation routines:
  */
 
@@ -1157,6 +1189,7 @@ int journal_destroy(journal_t *journal)
 {
 	int err = 0;
 
+	
 	/* Wait for the commit thread to wake up and die. */
 	journal_kill_thread(journal);
 
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index cb1a49ae605e..54c9bc9e1b17 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -20,7 +20,6 @@
 #include <linux/fs.h>
 #include <linux/jbd.h>
 #include <linux/errno.h>
-#include <linux/slab.h>
 #endif
 
 /*
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 006f9ad838a2..5ae71e75a491 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1398,7 +1398,7 @@ int journal_stop(handle_t *handle)
 	 * the case where our storage is so fast that it is more optimal to go
 	 * ahead and force a flush and wait for the transaction to be committed
 	 * than it is to wait for an arbitrary amount of time for new writers to
-	 * join the transaction.  We acheive this by measuring how long it takes
+	 * join the transaction.  We achieve this by measuring how long it takes
 	 * to commit a transaction, and compare it with how long this
 	 * transaction has been running, and if run time < commit time then we
 	 * sleep for the delta and commit.  This greatly helps super fast disks
@@ -1864,6 +1864,21 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
 	if (!jh)
 		goto zap_buffer_no_jh;
 
+	/*
+	 * We cannot remove the buffer from checkpoint lists until the
+	 * transaction adding inode to orphan list (let's call it T)
+	 * is committed.  Otherwise if the transaction changing the
+	 * buffer would be cleaned from the journal before T is
+	 * committed, a crash will cause that the correct contents of
+	 * the buffer will be lost.  On the other hand we have to
+	 * clear the buffer dirty bit at latest at the moment when the
+	 * transaction marking the buffer as freed in the filesystem
+	 * structures is committed because from that moment on the
+	 * buffer can be reallocated and used by a different page.
+	 * Since the block hasn't been freed yet but the inode has
+	 * already been added to orphan list, it is safe for us to add
+	 * the buffer to BJ_Forget list of the newest transaction.
+	 */
 	transaction = jh->b_transaction;
 	if (transaction == NULL) {
 		/* First case: not on any transaction.  If it
@@ -1929,16 +1944,15 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
 			goto zap_buffer;
 		}
 		/*
-		 * If it is committing, we simply cannot touch it.  We
-		 * can remove it's next_transaction pointer from the
-		 * running transaction if that is set, but nothing
-		 * else. */
+		 * The buffer is committing, we simply cannot touch
+		 * it. So we just set j_next_transaction to the
+		 * running transaction (if there is one) and mark
+		 * buffer as freed so that commit code knows it should
+		 * clear dirty bits when it is done with the buffer.
+		 */
 		set_buffer_freed(bh);
-		if (jh->b_next_transaction) {
-			J_ASSERT(jh->b_next_transaction ==
-					journal->j_running_transaction);
-			jh->b_next_transaction = NULL;
-		}
+		if (journal->j_running_transaction && buffer_jbddirty(bh))
+			jh->b_next_transaction = journal->j_running_transaction;
 		journal_put_journal_head(jh);
 		spin_unlock(&journal->j_list_lock);
 		jbd_unlock_bh_state(bh);
@@ -2120,7 +2134,7 @@ void journal_file_buffer(struct journal_head *jh,
  */
 void __journal_refile_buffer(struct journal_head *jh)
 {
-	int was_dirty;
+	int was_dirty, jlist;
 	struct buffer_head *bh = jh2bh(jh);
 
 	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
@@ -2142,8 +2156,13 @@ void __journal_refile_buffer(struct journal_head *jh)
 	__journal_temp_unlink_buffer(jh);
 	jh->b_transaction = jh->b_next_transaction;
 	jh->b_next_transaction = NULL;
-	__journal_file_buffer(jh, jh->b_transaction,
-				jh->b_modified ? BJ_Metadata : BJ_Reserved);
+	if (buffer_freed(bh))
+		jlist = BJ_Forget;
+	else if (jh->b_modified)
+		jlist = BJ_Metadata;
+	else
+		jlist = BJ_Reserved;
+	__journal_file_buffer(jh, jh->b_transaction, jlist);
 	J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
 
 	if (was_dirty)
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 886849370950..076d1cc44f95 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -507,6 +507,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
 	if (blocknr < journal->j_tail)
 		freed = freed + journal->j_last - journal->j_first;
 
+	trace_jbd2_cleanup_journal_tail(journal, first_tid, blocknr, freed);
 	jbd_debug(1,
 		  "Cleaning journal tail from %d to %d (offset %lu), "
 		  "freeing %lu\n",
@@ -529,7 +530,8 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
 	 */
 	if ((journal->j_fs_dev != journal->j_dev) &&
 	    (journal->j_flags & JBD2_BARRIER))
-		blkdev_issue_flush(journal->j_fs_dev, NULL);
+		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
+			BLKDEV_IFL_WAIT);
 	if (!(journal->j_flags & JBD2_ABORT))
 		jbd2_journal_update_superblock(journal, 1);
 	return 0;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 1bc74b6f26d2..75716d3d2be0 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -717,7 +717,8 @@ start_journal_io:
 	if (commit_transaction->t_flushed_data_blocks &&
 	    (journal->j_fs_dev != journal->j_dev) &&
 	    (journal->j_flags & JBD2_BARRIER))
-		blkdev_issue_flush(journal->j_fs_dev, NULL);
+		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
+			BLKDEV_IFL_WAIT);
 
 	/* Done it all: now write the commit record asynchronously. */
 	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
@@ -727,7 +728,8 @@ start_journal_io:
 		if (err)
 			__jbd2_journal_abort_hard(journal);
 		if (journal->j_flags & JBD2_BARRIER)
-			blkdev_issue_flush(journal->j_dev, NULL);
+			blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
+				BLKDEV_IFL_WAIT);
 	}
 
 	err = journal_finish_inode_data_buffers(journal, commit_transaction);
@@ -883,8 +885,7 @@ restart_loop:
 		spin_unlock(&journal->j_list_lock);
 		bh = jh2bh(jh);
 		jbd_lock_bh_state(bh);
-		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction ||
-			jh->b_transaction == journal->j_running_transaction);
+		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction);
 
 		/*
 		 * If there is undo-protected committed data against
@@ -930,12 +931,12 @@ restart_loop:
 		/* A buffer which has been freed while still being
 		 * journaled by a previous transaction may end up still
 		 * being dirty here, but we want to avoid writing back
-		 * that buffer in the future now that the last use has
-		 * been committed.  That's not only a performance gain,
-		 * it also stops aliasing problems if the buffer is left
-		 * behind for writeback and gets reallocated for another
+		 * that buffer in the future after the "add to orphan"
+		 * operation been committed,  That's not only a performance
+		 * gain, it also stops aliasing problems if the buffer is
+		 * left behind for writeback and gets reallocated for another
 		 * use in a different page. */
-		if (buffer_freed(bh)) {
+		if (buffer_freed(bh) && !jh->b_next_transaction) {
 			clear_buffer_freed(bh);
 			clear_buffer_jbddirty(bh);
 		}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index ac0d027595d0..036880895bfc 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -39,6 +39,8 @@
 #include <linux/seq_file.h>
 #include <linux/math64.h>
 #include <linux/hash.h>
+#include <linux/log2.h>
+#include <linux/vmalloc.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/jbd2.h>
@@ -93,6 +95,7 @@ EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
 
 static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
 static void __journal_abort_soft (journal_t *journal, int errno);
+static int jbd2_journal_create_slab(size_t slab_size);
 
 /*
  * Helper function used to manage commit timeouts
@@ -294,7 +297,6 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
 	struct page *new_page;
 	unsigned int new_offset;
 	struct buffer_head *bh_in = jh2bh(jh_in);
-	struct jbd2_buffer_trigger_type *triggers;
 	journal_t *journal = transaction->t_journal;
 
 	/*
@@ -325,21 +327,21 @@ repeat:
 		done_copy_out = 1;
 		new_page = virt_to_page(jh_in->b_frozen_data);
 		new_offset = offset_in_page(jh_in->b_frozen_data);
-		triggers = jh_in->b_frozen_triggers;
 	} else {
 		new_page = jh2bh(jh_in)->b_page;
 		new_offset = offset_in_page(jh2bh(jh_in)->b_data);
-		triggers = jh_in->b_triggers;
 	}
 
 	mapped_data = kmap_atomic(new_page, KM_USER0);
 	/*
-	 * Fire any commit trigger.  Do this before checking for escaping,
-	 * as the trigger may modify the magic offset.  If a copy-out
-	 * happens afterwards, it will have the correct data in the buffer.
+	 * Fire data frozen trigger if data already wasn't frozen.  Do this
+	 * before checking for escaping, as the trigger may modify the magic
+	 * offset.  If a copy-out happens afterwards, it will have the correct
+	 * data in the buffer.
 	 */
-	jbd2_buffer_commit_trigger(jh_in, mapped_data + new_offset,
-				   triggers);
+	if (!done_copy_out)
+		jbd2_buffer_frozen_trigger(jh_in, mapped_data + new_offset,
+					   jh_in->b_triggers);
 
 	/*
 	 * Check for escaping
@@ -1248,6 +1250,13 @@ int jbd2_journal_load(journal_t *journal)
 		}
 	}
 
+	/*
+	 * Create a slab for this blocksize
+	 */
+	err = jbd2_journal_create_slab(be32_to_cpu(sb->s_blocksize));
+	if (err)
+		return err;
+
 	/* Let the recovery code check whether it needs to recover any
 	 * data from the journal. */
 	if (jbd2_journal_recover(journal))
@@ -1807,6 +1816,127 @@ size_t journal_tag_bytes(journal_t *journal)
 }
 
 /*
+ * JBD memory management
+ *
+ * These functions are used to allocate block-sized chunks of memory
+ * used for making copies of buffer_head data.  Very often it will be
+ * page-sized chunks of data, but sometimes it will be in
+ * sub-page-size chunks.  (For example, 16k pages on Power systems
+ * with a 4k block file system.)  For blocks smaller than a page, we
+ * use a SLAB allocator.  There are slab caches for each block size,
+ * which are allocated at mount time, if necessary, and we only free
+ * (all of) the slab caches when/if the jbd2 module is unloaded.  For
+ * this reason we don't need to a mutex to protect access to
+ * jbd2_slab[] allocating or releasing memory; only in
+ * jbd2_journal_create_slab().
+ */
+#define JBD2_MAX_SLABS 8
+static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS];
+static DECLARE_MUTEX(jbd2_slab_create_sem);
+
+static const char *jbd2_slab_names[JBD2_MAX_SLABS] = {
+	"jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k",
+	"jbd2_16k", "jbd2_32k", "jbd2_64k", "jbd2_128k"
+};
+
+
+static void jbd2_journal_destroy_slabs(void)
+{
+	int i;
+
+	for (i = 0; i < JBD2_MAX_SLABS; i++) {
+		if (jbd2_slab[i])
+			kmem_cache_destroy(jbd2_slab[i]);
+		jbd2_slab[i] = NULL;
+	}
+}
+
+static int jbd2_journal_create_slab(size_t size)
+{
+	int i = order_base_2(size) - 10;
+	size_t slab_size;
+
+	if (size == PAGE_SIZE)
+		return 0;
+
+	if (i >= JBD2_MAX_SLABS)
+		return -EINVAL;
+
+	if (unlikely(i < 0))
+		i = 0;
+	down(&jbd2_slab_create_sem);
+	if (jbd2_slab[i]) {
+		up(&jbd2_slab_create_sem);
+		return 0;	/* Already created */
+	}
+
+	slab_size = 1 << (i+10);
+	jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size,
+					 slab_size, 0, NULL);
+	up(&jbd2_slab_create_sem);
+	if (!jbd2_slab[i]) {
+		printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n");
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+static struct kmem_cache *get_slab(size_t size)
+{
+	int i = order_base_2(size) - 10;
+
+	BUG_ON(i >= JBD2_MAX_SLABS);
+	if (unlikely(i < 0))
+		i = 0;
+	BUG_ON(jbd2_slab[i] == NULL);
+	return jbd2_slab[i];
+}
+
+void *jbd2_alloc(size_t size, gfp_t flags)
+{
+	void *ptr;
+
+	BUG_ON(size & (size-1)); /* Must be a power of 2 */
+
+	flags |= __GFP_REPEAT;
+	if (size == PAGE_SIZE)
+		ptr = (void *)__get_free_pages(flags, 0);
+	else if (size > PAGE_SIZE) {
+		int order = get_order(size);
+
+		if (order < 3)
+			ptr = (void *)__get_free_pages(flags, order);
+		else
+			ptr = vmalloc(size);
+	} else
+		ptr = kmem_cache_alloc(get_slab(size), flags);
+
+	/* Check alignment; SLUB has gotten this wrong in the past,
+	 * and this can lead to user data corruption! */
+	BUG_ON(((unsigned long) ptr) & (size-1));
+
+	return ptr;
+}
+
+void jbd2_free(void *ptr, size_t size)
+{
+	if (size == PAGE_SIZE) {
+		free_pages((unsigned long)ptr, 0);
+		return;
+	}
+	if (size > PAGE_SIZE) {
+		int order = get_order(size);
+
+		if (order < 3)
+			free_pages((unsigned long)ptr, order);
+		else
+			vfree(ptr);
+		return;
+	}
+	kmem_cache_free(get_slab(size), ptr);
+};
+
+/*
  * Journal_head storage management
  */
 static struct kmem_cache *jbd2_journal_head_cache;
@@ -2204,6 +2334,7 @@ static void jbd2_journal_destroy_caches(void)
 	jbd2_journal_destroy_revoke_caches();
 	jbd2_journal_destroy_jbd2_journal_head_cache();
 	jbd2_journal_destroy_handle_cache();
+	jbd2_journal_destroy_slabs();
 }
 
 static int __init journal_init(void)
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 73063285b13f..049281b7cb89 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -20,7 +20,6 @@
 #include <linux/fs.h>
 #include <linux/jbd2.h>
 #include <linux/errno.h>
-#include <linux/slab.h>
 #include <linux/crc32.h>
 #endif
 
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index a0512700542f..b8e0806681bb 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -725,6 +725,9 @@ done:
 		page = jh2bh(jh)->b_page;
 		offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK;
 		source = kmap_atomic(page, KM_USER0);
+		/* Fire data frozen trigger just before we copy the data */
+		jbd2_buffer_frozen_trigger(jh, source + offset,
+					   jh->b_triggers);
 		memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
 		kunmap_atomic(source, KM_USER0);
 
@@ -963,15 +966,15 @@ void jbd2_journal_set_triggers(struct buffer_head *bh,
 	jh->b_triggers = type;
 }
 
-void jbd2_buffer_commit_trigger(struct journal_head *jh, void *mapped_data,
+void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data,
 				struct jbd2_buffer_trigger_type *triggers)
 {
 	struct buffer_head *bh = jh2bh(jh);
 
-	if (!triggers || !triggers->t_commit)
+	if (!triggers || !triggers->t_frozen)
 		return;
 
-	triggers->t_commit(triggers, bh, mapped_data, bh->b_size);
+	triggers->t_frozen(triggers, bh, mapped_data, bh->b_size);
 }
 
 void jbd2_buffer_abort_trigger(struct journal_head *jh,
@@ -1311,7 +1314,6 @@ int jbd2_journal_stop(handle_t *handle)
 	if (handle->h_sync)
 		transaction->t_synchronous_commit = 1;
 	current->journal_info = NULL;
-	spin_lock(&journal->j_state_lock);
 	spin_lock(&transaction->t_handle_lock);
 	transaction->t_outstanding_credits -= handle->h_buffer_credits;
 	transaction->t_updates--;
@@ -1340,8 +1342,7 @@ int jbd2_journal_stop(handle_t *handle)
 		jbd_debug(2, "transaction too old, requesting commit for "
 					"handle %p\n", handle);
 		/* This is non-blocking */
-		__jbd2_log_start_commit(journal, transaction->t_tid);
-		spin_unlock(&journal->j_state_lock);
+		jbd2_log_start_commit(journal, transaction->t_tid);
 
 		/*
 		 * Special case: JBD2_SYNC synchronous updates require us
@@ -1351,7 +1352,6 @@ int jbd2_journal_stop(handle_t *handle)
 			err = jbd2_log_wait_commit(journal, tid);
 	} else {
 		spin_unlock(&transaction->t_handle_lock);
-		spin_unlock(&journal->j_state_lock);
 	}
 
 	lock_map_release(&handle->h_lockdep_map);
@@ -1727,6 +1727,21 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
 	if (!jh)
 		goto zap_buffer_no_jh;
 
+	/*
+	 * We cannot remove the buffer from checkpoint lists until the
+	 * transaction adding inode to orphan list (let's call it T)
+	 * is committed.  Otherwise if the transaction changing the
+	 * buffer would be cleaned from the journal before T is
+	 * committed, a crash will cause that the correct contents of
+	 * the buffer will be lost.  On the other hand we have to
+	 * clear the buffer dirty bit at latest at the moment when the
+	 * transaction marking the buffer as freed in the filesystem
+	 * structures is committed because from that moment on the
+	 * buffer can be reallocated and used by a different page.
+	 * Since the block hasn't been freed yet but the inode has
+	 * already been added to orphan list, it is safe for us to add
+	 * the buffer to BJ_Forget list of the newest transaction.
+	 */
 	transaction = jh->b_transaction;
 	if (transaction == NULL) {
 		/* First case: not on any transaction.  If it
@@ -1783,16 +1798,15 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
 	} else if (transaction == journal->j_committing_transaction) {
 		JBUFFER_TRACE(jh, "on committing transaction");
 		/*
-		 * If it is committing, we simply cannot touch it.  We
-		 * can remove it's next_transaction pointer from the
-		 * running transaction if that is set, but nothing
-		 * else. */
+		 * The buffer is committing, we simply cannot touch
+		 * it. So we just set j_next_transaction to the
+		 * running transaction (if there is one) and mark
+		 * buffer as freed so that commit code knows it should
+		 * clear dirty bits when it is done with the buffer.
+		 */
 		set_buffer_freed(bh);
-		if (jh->b_next_transaction) {
-			J_ASSERT(jh->b_next_transaction ==
-					journal->j_running_transaction);
-			jh->b_next_transaction = NULL;
-		}
+		if (journal->j_running_transaction && buffer_jbddirty(bh))
+			jh->b_next_transaction = journal->j_running_transaction;
 		jbd2_journal_put_journal_head(jh);
 		spin_unlock(&journal->j_list_lock);
 		jbd_unlock_bh_state(bh);
@@ -1969,7 +1983,7 @@ void jbd2_journal_file_buffer(struct journal_head *jh,
  */
 void __jbd2_journal_refile_buffer(struct journal_head *jh)
 {
-	int was_dirty;
+	int was_dirty, jlist;
 	struct buffer_head *bh = jh2bh(jh);
 
 	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
@@ -1991,8 +2005,13 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh)
 	__jbd2_journal_temp_unlink_buffer(jh);
 	jh->b_transaction = jh->b_next_transaction;
 	jh->b_next_transaction = NULL;
-	__jbd2_journal_file_buffer(jh, jh->b_transaction,
-				jh->b_modified ? BJ_Metadata : BJ_Reserved);
+	if (buffer_freed(bh))
+		jlist = BJ_Forget;
+	else if (jh->b_modified)
+		jlist = BJ_Metadata;
+	else
+		jlist = BJ_Reserved;
+	__jbd2_journal_file_buffer(jh, jh->b_transaction, jlist);
 	J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
 
 	if (was_dirty)
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 7cdc3196476a..54a92fd02bbd 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -234,8 +234,9 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 			if (inode->i_mode != mode) {
 				struct iattr attr;
 
-				attr.ia_valid = ATTR_MODE;
+				attr.ia_valid = ATTR_MODE | ATTR_CTIME;
 				attr.ia_mode = mode;
+				attr.ia_ctime = CURRENT_TIME_SEC;
 				rc = jffs2_do_setattr(inode, &attr);
 				if (rc < 0)
 					return rc;
@@ -419,7 +420,7 @@ static int jffs2_acl_setxattr(struct dentry *dentry, const char *name,
 	return rc;
 }
 
-struct xattr_handler jffs2_acl_access_xattr_handler = {
+const struct xattr_handler jffs2_acl_access_xattr_handler = {
 	.prefix	= POSIX_ACL_XATTR_ACCESS,
 	.flags	= ACL_TYPE_DEFAULT,
 	.list	= jffs2_acl_access_listxattr,
@@ -427,7 +428,7 @@ struct xattr_handler jffs2_acl_access_xattr_handler = {
 	.set	= jffs2_acl_setxattr,
 };
 
-struct xattr_handler jffs2_acl_default_xattr_handler = {
+const struct xattr_handler jffs2_acl_default_xattr_handler = {
 	.prefix	= POSIX_ACL_XATTR_DEFAULT,
 	.flags	= ACL_TYPE_DEFAULT,
 	.list	= jffs2_acl_default_listxattr,
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index f0ba63e3c36b..5e42de8d9541 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -31,8 +31,8 @@ extern int jffs2_acl_chmod(struct inode *);
 extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *);
 extern int jffs2_init_acl_post(struct inode *);
 
-extern struct xattr_handler jffs2_acl_access_xattr_handler;
-extern struct xattr_handler jffs2_acl_default_xattr_handler;
+extern const struct xattr_handler jffs2_acl_access_xattr_handler;
+extern const struct xattr_handler jffs2_acl_default_xattr_handler;
 
 #else
 
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 3ff50da94789..55f1dde2fa8b 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -23,10 +23,9 @@ static int jffs2_garbage_collect_thread(void *);
 
 void jffs2_garbage_collect_trigger(struct jffs2_sb_info *c)
 {
-	spin_lock(&c->erase_completion_lock);
+	assert_spin_locked(&c->erase_completion_lock);
 	if (c->gc_task && jffs2_thread_should_wake(c))
 		send_sig(SIGHUP, c->gc_task, 1);
-	spin_unlock(&c->erase_completion_lock);
 }
 
 /* This must only ever be called when no GC thread is currently running */
diff --git a/fs/jffs2/compr_lzo.c b/fs/jffs2/compr_lzo.c
index 90cb60d09787..cd02acafde8a 100644
--- a/fs/jffs2/compr_lzo.c
+++ b/fs/jffs2/compr_lzo.c
@@ -11,7 +11,6 @@
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
-#include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/init.h>
 #include <linux/lzo.h>
diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c
index cfd301a5edfc..b46661a42758 100644
--- a/fs/jffs2/compr_zlib.c
+++ b/fs/jffs2/compr_zlib.c
@@ -14,7 +14,6 @@
 #endif
 
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/zlib.h>
 #include <linux/zutil.h>
 #include "nodelist.h"
diff --git a/fs/jffs2/debug.c b/fs/jffs2/debug.c
index 5544d31c066b..ec3538413926 100644
--- a/fs/jffs2/debug.c
+++ b/fs/jffs2/debug.c
@@ -15,6 +15,7 @@
 #include <linux/crc32.h>
 #include <linux/jffs2.h>
 #include <linux/mtd/mtd.h>
+#include <linux/slab.h>
 #include "nodelist.h"
 #include "debug.h"
 
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 7aa4417e085f..166062a68230 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -222,15 +222,18 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, int mode,
 	dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(ri->ctime));
 
 	jffs2_free_raw_inode(ri);
-	d_instantiate(dentry, inode);
 
 	D1(printk(KERN_DEBUG "jffs2_create: Created ino #%lu with mode %o, nlink %d(%d). nrpages %ld\n",
 		  inode->i_ino, inode->i_mode, inode->i_nlink,
 		  f->inocache->pino_nlink, inode->i_mapping->nrpages));
+
+	d_instantiate(dentry, inode);
+	unlock_new_inode(inode);
 	return 0;
 
  fail:
 	make_bad_inode(inode);
+	unlock_new_inode(inode);
 	iput(inode);
 	jffs2_free_raw_inode(ri);
 	return ret;
@@ -360,8 +363,8 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
 		/* Eeek. Wave bye bye */
 		mutex_unlock(&f->sem);
 		jffs2_complete_reservation(c);
-		jffs2_clear_inode(inode);
-		return PTR_ERR(fn);
+		ret = PTR_ERR(fn);
+		goto fail;
 	}
 
 	/* We use f->target field to store the target path. */
@@ -370,8 +373,8 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
 		printk(KERN_WARNING "Can't allocate %d bytes of memory\n", targetlen + 1);
 		mutex_unlock(&f->sem);
 		jffs2_complete_reservation(c);
-		jffs2_clear_inode(inode);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto fail;
 	}
 
 	memcpy(f->target, target, targetlen + 1);
@@ -386,30 +389,24 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
 	jffs2_complete_reservation(c);
 
 	ret = jffs2_init_security(inode, dir_i);
-	if (ret) {
-		jffs2_clear_inode(inode);
-		return ret;
-	}
+	if (ret)
+		goto fail;
+
 	ret = jffs2_init_acl_post(inode);
-	if (ret) {
-		jffs2_clear_inode(inode);
-		return ret;
-	}
+	if (ret)
+		goto fail;
 
 	ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
 				  ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
-	if (ret) {
-		/* Eep. */
-		jffs2_clear_inode(inode);
-		return ret;
-	}
+	if (ret)
+		goto fail;
 
 	rd = jffs2_alloc_raw_dirent();
 	if (!rd) {
 		/* Argh. Now we treat it like a normal delete */
 		jffs2_complete_reservation(c);
-		jffs2_clear_inode(inode);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto fail;
 	}
 
 	dir_f = JFFS2_INODE_INFO(dir_i);
@@ -437,8 +434,8 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
 		jffs2_complete_reservation(c);
 		jffs2_free_raw_dirent(rd);
 		mutex_unlock(&dir_f->sem);
-		jffs2_clear_inode(inode);
-		return PTR_ERR(fd);
+		ret = PTR_ERR(fd);
+		goto fail;
 	}
 
 	dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
@@ -453,7 +450,14 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
 	jffs2_complete_reservation(c);
 
 	d_instantiate(dentry, inode);
+	unlock_new_inode(inode);
 	return 0;
+
+ fail:
+	make_bad_inode(inode);
+	unlock_new_inode(inode);
+	iput(inode);
+	return ret;
 }
 
 
@@ -519,8 +523,8 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
 		/* Eeek. Wave bye bye */
 		mutex_unlock(&f->sem);
 		jffs2_complete_reservation(c);
-		jffs2_clear_inode(inode);
-		return PTR_ERR(fn);
+		ret = PTR_ERR(fn);
+		goto fail;
 	}
 	/* No data here. Only a metadata node, which will be
 	   obsoleted by the first data write
@@ -531,30 +535,24 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
 	jffs2_complete_reservation(c);
 
 	ret = jffs2_init_security(inode, dir_i);
-	if (ret) {
-		jffs2_clear_inode(inode);
-		return ret;
-	}
+	if (ret)
+		goto fail;
+
 	ret = jffs2_init_acl_post(inode);
-	if (ret) {
-		jffs2_clear_inode(inode);
-		return ret;
-	}
+	if (ret)
+		goto fail;
 
 	ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
 				  ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
-	if (ret) {
-		/* Eep. */
-		jffs2_clear_inode(inode);
-		return ret;
-	}
+	if (ret)
+		goto fail;
 
 	rd = jffs2_alloc_raw_dirent();
 	if (!rd) {
 		/* Argh. Now we treat it like a normal delete */
 		jffs2_complete_reservation(c);
-		jffs2_clear_inode(inode);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto fail;
 	}
 
 	dir_f = JFFS2_INODE_INFO(dir_i);
@@ -582,8 +580,8 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
 		jffs2_complete_reservation(c);
 		jffs2_free_raw_dirent(rd);
 		mutex_unlock(&dir_f->sem);
-		jffs2_clear_inode(inode);
-		return PTR_ERR(fd);
+		ret = PTR_ERR(fd);
+		goto fail;
 	}
 
 	dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
@@ -599,7 +597,14 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
 	jffs2_complete_reservation(c);
 
 	d_instantiate(dentry, inode);
+	unlock_new_inode(inode);
 	return 0;
+
+ fail:
+	make_bad_inode(inode);
+	unlock_new_inode(inode);
+	iput(inode);
+	return ret;
 }
 
 static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry)
@@ -693,8 +698,8 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
 		/* Eeek. Wave bye bye */
 		mutex_unlock(&f->sem);
 		jffs2_complete_reservation(c);
-		jffs2_clear_inode(inode);
-		return PTR_ERR(fn);
+		ret = PTR_ERR(fn);
+		goto fail;
 	}
 	/* No data here. Only a metadata node, which will be
 	   obsoleted by the first data write
@@ -705,30 +710,24 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
 	jffs2_complete_reservation(c);
 
 	ret = jffs2_init_security(inode, dir_i);
-	if (ret) {
-		jffs2_clear_inode(inode);
-		return ret;
-	}
+	if (ret)
+		goto fail;
+
 	ret = jffs2_init_acl_post(inode);
-	if (ret) {
-		jffs2_clear_inode(inode);
-		return ret;
-	}
+	if (ret)
+		goto fail;
 
 	ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
 				  ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
-	if (ret) {
-		/* Eep. */
-		jffs2_clear_inode(inode);
-		return ret;
-	}
+	if (ret)
+		goto fail;
 
 	rd = jffs2_alloc_raw_dirent();
 	if (!rd) {
 		/* Argh. Now we treat it like a normal delete */
 		jffs2_complete_reservation(c);
-		jffs2_clear_inode(inode);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto fail;
 	}
 
 	dir_f = JFFS2_INODE_INFO(dir_i);
@@ -759,8 +758,8 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
 		jffs2_complete_reservation(c);
 		jffs2_free_raw_dirent(rd);
 		mutex_unlock(&dir_f->sem);
-		jffs2_clear_inode(inode);
-		return PTR_ERR(fd);
+		ret = PTR_ERR(fd);
+		goto fail;
 	}
 
 	dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
@@ -775,8 +774,14 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
 	jffs2_complete_reservation(c);
 
 	d_instantiate(dentry, inode);
-
+	unlock_new_inode(inode);
 	return 0;
+
+ fail:
+	make_bad_inode(inode);
+	unlock_new_inode(inode);
+	iput(inode);
+	return ret;
 }
 
 static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index b47679be118a..6286ad9b00f7 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -103,9 +103,10 @@ static void jffs2_erase_block(struct jffs2_sb_info *c,
 	jffs2_erase_failed(c, jeb, bad_offset);
 }
 
-void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
+int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
 {
 	struct jffs2_eraseblock *jeb;
+	int work_done = 0;
 
 	mutex_lock(&c->erase_free_sem);
 
@@ -121,6 +122,7 @@ void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
 			mutex_unlock(&c->erase_free_sem);
 			jffs2_mark_erased_block(c, jeb);
 
+			work_done++;
 			if (!--count) {
 				D1(printk(KERN_DEBUG "Count reached. jffs2_erase_pending_blocks leaving\n"));
 				goto done;
@@ -157,6 +159,7 @@ void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
 	mutex_unlock(&c->erase_free_sem);
  done:
 	D1(printk(KERN_DEBUG "jffs2_erase_pending_blocks completed\n"));
+	return work_done;
 }
 
 static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
@@ -165,10 +168,11 @@ static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblo
 	mutex_lock(&c->erase_free_sem);
 	spin_lock(&c->erase_completion_lock);
 	list_move_tail(&jeb->list, &c->erase_complete_list);
+	/* Wake the GC thread to mark them clean */
+	jffs2_garbage_collect_trigger(c);
 	spin_unlock(&c->erase_completion_lock);
 	mutex_unlock(&c->erase_free_sem);
-	/* Ensure that kupdated calls us again to mark them clean */
-	jffs2_erase_pending_trigger(c);
+	wake_up(&c->erase_wait);
 }
 
 static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t bad_offset)
@@ -487,9 +491,9 @@ filebad:
 
 refile:
 	/* Stick it back on the list from whence it came and come back later */
-	jffs2_erase_pending_trigger(c);
 	mutex_lock(&c->erase_free_sem);
 	spin_lock(&c->erase_completion_lock);
+	jffs2_garbage_collect_trigger(c);
 	list_move(&jeb->list, &c->erase_complete_list);
 	spin_unlock(&c->erase_completion_lock);
 	mutex_unlock(&c->erase_free_sem);
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index b7b74e299142..813497024437 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -10,7 +10,6 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/time.h>
 #include <linux/pagemap.h>
@@ -27,9 +26,9 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
 			struct page **pagep, void **fsdata);
 static int jffs2_readpage (struct file *filp, struct page *pg);
 
-int jffs2_fsync(struct file *filp, struct dentry *dentry, int datasync)
+int jffs2_fsync(struct file *filp, int datasync)
 {
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = filp->f_mapping->host;
 	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
 
 	/* Trigger GC to flush any pending writes for this inode */
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 3451a81b2142..459d39d1ea0b 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -169,13 +169,13 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
 	mutex_unlock(&f->sem);
 	jffs2_complete_reservation(c);
 
-	/* We have to do the vmtruncate() without f->sem held, since
+	/* We have to do the simple_setsize() without f->sem held, since
 	   some pages may be locked and waiting for it in readpage().
 	   We are protected from a simultaneous write() extending i_size
 	   back past iattr->ia_size, because do_truncate() holds the
 	   generic inode semaphore. */
 	if (ivalid & ATTR_SIZE && inode->i_size > iattr->ia_size) {
-		vmtruncate(inode, iattr->ia_size);	
+		simple_setsize(inode, iattr->ia_size);
 		inode->i_blocks = (inode->i_size + 511) >> 9;
 	}	
 
@@ -313,8 +313,8 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
 	case S_IFBLK:
 	case S_IFCHR:
 		/* Read the device numbers from the media */
-		if (f->metadata->size != sizeof(jdev.old) &&
-		    f->metadata->size != sizeof(jdev.new)) {
+		if (f->metadata->size != sizeof(jdev.old_id) &&
+		    f->metadata->size != sizeof(jdev.new_id)) {
 			printk(KERN_NOTICE "Device node has strange size %d\n", f->metadata->size);
 			goto error_io;
 		}
@@ -325,10 +325,10 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
 			printk(KERN_NOTICE "Read device numbers for inode %lu failed\n", (unsigned long)inode->i_ino);
 			goto error;
 		}
-		if (f->metadata->size == sizeof(jdev.old))
-			rdev = old_decode_dev(je16_to_cpu(jdev.old));
+		if (f->metadata->size == sizeof(jdev.old_id))
+			rdev = old_decode_dev(je16_to_cpu(jdev.old_id));
 		else
-			rdev = new_decode_dev(je32_to_cpu(jdev.new));
+			rdev = new_decode_dev(je32_to_cpu(jdev.new_id));
 
 	case S_IFSOCK:
 	case S_IFIFO:
@@ -465,7 +465,12 @@ struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_i
 	inode->i_blocks = 0;
 	inode->i_size = 0;
 
-	insert_inode_hash(inode);
+	if (insert_inode_locked(inode) < 0) {
+		make_bad_inode(inode);
+		unlock_new_inode(inode);
+		iput(inode);
+		return ERR_PTR(-EINVAL);
+	}
 
 	return inode;
 }
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 3b6f2fa12cff..f5e96bd656e8 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -214,6 +214,19 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
 		return ret;
 	}
 
+	/* If there are any blocks which need erasing, erase them now */
+	if (!list_empty(&c->erase_complete_list) ||
+	    !list_empty(&c->erase_pending_list)) {
+		spin_unlock(&c->erase_completion_lock);
+		D1(printk(KERN_DEBUG "jffs2_garbage_collect_pass() erasing pending blocks\n"));
+		if (jffs2_erase_pending_blocks(c, 1)) {
+			mutex_unlock(&c->alloc_sem);
+			return 0;
+		}
+		D1(printk(KERN_DEBUG "No progress from erasing blocks; doing GC anyway\n"));
+		spin_lock(&c->erase_completion_lock);
+	}
+
 	/* First, work out which block we're garbage-collecting */
 	jeb = c->gcblock;
 
@@ -222,7 +235,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
 
 	if (!jeb) {
 		/* Couldn't find a free block. But maybe we can just erase one and make 'progress'? */
-		if (!list_empty(&c->erase_pending_list)) {
+		if (c->nr_erasing_blocks) {
 			spin_unlock(&c->erase_completion_lock);
 			mutex_unlock(&c->alloc_sem);
 			return -EAGAIN;
@@ -435,7 +448,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
 		list_add_tail(&c->gcblock->list, &c->erase_pending_list);
 		c->gcblock = NULL;
 		c->nr_erasing_blocks++;
-		jffs2_erase_pending_trigger(c);
+		jffs2_garbage_collect_trigger(c);
 	}
 	spin_unlock(&c->erase_completion_lock);
 
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index 87c6f555e1a0..af02bd138469 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -15,7 +15,6 @@
 #include <linux/mtd/mtd.h>
 #include <linux/rbtree.h>
 #include <linux/crc32.h>
-#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include "nodelist.h"
 
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 507ed6ec1847..a881a42f19e3 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -312,11 +312,11 @@ static inline int jffs2_blocks_use_vmalloc(struct jffs2_sb_info *c)
 static inline int jffs2_encode_dev(union jffs2_device_node *jdev, dev_t rdev)
 {
 	if (old_valid_dev(rdev)) {
-		jdev->old = cpu_to_je16(old_encode_dev(rdev));
-		return sizeof(jdev->old);
+		jdev->old_id = cpu_to_je16(old_encode_dev(rdev));
+		return sizeof(jdev->old_id);
 	} else {
-		jdev->new = cpu_to_je32(new_encode_dev(rdev));
-		return sizeof(jdev->new);
+		jdev->new_id = cpu_to_je32(new_encode_dev(rdev));
+		return sizeof(jdev->new_id);
 	}
 }
 
@@ -464,7 +464,7 @@ int jffs2_scan_dirty_space(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
 int jffs2_do_mount_fs(struct jffs2_sb_info *c);
 
 /* erase.c */
-void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count);
+int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count);
 void jffs2_free_jeb_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
 
 #ifdef CONFIG_JFFS2_FS_WRITEBUFFER
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 21a052915aa9..694aa5b03505 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -10,7 +10,6 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/mtd/mtd.h>
 #include <linux/compiler.h>
 #include <linux/sched.h> /* For cond_resched() */
@@ -117,9 +116,21 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
 
 			ret = jffs2_garbage_collect_pass(c);
 
-			if (ret == -EAGAIN)
-				jffs2_erase_pending_blocks(c, 1);
-			else if (ret)
+			if (ret == -EAGAIN) {
+				spin_lock(&c->erase_completion_lock);
+				if (c->nr_erasing_blocks &&
+				    list_empty(&c->erase_pending_list) &&
+				    list_empty(&c->erase_complete_list)) {
+					DECLARE_WAITQUEUE(wait, current);
+					set_current_state(TASK_UNINTERRUPTIBLE);
+					add_wait_queue(&c->erase_wait, &wait);
+					D1(printk(KERN_DEBUG "%s waiting for erase to complete\n", __func__));
+					spin_unlock(&c->erase_completion_lock);
+
+					schedule();
+				} else
+					spin_unlock(&c->erase_completion_lock);
+			} else if (ret)
 				return ret;
 
 			cond_resched();
@@ -218,7 +229,7 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c)
 			ejeb = list_entry(c->erasable_list.next, struct jffs2_eraseblock, list);
 			list_move_tail(&ejeb->list, &c->erase_pending_list);
 			c->nr_erasing_blocks++;
-			jffs2_erase_pending_trigger(c);
+			jffs2_garbage_collect_trigger(c);
 			D1(printk(KERN_DEBUG "jffs2_find_nextblock: Triggering erase of erasable block at 0x%08x\n",
 				  ejeb->offset));
 		}
@@ -470,7 +481,9 @@ struct jffs2_raw_node_ref *jffs2_add_physical_node_ref(struct jffs2_sb_info *c,
 void jffs2_complete_reservation(struct jffs2_sb_info *c)
 {
 	D1(printk(KERN_DEBUG "jffs2_complete_reservation()\n"));
+	spin_lock(&c->erase_completion_lock);
 	jffs2_garbage_collect_trigger(c);
+	spin_unlock(&c->erase_completion_lock);
 	mutex_unlock(&c->alloc_sem);
 }
 
@@ -612,7 +625,7 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
 				D1(printk(KERN_DEBUG "...and adding to erase_pending_list\n"));
 				list_add_tail(&jeb->list, &c->erase_pending_list);
 				c->nr_erasing_blocks++;
-				jffs2_erase_pending_trigger(c);
+				jffs2_garbage_collect_trigger(c);
 			} else {
 				/* Sometimes, however, we leave it elsewhere so it doesn't get
 				   immediately reused, and we spread the load a bit. */
@@ -733,6 +746,10 @@ int jffs2_thread_should_wake(struct jffs2_sb_info *c)
 	int nr_very_dirty = 0;
 	struct jffs2_eraseblock *jeb;
 
+	if (!list_empty(&c->erase_complete_list) ||
+	    !list_empty(&c->erase_pending_list))
+		return 1;
+
 	if (c->unchecked_size) {
 		D1(printk(KERN_DEBUG "jffs2_thread_should_wake(): unchecked_size %d, checked_ino #%d\n",
 			  c->unchecked_size, c->checked_ino));
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index a7f03b7ebcb3..4791aacf3084 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -140,8 +140,7 @@ void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c);
 
 #endif /* WRITEBUFFER */
 
-/* erase.c */
-static inline void jffs2_erase_pending_trigger(struct jffs2_sb_info *c)
+static inline void jffs2_dirty_trigger(struct jffs2_sb_info *c)
 {
 	OFNI_BS_2SFFJ(c)->s_dirt = 1;
 }
@@ -159,7 +158,7 @@ extern const struct inode_operations jffs2_dir_inode_operations;
 extern const struct file_operations jffs2_file_operations;
 extern const struct inode_operations jffs2_file_inode_operations;
 extern const struct address_space_operations jffs2_file_address_operations;
-int jffs2_fsync(struct file *, struct dentry *, int);
+int jffs2_fsync(struct file *, int);
 int jffs2_do_readpage_unlock (struct inode *inode, struct page *pg);
 
 /* ioctl.c */
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index e22de8397b74..d32ee9412cb9 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -567,7 +567,7 @@ static void jffs2_free_tmp_dnode_info_list(struct rb_root *list)
 			else BUG();
 		}
 	}
-	list->rb_node = NULL;
+	*list = RB_ROOT;
 }
 
 static void jffs2_free_full_dirent_list(struct jffs2_full_dirent *fd)
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 696686cc206e..46f870d1cc36 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -260,7 +260,9 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
 			ret = -EIO;
 			goto out;
 		}
-		jffs2_erase_pending_trigger(c);
+		spin_lock(&c->erase_completion_lock);
+		jffs2_garbage_collect_trigger(c);
+		spin_unlock(&c->erase_completion_lock);
 	}
 	ret = 0;
  out:
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index eaccee058583..239f51216a68 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -77,7 +77,7 @@ static size_t jffs2_security_listxattr(struct dentry *dentry, char *list,
 	return retlen;
 }
 
-struct xattr_handler jffs2_security_xattr_handler = {
+const struct xattr_handler jffs2_security_xattr_handler = {
 	.prefix = XATTR_SECURITY_PREFIX,
 	.list = jffs2_security_listxattr,
 	.set = jffs2_security_setxattr,
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 9a80e8e595d0..511e2d609d12 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -63,8 +63,6 @@ static void jffs2_write_super(struct super_block *sb)
 
 	if (!(sb->s_flags & MS_RDONLY)) {
 		D1(printk(KERN_DEBUG "jffs2_write_super()\n"));
-		jffs2_garbage_collect_trigger(c);
-		jffs2_erase_pending_blocks(c, 0);
 		jffs2_flush_wbuf_gc(c, 0);
 	}
 
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
index 4ec11e8bda8c..b955626071c2 100644
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -10,7 +10,6 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/namei.h>
 #include "nodelist.h"
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 5ef7bac265e5..07ee1546b2fa 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -84,7 +84,7 @@ static void jffs2_wbuf_dirties_inode(struct jffs2_sb_info *c, uint32_t ino)
 	struct jffs2_inodirty *new;
 
 	/* Mark the superblock dirty so that kupdated will flush... */
-	jffs2_erase_pending_trigger(c);
+	jffs2_dirty_trigger(c);
 
 	if (jffs2_wbuf_pending_for_ino(c, ino))
 		return;
@@ -121,7 +121,7 @@ static inline void jffs2_refile_wbuf_blocks(struct jffs2_sb_info *c)
 			D1(printk(KERN_DEBUG "...and adding to erase_pending_list\n"));
 			list_add_tail(&jeb->list, &c->erase_pending_list);
 			c->nr_erasing_blocks++;
-			jffs2_erase_pending_trigger(c);
+			jffs2_garbage_collect_trigger(c);
 		} else {
 			/* Sometimes, however, we leave it elsewhere so it doesn't get
 			   immediately reused, and we spread the load a bit. */
@@ -152,7 +152,7 @@ static void jffs2_block_refile(struct jffs2_sb_info *c, struct jffs2_eraseblock
 		D1(printk("Refiling block at %08x to erase_pending_list\n", jeb->offset));
 		list_add(&jeb->list, &c->erase_pending_list);
 		c->nr_erasing_blocks++;
-		jffs2_erase_pending_trigger(c);
+		jffs2_garbage_collect_trigger(c);
 	}
 
 	if (!jffs2_prealloc_raw_node_refs(c, jeb, 1)) {
@@ -543,7 +543,7 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
 		D1(printk(KERN_DEBUG "Failing block at %08x is now empty. Moving to erase_pending_list\n", jeb->offset));
 		list_move(&jeb->list, &c->erase_pending_list);
 		c->nr_erasing_blocks++;
-		jffs2_erase_pending_trigger(c);
+		jffs2_garbage_collect_trigger(c);
 	}
 
 	jffs2_dbg_acct_sanity_check_nolock(c, jeb);
diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c
index ca29440e9435..c819eb0e982d 100644
--- a/fs/jffs2/write.c
+++ b/fs/jffs2/write.c
@@ -12,7 +12,6 @@
 #include <linux/kernel.h>
 #include <linux/fs.h>
 #include <linux/crc32.h>
-#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/mtd/mtd.h>
 #include "nodelist.h"
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 9e75c62c85d6..d258e261bdc7 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -626,7 +626,7 @@ void jffs2_xattr_free_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *i
 
 static int check_xattr_ref_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
 {
-	/* success of check_xattr_ref_inode() means taht inode (ic) dose not have
+	/* success of check_xattr_ref_inode() means that inode (ic) dose not have
 	 * duplicate name/value pairs. If duplicate name/value pair would be found,
 	 * one will be removed.
 	 */
@@ -904,7 +904,7 @@ struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c,
  * do_jffs2_setxattr(inode, xprefix, xname, buffer, size, flags)
  *   is an implementation of setxattr handler on jffs2.
  * -------------------------------------------------- */
-struct xattr_handler *jffs2_xattr_handlers[] = {
+const struct xattr_handler *jffs2_xattr_handlers[] = {
 	&jffs2_user_xattr_handler,
 #ifdef CONFIG_JFFS2_FS_SECURITY
 	&jffs2_security_xattr_handler,
@@ -917,8 +917,8 @@ struct xattr_handler *jffs2_xattr_handlers[] = {
 	NULL
 };
 
-static struct xattr_handler *xprefix_to_handler(int xprefix) {
-	struct xattr_handler *ret;
+static const struct xattr_handler *xprefix_to_handler(int xprefix) {
+	const struct xattr_handler *ret;
 
 	switch (xprefix) {
 	case JFFS2_XPREFIX_USER:
@@ -955,7 +955,7 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
 	struct jffs2_inode_cache *ic = f->inocache;
 	struct jffs2_xattr_ref *ref, **pref;
 	struct jffs2_xattr_datum *xd;
-	struct xattr_handler *xhandle;
+	const struct xattr_handler *xhandle;
 	ssize_t len, rc;
 	int retry = 0;
 
diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h
index 6e3b5ddfb7ab..cf4f5759b42b 100644
--- a/fs/jffs2/xattr.h
+++ b/fs/jffs2/xattr.h
@@ -93,9 +93,9 @@ extern int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname
 extern int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
 			     const char *buffer, size_t size, int flags);
 
-extern struct xattr_handler *jffs2_xattr_handlers[];
-extern struct xattr_handler jffs2_user_xattr_handler;
-extern struct xattr_handler jffs2_trusted_xattr_handler;
+extern const struct xattr_handler *jffs2_xattr_handlers[];
+extern const struct xattr_handler jffs2_user_xattr_handler;
+extern const struct xattr_handler jffs2_trusted_xattr_handler;
 
 extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t);
 #define jffs2_getxattr		generic_getxattr
@@ -122,7 +122,7 @@ extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t);
 
 #ifdef CONFIG_JFFS2_FS_SECURITY
 extern int jffs2_init_security(struct inode *inode, struct inode *dir);
-extern struct xattr_handler jffs2_security_xattr_handler;
+extern const struct xattr_handler jffs2_security_xattr_handler;
 #else
 #define jffs2_init_security(inode,dir)	(0)
 #endif /* CONFIG_JFFS2_FS_SECURITY */
diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c
index 3e5a5e356e05..1c868194c504 100644
--- a/fs/jffs2/xattr_trusted.c
+++ b/fs/jffs2/xattr_trusted.c
@@ -47,7 +47,7 @@ static size_t jffs2_trusted_listxattr(struct dentry *dentry, char *list,
 	return retlen;
 }
 
-struct xattr_handler jffs2_trusted_xattr_handler = {
+const struct xattr_handler jffs2_trusted_xattr_handler = {
 	.prefix = XATTR_TRUSTED_PREFIX,
 	.list = jffs2_trusted_listxattr,
 	.set = jffs2_trusted_setxattr,
diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c
index 8544af67dffe..916b5c966039 100644
--- a/fs/jffs2/xattr_user.c
+++ b/fs/jffs2/xattr_user.c
@@ -47,7 +47,7 @@ static size_t jffs2_user_listxattr(struct dentry *dentry, char *list,
 	return retlen;
 }
 
-struct xattr_handler jffs2_user_xattr_handler = {
+const struct xattr_handler jffs2_user_xattr_handler = {
 	.prefix = XATTR_USER_PREFIX,
 	.list = jffs2_user_listxattr,
 	.set = jffs2_user_setxattr,
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index d66477c34306..1057a4998e4e 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -19,8 +19,8 @@
  */
 
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/fs.h>
-#include <linux/quotaops.h>
 #include <linux/posix_acl_xattr.h>
 #include "jfs_incore.h"
 #include "jfs_txnmgr.h"
@@ -174,7 +174,7 @@ cleanup:
 	return rc;
 }
 
-static int jfs_acl_chmod(struct inode *inode)
+int jfs_acl_chmod(struct inode *inode)
 {
 	struct posix_acl *acl, *clone;
 	int rc;
@@ -205,26 +205,3 @@ static int jfs_acl_chmod(struct inode *inode)
 	posix_acl_release(clone);
 	return rc;
 }
-
-int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
-{
-	struct inode *inode = dentry->d_inode;
-	int rc;
-
-	rc = inode_change_ok(inode, iattr);
-	if (rc)
-		return rc;
-
-	if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
-	    (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
-		if (vfs_dq_transfer(inode, iattr))
-			return -EDQUOT;
-	}
-
-	rc = inode_setattr(inode, iattr);
-
-	if (!rc && (iattr->ia_valid & ATTR_MODE))
-		rc = jfs_acl_chmod(inode);
-
-	return rc;
-}
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 2b70fa78e4a7..127263cc8657 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -18,6 +18,7 @@
  */
 
 #include <linux/fs.h>
+#include <linux/quotaops.h>
 #include "jfs_incore.h"
 #include "jfs_inode.h"
 #include "jfs_dmap.h"
@@ -26,9 +27,9 @@
 #include "jfs_acl.h"
 #include "jfs_debug.h"
 
-int jfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int jfs_fsync(struct file *file, int datasync)
 {
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = file->f_mapping->host;
 	int rc = 0;
 
 	if (!(inode->i_state & I_DIRTY) ||
@@ -47,7 +48,7 @@ static int jfs_open(struct inode *inode, struct file *file)
 {
 	int rc;
 
-	if ((rc = generic_file_open(inode, file)))
+	if ((rc = dquot_file_open(inode, file)))
 		return rc;
 
 	/*
@@ -88,14 +89,40 @@ static int jfs_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
+int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+	struct inode *inode = dentry->d_inode;
+	int rc;
+
+	rc = inode_change_ok(inode, iattr);
+	if (rc)
+		return rc;
+
+	if (is_quota_modification(inode, iattr))
+		dquot_initialize(inode);
+	if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
+	    (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
+		rc = dquot_transfer(inode, iattr);
+		if (rc)
+			return rc;
+	}
+
+	rc = inode_setattr(inode, iattr);
+
+	if (!rc && (iattr->ia_valid & ATTR_MODE))
+		rc = jfs_acl_chmod(inode);
+
+	return rc;
+}
+
 const struct inode_operations jfs_file_inode_operations = {
 	.truncate	= jfs_truncate,
 	.setxattr	= jfs_setxattr,
 	.getxattr	= jfs_getxattr,
 	.listxattr	= jfs_listxattr,
 	.removexattr	= jfs_removexattr,
-#ifdef CONFIG_JFS_POSIX_ACL
 	.setattr	= jfs_setattr,
+#ifdef CONFIG_JFS_POSIX_ACL
 	.check_acl	= jfs_check_acl,
 #endif
 };
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index b2ae190a77ba..ed9ba6fe04f5 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -22,6 +22,7 @@
 #include <linux/buffer_head.h>
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
+#include <linux/writeback.h>
 #include "jfs_incore.h"
 #include "jfs_inode.h"
 #include "jfs_filsys.h"
@@ -60,7 +61,7 @@ struct inode *jfs_iget(struct super_block *sb, unsigned long ino)
 			inode->i_op = &page_symlink_inode_operations;
 			inode->i_mapping->a_ops = &jfs_aops;
 		} else {
-			inode->i_op = &jfs_symlink_inode_operations;
+			inode->i_op = &jfs_fast_symlink_inode_operations;
 			/*
 			 * The inline data should be null-terminated, but
 			 * don't let on-disk corruption crash the kernel
@@ -120,8 +121,10 @@ int jfs_commit_inode(struct inode *inode, int wait)
 	return rc;
 }
 
-int jfs_write_inode(struct inode *inode, int wait)
+int jfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
+	int wait = wbc->sync_mode == WB_SYNC_ALL;
+
 	if (test_cflag(COMMIT_Nolink, inode))
 		return 0;
 	/*
@@ -146,6 +149,9 @@ void jfs_delete_inode(struct inode *inode)
 {
 	jfs_info("In jfs_delete_inode, inode = 0x%p", inode);
 
+	if (!is_bad_inode(inode))
+		dquot_initialize(inode);
+
 	if (!is_bad_inode(inode) &&
 	    (JFS_IP(inode)->fileset == FILESYSTEM_I)) {
 		truncate_inode_pages(&inode->i_data, 0);
@@ -158,9 +164,9 @@ void jfs_delete_inode(struct inode *inode)
 		/*
 		 * Free the inode from the quota allocation.
 		 */
-		vfs_dq_init(inode);
-		vfs_dq_free_inode(inode);
-		vfs_dq_drop(inode);
+		dquot_initialize(inode);
+		dquot_free_inode(inode);
+		dquot_drop(inode);
 	}
 
 	clear_inode(inode);
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index b07bd417ef85..54e07559878d 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -22,7 +22,7 @@
 
 int jfs_check_acl(struct inode *, int);
 int jfs_init_acl(tid_t, struct inode *, struct inode *);
-int jfs_setattr(struct dentry *, struct iattr *);
+int jfs_acl_chmod(struct inode *inode);
 
 #else
 
@@ -32,5 +32,10 @@ static inline int jfs_init_acl(tid_t tid, struct inode *inode,
 	return 0;
 }
 
+static inline int jfs_acl_chmod(struct inode *inode)
+{
+	return 0;
+}
+
 #endif
 #endif		/* _H_JFS_ACL */
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index d9b031cf69f5..c92ea3b3ea5e 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -17,6 +17,7 @@
  */
 
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include "jfs_incore.h"
 #include "jfs_superblock.h"
 #include "jfs_dmap.h"
@@ -195,7 +196,7 @@ int dbMount(struct inode *ipbmap)
 	bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag);
 	bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref);
 	bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel);
-	bmp->db_agheigth = le32_to_cpu(dbmp_le->dn_agheigth);
+	bmp->db_agheight = le32_to_cpu(dbmp_le->dn_agheight);
 	bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth);
 	bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart);
 	bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size);
@@ -287,7 +288,7 @@ int dbSync(struct inode *ipbmap)
 	dbmp_le->dn_maxag = cpu_to_le32(bmp->db_maxag);
 	dbmp_le->dn_agpref = cpu_to_le32(bmp->db_agpref);
 	dbmp_le->dn_aglevel = cpu_to_le32(bmp->db_aglevel);
-	dbmp_le->dn_agheigth = cpu_to_le32(bmp->db_agheigth);
+	dbmp_le->dn_agheight = cpu_to_le32(bmp->db_agheight);
 	dbmp_le->dn_agwidth = cpu_to_le32(bmp->db_agwidth);
 	dbmp_le->dn_agstart = cpu_to_le32(bmp->db_agstart);
 	dbmp_le->dn_agl2size = cpu_to_le32(bmp->db_agl2size);
@@ -1440,7 +1441,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
 	 * tree index of this allocation group within the control page.
 	 */
 	agperlev =
-	    (1 << (L2LPERCTL - (bmp->db_agheigth << 1))) / bmp->db_agwidth;
+	    (1 << (L2LPERCTL - (bmp->db_agheight << 1))) / bmp->db_agwidth;
 	ti = bmp->db_agstart + bmp->db_agwidth * (agno & (agperlev - 1));
 
 	/* dmap control page trees fan-out by 4 and a single allocation
@@ -1459,7 +1460,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
 		 * the subtree to find the leftmost leaf that describes this
 		 * free space.
 		 */
-		for (k = bmp->db_agheigth; k > 0; k--) {
+		for (k = bmp->db_agheight; k > 0; k--) {
 			for (n = 0, m = (ti << 2) + 1; n < 4; n++) {
 				if (l2nb <= dcp->stree[m + n]) {
 					ti = m + n;
@@ -2437,7 +2438,7 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
 
 	/* check if this is a control page update for an allocation.
 	 * if so, update the leaf to reflect the new leaf value using
-	 * dbSplit(); otherwise (deallocation), use dbJoin() to udpate
+	 * dbSplit(); otherwise (deallocation), use dbJoin() to update
 	 * the leaf with the new value.  in addition to updating the
 	 * leaf, dbSplit() will also split the binary buddy system of
 	 * the leaves, if required, and bubble new values within the
@@ -3606,7 +3607,7 @@ void dbFinalizeBmap(struct inode *ipbmap)
 	}
 
 	/*
-	 * compute db_aglevel, db_agheigth, db_width, db_agstart:
+	 * compute db_aglevel, db_agheight, db_width, db_agstart:
 	 * an ag is covered in aglevel dmapctl summary tree,
 	 * at agheight level height (from leaf) with agwidth number of nodes
 	 * each, which starts at agstart index node of the smmary tree node
@@ -3615,9 +3616,9 @@ void dbFinalizeBmap(struct inode *ipbmap)
 	bmp->db_aglevel = BMAPSZTOLEV(bmp->db_agsize);
 	l2nl =
 	    bmp->db_agl2size - (L2BPERDMAP + bmp->db_aglevel * L2LPERCTL);
-	bmp->db_agheigth = l2nl >> 1;
-	bmp->db_agwidth = 1 << (l2nl - (bmp->db_agheigth << 1));
-	for (i = 5 - bmp->db_agheigth, bmp->db_agstart = 0, n = 1; i > 0;
+	bmp->db_agheight = l2nl >> 1;
+	bmp->db_agwidth = 1 << (l2nl - (bmp->db_agheight << 1));
+	for (i = 5 - bmp->db_agheight, bmp->db_agstart = 0, n = 1; i > 0;
 	     i--) {
 		bmp->db_agstart += n;
 		n <<= 2;
diff --git a/fs/jfs/jfs_dmap.h b/fs/jfs/jfs_dmap.h
index 1a6eb41569bc..6dcb906c55d8 100644
--- a/fs/jfs/jfs_dmap.h
+++ b/fs/jfs/jfs_dmap.h
@@ -210,7 +210,7 @@ struct dbmap_disk {
 	__le32 dn_maxag;	/* 4: max active alloc group number	*/
 	__le32 dn_agpref;	/* 4: preferred alloc group (hint)	*/
 	__le32 dn_aglevel;	/* 4: dmapctl level holding the AG	*/
-	__le32 dn_agheigth;	/* 4: height in dmapctl of the AG	*/
+	__le32 dn_agheight;	/* 4: height in dmapctl of the AG	*/
 	__le32 dn_agwidth;	/* 4: width in dmapctl of the AG	*/
 	__le32 dn_agstart;	/* 4: start tree index at AG height	*/
 	__le32 dn_agl2size;	/* 4: l2 num of blks per alloc group	*/
@@ -229,7 +229,7 @@ struct dbmap {
 	int dn_maxag;		/* max active alloc group number	*/
 	int dn_agpref;		/* preferred alloc group (hint)		*/
 	int dn_aglevel;		/* dmapctl level holding the AG		*/
-	int dn_agheigth;	/* height in dmapctl of the AG		*/
+	int dn_agheight;	/* height in dmapctl of the AG		*/
 	int dn_agwidth;		/* width in dmapctl of the AG		*/
 	int dn_agstart;		/* start tree index at AG height	*/
 	int dn_agl2size;	/* l2 num of blks per alloc group	*/
@@ -255,7 +255,7 @@ struct bmap {
 #define	db_agsize	db_bmap.dn_agsize
 #define	db_agl2size	db_bmap.dn_agl2size
 #define	db_agwidth	db_bmap.dn_agwidth
-#define	db_agheigth	db_bmap.dn_agheigth
+#define	db_agheight	db_bmap.dn_agheight
 #define	db_agstart	db_bmap.dn_agstart
 #define	db_numag	db_bmap.dn_numag
 #define	db_maxlevel	db_bmap.dn_maxlevel
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 925871e9887b..9197a1b0d02d 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -102,6 +102,7 @@
 
 #include <linux/fs.h>
 #include <linux/quotaops.h>
+#include <linux/slab.h>
 #include "jfs_incore.h"
 #include "jfs_superblock.h"
 #include "jfs_filsys.h"
@@ -381,10 +382,10 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
 		 * It's time to move the inline table to an external
 		 * page and begin to build the xtree
 		 */
-		if (vfs_dq_alloc_block(ip, sbi->nbperpage))
+		if (dquot_alloc_block(ip, sbi->nbperpage))
 			goto clean_up;
 		if (dbAlloc(ip, 0, sbi->nbperpage, &xaddr)) {
-			vfs_dq_free_block(ip, sbi->nbperpage);
+			dquot_free_block(ip, sbi->nbperpage);
 			goto clean_up;
 		}
 
@@ -408,7 +409,7 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
 			memcpy(&jfs_ip->i_dirtable, temp_table,
 			       sizeof (temp_table));
 			dbFree(ip, xaddr, sbi->nbperpage);
-			vfs_dq_free_block(ip, sbi->nbperpage);
+			dquot_free_block(ip, sbi->nbperpage);
 			goto clean_up;
 		}
 		ip->i_size = PSIZE;
@@ -1027,10 +1028,9 @@ static int dtSplitUp(tid_t tid,
 			n = xlen;
 
 		/* Allocate blocks to quota. */
-		if (vfs_dq_alloc_block(ip, n)) {
-			rc = -EDQUOT;
+		rc = dquot_alloc_block(ip, n);
+		if (rc)
 			goto extendOut;
-		}
 		quota_allocation += n;
 
 		if ((rc = dbReAlloc(sbi->ipbmap, xaddr, (s64) xlen,
@@ -1308,7 +1308,7 @@ static int dtSplitUp(tid_t tid,
 
 	/* Rollback quota allocation */
 	if (rc && quota_allocation)
-		vfs_dq_free_block(ip, quota_allocation);
+		dquot_free_block(ip, quota_allocation);
 
       dtSplitUp_Exit:
 
@@ -1369,9 +1369,10 @@ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
 		return -EIO;
 
 	/* Allocate blocks to quota. */
-	if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) {
+	rc = dquot_alloc_block(ip, lengthPXD(pxd));
+	if (rc) {
 		release_metapage(rmp);
-		return -EDQUOT;
+		return rc;
 	}
 
 	jfs_info("dtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp);
@@ -1892,6 +1893,7 @@ static int dtSplitRoot(tid_t tid,
 	struct dt_lock *dtlck;
 	struct tlock *tlck;
 	struct lv *lv;
+	int rc;
 
 	/* get split root page */
 	smp = split->mp;
@@ -1916,9 +1918,10 @@ static int dtSplitRoot(tid_t tid,
 	rp = rmp->data;
 
 	/* Allocate blocks to quota. */
-	if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) {
+	rc = dquot_alloc_block(ip, lengthPXD(pxd));
+	if (rc) {
 		release_metapage(rmp);
-		return -EDQUOT;
+		return rc;
 	}
 
 	BT_MARK_DIRTY(rmp, ip);
@@ -2287,7 +2290,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip,
 	xlen = lengthPXD(&fp->header.self);
 
 	/* Free quota allocation. */
-	vfs_dq_free_block(ip, xlen);
+	dquot_free_block(ip, xlen);
 
 	/* free/invalidate its buffer page */
 	discard_metapage(fmp);
@@ -2363,7 +2366,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip,
 				xlen = lengthPXD(&p->header.self);
 
 				/* Free quota allocation */
-				vfs_dq_free_block(ip, xlen);
+				dquot_free_block(ip, xlen);
 
 				/* free/invalidate its buffer page */
 				discard_metapage(mp);
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index 41d6045dbeb0..5d3bbd10f8db 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -141,10 +141,11 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
 	}
 
 	/* Allocate blocks to quota. */
-	if (vfs_dq_alloc_block(ip, nxlen)) {
+	rc = dquot_alloc_block(ip, nxlen);
+	if (rc) {
 		dbFree(ip, nxaddr, (s64) nxlen);
 		mutex_unlock(&JFS_IP(ip)->commit_mutex);
-		return -EDQUOT;
+		return rc;
 	}
 
 	/* determine the value of the extent flag */
@@ -164,7 +165,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
 	 */
 	if (rc) {
 		dbFree(ip, nxaddr, nxlen);
-		vfs_dq_free_block(ip, nxlen);
+		dquot_free_block(ip, nxlen);
 		mutex_unlock(&JFS_IP(ip)->commit_mutex);
 		return (rc);
 	}
@@ -256,10 +257,11 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
 		goto exit;
 
 	/* Allocat blocks to quota. */
-	if (vfs_dq_alloc_block(ip, nxlen)) {
+	rc = dquot_alloc_block(ip, nxlen);
+	if (rc) {
 		dbFree(ip, nxaddr, (s64) nxlen);
 		mutex_unlock(&JFS_IP(ip)->commit_mutex);
-		return -EDQUOT;
+		return rc;
 	}
 
 	delta = nxlen - xlen;
@@ -297,7 +299,7 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
 		/* extend the extent */
 		if ((rc = xtExtend(0, ip, xoff + xlen, (int) nextend, 0))) {
 			dbFree(ip, xaddr + xlen, delta);
-			vfs_dq_free_block(ip, nxlen);
+			dquot_free_block(ip, nxlen);
 			goto exit;
 		}
 	} else {
@@ -308,7 +310,7 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
 		 */
 		if ((rc = xtTailgate(0, ip, xoff, (int) ntail, nxaddr, 0))) {
 			dbFree(ip, nxaddr, nxlen);
-			vfs_dq_free_block(ip, nxlen);
+			dquot_free_block(ip, nxlen);
 			goto exit;
 		}
 	}
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 0fc30407f039..f8332dc8eeb2 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -45,6 +45,7 @@
 #include <linux/buffer_head.h>
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
+#include <linux/slab.h>
 
 #include "jfs_incore.h"
 #include "jfs_inode.h"
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index dc0e02159ac9..2686531e235a 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -98,14 +98,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
 		goto fail_unlock;
 	}
 
-	inode->i_uid = current_fsuid();
-	if (parent->i_mode & S_ISGID) {
-		inode->i_gid = parent->i_gid;
-		if (S_ISDIR(mode))
-			mode |= S_ISGID;
-	} else
-		inode->i_gid = current_fsgid();
-
+	inode_init_owner(inode, parent, mode);
 	/*
 	 * New inodes need to save sane values on disk when
 	 * uid & gid mount options are used
@@ -116,12 +109,11 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
 	/*
 	 * Allocate inode to quota.
 	 */
-	if (vfs_dq_alloc_inode(inode)) {
-		rc = -EDQUOT;
+	dquot_initialize(inode);
+	rc = dquot_alloc_inode(inode);
+	if (rc)
 		goto fail_drop;
-	}
 
-	inode->i_mode = mode;
 	/* inherit flags from parent */
 	jfs_inode->mode2 = JFS_IP(parent)->mode2 & JFS_FL_INHERIT;
 
@@ -134,7 +126,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
 		if (S_ISLNK(mode))
 			jfs_inode->mode2 &= ~(JFS_IMMUTABLE_FL|JFS_APPEND_FL);
 	}
-	jfs_inode->mode2 |= mode;
+	jfs_inode->mode2 |= inode->i_mode;
 
 	inode->i_blocks = 0;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
@@ -162,7 +154,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
 	return inode;
 
 fail_drop:
-	vfs_dq_drop(inode);
+	dquot_drop(inode);
 	inode->i_flags |= S_NOQUOTA;
 fail_unlock:
 	inode->i_nlink = 0;
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 1eff7db34d63..11042b1f44b5 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -21,12 +21,12 @@
 struct fid;
 
 extern struct inode *ialloc(struct inode *, umode_t);
-extern int jfs_fsync(struct file *, struct dentry *, int);
+extern int jfs_fsync(struct file *, int);
 extern long jfs_ioctl(struct file *, unsigned int, unsigned long);
 extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long);
 extern struct inode *jfs_iget(struct super_block *, unsigned long);
 extern int jfs_commit_inode(struct inode *, int);
-extern int jfs_write_inode(struct inode*, int);
+extern int jfs_write_inode(struct inode *, struct writeback_control *);
 extern void jfs_delete_inode(struct inode *);
 extern void jfs_dirty_inode(struct inode *);
 extern void jfs_truncate(struct inode *);
@@ -40,6 +40,7 @@ extern struct dentry *jfs_fh_to_parent(struct super_block *sb, struct fid *fid,
 	int fh_len, int fh_type);
 extern void jfs_set_inode_flags(struct inode *);
 extern int jfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
+extern int jfs_setattr(struct dentry *, struct iattr *);
 
 extern const struct address_space_operations jfs_aops;
 extern const struct inode_operations jfs_dir_inode_operations;
@@ -47,5 +48,6 @@ extern const struct file_operations jfs_dir_operations;
 extern const struct inode_operations jfs_file_inode_operations;
 extern const struct file_operations jfs_file_operations;
 extern const struct inode_operations jfs_symlink_inode_operations;
+extern const struct inode_operations jfs_fast_symlink_inode_operations;
 extern const struct dentry_operations jfs_ci_dentry_operations;
 #endif				/* _H_JFS_INODE */
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 335c4de6552d..c51af2a14516 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -70,6 +70,7 @@
 #include <linux/delay.h>
 #include <linux/mutex.h>
 #include <linux/seq_file.h>
+#include <linux/slab.h>
 #include "jfs_incore.h"
 #include "jfs_filsys.h"
 #include "jfs_metapage.h"
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 07b6c5dfb4b6..48b44bd8267b 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -21,6 +21,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/bio.h>
+#include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/buffer_head.h>
 #include <linux/mempool.h>
diff --git a/fs/jfs/jfs_unicode.h b/fs/jfs/jfs_unicode.h
index 3fbb3a225590..8f0f02cb6ca6 100644
--- a/fs/jfs/jfs_unicode.h
+++ b/fs/jfs/jfs_unicode.h
@@ -19,6 +19,7 @@
 #ifndef _H_JFS_UNICODE
 #define _H_JFS_UNICODE
 
+#include <linux/slab.h>
 #include <asm/byteorder.h>
 #include "jfs_types.h"
 
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index d654a6458648..6c50871e6220 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -585,10 +585,10 @@ int xtInsert(tid_t tid,		/* transaction id */
 			hint = addressXAD(xad) + lengthXAD(xad) - 1;
 		} else
 			hint = 0;
-		if ((rc = vfs_dq_alloc_block(ip, xlen)))
+		if ((rc = dquot_alloc_block(ip, xlen)))
 			goto out;
 		if ((rc = dbAlloc(ip, hint, (s64) xlen, &xaddr))) {
-			vfs_dq_free_block(ip, xlen);
+			dquot_free_block(ip, xlen);
 			goto out;
 		}
 	}
@@ -617,7 +617,7 @@ int xtInsert(tid_t tid,		/* transaction id */
 			/* undo data extent allocation */
 			if (*xaddrp == 0) {
 				dbFree(ip, xaddr, (s64) xlen);
-				vfs_dq_free_block(ip, xlen);
+				dquot_free_block(ip, xlen);
 			}
 			return rc;
 		}
@@ -985,10 +985,9 @@ xtSplitPage(tid_t tid, struct inode *ip,
 	rbn = addressPXD(pxd);
 
 	/* Allocate blocks to quota. */
-	if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) {
-		rc = -EDQUOT;
+	rc = dquot_alloc_block(ip, lengthPXD(pxd));
+	if (rc)
 		goto clean_up;
-	}
 
 	quota_allocation += lengthPXD(pxd);
 
@@ -1195,7 +1194,7 @@ xtSplitPage(tid_t tid, struct inode *ip,
 
 	/* Rollback quota allocation. */
 	if (quota_allocation)
-		vfs_dq_free_block(ip, quota_allocation);
+		dquot_free_block(ip, quota_allocation);
 
 	return (rc);
 }
@@ -1235,6 +1234,7 @@ xtSplitRoot(tid_t tid,
 	struct pxdlist *pxdlist;
 	struct tlock *tlck;
 	struct xtlock *xtlck;
+	int rc;
 
 	sp = &JFS_IP(ip)->i_xtroot;
 
@@ -1252,9 +1252,10 @@ xtSplitRoot(tid_t tid,
 		return -EIO;
 
 	/* Allocate blocks to quota. */
-	if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) {
+	rc = dquot_alloc_block(ip, lengthPXD(pxd));
+	if (rc) {
 		release_metapage(rmp);
-		return -EDQUOT;
+		return rc;
 	}
 
 	jfs_info("xtSplitRoot: ip:0x%p rmp:0x%p", ip, rmp);
@@ -3680,7 +3681,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
 		ip->i_size = newsize;
 
 	/* update quota allocation to reflect freed blocks */
-	vfs_dq_free_block(ip, nfreed);
+	dquot_free_block(ip, nfreed);
 
 	/*
 	 * free tlock of invalidated pages
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index c79a4270f083..a9cf8e8675be 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -85,6 +85,8 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
 
 	jfs_info("jfs_create: dip:0x%p name:%s", dip, dentry->d_name.name);
 
+	dquot_initialize(dip);
+
 	/*
 	 * search parent directory for entry/freespace
 	 * (dtSearch() returns parent directory page pinned)
@@ -215,6 +217,8 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
 
 	jfs_info("jfs_mkdir: dip:0x%p name:%s", dip, dentry->d_name.name);
 
+	dquot_initialize(dip);
+
 	/* link count overflow on parent directory ? */
 	if (dip->i_nlink == JFS_LINK_MAX) {
 		rc = -EMLINK;
@@ -356,7 +360,8 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
 	jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name);
 
 	/* Init inode for quota operations. */
-	vfs_dq_init(ip);
+	dquot_initialize(dip);
+	dquot_initialize(ip);
 
 	/* directory must be empty to be removed */
 	if (!dtEmpty(ip)) {
@@ -483,7 +488,8 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry)
 	jfs_info("jfs_unlink: dip:0x%p name:%s", dip, dentry->d_name.name);
 
 	/* Init inode for quota operations. */
-	vfs_dq_init(ip);
+	dquot_initialize(dip);
+	dquot_initialize(ip);
 
 	if ((rc = get_UCSname(&dname, dentry)))
 		goto out;
@@ -805,6 +811,8 @@ static int jfs_link(struct dentry *old_dentry,
 	if (ip->i_nlink == 0)
 		return -ENOENT;
 
+	dquot_initialize(dir);
+
 	tid = txBegin(ip->i_sb, 0);
 
 	mutex_lock_nested(&JFS_IP(dir)->commit_mutex, COMMIT_MUTEX_PARENT);
@@ -896,6 +904,8 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
 
 	jfs_info("jfs_symlink: dip:0x%p name:%s", dip, name);
 
+	dquot_initialize(dip);
+
 	ssize = strlen(name) + 1;
 
 	/*
@@ -946,7 +956,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
 	 */
 
 	if (ssize <= IDATASIZE) {
-		ip->i_op = &jfs_symlink_inode_operations;
+		ip->i_op = &jfs_fast_symlink_inode_operations;
 
 		i_fastsymlink = JFS_IP(ip)->i_inline;
 		memcpy(i_fastsymlink, name, ssize);
@@ -968,7 +978,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
 	else {
 		jfs_info("jfs_symlink: allocate extent ip:0x%p", ip);
 
-		ip->i_op = &page_symlink_inode_operations;
+		ip->i_op = &jfs_symlink_inode_operations;
 		ip->i_mapping->a_ops = &jfs_aops;
 
 		/*
@@ -1087,6 +1097,9 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	jfs_info("jfs_rename: %s %s", old_dentry->d_name.name,
 		 new_dentry->d_name.name);
 
+	dquot_initialize(old_dir);
+	dquot_initialize(new_dir);
+
 	old_ip = old_dentry->d_inode;
 	new_ip = new_dentry->d_inode;
 
@@ -1136,7 +1149,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	} else if (new_ip) {
 		IWRITE_LOCK(new_ip, RDWRLOCK_NORMAL);
 		/* Init inode for quota operations. */
-		vfs_dq_init(new_ip);
+		dquot_initialize(new_ip);
 	}
 
 	/*
@@ -1360,6 +1373,8 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
 
 	jfs_info("jfs_mknod: %s", dentry->d_name.name);
 
+	dquot_initialize(dir);
+
 	if ((rc = get_UCSname(&dname, dentry)))
 		goto out;
 
@@ -1541,8 +1556,8 @@ const struct inode_operations jfs_dir_inode_operations = {
 	.getxattr	= jfs_getxattr,
 	.listxattr	= jfs_listxattr,
 	.removexattr	= jfs_removexattr,
-#ifdef CONFIG_JFS_POSIX_ACL
 	.setattr	= jfs_setattr,
+#ifdef CONFIG_JFS_POSIX_ACL
 	.check_acl	= jfs_check_acl,
 #endif
 };
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
index 7f24a0bb08ca..1aba0039f1c9 100644
--- a/fs/jfs/resize.c
+++ b/fs/jfs/resize.c
@@ -81,6 +81,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
 	struct inode *iplist[1];
 	struct jfs_superblock *j_sb, *j_sb2;
 	uint old_agsize;
+	int agsizechanged = 0;
 	struct buffer_head *bh, *bh2;
 
 	/* If the volume hasn't grown, get out now */
@@ -333,6 +334,9 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
 	 */
 	if ((rc = dbExtendFS(ipbmap, XAddress, nblocks)))
 		goto error_out;
+
+	agsizechanged |= (bmp->db_agsize != old_agsize);
+
 	/*
 	 * the map now has extended to cover additional nblocks:
 	 * dn_mapsize = oldMapsize + nblocks;
@@ -432,7 +436,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
 	 * will correctly identify the new ag);
 	 */
 	/* if new AG size the same as old AG size, done! */
-	if (bmp->db_agsize != old_agsize) {
+	if (agsizechanged) {
 		if ((rc = diExtendFS(ipimap, ipbmap)))
 			goto error_out;
 
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index d929a822a74e..b38f96bef829 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -30,6 +30,7 @@
 #include <linux/buffer_head.h>
 #include <linux/exportfs.h>
 #include <linux/crc32.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <linux/seq_file.h>
 #include <linux/smp_lock.h>
@@ -131,6 +132,11 @@ static void jfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(jfs_inode_cachep, ji);
 }
 
+static void jfs_clear_inode(struct inode *inode)
+{
+	dquot_drop(inode);
+}
+
 static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct jfs_sb_info *sbi = JFS_SBI(dentry->d_sb);
@@ -173,6 +179,8 @@ static void jfs_put_super(struct super_block *sb)
 
 	jfs_info("In jfs_put_super");
 
+	dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
+
 	lock_kernel();
 
 	rc = jfs_umount(sb);
@@ -390,10 +398,20 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
 
 		JFS_SBI(sb)->flag = flag;
 		ret = jfs_mount_rw(sb, 1);
+
+		/* mark the fs r/w for quota activity */
+		sb->s_flags &= ~MS_RDONLY;
+
 		unlock_kernel();
+		dquot_resume(sb, -1);
 		return ret;
 	}
 	if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) {
+		rc = dquot_suspend(sb, -1);
+		if (rc < 0) {
+			unlock_kernel();
+			return rc;
+		}
 		rc = jfs_umount_rw(sb);
 		JFS_SBI(sb)->flag = flag;
 		unlock_kernel();
@@ -440,10 +458,8 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
 	/* initialize the mount flag and determine the default error handler */
 	flag = JFS_ERR_REMOUNT_RO;
 
-	if (!parse_options((char *) data, sb, &newLVSize, &flag)) {
-		kfree(sbi);
-		return -EINVAL;
-	}
+	if (!parse_options((char *) data, sb, &newLVSize, &flag))
+		goto out_kfree;
 	sbi->flag = flag;
 
 #ifdef CONFIG_JFS_POSIX_ACL
@@ -452,7 +468,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
 
 	if (newLVSize) {
 		printk(KERN_ERR "resize option for remount only\n");
-		return -EINVAL;
+		goto out_kfree;
 	}
 
 	/*
@@ -465,6 +481,10 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
 	 */
 	sb->s_op = &jfs_super_operations;
 	sb->s_export_op = &jfs_export_operations;
+#ifdef CONFIG_QUOTA
+	sb->dq_op = &dquot_operations;
+	sb->s_qcop = &dquot_quotactl_ops;
+#endif
 
 	/*
 	 * Initialize direct-mapping inode/address-space
@@ -472,7 +492,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
 	inode = new_inode(sb);
 	if (inode == NULL) {
 		ret = -ENOMEM;
-		goto out_kfree;
+		goto out_unload;
 	}
 	inode->i_ino = 0;
 	inode->i_nlink = 1;
@@ -544,9 +564,10 @@ out_mount_failed:
 	make_bad_inode(sbi->direct_inode);
 	iput(sbi->direct_inode);
 	sbi->direct_inode = NULL;
-out_kfree:
+out_unload:
 	if (sbi->nls_tab)
 		unload_nls(sbi->nls_tab);
+out_kfree:
 	kfree(sbi);
 	return ret;
 }
@@ -745,6 +766,7 @@ static const struct super_operations jfs_super_operations = {
 	.dirty_inode	= jfs_dirty_inode,
 	.write_inode	= jfs_write_inode,
 	.delete_inode	= jfs_delete_inode,
+	.clear_inode	= jfs_clear_inode,
 	.put_super	= jfs_put_super,
 	.sync_fs	= jfs_sync_fs,
 	.freeze_fs	= jfs_freeze,
diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c
index 4af1a05aad0a..205b946d8e0d 100644
--- a/fs/jfs/symlink.c
+++ b/fs/jfs/symlink.c
@@ -29,9 +29,21 @@ static void *jfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 	return NULL;
 }
 
-const struct inode_operations jfs_symlink_inode_operations = {
+const struct inode_operations jfs_fast_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= jfs_follow_link,
+	.setattr	= jfs_setattr,
+	.setxattr	= jfs_setxattr,
+	.getxattr	= jfs_getxattr,
+	.listxattr	= jfs_listxattr,
+	.removexattr	= jfs_removexattr,
+};
+
+const struct inode_operations jfs_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+	.setattr	= jfs_setattr,
 	.setxattr	= jfs_setxattr,
 	.getxattr	= jfs_getxattr,
 	.listxattr	= jfs_listxattr,
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index fad364548bc9..fa96bbb26343 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -21,6 +21,7 @@
 #include <linux/fs.h>
 #include <linux/xattr.h>
 #include <linux/posix_acl_xattr.h>
+#include <linux/slab.h>
 #include <linux/quotaops.h>
 #include <linux/security.h>
 #include "jfs_incore.h"
@@ -260,14 +261,14 @@ static int ea_write(struct inode *ip, struct jfs_ea_list *ealist, int size,
 	nblocks = (size + (sb->s_blocksize - 1)) >> sb->s_blocksize_bits;
 
 	/* Allocate new blocks to quota. */
-	if (vfs_dq_alloc_block(ip, nblocks)) {
-		return -EDQUOT;
-	}
+	rc = dquot_alloc_block(ip, nblocks);
+	if (rc)
+		return rc;
 
 	rc = dbAlloc(ip, INOHINT(ip), nblocks, &blkno);
 	if (rc) {
 		/*Rollback quota allocation. */
-		vfs_dq_free_block(ip, nblocks);
+		dquot_free_block(ip, nblocks);
 		return rc;
 	}
 
@@ -332,7 +333,7 @@ static int ea_write(struct inode *ip, struct jfs_ea_list *ealist, int size,
 
       failed:
 	/* Rollback quota allocation. */
-	vfs_dq_free_block(ip, nblocks);
+	dquot_free_block(ip, nblocks);
 
 	dbFree(ip, blkno, nblocks);
 	return rc;
@@ -538,7 +539,8 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
 
 	if (blocks_needed > current_blocks) {
 		/* Allocate new blocks to quota. */
-		if (vfs_dq_alloc_block(inode, blocks_needed))
+		rc = dquot_alloc_block(inode, blocks_needed);
+		if (rc)
 			return -EDQUOT;
 
 		quota_allocation = blocks_needed;
@@ -602,7 +604,7 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
       clean_up:
 	/* Rollback quota allocation */
 	if (quota_allocation)
-		vfs_dq_free_block(inode, quota_allocation);
+		dquot_free_block(inode, quota_allocation);
 
 	return (rc);
 }
@@ -677,7 +679,7 @@ static int ea_put(tid_t tid, struct inode *inode, struct ea_buffer *ea_buf,
 
 	/* If old blocks exist, they must be removed from quota allocation. */
 	if (old_blocks)
-		vfs_dq_free_block(inode, old_blocks);
+		dquot_free_block(inode, old_blocks);
 
 	inode->i_ctime = CURRENT_TIME;
 
diff --git a/fs/libfs.c b/fs/libfs.c
index 6e8d17e1dc4c..dcaf972cbf1b 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -5,8 +5,10 @@
 
 #include <linux/module.h>
 #include <linux/pagemap.h>
+#include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/vfs.h>
+#include <linux/quotaops.h>
 #include <linux/mutex.h>
 #include <linux/exportfs.h>
 #include <linux/writeback.h>
@@ -57,11 +59,6 @@ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct na
 	return NULL;
 }
 
-int simple_sync_file(struct file * file, struct dentry *dentry, int datasync)
-{
-	return 0;
-}
- 
 int dcache_dir_open(struct inode *inode, struct file *file)
 {
 	static struct qstr cursor_name = {.len = 1, .name = "."};
@@ -189,7 +186,7 @@ const struct file_operations simple_dir_operations = {
 	.llseek		= dcache_dir_lseek,
 	.read		= generic_read_dir,
 	.readdir	= dcache_readdir,
-	.fsync		= simple_sync_file,
+	.fsync		= noop_fsync,
 };
 
 const struct inode_operations simple_dir_inode_operations = {
@@ -329,6 +326,81 @@ int simple_rename(struct inode *old_dir, struct dentry *old_dentry,
 	return 0;
 }
 
+/**
+ * simple_setsize - handle core mm and vfs requirements for file size change
+ * @inode: inode
+ * @newsize: new file size
+ *
+ * Returns 0 on success, -error on failure.
+ *
+ * simple_setsize must be called with inode_mutex held.
+ *
+ * simple_setsize will check that the requested new size is OK (see
+ * inode_newsize_ok), and then will perform the necessary i_size update
+ * and pagecache truncation (if necessary). It will be typically be called
+ * from the filesystem's setattr function when ATTR_SIZE is passed in.
+ *
+ * The inode itself must have correct permissions and attributes to allow
+ * i_size to be changed, this function then just checks that the new size
+ * requested is valid.
+ *
+ * In the case of simple in-memory filesystems with inodes stored solely
+ * in the inode cache, and file data in the pagecache, nothing more needs
+ * to be done to satisfy a truncate request. Filesystems with on-disk
+ * blocks for example will need to free them in the case of truncate, in
+ * that case it may be easier not to use simple_setsize (but each of its
+ * components will likely be required at some point to update pagecache
+ * and inode etc).
+ */
+int simple_setsize(struct inode *inode, loff_t newsize)
+{
+	loff_t oldsize;
+	int error;
+
+	error = inode_newsize_ok(inode, newsize);
+	if (error)
+		return error;
+
+	oldsize = inode->i_size;
+	i_size_write(inode, newsize);
+	truncate_pagecache(inode, oldsize, newsize);
+
+	return error;
+}
+EXPORT_SYMBOL(simple_setsize);
+
+/**
+ * simple_setattr - setattr for simple in-memory filesystem
+ * @dentry: dentry
+ * @iattr: iattr structure
+ *
+ * Returns 0 on success, -error on failure.
+ *
+ * simple_setattr implements setattr for an in-memory filesystem which
+ * does not store its own file data or metadata (eg. uses the page cache
+ * and inode cache as its data store).
+ */
+int simple_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+	struct inode *inode = dentry->d_inode;
+	int error;
+
+	error = inode_change_ok(inode, iattr);
+	if (error)
+		return error;
+
+	if (iattr->ia_valid & ATTR_SIZE) {
+		error = simple_setsize(inode, iattr->ia_size);
+		if (error)
+			return error;
+	}
+
+	generic_setattr(inode, iattr);
+
+	return error;
+}
+EXPORT_SYMBOL(simple_setattr);
+
 int simple_readpage(struct file *file, struct page *page)
 {
 	clear_highpage(page);
@@ -338,28 +410,14 @@ int simple_readpage(struct file *file, struct page *page)
 	return 0;
 }
 
-int simple_prepare_write(struct file *file, struct page *page,
-			unsigned from, unsigned to)
-{
-	if (!PageUptodate(page)) {
-		if (to - from != PAGE_CACHE_SIZE)
-			zero_user_segments(page,
-				0, from,
-				to, PAGE_CACHE_SIZE);
-	}
-	return 0;
-}
-
 int simple_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
 {
 	struct page *page;
 	pgoff_t index;
-	unsigned from;
 
 	index = pos >> PAGE_CACHE_SHIFT;
-	from = pos & (PAGE_CACHE_SIZE - 1);
 
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page)
@@ -367,43 +425,59 @@ int simple_write_begin(struct file *file, struct address_space *mapping,
 
 	*pagep = page;
 
-	return simple_prepare_write(file, page, from, from+len);
-}
-
-static int simple_commit_write(struct file *file, struct page *page,
-			       unsigned from, unsigned to)
-{
-	struct inode *inode = page->mapping->host;
-	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+	if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) {
+		unsigned from = pos & (PAGE_CACHE_SIZE - 1);
 
-	if (!PageUptodate(page))
-		SetPageUptodate(page);
-	/*
-	 * No need to use i_size_read() here, the i_size
-	 * cannot change under us because we hold the i_mutex.
-	 */
-	if (pos > inode->i_size)
-		i_size_write(inode, pos);
-	set_page_dirty(page);
+		zero_user_segments(page, 0, from, from + len, PAGE_CACHE_SIZE);
+	}
 	return 0;
 }
 
+/**
+ * simple_write_end - .write_end helper for non-block-device FSes
+ * @available: See .write_end of address_space_operations
+ * @file: 		"
+ * @mapping: 		"
+ * @pos: 		"
+ * @len: 		"
+ * @copied: 		"
+ * @page: 		"
+ * @fsdata: 		"
+ *
+ * simple_write_end does the minimum needed for updating a page after writing is
+ * done. It has the same API signature as the .write_end of
+ * address_space_operations vector. So it can just be set onto .write_end for
+ * FSes that don't need any other processing. i_mutex is assumed to be held.
+ * Block based filesystems should use generic_write_end().
+ * NOTE: Even though i_size might get updated by this function, mark_inode_dirty
+ * is not called, so a filesystem that actually does store data in .write_inode
+ * should extend on what's done here with a call to mark_inode_dirty() in the
+ * case that i_size has changed.
+ */
 int simple_write_end(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned copied,
 			struct page *page, void *fsdata)
 {
-	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+	struct inode *inode = page->mapping->host;
+	loff_t last_pos = pos + copied;
 
 	/* zero the stale part of the page if we did a short copy */
 	if (copied < len) {
-		void *kaddr = kmap_atomic(page, KM_USER0);
-		memset(kaddr + from + copied, 0, len - copied);
-		flush_dcache_page(page);
-		kunmap_atomic(kaddr, KM_USER0);
+		unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+
+		zero_user(page, from + copied, len - copied);
 	}
 
-	simple_commit_write(file, page, from, from+copied);
+	if (!PageUptodate(page))
+		SetPageUptodate(page);
+	/*
+	 * No need to use i_size_read() here, the i_size
+	 * cannot change under us because we hold the i_mutex.
+	 */
+	if (last_pos > inode->i_size)
+		i_size_write(inode, last_pos);
 
+	set_page_dirty(page);
 	unlock_page(page);
 	page_cache_release(page);
 
@@ -415,7 +489,8 @@ int simple_write_end(struct file *file, struct address_space *mapping,
  * unique inode values later for this filesystem, then you must take care
  * to pass it an appropriate max_reserved value to avoid collisions.
  */
-int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files)
+int simple_fill_super(struct super_block *s, unsigned long magic,
+		      struct tree_descr *files)
 {
 	struct inode *inode;
 	struct dentry *root;
@@ -544,6 +619,40 @@ ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
 }
 
 /**
+ * simple_write_to_buffer - copy data from user space to the buffer
+ * @to: the buffer to write to
+ * @available: the size of the buffer
+ * @ppos: the current position in the buffer
+ * @from: the user space buffer to read from
+ * @count: the maximum number of bytes to read
+ *
+ * The simple_write_to_buffer() function reads up to @count bytes from the user
+ * space address starting at @from into the buffer @to at offset @ppos.
+ *
+ * On success, the number of bytes written is returned and the offset @ppos is
+ * advanced by this number, or negative value is returned on error.
+ **/
+ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos,
+		const void __user *from, size_t count)
+{
+	loff_t pos = *ppos;
+	size_t res;
+
+	if (pos < 0)
+		return -EINVAL;
+	if (pos >= available || !count)
+		return 0;
+	if (count > available - pos)
+		count = available - pos;
+	res = copy_from_user(to + pos, from, count);
+	if (res == count)
+		return -EFAULT;
+	count -= res;
+	*ppos = pos + count;
+	return count;
+}
+
+/**
  * memory_read_from_buffer - copy data from the buffer
  * @to: the kernel space buffer to read to
  * @count: the maximum number of bytes to read
@@ -814,13 +923,22 @@ struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid,
 }
 EXPORT_SYMBOL_GPL(generic_fh_to_parent);
 
-int simple_fsync(struct file *file, struct dentry *dentry, int datasync)
+/**
+ * generic_file_fsync - generic fsync implementation for simple filesystems
+ * @file:	file to synchronize
+ * @datasync:	only synchronize essential metadata if true
+ *
+ * This is a generic implementation of the fsync method for simple
+ * filesystems which track all non-inode metadata in the buffers list
+ * hanging off the address_space structure.
+ */
+int generic_file_fsync(struct file *file, int datasync)
 {
 	struct writeback_control wbc = {
 		.sync_mode = WB_SYNC_ALL,
 		.nr_to_write = 0, /* metadata-only; caller takes care of data */
 	};
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = file->f_mapping->host;
 	int err;
 	int ret;
 
@@ -835,7 +953,15 @@ int simple_fsync(struct file *file, struct dentry *dentry, int datasync)
 		ret = err;
 	return ret;
 }
-EXPORT_SYMBOL(simple_fsync);
+EXPORT_SYMBOL(generic_file_fsync);
+
+/*
+ * No-op implementation of ->fsync for in-memory filesystems.
+ */
+int noop_fsync(struct file *file, int datasync)
+{
+	return 0;
+}
 
 EXPORT_SYMBOL(dcache_dir_close);
 EXPORT_SYMBOL(dcache_dir_lseek);
@@ -853,15 +979,15 @@ EXPORT_SYMBOL(simple_getattr);
 EXPORT_SYMBOL(simple_link);
 EXPORT_SYMBOL(simple_lookup);
 EXPORT_SYMBOL(simple_pin_fs);
-EXPORT_UNUSED_SYMBOL(simple_prepare_write);
 EXPORT_SYMBOL(simple_readpage);
 EXPORT_SYMBOL(simple_release_fs);
 EXPORT_SYMBOL(simple_rename);
 EXPORT_SYMBOL(simple_rmdir);
 EXPORT_SYMBOL(simple_statfs);
-EXPORT_SYMBOL(simple_sync_file);
+EXPORT_SYMBOL(noop_fsync);
 EXPORT_SYMBOL(simple_unlink);
 EXPORT_SYMBOL(simple_read_from_buffer);
+EXPORT_SYMBOL(simple_write_to_buffer);
 EXPORT_SYMBOL(memory_read_from_buffer);
 EXPORT_SYMBOL(simple_transaction_set);
 EXPORT_SYMBOL(simple_transaction_get);
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index fc9032dc8862..64fd427c993c 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -8,6 +8,7 @@
 
 #include <linux/module.h>
 #include <linux/types.h>
+#include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/nfs_fs.h>
 #include <linux/sunrpc/clnt.h>
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index c81249fef11f..7932c399fab4 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -8,6 +8,7 @@
 
 #include <linux/module.h>
 #include <linux/smp_lock.h>
+#include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 4600c2037b8b..bb464d12104c 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -479,8 +479,8 @@ again:	mutex_lock(&nlm_host_mutex);
 			}
 		}
 	}
-
 	mutex_unlock(&nlm_host_mutex);
+	nsm_release(nsm);
 }
 
 /*
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index f956651d0f65..e3015464fbab 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -10,6 +10,7 @@
 #include <linux/utsname.h>
 #include <linux/kernel.h>
 #include <linux/ktime.h>
+#include <linux/slab.h>
 
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/xprtsock.h>
@@ -349,9 +350,9 @@ retry:
  * nsm_reboot_lookup - match NLMPROC_SM_NOTIFY arguments to an nsm_handle
  * @info: pointer to NLMPROC_SM_NOTIFY arguments
  *
- * Returns a matching nsm_handle if found in the nsm cache; the returned
- * nsm_handle's reference count is bumped and sm_monitored is cleared.
- * Otherwise returns NULL if some error occurred.
+ * Returns a matching nsm_handle if found in the nsm cache. The returned
+ * nsm_handle's reference count is bumped. Otherwise returns NULL if some
+ * error occurred.
  */
 struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info)
 {
@@ -370,12 +371,6 @@ struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info)
 	atomic_inc(&cached->sm_count);
 	spin_unlock(&nsm_lock);
 
-	/*
-	 * During subsequent lock activity, force a fresh
-	 * notification to be set up for this host.
-	 */
-	cached->sm_monitored = 0;
-
 	dprintk("lockd: host %s (%s) rebooted, cnt %d\n",
 			cached->sm_name, cached->sm_addrbuf,
 			atomic_read(&cached->sm_count));
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index e50cfa3d9654..f1bacf1a0391 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -21,7 +21,6 @@
 #include <linux/errno.h>
 #include <linux/in.h>
 #include <linux/uio.h>
-#include <linux/slab.h>
 #include <linux/smp.h>
 #include <linux/smp_lock.h>
 #include <linux/mutex.h>
@@ -243,11 +242,9 @@ static int make_socks(struct svc_serv *serv)
 	if (err < 0)
 		goto out_err;
 
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 	err = create_lockd_family(serv, PF_INET6);
 	if (err < 0 && err != -EAFNOSUPPORT)
 		goto out_err;
-#endif	/* CONFIG_IPV6 || CONFIG_IPV6_MODULE */
 
 	warned = 0;
 	return 0;
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index a7966eed3c17..031c6569a134 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -9,7 +9,6 @@
 
 #include <linux/types.h>
 #include <linux/time.h>
-#include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/share.h>
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index d1001790fa9a..84055d31bfc5 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -21,6 +21,7 @@
  */
 
 #include <linux/types.h>
+#include <linux/slab.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 56c9519d900a..0f2ab741ae7c 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -9,7 +9,6 @@
 
 #include <linux/types.h>
 #include <linux/time.h>
-#include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/share.h>
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index ad478da7ca63..d0ef94cfb3da 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -10,6 +10,7 @@
 #include <linux/string.h>
 #include <linux/time.h>
 #include <linux/in.h>
+#include <linux/slab.h>
 #include <linux/mutex.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/clnt.h>
diff --git a/fs/locks.c b/fs/locks.c
index a8794f233bc9..ab24d49fc048 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1182,8 +1182,9 @@ int __break_lease(struct inode *inode, unsigned int mode)
 	struct file_lock *fl;
 	unsigned long break_time;
 	int i_have_this_lease = 0;
+	int want_write = (mode & O_ACCMODE) != O_RDONLY;
 
-	new_fl = lease_alloc(NULL, mode & FMODE_WRITE ? F_WRLCK : F_RDLCK);
+	new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK);
 
 	lock_kernel();
 
@@ -1197,7 +1198,7 @@ int __break_lease(struct inode *inode, unsigned int mode)
 		if (fl->fl_owner == current->files)
 			i_have_this_lease = 1;
 
-	if (mode & FMODE_WRITE) {
+	if (want_write) {
 		/* If we want write access, we have to revoke any lease. */
 		future = F_UNLCK | F_INPROGRESS;
 	} else if (flock->fl_type & F_INPROGRESS) {
@@ -1454,7 +1455,7 @@ EXPORT_SYMBOL(generic_setlease);
  *	leases held by processes on this node.
  *
  *	There is also no break_lease method; filesystems that
- *	handle their own leases shoud break leases themselves from the
+ *	handle their own leases should break leases themselves from the
  *	filesystem's open, create, and (on truncate) setattr methods.
  *
  *	Warning: the only current setlease methods exist only to disable
diff --git a/fs/logfs/Kconfig b/fs/logfs/Kconfig
new file mode 100644
index 000000000000..daf9a9b32dd3
--- /dev/null
+++ b/fs/logfs/Kconfig
@@ -0,0 +1,17 @@
+config LOGFS
+	tristate "LogFS file system (EXPERIMENTAL)"
+	depends on (MTD || BLOCK) && EXPERIMENTAL
+	select ZLIB_INFLATE
+	select ZLIB_DEFLATE
+	select CRC32
+	select BTREE
+	help
+	  Flash filesystem aimed to scale efficiently to large devices.
+	  In comparison to JFFS2 it offers significantly faster mount
+	  times and potentially less RAM usage, although the latter has
+	  not been measured yet.
+
+	  In its current state it is still very experimental and should
+	  not be used for other than testing purposes.
+
+	  If unsure, say N.
diff --git a/fs/logfs/Makefile b/fs/logfs/Makefile
new file mode 100644
index 000000000000..4820027787ee
--- /dev/null
+++ b/fs/logfs/Makefile
@@ -0,0 +1,13 @@
+obj-$(CONFIG_LOGFS)	+= logfs.o
+
+logfs-y	+= compr.o
+logfs-y	+= dir.o
+logfs-y	+= file.o
+logfs-y	+= gc.o
+logfs-y	+= inode.o
+logfs-y	+= journal.o
+logfs-y	+= readwrite.o
+logfs-y	+= segment.o
+logfs-y	+= super.o
+logfs-$(CONFIG_BLOCK)	+= dev_bdev.o
+logfs-$(CONFIG_MTD)	+= dev_mtd.o
diff --git a/fs/logfs/compr.c b/fs/logfs/compr.c
new file mode 100644
index 000000000000..44bbfd249abc
--- /dev/null
+++ b/fs/logfs/compr.c
@@ -0,0 +1,95 @@
+/*
+ * fs/logfs/compr.c	- compression routines
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/vmalloc.h>
+#include <linux/zlib.h>
+
+#define COMPR_LEVEL 3
+
+static DEFINE_MUTEX(compr_mutex);
+static struct z_stream_s stream;
+
+int logfs_compress(void *in, void *out, size_t inlen, size_t outlen)
+{
+	int err, ret;
+
+	ret = -EIO;
+	mutex_lock(&compr_mutex);
+	err = zlib_deflateInit(&stream, COMPR_LEVEL);
+	if (err != Z_OK)
+		goto error;
+
+	stream.next_in = in;
+	stream.avail_in = inlen;
+	stream.total_in = 0;
+	stream.next_out = out;
+	stream.avail_out = outlen;
+	stream.total_out = 0;
+
+	err = zlib_deflate(&stream, Z_FINISH);
+	if (err != Z_STREAM_END)
+		goto error;
+
+	err = zlib_deflateEnd(&stream);
+	if (err != Z_OK)
+		goto error;
+
+	if (stream.total_out >= stream.total_in)
+		goto error;
+
+	ret = stream.total_out;
+error:
+	mutex_unlock(&compr_mutex);
+	return ret;
+}
+
+int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen)
+{
+	int err, ret;
+
+	ret = -EIO;
+	mutex_lock(&compr_mutex);
+	err = zlib_inflateInit(&stream);
+	if (err != Z_OK)
+		goto error;
+
+	stream.next_in = in;
+	stream.avail_in = inlen;
+	stream.total_in = 0;
+	stream.next_out = out;
+	stream.avail_out = outlen;
+	stream.total_out = 0;
+
+	err = zlib_inflate(&stream, Z_FINISH);
+	if (err != Z_STREAM_END)
+		goto error;
+
+	err = zlib_inflateEnd(&stream);
+	if (err != Z_OK)
+		goto error;
+
+	ret = 0;
+error:
+	mutex_unlock(&compr_mutex);
+	return ret;
+}
+
+int __init logfs_compr_init(void)
+{
+	size_t size = max(zlib_deflate_workspacesize(),
+			zlib_inflate_workspacesize());
+	stream.workspace = vmalloc(size);
+	if (!stream.workspace)
+		return -ENOMEM;
+	return 0;
+}
+
+void logfs_compr_exit(void)
+{
+	vfree(stream.workspace);
+}
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
new file mode 100644
index 000000000000..9bd2ce2a3040
--- /dev/null
+++ b/fs/logfs/dev_bdev.c
@@ -0,0 +1,339 @@
+/*
+ * fs/logfs/dev_bdev.c	- Device access methods for block devices
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/gfp.h>
+
+#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
+
+static void request_complete(struct bio *bio, int err)
+{
+	complete((struct completion *)bio->bi_private);
+}
+
+static int sync_request(struct page *page, struct block_device *bdev, int rw)
+{
+	struct bio bio;
+	struct bio_vec bio_vec;
+	struct completion complete;
+
+	bio_init(&bio);
+	bio.bi_io_vec = &bio_vec;
+	bio_vec.bv_page = page;
+	bio_vec.bv_len = PAGE_SIZE;
+	bio_vec.bv_offset = 0;
+	bio.bi_vcnt = 1;
+	bio.bi_idx = 0;
+	bio.bi_size = PAGE_SIZE;
+	bio.bi_bdev = bdev;
+	bio.bi_sector = page->index * (PAGE_SIZE >> 9);
+	init_completion(&complete);
+	bio.bi_private = &complete;
+	bio.bi_end_io = request_complete;
+
+	submit_bio(rw, &bio);
+	generic_unplug_device(bdev_get_queue(bdev));
+	wait_for_completion(&complete);
+	return test_bit(BIO_UPTODATE, &bio.bi_flags) ? 0 : -EIO;
+}
+
+static int bdev_readpage(void *_sb, struct page *page)
+{
+	struct super_block *sb = _sb;
+	struct block_device *bdev = logfs_super(sb)->s_bdev;
+	int err;
+
+	err = sync_request(page, bdev, READ);
+	if (err) {
+		ClearPageUptodate(page);
+		SetPageError(page);
+	} else {
+		SetPageUptodate(page);
+		ClearPageError(page);
+	}
+	unlock_page(page);
+	return err;
+}
+
+static DECLARE_WAIT_QUEUE_HEAD(wq);
+
+static void writeseg_end_io(struct bio *bio, int err)
+{
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct super_block *sb = bio->bi_private;
+	struct logfs_super *super = logfs_super(sb);
+	struct page *page;
+
+	BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */
+	BUG_ON(err);
+	BUG_ON(bio->bi_vcnt == 0);
+	do {
+		page = bvec->bv_page;
+		if (--bvec >= bio->bi_io_vec)
+			prefetchw(&bvec->bv_page->flags);
+
+		end_page_writeback(page);
+		page_cache_release(page);
+	} while (bvec >= bio->bi_io_vec);
+	bio_put(bio);
+	if (atomic_dec_and_test(&super->s_pending_writes))
+		wake_up(&wq);
+}
+
+static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
+		size_t nr_pages)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping = super->s_mapping_inode->i_mapping;
+	struct bio *bio;
+	struct page *page;
+	struct request_queue *q = bdev_get_queue(sb->s_bdev);
+	unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
+	int i;
+
+	if (max_pages > BIO_MAX_PAGES)
+		max_pages = BIO_MAX_PAGES;
+	bio = bio_alloc(GFP_NOFS, max_pages);
+	BUG_ON(!bio);
+
+	for (i = 0; i < nr_pages; i++) {
+		if (i >= max_pages) {
+			/* Block layer cannot split bios :( */
+			bio->bi_vcnt = i;
+			bio->bi_idx = 0;
+			bio->bi_size = i * PAGE_SIZE;
+			bio->bi_bdev = super->s_bdev;
+			bio->bi_sector = ofs >> 9;
+			bio->bi_private = sb;
+			bio->bi_end_io = writeseg_end_io;
+			atomic_inc(&super->s_pending_writes);
+			submit_bio(WRITE, bio);
+
+			ofs += i * PAGE_SIZE;
+			index += i;
+			nr_pages -= i;
+			i = 0;
+
+			bio = bio_alloc(GFP_NOFS, max_pages);
+			BUG_ON(!bio);
+		}
+		page = find_lock_page(mapping, index + i);
+		BUG_ON(!page);
+		bio->bi_io_vec[i].bv_page = page;
+		bio->bi_io_vec[i].bv_len = PAGE_SIZE;
+		bio->bi_io_vec[i].bv_offset = 0;
+
+		BUG_ON(PageWriteback(page));
+		set_page_writeback(page);
+		unlock_page(page);
+	}
+	bio->bi_vcnt = nr_pages;
+	bio->bi_idx = 0;
+	bio->bi_size = nr_pages * PAGE_SIZE;
+	bio->bi_bdev = super->s_bdev;
+	bio->bi_sector = ofs >> 9;
+	bio->bi_private = sb;
+	bio->bi_end_io = writeseg_end_io;
+	atomic_inc(&super->s_pending_writes);
+	submit_bio(WRITE, bio);
+	return 0;
+}
+
+static void bdev_writeseg(struct super_block *sb, u64 ofs, size_t len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int head;
+
+	BUG_ON(super->s_flags & LOGFS_SB_FLAG_RO);
+
+	if (len == 0) {
+		/* This can happen when the object fit perfectly into a
+		 * segment, the segment gets written per sync and subsequently
+		 * closed.
+		 */
+		return;
+	}
+	head = ofs & (PAGE_SIZE - 1);
+	if (head) {
+		ofs -= head;
+		len += head;
+	}
+	len = PAGE_ALIGN(len);
+	__bdev_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
+	generic_unplug_device(bdev_get_queue(logfs_super(sb)->s_bdev));
+}
+
+
+static void erase_end_io(struct bio *bio, int err) 
+{ 
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 
+	struct super_block *sb = bio->bi_private; 
+	struct logfs_super *super = logfs_super(sb); 
+
+	BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */ 
+	BUG_ON(err); 
+	BUG_ON(bio->bi_vcnt == 0); 
+	bio_put(bio); 
+	if (atomic_dec_and_test(&super->s_pending_writes))
+		wake_up(&wq); 
+} 
+
+static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
+		size_t nr_pages)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct bio *bio;
+	struct request_queue *q = bdev_get_queue(sb->s_bdev);
+	unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
+	int i;
+
+	if (max_pages > BIO_MAX_PAGES)
+		max_pages = BIO_MAX_PAGES;
+	bio = bio_alloc(GFP_NOFS, max_pages);
+	BUG_ON(!bio);
+
+	for (i = 0; i < nr_pages; i++) {
+		if (i >= max_pages) {
+			/* Block layer cannot split bios :( */
+			bio->bi_vcnt = i;
+			bio->bi_idx = 0;
+			bio->bi_size = i * PAGE_SIZE;
+			bio->bi_bdev = super->s_bdev;
+			bio->bi_sector = ofs >> 9;
+			bio->bi_private = sb;
+			bio->bi_end_io = erase_end_io;
+			atomic_inc(&super->s_pending_writes);
+			submit_bio(WRITE, bio);
+
+			ofs += i * PAGE_SIZE;
+			index += i;
+			nr_pages -= i;
+			i = 0;
+
+			bio = bio_alloc(GFP_NOFS, max_pages);
+			BUG_ON(!bio);
+		}
+		bio->bi_io_vec[i].bv_page = super->s_erase_page;
+		bio->bi_io_vec[i].bv_len = PAGE_SIZE;
+		bio->bi_io_vec[i].bv_offset = 0;
+	}
+	bio->bi_vcnt = nr_pages;
+	bio->bi_idx = 0;
+	bio->bi_size = nr_pages * PAGE_SIZE;
+	bio->bi_bdev = super->s_bdev;
+	bio->bi_sector = ofs >> 9;
+	bio->bi_private = sb;
+	bio->bi_end_io = erase_end_io;
+	atomic_inc(&super->s_pending_writes);
+	submit_bio(WRITE, bio);
+	return 0;
+}
+
+static int bdev_erase(struct super_block *sb, loff_t to, size_t len,
+		int ensure_write)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	BUG_ON(to & (PAGE_SIZE - 1));
+	BUG_ON(len & (PAGE_SIZE - 1));
+
+	if (super->s_flags & LOGFS_SB_FLAG_RO)
+		return -EROFS;
+
+	if (ensure_write) {
+		/*
+		 * Object store doesn't care whether erases happen or not.
+		 * But for the journal they are required.  Otherwise a scan
+		 * can find an old commit entry and assume it is the current
+		 * one, travelling back in time.
+		 */
+		do_erase(sb, to, to >> PAGE_SHIFT, len >> PAGE_SHIFT);
+	}
+
+	return 0;
+}
+
+static void bdev_sync(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	wait_event(wq, atomic_read(&super->s_pending_writes) == 0);
+}
+
+static struct page *bdev_find_first_sb(struct super_block *sb, u64 *ofs)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping = super->s_mapping_inode->i_mapping;
+	filler_t *filler = bdev_readpage;
+
+	*ofs = 0;
+	return read_cache_page(mapping, 0, filler, sb);
+}
+
+static struct page *bdev_find_last_sb(struct super_block *sb, u64 *ofs)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping = super->s_mapping_inode->i_mapping;
+	filler_t *filler = bdev_readpage;
+	u64 pos = (super->s_bdev->bd_inode->i_size & ~0xfffULL) - 0x1000;
+	pgoff_t index = pos >> PAGE_SHIFT;
+
+	*ofs = pos;
+	return read_cache_page(mapping, index, filler, sb);
+}
+
+static int bdev_write_sb(struct super_block *sb, struct page *page)
+{
+	struct block_device *bdev = logfs_super(sb)->s_bdev;
+
+	/* Nothing special to do for block devices. */
+	return sync_request(page, bdev, WRITE);
+}
+
+static void bdev_put_device(struct super_block *sb)
+{
+	close_bdev_exclusive(logfs_super(sb)->s_bdev, FMODE_READ|FMODE_WRITE);
+}
+
+static int bdev_can_write_buf(struct super_block *sb, u64 ofs)
+{
+	return 0;
+}
+
+static const struct logfs_device_ops bd_devops = {
+	.find_first_sb	= bdev_find_first_sb,
+	.find_last_sb	= bdev_find_last_sb,
+	.write_sb	= bdev_write_sb,
+	.readpage	= bdev_readpage,
+	.writeseg	= bdev_writeseg,
+	.erase		= bdev_erase,
+	.can_write_buf	= bdev_can_write_buf,
+	.sync		= bdev_sync,
+	.put_device	= bdev_put_device,
+};
+
+int logfs_get_sb_bdev(struct file_system_type *type, int flags,
+		const char *devname, struct vfsmount *mnt)
+{
+	struct block_device *bdev;
+
+	bdev = open_bdev_exclusive(devname, FMODE_READ|FMODE_WRITE, type);
+	if (IS_ERR(bdev))
+		return PTR_ERR(bdev);
+
+	if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) {
+		int mtdnr = MINOR(bdev->bd_dev);
+		close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE);
+		return logfs_get_sb_mtd(type, flags, mtdnr, mnt);
+	}
+
+	return logfs_get_sb_device(type, flags, NULL, bdev, &bd_devops, mnt);
+}
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
new file mode 100644
index 000000000000..a85d47d13e4b
--- /dev/null
+++ b/fs/logfs/dev_mtd.c
@@ -0,0 +1,278 @@
+/*
+ * fs/logfs/dev_mtd.c	- Device access methods for MTD
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/completion.h>
+#include <linux/mount.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
+
+static int mtd_read(struct super_block *sb, loff_t ofs, size_t len, void *buf)
+{
+	struct mtd_info *mtd = logfs_super(sb)->s_mtd;
+	size_t retlen;
+	int ret;
+
+	ret = mtd->read(mtd, ofs, len, &retlen, buf);
+	BUG_ON(ret == -EINVAL);
+	if (ret)
+		return ret;
+
+	/* Not sure if we should loop instead. */
+	if (retlen != len)
+		return -EIO;
+
+	return 0;
+}
+
+static int mtd_write(struct super_block *sb, loff_t ofs, size_t len, void *buf)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct mtd_info *mtd = super->s_mtd;
+	size_t retlen;
+	loff_t page_start, page_end;
+	int ret;
+
+	if (super->s_flags & LOGFS_SB_FLAG_RO)
+		return -EROFS;
+
+	BUG_ON((ofs >= mtd->size) || (len > mtd->size - ofs));
+	BUG_ON(ofs != (ofs >> super->s_writeshift) << super->s_writeshift);
+	BUG_ON(len > PAGE_CACHE_SIZE);
+	page_start = ofs & PAGE_CACHE_MASK;
+	page_end = PAGE_CACHE_ALIGN(ofs + len) - 1;
+	ret = mtd->write(mtd, ofs, len, &retlen, buf);
+	if (ret || (retlen != len))
+		return -EIO;
+
+	return 0;
+}
+
+/*
+ * For as long as I can remember (since about 2001) mtd->erase has been an
+ * asynchronous interface lacking the first driver to actually use the
+ * asynchronous properties.  So just to prevent the first implementor of such
+ * a thing from breaking logfs in 2350, we do the usual pointless dance to
+ * declare a completion variable and wait for completion before returning
+ * from mtd_erase().  What an excercise in futility!
+ */
+static void logfs_erase_callback(struct erase_info *ei)
+{
+	complete((struct completion *)ei->priv);
+}
+
+static int mtd_erase_mapping(struct super_block *sb, loff_t ofs, size_t len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping = super->s_mapping_inode->i_mapping;
+	struct page *page;
+	pgoff_t index = ofs >> PAGE_SHIFT;
+
+	for (index = ofs >> PAGE_SHIFT; index < (ofs + len) >> PAGE_SHIFT; index++) {
+		page = find_get_page(mapping, index);
+		if (!page)
+			continue;
+		memset(page_address(page), 0xFF, PAGE_SIZE);
+		page_cache_release(page);
+	}
+	return 0;
+}
+
+static int mtd_erase(struct super_block *sb, loff_t ofs, size_t len,
+		int ensure_write)
+{
+	struct mtd_info *mtd = logfs_super(sb)->s_mtd;
+	struct erase_info ei;
+	DECLARE_COMPLETION_ONSTACK(complete);
+	int ret;
+
+	BUG_ON(len % mtd->erasesize);
+	if (logfs_super(sb)->s_flags & LOGFS_SB_FLAG_RO)
+		return -EROFS;
+
+	memset(&ei, 0, sizeof(ei));
+	ei.mtd = mtd;
+	ei.addr = ofs;
+	ei.len = len;
+	ei.callback = logfs_erase_callback;
+	ei.priv = (long)&complete;
+	ret = mtd->erase(mtd, &ei);
+	if (ret)
+		return -EIO;
+
+	wait_for_completion(&complete);
+	if (ei.state != MTD_ERASE_DONE)
+		return -EIO;
+	return mtd_erase_mapping(sb, ofs, len);
+}
+
+static void mtd_sync(struct super_block *sb)
+{
+	struct mtd_info *mtd = logfs_super(sb)->s_mtd;
+
+	if (mtd->sync)
+		mtd->sync(mtd);
+}
+
+static int mtd_readpage(void *_sb, struct page *page)
+{
+	struct super_block *sb = _sb;
+	int err;
+
+	err = mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
+			page_address(page));
+	if (err == -EUCLEAN || err == -EBADMSG) {
+		/* -EBADMSG happens regularly on power failures */
+		err = 0;
+		/* FIXME: force GC this segment */
+	}
+	if (err) {
+		ClearPageUptodate(page);
+		SetPageError(page);
+	} else {
+		SetPageUptodate(page);
+		ClearPageError(page);
+	}
+	unlock_page(page);
+	return err;
+}
+
+static struct page *mtd_find_first_sb(struct super_block *sb, u64 *ofs)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping = super->s_mapping_inode->i_mapping;
+	filler_t *filler = mtd_readpage;
+	struct mtd_info *mtd = super->s_mtd;
+
+	if (!mtd->block_isbad)
+		return NULL;
+
+	*ofs = 0;
+	while (mtd->block_isbad(mtd, *ofs)) {
+		*ofs += mtd->erasesize;
+		if (*ofs >= mtd->size)
+			return NULL;
+	}
+	BUG_ON(*ofs & ~PAGE_MASK);
+	return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb);
+}
+
+static struct page *mtd_find_last_sb(struct super_block *sb, u64 *ofs)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping = super->s_mapping_inode->i_mapping;
+	filler_t *filler = mtd_readpage;
+	struct mtd_info *mtd = super->s_mtd;
+
+	if (!mtd->block_isbad)
+		return NULL;
+
+	*ofs = mtd->size - mtd->erasesize;
+	while (mtd->block_isbad(mtd, *ofs)) {
+		*ofs -= mtd->erasesize;
+		if (*ofs <= 0)
+			return NULL;
+	}
+	*ofs = *ofs + mtd->erasesize - 0x1000;
+	BUG_ON(*ofs & ~PAGE_MASK);
+	return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb);
+}
+
+static int __mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
+		size_t nr_pages)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping = super->s_mapping_inode->i_mapping;
+	struct page *page;
+	int i, err;
+
+	for (i = 0; i < nr_pages; i++) {
+		page = find_lock_page(mapping, index + i);
+		BUG_ON(!page);
+
+		err = mtd_write(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
+				page_address(page));
+		unlock_page(page);
+		page_cache_release(page);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static void mtd_writeseg(struct super_block *sb, u64 ofs, size_t len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int head;
+
+	if (super->s_flags & LOGFS_SB_FLAG_RO)
+		return;
+
+	if (len == 0) {
+		/* This can happen when the object fit perfectly into a
+		 * segment, the segment gets written per sync and subsequently
+		 * closed.
+		 */
+		return;
+	}
+	head = ofs & (PAGE_SIZE - 1);
+	if (head) {
+		ofs -= head;
+		len += head;
+	}
+	len = PAGE_ALIGN(len);
+	__mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
+}
+
+static void mtd_put_device(struct super_block *sb)
+{
+	put_mtd_device(logfs_super(sb)->s_mtd);
+}
+
+static int mtd_can_write_buf(struct super_block *sb, u64 ofs)
+{
+	struct logfs_super *super = logfs_super(sb);
+	void *buf;
+	int err;
+
+	buf = kmalloc(super->s_writesize, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+	err = mtd_read(sb, ofs, super->s_writesize, buf);
+	if (err)
+		goto out;
+	if (memchr_inv(buf, 0xff, super->s_writesize))
+		err = -EIO;
+	kfree(buf);
+out:
+	return err;
+}
+
+static const struct logfs_device_ops mtd_devops = {
+	.find_first_sb	= mtd_find_first_sb,
+	.find_last_sb	= mtd_find_last_sb,
+	.readpage	= mtd_readpage,
+	.writeseg	= mtd_writeseg,
+	.erase		= mtd_erase,
+	.can_write_buf	= mtd_can_write_buf,
+	.sync		= mtd_sync,
+	.put_device	= mtd_put_device,
+};
+
+int logfs_get_sb_mtd(struct file_system_type *type, int flags,
+		int mtdnr, struct vfsmount *mnt)
+{
+	struct mtd_info *mtd;
+	const struct logfs_device_ops *devops = &mtd_devops;
+
+	mtd = get_mtd_device(NULL, mtdnr);
+	if (IS_ERR(mtd))
+		return PTR_ERR(mtd);
+	return logfs_get_sb_device(type, flags, mtd, NULL, devops, mnt);
+}
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
new file mode 100644
index 000000000000..72d1893ddd36
--- /dev/null
+++ b/fs/logfs/dir.c
@@ -0,0 +1,827 @@
+/*
+ * fs/logfs/dir.c	- directory-related code
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/slab.h>
+
+/*
+ * Atomic dir operations
+ *
+ * Directory operations are by default not atomic.  Dentries and Inodes are
+ * created/removed/altered in separate operations.  Therefore we need to do
+ * a small amount of journaling.
+ *
+ * Create, link, mkdir, mknod and symlink all share the same function to do
+ * the work: __logfs_create.  This function works in two atomic steps:
+ * 1. allocate inode (remember in journal)
+ * 2. allocate dentry (clear journal)
+ *
+ * As we can only get interrupted between the two, when the inode we just
+ * created is simply stored in the anchor.  On next mount, if we were
+ * interrupted, we delete the inode.  From a users point of view the
+ * operation never happened.
+ *
+ * Unlink and rmdir also share the same function: unlink.  Again, this
+ * function works in two atomic steps
+ * 1. remove dentry (remember inode in journal)
+ * 2. unlink inode (clear journal)
+ *
+ * And again, on the next mount, if we were interrupted, we delete the inode.
+ * From a users point of view the operation succeeded.
+ *
+ * Rename is the real pain to deal with, harder than all the other methods
+ * combined.  Depending on the circumstances we can run into three cases.
+ * A "target rename" where the target dentry already existed, a "local
+ * rename" where both parent directories are identical or a "cross-directory
+ * rename" in the remaining case.
+ *
+ * Local rename is atomic, as the old dentry is simply rewritten with a new
+ * name.
+ *
+ * Cross-directory rename works in two steps, similar to __logfs_create and
+ * logfs_unlink:
+ * 1. Write new dentry (remember old dentry in journal)
+ * 2. Remove old dentry (clear journal)
+ *
+ * Here we remember a dentry instead of an inode.  On next mount, if we were
+ * interrupted, we delete the dentry.  From a users point of view, the
+ * operation succeeded.
+ *
+ * Target rename works in three atomic steps:
+ * 1. Attach old inode to new dentry (remember old dentry and new inode)
+ * 2. Remove old dentry (still remember the new inode)
+ * 3. Remove victim inode
+ *
+ * Here we remember both an inode an a dentry.  If we get interrupted
+ * between steps 1 and 2, we delete both the dentry and the inode.  If
+ * we get interrupted between steps 2 and 3, we delete just the inode.
+ * In either case, the remaining objects are deleted on next mount.  From
+ * a users point of view, the operation succeeded.
+ */
+
+static int write_dir(struct inode *dir, struct logfs_disk_dentry *dd,
+		loff_t pos)
+{
+	return logfs_inode_write(dir, dd, sizeof(*dd), pos, WF_LOCK, NULL);
+}
+
+static int write_inode(struct inode *inode)
+{
+	return __logfs_write_inode(inode, WF_LOCK);
+}
+
+static s64 dir_seek_data(struct inode *inode, s64 pos)
+{
+	s64 new_pos = logfs_seek_data(inode, pos);
+
+	return max(pos, new_pos - 1);
+}
+
+static int beyond_eof(struct inode *inode, loff_t bix)
+{
+	loff_t pos = bix << inode->i_sb->s_blocksize_bits;
+	return pos >= i_size_read(inode);
+}
+
+/*
+ * Prime value was chosen to be roughly 256 + 26.  r5 hash uses 11,
+ * so short names (len <= 9) don't even occupy the complete 32bit name
+ * space.  A prime >256 ensures short names quickly spread the 32bit
+ * name space.  Add about 26 for the estimated amount of information
+ * of each character and pick a prime nearby, preferrably a bit-sparse
+ * one.
+ */
+static u32 hash_32(const char *s, int len, u32 seed)
+{
+	u32 hash = seed;
+	int i;
+
+	for (i = 0; i < len; i++)
+		hash = hash * 293 + s[i];
+	return hash;
+}
+
+/*
+ * We have to satisfy several conflicting requirements here.  Small
+ * directories should stay fairly compact and not require too many
+ * indirect blocks.  The number of possible locations for a given hash
+ * should be small to make lookup() fast.  And we should try hard not
+ * to overflow the 32bit name space or nfs and 32bit host systems will
+ * be unhappy.
+ *
+ * So we use the following scheme.  First we reduce the hash to 0..15
+ * and try a direct block.  If that is occupied we reduce the hash to
+ * 16..255 and try an indirect block.  Same for 2x and 3x indirect
+ * blocks.  Lastly we reduce the hash to 0x800_0000 .. 0xffff_ffff,
+ * but use buckets containing eight entries instead of a single one.
+ *
+ * Using 16 entries should allow for a reasonable amount of hash
+ * collisions, so the 32bit name space can be packed fairly tight
+ * before overflowing.  Oh and currently we don't overflow but return
+ * and error.
+ *
+ * How likely are collisions?  Doing the appropriate math is beyond me
+ * and the Bronstein textbook.  But running a test program to brute
+ * force collisions for a couple of days showed that on average the
+ * first collision occurs after 598M entries, with 290M being the
+ * smallest result.  Obviously 21 entries could already cause a
+ * collision if all entries are carefully chosen.
+ */
+static pgoff_t hash_index(u32 hash, int round)
+{
+	u32 i0_blocks = I0_BLOCKS;
+	u32 i1_blocks = I1_BLOCKS;
+	u32 i2_blocks = I2_BLOCKS;
+	u32 i3_blocks = I3_BLOCKS;
+
+	switch (round) {
+	case 0:
+		return hash % i0_blocks;
+	case 1:
+		return i0_blocks + hash % (i1_blocks - i0_blocks);
+	case 2:
+		return i1_blocks + hash % (i2_blocks - i1_blocks);
+	case 3:
+		return i2_blocks + hash % (i3_blocks - i2_blocks);
+	case 4 ... 19:
+		return i3_blocks + 16 * (hash % (((1<<31) - i3_blocks) / 16))
+			+ round - 4;
+	}
+	BUG();
+}
+
+static struct page *logfs_get_dd_page(struct inode *dir, struct dentry *dentry)
+{
+	struct qstr *name = &dentry->d_name;
+	struct page *page;
+	struct logfs_disk_dentry *dd;
+	u32 hash = hash_32(name->name, name->len, 0);
+	pgoff_t index;
+	int round;
+
+	if (name->len > LOGFS_MAX_NAMELEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	for (round = 0; round < 20; round++) {
+		index = hash_index(hash, round);
+
+		if (beyond_eof(dir, index))
+			return NULL;
+		if (!logfs_exist_block(dir, index))
+			continue;
+		page = read_cache_page(dir->i_mapping, index,
+				(filler_t *)logfs_readpage, NULL);
+		if (IS_ERR(page))
+			return page;
+		dd = kmap_atomic(page, KM_USER0);
+		BUG_ON(dd->namelen == 0);
+
+		if (name->len != be16_to_cpu(dd->namelen) ||
+				memcmp(name->name, dd->name, name->len)) {
+			kunmap_atomic(dd, KM_USER0);
+			page_cache_release(page);
+			continue;
+		}
+
+		kunmap_atomic(dd, KM_USER0);
+		return page;
+	}
+	return NULL;
+}
+
+static int logfs_remove_inode(struct inode *inode)
+{
+	int ret;
+
+	inode->i_nlink--;
+	ret = write_inode(inode);
+	LOGFS_BUG_ON(ret, inode->i_sb);
+	return ret;
+}
+
+static void abort_transaction(struct inode *inode, struct logfs_transaction *ta)
+{
+	if (logfs_inode(inode)->li_block)
+		logfs_inode(inode)->li_block->ta = NULL;
+	kfree(ta);
+}
+
+static int logfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct logfs_super *super = logfs_super(dir->i_sb);
+	struct inode *inode = dentry->d_inode;
+	struct logfs_transaction *ta;
+	struct page *page;
+	pgoff_t index;
+	int ret;
+
+	ta = kzalloc(sizeof(*ta), GFP_KERNEL);
+	if (!ta)
+		return -ENOMEM;
+
+	ta->state = UNLINK_1;
+	ta->ino = inode->i_ino;
+
+	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+
+	page = logfs_get_dd_page(dir, dentry);
+	if (!page) {
+		kfree(ta);
+		return -ENOENT;
+	}
+	if (IS_ERR(page)) {
+		kfree(ta);
+		return PTR_ERR(page);
+	}
+	index = page->index;
+	page_cache_release(page);
+
+	mutex_lock(&super->s_dirop_mutex);
+	logfs_add_transaction(dir, ta);
+
+	ret = logfs_delete(dir, index, NULL);
+	if (!ret)
+		ret = write_inode(dir);
+
+	if (ret) {
+		abort_transaction(dir, ta);
+		printk(KERN_ERR"LOGFS: unable to delete inode\n");
+		goto out;
+	}
+
+	ta->state = UNLINK_2;
+	logfs_add_transaction(inode, ta);
+	ret = logfs_remove_inode(inode);
+out:
+	mutex_unlock(&super->s_dirop_mutex);
+	return ret;
+}
+
+static inline int logfs_empty_dir(struct inode *dir)
+{
+	u64 data;
+
+	data = logfs_seek_data(dir, 0) << dir->i_sb->s_blocksize_bits;
+	return data >= i_size_read(dir);
+}
+
+static int logfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+
+	if (!logfs_empty_dir(inode))
+		return -ENOTEMPTY;
+
+	return logfs_unlink(dir, dentry);
+}
+
+/* FIXME: readdir currently has it's own dir_walk code.  I don't see a good
+ * way to combine the two copies */
+#define IMPLICIT_NODES 2
+static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir)
+{
+	struct inode *dir = file->f_dentry->d_inode;
+	loff_t pos = file->f_pos - IMPLICIT_NODES;
+	struct page *page;
+	struct logfs_disk_dentry *dd;
+	int full;
+
+	BUG_ON(pos < 0);
+	for (;; pos++) {
+		if (beyond_eof(dir, pos))
+			break;
+		if (!logfs_exist_block(dir, pos)) {
+			/* deleted dentry */
+			pos = dir_seek_data(dir, pos);
+			continue;
+		}
+		page = read_cache_page(dir->i_mapping, pos,
+				(filler_t *)logfs_readpage, NULL);
+		if (IS_ERR(page))
+			return PTR_ERR(page);
+		dd = kmap(page);
+		BUG_ON(dd->namelen == 0);
+
+		full = filldir(buf, (char *)dd->name, be16_to_cpu(dd->namelen),
+				pos, be64_to_cpu(dd->ino), dd->type);
+		kunmap(page);
+		page_cache_release(page);
+		if (full)
+			break;
+	}
+
+	file->f_pos = pos + IMPLICIT_NODES;
+	return 0;
+}
+
+static int logfs_readdir(struct file *file, void *buf, filldir_t filldir)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	ino_t pino = parent_ino(file->f_dentry);
+	int err;
+
+	if (file->f_pos < 0)
+		return -EINVAL;
+
+	if (file->f_pos == 0) {
+		if (filldir(buf, ".", 1, 1, inode->i_ino, DT_DIR) < 0)
+			return 0;
+		file->f_pos++;
+	}
+	if (file->f_pos == 1) {
+		if (filldir(buf, "..", 2, 2, pino, DT_DIR) < 0)
+			return 0;
+		file->f_pos++;
+	}
+
+	err = __logfs_readdir(file, buf, filldir);
+	return err;
+}
+
+static void logfs_set_name(struct logfs_disk_dentry *dd, struct qstr *name)
+{
+	dd->namelen = cpu_to_be16(name->len);
+	memcpy(dd->name, name->name, name->len);
+}
+
+static struct dentry *logfs_lookup(struct inode *dir, struct dentry *dentry,
+		struct nameidata *nd)
+{
+	struct page *page;
+	struct logfs_disk_dentry *dd;
+	pgoff_t index;
+	u64 ino = 0;
+	struct inode *inode;
+
+	page = logfs_get_dd_page(dir, dentry);
+	if (IS_ERR(page))
+		return ERR_CAST(page);
+	if (!page) {
+		d_add(dentry, NULL);
+		return NULL;
+	}
+	index = page->index;
+	dd = kmap_atomic(page, KM_USER0);
+	ino = be64_to_cpu(dd->ino);
+	kunmap_atomic(dd, KM_USER0);
+	page_cache_release(page);
+
+	inode = logfs_iget(dir->i_sb, ino);
+	if (IS_ERR(inode)) {
+		printk(KERN_ERR"LogFS: Cannot read inode #%llx for dentry (%lx, %lx)n",
+				ino, dir->i_ino, index);
+		return ERR_CAST(inode);
+	}
+	return d_splice_alias(inode, dentry);
+}
+
+static void grow_dir(struct inode *dir, loff_t index)
+{
+	index = (index + 1) << dir->i_sb->s_blocksize_bits;
+	if (i_size_read(dir) < index)
+		i_size_write(dir, index);
+}
+
+static int logfs_write_dir(struct inode *dir, struct dentry *dentry,
+		struct inode *inode)
+{
+	struct page *page;
+	struct logfs_disk_dentry *dd;
+	u32 hash = hash_32(dentry->d_name.name, dentry->d_name.len, 0);
+	pgoff_t index;
+	int round, err;
+
+	for (round = 0; round < 20; round++) {
+		index = hash_index(hash, round);
+
+		if (logfs_exist_block(dir, index))
+			continue;
+		page = find_or_create_page(dir->i_mapping, index, GFP_KERNEL);
+		if (!page)
+			return -ENOMEM;
+
+		dd = kmap_atomic(page, KM_USER0);
+		memset(dd, 0, sizeof(*dd));
+		dd->ino = cpu_to_be64(inode->i_ino);
+		dd->type = logfs_type(inode);
+		logfs_set_name(dd, &dentry->d_name);
+		kunmap_atomic(dd, KM_USER0);
+
+		err = logfs_write_buf(dir, page, WF_LOCK);
+		unlock_page(page);
+		page_cache_release(page);
+		if (!err)
+			grow_dir(dir, index);
+		return err;
+	}
+	/* FIXME: Is there a better return value?  In most cases neither
+	 * the filesystem nor the directory are full.  But we have had
+	 * too many collisions for this particular hash and no fallback.
+	 */
+	return -ENOSPC;
+}
+
+static int __logfs_create(struct inode *dir, struct dentry *dentry,
+		struct inode *inode, const char *dest, long destlen)
+{
+	struct logfs_super *super = logfs_super(dir->i_sb);
+	struct logfs_inode *li = logfs_inode(inode);
+	struct logfs_transaction *ta;
+	int ret;
+
+	ta = kzalloc(sizeof(*ta), GFP_KERNEL);
+	if (!ta)
+		return -ENOMEM;
+
+	ta->state = CREATE_1;
+	ta->ino = inode->i_ino;
+	mutex_lock(&super->s_dirop_mutex);
+	logfs_add_transaction(inode, ta);
+
+	if (dest) {
+		/* symlink */
+		ret = logfs_inode_write(inode, dest, destlen, 0, WF_LOCK, NULL);
+		if (!ret)
+			ret = write_inode(inode);
+	} else {
+		/* creat/mkdir/mknod */
+		ret = write_inode(inode);
+	}
+	if (ret) {
+		abort_transaction(inode, ta);
+		li->li_flags |= LOGFS_IF_STILLBORN;
+		/* FIXME: truncate symlink */
+		inode->i_nlink--;
+		iput(inode);
+		goto out;
+	}
+
+	ta->state = CREATE_2;
+	logfs_add_transaction(dir, ta);
+	ret = logfs_write_dir(dir, dentry, inode);
+	/* sync directory */
+	if (!ret)
+		ret = write_inode(dir);
+
+	if (ret) {
+		logfs_del_transaction(dir, ta);
+		ta->state = CREATE_2;
+		logfs_add_transaction(inode, ta);
+		logfs_remove_inode(inode);
+		iput(inode);
+		goto out;
+	}
+	d_instantiate(dentry, inode);
+out:
+	mutex_unlock(&super->s_dirop_mutex);
+	return ret;
+}
+
+static int logfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct inode *inode;
+
+	/*
+	 * FIXME: why do we have to fill in S_IFDIR, while the mode is
+	 * correct for mknod, creat, etc.?  Smells like the vfs *should*
+	 * do it for us but for some reason fails to do so.
+	 */
+	inode = logfs_new_inode(dir, S_IFDIR | mode);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	inode->i_op = &logfs_dir_iops;
+	inode->i_fop = &logfs_dir_fops;
+
+	return __logfs_create(dir, dentry, inode, NULL, 0);
+}
+
+static int logfs_create(struct inode *dir, struct dentry *dentry, int mode,
+		struct nameidata *nd)
+{
+	struct inode *inode;
+
+	inode = logfs_new_inode(dir, mode);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	inode->i_op = &logfs_reg_iops;
+	inode->i_fop = &logfs_reg_fops;
+	inode->i_mapping->a_ops = &logfs_reg_aops;
+
+	return __logfs_create(dir, dentry, inode, NULL, 0);
+}
+
+static int logfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
+		dev_t rdev)
+{
+	struct inode *inode;
+
+	if (dentry->d_name.len > LOGFS_MAX_NAMELEN)
+		return -ENAMETOOLONG;
+
+	inode = logfs_new_inode(dir, mode);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	init_special_inode(inode, mode, rdev);
+
+	return __logfs_create(dir, dentry, inode, NULL, 0);
+}
+
+static int logfs_symlink(struct inode *dir, struct dentry *dentry,
+		const char *target)
+{
+	struct inode *inode;
+	size_t destlen = strlen(target) + 1;
+
+	if (destlen > dir->i_sb->s_blocksize)
+		return -ENAMETOOLONG;
+
+	inode = logfs_new_inode(dir, S_IFLNK | 0777);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	inode->i_op = &logfs_symlink_iops;
+	inode->i_mapping->a_ops = &logfs_reg_aops;
+
+	return __logfs_create(dir, dentry, inode, target, destlen);
+}
+
+static int logfs_permission(struct inode *inode, int mask)
+{
+	return generic_permission(inode, mask, NULL);
+}
+
+static int logfs_link(struct dentry *old_dentry, struct inode *dir,
+		struct dentry *dentry)
+{
+	struct inode *inode = old_dentry->d_inode;
+
+	if (inode->i_nlink >= LOGFS_LINK_MAX)
+		return -EMLINK;
+
+	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+	atomic_inc(&inode->i_count);
+	inode->i_nlink++;
+	mark_inode_dirty_sync(inode);
+
+	return __logfs_create(dir, dentry, inode, NULL, 0);
+}
+
+static int logfs_get_dd(struct inode *dir, struct dentry *dentry,
+		struct logfs_disk_dentry *dd, loff_t *pos)
+{
+	struct page *page;
+	void *map;
+
+	page = logfs_get_dd_page(dir, dentry);
+	if (IS_ERR(page))
+		return PTR_ERR(page);
+	*pos = page->index;
+	map = kmap_atomic(page, KM_USER0);
+	memcpy(dd, map, sizeof(*dd));
+	kunmap_atomic(map, KM_USER0);
+	page_cache_release(page);
+	return 0;
+}
+
+static int logfs_delete_dd(struct inode *dir, loff_t pos)
+{
+	/*
+	 * Getting called with pos somewhere beyond eof is either a goofup
+	 * within this file or means someone maliciously edited the
+	 * (crc-protected) journal.
+	 */
+	BUG_ON(beyond_eof(dir, pos));
+	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+	log_dir(" Delete dentry (%lx, %llx)\n", dir->i_ino, pos);
+	return logfs_delete(dir, pos, NULL);
+}
+
+/*
+ * Cross-directory rename, target does not exist.  Just a little nasty.
+ * Create a new dentry in the target dir, then remove the old dentry,
+ * all the while taking care to remember our operation in the journal.
+ */
+static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry,
+			      struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct logfs_super *super = logfs_super(old_dir->i_sb);
+	struct logfs_disk_dentry dd;
+	struct logfs_transaction *ta;
+	loff_t pos;
+	int err;
+
+	/* 1. locate source dd */
+	err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
+	if (err)
+		return err;
+
+	ta = kzalloc(sizeof(*ta), GFP_KERNEL);
+	if (!ta)
+		return -ENOMEM;
+
+	ta->state = CROSS_RENAME_1;
+	ta->dir = old_dir->i_ino;
+	ta->pos = pos;
+
+	/* 2. write target dd */
+	mutex_lock(&super->s_dirop_mutex);
+	logfs_add_transaction(new_dir, ta);
+	err = logfs_write_dir(new_dir, new_dentry, old_dentry->d_inode);
+	if (!err)
+		err = write_inode(new_dir);
+
+	if (err) {
+		super->s_rename_dir = 0;
+		super->s_rename_pos = 0;
+		abort_transaction(new_dir, ta);
+		goto out;
+	}
+
+	/* 3. remove source dd */
+	ta->state = CROSS_RENAME_2;
+	logfs_add_transaction(old_dir, ta);
+	err = logfs_delete_dd(old_dir, pos);
+	if (!err)
+		err = write_inode(old_dir);
+	LOGFS_BUG_ON(err, old_dir->i_sb);
+out:
+	mutex_unlock(&super->s_dirop_mutex);
+	return err;
+}
+
+static int logfs_replace_inode(struct inode *dir, struct dentry *dentry,
+		struct logfs_disk_dentry *dd, struct inode *inode)
+{
+	loff_t pos;
+	int err;
+
+	err = logfs_get_dd(dir, dentry, dd, &pos);
+	if (err)
+		return err;
+	dd->ino = cpu_to_be64(inode->i_ino);
+	dd->type = logfs_type(inode);
+
+	err = write_dir(dir, dd, pos);
+	if (err)
+		return err;
+	log_dir("Replace dentry (%lx, %llx) %s -> %llx\n", dir->i_ino, pos,
+			dd->name, be64_to_cpu(dd->ino));
+	return write_inode(dir);
+}
+
+/* Target dentry exists - the worst case.  We need to attach the source
+ * inode to the target dentry, then remove the orphaned target inode and
+ * source dentry.
+ */
+static int logfs_rename_target(struct inode *old_dir, struct dentry *old_dentry,
+			       struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct logfs_super *super = logfs_super(old_dir->i_sb);
+	struct inode *old_inode = old_dentry->d_inode;
+	struct inode *new_inode = new_dentry->d_inode;
+	int isdir = S_ISDIR(old_inode->i_mode);
+	struct logfs_disk_dentry dd;
+	struct logfs_transaction *ta;
+	loff_t pos;
+	int err;
+
+	BUG_ON(isdir != S_ISDIR(new_inode->i_mode));
+	if (isdir) {
+		if (!logfs_empty_dir(new_inode))
+			return -ENOTEMPTY;
+	}
+
+	/* 1. locate source dd */
+	err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
+	if (err)
+		return err;
+
+	ta = kzalloc(sizeof(*ta), GFP_KERNEL);
+	if (!ta)
+		return -ENOMEM;
+
+	ta->state = TARGET_RENAME_1;
+	ta->dir = old_dir->i_ino;
+	ta->pos = pos;
+	ta->ino = new_inode->i_ino;
+
+	/* 2. attach source inode to target dd */
+	mutex_lock(&super->s_dirop_mutex);
+	logfs_add_transaction(new_dir, ta);
+	err = logfs_replace_inode(new_dir, new_dentry, &dd, old_inode);
+	if (err) {
+		super->s_rename_dir = 0;
+		super->s_rename_pos = 0;
+		super->s_victim_ino = 0;
+		abort_transaction(new_dir, ta);
+		goto out;
+	}
+
+	/* 3. remove source dd */
+	ta->state = TARGET_RENAME_2;
+	logfs_add_transaction(old_dir, ta);
+	err = logfs_delete_dd(old_dir, pos);
+	if (!err)
+		err = write_inode(old_dir);
+	LOGFS_BUG_ON(err, old_dir->i_sb);
+
+	/* 4. remove target inode */
+	ta->state = TARGET_RENAME_3;
+	logfs_add_transaction(new_inode, ta);
+	err = logfs_remove_inode(new_inode);
+
+out:
+	mutex_unlock(&super->s_dirop_mutex);
+	return err;
+}
+
+static int logfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+			struct inode *new_dir, struct dentry *new_dentry)
+{
+	if (new_dentry->d_inode)
+		return logfs_rename_target(old_dir, old_dentry,
+					   new_dir, new_dentry);
+	return logfs_rename_cross(old_dir, old_dentry, new_dir, new_dentry);
+}
+
+/* No locking done here, as this is called before .get_sb() returns. */
+int logfs_replay_journal(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct inode *inode;
+	u64 ino, pos;
+	int err;
+
+	if (super->s_victim_ino) {
+		/* delete victim inode */
+		ino = super->s_victim_ino;
+		printk(KERN_INFO"LogFS: delete unmapped inode #%llx\n", ino);
+		inode = logfs_iget(sb, ino);
+		if (IS_ERR(inode))
+			goto fail;
+
+		LOGFS_BUG_ON(i_size_read(inode) > 0, sb);
+		super->s_victim_ino = 0;
+		err = logfs_remove_inode(inode);
+		iput(inode);
+		if (err) {
+			super->s_victim_ino = ino;
+			goto fail;
+		}
+	}
+	if (super->s_rename_dir) {
+		/* delete old dd from rename */
+		ino = super->s_rename_dir;
+		pos = super->s_rename_pos;
+		printk(KERN_INFO"LogFS: delete unbacked dentry (%llx, %llx)\n",
+				ino, pos);
+		inode = logfs_iget(sb, ino);
+		if (IS_ERR(inode))
+			goto fail;
+
+		super->s_rename_dir = 0;
+		super->s_rename_pos = 0;
+		err = logfs_delete_dd(inode, pos);
+		iput(inode);
+		if (err) {
+			super->s_rename_dir = ino;
+			super->s_rename_pos = pos;
+			goto fail;
+		}
+	}
+	return 0;
+fail:
+	LOGFS_BUG(sb);
+	return -EIO;
+}
+
+const struct inode_operations logfs_symlink_iops = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+};
+
+const struct inode_operations logfs_dir_iops = {
+	.create		= logfs_create,
+	.link		= logfs_link,
+	.lookup		= logfs_lookup,
+	.mkdir		= logfs_mkdir,
+	.mknod		= logfs_mknod,
+	.rename		= logfs_rename,
+	.rmdir		= logfs_rmdir,
+	.permission	= logfs_permission,
+	.symlink	= logfs_symlink,
+	.unlink		= logfs_unlink,
+};
+const struct file_operations logfs_dir_fops = {
+	.fsync		= logfs_fsync,
+	.ioctl		= logfs_ioctl,
+	.readdir	= logfs_readdir,
+	.read		= generic_read_dir,
+};
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
new file mode 100644
index 000000000000..abe1cafbd4c2
--- /dev/null
+++ b/fs/logfs/file.c
@@ -0,0 +1,271 @@
+/*
+ * fs/logfs/file.c	- prepare_write, commit_write and friends
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/sched.h>
+#include <linux/writeback.h>
+
+static int logfs_write_begin(struct file *file, struct address_space *mapping,
+		loff_t pos, unsigned len, unsigned flags,
+		struct page **pagep, void **fsdata)
+{
+	struct inode *inode = mapping->host;
+	struct page *page;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page)
+		return -ENOMEM;
+	*pagep = page;
+
+	if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
+		return 0;
+	if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) {
+		unsigned start = pos & (PAGE_CACHE_SIZE - 1);
+		unsigned end = start + len;
+
+		/* Reading beyond i_size is simple: memset to zero */
+		zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE);
+		return 0;
+	}
+	return logfs_readpage_nolock(page);
+}
+
+static int logfs_write_end(struct file *file, struct address_space *mapping,
+		loff_t pos, unsigned len, unsigned copied, struct page *page,
+		void *fsdata)
+{
+	struct inode *inode = mapping->host;
+	pgoff_t index = page->index;
+	unsigned start = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned end = start + copied;
+	int ret = 0;
+
+	BUG_ON(PAGE_CACHE_SIZE != inode->i_sb->s_blocksize);
+	BUG_ON(page->index > I3_BLOCKS);
+
+	if (copied < len) {
+		/*
+		 * Short write of a non-initialized paged.  Just tell userspace
+		 * to retry the entire page.
+		 */
+		if (!PageUptodate(page)) {
+			copied = 0;
+			goto out;
+		}
+	}
+	if (copied == 0)
+		goto out; /* FIXME: do we need to update inode? */
+
+	if (i_size_read(inode) < (index << PAGE_CACHE_SHIFT) + end) {
+		i_size_write(inode, (index << PAGE_CACHE_SHIFT) + end);
+		mark_inode_dirty_sync(inode);
+	}
+
+	SetPageUptodate(page);
+	if (!PageDirty(page)) {
+		if (!get_page_reserve(inode, page))
+			__set_page_dirty_nobuffers(page);
+		else
+			ret = logfs_write_buf(inode, page, WF_LOCK);
+	}
+out:
+	unlock_page(page);
+	page_cache_release(page);
+	return ret ? ret : copied;
+}
+
+int logfs_readpage(struct file *file, struct page *page)
+{
+	int ret;
+
+	ret = logfs_readpage_nolock(page);
+	unlock_page(page);
+	return ret;
+}
+
+/* Clear the page's dirty flag in the radix tree. */
+/* TODO: mucking with PageWriteback is silly.  Add a generic function to clear
+ * the dirty bit from the radix tree for filesystems that don't have to wait
+ * for page writeback to finish (i.e. any compressing filesystem).
+ */
+static void clear_radix_tree_dirty(struct page *page)
+{
+	BUG_ON(PagePrivate(page) || page->private);
+	set_page_writeback(page);
+	end_page_writeback(page);
+}
+
+static int __logfs_writepage(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	int err;
+
+	err = logfs_write_buf(inode, page, WF_LOCK);
+	if (err)
+		set_page_dirty(page);
+	else
+		clear_radix_tree_dirty(page);
+	unlock_page(page);
+	return err;
+}
+
+static int logfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	loff_t i_size = i_size_read(inode);
+	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+	unsigned offset;
+	u64 bix;
+	level_t level;
+
+	log_file("logfs_writepage(%lx, %lx, %p)\n", inode->i_ino, page->index,
+			page);
+
+	logfs_unpack_index(page->index, &bix, &level);
+
+	/* Indirect blocks are never truncated */
+	if (level != 0)
+		return __logfs_writepage(page);
+
+	/*
+	 * TODO: everything below is a near-verbatim copy of nobh_writepage().
+	 * The relevant bits should be factored out after logfs is merged.
+	 */
+
+	/* Is the page fully inside i_size? */
+	if (bix < end_index)
+		return __logfs_writepage(page);
+
+	 /* Is the page fully outside i_size? (truncate in progress) */
+	offset = i_size & (PAGE_CACHE_SIZE-1);
+	if (bix > end_index || offset == 0) {
+		unlock_page(page);
+		return 0; /* don't care */
+	}
+
+	/*
+	 * The page straddles i_size.  It must be zeroed out on each and every
+	 * writepage invokation because it may be mmapped.  "A file is mapped
+	 * in multiples of the page size.  For a file that is not a multiple of
+	 * the  page size, the remaining memory is zeroed when mapped, and
+	 * writes to that region are not written out to the file."
+	 */
+	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+	return __logfs_writepage(page);
+}
+
+static void logfs_invalidatepage(struct page *page, unsigned long offset)
+{
+	struct logfs_block *block = logfs_block(page);
+
+	if (block->reserved_bytes) {
+		struct super_block *sb = page->mapping->host->i_sb;
+		struct logfs_super *super = logfs_super(sb);
+
+		super->s_dirty_pages -= block->reserved_bytes;
+		block->ops->free_block(sb, block);
+		BUG_ON(bitmap_weight(block->alias_map, LOGFS_BLOCK_FACTOR));
+	} else
+		move_page_to_btree(page);
+	BUG_ON(PagePrivate(page) || page->private);
+}
+
+static int logfs_releasepage(struct page *page, gfp_t only_xfs_uses_this)
+{
+	return 0; /* None of these are easy to release */
+}
+
+
+int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
+		unsigned long arg)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	unsigned int oldflags, flags;
+	int err;
+
+	switch (cmd) {
+	case FS_IOC_GETFLAGS:
+		flags = li->li_flags & LOGFS_FL_USER_VISIBLE;
+		return put_user(flags, (int __user *)arg);
+	case FS_IOC_SETFLAGS:
+		if (IS_RDONLY(inode))
+			return -EROFS;
+
+		if (!is_owner_or_cap(inode))
+			return -EACCES;
+
+		err = get_user(flags, (int __user *)arg);
+		if (err)
+			return err;
+
+		mutex_lock(&inode->i_mutex);
+		oldflags = li->li_flags;
+		flags &= LOGFS_FL_USER_MODIFIABLE;
+		flags |= oldflags & ~LOGFS_FL_USER_MODIFIABLE;
+		li->li_flags = flags;
+		mutex_unlock(&inode->i_mutex);
+
+		inode->i_ctime = CURRENT_TIME;
+		mark_inode_dirty_sync(inode);
+		return 0;
+
+	default:
+		return -ENOTTY;
+	}
+}
+
+int logfs_fsync(struct file *file, int datasync)
+{
+	struct super_block *sb = file->f_mapping->host->i_sb;
+
+	logfs_write_anchor(sb);
+	return 0;
+}
+
+static int logfs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	int err = 0;
+
+	if (attr->ia_valid & ATTR_SIZE)
+		err = logfs_truncate(inode, attr->ia_size);
+	attr->ia_valid &= ~ATTR_SIZE;
+
+	if (!err)
+		err = inode_change_ok(inode, attr);
+	if (!err)
+		err = inode_setattr(inode, attr);
+	return err;
+}
+
+const struct inode_operations logfs_reg_iops = {
+	.setattr	= logfs_setattr,
+};
+
+const struct file_operations logfs_reg_fops = {
+	.aio_read	= generic_file_aio_read,
+	.aio_write	= generic_file_aio_write,
+	.fsync		= logfs_fsync,
+	.ioctl		= logfs_ioctl,
+	.llseek		= generic_file_llseek,
+	.mmap		= generic_file_readonly_mmap,
+	.open		= generic_file_open,
+	.read		= do_sync_read,
+	.write		= do_sync_write,
+};
+
+const struct address_space_operations logfs_reg_aops = {
+	.invalidatepage	= logfs_invalidatepage,
+	.readpage	= logfs_readpage,
+	.releasepage	= logfs_releasepage,
+	.set_page_dirty	= __set_page_dirty_nobuffers,
+	.writepage	= logfs_writepage,
+	.writepages	= generic_writepages,
+	.write_begin	= logfs_write_begin,
+	.write_end	= logfs_write_end,
+};
diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c
new file mode 100644
index 000000000000..caa4419285dc
--- /dev/null
+++ b/fs/logfs/gc.c
@@ -0,0 +1,732 @@
+/*
+ * fs/logfs/gc.c	- garbage collection code
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+/*
+ * Wear leveling needs to kick in when the difference between low erase
+ * counts and high erase counts gets too big.  A good value for "too big"
+ * may be somewhat below 10% of maximum erase count for the device.
+ * Why not 397, to pick a nice round number with no specific meaning? :)
+ *
+ * WL_RATELIMIT is the minimum time between two wear level events.  A huge
+ * number of segments may fulfil the requirements for wear leveling at the
+ * same time.  If that happens we don't want to cause a latency from hell,
+ * but just gently pick one segment every so often and minimize overhead.
+ */
+#define WL_DELTA 397
+#define WL_RATELIMIT 100
+#define MAX_OBJ_ALIASES	2600
+#define SCAN_RATIO 512	/* number of scanned segments per gc'd segment */
+#define LIST_SIZE 64	/* base size of candidate lists */
+#define SCAN_ROUNDS 128	/* maximum number of complete medium scans */
+#define SCAN_ROUNDS_HIGH 4 /* maximum number of higher-level scans */
+
+static int no_free_segments(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	return super->s_free_list.count;
+}
+
+/* journal has distance -1, top-most ifile layer distance 0 */
+static u8 root_distance(struct super_block *sb, gc_level_t __gc_level)
+{
+	struct logfs_super *super = logfs_super(sb);
+	u8 gc_level = (__force u8)__gc_level;
+
+	switch (gc_level) {
+	case 0: /* fall through */
+	case 1: /* fall through */
+	case 2: /* fall through */
+	case 3:
+		/* file data or indirect blocks */
+		return super->s_ifile_levels + super->s_iblock_levels - gc_level;
+	case 6: /* fall through */
+	case 7: /* fall through */
+	case 8: /* fall through */
+	case 9:
+		/* inode file data or indirect blocks */
+		return super->s_ifile_levels - (gc_level - 6);
+	default:
+		printk(KERN_ERR"LOGFS: segment of unknown level %x found\n",
+				gc_level);
+		WARN_ON(1);
+		return super->s_ifile_levels + super->s_iblock_levels;
+	}
+}
+
+static int segment_is_reserved(struct super_block *sb, u32 segno)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_area *area;
+	void *reserved;
+	int i;
+
+	/* Some segments are reserved.  Just pretend they were all valid */
+	reserved = btree_lookup32(&super->s_reserved_segments, segno);
+	if (reserved)
+		return 1;
+
+	/* Currently open segments */
+	for_each_area(i) {
+		area = super->s_area[i];
+		if (area->a_is_open && area->a_segno == segno)
+			return 1;
+	}
+
+	return 0;
+}
+
+static void logfs_mark_segment_bad(struct super_block *sb, u32 segno)
+{
+	BUG();
+}
+
+/*
+ * Returns the bytes consumed by valid objects in this segment.  Object headers
+ * are counted, the segment header is not.
+ */
+static u32 logfs_valid_bytes(struct super_block *sb, u32 segno, u32 *ec,
+		gc_level_t *gc_level)
+{
+	struct logfs_segment_entry se;
+	u32 ec_level;
+
+	logfs_get_segment_entry(sb, segno, &se);
+	if (se.ec_level == cpu_to_be32(BADSEG) ||
+			se.valid == cpu_to_be32(RESERVED))
+		return RESERVED;
+
+	ec_level = be32_to_cpu(se.ec_level);
+	*ec = ec_level >> 4;
+	*gc_level = GC_LEVEL(ec_level & 0xf);
+	return be32_to_cpu(se.valid);
+}
+
+static void logfs_cleanse_block(struct super_block *sb, u64 ofs, u64 ino,
+		u64 bix, gc_level_t gc_level)
+{
+	struct inode *inode;
+	int err, cookie;
+
+	inode = logfs_safe_iget(sb, ino, &cookie);
+	err = logfs_rewrite_block(inode, bix, ofs, gc_level, 0);
+	BUG_ON(err);
+	logfs_safe_iput(inode, cookie);
+}
+
+static u32 logfs_gc_segment(struct super_block *sb, u32 segno)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_segment_header sh;
+	struct logfs_object_header oh;
+	u64 ofs, ino, bix;
+	u32 seg_ofs, logical_segno, cleaned = 0;
+	int err, len, valid;
+	gc_level_t gc_level;
+
+	LOGFS_BUG_ON(segment_is_reserved(sb, segno), sb);
+
+	btree_insert32(&super->s_reserved_segments, segno, (void *)1, GFP_NOFS);
+	err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh);
+	BUG_ON(err);
+	gc_level = GC_LEVEL(sh.level);
+	logical_segno = be32_to_cpu(sh.segno);
+	if (sh.crc != logfs_crc32(&sh, sizeof(sh), 4)) {
+		logfs_mark_segment_bad(sb, segno);
+		cleaned = -1;
+		goto out;
+	}
+
+	for (seg_ofs = LOGFS_SEGMENT_HEADERSIZE;
+			seg_ofs + sizeof(oh) < super->s_segsize; ) {
+		ofs = dev_ofs(sb, logical_segno, seg_ofs);
+		err = wbuf_read(sb, dev_ofs(sb, segno, seg_ofs), sizeof(oh),
+				&oh);
+		BUG_ON(err);
+
+		if (!memchr_inv(&oh, 0xff, sizeof(oh)))
+			break;
+
+		if (oh.crc != logfs_crc32(&oh, sizeof(oh) - 4, 4)) {
+			logfs_mark_segment_bad(sb, segno);
+			cleaned = super->s_segsize - 1;
+			goto out;
+		}
+
+		ino = be64_to_cpu(oh.ino);
+		bix = be64_to_cpu(oh.bix);
+		len = sizeof(oh) + be16_to_cpu(oh.len);
+		valid = logfs_is_valid_block(sb, ofs, ino, bix, gc_level);
+		if (valid == 1) {
+			logfs_cleanse_block(sb, ofs, ino, bix, gc_level);
+			cleaned += len;
+		} else if (valid == 2) {
+			/* Will be invalid upon journal commit */
+			cleaned += len;
+		}
+		seg_ofs += len;
+	}
+out:
+	btree_remove32(&super->s_reserved_segments, segno);
+	return cleaned;
+}
+
+static struct gc_candidate *add_list(struct gc_candidate *cand,
+		struct candidate_list *list)
+{
+	struct rb_node **p = &list->rb_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct gc_candidate *cur;
+	int comp;
+
+	cand->list = list;
+	while (*p) {
+		parent = *p;
+		cur = rb_entry(parent, struct gc_candidate, rb_node);
+
+		if (list->sort_by_ec)
+			comp = cand->erase_count < cur->erase_count;
+		else
+			comp = cand->valid < cur->valid;
+
+		if (comp)
+			p = &parent->rb_left;
+		else
+			p = &parent->rb_right;
+	}
+	rb_link_node(&cand->rb_node, parent, p);
+	rb_insert_color(&cand->rb_node, &list->rb_tree);
+
+	if (list->count <= list->maxcount) {
+		list->count++;
+		return NULL;
+	}
+	cand = rb_entry(rb_last(&list->rb_tree), struct gc_candidate, rb_node);
+	rb_erase(&cand->rb_node, &list->rb_tree);
+	cand->list = NULL;
+	return cand;
+}
+
+static void remove_from_list(struct gc_candidate *cand)
+{
+	struct candidate_list *list = cand->list;
+
+	rb_erase(&cand->rb_node, &list->rb_tree);
+	list->count--;
+}
+
+static void free_candidate(struct super_block *sb, struct gc_candidate *cand)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	btree_remove32(&super->s_cand_tree, cand->segno);
+	kfree(cand);
+}
+
+u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec)
+{
+	struct gc_candidate *cand;
+	u32 segno;
+
+	BUG_ON(list->count == 0);
+
+	cand = rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node);
+	remove_from_list(cand);
+	segno = cand->segno;
+	if (ec)
+		*ec = cand->erase_count;
+	free_candidate(sb, cand);
+	return segno;
+}
+
+/*
+ * We have several lists to manage segments with.  The reserve_list is used to
+ * deal with bad blocks.  We try to keep the best (lowest ec) segments on this
+ * list.
+ * The free_list contains free segments for normal usage.  It usually gets the
+ * second pick after the reserve_list.  But when the free_list is running short
+ * it is more important to keep the free_list full than to keep a reserve.
+ *
+ * Segments that are not free are put onto a per-level low_list.  If we have
+ * to run garbage collection, we pick a candidate from there.  All segments on
+ * those lists should have at least some free space so GC will make progress.
+ *
+ * And last we have the ec_list, which is used to pick segments for wear
+ * leveling.
+ *
+ * If all appropriate lists are full, we simply free the candidate and forget
+ * about that segment for a while.  We have better candidates for each purpose.
+ */
+static void __add_candidate(struct super_block *sb, struct gc_candidate *cand)
+{
+	struct logfs_super *super = logfs_super(sb);
+	u32 full = super->s_segsize - LOGFS_SEGMENT_RESERVE;
+
+	if (cand->valid == 0) {
+		/* 100% free segments */
+		log_gc_noisy("add reserve segment %x (ec %x) at %llx\n",
+				cand->segno, cand->erase_count,
+				dev_ofs(sb, cand->segno, 0));
+		cand = add_list(cand, &super->s_reserve_list);
+		if (cand) {
+			log_gc_noisy("add free segment %x (ec %x) at %llx\n",
+					cand->segno, cand->erase_count,
+					dev_ofs(sb, cand->segno, 0));
+			cand = add_list(cand, &super->s_free_list);
+		}
+	} else {
+		/* good candidates for Garbage Collection */
+		if (cand->valid < full)
+			cand = add_list(cand, &super->s_low_list[cand->dist]);
+		/* good candidates for wear leveling,
+		 * segments that were recently written get ignored */
+		if (cand)
+			cand = add_list(cand, &super->s_ec_list);
+	}
+	if (cand)
+		free_candidate(sb, cand);
+}
+
+static int add_candidate(struct super_block *sb, u32 segno, u32 valid, u32 ec,
+		u8 dist)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct gc_candidate *cand;
+
+	cand = kmalloc(sizeof(*cand), GFP_NOFS);
+	if (!cand)
+		return -ENOMEM;
+
+	cand->segno = segno;
+	cand->valid = valid;
+	cand->erase_count = ec;
+	cand->dist = dist;
+
+	btree_insert32(&super->s_cand_tree, segno, cand, GFP_NOFS);
+	__add_candidate(sb, cand);
+	return 0;
+}
+
+static void remove_segment_from_lists(struct super_block *sb, u32 segno)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct gc_candidate *cand;
+
+	cand = btree_lookup32(&super->s_cand_tree, segno);
+	if (cand) {
+		remove_from_list(cand);
+		free_candidate(sb, cand);
+	}
+}
+
+static void scan_segment(struct super_block *sb, u32 segno)
+{
+	u32 valid, ec = 0;
+	gc_level_t gc_level = 0;
+	u8 dist;
+
+	if (segment_is_reserved(sb, segno))
+		return;
+
+	remove_segment_from_lists(sb, segno);
+	valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
+	if (valid == RESERVED)
+		return;
+
+	dist = root_distance(sb, gc_level);
+	add_candidate(sb, segno, valid, ec, dist);
+}
+
+static struct gc_candidate *first_in_list(struct candidate_list *list)
+{
+	if (list->count == 0)
+		return NULL;
+	return rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node);
+}
+
+/*
+ * Find the best segment for garbage collection.  Main criterion is
+ * the segment requiring the least effort to clean.  Secondary
+ * criterion is to GC on the lowest level available.
+ *
+ * So we search the least effort segment on the lowest level first,
+ * then move up and pick another segment iff is requires significantly
+ * less effort.  Hence the LOGFS_MAX_OBJECTSIZE in the comparison.
+ */
+static struct gc_candidate *get_candidate(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int i, max_dist;
+	struct gc_candidate *cand = NULL, *this;
+
+	max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS);
+
+	for (i = max_dist; i >= 0; i--) {
+		this = first_in_list(&super->s_low_list[i]);
+		if (!this)
+			continue;
+		if (!cand)
+			cand = this;
+		if (this->valid + LOGFS_MAX_OBJECTSIZE <= cand->valid)
+			cand = this;
+	}
+	return cand;
+}
+
+static int __logfs_gc_once(struct super_block *sb, struct gc_candidate *cand)
+{
+	struct logfs_super *super = logfs_super(sb);
+	gc_level_t gc_level;
+	u32 cleaned, valid, segno, ec;
+	u8 dist;
+
+	if (!cand) {
+		log_gc("GC attempted, but no candidate found\n");
+		return 0;
+	}
+
+	segno = cand->segno;
+	dist = cand->dist;
+	valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
+	free_candidate(sb, cand);
+	log_gc("GC segment #%02x at %llx, %x required, %x free, %x valid, %llx free\n",
+			segno, (u64)segno << super->s_segshift,
+			dist, no_free_segments(sb), valid,
+			super->s_free_bytes);
+	cleaned = logfs_gc_segment(sb, segno);
+	log_gc("GC segment #%02x complete - now %x valid\n", segno,
+			valid - cleaned);
+	BUG_ON(cleaned != valid);
+	return 1;
+}
+
+static int logfs_gc_once(struct super_block *sb)
+{
+	struct gc_candidate *cand;
+
+	cand = get_candidate(sb);
+	if (cand)
+		remove_from_list(cand);
+	return __logfs_gc_once(sb, cand);
+}
+
+/* returns 1 if a wrap occurs, 0 otherwise */
+static int logfs_scan_some(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	u32 segno;
+	int i, ret = 0;
+
+	segno = super->s_sweeper;
+	for (i = SCAN_RATIO; i > 0; i--) {
+		segno++;
+		if (segno >= super->s_no_segs) {
+			segno = 0;
+			ret = 1;
+			/* Break out of the loop.  We want to read a single
+			 * block from the segment size on next invocation if
+			 * SCAN_RATIO is set to match block size
+			 */
+			break;
+		}
+
+		scan_segment(sb, segno);
+	}
+	super->s_sweeper = segno;
+	return ret;
+}
+
+/*
+ * In principle, this function should loop forever, looking for GC candidates
+ * and moving data.  LogFS is designed in such a way that this loop is
+ * guaranteed to terminate.
+ *
+ * Limiting the loop to some iterations serves purely to catch cases when
+ * these guarantees have failed.  An actual endless loop is an obvious bug
+ * and should be reported as such.
+ */
+static void __logfs_gc_pass(struct super_block *sb, int target)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_block *block;
+	int round, progress, last_progress = 0;
+
+	/*
+	 * Doing too many changes to the segfile at once would result
+	 * in a large number of aliases.  Write the journal before
+	 * things get out of hand.
+	 */
+	if (super->s_shadow_tree.no_shadowed_segments >= MAX_OBJ_ALIASES)
+		logfs_write_anchor(sb);
+
+	if (no_free_segments(sb) >= target &&
+			super->s_no_object_aliases < MAX_OBJ_ALIASES)
+		return;
+
+	log_gc("__logfs_gc_pass(%x)\n", target);
+	for (round = 0; round < SCAN_ROUNDS; ) {
+		if (no_free_segments(sb) >= target)
+			goto write_alias;
+
+		/* Sync in-memory state with on-medium state in case they
+		 * diverged */
+		logfs_write_anchor(sb);
+		round += logfs_scan_some(sb);
+		if (no_free_segments(sb) >= target)
+			goto write_alias;
+		progress = logfs_gc_once(sb);
+		if (progress)
+			last_progress = round;
+		else if (round - last_progress > 2)
+			break;
+		continue;
+
+		/*
+		 * The goto logic is nasty, I just don't know a better way to
+		 * code it.  GC is supposed to ensure two things:
+		 * 1. Enough free segments are available.
+		 * 2. The number of aliases is bounded.
+		 * When 1. is achieved, we take a look at 2. and write back
+		 * some alias-containing blocks, if necessary.  However, after
+		 * each such write we need to go back to 1., as writes can
+		 * consume free segments.
+		 */
+write_alias:
+		if (super->s_no_object_aliases < MAX_OBJ_ALIASES)
+			return;
+		if (list_empty(&super->s_object_alias)) {
+			/* All aliases are still in btree */
+			return;
+		}
+		log_gc("Write back one alias\n");
+		block = list_entry(super->s_object_alias.next,
+				struct logfs_block, alias_list);
+		block->ops->write_block(block);
+		/*
+		 * To round off the nasty goto logic, we reset round here.  It
+		 * is a safety-net for GC not making any progress and limited
+		 * to something reasonably small.  If incremented it for every
+		 * single alias, the loop could terminate rather quickly.
+		 */
+		round = 0;
+	}
+	LOGFS_BUG(sb);
+}
+
+static int wl_ratelimit(struct super_block *sb, u64 *next_event)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	if (*next_event < super->s_gec) {
+		*next_event = super->s_gec + WL_RATELIMIT;
+		return 0;
+	}
+	return 1;
+}
+
+static void logfs_wl_pass(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct gc_candidate *wl_cand, *free_cand;
+
+	if (wl_ratelimit(sb, &super->s_wl_gec_ostore))
+		return;
+
+	wl_cand = first_in_list(&super->s_ec_list);
+	if (!wl_cand)
+		return;
+	free_cand = first_in_list(&super->s_free_list);
+	if (!free_cand)
+		return;
+
+	if (wl_cand->erase_count < free_cand->erase_count + WL_DELTA) {
+		remove_from_list(wl_cand);
+		__logfs_gc_once(sb, wl_cand);
+	}
+}
+
+/*
+ * The journal needs wear leveling as well.  But moving the journal is an
+ * expensive operation so we try to avoid it as much as possible.  And if we
+ * have to do it, we move the whole journal, not individual segments.
+ *
+ * Ratelimiting is not strictly necessary here, it mainly serves to avoid the
+ * calculations.  First we check whether moving the journal would be a
+ * significant improvement.  That means that a) the current journal segments
+ * have more wear than the future journal segments and b) the current journal
+ * segments have more wear than normal ostore segments.
+ * Rationale for b) is that we don't have to move the journal if it is aging
+ * less than the ostore, even if the reserve segments age even less (they are
+ * excluded from wear leveling, after all).
+ * Next we check that the superblocks have less wear than the journal.  Since
+ * moving the journal requires writing the superblocks, we have to protect the
+ * superblocks even more than the journal.
+ *
+ * Also we double the acceptable wear difference, compared to ostore wear
+ * leveling.  Journal data is read and rewritten rapidly, comparatively.  So
+ * soft errors have much less time to accumulate and we allow the journal to
+ * be a bit worse than the ostore.
+ */
+static void logfs_journal_wl_pass(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct gc_candidate *cand;
+	u32 min_journal_ec = -1, max_reserve_ec = 0;
+	int i;
+
+	if (wl_ratelimit(sb, &super->s_wl_gec_journal))
+		return;
+
+	if (super->s_reserve_list.count < super->s_no_journal_segs) {
+		/* Reserve is not full enough to move complete journal */
+		return;
+	}
+
+	journal_for_each(i)
+		if (super->s_journal_seg[i])
+			min_journal_ec = min(min_journal_ec,
+					super->s_journal_ec[i]);
+	cand = rb_entry(rb_first(&super->s_free_list.rb_tree),
+			struct gc_candidate, rb_node);
+	max_reserve_ec = cand->erase_count;
+	for (i = 0; i < 2; i++) {
+		struct logfs_segment_entry se;
+		u32 segno = seg_no(sb, super->s_sb_ofs[i]);
+		u32 ec;
+
+		logfs_get_segment_entry(sb, segno, &se);
+		ec = be32_to_cpu(se.ec_level) >> 4;
+		max_reserve_ec = max(max_reserve_ec, ec);
+	}
+
+	if (min_journal_ec > max_reserve_ec + 2 * WL_DELTA) {
+		do_logfs_journal_wl_pass(sb);
+	}
+}
+
+void logfs_gc_pass(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	//BUG_ON(mutex_trylock(&logfs_super(sb)->s_w_mutex));
+	/* Write journal before free space is getting saturated with dirty
+	 * objects.
+	 */
+	if (super->s_dirty_used_bytes + super->s_dirty_free_bytes
+			+ LOGFS_MAX_OBJECTSIZE >= super->s_free_bytes)
+		logfs_write_anchor(sb);
+	__logfs_gc_pass(sb, super->s_total_levels);
+	logfs_wl_pass(sb);
+	logfs_journal_wl_pass(sb);
+}
+
+static int check_area(struct super_block *sb, int i)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_area *area = super->s_area[i];
+	gc_level_t gc_level;
+	u32 cleaned, valid, ec;
+	u32 segno = area->a_segno;
+	u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
+
+	if (!area->a_is_open)
+		return 0;
+
+	if (super->s_devops->can_write_buf(sb, ofs) == 0)
+		return 0;
+
+	printk(KERN_INFO"LogFS: Possibly incomplete write at %llx\n", ofs);
+	/*
+	 * The device cannot write back the write buffer.  Most likely the
+	 * wbuf was already written out and the system crashed at some point
+	 * before the journal commit happened.  In that case we wouldn't have
+	 * to do anything.  But if the crash happened before the wbuf was
+	 * written out correctly, we must GC this segment.  So assume the
+	 * worst and always do the GC run.
+	 */
+	area->a_is_open = 0;
+	valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
+	cleaned = logfs_gc_segment(sb, segno);
+	if (cleaned != valid)
+		return -EIO;
+	return 0;
+}
+
+int logfs_check_areas(struct super_block *sb)
+{
+	int i, err;
+
+	for_each_area(i) {
+		err = check_area(sb, i);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static void logfs_init_candlist(struct candidate_list *list, int maxcount,
+		int sort_by_ec)
+{
+	list->count = 0;
+	list->maxcount = maxcount;
+	list->sort_by_ec = sort_by_ec;
+	list->rb_tree = RB_ROOT;
+}
+
+int logfs_init_gc(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int i;
+
+	btree_init_mempool32(&super->s_cand_tree, super->s_btree_pool);
+	logfs_init_candlist(&super->s_free_list, LIST_SIZE + SCAN_RATIO, 1);
+	logfs_init_candlist(&super->s_reserve_list,
+			super->s_bad_seg_reserve, 1);
+	for_each_area(i)
+		logfs_init_candlist(&super->s_low_list[i], LIST_SIZE, 0);
+	logfs_init_candlist(&super->s_ec_list, LIST_SIZE, 1);
+	return 0;
+}
+
+static void logfs_cleanup_list(struct super_block *sb,
+		struct candidate_list *list)
+{
+	struct gc_candidate *cand;
+
+	while (list->count) {
+		cand = rb_entry(list->rb_tree.rb_node, struct gc_candidate,
+				rb_node);
+		remove_from_list(cand);
+		free_candidate(sb, cand);
+	}
+	BUG_ON(list->rb_tree.rb_node);
+}
+
+void logfs_cleanup_gc(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int i;
+
+	if (!super->s_free_list.count)
+		return;
+
+	/*
+	 * FIXME: The btree may still contain a single empty node.  So we
+	 * call the grim visitor to clean up that mess.  Btree code should
+	 * do it for us, really.
+	 */
+	btree_grim_visitor32(&super->s_cand_tree, 0, NULL);
+	logfs_cleanup_list(sb, &super->s_free_list);
+	logfs_cleanup_list(sb, &super->s_reserve_list);
+	for_each_area(i)
+		logfs_cleanup_list(sb, &super->s_low_list[i]);
+	logfs_cleanup_list(sb, &super->s_ec_list);
+}
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
new file mode 100644
index 000000000000..f602e230e162
--- /dev/null
+++ b/fs/logfs/inode.c
@@ -0,0 +1,411 @@
+/*
+ * fs/logfs/inode.c	- inode handling code
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/slab.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+
+/*
+ * How soon to reuse old inode numbers?  LogFS doesn't store deleted inodes
+ * on the medium.  It therefore also lacks a method to store the previous
+ * generation number for deleted inodes.  Instead a single generation number
+ * is stored which will be used for new inodes.  Being just a 32bit counter,
+ * this can obvious wrap relatively quickly.  So we only reuse inodes if we
+ * know that a fair number of inodes can be created before we have to increment
+ * the generation again - effectively adding some bits to the counter.
+ * But being too aggressive here means we keep a very large and very sparse
+ * inode file, wasting space on indirect blocks.
+ * So what is a good value?  Beats me.  64k seems moderately bad on both
+ * fronts, so let's use that for now...
+ *
+ * NFS sucks, as everyone already knows.
+ */
+#define INOS_PER_WRAP (0x10000)
+
+/*
+ * Logfs' requirement to read inodes for garbage collection makes life a bit
+ * harder.  GC may have to read inodes that are in I_FREEING state, when they
+ * are being written out - and waiting for GC to make progress, naturally.
+ *
+ * So we cannot just call iget() or some variant of it, but first have to check
+ * wether the inode in question might be in I_FREEING state.  Therefore we
+ * maintain our own per-sb list of "almost deleted" inodes and check against
+ * that list first.  Normally this should be at most 1-2 entries long.
+ *
+ * Also, inodes have logfs-specific reference counting on top of what the vfs
+ * does.  When .destroy_inode is called, normally the reference count will drop
+ * to zero and the inode gets deleted.  But if GC accessed the inode, its
+ * refcount will remain nonzero and final deletion will have to wait.
+ *
+ * As a result we have two sets of functions to get/put inodes:
+ * logfs_safe_iget/logfs_safe_iput	- safe to call from GC context
+ * logfs_iget/iput			- normal version
+ */
+static struct kmem_cache *logfs_inode_cache;
+
+static DEFINE_SPINLOCK(logfs_inode_lock);
+
+static void logfs_inode_setops(struct inode *inode)
+{
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFDIR:
+		inode->i_op = &logfs_dir_iops;
+		inode->i_fop = &logfs_dir_fops;
+		inode->i_mapping->a_ops = &logfs_reg_aops;
+		break;
+	case S_IFREG:
+		inode->i_op = &logfs_reg_iops;
+		inode->i_fop = &logfs_reg_fops;
+		inode->i_mapping->a_ops = &logfs_reg_aops;
+		break;
+	case S_IFLNK:
+		inode->i_op = &logfs_symlink_iops;
+		inode->i_mapping->a_ops = &logfs_reg_aops;
+		break;
+	case S_IFSOCK:	/* fall through */
+	case S_IFBLK:	/* fall through */
+	case S_IFCHR:	/* fall through */
+	case S_IFIFO:
+		init_special_inode(inode, inode->i_mode, inode->i_rdev);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static struct inode *__logfs_iget(struct super_block *sb, ino_t ino)
+{
+	struct inode *inode = iget_locked(sb, ino);
+	int err;
+
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	err = logfs_read_inode(inode);
+	if (err || inode->i_nlink == 0) {
+		/* inode->i_nlink == 0 can be true when called from
+		 * block validator */
+		/* set i_nlink to 0 to prevent caching */
+		inode->i_nlink = 0;
+		logfs_inode(inode)->li_flags |= LOGFS_IF_ZOMBIE;
+		iget_failed(inode);
+		if (!err)
+			err = -ENOENT;
+		return ERR_PTR(err);
+	}
+
+	logfs_inode_setops(inode);
+	unlock_new_inode(inode);
+	return inode;
+}
+
+struct inode *logfs_iget(struct super_block *sb, ino_t ino)
+{
+	BUG_ON(ino == LOGFS_INO_MASTER);
+	BUG_ON(ino == LOGFS_INO_SEGFILE);
+	return __logfs_iget(sb, ino);
+}
+
+/*
+ * is_cached is set to 1 if we hand out a cached inode, 0 otherwise.
+ * this allows logfs_iput to do the right thing later
+ */
+struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *is_cached)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_inode *li;
+
+	if (ino == LOGFS_INO_MASTER)
+		return super->s_master_inode;
+	if (ino == LOGFS_INO_SEGFILE)
+		return super->s_segfile_inode;
+
+	spin_lock(&logfs_inode_lock);
+	list_for_each_entry(li, &super->s_freeing_list, li_freeing_list)
+		if (li->vfs_inode.i_ino == ino) {
+			li->li_refcount++;
+			spin_unlock(&logfs_inode_lock);
+			*is_cached = 1;
+			return &li->vfs_inode;
+		}
+	spin_unlock(&logfs_inode_lock);
+
+	*is_cached = 0;
+	return __logfs_iget(sb, ino);
+}
+
+static void __logfs_destroy_inode(struct inode *inode)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	BUG_ON(li->li_block);
+	list_del(&li->li_freeing_list);
+	kmem_cache_free(logfs_inode_cache, li);
+}
+
+static void logfs_destroy_inode(struct inode *inode)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	BUG_ON(list_empty(&li->li_freeing_list));
+	spin_lock(&logfs_inode_lock);
+	li->li_refcount--;
+	if (li->li_refcount == 0)
+		__logfs_destroy_inode(inode);
+	spin_unlock(&logfs_inode_lock);
+}
+
+void logfs_safe_iput(struct inode *inode, int is_cached)
+{
+	if (inode->i_ino == LOGFS_INO_MASTER)
+		return;
+	if (inode->i_ino == LOGFS_INO_SEGFILE)
+		return;
+
+	if (is_cached) {
+		logfs_destroy_inode(inode);
+		return;
+	}
+
+	iput(inode);
+}
+
+static void logfs_init_inode(struct super_block *sb, struct inode *inode)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	int i;
+
+	li->li_flags	= 0;
+	li->li_height	= 0;
+	li->li_used_bytes = 0;
+	li->li_block	= NULL;
+	inode->i_uid	= 0;
+	inode->i_gid	= 0;
+	inode->i_size	= 0;
+	inode->i_blocks	= 0;
+	inode->i_ctime	= CURRENT_TIME;
+	inode->i_mtime	= CURRENT_TIME;
+	inode->i_nlink	= 1;
+	li->li_refcount = 1;
+	INIT_LIST_HEAD(&li->li_freeing_list);
+
+	for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+		li->li_data[i] = 0;
+
+	return;
+}
+
+static struct inode *logfs_alloc_inode(struct super_block *sb)
+{
+	struct logfs_inode *li;
+
+	li = kmem_cache_alloc(logfs_inode_cache, GFP_NOFS);
+	if (!li)
+		return NULL;
+	logfs_init_inode(sb, &li->vfs_inode);
+	return &li->vfs_inode;
+}
+
+/*
+ * In logfs inodes are written to an inode file.  The inode file, like any
+ * other file, is managed with a inode.  The inode file's inode, aka master
+ * inode, requires special handling in several respects.  First, it cannot be
+ * written to the inode file, so it is stored in the journal instead.
+ *
+ * Secondly, this inode cannot be written back and destroyed before all other
+ * inodes have been written.  The ordering is important.  Linux' VFS is happily
+ * unaware of the ordering constraint and would ordinarily destroy the master
+ * inode at umount time while other inodes are still in use and dirty.  Not
+ * good.
+ *
+ * So logfs makes sure the master inode is not written until all other inodes
+ * have been destroyed.  Sadly, this method has another side-effect.  The VFS
+ * will notice one remaining inode and print a frightening warning message.
+ * Worse, it is impossible to judge whether such a warning was caused by the
+ * master inode or any other inodes have leaked as well.
+ *
+ * Our attempt of solving this is with logfs_new_meta_inode() below.  Its
+ * purpose is to create a new inode that will not trigger the warning if such
+ * an inode is still in use.  An ugly hack, no doubt.  Suggections for
+ * improvement are welcome.
+ */
+struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino)
+{
+	struct inode *inode;
+
+	inode = logfs_alloc_inode(sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	inode->i_mode = S_IFREG;
+	inode->i_ino = ino;
+	inode->i_sb = sb;
+
+	/* This is a blatant copy of alloc_inode code.  We'd need alloc_inode
+	 * to be nonstatic, alas. */
+	{
+		struct address_space * const mapping = &inode->i_data;
+
+		mapping->a_ops = &logfs_reg_aops;
+		mapping->host = inode;
+		mapping->flags = 0;
+		mapping_set_gfp_mask(mapping, GFP_NOFS);
+		mapping->assoc_mapping = NULL;
+		mapping->backing_dev_info = &default_backing_dev_info;
+		inode->i_mapping = mapping;
+		inode->i_nlink = 1;
+	}
+
+	return inode;
+}
+
+struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino)
+{
+	struct inode *inode;
+	int err;
+
+	inode = logfs_new_meta_inode(sb, ino);
+	if (IS_ERR(inode))
+		return inode;
+
+	err = logfs_read_inode(inode);
+	if (err) {
+		destroy_meta_inode(inode);
+		return ERR_PTR(err);
+	}
+	logfs_inode_setops(inode);
+	return inode;
+}
+
+static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	int ret;
+	long flags = WF_LOCK;
+
+	/* Can only happen if creat() failed.  Safe to skip. */
+	if (logfs_inode(inode)->li_flags & LOGFS_IF_STILLBORN)
+		return 0;
+
+	ret = __logfs_write_inode(inode, flags);
+	LOGFS_BUG_ON(ret, inode->i_sb);
+	return ret;
+}
+
+void destroy_meta_inode(struct inode *inode)
+{
+	if (inode) {
+		if (inode->i_data.nrpages)
+			truncate_inode_pages(&inode->i_data, 0);
+		logfs_clear_inode(inode);
+		kmem_cache_free(logfs_inode_cache, logfs_inode(inode));
+	}
+}
+
+/* called with inode_lock held */
+static void logfs_drop_inode(struct inode *inode)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+	struct logfs_inode *li = logfs_inode(inode);
+
+	spin_lock(&logfs_inode_lock);
+	list_move(&li->li_freeing_list, &super->s_freeing_list);
+	spin_unlock(&logfs_inode_lock);
+	generic_drop_inode(inode);
+}
+
+static void logfs_set_ino_generation(struct super_block *sb,
+		struct inode *inode)
+{
+	struct logfs_super *super = logfs_super(sb);
+	u64 ino;
+
+	mutex_lock(&super->s_journal_mutex);
+	ino = logfs_seek_hole(super->s_master_inode, super->s_last_ino + 1);
+	super->s_last_ino = ino;
+	super->s_inos_till_wrap--;
+	if (super->s_inos_till_wrap < 0) {
+		super->s_last_ino = LOGFS_RESERVED_INOS;
+		super->s_generation++;
+		super->s_inos_till_wrap = INOS_PER_WRAP;
+	}
+	inode->i_ino = ino;
+	inode->i_generation = super->s_generation;
+	mutex_unlock(&super->s_journal_mutex);
+}
+
+struct inode *logfs_new_inode(struct inode *dir, int mode)
+{
+	struct super_block *sb = dir->i_sb;
+	struct inode *inode;
+
+	inode = new_inode(sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	logfs_init_inode(sb, inode);
+
+	/* inherit parent flags */
+	logfs_inode(inode)->li_flags |=
+		logfs_inode(dir)->li_flags & LOGFS_FL_INHERITED;
+
+	inode->i_mode = mode;
+	logfs_set_ino_generation(sb, inode);
+
+	inode_init_owner(inode, dir, mode);
+	logfs_inode_setops(inode);
+	insert_inode_hash(inode);
+
+	return inode;
+}
+
+static void logfs_init_once(void *_li)
+{
+	struct logfs_inode *li = _li;
+	int i;
+
+	li->li_flags = 0;
+	li->li_used_bytes = 0;
+	li->li_refcount = 1;
+	for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+		li->li_data[i] = 0;
+	inode_init_once(&li->vfs_inode);
+}
+
+static int logfs_sync_fs(struct super_block *sb, int wait)
+{
+	logfs_write_anchor(sb);
+	return 0;
+}
+
+const struct super_operations logfs_super_operations = {
+	.alloc_inode	= logfs_alloc_inode,
+	.clear_inode	= logfs_clear_inode,
+	.delete_inode	= logfs_delete_inode,
+	.destroy_inode	= logfs_destroy_inode,
+	.drop_inode	= logfs_drop_inode,
+	.write_inode	= logfs_write_inode,
+	.statfs		= logfs_statfs,
+	.sync_fs	= logfs_sync_fs,
+};
+
+int logfs_init_inode_cache(void)
+{
+	logfs_inode_cache = kmem_cache_create("logfs_inode_cache",
+			sizeof(struct logfs_inode), 0, SLAB_RECLAIM_ACCOUNT,
+			logfs_init_once);
+	if (!logfs_inode_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void logfs_destroy_inode_cache(void)
+{
+	kmem_cache_destroy(logfs_inode_cache);
+}
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
new file mode 100644
index 000000000000..4b0e0616b357
--- /dev/null
+++ b/fs/logfs/journal.c
@@ -0,0 +1,897 @@
+/*
+ * fs/logfs/journal.c	- journal handling code
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ */
+#include "logfs.h"
+#include <linux/slab.h>
+
+static void logfs_calc_free(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	u64 reserve, no_segs = super->s_no_segs;
+	s64 free;
+	int i;
+
+	/* superblock segments */
+	no_segs -= 2;
+	super->s_no_journal_segs = 0;
+	/* journal */
+	journal_for_each(i)
+		if (super->s_journal_seg[i]) {
+			no_segs--;
+			super->s_no_journal_segs++;
+		}
+
+	/* open segments plus one extra per level for GC */
+	no_segs -= 2 * super->s_total_levels;
+
+	free = no_segs * (super->s_segsize - LOGFS_SEGMENT_RESERVE);
+	free -= super->s_used_bytes;
+	/* just a bit extra */
+	free -= super->s_total_levels * 4096;
+
+	/* Bad blocks are 'paid' for with speed reserve - the filesystem
+	 * simply gets slower as bad blocks accumulate.  Until the bad blocks
+	 * exceed the speed reserve - then the filesystem gets smaller.
+	 */
+	reserve = super->s_bad_segments + super->s_bad_seg_reserve;
+	reserve *= super->s_segsize - LOGFS_SEGMENT_RESERVE;
+	reserve = max(reserve, super->s_speed_reserve);
+	free -= reserve;
+	if (free < 0)
+		free = 0;
+
+	super->s_free_bytes = free;
+}
+
+static void reserve_sb_and_journal(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct btree_head32 *head = &super->s_reserved_segments;
+	int i, err;
+
+	err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[0]), (void *)1,
+			GFP_KERNEL);
+	BUG_ON(err);
+
+	err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[1]), (void *)1,
+			GFP_KERNEL);
+	BUG_ON(err);
+
+	journal_for_each(i) {
+		if (!super->s_journal_seg[i])
+			continue;
+		err = btree_insert32(head, super->s_journal_seg[i], (void *)1,
+				GFP_KERNEL);
+		BUG_ON(err);
+	}
+}
+
+static void read_dynsb(struct super_block *sb,
+		struct logfs_je_dynsb *dynsb)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	super->s_gec		= be64_to_cpu(dynsb->ds_gec);
+	super->s_sweeper	= be64_to_cpu(dynsb->ds_sweeper);
+	super->s_victim_ino	= be64_to_cpu(dynsb->ds_victim_ino);
+	super->s_rename_dir	= be64_to_cpu(dynsb->ds_rename_dir);
+	super->s_rename_pos	= be64_to_cpu(dynsb->ds_rename_pos);
+	super->s_used_bytes	= be64_to_cpu(dynsb->ds_used_bytes);
+	super->s_generation	= be32_to_cpu(dynsb->ds_generation);
+}
+
+static void read_anchor(struct super_block *sb,
+		struct logfs_je_anchor *da)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct inode *inode = super->s_master_inode;
+	struct logfs_inode *li = logfs_inode(inode);
+	int i;
+
+	super->s_last_ino = be64_to_cpu(da->da_last_ino);
+	li->li_flags	= 0;
+	li->li_height	= da->da_height;
+	i_size_write(inode, be64_to_cpu(da->da_size));
+	li->li_used_bytes = be64_to_cpu(da->da_used_bytes);
+
+	for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+		li->li_data[i] = be64_to_cpu(da->da_data[i]);
+}
+
+static void read_erasecount(struct super_block *sb,
+		struct logfs_je_journal_ec *ec)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int i;
+
+	journal_for_each(i)
+		super->s_journal_ec[i] = be32_to_cpu(ec->ec[i]);
+}
+
+static int read_area(struct super_block *sb, struct logfs_je_area *a)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_area *area = super->s_area[a->gc_level];
+	u64 ofs;
+	u32 writemask = ~(super->s_writesize - 1);
+
+	if (a->gc_level >= LOGFS_NO_AREAS)
+		return -EIO;
+	if (a->vim != VIM_DEFAULT)
+		return -EIO; /* TODO: close area and continue */
+
+	area->a_used_bytes = be32_to_cpu(a->used_bytes);
+	area->a_written_bytes = area->a_used_bytes & writemask;
+	area->a_segno = be32_to_cpu(a->segno);
+	if (area->a_segno)
+		area->a_is_open = 1;
+
+	ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
+	if (super->s_writesize > 1)
+		return logfs_buf_recover(area, ofs, a + 1, super->s_writesize);
+	else
+		return logfs_buf_recover(area, ofs, NULL, 0);
+}
+
+static void *unpack(void *from, void *to)
+{
+	struct logfs_journal_header *jh = from;
+	void *data = from + sizeof(struct logfs_journal_header);
+	int err;
+	size_t inlen, outlen;
+
+	inlen = be16_to_cpu(jh->h_len);
+	outlen = be16_to_cpu(jh->h_datalen);
+
+	if (jh->h_compr == COMPR_NONE)
+		memcpy(to, data, inlen);
+	else {
+		err = logfs_uncompress(data, to, inlen, outlen);
+		BUG_ON(err);
+	}
+	return to;
+}
+
+static int __read_je_header(struct super_block *sb, u64 ofs,
+		struct logfs_journal_header *jh)
+{
+	struct logfs_super *super = logfs_super(sb);
+	size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize)
+		+ MAX_JOURNAL_HEADER;
+	u16 type, len, datalen;
+	int err;
+
+	/* read header only */
+	err = wbuf_read(sb, ofs, sizeof(*jh), jh);
+	if (err)
+		return err;
+	type = be16_to_cpu(jh->h_type);
+	len = be16_to_cpu(jh->h_len);
+	datalen = be16_to_cpu(jh->h_datalen);
+	if (len > sb->s_blocksize)
+		return -EIO;
+	if ((type < JE_FIRST) || (type > JE_LAST))
+		return -EIO;
+	if (datalen > bufsize)
+		return -EIO;
+	return 0;
+}
+
+static int __read_je_payload(struct super_block *sb, u64 ofs,
+		struct logfs_journal_header *jh)
+{
+	u16 len;
+	int err;
+
+	len = be16_to_cpu(jh->h_len);
+	err = wbuf_read(sb, ofs + sizeof(*jh), len, jh + 1);
+	if (err)
+		return err;
+	if (jh->h_crc != logfs_crc32(jh, len + sizeof(*jh), 4)) {
+		/* Old code was confused.  It forgot about the header length
+		 * and stopped calculating the crc 16 bytes before the end
+		 * of data - ick!
+		 * FIXME: Remove this hack once the old code is fixed.
+		 */
+		if (jh->h_crc == logfs_crc32(jh, len, 4))
+			WARN_ON_ONCE(1);
+		else
+			return -EIO;
+	}
+	return 0;
+}
+
+/*
+ * jh needs to be large enough to hold the complete entry, not just the header
+ */
+static int __read_je(struct super_block *sb, u64 ofs,
+		struct logfs_journal_header *jh)
+{
+	int err;
+
+	err = __read_je_header(sb, ofs, jh);
+	if (err)
+		return err;
+	return __read_je_payload(sb, ofs, jh);
+}
+
+static int read_je(struct super_block *sb, u64 ofs)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_journal_header *jh = super->s_compressed_je;
+	void *scratch = super->s_je;
+	u16 type, datalen;
+	int err;
+
+	err = __read_je(sb, ofs, jh);
+	if (err)
+		return err;
+	type = be16_to_cpu(jh->h_type);
+	datalen = be16_to_cpu(jh->h_datalen);
+
+	switch (type) {
+	case JE_DYNSB:
+		read_dynsb(sb, unpack(jh, scratch));
+		break;
+	case JE_ANCHOR:
+		read_anchor(sb, unpack(jh, scratch));
+		break;
+	case JE_ERASECOUNT:
+		read_erasecount(sb, unpack(jh, scratch));
+		break;
+	case JE_AREA:
+		err = read_area(sb, unpack(jh, scratch));
+		break;
+	case JE_OBJ_ALIAS:
+		err = logfs_load_object_aliases(sb, unpack(jh, scratch),
+				datalen);
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		return -EIO;
+	}
+	return err;
+}
+
+static int logfs_read_segment(struct super_block *sb, u32 segno)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_journal_header *jh = super->s_compressed_je;
+	u64 ofs, seg_ofs = dev_ofs(sb, segno, 0);
+	u32 h_ofs, last_ofs = 0;
+	u16 len, datalen, last_len = 0;
+	int i, err;
+
+	/* search for most recent commit */
+	for (h_ofs = 0; h_ofs < super->s_segsize; h_ofs += sizeof(*jh)) {
+		ofs = seg_ofs + h_ofs;
+		err = __read_je_header(sb, ofs, jh);
+		if (err)
+			continue;
+		if (jh->h_type != cpu_to_be16(JE_COMMIT))
+			continue;
+		err = __read_je_payload(sb, ofs, jh);
+		if (err)
+			continue;
+		len = be16_to_cpu(jh->h_len);
+		datalen = be16_to_cpu(jh->h_datalen);
+		if ((datalen > sizeof(super->s_je_array)) ||
+				(datalen % sizeof(__be64)))
+			continue;
+		last_ofs = h_ofs;
+		last_len = datalen;
+		h_ofs += ALIGN(len, sizeof(*jh)) - sizeof(*jh);
+	}
+	/* read commit */
+	if (last_ofs == 0)
+		return -ENOENT;
+	ofs = seg_ofs + last_ofs;
+	log_journal("Read commit from %llx\n", ofs);
+	err = __read_je(sb, ofs, jh);
+	BUG_ON(err); /* We should have caught it in the scan loop already */
+	if (err)
+		return err;
+	/* uncompress */
+	unpack(jh, super->s_je_array);
+	super->s_no_je = last_len / sizeof(__be64);
+	/* iterate over array */
+	for (i = 0; i < super->s_no_je; i++) {
+		err = read_je(sb, be64_to_cpu(super->s_je_array[i]));
+		if (err)
+			return err;
+	}
+	super->s_journal_area->a_segno = segno;
+	return 0;
+}
+
+static u64 read_gec(struct super_block *sb, u32 segno)
+{
+	struct logfs_segment_header sh;
+	__be32 crc;
+	int err;
+
+	if (!segno)
+		return 0;
+	err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh);
+	if (err)
+		return 0;
+	crc = logfs_crc32(&sh, sizeof(sh), 4);
+	if (crc != sh.crc) {
+		WARN_ON(sh.gec != cpu_to_be64(0xffffffffffffffffull));
+		/* Most likely it was just erased */
+		return 0;
+	}
+	return be64_to_cpu(sh.gec);
+}
+
+static int logfs_read_journal(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	u64 gec[LOGFS_JOURNAL_SEGS], max;
+	u32 segno;
+	int i, max_i;
+
+	max = 0;
+	max_i = -1;
+	journal_for_each(i) {
+		segno = super->s_journal_seg[i];
+		gec[i] = read_gec(sb, super->s_journal_seg[i]);
+		if (gec[i] > max) {
+			max = gec[i];
+			max_i = i;
+		}
+	}
+	if (max_i == -1)
+		return -EIO;
+	/* FIXME: Try older segments in case of error */
+	return logfs_read_segment(sb, super->s_journal_seg[max_i]);
+}
+
+/*
+ * First search the current segment (outer loop), then pick the next segment
+ * in the array, skipping any zero entries (inner loop).
+ */
+static void journal_get_free_segment(struct logfs_area *area)
+{
+	struct logfs_super *super = logfs_super(area->a_sb);
+	int i;
+
+	journal_for_each(i) {
+		if (area->a_segno != super->s_journal_seg[i])
+			continue;
+
+		do {
+			i++;
+			if (i == LOGFS_JOURNAL_SEGS)
+				i = 0;
+		} while (!super->s_journal_seg[i]);
+
+		area->a_segno = super->s_journal_seg[i];
+		area->a_erase_count = ++(super->s_journal_ec[i]);
+		log_journal("Journal now at %x (ec %x)\n", area->a_segno,
+				area->a_erase_count);
+		return;
+	}
+	BUG();
+}
+
+static void journal_get_erase_count(struct logfs_area *area)
+{
+	/* erase count is stored globally and incremented in
+	 * journal_get_free_segment() - nothing to do here */
+}
+
+static int journal_erase_segment(struct logfs_area *area)
+{
+	struct super_block *sb = area->a_sb;
+	union {
+		struct logfs_segment_header sh;
+		unsigned char c[ALIGN(sizeof(struct logfs_segment_header), 16)];
+	} u;
+	u64 ofs;
+	int err;
+
+	err = logfs_erase_segment(sb, area->a_segno, 1);
+	if (err)
+		return err;
+
+	memset(&u, 0, sizeof(u));
+	u.sh.pad = 0;
+	u.sh.type = SEG_JOURNAL;
+	u.sh.level = 0;
+	u.sh.segno = cpu_to_be32(area->a_segno);
+	u.sh.ec = cpu_to_be32(area->a_erase_count);
+	u.sh.gec = cpu_to_be64(logfs_super(sb)->s_gec);
+	u.sh.crc = logfs_crc32(&u.sh, sizeof(u.sh), 4);
+
+	/* This causes a bug in segment.c.  Not yet. */
+	//logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count, 0);
+
+	ofs = dev_ofs(sb, area->a_segno, 0);
+	area->a_used_bytes = sizeof(u);
+	logfs_buf_write(area, ofs, &u, sizeof(u));
+	return 0;
+}
+
+static size_t __logfs_write_header(struct logfs_super *super,
+		struct logfs_journal_header *jh, size_t len, size_t datalen,
+		u16 type, u8 compr)
+{
+	jh->h_len	= cpu_to_be16(len);
+	jh->h_type	= cpu_to_be16(type);
+	jh->h_datalen	= cpu_to_be16(datalen);
+	jh->h_compr	= compr;
+	jh->h_pad[0]	= 'H';
+	jh->h_pad[1]	= 'E';
+	jh->h_pad[2]	= 'A';
+	jh->h_pad[3]	= 'D';
+	jh->h_pad[4]	= 'R';
+	jh->h_crc	= logfs_crc32(jh, len + sizeof(*jh), 4);
+	return ALIGN(len, 16) + sizeof(*jh);
+}
+
+static size_t logfs_write_header(struct logfs_super *super,
+		struct logfs_journal_header *jh, size_t datalen, u16 type)
+{
+	size_t len = datalen;
+
+	return __logfs_write_header(super, jh, len, datalen, type, COMPR_NONE);
+}
+
+static inline size_t logfs_journal_erasecount_size(struct logfs_super *super)
+{
+	return LOGFS_JOURNAL_SEGS * sizeof(__be32);
+}
+
+static void *logfs_write_erasecount(struct super_block *sb, void *_ec,
+		u16 *type, size_t *len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_je_journal_ec *ec = _ec;
+	int i;
+
+	journal_for_each(i)
+		ec->ec[i] = cpu_to_be32(super->s_journal_ec[i]);
+	*type = JE_ERASECOUNT;
+	*len = logfs_journal_erasecount_size(super);
+	return ec;
+}
+
+static void account_shadow(void *_shadow, unsigned long _sb, u64 ignore,
+		size_t ignore2)
+{
+	struct logfs_shadow *shadow = _shadow;
+	struct super_block *sb = (void *)_sb;
+	struct logfs_super *super = logfs_super(sb);
+
+	/* consume new space */
+	super->s_free_bytes	  -= shadow->new_len;
+	super->s_used_bytes	  += shadow->new_len;
+	super->s_dirty_used_bytes -= shadow->new_len;
+
+	/* free up old space */
+	super->s_free_bytes	  += shadow->old_len;
+	super->s_used_bytes	  -= shadow->old_len;
+	super->s_dirty_free_bytes -= shadow->old_len;
+
+	logfs_set_segment_used(sb, shadow->old_ofs, -shadow->old_len);
+	logfs_set_segment_used(sb, shadow->new_ofs, shadow->new_len);
+
+	log_journal("account_shadow(%llx, %llx, %x) %llx->%llx %x->%x\n",
+			shadow->ino, shadow->bix, shadow->gc_level,
+			shadow->old_ofs, shadow->new_ofs,
+			shadow->old_len, shadow->new_len);
+	mempool_free(shadow, super->s_shadow_pool);
+}
+
+static void account_shadows(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct inode *inode = super->s_master_inode;
+	struct logfs_inode *li = logfs_inode(inode);
+	struct shadow_tree *tree = &super->s_shadow_tree;
+
+	btree_grim_visitor64(&tree->new, (unsigned long)sb, account_shadow);
+	btree_grim_visitor64(&tree->old, (unsigned long)sb, account_shadow);
+	btree_grim_visitor32(&tree->segment_map, 0, NULL);
+	tree->no_shadowed_segments = 0;
+
+	if (li->li_block) {
+		/*
+		 * We never actually use the structure, when attached to the
+		 * master inode.  But it is easier to always free it here than
+		 * to have checks in several places elsewhere when allocating
+		 * it.
+		 */
+		li->li_block->ops->free_block(sb, li->li_block);
+	}
+	BUG_ON((s64)li->li_used_bytes < 0);
+}
+
+static void *__logfs_write_anchor(struct super_block *sb, void *_da,
+		u16 *type, size_t *len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_je_anchor *da = _da;
+	struct inode *inode = super->s_master_inode;
+	struct logfs_inode *li = logfs_inode(inode);
+	int i;
+
+	da->da_height	= li->li_height;
+	da->da_last_ino = cpu_to_be64(super->s_last_ino);
+	da->da_size	= cpu_to_be64(i_size_read(inode));
+	da->da_used_bytes = cpu_to_be64(li->li_used_bytes);
+	for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+		da->da_data[i] = cpu_to_be64(li->li_data[i]);
+	*type = JE_ANCHOR;
+	*len = sizeof(*da);
+	return da;
+}
+
+static void *logfs_write_dynsb(struct super_block *sb, void *_dynsb,
+		u16 *type, size_t *len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_je_dynsb *dynsb = _dynsb;
+
+	dynsb->ds_gec		= cpu_to_be64(super->s_gec);
+	dynsb->ds_sweeper	= cpu_to_be64(super->s_sweeper);
+	dynsb->ds_victim_ino	= cpu_to_be64(super->s_victim_ino);
+	dynsb->ds_rename_dir	= cpu_to_be64(super->s_rename_dir);
+	dynsb->ds_rename_pos	= cpu_to_be64(super->s_rename_pos);
+	dynsb->ds_used_bytes	= cpu_to_be64(super->s_used_bytes);
+	dynsb->ds_generation	= cpu_to_be32(super->s_generation);
+	*type = JE_DYNSB;
+	*len = sizeof(*dynsb);
+	return dynsb;
+}
+
+static void write_wbuf(struct super_block *sb, struct logfs_area *area,
+		void *wbuf)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping = super->s_mapping_inode->i_mapping;
+	u64 ofs;
+	pgoff_t index;
+	int page_ofs;
+	struct page *page;
+
+	ofs = dev_ofs(sb, area->a_segno,
+			area->a_used_bytes & ~(super->s_writesize - 1));
+	index = ofs >> PAGE_SHIFT;
+	page_ofs = ofs & (PAGE_SIZE - 1);
+
+	page = find_lock_page(mapping, index);
+	BUG_ON(!page);
+	memcpy(wbuf, page_address(page) + page_ofs, super->s_writesize);
+	unlock_page(page);
+}
+
+static void *logfs_write_area(struct super_block *sb, void *_a,
+		u16 *type, size_t *len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_area *area = super->s_area[super->s_sum_index];
+	struct logfs_je_area *a = _a;
+
+	a->vim = VIM_DEFAULT;
+	a->gc_level = super->s_sum_index;
+	a->used_bytes = cpu_to_be32(area->a_used_bytes);
+	a->segno = cpu_to_be32(area->a_segno);
+	if (super->s_writesize > 1)
+		write_wbuf(sb, area, a + 1);
+
+	*type = JE_AREA;
+	*len = sizeof(*a) + super->s_writesize;
+	return a;
+}
+
+static void *logfs_write_commit(struct super_block *sb, void *h,
+		u16 *type, size_t *len)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	*type = JE_COMMIT;
+	*len = super->s_no_je * sizeof(__be64);
+	return super->s_je_array;
+}
+
+static size_t __logfs_write_je(struct super_block *sb, void *buf, u16 type,
+		size_t len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	void *header = super->s_compressed_je;
+	void *data = header + sizeof(struct logfs_journal_header);
+	ssize_t compr_len, pad_len;
+	u8 compr = COMPR_ZLIB;
+
+	if (len == 0)
+		return logfs_write_header(super, header, 0, type);
+
+	BUG_ON(len > sb->s_blocksize);
+	compr_len = logfs_compress(buf, data, len, sb->s_blocksize);
+	if (compr_len < 0 || type == JE_ANCHOR) {
+		memcpy(data, buf, len);
+		compr_len = len;
+		compr = COMPR_NONE;
+	}
+
+	pad_len = ALIGN(compr_len, 16);
+	memset(data + compr_len, 0, pad_len - compr_len);
+
+	return __logfs_write_header(super, header, compr_len, len, type, compr);
+}
+
+static s64 logfs_get_free_bytes(struct logfs_area *area, size_t *bytes,
+		int must_pad)
+{
+	u32 writesize = logfs_super(area->a_sb)->s_writesize;
+	s32 ofs;
+	int ret;
+
+	ret = logfs_open_area(area, *bytes);
+	if (ret)
+		return -EAGAIN;
+
+	ofs = area->a_used_bytes;
+	area->a_used_bytes += *bytes;
+
+	if (must_pad) {
+		area->a_used_bytes = ALIGN(area->a_used_bytes, writesize);
+		*bytes = area->a_used_bytes - ofs;
+	}
+
+	return dev_ofs(area->a_sb, area->a_segno, ofs);
+}
+
+static int logfs_write_je_buf(struct super_block *sb, void *buf, u16 type,
+		size_t buf_len)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_area *area = super->s_journal_area;
+	struct logfs_journal_header *jh = super->s_compressed_je;
+	size_t len;
+	int must_pad = 0;
+	s64 ofs;
+
+	len = __logfs_write_je(sb, buf, type, buf_len);
+	if (jh->h_type == cpu_to_be16(JE_COMMIT))
+		must_pad = 1;
+
+	ofs = logfs_get_free_bytes(area, &len, must_pad);
+	if (ofs < 0)
+		return ofs;
+	logfs_buf_write(area, ofs, super->s_compressed_je, len);
+	BUG_ON(super->s_no_je >= MAX_JOURNAL_ENTRIES);
+	super->s_je_array[super->s_no_je++] = cpu_to_be64(ofs);
+	return 0;
+}
+
+static int logfs_write_je(struct super_block *sb,
+		void* (*write)(struct super_block *sb, void *scratch,
+			u16 *type, size_t *len))
+{
+	void *buf;
+	size_t len;
+	u16 type;
+
+	buf = write(sb, logfs_super(sb)->s_je, &type, &len);
+	return logfs_write_je_buf(sb, buf, type, len);
+}
+
+int write_alias_journal(struct super_block *sb, u64 ino, u64 bix,
+		level_t level, int child_no, __be64 val)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_obj_alias *oa = super->s_je;
+	int err = 0, fill = super->s_je_fill;
+
+	log_aliases("logfs_write_obj_aliases #%x(%llx, %llx, %x, %x) %llx\n",
+			fill, ino, bix, level, child_no, be64_to_cpu(val));
+	oa[fill].ino = cpu_to_be64(ino);
+	oa[fill].bix = cpu_to_be64(bix);
+	oa[fill].val = val;
+	oa[fill].level = (__force u8)level;
+	oa[fill].child_no = cpu_to_be16(child_no);
+	fill++;
+	if (fill >= sb->s_blocksize / sizeof(*oa)) {
+		err = logfs_write_je_buf(sb, oa, JE_OBJ_ALIAS, sb->s_blocksize);
+		fill = 0;
+	}
+
+	super->s_je_fill = fill;
+	return err;
+}
+
+static int logfs_write_obj_aliases(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int err;
+
+	log_journal("logfs_write_obj_aliases: %d aliases to write\n",
+			super->s_no_object_aliases);
+	super->s_je_fill = 0;
+	err = logfs_write_obj_aliases_pagecache(sb);
+	if (err)
+		return err;
+
+	if (super->s_je_fill)
+		err = logfs_write_je_buf(sb, super->s_je, JE_OBJ_ALIAS,
+				super->s_je_fill
+				* sizeof(struct logfs_obj_alias));
+	return err;
+}
+
+/*
+ * Write all journal entries.  The goto logic ensures that all journal entries
+ * are written whenever a new segment is used.  It is ugly and potentially a
+ * bit wasteful, but robustness is more important.  With this we can *always*
+ * erase all journal segments except the one containing the most recent commit.
+ */
+void logfs_write_anchor(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_area *area = super->s_journal_area;
+	int i, err;
+
+	if (!(super->s_flags & LOGFS_SB_FLAG_DIRTY))
+		return;
+	super->s_flags &= ~LOGFS_SB_FLAG_DIRTY;
+
+	BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
+	mutex_lock(&super->s_journal_mutex);
+
+	/* Do this first or suffer corruption */
+	logfs_sync_segments(sb);
+	account_shadows(sb);
+
+again:
+	super->s_no_je = 0;
+	for_each_area(i) {
+		if (!super->s_area[i]->a_is_open)
+			continue;
+		super->s_sum_index = i;
+		err = logfs_write_je(sb, logfs_write_area);
+		if (err)
+			goto again;
+	}
+	err = logfs_write_obj_aliases(sb);
+	if (err)
+		goto again;
+	err = logfs_write_je(sb, logfs_write_erasecount);
+	if (err)
+		goto again;
+	err = logfs_write_je(sb, __logfs_write_anchor);
+	if (err)
+		goto again;
+	err = logfs_write_je(sb, logfs_write_dynsb);
+	if (err)
+		goto again;
+	/*
+	 * Order is imperative.  First we sync all writes, including the
+	 * non-committed journal writes.  Then we write the final commit and
+	 * sync the current journal segment.
+	 * There is a theoretical bug here.  Syncing the journal segment will
+	 * write a number of journal entries and the final commit.  All these
+	 * are written in a single operation.  If the device layer writes the
+	 * data back-to-front, the commit will precede the other journal
+	 * entries, leaving a race window.
+	 * Two fixes are possible.  Preferred is to fix the device layer to
+	 * ensure writes happen front-to-back.  Alternatively we can insert
+	 * another logfs_sync_area() super->s_devops->sync() combo before
+	 * writing the commit.
+	 */
+	/*
+	 * On another subject, super->s_devops->sync is usually not necessary.
+	 * Unless called from sys_sync or friends, a barrier would suffice.
+	 */
+	super->s_devops->sync(sb);
+	err = logfs_write_je(sb, logfs_write_commit);
+	if (err)
+		goto again;
+	log_journal("Write commit to %llx\n",
+			be64_to_cpu(super->s_je_array[super->s_no_je - 1]));
+	logfs_sync_area(area);
+	BUG_ON(area->a_used_bytes != area->a_written_bytes);
+	super->s_devops->sync(sb);
+
+	mutex_unlock(&super->s_journal_mutex);
+	return;
+}
+
+void do_logfs_journal_wl_pass(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_area *area = super->s_journal_area;
+	struct btree_head32 *head = &super->s_reserved_segments;
+	u32 segno, ec;
+	int i, err;
+
+	log_journal("Journal requires wear-leveling.\n");
+	/* Drop old segments */
+	journal_for_each(i)
+		if (super->s_journal_seg[i]) {
+			btree_remove32(head, super->s_journal_seg[i]);
+			logfs_set_segment_unreserved(sb,
+					super->s_journal_seg[i],
+					super->s_journal_ec[i]);
+			super->s_journal_seg[i] = 0;
+			super->s_journal_ec[i] = 0;
+		}
+	/* Get new segments */
+	for (i = 0; i < super->s_no_journal_segs; i++) {
+		segno = get_best_cand(sb, &super->s_reserve_list, &ec);
+		super->s_journal_seg[i] = segno;
+		super->s_journal_ec[i] = ec;
+		logfs_set_segment_reserved(sb, segno);
+		err = btree_insert32(head, segno, (void *)1, GFP_KERNEL);
+		BUG_ON(err); /* mempool should prevent this */
+		err = logfs_erase_segment(sb, segno, 1);
+		BUG_ON(err); /* FIXME: remount-ro would be nicer */
+	}
+	/* Manually move journal_area */
+	freeseg(sb, area->a_segno);
+	area->a_segno = super->s_journal_seg[0];
+	area->a_is_open = 0;
+	area->a_used_bytes = 0;
+	/* Write journal */
+	logfs_write_anchor(sb);
+	/* Write superblocks */
+	err = logfs_write_sb(sb);
+	BUG_ON(err);
+}
+
+static const struct logfs_area_ops journal_area_ops = {
+	.get_free_segment	= journal_get_free_segment,
+	.get_erase_count	= journal_get_erase_count,
+	.erase_segment		= journal_erase_segment,
+};
+
+int logfs_init_journal(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize)
+		+ MAX_JOURNAL_HEADER;
+	int ret = -ENOMEM;
+
+	mutex_init(&super->s_journal_mutex);
+	btree_init_mempool32(&super->s_reserved_segments, super->s_btree_pool);
+
+	super->s_je = kzalloc(bufsize, GFP_KERNEL);
+	if (!super->s_je)
+		return ret;
+
+	super->s_compressed_je = kzalloc(bufsize, GFP_KERNEL);
+	if (!super->s_compressed_je)
+		return ret;
+
+	super->s_master_inode = logfs_new_meta_inode(sb, LOGFS_INO_MASTER);
+	if (IS_ERR(super->s_master_inode))
+		return PTR_ERR(super->s_master_inode);
+
+	ret = logfs_read_journal(sb);
+	if (ret)
+		return -EIO;
+
+	reserve_sb_and_journal(sb);
+	logfs_calc_free(sb);
+
+	super->s_journal_area->a_ops = &journal_area_ops;
+	return 0;
+}
+
+void logfs_cleanup_journal(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	btree_grim_visitor32(&super->s_reserved_segments, 0, NULL);
+	destroy_meta_inode(super->s_master_inode);
+	super->s_master_inode = NULL;
+
+	kfree(super->s_compressed_je);
+	kfree(super->s_je);
+}
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
new file mode 100644
index 000000000000..c838c4d72111
--- /dev/null
+++ b/fs/logfs/logfs.h
@@ -0,0 +1,739 @@
+/*
+ * fs/logfs/logfs.h
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ *
+ * Private header for logfs.
+ */
+#ifndef FS_LOGFS_LOGFS_H
+#define FS_LOGFS_LOGFS_H
+
+#undef __CHECK_ENDIAN__
+#define __CHECK_ENDIAN__
+
+#include <linux/btree.h>
+#include <linux/crc32.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/mempool.h>
+#include <linux/pagemap.h>
+#include <linux/mtd/mtd.h>
+#include "logfs_abi.h"
+
+#define LOGFS_DEBUG_SUPER	(0x0001)
+#define LOGFS_DEBUG_SEGMENT	(0x0002)
+#define LOGFS_DEBUG_JOURNAL	(0x0004)
+#define LOGFS_DEBUG_DIR		(0x0008)
+#define LOGFS_DEBUG_FILE	(0x0010)
+#define LOGFS_DEBUG_INODE	(0x0020)
+#define LOGFS_DEBUG_READWRITE	(0x0040)
+#define LOGFS_DEBUG_GC		(0x0080)
+#define LOGFS_DEBUG_GC_NOISY	(0x0100)
+#define LOGFS_DEBUG_ALIASES	(0x0200)
+#define LOGFS_DEBUG_BLOCKMOVE	(0x0400)
+#define LOGFS_DEBUG_ALL		(0xffffffff)
+
+#define LOGFS_DEBUG		(0x01)
+/*
+ * To enable specific log messages, simply define LOGFS_DEBUG to match any
+ * or all of the above.
+ */
+#ifndef LOGFS_DEBUG
+#define LOGFS_DEBUG		(0)
+#endif
+
+#define log_cond(cond, fmt, arg...) do {	\
+	if (cond)				\
+		printk(KERN_DEBUG fmt, ##arg);	\
+} while (0)
+
+#define log_super(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SUPER, fmt, ##arg)
+#define log_segment(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SEGMENT, fmt, ##arg)
+#define log_journal(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_JOURNAL, fmt, ##arg)
+#define log_dir(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_DIR, fmt, ##arg)
+#define log_file(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_FILE, fmt, ##arg)
+#define log_inode(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_INODE, fmt, ##arg)
+#define log_readwrite(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_READWRITE, fmt, ##arg)
+#define log_gc(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC, fmt, ##arg)
+#define log_gc_noisy(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC_NOISY, fmt, ##arg)
+#define log_aliases(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_ALIASES, fmt, ##arg)
+#define log_blockmove(fmt, arg...) \
+	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_BLOCKMOVE, fmt, ##arg)
+
+#define PG_pre_locked		PG_owner_priv_1
+#define PagePreLocked(page)	test_bit(PG_pre_locked, &(page)->flags)
+#define SetPagePreLocked(page)	set_bit(PG_pre_locked, &(page)->flags)
+#define ClearPagePreLocked(page) clear_bit(PG_pre_locked, &(page)->flags)
+
+/* FIXME: This should really be somewhere in the 64bit area. */
+#define LOGFS_LINK_MAX		(1<<30)
+
+/* Read-only filesystem */
+#define LOGFS_SB_FLAG_RO	0x0001
+#define LOGFS_SB_FLAG_DIRTY	0x0002
+#define LOGFS_SB_FLAG_OBJ_ALIAS	0x0004
+#define LOGFS_SB_FLAG_SHUTDOWN	0x0008
+
+/* Write Control Flags */
+#define WF_LOCK			0x01 /* take write lock */
+#define WF_WRITE		0x02 /* write block */
+#define WF_DELETE		0x04 /* delete old block */
+
+typedef u8 __bitwise level_t;
+typedef u8 __bitwise gc_level_t;
+
+#define LEVEL(level) ((__force level_t)(level))
+#define GC_LEVEL(gc_level) ((__force gc_level_t)(gc_level))
+
+#define SUBLEVEL(level) ( (void)((level) == LEVEL(1)),	\
+		(__force level_t)((__force u8)(level) - 1) )
+
+/**
+ * struct logfs_area - area management information
+ *
+ * @a_sb:			the superblock this area belongs to
+ * @a_is_open:			1 if the area is currently open, else 0
+ * @a_segno:			segment number of area
+ * @a_written_bytes:		number of bytes already written back
+ * @a_used_bytes:		number of used bytes
+ * @a_ops:			area operations (either journal or ostore)
+ * @a_erase_count:		erase count
+ * @a_level:			GC level
+ */
+struct logfs_area { /* a segment open for writing */
+	struct super_block *a_sb;
+	int	a_is_open;
+	u32	a_segno;
+	u32	a_written_bytes;
+	u32	a_used_bytes;
+	const struct logfs_area_ops *a_ops;
+	u32	a_erase_count;
+	gc_level_t a_level;
+};
+
+/**
+ * struct logfs_area_ops - area operations
+ *
+ * @get_free_segment:		fill area->ofs with the offset of a free segment
+ * @get_erase_count:		fill area->erase_count (needs area->ofs)
+ * @erase_segment:		erase and setup segment
+ */
+struct logfs_area_ops {
+	void	(*get_free_segment)(struct logfs_area *area);
+	void	(*get_erase_count)(struct logfs_area *area);
+	int	(*erase_segment)(struct logfs_area *area);
+};
+
+/**
+ * struct logfs_device_ops - device access operations
+ *
+ * @readpage:			read one page (mm page)
+ * @writeseg:			write one segment.  may be a partial segment
+ * @erase:			erase one segment
+ * @read:			read from the device
+ * @erase:			erase part of the device
+ * @can_write_buf:		decide whether wbuf can be written to ofs
+ */
+struct logfs_device_ops {
+	struct page *(*find_first_sb)(struct super_block *sb, u64 *ofs);
+	struct page *(*find_last_sb)(struct super_block *sb, u64 *ofs);
+	int (*write_sb)(struct super_block *sb, struct page *page);
+	int (*readpage)(void *_sb, struct page *page);
+	void (*writeseg)(struct super_block *sb, u64 ofs, size_t len);
+	int (*erase)(struct super_block *sb, loff_t ofs, size_t len,
+			int ensure_write);
+	int (*can_write_buf)(struct super_block *sb, u64 ofs);
+	void (*sync)(struct super_block *sb);
+	void (*put_device)(struct super_block *sb);
+};
+
+/**
+ * struct candidate_list - list of similar candidates
+ */
+struct candidate_list {
+	struct rb_root rb_tree;
+	int count;
+	int maxcount;
+	int sort_by_ec;
+};
+
+/**
+ * struct gc_candidate - "candidate" segment to be garbage collected next
+ *
+ * @list:			list (either free of low)
+ * @segno:			segment number
+ * @valid:			number of valid bytes
+ * @erase_count:		erase count of segment
+ * @dist:			distance from tree root
+ *
+ * Candidates can be on two lists.  The free list contains electees rather
+ * than candidates - segments that no longer contain any valid data.  The
+ * low list contains candidates to be picked for GC.  It should be kept
+ * short.  It is not required to always pick a perfect candidate.  In the
+ * worst case GC will have to move more data than absolutely necessary.
+ */
+struct gc_candidate {
+	struct rb_node rb_node;
+	struct candidate_list *list;
+	u32	segno;
+	u32	valid;
+	u32	erase_count;
+	u8	dist;
+};
+
+/**
+ * struct logfs_journal_entry - temporary structure used during journal scan
+ *
+ * @used:
+ * @version:			normalized version
+ * @len:			length
+ * @offset:			offset
+ */
+struct logfs_journal_entry {
+	int used;
+	s16 version;
+	u16 len;
+	u16 datalen;
+	u64 offset;
+};
+
+enum transaction_state {
+	CREATE_1 = 1,
+	CREATE_2,
+	UNLINK_1,
+	UNLINK_2,
+	CROSS_RENAME_1,
+	CROSS_RENAME_2,
+	TARGET_RENAME_1,
+	TARGET_RENAME_2,
+	TARGET_RENAME_3
+};
+
+/**
+ * struct logfs_transaction - essential fields to support atomic dirops
+ *
+ * @ino:			target inode
+ * @dir:			inode of directory containing dentry
+ * @pos:			pos of dentry in directory
+ */
+struct logfs_transaction {
+	enum transaction_state state;
+	u64	 ino;
+	u64	 dir;
+	u64	 pos;
+};
+
+/**
+ * struct logfs_shadow - old block in the shadow of a not-yet-committed new one
+ * @old_ofs:			offset of old block on medium
+ * @new_ofs:			offset of new block on medium
+ * @ino:			inode number
+ * @bix:			block index
+ * @old_len:			size of old block, including header
+ * @new_len:			size of new block, including header
+ * @level:			block level
+ */
+struct logfs_shadow {
+	u64 old_ofs;
+	u64 new_ofs;
+	u64 ino;
+	u64 bix;
+	int old_len;
+	int new_len;
+	gc_level_t gc_level;
+};
+
+/**
+ * struct shadow_tree
+ * @new:			shadows where old_ofs==0, indexed by new_ofs
+ * @old:			shadows where old_ofs!=0, indexed by old_ofs
+ * @segment_map:		bitfield of segments containing shadows
+ * @no_shadowed_segment:	number of segments containing shadows
+ */
+struct shadow_tree {
+	struct btree_head64 new;
+	struct btree_head64 old;
+	struct btree_head32 segment_map;
+	int no_shadowed_segments;
+};
+
+struct object_alias_item {
+	struct list_head list;
+	__be64 val;
+	int child_no;
+};
+
+/**
+ * struct logfs_block - contains any block state
+ * @type:			indirect block or inode
+ * @full:			number of fully populated children
+ * @partial:			number of partially populated children
+ *
+ * Most blocks are directly represented by page cache pages.  But when a block
+ * becomes dirty, is part of a transaction, contains aliases or is otherwise
+ * special, a struct logfs_block is allocated to track the additional state.
+ * Inodes are very similar to indirect blocks, so they can also get one of
+ * these structures added when appropriate.
+ */
+#define BLOCK_INDIRECT	1	/* Indirect block */
+#define BLOCK_INODE	2	/* Inode */
+struct logfs_block_ops;
+struct logfs_block {
+	struct list_head alias_list;
+	struct list_head item_list;
+	struct super_block *sb;
+	u64 ino;
+	u64 bix;
+	level_t level;
+	struct page *page;
+	struct inode *inode;
+	struct logfs_transaction *ta;
+	unsigned long alias_map[LOGFS_BLOCK_FACTOR / BITS_PER_LONG];
+	struct logfs_block_ops *ops;
+	int full;
+	int partial;
+	int reserved_bytes;
+};
+
+typedef int write_alias_t(struct super_block *sb, u64 ino, u64 bix,
+		level_t level, int child_no, __be64 val);
+struct logfs_block_ops {
+	void	(*write_block)(struct logfs_block *block);
+	void	(*free_block)(struct super_block *sb, struct logfs_block*block);
+	int	(*write_alias)(struct super_block *sb,
+			struct logfs_block *block,
+			write_alias_t *write_one_alias);
+};
+
+#define MAX_JOURNAL_ENTRIES 256
+
+struct logfs_super {
+	struct mtd_info *s_mtd;			/* underlying device */
+	struct block_device *s_bdev;		/* underlying device */
+	const struct logfs_device_ops *s_devops;/* device access */
+	struct inode	*s_master_inode;	/* inode file */
+	struct inode	*s_segfile_inode;	/* segment file */
+	struct inode *s_mapping_inode;		/* device mapping */
+	atomic_t s_pending_writes;		/* outstanting bios */
+	long	 s_flags;
+	mempool_t *s_btree_pool;		/* for btree nodes */
+	mempool_t *s_alias_pool;		/* aliases in segment.c */
+	u64	 s_feature_incompat;
+	u64	 s_feature_ro_compat;
+	u64	 s_feature_compat;
+	u64	 s_feature_flags;
+	u64	 s_sb_ofs[2];
+	struct page *s_erase_page;		/* for dev_bdev.c */
+	/* alias.c fields */
+	struct btree_head32 s_segment_alias;	/* remapped segments */
+	int	 s_no_object_aliases;
+	struct list_head s_object_alias;	/* remapped objects */
+	struct btree_head128 s_object_alias_tree; /* remapped objects */
+	struct mutex s_object_alias_mutex;
+	/* dir.c fields */
+	struct mutex s_dirop_mutex;		/* for creat/unlink/rename */
+	u64	 s_victim_ino;			/* used for atomic dir-ops */
+	u64	 s_rename_dir;			/* source directory ino */
+	u64	 s_rename_pos;			/* position of source dd */
+	/* gc.c fields */
+	long	 s_segsize;			/* size of a segment */
+	int	 s_segshift;			/* log2 of segment size */
+	long	 s_segmask;			/* 1 << s_segshift - 1 */
+	long	 s_no_segs;			/* segments on device */
+	long	 s_no_journal_segs;		/* segments used for journal */
+	long	 s_no_blocks;			/* blocks per segment */
+	long	 s_writesize;			/* minimum write size */
+	int	 s_writeshift;			/* log2 of write size */
+	u64	 s_size;			/* filesystem size */
+	struct logfs_area *s_area[LOGFS_NO_AREAS];	/* open segment array */
+	u64	 s_gec;				/* global erase count */
+	u64	 s_wl_gec_ostore;		/* time of last wl event */
+	u64	 s_wl_gec_journal;		/* time of last wl event */
+	u64	 s_sweeper;			/* current sweeper pos */
+	u8	 s_ifile_levels;		/* max level of ifile */
+	u8	 s_iblock_levels;		/* max level of regular files */
+	u8	 s_data_levels;			/* # of segments to leaf block*/
+	u8	 s_total_levels;		/* sum of above three */
+	struct btree_head32 s_cand_tree;	/* all candidates */
+	struct candidate_list s_free_list;	/* 100% free segments */
+	struct candidate_list s_reserve_list;	/* Bad segment reserve */
+	struct candidate_list s_low_list[LOGFS_NO_AREAS];/* good candidates */
+	struct candidate_list s_ec_list;	/* wear level candidates */
+	struct btree_head32 s_reserved_segments;/* sb, journal, bad, etc. */
+	/* inode.c fields */
+	u64	 s_last_ino;			/* highest ino used */
+	long	 s_inos_till_wrap;
+	u32	 s_generation;			/* i_generation for new files */
+	struct list_head s_freeing_list;	/* inodes being freed */
+	/* journal.c fields */
+	struct mutex s_journal_mutex;
+	void	*s_je;				/* journal entry to compress */
+	void	*s_compressed_je;		/* block to write to journal */
+	u32	 s_journal_seg[LOGFS_JOURNAL_SEGS]; /* journal segments */
+	u32	 s_journal_ec[LOGFS_JOURNAL_SEGS]; /* journal erasecounts */
+	u64	 s_last_version;
+	struct logfs_area *s_journal_area;	/* open journal segment */
+	__be64	s_je_array[MAX_JOURNAL_ENTRIES];
+	int	s_no_je;
+
+	int	 s_sum_index;			/* for the 12 summaries */
+	struct shadow_tree s_shadow_tree;
+	int	 s_je_fill;			/* index of current je */
+	/* readwrite.c fields */
+	struct mutex s_write_mutex;
+	int	 s_lock_count;
+	mempool_t *s_block_pool;		/* struct logfs_block pool */
+	mempool_t *s_shadow_pool;		/* struct logfs_shadow pool */
+	struct list_head s_writeback_list;	/* writeback pages */
+	/*
+	 * Space accounting:
+	 * - s_used_bytes specifies space used to store valid data objects.
+	 * - s_dirty_used_bytes is space used to store non-committed data
+	 *   objects.  Those objects have already been written themselves,
+	 *   but they don't become valid until all indirect blocks up to the
+	 *   journal have been written as well.
+	 * - s_dirty_free_bytes is space used to store the old copy of a
+	 *   replaced object, as long as the replacement is non-committed.
+	 *   In other words, it is the amount of space freed when all dirty
+	 *   blocks are written back.
+	 * - s_free_bytes is the amount of free space available for any
+	 *   purpose.
+	 * - s_root_reserve is the amount of free space available only to
+	 *   the root user.  Non-privileged users can no longer write once
+	 *   this watermark has been reached.
+	 * - s_speed_reserve is space which remains unused to speed up
+	 *   garbage collection performance.
+	 * - s_dirty_pages is the space reserved for currently dirty pages.
+	 *   It is a pessimistic estimate, so some/most will get freed on
+	 *   page writeback.
+	 *
+	 * s_used_bytes + s_free_bytes + s_speed_reserve = total usable size
+	 */
+	u64	 s_free_bytes;
+	u64	 s_used_bytes;
+	u64	 s_dirty_free_bytes;
+	u64	 s_dirty_used_bytes;
+	u64	 s_root_reserve;
+	u64	 s_speed_reserve;
+	u64	 s_dirty_pages;
+	/* Bad block handling:
+	 * - s_bad_seg_reserve is a number of segments usually kept
+	 *   free.  When encountering bad blocks, the affected segment's data
+	 *   is _temporarily_ moved to a reserved segment.
+	 * - s_bad_segments is the number of known bad segments.
+	 */
+	u32	 s_bad_seg_reserve;
+	u32	 s_bad_segments;
+};
+
+/**
+ * struct logfs_inode - in-memory inode
+ *
+ * @vfs_inode:			struct inode
+ * @li_data:			data pointers
+ * @li_used_bytes:		number of used bytes
+ * @li_freeing_list:		used to track inodes currently being freed
+ * @li_flags:			inode flags
+ * @li_refcount:		number of internal (GC-induced) references
+ */
+struct logfs_inode {
+	struct inode vfs_inode;
+	u64	li_data[LOGFS_EMBEDDED_FIELDS];
+	u64	li_used_bytes;
+	struct list_head li_freeing_list;
+	struct logfs_block *li_block;
+	u32	li_flags;
+	u8	li_height;
+	int	li_refcount;
+};
+
+#define journal_for_each(__i) for (__i = 0; __i < LOGFS_JOURNAL_SEGS; __i++)
+#define for_each_area(__i) for (__i = 0; __i < LOGFS_NO_AREAS; __i++)
+#define for_each_area_down(__i) for (__i = LOGFS_NO_AREAS - 1; __i >= 0; __i--)
+
+/* compr.c */
+int logfs_compress(void *in, void *out, size_t inlen, size_t outlen);
+int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen);
+int __init logfs_compr_init(void);
+void logfs_compr_exit(void);
+
+/* dev_bdev.c */
+#ifdef CONFIG_BLOCK
+int logfs_get_sb_bdev(struct file_system_type *type, int flags,
+		const char *devname, struct vfsmount *mnt);
+#else
+static inline int logfs_get_sb_bdev(struct file_system_type *type, int flags,
+		const char *devname, struct vfsmount *mnt)
+{
+	return -ENODEV;
+}
+#endif
+
+/* dev_mtd.c */
+#ifdef CONFIG_MTD
+int logfs_get_sb_mtd(struct file_system_type *type, int flags,
+		int mtdnr, struct vfsmount *mnt);
+#else
+static inline int logfs_get_sb_mtd(struct file_system_type *type, int flags,
+		int mtdnr, struct vfsmount *mnt)
+{
+	return -ENODEV;
+}
+#endif
+
+/* dir.c */
+extern const struct inode_operations logfs_symlink_iops;
+extern const struct inode_operations logfs_dir_iops;
+extern const struct file_operations logfs_dir_fops;
+int logfs_replay_journal(struct super_block *sb);
+
+/* file.c */
+extern const struct inode_operations logfs_reg_iops;
+extern const struct file_operations logfs_reg_fops;
+extern const struct address_space_operations logfs_reg_aops;
+int logfs_readpage(struct file *file, struct page *page);
+int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
+		unsigned long arg);
+int logfs_fsync(struct file *file, int datasync);
+
+/* gc.c */
+u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec);
+void logfs_gc_pass(struct super_block *sb);
+int logfs_check_areas(struct super_block *sb);
+int logfs_init_gc(struct super_block *sb);
+void logfs_cleanup_gc(struct super_block *sb);
+
+/* inode.c */
+extern const struct super_operations logfs_super_operations;
+struct inode *logfs_iget(struct super_block *sb, ino_t ino);
+struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *cookie);
+void logfs_safe_iput(struct inode *inode, int cookie);
+struct inode *logfs_new_inode(struct inode *dir, int mode);
+struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino);
+struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino);
+int logfs_init_inode_cache(void);
+void logfs_destroy_inode_cache(void);
+void destroy_meta_inode(struct inode *inode);
+void logfs_set_blocks(struct inode *inode, u64 no);
+/* these logically belong into inode.c but actually reside in readwrite.c */
+int logfs_read_inode(struct inode *inode);
+int __logfs_write_inode(struct inode *inode, long flags);
+void logfs_delete_inode(struct inode *inode);
+void logfs_clear_inode(struct inode *inode);
+
+/* journal.c */
+void logfs_write_anchor(struct super_block *sb);
+int logfs_init_journal(struct super_block *sb);
+void logfs_cleanup_journal(struct super_block *sb);
+int write_alias_journal(struct super_block *sb, u64 ino, u64 bix,
+		level_t level, int child_no, __be64 val);
+void do_logfs_journal_wl_pass(struct super_block *sb);
+
+/* readwrite.c */
+pgoff_t logfs_pack_index(u64 bix, level_t level);
+void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level);
+int logfs_inode_write(struct inode *inode, const void *buf, size_t count,
+		loff_t bix, long flags, struct shadow_tree *shadow_tree);
+int logfs_readpage_nolock(struct page *page);
+int logfs_write_buf(struct inode *inode, struct page *page, long flags);
+int logfs_delete(struct inode *inode, pgoff_t index,
+		struct shadow_tree *shadow_tree);
+int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
+		gc_level_t gc_level, long flags);
+int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix,
+		gc_level_t gc_level);
+int logfs_truncate(struct inode *inode, u64 size);
+u64 logfs_seek_hole(struct inode *inode, u64 bix);
+u64 logfs_seek_data(struct inode *inode, u64 bix);
+int logfs_open_segfile(struct super_block *sb);
+int logfs_init_rw(struct super_block *sb);
+void logfs_cleanup_rw(struct super_block *sb);
+void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta);
+void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta);
+void logfs_write_block(struct logfs_block *block, long flags);
+int logfs_write_obj_aliases_pagecache(struct super_block *sb);
+void logfs_get_segment_entry(struct super_block *sb, u32 segno,
+		struct logfs_segment_entry *se);
+void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment);
+void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec,
+		gc_level_t gc_level);
+void logfs_set_segment_reserved(struct super_block *sb, u32 segno);
+void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec);
+struct logfs_block *__alloc_block(struct super_block *sb,
+		u64 ino, u64 bix, level_t level);
+void __free_block(struct super_block *sb, struct logfs_block *block);
+void btree_write_block(struct logfs_block *block);
+void initialize_block_counters(struct page *page, struct logfs_block *block,
+		__be64 *array, int page_is_empty);
+int logfs_exist_block(struct inode *inode, u64 bix);
+int get_page_reserve(struct inode *inode, struct page *page);
+extern struct logfs_block_ops indirect_block_ops;
+
+/* segment.c */
+int logfs_erase_segment(struct super_block *sb, u32 ofs, int ensure_erase);
+int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf);
+int logfs_segment_read(struct inode *inode, struct page *page, u64 ofs, u64 bix,
+		level_t level);
+int logfs_segment_write(struct inode *inode, struct page *page,
+		struct logfs_shadow *shadow);
+int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow);
+int logfs_load_object_aliases(struct super_block *sb,
+		struct logfs_obj_alias *oa, int count);
+void move_page_to_btree(struct page *page);
+int logfs_init_mapping(struct super_block *sb);
+void logfs_sync_area(struct logfs_area *area);
+void logfs_sync_segments(struct super_block *sb);
+void freeseg(struct super_block *sb, u32 segno);
+
+/* area handling */
+int logfs_init_areas(struct super_block *sb);
+void logfs_cleanup_areas(struct super_block *sb);
+int logfs_open_area(struct logfs_area *area, size_t bytes);
+int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
+		int use_filler);
+
+static inline int logfs_buf_write(struct logfs_area *area, u64 ofs,
+		void *buf, size_t len)
+{
+	return __logfs_buf_write(area, ofs, buf, len, 0);
+}
+
+static inline int logfs_buf_recover(struct logfs_area *area, u64 ofs,
+		void *buf, size_t len)
+{
+	return __logfs_buf_write(area, ofs, buf, len, 1);
+}
+
+/* super.c */
+struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index);
+void emergency_read_end(struct page *page);
+void logfs_crash_dump(struct super_block *sb);
+void *memchr_inv(const void *s, int c, size_t n);
+int logfs_statfs(struct dentry *dentry, struct kstatfs *stats);
+int logfs_get_sb_device(struct file_system_type *type, int flags,
+		struct mtd_info *mtd, struct block_device *bdev,
+		const struct logfs_device_ops *devops, struct vfsmount *mnt);
+int logfs_check_ds(struct logfs_disk_super *ds);
+int logfs_write_sb(struct super_block *sb);
+
+static inline struct logfs_super *logfs_super(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline struct logfs_inode *logfs_inode(struct inode *inode)
+{
+	return container_of(inode, struct logfs_inode, vfs_inode);
+}
+
+static inline void logfs_set_ro(struct super_block *sb)
+{
+	logfs_super(sb)->s_flags |= LOGFS_SB_FLAG_RO;
+}
+
+#define LOGFS_BUG(sb) do {					\
+	struct super_block *__sb = sb;				\
+	logfs_crash_dump(__sb);					\
+	logfs_super(__sb)->s_flags |= LOGFS_SB_FLAG_RO;		\
+	BUG();							\
+} while (0)
+
+#define LOGFS_BUG_ON(condition, sb) \
+	do { if (unlikely(condition)) LOGFS_BUG((sb)); } while (0)
+
+static inline __be32 logfs_crc32(void *data, size_t len, size_t skip)
+{
+	return cpu_to_be32(crc32(~0, data+skip, len-skip));
+}
+
+static inline u8 logfs_type(struct inode *inode)
+{
+	return (inode->i_mode >> 12) & 15;
+}
+
+static inline pgoff_t logfs_index(struct super_block *sb, u64 pos)
+{
+	return pos >> sb->s_blocksize_bits;
+}
+
+static inline u64 dev_ofs(struct super_block *sb, u32 segno, u32 ofs)
+{
+	return ((u64)segno << logfs_super(sb)->s_segshift) + ofs;
+}
+
+static inline u32 seg_no(struct super_block *sb, u64 ofs)
+{
+	return ofs >> logfs_super(sb)->s_segshift;
+}
+
+static inline u32 seg_ofs(struct super_block *sb, u64 ofs)
+{
+	return ofs & logfs_super(sb)->s_segmask;
+}
+
+static inline u64 seg_align(struct super_block *sb, u64 ofs)
+{
+	return ofs & ~logfs_super(sb)->s_segmask;
+}
+
+static inline struct logfs_block *logfs_block(struct page *page)
+{
+	return (void *)page->private;
+}
+
+static inline level_t shrink_level(gc_level_t __level)
+{
+	u8 level = (__force u8)__level;
+
+	if (level >= LOGFS_MAX_LEVELS)
+		level -= LOGFS_MAX_LEVELS;
+	return (__force level_t)level;
+}
+
+static inline gc_level_t expand_level(u64 ino, level_t __level)
+{
+	u8 level = (__force u8)__level;
+
+	if (ino == LOGFS_INO_MASTER) {
+		/* ifile has separate areas */
+		level += LOGFS_MAX_LEVELS;
+	}
+	return (__force gc_level_t)level;
+}
+
+static inline int logfs_block_shift(struct super_block *sb, level_t level)
+{
+	level = shrink_level((__force gc_level_t)level);
+	return (__force int)level * (sb->s_blocksize_bits - 3);
+}
+
+static inline u64 logfs_block_mask(struct super_block *sb, level_t level)
+{
+	return ~0ull << logfs_block_shift(sb, level);
+}
+
+static inline struct logfs_area *get_area(struct super_block *sb,
+		gc_level_t gc_level)
+{
+	return logfs_super(sb)->s_area[(__force u8)gc_level];
+}
+
+static inline void logfs_mempool_destroy(mempool_t *pool)
+{
+	if (pool)
+		mempool_destroy(pool);
+}
+
+#endif
diff --git a/fs/logfs/logfs_abi.h b/fs/logfs/logfs_abi.h
new file mode 100644
index 000000000000..ae960519c54a
--- /dev/null
+++ b/fs/logfs/logfs_abi.h
@@ -0,0 +1,629 @@
+/*
+ * fs/logfs/logfs_abi.h
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ *
+ * Public header for logfs.
+ */
+#ifndef FS_LOGFS_LOGFS_ABI_H
+#define FS_LOGFS_LOGFS_ABI_H
+
+/* For out-of-kernel compiles */
+#ifndef BUILD_BUG_ON
+#define BUILD_BUG_ON(condition) /**/
+#endif
+
+#define SIZE_CHECK(type, size)					\
+static inline void check_##type(void)				\
+{								\
+	BUILD_BUG_ON(sizeof(struct type) != (size));		\
+}
+
+/*
+ * Throughout the logfs code, we're constantly dealing with blocks at
+ * various positions or offsets.  To remove confusion, we stricly
+ * distinguish between a "position" - the logical position within a
+ * file and an "offset" - the physical location within the device.
+ *
+ * Any usage of the term offset for a logical location or position for
+ * a physical one is a bug and should get fixed.
+ */
+
+/*
+ * Block are allocated in one of several segments depending on their
+ * level.  The following levels are used:
+ *  0	- regular data block
+ *  1	- i1 indirect blocks
+ *  2	- i2 indirect blocks
+ *  3	- i3 indirect blocks
+ *  4	- i4 indirect blocks
+ *  5	- i5 indirect blocks
+ *  6	- ifile data blocks
+ *  7	- ifile i1 indirect blocks
+ *  8	- ifile i2 indirect blocks
+ *  9	- ifile i3 indirect blocks
+ * 10	- ifile i4 indirect blocks
+ * 11	- ifile i5 indirect blocks
+ * Potential levels to be used in the future:
+ * 12	- gc recycled blocks, long-lived data
+ * 13	- replacement blocks, short-lived data
+ *
+ * Levels 1-11 are necessary for robust gc operations and help separate
+ * short-lived metadata from longer-lived file data.  In the future,
+ * file data should get separated into several segments based on simple
+ * heuristics.  Old data recycled during gc operation is expected to be
+ * long-lived.  New data is of uncertain life expectancy.  New data
+ * used to replace older blocks in existing files is expected to be
+ * short-lived.
+ */
+
+
+/* Magic numbers.  64bit for superblock, 32bit for statfs f_type */
+#define LOGFS_MAGIC		0x7a3a8e5cb9d5bf67ull
+#define LOGFS_MAGIC_U32		0xc97e8168u
+
+/*
+ * Various blocksize related macros.  Blocksize is currently fixed at 4KiB.
+ * Sooner or later that should become configurable and the macros replaced
+ * by something superblock-dependent.  Pointers in indirect blocks are and
+ * will remain 64bit.
+ *
+ * LOGFS_BLOCKSIZE	- self-explaining
+ * LOGFS_BLOCK_FACTOR	- number of pointers per indirect block
+ * LOGFS_BLOCK_BITS	- log2 of LOGFS_BLOCK_FACTOR, used for shifts
+ */
+#define LOGFS_BLOCKSIZE		(4096ull)
+#define LOGFS_BLOCK_FACTOR	(LOGFS_BLOCKSIZE / sizeof(u64))
+#define LOGFS_BLOCK_BITS	(9)
+
+/*
+ * Number of blocks at various levels of indirection.  There are 16 direct
+ * block pointers plus a single indirect pointer.
+ */
+#define I0_BLOCKS		(16)
+#define I1_BLOCKS		LOGFS_BLOCK_FACTOR
+#define I2_BLOCKS		(LOGFS_BLOCK_FACTOR * I1_BLOCKS)
+#define I3_BLOCKS		(LOGFS_BLOCK_FACTOR * I2_BLOCKS)
+#define I4_BLOCKS		(LOGFS_BLOCK_FACTOR * I3_BLOCKS)
+#define I5_BLOCKS		(LOGFS_BLOCK_FACTOR * I4_BLOCKS)
+
+#define INDIRECT_INDEX		I0_BLOCKS
+#define LOGFS_EMBEDDED_FIELDS	(I0_BLOCKS + 1)
+
+/*
+ * Sizes at which files require another level of indirection.  Files smaller
+ * than LOGFS_EMBEDDED_SIZE can be completely stored in the inode itself,
+ * similar like ext2 fast symlinks.
+ *
+ * Data at a position smaller than LOGFS_I0_SIZE is accessed through the
+ * direct pointers, else through the 1x indirect pointer and so forth.
+ */
+#define LOGFS_EMBEDDED_SIZE	(LOGFS_EMBEDDED_FIELDS * sizeof(u64))
+#define LOGFS_I0_SIZE		(I0_BLOCKS * LOGFS_BLOCKSIZE)
+#define LOGFS_I1_SIZE		(I1_BLOCKS * LOGFS_BLOCKSIZE)
+#define LOGFS_I2_SIZE		(I2_BLOCKS * LOGFS_BLOCKSIZE)
+#define LOGFS_I3_SIZE		(I3_BLOCKS * LOGFS_BLOCKSIZE)
+#define LOGFS_I4_SIZE		(I4_BLOCKS * LOGFS_BLOCKSIZE)
+#define LOGFS_I5_SIZE		(I5_BLOCKS * LOGFS_BLOCKSIZE)
+
+/*
+ * Each indirect block pointer must have this flag set, if all block pointers
+ * behind it are set, i.e. there is no hole hidden in the shadow of this
+ * indirect block pointer.
+ */
+#define LOGFS_FULLY_POPULATED (1ULL << 63)
+#define pure_ofs(ofs) (ofs & ~LOGFS_FULLY_POPULATED)
+
+/*
+ * LogFS needs to separate data into levels.  Each level is defined as the
+ * maximal possible distance from the master inode (inode of the inode file).
+ * Data blocks reside on level 0, 1x indirect block on level 1, etc.
+ * Inodes reside on level 6, indirect blocks for the inode file on levels 7-11.
+ * This effort is necessary to guarantee garbage collection to always make
+ * progress.
+ *
+ * LOGFS_MAX_INDIRECT is the maximal indirection through indirect blocks,
+ * LOGFS_MAX_LEVELS is one more for the actual data level of a file.  It is
+ * the maximal number of levels for one file.
+ * LOGFS_NO_AREAS is twice that, as the inode file and regular files are
+ * effectively stacked on top of each other.
+ */
+#define LOGFS_MAX_INDIRECT	(5)
+#define LOGFS_MAX_LEVELS	(LOGFS_MAX_INDIRECT + 1)
+#define LOGFS_NO_AREAS		(2 * LOGFS_MAX_LEVELS)
+
+/* Maximum size of filenames */
+#define LOGFS_MAX_NAMELEN	(255)
+
+/* Number of segments in the primary journal. */
+#define LOGFS_JOURNAL_SEGS	(16)
+
+/* Maximum number of free/erased/etc. segments in journal entries */
+#define MAX_CACHED_SEGS		(64)
+
+
+/*
+ * LOGFS_OBJECT_HEADERSIZE is the size of a single header in the object store,
+ * LOGFS_MAX_OBJECTSIZE the size of the largest possible object, including
+ * its header,
+ * LOGFS_SEGMENT_RESERVE is the amount of space reserved for each segment for
+ * its segment header and the padded space at the end when no further objects
+ * fit.
+ */
+#define LOGFS_OBJECT_HEADERSIZE	(0x1c)
+#define LOGFS_SEGMENT_HEADERSIZE (0x18)
+#define LOGFS_MAX_OBJECTSIZE	(LOGFS_OBJECT_HEADERSIZE + LOGFS_BLOCKSIZE)
+#define LOGFS_SEGMENT_RESERVE	\
+	(LOGFS_SEGMENT_HEADERSIZE + LOGFS_MAX_OBJECTSIZE - 1)
+
+/*
+ * Segment types:
+ * SEG_SUPER	- Data or indirect block
+ * SEG_JOURNAL	- Inode
+ * SEG_OSTORE	- Dentry
+ */
+enum {
+	SEG_SUPER	= 0x01,
+	SEG_JOURNAL	= 0x02,
+	SEG_OSTORE	= 0x03,
+};
+
+/**
+ * struct logfs_segment_header - per-segment header in the ostore
+ *
+ * @crc:			crc32 of header (there is no data)
+ * @pad:			unused, must be 0
+ * @type:			segment type, see above
+ * @level:			GC level for all objects in this segment
+ * @segno:			segment number
+ * @ec:				erase count for this segment
+ * @gec:			global erase count at time of writing
+ */
+struct logfs_segment_header {
+	__be32	crc;
+	__be16	pad;
+	__u8	type;
+	__u8	level;
+	__be32	segno;
+	__be32	ec;
+	__be64	gec;
+};
+
+SIZE_CHECK(logfs_segment_header, LOGFS_SEGMENT_HEADERSIZE);
+
+#define LOGFS_FEATURES_INCOMPAT		(0ull)
+#define LOGFS_FEATURES_RO_COMPAT	(0ull)
+#define LOGFS_FEATURES_COMPAT		(0ull)
+
+/**
+ * struct logfs_disk_super - on-medium superblock
+ *
+ * @ds_magic:			magic number, must equal LOGFS_MAGIC
+ * @ds_crc:			crc32 of structure starting with the next field
+ * @ds_ifile_levels:		maximum number of levels for ifile
+ * @ds_iblock_levels:		maximum number of levels for regular files
+ * @ds_data_levels:		number of separate levels for data
+ * @pad0:			reserved, must be 0
+ * @ds_feature_incompat:	incompatible filesystem features
+ * @ds_feature_ro_compat:	read-only compatible filesystem features
+ * @ds_feature_compat:		compatible filesystem features
+ * @ds_flags:			flags
+ * @ds_segment_shift:		log2 of segment size
+ * @ds_block_shift:		log2 of block size
+ * @ds_write_shift:		log2 of write size
+ * @pad1:			reserved, must be 0
+ * @ds_journal_seg:		segments used by primary journal
+ * @ds_root_reserve:		bytes reserved for the superuser
+ * @ds_speed_reserve:		bytes reserved to speed up GC
+ * @ds_bad_seg_reserve:		number of segments reserved to handle bad blocks
+ * @pad2:			reserved, must be 0
+ * @pad3:			reserved, must be 0
+ *
+ * Contains only read-only fields.  Read-write fields like the amount of used
+ * space is tracked in the dynamic superblock, which is stored in the journal.
+ */
+struct logfs_disk_super {
+	struct logfs_segment_header ds_sh;
+	__be64	ds_magic;
+
+	__be32	ds_crc;
+	__u8	ds_ifile_levels;
+	__u8	ds_iblock_levels;
+	__u8	ds_data_levels;
+	__u8	ds_segment_shift;
+	__u8	ds_block_shift;
+	__u8	ds_write_shift;
+	__u8	pad0[6];
+
+	__be64	ds_filesystem_size;
+	__be32	ds_segment_size;
+	__be32  ds_bad_seg_reserve;
+
+	__be64	ds_feature_incompat;
+	__be64	ds_feature_ro_compat;
+
+	__be64	ds_feature_compat;
+	__be64	ds_feature_flags;
+
+	__be64	ds_root_reserve;
+	__be64  ds_speed_reserve;
+
+	__be32	ds_journal_seg[LOGFS_JOURNAL_SEGS];
+
+	__be64	ds_super_ofs[2];
+	__be64	pad3[8];
+};
+
+SIZE_CHECK(logfs_disk_super, 256);
+
+/*
+ * Object types:
+ * OBJ_BLOCK	- Data or indirect block
+ * OBJ_INODE	- Inode
+ * OBJ_DENTRY	- Dentry
+ */
+enum {
+	OBJ_BLOCK	= 0x04,
+	OBJ_INODE	= 0x05,
+	OBJ_DENTRY	= 0x06,
+};
+
+/**
+ * struct logfs_object_header - per-object header in the ostore
+ *
+ * @crc:			crc32 of header, excluding data_crc
+ * @len:			length of data
+ * @type:			object type, see above
+ * @compr:			compression type
+ * @ino:			inode number
+ * @bix:			block index
+ * @data_crc:			crc32 of payload
+ */
+struct logfs_object_header {
+	__be32	crc;
+	__be16	len;
+	__u8	type;
+	__u8	compr;
+	__be64	ino;
+	__be64	bix;
+	__be32	data_crc;
+} __attribute__((packed));
+
+SIZE_CHECK(logfs_object_header, LOGFS_OBJECT_HEADERSIZE);
+
+/*
+ * Reserved inode numbers:
+ * LOGFS_INO_MASTER	- master inode (for inode file)
+ * LOGFS_INO_ROOT	- root directory
+ * LOGFS_INO_SEGFILE	- per-segment used bytes and erase count
+ */
+enum {
+	LOGFS_INO_MAPPING	= 0x00,
+	LOGFS_INO_MASTER	= 0x01,
+	LOGFS_INO_ROOT		= 0x02,
+	LOGFS_INO_SEGFILE	= 0x03,
+	LOGFS_RESERVED_INOS	= 0x10,
+};
+
+/*
+ * Inode flags.  High bits should never be written to the medium.  They are
+ * reserved for in-memory usage.
+ * Low bits should either remain in sync with the corresponding FS_*_FL or
+ * reuse slots that obviously don't make sense for logfs.
+ *
+ * LOGFS_IF_DIRTY	Inode must be written back
+ * LOGFS_IF_ZOMBIE	Inode has been deleted
+ * LOGFS_IF_STILLBORN	-ENOSPC happened when creating inode
+ */
+#define LOGFS_IF_COMPRESSED	0x00000004 /* == FS_COMPR_FL */
+#define LOGFS_IF_DIRTY		0x20000000
+#define LOGFS_IF_ZOMBIE		0x40000000
+#define LOGFS_IF_STILLBORN	0x80000000
+
+/* Flags available to chattr */
+#define LOGFS_FL_USER_VISIBLE	(LOGFS_IF_COMPRESSED)
+#define LOGFS_FL_USER_MODIFIABLE (LOGFS_IF_COMPRESSED)
+/* Flags inherited from parent directory on file/directory creation */
+#define LOGFS_FL_INHERITED	(LOGFS_IF_COMPRESSED)
+
+/**
+ * struct logfs_disk_inode - on-medium inode
+ *
+ * @di_mode:			file mode
+ * @di_pad:			reserved, must be 0
+ * @di_flags:			inode flags, see above
+ * @di_uid:			user id
+ * @di_gid:			group id
+ * @di_ctime:			change time
+ * @di_mtime:			modify time
+ * @di_refcount:		reference count (aka nlink or link count)
+ * @di_generation:		inode generation, for nfs
+ * @di_used_bytes:		number of bytes used
+ * @di_size:			file size
+ * @di_data:			data pointers
+ */
+struct logfs_disk_inode {
+	__be16	di_mode;
+	__u8	di_height;
+	__u8	di_pad;
+	__be32	di_flags;
+	__be32	di_uid;
+	__be32	di_gid;
+
+	__be64	di_ctime;
+	__be64	di_mtime;
+
+	__be64	di_atime;
+	__be32	di_refcount;
+	__be32	di_generation;
+
+	__be64	di_used_bytes;
+	__be64	di_size;
+
+	__be64	di_data[LOGFS_EMBEDDED_FIELDS];
+};
+
+SIZE_CHECK(logfs_disk_inode, 200);
+
+#define INODE_POINTER_OFS \
+	(offsetof(struct logfs_disk_inode, di_data) / sizeof(__be64))
+#define INODE_USED_OFS \
+	(offsetof(struct logfs_disk_inode, di_used_bytes) / sizeof(__be64))
+#define INODE_SIZE_OFS \
+	(offsetof(struct logfs_disk_inode, di_size) / sizeof(__be64))
+#define INODE_HEIGHT_OFS	(0)
+
+/**
+ * struct logfs_disk_dentry - on-medium dentry structure
+ *
+ * @ino:			inode number
+ * @namelen:			length of file name
+ * @type:			file type, identical to bits 12..15 of mode
+ * @name:			file name
+ */
+/* FIXME: add 6 bytes of padding to remove the __packed */
+struct logfs_disk_dentry {
+	__be64	ino;
+	__be16	namelen;
+	__u8	type;
+	__u8	name[LOGFS_MAX_NAMELEN];
+} __attribute__((packed));
+
+SIZE_CHECK(logfs_disk_dentry, 266);
+
+#define RESERVED		0xffffffff
+#define BADSEG			0xffffffff
+/**
+ * struct logfs_segment_entry - segment file entry
+ *
+ * @ec_level:			erase count and level
+ * @valid:			number of valid bytes
+ *
+ * Segment file contains one entry for every segment.  ec_level contains the
+ * erasecount in the upper 28 bits and the level in the lower 4 bits.  An
+ * ec_level of BADSEG (-1) identifies bad segments.  valid contains the number
+ * of valid bytes or RESERVED (-1 again) if the segment is used for either the
+ * superblock or the journal, or when the segment is bad.
+ */
+struct logfs_segment_entry {
+	__be32	ec_level;
+	__be32	valid;
+};
+
+SIZE_CHECK(logfs_segment_entry, 8);
+
+/**
+ * struct logfs_journal_header - header for journal entries (JEs)
+ *
+ * @h_crc:			crc32 of journal entry
+ * @h_len:			length of compressed journal entry,
+ *				not including header
+ * @h_datalen:			length of uncompressed data
+ * @h_type:			JE type
+ * @h_compr:			compression type
+ * @h_pad:			reserved
+ */
+struct logfs_journal_header {
+	__be32	h_crc;
+	__be16	h_len;
+	__be16	h_datalen;
+	__be16	h_type;
+	__u8	h_compr;
+	__u8	h_pad[5];
+};
+
+SIZE_CHECK(logfs_journal_header, 16);
+
+/*
+ * Life expectency of data.
+ * VIM_DEFAULT		- default vim
+ * VIM_SEGFILE		- for segment file only - very short-living
+ * VIM_GC		- GC'd data - likely long-living
+ */
+enum logfs_vim {
+	VIM_DEFAULT	= 0,
+	VIM_SEGFILE	= 1,
+};
+
+/**
+ * struct logfs_je_area - wbuf header
+ *
+ * @segno:			segment number of area
+ * @used_bytes:			number of bytes already used
+ * @gc_level:			GC level
+ * @vim:			life expectancy of data
+ *
+ * "Areas" are segments currently being used for writing.  There is at least
+ * one area per GC level.  Several may be used to separate long-living from
+ * short-living data.  If an area with unknown vim is encountered, it can
+ * simply be closed.
+ * The write buffer immediately follow this header.
+ */
+struct logfs_je_area {
+	__be32	segno;
+	__be32	used_bytes;
+	__u8	gc_level;
+	__u8	vim;
+} __attribute__((packed));
+
+SIZE_CHECK(logfs_je_area, 10);
+
+#define MAX_JOURNAL_HEADER \
+	(sizeof(struct logfs_journal_header) + sizeof(struct logfs_je_area))
+
+/**
+ * struct logfs_je_dynsb - dynamic superblock
+ *
+ * @ds_gec:			global erase count
+ * @ds_sweeper:			current position of GC "sweeper"
+ * @ds_rename_dir:		source directory ino (see dir.c documentation)
+ * @ds_rename_pos:		position of source dd (see dir.c documentation)
+ * @ds_victim_ino:		victims of incomplete dir operation (see dir.c)
+ * @ds_victim_ino:		parent inode of victim (see dir.c)
+ * @ds_used_bytes:		number of used bytes
+ */
+struct logfs_je_dynsb {
+	__be64	ds_gec;
+	__be64	ds_sweeper;
+
+	__be64	ds_rename_dir;
+	__be64	ds_rename_pos;
+
+	__be64	ds_victim_ino;
+	__be64	ds_victim_parent; /* XXX */
+
+	__be64	ds_used_bytes;
+	__be32	ds_generation;
+	__be32	pad;
+};
+
+SIZE_CHECK(logfs_je_dynsb, 64);
+
+/**
+ * struct logfs_je_anchor - anchor of filesystem tree, aka master inode
+ *
+ * @da_size:			size of inode file
+ * @da_last_ino:		last created inode
+ * @da_used_bytes:		number of bytes used
+ * @da_data:			data pointers
+ */
+struct logfs_je_anchor {
+	__be64	da_size;
+	__be64	da_last_ino;
+
+	__be64	da_used_bytes;
+	u8	da_height;
+	u8	pad[7];
+
+	__be64	da_data[LOGFS_EMBEDDED_FIELDS];
+};
+
+SIZE_CHECK(logfs_je_anchor, 168);
+
+/**
+ * struct logfs_je_spillout - spillout entry (from 1st to 2nd journal)
+ *
+ * @so_segment:			segments used for 2nd journal
+ *
+ * Length of the array is given by h_len field in the header.
+ */
+struct logfs_je_spillout {
+	__be64	so_segment[0];
+};
+
+SIZE_CHECK(logfs_je_spillout, 0);
+
+/**
+ * struct logfs_je_journal_ec - erase counts for all journal segments
+ *
+ * @ec:				erase count
+ *
+ * Length of the array is given by h_len field in the header.
+ */
+struct logfs_je_journal_ec {
+	__be32	ec[0];
+};
+
+SIZE_CHECK(logfs_je_journal_ec, 0);
+
+/**
+ * struct logfs_je_free_segments - list of free segmetns with erase count
+ */
+struct logfs_je_free_segments {
+	__be32	segno;
+	__be32	ec;
+};
+
+SIZE_CHECK(logfs_je_free_segments, 8);
+
+/**
+ * struct logfs_seg_alias - list of segment aliases
+ */
+struct logfs_seg_alias {
+	__be32	old_segno;
+	__be32	new_segno;
+};
+
+SIZE_CHECK(logfs_seg_alias, 8);
+
+/**
+ * struct logfs_obj_alias - list of object aliases
+ */
+struct logfs_obj_alias {
+	__be64	ino;
+	__be64	bix;
+	__be64	val;
+	u8	level;
+	u8	pad[5];
+	__be16	child_no;
+};
+
+SIZE_CHECK(logfs_obj_alias, 32);
+
+/**
+ * Compression types.
+ *
+ * COMPR_NONE	- uncompressed
+ * COMPR_ZLIB	- compressed with zlib
+ */
+enum {
+	COMPR_NONE	= 0,
+	COMPR_ZLIB	= 1,
+};
+
+/*
+ * Journal entries come in groups of 16.  First group contains unique
+ * entries, next groups contain one entry per level
+ *
+ * JE_FIRST	- smallest possible journal entry number
+ *
+ * JEG_BASE	- base group, containing unique entries
+ * JE_COMMIT	- commit entry, validates all previous entries
+ * JE_DYNSB	- dynamic superblock, anything that ought to be in the
+ *		  superblock but cannot because it is read-write data
+ * JE_ANCHOR	- anchor aka master inode aka inode file's inode
+ * JE_ERASECOUNT  erasecounts for all journal segments
+ * JE_SPILLOUT	- unused
+ * JE_SEG_ALIAS	- aliases segments
+ * JE_AREA	- area description
+ *
+ * JE_LAST	- largest possible journal entry number
+ */
+enum {
+	JE_FIRST	= 0x01,
+
+	JEG_BASE	= 0x00,
+	JE_COMMIT	= 0x02,
+	JE_DYNSB	= 0x03,
+	JE_ANCHOR	= 0x04,
+	JE_ERASECOUNT	= 0x05,
+	JE_SPILLOUT	= 0x06,
+	JE_OBJ_ALIAS	= 0x0d,
+	JE_AREA		= 0x0e,
+
+	JE_LAST		= 0x0e,
+};
+
+#endif
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
new file mode 100644
index 000000000000..0718d112a1a5
--- /dev/null
+++ b/fs/logfs/readwrite.c
@@ -0,0 +1,2278 @@
+/*
+ * fs/logfs/readwrite.c
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ *
+ *
+ * Actually contains five sets of very similar functions:
+ * read		read blocks from a file
+ * seek_hole	find next hole
+ * seek_data	find next data block
+ * valid	check whether a block still belongs to a file
+ * write	write blocks to a file
+ * delete	delete a block (for directories and ifile)
+ * rewrite	move existing blocks of a file to a new location (gc helper)
+ * truncate	truncate a file
+ */
+#include "logfs.h"
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+static u64 adjust_bix(u64 bix, level_t level)
+{
+	switch (level) {
+	case 0:
+		return bix;
+	case LEVEL(1):
+		return max_t(u64, bix, I0_BLOCKS);
+	case LEVEL(2):
+		return max_t(u64, bix, I1_BLOCKS);
+	case LEVEL(3):
+		return max_t(u64, bix, I2_BLOCKS);
+	case LEVEL(4):
+		return max_t(u64, bix, I3_BLOCKS);
+	case LEVEL(5):
+		return max_t(u64, bix, I4_BLOCKS);
+	default:
+		WARN_ON(1);
+		return bix;
+	}
+}
+
+static inline u64 maxbix(u8 height)
+{
+	return 1ULL << (LOGFS_BLOCK_BITS * height);
+}
+
+/**
+ * The inode address space is cut in two halves.  Lower half belongs to data
+ * pages, upper half to indirect blocks.  If the high bit (INDIRECT_BIT) is
+ * set, the actual block index (bix) and level can be derived from the page
+ * index.
+ *
+ * The lowest three bits of the block index are set to 0 after packing and
+ * unpacking.  Since the lowest n bits (9 for 4KiB blocksize) are ignored
+ * anyway this is harmless.
+ */
+#define ARCH_SHIFT	(BITS_PER_LONG - 32)
+#define INDIRECT_BIT	(0x80000000UL << ARCH_SHIFT)
+#define LEVEL_SHIFT	(28 + ARCH_SHIFT)
+static inline pgoff_t first_indirect_block(void)
+{
+	return INDIRECT_BIT | (1ULL << LEVEL_SHIFT);
+}
+
+pgoff_t logfs_pack_index(u64 bix, level_t level)
+{
+	pgoff_t index;
+
+	BUG_ON(bix >= INDIRECT_BIT);
+	if (level == 0)
+		return bix;
+
+	index  = INDIRECT_BIT;
+	index |= (__force long)level << LEVEL_SHIFT;
+	index |= bix >> ((__force u8)level * LOGFS_BLOCK_BITS);
+	return index;
+}
+
+void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level)
+{
+	u8 __level;
+
+	if (!(index & INDIRECT_BIT)) {
+		*bix = index;
+		*level = 0;
+		return;
+	}
+
+	__level = (index & ~INDIRECT_BIT) >> LEVEL_SHIFT;
+	*level = LEVEL(__level);
+	*bix = (index << (__level * LOGFS_BLOCK_BITS)) & ~INDIRECT_BIT;
+	*bix = adjust_bix(*bix, *level);
+	return;
+}
+#undef ARCH_SHIFT
+#undef INDIRECT_BIT
+#undef LEVEL_SHIFT
+
+/*
+ * Time is stored as nanoseconds since the epoch.
+ */
+static struct timespec be64_to_timespec(__be64 betime)
+{
+	return ns_to_timespec(be64_to_cpu(betime));
+}
+
+static __be64 timespec_to_be64(struct timespec tsp)
+{
+	return cpu_to_be64((u64)tsp.tv_sec * NSEC_PER_SEC + tsp.tv_nsec);
+}
+
+static void logfs_disk_to_inode(struct logfs_disk_inode *di, struct inode*inode)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	int i;
+
+	inode->i_mode	= be16_to_cpu(di->di_mode);
+	li->li_height	= di->di_height;
+	li->li_flags	= be32_to_cpu(di->di_flags);
+	inode->i_uid	= be32_to_cpu(di->di_uid);
+	inode->i_gid	= be32_to_cpu(di->di_gid);
+	inode->i_size	= be64_to_cpu(di->di_size);
+	logfs_set_blocks(inode, be64_to_cpu(di->di_used_bytes));
+	inode->i_atime	= be64_to_timespec(di->di_atime);
+	inode->i_ctime	= be64_to_timespec(di->di_ctime);
+	inode->i_mtime	= be64_to_timespec(di->di_mtime);
+	inode->i_nlink	= be32_to_cpu(di->di_refcount);
+	inode->i_generation = be32_to_cpu(di->di_generation);
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFSOCK:	/* fall through */
+	case S_IFBLK:	/* fall through */
+	case S_IFCHR:	/* fall through */
+	case S_IFIFO:
+		inode->i_rdev = be64_to_cpu(di->di_data[0]);
+		break;
+	case S_IFDIR:	/* fall through */
+	case S_IFREG:	/* fall through */
+	case S_IFLNK:
+		for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+			li->li_data[i] = be64_to_cpu(di->di_data[i]);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static void logfs_inode_to_disk(struct inode *inode, struct logfs_disk_inode*di)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	int i;
+
+	di->di_mode	= cpu_to_be16(inode->i_mode);
+	di->di_height	= li->li_height;
+	di->di_pad	= 0;
+	di->di_flags	= cpu_to_be32(li->li_flags);
+	di->di_uid	= cpu_to_be32(inode->i_uid);
+	di->di_gid	= cpu_to_be32(inode->i_gid);
+	di->di_size	= cpu_to_be64(i_size_read(inode));
+	di->di_used_bytes = cpu_to_be64(li->li_used_bytes);
+	di->di_atime	= timespec_to_be64(inode->i_atime);
+	di->di_ctime	= timespec_to_be64(inode->i_ctime);
+	di->di_mtime	= timespec_to_be64(inode->i_mtime);
+	di->di_refcount	= cpu_to_be32(inode->i_nlink);
+	di->di_generation = cpu_to_be32(inode->i_generation);
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFSOCK:	/* fall through */
+	case S_IFBLK:	/* fall through */
+	case S_IFCHR:	/* fall through */
+	case S_IFIFO:
+		di->di_data[0] = cpu_to_be64(inode->i_rdev);
+		break;
+	case S_IFDIR:	/* fall through */
+	case S_IFREG:	/* fall through */
+	case S_IFLNK:
+		for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
+			di->di_data[i] = cpu_to_be64(li->li_data[i]);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static void __logfs_set_blocks(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct logfs_inode *li = logfs_inode(inode);
+
+	inode->i_blocks = ULONG_MAX;
+	if (li->li_used_bytes >> sb->s_blocksize_bits < ULONG_MAX)
+		inode->i_blocks = ALIGN(li->li_used_bytes, 512) >> 9;
+}
+
+void logfs_set_blocks(struct inode *inode, u64 bytes)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	li->li_used_bytes = bytes;
+	__logfs_set_blocks(inode);
+}
+
+static void prelock_page(struct super_block *sb, struct page *page, int lock)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	BUG_ON(!PageLocked(page));
+	if (lock) {
+		BUG_ON(PagePreLocked(page));
+		SetPagePreLocked(page);
+	} else {
+		/* We are in GC path. */
+		if (PagePreLocked(page))
+			super->s_lock_count++;
+		else
+			SetPagePreLocked(page);
+	}
+}
+
+static void preunlock_page(struct super_block *sb, struct page *page, int lock)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	BUG_ON(!PageLocked(page));
+	if (lock)
+		ClearPagePreLocked(page);
+	else {
+		/* We are in GC path. */
+		BUG_ON(!PagePreLocked(page));
+		if (super->s_lock_count)
+			super->s_lock_count--;
+		else
+			ClearPagePreLocked(page);
+	}
+}
+
+/*
+ * Logfs is prone to an AB-BA deadlock where one task tries to acquire
+ * s_write_mutex with a locked page and GC tries to get that page while holding
+ * s_write_mutex.
+ * To solve this issue logfs will ignore the page lock iff the page in question
+ * is waiting for s_write_mutex.  We annotate this fact by setting PG_pre_locked
+ * in addition to PG_locked.
+ */
+static void logfs_get_wblocks(struct super_block *sb, struct page *page,
+		int lock)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	if (page)
+		prelock_page(sb, page, lock);
+
+	if (lock) {
+		mutex_lock(&super->s_write_mutex);
+		logfs_gc_pass(sb);
+		/* FIXME: We also have to check for shadowed space
+		 * and mempool fill grade */
+	}
+}
+
+static void logfs_put_wblocks(struct super_block *sb, struct page *page,
+		int lock)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	if (page)
+		preunlock_page(sb, page, lock);
+	/* Order matters - we must clear PG_pre_locked before releasing
+	 * s_write_mutex or we could race against another task. */
+	if (lock)
+		mutex_unlock(&super->s_write_mutex);
+}
+
+static struct page *logfs_get_read_page(struct inode *inode, u64 bix,
+		level_t level)
+{
+	return find_or_create_page(inode->i_mapping,
+			logfs_pack_index(bix, level), GFP_NOFS);
+}
+
+static void logfs_put_read_page(struct page *page)
+{
+	unlock_page(page);
+	page_cache_release(page);
+}
+
+static void logfs_lock_write_page(struct page *page)
+{
+	int loop = 0;
+
+	while (unlikely(!trylock_page(page))) {
+		if (loop++ > 0x1000) {
+			/* Has been observed once so far... */
+			printk(KERN_ERR "stack at %p\n", &loop);
+			BUG();
+		}
+		if (PagePreLocked(page)) {
+			/* Holder of page lock is waiting for us, it
+			 * is safe to use this page. */
+			break;
+		}
+		/* Some other process has this page locked and has
+		 * nothing to do with us.  Wait for it to finish.
+		 */
+		schedule();
+	}
+	BUG_ON(!PageLocked(page));
+}
+
+static struct page *logfs_get_write_page(struct inode *inode, u64 bix,
+		level_t level)
+{
+	struct address_space *mapping = inode->i_mapping;
+	pgoff_t index = logfs_pack_index(bix, level);
+	struct page *page;
+	int err;
+
+repeat:
+	page = find_get_page(mapping, index);
+	if (!page) {
+		page = __page_cache_alloc(GFP_NOFS);
+		if (!page)
+			return NULL;
+		err = add_to_page_cache_lru(page, mapping, index, GFP_NOFS);
+		if (unlikely(err)) {
+			page_cache_release(page);
+			if (err == -EEXIST)
+				goto repeat;
+			return NULL;
+		}
+	} else logfs_lock_write_page(page);
+	BUG_ON(!PageLocked(page));
+	return page;
+}
+
+static void logfs_unlock_write_page(struct page *page)
+{
+	if (!PagePreLocked(page))
+		unlock_page(page);
+}
+
+static void logfs_put_write_page(struct page *page)
+{
+	logfs_unlock_write_page(page);
+	page_cache_release(page);
+}
+
+static struct page *logfs_get_page(struct inode *inode, u64 bix, level_t level,
+		int rw)
+{
+	if (rw == READ)
+		return logfs_get_read_page(inode, bix, level);
+	else
+		return logfs_get_write_page(inode, bix, level);
+}
+
+static void logfs_put_page(struct page *page, int rw)
+{
+	if (rw == READ)
+		logfs_put_read_page(page);
+	else
+		logfs_put_write_page(page);
+}
+
+static unsigned long __get_bits(u64 val, int skip, int no)
+{
+	u64 ret = val;
+
+	ret >>= skip * no;
+	ret <<= 64 - no;
+	ret >>= 64 - no;
+	return ret;
+}
+
+static unsigned long get_bits(u64 val, level_t skip)
+{
+	return __get_bits(val, (__force int)skip, LOGFS_BLOCK_BITS);
+}
+
+static inline void init_shadow_tree(struct super_block *sb,
+		struct shadow_tree *tree)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	btree_init_mempool64(&tree->new, super->s_btree_pool);
+	btree_init_mempool64(&tree->old, super->s_btree_pool);
+}
+
+static void indirect_write_block(struct logfs_block *block)
+{
+	struct page *page;
+	struct inode *inode;
+	int ret;
+
+	page = block->page;
+	inode = page->mapping->host;
+	logfs_lock_write_page(page);
+	ret = logfs_write_buf(inode, page, 0);
+	logfs_unlock_write_page(page);
+	/*
+	 * This needs some rework.  Unless you want your filesystem to run
+	 * completely synchronously (you don't), the filesystem will always
+	 * report writes as 'successful' before the actual work has been
+	 * done.  The actual work gets done here and this is where any errors
+	 * will show up.  And there isn't much we can do about it, really.
+	 *
+	 * Some attempts to fix the errors (move from bad blocks, retry io,...)
+	 * have already been done, so anything left should be either a broken
+	 * device or a bug somewhere in logfs itself.  Being relatively new,
+	 * the odds currently favor a bug, so for now the line below isn't
+	 * entirely tasteles.
+	 */
+	BUG_ON(ret);
+}
+
+static void inode_write_block(struct logfs_block *block)
+{
+	struct inode *inode;
+	int ret;
+
+	inode = block->inode;
+	if (inode->i_ino == LOGFS_INO_MASTER)
+		logfs_write_anchor(inode->i_sb);
+	else {
+		ret = __logfs_write_inode(inode, 0);
+		/* see indirect_write_block comment */
+		BUG_ON(ret);
+	}
+}
+
+/*
+ * This silences a false, yet annoying gcc warning.  I hate it when my editor
+ * jumps into bitops.h each time I recompile this file.
+ * TODO: Complain to gcc folks about this and upgrade compiler.
+ */
+static unsigned long fnb(const unsigned long *addr,
+		unsigned long size, unsigned long offset)
+{
+	return find_next_bit(addr, size, offset);
+}
+
+static __be64 inode_val0(struct inode *inode)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	u64 val;
+
+	/*
+	 * Explicit shifting generates good code, but must match the format
+	 * of the structure.  Add some paranoia just in case.
+	 */
+	BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_mode) != 0);
+	BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_height) != 2);
+	BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_flags) != 4);
+
+	val =	(u64)inode->i_mode << 48 |
+		(u64)li->li_height << 40 |
+		(u64)li->li_flags;
+	return cpu_to_be64(val);
+}
+
+static int inode_write_alias(struct super_block *sb,
+		struct logfs_block *block, write_alias_t *write_one_alias)
+{
+	struct inode *inode = block->inode;
+	struct logfs_inode *li = logfs_inode(inode);
+	unsigned long pos;
+	u64 ino , bix;
+	__be64 val;
+	level_t level;
+	int err;
+
+	for (pos = 0; ; pos++) {
+		pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
+		if (pos >= LOGFS_EMBEDDED_FIELDS + INODE_POINTER_OFS)
+			return 0;
+
+		switch (pos) {
+		case INODE_HEIGHT_OFS:
+			val = inode_val0(inode);
+			break;
+		case INODE_USED_OFS:
+			val = cpu_to_be64(li->li_used_bytes);;
+			break;
+		case INODE_SIZE_OFS:
+			val = cpu_to_be64(i_size_read(inode));
+			break;
+		case INODE_POINTER_OFS ... INODE_POINTER_OFS + LOGFS_EMBEDDED_FIELDS - 1:
+			val = cpu_to_be64(li->li_data[pos - INODE_POINTER_OFS]);
+			break;
+		default:
+			BUG();
+		}
+
+		ino = LOGFS_INO_MASTER;
+		bix = inode->i_ino;
+		level = LEVEL(0);
+		err = write_one_alias(sb, ino, bix, level, pos, val);
+		if (err)
+			return err;
+	}
+}
+
+static int indirect_write_alias(struct super_block *sb,
+		struct logfs_block *block, write_alias_t *write_one_alias)
+{
+	unsigned long pos;
+	struct page *page = block->page;
+	u64 ino , bix;
+	__be64 *child, val;
+	level_t level;
+	int err;
+
+	for (pos = 0; ; pos++) {
+		pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
+		if (pos >= LOGFS_BLOCK_FACTOR)
+			return 0;
+
+		ino = page->mapping->host->i_ino;
+		logfs_unpack_index(page->index, &bix, &level);
+		child = kmap_atomic(page, KM_USER0);
+		val = child[pos];
+		kunmap_atomic(child, KM_USER0);
+		err = write_one_alias(sb, ino, bix, level, pos, val);
+		if (err)
+			return err;
+	}
+}
+
+int logfs_write_obj_aliases_pagecache(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_block *block;
+	int err;
+
+	list_for_each_entry(block, &super->s_object_alias, alias_list) {
+		err = block->ops->write_alias(sb, block, write_alias_journal);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+void __free_block(struct super_block *sb, struct logfs_block *block)
+{
+	BUG_ON(!list_empty(&block->item_list));
+	list_del(&block->alias_list);
+	mempool_free(block, logfs_super(sb)->s_block_pool);
+}
+
+static void inode_free_block(struct super_block *sb, struct logfs_block *block)
+{
+	struct inode *inode = block->inode;
+
+	logfs_inode(inode)->li_block = NULL;
+	__free_block(sb, block);
+}
+
+static void indirect_free_block(struct super_block *sb,
+		struct logfs_block *block)
+{
+	ClearPagePrivate(block->page);
+	block->page->private = 0;
+	__free_block(sb, block);
+}
+
+
+static struct logfs_block_ops inode_block_ops = {
+	.write_block = inode_write_block,
+	.free_block = inode_free_block,
+	.write_alias = inode_write_alias,
+};
+
+struct logfs_block_ops indirect_block_ops = {
+	.write_block = indirect_write_block,
+	.free_block = indirect_free_block,
+	.write_alias = indirect_write_alias,
+};
+
+struct logfs_block *__alloc_block(struct super_block *sb,
+		u64 ino, u64 bix, level_t level)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_block *block;
+
+	block = mempool_alloc(super->s_block_pool, GFP_NOFS);
+	memset(block, 0, sizeof(*block));
+	INIT_LIST_HEAD(&block->alias_list);
+	INIT_LIST_HEAD(&block->item_list);
+	block->sb = sb;
+	block->ino = ino;
+	block->bix = bix;
+	block->level = level;
+	return block;
+}
+
+static void alloc_inode_block(struct inode *inode)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	struct logfs_block *block;
+
+	if (li->li_block)
+		return;
+
+	block = __alloc_block(inode->i_sb, LOGFS_INO_MASTER, inode->i_ino, 0);
+	block->inode = inode;
+	li->li_block = block;
+	block->ops = &inode_block_ops;
+}
+
+void initialize_block_counters(struct page *page, struct logfs_block *block,
+		__be64 *array, int page_is_empty)
+{
+	u64 ptr;
+	int i, start;
+
+	block->partial = 0;
+	block->full = 0;
+	start = 0;
+	if (page->index < first_indirect_block()) {
+		/* Counters are pointless on level 0 */
+		return;
+	}
+	if (page->index == first_indirect_block()) {
+		/* Skip unused pointers */
+		start = I0_BLOCKS;
+		block->full = I0_BLOCKS;
+	}
+	if (!page_is_empty) {
+		for (i = start; i < LOGFS_BLOCK_FACTOR; i++) {
+			ptr = be64_to_cpu(array[i]);
+			if (ptr)
+				block->partial++;
+			if (ptr & LOGFS_FULLY_POPULATED)
+				block->full++;
+		}
+	}
+}
+
+static void alloc_data_block(struct inode *inode, struct page *page)
+{
+	struct logfs_block *block;
+	u64 bix;
+	level_t level;
+
+	if (PagePrivate(page))
+		return;
+
+	logfs_unpack_index(page->index, &bix, &level);
+	block = __alloc_block(inode->i_sb, inode->i_ino, bix, level);
+	block->page = page;
+	SetPagePrivate(page);
+	page->private = (unsigned long)block;
+	block->ops = &indirect_block_ops;
+}
+
+static void alloc_indirect_block(struct inode *inode, struct page *page,
+		int page_is_empty)
+{
+	struct logfs_block *block;
+	__be64 *array;
+
+	if (PagePrivate(page))
+		return;
+
+	alloc_data_block(inode, page);
+
+	block = logfs_block(page);
+	array = kmap_atomic(page, KM_USER0);
+	initialize_block_counters(page, block, array, page_is_empty);
+	kunmap_atomic(array, KM_USER0);
+}
+
+static void block_set_pointer(struct page *page, int index, u64 ptr)
+{
+	struct logfs_block *block = logfs_block(page);
+	__be64 *array;
+	u64 oldptr;
+
+	BUG_ON(!block);
+	array = kmap_atomic(page, KM_USER0);
+	oldptr = be64_to_cpu(array[index]);
+	array[index] = cpu_to_be64(ptr);
+	kunmap_atomic(array, KM_USER0);
+	SetPageUptodate(page);
+
+	block->full += !!(ptr & LOGFS_FULLY_POPULATED)
+		- !!(oldptr & LOGFS_FULLY_POPULATED);
+	block->partial += !!ptr - !!oldptr;
+}
+
+static u64 block_get_pointer(struct page *page, int index)
+{
+	__be64 *block;
+	u64 ptr;
+
+	block = kmap_atomic(page, KM_USER0);
+	ptr = be64_to_cpu(block[index]);
+	kunmap_atomic(block, KM_USER0);
+	return ptr;
+}
+
+static int logfs_read_empty(struct page *page)
+{
+	zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+	return 0;
+}
+
+static int logfs_read_direct(struct inode *inode, struct page *page)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	pgoff_t index = page->index;
+	u64 block;
+
+	block = li->li_data[index];
+	if (!block)
+		return logfs_read_empty(page);
+
+	return logfs_segment_read(inode, page, block, index, 0);
+}
+
+static int logfs_read_loop(struct inode *inode, struct page *page,
+		int rw_context)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	u64 bix, bofs = li->li_data[INDIRECT_INDEX];
+	level_t level, target_level;
+	int ret;
+	struct page *ipage;
+
+	logfs_unpack_index(page->index, &bix, &target_level);
+	if (!bofs)
+		return logfs_read_empty(page);
+
+	if (bix >= maxbix(li->li_height))
+		return logfs_read_empty(page);
+
+	for (level = LEVEL(li->li_height);
+			(__force u8)level > (__force u8)target_level;
+			level = SUBLEVEL(level)){
+		ipage = logfs_get_page(inode, bix, level, rw_context);
+		if (!ipage)
+			return -ENOMEM;
+
+		ret = logfs_segment_read(inode, ipage, bofs, bix, level);
+		if (ret) {
+			logfs_put_read_page(ipage);
+			return ret;
+		}
+
+		bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level)));
+		logfs_put_page(ipage, rw_context);
+		if (!bofs)
+			return logfs_read_empty(page);
+	}
+
+	return logfs_segment_read(inode, page, bofs, bix, 0);
+}
+
+static int logfs_read_block(struct inode *inode, struct page *page,
+		int rw_context)
+{
+	pgoff_t index = page->index;
+
+	if (index < I0_BLOCKS)
+		return logfs_read_direct(inode, page);
+	return logfs_read_loop(inode, page, rw_context);
+}
+
+static int logfs_exist_loop(struct inode *inode, u64 bix)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	u64 bofs = li->li_data[INDIRECT_INDEX];
+	level_t level;
+	int ret;
+	struct page *ipage;
+
+	if (!bofs)
+		return 0;
+	if (bix >= maxbix(li->li_height))
+		return 0;
+
+	for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) {
+		ipage = logfs_get_read_page(inode, bix, level);
+		if (!ipage)
+			return -ENOMEM;
+
+		ret = logfs_segment_read(inode, ipage, bofs, bix, level);
+		if (ret) {
+			logfs_put_read_page(ipage);
+			return ret;
+		}
+
+		bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level)));
+		logfs_put_read_page(ipage);
+		if (!bofs)
+			return 0;
+	}
+
+	return 1;
+}
+
+int logfs_exist_block(struct inode *inode, u64 bix)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	if (bix < I0_BLOCKS)
+		return !!li->li_data[bix];
+	return logfs_exist_loop(inode, bix);
+}
+
+static u64 seek_holedata_direct(struct inode *inode, u64 bix, int data)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	for (; bix < I0_BLOCKS; bix++)
+		if (data ^ (li->li_data[bix] == 0))
+			return bix;
+	return I0_BLOCKS;
+}
+
+static u64 seek_holedata_loop(struct inode *inode, u64 bix, int data)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	__be64 *rblock;
+	u64 increment, bofs = li->li_data[INDIRECT_INDEX];
+	level_t level;
+	int ret, slot;
+	struct page *page;
+
+	BUG_ON(!bofs);
+
+	for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) {
+		increment = 1 << (LOGFS_BLOCK_BITS * ((__force u8)level-1));
+		page = logfs_get_read_page(inode, bix, level);
+		if (!page)
+			return bix;
+
+		ret = logfs_segment_read(inode, page, bofs, bix, level);
+		if (ret) {
+			logfs_put_read_page(page);
+			return bix;
+		}
+
+		slot = get_bits(bix, SUBLEVEL(level));
+		rblock = kmap_atomic(page, KM_USER0);
+		while (slot < LOGFS_BLOCK_FACTOR) {
+			if (data && (rblock[slot] != 0))
+				break;
+			if (!data && !(be64_to_cpu(rblock[slot]) & LOGFS_FULLY_POPULATED))
+				break;
+			slot++;
+			bix += increment;
+			bix &= ~(increment - 1);
+		}
+		if (slot >= LOGFS_BLOCK_FACTOR) {
+			kunmap_atomic(rblock, KM_USER0);
+			logfs_put_read_page(page);
+			return bix;
+		}
+		bofs = be64_to_cpu(rblock[slot]);
+		kunmap_atomic(rblock, KM_USER0);
+		logfs_put_read_page(page);
+		if (!bofs) {
+			BUG_ON(data);
+			return bix;
+		}
+	}
+	return bix;
+}
+
+/**
+ * logfs_seek_hole - find next hole starting at a given block index
+ * @inode:		inode to search in
+ * @bix:		block index to start searching
+ *
+ * Returns next hole.  If the file doesn't contain any further holes, the
+ * block address next to eof is returned instead.
+ */
+u64 logfs_seek_hole(struct inode *inode, u64 bix)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	if (bix < I0_BLOCKS) {
+		bix = seek_holedata_direct(inode, bix, 0);
+		if (bix < I0_BLOCKS)
+			return bix;
+	}
+
+	if (!li->li_data[INDIRECT_INDEX])
+		return bix;
+	else if (li->li_data[INDIRECT_INDEX] & LOGFS_FULLY_POPULATED)
+		bix = maxbix(li->li_height);
+	else if (bix >= maxbix(li->li_height))
+		return bix;
+	else {
+		bix = seek_holedata_loop(inode, bix, 0);
+		if (bix < maxbix(li->li_height))
+			return bix;
+		/* Should not happen anymore.  But if some port writes semi-
+		 * corrupt images (as this one used to) we might run into it.
+		 */
+		WARN_ON_ONCE(bix == maxbix(li->li_height));
+	}
+
+	return bix;
+}
+
+static u64 __logfs_seek_data(struct inode *inode, u64 bix)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	if (bix < I0_BLOCKS) {
+		bix = seek_holedata_direct(inode, bix, 1);
+		if (bix < I0_BLOCKS)
+			return bix;
+	}
+
+	if (bix < maxbix(li->li_height)) {
+		if (!li->li_data[INDIRECT_INDEX])
+			bix = maxbix(li->li_height);
+		else
+			return seek_holedata_loop(inode, bix, 1);
+	}
+
+	return bix;
+}
+
+/**
+ * logfs_seek_data - find next data block after a given block index
+ * @inode:		inode to search in
+ * @bix:		block index to start searching
+ *
+ * Returns next data block.  If the file doesn't contain any further data
+ * blocks, the last block in the file is returned instead.
+ */
+u64 logfs_seek_data(struct inode *inode, u64 bix)
+{
+	struct super_block *sb = inode->i_sb;
+	u64 ret, end;
+
+	ret = __logfs_seek_data(inode, bix);
+	end = i_size_read(inode) >> sb->s_blocksize_bits;
+	if (ret >= end)
+		ret = max(bix, end);
+	return ret;
+}
+
+static int logfs_is_valid_direct(struct logfs_inode *li, u64 bix, u64 ofs)
+{
+	return pure_ofs(li->li_data[bix]) == ofs;
+}
+
+static int __logfs_is_valid_loop(struct inode *inode, u64 bix,
+		u64 ofs, u64 bofs)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	level_t level;
+	int ret;
+	struct page *page;
+
+	for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)){
+		page = logfs_get_write_page(inode, bix, level);
+		BUG_ON(!page);
+
+		ret = logfs_segment_read(inode, page, bofs, bix, level);
+		if (ret) {
+			logfs_put_write_page(page);
+			return 0;
+		}
+
+		bofs = block_get_pointer(page, get_bits(bix, SUBLEVEL(level)));
+		logfs_put_write_page(page);
+		if (!bofs)
+			return 0;
+
+		if (pure_ofs(bofs) == ofs)
+			return 1;
+	}
+	return 0;
+}
+
+static int logfs_is_valid_loop(struct inode *inode, u64 bix, u64 ofs)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	u64 bofs = li->li_data[INDIRECT_INDEX];
+
+	if (!bofs)
+		return 0;
+
+	if (bix >= maxbix(li->li_height))
+		return 0;
+
+	if (pure_ofs(bofs) == ofs)
+		return 1;
+
+	return __logfs_is_valid_loop(inode, bix, ofs, bofs);
+}
+
+static int __logfs_is_valid_block(struct inode *inode, u64 bix, u64 ofs)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	if ((inode->i_nlink == 0) && atomic_read(&inode->i_count) == 1)
+		return 0;
+
+	if (bix < I0_BLOCKS)
+		return logfs_is_valid_direct(li, bix, ofs);
+	return logfs_is_valid_loop(inode, bix, ofs);
+}
+
+/**
+ * logfs_is_valid_block - check whether this block is still valid
+ *
+ * @sb	- superblock
+ * @ofs	- block physical offset
+ * @ino	- block inode number
+ * @bix	- block index
+ * @level - block level
+ *
+ * Returns 0 if the block is invalid, 1 if it is valid and 2 if it will
+ * become invalid once the journal is written.
+ */
+int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix,
+		gc_level_t gc_level)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct inode *inode;
+	int ret, cookie;
+
+	/* Umount closes a segment with free blocks remaining.  Those
+	 * blocks are by definition invalid. */
+	if (ino == -1)
+		return 0;
+
+	LOGFS_BUG_ON((u64)(u_long)ino != ino, sb);
+
+	inode = logfs_safe_iget(sb, ino, &cookie);
+	if (IS_ERR(inode))
+		goto invalid;
+
+	ret = __logfs_is_valid_block(inode, bix, ofs);
+	logfs_safe_iput(inode, cookie);
+	if (ret)
+		return ret;
+
+invalid:
+	/* Block is nominally invalid, but may still sit in the shadow tree,
+	 * waiting for a journal commit.
+	 */
+	if (btree_lookup64(&super->s_shadow_tree.old, ofs))
+		return 2;
+	return 0;
+}
+
+int logfs_readpage_nolock(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	int ret = -EIO;
+
+	ret = logfs_read_block(inode, page, READ);
+
+	if (ret) {
+		ClearPageUptodate(page);
+		SetPageError(page);
+	} else {
+		SetPageUptodate(page);
+		ClearPageError(page);
+	}
+	flush_dcache_page(page);
+
+	return ret;
+}
+
+static int logfs_reserve_bytes(struct inode *inode, int bytes)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+	u64 available = super->s_free_bytes + super->s_dirty_free_bytes
+			- super->s_dirty_used_bytes - super->s_dirty_pages;
+
+	if (!bytes)
+		return 0;
+
+	if (available < bytes)
+		return -ENOSPC;
+
+	if (available < bytes + super->s_root_reserve &&
+			!capable(CAP_SYS_RESOURCE))
+		return -ENOSPC;
+
+	return 0;
+}
+
+int get_page_reserve(struct inode *inode, struct page *page)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+	struct logfs_block *block = logfs_block(page);
+	int ret;
+
+	if (block && block->reserved_bytes)
+		return 0;
+
+	logfs_get_wblocks(inode->i_sb, page, WF_LOCK);
+	while ((ret = logfs_reserve_bytes(inode, 6 * LOGFS_MAX_OBJECTSIZE)) &&
+			!list_empty(&super->s_writeback_list)) {
+		block = list_entry(super->s_writeback_list.next,
+				struct logfs_block, alias_list);
+		block->ops->write_block(block);
+	}
+	if (!ret) {
+		alloc_data_block(inode, page);
+		block = logfs_block(page);
+		block->reserved_bytes += 6 * LOGFS_MAX_OBJECTSIZE;
+		super->s_dirty_pages += 6 * LOGFS_MAX_OBJECTSIZE;
+		list_move_tail(&block->alias_list, &super->s_writeback_list);
+	}
+	logfs_put_wblocks(inode->i_sb, page, WF_LOCK);
+	return ret;
+}
+
+/*
+ * We are protected by write lock.  Push victims up to superblock level
+ * and release transaction when appropriate.
+ */
+/* FIXME: This is currently called from the wrong spots. */
+static void logfs_handle_transaction(struct inode *inode,
+		struct logfs_transaction *ta)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+
+	if (!ta)
+		return;
+	logfs_inode(inode)->li_block->ta = NULL;
+
+	if (inode->i_ino != LOGFS_INO_MASTER) {
+		BUG(); /* FIXME: Yes, this needs more thought */
+		/* just remember the transaction until inode is written */
+		//BUG_ON(logfs_inode(inode)->li_transaction);
+		//logfs_inode(inode)->li_transaction = ta;
+		return;
+	}
+
+	switch (ta->state) {
+	case CREATE_1: /* fall through */
+	case UNLINK_1:
+		BUG_ON(super->s_victim_ino);
+		super->s_victim_ino = ta->ino;
+		break;
+	case CREATE_2: /* fall through */
+	case UNLINK_2:
+		BUG_ON(super->s_victim_ino != ta->ino);
+		super->s_victim_ino = 0;
+		/* transaction ends here - free it */
+		kfree(ta);
+		break;
+	case CROSS_RENAME_1:
+		BUG_ON(super->s_rename_dir);
+		BUG_ON(super->s_rename_pos);
+		super->s_rename_dir = ta->dir;
+		super->s_rename_pos = ta->pos;
+		break;
+	case CROSS_RENAME_2:
+		BUG_ON(super->s_rename_dir != ta->dir);
+		BUG_ON(super->s_rename_pos != ta->pos);
+		super->s_rename_dir = 0;
+		super->s_rename_pos = 0;
+		kfree(ta);
+		break;
+	case TARGET_RENAME_1:
+		BUG_ON(super->s_rename_dir);
+		BUG_ON(super->s_rename_pos);
+		BUG_ON(super->s_victim_ino);
+		super->s_rename_dir = ta->dir;
+		super->s_rename_pos = ta->pos;
+		super->s_victim_ino = ta->ino;
+		break;
+	case TARGET_RENAME_2:
+		BUG_ON(super->s_rename_dir != ta->dir);
+		BUG_ON(super->s_rename_pos != ta->pos);
+		BUG_ON(super->s_victim_ino != ta->ino);
+		super->s_rename_dir = 0;
+		super->s_rename_pos = 0;
+		break;
+	case TARGET_RENAME_3:
+		BUG_ON(super->s_rename_dir);
+		BUG_ON(super->s_rename_pos);
+		BUG_ON(super->s_victim_ino != ta->ino);
+		super->s_victim_ino = 0;
+		kfree(ta);
+		break;
+	default:
+		BUG();
+	}
+}
+
+/*
+ * Not strictly a reservation, but rather a check that we still have enough
+ * space to satisfy the write.
+ */
+static int logfs_reserve_blocks(struct inode *inode, int blocks)
+{
+	return logfs_reserve_bytes(inode, blocks * LOGFS_MAX_OBJECTSIZE);
+}
+
+struct write_control {
+	u64 ofs;
+	long flags;
+};
+
+static struct logfs_shadow *alloc_shadow(struct inode *inode, u64 bix,
+		level_t level, u64 old_ofs)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+	struct logfs_shadow *shadow;
+
+	shadow = mempool_alloc(super->s_shadow_pool, GFP_NOFS);
+	memset(shadow, 0, sizeof(*shadow));
+	shadow->ino = inode->i_ino;
+	shadow->bix = bix;
+	shadow->gc_level = expand_level(inode->i_ino, level);
+	shadow->old_ofs = old_ofs & ~LOGFS_FULLY_POPULATED;
+	return shadow;
+}
+
+static void free_shadow(struct inode *inode, struct logfs_shadow *shadow)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+
+	mempool_free(shadow, super->s_shadow_pool);
+}
+
+static void mark_segment(struct shadow_tree *tree, u32 segno)
+{
+	int err;
+
+	if (!btree_lookup32(&tree->segment_map, segno)) {
+		err = btree_insert32(&tree->segment_map, segno, (void *)1,
+				GFP_NOFS);
+		BUG_ON(err);
+		tree->no_shadowed_segments++;
+	}
+}
+
+/**
+ * fill_shadow_tree - Propagate shadow tree changes due to a write
+ * @inode:	Inode owning the page
+ * @page:	Struct page that was written
+ * @shadow:	Shadow for the current write
+ *
+ * Writes in logfs can result in two semi-valid objects.  The old object
+ * is still valid as long as it can be reached by following pointers on
+ * the medium.  Only when writes propagate all the way up to the journal
+ * has the new object safely replaced the old one.
+ *
+ * To handle this problem, a struct logfs_shadow is used to represent
+ * every single write.  It is attached to the indirect block, which is
+ * marked dirty.  When the indirect block is written, its shadows are
+ * handed up to the next indirect block (or inode).  Untimately they
+ * will reach the master inode and be freed upon journal commit.
+ *
+ * This function handles a single step in the propagation.  It adds the
+ * shadow for the current write to the tree, along with any shadows in
+ * the page's tree, in case it was an indirect block.  If a page is
+ * written, the inode parameter is left NULL, if an inode is written,
+ * the page parameter is left NULL.
+ */
+static void fill_shadow_tree(struct inode *inode, struct page *page,
+		struct logfs_shadow *shadow)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+	struct logfs_block *block = logfs_block(page);
+	struct shadow_tree *tree = &super->s_shadow_tree;
+
+	if (PagePrivate(page)) {
+		if (block->alias_map)
+			super->s_no_object_aliases -= bitmap_weight(
+					block->alias_map, LOGFS_BLOCK_FACTOR);
+		logfs_handle_transaction(inode, block->ta);
+		block->ops->free_block(inode->i_sb, block);
+	}
+	if (shadow) {
+		if (shadow->old_ofs)
+			btree_insert64(&tree->old, shadow->old_ofs, shadow,
+					GFP_NOFS);
+		else
+			btree_insert64(&tree->new, shadow->new_ofs, shadow,
+					GFP_NOFS);
+
+		super->s_dirty_used_bytes += shadow->new_len;
+		super->s_dirty_free_bytes += shadow->old_len;
+		mark_segment(tree, shadow->old_ofs >> super->s_segshift);
+		mark_segment(tree, shadow->new_ofs >> super->s_segshift);
+	}
+}
+
+static void logfs_set_alias(struct super_block *sb, struct logfs_block *block,
+		long child_no)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	if (block->inode && block->inode->i_ino == LOGFS_INO_MASTER) {
+		/* Aliases in the master inode are pointless. */
+		return;
+	}
+
+	if (!test_bit(child_no, block->alias_map)) {
+		set_bit(child_no, block->alias_map);
+		super->s_no_object_aliases++;
+	}
+	list_move_tail(&block->alias_list, &super->s_object_alias);
+}
+
+/*
+ * Object aliases can and often do change the size and occupied space of a
+ * file.  So not only do we have to change the pointers, we also have to
+ * change inode->i_size and li->li_used_bytes.  Which is done by setting
+ * another two object aliases for the inode itself.
+ */
+static void set_iused(struct inode *inode, struct logfs_shadow *shadow)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	if (shadow->new_len == shadow->old_len)
+		return;
+
+	alloc_inode_block(inode);
+	li->li_used_bytes += shadow->new_len - shadow->old_len;
+	__logfs_set_blocks(inode);
+	logfs_set_alias(inode->i_sb, li->li_block, INODE_USED_OFS);
+	logfs_set_alias(inode->i_sb, li->li_block, INODE_SIZE_OFS);
+}
+
+static int logfs_write_i0(struct inode *inode, struct page *page,
+		struct write_control *wc)
+{
+	struct logfs_shadow *shadow;
+	u64 bix;
+	level_t level;
+	int full, err = 0;
+
+	logfs_unpack_index(page->index, &bix, &level);
+	if (wc->ofs == 0)
+		if (logfs_reserve_blocks(inode, 1))
+			return -ENOSPC;
+
+	shadow = alloc_shadow(inode, bix, level, wc->ofs);
+	if (wc->flags & WF_WRITE)
+		err = logfs_segment_write(inode, page, shadow);
+	if (wc->flags & WF_DELETE)
+		logfs_segment_delete(inode, shadow);
+	if (err) {
+		free_shadow(inode, shadow);
+		return err;
+	}
+
+	set_iused(inode, shadow);
+	full = 1;
+	if (level != 0) {
+		alloc_indirect_block(inode, page, 0);
+		full = logfs_block(page)->full == LOGFS_BLOCK_FACTOR;
+	}
+	fill_shadow_tree(inode, page, shadow);
+	wc->ofs = shadow->new_ofs;
+	if (wc->ofs && full)
+		wc->ofs |= LOGFS_FULLY_POPULATED;
+	return 0;
+}
+
+static int logfs_write_direct(struct inode *inode, struct page *page,
+		long flags)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	struct write_control wc = {
+		.ofs = li->li_data[page->index],
+		.flags = flags,
+	};
+	int err;
+
+	alloc_inode_block(inode);
+
+	err = logfs_write_i0(inode, page, &wc);
+	if (err)
+		return err;
+
+	li->li_data[page->index] = wc.ofs;
+	logfs_set_alias(inode->i_sb, li->li_block,
+			page->index + INODE_POINTER_OFS);
+	return 0;
+}
+
+static int ptr_change(u64 ofs, struct page *page)
+{
+	struct logfs_block *block = logfs_block(page);
+	int empty0, empty1, full0, full1;
+
+	empty0 = ofs == 0;
+	empty1 = block->partial == 0;
+	if (empty0 != empty1)
+		return 1;
+
+	/* The !! is necessary to shrink result to int */
+	full0 = !!(ofs & LOGFS_FULLY_POPULATED);
+	full1 = block->full == LOGFS_BLOCK_FACTOR;
+	if (full0 != full1)
+		return 1;
+	return 0;
+}
+
+static int __logfs_write_rec(struct inode *inode, struct page *page,
+		struct write_control *this_wc,
+		pgoff_t bix, level_t target_level, level_t level)
+{
+	int ret, page_empty = 0;
+	int child_no = get_bits(bix, SUBLEVEL(level));
+	struct page *ipage;
+	struct write_control child_wc = {
+		.flags = this_wc->flags,
+	};
+
+	ipage = logfs_get_write_page(inode, bix, level);
+	if (!ipage)
+		return -ENOMEM;
+
+	if (this_wc->ofs) {
+		ret = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level);
+		if (ret)
+			goto out;
+	} else if (!PageUptodate(ipage)) {
+		page_empty = 1;
+		logfs_read_empty(ipage);
+	}
+
+	child_wc.ofs = block_get_pointer(ipage, child_no);
+
+	if ((__force u8)level-1 > (__force u8)target_level)
+		ret = __logfs_write_rec(inode, page, &child_wc, bix,
+				target_level, SUBLEVEL(level));
+	else
+		ret = logfs_write_i0(inode, page, &child_wc);
+
+	if (ret)
+		goto out;
+
+	alloc_indirect_block(inode, ipage, page_empty);
+	block_set_pointer(ipage, child_no, child_wc.ofs);
+	/* FIXME: first condition seems superfluous */
+	if (child_wc.ofs || logfs_block(ipage)->partial)
+		this_wc->flags |= WF_WRITE;
+	/* the condition on this_wc->ofs ensures that we won't consume extra
+	 * space for indirect blocks in the future, which we cannot reserve */
+	if (!this_wc->ofs || ptr_change(this_wc->ofs, ipage))
+		ret = logfs_write_i0(inode, ipage, this_wc);
+	else
+		logfs_set_alias(inode->i_sb, logfs_block(ipage), child_no);
+out:
+	logfs_put_write_page(ipage);
+	return ret;
+}
+
+static int logfs_write_rec(struct inode *inode, struct page *page,
+		pgoff_t bix, level_t target_level, long flags)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	struct write_control wc = {
+		.ofs = li->li_data[INDIRECT_INDEX],
+		.flags = flags,
+	};
+	int ret;
+
+	alloc_inode_block(inode);
+
+	if (li->li_height > (__force u8)target_level)
+		ret = __logfs_write_rec(inode, page, &wc, bix, target_level,
+				LEVEL(li->li_height));
+	else
+		ret = logfs_write_i0(inode, page, &wc);
+	if (ret)
+		return ret;
+
+	if (li->li_data[INDIRECT_INDEX] != wc.ofs) {
+		li->li_data[INDIRECT_INDEX] = wc.ofs;
+		logfs_set_alias(inode->i_sb, li->li_block,
+				INDIRECT_INDEX + INODE_POINTER_OFS);
+	}
+	return ret;
+}
+
+void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta)
+{
+	alloc_inode_block(inode);
+	logfs_inode(inode)->li_block->ta = ta;
+}
+
+void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta)
+{
+	struct logfs_block *block = logfs_inode(inode)->li_block;
+
+	if (block && block->ta)
+		block->ta = NULL;
+}
+
+static int grow_inode(struct inode *inode, u64 bix, level_t level)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	u8 height = (__force u8)level;
+	struct page *page;
+	struct write_control wc = {
+		.flags = WF_WRITE,
+	};
+	int err;
+
+	BUG_ON(height > 5 || li->li_height > 5);
+	while (height > li->li_height || bix >= maxbix(li->li_height)) {
+		page = logfs_get_write_page(inode, I0_BLOCKS + 1,
+				LEVEL(li->li_height + 1));
+		if (!page)
+			return -ENOMEM;
+		logfs_read_empty(page);
+		alloc_indirect_block(inode, page, 1);
+		block_set_pointer(page, 0, li->li_data[INDIRECT_INDEX]);
+		err = logfs_write_i0(inode, page, &wc);
+		logfs_put_write_page(page);
+		if (err)
+			return err;
+		li->li_data[INDIRECT_INDEX] = wc.ofs;
+		wc.ofs = 0;
+		li->li_height++;
+		logfs_set_alias(inode->i_sb, li->li_block, INODE_HEIGHT_OFS);
+	}
+	return 0;
+}
+
+static int __logfs_write_buf(struct inode *inode, struct page *page, long flags)
+{
+	struct logfs_super *super = logfs_super(inode->i_sb);
+	pgoff_t index = page->index;
+	u64 bix;
+	level_t level;
+	int err;
+
+	flags |= WF_WRITE | WF_DELETE;
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+
+	logfs_unpack_index(index, &bix, &level);
+	if (logfs_block(page) && logfs_block(page)->reserved_bytes)
+		super->s_dirty_pages -= logfs_block(page)->reserved_bytes;
+
+	if (index < I0_BLOCKS)
+		return logfs_write_direct(inode, page, flags);
+
+	bix = adjust_bix(bix, level);
+	err = grow_inode(inode, bix, level);
+	if (err)
+		return err;
+	return logfs_write_rec(inode, page, bix, level, flags);
+}
+
+int logfs_write_buf(struct inode *inode, struct page *page, long flags)
+{
+	struct super_block *sb = inode->i_sb;
+	int ret;
+
+	logfs_get_wblocks(sb, page, flags & WF_LOCK);
+	ret = __logfs_write_buf(inode, page, flags);
+	logfs_put_wblocks(sb, page, flags & WF_LOCK);
+	return ret;
+}
+
+static int __logfs_delete(struct inode *inode, struct page *page)
+{
+	long flags = WF_DELETE;
+
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+
+	if (page->index < I0_BLOCKS)
+		return logfs_write_direct(inode, page, flags);
+	return logfs_write_rec(inode, page, page->index, 0, flags);
+}
+
+int logfs_delete(struct inode *inode, pgoff_t index,
+		struct shadow_tree *shadow_tree)
+{
+	struct super_block *sb = inode->i_sb;
+	struct page *page;
+	int ret;
+
+	page = logfs_get_read_page(inode, index, 0);
+	if (!page)
+		return -ENOMEM;
+
+	logfs_get_wblocks(sb, page, 1);
+	ret = __logfs_delete(inode, page);
+	logfs_put_wblocks(sb, page, 1);
+
+	logfs_put_read_page(page);
+
+	return ret;
+}
+
+int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
+		gc_level_t gc_level, long flags)
+{
+	level_t level = shrink_level(gc_level);
+	struct page *page;
+	int err;
+
+	page = logfs_get_write_page(inode, bix, level);
+	if (!page)
+		return -ENOMEM;
+
+	err = logfs_segment_read(inode, page, ofs, bix, level);
+	if (!err) {
+		if (level != 0)
+			alloc_indirect_block(inode, page, 0);
+		err = logfs_write_buf(inode, page, flags);
+		if (!err && shrink_level(gc_level) == 0) {
+			/* Rewrite cannot mark the inode dirty but has to
+			 * write it immediatly.
+			 * Q: Can't we just create an alias for the inode
+			 * instead?  And if not, why not?
+			 */
+			if (inode->i_ino == LOGFS_INO_MASTER)
+				logfs_write_anchor(inode->i_sb);
+			else {
+				err = __logfs_write_inode(inode, flags);
+			}
+		}
+	}
+	logfs_put_write_page(page);
+	return err;
+}
+
+static int truncate_data_block(struct inode *inode, struct page *page,
+		u64 ofs, struct logfs_shadow *shadow, u64 size)
+{
+	loff_t pageofs = page->index << inode->i_sb->s_blocksize_bits;
+	u64 bix;
+	level_t level;
+	int err;
+
+	/* Does truncation happen within this page? */
+	if (size <= pageofs || size - pageofs >= PAGE_SIZE)
+		return 0;
+
+	logfs_unpack_index(page->index, &bix, &level);
+	BUG_ON(level != 0);
+
+	err = logfs_segment_read(inode, page, ofs, bix, level);
+	if (err)
+		return err;
+
+	zero_user_segment(page, size - pageofs, PAGE_CACHE_SIZE);
+	return logfs_segment_write(inode, page, shadow);
+}
+
+static int logfs_truncate_i0(struct inode *inode, struct page *page,
+		struct write_control *wc, u64 size)
+{
+	struct logfs_shadow *shadow;
+	u64 bix;
+	level_t level;
+	int err = 0;
+
+	logfs_unpack_index(page->index, &bix, &level);
+	BUG_ON(level != 0);
+	shadow = alloc_shadow(inode, bix, level, wc->ofs);
+
+	err = truncate_data_block(inode, page, wc->ofs, shadow, size);
+	if (err) {
+		free_shadow(inode, shadow);
+		return err;
+	}
+
+	logfs_segment_delete(inode, shadow);
+	set_iused(inode, shadow);
+	fill_shadow_tree(inode, page, shadow);
+	wc->ofs = shadow->new_ofs;
+	return 0;
+}
+
+static int logfs_truncate_direct(struct inode *inode, u64 size)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	struct write_control wc;
+	struct page *page;
+	int e;
+	int err;
+
+	alloc_inode_block(inode);
+
+	for (e = I0_BLOCKS - 1; e >= 0; e--) {
+		if (size > (e+1) * LOGFS_BLOCKSIZE)
+			break;
+
+		wc.ofs = li->li_data[e];
+		if (!wc.ofs)
+			continue;
+
+		page = logfs_get_write_page(inode, e, 0);
+		if (!page)
+			return -ENOMEM;
+		err = logfs_segment_read(inode, page, wc.ofs, e, 0);
+		if (err) {
+			logfs_put_write_page(page);
+			return err;
+		}
+		err = logfs_truncate_i0(inode, page, &wc, size);
+		logfs_put_write_page(page);
+		if (err)
+			return err;
+
+		li->li_data[e] = wc.ofs;
+	}
+	return 0;
+}
+
+/* FIXME: these need to become per-sb once we support different blocksizes */
+static u64 __logfs_step[] = {
+	1,
+	I1_BLOCKS,
+	I2_BLOCKS,
+	I3_BLOCKS,
+};
+
+static u64 __logfs_start_index[] = {
+	I0_BLOCKS,
+	I1_BLOCKS,
+	I2_BLOCKS,
+	I3_BLOCKS
+};
+
+static inline u64 logfs_step(level_t level)
+{
+	return __logfs_step[(__force u8)level];
+}
+
+static inline u64 logfs_factor(u8 level)
+{
+	return __logfs_step[level] * LOGFS_BLOCKSIZE;
+}
+
+static inline u64 logfs_start_index(level_t level)
+{
+	return __logfs_start_index[(__force u8)level];
+}
+
+static void logfs_unpack_raw_index(pgoff_t index, u64 *bix, level_t *level)
+{
+	logfs_unpack_index(index, bix, level);
+	if (*bix <= logfs_start_index(SUBLEVEL(*level)))
+		*bix = 0;
+}
+
+static int __logfs_truncate_rec(struct inode *inode, struct page *ipage,
+		struct write_control *this_wc, u64 size)
+{
+	int truncate_happened = 0;
+	int e, err = 0;
+	u64 bix, child_bix, next_bix;
+	level_t level;
+	struct page *page;
+	struct write_control child_wc = { /* FIXME: flags */ };
+
+	logfs_unpack_raw_index(ipage->index, &bix, &level);
+	err = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level);
+	if (err)
+		return err;
+
+	for (e = LOGFS_BLOCK_FACTOR - 1; e >= 0; e--) {
+		child_bix = bix + e * logfs_step(SUBLEVEL(level));
+		next_bix = child_bix + logfs_step(SUBLEVEL(level));
+		if (size > next_bix * LOGFS_BLOCKSIZE)
+			break;
+
+		child_wc.ofs = pure_ofs(block_get_pointer(ipage, e));
+		if (!child_wc.ofs)
+			continue;
+
+		page = logfs_get_write_page(inode, child_bix, SUBLEVEL(level));
+		if (!page)
+			return -ENOMEM;
+
+		if ((__force u8)level > 1)
+			err = __logfs_truncate_rec(inode, page, &child_wc, size);
+		else
+			err = logfs_truncate_i0(inode, page, &child_wc, size);
+		logfs_put_write_page(page);
+		if (err)
+			return err;
+
+		truncate_happened = 1;
+		alloc_indirect_block(inode, ipage, 0);
+		block_set_pointer(ipage, e, child_wc.ofs);
+	}
+
+	if (!truncate_happened) {
+		printk("ineffectual truncate (%lx, %lx, %llx)\n", inode->i_ino, ipage->index, size);
+		return 0;
+	}
+
+	this_wc->flags = WF_DELETE;
+	if (logfs_block(ipage)->partial)
+		this_wc->flags |= WF_WRITE;
+
+	return logfs_write_i0(inode, ipage, this_wc);
+}
+
+static int logfs_truncate_rec(struct inode *inode, u64 size)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	struct write_control wc = {
+		.ofs = li->li_data[INDIRECT_INDEX],
+	};
+	struct page *page;
+	int err;
+
+	alloc_inode_block(inode);
+
+	if (!wc.ofs)
+		return 0;
+
+	page = logfs_get_write_page(inode, 0, LEVEL(li->li_height));
+	if (!page)
+		return -ENOMEM;
+
+	err = __logfs_truncate_rec(inode, page, &wc, size);
+	logfs_put_write_page(page);
+	if (err)
+		return err;
+
+	if (li->li_data[INDIRECT_INDEX] != wc.ofs)
+		li->li_data[INDIRECT_INDEX] = wc.ofs;
+	return 0;
+}
+
+static int __logfs_truncate(struct inode *inode, u64 size)
+{
+	int ret;
+
+	if (size >= logfs_factor(logfs_inode(inode)->li_height))
+		return 0;
+
+	ret = logfs_truncate_rec(inode, size);
+	if (ret)
+		return ret;
+
+	return logfs_truncate_direct(inode, size);
+}
+
+/*
+ * Truncate, by changing the segment file, can consume a fair amount
+ * of resources.  So back off from time to time and do some GC.
+ * 8 or 2048 blocks should be well within safety limits even if
+ * every single block resided in a different segment.
+ */
+#define TRUNCATE_STEP	(8 * 1024 * 1024)
+int logfs_truncate(struct inode *inode, u64 target)
+{
+	struct super_block *sb = inode->i_sb;
+	u64 size = i_size_read(inode);
+	int err = 0;
+
+	size = ALIGN(size, TRUNCATE_STEP);
+	while (size > target) {
+		if (size > TRUNCATE_STEP)
+			size -= TRUNCATE_STEP;
+		else
+			size = 0;
+		if (size < target)
+			size = target;
+
+		logfs_get_wblocks(sb, NULL, 1);
+		err = __logfs_truncate(inode, size);
+		if (!err)
+			err = __logfs_write_inode(inode, 0);
+		logfs_put_wblocks(sb, NULL, 1);
+	}
+
+	if (!err)
+		err = vmtruncate(inode, target);
+
+	/* I don't trust error recovery yet. */
+	WARN_ON(err);
+	return err;
+}
+
+static void move_page_to_inode(struct inode *inode, struct page *page)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	struct logfs_block *block = logfs_block(page);
+
+	if (!block)
+		return;
+
+	log_blockmove("move_page_to_inode(%llx, %llx, %x)\n",
+			block->ino, block->bix, block->level);
+	BUG_ON(li->li_block);
+	block->ops = &inode_block_ops;
+	block->inode = inode;
+	li->li_block = block;
+
+	block->page = NULL;
+	page->private = 0;
+	ClearPagePrivate(page);
+}
+
+static void move_inode_to_page(struct page *page, struct inode *inode)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+	struct logfs_block *block = li->li_block;
+
+	if (!block)
+		return;
+
+	log_blockmove("move_inode_to_page(%llx, %llx, %x)\n",
+			block->ino, block->bix, block->level);
+	BUG_ON(PagePrivate(page));
+	block->ops = &indirect_block_ops;
+	block->page = page;
+	page->private = (unsigned long)block;
+	SetPagePrivate(page);
+
+	block->inode = NULL;
+	li->li_block = NULL;
+}
+
+int logfs_read_inode(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct logfs_super *super = logfs_super(sb);
+	struct inode *master_inode = super->s_master_inode;
+	struct page *page;
+	struct logfs_disk_inode *di;
+	u64 ino = inode->i_ino;
+
+	if (ino << sb->s_blocksize_bits > i_size_read(master_inode))
+		return -ENODATA;
+	if (!logfs_exist_block(master_inode, ino))
+		return -ENODATA;
+
+	page = read_cache_page(master_inode->i_mapping, ino,
+			(filler_t *)logfs_readpage, NULL);
+	if (IS_ERR(page))
+		return PTR_ERR(page);
+
+	di = kmap_atomic(page, KM_USER0);
+	logfs_disk_to_inode(di, inode);
+	kunmap_atomic(di, KM_USER0);
+	move_page_to_inode(inode, page);
+	page_cache_release(page);
+	return 0;
+}
+
+/* Caller must logfs_put_write_page(page); */
+static struct page *inode_to_page(struct inode *inode)
+{
+	struct inode *master_inode = logfs_super(inode->i_sb)->s_master_inode;
+	struct logfs_disk_inode *di;
+	struct page *page;
+
+	BUG_ON(inode->i_ino == LOGFS_INO_MASTER);
+
+	page = logfs_get_write_page(master_inode, inode->i_ino, 0);
+	if (!page)
+		return NULL;
+
+	di = kmap_atomic(page, KM_USER0);
+	logfs_inode_to_disk(inode, di);
+	kunmap_atomic(di, KM_USER0);
+	move_inode_to_page(page, inode);
+	return page;
+}
+
+/* Cheaper version of write_inode.  All changes are concealed in
+ * aliases, which are moved back.  No write to the medium happens.
+ */
+void logfs_clear_inode(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct logfs_inode *li = logfs_inode(inode);
+	struct logfs_block *block = li->li_block;
+	struct page *page;
+
+	/* Only deleted files may be dirty at this point */
+	BUG_ON(inode->i_state & I_DIRTY && inode->i_nlink);
+	if (!block)
+		return;
+	if ((logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN)) {
+		block->ops->free_block(inode->i_sb, block);
+		return;
+	}
+
+	BUG_ON(inode->i_ino < LOGFS_RESERVED_INOS);
+	page = inode_to_page(inode);
+	BUG_ON(!page); /* FIXME: Use emergency page */
+	logfs_put_write_page(page);
+}
+
+static int do_write_inode(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct inode *master_inode = logfs_super(sb)->s_master_inode;
+	loff_t size = (inode->i_ino + 1) << inode->i_sb->s_blocksize_bits;
+	struct page *page;
+	int err;
+
+	BUG_ON(inode->i_ino == LOGFS_INO_MASTER);
+	/* FIXME: lock inode */
+
+	if (i_size_read(master_inode) < size)
+		i_size_write(master_inode, size);
+
+	/* TODO: Tell vfs this inode is clean now */
+
+	page = inode_to_page(inode);
+	if (!page)
+		return -ENOMEM;
+
+	/* FIXME: transaction is part of logfs_block now.  Is that enough? */
+	err = logfs_write_buf(master_inode, page, 0);
+	logfs_put_write_page(page);
+	return err;
+}
+
+static void logfs_mod_segment_entry(struct super_block *sb, u32 segno,
+		int write,
+		void (*change_se)(struct logfs_segment_entry *, long),
+		long arg)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct inode *inode;
+	struct page *page;
+	struct logfs_segment_entry *se;
+	pgoff_t page_no;
+	int child_no;
+
+	page_no = segno >> (sb->s_blocksize_bits - 3);
+	child_no = segno & ((sb->s_blocksize >> 3) - 1);
+
+	inode = super->s_segfile_inode;
+	page = logfs_get_write_page(inode, page_no, 0);
+	BUG_ON(!page); /* FIXME: We need some reserve page for this case */
+	if (!PageUptodate(page))
+		logfs_read_block(inode, page, WRITE);
+
+	if (write)
+		alloc_indirect_block(inode, page, 0);
+	se = kmap_atomic(page, KM_USER0);
+	change_se(se + child_no, arg);
+	if (write) {
+		logfs_set_alias(sb, logfs_block(page), child_no);
+		BUG_ON((int)be32_to_cpu(se[child_no].valid) > super->s_segsize);
+	}
+	kunmap_atomic(se, KM_USER0);
+
+	logfs_put_write_page(page);
+}
+
+static void __get_segment_entry(struct logfs_segment_entry *se, long _target)
+{
+	struct logfs_segment_entry *target = (void *)_target;
+
+	*target = *se;
+}
+
+void logfs_get_segment_entry(struct super_block *sb, u32 segno,
+		struct logfs_segment_entry *se)
+{
+	logfs_mod_segment_entry(sb, segno, 0, __get_segment_entry, (long)se);
+}
+
+static void __set_segment_used(struct logfs_segment_entry *se, long increment)
+{
+	u32 valid;
+
+	valid = be32_to_cpu(se->valid);
+	valid += increment;
+	se->valid = cpu_to_be32(valid);
+}
+
+void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment)
+{
+	struct logfs_super *super = logfs_super(sb);
+	u32 segno = ofs >> super->s_segshift;
+
+	if (!increment)
+		return;
+
+	logfs_mod_segment_entry(sb, segno, 1, __set_segment_used, increment);
+}
+
+static void __set_segment_erased(struct logfs_segment_entry *se, long ec_level)
+{
+	se->ec_level = cpu_to_be32(ec_level);
+}
+
+void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec,
+		gc_level_t gc_level)
+{
+	u32 ec_level = ec << 4 | (__force u8)gc_level;
+
+	logfs_mod_segment_entry(sb, segno, 1, __set_segment_erased, ec_level);
+}
+
+static void __set_segment_reserved(struct logfs_segment_entry *se, long ignore)
+{
+	se->valid = cpu_to_be32(RESERVED);
+}
+
+void logfs_set_segment_reserved(struct super_block *sb, u32 segno)
+{
+	logfs_mod_segment_entry(sb, segno, 1, __set_segment_reserved, 0);
+}
+
+static void __set_segment_unreserved(struct logfs_segment_entry *se,
+		long ec_level)
+{
+	se->valid = 0;
+	se->ec_level = cpu_to_be32(ec_level);
+}
+
+void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec)
+{
+	u32 ec_level = ec << 4;
+
+	logfs_mod_segment_entry(sb, segno, 1, __set_segment_unreserved,
+			ec_level);
+}
+
+int __logfs_write_inode(struct inode *inode, long flags)
+{
+	struct super_block *sb = inode->i_sb;
+	int ret;
+
+	logfs_get_wblocks(sb, NULL, flags & WF_LOCK);
+	ret = do_write_inode(inode);
+	logfs_put_wblocks(sb, NULL, flags & WF_LOCK);
+	return ret;
+}
+
+static int do_delete_inode(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct inode *master_inode = logfs_super(sb)->s_master_inode;
+	struct page *page;
+	int ret;
+
+	page = logfs_get_write_page(master_inode, inode->i_ino, 0);
+	if (!page)
+		return -ENOMEM;
+
+	move_inode_to_page(page, inode);
+
+	logfs_get_wblocks(sb, page, 1);
+	ret = __logfs_delete(master_inode, page);
+	logfs_put_wblocks(sb, page, 1);
+
+	logfs_put_write_page(page);
+	return ret;
+}
+
+/*
+ * ZOMBIE inodes have already been deleted before and should remain dead,
+ * if it weren't for valid checking.  No need to kill them again here.
+ */
+void logfs_delete_inode(struct inode *inode)
+{
+	struct logfs_inode *li = logfs_inode(inode);
+
+	if (!(li->li_flags & LOGFS_IF_ZOMBIE)) {
+		li->li_flags |= LOGFS_IF_ZOMBIE;
+		if (i_size_read(inode) > 0)
+			logfs_truncate(inode, 0);
+		do_delete_inode(inode);
+	}
+	truncate_inode_pages(&inode->i_data, 0);
+	clear_inode(inode);
+}
+
+void btree_write_block(struct logfs_block *block)
+{
+	struct inode *inode;
+	struct page *page;
+	int err, cookie;
+
+	inode = logfs_safe_iget(block->sb, block->ino, &cookie);
+	page = logfs_get_write_page(inode, block->bix, block->level);
+
+	err = logfs_readpage_nolock(page);
+	BUG_ON(err);
+	BUG_ON(!PagePrivate(page));
+	BUG_ON(logfs_block(page) != block);
+	err = __logfs_write_buf(inode, page, 0);
+	BUG_ON(err);
+	BUG_ON(PagePrivate(page) || page->private);
+
+	logfs_put_write_page(page);
+	logfs_safe_iput(inode, cookie);
+}
+
+/**
+ * logfs_inode_write - write inode or dentry objects
+ *
+ * @inode:		parent inode (ifile or directory)
+ * @buf:		object to write (inode or dentry)
+ * @n:			object size
+ * @_pos:		object number (file position in blocks/objects)
+ * @flags:		write flags
+ * @lock:		0 if write lock is already taken, 1 otherwise
+ * @shadow_tree:	shadow below this inode
+ *
+ * FIXME: All caller of this put a 200-300 byte variable on the stack,
+ * only to call here and do a memcpy from that stack variable.  A good
+ * example of wasted performance and stack space.
+ */
+int logfs_inode_write(struct inode *inode, const void *buf, size_t count,
+		loff_t bix, long flags, struct shadow_tree *shadow_tree)
+{
+	loff_t pos = bix << inode->i_sb->s_blocksize_bits;
+	int err;
+	struct page *page;
+	void *pagebuf;
+
+	BUG_ON(pos & (LOGFS_BLOCKSIZE-1));
+	BUG_ON(count > LOGFS_BLOCKSIZE);
+	page = logfs_get_write_page(inode, bix, 0);
+	if (!page)
+		return -ENOMEM;
+
+	pagebuf = kmap_atomic(page, KM_USER0);
+	memcpy(pagebuf, buf, count);
+	flush_dcache_page(page);
+	kunmap_atomic(pagebuf, KM_USER0);
+
+	if (i_size_read(inode) < pos + LOGFS_BLOCKSIZE)
+		i_size_write(inode, pos + LOGFS_BLOCKSIZE);
+
+	err = logfs_write_buf(inode, page, flags);
+	logfs_put_write_page(page);
+	return err;
+}
+
+int logfs_open_segfile(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct inode *inode;
+
+	inode = logfs_read_meta_inode(sb, LOGFS_INO_SEGFILE);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+	super->s_segfile_inode = inode;
+	return 0;
+}
+
+int logfs_init_rw(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int min_fill = 3 * super->s_no_blocks;
+
+	INIT_LIST_HEAD(&super->s_object_alias);
+	INIT_LIST_HEAD(&super->s_writeback_list);
+	mutex_init(&super->s_write_mutex);
+	super->s_block_pool = mempool_create_kmalloc_pool(min_fill,
+			sizeof(struct logfs_block));
+	super->s_shadow_pool = mempool_create_kmalloc_pool(min_fill,
+			sizeof(struct logfs_shadow));
+	return 0;
+}
+
+void logfs_cleanup_rw(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	destroy_meta_inode(super->s_segfile_inode);
+	logfs_mempool_destroy(super->s_block_pool);
+	logfs_mempool_destroy(super->s_shadow_pool);
+}
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
new file mode 100644
index 000000000000..a9657afb70ad
--- /dev/null
+++ b/fs/logfs/segment.c
@@ -0,0 +1,933 @@
+/*
+ * fs/logfs/segment.c	- Handling the Object Store
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ *
+ * Object store or ostore makes up the complete device with exception of
+ * the superblock and journal areas.  Apart from its own metadata it stores
+ * three kinds of objects: inodes, dentries and blocks, both data and indirect.
+ */
+#include "logfs.h"
+#include <linux/slab.h>
+
+static int logfs_mark_segment_bad(struct super_block *sb, u32 segno)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct btree_head32 *head = &super->s_reserved_segments;
+	int err;
+
+	err = btree_insert32(head, segno, (void *)1, GFP_NOFS);
+	if (err)
+		return err;
+	logfs_super(sb)->s_bad_segments++;
+	/* FIXME: write to journal */
+	return 0;
+}
+
+int logfs_erase_segment(struct super_block *sb, u32 segno, int ensure_erase)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	super->s_gec++;
+
+	return super->s_devops->erase(sb, (u64)segno << super->s_segshift,
+			super->s_segsize, ensure_erase);
+}
+
+static s64 logfs_get_free_bytes(struct logfs_area *area, size_t bytes)
+{
+	s32 ofs;
+
+	logfs_open_area(area, bytes);
+
+	ofs = area->a_used_bytes;
+	area->a_used_bytes += bytes;
+	BUG_ON(area->a_used_bytes >= logfs_super(area->a_sb)->s_segsize);
+
+	return dev_ofs(area->a_sb, area->a_segno, ofs);
+}
+
+static struct page *get_mapping_page(struct super_block *sb, pgoff_t index,
+		int use_filler)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping = super->s_mapping_inode->i_mapping;
+	filler_t *filler = super->s_devops->readpage;
+	struct page *page;
+
+	BUG_ON(mapping_gfp_mask(mapping) & __GFP_FS);
+	if (use_filler)
+		page = read_cache_page(mapping, index, filler, sb);
+	else {
+		page = find_or_create_page(mapping, index, GFP_NOFS);
+		unlock_page(page);
+	}
+	return page;
+}
+
+int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
+		int use_filler)
+{
+	pgoff_t index = ofs >> PAGE_SHIFT;
+	struct page *page;
+	long offset = ofs & (PAGE_SIZE-1);
+	long copylen;
+
+	/* Only logfs_wbuf_recover may use len==0 */
+	BUG_ON(!len && !use_filler);
+	do {
+		copylen = min((ulong)len, PAGE_SIZE - offset);
+
+		page = get_mapping_page(area->a_sb, index, use_filler);
+		if (IS_ERR(page))
+			return PTR_ERR(page);
+		BUG_ON(!page); /* FIXME: reserve a pool */
+		SetPageUptodate(page);
+		memcpy(page_address(page) + offset, buf, copylen);
+		SetPagePrivate(page);
+		page_cache_release(page);
+
+		buf += copylen;
+		len -= copylen;
+		offset = 0;
+		index++;
+	} while (len);
+	return 0;
+}
+
+static void pad_partial_page(struct logfs_area *area)
+{
+	struct super_block *sb = area->a_sb;
+	struct page *page;
+	u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
+	pgoff_t index = ofs >> PAGE_SHIFT;
+	long offset = ofs & (PAGE_SIZE-1);
+	u32 len = PAGE_SIZE - offset;
+
+	if (len % PAGE_SIZE) {
+		page = get_mapping_page(sb, index, 0);
+		BUG_ON(!page); /* FIXME: reserve a pool */
+		memset(page_address(page) + offset, 0xff, len);
+		SetPagePrivate(page);
+		page_cache_release(page);
+	}
+}
+
+static void pad_full_pages(struct logfs_area *area)
+{
+	struct super_block *sb = area->a_sb;
+	struct logfs_super *super = logfs_super(sb);
+	u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
+	u32 len = super->s_segsize - area->a_used_bytes;
+	pgoff_t index = PAGE_CACHE_ALIGN(ofs) >> PAGE_CACHE_SHIFT;
+	pgoff_t no_indizes = len >> PAGE_CACHE_SHIFT;
+	struct page *page;
+
+	while (no_indizes) {
+		page = get_mapping_page(sb, index, 0);
+		BUG_ON(!page); /* FIXME: reserve a pool */
+		SetPageUptodate(page);
+		memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
+		SetPagePrivate(page);
+		page_cache_release(page);
+		index++;
+		no_indizes--;
+	}
+}
+
+/*
+ * bdev_writeseg will write full pages.  Memset the tail to prevent data leaks.
+ * Also make sure we allocate (and memset) all pages for final writeout.
+ */
+static void pad_wbuf(struct logfs_area *area, int final)
+{
+	pad_partial_page(area);
+	if (final)
+		pad_full_pages(area);
+}
+
+/*
+ * We have to be careful with the alias tree.  Since lookup is done by bix,
+ * it needs to be normalized, so 14, 15, 16, etc. all match when dealing with
+ * indirect blocks.  So always use it through accessor functions.
+ */
+static void *alias_tree_lookup(struct super_block *sb, u64 ino, u64 bix,
+		level_t level)
+{
+	struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree;
+	pgoff_t index = logfs_pack_index(bix, level);
+
+	return btree_lookup128(head, ino, index);
+}
+
+static int alias_tree_insert(struct super_block *sb, u64 ino, u64 bix,
+		level_t level, void *val)
+{
+	struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree;
+	pgoff_t index = logfs_pack_index(bix, level);
+
+	return btree_insert128(head, ino, index, val, GFP_NOFS);
+}
+
+static int btree_write_alias(struct super_block *sb, struct logfs_block *block,
+		write_alias_t *write_one_alias)
+{
+	struct object_alias_item *item;
+	int err;
+
+	list_for_each_entry(item, &block->item_list, list) {
+		err = write_alias_journal(sb, block->ino, block->bix,
+				block->level, item->child_no, item->val);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static struct logfs_block_ops btree_block_ops = {
+	.write_block	= btree_write_block,
+	.free_block	= __free_block,
+	.write_alias	= btree_write_alias,
+};
+
+int logfs_load_object_aliases(struct super_block *sb,
+		struct logfs_obj_alias *oa, int count)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_block *block;
+	struct object_alias_item *item;
+	u64 ino, bix;
+	level_t level;
+	int i, err;
+
+	super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS;
+	count /= sizeof(*oa);
+	for (i = 0; i < count; i++) {
+		item = mempool_alloc(super->s_alias_pool, GFP_NOFS);
+		if (!item)
+			return -ENOMEM;
+		memset(item, 0, sizeof(*item));
+
+		super->s_no_object_aliases++;
+		item->val = oa[i].val;
+		item->child_no = be16_to_cpu(oa[i].child_no);
+
+		ino = be64_to_cpu(oa[i].ino);
+		bix = be64_to_cpu(oa[i].bix);
+		level = LEVEL(oa[i].level);
+
+		log_aliases("logfs_load_object_aliases(%llx, %llx, %x, %x) %llx\n",
+				ino, bix, level, item->child_no,
+				be64_to_cpu(item->val));
+		block = alias_tree_lookup(sb, ino, bix, level);
+		if (!block) {
+			block = __alloc_block(sb, ino, bix, level);
+			block->ops = &btree_block_ops;
+			err = alias_tree_insert(sb, ino, bix, level, block);
+			BUG_ON(err); /* mempool empty */
+		}
+		if (test_and_set_bit(item->child_no, block->alias_map)) {
+			printk(KERN_ERR"LogFS: Alias collision detected\n");
+			return -EIO;
+		}
+		list_move_tail(&block->alias_list, &super->s_object_alias);
+		list_add(&item->list, &block->item_list);
+	}
+	return 0;
+}
+
+static void kill_alias(void *_block, unsigned long ignore0,
+		u64 ignore1, u64 ignore2, size_t ignore3)
+{
+	struct logfs_block *block = _block;
+	struct super_block *sb = block->sb;
+	struct logfs_super *super = logfs_super(sb);
+	struct object_alias_item *item;
+
+	while (!list_empty(&block->item_list)) {
+		item = list_entry(block->item_list.next, typeof(*item), list);
+		list_del(&item->list);
+		mempool_free(item, super->s_alias_pool);
+	}
+	block->ops->free_block(sb, block);
+}
+
+static int obj_type(struct inode *inode, level_t level)
+{
+	if (level == 0) {
+		if (S_ISDIR(inode->i_mode))
+			return OBJ_DENTRY;
+		if (inode->i_ino == LOGFS_INO_MASTER)
+			return OBJ_INODE;
+	}
+	return OBJ_BLOCK;
+}
+
+static int obj_len(struct super_block *sb, int obj_type)
+{
+	switch (obj_type) {
+	case OBJ_DENTRY:
+		return sizeof(struct logfs_disk_dentry);
+	case OBJ_INODE:
+		return sizeof(struct logfs_disk_inode);
+	case OBJ_BLOCK:
+		return sb->s_blocksize;
+	default:
+		BUG();
+	}
+}
+
+static int __logfs_segment_write(struct inode *inode, void *buf,
+		struct logfs_shadow *shadow, int type, int len, int compr)
+{
+	struct logfs_area *area;
+	struct super_block *sb = inode->i_sb;
+	s64 ofs;
+	struct logfs_object_header h;
+	int acc_len;
+
+	if (shadow->gc_level == 0)
+		acc_len = len;
+	else
+		acc_len = obj_len(sb, type);
+
+	area = get_area(sb, shadow->gc_level);
+	ofs = logfs_get_free_bytes(area, len + LOGFS_OBJECT_HEADERSIZE);
+	LOGFS_BUG_ON(ofs <= 0, sb);
+	/*
+	 * Order is important.  logfs_get_free_bytes(), by modifying the
+	 * segment file, may modify the content of the very page we're about
+	 * to write now.  Which is fine, as long as the calculated crc and
+	 * written data still match.  So do the modifications _before_
+	 * calculating the crc.
+	 */
+
+	h.len	= cpu_to_be16(len);
+	h.type	= type;
+	h.compr	= compr;
+	h.ino	= cpu_to_be64(inode->i_ino);
+	h.bix	= cpu_to_be64(shadow->bix);
+	h.crc	= logfs_crc32(&h, sizeof(h) - 4, 4);
+	h.data_crc = logfs_crc32(buf, len, 0);
+
+	logfs_buf_write(area, ofs, &h, sizeof(h));
+	logfs_buf_write(area, ofs + LOGFS_OBJECT_HEADERSIZE, buf, len);
+
+	shadow->new_ofs = ofs;
+	shadow->new_len = acc_len + LOGFS_OBJECT_HEADERSIZE;
+
+	return 0;
+}
+
+static s64 logfs_segment_write_compress(struct inode *inode, void *buf,
+		struct logfs_shadow *shadow, int type, int len)
+{
+	struct super_block *sb = inode->i_sb;
+	void *compressor_buf = logfs_super(sb)->s_compressed_je;
+	ssize_t compr_len;
+	int ret;
+
+	mutex_lock(&logfs_super(sb)->s_journal_mutex);
+	compr_len = logfs_compress(buf, compressor_buf, len, len);
+
+	if (compr_len >= 0) {
+		ret = __logfs_segment_write(inode, compressor_buf, shadow,
+				type, compr_len, COMPR_ZLIB);
+	} else {
+		ret = __logfs_segment_write(inode, buf, shadow, type, len,
+				COMPR_NONE);
+	}
+	mutex_unlock(&logfs_super(sb)->s_journal_mutex);
+	return ret;
+}
+
+/**
+ * logfs_segment_write - write data block to object store
+ * @inode:		inode containing data
+ *
+ * Returns an errno or zero.
+ */
+int logfs_segment_write(struct inode *inode, struct page *page,
+		struct logfs_shadow *shadow)
+{
+	struct super_block *sb = inode->i_sb;
+	struct logfs_super *super = logfs_super(sb);
+	int do_compress, type, len;
+	int ret;
+	void *buf;
+
+	super->s_flags |= LOGFS_SB_FLAG_DIRTY;
+	BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
+	do_compress = logfs_inode(inode)->li_flags & LOGFS_IF_COMPRESSED;
+	if (shadow->gc_level != 0) {
+		/* temporarily disable compression for indirect blocks */
+		do_compress = 0;
+	}
+
+	type = obj_type(inode, shrink_level(shadow->gc_level));
+	len = obj_len(sb, type);
+	buf = kmap(page);
+	if (do_compress)
+		ret = logfs_segment_write_compress(inode, buf, shadow, type,
+				len);
+	else
+		ret = __logfs_segment_write(inode, buf, shadow, type, len,
+				COMPR_NONE);
+	kunmap(page);
+
+	log_segment("logfs_segment_write(%llx, %llx, %x) %llx->%llx %x->%x\n",
+			shadow->ino, shadow->bix, shadow->gc_level,
+			shadow->old_ofs, shadow->new_ofs,
+			shadow->old_len, shadow->new_len);
+	/* this BUG_ON did catch a locking bug.  useful */
+	BUG_ON(!(shadow->new_ofs & (super->s_segsize - 1)));
+	return ret;
+}
+
+int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf)
+{
+	pgoff_t index = ofs >> PAGE_SHIFT;
+	struct page *page;
+	long offset = ofs & (PAGE_SIZE-1);
+	long copylen;
+
+	while (len) {
+		copylen = min((ulong)len, PAGE_SIZE - offset);
+
+		page = get_mapping_page(sb, index, 1);
+		if (IS_ERR(page))
+			return PTR_ERR(page);
+		memcpy(buf, page_address(page) + offset, copylen);
+		page_cache_release(page);
+
+		buf += copylen;
+		len -= copylen;
+		offset = 0;
+		index++;
+	}
+	return 0;
+}
+
+/*
+ * The "position" of indirect blocks is ambiguous.  It can be the position
+ * of any data block somewhere behind this indirect block.  So we need to
+ * normalize the positions through logfs_block_mask() before comparing.
+ */
+static int check_pos(struct super_block *sb, u64 pos1, u64 pos2, level_t level)
+{
+	return	(pos1 & logfs_block_mask(sb, level)) !=
+		(pos2 & logfs_block_mask(sb, level));
+}
+
+#if 0
+static int read_seg_header(struct super_block *sb, u64 ofs,
+		struct logfs_segment_header *sh)
+{
+	__be32 crc;
+	int err;
+
+	err = wbuf_read(sb, ofs, sizeof(*sh), sh);
+	if (err)
+		return err;
+	crc = logfs_crc32(sh, sizeof(*sh), 4);
+	if (crc != sh->crc) {
+		printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, "
+				"got %x\n", ofs, be32_to_cpu(sh->crc),
+				be32_to_cpu(crc));
+		return -EIO;
+	}
+	return 0;
+}
+#endif
+
+static int read_obj_header(struct super_block *sb, u64 ofs,
+		struct logfs_object_header *oh)
+{
+	__be32 crc;
+	int err;
+
+	err = wbuf_read(sb, ofs, sizeof(*oh), oh);
+	if (err)
+		return err;
+	crc = logfs_crc32(oh, sizeof(*oh) - 4, 4);
+	if (crc != oh->crc) {
+		printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, "
+				"got %x\n", ofs, be32_to_cpu(oh->crc),
+				be32_to_cpu(crc));
+		return -EIO;
+	}
+	return 0;
+}
+
+static void move_btree_to_page(struct inode *inode, struct page *page,
+		__be64 *data)
+{
+	struct super_block *sb = inode->i_sb;
+	struct logfs_super *super = logfs_super(sb);
+	struct btree_head128 *head = &super->s_object_alias_tree;
+	struct logfs_block *block;
+	struct object_alias_item *item, *next;
+
+	if (!(super->s_flags & LOGFS_SB_FLAG_OBJ_ALIAS))
+		return;
+
+	block = btree_remove128(head, inode->i_ino, page->index);
+	if (!block)
+		return;
+
+	log_blockmove("move_btree_to_page(%llx, %llx, %x)\n",
+			block->ino, block->bix, block->level);
+	list_for_each_entry_safe(item, next, &block->item_list, list) {
+		data[item->child_no] = item->val;
+		list_del(&item->list);
+		mempool_free(item, super->s_alias_pool);
+	}
+	block->page = page;
+	SetPagePrivate(page);
+	page->private = (unsigned long)block;
+	block->ops = &indirect_block_ops;
+	initialize_block_counters(page, block, data, 0);
+}
+
+/*
+ * This silences a false, yet annoying gcc warning.  I hate it when my editor
+ * jumps into bitops.h each time I recompile this file.
+ * TODO: Complain to gcc folks about this and upgrade compiler.
+ */
+static unsigned long fnb(const unsigned long *addr,
+		unsigned long size, unsigned long offset)
+{
+	return find_next_bit(addr, size, offset);
+}
+
+void move_page_to_btree(struct page *page)
+{
+	struct logfs_block *block = logfs_block(page);
+	struct super_block *sb = block->sb;
+	struct logfs_super *super = logfs_super(sb);
+	struct object_alias_item *item;
+	unsigned long pos;
+	__be64 *child;
+	int err;
+
+	if (super->s_flags & LOGFS_SB_FLAG_SHUTDOWN) {
+		block->ops->free_block(sb, block);
+		return;
+	}
+	log_blockmove("move_page_to_btree(%llx, %llx, %x)\n",
+			block->ino, block->bix, block->level);
+	super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS;
+
+	for (pos = 0; ; pos++) {
+		pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
+		if (pos >= LOGFS_BLOCK_FACTOR)
+			break;
+
+		item = mempool_alloc(super->s_alias_pool, GFP_NOFS);
+		BUG_ON(!item); /* mempool empty */
+		memset(item, 0, sizeof(*item));
+
+		child = kmap_atomic(page, KM_USER0);
+		item->val = child[pos];
+		kunmap_atomic(child, KM_USER0);
+		item->child_no = pos;
+		list_add(&item->list, &block->item_list);
+	}
+	block->page = NULL;
+	ClearPagePrivate(page);
+	page->private = 0;
+	block->ops = &btree_block_ops;
+	err = alias_tree_insert(block->sb, block->ino, block->bix, block->level,
+			block);
+	BUG_ON(err); /* mempool empty */
+	ClearPageUptodate(page);
+}
+
+static int __logfs_segment_read(struct inode *inode, void *buf,
+		u64 ofs, u64 bix, level_t level)
+{
+	struct super_block *sb = inode->i_sb;
+	void *compressor_buf = logfs_super(sb)->s_compressed_je;
+	struct logfs_object_header oh;
+	__be32 crc;
+	u16 len;
+	int err, block_len;
+
+	block_len = obj_len(sb, obj_type(inode, level));
+	err = read_obj_header(sb, ofs, &oh);
+	if (err)
+		goto out_err;
+
+	err = -EIO;
+	if (be64_to_cpu(oh.ino) != inode->i_ino
+			|| check_pos(sb, be64_to_cpu(oh.bix), bix, level)) {
+		printk(KERN_ERR"LOGFS: (ino, bix) don't match at %llx: "
+				"expected (%lx, %llx), got (%llx, %llx)\n",
+				ofs, inode->i_ino, bix,
+				be64_to_cpu(oh.ino), be64_to_cpu(oh.bix));
+		goto out_err;
+	}
+
+	len = be16_to_cpu(oh.len);
+
+	switch (oh.compr) {
+	case COMPR_NONE:
+		err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len, buf);
+		if (err)
+			goto out_err;
+		crc = logfs_crc32(buf, len, 0);
+		if (crc != oh.data_crc) {
+			printk(KERN_ERR"LOGFS: uncompressed data crc error at "
+					"%llx: expected %x, got %x\n", ofs,
+					be32_to_cpu(oh.data_crc),
+					be32_to_cpu(crc));
+			goto out_err;
+		}
+		break;
+	case COMPR_ZLIB:
+		mutex_lock(&logfs_super(sb)->s_journal_mutex);
+		err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len,
+				compressor_buf);
+		if (err) {
+			mutex_unlock(&logfs_super(sb)->s_journal_mutex);
+			goto out_err;
+		}
+		crc = logfs_crc32(compressor_buf, len, 0);
+		if (crc != oh.data_crc) {
+			printk(KERN_ERR"LOGFS: compressed data crc error at "
+					"%llx: expected %x, got %x\n", ofs,
+					be32_to_cpu(oh.data_crc),
+					be32_to_cpu(crc));
+			mutex_unlock(&logfs_super(sb)->s_journal_mutex);
+			goto out_err;
+		}
+		err = logfs_uncompress(compressor_buf, buf, len, block_len);
+		mutex_unlock(&logfs_super(sb)->s_journal_mutex);
+		if (err) {
+			printk(KERN_ERR"LOGFS: uncompress error at %llx\n", ofs);
+			goto out_err;
+		}
+		break;
+	default:
+		LOGFS_BUG(sb);
+		err = -EIO;
+		goto out_err;
+	}
+	return 0;
+
+out_err:
+	logfs_set_ro(sb);
+	printk(KERN_ERR"LOGFS: device is read-only now\n");
+	LOGFS_BUG(sb);
+	return err;
+}
+
+/**
+ * logfs_segment_read - read data block from object store
+ * @inode:		inode containing data
+ * @buf:		data buffer
+ * @ofs:		physical data offset
+ * @bix:		block index
+ * @level:		block level
+ *
+ * Returns 0 on success or a negative errno.
+ */
+int logfs_segment_read(struct inode *inode, struct page *page,
+		u64 ofs, u64 bix, level_t level)
+{
+	int err;
+	void *buf;
+
+	if (PageUptodate(page))
+		return 0;
+
+	ofs &= ~LOGFS_FULLY_POPULATED;
+
+	buf = kmap(page);
+	err = __logfs_segment_read(inode, buf, ofs, bix, level);
+	if (!err) {
+		move_btree_to_page(inode, page, buf);
+		SetPageUptodate(page);
+	}
+	kunmap(page);
+	log_segment("logfs_segment_read(%lx, %llx, %x) %llx (%d)\n",
+			inode->i_ino, bix, level, ofs, err);
+	return err;
+}
+
+int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow)
+{
+	struct super_block *sb = inode->i_sb;
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_object_header h;
+	u16 len;
+	int err;
+
+	super->s_flags |= LOGFS_SB_FLAG_DIRTY;
+	BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
+	BUG_ON(shadow->old_ofs & LOGFS_FULLY_POPULATED);
+	if (!shadow->old_ofs)
+		return 0;
+
+	log_segment("logfs_segment_delete(%llx, %llx, %x) %llx->%llx %x->%x\n",
+			shadow->ino, shadow->bix, shadow->gc_level,
+			shadow->old_ofs, shadow->new_ofs,
+			shadow->old_len, shadow->new_len);
+	err = read_obj_header(sb, shadow->old_ofs, &h);
+	LOGFS_BUG_ON(err, sb);
+	LOGFS_BUG_ON(be64_to_cpu(h.ino) != inode->i_ino, sb);
+	LOGFS_BUG_ON(check_pos(sb, shadow->bix, be64_to_cpu(h.bix),
+				shrink_level(shadow->gc_level)), sb);
+
+	if (shadow->gc_level == 0)
+		len = be16_to_cpu(h.len);
+	else
+		len = obj_len(sb, h.type);
+	shadow->old_len = len + sizeof(h);
+	return 0;
+}
+
+void freeseg(struct super_block *sb, u32 segno)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping = super->s_mapping_inode->i_mapping;
+	struct page *page;
+	u64 ofs, start, end;
+
+	start = dev_ofs(sb, segno, 0);
+	end = dev_ofs(sb, segno + 1, 0);
+	for (ofs = start; ofs < end; ofs += PAGE_SIZE) {
+		page = find_get_page(mapping, ofs >> PAGE_SHIFT);
+		if (!page)
+			continue;
+		ClearPagePrivate(page);
+		page_cache_release(page);
+	}
+}
+
+int logfs_open_area(struct logfs_area *area, size_t bytes)
+{
+	struct super_block *sb = area->a_sb;
+	struct logfs_super *super = logfs_super(sb);
+	int err, closed = 0;
+
+	if (area->a_is_open && area->a_used_bytes + bytes <= super->s_segsize)
+		return 0;
+
+	if (area->a_is_open) {
+		u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
+		u32 len = super->s_segsize - area->a_written_bytes;
+
+		log_gc("logfs_close_area(%x)\n", area->a_segno);
+		pad_wbuf(area, 1);
+		super->s_devops->writeseg(area->a_sb, ofs, len);
+		freeseg(sb, area->a_segno);
+		closed = 1;
+	}
+
+	area->a_used_bytes = 0;
+	area->a_written_bytes = 0;
+again:
+	area->a_ops->get_free_segment(area);
+	area->a_ops->get_erase_count(area);
+
+	log_gc("logfs_open_area(%x, %x)\n", area->a_segno, area->a_level);
+	err = area->a_ops->erase_segment(area);
+	if (err) {
+		printk(KERN_WARNING "LogFS: Error erasing segment %x\n",
+				area->a_segno);
+		logfs_mark_segment_bad(sb, area->a_segno);
+		goto again;
+	}
+	area->a_is_open = 1;
+	return closed;
+}
+
+void logfs_sync_area(struct logfs_area *area)
+{
+	struct super_block *sb = area->a_sb;
+	struct logfs_super *super = logfs_super(sb);
+	u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
+	u32 len = (area->a_used_bytes - area->a_written_bytes);
+
+	if (super->s_writesize)
+		len &= ~(super->s_writesize - 1);
+	if (len == 0)
+		return;
+	pad_wbuf(area, 0);
+	super->s_devops->writeseg(sb, ofs, len);
+	area->a_written_bytes += len;
+}
+
+void logfs_sync_segments(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int i;
+
+	for_each_area(i)
+		logfs_sync_area(super->s_area[i]);
+}
+
+/*
+ * Pick a free segment to be used for this area.  Effectively takes a
+ * candidate from the free list (not really a candidate anymore).
+ */
+static void ostore_get_free_segment(struct logfs_area *area)
+{
+	struct super_block *sb = area->a_sb;
+	struct logfs_super *super = logfs_super(sb);
+
+	if (super->s_free_list.count == 0) {
+		printk(KERN_ERR"LOGFS: ran out of free segments\n");
+		LOGFS_BUG(sb);
+	}
+
+	area->a_segno = get_best_cand(sb, &super->s_free_list, NULL);
+}
+
+static void ostore_get_erase_count(struct logfs_area *area)
+{
+	struct logfs_segment_entry se;
+	u32 ec_level;
+
+	logfs_get_segment_entry(area->a_sb, area->a_segno, &se);
+	BUG_ON(se.ec_level == cpu_to_be32(BADSEG) ||
+			se.valid == cpu_to_be32(RESERVED));
+
+	ec_level = be32_to_cpu(se.ec_level);
+	area->a_erase_count = (ec_level >> 4) + 1;
+}
+
+static int ostore_erase_segment(struct logfs_area *area)
+{
+	struct super_block *sb = area->a_sb;
+	struct logfs_segment_header sh;
+	u64 ofs;
+	int err;
+
+	err = logfs_erase_segment(sb, area->a_segno, 0);
+	if (err)
+		return err;
+
+	sh.pad = 0;
+	sh.type = SEG_OSTORE;
+	sh.level = (__force u8)area->a_level;
+	sh.segno = cpu_to_be32(area->a_segno);
+	sh.ec = cpu_to_be32(area->a_erase_count);
+	sh.gec = cpu_to_be64(logfs_super(sb)->s_gec);
+	sh.crc = logfs_crc32(&sh, sizeof(sh), 4);
+
+	logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count,
+			area->a_level);
+
+	ofs = dev_ofs(sb, area->a_segno, 0);
+	area->a_used_bytes = sizeof(sh);
+	logfs_buf_write(area, ofs, &sh, sizeof(sh));
+	return 0;
+}
+
+static const struct logfs_area_ops ostore_area_ops = {
+	.get_free_segment	= ostore_get_free_segment,
+	.get_erase_count	= ostore_get_erase_count,
+	.erase_segment		= ostore_erase_segment,
+};
+
+static void free_area(struct logfs_area *area)
+{
+	if (area)
+		freeseg(area->a_sb, area->a_segno);
+	kfree(area);
+}
+
+static struct logfs_area *alloc_area(struct super_block *sb)
+{
+	struct logfs_area *area;
+
+	area = kzalloc(sizeof(*area), GFP_KERNEL);
+	if (!area)
+		return NULL;
+
+	area->a_sb = sb;
+	return area;
+}
+
+static void map_invalidatepage(struct page *page, unsigned long l)
+{
+	BUG();
+}
+
+static int map_releasepage(struct page *page, gfp_t g)
+{
+	/* Don't release these pages */
+	return 0;
+}
+
+static const struct address_space_operations mapping_aops = {
+	.invalidatepage = map_invalidatepage,
+	.releasepage	= map_releasepage,
+	.set_page_dirty = __set_page_dirty_nobuffers,
+};
+
+int logfs_init_mapping(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct address_space *mapping;
+	struct inode *inode;
+
+	inode = logfs_new_meta_inode(sb, LOGFS_INO_MAPPING);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+	super->s_mapping_inode = inode;
+	mapping = inode->i_mapping;
+	mapping->a_ops = &mapping_aops;
+	/* Would it be possible to use __GFP_HIGHMEM as well? */
+	mapping_set_gfp_mask(mapping, GFP_NOFS);
+	return 0;
+}
+
+int logfs_init_areas(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int i = -1;
+
+	super->s_alias_pool = mempool_create_kmalloc_pool(600,
+			sizeof(struct object_alias_item));
+	if (!super->s_alias_pool)
+		return -ENOMEM;
+
+	super->s_journal_area = alloc_area(sb);
+	if (!super->s_journal_area)
+		goto err;
+
+	for_each_area(i) {
+		super->s_area[i] = alloc_area(sb);
+		if (!super->s_area[i])
+			goto err;
+		super->s_area[i]->a_level = GC_LEVEL(i);
+		super->s_area[i]->a_ops = &ostore_area_ops;
+	}
+	btree_init_mempool128(&super->s_object_alias_tree,
+			super->s_btree_pool);
+	return 0;
+
+err:
+	for (i--; i >= 0; i--)
+		free_area(super->s_area[i]);
+	free_area(super->s_journal_area);
+	logfs_mempool_destroy(super->s_alias_pool);
+	return -ENOMEM;
+}
+
+void logfs_cleanup_areas(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int i;
+
+	btree_grim_visitor128(&super->s_object_alias_tree, 0, kill_alias);
+	for_each_area(i)
+		free_area(super->s_area[i]);
+	free_area(super->s_journal_area);
+	destroy_meta_inode(super->s_mapping_inode);
+}
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
new file mode 100644
index 000000000000..d651e10a1e9c
--- /dev/null
+++ b/fs/logfs/super.c
@@ -0,0 +1,661 @@
+/*
+ * fs/logfs/super.c
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
+ *
+ * Generally contains mount/umount code and also serves as a dump area for
+ * any functions that don't fit elsewhere and neither justify a file of their
+ * own.
+ */
+#include "logfs.h"
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/mtd/mtd.h>
+#include <linux/statfs.h>
+#include <linux/buffer_head.h>
+
+static DEFINE_MUTEX(emergency_mutex);
+static struct page *emergency_page;
+
+struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index)
+{
+	filler_t *filler = (filler_t *)mapping->a_ops->readpage;
+	struct page *page;
+	int err;
+
+	page = read_cache_page(mapping, index, filler, NULL);
+	if (page)
+		return page;
+
+	/* No more pages available, switch to emergency page */
+	printk(KERN_INFO"Logfs: Using emergency page\n");
+	mutex_lock(&emergency_mutex);
+	err = filler(NULL, emergency_page);
+	if (err) {
+		mutex_unlock(&emergency_mutex);
+		printk(KERN_EMERG"Logfs: Error reading emergency page\n");
+		return ERR_PTR(err);
+	}
+	return emergency_page;
+}
+
+void emergency_read_end(struct page *page)
+{
+	if (page == emergency_page)
+		mutex_unlock(&emergency_mutex);
+	else
+		page_cache_release(page);
+}
+
+static void dump_segfile(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_segment_entry se;
+	u32 segno;
+
+	for (segno = 0; segno < super->s_no_segs; segno++) {
+		logfs_get_segment_entry(sb, segno, &se);
+		printk("%3x: %6x %8x", segno, be32_to_cpu(se.ec_level),
+				be32_to_cpu(se.valid));
+		if (++segno < super->s_no_segs) {
+			logfs_get_segment_entry(sb, segno, &se);
+			printk(" %6x %8x", be32_to_cpu(se.ec_level),
+					be32_to_cpu(se.valid));
+		}
+		if (++segno < super->s_no_segs) {
+			logfs_get_segment_entry(sb, segno, &se);
+			printk(" %6x %8x", be32_to_cpu(se.ec_level),
+					be32_to_cpu(se.valid));
+		}
+		if (++segno < super->s_no_segs) {
+			logfs_get_segment_entry(sb, segno, &se);
+			printk(" %6x %8x", be32_to_cpu(se.ec_level),
+					be32_to_cpu(se.valid));
+		}
+		printk("\n");
+	}
+}
+
+/*
+ * logfs_crash_dump - dump debug information to device
+ *
+ * The LogFS superblock only occupies part of a segment.  This function will
+ * write as much debug information as it can gather into the spare space.
+ */
+void logfs_crash_dump(struct super_block *sb)
+{
+	dump_segfile(sb);
+}
+
+/*
+ * TODO: move to lib/string.c
+ */
+/**
+ * memchr_inv - Find a character in an area of memory.
+ * @s: The memory area
+ * @c: The byte to search for
+ * @n: The size of the area.
+ *
+ * returns the address of the first character other than @c, or %NULL
+ * if the whole buffer contains just @c.
+ */
+void *memchr_inv(const void *s, int c, size_t n)
+{
+	const unsigned char *p = s;
+	while (n-- != 0)
+		if ((unsigned char)c != *p++)
+			return (void *)(p - 1);
+
+	return NULL;
+}
+
+/*
+ * FIXME: There should be a reserve for root, similar to ext2.
+ */
+int logfs_statfs(struct dentry *dentry, struct kstatfs *stats)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct logfs_super *super = logfs_super(sb);
+
+	stats->f_type		= LOGFS_MAGIC_U32;
+	stats->f_bsize		= sb->s_blocksize;
+	stats->f_blocks		= super->s_size >> LOGFS_BLOCK_BITS >> 3;
+	stats->f_bfree		= super->s_free_bytes >> sb->s_blocksize_bits;
+	stats->f_bavail		= super->s_free_bytes >> sb->s_blocksize_bits;
+	stats->f_files		= 0;
+	stats->f_ffree		= 0;
+	stats->f_namelen	= LOGFS_MAX_NAMELEN;
+	return 0;
+}
+
+static int logfs_sb_set(struct super_block *sb, void *_super)
+{
+	struct logfs_super *super = _super;
+
+	sb->s_fs_info = super;
+	sb->s_mtd = super->s_mtd;
+	sb->s_bdev = super->s_bdev;
+#ifdef CONFIG_BLOCK
+	if (sb->s_bdev)
+		sb->s_bdi = &bdev_get_queue(sb->s_bdev)->backing_dev_info;
+#endif
+#ifdef CONFIG_MTD
+	if (sb->s_mtd)
+		sb->s_bdi = sb->s_mtd->backing_dev_info;
+#endif
+	return 0;
+}
+
+static int logfs_sb_test(struct super_block *sb, void *_super)
+{
+	struct logfs_super *super = _super;
+	struct mtd_info *mtd = super->s_mtd;
+
+	if (mtd && sb->s_mtd == mtd)
+		return 1;
+	if (super->s_bdev && sb->s_bdev == super->s_bdev)
+		return 1;
+	return 0;
+}
+
+static void set_segment_header(struct logfs_segment_header *sh, u8 type,
+		u8 level, u32 segno, u32 ec)
+{
+	sh->pad = 0;
+	sh->type = type;
+	sh->level = level;
+	sh->segno = cpu_to_be32(segno);
+	sh->ec = cpu_to_be32(ec);
+	sh->gec = cpu_to_be64(segno);
+	sh->crc = logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4);
+}
+
+static void logfs_write_ds(struct super_block *sb, struct logfs_disk_super *ds,
+		u32 segno, u32 ec)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_segment_header *sh = &ds->ds_sh;
+	int i;
+
+	memset(ds, 0, sizeof(*ds));
+	set_segment_header(sh, SEG_SUPER, 0, segno, ec);
+
+	ds->ds_ifile_levels	= super->s_ifile_levels;
+	ds->ds_iblock_levels	= super->s_iblock_levels;
+	ds->ds_data_levels	= super->s_data_levels; /* XXX: Remove */
+	ds->ds_segment_shift	= super->s_segshift;
+	ds->ds_block_shift	= sb->s_blocksize_bits;
+	ds->ds_write_shift	= super->s_writeshift;
+	ds->ds_filesystem_size	= cpu_to_be64(super->s_size);
+	ds->ds_segment_size	= cpu_to_be32(super->s_segsize);
+	ds->ds_bad_seg_reserve	= cpu_to_be32(super->s_bad_seg_reserve);
+	ds->ds_feature_incompat	= cpu_to_be64(super->s_feature_incompat);
+	ds->ds_feature_ro_compat= cpu_to_be64(super->s_feature_ro_compat);
+	ds->ds_feature_compat	= cpu_to_be64(super->s_feature_compat);
+	ds->ds_feature_flags	= cpu_to_be64(super->s_feature_flags);
+	ds->ds_root_reserve	= cpu_to_be64(super->s_root_reserve);
+	ds->ds_speed_reserve	= cpu_to_be64(super->s_speed_reserve);
+	journal_for_each(i)
+		ds->ds_journal_seg[i] = cpu_to_be32(super->s_journal_seg[i]);
+	ds->ds_magic		= cpu_to_be64(LOGFS_MAGIC);
+	ds->ds_crc = logfs_crc32(ds, sizeof(*ds),
+			LOGFS_SEGMENT_HEADERSIZE + 12);
+}
+
+static int write_one_sb(struct super_block *sb,
+		struct page *(*find_sb)(struct super_block *sb, u64 *ofs))
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_disk_super *ds;
+	struct logfs_segment_entry se;
+	struct page *page;
+	u64 ofs;
+	u32 ec, segno;
+	int err;
+
+	page = find_sb(sb, &ofs);
+	if (!page)
+		return -EIO;
+	ds = page_address(page);
+	segno = seg_no(sb, ofs);
+	logfs_get_segment_entry(sb, segno, &se);
+	ec = be32_to_cpu(se.ec_level) >> 4;
+	ec++;
+	logfs_set_segment_erased(sb, segno, ec, 0);
+	logfs_write_ds(sb, ds, segno, ec);
+	err = super->s_devops->write_sb(sb, page);
+	page_cache_release(page);
+	return err;
+}
+
+int logfs_write_sb(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int err;
+
+	/* First superblock */
+	err = write_one_sb(sb, super->s_devops->find_first_sb);
+	if (err)
+		return err;
+
+	/* Last superblock */
+	err = write_one_sb(sb, super->s_devops->find_last_sb);
+	if (err)
+		return err;
+	return 0;
+}
+
+static int ds_cmp(const void *ds0, const void *ds1)
+{
+	size_t len = sizeof(struct logfs_disk_super);
+
+	/* We know the segment headers differ, so ignore them */
+	len -= LOGFS_SEGMENT_HEADERSIZE;
+	ds0 += LOGFS_SEGMENT_HEADERSIZE;
+	ds1 += LOGFS_SEGMENT_HEADERSIZE;
+	return memcmp(ds0, ds1, len);
+}
+
+static int logfs_recover_sb(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct logfs_disk_super _ds0, *ds0 = &_ds0;
+	struct logfs_disk_super _ds1, *ds1 = &_ds1;
+	int err, valid0, valid1;
+
+	/* read first superblock */
+	err = wbuf_read(sb, super->s_sb_ofs[0], sizeof(*ds0), ds0);
+	if (err)
+		return err;
+	/* read last superblock */
+	err = wbuf_read(sb, super->s_sb_ofs[1], sizeof(*ds1), ds1);
+	if (err)
+		return err;
+	valid0 = logfs_check_ds(ds0) == 0;
+	valid1 = logfs_check_ds(ds1) == 0;
+
+	if (!valid0 && valid1) {
+		printk(KERN_INFO"First superblock is invalid - fixing.\n");
+		return write_one_sb(sb, super->s_devops->find_first_sb);
+	}
+	if (valid0 && !valid1) {
+		printk(KERN_INFO"Last superblock is invalid - fixing.\n");
+		return write_one_sb(sb, super->s_devops->find_last_sb);
+	}
+	if (valid0 && valid1 && ds_cmp(ds0, ds1)) {
+		printk(KERN_INFO"Superblocks don't match - fixing.\n");
+		return logfs_write_sb(sb);
+	}
+	/* If neither is valid now, something's wrong.  Didn't we properly
+	 * check them before?!? */
+	BUG_ON(!valid0 && !valid1);
+	return 0;
+}
+
+static int logfs_make_writeable(struct super_block *sb)
+{
+	int err;
+
+	err = logfs_open_segfile(sb);
+	if (err)
+		return err;
+
+	/* Repair any broken superblock copies */
+	err = logfs_recover_sb(sb);
+	if (err)
+		return err;
+
+	/* Check areas for trailing unaccounted data */
+	err = logfs_check_areas(sb);
+	if (err)
+		return err;
+
+	/* Do one GC pass before any data gets dirtied */
+	logfs_gc_pass(sb);
+
+	/* after all initializations are done, replay the journal
+	 * for rw-mounts, if necessary */
+	err = logfs_replay_journal(sb);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct inode *rootdir;
+	int err;
+
+	/* root dir */
+	rootdir = logfs_iget(sb, LOGFS_INO_ROOT);
+	if (IS_ERR(rootdir))
+		goto fail;
+
+	sb->s_root = d_alloc_root(rootdir);
+	if (!sb->s_root) {
+		iput(rootdir);
+		goto fail;
+	}
+
+	super->s_erase_page = alloc_pages(GFP_KERNEL, 0);
+	if (!super->s_erase_page)
+		goto fail;
+	memset(page_address(super->s_erase_page), 0xFF, PAGE_SIZE);
+
+	/* FIXME: check for read-only mounts */
+	err = logfs_make_writeable(sb);
+	if (err)
+		goto fail1;
+
+	log_super("LogFS: Finished mounting\n");
+	simple_set_mnt(mnt, sb);
+	return 0;
+
+fail1:
+	__free_page(super->s_erase_page);
+fail:
+	iput(logfs_super(sb)->s_master_inode);
+	return -EIO;
+}
+
+int logfs_check_ds(struct logfs_disk_super *ds)
+{
+	struct logfs_segment_header *sh = &ds->ds_sh;
+
+	if (ds->ds_magic != cpu_to_be64(LOGFS_MAGIC))
+		return -EINVAL;
+	if (sh->crc != logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4))
+		return -EINVAL;
+	if (ds->ds_crc != logfs_crc32(ds, sizeof(*ds),
+				LOGFS_SEGMENT_HEADERSIZE + 12))
+		return -EINVAL;
+	return 0;
+}
+
+static struct page *find_super_block(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct page *first, *last;
+
+	first = super->s_devops->find_first_sb(sb, &super->s_sb_ofs[0]);
+	if (!first || IS_ERR(first))
+		return NULL;
+	last = super->s_devops->find_last_sb(sb, &super->s_sb_ofs[1]);
+	if (!last || IS_ERR(last)) {
+		page_cache_release(first);
+		return NULL;
+	}
+
+	if (!logfs_check_ds(page_address(first))) {
+		page_cache_release(last);
+		return first;
+	}
+
+	/* First one didn't work, try the second superblock */
+	if (!logfs_check_ds(page_address(last))) {
+		page_cache_release(first);
+		return last;
+	}
+
+	/* Neither worked, sorry folks */
+	page_cache_release(first);
+	page_cache_release(last);
+	return NULL;
+}
+
+static int __logfs_read_sb(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+	struct page *page;
+	struct logfs_disk_super *ds;
+	int i;
+
+	page = find_super_block(sb);
+	if (!page)
+		return -EINVAL;
+
+	ds = page_address(page);
+	super->s_size = be64_to_cpu(ds->ds_filesystem_size);
+	super->s_root_reserve = be64_to_cpu(ds->ds_root_reserve);
+	super->s_speed_reserve = be64_to_cpu(ds->ds_speed_reserve);
+	super->s_bad_seg_reserve = be32_to_cpu(ds->ds_bad_seg_reserve);
+	super->s_segsize = 1 << ds->ds_segment_shift;
+	super->s_segmask = (1 << ds->ds_segment_shift) - 1;
+	super->s_segshift = ds->ds_segment_shift;
+	sb->s_blocksize = 1 << ds->ds_block_shift;
+	sb->s_blocksize_bits = ds->ds_block_shift;
+	super->s_writesize = 1 << ds->ds_write_shift;
+	super->s_writeshift = ds->ds_write_shift;
+	super->s_no_segs = super->s_size >> super->s_segshift;
+	super->s_no_blocks = super->s_segsize >> sb->s_blocksize_bits;
+	super->s_feature_incompat = be64_to_cpu(ds->ds_feature_incompat);
+	super->s_feature_ro_compat = be64_to_cpu(ds->ds_feature_ro_compat);
+	super->s_feature_compat = be64_to_cpu(ds->ds_feature_compat);
+	super->s_feature_flags = be64_to_cpu(ds->ds_feature_flags);
+
+	journal_for_each(i)
+		super->s_journal_seg[i] = be32_to_cpu(ds->ds_journal_seg[i]);
+
+	super->s_ifile_levels = ds->ds_ifile_levels;
+	super->s_iblock_levels = ds->ds_iblock_levels;
+	super->s_data_levels = ds->ds_data_levels;
+	super->s_total_levels = super->s_ifile_levels + super->s_iblock_levels
+		+ super->s_data_levels;
+	page_cache_release(page);
+	return 0;
+}
+
+static int logfs_read_sb(struct super_block *sb, int read_only)
+{
+	struct logfs_super *super = logfs_super(sb);
+	int ret;
+
+	super->s_btree_pool = mempool_create(32, btree_alloc, btree_free, NULL);
+	if (!super->s_btree_pool)
+		return -ENOMEM;
+
+	btree_init_mempool64(&super->s_shadow_tree.new, super->s_btree_pool);
+	btree_init_mempool64(&super->s_shadow_tree.old, super->s_btree_pool);
+	btree_init_mempool32(&super->s_shadow_tree.segment_map,
+			super->s_btree_pool);
+
+	ret = logfs_init_mapping(sb);
+	if (ret)
+		return ret;
+
+	ret = __logfs_read_sb(sb);
+	if (ret)
+		return ret;
+
+	if (super->s_feature_incompat & ~LOGFS_FEATURES_INCOMPAT)
+		return -EIO;
+	if ((super->s_feature_ro_compat & ~LOGFS_FEATURES_RO_COMPAT) &&
+			!read_only)
+		return -EIO;
+
+	mutex_init(&super->s_dirop_mutex);
+	mutex_init(&super->s_object_alias_mutex);
+	INIT_LIST_HEAD(&super->s_freeing_list);
+
+	ret = logfs_init_rw(sb);
+	if (ret)
+		return ret;
+
+	ret = logfs_init_areas(sb);
+	if (ret)
+		return ret;
+
+	ret = logfs_init_gc(sb);
+	if (ret)
+		return ret;
+
+	ret = logfs_init_journal(sb);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static void logfs_kill_sb(struct super_block *sb)
+{
+	struct logfs_super *super = logfs_super(sb);
+
+	log_super("LogFS: Start unmounting\n");
+	/* Alias entries slow down mount, so evict as many as possible */
+	sync_filesystem(sb);
+	logfs_write_anchor(sb);
+
+	/*
+	 * From this point on alias entries are simply dropped - and any
+	 * writes to the object store are considered bugs.
+	 */
+	super->s_flags |= LOGFS_SB_FLAG_SHUTDOWN;
+	log_super("LogFS: Now in shutdown\n");
+	generic_shutdown_super(sb);
+
+	BUG_ON(super->s_dirty_used_bytes || super->s_dirty_free_bytes);
+
+	logfs_cleanup_gc(sb);
+	logfs_cleanup_journal(sb);
+	logfs_cleanup_areas(sb);
+	logfs_cleanup_rw(sb);
+	if (super->s_erase_page)
+		__free_page(super->s_erase_page);
+	super->s_devops->put_device(sb);
+	logfs_mempool_destroy(super->s_btree_pool);
+	logfs_mempool_destroy(super->s_alias_pool);
+	kfree(super);
+	log_super("LogFS: Finished unmounting\n");
+}
+
+int logfs_get_sb_device(struct file_system_type *type, int flags,
+		struct mtd_info *mtd, struct block_device *bdev,
+		const struct logfs_device_ops *devops, struct vfsmount *mnt)
+{
+	struct logfs_super *super;
+	struct super_block *sb;
+	int err = -ENOMEM;
+	static int mount_count;
+
+	log_super("LogFS: Start mount %x\n", mount_count++);
+	super = kzalloc(sizeof(*super), GFP_KERNEL);
+	if (!super)
+		goto err0;
+
+	super->s_mtd	= mtd;
+	super->s_bdev	= bdev;
+	err = -EINVAL;
+	sb = sget(type, logfs_sb_test, logfs_sb_set, super);
+	if (IS_ERR(sb))
+		goto err0;
+
+	if (sb->s_root) {
+		/* Device is already in use */
+		err = 0;
+		simple_set_mnt(mnt, sb);
+		goto err0;
+	}
+
+	super->s_devops = devops;
+
+	/*
+	 * sb->s_maxbytes is limited to 8TB.  On 32bit systems, the page cache
+	 * only covers 16TB and the upper 8TB are used for indirect blocks.
+	 * On 64bit system we could bump up the limit, but that would make
+	 * the filesystem incompatible with 32bit systems.
+	 */
+	sb->s_maxbytes	= (1ull << 43) - 1;
+	sb->s_op	= &logfs_super_operations;
+	sb->s_flags	= flags | MS_NOATIME;
+
+	err = logfs_read_sb(sb, sb->s_flags & MS_RDONLY);
+	if (err)
+		goto err1;
+
+	sb->s_flags |= MS_ACTIVE;
+	err = logfs_get_sb_final(sb, mnt);
+	if (err)
+		goto err1;
+	return 0;
+
+err1:
+	deactivate_locked_super(sb);
+	return err;
+err0:
+	kfree(super);
+	//devops->put_device(sb);
+	return err;
+}
+
+static int logfs_get_sb(struct file_system_type *type, int flags,
+		const char *devname, void *data, struct vfsmount *mnt)
+{
+	ulong mtdnr;
+
+	if (!devname)
+		return logfs_get_sb_bdev(type, flags, devname, mnt);
+	if (strncmp(devname, "mtd", 3))
+		return logfs_get_sb_bdev(type, flags, devname, mnt);
+
+	{
+		char *garbage;
+		mtdnr = simple_strtoul(devname+3, &garbage, 0);
+		if (*garbage)
+			return -EINVAL;
+	}
+
+	return logfs_get_sb_mtd(type, flags, mtdnr, mnt);
+}
+
+static struct file_system_type logfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "logfs",
+	.get_sb		= logfs_get_sb,
+	.kill_sb	= logfs_kill_sb,
+	.fs_flags	= FS_REQUIRES_DEV,
+
+};
+
+static int __init logfs_init(void)
+{
+	int ret;
+
+	emergency_page = alloc_pages(GFP_KERNEL, 0);
+	if (!emergency_page)
+		return -ENOMEM;
+
+	ret = logfs_compr_init();
+	if (ret)
+		goto out1;
+
+	ret = logfs_init_inode_cache();
+	if (ret)
+		goto out2;
+
+	return register_filesystem(&logfs_fs_type);
+out2:
+	logfs_compr_exit();
+out1:
+	__free_pages(emergency_page, 0);
+	return ret;
+}
+
+static void __exit logfs_exit(void)
+{
+	unregister_filesystem(&logfs_fs_type);
+	logfs_destroy_inode_cache();
+	logfs_compr_exit();
+	__free_pages(emergency_page, 0);
+}
+
+module_init(logfs_init);
+module_exit(logfs_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Joern Engel <joern@logfs.org>");
+MODULE_DESCRIPTION("scalable flash filesystem");
diff --git a/fs/mbcache.c b/fs/mbcache.c
index ec88ff3d04a9..e28f21b95344 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -115,7 +115,7 @@ mb_cache_indexes(struct mb_cache *cache)
  * What the mbcache registers as to get shrunk dynamically.
  */
 
-static int mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask);
+static int mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask);
 
 static struct shrinker mb_cache_shrinker = {
 	.shrink = mb_cache_shrink_fn,
@@ -191,13 +191,14 @@ forget:
  * This function is called by the kernel memory management when memory
  * gets low.
  *
+ * @shrink: (ignored)
  * @nr_to_scan: Number of objects to scan
  * @gfp_mask: (ignored)
  *
  * Returns the number of objects which are present in the cache.
  */
 static int
-mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask)
+mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
 {
 	LIST_HEAD(free_list);
 	struct list_head *l, *ltmp;
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
index 6ac693faae49..482779fe4e7c 100644
--- a/fs/minix/bitmap.c
+++ b/fs/minix/bitmap.c
@@ -221,7 +221,7 @@ void minix_free_inode(struct inode * inode)
 	clear_inode(inode);		/* clear in-memory copy */
 }
 
-struct inode * minix_new_inode(const struct inode * dir, int * error)
+struct inode *minix_new_inode(const struct inode *dir, int mode, int *error)
 {
 	struct super_block *sb = dir->i_sb;
 	struct minix_sb_info *sbi = minix_sb(sb);
@@ -263,8 +263,7 @@ struct inode * minix_new_inode(const struct inode * dir, int * error)
 		iput(inode);
 		return NULL;
 	}
-	inode->i_uid = current_fsuid();
-	inode->i_gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current_fsgid();
+	inode_init_owner(inode, dir, mode);
 	inode->i_ino = j;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
 	inode->i_blocks = 0;
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index 6198731d7fcd..1dbf921ca44b 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -22,7 +22,7 @@ const struct file_operations minix_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= minix_readdir,
-	.fsync		= simple_fsync,
+	.fsync		= generic_file_fsync,
 };
 
 static inline void dir_put_page(struct page *page)
@@ -72,16 +72,9 @@ static struct page * dir_get_page(struct inode *dir, unsigned long n)
 {
 	struct address_space *mapping = dir->i_mapping;
 	struct page *page = read_mapping_page(mapping, n, NULL);
-	if (!IS_ERR(page)) {
+	if (!IS_ERR(page))
 		kmap(page);
-		if (!PageUptodate(page))
-			goto fail;
-	}
 	return page;
-
-fail:
-	dir_put_page(page);
-	return ERR_PTR(-EIO);
 }
 
 static inline void *minix_next_entry(void *de, struct minix_sb_info *sbi)
diff --git a/fs/minix/file.c b/fs/minix/file.c
index 3eec3e607a87..d5320ff23faf 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -19,7 +19,7 @@ const struct file_operations minix_file_operations = {
 	.write		= do_sync_write,
 	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
-	.fsync		= simple_fsync,
+	.fsync		= generic_file_fsync,
 	.splice_read	= generic_file_splice_read,
 };
 
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 74ea82d72164..756f8c93780c 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -17,8 +17,10 @@
 #include <linux/init.h>
 #include <linux/highuid.h>
 #include <linux/vfs.h>
+#include <linux/writeback.h>
 
-static int minix_write_inode(struct inode * inode, int wait);
+static int minix_write_inode(struct inode *inode,
+		struct writeback_control *wbc);
 static int minix_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int minix_remount (struct super_block * sb, int * flags, char * data);
 
@@ -552,7 +554,7 @@ static struct buffer_head * V2_minix_update_inode(struct inode * inode)
 	return bh;
 }
 
-static int minix_write_inode(struct inode *inode, int wait)
+static int minix_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	int err = 0;
 	struct buffer_head *bh;
@@ -563,7 +565,7 @@ static int minix_write_inode(struct inode *inode, int wait)
 		bh = V2_minix_update_inode(inode);
 	if (!bh)
 		return -EIO;
-	if (wait && buffer_dirty(bh)) {
+	if (wbc->sync_mode == WB_SYNC_ALL && buffer_dirty(bh)) {
 		sync_dirty_buffer(bh);
 		if (buffer_req(bh) && !buffer_uptodate(bh)) {
 			printk("IO error syncing minix inode [%s:%08lx]\n",
diff --git a/fs/minix/itree_v1.c b/fs/minix/itree_v1.c
index 82d6554b02fe..282e15ad8cd8 100644
--- a/fs/minix/itree_v1.c
+++ b/fs/minix/itree_v1.c
@@ -1,4 +1,5 @@
 #include <linux/buffer_head.h>
+#include <linux/slab.h>
 #include "minix.h"
 
 enum {DEPTH = 3, DIRECT = 7};	/* Only double indirect */
diff --git a/fs/minix/itree_v2.c b/fs/minix/itree_v2.c
index f23010969369..13487ad16894 100644
--- a/fs/minix/itree_v2.c
+++ b/fs/minix/itree_v2.c
@@ -20,6 +20,9 @@ static inline block_t *i_data(struct inode *inode)
 	return (block_t *)minix_i(inode)->u.i2_data;
 }
 
+#define DIRCOUNT 7
+#define INDIRCOUNT(sb) (1 << ((sb)->s_blocksize_bits - 2))
+
 static int block_to_path(struct inode * inode, long block, int offsets[DEPTH])
 {
 	int n = 0;
@@ -34,21 +37,21 @@ static int block_to_path(struct inode * inode, long block, int offsets[DEPTH])
 			printk("MINIX-fs: block_to_path: "
 			       "block %ld too big on dev %s\n",
 				block, bdevname(sb->s_bdev, b));
-	} else if (block < 7) {
+	} else if (block < DIRCOUNT) {
 		offsets[n++] = block;
-	} else if ((block -= 7) < 256) {
-		offsets[n++] = 7;
+	} else if ((block -= DIRCOUNT) < INDIRCOUNT(sb)) {
+		offsets[n++] = DIRCOUNT;
 		offsets[n++] = block;
-	} else if ((block -= 256) < 256*256) {
-		offsets[n++] = 8;
-		offsets[n++] = block>>8;
-		offsets[n++] = block & 255;
+	} else if ((block -= INDIRCOUNT(sb)) < INDIRCOUNT(sb) * INDIRCOUNT(sb)) {
+		offsets[n++] = DIRCOUNT + 1;
+		offsets[n++] = block / INDIRCOUNT(sb);
+		offsets[n++] = block % INDIRCOUNT(sb);
 	} else {
-		block -= 256*256;
-		offsets[n++] = 9;
-		offsets[n++] = block>>16;
-		offsets[n++] = (block>>8) & 255;
-		offsets[n++] = block & 255;
+		block -= INDIRCOUNT(sb) * INDIRCOUNT(sb);
+		offsets[n++] = DIRCOUNT + 2;
+		offsets[n++] = (block / INDIRCOUNT(sb)) / INDIRCOUNT(sb);
+		offsets[n++] = (block / INDIRCOUNT(sb)) % INDIRCOUNT(sb);
+		offsets[n++] = block % INDIRCOUNT(sb);
 	}
 	return n;
 }
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index 9dcf95b42116..111f34ee9e3b 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -46,7 +46,7 @@ struct minix_sb_info {
 extern struct inode *minix_iget(struct super_block *, unsigned long);
 extern struct minix_inode * minix_V1_raw_inode(struct super_block *, ino_t, struct buffer_head **);
 extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **);
-extern struct inode * minix_new_inode(const struct inode * dir, int * error);
+extern struct inode * minix_new_inode(const struct inode *, int, int *);
 extern void minix_free_inode(struct inode * inode);
 extern unsigned long minix_count_free_inodes(struct minix_sb_info *sbi);
 extern int minix_new_block(struct inode * inode);
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 32b131cd6121..e20ee85955d1 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -46,10 +46,9 @@ static int minix_mknod(struct inode * dir, struct dentry *dentry, int mode, dev_
 	if (!old_valid_dev(rdev))
 		return -EINVAL;
 
-	inode = minix_new_inode(dir, &error);
+	inode = minix_new_inode(dir, mode, &error);
 
 	if (inode) {
-		inode->i_mode = mode;
 		minix_set_inode(inode, rdev);
 		mark_inode_dirty(inode);
 		error = add_nondir(dentry, inode);
@@ -73,11 +72,10 @@ static int minix_symlink(struct inode * dir, struct dentry *dentry,
 	if (i > dir->i_sb->s_blocksize)
 		goto out;
 
-	inode = minix_new_inode(dir, &err);
+	inode = minix_new_inode(dir, S_IFLNK | 0777, &err);
 	if (!inode)
 		goto out;
 
-	inode->i_mode = S_IFLNK | 0777;
 	minix_set_inode(inode, 0);
 	err = page_symlink(inode, symname, i);
 	if (err)
@@ -117,13 +115,10 @@ static int minix_mkdir(struct inode * dir, struct dentry *dentry, int mode)
 
 	inode_inc_link_count(dir);
 
-	inode = minix_new_inode(dir, &err);
+	inode = minix_new_inode(dir, mode, &err);
 	if (!inode)
 		goto out_dir;
 
-	inode->i_mode = S_IFDIR | mode;
-	if (dir->i_mode & S_ISGID)
-		inode->i_mode |= S_ISGID;
 	minix_set_inode(inode, 0);
 
 	inode_inc_link_count(inode);
diff --git a/fs/mpage.c b/fs/mpage.c
index 42381bd6543b..fd56ca2ea556 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -16,6 +16,7 @@
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/kdev_t.h>
+#include <linux/gfp.h>
 #include <linux/bio.h>
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
@@ -561,7 +562,7 @@ page_is_mapped:
 	if (page->index >= end_index) {
 		/*
 		 * The page straddles i_size.  It must be zeroed out on each
-		 * and every writepage invokation because it may be mmapped.
+		 * and every writepage invocation because it may be mmapped.
 		 * "A file is mapped in multiples of the page size.  For a file
 		 * that is not a multiple of the page size, the remaining memory
 		 * is zeroed when mapped, and writes to that region are not
diff --git a/fs/namei.c b/fs/namei.c
index 94a5e60779f9..868d0cb9d473 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -19,7 +19,6 @@
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/namei.h>
-#include <linux/quotaops.h>
 #include <linux/pagemap.h>
 #include <linux/fsnotify.h>
 #include <linux/personality.h>
@@ -498,8 +497,6 @@ static int link_path_walk(const char *, struct nameidata *);
 
 static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
 {
-	int res = 0;
-	char *name;
 	if (IS_ERR(link))
 		goto fail;
 
@@ -510,22 +507,7 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
 		path_get(&nd->root);
 	}
 
-	res = link_path_walk(link, nd);
-	if (nd->depth || res || nd->last_type!=LAST_NORM)
-		return res;
-	/*
-	 * If it is an iterative symlinks resolution in open_namei() we
-	 * have to copy the last component. And all that crap because of
-	 * bloody create() on broken symlinks. Furrfu...
-	 */
-	name = __getname();
-	if (unlikely(!name)) {
-		path_put(&nd->path);
-		return -ENOMEM;
-	}
-	strcpy(name, nd->last.name);
-	nd->last.name = name;
-	return 0;
+	return link_path_walk(link, nd);
 fail:
 	path_put(&nd->path);
 	return PTR_ERR(link);
@@ -541,16 +523,17 @@ static void path_put_conditional(struct path *path, struct nameidata *nd)
 static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
 {
 	dput(nd->path.dentry);
-	if (nd->path.mnt != path->mnt)
+	if (nd->path.mnt != path->mnt) {
 		mntput(nd->path.mnt);
-	nd->path.mnt = path->mnt;
+		nd->path.mnt = path->mnt;
+	}
 	nd->path.dentry = path->dentry;
 }
 
-static __always_inline int __do_follow_link(struct path *path, struct nameidata *nd)
+static __always_inline int
+__do_follow_link(struct path *path, struct nameidata *nd, void **p)
 {
 	int error;
-	void *cookie;
 	struct dentry *dentry = path->dentry;
 
 	touch_atime(path->mnt, dentry);
@@ -562,9 +545,9 @@ static __always_inline int __do_follow_link(struct path *path, struct nameidata
 	}
 	mntget(path->mnt);
 	nd->last_type = LAST_BIND;
-	cookie = dentry->d_inode->i_op->follow_link(dentry, nd);
-	error = PTR_ERR(cookie);
-	if (!IS_ERR(cookie)) {
+	*p = dentry->d_inode->i_op->follow_link(dentry, nd);
+	error = PTR_ERR(*p);
+	if (!IS_ERR(*p)) {
 		char *s = nd_get_link(nd);
 		error = 0;
 		if (s)
@@ -574,8 +557,6 @@ static __always_inline int __do_follow_link(struct path *path, struct nameidata
 			if (error)
 				path_put(&nd->path);
 		}
-		if (dentry->d_inode->i_op->put_link)
-			dentry->d_inode->i_op->put_link(dentry, nd, cookie);
 	}
 	return error;
 }
@@ -589,6 +570,7 @@ static __always_inline int __do_follow_link(struct path *path, struct nameidata
  */
 static inline int do_follow_link(struct path *path, struct nameidata *nd)
 {
+	void *cookie;
 	int err = -ELOOP;
 	if (current->link_count >= MAX_NESTED_LINKS)
 		goto loop;
@@ -602,7 +584,9 @@ static inline int do_follow_link(struct path *path, struct nameidata *nd)
 	current->link_count++;
 	current->total_link_count++;
 	nd->depth++;
-	err = __do_follow_link(path, nd);
+	err = __do_follow_link(path, nd, &cookie);
+	if (!IS_ERR(cookie) && path->dentry->d_inode->i_op->put_link)
+		path->dentry->d_inode->i_op->put_link(path->dentry, nd, cookie);
 	path_put(path);
 	current->link_count--;
 	nd->depth--;
@@ -689,33 +673,20 @@ static __always_inline void follow_dotdot(struct nameidata *nd)
 	set_root(nd);
 
 	while(1) {
-		struct vfsmount *parent;
 		struct dentry *old = nd->path.dentry;
 
 		if (nd->path.dentry == nd->root.dentry &&
 		    nd->path.mnt == nd->root.mnt) {
 			break;
 		}
-		spin_lock(&dcache_lock);
 		if (nd->path.dentry != nd->path.mnt->mnt_root) {
-			nd->path.dentry = dget(nd->path.dentry->d_parent);
-			spin_unlock(&dcache_lock);
+			/* rare case of legitimate dget_parent()... */
+			nd->path.dentry = dget_parent(nd->path.dentry);
 			dput(old);
 			break;
 		}
-		spin_unlock(&dcache_lock);
-		spin_lock(&vfsmount_lock);
-		parent = nd->path.mnt->mnt_parent;
-		if (parent == nd->path.mnt) {
-			spin_unlock(&vfsmount_lock);
+		if (!follow_up(&nd->path))
 			break;
-		}
-		mntget(parent);
-		nd->path.dentry = dget(nd->path.mnt->mnt_mountpoint);
-		spin_unlock(&vfsmount_lock);
-		dput(old);
-		mntput(nd->path.mnt);
-		nd->path.mnt = parent;
 	}
 	follow_mount(&nd->path);
 }
@@ -823,6 +794,17 @@ fail:
 }
 
 /*
+ * This is a temporary kludge to deal with "automount" symlinks; proper
+ * solution is to trigger them on follow_mount(), so that do_lookup()
+ * would DTRT.  To be killed before 2.6.34-final.
+ */
+static inline int follow_on_final(struct inode *inode, unsigned lookup_flags)
+{
+	return inode && unlikely(inode->i_op->follow_link) &&
+		((lookup_flags & LOOKUP_FOLLOW) || S_ISDIR(inode->i_mode));
+}
+
+/*
  * Name resolution.
  * This is the basic name resolution function, turning a pathname into
  * the final dentry. We expect 'base' to be positive and a directory.
@@ -942,8 +924,7 @@ last_component:
 		if (err)
 			break;
 		inode = next.dentry->d_inode;
-		if ((lookup_flags & LOOKUP_FOLLOW)
-		    && inode && inode->i_op->follow_link) {
+		if (follow_on_final(inode, lookup_flags)) {
 			err = do_follow_link(&next, nd);
 			if (err)
 				goto return_err;
@@ -1337,7 +1318,7 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
 		return -ENOENT;
 
 	BUG_ON(victim->d_parent->d_inode != dir);
-	audit_inode_child(victim->d_name.name, victim, dir);
+	audit_inode_child(victim, dir);
 
 	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
 	if (error)
@@ -1378,22 +1359,6 @@ static inline int may_create(struct inode *dir, struct dentry *child)
 	return inode_permission(dir, MAY_WRITE | MAY_EXEC);
 }
 
-/* 
- * O_DIRECTORY translates into forcing a directory lookup.
- */
-static inline int lookup_flags(unsigned int f)
-{
-	unsigned long retval = LOOKUP_FOLLOW;
-
-	if (f & O_NOFOLLOW)
-		retval &= ~LOOKUP_FOLLOW;
-	
-	if (f & O_DIRECTORY)
-		retval |= LOOKUP_DIRECTORY;
-
-	return retval;
-}
-
 /*
  * p1 and p2 should be directories on the same fs.
  */
@@ -1451,7 +1416,6 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	error = security_inode_create(dir, dentry, mode);
 	if (error)
 		return error;
-	vfs_dq_init(dir);
 	error = dir->i_op->create(dir, dentry, mode, nd);
 	if (!error)
 		fsnotify_create(dir, dentry);
@@ -1493,7 +1457,7 @@ int may_open(struct path *path, int acc_mode, int flag)
 	 * An append-only file must be opened in append mode for writing.
 	 */
 	if (IS_APPEND(inode)) {
-		if  ((flag & FMODE_WRITE) && !(flag & O_APPEND))
+		if  ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
 			return -EPERM;
 		if (flag & O_TRUNC)
 			return -EPERM;
@@ -1537,7 +1501,7 @@ static int handle_truncate(struct path *path)
  * what get passed to sys_open().
  */
 static int __open_namei_create(struct nameidata *nd, struct path *path,
-				int flag, int mode)
+				int open_flag, int mode)
 {
 	int error;
 	struct dentry *dir = nd->path.dentry;
@@ -1555,7 +1519,7 @@ out_unlock:
 	if (error)
 		return error;
 	/* Don't check for write permission, don't truncate */
-	return may_open(&nd->path, 0, flag & ~O_TRUNC);
+	return may_open(&nd->path, 0, open_flag & ~O_TRUNC);
 }
 
 /*
@@ -1593,129 +1557,133 @@ static int open_will_truncate(int flag, struct inode *inode)
 	return (flag & O_TRUNC);
 }
 
-/*
- * Note that the low bits of the passed in "open_flag"
- * are not the same as in the local variable "flag". See
- * open_to_namei_flags() for more details.
- */
-struct file *do_filp_open(int dfd, const char *pathname,
-		int open_flag, int mode, int acc_mode)
+static struct file *finish_open(struct nameidata *nd,
+				int open_flag, int acc_mode)
 {
 	struct file *filp;
-	struct nameidata nd;
-	int error;
-	struct path path;
-	struct dentry *dir;
-	int count = 0;
 	int will_truncate;
-	int flag = open_to_namei_flags(open_flag);
-	int force_reval = 0;
+	int error;
 
+	will_truncate = open_will_truncate(open_flag, nd->path.dentry->d_inode);
+	if (will_truncate) {
+		error = mnt_want_write(nd->path.mnt);
+		if (error)
+			goto exit;
+	}
+	error = may_open(&nd->path, acc_mode, open_flag);
+	if (error) {
+		if (will_truncate)
+			mnt_drop_write(nd->path.mnt);
+		goto exit;
+	}
+	filp = nameidata_to_filp(nd);
+	if (!IS_ERR(filp)) {
+		error = ima_file_check(filp, acc_mode);
+		if (error) {
+			fput(filp);
+			filp = ERR_PTR(error);
+		}
+	}
+	if (!IS_ERR(filp)) {
+		if (will_truncate) {
+			error = handle_truncate(&nd->path);
+			if (error) {
+				fput(filp);
+				filp = ERR_PTR(error);
+			}
+		}
+	}
 	/*
-	 * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
-	 * check for O_DSYNC if the need any syncing at all we enforce it's
-	 * always set instead of having to deal with possibly weird behaviour
-	 * for malicious applications setting only __O_SYNC.
+	 * It is now safe to drop the mnt write
+	 * because the filp has had a write taken
+	 * on its behalf.
 	 */
-	if (open_flag & __O_SYNC)
-		open_flag |= O_DSYNC;
-
-	if (!acc_mode)
-		acc_mode = MAY_OPEN | ACC_MODE(open_flag);
+	if (will_truncate)
+		mnt_drop_write(nd->path.mnt);
+	return filp;
 
-	/* O_TRUNC implies we need access checks for write permissions */
-	if (flag & O_TRUNC)
-		acc_mode |= MAY_WRITE;
+exit:
+	if (!IS_ERR(nd->intent.open.file))
+		release_open_intent(nd);
+	path_put(&nd->path);
+	return ERR_PTR(error);
+}
 
-	/* Allow the LSM permission hook to distinguish append 
-	   access from general write access. */
-	if (flag & O_APPEND)
-		acc_mode |= MAY_APPEND;
+static struct file *do_last(struct nameidata *nd, struct path *path,
+			    int open_flag, int acc_mode,
+			    int mode, const char *pathname)
+{
+	struct dentry *dir = nd->path.dentry;
+	struct file *filp;
+	int error = -EISDIR;
 
-	/*
-	 * The simplest case - just a plain lookup.
-	 */
-	if (!(flag & O_CREAT)) {
-		filp = get_empty_filp();
-
-		if (filp == NULL)
-			return ERR_PTR(-ENFILE);
-		nd.intent.open.file = filp;
-		filp->f_flags = open_flag;
-		nd.intent.open.flags = flag;
-		nd.intent.open.create_mode = 0;
-		error = do_path_lookup(dfd, pathname,
-					lookup_flags(flag)|LOOKUP_OPEN, &nd);
-		if (IS_ERR(nd.intent.open.file)) {
-			if (error == 0) {
-				error = PTR_ERR(nd.intent.open.file);
-				path_put(&nd.path);
+	switch (nd->last_type) {
+	case LAST_DOTDOT:
+		follow_dotdot(nd);
+		dir = nd->path.dentry;
+	case LAST_DOT:
+		if (nd->path.mnt->mnt_sb->s_type->fs_flags & FS_REVAL_DOT) {
+			if (!dir->d_op->d_revalidate(dir, nd)) {
+				error = -ESTALE;
+				goto exit;
 			}
-		} else if (error)
-			release_open_intent(&nd);
-		if (error)
-			return ERR_PTR(error);
+		}
+		/* fallthrough */
+	case LAST_ROOT:
+		if (open_flag & O_CREAT)
+			goto exit;
+		/* fallthrough */
+	case LAST_BIND:
+		audit_inode(pathname, dir);
 		goto ok;
 	}
 
-	/*
-	 * Create - we need to know the parent.
-	 */
-reval:
-	error = path_init(dfd, pathname, LOOKUP_PARENT, &nd);
-	if (error)
-		return ERR_PTR(error);
-	if (force_reval)
-		nd.flags |= LOOKUP_REVAL;
-	error = path_walk(pathname, &nd);
-	if (error) {
-		if (nd.root.mnt)
-			path_put(&nd.root);
-		return ERR_PTR(error);
+	/* trailing slashes? */
+	if (nd->last.name[nd->last.len]) {
+		if (open_flag & O_CREAT)
+			goto exit;
+		nd->flags |= LOOKUP_DIRECTORY | LOOKUP_FOLLOW;
 	}
-	if (unlikely(!audit_dummy_context()))
-		audit_inode(pathname, nd.path.dentry);
 
-	/*
-	 * We have the parent and last component. First of all, check
-	 * that we are not asked to creat(2) an obvious directory - that
-	 * will not do.
-	 */
-	error = -EISDIR;
-	if (nd.last_type != LAST_NORM || nd.last.name[nd.last.len])
-		goto exit_parent;
+	/* just plain open? */
+	if (!(open_flag & O_CREAT)) {
+		error = do_lookup(nd, &nd->last, path);
+		if (error)
+			goto exit;
+		error = -ENOENT;
+		if (!path->dentry->d_inode)
+			goto exit_dput;
+		if (path->dentry->d_inode->i_op->follow_link)
+			return NULL;
+		error = -ENOTDIR;
+		if (nd->flags & LOOKUP_DIRECTORY) {
+			if (!path->dentry->d_inode->i_op->lookup)
+				goto exit_dput;
+		}
+		path_to_nameidata(path, nd);
+		audit_inode(pathname, nd->path.dentry);
+		goto ok;
+	}
 
-	error = -ENFILE;
-	filp = get_empty_filp();
-	if (filp == NULL)
-		goto exit_parent;
-	nd.intent.open.file = filp;
-	filp->f_flags = open_flag;
-	nd.intent.open.flags = flag;
-	nd.intent.open.create_mode = mode;
-	dir = nd.path.dentry;
-	nd.flags &= ~LOOKUP_PARENT;
-	nd.flags |= LOOKUP_CREATE | LOOKUP_OPEN;
-	if (flag & O_EXCL)
-		nd.flags |= LOOKUP_EXCL;
+	/* OK, it's O_CREAT */
 	mutex_lock(&dir->d_inode->i_mutex);
-	path.dentry = lookup_hash(&nd);
-	path.mnt = nd.path.mnt;
 
-do_last:
-	error = PTR_ERR(path.dentry);
-	if (IS_ERR(path.dentry)) {
+	path->dentry = lookup_hash(nd);
+	path->mnt = nd->path.mnt;
+
+	error = PTR_ERR(path->dentry);
+	if (IS_ERR(path->dentry)) {
 		mutex_unlock(&dir->d_inode->i_mutex);
 		goto exit;
 	}
 
-	if (IS_ERR(nd.intent.open.file)) {
-		error = PTR_ERR(nd.intent.open.file);
+	if (IS_ERR(nd->intent.open.file)) {
+		error = PTR_ERR(nd->intent.open.file);
 		goto exit_mutex_unlock;
 	}
 
 	/* Negative dentry, just create the file */
-	if (!path.dentry->d_inode) {
+	if (!path->dentry->d_inode) {
 		/*
 		 * This write is needed to ensure that a
 		 * ro->rw transition does not occur between
@@ -1723,21 +1691,18 @@ do_last:
 		 * a permanent write count is taken through
 		 * the 'struct file' in nameidata_to_filp().
 		 */
-		error = mnt_want_write(nd.path.mnt);
+		error = mnt_want_write(nd->path.mnt);
 		if (error)
 			goto exit_mutex_unlock;
-		error = __open_namei_create(&nd, &path, flag, mode);
+		error = __open_namei_create(nd, path, open_flag, mode);
 		if (error) {
-			mnt_drop_write(nd.path.mnt);
+			mnt_drop_write(nd->path.mnt);
 			goto exit;
 		}
-		filp = nameidata_to_filp(&nd);
-		mnt_drop_write(nd.path.mnt);
-		if (nd.root.mnt)
-			path_put(&nd.root);
+		filp = nameidata_to_filp(nd);
+		mnt_drop_write(nd->path.mnt);
 		if (!IS_ERR(filp)) {
-			error = ima_path_check(&filp->f_path, filp->f_mode &
-				       (MAY_READ | MAY_WRITE | MAY_EXEC));
+			error = ima_file_check(filp, acc_mode);
 			if (error) {
 				fput(filp);
 				filp = ERR_PTR(error);
@@ -1750,151 +1715,184 @@ do_last:
 	 * It already exists.
 	 */
 	mutex_unlock(&dir->d_inode->i_mutex);
-	audit_inode(pathname, path.dentry);
+	audit_inode(pathname, path->dentry);
 
 	error = -EEXIST;
-	if (flag & O_EXCL)
+	if (open_flag & O_EXCL)
 		goto exit_dput;
 
-	if (__follow_mount(&path)) {
+	if (__follow_mount(path)) {
 		error = -ELOOP;
-		if (flag & O_NOFOLLOW)
+		if (open_flag & O_NOFOLLOW)
 			goto exit_dput;
 	}
 
 	error = -ENOENT;
-	if (!path.dentry->d_inode)
+	if (!path->dentry->d_inode)
 		goto exit_dput;
-	if (path.dentry->d_inode->i_op->follow_link)
-		goto do_link;
 
-	path_to_nameidata(&path, &nd);
+	if (path->dentry->d_inode->i_op->follow_link)
+		return NULL;
+
+	path_to_nameidata(path, nd);
 	error = -EISDIR;
-	if (S_ISDIR(path.dentry->d_inode->i_mode))
+	if (S_ISDIR(path->dentry->d_inode->i_mode))
 		goto exit;
 ok:
+	filp = finish_open(nd, open_flag, acc_mode);
+	return filp;
+
+exit_mutex_unlock:
+	mutex_unlock(&dir->d_inode->i_mutex);
+exit_dput:
+	path_put_conditional(path, nd);
+exit:
+	if (!IS_ERR(nd->intent.open.file))
+		release_open_intent(nd);
+	path_put(&nd->path);
+	return ERR_PTR(error);
+}
+
+/*
+ * Note that the low bits of the passed in "open_flag"
+ * are not the same as in the local variable "flag". See
+ * open_to_namei_flags() for more details.
+ */
+struct file *do_filp_open(int dfd, const char *pathname,
+		int open_flag, int mode, int acc_mode)
+{
+	struct file *filp;
+	struct nameidata nd;
+	int error;
+	struct path path;
+	int count = 0;
+	int flag = open_to_namei_flags(open_flag);
+	int force_reval = 0;
+
+	if (!(open_flag & O_CREAT))
+		mode = 0;
+
 	/*
-	 * Consider:
-	 * 1. may_open() truncates a file
-	 * 2. a rw->ro mount transition occurs
-	 * 3. nameidata_to_filp() fails due to
-	 *    the ro mount.
-	 * That would be inconsistent, and should
-	 * be avoided. Taking this mnt write here
-	 * ensures that (2) can not occur.
+	 * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
+	 * check for O_DSYNC if the need any syncing at all we enforce it's
+	 * always set instead of having to deal with possibly weird behaviour
+	 * for malicious applications setting only __O_SYNC.
 	 */
-	will_truncate = open_will_truncate(flag, nd.path.dentry->d_inode);
-	if (will_truncate) {
-		error = mnt_want_write(nd.path.mnt);
-		if (error)
-			goto exit;
-	}
-	error = may_open(&nd.path, acc_mode, flag);
+	if (open_flag & __O_SYNC)
+		open_flag |= O_DSYNC;
+
+	if (!acc_mode)
+		acc_mode = MAY_OPEN | ACC_MODE(open_flag);
+
+	/* O_TRUNC implies we need access checks for write permissions */
+	if (open_flag & O_TRUNC)
+		acc_mode |= MAY_WRITE;
+
+	/* Allow the LSM permission hook to distinguish append 
+	   access from general write access. */
+	if (open_flag & O_APPEND)
+		acc_mode |= MAY_APPEND;
+
+	/* find the parent */
+reval:
+	error = path_init(dfd, pathname, LOOKUP_PARENT, &nd);
+	if (error)
+		return ERR_PTR(error);
+	if (force_reval)
+		nd.flags |= LOOKUP_REVAL;
+
+	current->total_link_count = 0;
+	error = link_path_walk(pathname, &nd);
 	if (error) {
-		if (will_truncate)
-			mnt_drop_write(nd.path.mnt);
-		goto exit;
-	}
-	filp = nameidata_to_filp(&nd);
-	if (!IS_ERR(filp)) {
-		error = ima_path_check(&filp->f_path, filp->f_mode &
-			       (MAY_READ | MAY_WRITE | MAY_EXEC));
-		if (error) {
-			fput(filp);
-			filp = ERR_PTR(error);
-		}
+		filp = ERR_PTR(error);
+		goto out;
 	}
-	if (!IS_ERR(filp)) {
-		if (acc_mode & MAY_WRITE)
-			vfs_dq_init(nd.path.dentry->d_inode);
+	if (unlikely(!audit_dummy_context()) && (open_flag & O_CREAT))
+		audit_inode(pathname, nd.path.dentry);
 
-		if (will_truncate) {
-			error = handle_truncate(&nd.path);
-			if (error) {
-				fput(filp);
-				filp = ERR_PTR(error);
-			}
-		}
-	}
 	/*
-	 * It is now safe to drop the mnt write
-	 * because the filp has had a write taken
-	 * on its behalf.
+	 * We have the parent and last component.
 	 */
-	if (will_truncate)
-		mnt_drop_write(nd.path.mnt);
+
+	error = -ENFILE;
+	filp = get_empty_filp();
+	if (filp == NULL)
+		goto exit_parent;
+	nd.intent.open.file = filp;
+	filp->f_flags = open_flag;
+	nd.intent.open.flags = flag;
+	nd.intent.open.create_mode = mode;
+	nd.flags &= ~LOOKUP_PARENT;
+	nd.flags |= LOOKUP_OPEN;
+	if (open_flag & O_CREAT) {
+		nd.flags |= LOOKUP_CREATE;
+		if (open_flag & O_EXCL)
+			nd.flags |= LOOKUP_EXCL;
+	}
+	if (open_flag & O_DIRECTORY)
+		nd.flags |= LOOKUP_DIRECTORY;
+	if (!(open_flag & O_NOFOLLOW))
+		nd.flags |= LOOKUP_FOLLOW;
+	filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
+	while (unlikely(!filp)) { /* trailing symlink */
+		struct path holder;
+		struct inode *inode = path.dentry->d_inode;
+		void *cookie;
+		error = -ELOOP;
+		/* S_ISDIR part is a temporary automount kludge */
+		if (!(nd.flags & LOOKUP_FOLLOW) && !S_ISDIR(inode->i_mode))
+			goto exit_dput;
+		if (count++ == 32)
+			goto exit_dput;
+		/*
+		 * This is subtle. Instead of calling do_follow_link() we do
+		 * the thing by hands. The reason is that this way we have zero
+		 * link_count and path_walk() (called from ->follow_link)
+		 * honoring LOOKUP_PARENT.  After that we have the parent and
+		 * last component, i.e. we are in the same situation as after
+		 * the first path_walk().  Well, almost - if the last component
+		 * is normal we get its copy stored in nd->last.name and we will
+		 * have to putname() it when we are done. Procfs-like symlinks
+		 * just set LAST_BIND.
+		 */
+		nd.flags |= LOOKUP_PARENT;
+		error = security_inode_follow_link(path.dentry, &nd);
+		if (error)
+			goto exit_dput;
+		error = __do_follow_link(&path, &nd, &cookie);
+		if (unlikely(error)) {
+			/* nd.path had been dropped */
+			if (!IS_ERR(cookie) && inode->i_op->put_link)
+				inode->i_op->put_link(path.dentry, &nd, cookie);
+			path_put(&path);
+			release_open_intent(&nd);
+			filp = ERR_PTR(error);
+			goto out;
+		}
+		holder = path;
+		nd.flags &= ~LOOKUP_PARENT;
+		filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
+		if (inode->i_op->put_link)
+			inode->i_op->put_link(holder.dentry, &nd, cookie);
+		path_put(&holder);
+	}
+out:
 	if (nd.root.mnt)
 		path_put(&nd.root);
+	if (filp == ERR_PTR(-ESTALE) && !force_reval) {
+		force_reval = 1;
+		goto reval;
+	}
 	return filp;
 
-exit_mutex_unlock:
-	mutex_unlock(&dir->d_inode->i_mutex);
 exit_dput:
 	path_put_conditional(&path, &nd);
-exit:
 	if (!IS_ERR(nd.intent.open.file))
 		release_open_intent(&nd);
 exit_parent:
-	if (nd.root.mnt)
-		path_put(&nd.root);
 	path_put(&nd.path);
-	return ERR_PTR(error);
-
-do_link:
-	error = -ELOOP;
-	if (flag & O_NOFOLLOW)
-		goto exit_dput;
-	/*
-	 * This is subtle. Instead of calling do_follow_link() we do the
-	 * thing by hands. The reason is that this way we have zero link_count
-	 * and path_walk() (called from ->follow_link) honoring LOOKUP_PARENT.
-	 * After that we have the parent and last component, i.e.
-	 * we are in the same situation as after the first path_walk().
-	 * Well, almost - if the last component is normal we get its copy
-	 * stored in nd->last.name and we will have to putname() it when we
-	 * are done. Procfs-like symlinks just set LAST_BIND.
-	 */
-	nd.flags |= LOOKUP_PARENT;
-	error = security_inode_follow_link(path.dentry, &nd);
-	if (error)
-		goto exit_dput;
-	error = __do_follow_link(&path, &nd);
-	path_put(&path);
-	if (error) {
-		/* Does someone understand code flow here? Or it is only
-		 * me so stupid? Anathema to whoever designed this non-sense
-		 * with "intent.open".
-		 */
-		release_open_intent(&nd);
-		if (nd.root.mnt)
-			path_put(&nd.root);
-		if (error == -ESTALE && !force_reval) {
-			force_reval = 1;
-			goto reval;
-		}
-		return ERR_PTR(error);
-	}
-	nd.flags &= ~LOOKUP_PARENT;
-	if (nd.last_type == LAST_BIND)
-		goto ok;
-	error = -EISDIR;
-	if (nd.last_type != LAST_NORM)
-		goto exit;
-	if (nd.last.name[nd.last.len]) {
-		__putname(nd.last.name);
-		goto exit;
-	}
-	error = -ELOOP;
-	if (count++==32) {
-		__putname(nd.last.name);
-		goto exit;
-	}
-	dir = nd.path.dentry;
-	mutex_lock(&dir->d_inode->i_mutex);
-	path.dentry = lookup_hash(&nd);
-	path.mnt = nd.path.mnt;
-	__putname(nd.last.name);
-	goto do_last;
+	filp = ERR_PTR(error);
+	goto out;
 }
 
 /**
@@ -1988,7 +1986,6 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
 	if (error)
 		return error;
 
-	vfs_dq_init(dir);
 	error = dir->i_op->mknod(dir, dentry, mode, dev);
 	if (!error)
 		fsnotify_create(dir, dentry);
@@ -2087,7 +2084,6 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	if (error)
 		return error;
 
-	vfs_dq_init(dir);
 	error = dir->i_op->mkdir(dir, dentry, mode);
 	if (!error)
 		fsnotify_mkdir(dir, dentry);
@@ -2173,8 +2169,6 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
 	if (!dir->i_op->rmdir)
 		return -EPERM;
 
-	vfs_dq_init(dir);
-
 	mutex_lock(&dentry->d_inode->i_mutex);
 	dentry_unhash(dentry);
 	if (d_mountpoint(dentry))
@@ -2183,8 +2177,10 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
 		error = security_inode_rmdir(dir, dentry);
 		if (!error) {
 			error = dir->i_op->rmdir(dir, dentry);
-			if (!error)
+			if (!error) {
 				dentry->d_inode->i_flags |= S_DEAD;
+				dont_mount(dentry);
+			}
 		}
 	}
 	mutex_unlock(&dentry->d_inode->i_mutex);
@@ -2260,15 +2256,16 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
 	if (!dir->i_op->unlink)
 		return -EPERM;
 
-	vfs_dq_init(dir);
-
 	mutex_lock(&dentry->d_inode->i_mutex);
 	if (d_mountpoint(dentry))
 		error = -EBUSY;
 	else {
 		error = security_inode_unlink(dir, dentry);
-		if (!error)
+		if (!error) {
 			error = dir->i_op->unlink(dir, dentry);
+			if (!error)
+				dont_mount(dentry);
+		}
 	}
 	mutex_unlock(&dentry->d_inode->i_mutex);
 
@@ -2371,7 +2368,6 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
 	if (error)
 		return error;
 
-	vfs_dq_init(dir);
 	error = dir->i_op->symlink(dir, dentry, oldname);
 	if (!error)
 		fsnotify_create(dir, dentry);
@@ -2455,7 +2451,6 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
 		return error;
 
 	mutex_lock(&inode->i_mutex);
-	vfs_dq_init(dir);
 	error = dir->i_op->link(old_dentry, dir, new_dentry);
 	mutex_unlock(&inode->i_mutex);
 	if (!error)
@@ -2556,7 +2551,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
  *	e) conversion from fhandle to dentry may come in the wrong moment - when
  *	   we are removing the target. Solution: we will have to grab ->i_mutex
  *	   in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
- *	   ->i_mutex on parents, which works but leads to some truely excessive
+ *	   ->i_mutex on parents, which works but leads to some truly excessive
  *	   locking].
  */
 static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
@@ -2580,17 +2575,20 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
 		return error;
 
 	target = new_dentry->d_inode;
-	if (target) {
+	if (target)
 		mutex_lock(&target->i_mutex);
-		dentry_unhash(new_dentry);
-	}
 	if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
 		error = -EBUSY;
-	else 
+	else {
+		if (target)
+			dentry_unhash(new_dentry);
 		error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
+	}
 	if (target) {
-		if (!error)
+		if (!error) {
 			target->i_flags |= S_DEAD;
+			dont_mount(new_dentry);
+		}
 		mutex_unlock(&target->i_mutex);
 		if (d_unhashed(new_dentry))
 			d_rehash(new_dentry);
@@ -2621,6 +2619,8 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
 	else
 		error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
 	if (!error) {
+		if (target)
+			dont_mount(new_dentry);
 		if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
 			d_move(old_dentry, new_dentry);
 	}
@@ -2654,20 +2654,15 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (!old_dir->i_op->rename)
 		return -EPERM;
 
-	vfs_dq_init(old_dir);
-	vfs_dq_init(new_dir);
-
 	old_name = fsnotify_oldname_init(old_dentry->d_name.name);
 
 	if (is_dir)
 		error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry);
 	else
 		error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry);
-	if (!error) {
-		const char *new_name = old_dentry->d_name.name;
-		fsnotify_move(old_dir, new_dir, old_name, new_name, is_dir,
+	if (!error)
+		fsnotify_move(old_dir, new_dir, old_name, is_dir,
 			      new_dentry->d_inode, old_dentry);
-	}
 	fsnotify_oldname_free(old_name);
 
 	return error;
diff --git a/fs/namespace.c b/fs/namespace.c
index c768f733c8d6..88058de59c7c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -573,7 +573,7 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
 			mnt->mnt_master = old;
 			CLEAR_MNT_SHARED(mnt);
 		} else if (!(flag & CL_PRIVATE)) {
-			if ((flag & CL_PROPAGATION) || IS_MNT_SHARED(old))
+			if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))
 				list_add(&mnt->mnt_share, &old->mnt_share);
 			if (IS_MNT_SLAVE(old))
 				list_add(&mnt->mnt_slave, &old->mnt_slave);
@@ -628,7 +628,6 @@ repeat:
 		mnt->mnt_pinned = 0;
 		spin_unlock(&vfsmount_lock);
 		acct_auto_close_mnt(mnt);
-		security_sb_umount_close(mnt);
 		goto repeat;
 	}
 }
@@ -737,6 +736,21 @@ static void m_stop(struct seq_file *m, void *v)
 	up_read(&namespace_sem);
 }
 
+int mnt_had_events(struct proc_mounts *p)
+{
+	struct mnt_namespace *ns = p->ns;
+	int res = 0;
+
+	spin_lock(&vfsmount_lock);
+	if (p->event != ns->event) {
+		p->event = ns->event;
+		res = 1;
+	}
+	spin_unlock(&vfsmount_lock);
+
+	return res;
+}
+
 struct proc_fs_info {
 	int flag;
 	const char *str;
@@ -1102,8 +1116,6 @@ static int do_umount(struct vfsmount *mnt, int flags)
 		retval = 0;
 	}
 	spin_unlock(&vfsmount_lock);
-	if (retval)
-		security_sb_umount_busy(mnt);
 	up_write(&namespace_sem);
 	release_mounts(&umount_list);
 	return retval;
@@ -1121,8 +1133,15 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
 {
 	struct path path;
 	int retval;
+	int lookup_flags = 0;
+
+	if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))
+		return -EINVAL;
+
+	if (!(flags & UMOUNT_NOFOLLOW))
+		lookup_flags |= LOOKUP_FOLLOW;
 
-	retval = user_path(name, &path);
+	retval = user_path_at(AT_FDCWD, name, lookup_flags, &path);
 	if (retval)
 		goto out;
 	retval = -EINVAL;
@@ -1246,6 +1265,21 @@ void drop_collected_mounts(struct vfsmount *mnt)
 	release_mounts(&umount_list);
 }
 
+int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
+		   struct vfsmount *root)
+{
+	struct vfsmount *mnt;
+	int res = f(root, arg);
+	if (res)
+		return res;
+	list_for_each_entry(mnt, &root->mnt_list, mnt_list) {
+		res = f(mnt, arg);
+		if (res)
+			return res;
+	}
+	return 0;
+}
+
 static void cleanup_group_ids(struct vfsmount *mnt, struct vfsmount *end)
 {
 	struct vfsmount *p;
@@ -1395,20 +1429,13 @@ static int graft_tree(struct vfsmount *mnt, struct path *path)
 
 	err = -ENOENT;
 	mutex_lock(&path->dentry->d_inode->i_mutex);
-	if (IS_DEADDIR(path->dentry->d_inode))
+	if (cant_mount(path->dentry))
 		goto out_unlock;
 
-	err = security_sb_check_sb(mnt, path);
-	if (err)
-		goto out_unlock;
-
-	err = -ENOENT;
 	if (!d_unlinked(path->dentry))
 		err = attach_recursive_mnt(mnt, path, NULL);
 out_unlock:
 	mutex_unlock(&path->dentry->d_inode->i_mutex);
-	if (!err)
-		security_sb_post_addmount(mnt, path);
 	return err;
 }
 
@@ -1538,14 +1565,12 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
 		err = do_remount_sb(sb, flags, data, 0);
 	if (!err) {
 		spin_lock(&vfsmount_lock);
-		mnt_flags |= path->mnt->mnt_flags & MNT_PNODE_MASK;
+		mnt_flags |= path->mnt->mnt_flags & MNT_PROPAGATION_MASK;
 		path->mnt->mnt_flags = mnt_flags;
 		spin_unlock(&vfsmount_lock);
 	}
 	up_write(&sb->s_umount);
 	if (!err) {
-		security_sb_post_remount(path->mnt, flags, data);
-
 		spin_lock(&vfsmount_lock);
 		touch_mnt_namespace(path->mnt->mnt_ns);
 		spin_unlock(&vfsmount_lock);
@@ -1586,7 +1611,7 @@ static int do_move_mount(struct path *path, char *old_name)
 
 	err = -ENOENT;
 	mutex_lock(&path->dentry->d_inode->i_mutex);
-	if (IS_DEADDIR(path->dentry->d_inode))
+	if (cant_mount(path->dentry))
 		goto out1;
 
 	if (d_unlinked(path->dentry))
@@ -1671,7 +1696,7 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
 {
 	int err;
 
-	mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD);
+	mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);
 
 	down_write(&namespace_sem);
 	/* Something was mounted here while we slept */
@@ -2197,7 +2222,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 	if (!check_mnt(root.mnt))
 		goto out2;
 	error = -ENOENT;
-	if (IS_DEADDIR(new.dentry->d_inode))
+	if (cant_mount(old.dentry))
 		goto out2;
 	if (d_unlinked(new.dentry))
 		goto out2;
@@ -2240,7 +2265,6 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 	touch_mnt_namespace(current->nsproxy->mnt_ns);
 	spin_unlock(&vfsmount_lock);
 	chroot_fs_refs(&root, &new);
-	security_sb_post_pivotroot(&root, &new);
 	error = 0;
 	path_put(&root_parent);
 	path_put(&parent_path);
@@ -2314,17 +2338,13 @@ void __init mnt_init(void)
 
 void put_mnt_ns(struct mnt_namespace *ns)
 {
-	struct vfsmount *root;
 	LIST_HEAD(umount_list);
 
-	if (!atomic_dec_and_lock(&ns->count, &vfsmount_lock))
+	if (!atomic_dec_and_test(&ns->count))
 		return;
-	root = ns->root;
-	ns->root = NULL;
-	spin_unlock(&vfsmount_lock);
 	down_write(&namespace_sem);
 	spin_lock(&vfsmount_lock);
-	umount_tree(root, 0, &umount_list);
+	umount_tree(ns->root, 0, &umount_list);
 	spin_unlock(&vfsmount_lock);
 	up_write(&namespace_sem);
 	release_mounts(&umount_list);
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index b8b5b30d53f0..9578cbe0cd58 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -15,7 +15,6 @@
 #include <linux/errno.h>
 #include <linux/stat.h>
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/mm.h>
 #include <asm/uaccess.h>
@@ -50,9 +49,10 @@ extern int ncp_symlink(struct inode *, struct dentry *, const char *);
 		      
 const struct file_operations ncp_dir_operations =
 {
+	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= ncp_readdir,
-	.ioctl		= ncp_ioctl,
+	.unlocked_ioctl	= ncp_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ncp_compat_ioctl,
 #endif
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 6a7d901f1936..3639cc5cbdae 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -15,7 +15,6 @@
 #include <linux/fcntl.h>
 #include <linux/stat.h>
 #include <linux/mm.h>
-#include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/sched.h>
 #include <linux/smp_lock.h>
@@ -23,7 +22,7 @@
 #include <linux/ncp_fs.h>
 #include "ncplib_kernel.h"
 
-static int ncp_fsync(struct file *file, struct dentry *dentry, int datasync)
+static int ncp_fsync(struct file *file, int datasync)
 {
 	return 0;
 }
@@ -296,7 +295,7 @@ const struct file_operations ncp_file_operations =
 	.llseek 	= ncp_remote_llseek,
 	.read		= ncp_file_read,
 	.write		= ncp_file_write,
-	.ioctl		= ncp_ioctl,
+	.unlocked_ioctl	= ncp_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ncp_compat_ioctl,
 #endif
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index cf98da1be23e..fa3385154023 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -526,10 +526,15 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 	sb->s_blocksize_bits = 10;
 	sb->s_magic = NCP_SUPER_MAGIC;
 	sb->s_op = &ncp_sops;
+	sb->s_bdi = &server->bdi;
 
 	server = NCP_SBP(sb);
 	memset(server, 0, sizeof(*server));
 
+	error = bdi_setup_and_register(&server->bdi, "ncpfs", BDI_CAP_MAP_COPY);
+	if (error)
+		goto out_bdi;
+
 	server->ncp_filp = ncp_filp;
 	server->ncp_sock = sock;
 	
@@ -719,6 +724,8 @@ out_fput2:
 	if (server->info_filp)
 		fput(server->info_filp);
 out_fput:
+	bdi_destroy(&server->bdi);
+out_bdi:
 	/* 23/12/1998 Marcin Dalecki <dalecki@cs.net.pl>:
 	 * 
 	 * The previously used put_filp(ncp_filp); was bogous, since
@@ -756,6 +763,7 @@ static void ncp_put_super(struct super_block *sb)
 	kill_pid(server->m.wdog_pid, SIGTERM, 1);
 	put_pid(server->m.wdog_pid);
 
+	bdi_destroy(&server->bdi);
 	kfree(server->priv.data);
 	kfree(server->auth.object_name);
 	vfree(server->rxbuf);
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index ec8f45f12e05..023c03d02070 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -15,10 +15,12 @@
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/mount.h>
+#include <linux/slab.h>
 #include <linux/highuid.h>
 #include <linux/smp_lock.h>
 #include <linux/vmalloc.h>
 #include <linux/sched.h>
+#include <linux/smp_lock.h>
 
 #include <linux/ncp_fs.h>
 
@@ -260,9 +262,9 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
 }
 #endif /* CONFIG_NCPFS_NLS */
 
-static int __ncp_ioctl(struct inode *inode, struct file *filp,
-	      unsigned int cmd, unsigned long arg)
+static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
+	struct inode *inode = filp->f_dentry->d_inode;
 	struct ncp_server *server = NCP_SERVER(inode);
 	int result;
 	struct ncp_ioctl_request request;
@@ -840,11 +842,11 @@ static int ncp_ioctl_need_write(unsigned int cmd)
 	}
 }
 
-int ncp_ioctl(struct inode *inode, struct file *filp,
-	      unsigned int cmd, unsigned long arg)
+long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
-	int ret;
+	long ret;
 
+	lock_kernel();
 	if (ncp_ioctl_need_write(cmd)) {
 		/*
 		 * inside the ioctl(), any failures which
@@ -852,24 +854,28 @@ int ncp_ioctl(struct inode *inode, struct file *filp,
 		 * -EACCESS, so it seems consistent to keep
 		 *  that here.
 		 */
-		if (mnt_want_write(filp->f_path.mnt))
-			return -EACCES;
+		if (mnt_want_write(filp->f_path.mnt)) {
+			ret = -EACCES;
+			goto out;
+		}
 	}
-	ret = __ncp_ioctl(inode, filp, cmd, arg);
+	ret = __ncp_ioctl(filp, cmd, arg);
 	if (ncp_ioctl_need_write(cmd))
 		mnt_drop_write(filp->f_path.mnt);
+
+out:
+	unlock_kernel();
 	return ret;
 }
 
 #ifdef CONFIG_COMPAT
 long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
-	int ret;
+	long ret;
 
 	lock_kernel();
 	arg = (unsigned long) compat_ptr(arg);
-	ret = ncp_ioctl(inode, file, cmd, arg);
+	ret = ncp_ioctl(file, cmd, arg);
 	unlock_kernel();
 	return ret;
 }
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 15458decdb8a..56f5b3a0e1ee 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -9,12 +9,12 @@
 #include <linux/stat.h>
 #include <linux/time.h>
 #include <linux/kernel.h>
+#include <linux/gfp.h>
 #include <linux/mm.h>
 #include <linux/shm.h>
 #include <linux/errno.h>
 #include <linux/mman.h>
 #include <linux/string.h>
-#include <linux/slab.h>
 #include <linux/fcntl.h>
 #include <linux/ncp_fs.h>
 
diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c
index e37df8d5fe70..c7ff6c700a6e 100644
--- a/fs/ncpfs/sock.c
+++ b/fs/ncpfs/sock.c
@@ -21,6 +21,7 @@
 #include <linux/mm.h>
 #include <linux/netdevice.h>
 #include <linux/signal.h>
+#include <linux/slab.h>
 #include <net/scm.h>
 #include <net/sock.h>
 #include <linux/ipx.h>
diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c
index e3d26c1bd105..c634fd17b337 100644
--- a/fs/ncpfs/symlink.c
+++ b/fs/ncpfs/symlink.c
@@ -27,6 +27,7 @@
 #include <linux/fs.h>
 #include <linux/ncp_fs.h>
 #include <linux/time.h>
+#include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/stat.h>
 #include "ncplib_kernel.h"
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 59e5673b4597..a43d07e7b924 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -95,8 +95,7 @@ config ROOT_NFS
 	  Most people say N here.
 
 config NFS_FSCACHE
-	bool "Provide NFS client caching support (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
+	bool "Provide NFS client caching support"
 	depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y
 	help
 	  Say Y here if you want NFS data to be cached locally on disc through
diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c
index b4ffd0146ea6..84690319e625 100644
--- a/fs/nfs/cache_lib.c
+++ b/fs/nfs/cache_lib.c
@@ -10,6 +10,7 @@
 #include <linux/moduleparam.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
+#include <linux/slab.h>
 #include <linux/sunrpc/cache.h>
 #include <linux/sunrpc/rpc_pipe_fs.h>
 
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 73ab220354df..36dfdae95123 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -118,7 +118,6 @@ nfs4_callback_up(struct svc_serv *serv)
 	dprintk("NFS: Callback listener port = %u (af %u)\n",
 			nfs_callback_tcpport, PF_INET);
 
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 	ret = svc_create_xprt(serv, "tcp", PF_INET6,
 				nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
 	if (ret > 0) {
@@ -129,7 +128,6 @@ nfs4_callback_up(struct svc_serv *serv)
 		ret = 0;
 	else
 		goto out_err;
-#endif	/* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
 
 	return svc_prepare_thread(serv, &serv->sv_pools[0]);
 
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index d4036be0b589..85a7cfd1b8dd 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -119,6 +119,14 @@ struct cb_recallanyargs {
 };
 
 extern unsigned nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy);
+
+struct cb_recallslotargs {
+	struct sockaddr	*crsa_addr;
+	uint32_t	crsa_target_max_slots;
+};
+extern unsigned nfs4_callback_recallslot(struct cb_recallslotargs *args,
+					  void *dummy);
+
 #endif /* CONFIG_NFS_V4_1 */
 
 extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res);
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index defa9b4c470e..a08770a7e857 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -7,6 +7,7 @@
  */
 #include <linux/nfs4.h>
 #include <linux/nfs_fs.h>
+#include <linux/slab.h>
 #include "nfs4_fs.h"
 #include "callback.h"
 #include "delegation.h"
@@ -143,44 +144,49 @@ int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const n
  * Return success if the sequenceID is one more than what we last saw on
  * this slot, accounting for wraparound.  Increments the slot's sequence.
  *
- * We don't yet implement a duplicate request cache, so at this time
- * we will log replays, and process them as if we had not seen them before,
- * but we don't bump the sequence in the slot.  Not too worried about it,
+ * We don't yet implement a duplicate request cache, instead we set the
+ * back channel ca_maxresponsesize_cached to zero. This is OK for now
  * since we only currently implement idempotent callbacks anyway.
  *
  * We have a single slot backchannel at this time, so we don't bother
  * checking the used_slots bit array on the table.  The lower layer guarantees
  * a single outstanding callback request at a time.
  */
-static int
-validate_seqid(struct nfs4_slot_table *tbl, u32 slotid, u32 seqid)
+static __be32
+validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
 {
 	struct nfs4_slot *slot;
 
 	dprintk("%s enter. slotid %d seqid %d\n",
-		__func__, slotid, seqid);
+		__func__, args->csa_slotid, args->csa_sequenceid);
 
-	if (slotid > NFS41_BC_MAX_CALLBACKS)
+	if (args->csa_slotid > NFS41_BC_MAX_CALLBACKS)
 		return htonl(NFS4ERR_BADSLOT);
 
-	slot = tbl->slots + slotid;
+	slot = tbl->slots + args->csa_slotid;
 	dprintk("%s slot table seqid: %d\n", __func__, slot->seq_nr);
 
 	/* Normal */
-	if (likely(seqid == slot->seq_nr + 1)) {
+	if (likely(args->csa_sequenceid == slot->seq_nr + 1)) {
 		slot->seq_nr++;
 		return htonl(NFS4_OK);
 	}
 
 	/* Replay */
-	if (seqid == slot->seq_nr) {
-		dprintk("%s seqid %d is a replay - no DRC available\n",
-			__func__, seqid);
-		return htonl(NFS4_OK);
+	if (args->csa_sequenceid == slot->seq_nr) {
+		dprintk("%s seqid %d is a replay\n",
+			__func__, args->csa_sequenceid);
+		/* Signal process_op to set this error on next op */
+		if (args->csa_cachethis == 0)
+			return htonl(NFS4ERR_RETRY_UNCACHED_REP);
+
+		/* The ca_maxresponsesize_cached is 0 with no DRC */
+		else if (args->csa_cachethis == 1)
+			return htonl(NFS4ERR_REP_TOO_BIG_TO_CACHE);
 	}
 
 	/* Wraparound */
-	if (seqid == 1 && (slot->seq_nr + 1) == 0) {
+	if (args->csa_sequenceid == 1 && (slot->seq_nr + 1) == 0) {
 		slot->seq_nr = 1;
 		return htonl(NFS4_OK);
 	}
@@ -225,27 +231,87 @@ validate_seqid(struct nfs4_slot_table *tbl, u32 slotid, u32 seqid)
 	return NULL;
 }
 
-/* FIXME: referring calls should be processed */
-unsigned nfs4_callback_sequence(struct cb_sequenceargs *args,
+/*
+ * For each referring call triple, check the session's slot table for
+ * a match.  If the slot is in use and the sequence numbers match, the
+ * client is still waiting for a response to the original request.
+ */
+static bool referring_call_exists(struct nfs_client *clp,
+				  uint32_t nrclists,
+				  struct referring_call_list *rclists)
+{
+	bool status = 0;
+	int i, j;
+	struct nfs4_session *session;
+	struct nfs4_slot_table *tbl;
+	struct referring_call_list *rclist;
+	struct referring_call *ref;
+
+	/*
+	 * XXX When client trunking is implemented, this becomes
+	 * a session lookup from within the loop
+	 */
+	session = clp->cl_session;
+	tbl = &session->fc_slot_table;
+
+	for (i = 0; i < nrclists; i++) {
+		rclist = &rclists[i];
+		if (memcmp(session->sess_id.data,
+			   rclist->rcl_sessionid.data,
+			   NFS4_MAX_SESSIONID_LEN) != 0)
+			continue;
+
+		for (j = 0; j < rclist->rcl_nrefcalls; j++) {
+			ref = &rclist->rcl_refcalls[j];
+
+			dprintk("%s: sessionid %x:%x:%x:%x sequenceid %u "
+				"slotid %u\n", __func__,
+				((u32 *)&rclist->rcl_sessionid.data)[0],
+				((u32 *)&rclist->rcl_sessionid.data)[1],
+				((u32 *)&rclist->rcl_sessionid.data)[2],
+				((u32 *)&rclist->rcl_sessionid.data)[3],
+				ref->rc_sequenceid, ref->rc_slotid);
+
+			spin_lock(&tbl->slot_tbl_lock);
+			status = (test_bit(ref->rc_slotid, tbl->used_slots) &&
+				  tbl->slots[ref->rc_slotid].seq_nr ==
+					ref->rc_sequenceid);
+			spin_unlock(&tbl->slot_tbl_lock);
+			if (status)
+				goto out;
+		}
+	}
+
+out:
+	return status;
+}
+
+__be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
 				struct cb_sequenceres *res)
 {
 	struct nfs_client *clp;
-	int i, status;
-
-	for (i = 0; i < args->csa_nrclists; i++)
-		kfree(args->csa_rclists[i].rcl_refcalls);
-	kfree(args->csa_rclists);
+	int i;
+	__be32 status;
 
 	status = htonl(NFS4ERR_BADSESSION);
 	clp = find_client_with_session(args->csa_addr, 4, &args->csa_sessionid);
 	if (clp == NULL)
 		goto out;
 
-	status = validate_seqid(&clp->cl_session->bc_slot_table,
-				args->csa_slotid, args->csa_sequenceid);
+	status = validate_seqid(&clp->cl_session->bc_slot_table, args);
 	if (status)
 		goto out_putclient;
 
+	/*
+	 * Check for pending referring calls.  If a match is found, a
+	 * related callback was received before the response to the original
+	 * call.
+	 */
+	if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists)) {
+		status = htonl(NFS4ERR_DELAY);
+		goto out_putclient;
+	}
+
 	memcpy(&res->csr_sessionid, &args->csa_sessionid,
 	       sizeof(res->csr_sessionid));
 	res->csr_sequenceid = args->csa_sequenceid;
@@ -256,15 +322,23 @@ unsigned nfs4_callback_sequence(struct cb_sequenceargs *args,
 out_putclient:
 	nfs_put_client(clp);
 out:
-	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
-	res->csr_status = status;
-	return res->csr_status;
+	for (i = 0; i < args->csa_nrclists; i++)
+		kfree(args->csa_rclists[i].rcl_refcalls);
+	kfree(args->csa_rclists);
+
+	if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP))
+		res->csr_status = 0;
+	else
+		res->csr_status = status;
+	dprintk("%s: exit with status = %d res->csr_status %d\n", __func__,
+		ntohl(status), ntohl(res->csr_status));
+	return status;
 }
 
-unsigned nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy)
+__be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy)
 {
 	struct nfs_client *clp;
-	int status;
+	__be32 status;
 	fmode_t flags = 0;
 
 	status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
@@ -289,4 +363,40 @@ out:
 	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
 	return status;
 }
+
+/* Reduce the fore channel's max_slots to the target value */
+__be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy)
+{
+	struct nfs_client *clp;
+	struct nfs4_slot_table *fc_tbl;
+	__be32 status;
+
+	status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
+	clp = nfs_find_client(args->crsa_addr, 4);
+	if (clp == NULL)
+		goto out;
+
+	dprintk("NFS: CB_RECALL_SLOT request from %s target max slots %d\n",
+		rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR),
+		args->crsa_target_max_slots);
+
+	fc_tbl = &clp->cl_session->fc_slot_table;
+
+	status = htonl(NFS4ERR_BAD_HIGH_SLOT);
+	if (args->crsa_target_max_slots > fc_tbl->max_slots ||
+	    args->crsa_target_max_slots < 1)
+		goto out_putclient;
+
+	status = htonl(NFS4_OK);
+	if (args->crsa_target_max_slots == fc_tbl->max_slots)
+		goto out_putclient;
+
+	fc_tbl->target_max_slots = args->crsa_target_max_slots;
+	nfs41_handle_recall_slot(clp);
+out_putclient:
+	nfs_put_client(clp);	/* balance nfs_find_client */
+out:
+	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+	return status;
+}
 #endif /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 8e1a2511c8be..05af212f0edf 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -9,6 +9,7 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/nfs4.h>
 #include <linux/nfs_fs.h>
+#include <linux/slab.h>
 #include "nfs4_fs.h"
 #include "callback.h"
 
@@ -24,10 +25,14 @@
 #define CB_OP_SEQUENCE_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ + \
 					4 + 1 + 3)
 #define CB_OP_RECALLANY_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ)
+#define CB_OP_RECALLSLOT_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ)
 #endif /* CONFIG_NFS_V4_1 */
 
 #define NFSDBG_FACILITY NFSDBG_CALLBACK
 
+/* Internal error code */
+#define NFS4ERR_RESOURCE_HDR	11050
+
 typedef __be32 (*callback_process_op_t)(void *, void *);
 typedef __be32 (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *);
 typedef __be32 (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *);
@@ -173,7 +178,7 @@ static __be32 decode_op_hdr(struct xdr_stream *xdr, unsigned int *op)
 	__be32 *p;
 	p = read_buf(xdr, 4);
 	if (unlikely(p == NULL))
-		return htonl(NFS4ERR_RESOURCE);
+		return htonl(NFS4ERR_RESOURCE_HDR);
 	*op = ntohl(*p);
 	return 0;
 }
@@ -215,10 +220,10 @@ out:
 
 #if defined(CONFIG_NFS_V4_1)
 
-static unsigned decode_sessionid(struct xdr_stream *xdr,
+static __be32 decode_sessionid(struct xdr_stream *xdr,
 				 struct nfs4_sessionid *sid)
 {
-	uint32_t *p;
+	__be32 *p;
 	int len = NFS4_MAX_SESSIONID_LEN;
 
 	p = read_buf(xdr, len);
@@ -229,12 +234,12 @@ static unsigned decode_sessionid(struct xdr_stream *xdr,
 	return 0;
 }
 
-static unsigned decode_rc_list(struct xdr_stream *xdr,
+static __be32 decode_rc_list(struct xdr_stream *xdr,
 			       struct referring_call_list *rc_list)
 {
-	uint32_t *p;
+	__be32 *p;
 	int i;
-	unsigned status;
+	__be32 status;
 
 	status = decode_sessionid(xdr, &rc_list->rcl_sessionid);
 	if (status)
@@ -267,13 +272,13 @@ out:
 	return status;
 }
 
-static unsigned decode_cb_sequence_args(struct svc_rqst *rqstp,
+static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp,
 					struct xdr_stream *xdr,
 					struct cb_sequenceargs *args)
 {
-	uint32_t *p;
+	__be32 *p;
 	int i;
-	unsigned status;
+	__be32 status;
 
 	status = decode_sessionid(xdr, &args->csa_sessionid);
 	if (status)
@@ -327,11 +332,11 @@ out_free:
 	goto out;
 }
 
-static unsigned decode_recallany_args(struct svc_rqst *rqstp,
+static __be32 decode_recallany_args(struct svc_rqst *rqstp,
 				      struct xdr_stream *xdr,
 				      struct cb_recallanyargs *args)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	args->craa_addr = svc_addr(rqstp);
 	p = read_buf(xdr, 4);
@@ -346,6 +351,20 @@ static unsigned decode_recallany_args(struct svc_rqst *rqstp,
 	return 0;
 }
 
+static __be32 decode_recallslot_args(struct svc_rqst *rqstp,
+					struct xdr_stream *xdr,
+					struct cb_recallslotargs *args)
+{
+	__be32 *p;
+
+	args->crsa_addr = svc_addr(rqstp);
+	p = read_buf(xdr, 4);
+	if (unlikely(p == NULL))
+		return htonl(NFS4ERR_BADXDR);
+	args->crsa_target_max_slots = ntohl(*p++);
+	return 0;
+}
+
 #endif /* CONFIG_NFS_V4_1 */
 
 static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
@@ -465,7 +484,7 @@ static __be32 encode_op_hdr(struct xdr_stream *xdr, uint32_t op, __be32 res)
 	
 	p = xdr_reserve_space(xdr, 8);
 	if (unlikely(p == NULL))
-		return htonl(NFS4ERR_RESOURCE);
+		return htonl(NFS4ERR_RESOURCE_HDR);
 	*p++ = htonl(op);
 	*p = res;
 	return 0;
@@ -499,10 +518,10 @@ out:
 
 #if defined(CONFIG_NFS_V4_1)
 
-static unsigned encode_sessionid(struct xdr_stream *xdr,
+static __be32 encode_sessionid(struct xdr_stream *xdr,
 				 const struct nfs4_sessionid *sid)
 {
-	uint32_t *p;
+	__be32 *p;
 	int len = NFS4_MAX_SESSIONID_LEN;
 
 	p = xdr_reserve_space(xdr, len);
@@ -513,11 +532,11 @@ static unsigned encode_sessionid(struct xdr_stream *xdr,
 	return 0;
 }
 
-static unsigned encode_cb_sequence_res(struct svc_rqst *rqstp,
+static __be32 encode_cb_sequence_res(struct svc_rqst *rqstp,
 				       struct xdr_stream *xdr,
 				       const struct cb_sequenceres *res)
 {
-	uint32_t *p;
+	__be32 *p;
 	unsigned status = res->csr_status;
 
 	if (unlikely(status != 0))
@@ -554,6 +573,7 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
 	case OP_CB_RECALL:
 	case OP_CB_SEQUENCE:
 	case OP_CB_RECALL_ANY:
+	case OP_CB_RECALL_SLOT:
 		*op = &callback_ops[op_nr];
 		break;
 
@@ -562,7 +582,6 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
 	case OP_CB_NOTIFY:
 	case OP_CB_PUSH_DELEG:
 	case OP_CB_RECALLABLE_OBJ_AVAIL:
-	case OP_CB_RECALL_SLOT:
 	case OP_CB_WANTS_CANCELLED:
 	case OP_CB_NOTIFY_LOCK:
 		return htonl(NFS4ERR_NOTSUPP);
@@ -602,20 +621,18 @@ preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
 static __be32 process_op(uint32_t minorversion, int nop,
 		struct svc_rqst *rqstp,
 		struct xdr_stream *xdr_in, void *argp,
-		struct xdr_stream *xdr_out, void *resp)
+		struct xdr_stream *xdr_out, void *resp, int* drc_status)
 {
 	struct callback_op *op = &callback_ops[0];
-	unsigned int op_nr = OP_CB_ILLEGAL;
+	unsigned int op_nr;
 	__be32 status;
 	long maxlen;
 	__be32 res;
 
 	dprintk("%s: start\n", __func__);
 	status = decode_op_hdr(xdr_in, &op_nr);
-	if (unlikely(status)) {
-		status = htonl(NFS4ERR_OP_ILLEGAL);
-		goto out;
-	}
+	if (unlikely(status))
+		return status;
 
 	dprintk("%s: minorversion=%d nop=%d op_nr=%u\n",
 		__func__, minorversion, nop, op_nr);
@@ -624,19 +641,32 @@ static __be32 process_op(uint32_t minorversion, int nop,
 				preprocess_nfs4_op(op_nr, &op);
 	if (status == htonl(NFS4ERR_OP_ILLEGAL))
 		op_nr = OP_CB_ILLEGAL;
-out:
+	if (status)
+		goto encode_hdr;
+
+	if (*drc_status) {
+		status = *drc_status;
+		goto encode_hdr;
+	}
+
 	maxlen = xdr_out->end - xdr_out->p;
 	if (maxlen > 0 && maxlen < PAGE_SIZE) {
-		if (likely(status == 0 && op->decode_args != NULL))
-			status = op->decode_args(rqstp, xdr_in, argp);
-		if (likely(status == 0 && op->process_op != NULL))
+		status = op->decode_args(rqstp, xdr_in, argp);
+		if (likely(status == 0))
 			status = op->process_op(argp, resp);
 	} else
 		status = htonl(NFS4ERR_RESOURCE);
 
+	/* Only set by OP_CB_SEQUENCE processing */
+	if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) {
+		*drc_status = status;
+		status = 0;
+	}
+
+encode_hdr:
 	res = encode_op_hdr(xdr_out, op_nr, status);
-	if (status == 0)
-		status = res;
+	if (unlikely(res))
+		return res;
 	if (op->encode_res != NULL && status == 0)
 		status = op->encode_res(rqstp, xdr_out, resp);
 	dprintk("%s: done, status = %d\n", __func__, ntohl(status));
@@ -652,7 +682,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
 	struct cb_compound_hdr_res hdr_res = { NULL };
 	struct xdr_stream xdr_in, xdr_out;
 	__be32 *p;
-	__be32 status;
+	__be32 status, drc_status = 0;
 	unsigned int nops = 0;
 
 	dprintk("%s: start\n", __func__);
@@ -672,11 +702,18 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
 		return rpc_system_err;
 
 	while (status == 0 && nops != hdr_arg.nops) {
-		status = process_op(hdr_arg.minorversion, nops,
-				    rqstp, &xdr_in, argp, &xdr_out, resp);
+		status = process_op(hdr_arg.minorversion, nops, rqstp,
+				    &xdr_in, argp, &xdr_out, resp, &drc_status);
 		nops++;
 	}
 
+	/* Buffer overflow in decode_ops_hdr or encode_ops_hdr. Return
+	* resource error in cb_compound status without returning op */
+	if (unlikely(status == htonl(NFS4ERR_RESOURCE_HDR))) {
+		status = htonl(NFS4ERR_RESOURCE);
+		nops--;
+	}
+
 	*hdr_res.status = status;
 	*hdr_res.nops = htonl(nops);
 	dprintk("%s: done, status = %u\n", __func__, ntohl(status));
@@ -713,6 +750,11 @@ static struct callback_op callback_ops[] = {
 		.decode_args = (callback_decode_arg_t)decode_recallany_args,
 		.res_maxsize = CB_OP_RECALLANY_RES_MAXSZ,
 	},
+	[OP_CB_RECALL_SLOT] = {
+		.process_op = (callback_process_op_t)nfs4_callback_recallslot,
+		.decode_args = (callback_decode_arg_t)decode_recallslot_args,
+		.res_maxsize = CB_OP_RECALLSLOT_RES_MAXSZ,
+	},
 #endif /* CONFIG_NFS_V4_1 */
 };
 
@@ -741,6 +783,7 @@ struct svc_version nfs4_callback_version1 = {
 	.vs_proc = nfs4_callback_procedures1,
 	.vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
 	.vs_dispatch = NULL,
+	.vs_hidden = 1,
 };
 
 struct svc_version nfs4_callback_version4 = {
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index ee77713ce68b..d25b5257b7a1 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -35,6 +35,7 @@
 #include <linux/vfs.h>
 #include <linux/inet.h>
 #include <linux/in6.h>
+#include <linux/slab.h>
 #include <net/ipv6.h>
 #include <linux/nfs_xdr.h>
 #include <linux/sunrpc/bc_xprt.h>
@@ -164,30 +165,7 @@ error_0:
 	return ERR_PTR(err);
 }
 
-static void nfs4_shutdown_client(struct nfs_client *clp)
-{
-#ifdef CONFIG_NFS_V4
-	if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state))
-		nfs4_kill_renewd(clp);
-	BUG_ON(!RB_EMPTY_ROOT(&clp->cl_state_owners));
-	if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state))
-		nfs_idmap_delete(clp);
-
-	rpc_destroy_wait_queue(&clp->cl_rpcwaitq);
-#endif
-}
-
-/*
- * Destroy the NFS4 callback service
- */
-static void nfs4_destroy_callback(struct nfs_client *clp)
-{
 #ifdef CONFIG_NFS_V4
-	if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
-		nfs_callback_down(clp->cl_minorversion);
-#endif /* CONFIG_NFS_V4 */
-}
-
 /*
  * Clears/puts all minor version specific parts from an nfs_client struct
  * reverting it to minorversion 0.
@@ -202,9 +180,33 @@ static void nfs4_clear_client_minor_version(struct nfs_client *clp)
 
 	clp->cl_call_sync = _nfs4_call_sync;
 #endif /* CONFIG_NFS_V4_1 */
+}
 
+/*
+ * Destroy the NFS4 callback service
+ */
+static void nfs4_destroy_callback(struct nfs_client *clp)
+{
+	if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
+		nfs_callback_down(clp->cl_minorversion);
+}
+
+static void nfs4_shutdown_client(struct nfs_client *clp)
+{
+	if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state))
+		nfs4_kill_renewd(clp);
+	nfs4_clear_client_minor_version(clp);
 	nfs4_destroy_callback(clp);
+	if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state))
+		nfs_idmap_delete(clp);
+
+	rpc_destroy_wait_queue(&clp->cl_rpcwaitq);
 }
+#else
+static void nfs4_shutdown_client(struct nfs_client *clp)
+{
+}
+#endif /* CONFIG_NFS_V4 */
 
 /*
  * Destroy a shared client record
@@ -213,7 +215,6 @@ static void nfs_free_client(struct nfs_client *clp)
 {
 	dprintk("--> nfs_free_client(%u)\n", clp->rpc_ops->version);
 
-	nfs4_clear_client_minor_version(clp);
 	nfs4_shutdown_client(clp);
 
 	nfs_fscache_release_client_cookie(clp);
@@ -933,7 +934,6 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str
 	}
 
 	fsinfo.fattr = fattr;
-	nfs_fattr_init(fattr);
 	error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo);
 	if (error < 0)
 		goto out_error;
@@ -965,6 +965,8 @@ out_error:
 static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *source)
 {
 	target->flags = source->flags;
+	target->rsize = source->rsize;
+	target->wsize = source->wsize;
 	target->acregmin = source->acregmin;
 	target->acregmax = source->acregmax;
 	target->acdirmin = source->acdirmin;
@@ -1044,13 +1046,18 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
 				     struct nfs_fh *mntfh)
 {
 	struct nfs_server *server;
-	struct nfs_fattr fattr;
+	struct nfs_fattr *fattr;
 	int error;
 
 	server = nfs_alloc_server();
 	if (!server)
 		return ERR_PTR(-ENOMEM);
 
+	error = -ENOMEM;
+	fattr = nfs_alloc_fattr();
+	if (fattr == NULL)
+		goto error;
+
 	/* Get a client representation */
 	error = nfs_init_server(server, data);
 	if (error < 0)
@@ -1061,7 +1068,7 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
 	BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
 
 	/* Probe the root fh to retrieve its FSID */
-	error = nfs_probe_fsinfo(server, mntfh, &fattr);
+	error = nfs_probe_fsinfo(server, mntfh, fattr);
 	if (error < 0)
 		goto error;
 	if (server->nfs_client->rpc_ops->version == 3) {
@@ -1074,14 +1081,14 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
 			server->namelen = NFS2_MAXNAMLEN;
 	}
 
-	if (!(fattr.valid & NFS_ATTR_FATTR)) {
-		error = server->nfs_client->rpc_ops->getattr(server, mntfh, &fattr);
+	if (!(fattr->valid & NFS_ATTR_FATTR)) {
+		error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr);
 		if (error < 0) {
 			dprintk("nfs_create_server: getattr error = %d\n", -error);
 			goto error;
 		}
 	}
-	memcpy(&server->fsid, &fattr.fsid, sizeof(server->fsid));
+	memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid));
 
 	dprintk("Server FSID: %llx:%llx\n",
 		(unsigned long long) server->fsid.major,
@@ -1093,9 +1100,11 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
 	spin_unlock(&nfs_client_lock);
 
 	server->mount_time = jiffies;
+	nfs_free_fattr(fattr);
 	return server;
 
 error:
+	nfs_free_fattr(fattr);
 	nfs_free_server(server);
 	return ERR_PTR(error);
 }
@@ -1277,6 +1286,55 @@ static void nfs4_session_set_rwsize(struct nfs_server *server)
 #endif /* CONFIG_NFS_V4_1 */
 }
 
+static int nfs4_server_common_setup(struct nfs_server *server,
+		struct nfs_fh *mntfh)
+{
+	struct nfs_fattr *fattr;
+	int error;
+
+	BUG_ON(!server->nfs_client);
+	BUG_ON(!server->nfs_client->rpc_ops);
+	BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
+
+	fattr = nfs_alloc_fattr();
+	if (fattr == NULL)
+		return -ENOMEM;
+
+	/* We must ensure the session is initialised first */
+	error = nfs4_init_session(server);
+	if (error < 0)
+		goto out;
+
+	/* Probe the root fh to retrieve its FSID and filehandle */
+	error = nfs4_get_rootfh(server, mntfh);
+	if (error < 0)
+		goto out;
+
+	dprintk("Server FSID: %llx:%llx\n",
+			(unsigned long long) server->fsid.major,
+			(unsigned long long) server->fsid.minor);
+	dprintk("Mount FH: %d\n", mntfh->size);
+
+	nfs4_session_set_rwsize(server);
+
+	error = nfs_probe_fsinfo(server, mntfh, fattr);
+	if (error < 0)
+		goto out;
+
+	if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
+		server->namelen = NFS4_MAXNAMLEN;
+
+	spin_lock(&nfs_client_lock);
+	list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
+	list_add_tail(&server->master_link, &nfs_volume_list);
+	spin_unlock(&nfs_client_lock);
+
+	server->mount_time = jiffies;
+out:
+	nfs_free_fattr(fattr);
+	return error;
+}
+
 /*
  * Create a version 4 volume record
  */
@@ -1293,7 +1351,8 @@ static int nfs4_init_server(struct nfs_server *server,
 
 	/* Initialise the client representation from the mount data */
 	server->flags = data->flags;
-	server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR;
+	server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR|
+		NFS_CAP_POSIX_LOCK;
 	server->options = data->options;
 
 	/* Get a client record */
@@ -1336,7 +1395,6 @@ error:
 struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
 				      struct nfs_fh *mntfh)
 {
-	struct nfs_fattr fattr;
 	struct nfs_server *server;
 	int error;
 
@@ -1351,39 +1409,10 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
 	if (error < 0)
 		goto error;
 
-	BUG_ON(!server->nfs_client);
-	BUG_ON(!server->nfs_client->rpc_ops);
-	BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
-
-	error = nfs4_init_session(server);
-	if (error < 0)
-		goto error;
-
-	/* Probe the root fh to retrieve its FSID */
-	error = nfs4_path_walk(server, mntfh, data->nfs_server.export_path);
-	if (error < 0)
-		goto error;
-
-	dprintk("Server FSID: %llx:%llx\n",
-		(unsigned long long) server->fsid.major,
-		(unsigned long long) server->fsid.minor);
-	dprintk("Mount FH: %d\n", mntfh->size);
-
-	nfs4_session_set_rwsize(server);
-
-	error = nfs_probe_fsinfo(server, mntfh, &fattr);
+	error = nfs4_server_common_setup(server, mntfh);
 	if (error < 0)
 		goto error;
 
-	if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
-		server->namelen = NFS4_MAXNAMLEN;
-
-	spin_lock(&nfs_client_lock);
-	list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
-	list_add_tail(&server->master_link, &nfs_volume_list);
-	spin_unlock(&nfs_client_lock);
-
-	server->mount_time = jiffies;
 	dprintk("<-- nfs4_create_server() = %p\n", server);
 	return server;
 
@@ -1401,7 +1430,6 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
 {
 	struct nfs_client *parent_client;
 	struct nfs_server *server, *parent_server;
-	struct nfs_fattr fattr;
 	int error;
 
 	dprintk("--> nfs4_create_referral_server()\n");
@@ -1434,34 +1462,10 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
 	if (error < 0)
 		goto error;
 
-	BUG_ON(!server->nfs_client);
-	BUG_ON(!server->nfs_client->rpc_ops);
-	BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
-
-	/* Probe the root fh to retrieve its FSID and filehandle */
-	error = nfs4_path_walk(server, mntfh, data->mnt_path);
+	error = nfs4_server_common_setup(server, mntfh);
 	if (error < 0)
 		goto error;
 
-	/* probe the filesystem info for this server filesystem */
-	error = nfs_probe_fsinfo(server, mntfh, &fattr);
-	if (error < 0)
-		goto error;
-
-	if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
-		server->namelen = NFS4_MAXNAMLEN;
-
-	dprintk("Referral FSID: %llx:%llx\n",
-		(unsigned long long) server->fsid.major,
-		(unsigned long long) server->fsid.minor);
-
-	spin_lock(&nfs_client_lock);
-	list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
-	list_add_tail(&server->master_link, &nfs_volume_list);
-	spin_unlock(&nfs_client_lock);
-
-	server->mount_time = jiffies;
-
 	dprintk("<-- nfs_create_referral_server() = %p\n", server);
 	return server;
 
@@ -1481,7 +1485,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
 				    struct nfs_fattr *fattr)
 {
 	struct nfs_server *server;
-	struct nfs_fattr fattr_fsinfo;
+	struct nfs_fattr *fattr_fsinfo;
 	int error;
 
 	dprintk("--> nfs_clone_server(,%llx:%llx,)\n",
@@ -1492,6 +1496,11 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
 	if (!server)
 		return ERR_PTR(-ENOMEM);
 
+	error = -ENOMEM;
+	fattr_fsinfo = nfs_alloc_fattr();
+	if (fattr_fsinfo == NULL)
+		goto out_free_server;
+
 	/* Copy data from the source */
 	server->nfs_client = source->nfs_client;
 	atomic_inc(&server->nfs_client->cl_count);
@@ -1508,7 +1517,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
 		nfs_init_server_aclclient(server);
 
 	/* probe the filesystem info for this server filesystem */
-	error = nfs_probe_fsinfo(server, fh, &fattr_fsinfo);
+	error = nfs_probe_fsinfo(server, fh, fattr_fsinfo);
 	if (error < 0)
 		goto out_free_server;
 
@@ -1530,10 +1539,12 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
 
 	server->mount_time = jiffies;
 
+	nfs_free_fattr(fattr_fsinfo);
 	dprintk("<-- nfs_clone_server() = %p\n", server);
 	return server;
 
 out_free_server:
+	nfs_free_fattr(fattr_fsinfo);
 	nfs_free_server(server);
 	dprintk("<-- nfs_clone_server() = error %d\n", error);
 	return ERR_PTR(error);
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 2563bebc4c67..301634543974 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -10,6 +10,7 @@
 #include <linux/kthread.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/spinlock.h>
 
@@ -23,6 +24,8 @@
 
 static void nfs_do_free_delegation(struct nfs_delegation *delegation)
 {
+	if (delegation->cred)
+		put_rpccred(delegation->cred);
 	kfree(delegation);
 }
 
@@ -35,13 +38,7 @@ static void nfs_free_delegation_callback(struct rcu_head *head)
 
 static void nfs_free_delegation(struct nfs_delegation *delegation)
 {
-	struct rpc_cred *cred;
-
-	cred = rcu_dereference(delegation->cred);
-	rcu_assign_pointer(delegation->cred, NULL);
 	call_rcu(&delegation->rcu, nfs_free_delegation_callback);
-	if (cred)
-		put_rpccred(cred);
 }
 
 void nfs_mark_delegation_referenced(struct nfs_delegation *delegation)
@@ -128,21 +125,35 @@ again:
  */
 void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res)
 {
-	struct nfs_delegation *delegation = NFS_I(inode)->delegation;
-	struct rpc_cred *oldcred;
+	struct nfs_delegation *delegation;
+	struct rpc_cred *oldcred = NULL;
 
-	if (delegation == NULL)
-		return;
-	memcpy(delegation->stateid.data, res->delegation.data,
-			sizeof(delegation->stateid.data));
-	delegation->type = res->delegation_type;
-	delegation->maxsize = res->maxsize;
-	oldcred = delegation->cred;
-	delegation->cred = get_rpccred(cred);
-	clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
-	NFS_I(inode)->delegation_state = delegation->type;
-	smp_wmb();
-	put_rpccred(oldcred);
+	rcu_read_lock();
+	delegation = rcu_dereference(NFS_I(inode)->delegation);
+	if (delegation != NULL) {
+		spin_lock(&delegation->lock);
+		if (delegation->inode != NULL) {
+			memcpy(delegation->stateid.data, res->delegation.data,
+			       sizeof(delegation->stateid.data));
+			delegation->type = res->delegation_type;
+			delegation->maxsize = res->maxsize;
+			oldcred = delegation->cred;
+			delegation->cred = get_rpccred(cred);
+			clear_bit(NFS_DELEGATION_NEED_RECLAIM,
+				  &delegation->flags);
+			NFS_I(inode)->delegation_state = delegation->type;
+			spin_unlock(&delegation->lock);
+			put_rpccred(oldcred);
+			rcu_read_unlock();
+		} else {
+			/* We appear to have raced with a delegation return. */
+			spin_unlock(&delegation->lock);
+			rcu_read_unlock();
+			nfs_inode_set_delegation(inode, cred, res);
+		}
+	} else {
+		rcu_read_unlock();
+	}
 }
 
 static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync)
@@ -165,9 +176,13 @@ static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation
 	return inode;
 }
 
-static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi, const nfs4_stateid *stateid)
+static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi,
+							   const nfs4_stateid *stateid,
+							   struct nfs_client *clp)
 {
-	struct nfs_delegation *delegation = rcu_dereference(nfsi->delegation);
+	struct nfs_delegation *delegation =
+		rcu_dereference_protected(nfsi->delegation,
+					  lockdep_is_held(&clp->cl_lock));
 
 	if (delegation == NULL)
 		goto nomatch;
@@ -194,11 +209,11 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
 {
 	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
 	struct nfs_inode *nfsi = NFS_I(inode);
-	struct nfs_delegation *delegation;
+	struct nfs_delegation *delegation, *old_delegation;
 	struct nfs_delegation *freeme = NULL;
 	int status = 0;
 
-	delegation = kmalloc(sizeof(*delegation), GFP_KERNEL);
+	delegation = kmalloc(sizeof(*delegation), GFP_NOFS);
 	if (delegation == NULL)
 		return -ENOMEM;
 	memcpy(delegation->stateid.data, res->delegation.data,
@@ -212,10 +227,12 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
 	spin_lock_init(&delegation->lock);
 
 	spin_lock(&clp->cl_lock);
-	if (rcu_dereference(nfsi->delegation) != NULL) {
-		if (memcmp(&delegation->stateid, &nfsi->delegation->stateid,
-					sizeof(delegation->stateid)) == 0 &&
-				delegation->type == nfsi->delegation->type) {
+	old_delegation = rcu_dereference_protected(nfsi->delegation,
+						   lockdep_is_held(&clp->cl_lock));
+	if (old_delegation != NULL) {
+		if (memcmp(&delegation->stateid, &old_delegation->stateid,
+					sizeof(old_delegation->stateid)) == 0 &&
+				delegation->type == old_delegation->type) {
 			goto out;
 		}
 		/*
@@ -225,12 +242,12 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
 		dfprintk(FILE, "%s: server %s handed out "
 				"a duplicate delegation!\n",
 				__func__, clp->cl_hostname);
-		if (delegation->type <= nfsi->delegation->type) {
+		if (delegation->type <= old_delegation->type) {
 			freeme = delegation;
 			delegation = NULL;
 			goto out;
 		}
-		freeme = nfs_detach_delegation_locked(nfsi, NULL);
+		freeme = nfs_detach_delegation_locked(nfsi, NULL, clp);
 	}
 	list_add_rcu(&delegation->super_list, &clp->cl_delegations);
 	nfsi->delegation_state = delegation->type;
@@ -300,7 +317,7 @@ restart:
 		if (inode == NULL)
 			continue;
 		spin_lock(&clp->cl_lock);
-		delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL);
+		delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL, clp);
 		spin_unlock(&clp->cl_lock);
 		rcu_read_unlock();
 		if (delegation != NULL) {
@@ -329,9 +346,9 @@ void nfs_inode_return_delegation_noreclaim(struct inode *inode)
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_delegation *delegation;
 
-	if (rcu_dereference(nfsi->delegation) != NULL) {
+	if (rcu_access_pointer(nfsi->delegation) != NULL) {
 		spin_lock(&clp->cl_lock);
-		delegation = nfs_detach_delegation_locked(nfsi, NULL);
+		delegation = nfs_detach_delegation_locked(nfsi, NULL, clp);
 		spin_unlock(&clp->cl_lock);
 		if (delegation != NULL)
 			nfs_do_return_delegation(inode, delegation, 0);
@@ -345,9 +362,9 @@ int nfs_inode_return_delegation(struct inode *inode)
 	struct nfs_delegation *delegation;
 	int err = 0;
 
-	if (rcu_dereference(nfsi->delegation) != NULL) {
+	if (rcu_access_pointer(nfsi->delegation) != NULL) {
 		spin_lock(&clp->cl_lock);
-		delegation = nfs_detach_delegation_locked(nfsi, NULL);
+		delegation = nfs_detach_delegation_locked(nfsi, NULL, clp);
 		spin_unlock(&clp->cl_lock);
 		if (delegation != NULL) {
 			nfs_msync_inode(inode);
@@ -525,7 +542,7 @@ restart:
 		if (inode == NULL)
 			continue;
 		spin_lock(&clp->cl_lock);
-		delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL);
+		delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL, clp);
 		spin_unlock(&clp->cl_lock);
 		rcu_read_unlock();
 		if (delegation != NULL)
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 944b627ec6e1..69e7b8140122 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -71,4 +71,10 @@ static inline int nfs_inode_return_delegation(struct inode *inode)
 }
 #endif
 
+static inline int nfs_have_delegated_attributes(struct inode *inode)
+{
+	return nfs_have_delegation(inode, FMODE_READ) &&
+		!(NFS_I(inode)->cache_validity & NFS_INO_REVAL_FORCED);
+}
+
 #endif
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 3c7f03b669fb..e60416d3f818 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -53,7 +53,7 @@ static int nfs_link(struct dentry *, struct inode *, struct dentry *);
 static int nfs_mknod(struct inode *, struct dentry *, int, dev_t);
 static int nfs_rename(struct inode *, struct dentry *,
 		      struct inode *, struct dentry *);
-static int nfs_fsync_dir(struct file *, struct dentry *, int);
+static int nfs_fsync_dir(struct file *, int);
 static loff_t nfs_llseek_dir(struct file *, loff_t, int);
 
 const struct file_operations nfs_dir_operations = {
@@ -530,9 +530,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	nfs_readdir_descriptor_t my_desc,
 			*desc = &my_desc;
 	struct nfs_entry my_entry;
-	struct nfs_fh	 fh;
-	struct nfs_fattr fattr;
-	long		res;
+	int res = -ENOMEM;
 
 	dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
 			dentry->d_parent->d_name.name, dentry->d_name.name,
@@ -554,13 +552,15 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 
 	my_entry.cookie = my_entry.prev_cookie = 0;
 	my_entry.eof = 0;
-	my_entry.fh = &fh;
-	my_entry.fattr = &fattr;
-	nfs_fattr_init(&fattr);
+	my_entry.fh = nfs_alloc_fhandle();
+	my_entry.fattr = nfs_alloc_fattr();
+	if (my_entry.fh == NULL || my_entry.fattr == NULL)
+		goto out_alloc_failed;
+
 	desc->entry = &my_entry;
 
 	nfs_block_sillyrename(dentry);
-	res = nfs_revalidate_mapping_nolock(inode, filp->f_mapping);
+	res = nfs_revalidate_mapping(inode, filp->f_mapping);
 	if (res < 0)
 		goto out;
 
@@ -598,7 +598,10 @@ out:
 	nfs_unblock_sillyrename(dentry);
 	if (res > 0)
 		res = 0;
-	dfprintk(FILE, "NFS: readdir(%s/%s) returns %ld\n",
+out_alloc_failed:
+	nfs_free_fattr(my_entry.fattr);
+	nfs_free_fhandle(my_entry.fh);
+	dfprintk(FILE, "NFS: readdir(%s/%s) returns %d\n",
 			dentry->d_parent->d_name.name, dentry->d_name.name,
 			res);
 	return res;
@@ -638,8 +641,10 @@ out:
  * All directory operations under NFS are synchronous, so fsync()
  * is a dummy operation.
  */
-static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync)
+static int nfs_fsync_dir(struct file *filp, int datasync)
 {
+	struct dentry *dentry = filp->f_path.dentry;
+
 	dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n",
 			dentry->d_parent->d_name.name, dentry->d_name.name,
 			datasync);
@@ -776,9 +781,9 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
 	struct inode *dir;
 	struct inode *inode;
 	struct dentry *parent;
+	struct nfs_fh *fhandle = NULL;
+	struct nfs_fattr *fattr = NULL;
 	int error;
-	struct nfs_fh fhandle;
-	struct nfs_fattr fattr;
 
 	parent = dget_parent(dentry);
 	dir = parent->d_inode;
@@ -811,14 +816,22 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
 	if (NFS_STALE(inode))
 		goto out_bad;
 
-	error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr);
+	error = -ENOMEM;
+	fhandle = nfs_alloc_fhandle();
+	fattr = nfs_alloc_fattr();
+	if (fhandle == NULL || fattr == NULL)
+		goto out_error;
+
+	error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
 	if (error)
 		goto out_bad;
-	if (nfs_compare_fh(NFS_FH(inode), &fhandle))
+	if (nfs_compare_fh(NFS_FH(inode), fhandle))
 		goto out_bad;
-	if ((error = nfs_refresh_inode(inode, &fattr)) != 0)
+	if ((error = nfs_refresh_inode(inode, fattr)) != 0)
 		goto out_bad;
 
+	nfs_free_fattr(fattr);
+	nfs_free_fhandle(fhandle);
 out_set_verifier:
 	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
  out_valid:
@@ -837,14 +850,26 @@ out_zap_parent:
 		/* If we have submounts, don't unhash ! */
 		if (have_submounts(dentry))
 			goto out_valid;
+		if (dentry->d_flags & DCACHE_DISCONNECTED)
+			goto out_valid;
 		shrink_dcache_parent(dentry);
 	}
 	d_drop(dentry);
+	nfs_free_fattr(fattr);
+	nfs_free_fhandle(fhandle);
 	dput(parent);
 	dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n",
 			__func__, dentry->d_parent->d_name.name,
 			dentry->d_name.name);
 	return 0;
+out_error:
+	nfs_free_fattr(fattr);
+	nfs_free_fhandle(fhandle);
+	dput(parent);
+	dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) lookup returned error %d\n",
+			__func__, dentry->d_parent->d_name.name,
+			dentry->d_name.name, error);
+	return error;
 }
 
 /*
@@ -909,9 +934,9 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
 	struct dentry *res;
 	struct dentry *parent;
 	struct inode *inode = NULL;
+	struct nfs_fh *fhandle = NULL;
+	struct nfs_fattr *fattr = NULL;
 	int error;
-	struct nfs_fh fhandle;
-	struct nfs_fattr fattr;
 
 	dfprintk(VFS, "NFS: lookup(%s/%s)\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name);
@@ -921,7 +946,6 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
 	if (dentry->d_name.len > NFS_SERVER(dir)->namelen)
 		goto out;
 
-	res = ERR_PTR(-ENOMEM);
 	dentry->d_op = NFS_PROTO(dir)->dentry_ops;
 
 	/*
@@ -934,17 +958,23 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
 		goto out;
 	}
 
+	res = ERR_PTR(-ENOMEM);
+	fhandle = nfs_alloc_fhandle();
+	fattr = nfs_alloc_fattr();
+	if (fhandle == NULL || fattr == NULL)
+		goto out;
+
 	parent = dentry->d_parent;
 	/* Protect against concurrent sillydeletes */
 	nfs_block_sillyrename(parent);
-	error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr);
+	error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
 	if (error == -ENOENT)
 		goto no_entry;
 	if (error < 0) {
 		res = ERR_PTR(error);
 		goto out_unblock_sillyrename;
 	}
-	inode = nfs_fhget(dentry->d_sb, &fhandle, &fattr);
+	inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
 	res = (struct dentry *)inode;
 	if (IS_ERR(res))
 		goto out_unblock_sillyrename;
@@ -960,6 +990,8 @@ no_entry:
 out_unblock_sillyrename:
 	nfs_unblock_sillyrename(parent);
 out:
+	nfs_free_fattr(fattr);
+	nfs_free_fhandle(fhandle);
 	return res;
 }
 
@@ -1025,12 +1057,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
 				res = NULL;
 				goto out;
 			/* This turned out not to be a regular file */
+			case -EISDIR:
 			case -ENOTDIR:
 				goto no_open;
 			case -ELOOP:
 				if (!(nd->intent.open.flags & O_NOFOLLOW))
 					goto no_open;
-			/* case -EISDIR: */
 			/* case -EINVAL: */
 			default:
 				goto out;
@@ -1050,7 +1082,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
 	struct inode *dir;
 	int openflags, ret = 0;
 
-	if (!is_atomic_open(nd))
+	if (!is_atomic_open(nd) || d_mountpoint(dentry))
 		goto no_open;
 	parent = dget_parent(dentry);
 	dir = parent->d_inode;
@@ -1667,28 +1699,33 @@ static void nfs_access_free_entry(struct nfs_access_entry *entry)
 	smp_mb__after_atomic_dec();
 }
 
-int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask)
+static void nfs_access_free_list(struct list_head *head)
+{
+	struct nfs_access_entry *cache;
+
+	while (!list_empty(head)) {
+		cache = list_entry(head->next, struct nfs_access_entry, lru);
+		list_del(&cache->lru);
+		nfs_access_free_entry(cache);
+	}
+}
+
+int nfs_access_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
 {
 	LIST_HEAD(head);
 	struct nfs_inode *nfsi;
 	struct nfs_access_entry *cache;
 
-restart:
+	if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
+		return (nr_to_scan == 0) ? 0 : -1;
+
 	spin_lock(&nfs_access_lru_lock);
 	list_for_each_entry(nfsi, &nfs_access_lru_list, access_cache_inode_lru) {
-		struct rw_semaphore *s_umount;
 		struct inode *inode;
 
 		if (nr_to_scan-- == 0)
 			break;
-		s_umount = &nfsi->vfs_inode.i_sb->s_umount;
-		if (!down_read_trylock(s_umount))
-			continue;
-		inode = igrab(&nfsi->vfs_inode);
-		if (inode == NULL) {
-			up_read(s_umount);
-			continue;
-		}
+		inode = &nfsi->vfs_inode;
 		spin_lock(&inode->i_lock);
 		if (list_empty(&nfsi->access_cache_entry_lru))
 			goto remove_lru_entry;
@@ -1702,61 +1739,48 @@ restart:
 		else {
 remove_lru_entry:
 			list_del_init(&nfsi->access_cache_inode_lru);
+			smp_mb__before_clear_bit();
 			clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags);
+			smp_mb__after_clear_bit();
 		}
 		spin_unlock(&inode->i_lock);
-		spin_unlock(&nfs_access_lru_lock);
-		iput(inode);
-		up_read(s_umount);
-		goto restart;
 	}
 	spin_unlock(&nfs_access_lru_lock);
-	while (!list_empty(&head)) {
-		cache = list_entry(head.next, struct nfs_access_entry, lru);
-		list_del(&cache->lru);
-		nfs_access_free_entry(cache);
-	}
+	nfs_access_free_list(&head);
 	return (atomic_long_read(&nfs_access_nr_entries) / 100) * sysctl_vfs_cache_pressure;
 }
 
-static void __nfs_access_zap_cache(struct inode *inode)
+static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head)
 {
-	struct nfs_inode *nfsi = NFS_I(inode);
 	struct rb_root *root_node = &nfsi->access_cache;
-	struct rb_node *n, *dispose = NULL;
+	struct rb_node *n;
 	struct nfs_access_entry *entry;
 
 	/* Unhook entries from the cache */
 	while ((n = rb_first(root_node)) != NULL) {
 		entry = rb_entry(n, struct nfs_access_entry, rb_node);
 		rb_erase(n, root_node);
-		list_del(&entry->lru);
-		n->rb_left = dispose;
-		dispose = n;
+		list_move(&entry->lru, head);
 	}
 	nfsi->cache_validity &= ~NFS_INO_INVALID_ACCESS;
-	spin_unlock(&inode->i_lock);
-
-	/* Now kill them all! */
-	while (dispose != NULL) {
-		n = dispose;
-		dispose = n->rb_left;
-		nfs_access_free_entry(rb_entry(n, struct nfs_access_entry, rb_node));
-	}
 }
 
 void nfs_access_zap_cache(struct inode *inode)
 {
+	LIST_HEAD(head);
+
+	if (test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags) == 0)
+		return;
 	/* Remove from global LRU init */
-	if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) {
-		spin_lock(&nfs_access_lru_lock);
+	spin_lock(&nfs_access_lru_lock);
+	if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags))
 		list_del_init(&NFS_I(inode)->access_cache_inode_lru);
-		spin_unlock(&nfs_access_lru_lock);
-	}
 
 	spin_lock(&inode->i_lock);
-	/* This will release the spinlock */
-	__nfs_access_zap_cache(inode);
+	__nfs_access_zap_cache(NFS_I(inode), &head);
+	spin_unlock(&inode->i_lock);
+	spin_unlock(&nfs_access_lru_lock);
+	nfs_access_free_list(&head);
 }
 
 static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, struct rpc_cred *cred)
@@ -1789,7 +1813,7 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str
 	cache = nfs_access_search_rbtree(inode, cred);
 	if (cache == NULL)
 		goto out;
-	if (!nfs_have_delegation(inode, FMODE_READ) &&
+	if (!nfs_have_delegated_attributes(inode) &&
 	    !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
 		goto out_stale;
 	res->jiffies = cache->jiffies;
@@ -1807,8 +1831,8 @@ out_stale:
 	nfs_access_free_entry(cache);
 	return -ENOENT;
 out_zap:
-	/* This will release the spinlock */
-	__nfs_access_zap_cache(inode);
+	spin_unlock(&inode->i_lock);
+	nfs_access_zap_cache(inode);
 	return -ENOENT;
 }
 
@@ -1863,9 +1887,11 @@ static void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *s
 	smp_mb__after_atomic_inc();
 
 	/* Add inode to global LRU list */
-	if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) {
+	if (!test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) {
 		spin_lock(&nfs_access_lru_lock);
-		list_add_tail(&NFS_I(inode)->access_cache_inode_lru, &nfs_access_lru_list);
+		if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags))
+			list_add_tail(&NFS_I(inode)->access_cache_inode_lru,
+					&nfs_access_lru_list);
 		spin_unlock(&nfs_access_lru_lock);
 	}
 }
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index e1d415e97849..ad4cd31d6050 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -44,6 +44,7 @@
 #include <linux/file.h>
 #include <linux/pagemap.h>
 #include <linux/kref.h>
+#include <linux/slab.h>
 
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
@@ -342,6 +343,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
 		data->res.fattr = &data->fattr;
 		data->res.eof = 0;
 		data->res.count = bytes;
+		nfs_fattr_init(&data->fattr);
 		msg.rpc_argp = &data->args;
 		msg.rpc_resp = &data->res;
 
@@ -575,6 +577,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
 	data->res.count = 0;
 	data->res.fattr = &data->fattr;
 	data->res.verf = &data->verf;
+	nfs_fattr_init(&data->fattr);
 
 	NFS_PROTO(data->inode)->commit_setup(data, &msg);
 
@@ -766,6 +769,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
 		data->res.fattr = &data->fattr;
 		data->res.count = bytes;
 		data->res.verf = &data->verf;
+		nfs_fattr_init(&data->fattr);
 
 		task_setup_data.task = &data->task;
 		task_setup_data.callback_data = data;
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index 95e1ca765d47..76fd235d0024 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -9,6 +9,7 @@
 #include <linux/hash.h>
 #include <linux/string.h>
 #include <linux/kmod.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/socket.h>
 #include <linux/seq_file.h>
@@ -36,6 +37,19 @@ struct nfs_dns_ent {
 };
 
 
+static void nfs_dns_ent_update(struct cache_head *cnew,
+		struct cache_head *ckey)
+{
+	struct nfs_dns_ent *new;
+	struct nfs_dns_ent *key;
+
+	new = container_of(cnew, struct nfs_dns_ent, h);
+	key = container_of(ckey, struct nfs_dns_ent, h);
+
+	memcpy(&new->addr, &key->addr, key->addrlen);
+	new->addrlen = key->addrlen;
+}
+
 static void nfs_dns_ent_init(struct cache_head *cnew,
 		struct cache_head *ckey)
 {
@@ -49,8 +63,7 @@ static void nfs_dns_ent_init(struct cache_head *cnew,
 	new->hostname = kstrndup(key->hostname, key->namelen, GFP_KERNEL);
 	if (new->hostname) {
 		new->namelen = key->namelen;
-		memcpy(&new->addr, &key->addr, key->addrlen);
-		new->addrlen = key->addrlen;
+		nfs_dns_ent_update(cnew, ckey);
 	} else {
 		new->namelen = 0;
 		new->addrlen = 0;
@@ -234,7 +247,7 @@ static struct cache_detail nfs_dns_resolve = {
 	.cache_show = nfs_dns_show,
 	.match = nfs_dns_match,
 	.init = nfs_dns_ent_init,
-	.update = nfs_dns_ent_init,
+	.update = nfs_dns_ent_update,
 	.alloc = nfs_dns_ent_alloc,
 };
 
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 6b891328f332..f036153d9f50 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -24,9 +24,10 @@
 #include <linux/nfs_fs.h>
 #include <linux/nfs_mount.h>
 #include <linux/mm.h>
-#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/aio.h>
+#include <linux/gfp.h>
+#include <linux/swap.h>
 
 #include <asm/uaccess.h>
 #include <asm/system.h>
@@ -53,7 +54,7 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
 static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov,
 				unsigned long nr_segs, loff_t pos);
 static int  nfs_file_flush(struct file *, fl_owner_t id);
-static int  nfs_file_fsync(struct file *, struct dentry *dentry, int datasync);
+static int  nfs_file_fsync(struct file *, int datasync);
 static int nfs_check_flags(int flags);
 static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
 static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl);
@@ -123,11 +124,11 @@ nfs_file_open(struct inode *inode, struct file *filp)
 			filp->f_path.dentry->d_parent->d_name.name,
 			filp->f_path.dentry->d_name.name);
 
+	nfs_inc_stats(inode, NFSIOS_VFSOPEN);
 	res = nfs_check_flags(filp->f_flags);
 	if (res)
 		return res;
 
-	nfs_inc_stats(inode, NFSIOS_VFSOPEN);
 	res = nfs_open(inode, filp);
 	return res;
 }
@@ -161,14 +162,17 @@ static int nfs_revalidate_file_size(struct inode *inode, struct file *filp)
 	struct nfs_server *server = NFS_SERVER(inode);
 	struct nfs_inode *nfsi = NFS_I(inode);
 
-	if (server->flags & NFS_MOUNT_NOAC)
-		goto force_reval;
+	if (nfs_have_delegated_attributes(inode))
+		goto out_noreval;
+
 	if (filp->f_flags & O_DIRECT)
 		goto force_reval;
-	if (nfsi->npages != 0)
-		return 0;
-	if (!(nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) && !nfs_attribute_timeout(inode))
-		return 0;
+	if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
+		goto force_reval;
+	if (nfs_attribute_timeout(inode))
+		goto force_reval;
+out_noreval:
+	return 0;
 force_reval:
 	return __nfs_revalidate_inode(server, inode);
 }
@@ -237,9 +241,9 @@ nfs_file_flush(struct file *file, fl_owner_t id)
 			dentry->d_parent->d_name.name,
 			dentry->d_name.name);
 
+	nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
 	if ((file->f_mode & FMODE_WRITE) == 0)
 		return 0;
-	nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
 
 	/* Flush writes to the server and return any errors */
 	return nfs_do_fsync(ctx, inode);
@@ -262,9 +266,11 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
 		(unsigned long) count, (unsigned long) pos);
 
 	result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
-	nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, count);
-	if (!result)
+	if (!result) {
 		result = generic_file_aio_read(iocb, iov, nr_segs, pos);
+		if (result > 0)
+			nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result);
+	}
 	return result;
 }
 
@@ -282,8 +288,11 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos,
 		(unsigned long) count, (unsigned long long) *ppos);
 
 	res = nfs_revalidate_mapping(inode, filp->f_mapping);
-	if (!res)
+	if (!res) {
 		res = generic_file_splice_read(filp, ppos, pipe, count, flags);
+		if (res > 0)
+			nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res);
+	}
 	return res;
 }
 
@@ -314,8 +323,9 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
  * whether any write errors occurred for this process.
  */
 static int
-nfs_file_fsync(struct file *file, struct dentry *dentry, int datasync)
+nfs_file_fsync(struct file *file, int datasync)
 {
+	struct dentry *dentry = file->f_path.dentry;
 	struct nfs_open_context *ctx = nfs_file_open_context(file);
 	struct inode *inode = dentry->d_inode;
 
@@ -484,8 +494,19 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset)
  */
 static int nfs_release_page(struct page *page, gfp_t gfp)
 {
+	struct address_space *mapping = page->mapping;
+
 	dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
 
+	/* Only do I/O if gfp is a superset of GFP_KERNEL */
+	if (mapping && (gfp & GFP_KERNEL) == GFP_KERNEL) {
+		int how = FLUSH_SYNC;
+
+		/* Don't let kswapd deadlock waiting for OOM RPC calls */
+		if (current_is_kswapd())
+			how = 0;
+		nfs_commit_inode(mapping->host, how);
+	}
 	/* If PagePrivate() is set, then the page is not freeable */
 	if (PagePrivate(page))
 		return 0;
@@ -594,6 +615,7 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
 {
 	struct dentry * dentry = iocb->ki_filp->f_path.dentry;
 	struct inode * inode = dentry->d_inode;
+	unsigned long written = 0;
 	ssize_t result;
 	size_t count = iov_length(iov, nr_segs);
 
@@ -620,14 +642,18 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
 	if (!count)
 		goto out;
 
-	nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count);
 	result = generic_file_aio_write(iocb, iov, nr_segs, pos);
+	if (result > 0)
+		written = result;
+
 	/* Return error values for O_DSYNC and IS_SYNC() */
 	if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) {
 		int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode);
 		if (err < 0)
 			result = err;
 	}
+	if (result > 0)
+		nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
 out:
 	return result;
 
@@ -642,6 +668,7 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
 {
 	struct dentry *dentry = filp->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
+	unsigned long written = 0;
 	ssize_t ret;
 
 	dprintk("NFS splice_write(%s/%s, %lu@%llu)\n",
@@ -652,14 +679,17 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
 	 * The combination of splice and an O_APPEND destination is disallowed.
 	 */
 
-	nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count);
-
 	ret = generic_file_splice_write(pipe, filp, ppos, count, flags);
+	if (ret > 0)
+		written = ret;
+
 	if (ret >= 0 && nfs_need_sync_write(filp, inode)) {
 		int err = nfs_do_fsync(nfs_file_open_context(filp), inode);
 		if (err < 0)
 			ret = err;
 	}
+	if (ret > 0)
+		nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
 	return ret;
 }
 
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index fa588006588d..ce153a6b3aec 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -17,6 +17,7 @@
 #include <linux/nfs_fs_sb.h>
 #include <linux/in6.h>
 #include <linux/seq_file.h>
+#include <linux/slab.h>
 
 #include "internal.h"
 #include "iostat.h"
@@ -354,12 +355,11 @@ void nfs_fscache_reset_inode_cookie(struct inode *inode)
  */
 int nfs_fscache_release_page(struct page *page, gfp_t gfp)
 {
-	struct nfs_inode *nfsi = NFS_I(page->mapping->host);
-	struct fscache_cookie *cookie = nfsi->fscache;
-
-	BUG_ON(!cookie);
-
 	if (PageFsCache(page)) {
+		struct nfs_inode *nfsi = NFS_I(page->mapping->host);
+		struct fscache_cookie *cookie = nfsi->fscache;
+
+		BUG_ON(!cookie);
 		dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n",
 			 cookie, page, nfsi);
 
@@ -467,7 +467,8 @@ int __nfs_readpages_from_fscache(struct nfs_open_context *ctx,
 				 struct list_head *pages,
 				 unsigned *nr_pages)
 {
-	int ret, npages = *nr_pages;
+	unsigned npages = *nr_pages;
+	int ret;
 
 	dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n",
 		 NFS_I(inode)->fscache, npages, inode);
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index b35d2a616066..a70e446e1605 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -78,159 +78,94 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh)
 {
 	struct nfs_server *server = NFS_SB(sb);
 	struct nfs_fsinfo fsinfo;
-	struct nfs_fattr fattr;
-	struct dentry *mntroot;
+	struct dentry *ret;
 	struct inode *inode;
 	int error;
 
 	/* get the actual root for this mount */
-	fsinfo.fattr = &fattr;
+	fsinfo.fattr = nfs_alloc_fattr();
+	if (fsinfo.fattr == NULL)
+		return ERR_PTR(-ENOMEM);
 
 	error = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo);
 	if (error < 0) {
 		dprintk("nfs_get_root: getattr error = %d\n", -error);
-		return ERR_PTR(error);
+		ret = ERR_PTR(error);
+		goto out;
 	}
 
 	inode = nfs_fhget(sb, mntfh, fsinfo.fattr);
 	if (IS_ERR(inode)) {
 		dprintk("nfs_get_root: get root inode failed\n");
-		return ERR_CAST(inode);
+		ret = ERR_CAST(inode);
+		goto out;
 	}
 
 	error = nfs_superblock_set_dummy_root(sb, inode);
-	if (error != 0)
-		return ERR_PTR(error);
+	if (error != 0) {
+		ret = ERR_PTR(error);
+		goto out;
+	}
 
 	/* root dentries normally start off anonymous and get spliced in later
 	 * if the dentry tree reaches them; however if the dentry already
 	 * exists, we'll pick it up at this point and use it as the root
 	 */
-	mntroot = d_obtain_alias(inode);
-	if (IS_ERR(mntroot)) {
+	ret = d_obtain_alias(inode);
+	if (IS_ERR(ret)) {
 		dprintk("nfs_get_root: get root dentry failed\n");
-		return mntroot;
+		goto out;
 	}
 
-	security_d_instantiate(mntroot, inode);
-
-	if (!mntroot->d_op)
-		mntroot->d_op = server->nfs_client->rpc_ops->dentry_ops;
+	security_d_instantiate(ret, inode);
 
-	return mntroot;
+	if (ret->d_op == NULL)
+		ret->d_op = server->nfs_client->rpc_ops->dentry_ops;
+out:
+	nfs_free_fattr(fsinfo.fattr);
+	return ret;
 }
 
 #ifdef CONFIG_NFS_V4
 
-/*
- * Do a simple pathwalk from the root FH of the server to the nominated target
- * of the mountpoint
- * - give error on symlinks
- * - give error on ".." occurring in the path
- * - follow traversals
- */
-int nfs4_path_walk(struct nfs_server *server,
-		   struct nfs_fh *mntfh,
-		   const char *path)
+int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh)
 {
 	struct nfs_fsinfo fsinfo;
-	struct nfs_fattr fattr;
-	struct nfs_fh lastfh;
-	struct qstr name;
-	int ret;
+	int ret = -ENOMEM;
 
-	dprintk("--> nfs4_path_walk(,,%s)\n", path);
+	dprintk("--> nfs4_get_rootfh()\n");
 
-	fsinfo.fattr = &fattr;
-	nfs_fattr_init(&fattr);
-
-	/* Eat leading slashes */
-	while (*path == '/')
-		path++;
+	fsinfo.fattr = nfs_alloc_fattr();
+	if (fsinfo.fattr == NULL)
+		goto out;
 
 	/* Start by getting the root filehandle from the server */
 	ret = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo);
 	if (ret < 0) {
-		dprintk("nfs4_get_root: getroot error = %d\n", -ret);
-		return ret;
+		dprintk("nfs4_get_rootfh: getroot error = %d\n", -ret);
+		goto out;
 	}
 
-	if (!S_ISDIR(fattr.mode)) {
-		printk(KERN_ERR "nfs4_get_root:"
+	if (!(fsinfo.fattr->valid & NFS_ATTR_FATTR_TYPE)
+			|| !S_ISDIR(fsinfo.fattr->mode)) {
+		printk(KERN_ERR "nfs4_get_rootfh:"
 		       " getroot encountered non-directory\n");
-		return -ENOTDIR;
+		ret = -ENOTDIR;
+		goto out;
 	}
 
-	/* FIXME: It is quite valid for the server to return a referral here */
-	if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL) {
-		printk(KERN_ERR "nfs4_get_root:"
+	if (fsinfo.fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) {
+		printk(KERN_ERR "nfs4_get_rootfh:"
 		       " getroot obtained referral\n");
-		return -EREMOTE;
-	}
-
-next_component:
-	dprintk("Next: %s\n", path);
-
-	/* extract the next bit of the path */
-	if (!*path)
-		goto path_walk_complete;
-
-	name.name = path;
-	while (*path && *path != '/')
-		path++;
-	name.len = path - (const char *) name.name;
-
-	if (name.len > NFS4_MAXNAMLEN)
-		return -ENAMETOOLONG;
-
-eat_dot_dir:
-	while (*path == '/')
-		path++;
-
-	if (path[0] == '.' && (path[1] == '/' || !path[1])) {
-		path += 2;
-		goto eat_dot_dir;
-	}
-
-	/* FIXME: Why shouldn't the user be able to use ".." in the path? */
-	if (path[0] == '.' && path[1] == '.' && (path[2] == '/' || !path[2])
-	    ) {
-		printk(KERN_ERR "nfs4_get_root:"
-		       " Mount path contains reference to \"..\"\n");
-		return -EINVAL;
+		ret = -EREMOTE;
+		goto out;
 	}
 
-	/* lookup the next FH in the sequence */
-	memcpy(&lastfh, mntfh, sizeof(lastfh));
-
-	dprintk("LookupFH: %*.*s [%s]\n", name.len, name.len, name.name, path);
-
-	ret = server->nfs_client->rpc_ops->lookupfh(server, &lastfh, &name,
-						    mntfh, &fattr);
-	if (ret < 0) {
-		dprintk("nfs4_get_root: getroot error = %d\n", -ret);
-		return ret;
-	}
-
-	if (!S_ISDIR(fattr.mode)) {
-		printk(KERN_ERR "nfs4_get_root:"
-		       " lookupfh encountered non-directory\n");
-		return -ENOTDIR;
-	}
-
-	/* FIXME: Referrals are quite valid here too */
-	if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL) {
-		printk(KERN_ERR "nfs4_get_root:"
-		       " lookupfh obtained referral\n");
-		return -EREMOTE;
-	}
-
-	goto next_component;
-
-path_walk_complete:
-	memcpy(&server->fsid, &fattr.fsid, sizeof(server->fsid));
-	dprintk("<-- nfs4_path_walk() = 0\n");
-	return 0;
+	memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid));
+out:
+	nfs_free_fattr(fsinfo.fattr);
+	dprintk("<-- nfs4_get_rootfh() = %d\n", ret);
+	return ret;
 }
 
 /*
@@ -239,8 +174,8 @@ path_walk_complete:
 struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
 {
 	struct nfs_server *server = NFS_SB(sb);
-	struct nfs_fattr fattr;
-	struct dentry *mntroot;
+	struct nfs_fattr *fattr = NULL;
+	struct dentry *ret;
 	struct inode *inode;
 	int error;
 
@@ -254,40 +189,50 @@ struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
 		return ERR_PTR(error);
 	}
 
+	fattr = nfs_alloc_fattr();
+	if (fattr == NULL)
+		return ERR_PTR(-ENOMEM);;
+
 	/* get the actual root for this mount */
-	error = server->nfs_client->rpc_ops->getattr(server, mntfh, &fattr);
+	error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr);
 	if (error < 0) {
 		dprintk("nfs_get_root: getattr error = %d\n", -error);
-		return ERR_PTR(error);
+		ret = ERR_PTR(error);
+		goto out;
 	}
 
-	inode = nfs_fhget(sb, mntfh, &fattr);
+	inode = nfs_fhget(sb, mntfh, fattr);
 	if (IS_ERR(inode)) {
 		dprintk("nfs_get_root: get root inode failed\n");
-		return ERR_CAST(inode);
+		ret = ERR_CAST(inode);
+		goto out;
 	}
 
 	error = nfs_superblock_set_dummy_root(sb, inode);
-	if (error != 0)
-		return ERR_PTR(error);
+	if (error != 0) {
+		ret = ERR_PTR(error);
+		goto out;
+	}
 
 	/* root dentries normally start off anonymous and get spliced in later
 	 * if the dentry tree reaches them; however if the dentry already
 	 * exists, we'll pick it up at this point and use it as the root
 	 */
-	mntroot = d_obtain_alias(inode);
-	if (IS_ERR(mntroot)) {
+	ret = d_obtain_alias(inode);
+	if (IS_ERR(ret)) {
 		dprintk("nfs_get_root: get root dentry failed\n");
-		return mntroot;
+		goto out;
 	}
 
-	security_d_instantiate(mntroot, inode);
+	security_d_instantiate(ret, inode);
 
-	if (!mntroot->d_op)
-		mntroot->d_op = server->nfs_client->rpc_ops->dentry_ops;
+	if (ret->d_op == NULL)
+		ret->d_op = server->nfs_client->rpc_ops->dentry_ops;
 
+out:
+	nfs_free_fattr(fattr);
 	dprintk("<-- nfs4_get_root()\n");
-	return mntroot;
+	return ret;
 }
 
 #endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index faa091865ad0..099b3518feea 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -36,6 +36,7 @@
 #include <linux/vfs.h>
 #include <linux/inet.h>
 #include <linux/nfs_xdr.h>
+#include <linux/slab.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -97,22 +98,6 @@ u64 nfs_compat_user_ino64(u64 fileid)
 	return ino;
 }
 
-int nfs_write_inode(struct inode *inode, int sync)
-{
-	int ret;
-
-	if (sync) {
-		ret = filemap_fdatawait(inode->i_mapping);
-		if (ret == 0)
-			ret = nfs_commit_inode(inode, FLUSH_SYNC);
-	} else
-		ret = nfs_commit_inode(inode, 0);
-	if (ret >= 0)
-		return 0;
-	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
-	return ret;
-}
-
 void nfs_clear_inode(struct inode *inode)
 {
 	/*
@@ -130,16 +115,12 @@ void nfs_clear_inode(struct inode *inode)
  */
 int nfs_sync_mapping(struct address_space *mapping)
 {
-	int ret;
+	int ret = 0;
 
-	if (mapping->nrpages == 0)
-		return 0;
-	unmap_mapping_range(mapping, 0, 0, 0);
-	ret = filemap_write_and_wait(mapping);
-	if (ret != 0)
-		goto out;
-	ret = nfs_wb_all(mapping->host);
-out:
+	if (mapping->nrpages != 0) {
+		unmap_mapping_range(mapping, 0, 0, 0);
+		ret = nfs_wb_all(mapping->host);
+	}
 	return ret;
 }
 
@@ -412,8 +393,8 @@ int
 nfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = dentry->d_inode;
-	struct nfs_fattr fattr;
-	int error;
+	struct nfs_fattr *fattr;
+	int error = -ENOMEM;
 
 	nfs_inc_stats(inode, NFSIOS_VFSSETATTR);
 
@@ -436,14 +417,20 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
 		filemap_write_and_wait(inode->i_mapping);
 		nfs_wb_all(inode);
 	}
+
+	fattr = nfs_alloc_fattr();
+	if (fattr == NULL)
+		goto out;
 	/*
 	 * Return any delegations if we're going to change ACLs
 	 */
 	if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0)
 		nfs_inode_return_delegation(inode);
-	error = NFS_PROTO(inode)->setattr(dentry, &fattr, attr);
+	error = NFS_PROTO(inode)->setattr(dentry, fattr, attr);
 	if (error == 0)
-		nfs_refresh_inode(inode, &fattr);
+		nfs_refresh_inode(inode, fattr);
+	nfs_free_fattr(fattr);
+out:
 	return error;
 }
 
@@ -511,17 +498,11 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 	int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME;
 	int err;
 
-	/*
-	 * Flush out writes to the server in order to update c/mtime.
-	 *
-	 * Hold the i_mutex to suspend application writes temporarily;
-	 * this prevents long-running writing applications from blocking
-	 * nfs_wb_nocommit.
-	 */
+	/* Flush out writes to the server in order to update c/mtime.  */
 	if (S_ISREG(inode->i_mode)) {
-		mutex_lock(&inode->i_mutex);
-		nfs_wb_nocommit(inode);
-		mutex_unlock(&inode->i_mutex);
+		err = filemap_write_and_wait(inode->i_mapping);
+		if (err)
+			goto out;
 	}
 
 	/*
@@ -545,6 +526,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 		generic_fillattr(inode, stat);
 		stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
 	}
+out:
 	return err;
 }
 
@@ -574,14 +556,14 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
 	nfs_revalidate_inode(server, inode);
 }
 
-static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, struct dentry *dentry, struct rpc_cred *cred)
+static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cred *cred)
 {
 	struct nfs_open_context *ctx;
 
 	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
 	if (ctx != NULL) {
-		ctx->path.dentry = dget(dentry);
-		ctx->path.mnt = mntget(mnt);
+		ctx->path = *path;
+		path_get(&ctx->path);
 		ctx->cred = get_rpccred(cred);
 		ctx->state = NULL;
 		ctx->lockowner = current->files;
@@ -620,11 +602,6 @@ void put_nfs_open_context(struct nfs_open_context *ctx)
 	__put_nfs_open_context(ctx, 0);
 }
 
-static void put_nfs_open_context_sync(struct nfs_open_context *ctx)
-{
-	__put_nfs_open_context(ctx, 1);
-}
-
 /*
  * Ensure that mmap has a recent RPC credential for use when writing out
  * shared pages
@@ -652,10 +629,10 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c
 	list_for_each_entry(pos, &nfsi->open_files, list) {
 		if (cred != NULL && pos->cred != cred)
 			continue;
-		if ((pos->mode & mode) == mode) {
-			ctx = get_nfs_open_context(pos);
-			break;
-		}
+		if ((pos->mode & (FMODE_READ|FMODE_WRITE)) != mode)
+			continue;
+		ctx = get_nfs_open_context(pos);
+		break;
 	}
 	spin_unlock(&inode->i_lock);
 	return ctx;
@@ -671,7 +648,7 @@ static void nfs_file_clear_open_context(struct file *filp)
 		spin_lock(&inode->i_lock);
 		list_move_tail(&ctx->list, &NFS_I(inode)->open_files);
 		spin_unlock(&inode->i_lock);
-		put_nfs_open_context_sync(ctx);
+		__put_nfs_open_context(ctx, filp->f_flags & O_DIRECT ? 0 : 1);
 	}
 }
 
@@ -686,7 +663,7 @@ int nfs_open(struct inode *inode, struct file *filp)
 	cred = rpc_lookup_cred();
 	if (IS_ERR(cred))
 		return PTR_ERR(cred);
-	ctx = alloc_nfs_open_context(filp->f_path.mnt, filp->f_path.dentry, cred);
+	ctx = alloc_nfs_open_context(&filp->f_path, cred);
 	put_rpccred(cred);
 	if (ctx == NULL)
 		return -ENOMEM;
@@ -711,7 +688,7 @@ int
 __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 {
 	int		 status = -ESTALE;
-	struct nfs_fattr fattr;
+	struct nfs_fattr *fattr = NULL;
 	struct nfs_inode *nfsi = NFS_I(inode);
 
 	dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n",
@@ -722,8 +699,13 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 	if (NFS_STALE(inode))
 		goto out;
 
+	status = -ENOMEM;
+	fattr = nfs_alloc_fattr();
+	if (fattr == NULL)
+		goto out;
+
 	nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
-	status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr);
+	status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr);
 	if (status != 0) {
 		dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n",
 			 inode->i_sb->s_id,
@@ -736,7 +718,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 		goto out;
 	}
 
-	status = nfs_refresh_inode(inode, &fattr);
+	status = nfs_refresh_inode(inode, fattr);
 	if (status) {
 		dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n",
 			 inode->i_sb->s_id,
@@ -752,6 +734,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 		(long long)NFS_FILEID(inode));
 
  out:
+	nfs_free_fattr(fattr);
 	return status;
 }
 
@@ -759,11 +742,16 @@ int nfs_attribute_timeout(struct inode *inode)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 
-	if (nfs_have_delegation(inode, FMODE_READ))
-		return 0;
 	return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
 }
 
+static int nfs_attribute_cache_expired(struct inode *inode)
+{
+	if (nfs_have_delegated_attributes(inode))
+		return 0;
+	return nfs_attribute_timeout(inode);
+}
+
 /**
  * nfs_revalidate_inode - Revalidate the inode attributes
  * @server - pointer to nfs_server struct
@@ -774,12 +762,12 @@ int nfs_attribute_timeout(struct inode *inode)
 int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 {
 	if (!(NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATTR)
-			&& !nfs_attribute_timeout(inode))
+			&& !nfs_attribute_cache_expired(inode))
 		return NFS_STALE(inode) ? -ESTALE : 0;
 	return __nfs_revalidate_inode(server, inode);
 }
 
-static int nfs_invalidate_mapping_nolock(struct inode *inode, struct address_space *mapping)
+static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	
@@ -800,49 +788,10 @@ static int nfs_invalidate_mapping_nolock(struct inode *inode, struct address_spa
 	return 0;
 }
 
-static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping)
-{
-	int ret = 0;
-
-	mutex_lock(&inode->i_mutex);
-	if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_DATA) {
-		ret = nfs_sync_mapping(mapping);
-		if (ret == 0)
-			ret = nfs_invalidate_mapping_nolock(inode, mapping);
-	}
-	mutex_unlock(&inode->i_mutex);
-	return ret;
-}
-
-/**
- * nfs_revalidate_mapping_nolock - Revalidate the pagecache
- * @inode - pointer to host inode
- * @mapping - pointer to mapping
- */
-int nfs_revalidate_mapping_nolock(struct inode *inode, struct address_space *mapping)
-{
-	struct nfs_inode *nfsi = NFS_I(inode);
-	int ret = 0;
-
-	if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
-			|| nfs_attribute_timeout(inode) || NFS_STALE(inode)) {
-		ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
-		if (ret < 0)
-			goto out;
-	}
-	if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
-		ret = nfs_invalidate_mapping_nolock(inode, mapping);
-out:
-	return ret;
-}
-
 /**
  * nfs_revalidate_mapping - Revalidate the pagecache
  * @inode - pointer to host inode
  * @mapping - pointer to mapping
- *
- * This version of the function will take the inode->i_mutex and attempt to
- * flush out all dirty data if it needs to invalidate the page cache.
  */
 int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
 {
@@ -850,7 +799,8 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
 	int ret = 0;
 
 	if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
-			|| nfs_attribute_timeout(inode) || NFS_STALE(inode)) {
+			|| nfs_attribute_cache_expired(inode)
+			|| NFS_STALE(inode)) {
 		ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
 		if (ret < 0)
 			goto out;
@@ -984,6 +934,26 @@ void nfs_fattr_init(struct nfs_fattr *fattr)
 	fattr->gencount = nfs_inc_attr_generation_counter();
 }
 
+struct nfs_fattr *nfs_alloc_fattr(void)
+{
+	struct nfs_fattr *fattr;
+
+	fattr = kmalloc(sizeof(*fattr), GFP_NOFS);
+	if (fattr != NULL)
+		nfs_fattr_init(fattr);
+	return fattr;
+}
+
+struct nfs_fh *nfs_alloc_fhandle(void)
+{
+	struct nfs_fh *fh;
+
+	fh = kmalloc(sizeof(struct nfs_fh), GFP_NOFS);
+	if (fh != NULL)
+		fh->size = 0;
+	return fh;
+}
+
 /**
  * nfs_inode_attrs_need_update - check if the inode attributes need updating
  * @inode - pointer to inode
@@ -1261,8 +1231,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 
 	if (fattr->valid & NFS_ATTR_FATTR_MODE) {
 		if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) {
+			umode_t newmode = inode->i_mode & S_IFMT;
+			newmode |= fattr->mode & S_IALLUGO;
+			inode->i_mode = newmode;
 			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
-			inode->i_mode = fattr->mode;
 		}
 	} else if (server->caps & NFS_CAP_MODE)
 		invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
@@ -1418,6 +1390,7 @@ static void init_once(void *foo)
 	INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
 	INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
 	nfsi->npages = 0;
+	nfsi->ncommit = 0;
 	atomic_set(&nfsi->silly_count, 1);
 	INIT_HLIST_HEAD(&nfsi->silly_list);
 	init_waitqueue_head(&nfsi->waitqueue);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 29e464d23b32..e70f44b9b3f4 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -205,13 +205,14 @@ extern struct rpc_procinfo nfs4_procedures[];
 void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
 
 /* dir.c */
-extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask);
+extern int nfs_access_cache_shrinker(struct shrinker *shrink,
+					int nr_to_scan, gfp_t gfp_mask);
 
 /* inode.c */
 extern struct workqueue_struct *nfsiod_workqueue;
 extern struct inode *nfs_alloc_inode(struct super_block *sb);
 extern void nfs_destroy_inode(struct inode *);
-extern int nfs_write_inode(struct inode *,int);
+extern int nfs_write_inode(struct inode *, struct writeback_control *);
 extern void nfs_clear_inode(struct inode *);
 #ifdef CONFIG_NFS_V4
 extern void nfs4_clear_inode(struct inode *);
@@ -244,9 +245,7 @@ extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *);
 #ifdef CONFIG_NFS_V4
 extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *);
 
-extern int nfs4_path_walk(struct nfs_server *server,
-			  struct nfs_fh *mntfh,
-			  const char *path);
+extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh);
 #endif
 
 /* read.c */
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index 46d779abafd3..c5832487c456 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -36,14 +36,14 @@ static inline void nfs_inc_stats(const struct inode *inode,
 
 static inline void nfs_add_server_stats(const struct nfs_server *server,
 					enum nfs_stat_bytecounters stat,
-					unsigned long addend)
+					long addend)
 {
 	this_cpu_add(server->io_stats->bytes[stat], addend);
 }
 
 static inline void nfs_add_stats(const struct inode *inode,
 				 enum nfs_stat_bytecounters stat,
-				 unsigned long addend)
+				 long addend)
 {
 	nfs_add_server_stats(NFS_SERVER(inode), stat, addend);
 }
@@ -51,18 +51,18 @@ static inline void nfs_add_stats(const struct inode *inode,
 #ifdef CONFIG_NFS_FSCACHE
 static inline void nfs_add_fscache_stats(struct inode *inode,
 					 enum nfs_stat_fscachecounters stat,
-					 unsigned long addend)
+					 long addend)
 {
 	this_cpu_add(NFS_SERVER(inode)->io_stats->fscache[stat], addend);
 }
 #endif
 
-static inline struct nfs_iostats *nfs_alloc_iostats(void)
+static inline struct nfs_iostats __percpu *nfs_alloc_iostats(void)
 {
 	return alloc_percpu(struct nfs_iostats);
 }
 
-static inline void nfs_free_iostats(struct nfs_iostats *stats)
+static inline void nfs_free_iostats(struct nfs_iostats __percpu *stats)
 {
 	if (stats != NULL)
 		free_percpu(stats);
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 0adefc40cc89..59047f8d7d72 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -120,7 +120,7 @@ static struct {
 	{ .status = MNT3ERR_INVAL,		.errno = -EINVAL,	},
 	{ .status = MNT3ERR_NAMETOOLONG,	.errno = -ENAMETOOLONG,	},
 	{ .status = MNT3ERR_NOTSUPP,		.errno = -ENOTSUPP,	},
-	{ .status = MNT3ERR_SERVERFAULT,	.errno = -ESERVERFAULT,	},
+	{ .status = MNT3ERR_SERVERFAULT,	.errno = -EREMOTEIO,	},
 };
 
 struct mountres {
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 40c766782891..db6aa3673cf3 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -8,6 +8,7 @@
  */
 
 #include <linux/dcache.h>
+#include <linux/gfp.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/nfs_fs.h>
@@ -104,8 +105,8 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 	struct vfsmount *mnt;
 	struct nfs_server *server = NFS_SERVER(dentry->d_inode);
 	struct dentry *parent;
-	struct nfs_fh fh;
-	struct nfs_fattr fattr;
+	struct nfs_fh *fh = NULL;
+	struct nfs_fattr *fattr = NULL;
 	int err;
 
 	dprintk("--> nfs_follow_mountpoint()\n");
@@ -114,6 +115,12 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 	if (IS_ROOT(dentry))
 		goto out_err;
 
+	err = -ENOMEM;
+	fh = nfs_alloc_fhandle();
+	fattr = nfs_alloc_fattr();
+	if (fh == NULL || fattr == NULL)
+		goto out_err;
+
 	dprintk("%s: enter\n", __func__);
 	dput(nd->path.dentry);
 	nd->path.dentry = dget(dentry);
@@ -122,16 +129,16 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 	parent = dget_parent(nd->path.dentry);
 	err = server->nfs_client->rpc_ops->lookup(parent->d_inode,
 						  &nd->path.dentry->d_name,
-						  &fh, &fattr);
+						  fh, fattr);
 	dput(parent);
 	if (err != 0)
 		goto out_err;
 
-	if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL)
+	if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
 		mnt = nfs_do_refmount(nd->path.mnt, nd->path.dentry);
 	else
-		mnt = nfs_do_submount(nd->path.mnt, nd->path.dentry, &fh,
-				      &fattr);
+		mnt = nfs_do_submount(nd->path.mnt, nd->path.dentry, fh,
+				      fattr);
 	err = PTR_ERR(mnt);
 	if (IS_ERR(mnt))
 		goto out_err;
@@ -150,6 +157,8 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 	nd->path.dentry = dget(mnt->mnt_root);
 	schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
 out:
+	nfs_free_fattr(fattr);
+	nfs_free_fhandle(fh);
 	dprintk("%s: done, returned %d\n", __func__, err);
 
 	dprintk("<-- nfs_follow_mountpoint() = %d\n", err);
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 5e078b222b4e..81cf14257916 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -12,7 +12,6 @@
 #include <linux/param.h>
 #include <linux/time.h>
 #include <linux/mm.h>
-#include <linux/slab.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/in.h>
@@ -699,7 +698,7 @@ static struct {
 	{ NFSERR_BAD_COOKIE,	-EBADCOOKIE	},
 	{ NFSERR_NOTSUPP,	-ENOTSUPP	},
 	{ NFSERR_TOOSMALL,	-ETOOSMALL	},
-	{ NFSERR_SERVERFAULT,	-ESERVERFAULT	},
+	{ NFSERR_SERVERFAULT,	-EREMOTEIO	},
 	{ NFSERR_BADTYPE,	-EBADTYPE	},
 	{ NFSERR_JUKEBOX,	-EJUKEBOX	},
 	{ -1,			-EIO		}
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index bac60515a4b3..9f88c5f4c7e2 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -1,4 +1,5 @@
 #include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/nfs.h>
 #include <linux/nfs3.h>
 #include <linux/nfs_fs.h>
@@ -184,7 +185,6 @@ static void nfs3_cache_acls(struct inode *inode, struct posix_acl *acl,
 struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
 {
 	struct nfs_server *server = NFS_SERVER(inode);
-	struct nfs_fattr fattr;
 	struct page *pages[NFSACL_MAXPAGES] = { };
 	struct nfs3_getaclargs args = {
 		.fh = NFS_FH(inode),
@@ -192,7 +192,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
 		.pages = pages,
 	};
 	struct nfs3_getaclres res = {
-		.fattr =	&fattr,
+		0
 	};
 	struct rpc_message msg = {
 		.rpc_argp	= &args,
@@ -227,7 +227,10 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
 
 	dprintk("NFS call getacl\n");
 	msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_GETACL];
-	nfs_fattr_init(&fattr);
+	res.fattr = nfs_alloc_fattr();
+	if (res.fattr == NULL)
+		return ERR_PTR(-ENOMEM);
+
 	status = rpc_call_sync(server->client_acl, &msg, 0);
 	dprintk("NFS reply getacl: %d\n", status);
 
@@ -237,7 +240,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
 
 	switch (status) {
 		case 0:
-			status = nfs_refresh_inode(inode, &fattr);
+			status = nfs_refresh_inode(inode, res.fattr);
 			break;
 		case -EPFNOSUPPORT:
 		case -EPROTONOSUPPORT:
@@ -277,6 +280,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
 getout:
 	posix_acl_release(res.acl_access);
 	posix_acl_release(res.acl_default);
+	nfs_free_fattr(res.fattr);
 
 	if (status != 0) {
 		posix_acl_release(acl);
@@ -289,7 +293,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
 		  struct posix_acl *dfacl)
 {
 	struct nfs_server *server = NFS_SERVER(inode);
-	struct nfs_fattr fattr;
+	struct nfs_fattr *fattr;
 	struct page *pages[NFSACL_MAXPAGES];
 	struct nfs3_setaclargs args = {
 		.inode = inode,
@@ -334,8 +338,13 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
 	}
 
 	dprintk("NFS call setacl\n");
+	status = -ENOMEM;
+	fattr = nfs_alloc_fattr();
+	if (fattr == NULL)
+		goto out_freepages;
+
 	msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL];
-	nfs_fattr_init(&fattr);
+	msg.rpc_resp = fattr;
 	status = rpc_call_sync(server->client_acl, &msg, 0);
 	nfs_access_zap_cache(inode);
 	nfs_zap_acl_cache(inode);
@@ -343,7 +352,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
 
 	switch (status) {
 		case 0:
-			status = nfs_refresh_inode(inode, &fattr);
+			status = nfs_refresh_inode(inode, fattr);
 			nfs3_cache_acls(inode, acl, dfacl);
 			break;
 		case -EPFNOSUPPORT:
@@ -354,6 +363,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
 		case -ENOTSUPP:
 			status = -EOPNOTSUPP;
 	}
+	nfs_free_fattr(fattr);
 out_freepages:
 	while (args.npages != 0) {
 		args.npages--;
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 3f8881d1a050..fabb4f2849a1 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -10,6 +10,7 @@
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/sunrpc/clnt.h>
+#include <linux/slab.h>
 #include <linux/nfs.h>
 #include <linux/nfs3.h>
 #include <linux/nfs_fs.h>
@@ -22,14 +23,14 @@
 
 #define NFSDBG_FACILITY		NFSDBG_PROC
 
-/* A wrapper to handle the EJUKEBOX error message */
+/* A wrapper to handle the EJUKEBOX and EKEYEXPIRED error messages */
 static int
 nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
 {
 	int res;
 	do {
 		res = rpc_call_sync(clnt, msg, flags);
-		if (res != -EJUKEBOX)
+		if (res != -EJUKEBOX && res != -EKEYEXPIRED)
 			break;
 		schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME);
 		res = -ERESTARTSYS;
@@ -42,9 +43,10 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
 static int
 nfs3_async_handle_jukebox(struct rpc_task *task, struct inode *inode)
 {
-	if (task->tk_status != -EJUKEBOX)
+	if (task->tk_status != -EJUKEBOX && task->tk_status != -EKEYEXPIRED)
 		return 0;
-	nfs_inc_stats(inode, NFSIOS_DELAY);
+	if (task->tk_status == -EJUKEBOX)
+		nfs_inc_stats(inode, NFSIOS_DELAY);
 	task->tk_status = 0;
 	rpc_restart_call(task);
 	rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
@@ -142,14 +144,12 @@ static int
 nfs3_proc_lookup(struct inode *dir, struct qstr *name,
 		 struct nfs_fh *fhandle, struct nfs_fattr *fattr)
 {
-	struct nfs_fattr	dir_attr;
 	struct nfs3_diropargs	arg = {
 		.fh		= NFS_FH(dir),
 		.name		= name->name,
 		.len		= name->len
 	};
 	struct nfs3_diropres	res = {
-		.dir_attr	= &dir_attr,
 		.fh		= fhandle,
 		.fattr		= fattr
 	};
@@ -161,29 +161,30 @@ nfs3_proc_lookup(struct inode *dir, struct qstr *name,
 	int			status;
 
 	dprintk("NFS call  lookup %s\n", name->name);
-	nfs_fattr_init(&dir_attr);
+	res.dir_attr = nfs_alloc_fattr();
+	if (res.dir_attr == NULL)
+		return -ENOMEM;
+
 	nfs_fattr_init(fattr);
 	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
-	nfs_refresh_inode(dir, &dir_attr);
+	nfs_refresh_inode(dir, res.dir_attr);
 	if (status >= 0 && !(fattr->valid & NFS_ATTR_FATTR)) {
 		msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR];
 		msg.rpc_argp = fhandle;
 		msg.rpc_resp = fattr;
 		status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
 	}
+	nfs_free_fattr(res.dir_attr);
 	dprintk("NFS reply lookup: %d\n", status);
 	return status;
 }
 
 static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
 {
-	struct nfs_fattr	fattr;
 	struct nfs3_accessargs	arg = {
 		.fh		= NFS_FH(inode),
 	};
-	struct nfs3_accessres	res = {
-		.fattr		= &fattr,
-	};
+	struct nfs3_accessres	res;
 	struct rpc_message msg = {
 		.rpc_proc	= &nfs3_procedures[NFS3PROC_ACCESS],
 		.rpc_argp	= &arg,
@@ -191,7 +192,7 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
 		.rpc_cred	= entry->cred,
 	};
 	int mode = entry->mask;
-	int status;
+	int status = -ENOMEM;
 
 	dprintk("NFS call  access\n");
 
@@ -208,9 +209,13 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
 		if (mode & MAY_EXEC)
 			arg.access |= NFS3_ACCESS_EXECUTE;
 	}
-	nfs_fattr_init(&fattr);
+
+	res.fattr = nfs_alloc_fattr();
+	if (res.fattr == NULL)
+		goto out;
+
 	status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
-	nfs_refresh_inode(inode, &fattr);
+	nfs_refresh_inode(inode, res.fattr);
 	if (status == 0) {
 		entry->mask = 0;
 		if (res.access & NFS3_ACCESS_READ)
@@ -220,6 +225,8 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
 		if (res.access & (NFS3_ACCESS_LOOKUP|NFS3_ACCESS_EXECUTE))
 			entry->mask |= MAY_EXEC;
 	}
+	nfs_free_fattr(res.fattr);
+out:
 	dprintk("NFS reply access: %d\n", status);
 	return status;
 }
@@ -227,7 +234,7 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
 static int nfs3_proc_readlink(struct inode *inode, struct page *page,
 		unsigned int pgbase, unsigned int pglen)
 {
-	struct nfs_fattr	fattr;
+	struct nfs_fattr	*fattr;
 	struct nfs3_readlinkargs args = {
 		.fh		= NFS_FH(inode),
 		.pgbase		= pgbase,
@@ -237,14 +244,19 @@ static int nfs3_proc_readlink(struct inode *inode, struct page *page,
 	struct rpc_message msg = {
 		.rpc_proc	= &nfs3_procedures[NFS3PROC_READLINK],
 		.rpc_argp	= &args,
-		.rpc_resp	= &fattr,
 	};
-	int			status;
+	int status = -ENOMEM;
 
 	dprintk("NFS call  readlink\n");
-	nfs_fattr_init(&fattr);
+	fattr = nfs_alloc_fattr();
+	if (fattr == NULL)
+		goto out;
+	msg.rpc_resp = fattr;
+
 	status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
-	nfs_refresh_inode(inode, &fattr);
+	nfs_refresh_inode(inode, fattr);
+	nfs_free_fattr(fattr);
+out:
 	dprintk("NFS reply readlink: %d\n", status);
 	return status;
 }
@@ -394,12 +406,17 @@ nfs3_proc_remove(struct inode *dir, struct qstr *name)
 		.rpc_argp = &arg,
 		.rpc_resp = &res,
 	};
-	int			status;
+	int status = -ENOMEM;
 
 	dprintk("NFS call  remove %s\n", name->name);
-	nfs_fattr_init(&res.dir_attr);
+	res.dir_attr = nfs_alloc_fattr();
+	if (res.dir_attr == NULL)
+		goto out;
+
 	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
-	nfs_post_op_update_inode(dir, &res.dir_attr);
+	nfs_post_op_update_inode(dir, res.dir_attr);
+	nfs_free_fattr(res.dir_attr);
+out:
 	dprintk("NFS reply remove: %d\n", status);
 	return status;
 }
@@ -417,7 +434,7 @@ nfs3_proc_unlink_done(struct rpc_task *task, struct inode *dir)
 	if (nfs3_async_handle_jukebox(task, dir))
 		return 0;
 	res = task->tk_msg.rpc_resp;
-	nfs_post_op_update_inode(dir, &res->dir_attr);
+	nfs_post_op_update_inode(dir, res->dir_attr);
 	return 1;
 }
 
@@ -425,7 +442,6 @@ static int
 nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
 		 struct inode *new_dir, struct qstr *new_name)
 {
-	struct nfs_fattr	old_dir_attr, new_dir_attr;
 	struct nfs3_renameargs	arg = {
 		.fromfh		= NFS_FH(old_dir),
 		.fromname	= old_name->name,
@@ -434,23 +450,27 @@ nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
 		.toname		= new_name->name,
 		.tolen		= new_name->len
 	};
-	struct nfs3_renameres	res = {
-		.fromattr	= &old_dir_attr,
-		.toattr		= &new_dir_attr
-	};
+	struct nfs3_renameres res;
 	struct rpc_message msg = {
 		.rpc_proc	= &nfs3_procedures[NFS3PROC_RENAME],
 		.rpc_argp	= &arg,
 		.rpc_resp	= &res,
 	};
-	int			status;
+	int status = -ENOMEM;
 
 	dprintk("NFS call  rename %s -> %s\n", old_name->name, new_name->name);
-	nfs_fattr_init(&old_dir_attr);
-	nfs_fattr_init(&new_dir_attr);
+
+	res.fromattr = nfs_alloc_fattr();
+	res.toattr = nfs_alloc_fattr();
+	if (res.fromattr == NULL || res.toattr == NULL)
+		goto out;
+
 	status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0);
-	nfs_post_op_update_inode(old_dir, &old_dir_attr);
-	nfs_post_op_update_inode(new_dir, &new_dir_attr);
+	nfs_post_op_update_inode(old_dir, res.fromattr);
+	nfs_post_op_update_inode(new_dir, res.toattr);
+out:
+	nfs_free_fattr(res.toattr);
+	nfs_free_fattr(res.fromattr);
 	dprintk("NFS reply rename: %d\n", status);
 	return status;
 }
@@ -458,30 +478,32 @@ nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
 static int
 nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
 {
-	struct nfs_fattr	dir_attr, fattr;
 	struct nfs3_linkargs	arg = {
 		.fromfh		= NFS_FH(inode),
 		.tofh		= NFS_FH(dir),
 		.toname		= name->name,
 		.tolen		= name->len
 	};
-	struct nfs3_linkres	res = {
-		.dir_attr	= &dir_attr,
-		.fattr		= &fattr
-	};
+	struct nfs3_linkres	res;
 	struct rpc_message msg = {
 		.rpc_proc	= &nfs3_procedures[NFS3PROC_LINK],
 		.rpc_argp	= &arg,
 		.rpc_resp	= &res,
 	};
-	int			status;
+	int status = -ENOMEM;
 
 	dprintk("NFS call  link %s\n", name->name);
-	nfs_fattr_init(&dir_attr);
-	nfs_fattr_init(&fattr);
+	res.fattr = nfs_alloc_fattr();
+	res.dir_attr = nfs_alloc_fattr();
+	if (res.fattr == NULL || res.dir_attr == NULL)
+		goto out;
+
 	status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
-	nfs_post_op_update_inode(dir, &dir_attr);
-	nfs_post_op_update_inode(inode, &fattr);
+	nfs_post_op_update_inode(dir, res.dir_attr);
+	nfs_post_op_update_inode(inode, res.fattr);
+out:
+	nfs_free_fattr(res.dir_attr);
+	nfs_free_fattr(res.fattr);
 	dprintk("NFS reply link: %d\n", status);
 	return status;
 }
@@ -552,7 +574,7 @@ out:
 static int
 nfs3_proc_rmdir(struct inode *dir, struct qstr *name)
 {
-	struct nfs_fattr	dir_attr;
+	struct nfs_fattr	*dir_attr;
 	struct nfs3_diropargs	arg = {
 		.fh		= NFS_FH(dir),
 		.name		= name->name,
@@ -561,14 +583,19 @@ nfs3_proc_rmdir(struct inode *dir, struct qstr *name)
 	struct rpc_message msg = {
 		.rpc_proc	= &nfs3_procedures[NFS3PROC_RMDIR],
 		.rpc_argp	= &arg,
-		.rpc_resp	= &dir_attr,
 	};
-	int			status;
+	int status = -ENOMEM;
 
 	dprintk("NFS call  rmdir %s\n", name->name);
-	nfs_fattr_init(&dir_attr);
+	dir_attr = nfs_alloc_fattr();
+	if (dir_attr == NULL)
+		goto out;
+
+	msg.rpc_resp = dir_attr;
 	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
-	nfs_post_op_update_inode(dir, &dir_attr);
+	nfs_post_op_update_inode(dir, dir_attr);
+	nfs_free_fattr(dir_attr);
+out:
 	dprintk("NFS reply rmdir: %d\n", status);
 	return status;
 }
@@ -587,7 +614,6 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
 		  u64 cookie, struct page *page, unsigned int count, int plus)
 {
 	struct inode		*dir = dentry->d_inode;
-	struct nfs_fattr	dir_attr;
 	__be32			*verf = NFS_COOKIEVERF(dir);
 	struct nfs3_readdirargs	arg = {
 		.fh		= NFS_FH(dir),
@@ -598,7 +624,6 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
 		.pages		= &page
 	};
 	struct nfs3_readdirres	res = {
-		.dir_attr	= &dir_attr,
 		.verf		= verf,
 		.plus		= plus
 	};
@@ -608,7 +633,7 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
 		.rpc_resp	= &res,
 		.rpc_cred	= cred
 	};
-	int			status;
+	int status = -ENOMEM;
 
 	if (plus)
 		msg.rpc_proc = &nfs3_procedures[NFS3PROC_READDIRPLUS];
@@ -616,12 +641,17 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
 	dprintk("NFS call  readdir%s %d\n",
 			plus? "plus" : "", (unsigned int) cookie);
 
-	nfs_fattr_init(&dir_attr);
+	res.dir_attr = nfs_alloc_fattr();
+	if (res.dir_attr == NULL)
+		goto out;
+
 	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
 
 	nfs_invalidate_atime(dir);
+	nfs_refresh_inode(dir, res.dir_attr);
 
-	nfs_refresh_inode(dir, &dir_attr);
+	nfs_free_fattr(res.dir_attr);
+out:
 	dprintk("NFS reply readdir: %d\n", status);
 	return status;
 }
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 5fe5492fbd29..75dcfc7da365 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -9,7 +9,6 @@
 #include <linux/param.h>
 #include <linux/time.h>
 #include <linux/mm.h>
-#include <linux/slab.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/in.h>
@@ -763,7 +762,7 @@ nfs3_xdr_wccstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
 static int
 nfs3_xdr_removeres(struct rpc_rqst *req, __be32 *p, struct nfs_removeres *res)
 {
-	return nfs3_xdr_wccstat(req, p, &res->dir_attr);
+	return nfs3_xdr_wccstat(req, p, res->dir_attr);
 }
 
 /*
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 865265bdca03..c538c6106e16 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -46,6 +46,7 @@ enum nfs4_client_state {
 	NFS4CLNT_DELEGRETURN,
 	NFS4CLNT_SESSION_RESET,
 	NFS4CLNT_SESSION_DRAINING,
+	NFS4CLNT_RECALL_SLOT,
 };
 
 /*
@@ -146,6 +147,7 @@ enum {
 	NFS_O_RDWR_STATE,		/* OPEN stateid has read/write state */
 	NFS_STATE_RECLAIM_REBOOT,	/* OPEN stateid server rebooted */
 	NFS_STATE_RECLAIM_NOGRACE,	/* OPEN stateid needs to recover state */
+	NFS_STATE_POSIX_LOCKS,		/* Posix locks are supported */
 };
 
 struct nfs4_state {
@@ -204,14 +206,14 @@ extern ssize_t nfs4_listxattr(struct dentry *, char *, size_t);
 
 
 /* nfs4proc.c */
-extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *);
-extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct rpc_cred *);
+extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
+extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
 extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred);
 extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
 extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
-extern int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait);
+extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait);
 extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
 extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
 extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
@@ -277,12 +279,14 @@ extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);
 extern void nfs4_schedule_state_recovery(struct nfs_client *);
 extern void nfs4_schedule_state_manager(struct nfs_client *);
 extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state);
+extern int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state);
 extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
+extern void nfs41_handle_recall_slot(struct nfs_client *clp);
 extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
 extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
 extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t);
 
-extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter);
+extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask);
 extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task);
 extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid);
 extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid);
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index fa3408f20112..3c2a1724fbd2 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -11,6 +11,7 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/nfs_fs.h>
+#include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/vfs.h>
@@ -114,6 +115,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
 				     char *page, char *page2,
 				     const struct nfs4_fs_location *location)
 {
+	const size_t addr_bufsize = sizeof(struct sockaddr_storage);
 	struct vfsmount *mnt = ERR_PTR(-ENOENT);
 	char *mnt_path;
 	unsigned int maxbuflen;
@@ -125,9 +127,12 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
 	mountdata->mnt_path = mnt_path;
 	maxbuflen = mnt_path - 1 - page2;
 
+	mountdata->addr = kmalloc(addr_bufsize, GFP_KERNEL);
+	if (mountdata->addr == NULL)
+		return ERR_PTR(-ENOMEM);
+
 	for (s = 0; s < location->nservers; s++) {
 		const struct nfs4_string *buf = &location->servers[s];
-		struct sockaddr_storage addr;
 
 		if (buf->len <= 0 || buf->len >= maxbuflen)
 			continue;
@@ -136,11 +141,10 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
 			continue;
 
 		mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len,
-				(struct sockaddr *)&addr, sizeof(addr));
+				mountdata->addr, addr_bufsize);
 		if (mountdata->addrlen == 0)
 			continue;
 
-		mountdata->addr = (struct sockaddr *)&addr;
 		rpc_set_port(mountdata->addr, NFS_PORT);
 
 		memcpy(page2, buf->data, buf->len);
@@ -155,6 +159,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
 		if (!IS_ERR(mnt))
 			break;
 	}
+	kfree(mountdata->addr);
 	return mnt;
 }
 
@@ -220,8 +225,8 @@ out:
 
 /*
  * nfs_do_refmount - handle crossing a referral on server
+ * @mnt_parent - mountpoint of referral
  * @dentry - dentry of referral
- * @nd - nameidata info
  *
  */
 struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 198d51d17c13..70015dd60a98 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -39,6 +39,7 @@
 #include <linux/delay.h>
 #include <linux/errno.h>
 #include <linux/string.h>
+#include <linux/slab.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/nfs.h>
 #include <linux/nfs4.h>
@@ -69,6 +70,9 @@ static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinf
 static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
 static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
 static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
+static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
+			    struct nfs_fattr *fattr, struct iattr *sattr,
+			    struct nfs4_state *state);
 
 /* Prevent leaks of NFSv4 errors into userland */
 static int nfs4_map_errors(int err)
@@ -249,19 +253,15 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
 			if (state == NULL)
 				break;
 			nfs4_state_mark_reclaim_nograce(clp, state);
-		case -NFS4ERR_STALE_CLIENTID:
+			goto do_state_recovery;
 		case -NFS4ERR_STALE_STATEID:
-		case -NFS4ERR_EXPIRED:
-			nfs4_schedule_state_recovery(clp);
-			ret = nfs4_wait_clnt_recover(clp);
-			if (ret == 0)
-				exception->retry = 1;
-#if !defined(CONFIG_NFS_V4_1)
-			break;
-#else /* !defined(CONFIG_NFS_V4_1) */
-			if (!nfs4_has_session(server->nfs_client))
+			if (state == NULL)
 				break;
-			/* FALLTHROUGH */
+			nfs4_state_mark_reclaim_reboot(clp, state);
+		case -NFS4ERR_STALE_CLIENTID:
+		case -NFS4ERR_EXPIRED:
+			goto do_state_recovery;
+#if defined(CONFIG_NFS_V4_1)
 		case -NFS4ERR_BADSESSION:
 		case -NFS4ERR_BADSLOT:
 		case -NFS4ERR_BAD_HIGH_SLOT:
@@ -274,7 +274,7 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
 			nfs4_schedule_state_recovery(clp);
 			exception->retry = 1;
 			break;
-#endif /* !defined(CONFIG_NFS_V4_1) */
+#endif /* defined(CONFIG_NFS_V4_1) */
 		case -NFS4ERR_FILE_OPEN:
 			if (exception->timeout > HZ) {
 				/* We have retried a decent amount, time to
@@ -285,6 +285,7 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
 			}
 		case -NFS4ERR_GRACE:
 		case -NFS4ERR_DELAY:
+		case -EKEYEXPIRED:
 			ret = nfs4_delay(server->client, &exception->timeout);
 			if (ret != 0)
 				break;
@@ -293,6 +294,12 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
 	}
 	/* We failed to handle the error */
 	return nfs4_map_errors(ret);
+do_state_recovery:
+	nfs4_schedule_state_recovery(clp);
+	ret = nfs4_wait_clnt_recover(clp);
+	if (ret == 0)
+		exception->retry = 1;
+	return ret;
 }
 
 
@@ -416,7 +423,8 @@ static void nfs41_sequence_done(struct nfs_client *clp,
 			clp->cl_last_renewal = timestamp;
 		spin_unlock(&clp->cl_lock);
 		/* Check sequence flags */
-		nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags);
+		if (atomic_read(&clp->cl_count) > 1)
+			nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags);
 	}
 out:
 	/* The session may be reset by one of the error handlers. */
@@ -709,21 +717,22 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
 
 static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
 		struct nfs4_state_owner *sp, fmode_t fmode, int flags,
-		const struct iattr *attrs)
+		const struct iattr *attrs,
+		gfp_t gfp_mask)
 {
 	struct dentry *parent = dget_parent(path->dentry);
 	struct inode *dir = parent->d_inode;
 	struct nfs_server *server = NFS_SERVER(dir);
 	struct nfs4_opendata *p;
 
-	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	p = kzalloc(sizeof(*p), gfp_mask);
 	if (p == NULL)
 		goto err;
-	p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid);
+	p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid, gfp_mask);
 	if (p->o_arg.seqid == NULL)
 		goto err_free;
-	p->path.mnt = mntget(path->mnt);
-	p->path.dentry = dget(path->dentry);
+	path_get(path);
+	p->path = *path;
 	p->dir = parent;
 	p->owner = sp;
 	atomic_inc(&sp->so_count);
@@ -1055,7 +1064,7 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context
 {
 	struct nfs4_opendata *opendata;
 
-	opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, 0, NULL);
+	opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, 0, NULL, GFP_NOFS);
 	if (opendata == NULL)
 		return ERR_PTR(-ENOMEM);
 	opendata->state = state;
@@ -1161,7 +1170,7 @@ static int nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state
 	int err;
 	do {
 		err = _nfs4_do_open_reclaim(ctx, state);
-		if (err != -NFS4ERR_DELAY)
+		if (err != -NFS4ERR_DELAY && err != -EKEYEXPIRED)
 			break;
 		nfs4_handle_exception(server, err, &exception);
 	} while (exception.retry);
@@ -1518,6 +1527,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
 		nfs_post_op_update_inode(dir, o_res->dir_attr);
 	} else
 		nfs_refresh_inode(dir, o_res->dir_attr);
+	if ((o_res->rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) == 0)
+		server->caps &= ~NFS_CAP_POSIX_LOCK;
 	if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) {
 		status = _nfs4_proc_open_confirm(data);
 		if (status != 0)
@@ -1580,6 +1591,7 @@ static int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state
 			goto out;
 		case -NFS4ERR_GRACE:
 		case -NFS4ERR_DELAY:
+		case -EKEYEXPIRED:
 			nfs4_handle_exception(server, err, &exception);
 			err = 0;
 		}
@@ -1640,7 +1652,7 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, in
 	if (path->dentry->d_inode != NULL)
 		nfs4_return_incompatible_delegation(path->dentry->d_inode, fmode);
 	status = -ENOMEM;
-	opendata = nfs4_opendata_alloc(path, sp, fmode, flags, sattr);
+	opendata = nfs4_opendata_alloc(path, sp, fmode, flags, sattr, GFP_KERNEL);
 	if (opendata == NULL)
 		goto err_put_state_owner;
 
@@ -1651,13 +1663,24 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, in
 	if (status != 0)
 		goto err_opendata_put;
 
-	if (opendata->o_arg.open_flags & O_EXCL)
-		nfs4_exclusive_attrset(opendata, sattr);
-
 	state = nfs4_opendata_to_nfs4_state(opendata);
 	status = PTR_ERR(state);
 	if (IS_ERR(state))
 		goto err_opendata_put;
+	if (server->caps & NFS_CAP_POSIX_LOCK)
+		set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
+
+	if (opendata->o_arg.open_flags & O_EXCL) {
+		nfs4_exclusive_attrset(opendata, sattr);
+
+		nfs_fattr_init(opendata->o_res.f_attr);
+		status = nfs4_do_setattr(state->inode, cred,
+				opendata->o_res.f_attr, sattr,
+				state);
+		if (status == 0)
+			nfs_setattr_update_inode(state->inode, sattr);
+		nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr);
+	}
 	nfs4_opendata_put(opendata);
 	nfs4_put_state_owner(sp);
 	*res = state;
@@ -1904,7 +1927,7 @@ static const struct rpc_call_ops nfs4_close_ops = {
  *
  * NOTE: Caller must be holding the sp->so_owner semaphore!
  */
-int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
+int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait)
 {
 	struct nfs_server *server = NFS_SERVER(state->inode);
 	struct nfs4_closedata *calldata;
@@ -1923,7 +1946,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
 	};
 	int status = -ENOMEM;
 
-	calldata = kzalloc(sizeof(*calldata), GFP_KERNEL);
+	calldata = kzalloc(sizeof(*calldata), gfp_mask);
 	if (calldata == NULL)
 		goto out;
 	calldata->inode = state->inode;
@@ -1931,7 +1954,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
 	calldata->arg.fh = NFS_FH(state->inode);
 	calldata->arg.stateid = &state->open_stateid;
 	/* Serialization for the sequence id */
-	calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid);
+	calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid, gfp_mask);
 	if (calldata->arg.seqid == NULL)
 		goto out_free_calldata;
 	calldata->arg.fmode = 0;
@@ -1940,8 +1963,8 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
 	calldata->res.seqid = calldata->arg.seqid;
 	calldata->res.server = server;
 	calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
-	calldata->path.mnt = mntget(path->mnt);
-	calldata->path.dentry = dget(path->dentry);
+	path_get(path);
+	calldata->path = *path;
 
 	msg.rpc_argp = &calldata->arg,
 	msg.rpc_resp = &calldata->res,
@@ -2060,8 +2083,7 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st
 			case -EDQUOT:
 			case -ENOSPC:
 			case -EROFS:
-				lookup_instantiate_filp(nd, (struct dentry *)state, NULL);
-				return 1;
+				return PTR_ERR(state);
 			default:
 				goto out_drop;
 		}
@@ -2395,14 +2417,12 @@ static int nfs4_proc_lookup(struct inode *dir, struct qstr *name, struct nfs_fh
 static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry)
 {
 	struct nfs_server *server = NFS_SERVER(inode);
-	struct nfs_fattr fattr;
 	struct nfs4_accessargs args = {
 		.fh = NFS_FH(inode),
 		.bitmask = server->attr_bitmask,
 	};
 	struct nfs4_accessres res = {
 		.server = server,
-		.fattr = &fattr,
 	};
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ACCESS],
@@ -2429,7 +2449,11 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
 		if (mode & MAY_EXEC)
 			args.access |= NFS4_ACCESS_EXECUTE;
 	}
-	nfs_fattr_init(&fattr);
+
+	res.fattr = nfs_alloc_fattr();
+	if (res.fattr == NULL)
+		return -ENOMEM;
+
 	status = nfs4_call_sync(server, &msg, &args, &res, 0);
 	if (!status) {
 		entry->mask = 0;
@@ -2439,8 +2463,9 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
 			entry->mask |= MAY_WRITE;
 		if (res.access & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE))
 			entry->mask |= MAY_EXEC;
-		nfs_refresh_inode(inode, &fattr);
+		nfs_refresh_inode(inode, res.fattr);
 	}
+	nfs_free_fattr(res.fattr);
 	return status;
 }
 
@@ -2553,13 +2578,6 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 	}
 	d_add(dentry, igrab(state->inode));
 	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-	if (flags & O_EXCL) {
-		struct nfs_fattr fattr;
-		status = nfs4_do_setattr(state->inode, cred, &fattr, sattr, state);
-		if (status == 0)
-			nfs_setattr_update_inode(state->inode, sattr);
-		nfs_post_op_update_inode(state->inode, &fattr);
-	}
 	if (status == 0 && (nd->flags & LOOKUP_OPEN) != 0)
 		status = nfs4_intent_set_file(nd, &path, state, fmode);
 	else
@@ -2587,14 +2605,19 @@ static int _nfs4_proc_remove(struct inode *dir, struct qstr *name)
 		.rpc_argp = &args,
 		.rpc_resp = &res,
 	};
-	int			status;
+	int status = -ENOMEM;
+
+	res.dir_attr = nfs_alloc_fattr();
+	if (res.dir_attr == NULL)
+		goto out;
 
-	nfs_fattr_init(&res.dir_attr);
 	status = nfs4_call_sync(server, &msg, &args, &res, 1);
 	if (status == 0) {
 		update_changeattr(dir, &res.cinfo);
-		nfs_post_op_update_inode(dir, &res.dir_attr);
+		nfs_post_op_update_inode(dir, res.dir_attr);
 	}
+	nfs_free_fattr(res.dir_attr);
+out:
 	return status;
 }
 
@@ -2629,7 +2652,7 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
 	if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
 		return 0;
 	update_changeattr(dir, &res->cinfo);
-	nfs_post_op_update_inode(dir, &res->dir_attr);
+	nfs_post_op_update_inode(dir, res->dir_attr);
 	return 1;
 }
 
@@ -2644,29 +2667,31 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
 		.new_name = new_name,
 		.bitmask = server->attr_bitmask,
 	};
-	struct nfs_fattr old_fattr, new_fattr;
 	struct nfs4_rename_res res = {
 		.server = server,
-		.old_fattr = &old_fattr,
-		.new_fattr = &new_fattr,
 	};
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME],
 		.rpc_argp = &arg,
 		.rpc_resp = &res,
 	};
-	int			status;
+	int status = -ENOMEM;
 	
-	nfs_fattr_init(res.old_fattr);
-	nfs_fattr_init(res.new_fattr);
-	status = nfs4_call_sync(server, &msg, &arg, &res, 1);
+	res.old_fattr = nfs_alloc_fattr();
+	res.new_fattr = nfs_alloc_fattr();
+	if (res.old_fattr == NULL || res.new_fattr == NULL)
+		goto out;
 
+	status = nfs4_call_sync(server, &msg, &arg, &res, 1);
 	if (!status) {
 		update_changeattr(old_dir, &res.old_cinfo);
 		nfs_post_op_update_inode(old_dir, res.old_fattr);
 		update_changeattr(new_dir, &res.new_cinfo);
 		nfs_post_op_update_inode(new_dir, res.new_fattr);
 	}
+out:
+	nfs_free_fattr(res.new_fattr);
+	nfs_free_fattr(res.old_fattr);
 	return status;
 }
 
@@ -2693,28 +2718,30 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *
 		.name   = name,
 		.bitmask = server->attr_bitmask,
 	};
-	struct nfs_fattr fattr, dir_attr;
 	struct nfs4_link_res res = {
 		.server = server,
-		.fattr = &fattr,
-		.dir_attr = &dir_attr,
 	};
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK],
 		.rpc_argp = &arg,
 		.rpc_resp = &res,
 	};
-	int			status;
+	int status = -ENOMEM;
+
+	res.fattr = nfs_alloc_fattr();
+	res.dir_attr = nfs_alloc_fattr();
+	if (res.fattr == NULL || res.dir_attr == NULL)
+		goto out;
 
-	nfs_fattr_init(res.fattr);
-	nfs_fattr_init(res.dir_attr);
 	status = nfs4_call_sync(server, &msg, &arg, &res, 1);
 	if (!status) {
 		update_changeattr(dir, &res.cinfo);
 		nfs_post_op_update_inode(dir, res.dir_attr);
 		nfs_post_op_update_inode(inode, res.fattr);
 	}
-
+out:
+	nfs_free_fattr(res.dir_attr);
+	nfs_free_fattr(res.fattr);
 	return status;
 }
 
@@ -3137,14 +3164,31 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa
 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
 }
 
+struct nfs4_renewdata {
+	struct nfs_client	*client;
+	unsigned long		timestamp;
+};
+
 /*
  * nfs4_proc_async_renew(): This is not one of the nfs_rpc_ops; it is a special
  * standalone procedure for queueing an asynchronous RENEW.
  */
-static void nfs4_renew_done(struct rpc_task *task, void *data)
+static void nfs4_renew_release(void *calldata)
 {
-	struct nfs_client *clp = (struct nfs_client *)task->tk_msg.rpc_argp;
-	unsigned long timestamp = (unsigned long)data;
+	struct nfs4_renewdata *data = calldata;
+	struct nfs_client *clp = data->client;
+
+	if (atomic_read(&clp->cl_count) > 1)
+		nfs4_schedule_state_renewal(clp);
+	nfs_put_client(clp);
+	kfree(data);
+}
+
+static void nfs4_renew_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs4_renewdata *data = calldata;
+	struct nfs_client *clp = data->client;
+	unsigned long timestamp = data->timestamp;
 
 	if (task->tk_status < 0) {
 		/* Unless we're shutting down, schedule state recovery! */
@@ -3160,6 +3204,7 @@ static void nfs4_renew_done(struct rpc_task *task, void *data)
 
 static const struct rpc_call_ops nfs4_renew_ops = {
 	.rpc_call_done = nfs4_renew_done,
+	.rpc_release = nfs4_renew_release,
 };
 
 int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred)
@@ -3169,9 +3214,17 @@ int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred)
 		.rpc_argp	= clp,
 		.rpc_cred	= cred,
 	};
+	struct nfs4_renewdata *data;
 
+	if (!atomic_inc_not_zero(&clp->cl_count))
+		return -EIO;
+	data = kmalloc(sizeof(*data), GFP_KERNEL);
+	if (data == NULL)
+		return -ENOMEM;
+	data->client = clp;
+	data->timestamp = jiffies;
 	return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT,
-			&nfs4_renew_ops, (void *)jiffies);
+			&nfs4_renew_ops, data);
 }
 
 int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)
@@ -3422,15 +3475,14 @@ _nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
 			if (state == NULL)
 				break;
 			nfs4_state_mark_reclaim_nograce(clp, state);
-		case -NFS4ERR_STALE_CLIENTID:
+			goto do_state_recovery;
 		case -NFS4ERR_STALE_STATEID:
+			if (state == NULL)
+				break;
+			nfs4_state_mark_reclaim_reboot(clp, state);
+		case -NFS4ERR_STALE_CLIENTID:
 		case -NFS4ERR_EXPIRED:
-			rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
-			nfs4_schedule_state_recovery(clp);
-			if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
-				rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
-			task->tk_status = 0;
-			return -EAGAIN;
+			goto do_state_recovery;
 #if defined(CONFIG_NFS_V4_1)
 		case -NFS4ERR_BADSESSION:
 		case -NFS4ERR_BADSLOT:
@@ -3449,6 +3501,7 @@ _nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
 			if (server)
 				nfs_inc_server_stats(server, NFSIOS_DELAY);
 		case -NFS4ERR_GRACE:
+		case -EKEYEXPIRED:
 			rpc_delay(task, NFS4_POLL_RETRY_MAX);
 			task->tk_status = 0;
 			return -EAGAIN;
@@ -3458,6 +3511,13 @@ _nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
 	}
 	task->tk_status = nfs4_map_errors(task->tk_status);
 	return 0;
+do_state_recovery:
+	rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
+	nfs4_schedule_state_recovery(clp);
+	if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
+		rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
+	task->tk_status = 0;
+	return -EAGAIN;
 }
 
 static int
@@ -3466,7 +3526,9 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
 	return _nfs4_async_handle_error(task, server, server->nfs_client, state);
 }
 
-int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short port, struct rpc_cred *cred)
+int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
+		unsigned short port, struct rpc_cred *cred,
+		struct nfs4_setclientid_res *res)
 {
 	nfs4_verifier sc_verifier;
 	struct nfs4_setclientid setclientid = {
@@ -3476,7 +3538,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short po
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID],
 		.rpc_argp = &setclientid,
-		.rpc_resp = clp,
+		.rpc_resp = res,
 		.rpc_cred = cred,
 	};
 	__be32 *p;
@@ -3519,12 +3581,14 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short po
 	return status;
 }
 
-static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred)
+static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp,
+		struct nfs4_setclientid_res *arg,
+		struct rpc_cred *cred)
 {
 	struct nfs_fsinfo fsinfo;
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID_CONFIRM],
-		.rpc_argp = clp,
+		.rpc_argp = arg,
 		.rpc_resp = &fsinfo,
 		.rpc_cred = cred,
 	};
@@ -3542,18 +3606,21 @@ static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cre
 	return status;
 }
 
-int nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred)
+int nfs4_proc_setclientid_confirm(struct nfs_client *clp,
+		struct nfs4_setclientid_res *arg,
+		struct rpc_cred *cred)
 {
 	long timeout = 0;
 	int err;
 	do {
-		err = _nfs4_proc_setclientid_confirm(clp, cred);
+		err = _nfs4_proc_setclientid_confirm(clp, arg, cred);
 		switch (err) {
 			case 0:
 				return err;
 			case -NFS4ERR_RESOURCE:
 				/* The IBM lawyers misread another document! */
 			case -NFS4ERR_DELAY:
+			case -EKEYEXPIRED:
 				err = nfs4_delay(clp->cl_rpcclient, &timeout);
 		}
 	} while (err == 0);
@@ -3638,7 +3705,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
 	};
 	int status = 0;
 
-	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	data = kzalloc(sizeof(*data), GFP_NOFS);
 	if (data == NULL)
 		return -ENOMEM;
 	data->args.fhandle = &data->fh;
@@ -3794,7 +3861,7 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
 	struct nfs4_unlockdata *p;
 	struct inode *inode = lsp->ls_state->inode;
 
-	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	p = kzalloc(sizeof(*p), GFP_NOFS);
 	if (p == NULL)
 		return NULL;
 	p->arg.fh = NFS_FH(inode);
@@ -3932,7 +3999,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
 	if (test_bit(NFS_DELEGATED_STATE, &state->flags))
 		goto out;
 	lsp = request->fl_u.nfs4_fl.owner;
-	seqid = nfs_alloc_seqid(&lsp->ls_seqid);
+	seqid = nfs_alloc_seqid(&lsp->ls_seqid, GFP_KERNEL);
 	status = -ENOMEM;
 	if (seqid == NULL)
 		goto out;
@@ -3960,22 +4027,23 @@ struct nfs4_lockdata {
 };
 
 static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
-		struct nfs_open_context *ctx, struct nfs4_lock_state *lsp)
+		struct nfs_open_context *ctx, struct nfs4_lock_state *lsp,
+		gfp_t gfp_mask)
 {
 	struct nfs4_lockdata *p;
 	struct inode *inode = lsp->ls_state->inode;
 	struct nfs_server *server = NFS_SERVER(inode);
 
-	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	p = kzalloc(sizeof(*p), gfp_mask);
 	if (p == NULL)
 		return NULL;
 
 	p->arg.fh = NFS_FH(inode);
 	p->arg.fl = &p->fl;
-	p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid);
+	p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid, gfp_mask);
 	if (p->arg.open_seqid == NULL)
 		goto out_free;
-	p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid);
+	p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid, gfp_mask);
 	if (p->arg.lock_seqid == NULL)
 		goto out_free_seqid;
 	p->arg.lock_stateid = &lsp->ls_stateid;
@@ -4088,6 +4156,28 @@ static const struct rpc_call_ops nfs4_recover_lock_ops = {
 	.rpc_release = nfs4_lock_release,
 };
 
+static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_state *lsp, int new_lock_owner, int error)
+{
+	struct nfs_client *clp = server->nfs_client;
+	struct nfs4_state *state = lsp->ls_state;
+
+	switch (error) {
+	case -NFS4ERR_ADMIN_REVOKED:
+	case -NFS4ERR_BAD_STATEID:
+	case -NFS4ERR_EXPIRED:
+		if (new_lock_owner != 0 ||
+		   (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0)
+			nfs4_state_mark_reclaim_nograce(clp, state);
+		lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
+		break;
+	case -NFS4ERR_STALE_STATEID:
+		if (new_lock_owner != 0 ||
+		    (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0)
+			nfs4_state_mark_reclaim_reboot(clp, state);
+		lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
+	};
+}
+
 static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *fl, int recovery_type)
 {
 	struct nfs4_lockdata *data;
@@ -4107,7 +4197,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
 
 	dprintk("%s: begin!\n", __func__);
 	data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file),
-			fl->fl_u.nfs4_fl.owner);
+			fl->fl_u.nfs4_fl.owner,
+			recovery_type == NFS_LOCK_NEW ? GFP_KERNEL : GFP_NOFS);
 	if (data == NULL)
 		return -ENOMEM;
 	if (IS_SETLKW(cmd))
@@ -4126,6 +4217,9 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
 	ret = nfs4_wait_for_completion_rpc_task(task);
 	if (ret == 0) {
 		ret = data->rpc_status;
+		if (ret)
+			nfs4_handle_setlk_error(data->server, data->lsp,
+					data->arg.new_lock_owner, ret);
 	} else
 		data->cancelled = 1;
 	rpc_put_task(task);
@@ -4144,7 +4238,7 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request
 		if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
 			return 0;
 		err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM);
-		if (err != -NFS4ERR_DELAY)
+		if (err != -NFS4ERR_DELAY && err != -EKEYEXPIRED)
 			break;
 		nfs4_handle_exception(server, err, &exception);
 	} while (exception.retry);
@@ -4169,6 +4263,7 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request
 			goto out;
 		case -NFS4ERR_GRACE:
 		case -NFS4ERR_DELAY:
+		case -EKEYEXPIRED:
 			nfs4_handle_exception(server, err, &exception);
 			err = 0;
 		}
@@ -4181,8 +4276,11 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
 {
 	struct nfs_inode *nfsi = NFS_I(state->inode);
 	unsigned char fl_flags = request->fl_flags;
-	int status;
+	int status = -ENOLCK;
 
+	if ((fl_flags & FL_POSIX) &&
+			!test_bit(NFS_STATE_POSIX_LOCKS, &state->flags))
+		goto out;
 	/* Is this a delegated open? */
 	status = nfs4_set_lock_state(state, request);
 	if (status != 0)
@@ -4317,6 +4415,7 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
 				err = 0;
 				goto out;
 			case -NFS4ERR_DELAY:
+			case -EKEYEXPIRED:
 				break;
 		}
 		err = nfs4_handle_exception(server, err, &exception);
@@ -4462,7 +4561,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
 
 		status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
 
-		if (status != NFS4ERR_CLID_INUSE)
+		if (status != -NFS4ERR_CLID_INUSE)
 			break;
 
 		if (signalled())
@@ -4516,6 +4615,7 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
 	switch (task->tk_status) {
 	case -NFS4ERR_DELAY:
 	case -NFS4ERR_GRACE:
+	case -EKEYEXPIRED:
 		dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status);
 		rpc_delay(task, NFS4_POLL_RETRY_MIN);
 		task->tk_status = 0;
@@ -4573,26 +4673,32 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
 /*
  * Reset a slot table
  */
-static int nfs4_reset_slot_table(struct nfs4_slot_table *tbl, int max_slots,
-		int old_max_slots, int ivalue)
+static int nfs4_reset_slot_table(struct nfs4_slot_table *tbl, u32 max_reqs,
+				 int ivalue)
 {
+	struct nfs4_slot *new = NULL;
 	int i;
 	int ret = 0;
 
-	dprintk("--> %s: max_reqs=%u, tbl %p\n", __func__, max_slots, tbl);
+	dprintk("--> %s: max_reqs=%u, tbl->max_slots %d\n", __func__,
+		max_reqs, tbl->max_slots);
 
-	/*
-	 * Until we have dynamic slot table adjustment, insist
-	 * upon the same slot table size
-	 */
-	if (max_slots != old_max_slots) {
-		dprintk("%s reset slot table does't match old\n",
-			__func__);
-		ret = -EINVAL; /*XXX NFS4ERR_REQ_TOO_BIG ? */
-		goto out;
+	/* Does the newly negotiated max_reqs match the existing slot table? */
+	if (max_reqs != tbl->max_slots) {
+		ret = -ENOMEM;
+		new = kmalloc(max_reqs * sizeof(struct nfs4_slot),
+			      GFP_NOFS);
+		if (!new)
+			goto out;
+		ret = 0;
+		kfree(tbl->slots);
 	}
 	spin_lock(&tbl->slot_tbl_lock);
-	for (i = 0; i < max_slots; ++i)
+	if (new) {
+		tbl->slots = new;
+		tbl->max_slots = max_reqs;
+	}
+	for (i = 0; i < tbl->max_slots; ++i)
 		tbl->slots[i].seq_nr = ivalue;
 	spin_unlock(&tbl->slot_tbl_lock);
 	dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__,
@@ -4610,16 +4716,12 @@ static int nfs4_reset_slot_tables(struct nfs4_session *session)
 	int status;
 
 	status = nfs4_reset_slot_table(&session->fc_slot_table,
-			session->fc_attrs.max_reqs,
-			session->fc_slot_table.max_slots,
-			1);
+			session->fc_attrs.max_reqs, 1);
 	if (status)
 		return status;
 
 	status = nfs4_reset_slot_table(&session->bc_slot_table,
-			session->bc_attrs.max_reqs,
-			session->bc_slot_table.max_slots,
-			0);
+			session->bc_attrs.max_reqs, 0);
 	return status;
 }
 
@@ -4650,7 +4752,7 @@ static int nfs4_init_slot_table(struct nfs4_slot_table *tbl,
 
 	dprintk("--> %s: max_reqs=%u\n", __func__, max_slots);
 
-	slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_KERNEL);
+	slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_NOFS);
 	if (!slot)
 		goto out;
 	ret = 0;
@@ -4699,7 +4801,7 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
 	struct nfs4_session *session;
 	struct nfs4_slot_table *tbl;
 
-	session = kzalloc(sizeof(struct nfs4_session), GFP_KERNEL);
+	session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS);
 	if (!session)
 		return NULL;
 
@@ -4760,16 +4862,14 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
 	args->fc_attrs.headerpadsz = 0;
 	args->fc_attrs.max_rqst_sz = mxrqst_sz;
 	args->fc_attrs.max_resp_sz = mxresp_sz;
-	args->fc_attrs.max_resp_sz_cached = mxresp_sz;
 	args->fc_attrs.max_ops = NFS4_MAX_OPS;
 	args->fc_attrs.max_reqs = session->clp->cl_rpcclient->cl_xprt->max_reqs;
 
 	dprintk("%s: Fore Channel : max_rqst_sz=%u max_resp_sz=%u "
-		"max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n",
+		"max_ops=%u max_reqs=%u\n",
 		__func__,
 		args->fc_attrs.max_rqst_sz, args->fc_attrs.max_resp_sz,
-		args->fc_attrs.max_resp_sz_cached, args->fc_attrs.max_ops,
-		args->fc_attrs.max_reqs);
+		args->fc_attrs.max_ops, args->fc_attrs.max_reqs);
 
 	/* Back channel attributes */
 	args->bc_attrs.headerpadsz = 0;
@@ -4978,7 +5078,16 @@ static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
 				       &res, args.sa_cache_this, 1);
 }
 
-void nfs41_sequence_call_done(struct rpc_task *task, void *data)
+static void nfs41_sequence_release(void *data)
+{
+	struct nfs_client *clp = (struct nfs_client *)data;
+
+	if (atomic_read(&clp->cl_count) > 1)
+		nfs4_schedule_state_renewal(clp);
+	nfs_put_client(clp);
+}
+
+static void nfs41_sequence_call_done(struct rpc_task *task, void *data)
 {
 	struct nfs_client *clp = (struct nfs_client *)data;
 
@@ -4986,6 +5095,8 @@ void nfs41_sequence_call_done(struct rpc_task *task, void *data)
 
 	if (task->tk_status < 0) {
 		dprintk("%s ERROR %d\n", __func__, task->tk_status);
+		if (atomic_read(&clp->cl_count) == 1)
+			goto out;
 
 		if (_nfs4_async_handle_error(task, NULL, clp, NULL)
 								== -EAGAIN) {
@@ -4994,7 +5105,7 @@ void nfs41_sequence_call_done(struct rpc_task *task, void *data)
 		}
 	}
 	dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred);
-
+out:
 	kfree(task->tk_msg.rpc_argp);
 	kfree(task->tk_msg.rpc_resp);
 
@@ -5019,6 +5130,7 @@ static void nfs41_sequence_prepare(struct rpc_task *task, void *data)
 static const struct rpc_call_ops nfs41_sequence_ops = {
 	.rpc_call_done = nfs41_sequence_call_done,
 	.rpc_call_prepare = nfs41_sequence_prepare,
+	.rpc_release = nfs41_sequence_release,
 };
 
 static int nfs41_proc_async_sequence(struct nfs_client *clp,
@@ -5031,12 +5143,14 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp,
 		.rpc_cred = cred,
 	};
 
-	args = kzalloc(sizeof(*args), GFP_KERNEL);
-	if (!args)
-		return -ENOMEM;
-	res = kzalloc(sizeof(*res), GFP_KERNEL);
-	if (!res) {
+	if (!atomic_inc_not_zero(&clp->cl_count))
+		return -EIO;
+	args = kzalloc(sizeof(*args), GFP_NOFS);
+	res = kzalloc(sizeof(*res), GFP_NOFS);
+	if (!args || !res) {
 		kfree(args);
+		kfree(res);
+		nfs_put_client(clp);
 		return -ENOMEM;
 	}
 	res->sr_slotid = NFS4_MAX_SLOT_TABLE;
@@ -5133,7 +5247,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
 	int status = -ENOMEM;
 
 	dprintk("--> %s\n", __func__);
-	calldata = kzalloc(sizeof(*calldata), GFP_KERNEL);
+	calldata = kzalloc(sizeof(*calldata), GFP_NOFS);
 	if (calldata == NULL)
 		goto out;
 	calldata->clp = clp;
@@ -5144,9 +5258,12 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
 	msg.rpc_resp = &calldata->res;
 	task_setup_data.callback_data = calldata;
 	task = rpc_run_task(&task_setup_data);
-	if (IS_ERR(task))
+	if (IS_ERR(task)) {
 		status = PTR_ERR(task);
+		goto out;
+	}
 	rpc_put_task(task);
+	return 0;
 out:
 	dprintk("<-- %s status=%d\n", __func__, status);
 	return status;
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 0156c01c212c..d87f10327b72 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -36,11 +36,6 @@
  * as an rpc_task, not a real kernel thread, so it always runs in rpciod's
  * context.  There is one renewd per nfs_server.
  *
- * TODO: If the send queue gets backlogged (e.g., if the server goes down),
- * we will keep filling the queue with periodic RENEW requests.  We need a
- * mechanism for ensuring that if renewd successfully sends off a request,
- * then it only wakes up when the request is finished.  Maybe use the
- * child task framework of the RPC layer?
  */
 
 #include <linux/mm.h>
@@ -63,7 +58,7 @@ nfs4_renew_state(struct work_struct *work)
 	struct nfs_client *clp =
 		container_of(work, struct nfs_client, cl_renewd.work);
 	struct rpc_cred *cred;
-	long lease, timeout;
+	long lease;
 	unsigned long last, now;
 
 	ops = nfs4_state_renewal_ops[clp->cl_minorversion];
@@ -75,7 +70,6 @@ nfs4_renew_state(struct work_struct *work)
 	lease = clp->cl_lease_time;
 	last = clp->cl_last_renewal;
 	now = jiffies;
-	timeout = (2 * lease) / 3 + (long)last - (long)now;
 	/* Are we close to a lease timeout? */
 	if (time_after(now, last + lease/3)) {
 		cred = ops->get_state_renewal_cred_locked(clp);
@@ -90,19 +84,15 @@ nfs4_renew_state(struct work_struct *work)
 			/* Queue an asynchronous RENEW. */
 			ops->sched_state_renewal(clp, cred);
 			put_rpccred(cred);
+			goto out_exp;
 		}
-		timeout = (2 * lease) / 3;
-		spin_lock(&clp->cl_lock);
-	} else
+	} else {
 		dprintk("%s: failed to call renewd. Reason: lease not expired \n",
 				__func__);
-	if (timeout < 5 * HZ)    /* safeguard */
-		timeout = 5 * HZ;
-	dprintk("%s: requeueing work. Lease period = %ld\n",
-			__func__, (timeout + HZ - 1) / HZ);
-	cancel_delayed_work(&clp->cl_renewd);
-	schedule_delayed_work(&clp->cl_renewd, timeout);
-	spin_unlock(&clp->cl_lock);
+		spin_unlock(&clp->cl_lock);
+	}
+	nfs4_schedule_state_renewal(clp);
+out_exp:
 	nfs_expire_unreferenced_delegations(clp);
 out:
 	dprintk("%s: done\n", __func__);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 6d263ed79e92..34acf5926fdc 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -62,6 +62,7 @@ static LIST_HEAD(nfs4_clientid_list);
 
 int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
 {
+	struct nfs4_setclientid_res clid;
 	unsigned short port;
 	int status;
 
@@ -69,11 +70,15 @@ int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
 	if (clp->cl_addr.ss_family == AF_INET6)
 		port = nfs_callback_tcpport6;
 
-	status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred);
-	if (status == 0)
-		status = nfs4_proc_setclientid_confirm(clp, cred);
-	if (status == 0)
-		nfs4_schedule_state_renewal(clp);
+	status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred, &clid);
+	if (status != 0)
+		goto out;
+	status = nfs4_proc_setclientid_confirm(clp, &clid, cred);
+	if (status != 0)
+		goto out;
+	clp->cl_clientid = clid.clientid;
+	nfs4_schedule_state_renewal(clp);
+out:
 	return status;
 }
 
@@ -361,7 +366,7 @@ nfs4_alloc_state_owner(void)
 {
 	struct nfs4_state_owner *sp;
 
-	sp = kzalloc(sizeof(*sp),GFP_KERNEL);
+	sp = kzalloc(sizeof(*sp),GFP_NOFS);
 	if (!sp)
 		return NULL;
 	spin_lock_init(&sp->so_lock);
@@ -435,7 +440,7 @@ nfs4_alloc_open_state(void)
 {
 	struct nfs4_state *state;
 
-	state = kzalloc(sizeof(*state), GFP_KERNEL);
+	state = kzalloc(sizeof(*state), GFP_NOFS);
 	if (!state)
 		return NULL;
 	atomic_set(&state->count, 1);
@@ -537,7 +542,8 @@ void nfs4_put_open_state(struct nfs4_state *state)
 /*
  * Close the current file.
  */
-static void __nfs4_close(struct path *path, struct nfs4_state *state, fmode_t fmode, int wait)
+static void __nfs4_close(struct path *path, struct nfs4_state *state,
+		fmode_t fmode, gfp_t gfp_mask, int wait)
 {
 	struct nfs4_state_owner *owner = state->owner;
 	int call_close = 0;
@@ -578,17 +584,17 @@ static void __nfs4_close(struct path *path, struct nfs4_state *state, fmode_t fm
 		nfs4_put_open_state(state);
 		nfs4_put_state_owner(owner);
 	} else
-		nfs4_do_close(path, state, wait);
+		nfs4_do_close(path, state, gfp_mask, wait);
 }
 
 void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode)
 {
-	__nfs4_close(path, state, fmode, 0);
+	__nfs4_close(path, state, fmode, GFP_NOFS, 0);
 }
 
 void nfs4_close_sync(struct path *path, struct nfs4_state *state, fmode_t fmode)
 {
-	__nfs4_close(path, state, fmode, 1);
+	__nfs4_close(path, state, fmode, GFP_KERNEL, 1);
 }
 
 /*
@@ -618,7 +624,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
 	struct nfs4_lock_state *lsp;
 	struct nfs_client *clp = state->owner->so_client;
 
-	lsp = kzalloc(sizeof(*lsp), GFP_KERNEL);
+	lsp = kzalloc(sizeof(*lsp), GFP_NOFS);
 	if (lsp == NULL)
 		return NULL;
 	rpc_init_wait_queue(&lsp->ls_sequence.wait, "lock_seqid_waitqueue");
@@ -754,11 +760,11 @@ void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t f
 	nfs4_put_lock_state(lsp);
 }
 
-struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter)
+struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask)
 {
 	struct nfs_seqid *new;
 
-	new = kmalloc(sizeof(*new), GFP_KERNEL);
+	new = kmalloc(sizeof(*new), gfp_mask);
 	if (new != NULL) {
 		new->sequence = counter;
 		INIT_LIST_HEAD(&new->list);
@@ -901,7 +907,7 @@ void nfs4_schedule_state_recovery(struct nfs_client *clp)
 	nfs4_schedule_state_manager(clp);
 }
 
-static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state)
+int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state)
 {
 
 	set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
@@ -1249,26 +1255,65 @@ static int nfs4_reclaim_lease(struct nfs_client *clp)
 }
 
 #ifdef CONFIG_NFS_V4_1
+void nfs41_handle_recall_slot(struct nfs_client *clp)
+{
+	set_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state);
+	nfs4_schedule_state_recovery(clp);
+}
+
+static void nfs4_reset_all_state(struct nfs_client *clp)
+{
+	if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
+		clp->cl_boot_time = CURRENT_TIME;
+		nfs4_state_start_reclaim_nograce(clp);
+		nfs4_schedule_state_recovery(clp);
+	}
+}
+
+static void nfs41_handle_server_reboot(struct nfs_client *clp)
+{
+	if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
+		nfs4_state_start_reclaim_reboot(clp);
+		nfs4_schedule_state_recovery(clp);
+	}
+}
+
+static void nfs41_handle_state_revoked(struct nfs_client *clp)
+{
+	/* Temporary */
+	nfs4_reset_all_state(clp);
+}
+
+static void nfs41_handle_recallable_state_revoked(struct nfs_client *clp)
+{
+	/* This will need to handle layouts too */
+	nfs_expire_all_delegations(clp);
+}
+
+static void nfs41_handle_cb_path_down(struct nfs_client *clp)
+{
+	nfs_expire_all_delegations(clp);
+	if (test_and_set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) == 0)
+		nfs4_schedule_state_recovery(clp);
+}
+
 void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
 {
 	if (!flags)
 		return;
-	else if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED) {
-		set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
-		nfs4_state_start_reclaim_reboot(clp);
-		nfs4_schedule_state_recovery(clp);
-	} else if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED |
+	else if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED)
+		nfs41_handle_server_reboot(clp);
+	else if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED |
 			    SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED |
 			    SEQ4_STATUS_ADMIN_STATE_REVOKED |
-			    SEQ4_STATUS_RECALLABLE_STATE_REVOKED |
-			    SEQ4_STATUS_LEASE_MOVED)) {
-		set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
-		nfs4_state_start_reclaim_nograce(clp);
-		nfs4_schedule_state_recovery(clp);
-	} else if (flags & (SEQ4_STATUS_CB_PATH_DOWN |
+			    SEQ4_STATUS_LEASE_MOVED))
+		nfs41_handle_state_revoked(clp);
+	else if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED)
+		nfs41_handle_recallable_state_revoked(clp);
+	else if (flags & (SEQ4_STATUS_CB_PATH_DOWN |
 			    SEQ4_STATUS_BACKCHANNEL_FAULT |
 			    SEQ4_STATUS_CB_PATH_DOWN_SESSION))
-		nfs_expire_all_delegations(clp);
+		nfs41_handle_cb_path_down(clp);
 }
 
 static int nfs4_reset_session(struct nfs_client *clp)
@@ -1285,23 +1330,52 @@ static int nfs4_reset_session(struct nfs_client *clp)
 
 	memset(clp->cl_session->sess_id.data, 0, NFS4_MAX_SESSIONID_LEN);
 	status = nfs4_proc_create_session(clp);
-	if (status)
+	if (status) {
 		status = nfs4_recovery_handle_error(clp, status);
+		goto out;
+	}
+	/* create_session negotiated new slot table */
+	clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state);
 
-out:
-	/*
-	 * Let the state manager reestablish state
-	 */
-	if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) &&
-	    status == 0)
+	 /* Let the state manager reestablish state */
+	if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
 		nfs41_setup_state_renewal(clp);
-
+out:
 	return status;
 }
 
+static int nfs4_recall_slot(struct nfs_client *clp)
+{
+	struct nfs4_slot_table *fc_tbl = &clp->cl_session->fc_slot_table;
+	struct nfs4_channel_attrs *fc_attrs = &clp->cl_session->fc_attrs;
+	struct nfs4_slot *new, *old;
+	int i;
+
+	nfs4_begin_drain_session(clp);
+	new = kmalloc(fc_tbl->target_max_slots * sizeof(struct nfs4_slot),
+		      GFP_NOFS);
+        if (!new)
+		return -ENOMEM;
+
+	spin_lock(&fc_tbl->slot_tbl_lock);
+	for (i = 0; i < fc_tbl->target_max_slots; i++)
+		new[i].seq_nr = fc_tbl->slots[i].seq_nr;
+	old = fc_tbl->slots;
+	fc_tbl->slots = new;
+	fc_tbl->max_slots = fc_tbl->target_max_slots;
+	fc_tbl->target_max_slots = 0;
+	fc_attrs->max_reqs = fc_tbl->max_slots;
+	spin_unlock(&fc_tbl->slot_tbl_lock);
+
+	kfree(old);
+	nfs4_end_drain_session(clp);
+	return 0;
+}
+
 #else /* CONFIG_NFS_V4_1 */
 static int nfs4_reset_session(struct nfs_client *clp) { return 0; }
 static int nfs4_end_drain_session(struct nfs_client *clp) { return 0; }
+static int nfs4_recall_slot(struct nfs_client *clp) { return 0; }
 #endif /* CONFIG_NFS_V4_1 */
 
 /* Set NFS4CLNT_LEASE_EXPIRED for all v4.0 errors and for recoverable errors
@@ -1314,6 +1388,7 @@ static void nfs4_set_lease_expired(struct nfs_client *clp, int status)
 		case -NFS4ERR_DELAY:
 		case -NFS4ERR_CLID_INUSE:
 		case -EAGAIN:
+		case -EKEYEXPIRED:
 			break;
 
 		case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
@@ -1397,6 +1472,15 @@ static void nfs4_state_manager(struct nfs_client *clp)
 			nfs_client_return_marked_delegations(clp);
 			continue;
 		}
+		/* Recall session slots */
+		if (test_and_clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state)
+		   && nfs4_has_session(clp)) {
+			status = nfs4_recall_slot(clp);
+			if (status < 0)
+				goto out_error;
+			continue;
+		}
+
 
 		nfs4_clear_state_manager_bit(clp);
 		/* Did we race with an attempt to give us more work? */
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index e437fd6a819f..65c8dae4b267 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -38,7 +38,6 @@
 #include <linux/param.h>
 #include <linux/time.h>
 #include <linux/mm.h>
-#include <linux/slab.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/in.h>
@@ -863,8 +862,8 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
 		bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET;
 		*p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
 		*p++ = cpu_to_be32(0);
-		*p++ = cpu_to_be32(iap->ia_mtime.tv_sec);
-		*p++ = cpu_to_be32(iap->ia_mtime.tv_nsec);
+		*p++ = cpu_to_be32(iap->ia_atime.tv_sec);
+		*p++ = cpu_to_be32(iap->ia_atime.tv_nsec);
 	}
 	else if (iap->ia_valid & ATTR_ATIME) {
 		bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET;
@@ -1505,14 +1504,14 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie
 	hdr->replen += decode_setclientid_maxsz;
 }
 
-static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_client *client_state, struct compound_hdr *hdr)
+static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4_setclientid_res *arg, struct compound_hdr *hdr)
 {
 	__be32 *p;
 
 	p = reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE);
 	*p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM);
-	p = xdr_encode_hyper(p, client_state->cl_clientid);
-	xdr_encode_opaque_fixed(p, client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
+	p = xdr_encode_hyper(p, arg->clientid);
+	xdr_encode_opaque_fixed(p, arg->confirm.data, NFS4_VERIFIER_SIZE);
 	hdr->nops++;
 	hdr->replen += decode_setclientid_confirm_maxsz;
 }
@@ -1578,6 +1577,14 @@ static void encode_create_session(struct xdr_stream *xdr,
 	char machine_name[NFS4_MAX_MACHINE_NAME_LEN];
 	uint32_t len;
 	struct nfs_client *clp = args->client;
+	u32 max_resp_sz_cached;
+
+	/*
+	 * Assumes OPEN is the biggest non-idempotent compound.
+	 * 2 is the verifier.
+	 */
+	max_resp_sz_cached = (NFS4_dec_open_sz + RPC_REPHDRSIZE +
+			      RPC_MAX_AUTH_SIZE + 2) * XDR_UNIT;
 
 	len = scnprintf(machine_name, sizeof(machine_name), "%s",
 			clp->cl_ipaddr);
@@ -1592,7 +1599,7 @@ static void encode_create_session(struct xdr_stream *xdr,
 	*p++ = cpu_to_be32(args->fc_attrs.headerpadsz);	/* header padding size */
 	*p++ = cpu_to_be32(args->fc_attrs.max_rqst_sz);	/* max req size */
 	*p++ = cpu_to_be32(args->fc_attrs.max_resp_sz);	/* max resp size */
-	*p++ = cpu_to_be32(args->fc_attrs.max_resp_sz_cached);	/* Max resp sz cached */
+	*p++ = cpu_to_be32(max_resp_sz_cached);		/* Max resp sz cached */
 	*p++ = cpu_to_be32(args->fc_attrs.max_ops);	/* max operations */
 	*p++ = cpu_to_be32(args->fc_attrs.max_reqs);	/* max requests */
 	*p++ = cpu_to_be32(0);				/* rdmachannel_attrs */
@@ -2317,7 +2324,7 @@ static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, __be32 *p, struct nfs4
 /*
  * a SETCLIENTID_CONFIRM request
  */
-static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_client *clp)
+static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs4_setclientid_res *arg)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
@@ -2327,7 +2334,7 @@ static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, str
 
 	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
 	encode_compound_hdr(&xdr, req, &hdr);
-	encode_setclientid_confirm(&xdr, clp, &hdr);
+	encode_setclientid_confirm(&xdr, arg, &hdr);
 	encode_putrootfh(&xdr, &hdr);
 	encode_fsinfo(&xdr, lease_bitmap, &hdr);
 	encode_nops(&hdr);
@@ -4390,7 +4397,7 @@ out_overflow:
 	return -EIO;
 }
 
-static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
+static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid_res *res)
 {
 	__be32 *p;
 	uint32_t opnum;
@@ -4410,8 +4417,8 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
 		p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE);
 		if (unlikely(!p))
 			goto out_overflow;
-		p = xdr_decode_hyper(p, &clp->cl_clientid);
-		memcpy(clp->cl_confirm.data, p, NFS4_VERIFIER_SIZE);
+		p = xdr_decode_hyper(p, &res->clientid);
+		memcpy(res->confirm.data, p, NFS4_VERIFIER_SIZE);
 	} else if (nfserr == NFSERR_CLID_INUSE) {
 		uint32_t len;
 
@@ -4631,7 +4638,7 @@ static int decode_sequence(struct xdr_stream *xdr,
 	 * If the server returns different values for sessionID, slotID or
 	 * sequence number, the server is looney tunes.
 	 */
-	status = -ESERVERFAULT;
+	status = -EREMOTEIO;
 
 	if (memcmp(id.data, res->sr_session->sess_id.data,
 		   NFS4_MAX_SESSIONID_LEN)) {
@@ -4808,7 +4815,7 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_rem
 		goto out;
 	if ((status = decode_remove(&xdr, &res->cinfo)) != 0)
 		goto out;
-	decode_getfattr(&xdr, &res->dir_attr, res->server,
+	decode_getfattr(&xdr, res->dir_attr, res->server,
 			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
 	return status;
@@ -5491,7 +5498,7 @@ static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, __be32 *p, void *dummy)
  * Decode SETCLIENTID response
  */
 static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p,
-		struct nfs_client *clp)
+		struct nfs4_setclientid_res *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -5500,7 +5507,7 @@ static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p,
 	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
 	status = decode_compound_hdr(&xdr, &hdr);
 	if (!status)
-		status = decode_setclientid(&xdr, clp);
+		status = decode_setclientid(&xdr, res);
 	return status;
 }
 
@@ -5544,6 +5551,8 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nf
 	if (status != 0)
 		goto out;
 	status = decode_delegreturn(&xdr);
+	if (status != 0)
+		goto out;
 	decode_getfattr(&xdr, res->fattr, res->server,
 			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
@@ -5774,7 +5783,7 @@ static struct {
 	{ NFS4ERR_BAD_COOKIE,	-EBADCOOKIE	},
 	{ NFS4ERR_NOTSUPP,	-ENOTSUPP	},
 	{ NFS4ERR_TOOSMALL,	-ETOOSMALL	},
-	{ NFS4ERR_SERVERFAULT,	-ESERVERFAULT	},
+	{ NFS4ERR_SERVERFAULT,	-EREMOTEIO	},
 	{ NFS4ERR_BADTYPE,	-EBADTYPE	},
 	{ NFS4ERR_LOCKED,	-EAGAIN		},
 	{ NFS4ERR_SYMLINK,	-ELOOP		},
@@ -5801,7 +5810,7 @@ nfs4_stat_to_errno(int stat)
 	}
 	if (stat <= 10000 || stat > 10100) {
 		/* The server is looney tunes. */
-		return -ESERVERFAULT;
+		return -EREMOTEIO;
 	}
 	/* If we cannot translate the error, the recovery routines should
 	 * handle it.
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 8c55b27c0de4..df101d9f546a 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -105,7 +105,7 @@ static char nfs_root_name[256] __initdata = "";
 static __be32 servaddr __initdata = 0;
 
 /* Name of directory to mount */
-static char nfs_export_path[NFS_MAXPATHLEN] __initdata = { 0, };
+static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = { 0, };
 
 /* NFS-related data */
 static struct nfs_mount_data nfs_data __initdata = { 0, };/* NFS mount info */
@@ -488,7 +488,6 @@ static int __init root_nfs_ports(void)
  */
 static int __init root_nfs_get_handle(void)
 {
-	struct nfs_fh fh;
 	struct sockaddr_in sin;
 	unsigned int auth_flav_len = 0;
 	struct nfs_mount_request request = {
@@ -499,21 +498,24 @@ static int __init root_nfs_get_handle(void)
 					NFS_MNT3_VERSION : NFS_MNT_VERSION,
 		.protocol	= (nfs_data.flags & NFS_MOUNT_TCP) ?
 					XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP,
-		.fh		= &fh,
 		.auth_flav_len	= &auth_flav_len,
 	};
-	int status;
+	int status = -ENOMEM;
 
+	request.fh = nfs_alloc_fhandle();
+	if (!request.fh)
+		goto out;
 	set_sockaddr(&sin, servaddr, htons(mount_port));
 	status = nfs_mount(&request);
 	if (status < 0)
 		printk(KERN_ERR "Root-NFS: Server returned error %d "
 				"while mounting %s\n", status, nfs_export_path);
 	else {
-		nfs_data.root.size = fh.size;
-		memcpy(nfs_data.root.data, fh.data, fh.size);
+		nfs_data.root.size = request.fh->size;
+		memcpy(&nfs_data.root.data, request.fh->data, request.fh->size);
 	}
-
+	nfs_free_fhandle(request.fh);
+out:
 	return status;
 }
 
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index e2975939126a..a3654e57b589 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -60,16 +60,10 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
 {
 	struct nfs_page		*req;
 
-	for (;;) {
-		/* try to allocate the request struct */
-		req = nfs_page_alloc();
-		if (req != NULL)
-			break;
-
-		if (fatal_signal_pending(current))
-			return ERR_PTR(-ERESTARTSYS);
-		yield();
-	}
+	/* try to allocate the request struct */
+	req = nfs_page_alloc();
+	if (req == NULL)
+		return ERR_PTR(-ENOMEM);
 
 	/* Initialize the request struct. Initially, we assume a
 	 * long write-back delay. This will be adjusted in
@@ -112,12 +106,10 @@ void nfs_unlock_request(struct nfs_page *req)
  */
 int nfs_set_page_tag_locked(struct nfs_page *req)
 {
-	struct nfs_inode *nfsi = NFS_I(req->wb_context->path.dentry->d_inode);
-
 	if (!nfs_lock_request_dontget(req))
 		return 0;
 	if (req->wb_page != NULL)
-		radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
+		radix_tree_tag_set(&NFS_I(req->wb_context->path.dentry->d_inode)->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
 	return 1;
 }
 
@@ -126,10 +118,10 @@ int nfs_set_page_tag_locked(struct nfs_page *req)
  */
 void nfs_clear_page_tag_locked(struct nfs_page *req)
 {
-	struct inode *inode = req->wb_context->path.dentry->d_inode;
-	struct nfs_inode *nfsi = NFS_I(inode);
-
 	if (req->wb_page != NULL) {
+		struct inode *inode = req->wb_context->path.dentry->d_inode;
+		struct nfs_inode *nfsi = NFS_I(inode);
+
 		spin_lock(&inode->i_lock);
 		radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
 		nfs_unlock_request(req);
@@ -142,16 +134,22 @@ void nfs_clear_page_tag_locked(struct nfs_page *req)
  * nfs_clear_request - Free up all resources allocated to the request
  * @req:
  *
- * Release page resources associated with a write request after it
- * has completed.
+ * Release page and open context resources associated with a read/write
+ * request after it has completed.
  */
 void nfs_clear_request(struct nfs_page *req)
 {
 	struct page *page = req->wb_page;
+	struct nfs_open_context *ctx = req->wb_context;
+
 	if (page != NULL) {
 		page_cache_release(page);
 		req->wb_page = NULL;
 	}
+	if (ctx != NULL) {
+		put_nfs_open_context(ctx);
+		req->wb_context = NULL;
+	}
 }
 
 
@@ -165,9 +163,8 @@ static void nfs_free_request(struct kref *kref)
 {
 	struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
 
-	/* Release struct file or cached credential */
+	/* Release struct file and open context */
 	nfs_clear_request(req);
-	put_nfs_open_context(req->wb_context);
 	nfs_page_free(req);
 }
 
@@ -176,6 +173,12 @@ void nfs_release_request(struct nfs_page *req)
 	kref_put(&req->wb_kref, nfs_free_request);
 }
 
+static int nfs_wait_bit_uninterruptible(void *word)
+{
+	io_schedule();
+	return 0;
+}
+
 /**
  * nfs_wait_on_request - Wait for a request to complete.
  * @req: request to wait upon.
@@ -186,14 +189,9 @@ void nfs_release_request(struct nfs_page *req)
 int
 nfs_wait_on_request(struct nfs_page *req)
 {
-	int ret = 0;
-
-	if (!test_bit(PG_BUSY, &req->wb_flags))
-		goto out;
-	ret = out_of_line_wait_on_bit(&req->wb_flags, PG_BUSY,
-			nfs_wait_bit_killable, TASK_KILLABLE);
-out:
-	return ret;
+	return wait_on_bit(&req->wb_flags, PG_BUSY,
+			nfs_wait_bit_uninterruptible,
+			TASK_UNINTERRUPTIBLE);
 }
 
 /**
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index ef583854d8d0..611bec22f552 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -29,7 +29,6 @@
 
 #include <linux/types.h>
 #include <linux/param.h>
-#include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/errno.h>
@@ -47,6 +46,39 @@
 #define NFSDBG_FACILITY		NFSDBG_PROC
 
 /*
+ * wrapper to handle the -EKEYEXPIRED error message. This should generally
+ * only happen if using krb5 auth and a user's TGT expires. NFSv2 doesn't
+ * support the NFSERR_JUKEBOX error code, but we handle this situation in the
+ * same way that we handle that error with NFSv3.
+ */
+static int
+nfs_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
+{
+	int res;
+	do {
+		res = rpc_call_sync(clnt, msg, flags);
+		if (res != -EKEYEXPIRED)
+			break;
+		schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME);
+		res = -ERESTARTSYS;
+	} while (!fatal_signal_pending(current));
+	return res;
+}
+
+#define rpc_call_sync(clnt, msg, flags)	nfs_rpc_wrapper(clnt, msg, flags)
+
+static int
+nfs_async_handle_expired_key(struct rpc_task *task)
+{
+	if (task->tk_status != -EKEYEXPIRED)
+		return 0;
+	task->tk_status = 0;
+	rpc_restart_call(task);
+	rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
+	return 1;
+}
+
+/*
  * Bare-bones access to getattr: this is for nfs_read_super.
  */
 static int
@@ -192,35 +224,60 @@ static int nfs_proc_readlink(struct inode *inode, struct page *page,
 	return status;
 }
 
+struct nfs_createdata {
+	struct nfs_createargs arg;
+	struct nfs_diropok res;
+	struct nfs_fh fhandle;
+	struct nfs_fattr fattr;
+};
+
+static struct nfs_createdata *nfs_alloc_createdata(struct inode *dir,
+		struct dentry *dentry, struct iattr *sattr)
+{
+	struct nfs_createdata *data;
+
+	data = kmalloc(sizeof(*data), GFP_KERNEL);
+
+	if (data != NULL) {
+		data->arg.fh = NFS_FH(dir);
+		data->arg.name = dentry->d_name.name;
+		data->arg.len = dentry->d_name.len;
+		data->arg.sattr = sattr;
+		nfs_fattr_init(&data->fattr);
+		data->fhandle.size = 0;
+		data->res.fh = &data->fhandle;
+		data->res.fattr = &data->fattr;
+	}
+	return data;
+};
+
+static void nfs_free_createdata(const struct nfs_createdata *data)
+{
+	kfree(data);
+}
+
 static int
 nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 		int flags, struct nameidata *nd)
 {
-	struct nfs_fh		fhandle;
-	struct nfs_fattr	fattr;
-	struct nfs_createargs	arg = {
-		.fh		= NFS_FH(dir),
-		.name		= dentry->d_name.name,
-		.len		= dentry->d_name.len,
-		.sattr		= sattr
-	};
-	struct nfs_diropok	res = {
-		.fh		= &fhandle,
-		.fattr		= &fattr
-	};
+	struct nfs_createdata *data;
 	struct rpc_message msg = {
 		.rpc_proc	= &nfs_procedures[NFSPROC_CREATE],
-		.rpc_argp	= &arg,
-		.rpc_resp	= &res,
 	};
-	int			status;
+	int status = -ENOMEM;
 
-	nfs_fattr_init(&fattr);
 	dprintk("NFS call  create %s\n", dentry->d_name.name);
+	data = nfs_alloc_createdata(dir, dentry, sattr);
+	if (data == NULL)
+		goto out;
+	msg.rpc_argp = &data->arg;
+	msg.rpc_resp = &data->res;
 	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
 	nfs_mark_for_revalidate(dir);
 	if (status == 0)
-		status = nfs_instantiate(dentry, &fhandle, &fattr);
+		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+	nfs_free_createdata(data);
+out:
 	dprintk("NFS reply create: %d\n", status);
 	return status;
 }
@@ -232,24 +289,12 @@ static int
 nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 	       dev_t rdev)
 {
-	struct nfs_fh fhandle;
-	struct nfs_fattr fattr;
-	struct nfs_createargs	arg = {
-		.fh		= NFS_FH(dir),
-		.name		= dentry->d_name.name,
-		.len		= dentry->d_name.len,
-		.sattr		= sattr
-	};
-	struct nfs_diropok	res = {
-		.fh		= &fhandle,
-		.fattr		= &fattr
-	};
+	struct nfs_createdata *data;
 	struct rpc_message msg = {
 		.rpc_proc	= &nfs_procedures[NFSPROC_CREATE],
-		.rpc_argp	= &arg,
-		.rpc_resp	= &res,
 	};
-	int status, mode;
+	umode_t mode;
+	int status = -ENOMEM;
 
 	dprintk("NFS call  mknod %s\n", dentry->d_name.name);
 
@@ -262,17 +307,24 @@ nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 		sattr->ia_size = new_encode_dev(rdev);/* get out your barf bag */
 	}
 
-	nfs_fattr_init(&fattr);
+	data = nfs_alloc_createdata(dir, dentry, sattr);
+	if (data == NULL)
+		goto out;
+	msg.rpc_argp = &data->arg;
+	msg.rpc_resp = &data->res;
+
 	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
 	nfs_mark_for_revalidate(dir);
 
 	if (status == -EINVAL && S_ISFIFO(mode)) {
 		sattr->ia_mode = mode;
-		nfs_fattr_init(&fattr);
+		nfs_fattr_init(data->res.fattr);
 		status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
 	}
 	if (status == 0)
-		status = nfs_instantiate(dentry, &fhandle, &fattr);
+		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+	nfs_free_createdata(data);
+out:
 	dprintk("NFS reply mknod: %d\n", status);
 	return status;
 }
@@ -307,6 +359,8 @@ nfs_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
 
 static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir)
 {
+	if (nfs_async_handle_expired_key(task))
+		return 0;
 	nfs_mark_for_revalidate(dir);
 	return 1;
 }
@@ -364,8 +418,8 @@ static int
 nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
 		 unsigned int len, struct iattr *sattr)
 {
-	struct nfs_fh fhandle;
-	struct nfs_fattr fattr;
+	struct nfs_fh *fh;
+	struct nfs_fattr *fattr;
 	struct nfs_symlinkargs	arg = {
 		.fromfh		= NFS_FH(dir),
 		.fromname	= dentry->d_name.name,
@@ -378,12 +432,18 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
 		.rpc_proc	= &nfs_procedures[NFSPROC_SYMLINK],
 		.rpc_argp	= &arg,
 	};
-	int			status;
+	int status = -ENAMETOOLONG;
+
+	dprintk("NFS call  symlink %s\n", dentry->d_name.name);
 
 	if (len > NFS2_MAXPATHLEN)
-		return -ENAMETOOLONG;
+		goto out;
 
-	dprintk("NFS call  symlink %s\n", dentry->d_name.name);
+	fh = nfs_alloc_fhandle();
+	fattr = nfs_alloc_fattr();
+	status = -ENOMEM;
+	if (fh == NULL || fattr == NULL)
+		goto out;
 
 	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
 	nfs_mark_for_revalidate(dir);
@@ -393,12 +453,12 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
 	 * filehandle size to zero indicates to nfs_instantiate that it
 	 * should fill in the data with a LOOKUP call on the wire.
 	 */
-	if (status == 0) {
-		nfs_fattr_init(&fattr);
-		fhandle.size = 0;
-		status = nfs_instantiate(dentry, &fhandle, &fattr);
-	}
+	if (status == 0)
+		status = nfs_instantiate(dentry, fh, fattr);
 
+	nfs_free_fattr(fattr);
+	nfs_free_fhandle(fh);
+out:
 	dprintk("NFS reply symlink: %d\n", status);
 	return status;
 }
@@ -406,31 +466,25 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
 static int
 nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
 {
-	struct nfs_fh fhandle;
-	struct nfs_fattr fattr;
-	struct nfs_createargs	arg = {
-		.fh		= NFS_FH(dir),
-		.name		= dentry->d_name.name,
-		.len		= dentry->d_name.len,
-		.sattr		= sattr
-	};
-	struct nfs_diropok	res = {
-		.fh		= &fhandle,
-		.fattr		= &fattr
-	};
+	struct nfs_createdata *data;
 	struct rpc_message msg = {
 		.rpc_proc	= &nfs_procedures[NFSPROC_MKDIR],
-		.rpc_argp	= &arg,
-		.rpc_resp	= &res,
 	};
-	int			status;
+	int status = -ENOMEM;
 
 	dprintk("NFS call  mkdir %s\n", dentry->d_name.name);
-	nfs_fattr_init(&fattr);
+	data = nfs_alloc_createdata(dir, dentry, sattr);
+	if (data == NULL)
+		goto out;
+	msg.rpc_argp = &data->arg;
+	msg.rpc_resp = &data->res;
+
 	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
 	nfs_mark_for_revalidate(dir);
 	if (status == 0)
-		status = nfs_instantiate(dentry, &fhandle, &fattr);
+		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+	nfs_free_createdata(data);
+out:
 	dprintk("NFS reply mkdir: %d\n", status);
 	return status;
 }
@@ -560,6 +614,9 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
 
 static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data)
 {
+	if (nfs_async_handle_expired_key(task))
+		return -EAGAIN;
+
 	nfs_invalidate_atime(data->inode);
 	if (task->tk_status >= 0) {
 		nfs_refresh_inode(data->inode, data->res.fattr);
@@ -579,6 +636,9 @@ static void nfs_proc_read_setup(struct nfs_read_data *data, struct rpc_message *
 
 static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)
 {
+	if (nfs_async_handle_expired_key(task))
+		return -EAGAIN;
+
 	if (task->tk_status >= 0)
 		nfs_post_op_update_inode_force_wcc(data->inode, data->res.fattr);
 	return 0;
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index db9b360ae19d..6e2b06e6ca79 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -40,7 +40,7 @@ static mempool_t *nfs_rdata_mempool;
 
 struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
 {
-	struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_NOFS);
+	struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_KERNEL);
 
 	if (p) {
 		memset(p, 0, sizeof(*p));
@@ -50,7 +50,7 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
 		if (pagecount <= ARRAY_SIZE(p->page_array))
 			p->pagevec = p->page_array;
 		else {
-			p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS);
+			p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL);
 			if (!p->pagevec) {
 				mempool_free(p, nfs_rdata_mempool);
 				p = NULL;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index ce907efc5508..f9df16de4a56 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -48,6 +48,7 @@
 #include <linux/vfs.h>
 #include <linux/inet.h>
 #include <linux/in6.h>
+#include <linux/slab.h>
 #include <net/ipv6.h>
 #include <linux/netdevice.h>
 #include <linux/nfs_xdr.h>
@@ -140,7 +141,6 @@ static const match_table_t nfs_mount_option_tokens = {
 	{ Opt_resvport, "resvport" },
 	{ Opt_noresvport, "noresvport" },
 	{ Opt_fscache, "fsc" },
-	{ Opt_fscache_uniq, "fsc=%s" },
 	{ Opt_nofscache, "nofsc" },
 
 	{ Opt_port, "port=%s" },
@@ -170,6 +170,7 @@ static const match_table_t nfs_mount_option_tokens = {
 	{ Opt_mountaddr, "mountaddr=%s" },
 
 	{ Opt_lookupcache, "lookupcache=%s" },
+	{ Opt_fscache_uniq, "fsc=%s" },
 
 	{ Opt_err, NULL }
 };
@@ -243,6 +244,7 @@ static int  nfs_show_stats(struct seq_file *, struct vfsmount *);
 static int nfs_get_sb(struct file_system_type *, int, const char *, void *, struct vfsmount *);
 static int nfs_xdev_get_sb(struct file_system_type *fs_type,
 		int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+static void nfs_put_super(struct super_block *);
 static void nfs_kill_super(struct super_block *);
 static int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
 
@@ -266,6 +268,7 @@ static const struct super_operations nfs_sops = {
 	.alloc_inode	= nfs_alloc_inode,
 	.destroy_inode	= nfs_destroy_inode,
 	.write_inode	= nfs_write_inode,
+	.put_super	= nfs_put_super,
 	.statfs		= nfs_statfs,
 	.clear_inode	= nfs_clear_inode,
 	.umount_begin	= nfs_umount_begin,
@@ -335,6 +338,7 @@ static const struct super_operations nfs4_sops = {
 	.alloc_inode	= nfs_alloc_inode,
 	.destroy_inode	= nfs_destroy_inode,
 	.write_inode	= nfs_write_inode,
+	.put_super	= nfs_put_super,
 	.statfs		= nfs_statfs,
 	.clear_inode	= nfs4_clear_inode,
 	.umount_begin	= nfs_umount_begin,
@@ -419,15 +423,19 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	unsigned char blockbits;
 	unsigned long blockres;
 	struct nfs_fh *fh = NFS_FH(dentry->d_inode);
-	struct nfs_fattr fattr;
-	struct nfs_fsstat res = {
-			.fattr = &fattr,
-	};
-	int error;
+	struct nfs_fsstat res;
+	int error = -ENOMEM;
+
+	res.fattr = nfs_alloc_fattr();
+	if (res.fattr == NULL)
+		goto out_err;
 
 	error = server->nfs_client->rpc_ops->statfs(server, fh, &res);
+
+	nfs_free_fattr(res.fattr);
 	if (error < 0)
 		goto out_err;
+
 	buf->f_type = NFS_SUPER_MAGIC;
 
 	/*
@@ -562,6 +570,22 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss,
 	nfs_show_mountd_netid(m, nfss, showdefaults);
 }
 
+#ifdef CONFIG_NFS_V4
+static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss,
+				    int showdefaults)
+{
+	struct nfs_client *clp = nfss->nfs_client;
+
+	seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr);
+	seq_printf(m, ",minorversion=%u", clp->cl_minorversion);
+}
+#else
+static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss,
+				    int showdefaults)
+{
+}
+#endif
+
 /*
  * Describe the mount options in force on this server representation
  */
@@ -623,11 +647,9 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
 
 	if (version != 4)
 		nfs_show_mountd_options(m, nfss, showdefaults);
+	else
+		nfs_show_nfsv4_options(m, nfss, showdefaults);
 
-#ifdef CONFIG_NFS_V4
-	if (clp->rpc_ops->version == 4)
-		seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr);
-#endif
 	if (nfss->options & NFS_OPTION_FSCACHE)
 		seq_printf(m, ",fsc");
 }
@@ -1042,14 +1064,6 @@ static int nfs_parse_mount_options(char *raw,
 			kfree(mnt->fscache_uniq);
 			mnt->fscache_uniq = NULL;
 			break;
-		case Opt_fscache_uniq:
-			string = match_strdup(args);
-			if (!string)
-				goto out_nomem;
-			kfree(mnt->fscache_uniq);
-			mnt->fscache_uniq = string;
-			mnt->options |= NFS_OPTION_FSCACHE;
-			break;
 
 		/*
 		 * options that take numeric values
@@ -1060,7 +1074,7 @@ static int nfs_parse_mount_options(char *raw,
 				goto out_nomem;
 			rc = strict_strtoul(string, 10, &option);
 			kfree(string);
-			if (rc != 0 || option > USHORT_MAX)
+			if (rc != 0 || option > USHRT_MAX)
 				goto out_invalid_value;
 			mnt->nfs_server.port = option;
 			break;
@@ -1181,7 +1195,7 @@ static int nfs_parse_mount_options(char *raw,
 				goto out_nomem;
 			rc = strict_strtoul(string, 10, &option);
 			kfree(string);
-			if (rc != 0 || option > USHORT_MAX)
+			if (rc != 0 || option > USHRT_MAX)
 				goto out_invalid_value;
 			mnt->mount_server.port = option;
 			break;
@@ -1380,6 +1394,14 @@ static int nfs_parse_mount_options(char *raw,
 					return 0;
 			};
 			break;
+		case Opt_fscache_uniq:
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+			kfree(mnt->fscache_uniq);
+			mnt->fscache_uniq = string;
+			mnt->options |= NFS_OPTION_FSCACHE;
+			break;
 
 		/*
 		 * Special options
@@ -2168,7 +2190,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 	int error = -ENOMEM;
 
 	data = nfs_alloc_parsed_mount_data(3);
-	mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL);
+	mntfh = nfs_alloc_fhandle();
 	if (data == NULL || mntfh == NULL)
 		goto out_free_fh;
 
@@ -2183,6 +2205,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 	if (data->version == 4) {
 		error = nfs4_try_mount(flags, dev_name, data, mnt);
 		kfree(data->client_address);
+		kfree(data->nfs_server.export_path);
 		goto out;
 	}
 #endif	/* CONFIG_NFS_V4 */
@@ -2211,7 +2234,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 	} else {
 		error = nfs_bdi_register(server);
 		if (error)
-			goto error_splat_super;
+			goto error_splat_bdi;
 	}
 
 	if (!s->s_root) {
@@ -2242,7 +2265,7 @@ out:
 	kfree(data->fscache_uniq);
 	security_free_mnt_opts(&data->lsm_opts);
 out_free_fh:
-	kfree(mntfh);
+	nfs_free_fhandle(mntfh);
 	kfree(data);
 	return error;
 
@@ -2253,11 +2276,25 @@ out_err_nosb:
 error_splat_root:
 	dput(mntroot);
 error_splat_super:
+	if (server && !s->s_root)
+		bdi_unregister(&server->backing_dev_info);
+error_splat_bdi:
 	deactivate_locked_super(s);
 	goto out;
 }
 
 /*
+ * Ensure that we unregister the bdi before kill_anon_super
+ * releases the device name
+ */
+static void nfs_put_super(struct super_block *s)
+{
+	struct nfs_server *server = NFS_SB(s);
+
+	bdi_unregister(&server->backing_dev_info);
+}
+
+/*
  * Destroy an NFS2/3 superblock
  */
 static void nfs_kill_super(struct super_block *s)
@@ -2265,7 +2302,6 @@ static void nfs_kill_super(struct super_block *s)
 	struct nfs_server *server = NFS_SB(s);
 
 	kill_anon_super(s);
-	bdi_unregister(&server->backing_dev_info);
 	nfs_fscache_release_super_cookie(s);
 	nfs_free_server(server);
 }
@@ -2313,7 +2349,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
 	} else {
 		error = nfs_bdi_register(server);
 		if (error)
-			goto error_splat_super;
+			goto error_splat_bdi;
 	}
 
 	if (!s->s_root) {
@@ -2350,6 +2386,9 @@ out_err_noserver:
 	return error;
 
 error_splat_super:
+	if (server && !s->s_root)
+		bdi_unregister(&server->backing_dev_info);
+error_splat_bdi:
 	deactivate_locked_super(s);
 	dprintk("<-- nfs_xdev_get_sb() = %d [splat]\n", error);
 	return error;
@@ -2535,7 +2574,7 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type,
 	};
 	int error = -ENOMEM;
 
-	mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL);
+	mntfh = nfs_alloc_fhandle();
 	if (data == NULL || mntfh == NULL)
 		goto out_free_fh;
 
@@ -2565,7 +2604,7 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type,
 	} else {
 		error = nfs_bdi_register(server);
 		if (error)
-			goto error_splat_super;
+			goto error_splat_bdi;
 	}
 
 	if (!s->s_root) {
@@ -2593,7 +2632,7 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type,
 out:
 	security_free_mnt_opts(&data->lsm_opts);
 out_free_fh:
-	kfree(mntfh);
+	nfs_free_fhandle(mntfh);
 	return error;
 
 out_free:
@@ -2603,6 +2642,9 @@ out_free:
 error_splat_root:
 	dput(mntroot);
 error_splat_super:
+	if (server && !s->s_root)
+		bdi_unregister(&server->backing_dev_info);
+error_splat_bdi:
 	deactivate_locked_super(s);
 	goto out;
 }
@@ -2634,7 +2676,7 @@ static void nfs_fix_devname(const struct path *path, struct vfsmount *mnt)
 	devname = nfs_path(path->mnt->mnt_devname,
 			path->mnt->mnt_root, path->dentry,
 			page, PAGE_SIZE);
-	if (devname == NULL)
+	if (IS_ERR(devname))
 		goto out_freepage;
 	tmp = kstrdup(devname, GFP_KERNEL);
 	if (tmp == NULL)
@@ -2645,41 +2687,120 @@ out_freepage:
 	free_page((unsigned long)page);
 }
 
+struct nfs_referral_count {
+	struct list_head list;
+	const struct task_struct *task;
+	unsigned int referral_count;
+};
+
+static LIST_HEAD(nfs_referral_count_list);
+static DEFINE_SPINLOCK(nfs_referral_count_list_lock);
+
+static struct nfs_referral_count *nfs_find_referral_count(void)
+{
+	struct nfs_referral_count *p;
+
+	list_for_each_entry(p, &nfs_referral_count_list, list) {
+		if (p->task == current)
+			return p;
+	}
+	return NULL;
+}
+
+#define NFS_MAX_NESTED_REFERRALS 2
+
+static int nfs_referral_loop_protect(void)
+{
+	struct nfs_referral_count *p, *new;
+	int ret = -ENOMEM;
+
+	new = kmalloc(sizeof(*new), GFP_KERNEL);
+	if (!new)
+		goto out;
+	new->task = current;
+	new->referral_count = 1;
+
+	ret = 0;
+	spin_lock(&nfs_referral_count_list_lock);
+	p = nfs_find_referral_count();
+	if (p != NULL) {
+		if (p->referral_count >= NFS_MAX_NESTED_REFERRALS)
+			ret = -ELOOP;
+		else
+			p->referral_count++;
+	} else {
+		list_add(&new->list, &nfs_referral_count_list);
+		new = NULL;
+	}
+	spin_unlock(&nfs_referral_count_list_lock);
+	kfree(new);
+out:
+	return ret;
+}
+
+static void nfs_referral_loop_unprotect(void)
+{
+	struct nfs_referral_count *p;
+
+	spin_lock(&nfs_referral_count_list_lock);
+	p = nfs_find_referral_count();
+	p->referral_count--;
+	if (p->referral_count == 0)
+		list_del(&p->list);
+	else
+		p = NULL;
+	spin_unlock(&nfs_referral_count_list_lock);
+	kfree(p);
+}
+
 static int nfs_follow_remote_path(struct vfsmount *root_mnt,
 		const char *export_path, struct vfsmount *mnt_target)
 {
+	struct nameidata *nd = NULL;
 	struct mnt_namespace *ns_private;
-	struct nameidata nd;
 	struct super_block *s;
 	int ret;
 
+	nd = kmalloc(sizeof(*nd), GFP_KERNEL);
+	if (nd == NULL)
+		return -ENOMEM;
+
 	ns_private = create_mnt_ns(root_mnt);
 	ret = PTR_ERR(ns_private);
 	if (IS_ERR(ns_private))
 		goto out_mntput;
 
+	ret = nfs_referral_loop_protect();
+	if (ret != 0)
+		goto out_put_mnt_ns;
+
 	ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt,
-			export_path, LOOKUP_FOLLOW, &nd);
+			export_path, LOOKUP_FOLLOW, nd);
 
+	nfs_referral_loop_unprotect();
 	put_mnt_ns(ns_private);
 
 	if (ret != 0)
 		goto out_err;
 
-	s = nd.path.mnt->mnt_sb;
+	s = nd->path.mnt->mnt_sb;
 	atomic_inc(&s->s_active);
 	mnt_target->mnt_sb = s;
-	mnt_target->mnt_root = dget(nd.path.dentry);
+	mnt_target->mnt_root = dget(nd->path.dentry);
 
 	/* Correct the device pathname */
-	nfs_fix_devname(&nd.path, mnt_target);
+	nfs_fix_devname(&nd->path, mnt_target);
 
-	path_put(&nd.path);
+	path_put(&nd->path);
+	kfree(nd);
 	down_write(&s->s_umount);
 	return 0;
+out_put_mnt_ns:
+	put_mnt_ns(ns_private);
 out_mntput:
 	mntput(root_mnt);
 out_err:
+	kfree(nd);
 	return ret;
 }
 
@@ -2798,7 +2919,7 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
 	} else {
 		error = nfs_bdi_register(server);
 		if (error)
-			goto error_splat_super;
+			goto error_splat_bdi;
 	}
 
 	if (!s->s_root) {
@@ -2834,6 +2955,9 @@ out_err_noserver:
 	return error;
 
 error_splat_super:
+	if (server && !s->s_root)
+		bdi_unregister(&server->backing_dev_info);
+error_splat_bdi:
 	deactivate_locked_super(s);
 	dprintk("<-- nfs4_xdev_get_sb() = %d [splat]\n", error);
 	return error;
@@ -2847,17 +2971,21 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
 	struct super_block *s;
 	struct nfs_server *server;
 	struct dentry *mntroot;
-	struct nfs_fh mntfh;
+	struct nfs_fh *mntfh;
 	int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
 	struct nfs_sb_mountdata sb_mntdata = {
 		.mntflags = flags,
 	};
-	int error;
+	int error = -ENOMEM;
 
 	dprintk("--> nfs4_referral_get_sb()\n");
 
+	mntfh = nfs_alloc_fhandle();
+	if (mntfh == NULL)
+		goto out_err_nofh;
+
 	/* create a new volume representation */
-	server = nfs4_create_referral_server(data, &mntfh);
+	server = nfs4_create_referral_server(data, mntfh);
 	if (IS_ERR(server)) {
 		error = PTR_ERR(server);
 		goto out_err_noserver;
@@ -2880,7 +3008,7 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
 	} else {
 		error = nfs_bdi_register(server);
 		if (error)
-			goto error_splat_super;
+			goto error_splat_bdi;
 	}
 
 	if (!s->s_root) {
@@ -2889,7 +3017,7 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
 		nfs_fscache_get_super_cookie(s, NULL, data);
 	}
 
-	mntroot = nfs4_get_root(s, &mntfh);
+	mntroot = nfs4_get_root(s, mntfh);
 	if (IS_ERR(mntroot)) {
 		error = PTR_ERR(mntroot);
 		goto error_splat_super;
@@ -2906,17 +3034,24 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
 
 	security_sb_clone_mnt_opts(data->sb, s);
 
+	nfs_free_fhandle(mntfh);
 	dprintk("<-- nfs4_referral_get_sb() = 0\n");
 	return 0;
 
 out_err_nosb:
 	nfs_free_server(server);
 out_err_noserver:
+	nfs_free_fhandle(mntfh);
+out_err_nofh:
 	dprintk("<-- nfs4_referral_get_sb() = %d [error]\n", error);
 	return error;
 
 error_splat_super:
+	if (server && !s->s_root)
+		bdi_unregister(&server->backing_dev_info);
+error_splat_bdi:
 	deactivate_locked_super(s);
+	nfs_free_fhandle(mntfh);
 	dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error);
 	return error;
 }
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 412738dbfbc7..05c9e02f4153 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -19,7 +19,6 @@
 #include <linux/pagemap.h>
 #include <linux/stat.h>
 #include <linux/mm.h>
-#include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/namei.h>
 
@@ -50,7 +49,7 @@ static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 	struct page *page;
 	void *err;
 
-	err = ERR_PTR(nfs_revalidate_mapping_nolock(inode, inode->i_mapping));
+	err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping));
 	if (err)
 		goto read_failed;
 	page = read_cache_page(&inode->i_data, 0,
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
index 70e1fbbaaeab..ad4d2e787b20 100644
--- a/fs/nfs/sysctl.c
+++ b/fs/nfs/sysctl.c
@@ -15,8 +15,10 @@
 
 #include "callback.h"
 
+#ifdef CONFIG_NFS_V4
 static const int nfs_set_port_min = 0;
 static const int nfs_set_port_max = 65535;
+#endif
 static struct ctl_table_header *nfs_callback_sysctl_table;
 
 static ctl_table nfs_cb_sysctls[] = {
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 6da3d3ff6edd..a2242af6a17d 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -23,6 +23,7 @@ struct nfs_unlinkdata {
 	struct nfs_removeres res;
 	struct inode *dir;
 	struct rpc_cred	*cred;
+	struct nfs_fattr dir_attr;
 };
 
 /**
@@ -169,7 +170,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
 	}
 	nfs_sb_active(dir->i_sb);
 	data->args.fh = NFS_FH(dir);
-	nfs_fattr_init(&data->res.dir_attr);
+	nfs_fattr_init(data->res.dir_attr);
 
 	NFS_PROTO(dir)->unlink_setup(&msg, dir);
 
@@ -259,6 +260,7 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)
 		goto out_free;
 	}
 	data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
+	data->res.dir_attr = &data->dir_attr;
 
 	status = -EBUSY;
 	spin_lock(&dentry->d_lock);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index d171696017f4..9f81bdd91c55 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -201,6 +201,7 @@ static int nfs_set_page_writeback(struct page *page)
 		struct inode *inode = page->mapping->host;
 		struct nfs_server *nfss = NFS_SERVER(inode);
 
+		page_cache_get(page);
 		if (atomic_long_inc_return(&nfss->writeback) >
 				NFS_CONGESTION_ON_THRESH) {
 			set_bdi_congested(&nfss->backing_dev_info,
@@ -216,11 +217,12 @@ static void nfs_end_page_writeback(struct page *page)
 	struct nfs_server *nfss = NFS_SERVER(inode);
 
 	end_page_writeback(page);
+	page_cache_release(page);
 	if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
 		clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
 }
 
-static struct nfs_page *nfs_find_and_lock_request(struct page *page)
+static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblock)
 {
 	struct inode *inode = page->mapping->host;
 	struct nfs_page *req;
@@ -239,7 +241,10 @@ static struct nfs_page *nfs_find_and_lock_request(struct page *page)
 		 *	 request as dirty (in which case we don't care).
 		 */
 		spin_unlock(&inode->i_lock);
-		ret = nfs_wait_on_request(req);
+		if (!nonblock)
+			ret = nfs_wait_on_request(req);
+		else
+			ret = -EAGAIN;
 		nfs_release_request(req);
 		if (ret != 0)
 			return ERR_PTR(ret);
@@ -254,12 +259,12 @@ static struct nfs_page *nfs_find_and_lock_request(struct page *page)
  * May return an error if the user signalled nfs_wait_on_request().
  */
 static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
-				struct page *page)
+				struct page *page, bool nonblock)
 {
 	struct nfs_page *req;
 	int ret = 0;
 
-	req = nfs_find_and_lock_request(page);
+	req = nfs_find_and_lock_request(page, nonblock);
 	if (!req)
 		goto out;
 	ret = PTR_ERR(req);
@@ -281,12 +286,20 @@ out:
 static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
 {
 	struct inode *inode = page->mapping->host;
+	int ret;
 
 	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
 	nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
 
 	nfs_pageio_cond_complete(pgio, page->index);
-	return nfs_page_async_flush(pgio, page);
+	ret = nfs_page_async_flush(pgio, page,
+			wbc->sync_mode == WB_SYNC_NONE ||
+			wbc->nonblocking != 0);
+	if (ret == -EAGAIN) {
+		redirty_page_for_writepage(wbc, page);
+		ret = 0;
+	}
+	return ret;
 }
 
 /*
@@ -421,6 +434,7 @@ static void
 nfs_mark_request_dirty(struct nfs_page *req)
 {
 	__set_page_dirty_nobuffers(req->wb_page);
+	__mark_inode_dirty(req->wb_page->mapping->host, I_DIRTY_DATASYNC);
 }
 
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
@@ -438,6 +452,7 @@ nfs_mark_request_commit(struct nfs_page *req)
 	radix_tree_tag_set(&nfsi->nfs_page_tree,
 			req->wb_index,
 			NFS_PAGE_TAG_COMMIT);
+	nfsi->ncommit++;
 	spin_unlock(&inode->i_lock);
 	inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
 	inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
@@ -501,57 +516,6 @@ int nfs_reschedule_unstable_write(struct nfs_page *req)
 }
 #endif
 
-/*
- * Wait for a request to complete.
- *
- * Interruptible by fatal signals only.
- */
-static int nfs_wait_on_requests_locked(struct inode *inode, pgoff_t idx_start, unsigned int npages)
-{
-	struct nfs_inode *nfsi = NFS_I(inode);
-	struct nfs_page *req;
-	pgoff_t idx_end, next;
-	unsigned int		res = 0;
-	int			error;
-
-	if (npages == 0)
-		idx_end = ~0;
-	else
-		idx_end = idx_start + npages - 1;
-
-	next = idx_start;
-	while (radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree, (void **)&req, next, 1, NFS_PAGE_TAG_LOCKED)) {
-		if (req->wb_index > idx_end)
-			break;
-
-		next = req->wb_index + 1;
-		BUG_ON(!NFS_WBACK_BUSY(req));
-
-		kref_get(&req->wb_kref);
-		spin_unlock(&inode->i_lock);
-		error = nfs_wait_on_request(req);
-		nfs_release_request(req);
-		spin_lock(&inode->i_lock);
-		if (error < 0)
-			return error;
-		res++;
-	}
-	return res;
-}
-
-static void nfs_cancel_commit_list(struct list_head *head)
-{
-	struct nfs_page *req;
-
-	while(!list_empty(head)) {
-		req = nfs_list_entry(head->next);
-		nfs_list_remove_request(req);
-		nfs_clear_request_commit(req);
-		nfs_inode_remove_request(req);
-		nfs_unlock_request(req);
-	}
-}
-
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 static int
 nfs_need_commit(struct nfs_inode *nfsi)
@@ -573,11 +537,17 @@ static int
 nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
+	int ret;
 
 	if (!nfs_need_commit(nfsi))
 		return 0;
 
-	return nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT);
+	ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT);
+	if (ret > 0)
+		nfsi->ncommit -= ret;
+	if (nfs_need_commit(NFS_I(inode)))
+		__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+	return ret;
 }
 #else
 static inline int nfs_need_commit(struct nfs_inode *nfsi)
@@ -642,9 +612,10 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
 		spin_lock(&inode->i_lock);
 	}
 
-	if (nfs_clear_request_commit(req))
-		radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree,
-				req->wb_index, NFS_PAGE_TAG_COMMIT);
+	if (nfs_clear_request_commit(req) &&
+			radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree,
+				req->wb_index, NFS_PAGE_TAG_COMMIT) != NULL)
+		NFS_I(inode)->ncommit--;
 
 	/* Okay, the request matches. Update the region */
 	if (offset < req->wb_offset) {
@@ -703,9 +674,11 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
 	req = nfs_setup_write_request(ctx, page, offset, count);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
+	nfs_mark_request_dirty(req);
 	/* Update file length */
 	nfs_grow_file(page, offset, count);
 	nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
+	nfs_mark_request_dirty(req);
 	nfs_clear_page_tag_locked(req);
 	return 0;
 }
@@ -782,8 +755,6 @@ int nfs_updatepage(struct file *file, struct page *page,
 	status = nfs_writepage_setup(ctx, page, offset, count);
 	if (status < 0)
 		nfs_set_pageerror(page);
-	else
-		__set_page_dirty_nobuffers(page);
 
 	dprintk("NFS:       nfs_updatepage returns %d (isize %lld)\n",
 			status, (long long)i_size_read(inode));
@@ -792,13 +763,12 @@ int nfs_updatepage(struct file *file, struct page *page,
 
 static void nfs_writepage_release(struct nfs_page *req)
 {
+	struct page *page = req->wb_page;
 
-	if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req)) {
-		nfs_end_page_writeback(req->wb_page);
+	if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req))
 		nfs_inode_remove_request(req);
-	} else
-		nfs_end_page_writeback(req->wb_page);
 	nfs_clear_page_tag_locked(req);
+	nfs_end_page_writeback(page);
 }
 
 static int flush_task_priority(int how)
@@ -822,7 +792,6 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
 		int how)
 {
 	struct inode *inode = req->wb_context->path.dentry->d_inode;
-	int flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
 	int priority = flush_task_priority(how);
 	struct rpc_task *task;
 	struct rpc_message msg = {
@@ -837,9 +806,10 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
 		.callback_ops = call_ops,
 		.callback_data = data,
 		.workqueue = nfsiod_workqueue,
-		.flags = flags,
+		.flags = RPC_TASK_ASYNC,
 		.priority = priority,
 	};
+	int ret = 0;
 
 	/* Set up the RPC argument and reply structs
 	 * NB: take care not to mess about with data->commit et al. */
@@ -878,10 +848,18 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
 		(unsigned long long)data->args.offset);
 
 	task = rpc_run_task(&task_setup_data);
-	if (IS_ERR(task))
-		return PTR_ERR(task);
+	if (IS_ERR(task)) {
+		ret = PTR_ERR(task);
+		goto out;
+	}
+	if (how & FLUSH_SYNC) {
+		ret = rpc_wait_for_completion_task(task);
+		if (ret == 0)
+			ret = task->tk_status;
+	}
 	rpc_put_task(task);
-	return 0;
+out:
+	return ret;
 }
 
 /* If a nfs_flush_* function fails, it should remove reqs from @head and
@@ -890,9 +868,11 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
  */
 static void nfs_redirty_request(struct nfs_page *req)
 {
+	struct page *page = req->wb_page;
+
 	nfs_mark_request_dirty(req);
-	nfs_end_page_writeback(req->wb_page);
 	nfs_clear_page_tag_locked(req);
+	nfs_end_page_writeback(page);
 }
 
 /*
@@ -1127,16 +1107,15 @@ static void nfs_writeback_release_full(void *calldata)
 		if (nfs_write_need_commit(data)) {
 			memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf));
 			nfs_mark_request_commit(req);
-			nfs_end_page_writeback(page);
 			dprintk(" marked for commit\n");
 			goto next;
 		}
 		dprintk(" OK\n");
 remove_request:
-		nfs_end_page_writeback(page);
 		nfs_inode_remove_request(req);
 	next:
 		nfs_clear_page_tag_locked(req);
+		nfs_end_page_writeback(page);
 	}
 	nfs_writedata_release(calldata);
 }
@@ -1233,7 +1212,26 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 
 
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
-void nfs_commitdata_release(void *data)
+static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait)
+{
+	if (!test_and_set_bit(NFS_INO_COMMIT, &nfsi->flags))
+		return 1;
+	if (may_wait && !out_of_line_wait_on_bit_lock(&nfsi->flags,
+				NFS_INO_COMMIT, nfs_wait_bit_killable,
+				TASK_KILLABLE))
+		return 1;
+	return 0;
+}
+
+static void nfs_commit_clear_lock(struct nfs_inode *nfsi)
+{
+	clear_bit(NFS_INO_COMMIT, &nfsi->flags);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&nfsi->flags, NFS_INO_COMMIT);
+}
+
+
+static void nfs_commitdata_release(void *data)
 {
 	struct nfs_write_data *wdata = data;
 
@@ -1250,7 +1248,6 @@ static int nfs_commit_rpcsetup(struct list_head *head,
 {
 	struct nfs_page *first = nfs_list_entry(head->next);
 	struct inode *inode = first->wb_context->path.dentry->d_inode;
-	int flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
 	int priority = flush_task_priority(how);
 	struct rpc_task *task;
 	struct rpc_message msg = {
@@ -1265,7 +1262,7 @@ static int nfs_commit_rpcsetup(struct list_head *head,
 		.callback_ops = &nfs_commit_ops,
 		.callback_data = data,
 		.workqueue = nfsiod_workqueue,
-		.flags = flags,
+		.flags = RPC_TASK_ASYNC,
 		.priority = priority,
 	};
 
@@ -1325,6 +1322,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how)
 				BDI_RECLAIMABLE);
 		nfs_clear_page_tag_locked(req);
 	}
+	nfs_commit_clear_lock(NFS_I(inode));
 	return -ENOMEM;
 }
 
@@ -1380,6 +1378,7 @@ static void nfs_commit_release(void *calldata)
 	next:
 		nfs_clear_page_tag_locked(req);
 	}
+	nfs_commit_clear_lock(NFS_I(data->inode));
 	nfs_commitdata_release(calldata);
 }
 
@@ -1394,8 +1393,11 @@ static const struct rpc_call_ops nfs_commit_ops = {
 int nfs_commit_inode(struct inode *inode, int how)
 {
 	LIST_HEAD(head);
-	int res;
+	int may_wait = how & FLUSH_SYNC;
+	int res = 0;
 
+	if (!nfs_commit_set_lock(NFS_I(inode), may_wait))
+		goto out_mark_dirty;
 	spin_lock(&inode->i_lock);
 	res = nfs_scan_commit(inode, &head, 0, 0);
 	spin_unlock(&inode->i_lock);
@@ -1403,95 +1405,64 @@ int nfs_commit_inode(struct inode *inode, int how)
 		int error = nfs_commit_list(inode, &head, how);
 		if (error < 0)
 			return error;
-	}
+		if (may_wait)
+			wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT,
+					nfs_wait_bit_killable,
+					TASK_KILLABLE);
+		else
+			goto out_mark_dirty;
+	} else
+		nfs_commit_clear_lock(NFS_I(inode));
+	return res;
+	/* Note: If we exit without ensuring that the commit is complete,
+	 * we must mark the inode as dirty. Otherwise, future calls to
+	 * sync_inode() with the WB_SYNC_ALL flag set will fail to ensure
+	 * that the data is on the disk.
+	 */
+out_mark_dirty:
+	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 	return res;
 }
-#else
-static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how)
-{
-	return 0;
-}
-#endif
 
-long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how)
+static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
 {
-	struct inode *inode = mapping->host;
-	pgoff_t idx_start, idx_end;
-	unsigned int npages = 0;
-	LIST_HEAD(head);
-	int nocommit = how & FLUSH_NOCOMMIT;
-	long pages, ret;
-
-	/* FIXME */
-	if (wbc->range_cyclic)
-		idx_start = 0;
-	else {
-		idx_start = wbc->range_start >> PAGE_CACHE_SHIFT;
-		idx_end = wbc->range_end >> PAGE_CACHE_SHIFT;
-		if (idx_end > idx_start) {
-			pgoff_t l_npages = 1 + idx_end - idx_start;
-			npages = l_npages;
-			if (sizeof(npages) != sizeof(l_npages) &&
-					(pgoff_t)npages != l_npages)
-				npages = 0;
+	struct nfs_inode *nfsi = NFS_I(inode);
+	int flags = FLUSH_SYNC;
+	int ret = 0;
+
+	/* Don't commit yet if this is a non-blocking flush and there are
+	 * lots of outstanding writes for this mapping.
+	 */
+	if (wbc->sync_mode == WB_SYNC_NONE &&
+	    nfsi->ncommit <= (nfsi->npages >> 1))
+		goto out_mark_dirty;
+
+	if (wbc->nonblocking || wbc->for_background)
+		flags = 0;
+	ret = nfs_commit_inode(inode, flags);
+	if (ret >= 0) {
+		if (wbc->sync_mode == WB_SYNC_NONE) {
+			if (ret < wbc->nr_to_write)
+				wbc->nr_to_write -= ret;
+			else
+				wbc->nr_to_write = 0;
 		}
+		return 0;
 	}
-	how &= ~FLUSH_NOCOMMIT;
-	spin_lock(&inode->i_lock);
-	do {
-		ret = nfs_wait_on_requests_locked(inode, idx_start, npages);
-		if (ret != 0)
-			continue;
-		if (nocommit)
-			break;
-		pages = nfs_scan_commit(inode, &head, idx_start, npages);
-		if (pages == 0)
-			break;
-		if (how & FLUSH_INVALIDATE) {
-			spin_unlock(&inode->i_lock);
-			nfs_cancel_commit_list(&head);
-			ret = pages;
-			spin_lock(&inode->i_lock);
-			continue;
-		}
-		pages += nfs_scan_commit(inode, &head, 0, 0);
-		spin_unlock(&inode->i_lock);
-		ret = nfs_commit_list(inode, &head, how);
-		spin_lock(&inode->i_lock);
-
-	} while (ret >= 0);
-	spin_unlock(&inode->i_lock);
+out_mark_dirty:
+	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 	return ret;
 }
-
-static int __nfs_write_mapping(struct address_space *mapping, struct writeback_control *wbc, int how)
+#else
+static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
 {
-	int ret;
-
-	ret = nfs_writepages(mapping, wbc);
-	if (ret < 0)
-		goto out;
-	ret = nfs_sync_mapping_wait(mapping, wbc, how);
-	if (ret < 0)
-		goto out;
 	return 0;
-out:
-	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
-	return ret;
 }
+#endif
 
-/* Two pass sync: first using WB_SYNC_NONE, then WB_SYNC_ALL */
-static int nfs_write_mapping(struct address_space *mapping, int how)
+int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
-	struct writeback_control wbc = {
-		.bdi = mapping->backing_dev_info,
-		.sync_mode = WB_SYNC_ALL,
-		.nr_to_write = LONG_MAX,
-		.range_start = 0,
-		.range_end = LLONG_MAX,
-	};
-
-	return __nfs_write_mapping(mapping, &wbc, how);
+	return nfs_commit_unstable_pages(inode, wbc);
 }
 
 /*
@@ -1499,37 +1470,27 @@ static int nfs_write_mapping(struct address_space *mapping, int how)
  */
 int nfs_wb_all(struct inode *inode)
 {
-	return nfs_write_mapping(inode->i_mapping, 0);
-}
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_ALL,
+		.nr_to_write = LONG_MAX,
+		.range_start = 0,
+		.range_end = LLONG_MAX,
+	};
 
-int nfs_wb_nocommit(struct inode *inode)
-{
-	return nfs_write_mapping(inode->i_mapping, FLUSH_NOCOMMIT);
+	return sync_inode(inode, &wbc);
 }
 
 int nfs_wb_page_cancel(struct inode *inode, struct page *page)
 {
 	struct nfs_page *req;
-	loff_t range_start = page_offset(page);
-	loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
-	struct writeback_control wbc = {
-		.bdi = page->mapping->backing_dev_info,
-		.sync_mode = WB_SYNC_ALL,
-		.nr_to_write = LONG_MAX,
-		.range_start = range_start,
-		.range_end = range_end,
-	};
 	int ret = 0;
 
 	BUG_ON(!PageLocked(page));
 	for (;;) {
+		wait_on_page_writeback(page);
 		req = nfs_page_find_request(page);
 		if (req == NULL)
-			goto out;
-		if (test_bit(PG_CLEAN, &req->wb_flags)) {
-			nfs_release_request(req);
 			break;
-		}
 		if (nfs_lock_request_dontget(req)) {
 			nfs_inode_remove_request(req);
 			/*
@@ -1541,55 +1502,47 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
 			break;
 		}
 		ret = nfs_wait_on_request(req);
+		nfs_release_request(req);
 		if (ret < 0)
-			goto out;
+			break;
 	}
-	if (!PagePrivate(page))
-		return 0;
-	ret = nfs_sync_mapping_wait(page->mapping, &wbc, FLUSH_INVALIDATE);
-out:
 	return ret;
 }
 
-static int nfs_wb_page_priority(struct inode *inode, struct page *page,
-				int how)
+/*
+ * Write back all requests on one page - we do this before reading it.
+ */
+int nfs_wb_page(struct inode *inode, struct page *page)
 {
 	loff_t range_start = page_offset(page);
 	loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
 	struct writeback_control wbc = {
-		.bdi = page->mapping->backing_dev_info,
 		.sync_mode = WB_SYNC_ALL,
-		.nr_to_write = LONG_MAX,
+		.nr_to_write = 0,
 		.range_start = range_start,
 		.range_end = range_end,
 	};
 	int ret;
 
-	do {
+	for (;;) {
+		wait_on_page_writeback(page);
 		if (clear_page_dirty_for_io(page)) {
 			ret = nfs_writepage_locked(page, &wbc);
 			if (ret < 0)
 				goto out_error;
-		} else if (!PagePrivate(page))
+			continue;
+		}
+		if (!PagePrivate(page))
 			break;
-		ret = nfs_sync_mapping_wait(page->mapping, &wbc, how);
+		ret = nfs_commit_inode(inode, FLUSH_SYNC);
 		if (ret < 0)
 			goto out_error;
-	} while (PagePrivate(page));
+	}
 	return 0;
 out_error:
-	__mark_inode_dirty(inode, I_DIRTY_PAGES);
 	return ret;
 }
 
-/*
- * Write back all requests on one page - we do this before reading it.
- */
-int nfs_wb_page(struct inode *inode, struct page* page)
-{
-	return nfs_wb_page_priority(inode, page, FLUSH_STABLE);
-}
-
 #ifdef CONFIG_MIGRATION
 int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
 		struct page *page)
@@ -1597,10 +1550,9 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
 	struct nfs_page *req;
 	int ret;
 
-	if (PageFsCache(page))
-		nfs_fscache_release_page(page, GFP_KERNEL);
+	nfs_fscache_release_page(page, GFP_KERNEL);
 
-	req = nfs_find_and_lock_request(page);
+	req = nfs_find_and_lock_request(page, false);
 	ret = PTR_ERR(req);
 	if (IS_ERR(req))
 		goto out;
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index 04133aacb1e5..fc1c52571c03 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -22,6 +22,7 @@
 
 #include <linux/module.h>
 #include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/sunrpc/xdr.h>
 #include <linux/nfsacl.h>
 #include <linux/nfs3.h>
diff --git a/fs/nfsctl.c b/fs/nfsctl.c
index d3854d94b7cf..bf9cbd242ddd 100644
--- a/fs/nfsctl.c
+++ b/fs/nfsctl.c
@@ -36,10 +36,9 @@ static struct file *do_open(char *name, int flags)
 		return ERR_PTR(error);
 
 	if (flags == O_RDWR)
-		error = may_open(&nd.path, MAY_READ|MAY_WRITE,
-					   FMODE_READ|FMODE_WRITE);
+		error = may_open(&nd.path, MAY_READ|MAY_WRITE, flags);
 	else
-		error = may_open(&nd.path, MAY_WRITE, FMODE_WRITE);
+		error = may_open(&nd.path, MAY_WRITE, flags);
 
 	if (!error)
 		return dentry_open(nd.path.dentry, nd.path.mnt, flags,
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index c487810a2366..c2a4f71d87dd 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -12,6 +12,7 @@
  * Copyright (C) 1995, 1996 Olaf Kirch, <okir@monad.swb.de>
  */
 
+#include <linux/slab.h>
 #include <linux/namei.h>
 #include <linux/module.h>
 #include <linux/exportfs.h>
@@ -258,10 +259,9 @@ static struct cache_detail svc_expkey_cache = {
 	.alloc		= expkey_alloc,
 };
 
-static struct svc_expkey *
-svc_expkey_lookup(struct svc_expkey *item)
+static int
+svc_expkey_hash(struct svc_expkey *item)
 {
-	struct cache_head *ch;
 	int hash = item->ek_fsidtype;
 	char * cp = (char*)item->ek_fsid;
 	int len = key_len(item->ek_fsidtype);
@@ -269,6 +269,14 @@ svc_expkey_lookup(struct svc_expkey *item)
 	hash ^= hash_mem(cp, len, EXPKEY_HASHBITS);
 	hash ^= hash_ptr(item->ek_client, EXPKEY_HASHBITS);
 	hash &= EXPKEY_HASHMASK;
+	return hash;
+}
+
+static struct svc_expkey *
+svc_expkey_lookup(struct svc_expkey *item)
+{
+	struct cache_head *ch;
+	int hash = svc_expkey_hash(item);
 
 	ch = sunrpc_cache_lookup(&svc_expkey_cache, &item->h,
 				 hash);
@@ -282,13 +290,7 @@ static struct svc_expkey *
 svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old)
 {
 	struct cache_head *ch;
-	int hash = new->ek_fsidtype;
-	char * cp = (char*)new->ek_fsid;
-	int len = key_len(new->ek_fsidtype);
-
-	hash ^= hash_mem(cp, len, EXPKEY_HASHBITS);
-	hash ^= hash_ptr(new->ek_client, EXPKEY_HASHBITS);
-	hash &= EXPKEY_HASHMASK;
+	int hash = svc_expkey_hash(new);
 
 	ch = sunrpc_cache_update(&svc_expkey_cache, &new->h,
 				 &old->h, hash);
@@ -737,14 +739,22 @@ struct cache_detail svc_export_cache = {
 	.alloc		= svc_export_alloc,
 };
 
-static struct svc_export *
-svc_export_lookup(struct svc_export *exp)
+static int
+svc_export_hash(struct svc_export *exp)
 {
-	struct cache_head *ch;
 	int hash;
+
 	hash = hash_ptr(exp->ex_client, EXPORT_HASHBITS);
 	hash ^= hash_ptr(exp->ex_path.dentry, EXPORT_HASHBITS);
 	hash ^= hash_ptr(exp->ex_path.mnt, EXPORT_HASHBITS);
+	return hash;
+}
+
+static struct svc_export *
+svc_export_lookup(struct svc_export *exp)
+{
+	struct cache_head *ch;
+	int hash = svc_export_hash(exp);
 
 	ch = sunrpc_cache_lookup(&svc_export_cache, &exp->h,
 				 hash);
@@ -758,10 +768,7 @@ static struct svc_export *
 svc_export_update(struct svc_export *new, struct svc_export *old)
 {
 	struct cache_head *ch;
-	int hash;
-	hash = hash_ptr(old->ex_client, EXPORT_HASHBITS);
-	hash ^= hash_ptr(old->ex_path.dentry, EXPORT_HASHBITS);
-	hash ^= hash_ptr(old->ex_path.mnt, EXPORT_HASHBITS);
+	int hash = svc_export_hash(old);
 
 	ch = sunrpc_cache_update(&svc_export_cache, &new->h,
 				 &old->h,
@@ -1070,9 +1077,9 @@ exp_export(struct nfsctl_export *nxp)
 		err = 0;
 finish:
 	kfree(new.ex_pathname);
-	if (exp)
+	if (!IS_ERR_OR_NULL(exp))
 		exp_put(exp);
-	if (fsid_key && !IS_ERR(fsid_key))
+	if (!IS_ERR_OR_NULL(fsid_key))
 		cache_put(&fsid_key->h, &svc_expkey_cache);
 	path_put(&path);
 out_put_clp:
@@ -1316,19 +1323,11 @@ rqst_exp_parent(struct svc_rqst *rqstp, struct path *path)
 
 static struct svc_export *find_fsidzero_export(struct svc_rqst *rqstp)
 {
-	struct svc_export *exp;
 	u32 fsidv[2];
 
 	mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL);
 
-	exp = rqst_exp_find(rqstp, FSID_NUM, fsidv);
-	/*
-	 * We shouldn't have accepting an nfsv4 request at all if we
-	 * don't have a pseudoexport!:
-	 */
-	if (IS_ERR(exp) && PTR_ERR(exp) == -ENOENT)
-		exp = ERR_PTR(-ESERVERFAULT);
-	return exp;
+	return rqst_exp_find(rqstp, FSID_NUM, fsidv);
 }
 
 /*
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index f20589d2ae27..6aa5590c3679 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -7,6 +7,7 @@
 #include "nfsd.h"
 /* FIXME: nfsacl.h is a broken header */
 #include <linux/nfsacl.h>
+#include <linux/gfp.h>
 #include "cache.h"
 #include "xdr3.h"
 #include "vfs.h"
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index e0c4846bad92..a596e9d987e4 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -7,6 +7,7 @@
 #include "nfsd.h"
 /* FIXME: nfsacl.h is a broken header */
 #include <linux/nfsacl.h>
+#include <linux/gfp.h>
 #include "cache.h"
 #include "xdr3.h"
 #include "vfs.h"
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 88150685df34..e48052615159 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -34,6 +34,7 @@
  *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include <linux/slab.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs4_acl.h>
 
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index c6eed2a3b093..eb78e7e22077 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -32,6 +32,8 @@
  */
 
 #include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/svc_xprt.h>
+#include <linux/slab.h>
 #include "nfsd.h"
 #include "state.h"
 
@@ -78,11 +80,6 @@ enum nfs_cb_opnum4 {
 					cb_sequence_dec_sz +            \
 					op_dec_sz)
 
-struct nfs4_rpc_args {
-	void				*args_op;
-	struct nfsd4_cb_sequence	args_seq;
-};
-
 /*
 * Generic encode routines from fs/nfs/nfs4xdr.c
 */
@@ -427,13 +424,19 @@ static struct rpc_procinfo     nfs4_cb_procedures[] = {
 };
 
 static struct rpc_version       nfs_cb_version4 = {
+/*
+ * Note on the callback rpc program version number: despite language in rfc
+ * 5661 section 18.36.3 requiring servers to use 4 in this field, the
+ * official xdr descriptions for both 4.0 and 4.1 specify version 1, and
+ * in practice that appears to be what implementations use.  The section
+ * 18.36.3 language is expected to be fixed in an erratum.
+ */
         .number                 = 1,
         .nrprocs                = ARRAY_SIZE(nfs4_cb_procedures),
         .procs                  = nfs4_cb_procedures
 };
 
 static struct rpc_version *	nfs_cb_version[] = {
-	NULL,
 	&nfs_cb_version4,
 };
 
@@ -455,15 +458,14 @@ static struct rpc_program cb_program = {
 
 static int max_cb_time(void)
 {
-	return max(NFSD_LEASE_TIME/10, (time_t)1) * HZ;
+	return max(nfsd4_lease/10, (time_t)1) * HZ;
 }
 
 /* Reference counting, callback cleanup, etc., all look racy as heck.
- * And why is cb_set an atomic? */
+ * And why is cl_cb_set an atomic? */
 
-int setup_callback_client(struct nfs4_client *clp)
+int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
 {
-	struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
 	struct rpc_timeout	timeparms = {
 		.to_initval	= max_cb_time(),
 		.to_retries	= 0,
@@ -475,7 +477,7 @@ int setup_callback_client(struct nfs4_client *clp)
 		.timeout	= &timeparms,
 		.program	= &cb_program,
 		.prognumber	= cb->cb_prog,
-		.version	= nfs_cb_version[1]->number,
+		.version	= 0,
 		.authflavor	= clp->cl_flavor,
 		.flags		= (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
 		.client_name    = clp->cl_principal,
@@ -485,7 +487,7 @@ int setup_callback_client(struct nfs4_client *clp)
 	if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
 		return -EINVAL;
 	if (cb->cb_minorversion) {
-		args.bc_xprt = clp->cl_cb_xprt;
+		args.bc_xprt = cb->cb_xprt;
 		args.protocol = XPRT_TRANSPORT_BC_TCP;
 	}
 	/* Create RPC client */
@@ -495,7 +497,7 @@ int setup_callback_client(struct nfs4_client *clp)
 			PTR_ERR(client));
 		return PTR_ERR(client);
 	}
-	cb->cb_client = client;
+	nfsd4_set_callback_client(clp, client);
 	return 0;
 
 }
@@ -513,8 +515,7 @@ static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
 	if (task->tk_status)
 		warn_no_callback_path(clp, task->tk_status);
 	else
-		atomic_set(&clp->cl_cb_conn.cb_set, 1);
-	put_nfs4_client(clp);
+		atomic_set(&clp->cl_cb_set, 1);
 }
 
 static const struct rpc_call_ops nfsd4_cb_probe_ops = {
@@ -525,6 +526,8 @@ static struct rpc_cred *callback_cred;
 
 int set_callback_cred(void)
 {
+	if (callback_cred)
+		return 0;
 	callback_cred = rpc_lookup_machine_cred();
 	if (!callback_cred)
 		return -ENOMEM;
@@ -534,7 +537,6 @@ int set_callback_cred(void)
 
 void do_probe_callback(struct nfs4_client *clp)
 {
-	struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
 	struct rpc_message msg = {
 		.rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
 		.rpc_argp       = clp,
@@ -542,33 +544,27 @@ void do_probe_callback(struct nfs4_client *clp)
 	};
 	int status;
 
-	status = rpc_call_async(cb->cb_client, &msg, RPC_TASK_SOFT,
+	status = rpc_call_async(clp->cl_cb_client, &msg,
+				RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
 				&nfsd4_cb_probe_ops, (void *)clp);
-	if (status) {
+	if (status)
 		warn_no_callback_path(clp, status);
-		put_nfs4_client(clp);
-	}
 }
 
 /*
  * Set up the callback client and put a NFSPROC4_CB_NULL on the wire...
  */
-void
-nfsd4_probe_callback(struct nfs4_client *clp)
+void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
 {
 	int status;
 
-	BUG_ON(atomic_read(&clp->cl_cb_conn.cb_set));
+	BUG_ON(atomic_read(&clp->cl_cb_set));
 
-	status = setup_callback_client(clp);
+	status = setup_callback_client(clp, cb);
 	if (status) {
 		warn_no_callback_path(clp, status);
 		return;
 	}
-
-	/* the task holds a reference to the nfs4_client struct */
-	atomic_inc(&clp->cl_count);
-
 	do_probe_callback(clp);
 }
 
@@ -654,18 +650,32 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
 	}
 }
 
+
 static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
 {
 	struct nfs4_delegation *dp = calldata;
 	struct nfs4_client *clp = dp->dl_client;
+	struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
 
 	nfsd4_cb_done(task, calldata);
 
+	if (current_rpc_client == NULL) {
+		/* We're shutting down; give up. */
+		/* XXX: err, or is it ok just to fall through
+		 * and rpc_restart_call? */
+		return;
+	}
+
 	switch (task->tk_status) {
 	case -EIO:
 		/* Network partition? */
-		atomic_set(&clp->cl_cb_conn.cb_set, 0);
+		atomic_set(&clp->cl_cb_set, 0);
 		warn_no_callback_path(clp, task->tk_status);
+		if (current_rpc_client != task->tk_client) {
+			/* queue a callback on the new connection: */
+			nfsd4_cb_recall(dp);
+			return;
+		}
 	case -EBADHANDLE:
 	case -NFS4ERR_BAD_STATEID:
 		/* Race: client probably got cb_recall
@@ -673,7 +683,7 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
 		break;
 	default:
 		/* success, or error we can't handle */
-		goto done;
+		return;
 	}
 	if (dp->dl_retries--) {
 		rpc_delay(task, 2*HZ);
@@ -681,20 +691,16 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
 		rpc_restart_call(task);
 		return;
 	} else {
-		atomic_set(&clp->cl_cb_conn.cb_set, 0);
+		atomic_set(&clp->cl_cb_set, 0);
 		warn_no_callback_path(clp, task->tk_status);
 	}
-done:
-	kfree(task->tk_msg.rpc_argp);
 }
 
 static void nfsd4_cb_recall_release(void *calldata)
 {
 	struct nfs4_delegation *dp = calldata;
-	struct nfs4_client *clp = dp->dl_client;
 
 	nfs4_put_delegation(dp);
-	put_nfs4_client(clp);
 }
 
 static const struct rpc_call_ops nfsd4_cb_recall_ops = {
@@ -703,33 +709,75 @@ static const struct rpc_call_ops nfsd4_cb_recall_ops = {
 	.rpc_release = nfsd4_cb_recall_release,
 };
 
+static struct workqueue_struct *callback_wq;
+
+int nfsd4_create_callback_queue(void)
+{
+	callback_wq = create_singlethread_workqueue("nfsd4_callbacks");
+	if (!callback_wq)
+		return -ENOMEM;
+	return 0;
+}
+
+void nfsd4_destroy_callback_queue(void)
+{
+	destroy_workqueue(callback_wq);
+}
+
+/* must be called under the state lock */
+void nfsd4_set_callback_client(struct nfs4_client *clp, struct rpc_clnt *new)
+{
+	struct rpc_clnt *old = clp->cl_cb_client;
+
+	clp->cl_cb_client = new;
+	/*
+	 * After this, any work that saw the old value of cl_cb_client will
+	 * be gone:
+	 */
+	flush_workqueue(callback_wq);
+	/* So we can safely shut it down: */
+	if (old)
+		rpc_shutdown_client(old);
+}
+
 /*
  * called with dp->dl_count inc'ed.
  */
-void
-nfsd4_cb_recall(struct nfs4_delegation *dp)
+static void _nfsd4_cb_recall(struct nfs4_delegation *dp)
 {
 	struct nfs4_client *clp = dp->dl_client;
-	struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client;
-	struct nfs4_rpc_args *args;
+	struct rpc_clnt *clnt = clp->cl_cb_client;
+	struct nfs4_rpc_args *args = &dp->dl_recall.cb_args;
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL],
 		.rpc_cred = callback_cred
 	};
-	int status = -ENOMEM;
+	int status;
+
+	if (clnt == NULL)
+		return; /* Client is shutting down; give up. */
 
-	args = kzalloc(sizeof(*args), GFP_KERNEL);
-	if (!args)
-		goto out;
 	args->args_op = dp;
 	msg.rpc_argp = args;
 	dp->dl_retries = 1;
 	status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT,
 				&nfsd4_cb_recall_ops, dp);
-out:
-	if (status) {
-		kfree(args);
-		put_nfs4_client(clp);
+	if (status)
 		nfs4_put_delegation(dp);
-	}
+}
+
+void nfsd4_do_callback_rpc(struct work_struct *w)
+{
+	/* XXX: for now, just send off delegation recall. */
+	/* In future, generalize to handle any sort of callback. */
+	struct nfsd4_callback *c = container_of(w, struct nfsd4_callback, cb_work);
+	struct nfs4_delegation *dp = container_of(c, struct nfs4_delegation, dl_recall);
+
+	_nfsd4_cb_recall(dp);
+}
+
+
+void nfsd4_cb_recall(struct nfs4_delegation *dp)
+{
+	queue_work(callback_wq, &dp->dl_recall.cb_work);
 }
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 6e2983b27f3c..c78dbf493424 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -36,6 +36,7 @@
 #include <linux/nfsd_idmap.h>
 #include <linux/seq_file.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 
 /*
  * Cache entry
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 37514c469846..59ec449b0c7f 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -33,6 +33,7 @@
  *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 #include <linux/file.h>
+#include <linux/slab.h>
 
 #include "cache.h"
 #include "xdr4.h"
@@ -968,20 +969,36 @@ static struct nfsd4_operation nfsd4_ops[];
 static const char *nfsd4_op_name(unsigned opnum);
 
 /*
- * Enforce NFSv4.1 COMPOUND ordering rules.
+ * Enforce NFSv4.1 COMPOUND ordering rules:
  *
- * TODO:
- * - enforce NFS4ERR_NOT_ONLY_OP,
- * - DESTROY_SESSION MUST be the final operation in the COMPOUND request.
+ * Also note, enforced elsewhere:
+ *	- SEQUENCE other than as first op results in
+ *	  NFS4ERR_SEQUENCE_POS. (Enforced in nfsd4_sequence().)
+ *	- BIND_CONN_TO_SESSION must be the only op in its compound
+ *	  (Will be enforced in nfsd4_bind_conn_to_session().)
+ *	- DESTROY_SESSION must be the final operation in a compound, if
+ *	  sessionid's in SEQUENCE and DESTROY_SESSION are the same.
+ *	  (Enforced in nfsd4_destroy_session().)
  */
-static bool nfs41_op_ordering_ok(struct nfsd4_compoundargs *args)
+static __be32 nfs41_check_op_ordering(struct nfsd4_compoundargs *args)
 {
-	if (args->minorversion && args->opcnt > 0) {
-		struct nfsd4_op *op = &args->ops[0];
-		return (op->status == nfserr_op_illegal) ||
-		       (nfsd4_ops[op->opnum].op_flags & ALLOWED_AS_FIRST_OP);
-	}
-	return true;
+	struct nfsd4_op *op = &args->ops[0];
+
+	/* These ordering requirements don't apply to NFSv4.0: */
+	if (args->minorversion == 0)
+		return nfs_ok;
+	/* This is weird, but OK, not our problem: */
+	if (args->opcnt == 0)
+		return nfs_ok;
+	if (op->status == nfserr_op_illegal)
+		return nfs_ok;
+	if (!(nfsd4_ops[op->opnum].op_flags & ALLOWED_AS_FIRST_OP))
+		return nfserr_op_not_in_session;
+	if (op->opnum == OP_SEQUENCE)
+		return nfs_ok;
+	if (args->opcnt != 1)
+		return nfserr_not_only_op;
+	return nfs_ok;
 }
 
 /*
@@ -1011,6 +1028,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 	resp->rqstp = rqstp;
 	resp->cstate.minorversion = args->minorversion;
 	resp->cstate.replay_owner = NULL;
+	resp->cstate.session = NULL;
 	fh_init(&resp->cstate.current_fh, NFS4_FHSIZE);
 	fh_init(&resp->cstate.save_fh, NFS4_FHSIZE);
 	/* Use the deferral mechanism only for NFSv4.0 compounds */
@@ -1023,13 +1041,13 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 	if (args->minorversion > nfsd_supported_minorversion)
 		goto out;
 
-	if (!nfs41_op_ordering_ok(args)) {
+	status = nfs41_check_op_ordering(args);
+	if (status) {
 		op = &args->ops[0];
-		op->status = nfserr_sequence_pos;
+		op->status = status;
 		goto encode_op;
 	}
 
-	status = nfs_ok;
 	while (!status && resp->opcnt < args->opcnt) {
 		op = &args->ops[resp->opcnt++];
 
@@ -1294,6 +1312,11 @@ static struct nfsd4_operation nfsd4_ops[] = {
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
 		.op_name = "OP_SEQUENCE",
 	},
+	[OP_RECLAIM_COMPLETE] = {
+		.op_func = (nfsd4op_func)nfsd4_reclaim_complete,
+		.op_flags = ALLOWED_WITHOUT_FH,
+		.op_name = "OP_RECLAIM_COMPLETE",
+	},
 };
 
 static const char *nfsd4_op_name(unsigned opnum)
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 5a754f7b71ed..7e26caab2a26 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -32,6 +32,7 @@
 */
 
 #include <linux/file.h>
+#include <linux/slab.h>
 #include <linux/namei.h>
 #include <linux/crypto.h>
 #include <linux/sched.h>
@@ -43,8 +44,7 @@
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
 
 /* Globals */
-static struct path rec_dir;
-static int rec_dir_init = 0;
+static struct file *rec_file;
 
 static int
 nfs4_save_creds(const struct cred **original_creds)
@@ -116,35 +116,28 @@ out_no_tfm:
 	return status;
 }
 
-static void
-nfsd4_sync_rec_dir(void)
-{
-	mutex_lock(&rec_dir.dentry->d_inode->i_mutex);
-	nfsd_sync_dir(rec_dir.dentry);
-	mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
-}
-
 int
 nfsd4_create_clid_dir(struct nfs4_client *clp)
 {
 	const struct cred *original_cred;
 	char *dname = clp->cl_recdir;
-	struct dentry *dentry;
+	struct dentry *dir, *dentry;
 	int status;
 
 	dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname);
 
-	if (!rec_dir_init || clp->cl_firststate)
+	if (!rec_file || clp->cl_firststate)
 		return 0;
 
 	status = nfs4_save_creds(&original_cred);
 	if (status < 0)
 		return status;
 
+	dir = rec_file->f_path.dentry;
 	/* lock the parent */
-	mutex_lock(&rec_dir.dentry->d_inode->i_mutex);
+	mutex_lock(&dir->d_inode->i_mutex);
 
-	dentry = lookup_one_len(dname, rec_dir.dentry, HEXDIR_LEN-1);
+	dentry = lookup_one_len(dname, dir, HEXDIR_LEN-1);
 	if (IS_ERR(dentry)) {
 		status = PTR_ERR(dentry);
 		goto out_unlock;
@@ -154,18 +147,18 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
 		dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n");
 		goto out_put;
 	}
-	status = mnt_want_write(rec_dir.mnt);
+	status = mnt_want_write(rec_file->f_path.mnt);
 	if (status)
 		goto out_put;
-	status = vfs_mkdir(rec_dir.dentry->d_inode, dentry, S_IRWXU);
-	mnt_drop_write(rec_dir.mnt);
+	status = vfs_mkdir(dir->d_inode, dentry, S_IRWXU);
+	mnt_drop_write(rec_file->f_path.mnt);
 out_put:
 	dput(dentry);
 out_unlock:
-	mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
+	mutex_unlock(&dir->d_inode->i_mutex);
 	if (status == 0) {
 		clp->cl_firststate = 1;
-		nfsd4_sync_rec_dir();
+		vfs_fsync(rec_file, 0);
 	}
 	nfs4_reset_creds(original_cred);
 	dprintk("NFSD: nfsd4_create_clid_dir returns %d\n", status);
@@ -207,14 +200,14 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
 	struct dentry *dentry;
 	int status;
 
-	if (!rec_dir_init)
+	if (!rec_file)
 		return 0;
 
 	status = nfs4_save_creds(&original_cred);
 	if (status < 0)
 		return status;
 
-	filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY,
+	filp = dentry_open(dget(dir), mntget(rec_file->f_path.mnt), O_RDONLY,
 			   current_cred());
 	status = PTR_ERR(filp);
 	if (IS_ERR(filp))
@@ -251,13 +244,14 @@ out:
 static int
 nfsd4_unlink_clid_dir(char *name, int namlen)
 {
-	struct dentry *dentry;
+	struct dentry *dir, *dentry;
 	int status;
 
 	dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name);
 
-	mutex_lock_nested(&rec_dir.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
-	dentry = lookup_one_len(name, rec_dir.dentry, namlen);
+	dir = rec_file->f_path.dentry;
+	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
+	dentry = lookup_one_len(name, dir, namlen);
 	if (IS_ERR(dentry)) {
 		status = PTR_ERR(dentry);
 		goto out_unlock;
@@ -265,11 +259,11 @@ nfsd4_unlink_clid_dir(char *name, int namlen)
 	status = -ENOENT;
 	if (!dentry->d_inode)
 		goto out;
-	status = vfs_rmdir(rec_dir.dentry->d_inode, dentry);
+	status = vfs_rmdir(dir->d_inode, dentry);
 out:
 	dput(dentry);
 out_unlock:
-	mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
+	mutex_unlock(&dir->d_inode->i_mutex);
 	return status;
 }
 
@@ -279,10 +273,10 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp)
 	const struct cred *original_cred;
 	int status;
 
-	if (!rec_dir_init || !clp->cl_firststate)
+	if (!rec_file || !clp->cl_firststate)
 		return;
 
-	status = mnt_want_write(rec_dir.mnt);
+	status = mnt_want_write(rec_file->f_path.mnt);
 	if (status)
 		goto out;
 	clp->cl_firststate = 0;
@@ -294,8 +288,8 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp)
 	status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1);
 	nfs4_reset_creds(original_cred);
 	if (status == 0)
-		nfsd4_sync_rec_dir();
-	mnt_drop_write(rec_dir.mnt);
+		vfs_fsync(rec_file, 0);
+	mnt_drop_write(rec_file->f_path.mnt);
 out:
 	if (status)
 		printk("NFSD: Failed to remove expired client state directory"
@@ -324,19 +318,19 @@ void
 nfsd4_recdir_purge_old(void) {
 	int status;
 
-	if (!rec_dir_init)
+	if (!rec_file)
 		return;
-	status = mnt_want_write(rec_dir.mnt);
+	status = mnt_want_write(rec_file->f_path.mnt);
 	if (status)
 		goto out;
-	status = nfsd4_list_rec_dir(rec_dir.dentry, purge_old);
+	status = nfsd4_list_rec_dir(rec_file->f_path.dentry, purge_old);
 	if (status == 0)
-		nfsd4_sync_rec_dir();
-	mnt_drop_write(rec_dir.mnt);
+		vfs_fsync(rec_file, 0);
+	mnt_drop_write(rec_file->f_path.mnt);
 out:
 	if (status)
 		printk("nfsd4: failed to purge old clients from recovery"
-			" directory %s\n", rec_dir.dentry->d_name.name);
+			" directory %s\n", rec_file->f_path.dentry->d_name.name);
 }
 
 static int
@@ -356,10 +350,13 @@ int
 nfsd4_recdir_load(void) {
 	int status;
 
-	status = nfsd4_list_rec_dir(rec_dir.dentry, load_recdir);
+	if (!rec_file)
+		return 0;
+
+	status = nfsd4_list_rec_dir(rec_file->f_path.dentry, load_recdir);
 	if (status)
 		printk("nfsd4: failed loading clients from recovery"
-			" directory %s\n", rec_dir.dentry->d_name.name);
+			" directory %s\n", rec_file->f_path.dentry->d_name.name);
 	return status;
 }
 
@@ -376,7 +373,7 @@ nfsd4_init_recdir(char *rec_dirname)
 	printk("NFSD: Using %s as the NFSv4 state recovery directory\n",
 			rec_dirname);
 
-	BUG_ON(rec_dir_init);
+	BUG_ON(rec_file);
 
 	status = nfs4_save_creds(&original_cred);
 	if (status < 0) {
@@ -386,22 +383,21 @@ nfsd4_init_recdir(char *rec_dirname)
 		return;
 	}
 
-	status = kern_path(rec_dirname, LOOKUP_FOLLOW | LOOKUP_DIRECTORY,
-			&rec_dir);
-	if (status)
+	rec_file = filp_open(rec_dirname, O_RDONLY | O_DIRECTORY, 0);
+	if (IS_ERR(rec_file)) {
 		printk("NFSD: unable to find recovery directory %s\n",
 				rec_dirname);
+		rec_file = NULL;
+	}
 
-	if (!status)
-		rec_dir_init = 1;
 	nfs4_reset_creds(original_cred);
 }
 
 void
 nfsd4_shutdown_recdir(void)
 {
-	if (!rec_dir_init)
+	if (!rec_file)
 		return;
-	rec_dir_init = 0;
-	path_put(&rec_dir);
+	fput(rec_file);
+	rec_file = NULL;
 }
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index f19ed866c95f..4a2734758778 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -34,6 +34,7 @@
 
 #include <linux/file.h>
 #include <linux/smp_lock.h>
+#include <linux/slab.h>
 #include <linux/namei.h>
 #include <linux/swap.h>
 #include <linux/sunrpc/svcauth_gss.h>
@@ -44,8 +45,8 @@
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
 
 /* Globals */
-static time_t lease_time = 90;     /* default lease time */
-static time_t user_lease_time = 90;
+time_t nfsd4_lease = 90;     /* default lease time */
+time_t nfsd4_grace = 90;
 static time_t boot_time;
 static u32 current_ownerid = 1;
 static u32 current_fileid = 1;
@@ -189,7 +190,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
 	dp->dl_vfs_file = stp->st_vfs_file;
 	dp->dl_type = type;
 	dp->dl_ident = cb->cb_ident;
-	dp->dl_stateid.si_boot = get_seconds();
+	dp->dl_stateid.si_boot = boot_time;
 	dp->dl_stateid.si_stateownerid = current_delegid++;
 	dp->dl_stateid.si_fileid = 0;
 	dp->dl_stateid.si_generation = 0;
@@ -198,6 +199,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
 	atomic_set(&dp->dl_count, 1);
 	list_add(&dp->dl_perfile, &fp->fi_delegations);
 	list_add(&dp->dl_perclnt, &clp->cl_delegations);
+	INIT_WORK(&dp->dl_recall.cb_work, nfsd4_do_callback_rpc);
 	return dp;
 }
 
@@ -248,6 +250,9 @@ unhash_delegation(struct nfs4_delegation *dp)
  * SETCLIENTID state 
  */
 
+/* client_lock protects the client lru list and session hash table */
+static DEFINE_SPINLOCK(client_lock);
+
 /* Hash tables for nfs4_clientid state */
 #define CLIENT_HASH_BITS                 4
 #define CLIENT_HASH_SIZE                (1 << CLIENT_HASH_BITS)
@@ -366,7 +371,6 @@ static void release_openowner(struct nfs4_stateowner *sop)
 	nfs4_put_stateowner(sop);
 }
 
-static DEFINE_SPINLOCK(sessionid_lock);
 #define SESSION_HASH_SIZE	512
 static struct list_head sessionid_hashtbl[SESSION_HASH_SIZE];
 
@@ -564,10 +568,10 @@ alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
 
 	new->se_flags = cses->flags;
 	kref_init(&new->se_ref);
-	spin_lock(&sessionid_lock);
+	spin_lock(&client_lock);
 	list_add(&new->se_hash, &sessionid_hashtbl[idx]);
 	list_add(&new->se_perclnt, &clp->cl_sessions);
-	spin_unlock(&sessionid_lock);
+	spin_unlock(&client_lock);
 
 	status = nfs_ok;
 out:
@@ -578,7 +582,7 @@ out_free:
 	goto out;
 }
 
-/* caller must hold sessionid_lock */
+/* caller must hold client_lock */
 static struct nfsd4_session *
 find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid)
 {
@@ -601,7 +605,7 @@ find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid)
 	return NULL;
 }
 
-/* caller must hold sessionid_lock */
+/* caller must hold client_lock */
 static void
 unhash_session(struct nfsd4_session *ses)
 {
@@ -609,15 +613,6 @@ unhash_session(struct nfsd4_session *ses)
 	list_del(&ses->se_perclnt);
 }
 
-static void
-release_session(struct nfsd4_session *ses)
-{
-	spin_lock(&sessionid_lock);
-	unhash_session(ses);
-	spin_unlock(&sessionid_lock);
-	nfsd4_put_session(ses);
-}
-
 void
 free_session(struct kref *kref)
 {
@@ -633,9 +628,18 @@ free_session(struct kref *kref)
 	kfree(ses);
 }
 
+/* must be called under the client_lock */
 static inline void
-renew_client(struct nfs4_client *clp)
+renew_client_locked(struct nfs4_client *clp)
 {
+	if (is_client_expired(clp)) {
+		dprintk("%s: client (clientid %08x/%08x) already expired\n",
+			__func__,
+			clp->cl_clientid.cl_boot,
+			clp->cl_clientid.cl_id);
+		return;
+	}
+
 	/*
 	* Move client to the end to the LRU list.
 	*/
@@ -646,6 +650,14 @@ renew_client(struct nfs4_client *clp)
 	clp->cl_time = get_seconds();
 }
 
+static inline void
+renew_client(struct nfs4_client *clp)
+{
+	spin_lock(&client_lock);
+	renew_client_locked(clp);
+	spin_unlock(&client_lock);
+}
+
 /* SETCLIENTID and SETCLIENTID_CONFIRM Helper functions */
 static int
 STALE_CLIENTID(clientid_t *clid)
@@ -679,27 +691,9 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
 	return clp;
 }
 
-static void
-shutdown_callback_client(struct nfs4_client *clp)
-{
-	struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client;
-
-	if (clnt) {
-		/*
-		 * Callback threads take a reference on the client, so there
-		 * should be no outstanding callbacks at this point.
-		 */
-		clp->cl_cb_conn.cb_client = NULL;
-		rpc_shutdown_client(clnt);
-	}
-}
-
 static inline void
 free_client(struct nfs4_client *clp)
 {
-	shutdown_callback_client(clp);
-	if (clp->cl_cb_xprt)
-		svc_xprt_put(clp->cl_cb_xprt);
 	if (clp->cl_cred.cr_group_info)
 		put_group_info(clp->cl_cred.cr_group_info);
 	kfree(clp->cl_principal);
@@ -708,10 +702,34 @@ free_client(struct nfs4_client *clp)
 }
 
 void
-put_nfs4_client(struct nfs4_client *clp)
+release_session_client(struct nfsd4_session *session)
 {
-	if (atomic_dec_and_test(&clp->cl_count))
+	struct nfs4_client *clp = session->se_client;
+
+	if (!atomic_dec_and_lock(&clp->cl_refcount, &client_lock))
+		return;
+	if (is_client_expired(clp)) {
 		free_client(clp);
+		session->se_client = NULL;
+	} else
+		renew_client_locked(clp);
+	spin_unlock(&client_lock);
+	nfsd4_put_session(session);
+}
+
+/* must be called under the client_lock */
+static inline void
+unhash_client_locked(struct nfs4_client *clp)
+{
+	mark_client_expired(clp);
+	list_del(&clp->cl_lru);
+	while (!list_empty(&clp->cl_sessions)) {
+		struct nfsd4_session  *ses;
+		ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
+				 se_perclnt);
+		unhash_session(ses);
+		nfsd4_put_session(ses);
+	}
 }
 
 static void
@@ -721,9 +739,6 @@ expire_client(struct nfs4_client *clp)
 	struct nfs4_delegation *dp;
 	struct list_head reaplist;
 
-	dprintk("NFSD: expire_client cl_count %d\n",
-	                    atomic_read(&clp->cl_count));
-
 	INIT_LIST_HEAD(&reaplist);
 	spin_lock(&recall_lock);
 	while (!list_empty(&clp->cl_delegations)) {
@@ -739,20 +754,20 @@ expire_client(struct nfs4_client *clp)
 		list_del_init(&dp->dl_recall_lru);
 		unhash_delegation(dp);
 	}
-	list_del(&clp->cl_idhash);
-	list_del(&clp->cl_strhash);
-	list_del(&clp->cl_lru);
 	while (!list_empty(&clp->cl_openowners)) {
 		sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient);
 		release_openowner(sop);
 	}
-	while (!list_empty(&clp->cl_sessions)) {
-		struct nfsd4_session  *ses;
-		ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
-				 se_perclnt);
-		release_session(ses);
-	}
-	put_nfs4_client(clp);
+	nfsd4_set_callback_client(clp, NULL);
+	if (clp->cl_cb_conn.cb_xprt)
+		svc_xprt_put(clp->cl_cb_conn.cb_xprt);
+	list_del(&clp->cl_idhash);
+	list_del(&clp->cl_strhash);
+	spin_lock(&client_lock);
+	unhash_client_locked(clp);
+	if (atomic_read(&clp->cl_refcount) == 0)
+		free_client(clp);
+	spin_unlock(&client_lock);
 }
 
 static void copy_verf(struct nfs4_client *target, nfs4_verifier *source)
@@ -838,14 +853,15 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
 	}
 
 	memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
-	atomic_set(&clp->cl_count, 1);
-	atomic_set(&clp->cl_cb_conn.cb_set, 0);
+	atomic_set(&clp->cl_refcount, 0);
+	atomic_set(&clp->cl_cb_set, 0);
 	INIT_LIST_HEAD(&clp->cl_idhash);
 	INIT_LIST_HEAD(&clp->cl_strhash);
 	INIT_LIST_HEAD(&clp->cl_openowners);
 	INIT_LIST_HEAD(&clp->cl_delegations);
 	INIT_LIST_HEAD(&clp->cl_sessions);
 	INIT_LIST_HEAD(&clp->cl_lru);
+	clp->cl_time = get_seconds();
 	clear_bit(0, &clp->cl_cb_slot_busy);
 	rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
 	copy_verf(clp, verf);
@@ -876,8 +892,7 @@ add_to_unconfirmed(struct nfs4_client *clp, unsigned int strhashval)
 	list_add(&clp->cl_strhash, &unconf_str_hashtbl[strhashval]);
 	idhashval = clientid_hashval(clp->cl_clientid.cl_id);
 	list_add(&clp->cl_idhash, &unconf_id_hashtbl[idhashval]);
-	list_add_tail(&clp->cl_lru, &client_lru);
-	clp->cl_time = get_seconds();
+	renew_client(clp);
 }
 
 static void
@@ -887,10 +902,9 @@ move_to_confirmed(struct nfs4_client *clp)
 	unsigned int strhashval;
 
 	dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp);
-	list_del_init(&clp->cl_strhash);
 	list_move(&clp->cl_idhash, &conf_id_hashtbl[idhashval]);
 	strhashval = clientstr_hashval(clp->cl_recdir);
-	list_add(&clp->cl_strhash, &conf_str_hashtbl[strhashval]);
+	list_move(&clp->cl_strhash, &conf_str_hashtbl[strhashval]);
 	renew_client(clp);
 }
 
@@ -1326,15 +1340,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 		cs_slot->sl_seqid++; /* from 0 to 1 */
 		move_to_confirmed(unconf);
 
-		/*
-		 * We do not support RDMA or persistent sessions
-		 */
-		cr_ses->flags &= ~SESSION4_PERSIST;
-		cr_ses->flags &= ~SESSION4_RDMA;
-
 		if (cr_ses->flags & SESSION4_BACK_CHAN) {
-			unconf->cl_cb_xprt = rqstp->rq_xprt;
-			svc_xprt_get(unconf->cl_cb_xprt);
+			unconf->cl_cb_conn.cb_xprt = rqstp->rq_xprt;
+			svc_xprt_get(rqstp->rq_xprt);
 			rpc_copy_addr(
 				(struct sockaddr *)&unconf->cl_cb_conn.cb_addr,
 				sa);
@@ -1343,7 +1351,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 				cstate->minorversion;
 			unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog;
 			unconf->cl_cb_seq_nr = 1;
-			nfsd4_probe_callback(unconf);
+			nfsd4_probe_callback(unconf, &unconf->cl_cb_conn);
 		}
 		conf = unconf;
 	} else {
@@ -1351,6 +1359,12 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 		goto out;
 	}
 
+	/*
+	 * We do not support RDMA or persistent sessions
+	 */
+	cr_ses->flags &= ~SESSION4_PERSIST;
+	cr_ses->flags &= ~SESSION4_RDMA;
+
 	status = alloc_init_session(rqstp, conf, cr_ses);
 	if (status)
 		goto out;
@@ -1368,6 +1382,21 @@ out:
 	return status;
 }
 
+static bool nfsd4_last_compound_op(struct svc_rqst *rqstp)
+{
+	struct nfsd4_compoundres *resp = rqstp->rq_resp;
+	struct nfsd4_compoundargs *argp = rqstp->rq_argp;
+
+	return argp->opcnt == resp->opcnt;
+}
+
+static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid)
+{
+	if (!session)
+		return 0;
+	return !memcmp(sid, &session->se_sessionid, sizeof(*sid));
+}
+
 __be32
 nfsd4_destroy_session(struct svc_rqst *r,
 		      struct nfsd4_compound_state *cstate,
@@ -1383,19 +1412,25 @@ nfsd4_destroy_session(struct svc_rqst *r,
 	 * - Do we need to clear any callback info from previous session?
 	 */
 
+	if (nfsd4_compound_in_session(cstate->session, &sessionid->sessionid)) {
+		if (!nfsd4_last_compound_op(r))
+			return nfserr_not_only_op;
+	}
 	dump_sessionid(__func__, &sessionid->sessionid);
-	spin_lock(&sessionid_lock);
+	spin_lock(&client_lock);
 	ses = find_in_sessionid_hashtbl(&sessionid->sessionid);
 	if (!ses) {
-		spin_unlock(&sessionid_lock);
+		spin_unlock(&client_lock);
 		goto out;
 	}
 
 	unhash_session(ses);
-	spin_unlock(&sessionid_lock);
+	spin_unlock(&client_lock);
 
+	nfs4_lock_state();
 	/* wait for callbacks */
-	shutdown_callback_client(ses->se_client);
+	nfsd4_set_callback_client(ses->se_client, NULL);
+	nfs4_unlock_state();
 	nfsd4_put_session(ses);
 	status = nfs_ok;
 out:
@@ -1416,7 +1451,7 @@ nfsd4_sequence(struct svc_rqst *rqstp,
 	if (resp->opcnt != 1)
 		return nfserr_sequence_pos;
 
-	spin_lock(&sessionid_lock);
+	spin_lock(&client_lock);
 	status = nfserr_badsession;
 	session = find_in_sessionid_hashtbl(&seq->sessionid);
 	if (!session)
@@ -1455,23 +1490,47 @@ nfsd4_sequence(struct svc_rqst *rqstp,
 	cstate->slot = slot;
 	cstate->session = session;
 
-	/* Hold a session reference until done processing the compound:
-	 * nfsd4_put_session called only if the cstate slot is set.
-	 */
-	nfsd4_get_session(session);
 out:
-	spin_unlock(&sessionid_lock);
-	/* Renew the clientid on success and on replay */
+	/* Hold a session reference until done processing the compound. */
 	if (cstate->session) {
-		nfs4_lock_state();
-		renew_client(session->se_client);
-		nfs4_unlock_state();
+		nfsd4_get_session(cstate->session);
+		atomic_inc(&session->se_client->cl_refcount);
 	}
+	spin_unlock(&client_lock);
 	dprintk("%s: return %d\n", __func__, ntohl(status));
 	return status;
 }
 
 __be32
+nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc)
+{
+	if (rc->rca_one_fs) {
+		if (!cstate->current_fh.fh_dentry)
+			return nfserr_nofilehandle;
+		/*
+		 * We don't take advantage of the rca_one_fs case.
+		 * That's OK, it's optional, we can safely ignore it.
+		 */
+		 return nfs_ok;
+	}
+	nfs4_lock_state();
+	if (is_client_expired(cstate->session->se_client)) {
+		nfs4_unlock_state();
+		/*
+		 * The following error isn't really legal.
+		 * But we only get here if the client just explicitly
+		 * destroyed the client.  Surely it no longer cares what
+		 * error it gets back on an operation for the dead
+		 * client.
+		 */
+		return nfserr_stale_clientid;
+	}
+	nfsd4_create_clid_dir(cstate->session->se_client);
+	nfs4_unlock_state();
+	return nfs_ok;
+}
+
+__be32
 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		  struct nfsd4_setclientid *setclid)
 {
@@ -1630,9 +1689,8 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
 		if (!same_creds(&conf->cl_cred, &unconf->cl_cred))
 			status = nfserr_clid_inuse;
 		else {
-			/* XXX: We just turn off callbacks until we can handle
-			  * change request correctly. */
-			atomic_set(&conf->cl_cb_conn.cb_set, 0);
+			atomic_set(&conf->cl_cb_set, 0);
+			nfsd4_probe_callback(conf, &unconf->cl_cb_conn);
 			expire_client(unconf);
 			status = nfs_ok;
 
@@ -1666,7 +1724,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
 			}
 			move_to_confirmed(unconf);
 			conf = unconf;
-			nfsd4_probe_callback(conf);
+			nfsd4_probe_callback(conf, &conf->cl_cb_conn);
 			status = nfs_ok;
 		}
 	} else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm)))
@@ -1699,12 +1757,12 @@ alloc_init_file(struct inode *ino)
 		INIT_LIST_HEAD(&fp->fi_hash);
 		INIT_LIST_HEAD(&fp->fi_stateids);
 		INIT_LIST_HEAD(&fp->fi_delegations);
-		spin_lock(&recall_lock);
-		list_add(&fp->fi_hash, &file_hashtbl[hashval]);
-		spin_unlock(&recall_lock);
 		fp->fi_inode = igrab(ino);
 		fp->fi_id = current_fileid++;
 		fp->fi_had_conflict = false;
+		spin_lock(&recall_lock);
+		list_add(&fp->fi_hash, &file_hashtbl[hashval]);
+		spin_unlock(&recall_lock);
 		return fp;
 	}
 	return NULL;
@@ -1826,7 +1884,7 @@ init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *
 	stp->st_stateowner = sop;
 	get_nfs4_file(fp);
 	stp->st_file = fp;
-	stp->st_stateid.si_boot = get_seconds();
+	stp->st_stateid.si_boot = boot_time;
 	stp->st_stateid.si_stateownerid = sop->so_id;
 	stp->st_stateid.si_fileid = fp->fi_id;
 	stp->st_stateid.si_generation = 0;
@@ -1998,7 +2056,9 @@ nfs4_file_downgrade(struct file *filp, unsigned int share_access)
 {
 	if (share_access & NFS4_SHARE_ACCESS_WRITE) {
 		drop_file_write_access(filp);
+		spin_lock(&filp->f_lock);
 		filp->f_mode = (filp->f_mode | FMODE_READ) & ~FMODE_WRITE;
+		spin_unlock(&filp->f_lock);
 	}
 }
 
@@ -2025,7 +2085,6 @@ void nfsd_break_deleg_cb(struct file_lock *fl)
 	 * lock) we know the server hasn't removed the lease yet, we know
 	 * it's safe to take a reference: */
 	atomic_inc(&dp->dl_count);
-	atomic_inc(&dp->dl_client->cl_count);
 
 	spin_lock(&recall_lock);
 	list_add_tail(&dp->dl_recall_lru, &del_recall_lru);
@@ -2344,7 +2403,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
 {
 	struct nfs4_delegation *dp;
 	struct nfs4_stateowner *sop = stp->st_stateowner;
-	struct nfs4_cb_conn *cb = &sop->so_client->cl_cb_conn;
+	int cb_up = atomic_read(&sop->so_client->cl_cb_set);
 	struct file_lock fl, *flp = &fl;
 	int status, flag = 0;
 
@@ -2352,7 +2411,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
 	open->op_recall = 0;
 	switch (open->op_claim_type) {
 		case NFS4_OPEN_CLAIM_PREVIOUS:
-			if (!atomic_read(&cb->cb_set))
+			if (!cb_up)
 				open->op_recall = 1;
 			flag = open->op_delegate_type;
 			if (flag == NFS4_OPEN_DELEGATE_NONE)
@@ -2363,7 +2422,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
 			 * had the chance to reclaim theirs.... */
 			if (locks_in_grace())
 				goto out;
-			if (!atomic_read(&cb->cb_set) || !sop->so_confirmed)
+			if (!cb_up || !sop->so_confirmed)
 				goto out;
 			if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
 				flag = NFS4_OPEN_DELEGATE_WRITE;
@@ -2532,7 +2591,7 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	renew_client(clp);
 	status = nfserr_cb_path_down;
 	if (!list_empty(&clp->cl_delegations)
-			&& !atomic_read(&clp->cl_cb_conn.cb_set))
+			&& !atomic_read(&clp->cl_cb_set))
 		goto out;
 	status = nfs_ok;
 out:
@@ -2549,6 +2608,12 @@ nfsd4_end_grace(void)
 	dprintk("NFSD: end of grace period\n");
 	nfsd4_recdir_purge_old();
 	locks_end_grace(&nfsd4_manager);
+	/*
+	 * Now that every NFSv4 client has had the chance to recover and
+	 * to see the (possibly new, possibly shorter) lease time, we
+	 * can safely set the next grace time to the current lease time:
+	 */
+	nfsd4_grace = nfsd4_lease;
 }
 
 static time_t
@@ -2558,15 +2623,17 @@ nfs4_laundromat(void)
 	struct nfs4_stateowner *sop;
 	struct nfs4_delegation *dp;
 	struct list_head *pos, *next, reaplist;
-	time_t cutoff = get_seconds() - NFSD_LEASE_TIME;
-	time_t t, clientid_val = NFSD_LEASE_TIME;
-	time_t u, test_val = NFSD_LEASE_TIME;
+	time_t cutoff = get_seconds() - nfsd4_lease;
+	time_t t, clientid_val = nfsd4_lease;
+	time_t u, test_val = nfsd4_lease;
 
 	nfs4_lock_state();
 
 	dprintk("NFSD: laundromat service - starting\n");
 	if (locks_in_grace())
 		nfsd4_end_grace();
+	INIT_LIST_HEAD(&reaplist);
+	spin_lock(&client_lock);
 	list_for_each_safe(pos, next, &client_lru) {
 		clp = list_entry(pos, struct nfs4_client, cl_lru);
 		if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) {
@@ -2575,12 +2642,22 @@ nfs4_laundromat(void)
 				clientid_val = t;
 			break;
 		}
+		if (atomic_read(&clp->cl_refcount)) {
+			dprintk("NFSD: client in use (clientid %08x)\n",
+				clp->cl_clientid.cl_id);
+			continue;
+		}
+		unhash_client_locked(clp);
+		list_add(&clp->cl_lru, &reaplist);
+	}
+	spin_unlock(&client_lock);
+	list_for_each_safe(pos, next, &reaplist) {
+		clp = list_entry(pos, struct nfs4_client, cl_lru);
 		dprintk("NFSD: purging unused client (clientid %08x)\n",
 			clp->cl_clientid.cl_id);
 		nfsd4_remove_clid_dir(clp);
 		expire_client(clp);
 	}
-	INIT_LIST_HEAD(&reaplist);
 	spin_lock(&recall_lock);
 	list_for_each_safe(pos, next, &del_recall_lru) {
 		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
@@ -2600,7 +2677,7 @@ nfs4_laundromat(void)
 		list_del_init(&dp->dl_recall_lru);
 		unhash_delegation(dp);
 	}
-	test_val = NFSD_LEASE_TIME;
+	test_val = nfsd4_lease;
 	list_for_each_safe(pos, next, &close_lru) {
 		sop = list_entry(pos, struct nfs4_stateowner, so_close_lru);
 		if (time_after((unsigned long)sop->so_time, (unsigned long)cutoff)) {
@@ -2656,39 +2733,11 @@ nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stateid *stp)
 static int
 STALE_STATEID(stateid_t *stateid)
 {
-	if (time_after((unsigned long)boot_time,
-			(unsigned long)stateid->si_boot)) {
-		dprintk("NFSD: stale stateid " STATEID_FMT "!\n",
-			STATEID_VAL(stateid));
-		return 1;
-	}
-	return 0;
-}
-
-static int
-EXPIRED_STATEID(stateid_t *stateid)
-{
-	if (time_before((unsigned long)boot_time,
-			((unsigned long)stateid->si_boot)) &&
-	    time_before((unsigned long)(stateid->si_boot + lease_time), get_seconds())) {
-		dprintk("NFSD: expired stateid " STATEID_FMT "!\n",
-			STATEID_VAL(stateid));
-		return 1;
-	}
-	return 0;
-}
-
-static __be32
-stateid_error_map(stateid_t *stateid)
-{
-	if (STALE_STATEID(stateid))
-		return nfserr_stale_stateid;
-	if (EXPIRED_STATEID(stateid))
-		return nfserr_expired;
-
-	dprintk("NFSD: bad stateid " STATEID_FMT "!\n",
+	if (stateid->si_boot == boot_time)
+		return 0;
+	dprintk("NFSD: stale stateid " STATEID_FMT "!\n",
 		STATEID_VAL(stateid));
-	return nfserr_bad_stateid;
+	return 1;
 }
 
 static inline int
@@ -2812,10 +2861,8 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
 	status = nfserr_bad_stateid;
 	if (is_delegation_stateid(stateid)) {
 		dp = find_delegation_stateid(ino, stateid);
-		if (!dp) {
-			status = stateid_error_map(stateid);
+		if (!dp)
 			goto out;
-		}
 		status = check_stateid_generation(stateid, &dp->dl_stateid,
 						  flags);
 		if (status)
@@ -2828,10 +2875,8 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
 			*filpp = dp->dl_vfs_file;
 	} else { /* open or lock stateid */
 		stp = find_stateid(stateid, flags);
-		if (!stp) {
-			status = stateid_error_map(stateid);
+		if (!stp)
 			goto out;
-		}
 		if (nfs4_check_fh(current_fh, stp))
 			goto out;
 		if (!stp->st_stateowner->so_confirmed)
@@ -2903,7 +2948,7 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
 		 */
 		sop = search_close_lru(stateid->si_stateownerid, flags);
 		if (sop == NULL)
-			return stateid_error_map(stateid);
+			return nfserr_bad_stateid;
 		*sopp = sop;
 		goto check_replay;
 	}
@@ -3170,10 +3215,8 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (!is_delegation_stateid(stateid))
 		goto out;
 	dp = find_delegation_stateid(inode, stateid);
-	if (!dp) {
-		status = stateid_error_map(stateid);
+	if (!dp)
 		goto out;
-	}
 	status = check_stateid_generation(stateid, &dp->dl_stateid, flags);
 	if (status)
 		goto out;
@@ -3399,7 +3442,7 @@ alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struc
 	stp->st_stateowner = sop;
 	get_nfs4_file(fp);
 	stp->st_file = fp;
-	stp->st_stateid.si_boot = get_seconds();
+	stp->st_stateid.si_boot = boot_time;
 	stp->st_stateid.si_stateownerid = sop->so_id;
 	stp->st_stateid.si_fileid = fp->fi_id;
 	stp->st_stateid.si_generation = 0;
@@ -3971,12 +4014,6 @@ nfsd4_load_reboot_recovery_data(void)
 		printk("NFSD: Failure reading reboot recovery data\n");
 }
 
-unsigned long
-get_nfs4_grace_period(void)
-{
-	return max(user_lease_time, lease_time) * HZ;
-}
-
 /*
  * Since the lifetime of a delegation isn't limited to that of an open, a
  * client may quite reasonably hang on to a delegation as long as it has
@@ -4003,20 +4040,27 @@ set_max_delegations(void)
 static int
 __nfs4_state_start(void)
 {
-	unsigned long grace_time;
+	int ret;
 
 	boot_time = get_seconds();
-	grace_time = get_nfs4_grace_period();
-	lease_time = user_lease_time;
 	locks_start_grace(&nfsd4_manager);
 	printk(KERN_INFO "NFSD: starting %ld-second grace period\n",
-	       grace_time/HZ);
+	       nfsd4_grace);
+	ret = set_callback_cred();
+	if (ret)
+		return -ENOMEM;
 	laundry_wq = create_singlethread_workqueue("nfsd4");
 	if (laundry_wq == NULL)
 		return -ENOMEM;
-	queue_delayed_work(laundry_wq, &laundromat_work, grace_time);
+	ret = nfsd4_create_callback_queue();
+	if (ret)
+		goto out_free_laundry;
+	queue_delayed_work(laundry_wq, &laundromat_work, nfsd4_grace * HZ);
 	set_max_delegations();
-	return set_callback_cred();
+	return 0;
+out_free_laundry:
+	destroy_workqueue(laundry_wq);
+	return ret;
 }
 
 int
@@ -4034,12 +4078,6 @@ nfs4_state_start(void)
 	return 0;
 }
 
-time_t
-nfs4_lease_time(void)
-{
-	return lease_time;
-}
-
 static void
 __nfs4_state_shutdown(void)
 {
@@ -4085,6 +4123,7 @@ nfs4_state_shutdown(void)
 	nfs4_release_reclaim();
 	__nfs4_state_shutdown();
 	nfs4_unlock_state();
+	nfsd4_destroy_callback_queue();
 }
 
 /*
@@ -4123,21 +4162,3 @@ nfs4_recoverydir(void)
 {
 	return user_recovery_dirname;
 }
-
-/*
- * Called when leasetime is changed.
- *
- * The only way the protocol gives us to handle on-the-fly lease changes is to
- * simulate a reboot.  Instead of doing that, we just wait till the next time
- * we start to register any changes in lease time.  If the administrator
- * really wants to change the lease time *now*, they can go ahead and bring
- * nfsd down and then back up again after changing the lease time.
- *
- * user_lease_time is protected by nfsd_mutex since it's only really accessed
- * when nfsd is starting
- */
-void
-nfs4_reset_lease(time_t leasetime)
-{
-	user_lease_time = leasetime;
-}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index a8587e90fd5a..ac17a7080239 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -40,6 +40,7 @@
  * at the end of nfs4svc_decode_compoundargs.
  */
 
+#include <linux/slab.h>
 #include <linux/namei.h>
 #include <linux/statfs.h>
 #include <linux/utsname.h>
@@ -160,10 +161,10 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
 	argp->p = page_address(argp->pagelist[0]);
 	argp->pagelist++;
 	if (argp->pagelen < PAGE_SIZE) {
-		argp->end = p + (argp->pagelen>>2);
+		argp->end = argp->p + (argp->pagelen>>2);
 		argp->pagelen = 0;
 	} else {
-		argp->end = p + (PAGE_SIZE>>2);
+		argp->end = argp->p + (PAGE_SIZE>>2);
 		argp->pagelen -= PAGE_SIZE;
 	}
 	memcpy(((char*)p)+avail, argp->p, (nbytes - avail));
@@ -1233,6 +1234,16 @@ nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
 	DECODE_TAIL;
 }
 
+static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, struct nfsd4_reclaim_complete *rc)
+{
+	DECODE_HEAD;
+
+	READ_BUF(4);
+	READ32(rc->rca_one_fs);
+
+	DECODE_TAIL;
+}
+
 static __be32
 nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
 {
@@ -1345,7 +1356,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
 	[OP_TEST_STATEID]	= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_WANT_DELEGATION]	= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_DESTROY_CLIENTID]	= (nfsd4_dec)nfsd4_decode_notsupp,
-	[OP_RECLAIM_COMPLETE]	= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_RECLAIM_COMPLETE]	= (nfsd4_dec)nfsd4_decode_reclaim_complete,
 };
 
 struct nfsd4_minorversion_ops {
@@ -1425,16 +1436,16 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 			argp->p = page_address(argp->pagelist[0]);
 			argp->pagelist++;
 			if (argp->pagelen < PAGE_SIZE) {
-				argp->end = p + (argp->pagelen>>2);
+				argp->end = argp->p + (argp->pagelen>>2);
 				argp->pagelen = 0;
 			} else {
-				argp->end = p + (PAGE_SIZE>>2);
+				argp->end = argp->p + (PAGE_SIZE>>2);
 				argp->pagelen -= PAGE_SIZE;
 			}
 		}
 		op->opnum = ntohl(*argp->p++);
 
-		if (op->opnum >= OP_ACCESS && op->opnum < ops->nops)
+		if (op->opnum >= FIRST_NFS4_OP && op->opnum <= LAST_NFS4_OP)
 			op->status = ops->decoders[op->opnum](argp, &op->u);
 		else {
 			op->opnum = OP_ILLEGAL;
@@ -1528,7 +1539,7 @@ static void write_cinfo(__be32 **p, struct nfsd4_change_info *c)
 	} } while (0);
 
 /* Encode as an array of strings the string given with components
- * seperated @sep.
+ * separated @sep.
  */
 static __be32 nfsd4_encode_components(char sep, char *components,
 				   __be32 **pp, int *buflen)
@@ -1899,7 +1910,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 	if (bmval0 & FATTR4_WORD0_LEASE_TIME) {
 		if ((buflen -= 4) < 0)
 			goto out_resource;
-		WRITE32(NFSD_LEASE_TIME);
+		WRITE32(nfsd4_lease);
 	}
 	if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) {
 		if ((buflen -= 4) < 0)
@@ -2121,9 +2132,15 @@ out_acl:
 		 * and this is the root of a cross-mounted filesystem.
 		 */
 		if (ignore_crossmnt == 0 &&
-		    exp->ex_path.mnt->mnt_root->d_inode == dentry->d_inode) {
-			err = vfs_getattr(exp->ex_path.mnt->mnt_parent,
-				exp->ex_path.mnt->mnt_mountpoint, &stat);
+		    dentry == exp->ex_path.mnt->mnt_root) {
+			struct path path = exp->ex_path;
+			path_get(&path);
+			while (follow_up(&path)) {
+				if (path.dentry != path.mnt->mnt_root)
+					break;
+			}
+			err = vfs_getattr(path.mnt, path.dentry, &stat);
+			path_put(&path);
 			if (err)
 				goto out_nfserr;
 		}
@@ -3300,11 +3317,14 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
 		iov = &rqstp->rq_res.head[0];
 	iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base;
 	BUG_ON(iov->iov_len > PAGE_SIZE);
-	if (nfsd4_has_session(cs) && cs->status != nfserr_replay_cache) {
-		nfsd4_store_cache_entry(resp);
-		dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__);
-		resp->cstate.slot->sl_inuse = false;
-		nfsd4_put_session(resp->cstate.session);
+	if (nfsd4_has_session(cs)) {
+		if (cs->status != nfserr_replay_cache) {
+			nfsd4_store_cache_entry(resp);
+			dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__);
+			cs->slot->sl_inuse = false;
+		}
+		/* Renew the clientid on success and on replay */
+		release_session_client(cs->session);
 	}
 	return 1;
 }
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index da08560c4818..4666a209678a 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -8,6 +8,8 @@
  * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
  */
 
+#include <linux/slab.h>
+
 #include "nfsd.h"
 #include "cache.h"
 
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 2604c3e70ea5..508941c23af7 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -4,6 +4,7 @@
  * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
  */
 
+#include <linux/slab.h>
 #include <linux/namei.h>
 #include <linux/ctype.h>
 
@@ -45,6 +46,7 @@ enum {
 	 */
 #ifdef CONFIG_NFSD_V4
 	NFSD_Leasetime,
+	NFSD_Gracetime,
 	NFSD_RecoveryDir,
 #endif
 };
@@ -69,6 +71,7 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size);
 static ssize_t write_maxblksize(struct file *file, char *buf, size_t size);
 #ifdef CONFIG_NFSD_V4
 static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
+static ssize_t write_gracetime(struct file *file, char *buf, size_t size);
 static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
 #endif
 
@@ -90,6 +93,7 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
 	[NFSD_MaxBlkSize] = write_maxblksize,
 #ifdef CONFIG_NFSD_V4
 	[NFSD_Leasetime] = write_leasetime,
+	[NFSD_Gracetime] = write_gracetime,
 	[NFSD_RecoveryDir] = write_recoverydir,
 #endif
 };
@@ -988,12 +992,13 @@ static ssize_t __write_ports_delfd(char *buf)
 static ssize_t __write_ports_addxprt(char *buf)
 {
 	char transport[16];
+	struct svc_xprt *xprt;
 	int port, err;
 
 	if (sscanf(buf, "%15s %4u", transport, &port) != 2)
 		return -EINVAL;
 
-	if (port < 1 || port > USHORT_MAX)
+	if (port < 1 || port > USHRT_MAX)
 		return -EINVAL;
 
 	err = nfsd_create_serv();
@@ -1002,13 +1007,24 @@ static ssize_t __write_ports_addxprt(char *buf)
 
 	err = svc_create_xprt(nfsd_serv, transport,
 				PF_INET, port, SVC_SOCK_ANONYMOUS);
-	if (err < 0) {
-		/* Give a reasonable perror msg for bad transport string */
-		if (err == -ENOENT)
-			err = -EPROTONOSUPPORT;
-		return err;
-	}
+	if (err < 0)
+		goto out_err;
+
+	err = svc_create_xprt(nfsd_serv, transport,
+				PF_INET6, port, SVC_SOCK_ANONYMOUS);
+	if (err < 0 && err != -EAFNOSUPPORT)
+		goto out_close;
 	return 0;
+out_close:
+	xprt = svc_find_xprt(nfsd_serv, transport, PF_INET, port);
+	if (xprt != NULL) {
+		svc_close_xprt(xprt);
+		svc_xprt_put(xprt);
+	}
+out_err:
+	/* Decrease the count, but don't shut down the service */
+	nfsd_serv->sv_nrthreads--;
+	return err;
 }
 
 /*
@@ -1024,7 +1040,7 @@ static ssize_t __write_ports_delxprt(char *buf)
 	if (sscanf(&buf[1], "%15s %4u", transport, &port) != 2)
 		return -EINVAL;
 
-	if (port < 1 || port > USHORT_MAX || nfsd_serv == NULL)
+	if (port < 1 || port > USHRT_MAX || nfsd_serv == NULL)
 		return -EINVAL;
 
 	xprt = svc_find_xprt(nfsd_serv, transport, AF_UNSPEC, port);
@@ -1191,29 +1207,45 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
 }
 
 #ifdef CONFIG_NFSD_V4
-extern time_t nfs4_leasetime(void);
-
-static ssize_t __write_leasetime(struct file *file, char *buf, size_t size)
+static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time)
 {
-	/* if size > 10 seconds, call
-	 * nfs4_reset_lease() then write out the new lease (seconds) as reply
-	 */
 	char *mesg = buf;
-	int rv, lease;
+	int rv, i;
 
 	if (size > 0) {
 		if (nfsd_serv)
 			return -EBUSY;
-		rv = get_int(&mesg, &lease);
+		rv = get_int(&mesg, &i);
 		if (rv)
 			return rv;
-		if (lease < 10 || lease > 3600)
+		/*
+		 * Some sanity checking.  We don't have a reason for
+		 * these particular numbers, but problems with the
+		 * extremes are:
+		 *	- Too short: the briefest network outage may
+		 *	  cause clients to lose all their locks.  Also,
+		 *	  the frequent polling may be wasteful.
+		 *	- Too long: do you really want reboot recovery
+		 *	  to take more than an hour?  Or to make other
+		 *	  clients wait an hour before being able to
+		 *	  revoke a dead client's locks?
+		 */
+		if (i < 10 || i > 3600)
 			return -EINVAL;
-		nfs4_reset_lease(lease);
+		*time = i;
 	}
 
-	return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%ld\n",
-							nfs4_lease_time());
+	return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%ld\n", *time);
+}
+
+static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time)
+{
+	ssize_t rv;
+
+	mutex_lock(&nfsd_mutex);
+	rv = __nfsd4_write_time(file, buf, size, time);
+	mutex_unlock(&nfsd_mutex);
+	return rv;
 }
 
 /**
@@ -1239,12 +1271,22 @@ static ssize_t __write_leasetime(struct file *file, char *buf, size_t size)
  */
 static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
 {
-	ssize_t rv;
+	return nfsd4_write_time(file, buf, size, &nfsd4_lease);
+}
 
-	mutex_lock(&nfsd_mutex);
-	rv = __write_leasetime(file, buf, size);
-	mutex_unlock(&nfsd_mutex);
-	return rv;
+/**
+ * write_gracetime - Set or report current NFSv4 grace period time
+ *
+ * As above, but sets the time of the NFSv4 grace period.
+ *
+ * Note this should never be set to less than the *previous*
+ * lease-period time, but we don't try to enforce this.  (In the common
+ * case (a new boot), we don't know what the previous lease time was
+ * anyway.)
+ */
+static ssize_t write_gracetime(struct file *file, char *buf, size_t size)
+{
+	return nfsd4_write_time(file, buf, size, &nfsd4_grace);
 }
 
 extern char *nfs4_recoverydir(void);
@@ -1338,6 +1380,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
 		[NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
 #ifdef CONFIG_NFSD_V4
 		[NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
+		[NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR},
 		[NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR},
 #endif
 		/* last one */ {""}
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index e942a1aaac92..72377761270e 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -82,7 +82,6 @@ int nfs4_state_init(void);
 void nfsd4_free_slabs(void);
 int nfs4_state_start(void);
 void nfs4_state_shutdown(void);
-time_t nfs4_lease_time(void);
 void nfs4_reset_lease(time_t leasetime);
 int nfs4_reset_recoverydir(char *recdir);
 #else
@@ -90,7 +89,6 @@ static inline int nfs4_state_init(void) { return 0; }
 static inline void nfsd4_free_slabs(void) { }
 static inline int nfs4_state_start(void) { return 0; }
 static inline void nfs4_state_shutdown(void) { }
-static inline time_t nfs4_lease_time(void) { return 0; }
 static inline void nfs4_reset_lease(time_t leasetime) { }
 static inline int nfs4_reset_recoverydir(char *recdir) { return 0; }
 #endif
@@ -229,6 +227,9 @@ extern struct timeval	nfssvc_boot;
 
 #ifdef CONFIG_NFSD_V4
 
+extern time_t nfsd4_lease;
+extern time_t nfsd4_grace;
+
 /* before processing a COMPOUND operation, we have to check that there
  * is enough space in the buffer for XDR encode to succeed.  otherwise,
  * we might process an operation with side effects, and be unable to
@@ -247,7 +248,6 @@ extern struct timeval	nfssvc_boot;
 #define	COMPOUND_SLACK_SPACE		140    /* OP_GETFH */
 #define COMPOUND_ERR_SLACK_SPACE	12     /* OP_SETATTR */
 
-#define NFSD_LEASE_TIME                 (nfs4_lease_time())
 #define NFSD_LAUNDROMAT_MINTIMEOUT      10   /* seconds */
 
 /*
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 171699eb07c8..06b2a26edfe0 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -120,7 +120,7 @@ u32 nfsd_supported_minorversion;
 int nfsd_vers(int vers, enum vers_op change)
 {
 	if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS)
-		return -1;
+		return 0;
 	switch(change) {
 	case NFSD_SET:
 		nfsd_versions[vers] = nfsd_version[vers];
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index fefeae27f25e..006c84230c7c 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -70,6 +70,16 @@ struct nfsd4_cb_sequence {
 	struct nfs4_client	*cbs_clp;
 };
 
+struct nfs4_rpc_args {
+	void				*args_op;
+	struct nfsd4_cb_sequence	args_seq;
+};
+
+struct nfsd4_callback {
+	struct nfs4_rpc_args cb_args;
+	struct work_struct cb_work;
+};
+
 struct nfs4_delegation {
 	struct list_head	dl_perfile;
 	struct list_head	dl_perclnt;
@@ -86,6 +96,7 @@ struct nfs4_delegation {
 	stateid_t		dl_stateid;
 	struct knfsd_fh		dl_fh;
 	int			dl_retries;
+	struct nfsd4_callback	dl_recall;
 };
 
 /* client delegation callback info */
@@ -96,9 +107,7 @@ struct nfs4_cb_conn {
 	u32                     cb_prog;
 	u32			cb_minorversion;
 	u32                     cb_ident;	/* minorversion 0 only */
-	/* RPC client info */
-	atomic_t		cb_set;     /* successful CB_NULL call */
-	struct rpc_clnt *       cb_client;
+	struct svc_xprt		*cb_xprt;	/* minorversion 1 only */
 };
 
 /* Maximum number of slots per session. 160 is useful for long haul TCP */
@@ -157,7 +166,7 @@ struct nfsd4_session {
 	struct list_head	se_hash;	/* hash by sessionid */
 	struct list_head	se_perclnt;
 	u32			se_flags;
-	struct nfs4_client	*se_client;	/* for expire_client */
+	struct nfs4_client	*se_client;
 	struct nfs4_sessionid	se_sessionid;
 	struct nfsd4_channel_attrs se_fchannel;
 	struct nfsd4_channel_attrs se_bchannel;
@@ -212,25 +221,41 @@ struct nfs4_client {
 	struct svc_cred		cl_cred; 	/* setclientid principal */
 	clientid_t		cl_clientid;	/* generated by server */
 	nfs4_verifier		cl_confirm;	/* generated by server */
-	struct nfs4_cb_conn	cl_cb_conn;     /* callback info */
-	atomic_t		cl_count;	/* ref count */
 	u32			cl_firststate;	/* recovery dir creation */
 
+	/* for v4.0 and v4.1 callbacks: */
+	struct nfs4_cb_conn	cl_cb_conn;
+	struct rpc_clnt		*cl_cb_client;
+	atomic_t		cl_cb_set;
+
 	/* for nfs41 */
 	struct list_head	cl_sessions;
 	struct nfsd4_clid_slot	cl_cs_slot;	/* create_session slot */
 	u32			cl_exchange_flags;
 	struct nfs4_sessionid	cl_sessionid;
+	/* number of rpc's in progress over an associated session: */
+	atomic_t		cl_refcount;
 
 	/* for nfs41 callbacks */
 	/* We currently support a single back channel with a single slot */
 	unsigned long		cl_cb_slot_busy;
 	u32			cl_cb_seq_nr;
-	struct svc_xprt		*cl_cb_xprt;	/* 4.1 callback transport */
 	struct rpc_wait_queue	cl_cb_waitq;	/* backchannel callers may */
 						/* wait here for slots */
 };
 
+static inline void
+mark_client_expired(struct nfs4_client *clp)
+{
+	clp->cl_time = 0;
+}
+
+static inline bool
+is_client_expired(struct nfs4_client *clp)
+{
+	return clp->cl_time == 0;
+}
+
 /* struct nfs4_client_reset
  * one per old client. Populates reset_str_hashtbl. Filled from conf_id_hashtbl
  * upon lease reset, or from upcall to state_daemon (to read in state
@@ -377,11 +402,14 @@ extern void nfs4_lock_state(void);
 extern void nfs4_unlock_state(void);
 extern int nfs4_in_grace(void);
 extern __be32 nfs4_check_open_reclaim(clientid_t *clid);
-extern void put_nfs4_client(struct nfs4_client *clp);
 extern void nfs4_free_stateowner(struct kref *kref);
 extern int set_callback_cred(void);
-extern void nfsd4_probe_callback(struct nfs4_client *clp);
+extern void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
+extern void nfsd4_do_callback_rpc(struct work_struct *);
 extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
+extern int nfsd4_create_callback_queue(void);
+extern void nfsd4_destroy_callback_queue(void);
+extern void nfsd4_set_callback_client(struct nfs4_client *, struct rpc_clnt *);
 extern void nfs4_put_delegation(struct nfs4_delegation *dp);
 extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname);
 extern void nfsd4_init_recdir(char *recdir_name);
@@ -392,6 +420,7 @@ extern int nfs4_has_reclaimed_state(const char *name, bool use_exchange_id);
 extern void nfsd4_recdir_purge_old(void);
 extern int nfsd4_create_clid_dir(struct nfs4_client *clp);
 extern void nfsd4_remove_clid_dir(struct nfs4_client *clp);
+extern void release_session_client(struct nfsd4_session *);
 
 static inline void
 nfs4_put_stateowner(struct nfs4_stateowner *so)
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index c194793b642b..3c111120b619 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -20,13 +20,15 @@
 #include <linux/fcntl.h>
 #include <linux/namei.h>
 #include <linux/delay.h>
-#include <linux/quotaops.h>
 #include <linux/fsnotify.h>
 #include <linux/posix_acl_xattr.h>
 #include <linux/xattr.h>
 #include <linux/jhash.h>
 #include <linux/ima.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
+#include <linux/exportfs.h>
+#include <linux/writeback.h>
 
 #ifdef CONFIG_NFSD_V3
 #include "xdr3.h"
@@ -271,6 +273,32 @@ out:
 	return err;
 }
 
+/*
+ * Commit metadata changes to stable storage.
+ */
+static int
+commit_metadata(struct svc_fh *fhp)
+{
+	struct inode *inode = fhp->fh_dentry->d_inode;
+	const struct export_operations *export_ops = inode->i_sb->s_export_op;
+	int error = 0;
+
+	if (!EX_ISSYNC(fhp->fh_export))
+		return 0;
+
+	if (export_ops->commit_metadata) {
+		error = export_ops->commit_metadata(inode);
+	} else {
+		struct writeback_control wbc = {
+			.sync_mode = WB_SYNC_ALL,
+			.nr_to_write = 0, /* metadata only */
+		};
+
+		error = sync_inode(inode, &wbc);
+	}
+
+	return error;
+}
 
 /*
  * Set various file attributes.
@@ -361,7 +389,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 		 * If we are changing the size of the file, then
 		 * we need to break all leases.
 		 */
-		host_err = break_lease(inode, FMODE_WRITE | O_NONBLOCK);
+		host_err = break_lease(inode, O_WRONLY | O_NONBLOCK);
 		if (host_err == -EWOULDBLOCK)
 			host_err = -ETIMEDOUT;
 		if (host_err) /* ENOMEM or EWOULDBLOCK */
@@ -377,7 +405,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 			put_write_access(inode);
 			goto out_nfserr;
 		}
-		vfs_dq_init(inode);
 	}
 
 	/* sanitize the mode change */
@@ -416,8 +443,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 	if (size_change)
 		put_write_access(inode);
 	if (!err)
-		if (EX_ISSYNC(fhp->fh_export))
-			write_inode_now(inode, 1);
+		commit_metadata(fhp);
 out:
 	return err;
 
@@ -697,7 +723,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 	struct inode	*inode;
 	int		flags = O_RDONLY|O_LARGEFILE;
 	__be32		err;
-	int		host_err;
+	int		host_err = 0;
 
 	validate_process_creds();
 
@@ -734,7 +760,8 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 	 * Check to see if there are any leases on this file.
 	 * This may block while leases are broken.
 	 */
-	host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? FMODE_WRITE : 0));
+	if (!(access & NFSD_MAY_NOT_BREAK_LEASE))
+		host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0));
 	if (host_err == -EWOULDBLOCK)
 		host_err = -ETIMEDOUT;
 	if (host_err) /* NOMEM or WOULDBLOCK */
@@ -745,13 +772,13 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 			flags = O_RDWR|O_LARGEFILE;
 		else
 			flags = O_WRONLY|O_LARGEFILE;
-
-		vfs_dq_init(inode);
 	}
 	*filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_path.mnt),
 			    flags, current_cred());
 	if (IS_ERR(*filp))
 		host_err = PTR_ERR(*filp);
+	else
+		host_err = ima_file_check(*filp, access);
 out_nfserr:
 	err = nfserrno(host_err);
 out:
@@ -769,43 +796,6 @@ nfsd_close(struct file *filp)
 }
 
 /*
- * Sync a file
- * As this calls fsync (not fdatasync) there is no need for a write_inode
- * after it.
- */
-static inline int nfsd_dosync(struct file *filp, struct dentry *dp,
-			      const struct file_operations *fop)
-{
-	struct inode *inode = dp->d_inode;
-	int (*fsync) (struct file *, struct dentry *, int);
-	int err;
-
-	err = filemap_write_and_wait(inode->i_mapping);
-	if (err == 0 && fop && (fsync = fop->fsync))
-		err = fsync(filp, dp, 0);
-	return err;
-}
-
-static int
-nfsd_sync(struct file *filp)
-{
-        int err;
-	struct inode *inode = filp->f_path.dentry->d_inode;
-	dprintk("nfsd: sync file %s\n", filp->f_path.dentry->d_name.name);
-	mutex_lock(&inode->i_mutex);
-	err=nfsd_dosync(filp, filp->f_path.dentry, filp->f_op);
-	mutex_unlock(&inode->i_mutex);
-
-	return err;
-}
-
-int
-nfsd_sync_dir(struct dentry *dp)
-{
-	return nfsd_dosync(NULL, dp, dp->d_inode->i_fop);
-}
-
-/*
  * Obtain the readahead parameters for the file
  * specified by (dev, ino).
  */
@@ -1008,7 +998,7 @@ static int wait_for_concurrent_writes(struct file *file)
 
 	if (inode->i_state & I_DIRTY) {
 		dprintk("nfsd: write sync %d\n", task_pid_nr(current));
-		err = nfsd_sync(file);
+		err = vfs_fsync(file, 0);
 	}
 	last_ino = inode->i_ino;
 	last_dev = inode->i_sb->s_dev;
@@ -1156,8 +1146,9 @@ out:
 #ifdef CONFIG_NFSD_V3
 /*
  * Commit all pending writes to stable storage.
- * Strictly speaking, we could sync just the indicated file region here,
- * but there's currently no way we can ask the VFS to do so.
+ *
+ * Note: we only guarantee that data that lies within the range specified
+ * by the 'offset' and 'count' parameters will be synced.
  *
  * Unfortunately we cannot lock the file to make sure we return full WCC
  * data to the client, as locking happens lower down in the filesystem.
@@ -1167,23 +1158,32 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
                loff_t offset, unsigned long count)
 {
 	struct file	*file;
-	__be32		err;
+	loff_t		end = LLONG_MAX;
+	__be32		err = nfserr_inval;
 
-	if ((u64)count > ~(u64)offset)
-		return nfserr_inval;
+	if (offset < 0)
+		goto out;
+	if (count != 0) {
+		end = offset + (loff_t)count - 1;
+		if (end < offset)
+			goto out;
+	}
 
-	err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file);
+	err = nfsd_open(rqstp, fhp, S_IFREG,
+			NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &file);
 	if (err)
-		return err;
+		goto out;
 	if (EX_ISSYNC(fhp->fh_export)) {
-		if (file->f_op && file->f_op->fsync) {
-			err = nfserrno(nfsd_sync(file));
-		} else {
+		int err2 = vfs_fsync_range(file, offset, end, 0);
+
+		if (err2 != -EINVAL)
+			err = nfserrno(err2);
+		else
 			err = nfserr_notsupp;
-		}
 	}
 
 	nfsd_close(file);
+out:
 	return err;
 }
 #endif /* CONFIG_NFSD_V3 */
@@ -1336,12 +1336,14 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		goto out_nfserr;
 	}
 
-	if (EX_ISSYNC(fhp->fh_export)) {
-		err = nfserrno(nfsd_sync_dir(dentry));
-		write_inode_now(dchild->d_inode, 1);
-	}
+	err = nfsd_create_setattr(rqstp, resfhp, iap);
 
-	err2 = nfsd_create_setattr(rqstp, resfhp, iap);
+	/*
+	 * nfsd_setattr already committed the child.  Transactional filesystems
+	 * had a chance to commit changes for both parent and child
+	 * simultaneously making the following commit_metadata a noop.
+	 */
+	err2 = nfserrno(commit_metadata(fhp));
 	if (err2)
 		err = err2;
 	mnt_drop_write(fhp->fh_export->ex_path.mnt);
@@ -1373,7 +1375,6 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	struct dentry	*dentry, *dchild = NULL;
 	struct inode	*dirp;
 	__be32		err;
-	__be32		err2;
 	int		host_err;
 	__u32		v_mtime=0, v_atime=0;
 
@@ -1468,11 +1469,6 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	if (created)
 		*created = 1;
 
-	if (EX_ISSYNC(fhp->fh_export)) {
-		err = nfserrno(nfsd_sync_dir(dentry));
-		/* setattr will sync the child (or not) */
-	}
-
 	nfsd_check_ignore_resizing(iap);
 
 	if (createmode == NFS3_CREATE_EXCLUSIVE) {
@@ -1487,9 +1483,13 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	}
 
  set_attr:
-	err2 = nfsd_create_setattr(rqstp, resfhp, iap);
-	if (err2)
-		err = err2;
+	err = nfsd_create_setattr(rqstp, resfhp, iap);
+
+	/*
+	 * nfsd_setattr already committed the child (and possibly also the parent).
+	 */
+	if (!err)
+		err = nfserrno(commit_metadata(fhp));
 
 	mnt_drop_write(fhp->fh_export->ex_path.mnt);
 	/*
@@ -1604,12 +1604,9 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		}
 	} else
 		host_err = vfs_symlink(dentry->d_inode, dnew, path);
-
-	if (!host_err) {
-		if (EX_ISSYNC(fhp->fh_export))
-			host_err = nfsd_sync_dir(dentry);
-	}
 	err = nfserrno(host_err);
+	if (!err)
+		err = nfserrno(commit_metadata(fhp));
 	fh_unlock(fhp);
 
 	mnt_drop_write(fhp->fh_export->ex_path.mnt);
@@ -1671,11 +1668,9 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
 	}
 	host_err = vfs_link(dold, dirp, dnew);
 	if (!host_err) {
-		if (EX_ISSYNC(ffhp->fh_export)) {
-			err = nfserrno(nfsd_sync_dir(ddir));
-			write_inode_now(dest, 1);
-		}
-		err = 0;
+		err = nfserrno(commit_metadata(ffhp));
+		if (!err)
+			err = nfserrno(commit_metadata(tfhp));
 	} else {
 		if (host_err == -EXDEV && rqstp->rq_vers == 2)
 			err = nfserr_acces;
@@ -1771,10 +1766,10 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
 		goto out_dput_new;
 
 	host_err = vfs_rename(fdir, odentry, tdir, ndentry);
-	if (!host_err && EX_ISSYNC(tfhp->fh_export)) {
-		host_err = nfsd_sync_dir(tdentry);
+	if (!host_err) {
+		host_err = commit_metadata(tfhp);
 		if (!host_err)
-			host_err = nfsd_sync_dir(fdentry);
+			host_err = commit_metadata(ffhp);
 	}
 
 	mnt_drop_write(ffhp->fh_export->ex_path.mnt);
@@ -1855,12 +1850,9 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 
 	dput(rdentry);
 
-	if (host_err)
-		goto out_drop;
-	if (EX_ISSYNC(fhp->fh_export))
-		host_err = nfsd_sync_dir(dentry);
+	if (!host_err)
+		host_err = commit_metadata(fhp);
 
-out_drop:
 	mnt_drop_write(fhp->fh_export->ex_path.mnt);
 out_nfserr:
 	err = nfserrno(host_err);
@@ -2127,7 +2119,6 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
 	 */
 	path.mnt = exp->ex_path.mnt;
 	path.dentry = dentry;
-	err = ima_path_check(&path, acc & (MAY_READ | MAY_WRITE | MAY_EXEC));
 nfsd_out:
 	return err? nfserrno(err) : 0;
 }
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 4b1de0a9ea75..217a62c2a357 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -20,6 +20,7 @@
 #define NFSD_MAY_OWNER_OVERRIDE	64
 #define NFSD_MAY_LOCAL_ACCESS	128 /* IRIX doing local access check on device special file*/
 #define NFSD_MAY_BYPASS_GSS_ON_ROOT 256
+#define NFSD_MAY_NOT_BREAK_LEASE 512
 
 #define NFSD_MAY_CREATE		(NFSD_MAY_EXEC|NFSD_MAY_WRITE)
 #define NFSD_MAY_REMOVE		(NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC)
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index efa337739534..4d476ff08ae6 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -381,6 +381,10 @@ struct nfsd4_destroy_session {
 	struct nfs4_sessionid	sessionid;
 };
 
+struct nfsd4_reclaim_complete {
+	u32 rca_one_fs;
+};
+
 struct nfsd4_op {
 	int					opnum;
 	__be32					status;
@@ -421,6 +425,7 @@ struct nfsd4_op {
 		struct nfsd4_create_session	create_session;
 		struct nfsd4_destroy_session	destroy_session;
 		struct nfsd4_sequence		sequence;
+		struct nfsd4_reclaim_complete	reclaim_complete;
 	} u;
 	struct nfs4_replay *			replay;
 };
@@ -513,9 +518,8 @@ extern void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp);
 extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
 		struct nfsd4_sequence *seq);
 extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp,
-		struct nfsd4_compound_state *,
-struct nfsd4_exchange_id *);
-		extern __be32 nfsd4_create_session(struct svc_rqst *,
+		struct nfsd4_compound_state *, struct nfsd4_exchange_id *);
+extern __be32 nfsd4_create_session(struct svc_rqst *,
 		struct nfsd4_compound_state *,
 		struct nfsd4_create_session *);
 extern __be32 nfsd4_sequence(struct svc_rqst *,
@@ -524,6 +528,7 @@ extern __be32 nfsd4_sequence(struct svc_rqst *,
 extern __be32 nfsd4_destroy_session(struct svc_rqst *,
 		struct nfsd4_compound_state *,
 		struct nfsd4_destroy_session *);
+__be32 nfsd4_reclaim_complete(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_reclaim_complete *);
 extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *,
 		struct nfsd4_open *open);
 extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp,
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 3f959f1879d8..d7fd696e595c 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -26,10 +26,16 @@
 #include <linux/buffer_head.h>
 #include <linux/fs.h>
 #include <linux/bitops.h>
+#include <linux/slab.h>
 #include "mdt.h"
 #include "alloc.h"
 
 
+/**
+ * nilfs_palloc_groups_per_desc_block - get the number of groups that a group
+ *					descriptor block can maintain
+ * @inode: inode of metadata file using this allocator
+ */
 static inline unsigned long
 nilfs_palloc_groups_per_desc_block(const struct inode *inode)
 {
@@ -37,12 +43,21 @@ nilfs_palloc_groups_per_desc_block(const struct inode *inode)
 		sizeof(struct nilfs_palloc_group_desc);
 }
 
+/**
+ * nilfs_palloc_groups_count - get maximum number of groups
+ * @inode: inode of metadata file using this allocator
+ */
 static inline unsigned long
 nilfs_palloc_groups_count(const struct inode *inode)
 {
 	return 1UL << (BITS_PER_LONG - (inode->i_blkbits + 3 /* log2(8) */));
 }
 
+/**
+ * nilfs_palloc_init_blockgroup - initialize private variables for allocator
+ * @inode: inode of metadata file using this allocator
+ * @entry_size: size of the persistent object
+ */
 int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size)
 {
 	struct nilfs_mdt_info *mi = NILFS_MDT(inode);
@@ -68,6 +83,12 @@ int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size)
 	return 0;
 }
 
+/**
+ * nilfs_palloc_group - get group number and offset from an entry number
+ * @inode: inode of metadata file using this allocator
+ * @nr: serial number of the entry (e.g. inode number)
+ * @offset: pointer to store offset number in the group
+ */
 static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr,
 					unsigned long *offset)
 {
@@ -77,6 +98,14 @@ static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr,
 	return group;
 }
 
+/**
+ * nilfs_palloc_desc_blkoff - get block offset of a group descriptor block
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ *
+ * nilfs_palloc_desc_blkoff() returns block offset of the descriptor
+ * block which contains a descriptor of the specified group.
+ */
 static unsigned long
 nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group)
 {
@@ -85,6 +114,14 @@ nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group)
 	return desc_block * NILFS_MDT(inode)->mi_blocks_per_desc_block;
 }
 
+/**
+ * nilfs_palloc_bitmap_blkoff - get block offset of a bitmap block
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ *
+ * nilfs_palloc_bitmap_blkoff() returns block offset of the bitmap
+ * block used to allocate/deallocate entries in the specified group.
+ */
 static unsigned long
 nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group)
 {
@@ -94,6 +131,12 @@ nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group)
 		desc_offset * NILFS_MDT(inode)->mi_blocks_per_group;
 }
 
+/**
+ * nilfs_palloc_group_desc_nfrees - get the number of free entries in a group
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @desc: pointer to descriptor structure for the group
+ */
 static unsigned long
 nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group,
 			       const struct nilfs_palloc_group_desc *desc)
@@ -106,6 +149,13 @@ nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group,
 	return nfree;
 }
 
+/**
+ * nilfs_palloc_group_desc_add_entries - adjust count of free entries
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @desc: pointer to descriptor structure for the group
+ * @n: delta to be added
+ */
 static void
 nilfs_palloc_group_desc_add_entries(struct inode *inode,
 				    unsigned long group,
@@ -117,6 +167,11 @@ nilfs_palloc_group_desc_add_entries(struct inode *inode,
 	spin_unlock(nilfs_mdt_bgl_lock(inode, group));
 }
 
+/**
+ * nilfs_palloc_entry_blkoff - get block offset of an entry block
+ * @inode: inode of metadata file using this allocator
+ * @nr: serial number of the entry (e.g. inode number)
+ */
 static unsigned long
 nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr)
 {
@@ -128,6 +183,12 @@ nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr)
 		group_offset / NILFS_MDT(inode)->mi_entries_per_block;
 }
 
+/**
+ * nilfs_palloc_desc_block_init - initialize buffer of a group descriptor block
+ * @inode: inode of metadata file
+ * @bh: buffer head of the buffer to be initialized
+ * @kaddr: kernel address mapped for the page including the buffer
+ */
 static void nilfs_palloc_desc_block_init(struct inode *inode,
 					 struct buffer_head *bh, void *kaddr)
 {
@@ -178,6 +239,13 @@ static int nilfs_palloc_get_block(struct inode *inode, unsigned long blkoff,
 	return ret;
 }
 
+/**
+ * nilfs_palloc_get_desc_block - get buffer head of a group descriptor block
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @create: create flag
+ * @bhp: pointer to store the resultant buffer head
+ */
 static int nilfs_palloc_get_desc_block(struct inode *inode,
 				       unsigned long group,
 				       int create, struct buffer_head **bhp)
@@ -190,6 +258,13 @@ static int nilfs_palloc_get_desc_block(struct inode *inode,
 				      bhp, &cache->prev_desc, &cache->lock);
 }
 
+/**
+ * nilfs_palloc_get_bitmap_block - get buffer head of a bitmap block
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @create: create flag
+ * @bhp: pointer to store the resultant buffer head
+ */
 static int nilfs_palloc_get_bitmap_block(struct inode *inode,
 					 unsigned long group,
 					 int create, struct buffer_head **bhp)
@@ -202,6 +277,13 @@ static int nilfs_palloc_get_bitmap_block(struct inode *inode,
 				      &cache->prev_bitmap, &cache->lock);
 }
 
+/**
+ * nilfs_palloc_get_entry_block - get buffer head of an entry block
+ * @inode: inode of metadata file using this allocator
+ * @nr: serial number of the entry (e.g. inode number)
+ * @create: create flag
+ * @bhp: pointer to store the resultant buffer head
+ */
 int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
 				 int create, struct buffer_head **bhp)
 {
@@ -213,6 +295,13 @@ int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
 				      &cache->prev_entry, &cache->lock);
 }
 
+/**
+ * nilfs_palloc_block_get_group_desc - get kernel address of a group descriptor
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @bh: buffer head of the buffer storing the group descriptor block
+ * @kaddr: kernel address mapped for the page including the buffer
+ */
 static struct nilfs_palloc_group_desc *
 nilfs_palloc_block_get_group_desc(const struct inode *inode,
 				  unsigned long group,
@@ -222,6 +311,13 @@ nilfs_palloc_block_get_group_desc(const struct inode *inode,
 		group % nilfs_palloc_groups_per_desc_block(inode);
 }
 
+/**
+ * nilfs_palloc_block_get_entry - get kernel address of an entry
+ * @inode: inode of metadata file using this allocator
+ * @nr: serial number of the entry (e.g. inode number)
+ * @bh: buffer head of the buffer storing the entry block
+ * @kaddr: kernel address mapped for the page including the buffer
+ */
 void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
 				   const struct buffer_head *bh, void *kaddr)
 {
@@ -234,11 +330,19 @@ void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
 		entry_offset * NILFS_MDT(inode)->mi_entry_size;
 }
 
+/**
+ * nilfs_palloc_find_available_slot - find available slot in a group
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @target: offset number of an entry in the group (start point)
+ * @bitmap: bitmap of the group
+ * @bsize: size in bits
+ */
 static int nilfs_palloc_find_available_slot(struct inode *inode,
 					    unsigned long group,
 					    unsigned long target,
 					    unsigned char *bitmap,
-					    int bsize)  /* size in bits */
+					    int bsize)
 {
 	int curr, pos, end, i;
 
@@ -276,6 +380,13 @@ static int nilfs_palloc_find_available_slot(struct inode *inode,
 	return -ENOSPC;
 }
 
+/**
+ * nilfs_palloc_rest_groups_in_desc_block - get the remaining number of groups
+ *					    in a group descriptor block
+ * @inode: inode of metadata file using this allocator
+ * @curr: current group number
+ * @max: maximum number of groups
+ */
 static unsigned long
 nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode,
 				       unsigned long curr, unsigned long max)
@@ -286,6 +397,11 @@ nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode,
 		     max - curr + 1);
 }
 
+/**
+ * nilfs_palloc_prepare_alloc_entry - prepare to allocate a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the allocation
+ */
 int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
 				     struct nilfs_palloc_req *req)
 {
@@ -365,6 +481,11 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
 	return ret;
 }
 
+/**
+ * nilfs_palloc_commit_alloc_entry - finish allocation of a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the allocation
+ */
 void nilfs_palloc_commit_alloc_entry(struct inode *inode,
 				     struct nilfs_palloc_req *req)
 {
@@ -376,6 +497,11 @@ void nilfs_palloc_commit_alloc_entry(struct inode *inode,
 	brelse(req->pr_desc_bh);
 }
 
+/**
+ * nilfs_palloc_commit_free_entry - finish deallocating a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the removal
+ */
 void nilfs_palloc_commit_free_entry(struct inode *inode,
 				    struct nilfs_palloc_req *req)
 {
@@ -409,6 +535,11 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
 	brelse(req->pr_desc_bh);
 }
 
+/**
+ * nilfs_palloc_abort_alloc_entry - cancel allocation of a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the allocation
+ */
 void nilfs_palloc_abort_alloc_entry(struct inode *inode,
 				    struct nilfs_palloc_req *req)
 {
@@ -425,7 +556,7 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
 	bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
 	if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
 				    group_offset, bitmap))
-		printk(KERN_WARNING "%s: entry numer %llu already freed\n",
+		printk(KERN_WARNING "%s: entry number %llu already freed\n",
 		       __func__, (unsigned long long)req->pr_entry_nr);
 
 	nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
@@ -441,6 +572,11 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
 	req->pr_desc_bh = NULL;
 }
 
+/**
+ * nilfs_palloc_prepare_free_entry - prepare to deallocate a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the removal
+ */
 int nilfs_palloc_prepare_free_entry(struct inode *inode,
 				    struct nilfs_palloc_req *req)
 {
@@ -463,6 +599,11 @@ int nilfs_palloc_prepare_free_entry(struct inode *inode,
 	return 0;
 }
 
+/**
+ * nilfs_palloc_abort_free_entry - cancel deallocating a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the removal
+ */
 void nilfs_palloc_abort_free_entry(struct inode *inode,
 				   struct nilfs_palloc_req *req)
 {
@@ -474,6 +615,12 @@ void nilfs_palloc_abort_free_entry(struct inode *inode,
 	req->pr_desc_bh = NULL;
 }
 
+/**
+ * nilfs_palloc_group_is_in - judge if an entry is in a group
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @nr: serial number of the entry (e.g. inode number)
+ */
 static int
 nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr)
 {
@@ -484,6 +631,12 @@ nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr)
 	return (nr >= first) && (nr <= last);
 }
 
+/**
+ * nilfs_palloc_freev - deallocate a set of persistent objects
+ * @inode: inode of metadata file using this allocator
+ * @entry_nrs: array of entry numbers to be deallocated
+ * @nitems: number of entries stored in @entry_nrs
+ */
 int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
 {
 	struct buffer_head *desc_bh, *bitmap_bh;
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index f4543ac4f560..9af34a7e6e13 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -29,6 +29,13 @@
 #include <linux/buffer_head.h>
 #include <linux/fs.h>
 
+/**
+ * nilfs_palloc_entries_per_group - get the number of entries per group
+ * @inode: inode of metadata file using this allocator
+ *
+ * The number of entries per group is defined by the number of bits
+ * that a bitmap block can maintain.
+ */
 static inline unsigned long
 nilfs_palloc_entries_per_group(const struct inode *inode)
 {
@@ -42,7 +49,7 @@ void *nilfs_palloc_block_get_entry(const struct inode *, __u64,
 				   const struct buffer_head *, void *);
 
 /**
- * nilfs_palloc_req - persistent alloctor request and reply
+ * nilfs_palloc_req - persistent allocator request and reply
  * @pr_entry_nr: entry number (vblocknr or inode number)
  * @pr_desc_bh: buffer head of the buffer containing block group descriptors
  * @pr_bitmap_bh: buffer head of the buffer containing a block group bitmap
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 471e269536ae..447ce47a3306 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -27,6 +27,7 @@
 #include <linux/buffer_head.h>
 #include <linux/mm.h>
 #include <linux/backing-dev.h>
+#include <linux/gfp.h>
 #include "nilfs.h"
 #include "mdt.h"
 #include "dat.h"
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 7cdd98b8d514..b27a342c5af6 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -31,63 +31,16 @@
 #include "alloc.h"
 #include "dat.h"
 
-/**
- * struct nilfs_btree_path - A path on which B-tree operations are executed
- * @bp_bh: buffer head of node block
- * @bp_sib_bh: buffer head of sibling node block
- * @bp_index: index of child node
- * @bp_oldreq: ptr end request for old ptr
- * @bp_newreq: ptr alloc request for new ptr
- * @bp_op: rebalance operation
- */
-struct nilfs_btree_path {
-	struct buffer_head *bp_bh;
-	struct buffer_head *bp_sib_bh;
-	int bp_index;
-	union nilfs_bmap_ptr_req bp_oldreq;
-	union nilfs_bmap_ptr_req bp_newreq;
-	struct nilfs_btnode_chkey_ctxt bp_ctxt;
-	void (*bp_op)(struct nilfs_btree *, struct nilfs_btree_path *,
-		      int, __u64 *, __u64 *);
-};
-
-/*
- * B-tree path operations
- */
-
-static struct kmem_cache *nilfs_btree_path_cache;
-
-int __init nilfs_btree_path_cache_init(void)
-{
-	nilfs_btree_path_cache =
-		kmem_cache_create("nilfs2_btree_path_cache",
-				  sizeof(struct nilfs_btree_path) *
-				  NILFS_BTREE_LEVEL_MAX, 0, 0, NULL);
-	return (nilfs_btree_path_cache != NULL) ? 0 : -ENOMEM;
-}
-
-void nilfs_btree_path_cache_destroy(void)
-{
-	kmem_cache_destroy(nilfs_btree_path_cache);
-}
-
-static inline struct nilfs_btree_path *nilfs_btree_alloc_path(void)
-{
-	return kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
-}
-
-static inline void nilfs_btree_free_path(struct nilfs_btree_path *path)
+static struct nilfs_btree_path *nilfs_btree_alloc_path(void)
 {
-	kmem_cache_free(nilfs_btree_path_cache, path);
-}
+	struct nilfs_btree_path *path;
+	int level = NILFS_BTREE_LEVEL_DATA;
 
-static void nilfs_btree_init_path(struct nilfs_btree_path *path)
-{
-	int level;
+	path = kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
+	if (path == NULL)
+		goto out;
 
-	for (level = NILFS_BTREE_LEVEL_DATA;
-	     level < NILFS_BTREE_LEVEL_MAX;
-	     level++) {
+	for (; level < NILFS_BTREE_LEVEL_MAX; level++) {
 		path[level].bp_bh = NULL;
 		path[level].bp_sib_bh = NULL;
 		path[level].bp_index = 0;
@@ -95,15 +48,19 @@ static void nilfs_btree_init_path(struct nilfs_btree_path *path)
 		path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
 		path[level].bp_op = NULL;
 	}
+
+out:
+	return path;
 }
 
-static void nilfs_btree_release_path(struct nilfs_btree_path *path)
+static void nilfs_btree_free_path(struct nilfs_btree_path *path)
 {
-	int level;
+	int level = NILFS_BTREE_LEVEL_DATA;
 
-	for (level = NILFS_BTREE_LEVEL_DATA; level < NILFS_BTREE_LEVEL_MAX;
-	     level++)
+	for (; level < NILFS_BTREE_LEVEL_MAX; level++)
 		brelse(path[level].bp_bh);
+
+	kmem_cache_free(nilfs_btree_path_cache, path);
 }
 
 /*
@@ -566,14 +523,12 @@ static int nilfs_btree_lookup(const struct nilfs_bmap *bmap,
 	path = nilfs_btree_alloc_path();
 	if (path == NULL)
 		return -ENOMEM;
-	nilfs_btree_init_path(path);
 
 	ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
 
 	if (ptrp != NULL)
 		*ptrp = ptr;
 
-	nilfs_btree_release_path(path);
 	nilfs_btree_free_path(path);
 
 	return ret;
@@ -594,7 +549,7 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
 	path = nilfs_btree_alloc_path();
 	if (path == NULL)
 		return -ENOMEM;
-	nilfs_btree_init_path(path);
+
 	ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
 	if (ret < 0)
 		goto out;
@@ -655,7 +610,6 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
 	*ptrp = ptr;
 	ret = cnt;
  out:
-	nilfs_btree_release_path(path);
 	nilfs_btree_free_path(path);
 	return ret;
 }
@@ -1123,7 +1077,6 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
 	path = nilfs_btree_alloc_path();
 	if (path == NULL)
 		return -ENOMEM;
-	nilfs_btree_init_path(path);
 
 	ret = nilfs_btree_do_lookup(btree, path, key, NULL,
 				    NILFS_BTREE_LEVEL_NODE_MIN);
@@ -1140,7 +1093,6 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
 	nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
 
  out:
-	nilfs_btree_release_path(path);
 	nilfs_btree_free_path(path);
 	return ret;
 }
@@ -1456,7 +1408,7 @@ static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
 	path = nilfs_btree_alloc_path();
 	if (path == NULL)
 		return -ENOMEM;
-	nilfs_btree_init_path(path);
+
 	ret = nilfs_btree_do_lookup(btree, path, key, NULL,
 				    NILFS_BTREE_LEVEL_NODE_MIN);
 	if (ret < 0)
@@ -1473,7 +1425,6 @@ static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
 	nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
 
 out:
-	nilfs_btree_release_path(path);
 	nilfs_btree_free_path(path);
 	return ret;
 }
@@ -1488,11 +1439,9 @@ static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
 	path = nilfs_btree_alloc_path();
 	if (path == NULL)
 		return -ENOMEM;
-	nilfs_btree_init_path(path);
 
 	ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL);
 
-	nilfs_btree_release_path(path);
 	nilfs_btree_free_path(path);
 
 	return ret;
@@ -1879,7 +1828,7 @@ static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
 				   struct nilfs_btree_path *path,
 				   int level, struct buffer_head *bh)
 {
-	int maxlevel, ret;
+	int maxlevel = 0, ret;
 	struct nilfs_btree_node *parent;
 	struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap);
 	__u64 ptr;
@@ -1923,7 +1872,6 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
 	path = nilfs_btree_alloc_path();
 	if (path == NULL)
 		return -ENOMEM;
-	nilfs_btree_init_path(path);
 
 	if (buffer_nilfs_node(bh)) {
 		node = (struct nilfs_btree_node *)bh->b_data;
@@ -1947,7 +1895,6 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
 		nilfs_btree_propagate_p(btree, path, level, bh);
 
  out:
-	nilfs_btree_release_path(path);
 	nilfs_btree_free_path(path);
 
 	return ret;
@@ -2108,7 +2055,6 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
 	path = nilfs_btree_alloc_path();
 	if (path == NULL)
 		return -ENOMEM;
-	nilfs_btree_init_path(path);
 
 	if (buffer_nilfs_node(*bh)) {
 		node = (struct nilfs_btree_node *)(*bh)->b_data;
@@ -2130,7 +2076,6 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
 		nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo);
 
  out:
-	nilfs_btree_release_path(path);
 	nilfs_btree_free_path(path);
 
 	return ret;
@@ -2175,7 +2120,6 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
 	path = nilfs_btree_alloc_path();
 	if (path == NULL)
 		return -ENOMEM;
-	nilfs_btree_init_path(path);
 
 	ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1);
 	if (ret < 0) {
@@ -2195,7 +2139,6 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
 		nilfs_bmap_set_dirty(&btree->bt_bmap);
 
  out:
-	nilfs_btree_release_path(path);
 	nilfs_btree_free_path(path);
 	return ret;
 }
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index 4b82d84ade75..43c8c5b541fd 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -30,9 +30,6 @@
 #include "btnode.h"
 #include "bmap.h"
 
-struct nilfs_btree;
-struct nilfs_btree_path;
-
 /**
  * struct nilfs_btree - B-tree structure
  * @bt_bmap: bmap base structure
@@ -41,6 +38,25 @@ struct nilfs_btree {
 	struct nilfs_bmap bt_bmap;
 };
 
+/**
+ * struct nilfs_btree_path - A path on which B-tree operations are executed
+ * @bp_bh: buffer head of node block
+ * @bp_sib_bh: buffer head of sibling node block
+ * @bp_index: index of child node
+ * @bp_oldreq: ptr end request for old ptr
+ * @bp_newreq: ptr alloc request for new ptr
+ * @bp_op: rebalance operation
+ */
+struct nilfs_btree_path {
+	struct buffer_head *bp_bh;
+	struct buffer_head *bp_sib_bh;
+	int bp_index;
+	union nilfs_bmap_ptr_req bp_oldreq;
+	union nilfs_bmap_ptr_req bp_newreq;
+	struct nilfs_btnode_chkey_ctxt bp_ctxt;
+	void (*bp_op)(struct nilfs_btree *, struct nilfs_btree_path *,
+		      int, __u64 *, __u64 *);
+};
 
 #define NILFS_BTREE_ROOT_SIZE		NILFS_BMAP_SIZE
 #define NILFS_BTREE_ROOT_NCHILDREN_MAX					\
@@ -57,9 +73,8 @@ struct nilfs_btree {
 #define NILFS_BTREE_KEY_MIN	((__u64)0)
 #define NILFS_BTREE_KEY_MAX	(~(__u64)0)
 
+extern struct kmem_cache *nilfs_btree_path_cache;
 
-int nilfs_btree_path_cache_init(void);
-void nilfs_btree_path_cache_destroy(void);
 int nilfs_btree_init(struct nilfs_bmap *);
 int nilfs_btree_convert_and_insert(struct nilfs_bmap *, __u64, __u64,
 				   const __u64 *, const __u64 *, int);
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 187dd07ba86c..013146755683 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -288,7 +288,7 @@ int nilfs_dat_mark_dirty(struct inode *dat, __u64 vblocknr)
  * @vblocknrs and @nitems.
  *
  * Return Value: On success, 0 is returned. On error, one of the following
- * nagative error codes is returned.
+ * negative error codes is returned.
  *
  * %-EIO - I/O error.
  *
@@ -388,8 +388,7 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
 		ret = -ENOENT;
 		goto out;
 	}
-	if (blocknrp != NULL)
-		*blocknrp = blocknr;
+	*blocknrp = blocknr;
 
  out:
 	kunmap_atomic(kaddr, KM_USER0);
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 76d803e060a9..85c89dfc71f0 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -224,7 +224,7 @@ fail:
  * len <= NILFS_NAME_LEN and de != NULL are guaranteed by caller.
  */
 static int
-nilfs_match(int len, const char * const name, struct nilfs_dir_entry *de)
+nilfs_match(int len, const unsigned char *name, struct nilfs_dir_entry *de)
 {
 	if (len != de->name_len)
 		return 0;
@@ -349,11 +349,11 @@ done:
  * Entry is guaranteed to be valid.
  */
 struct nilfs_dir_entry *
-nilfs_find_entry(struct inode *dir, struct dentry *dentry,
+nilfs_find_entry(struct inode *dir, const struct qstr *qstr,
 		 struct page **res_page)
 {
-	const char *name = dentry->d_name.name;
-	int namelen = dentry->d_name.len;
+	const unsigned char *name = qstr->name;
+	int namelen = qstr->len;
 	unsigned reclen = NILFS_DIR_REC_LEN(namelen);
 	unsigned long start, n;
 	unsigned long npages = dir_pages(dir);
@@ -396,7 +396,7 @@ nilfs_find_entry(struct inode *dir, struct dentry *dentry,
 		/* next page is past the blocks we've got */
 		if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) {
 			nilfs_error(dir->i_sb, __func__,
-			       "dir %lu size %lld exceeds block cout %llu",
+			       "dir %lu size %lld exceeds block count %llu",
 			       dir->i_ino, dir->i_size,
 			       (unsigned long long)dir->i_blocks);
 			goto out;
@@ -424,13 +424,13 @@ struct nilfs_dir_entry *nilfs_dotdot(struct inode *dir, struct page **p)
 	return de;
 }
 
-ino_t nilfs_inode_by_name(struct inode *dir, struct dentry *dentry)
+ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr)
 {
 	ino_t res = 0;
 	struct nilfs_dir_entry *de;
 	struct page *page;
 
-	de = nilfs_find_entry(dir, dentry, &page);
+	de = nilfs_find_entry(dir, qstr, &page);
 	if (de) {
 		res = le64_to_cpu(de->inode);
 		kunmap(page);
@@ -465,7 +465,7 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
 int nilfs_add_link(struct dentry *dentry, struct inode *inode)
 {
 	struct inode *dir = dentry->d_parent->d_inode;
-	const char *name = dentry->d_name.name;
+	const unsigned char *name = dentry->d_name.name;
 	int namelen = dentry->d_name.len;
 	unsigned chunk_size = nilfs_chunk_size(dir);
 	unsigned reclen = NILFS_DIR_REC_LEN(namelen);
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 30292df443ce..c9a30d7ff6fc 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -27,7 +27,7 @@
 #include "nilfs.h"
 #include "segment.h"
 
-int nilfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
+int nilfs_sync_file(struct file *file, int datasync)
 {
 	/*
 	 * Called from fsync() system call
@@ -37,7 +37,7 @@ int nilfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 	 * This function should be implemented when the writeback function
 	 * will be implemented.
 	 */
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = file->f_mapping->host;
 	int err;
 
 	if (!nilfs_inode_dirty(inode))
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index e16a6664dfa2..145f03cd7d3e 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -28,10 +28,10 @@
  * gcinodes), and this file provides lookup function of the dummy
  * inodes and their buffer read function.
  *
- * Since NILFS2 keeps up multiple checkpoints/snapshots accross GC, it
+ * Since NILFS2 keeps up multiple checkpoints/snapshots across GC, it
  * has to treat blocks that belong to a same file but have different
  * checkpoint numbers.  To avoid interference among generations, dummy
- * inodes are managed separatly from actual inodes, and their lookup
+ * inodes are managed separately from actual inodes, and their lookup
  * function (nilfs_gc_iget) is designed to be specified with a
  * checkpoint number argument as well as an inode number.
  *
@@ -45,6 +45,7 @@
 #include <linux/buffer_head.h>
 #include <linux/mpage.h>
 #include <linux/hash.h>
+#include <linux/slab.h>
 #include <linux/swap.h>
 #include "nilfs.h"
 #include "page.h"
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 7868cc122ac7..39e038ac8fcb 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -22,6 +22,7 @@
  */
 
 #include <linux/buffer_head.h>
+#include <linux/gfp.h>
 #include <linux/mpage.h>
 #include <linux/writeback.h>
 #include <linux/uio.h>
@@ -279,16 +280,7 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
 	/* reference count of i_bh inherits from nilfs_mdt_read_block() */
 
 	atomic_inc(&sbi->s_inodes_count);
-
-	inode->i_uid = current_fsuid();
-	if (dir->i_mode & S_ISGID) {
-		inode->i_gid = dir->i_gid;
-		if (S_ISDIR(mode))
-			mode |= S_ISGID;
-	} else
-		inode->i_gid = current_fsgid();
-
-	inode->i_mode = mode;
+	inode_init_owner(inode, dir, mode);
 	inode->i_ino = ino;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 
@@ -450,7 +442,7 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
 		inode->i_op = &nilfs_special_inode_operations;
 		init_special_inode(
 			inode, inode->i_mode,
-			new_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
+			huge_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
 	}
 	nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh);
 	brelse(bh);
@@ -510,7 +502,7 @@ void nilfs_write_inode_common(struct inode *inode,
 		nilfs_bmap_write(ii->i_bmap, raw_inode);
 	else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
 		raw_inode->i_device_code =
-			cpu_to_le64(new_encode_dev(inode->i_rdev));
+			cpu_to_le64(huge_encode_dev(inode->i_rdev));
 	/* When extending inode, nilfs->ns_inode_size should be checked
 	   for substitutions of appended fields */
 }
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index d6b2b83de363..f90a33d9a5b0 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -23,9 +23,11 @@
 #include <linux/fs.h>
 #include <linux/wait.h>
 #include <linux/smp_lock.h>	/* lock_kernel(), unlock_kernel() */
+#include <linux/slab.h>
 #include <linux/capability.h>	/* capable() */
 #include <linux/uaccess.h>	/* copy_from_user(), copy_to_user() */
 #include <linux/vmalloc.h>
+#include <linux/mount.h>	/* mnt_want_write(), mnt_drop_write() */
 #include <linux/nilfs2_fs.h>
 #include "nilfs.h"
 #include "segment.h"
@@ -107,20 +109,28 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
+
+	ret = mnt_want_write(filp->f_path.mnt);
+	if (ret)
+		return ret;
+
+	ret = -EFAULT;
 	if (copy_from_user(&cpmode, argp, sizeof(cpmode)))
-		return -EFAULT;
+		goto out;
 
 	mutex_lock(&nilfs->ns_mount_mutex);
+
 	nilfs_transaction_begin(inode->i_sb, &ti, 0);
 	ret = nilfs_cpfile_change_cpmode(
 		cpfile, cpmode.cm_cno, cpmode.cm_mode);
-	if (unlikely(ret < 0)) {
+	if (unlikely(ret < 0))
 		nilfs_transaction_abort(inode->i_sb);
-		mutex_unlock(&nilfs->ns_mount_mutex);
-		return ret;
-	}
-	nilfs_transaction_commit(inode->i_sb); /* never fails */
+	else
+		nilfs_transaction_commit(inode->i_sb); /* never fails */
+
 	mutex_unlock(&nilfs->ns_mount_mutex);
+out:
+	mnt_drop_write(filp->f_path.mnt);
 	return ret;
 }
 
@@ -135,16 +145,23 @@ nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp,
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
+
+	ret = mnt_want_write(filp->f_path.mnt);
+	if (ret)
+		return ret;
+
+	ret = -EFAULT;
 	if (copy_from_user(&cno, argp, sizeof(cno)))
-		return -EFAULT;
+		goto out;
 
 	nilfs_transaction_begin(inode->i_sb, &ti, 0);
 	ret = nilfs_cpfile_delete_checkpoint(cpfile, cno);
-	if (unlikely(ret < 0)) {
+	if (unlikely(ret < 0))
 		nilfs_transaction_abort(inode->i_sb);
-		return ret;
-	}
-	nilfs_transaction_commit(inode->i_sb); /* never fails */
+	else
+		nilfs_transaction_commit(inode->i_sb); /* never fails */
+out:
+	mnt_drop_write(filp->f_path.mnt);
 	return ret;
 }
 
@@ -496,12 +513,19 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
+	ret = mnt_want_write(filp->f_path.mnt);
+	if (ret)
+		return ret;
+
+	ret = -EFAULT;
 	if (copy_from_user(argv, argp, sizeof(argv)))
-		return -EFAULT;
+		goto out;
 
+	ret = -EINVAL;
 	nsegs = argv[4].v_nmembs;
 	if (argv[4].v_size != argsz[4])
-		return -EINVAL;
+		goto out;
+
 	/*
 	 * argv[4] points to segment numbers this ioctl cleans.  We
 	 * use kmalloc() for its buffer because memory used for the
@@ -509,9 +533,10 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
 	 */
 	kbufs[4] = memdup_user((void __user *)(unsigned long)argv[4].v_base,
 			       nsegs * sizeof(__u64));
-	if (IS_ERR(kbufs[4]))
-		return PTR_ERR(kbufs[4]);
-
+	if (IS_ERR(kbufs[4])) {
+		ret = PTR_ERR(kbufs[4]);
+		goto out;
+	}
 	nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
 
 	for (n = 0; n < 4; n++) {
@@ -563,10 +588,12 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
 		nilfs_remove_all_gcinode(nilfs);
 	clear_nilfs_gc_running(nilfs);
 
- out_free:
+out_free:
 	while (--n >= 0)
 		vfree(kbufs[n]);
 	kfree(kbufs[4]);
+out:
+	mnt_drop_write(filp->f_path.mnt);
 	return ret;
 }
 
@@ -575,13 +602,17 @@ static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
 {
 	__u64 cno;
 	int ret;
+	struct the_nilfs *nilfs;
 
 	ret = nilfs_construct_segment(inode->i_sb);
 	if (ret < 0)
 		return ret;
 
 	if (argp != NULL) {
-		cno = NILFS_SB(inode->i_sb)->s_nilfs->ns_cno - 1;
+		nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+		down_read(&nilfs->ns_segctor_sem);
+		cno = nilfs->ns_cno - 1;
+		up_read(&nilfs->ns_segctor_sem);
 		if (copy_to_user(argp, &cno, sizeof(cno)))
 			return -EFAULT;
 	}
@@ -618,7 +649,7 @@ static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp,
 long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct inode *inode = filp->f_dentry->d_inode;
-	void __user *argp = (void * __user *)arg;
+	void __user *argp = (void __user *)arg;
 
 	switch (cmd) {
 	case NILFS_IOCTL_CHANGE_CPMODE:
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 06713ffcc7f2..024be8c35bb6 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -26,6 +26,7 @@
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
 #include <linux/swap.h>
+#include <linux/slab.h>
 #include "nilfs.h"
 #include "segment.h"
 #include "page.h"
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 07ba838ef089..ad6ed2cf19b4 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -67,7 +67,7 @@ nilfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 	if (dentry->d_name.len > NILFS_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
-	ino = nilfs_inode_by_name(dir, dentry);
+	ino = nilfs_inode_by_name(dir, &dentry->d_name);
 	inode = NULL;
 	if (ino) {
 		inode = nilfs_iget(dir->i_sb, ino);
@@ -81,10 +81,7 @@ struct dentry *nilfs_get_parent(struct dentry *child)
 {
 	unsigned long ino;
 	struct inode *inode;
-	struct dentry dotdot;
-
-	dotdot.d_name.name = "..";
-	dotdot.d_name.len = 2;
+	struct qstr dotdot = {.name = "..", .len = 2};
 
 	ino = nilfs_inode_by_name(child->d_inode, &dotdot);
 	if (!ino)
@@ -296,7 +293,7 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry)
 	int err;
 
 	err = -ENOENT;
-	de = nilfs_find_entry(dir, dentry, &page);
+	de = nilfs_find_entry(dir, &dentry->d_name, &page);
 	if (!de)
 		goto out;
 
@@ -389,7 +386,7 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		return err;
 
 	err = -ENOENT;
-	old_de = nilfs_find_entry(old_dir, old_dentry, &old_page);
+	old_de = nilfs_find_entry(old_dir, &old_dentry->d_name, &old_page);
 	if (!old_de)
 		goto out;
 
@@ -409,7 +406,7 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			goto out_dir;
 
 		err = -ENOENT;
-		new_de = nilfs_find_entry(new_dir, new_dentry, &new_page);
+		new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, &new_page);
 		if (!new_de)
 			goto out_dir;
 		inc_nlink(old_inode);
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 4da6f67e9a91..47d6d7928122 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -217,10 +217,10 @@ static inline int nilfs_init_acl(struct inode *inode, struct inode *dir)
 
 /* dir.c */
 extern int nilfs_add_link(struct dentry *, struct inode *);
-extern ino_t nilfs_inode_by_name(struct inode *, struct dentry *);
+extern ino_t nilfs_inode_by_name(struct inode *, const struct qstr *);
 extern int nilfs_make_empty(struct inode *, struct inode *);
 extern struct nilfs_dir_entry *
-nilfs_find_entry(struct inode *, struct dentry *, struct page **);
+nilfs_find_entry(struct inode *, const struct qstr *, struct page **);
 extern int nilfs_delete_entry(struct nilfs_dir_entry *, struct page *);
 extern int nilfs_empty_dir(struct inode *);
 extern struct nilfs_dir_entry *nilfs_dotdot(struct inode *, struct page **);
@@ -228,7 +228,7 @@ extern void nilfs_set_link(struct inode *, struct nilfs_dir_entry *,
 			   struct page *, struct inode *);
 
 /* file.c */
-extern int nilfs_sync_file(struct file *, struct dentry *, int);
+extern int nilfs_sync_file(struct file *, int);
 
 /* ioctl.c */
 long nilfs_ioctl(struct file *, unsigned int, unsigned long);
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index a2692bbc7b50..8de3e1e48130 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -29,6 +29,7 @@
 #include <linux/list.h>
 #include <linux/highmem.h>
 #include <linux/pagevec.h>
+#include <linux/gfp.h>
 #include "nilfs.h"
 #include "page.h"
 #include "mdt.h"
@@ -292,7 +293,7 @@ void nilfs_free_private_page(struct page *page)
  * @src: source page
  * @copy_dirty: flag whether to copy dirty states on the page's buffer heads.
  *
- * This fuction is for both data pages and btnode pages.  The dirty flag
+ * This function is for both data pages and btnode pages.  The dirty flag
  * should be treated by caller.  The page must not be under i/o.
  * Both src and dst page must be locked
  */
@@ -388,7 +389,7 @@ repeat:
 }
 
 /**
- * nilfs_copy_back_pages -- copy back pages to orignal cache from shadow cache
+ * nilfs_copy_back_pages -- copy back pages to original cache from shadow cache
  * @dmap: destination page cache
  * @smap: source page cache
  *
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index c9c96c7825dc..bae2a516b4ee 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -23,6 +23,7 @@
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 #include <linux/swap.h>
+#include <linux/slab.h>
 #include <linux/crc32.h>
 #include "nilfs.h"
 #include "segment.h"
@@ -39,7 +40,6 @@ enum {
 	NILFS_SEG_FAIL_IO,
 	NILFS_SEG_FAIL_MAGIC,
 	NILFS_SEG_FAIL_SEQ,
-	NILFS_SEG_FAIL_CHECKSUM_SEGSUM,
 	NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT,
 	NILFS_SEG_FAIL_CHECKSUM_FULL,
 	NILFS_SEG_FAIL_CONSISTENCY,
@@ -71,10 +71,6 @@ static int nilfs_warn_segment_error(int err)
 		printk(KERN_WARNING
 		       "NILFS warning: Sequence number mismatch\n");
 		break;
-	case NILFS_SEG_FAIL_CHECKSUM_SEGSUM:
-		printk(KERN_WARNING
-		       "NILFS warning: Checksum error in segment summary\n");
-		break;
 	case NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT:
 		printk(KERN_WARNING
 		       "NILFS warning: Checksum error in super root\n");
@@ -109,6 +105,8 @@ static void store_segsum_info(struct nilfs_segsum_info *ssi,
 
 	ssi->nsumblk = DIV_ROUND_UP(ssi->sumbytes, blocksize);
 	ssi->nfileblk = ssi->nblocks - ssi->nsumblk - !!NILFS_SEG_HAS_SR(ssi);
+
+	/* need to verify ->ss_bytes field if read ->ss_cno */
 }
 
 /**
@@ -206,19 +204,15 @@ int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block,
  * @pseg_start: start disk block number of partial segment
  * @seg_seq: sequence number requested
  * @ssi: pointer to nilfs_segsum_info struct to store information
- * @full_check: full check flag
- *              (0: only checks segment summary CRC, 1: data CRC)
  */
 static int
 load_segment_summary(struct nilfs_sb_info *sbi, sector_t pseg_start,
-		     u64 seg_seq, struct nilfs_segsum_info *ssi,
-		     int full_check)
+		     u64 seg_seq, struct nilfs_segsum_info *ssi)
 {
 	struct buffer_head *bh_sum;
 	struct nilfs_segment_summary *sum;
-	unsigned long offset, nblock;
-	u64 check_bytes;
-	u32 crc, crc_sum;
+	unsigned long nblock;
+	u32 crc;
 	int ret = NILFS_SEG_FAIL_IO;
 
 	bh_sum = sb_bread(sbi->s_super, pseg_start);
@@ -237,34 +231,24 @@ load_segment_summary(struct nilfs_sb_info *sbi, sector_t pseg_start,
 		ret = NILFS_SEG_FAIL_SEQ;
 		goto failed;
 	}
-	if (full_check) {
-		offset = sizeof(sum->ss_datasum);
-		check_bytes =
-			((u64)ssi->nblocks << sbi->s_super->s_blocksize_bits);
-		nblock = ssi->nblocks;
-		crc_sum = le32_to_cpu(sum->ss_datasum);
-		ret = NILFS_SEG_FAIL_CHECKSUM_FULL;
-	} else { /* only checks segment summary */
-		offset = sizeof(sum->ss_datasum) + sizeof(sum->ss_sumsum);
-		check_bytes = ssi->sumbytes;
-		nblock = ssi->nsumblk;
-		crc_sum = le32_to_cpu(sum->ss_sumsum);
-		ret = NILFS_SEG_FAIL_CHECKSUM_SEGSUM;
-	}
 
+	nblock = ssi->nblocks;
 	if (unlikely(nblock == 0 ||
 		     nblock > sbi->s_nilfs->ns_blocks_per_segment)) {
 		/* This limits the number of blocks read in the CRC check */
 		ret = NILFS_SEG_FAIL_CONSISTENCY;
 		goto failed;
 	}
-	if (calc_crc_cont(sbi, bh_sum, &crc, offset, check_bytes,
+	if (calc_crc_cont(sbi, bh_sum, &crc, sizeof(sum->ss_datasum),
+			  ((u64)nblock << sbi->s_super->s_blocksize_bits),
 			  pseg_start, nblock)) {
 		ret = NILFS_SEG_FAIL_IO;
 		goto failed;
 	}
-	if (crc == crc_sum)
+	if (crc == le32_to_cpu(sum->ss_datasum))
 		ret = 0;
+	else
+		ret = NILFS_SEG_FAIL_CHECKSUM_FULL;
  failed:
 	brelse(bh_sum);
  out:
@@ -598,7 +582,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
 
 	while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) {
 
-		ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1);
+		ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi);
 		if (ret) {
 			if (ret == NILFS_SEG_FAIL_IO) {
 				err = -EIO;
@@ -821,7 +805,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
 
 	for (;;) {
 		/* Load segment summary */
-		ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1);
+		ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi);
 		if (ret) {
 			if (ret == NILFS_SEG_FAIL_IO)
 				goto failed;
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 645c78656aa0..2e6a2723b8fa 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -25,6 +25,7 @@
 #include <linux/writeback.h>
 #include <linux/crc32.h>
 #include <linux/backing-dev.h>
+#include <linux/slab.h>
 #include "page.h"
 #include "segbuf.h"
 
@@ -32,36 +33,16 @@
 struct nilfs_write_info {
 	struct the_nilfs       *nilfs;
 	struct bio	       *bio;
-	int 			start, end; /* The region to be submitted */
+	int			start, end; /* The region to be submitted */
 	int			rest_blocks;
 	int			max_pages;
 	int			nr_vecs;
 	sector_t		blocknr;
 };
 
-
-static struct kmem_cache *nilfs_segbuf_cachep;
-
-static void nilfs_segbuf_init_once(void *obj)
-{
-	memset(obj, 0, sizeof(struct nilfs_segment_buffer));
-}
-
-int __init nilfs_init_segbuf_cache(void)
-{
-	nilfs_segbuf_cachep =
-		kmem_cache_create("nilfs2_segbuf_cache",
-				  sizeof(struct nilfs_segment_buffer),
-				  0, SLAB_RECLAIM_ACCOUNT,
-				  nilfs_segbuf_init_once);
-
-	return (nilfs_segbuf_cachep == NULL) ? -ENOMEM : 0;
-}
-
-void nilfs_destroy_segbuf_cache(void)
-{
-	kmem_cache_destroy(nilfs_segbuf_cachep);
-}
+static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
+			      struct the_nilfs *nilfs);
+static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf);
 
 struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb)
 {
@@ -75,6 +56,7 @@ struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb)
 	INIT_LIST_HEAD(&segbuf->sb_list);
 	INIT_LIST_HEAD(&segbuf->sb_segsum_buffers);
 	INIT_LIST_HEAD(&segbuf->sb_payload_buffers);
+	segbuf->sb_super_root = NULL;
 
 	init_completion(&segbuf->sb_bio_event);
 	atomic_set(&segbuf->sb_err, 0);
@@ -152,7 +134,7 @@ int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *segbuf,
 }
 
 int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
-		       time_t ctime)
+		       time_t ctime, __u64 cno)
 {
 	int err;
 
@@ -165,11 +147,12 @@ int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
 	segbuf->sb_sum.sumbytes = sizeof(struct nilfs_segment_summary);
 	segbuf->sb_sum.nfinfo = segbuf->sb_sum.nfileblk = 0;
 	segbuf->sb_sum.ctime = ctime;
+	segbuf->sb_sum.cno = cno;
 	return 0;
 }
 
 /*
- * Setup segument summary
+ * Setup segment summary
  */
 void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf)
 {
@@ -190,13 +173,14 @@ void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf)
 	raw_sum->ss_nfinfo   = cpu_to_le32(segbuf->sb_sum.nfinfo);
 	raw_sum->ss_sumbytes = cpu_to_le32(segbuf->sb_sum.sumbytes);
 	raw_sum->ss_pad      = 0;
+	raw_sum->ss_cno      = cpu_to_le64(segbuf->sb_sum.cno);
 }
 
 /*
  * CRC calculation routines
  */
-void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf,
-				     u32 seed)
+static void
+nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf, u32 seed)
 {
 	struct buffer_head *bh;
 	struct nilfs_segment_summary *raw_sum;
@@ -223,8 +207,8 @@ void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf,
 	raw_sum->ss_sumsum = cpu_to_le32(crc);
 }
 
-void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
-				   u32 seed)
+static void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
+					  u32 seed)
 {
 	struct buffer_head *bh;
 	struct nilfs_segment_summary *raw_sum;
@@ -250,6 +234,20 @@ void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
 	raw_sum->ss_datasum = cpu_to_le32(crc);
 }
 
+static void
+nilfs_segbuf_fill_in_super_root_crc(struct nilfs_segment_buffer *segbuf,
+				    u32 seed)
+{
+	struct nilfs_super_root *raw_sr;
+	u32 crc;
+
+	raw_sr = (struct nilfs_super_root *)segbuf->sb_super_root->b_data;
+	crc = crc32_le(seed,
+		       (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum),
+		       NILFS_SR_BYTES - sizeof(raw_sr->sr_sum));
+	raw_sr->sr_sum = cpu_to_le32(crc);
+}
+
 static void nilfs_release_buffers(struct list_head *list)
 {
 	struct buffer_head *bh, *n;
@@ -276,6 +274,7 @@ static void nilfs_segbuf_clear(struct nilfs_segment_buffer *segbuf)
 {
 	nilfs_release_buffers(&segbuf->sb_segsum_buffers);
 	nilfs_release_buffers(&segbuf->sb_payload_buffers);
+	segbuf->sb_super_root = NULL;
 }
 
 /*
@@ -302,17 +301,47 @@ void nilfs_truncate_logs(struct list_head *logs,
 	}
 }
 
+int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs)
+{
+	struct nilfs_segment_buffer *segbuf;
+	int ret = 0;
+
+	list_for_each_entry(segbuf, logs, sb_list) {
+		ret = nilfs_segbuf_write(segbuf, nilfs);
+		if (ret)
+			break;
+	}
+	return ret;
+}
+
 int nilfs_wait_on_logs(struct list_head *logs)
 {
 	struct nilfs_segment_buffer *segbuf;
-	int err;
+	int err, ret = 0;
 
 	list_for_each_entry(segbuf, logs, sb_list) {
 		err = nilfs_segbuf_wait(segbuf);
-		if (err)
-			return err;
+		if (err && !ret)
+			ret = err;
+	}
+	return ret;
+}
+
+/**
+ * nilfs_add_checksums_on_logs - add checksums on the logs
+ * @logs: list of segment buffers storing target logs
+ * @seed: checksum seed value
+ */
+void nilfs_add_checksums_on_logs(struct list_head *logs, u32 seed)
+{
+	struct nilfs_segment_buffer *segbuf;
+
+	list_for_each_entry(segbuf, logs, sb_list) {
+		if (segbuf->sb_super_root)
+			nilfs_segbuf_fill_in_super_root_crc(segbuf, seed);
+		nilfs_segbuf_fill_in_segsum_crc(segbuf, seed);
+		nilfs_segbuf_fill_in_data_crc(segbuf, seed);
 	}
-	return 0;
 }
 
 /*
@@ -452,8 +481,8 @@ static int nilfs_segbuf_submit_bh(struct nilfs_segment_buffer *segbuf,
  *
  * %-ENOMEM - Insufficient memory available.
  */
-int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
-		       struct the_nilfs *nilfs)
+static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
+			      struct the_nilfs *nilfs)
 {
 	struct nilfs_write_info wi;
 	struct buffer_head *bh;
@@ -496,7 +525,7 @@ int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
  *
  * %-EIO - I/O error
  */
-int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf)
+static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf)
 {
 	int err = 0;
 
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
index 6af1630fb401..85fbb66455e2 100644
--- a/fs/nilfs2/segbuf.h
+++ b/fs/nilfs2/segbuf.h
@@ -37,6 +37,7 @@
  * @sumbytes: Byte count of segment summary
  * @nfileblk: Total number of file blocks
  * @seg_seq: Segment sequence number
+ * @cno: Checkpoint number
  * @ctime: Creation time
  * @next: Block number of the next full segment
  */
@@ -48,6 +49,7 @@ struct nilfs_segsum_info {
 	unsigned long		sumbytes;
 	unsigned long		nfileblk;
 	u64			seg_seq;
+	__u64			cno;
 	time_t			ctime;
 	sector_t		next;
 };
@@ -76,6 +78,7 @@ struct nilfs_segsum_info {
  * @sb_rest_blocks: Number of residual blocks in the current segment
  * @sb_segsum_buffers: List of buffers for segment summaries
  * @sb_payload_buffers: List of buffers for segment payload
+ * @sb_super_root: Pointer to buffer storing a super root block (if exists)
  * @sb_nbio: Number of flying bio requests
  * @sb_err: I/O error status
  * @sb_bio_event: Completion event of log writing
@@ -95,6 +98,7 @@ struct nilfs_segment_buffer {
 	/* Buffers */
 	struct list_head	sb_segsum_buffers;
 	struct list_head	sb_payload_buffers; /* including super root */
+	struct buffer_head     *sb_super_root;
 
 	/* io status */
 	int			sb_nbio;
@@ -121,9 +125,8 @@ struct nilfs_segment_buffer {
 		    b_assoc_buffers))
 #define NILFS_SEGBUF_BH_IS_LAST(bh, head)  ((bh)->b_assoc_buffers.next == head)
 
+extern struct kmem_cache *nilfs_segbuf_cachep;
 
-int __init nilfs_init_segbuf_cache(void);
-void nilfs_destroy_segbuf_cache(void);
 struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *);
 void nilfs_segbuf_free(struct nilfs_segment_buffer *);
 void nilfs_segbuf_map(struct nilfs_segment_buffer *, __u64, unsigned long,
@@ -132,13 +135,11 @@ void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf,
 			   struct nilfs_segment_buffer *prev);
 void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64,
 				  struct the_nilfs *);
-int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t);
+int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t, __u64);
 int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *);
 int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *,
 				struct buffer_head **);
 void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *);
-void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *, u32);
-void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *, u32);
 
 static inline void
 nilfs_segbuf_add_segsum_buffer(struct nilfs_segment_buffer *segbuf,
@@ -166,14 +167,12 @@ nilfs_segbuf_add_file_buffer(struct nilfs_segment_buffer *segbuf,
 	segbuf->sb_sum.nfileblk++;
 }
 
-int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
-		       struct the_nilfs *nilfs);
-int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf);
-
 void nilfs_clear_logs(struct list_head *logs);
 void nilfs_truncate_logs(struct list_head *logs,
 			 struct nilfs_segment_buffer *last);
+int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs);
 int nilfs_wait_on_logs(struct list_head *logs);
+void nilfs_add_checksums_on_logs(struct list_head *logs, u32 seed);
 
 static inline void nilfs_destroy_logs(struct list_head *logs)
 {
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 17584c524486..c9201649cc49 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -32,6 +32,7 @@
 #include <linux/kthread.h>
 #include <linux/crc32.h>
 #include <linux/pagevec.h>
+#include <linux/slab.h>
 #include "nilfs.h"
 #include "btnode.h"
 #include "page.h"
@@ -115,42 +116,6 @@ static void nilfs_dispose_list(struct nilfs_sb_info *, struct list_head *,
 #define nilfs_cnt32_lt(a, b)  nilfs_cnt32_gt(b, a)
 #define nilfs_cnt32_le(a, b)  nilfs_cnt32_ge(b, a)
 
-/*
- * Transaction
- */
-static struct kmem_cache *nilfs_transaction_cachep;
-
-/**
- * nilfs_init_transaction_cache - create a cache for nilfs_transaction_info
- *
- * nilfs_init_transaction_cache() creates a slab cache for the struct
- * nilfs_transaction_info.
- *
- * Return Value: On success, it returns 0. On error, one of the following
- * negative error code is returned.
- *
- * %-ENOMEM - Insufficient memory available.
- */
-int nilfs_init_transaction_cache(void)
-{
-	nilfs_transaction_cachep =
-		kmem_cache_create("nilfs2_transaction_cache",
-				  sizeof(struct nilfs_transaction_info),
-				  0, SLAB_RECLAIM_ACCOUNT, NULL);
-	return (nilfs_transaction_cachep == NULL) ? -ENOMEM : 0;
-}
-
-/**
- * nilfs_detroy_transaction_cache - destroy the cache for transaction info
- *
- * nilfs_destroy_transaction_cache() frees the slab cache for the struct
- * nilfs_transaction_info.
- */
-void nilfs_destroy_transaction_cache(void)
-{
-	kmem_cache_destroy(nilfs_transaction_cachep);
-}
-
 static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti)
 {
 	struct nilfs_transaction_info *cur_ti = current->journal_info;
@@ -201,7 +166,7 @@ static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti)
  * This function allocates a nilfs_transaction_info struct to keep context
  * information on it.  It is initialized and hooked onto the current task in
  * the outermost call.  If a pre-allocated struct is given to @ti, it is used
- * instead; othewise a new struct is assigned from a slab.
+ * instead; otherwise a new struct is assigned from a slab.
  *
  * When @vacancy_check flag is set, this function will check the amount of
  * free space, and will wait for the GC to reclaim disk space if low capacity.
@@ -401,7 +366,8 @@ static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci)
 
 	if (nilfs_doing_gc())
 		flags = NILFS_SS_GC;
-	err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime);
+	err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime,
+				 sci->sc_sbi->s_nilfs->ns_cno);
 	if (unlikely(err))
 		return err;
 
@@ -434,7 +400,7 @@ static int nilfs_segctor_add_super_root(struct nilfs_sc_info *sci)
 			return err;
 		segbuf = sci->sc_curseg;
 	}
-	err = nilfs_segbuf_extend_payload(segbuf, &sci->sc_super_root);
+	err = nilfs_segbuf_extend_payload(segbuf, &segbuf->sb_super_root);
 	if (likely(!err))
 		segbuf->sb_sum.flags |= NILFS_SS_SR;
 	return err;
@@ -598,7 +564,7 @@ static void nilfs_write_file_node_binfo(struct nilfs_sc_info *sci,
 	*vblocknr = binfo->bi_v.bi_vblocknr;
 }
 
-struct nilfs_sc_operations nilfs_sc_file_ops = {
+static struct nilfs_sc_operations nilfs_sc_file_ops = {
 	.collect_data = nilfs_collect_file_data,
 	.collect_node = nilfs_collect_file_node,
 	.collect_bmap = nilfs_collect_file_bmap,
@@ -648,7 +614,7 @@ static void nilfs_write_dat_node_binfo(struct nilfs_sc_info *sci,
 	*binfo_dat = binfo->bi_dat;
 }
 
-struct nilfs_sc_operations nilfs_sc_dat_ops = {
+static struct nilfs_sc_operations nilfs_sc_dat_ops = {
 	.collect_data = nilfs_collect_dat_data,
 	.collect_node = nilfs_collect_file_node,
 	.collect_bmap = nilfs_collect_dat_bmap,
@@ -656,7 +622,7 @@ struct nilfs_sc_operations nilfs_sc_dat_ops = {
 	.write_node_binfo = nilfs_write_dat_node_binfo,
 };
 
-struct nilfs_sc_operations nilfs_sc_dsync_ops = {
+static struct nilfs_sc_operations nilfs_sc_dsync_ops = {
 	.collect_data = nilfs_collect_file_data,
 	.collect_node = NULL,
 	.collect_bmap = NULL,
@@ -931,43 +897,16 @@ static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci,
 	}
 }
 
-/*
- * CRC calculation routines
- */
-static void nilfs_fill_in_super_root_crc(struct buffer_head *bh_sr, u32 seed)
-{
-	struct nilfs_super_root *raw_sr =
-		(struct nilfs_super_root *)bh_sr->b_data;
-	u32 crc;
-
-	crc = crc32_le(seed,
-		       (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum),
-		       NILFS_SR_BYTES - sizeof(raw_sr->sr_sum));
-	raw_sr->sr_sum = cpu_to_le32(crc);
-}
-
-static void nilfs_segctor_fill_in_checksums(struct nilfs_sc_info *sci,
-					    u32 seed)
-{
-	struct nilfs_segment_buffer *segbuf;
-
-	if (sci->sc_super_root)
-		nilfs_fill_in_super_root_crc(sci->sc_super_root, seed);
-
-	list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
-		nilfs_segbuf_fill_in_segsum_crc(segbuf, seed);
-		nilfs_segbuf_fill_in_data_crc(segbuf, seed);
-	}
-}
-
 static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
 					     struct the_nilfs *nilfs)
 {
-	struct buffer_head *bh_sr = sci->sc_super_root;
-	struct nilfs_super_root *raw_sr =
-		(struct nilfs_super_root *)bh_sr->b_data;
+	struct buffer_head *bh_sr;
+	struct nilfs_super_root *raw_sr;
 	unsigned isz = nilfs->ns_inode_size;
 
+	bh_sr = NILFS_LAST_SEGBUF(&sci->sc_segbufs)->sb_super_root;
+	raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
+
 	raw_sr->sr_bytes = cpu_to_le16(NILFS_SR_BYTES);
 	raw_sr->sr_nongc_ctime
 		= cpu_to_le64(nilfs_doing_gc() ?
@@ -1490,7 +1429,6 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
 
 	/* Collection retry loop */
 	for (;;) {
-		sci->sc_super_root = NULL;
 		sci->sc_nblk_this_inc = 0;
 		sci->sc_curseg = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
 
@@ -1510,6 +1448,12 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
 		if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE)
 			break;
 
+		nilfs_clear_logs(&sci->sc_segbufs);
+
+		err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
+		if (unlikely(err))
+			return err;
+
 		if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
 			err = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
 							sci->sc_freesegs,
@@ -1517,12 +1461,6 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
 							NULL);
 			WARN_ON(err); /* do not happen */
 		}
-		nilfs_clear_logs(&sci->sc_segbufs);
-
-		err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
-		if (unlikely(err))
-			return err;
-
 		nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA);
 		sci->sc_stage = prev_stage;
 	}
@@ -1567,7 +1505,7 @@ nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci,
 	ssp.offset = sizeof(struct nilfs_segment_summary);
 
 	list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
-		if (bh == sci->sc_super_root)
+		if (bh == segbuf->sb_super_root)
 			break;
 		if (!finfo) {
 			finfo =	nilfs_segctor_map_segsum_entry(
@@ -1728,7 +1666,7 @@ static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci,
 
 		list_for_each_entry(bh, &segbuf->sb_payload_buffers,
 				    b_assoc_buffers) {
-			if (bh == sci->sc_super_root) {
+			if (bh == segbuf->sb_super_root) {
 				if (bh->b_page != bd_page) {
 					lock_page(bd_page);
 					clear_page_dirty_for_io(bd_page);
@@ -1764,14 +1702,9 @@ static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci,
 static int nilfs_segctor_write(struct nilfs_sc_info *sci,
 			       struct the_nilfs *nilfs)
 {
-	struct nilfs_segment_buffer *segbuf;
-	int ret = 0;
+	int ret;
 
-	list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
-		ret = nilfs_segbuf_write(segbuf, nilfs);
-		if (ret)
-			break;
-	}
+	ret = nilfs_write_logs(&sci->sc_segbufs, nilfs);
 	list_splice_tail_init(&sci->sc_segbufs, &sci->sc_write_logs);
 	return ret;
 }
@@ -1852,7 +1785,7 @@ static void nilfs_clear_copied_buffers(struct list_head *list, int err)
 }
 
 static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page,
-			     struct buffer_head *bh_sr, int err)
+			     int err)
 {
 	struct nilfs_segment_buffer *segbuf;
 	struct page *bd_page = NULL, *fs_page = NULL;
@@ -1873,7 +1806,7 @@ static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page,
 
 		list_for_each_entry(bh, &segbuf->sb_payload_buffers,
 				    b_assoc_buffers) {
-			if (bh == bh_sr) {
+			if (bh == segbuf->sb_super_root) {
 				if (bh->b_page != bd_page) {
 					end_page_writeback(bd_page);
 					bd_page = bh->b_page;
@@ -1902,8 +1835,7 @@ static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci,
 
 	list_splice_tail_init(&sci->sc_write_logs, &logs);
 	ret = nilfs_wait_on_logs(&logs);
-	if (ret)
-		nilfs_abort_logs(&logs, NULL, sci->sc_super_root, ret);
+	nilfs_abort_logs(&logs, NULL, ret ? : err);
 
 	list_splice_tail_init(&sci->sc_segbufs, &logs);
 	nilfs_cancel_segusage(&logs, nilfs->ns_sufile);
@@ -1919,7 +1851,6 @@ static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci,
 	}
 
 	nilfs_destroy_logs(&logs);
-	sci->sc_super_root = NULL;
 }
 
 static void nilfs_set_next_segment(struct the_nilfs *nilfs,
@@ -1937,9 +1868,8 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
 {
 	struct nilfs_segment_buffer *segbuf;
 	struct page *bd_page = NULL, *fs_page = NULL;
-	struct nilfs_sb_info *sbi = sci->sc_sbi;
-	struct the_nilfs *nilfs = sbi->s_nilfs;
-	int update_sr = (sci->sc_super_root != NULL);
+	struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
+	int update_sr = false;
 
 	list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) {
 		struct buffer_head *bh;
@@ -1970,11 +1900,12 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
 			set_buffer_uptodate(bh);
 			clear_buffer_dirty(bh);
 			clear_buffer_nilfs_volatile(bh);
-			if (bh == sci->sc_super_root) {
+			if (bh == segbuf->sb_super_root) {
 				if (bh->b_page != bd_page) {
 					end_page_writeback(bd_page);
 					bd_page = bh->b_page;
 				}
+				update_sr = true;
 				break;
 			}
 			if (bh->b_page != fs_page) {
@@ -2020,7 +1951,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
 	if (update_sr) {
 		nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start,
 				       segbuf->sb_sum.seg_seq, nilfs->ns_cno++);
-		sbi->s_super->s_dirt = 1;
+		set_nilfs_sb_dirty(nilfs);
 
 		clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
 		clear_bit(NILFS_SC_DIRTY, &sci->sc_flags);
@@ -2121,7 +2052,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 	struct nilfs_sb_info *sbi = sci->sc_sbi;
 	struct the_nilfs *nilfs = sbi->s_nilfs;
 	struct page *failed_page;
-	int err, has_sr = 0;
+	int err;
 
 	sci->sc_stage.scnt = NILFS_ST_INIT;
 
@@ -2149,8 +2080,6 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 		if (unlikely(err))
 			goto failed;
 
-		has_sr = (sci->sc_super_root != NULL);
-
 		/* Avoid empty segment */
 		if (sci->sc_stage.scnt == NILFS_ST_DONE &&
 		    NILFS_SEG_EMPTY(&sci->sc_curseg->sb_sum)) {
@@ -2165,7 +2094,8 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 		if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
 			nilfs_segctor_fill_in_file_bmap(sci, sbi->s_ifile);
 
-		if (has_sr) {
+		if (mode == SC_LSEG_SR &&
+		    sci->sc_stage.scnt >= NILFS_ST_CPFILE) {
 			err = nilfs_segctor_fill_in_checkpoint(sci);
 			if (unlikely(err))
 				goto failed_to_write;
@@ -2177,11 +2107,12 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 		/* Write partial segments */
 		err = nilfs_segctor_prepare_write(sci, &failed_page);
 		if (err) {
-			nilfs_abort_logs(&sci->sc_segbufs, failed_page,
-					 sci->sc_super_root, err);
+			nilfs_abort_logs(&sci->sc_segbufs, failed_page, err);
 			goto failed_to_write;
 		}
-		nilfs_segctor_fill_in_checksums(sci, nilfs->ns_crc_seed);
+
+		nilfs_add_checksums_on_logs(&sci->sc_segbufs,
+					    nilfs->ns_crc_seed);
 
 		err = nilfs_segctor_write(sci, nilfs);
 		if (unlikely(err))
@@ -2202,8 +2133,6 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 		}
 	} while (sci->sc_stage.scnt != NILFS_ST_DONE);
 
-	sci->sc_super_root = NULL;
-
  out:
 	nilfs_segctor_check_out_files(sci, sbi);
 	return err;
@@ -2220,7 +2149,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 }
 
 /**
- * nilfs_secgtor_start_timer - set timer of background write
+ * nilfs_segctor_start_timer - set timer of background write
  * @sci: nilfs_sc_info
  *
  * If the timer has already been set, it ignores the new request.
@@ -2230,9 +2159,9 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 static void nilfs_segctor_start_timer(struct nilfs_sc_info *sci)
 {
 	spin_lock(&sci->sc_state_lock);
-	if (sci->sc_timer && !(sci->sc_state & NILFS_SEGCTOR_COMMIT)) {
-		sci->sc_timer->expires = jiffies + sci->sc_interval;
-		add_timer(sci->sc_timer);
+	if (!(sci->sc_state & NILFS_SEGCTOR_COMMIT)) {
+		sci->sc_timer.expires = jiffies + sci->sc_interval;
+		add_timer(&sci->sc_timer);
 		sci->sc_state |= NILFS_SEGCTOR_COMMIT;
 	}
 	spin_unlock(&sci->sc_state_lock);
@@ -2425,77 +2354,82 @@ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
 	return err;
 }
 
-struct nilfs_segctor_req {
-	int mode;
-	__u32 seq_accepted;
-	int sc_err;  /* construction failure */
-	int sb_err;  /* super block writeback failure */
-};
-
 #define FLUSH_FILE_BIT	(0x1) /* data file only */
 #define FLUSH_DAT_BIT	(1 << NILFS_DAT_INO) /* DAT only */
 
-static void nilfs_segctor_accept(struct nilfs_sc_info *sci,
-				 struct nilfs_segctor_req *req)
+/**
+ * nilfs_segctor_accept - record accepted sequence count of log-write requests
+ * @sci: segment constructor object
+ */
+static void nilfs_segctor_accept(struct nilfs_sc_info *sci)
 {
-	req->sc_err = req->sb_err = 0;
 	spin_lock(&sci->sc_state_lock);
-	req->seq_accepted = sci->sc_seq_request;
+	sci->sc_seq_accepted = sci->sc_seq_request;
 	spin_unlock(&sci->sc_state_lock);
-
-	if (sci->sc_timer)
-		del_timer_sync(sci->sc_timer);
+	del_timer_sync(&sci->sc_timer);
 }
 
-static void nilfs_segctor_notify(struct nilfs_sc_info *sci,
-				 struct nilfs_segctor_req *req)
+/**
+ * nilfs_segctor_notify - notify the result of request to caller threads
+ * @sci: segment constructor object
+ * @mode: mode of log forming
+ * @err: error code to be notified
+ */
+static void nilfs_segctor_notify(struct nilfs_sc_info *sci, int mode, int err)
 {
 	/* Clear requests (even when the construction failed) */
 	spin_lock(&sci->sc_state_lock);
 
-	if (req->mode == SC_LSEG_SR) {
+	if (mode == SC_LSEG_SR) {
 		sci->sc_state &= ~NILFS_SEGCTOR_COMMIT;
-		sci->sc_seq_done = req->seq_accepted;
-		nilfs_segctor_wakeup(sci, req->sc_err ? : req->sb_err);
+		sci->sc_seq_done = sci->sc_seq_accepted;
+		nilfs_segctor_wakeup(sci, err);
 		sci->sc_flush_request = 0;
 	} else {
-		if (req->mode == SC_FLUSH_FILE)
+		if (mode == SC_FLUSH_FILE)
 			sci->sc_flush_request &= ~FLUSH_FILE_BIT;
-		else if (req->mode == SC_FLUSH_DAT)
+		else if (mode == SC_FLUSH_DAT)
 			sci->sc_flush_request &= ~FLUSH_DAT_BIT;
 
 		/* re-enable timer if checkpoint creation was not done */
-		if (sci->sc_timer && (sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
-		    time_before(jiffies, sci->sc_timer->expires))
-			add_timer(sci->sc_timer);
+		if ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
+		    time_before(jiffies, sci->sc_timer.expires))
+			add_timer(&sci->sc_timer);
 	}
 	spin_unlock(&sci->sc_state_lock);
 }
 
-static int nilfs_segctor_construct(struct nilfs_sc_info *sci,
-				   struct nilfs_segctor_req *req)
+/**
+ * nilfs_segctor_construct - form logs and write them to disk
+ * @sci: segment constructor object
+ * @mode: mode of log forming
+ */
+static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
 {
 	struct nilfs_sb_info *sbi = sci->sc_sbi;
 	struct the_nilfs *nilfs = sbi->s_nilfs;
 	int err = 0;
 
+	nilfs_segctor_accept(sci);
+
 	if (nilfs_discontinued(nilfs))
-		req->mode = SC_LSEG_SR;
-	if (!nilfs_segctor_confirm(sci)) {
-		err = nilfs_segctor_do_construct(sci, req->mode);
-		req->sc_err = err;
-	}
+		mode = SC_LSEG_SR;
+	if (!nilfs_segctor_confirm(sci))
+		err = nilfs_segctor_do_construct(sci, mode);
+
 	if (likely(!err)) {
-		if (req->mode != SC_FLUSH_DAT)
+		if (mode != SC_FLUSH_DAT)
 			atomic_set(&nilfs->ns_ndirtyblks, 0);
 		if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) &&
 		    nilfs_discontinued(nilfs)) {
 			down_write(&nilfs->ns_sem);
-			req->sb_err = nilfs_commit_super(sbi,
-					nilfs_altsb_need_update(nilfs));
+			err = nilfs_commit_super(
+				sbi, nilfs_altsb_need_update(nilfs));
 			up_write(&nilfs->ns_sem);
 		}
 	}
+
+	nilfs_segctor_notify(sci, mode, err);
 	return err;
 }
 
@@ -2526,7 +2460,6 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
 	struct nilfs_sc_info *sci = NILFS_SC(sbi);
 	struct the_nilfs *nilfs = sbi->s_nilfs;
 	struct nilfs_transaction_info ti;
-	struct nilfs_segctor_req req = { .mode = SC_LSEG_SR };
 	int err;
 
 	if (unlikely(!sci))
@@ -2547,10 +2480,8 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
 	list_splice_tail_init(&nilfs->ns_gc_inodes, &sci->sc_gc_inodes);
 
 	for (;;) {
-		nilfs_segctor_accept(sci, &req);
-		err = nilfs_segctor_construct(sci, &req);
+		err = nilfs_segctor_construct(sci, SC_LSEG_SR);
 		nilfs_remove_written_gcinodes(nilfs, &sci->sc_gc_inodes);
-		nilfs_segctor_notify(sci, &req);
 
 		if (likely(!err))
 			break;
@@ -2560,6 +2491,16 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
 		set_current_state(TASK_INTERRUPTIBLE);
 		schedule_timeout(sci->sc_interval);
 	}
+	if (nilfs_test_opt(sbi, DISCARD)) {
+		int ret = nilfs_discard_segments(nilfs, sci->sc_freesegs,
+						 sci->sc_nfreesegs);
+		if (ret) {
+			printk(KERN_WARNING
+			       "NILFS warning: error %d on discard request, "
+			       "turning discards off for the device\n", ret);
+			nilfs_clear_opt(sbi, DISCARD);
+		}
+	}
 
  out_unlock:
 	sci->sc_freesegs = NULL;
@@ -2573,13 +2514,9 @@ static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode)
 {
 	struct nilfs_sb_info *sbi = sci->sc_sbi;
 	struct nilfs_transaction_info ti;
-	struct nilfs_segctor_req req = { .mode = mode };
 
 	nilfs_transaction_lock(sbi, &ti, 0);
-
-	nilfs_segctor_accept(sci, &req);
-	nilfs_segctor_construct(sci, &req);
-	nilfs_segctor_notify(sci, &req);
+	nilfs_segctor_construct(sci, mode);
 
 	/*
 	 * Unclosed segment should be retried.  We do this using sc_timer.
@@ -2635,13 +2572,11 @@ static int nilfs_segctor_flush_mode(struct nilfs_sc_info *sci)
 static int nilfs_segctor_thread(void *arg)
 {
 	struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg;
-	struct timer_list timer;
+	struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
 	int timeout = 0;
 
-	init_timer(&timer);
-	timer.data = (unsigned long)current;
-	timer.function = nilfs_construction_timeout;
-	sci->sc_timer = &timer;
+	sci->sc_timer.data = (unsigned long)current;
+	sci->sc_timer.function = nilfs_construction_timeout;
 
 	/* start sync. */
 	sci->sc_task = current;
@@ -2680,7 +2615,6 @@ static int nilfs_segctor_thread(void *arg)
 	} else {
 		DEFINE_WAIT(wait);
 		int should_sleep = 1;
-		struct the_nilfs *nilfs;
 
 		prepare_to_wait(&sci->sc_wait_daemon, &wait,
 				TASK_INTERRUPTIBLE);
@@ -2691,7 +2625,7 @@ static int nilfs_segctor_thread(void *arg)
 			should_sleep = 0;
 		else if (sci->sc_state & NILFS_SEGCTOR_COMMIT)
 			should_sleep = time_before(jiffies,
-						   sci->sc_timer->expires);
+					sci->sc_timer.expires);
 
 		if (should_sleep) {
 			spin_unlock(&sci->sc_state_lock);
@@ -2700,17 +2634,15 @@ static int nilfs_segctor_thread(void *arg)
 		}
 		finish_wait(&sci->sc_wait_daemon, &wait);
 		timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
-			   time_after_eq(jiffies, sci->sc_timer->expires));
-		nilfs = sci->sc_sbi->s_nilfs;
-		if (sci->sc_super->s_dirt && nilfs_sb_need_update(nilfs))
+			   time_after_eq(jiffies, sci->sc_timer.expires));
+
+		if (nilfs_sb_dirty(nilfs) && nilfs_sb_need_update(nilfs))
 			set_nilfs_discontinued(nilfs);
 	}
 	goto loop;
 
  end_thread:
 	spin_unlock(&sci->sc_state_lock);
-	del_timer_sync(sci->sc_timer);
-	sci->sc_timer = NULL;
 
 	/* end sync. */
 	sci->sc_task = NULL;
@@ -2746,13 +2678,6 @@ static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
 	}
 }
 
-static int nilfs_segctor_init(struct nilfs_sc_info *sci)
-{
-	sci->sc_seq_done = sci->sc_seq_request;
-
-	return nilfs_segctor_start_thread(sci);
-}
-
 /*
  * Setup & clean-up functions
  */
@@ -2776,6 +2701,7 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi)
 	INIT_LIST_HEAD(&sci->sc_write_logs);
 	INIT_LIST_HEAD(&sci->sc_gc_inodes);
 	INIT_LIST_HEAD(&sci->sc_copied_buffers);
+	init_timer(&sci->sc_timer);
 
 	sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
 	sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ;
@@ -2797,12 +2723,9 @@ static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
 	do {
 		struct nilfs_sb_info *sbi = sci->sc_sbi;
 		struct nilfs_transaction_info ti;
-		struct nilfs_segctor_req req = { .mode = SC_LSEG_SR };
 
 		nilfs_transaction_lock(sbi, &ti, 0);
-		nilfs_segctor_accept(sci, &req);
-		ret = nilfs_segctor_construct(sci, &req);
-		nilfs_segctor_notify(sci, &req);
+		ret = nilfs_segctor_construct(sci, SC_LSEG_SR);
 		nilfs_transaction_unlock(sbi);
 
 	} while (ret && retrycount-- > 0);
@@ -2829,7 +2752,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
 		|| sci->sc_seq_request != sci->sc_seq_done);
 	spin_unlock(&sci->sc_state_lock);
 
-	if (flag || nilfs_segctor_confirm(sci))
+	if (flag || !nilfs_segctor_confirm(sci))
 		nilfs_segctor_write_out(sci);
 
 	WARN_ON(!list_empty(&sci->sc_copied_buffers));
@@ -2845,6 +2768,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
 
 	down_write(&sbi->s_nilfs->ns_segctor_sem);
 
+	del_timer_sync(&sci->sc_timer);
 	kfree(sci);
 }
 
@@ -2853,7 +2777,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
  * @sbi: nilfs_sb_info
  *
  * nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info,
- * initilizes it, and starts the segment constructor.
+ * initializes it, and starts the segment constructor.
  *
  * Return Value: On success, 0 is returned. On error, one of the following
  * negative error code is returned.
@@ -2865,14 +2789,21 @@ int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi)
 	struct the_nilfs *nilfs = sbi->s_nilfs;
 	int err;
 
-	/* Each field of nilfs_segctor is cleared through the initialization
-	   of super-block info */
+	if (NILFS_SC(sbi)) {
+		/*
+		 * This happens if the filesystem was remounted
+		 * read/write after nilfs_error degenerated it into a
+		 * read-only mount.
+		 */
+		nilfs_detach_segment_constructor(sbi);
+	}
+
 	sbi->s_sc_info = nilfs_segctor_new(sbi);
 	if (!sbi->s_sc_info)
 		return -ENOMEM;
 
 	nilfs_attach_writer(nilfs, sbi);
-	err = nilfs_segctor_init(NILFS_SC(sbi));
+	err = nilfs_segctor_start_thread(NILFS_SC(sbi));
 	if (err) {
 		nilfs_detach_writer(nilfs, sbi);
 		kfree(sbi->s_sc_info);
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 3d3ab2f9864c..01e20dbb217d 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -30,7 +30,7 @@
 #include "sb.h"
 
 /**
- * struct nilfs_recovery_info - Recovery infomation
+ * struct nilfs_recovery_info - Recovery information
  * @ri_need_recovery: Recovery status
  * @ri_super_root: Block number of the last super root
  * @ri_ri_cno: Number of the last checkpoint
@@ -71,7 +71,7 @@ struct nilfs_recovery_info {
  */
 struct nilfs_cstage {
 	int			scnt;
-	unsigned 		flags;
+	unsigned		flags;
 	struct nilfs_inode_info *dirty_file_ptr;
 	struct nilfs_inode_info *gc_inode_ptr;
 };
@@ -100,7 +100,6 @@ struct nilfs_segsum_pointer {
  * @sc_write_logs: List of segment buffers to hold logs under writing
  * @sc_segbuf_nblocks: Number of available blocks in segment buffers.
  * @sc_curseg: Current segment buffer
- * @sc_super_root: Pointer to the super root buffer
  * @sc_stage: Collection stage
  * @sc_finfo_ptr: pointer to the current finfo struct in the segment summary
  * @sc_binfo_ptr: pointer to the current binfo struct in the segment summary
@@ -116,6 +115,7 @@ struct nilfs_segsum_pointer {
  * @sc_wait_daemon: Daemon wait queue
  * @sc_wait_task: Start/end wait queue to control segctord task
  * @sc_seq_request: Request counter
+ * @sc_seq_accept: Accepted request count
  * @sc_seq_done: Completion counter
  * @sc_sync: Request of explicit sync operation
  * @sc_interval: Timeout value of background construction
@@ -147,7 +147,6 @@ struct nilfs_sc_info {
 	struct list_head	sc_write_logs;
 	unsigned long		sc_segbuf_nblocks;
 	struct nilfs_segment_buffer *sc_curseg;
-	struct buffer_head     *sc_super_root;
 
 	struct nilfs_cstage	sc_stage;
 
@@ -169,6 +168,7 @@ struct nilfs_sc_info {
 	wait_queue_head_t	sc_wait_task;
 
 	__u32			sc_seq_request;
+	__u32			sc_seq_accepted;
 	__u32			sc_seq_done;
 
 	int			sc_sync;
@@ -177,7 +177,7 @@ struct nilfs_sc_info {
 	unsigned long		sc_lseg_stime;	/* in 1/HZ seconds */
 	unsigned long		sc_watermark;
 
-	struct timer_list      *sc_timer;
+	struct timer_list	sc_timer;
 	struct task_struct     *sc_task;
 };
 
@@ -217,10 +217,10 @@ enum {
  */
 #define NILFS_SC_DEFAULT_WATERMARK  3600
 
+/* super.c */
+extern struct kmem_cache *nilfs_transaction_cachep;
 
 /* segment.c */
-extern int nilfs_init_transaction_cache(void);
-extern void nilfs_destroy_transaction_cache(void);
 extern void nilfs_relax_pressure_in_lock(struct super_block *);
 
 extern int nilfs_construct_segment(struct super_block *);
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index b6c36d0cc331..3c6cc6005c2e 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -18,7 +18,7 @@
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  *
  * Written by Koji Sato <koji@osrg.net>.
- * Rivised by Ryusuke Konishi <ryusuke@osrg.net>.
+ * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
  */
 
 #include <linux/kernel.h>
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 8173faee31e6..414ef68931cf 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -67,6 +67,11 @@ MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
 		   "(NILFS)");
 MODULE_LICENSE("GPL");
 
+struct kmem_cache *nilfs_inode_cachep;
+struct kmem_cache *nilfs_transaction_cachep;
+struct kmem_cache *nilfs_segbuf_cachep;
+struct kmem_cache *nilfs_btree_path_cache;
+
 static int nilfs_remount(struct super_block *sb, int *flags, char *data);
 
 /**
@@ -96,9 +101,6 @@ void nilfs_error(struct super_block *sb, const char *function,
 	if (!(sb->s_flags & MS_RDONLY)) {
 		struct the_nilfs *nilfs = sbi->s_nilfs;
 
-		if (!nilfs_test_opt(sbi, ERRORS_CONT))
-			nilfs_detach_segment_constructor(sbi);
-
 		down_write(&nilfs->ns_sem);
 		if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) {
 			nilfs->ns_mount_state |= NILFS_ERROR_FS;
@@ -132,7 +134,6 @@ void nilfs_warning(struct super_block *sb, const char *function,
 	va_end(args);
 }
 
-static struct kmem_cache *nilfs_inode_cachep;
 
 struct inode *nilfs_alloc_inode_common(struct the_nilfs *nilfs)
 {
@@ -158,34 +159,6 @@ void nilfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
 }
 
-static void init_once(void *obj)
-{
-	struct nilfs_inode_info *ii = obj;
-
-	INIT_LIST_HEAD(&ii->i_dirty);
-#ifdef CONFIG_NILFS_XATTR
-	init_rwsem(&ii->xattr_sem);
-#endif
-	nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
-	ii->i_bmap = (struct nilfs_bmap *)&ii->i_bmap_union;
-	inode_init_once(&ii->vfs_inode);
-}
-
-static int nilfs_init_inode_cache(void)
-{
-	nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache",
-					       sizeof(struct nilfs_inode_info),
-					       0, SLAB_RECLAIM_ACCOUNT,
-					       init_once);
-
-	return (nilfs_inode_cachep == NULL) ? -ENOMEM : 0;
-}
-
-static inline void nilfs_destroy_inode_cache(void)
-{
-	kmem_cache_destroy(nilfs_inode_cachep);
-}
-
 static void nilfs_clear_inode(struct inode *inode)
 {
 	struct nilfs_inode_info *ii = NILFS_I(inode);
@@ -269,8 +242,8 @@ int nilfs_commit_super(struct nilfs_sb_info *sbi, int dupsb)
 	int err;
 
 	/* nilfs->sem must be locked by the caller. */
-	if (sbp[0]->s_magic != NILFS_SUPER_MAGIC) {
-		if (sbp[1] && sbp[1]->s_magic == NILFS_SUPER_MAGIC)
+	if (sbp[0]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) {
+		if (sbp[1] && sbp[1]->s_magic == cpu_to_le16(NILFS_SUPER_MAGIC))
 			nilfs_swap_super_block(nilfs);
 		else {
 			printk(KERN_CRIT "NILFS: superblock broke on dev %s\n",
@@ -301,7 +274,7 @@ int nilfs_commit_super(struct nilfs_sb_info *sbi, int dupsb)
 		memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
 		nilfs->ns_sbwtime[1] = t;
 	}
-	sbi->s_super->s_dirt = 0;
+	clear_nilfs_sb_dirty(nilfs);
 	return nilfs_sync_super(sbi, dupsb);
 }
 
@@ -345,7 +318,7 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
 		err = nilfs_construct_segment(sb);
 
 	down_write(&nilfs->ns_sem);
-	if (sb->s_dirt)
+	if (nilfs_sb_dirty(nilfs))
 		nilfs_commit_super(sbi, 1);
 	up_write(&nilfs->ns_sem);
 
@@ -439,7 +412,7 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	/*
 	 * Compute the overhead
 	 *
-	 * When distributing meta data blocks outside semgent structure,
+	 * When distributing meta data blocks outside segment structure,
 	 * We must count them as the overhead.
 	 */
 	overhead = 0;
@@ -473,14 +446,16 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	if (nilfs_test_opt(sbi, SNAPSHOT))
 		seq_printf(seq, ",cp=%llu",
 			   (unsigned long long int)sbi->s_snapshot_cno);
-	if (nilfs_test_opt(sbi, ERRORS_RO))
-		seq_printf(seq, ",errors=remount-ro");
 	if (nilfs_test_opt(sbi, ERRORS_PANIC))
 		seq_printf(seq, ",errors=panic");
+	if (nilfs_test_opt(sbi, ERRORS_CONT))
+		seq_printf(seq, ",errors=continue");
 	if (nilfs_test_opt(sbi, STRICT_ORDER))
 		seq_printf(seq, ",order=strict");
 	if (nilfs_test_opt(sbi, NORECOVERY))
 		seq_printf(seq, ",norecovery");
+	if (nilfs_test_opt(sbi, DISCARD))
+		seq_printf(seq, ",discard");
 
 	return 0;
 }
@@ -550,7 +525,7 @@ static const struct export_operations nilfs_export_ops = {
 enum {
 	Opt_err_cont, Opt_err_panic, Opt_err_ro,
 	Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery,
-	Opt_err,
+	Opt_discard, Opt_err,
 };
 
 static match_table_t tokens = {
@@ -561,6 +536,7 @@ static match_table_t tokens = {
 	{Opt_snapshot, "cp=%u"},
 	{Opt_order, "order=%s"},
 	{Opt_norecovery, "norecovery"},
+	{Opt_discard, "discard"},
 	{Opt_err, NULL}
 };
 
@@ -614,6 +590,9 @@ static int parse_options(char *options, struct super_block *sb)
 		case Opt_norecovery:
 			nilfs_set_opt(sbi, NORECOVERY);
 			break;
+		case Opt_discard:
+			nilfs_set_opt(sbi, DISCARD);
+			break;
 		default:
 			printk(KERN_ERR
 			       "NILFS: Unrecognized mount option \"%s\"\n", p);
@@ -628,7 +607,7 @@ nilfs_set_default_options(struct nilfs_sb_info *sbi,
 			  struct nilfs_super_block *sbp)
 {
 	sbi->s_mount_opt =
-		NILFS_MOUNT_ERRORS_CONT | NILFS_MOUNT_BARRIER;
+		NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER;
 }
 
 static int nilfs_setup_super(struct nilfs_sb_info *sbi)
@@ -746,6 +725,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
 	sb->s_export_op = &nilfs_export_ops;
 	sb->s_root = NULL;
 	sb->s_time_gran = 1;
+	sb->s_bdi = nilfs->ns_bdi;
 
 	err = load_nilfs(nilfs, sbi);
 	if (err)
@@ -774,9 +754,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
 				goto failed_sbi;
 			}
 			cno = sbi->s_snapshot_cno;
-		} else
-			/* Read-only mount */
-			sbi->s_snapshot_cno = cno;
+		}
 	}
 
 	err = nilfs_attach_checkpoint(sbi, cno);
@@ -845,7 +823,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 	struct the_nilfs *nilfs = sbi->s_nilfs;
 	unsigned long old_sb_flags;
 	struct nilfs_mount_options old_opts;
-	int err;
+	int was_snapshot, err;
 
 	lock_kernel();
 
@@ -853,6 +831,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 	old_sb_flags = sb->s_flags;
 	old_opts.mount_opt = sbi->s_mount_opt;
 	old_opts.snapshot_cno = sbi->s_snapshot_cno;
+	was_snapshot = nilfs_test_opt(sbi, SNAPSHOT);
 
 	if (!parse_options(data, sb)) {
 		err = -EINVAL;
@@ -860,20 +839,32 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 	}
 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL);
 
-	if ((*flags & MS_RDONLY) &&
-	    sbi->s_snapshot_cno != old_opts.snapshot_cno) {
-		printk(KERN_WARNING "NILFS (device %s): couldn't "
-		       "remount to a different snapshot. \n",
-		       sb->s_id);
-		err = -EINVAL;
-		goto restore_opts;
+	err = -EINVAL;
+	if (was_snapshot) {
+		if (!(*flags & MS_RDONLY)) {
+			printk(KERN_ERR "NILFS (device %s): cannot remount "
+			       "snapshot read/write.\n",
+			       sb->s_id);
+			goto restore_opts;
+		} else if (sbi->s_snapshot_cno != old_opts.snapshot_cno) {
+			printk(KERN_ERR "NILFS (device %s): cannot "
+			       "remount to a different snapshot.\n",
+			       sb->s_id);
+			goto restore_opts;
+		}
+	} else {
+		if (nilfs_test_opt(sbi, SNAPSHOT)) {
+			printk(KERN_ERR "NILFS (device %s): cannot change "
+			       "a regular mount to a snapshot.\n",
+			       sb->s_id);
+			goto restore_opts;
+		}
 	}
 
 	if (!nilfs_valid_fs(nilfs)) {
 		printk(KERN_WARNING "NILFS (device %s): couldn't "
 		       "remount because the filesystem is in an "
 		       "incomplete recovery state.\n", sb->s_id);
-		err = -EINVAL;
 		goto restore_opts;
 	}
 
@@ -884,9 +875,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 		nilfs_detach_segment_constructor(sbi);
 		sb->s_flags |= MS_RDONLY;
 
-		sbi->s_snapshot_cno = nilfs_last_cno(nilfs);
-		/* nilfs_set_opt(sbi, SNAPSHOT); */
-
 		/*
 		 * Remounting a valid RW partition RDONLY, so set
 		 * the RDONLY flag and then mark the partition as valid again.
@@ -905,24 +893,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 		 * store the current valid flag.  (It may have been changed
 		 * by fsck since we originally mounted the partition.)
 		 */
-		if (nilfs->ns_current && nilfs->ns_current != sbi) {
-			printk(KERN_WARNING "NILFS (device %s): couldn't "
-			       "remount because an RW-mount exists.\n",
-			       sb->s_id);
-			err = -EBUSY;
-			goto restore_opts;
-		}
-		if (sbi->s_snapshot_cno != nilfs_last_cno(nilfs)) {
-			printk(KERN_WARNING "NILFS (device %s): couldn't "
-			       "remount because the current RO-mount is not "
-			       "the latest one.\n",
-			       sb->s_id);
-			err = -EINVAL;
-			goto restore_opts;
-		}
 		sb->s_flags &= ~MS_RDONLY;
-		nilfs_clear_opt(sbi, SNAPSHOT);
-		sbi->s_snapshot_cno = 0;
 
 		err = nilfs_attach_segment_constructor(sbi);
 		if (err)
@@ -931,8 +902,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 		down_write(&nilfs->ns_sem);
 		nilfs_setup_super(sbi);
 		up_write(&nilfs->ns_sem);
-
-		nilfs->ns_current = sbi;
 	}
  out:
 	up_write(&nilfs->ns_super_sem);
@@ -1018,10 +987,14 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 {
 	struct nilfs_super_data sd;
 	struct super_block *s;
+	fmode_t mode = FMODE_READ;
 	struct the_nilfs *nilfs;
 	int err, need_to_close = 1;
 
-	sd.bdev = open_bdev_exclusive(dev_name, flags, fs_type);
+	if (!(flags & MS_RDONLY))
+		mode |= FMODE_WRITE;
+
+	sd.bdev = open_bdev_exclusive(dev_name, mode, fs_type);
 	if (IS_ERR(sd.bdev))
 		return PTR_ERR(sd.bdev);
 
@@ -1088,10 +1061,12 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 
 		/* New superblock instance created */
 		s->s_flags = flags;
+		s->s_mode = mode;
 		strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id));
 		sb_set_blocksize(s, block_size(sd.bdev));
 
-		err = nilfs_fill_super(s, data, flags & MS_VERBOSE, nilfs);
+		err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0,
+				       nilfs);
 		if (err)
 			goto cancel_new;
 
@@ -1102,7 +1077,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 	mutex_unlock(&nilfs->ns_mount_mutex);
 	put_nilfs(nilfs);
 	if (need_to_close)
-		close_bdev_exclusive(sd.bdev, flags);
+		close_bdev_exclusive(sd.bdev, mode);
 	simple_set_mnt(mnt, s);
 	return 0;
 
@@ -1110,7 +1085,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 	mutex_unlock(&nilfs->ns_mount_mutex);
 	put_nilfs(nilfs);
  failed:
-	close_bdev_exclusive(sd.bdev, flags);
+	close_bdev_exclusive(sd.bdev, mode);
 
 	return err;
 
@@ -1120,7 +1095,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 	put_nilfs(nilfs);
 	deactivate_locked_super(s);
 	/*
-	 * deactivate_super() invokes close_bdev_exclusive().
+	 * deactivate_locked_super() invokes close_bdev_exclusive().
 	 * We must finish all post-cleaning before this call;
 	 * put_nilfs() needs the block device.
 	 */
@@ -1135,54 +1110,93 @@ struct file_system_type nilfs_fs_type = {
 	.fs_flags = FS_REQUIRES_DEV,
 };
 
-static int __init init_nilfs_fs(void)
+static void nilfs_inode_init_once(void *obj)
 {
-	int err;
-
-	err = nilfs_init_inode_cache();
-	if (err)
-		goto failed;
+	struct nilfs_inode_info *ii = obj;
 
-	err = nilfs_init_transaction_cache();
-	if (err)
-		goto failed_inode_cache;
+	INIT_LIST_HEAD(&ii->i_dirty);
+#ifdef CONFIG_NILFS_XATTR
+	init_rwsem(&ii->xattr_sem);
+#endif
+	nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
+	ii->i_bmap = (struct nilfs_bmap *)&ii->i_bmap_union;
+	inode_init_once(&ii->vfs_inode);
+}
 
-	err = nilfs_init_segbuf_cache();
-	if (err)
-		goto failed_transaction_cache;
+static void nilfs_segbuf_init_once(void *obj)
+{
+	memset(obj, 0, sizeof(struct nilfs_segment_buffer));
+}
 
-	err = nilfs_btree_path_cache_init();
-	if (err)
-		goto failed_segbuf_cache;
+static void nilfs_destroy_cachep(void)
+{
+	if (nilfs_inode_cachep)
+		kmem_cache_destroy(nilfs_inode_cachep);
+	if (nilfs_transaction_cachep)
+		kmem_cache_destroy(nilfs_transaction_cachep);
+	if (nilfs_segbuf_cachep)
+		kmem_cache_destroy(nilfs_segbuf_cachep);
+	if (nilfs_btree_path_cache)
+		kmem_cache_destroy(nilfs_btree_path_cache);
+}
 
-	err = register_filesystem(&nilfs_fs_type);
-	if (err)
-		goto failed_btree_path_cache;
+static int __init nilfs_init_cachep(void)
+{
+	nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache",
+			sizeof(struct nilfs_inode_info), 0,
+			SLAB_RECLAIM_ACCOUNT, nilfs_inode_init_once);
+	if (!nilfs_inode_cachep)
+		goto fail;
+
+	nilfs_transaction_cachep = kmem_cache_create("nilfs2_transaction_cache",
+			sizeof(struct nilfs_transaction_info), 0,
+			SLAB_RECLAIM_ACCOUNT, NULL);
+	if (!nilfs_transaction_cachep)
+		goto fail;
+
+	nilfs_segbuf_cachep = kmem_cache_create("nilfs2_segbuf_cache",
+			sizeof(struct nilfs_segment_buffer), 0,
+			SLAB_RECLAIM_ACCOUNT, nilfs_segbuf_init_once);
+	if (!nilfs_segbuf_cachep)
+		goto fail;
+
+	nilfs_btree_path_cache = kmem_cache_create("nilfs2_btree_path_cache",
+			sizeof(struct nilfs_btree_path) * NILFS_BTREE_LEVEL_MAX,
+			0, 0, NULL);
+	if (!nilfs_btree_path_cache)
+		goto fail;
 
 	return 0;
 
- failed_btree_path_cache:
-	nilfs_btree_path_cache_destroy();
+fail:
+	nilfs_destroy_cachep();
+	return -ENOMEM;
+}
+
+static int __init init_nilfs_fs(void)
+{
+	int err;
 
- failed_segbuf_cache:
-	nilfs_destroy_segbuf_cache();
+	err = nilfs_init_cachep();
+	if (err)
+		goto fail;
 
- failed_transaction_cache:
-	nilfs_destroy_transaction_cache();
+	err = register_filesystem(&nilfs_fs_type);
+	if (err)
+		goto free_cachep;
 
- failed_inode_cache:
-	nilfs_destroy_inode_cache();
+	printk(KERN_INFO "NILFS version 2 loaded\n");
+	return 0;
 
- failed:
+free_cachep:
+	nilfs_destroy_cachep();
+fail:
 	return err;
 }
 
 static void __exit exit_nilfs_fs(void)
 {
-	nilfs_destroy_segbuf_cache();
-	nilfs_destroy_transaction_cache();
-	nilfs_destroy_inode_cache();
-	nilfs_btree_path_cache_destroy();
+	nilfs_destroy_cachep();
 	unregister_filesystem(&nilfs_fs_type);
 }
 
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 6241e1722efc..8c1097327abc 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -386,7 +386,7 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
 
 	nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
 	if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) {
-		printk(KERN_ERR "NILFS: too short segment. \n");
+		printk(KERN_ERR "NILFS: too short segment.\n");
 		return -EINVAL;
 	}
 
@@ -486,11 +486,15 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
 		printk(KERN_WARNING
 		       "NILFS warning: unable to read secondary superblock\n");
 
+	/*
+	 * Compare two super blocks and set 1 in swp if the secondary
+	 * super block is valid and newer.  Otherwise, set 0 in swp.
+	 */
 	valid[0] = nilfs_valid_sb(sbp[0]);
 	valid[1] = nilfs_valid_sb(sbp[1]);
-	swp = valid[1] &&
-		(!valid[0] ||
-		 le64_to_cpu(sbp[1]->s_wtime) > le64_to_cpu(sbp[0]->s_wtime));
+	swp = valid[1] && (!valid[0] ||
+			   le64_to_cpu(sbp[1]->s_last_cno) >
+			   le64_to_cpu(sbp[0]->s_last_cno));
 
 	if (valid[swp] && nilfs_sb2_bad_offset(sbp[swp], sb2off)) {
 		brelse(sbh[1]);
@@ -646,6 +650,44 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
 	goto out;
 }
 
+int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
+			    size_t nsegs)
+{
+	sector_t seg_start, seg_end;
+	sector_t start = 0, nblocks = 0;
+	unsigned int sects_per_block;
+	__u64 *sn;
+	int ret = 0;
+
+	sects_per_block = (1 << nilfs->ns_blocksize_bits) /
+		bdev_logical_block_size(nilfs->ns_bdev);
+	for (sn = segnump; sn < segnump + nsegs; sn++) {
+		nilfs_get_segment_range(nilfs, *sn, &seg_start, &seg_end);
+
+		if (!nblocks) {
+			start = seg_start;
+			nblocks = seg_end - seg_start + 1;
+		} else if (start + nblocks == seg_start) {
+			nblocks += seg_end - seg_start + 1;
+		} else {
+			ret = blkdev_issue_discard(nilfs->ns_bdev,
+						   start * sects_per_block,
+						   nblocks * sects_per_block,
+						   GFP_NOFS,
+						   BLKDEV_IFL_BARRIER);
+			if (ret < 0)
+				return ret;
+			nblocks = 0;
+		}
+	}
+	if (nblocks)
+		ret = blkdev_issue_discard(nilfs->ns_bdev,
+					   start * sects_per_block,
+					   nblocks * sects_per_block,
+					   GFP_NOFS, BLKDEV_IFL_BARRIER);
+	return ret;
+}
+
 int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks)
 {
 	struct inode *dat = nilfs_dat_inode(nilfs);
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 589786e33464..1ab974533697 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -29,6 +29,7 @@
 #include <linux/fs.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
+#include <linux/slab.h>
 #include "sb.h"
 
 /* the_nilfs struct */
@@ -38,6 +39,7 @@ enum {
 				   the latest checkpoint was loaded */
 	THE_NILFS_DISCONTINUED,	/* 'next' pointer chain has broken */
 	THE_NILFS_GC_RUNNING,	/* gc process is running */
+	THE_NILFS_SB_DIRTY,	/* super block is dirty */
 };
 
 /**
@@ -197,6 +199,7 @@ THE_NILFS_FNS(INIT, init)
 THE_NILFS_FNS(LOADED, loaded)
 THE_NILFS_FNS(DISCONTINUED, discontinued)
 THE_NILFS_FNS(GC_RUNNING, gc_running)
+THE_NILFS_FNS(SB_DIRTY, sb_dirty)
 
 /* Minimum interval of periodical update of superblocks (in seconds) */
 #define NILFS_SB_FREQ		10
@@ -221,6 +224,7 @@ struct the_nilfs *find_or_create_nilfs(struct block_device *);
 void put_nilfs(struct the_nilfs *);
 int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *);
 int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *);
+int nilfs_discard_segments(struct the_nilfs *, __u64 *, size_t);
 int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
 struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *, int, __u64);
 int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int);
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 037e878e03fc..fcc2f064af83 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -18,6 +18,7 @@
 
 #include <linux/dcache.h>
 #include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/srcu.h>
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 3165d85aada2..0399bcbe09c8 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -87,7 +87,6 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
-#include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/writeback.h> /* for inode_lock */
 
diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
index 3e56dbffe729..b3a159b21cfd 100644
--- a/fs/notify/inotify/Kconfig
+++ b/fs/notify/inotify/Kconfig
@@ -15,6 +15,7 @@ config INOTIFY
 
 config INOTIFY_USER
 	bool "Inotify support for userspace"
+	select ANON_INODES
 	select FSNOTIFY
 	default y
 	---help---
diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c
index 40b1cf914ccb..27b75ebc7460 100644
--- a/fs/notify/inotify/inotify.c
+++ b/fs/notify/inotify/inotify.c
@@ -110,14 +110,10 @@ EXPORT_SYMBOL_GPL(get_inotify_watch);
 int pin_inotify_watch(struct inotify_watch *watch)
 {
 	struct super_block *sb = watch->inode->i_sb;
-	spin_lock(&sb_lock);
-	if (sb->s_count >= S_BIAS) {
-		atomic_inc(&sb->s_active);
-		spin_unlock(&sb_lock);
+	if (atomic_inc_not_zero(&sb->s_active)) {
 		atomic_inc(&watch->count);
 		return 1;
 	}
-	spin_unlock(&sb_lock);
 	return 0;
 }
 
@@ -515,34 +511,8 @@ EXPORT_SYMBOL_GPL(inotify_init_watch);
  * done.  Cleanup is just deactivate_super().  However, that leaves a messy
  * case - what if we *are* racing with umount() and active references to
  * superblock can't be acquired anymore?  We can bump ->s_count, grab
- * ->s_umount, which will almost certainly wait until the superblock is shut
- * down and the watch in question is pining for fjords.  That's fine, but
- * there is a problem - we might have hit the window between ->s_active
- * getting to 0 / ->s_count - below S_BIAS (i.e. the moment when superblock
- * is past the point of no return and is heading for shutdown) and the
- * moment when deactivate_super() acquires ->s_umount.  We could just do
- * drop_super() yield() and retry, but that's rather antisocial and this
- * stuff is luser-triggerable.  OTOH, having grabbed ->s_umount and having
- * found that we'd got there first (i.e. that ->s_root is non-NULL) we know
- * that we won't race with inotify_umount_inodes().  So we could grab a
- * reference to watch and do the rest as above, just with drop_super() instead
- * of deactivate_super(), right?  Wrong.  We had to drop ih->mutex before we
- * could grab ->s_umount.  So the watch could've been gone already.
- *
- * That still can be dealt with - we need to save watch->wd, do idr_find()
- * and compare its result with our pointer.  If they match, we either have
- * the damn thing still alive or we'd lost not one but two races at once,
- * the watch had been killed and a new one got created with the same ->wd
- * at the same address.  That couldn't have happened in inotify_destroy(),
- * but inotify_rm_wd() could run into that.  Still, "new one got created"
- * is not a problem - we have every right to kill it or leave it alone,
- * whatever's more convenient.
- *
- * So we can use idr_find(...) == watch && watch->inode->i_sb == sb as
- * "grab it and kill it" check.  If it's been our original watch, we are
- * fine, if it's a newcomer - nevermind, just pretend that we'd won the
- * race and kill the fscker anyway; we are safe since we know that its
- * superblock won't be going away.
+ * ->s_umount, which will wait until the superblock is shut down and the
+ * watch in question is pining for fjords.
  *
  * And yes, this is far beyond mere "not very pretty"; so's the entire
  * concept of inotify to start with.
@@ -556,57 +526,31 @@ EXPORT_SYMBOL_GPL(inotify_init_watch);
  * Called with ih->mutex held, drops it.  Possible return values:
  * 0 - nothing to do, it has died
  * 1 - remove it, drop the reference and deactivate_super()
- * 2 - remove it, drop the reference and drop_super(); we tried hard to avoid
- * that variant, since it involved a lot of PITA, but that's the best that
- * could've been done.
  */
 static int pin_to_kill(struct inotify_handle *ih, struct inotify_watch *watch)
 {
 	struct super_block *sb = watch->inode->i_sb;
-	s32 wd = watch->wd;
 
-	spin_lock(&sb_lock);
-	if (sb->s_count >= S_BIAS) {
-		atomic_inc(&sb->s_active);
-		spin_unlock(&sb_lock);
+	if (atomic_inc_not_zero(&sb->s_active)) {
 		get_inotify_watch(watch);
 		mutex_unlock(&ih->mutex);
 		return 1;	/* the best outcome */
 	}
+	spin_lock(&sb_lock);
 	sb->s_count++;
 	spin_unlock(&sb_lock);
 	mutex_unlock(&ih->mutex); /* can't grab ->s_umount under it */
 	down_read(&sb->s_umount);
-	if (likely(!sb->s_root)) {
-		/* fs is already shut down; the watch is dead */
-		drop_super(sb);
-		return 0;
-	}
-	/* raced with the final deactivate_super() */
-	mutex_lock(&ih->mutex);
-	if (idr_find(&ih->idr, wd) != watch || watch->inode->i_sb != sb) {
-		/* the watch is dead */
-		mutex_unlock(&ih->mutex);
-		drop_super(sb);
-		return 0;
-	}
-	/* still alive or freed and reused with the same sb and wd; kill */
-	get_inotify_watch(watch);
-	mutex_unlock(&ih->mutex);
-	return 2;
+	/* fs is already shut down; the watch is dead */
+	drop_super(sb);
+	return 0;
 }
 
-static void unpin_and_kill(struct inotify_watch *watch, int how)
+static void unpin_and_kill(struct inotify_watch *watch)
 {
 	struct super_block *sb = watch->inode->i_sb;
 	put_inotify_watch(watch);
-	switch (how) {
-	case 1:
-		deactivate_super(sb);
-		break;
-	case 2:
-		drop_super(sb);
-	}
+	deactivate_super(sb);
 }
 
 /**
@@ -628,7 +572,6 @@ void inotify_destroy(struct inotify_handle *ih)
 		struct list_head *watches;
 		struct super_block *sb;
 		struct inode *inode;
-		int how;
 
 		mutex_lock(&ih->mutex);
 		watches = &ih->watches;
@@ -638,8 +581,7 @@ void inotify_destroy(struct inotify_handle *ih)
 		}
 		watch = list_first_entry(watches, struct inotify_watch, h_list);
 		sb = watch->inode->i_sb;
-		how = pin_to_kill(ih, watch);
-		if (!how)
+		if (!pin_to_kill(ih, watch))
 			continue;
 
 		inode = watch->inode;
@@ -654,7 +596,7 @@ void inotify_destroy(struct inotify_handle *ih)
 
 		mutex_unlock(&ih->mutex);
 		mutex_unlock(&inode->inotify_mutex);
-		unpin_and_kill(watch, how);
+		unpin_and_kill(watch);
 	}
 
 	/* free this handle: the put matching the get in inotify_init() */
@@ -857,7 +799,6 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
 	struct inotify_watch *watch;
 	struct super_block *sb;
 	struct inode *inode;
-	int how;
 
 	mutex_lock(&ih->mutex);
 	watch = idr_find(&ih->idr, wd);
@@ -866,8 +807,7 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
 		return -EINVAL;
 	}
 	sb = watch->inode->i_sb;
-	how = pin_to_kill(ih, watch);
-	if (!how)
+	if (!pin_to_kill(ih, watch))
 		return 0;
 
 	inode = watch->inode;
@@ -881,7 +821,7 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
 
 	mutex_unlock(&ih->mutex);
 	mutex_unlock(&inode->inotify_mutex);
-	unpin_and_kill(watch, how);
+	unpin_and_kill(watch);
 
 	return 0;
 }
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 1afb0a10229f..e27960cd76ab 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -28,6 +28,7 @@
 #include <linux/path.h> /* struct path */
 #include <linux/slab.h> /* kmem_* */
 #include <linux/types.h>
+#include <linux/sched.h>
 
 #include "inotify.h"
 
@@ -146,6 +147,7 @@ static void inotify_free_group_priv(struct fsnotify_group *group)
 	idr_for_each(&group->inotify_data.idr, idr_callback, group);
 	idr_remove_all(&group->inotify_data.idr);
 	idr_destroy(&group->inotify_data.idr);
+	free_uid(group->inotify_data.user);
 }
 
 void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv)
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index a94e8bd8eb1f..e46ca685b9be 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -29,14 +29,12 @@
 #include <linux/init.h> /* module_init */
 #include <linux/inotify.h>
 #include <linux/kernel.h> /* roundup() */
-#include <linux/magic.h> /* superblock magic number */
-#include <linux/mount.h> /* mntget */
 #include <linux/namei.h> /* LOOKUP_FOLLOW */
-#include <linux/path.h> /* struct path */
 #include <linux/sched.h> /* struct user */
 #include <linux/slab.h> /* struct kmem_cache */
 #include <linux/syscalls.h>
 #include <linux/types.h>
+#include <linux/anon_inodes.h>
 #include <linux/uaccess.h>
 #include <linux/poll.h>
 #include <linux/wait.h>
@@ -45,8 +43,6 @@
 
 #include <asm/ioctls.h>
 
-static struct vfsmount *inotify_mnt __read_mostly;
-
 /* these are configurable via /proc/sys/fs/inotify/ */
 static int inotify_max_user_instances __read_mostly;
 static int inotify_max_queued_events __read_mostly;
@@ -550,21 +546,24 @@ retry:
 	if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
 		goto out_err;
 
+	/* we are putting the mark on the idr, take a reference */
+	fsnotify_get_mark(&tmp_ientry->fsn_entry);
+
 	spin_lock(&group->inotify_data.idr_lock);
 	ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry,
 				group->inotify_data.last_wd+1,
 				&tmp_ientry->wd);
 	spin_unlock(&group->inotify_data.idr_lock);
 	if (ret) {
+		/* we didn't get on the idr, drop the idr reference */
+		fsnotify_put_mark(&tmp_ientry->fsn_entry);
+
 		/* idr was out of memory allocate and try again */
 		if (ret == -EAGAIN)
 			goto retry;
 		goto out_err;
 	}
 
-	/* we put the mark on the idr, take a reference */
-	fsnotify_get_mark(&tmp_ientry->fsn_entry);
-
 	/* we are on the idr, now get on the inode */
 	ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode);
 	if (ret) {
@@ -582,16 +581,13 @@ retry:
 	/* return the watch descriptor for this new entry */
 	ret = tmp_ientry->wd;
 
-	/* match the ref from fsnotify_init_markentry() */
-	fsnotify_put_mark(&tmp_ientry->fsn_entry);
-
 	/* if this mark added a new event update the group mask */
 	if (mask & ~group->mask)
 		fsnotify_recalc_group_mask(group);
 
 out_err:
-	if (ret < 0)
-		kmem_cache_free(inotify_inode_mark_cachep, tmp_ientry);
+	/* match the ref from fsnotify_init_markentry() */
+	fsnotify_put_mark(&tmp_ientry->fsn_entry);
 
 	return ret;
 }
@@ -645,9 +641,7 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
 {
 	struct fsnotify_group *group;
 	struct user_struct *user;
-	struct file *filp;
-	struct path path;
-	int fd, ret;
+	int ret;
 
 	/* Check the IN_* constants for consistency.  */
 	BUILD_BUG_ON(IN_CLOEXEC != O_CLOEXEC);
@@ -656,10 +650,6 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
 	if (flags & ~(IN_CLOEXEC | IN_NONBLOCK))
 		return -EINVAL;
 
-	fd = get_unused_fd_flags(flags & O_CLOEXEC);
-	if (fd < 0)
-		return fd;
-
 	user = get_current_user();
 	if (unlikely(atomic_read(&user->inotify_devs) >=
 			inotify_max_user_instances)) {
@@ -676,27 +666,14 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
 
 	atomic_inc(&user->inotify_devs);
 
-	path.mnt = inotify_mnt;
-	path.dentry = inotify_mnt->mnt_root;
-	path_get(&path);
-	filp = alloc_file(&path, FMODE_READ, &inotify_fops);
-	if (!filp)
-		goto Enfile;
-
-	filp->f_flags = O_RDONLY | (flags & O_NONBLOCK);
-	filp->private_data = group;
-
-	fd_install(fd, filp);
-
-	return fd;
+	ret = anon_inode_getfd("inotify", &inotify_fops, group,
+				  O_RDONLY | flags);
+	if (ret >= 0)
+		return ret;
 
-Enfile:
-	ret = -ENFILE;
-	path_put(&path);
 	atomic_dec(&user->inotify_devs);
 out_free_uid:
 	free_uid(user);
-	put_unused_fd(fd);
 	return ret;
 }
 
@@ -783,20 +760,6 @@ out:
 	return ret;
 }
 
-static int
-inotify_get_sb(struct file_system_type *fs_type, int flags,
-	       const char *dev_name, void *data, struct vfsmount *mnt)
-{
-	return get_sb_pseudo(fs_type, "inotify", NULL,
-			INOTIFYFS_SUPER_MAGIC, mnt);
-}
-
-static struct file_system_type inotify_fs_type = {
-    .name	= "inotifyfs",
-    .get_sb	= inotify_get_sb,
-    .kill_sb	= kill_anon_super,
-};
-
 /*
  * inotify_user_setup - Our initialization function.  Note that we cannnot return
  * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
@@ -804,16 +767,6 @@ static struct file_system_type inotify_fs_type = {
  */
 static int __init inotify_user_setup(void)
 {
-	int ret;
-
-	ret = register_filesystem(&inotify_fs_type);
-	if (unlikely(ret))
-		panic("inotify: register_filesystem returned %d!\n", ret);
-
-	inotify_mnt = kern_mount(&inotify_fs_type);
-	if (IS_ERR(inotify_mnt))
-		panic("inotify: kern_mount ret %ld!\n", PTR_ERR(inotify_mnt));
-
 	inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC);
 	event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC);
 
diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
deleted file mode 100644
index 37c11e194372..000000000000
--- a/fs/ntfs/ChangeLog
+++ /dev/null
@@ -1,1702 +0,0 @@
-ToDo/Notes:
-	- Find and fix bugs.
-	- The only places in the kernel where a file is resized are
-	  ntfs_file_write*() and ntfs_truncate() for both of which i_mutex is
-	  held.  Just have to be careful in read-/writepage and other helpers
-	  not running under i_mutex that we play nice.  Also need to be careful
-	  with initialized_size extension in ntfs_file_write*() and writepage.
-	  UPDATE: The only things that need to be checked are the compressed
-	  write and the other attribute resize/write cases like index
-	  attributes, etc.  For now none of these are implemented so are safe.
-	- Implement filling in of holes in aops.c::ntfs_writepage() and its
-	  helpers.
-	- Implement mft.c::sync_mft_mirror_umount().  We currently will just
-	  leave the volume dirty on umount if the final iput(vol->mft_ino)
-	  causes a write of any mirrored mft records due to the mft mirror
-	  inode having been discarded already.  Whether this can actually ever
-	  happen is unclear however so it is worth waiting until someone hits
-	  the problem.
-
-2.1.29 - Fix a deadlock at mount time.
-
-	- During mount the VFS holds s_umount lock on the superblock.  So when
-	  we try to empty the journal $LogFile contents by calling
-	  ntfs_attr_set() when the machine does not have much memory and the
-	  journal is large ntfs_attr_set() results in the VM trying to balance
-	  dirty pages which in turn tries to that the s_umount lock and thus we
-	  get a deadlock.  The solution is to not use ntfs_attr_set() and
-	  instead do the zeroing by hand at the block level rather than page
-	  cache level.
-	- Fix sparse warnings.
-
-2.1.28 - Fix a deadlock.
-
-	- Fix deadlock in fs/ntfs/inode.c::ntfs_put_inode().  Thanks to Sergey
-	  Vlasov for the report and detailed analysis of the deadlock.  The fix
-	  involved getting rid of ntfs_put_inode() altogether and hence NTFS no
-	  longer has a ->put_inode super operation.
-
-2.1.27 - Various bug fixes and cleanups.
-
-	- Fix two compiler warnings on Alpha.  Thanks to Andrew Morton for
-	  reporting them.
-	- Fix an (innocent) off-by-one error in the runlist code.
-	- Fix a buggette in an "should be impossible" case handling where we
-	  continued the attribute lookup loop instead of aborting it.
-	- Use buffer_migrate_page() for the ->migratepage function of all ntfs
-	  address space operations.
-	- Fix comparison of $MFT and $MFTMirr to not bail out when there are
-	  unused, invalid mft records which are the same in both $MFT and
-	  $MFTMirr.
-	- Add support for sparse files which have a compression unit of 0.
-	- Remove all the make_bad_inode() calls.  This should only be called
-	  from read inode and new inode code paths.
-	- Limit name length in fs/ntfs/unistr.c::ntfs_nlstoucs() to maximum
-	  allowed by NTFS, i.e. 255 Unicode characters, not including the
-	  terminating NULL (which is not stored on disk).
-	- Improve comments on file attribute flags in fs/ntfs/layout.h.
-	- Fix a bug in fs/ntfs/inode.c::ntfs_read_locked_index_inode() where we
-	  forgot to update a temporary variable so loading index inodes which
-	  have an index allocation attribute failed.
-	- Add a missing call to flush_dcache_mft_record_page() in
-	  fs/ntfs/inode.c::ntfs_write_inode().
-	- Handle the recently introduced -ENAMETOOLONG return value from
-	  fs/ntfs/unistr.c::ntfs_nlstoucs() in fs/ntfs/namei.c::ntfs_lookup().
-	- Semaphore to mutex conversion.  (Ingo Molnar)
-
-2.1.26 - Minor bug fixes and updates.
-
-	- Fix a potential overflow in file.c where a cast to s64 was missing in
-	  a left shift of a page index.
-	- The struct inode has had its i_sem semaphore changed to a mutex named
-	  i_mutex.
-	- We have struct kmem_cache now so use it instead of the typedef
-	  kmem_cache_t.  (Pekka Enberg)
-	- Implement support for sector sizes above 512 bytes (up to the maximum
-	  supported by NTFS which is 4096 bytes).
-	- Do more detailed reporting of why we cannot mount read-write by
-	  special casing the VOLUME_MODIFIED_BY_CHKDSK flag.
-	- Miscellaneous updates to layout.h.
-	- Cope with attribute list attribute having invalid flags.  Windows
-	  copes with this and even chkdsk does not detect or fix this so we
-	  have to cope with it, too.  Thanks to Pawel Kot for reporting the
-	  problem.
-
-2.1.25 - (Almost) fully implement write(2) and truncate(2).
-
-	- Change ntfs_map_runlist_nolock(), ntfs_attr_find_vcn_nolock() and
-	  {__,}ntfs_cluster_free() to also take an optional attribute search
-	  context as argument.  This allows calling these functions with the
-	  mft record mapped.  Update all callers.
-	- Fix potential deadlock in ntfs_mft_data_extend_allocation_nolock()
-	  error handling by passing in the active search context when calling
-	  ntfs_cluster_free().
-	- Change ntfs_cluster_alloc() to take an extra boolean parameter
-	  specifying whether the cluster are being allocated to extend an
-	  attribute or to fill a hole.
-	- Change ntfs_attr_make_non_resident() to call ntfs_cluster_alloc()
-	  with @is_extension set to TRUE and remove the runlist terminator
-	  fixup code as this is now done by ntfs_cluster_alloc().
-	- Change ntfs_attr_make_non_resident to take the attribute value size
-	  as an extra parameter.  This is needed since we need to know the size
-	  before we can map the mft record and our callers always know it.  The
-	  reason we cannot simply read the size from the vfs inode i_size is
-	  that this is not necessarily uptodate.  This happens when
-	  ntfs_attr_make_non_resident() is called in the ->truncate call path.
-	- Fix ntfs_attr_make_non_resident() to update the vfs inode i_blocks
-	  which is zero for a resident attribute but should no longer be zero
-	  once the attribute is non-resident as it then has real clusters
-	  allocated.
-	- Add fs/ntfs/attrib.[hc]::ntfs_attr_extend_allocation(), a function to
-	  extend the allocation of an attributes.  Optionally, the data size,
-	  but not the initialized size can be extended, too.
-	- Implement fs/ntfs/inode.[hc]::ntfs_truncate().  It only supports
-	  uncompressed and unencrypted files and it never creates sparse files
-	  at least for the moment (making a file sparse requires us to modify
-	  its directory entries and we do not support directory operations at
-	  the moment).  Also, support for highly fragmented files, i.e. ones
-	  whose data attribute is split across multiple extents, is severly
-	  limited.  When such a case is encountered, EOPNOTSUPP is returned.
-	- Enable ATTR_SIZE attribute changes in ntfs_setattr().  This completes
-	  the initial implementation of file truncation.  Now both open(2)ing
-	  a file with the O_TRUNC flag and the {,f}truncate(2) system calls
-	  will resize a file appropriately.  The limitations are that only
-	  uncompressed and unencrypted files are supported.  Also, there is
-	  only very limited support for highly fragmented files (the ones whose
-	  $DATA attribute is split into multiple attribute extents).
-	- In attrib.c::ntfs_attr_set() call balance_dirty_pages_ratelimited()
-	  and cond_resched() in the main loop as we could be dirtying a lot of
-	  pages and this ensures we play nice with the VM and the system as a
-	  whole.
-	- Implement file operations ->write, ->aio_write, ->writev for regular
-	  files.  This replaces the old use of generic_file_write(), et al and
-	  the address space operations ->prepare_write and ->commit_write.
-	  This means that both sparse and non-sparse (unencrypted and
-	  uncompressed) files can now be extended using the normal write(2)
-	  code path.  There are two limitations at present and these are that
-	  we never create sparse files and that we only have limited support
-	  for highly fragmented files, i.e. ones whose data attribute is split
-	  across multiple extents.   When such a case is encountered,
-	  EOPNOTSUPP is returned.
-	- $EA attributes can be both resident and non-resident.
-	- Use %z for size_t to fix compilation warnings.  (Andrew Morton)
-	- Fix compilation warnings with gcc-4.0.2 on SUSE 10.0.
-	- Document extended attribute ($EA) NEED_EA flag.  (Based on libntfs
-	  patch by Yura Pakhuchiy.)
-
-2.1.24 - Lots of bug fixes and support more clean journal states.
-
-	- Support journals ($LogFile) which have been modified by chkdsk.  This
-	  means users can boot into Windows after we marked the volume dirty.
-	  The Windows boot will run chkdsk and then reboot.  The user can then
-	  immediately boot into Linux rather than having to do a full Windows
-	  boot first before rebooting into Linux and we will recognize such a
-	  journal and empty it as it is clean by definition.  Note, this only
-	  works if chkdsk left the journal in an obviously clean state.
-	- Support journals ($LogFile) with only one restart page as well as
-	  journals with two different restart pages.  We sanity check both and
-	  either use the only sane one or the more recent one of the two in the
-	  case that both are valid.
-	- Add fs/ntfs/malloc.h::ntfs_malloc_nofs_nofail() which is analogous to
-	  ntfs_malloc_nofs() but it performs allocations with __GFP_NOFAIL and
-	  hence cannot fail.
-	- Use ntfs_malloc_nofs_nofail() in the two critical regions in
-	  fs/ntfs/runlist.c::ntfs_runlists_merge().  This means we no longer
-	  need to panic() if the allocation fails as it now cannot fail.
-	- Fix two nasty runlist merging bugs that had gone unnoticed so far.
-	  Thanks to Stefano Picerno for the bug report.
-	- Remove two bogus BUG_ON()s from fs/ntfs/mft.c.
-	- Fix handling of valid but empty mapping pairs array in
-	  fs/ntfs/runlist.c::ntfs_mapping_pairs_decompress().
-	- Report unrepresentable inodes during ntfs_readdir() as KERN_WARNING
-	  messages and include the inode number.  Thanks to Yura Pakhuchiy for
-	  pointing this out.
-	- Change ntfs_rl_truncate_nolock() to throw away the runlist if the new
-	  length is zero.
-	- Add runlist.[hc]::ntfs_rl_punch_nolock() which punches a caller
-	  specified hole into a runlist.
-	- Fix a bug in fs/ntfs/index.c::ntfs_index_lookup().  When the returned
-	  index entry is in the index root, we forgot to set the @ir pointer in
-	  the index context.  Thanks to Yura Pakhuchiy for finding this bug.
-	- Remove bogus setting of PageError in ntfs_read_compressed_block().
-	- Add fs/ntfs/attrib.[hc]::ntfs_resident_attr_value_resize().
-	- Fix a bug in ntfs_map_runlist_nolock() where we forgot to protect
-	  access to the allocated size in the ntfs inode with the size lock.
-	- Fix ntfs_attr_vcn_to_lcn_nolock() and ntfs_attr_find_vcn_nolock() to
-	  return LCN_ENOENT when there is no runlist and the allocated size is
-	  zero.
-	- Fix load_attribute_list() to handle the case of a NULL runlist.
-	- Fix handling of sparse attributes in ntfs_attr_make_non_resident().
-	- Add BUG() checks to ntfs_attr_make_non_resident() and ntfs_attr_set()
-	  to ensure that these functions are never called for compressed or
-	  encrypted attributes.
-	- Fix cluster (de)allocators to work when the runlist is NULL and more
-	  importantly to take a locked runlist rather than them locking it
-	  which leads to lock reversal.
-	- Truncate {a,c,m}time to the ntfs supported time granularity when
-	  updating the times in the inode in ntfs_setattr().
-	- Fixup handling of sparse, compressed, and encrypted attributes in
-	  fs/ntfs/inode.c::ntfs_read_locked_{,attr_,index_}inode(),
-	  fs/ntfs/aops.c::ntfs_{read,write}page().
-	- Make ntfs_write_block() not instantiate sparse blocks if they contain
-	  only zeroes.
-	- Optimize fs/ntfs/aops.c::ntfs_write_block() by extending the page
-	  lock protection over the buffer submission for i/o which allows the
-	  removal of the get_bh()/put_bh() pairs for each buffer.
-	- Fix fs/ntfs/aops.c::ntfs_{read,write}_block() to handle the case
-	  where a concurrent truncate has truncated the runlist under our feet.
-	- Fix page_has_buffers()/page_buffers() handling in fs/ntfs/aops.c.
-	- In fs/ntfs/aops.c::ntfs_end_buffer_async_read(), use a bit spin lock
-	  in the first buffer head instead of a driver global spin lock to
-	  improve scalability.
-	- Minor fix to error handling and error message display in
-	  fs/ntfs/aops.c::ntfs_prepare_nonresident_write().
-	- Change the mount options {u,f,d}mask to always parse the number as
-	  an octal number to conform to how chmod(1) works, too.  Thanks to
-	  Giuseppe Bilotta and Horst von Brand for pointing out the errors of
-	  my ways.
-	- Fix various bugs in the runlist merging code.  (Based on libntfs
-	  changes by Richard Russon.)
-	- Fix sparse warnings that have crept in over time.
-	- Change ntfs_cluster_free() to require a write locked runlist on entry
-	  since we otherwise get into a lock reversal deadlock if a read locked
-	  runlist is passed in. In the process also change it to take an ntfs
-	  inode instead of a vfs inode as parameter.
-	- Fix the definition of the CHKD ntfs record magic.  It had an off by
-	  two error causing it to be CHKB instead of CHKD.
-	- Fix a stupid bug in __ntfs_bitmap_set_bits_in_run() which caused the
-	  count to become negative and hence we had a wild memset() scribbling
-	  all over the system's ram.
-
-2.1.23 - Implement extension of resident files and make writing safe as well as
-	 many bug fixes, cleanups, and enhancements...
-
-	- Add printk rate limiting for ntfs_warning() and ntfs_error() when
-	  compiled without debug.  This avoids a possible denial of service
-	  attack.  Thanks to Carl-Daniel Hailfinger from SuSE for pointing this
-	  out.
-	- Fix compilation warnings on ia64.  (Randy Dunlap)
-	- Use i_size_{read,write}() instead of reading i_size by hand and cache
-	  the value where apropriate.
-	- Add size_lock to the ntfs_inode structure.  This is an rw spinlock
-	  and it locks against access to the inode sizes.  Note, ->size_lock
-	  is also accessed from irq context so you must use the _irqsave and
-	  _irqrestore lock and unlock functions, respectively.  Protect all
-	  accesses to allocated_size, initialized_size, and compressed_size.
-	- Minor optimization to fs/ntfs/super.c::ntfs_statfs() and its helpers.
-	- Implement extension of resident files in the regular file write code
-	  paths (fs/ntfs/aops.c::ntfs_{prepare,commit}_write()).  At present
-	  this only works until the data attribute becomes too big for the mft
-	  record after which we abort the write returning -EOPNOTSUPP from
-	  ntfs_prepare_write().
-	- Add disable_sparse mount option together with a per volume sparse
-	  enable bit which is set appropriately and a per inode sparse disable
-	  bit which is preset on some system file inodes as appropriate.
-	- Enforce that sparse support is disabled on NTFS volumes pre 3.0.
-	- Fix a bug in fs/ntfs/runlist.c::ntfs_mapping_pairs_decompress() in
-	  the creation of the unmapped runlist element for the base attribute
-	  extent.
-	- Split ntfs_map_runlist() into ntfs_map_runlist() and a non-locking
-	  helper ntfs_map_runlist_nolock() which is used by ntfs_map_runlist().
-	  This allows us to map runlist fragments with the runlist lock already
-	  held without having to drop and reacquire it around the call.  Adapt
-	  all callers.
-	- Change ntfs_find_vcn() to ntfs_find_vcn_nolock() which takes a locked
-	  runlist.  This allows us to find runlist elements with the runlist
-	  lock already held without having to drop and reacquire it around the
-	  call.  Adapt all callers.
-	- Change time to u64 in time.h::ntfs2utc() as it otherwise generates a
-	  warning in the do_div() call on sparc32.  Thanks to Meelis Roos for
-	  the report and analysis of the warning.
-	- Fix a nasty runlist merge bug when merging two holes.
-	- Set the ntfs_inode->allocated_size to the real allocated size in the
-	  mft record for resident attributes (fs/ntfs/inode.c).
-	- Small readability cleanup to use "a" instead of "ctx->attr"
-	  everywhere (fs/ntfs/inode.c).
-	- Make fs/ntfs/namei.c::ntfs_get_{parent,dentry} static and move the
-	  definition of ntfs_export_ops from fs/ntfs/super.c to namei.c.  Also,
-	  declare ntfs_export_ops in fs/ntfs/ntfs.h.
-	- Correct sparse file handling.  The compressed values need to be
-	  checked and set in the ntfs inode as done for compressed files and
-	  the compressed size needs to be used for vfs inode->i_blocks instead
-	  of the allocated size, again, as done for compressed files.
-	- Add AT_EA in addition to AT_DATA to whitelist for being allowed to be
-	  non-resident in fs/ntfs/attrib.c::ntfs_attr_can_be_non_resident().
-	- Add fs/ntfs/attrib.c::ntfs_attr_vcn_to_lcn_nolock() used by the new
-	  write code.
-	- Fix bug in fs/ntfs/attrib.c::ntfs_find_vcn_nolock() where after
-	  dropping the read lock and taking the write lock we were not checking
-	  whether someone else did not already do the work we wanted to do.
-	- Rename fs/ntfs/attrib.c::ntfs_find_vcn_nolock() to
-	  ntfs_attr_find_vcn_nolock() and update all callers.
-	- Add fs/ntfs/attrib.[hc]::ntfs_attr_make_non_resident().
-	- Fix sign of various error return values to be negative in
-	  fs/ntfs/lcnalloc.c.
-	- Modify ->readpage and ->writepage (fs/ntfs/aops.c) so they detect and
-	  handle the case where an attribute is converted from resident to
-	  non-resident by a concurrent file write.
-	- Remove checks for NULL before calling kfree() since kfree() does the
-	  checking itself.  (Jesper Juhl)
-	- Some utilities modify the boot sector but do not update the checksum.
-	  Thus, relax the checking in fs/ntfs/super.c::is_boot_sector_ntfs() to
-	  only emit a warning when the checksum is incorrect rather than
-	  refusing the mount.  Thanks to Bernd Casimir for pointing this
-	  problem out.
-	- Update attribute definition handling.
-	- Add NTFS_MAX_CLUSTER_SIZE and NTFS_MAX_PAGES_PER_CLUSTER constants.
-	- Use NTFS_MAX_CLUSTER_SIZE in super.c instead of hard coding 0x10000.
-	- Use MAX_BUF_PER_PAGE instead of variable sized array allocation for
-	  better code generation and one less sparse warning in fs/ntfs/aops.c.
-	- Remove spurious void pointer casts from fs/ntfs/.  (Pekka Enberg)
-	- Use C99 style structure initialization after memory allocation where
-	  possible (fs/ntfs/{attrib.c,index.c,super.c}).  Thanks to Al Viro and
-	  Pekka Enberg.
-	- Stamp the transaction log ($UsnJrnl), aka user space journal, if it
-	  is active on the volume and we are mounting read-write or remounting
-	  from read-only to read-write.
-	- Fix a bug in address space operations error recovery code paths where
-	  if the runlist was not mapped at all and a mapping error occured we
-	  would leave the runlist locked on exit to the function so that the
-	  next access to the same file would try to take the lock and deadlock.
-	- Detect the case when Windows has been suspended to disk on the volume
-	  to be mounted and if this is the case do not allow (re)mounting
-	  read-write.  This is done by parsing hiberfil.sys if present.
-	- Fix several occurences of a bug where we would perform 'var & ~const'
-	  with a 64-bit variable and a int, i.e. 32-bit, constant.  This causes
-	  the higher order 32-bits of the 64-bit variable to be zeroed.  To fix
-	  this cast the 'const' to the same 64-bit type as 'var'.
-	- Change the runlist terminator of the newly allocated cluster(s) to
-	  LCN_ENOENT in ntfs_attr_make_non_resident().  Otherwise the runlist
-	  code gets confused.
-	- Add an extra parameter @last_vcn to ntfs_get_size_for_mapping_pairs()
-	  and ntfs_mapping_pairs_build() to allow the runlist encoding to be
-	  partial which is desirable when filling holes in sparse attributes.
-	  Update all callers.
-	- Change ntfs_map_runlist_nolock() to only decompress the mapping pairs
-	  if the requested vcn is inside it.  Otherwise we get into problems
-	  when we try to map an out of bounds vcn because we then try to map
-	  the already mapped runlist fragment which causes
-	  ntfs_mapping_pairs_decompress() to fail and return error.  Update
-	  ntfs_attr_find_vcn_nolock() accordingly.
-	- Fix a nasty deadlock that appeared in recent kernels.
-	  The situation: VFS inode X on a mounted ntfs volume is dirty.  For
-	  same inode X, the ntfs_inode is dirty and thus corresponding on-disk
-	  inode, i.e. mft record, which is in a dirty PAGE_CACHE_PAGE belonging
-	  to the table of inodes, i.e. $MFT, inode 0.
-	  What happens:
-	  Process 1: sys_sync()/umount()/whatever...  calls
-	  __sync_single_inode() for $MFT -> do_writepages() -> write_page for
-	  the dirty page containing the on-disk inode X, the page is now locked
-	  -> ntfs_write_mst_block() which clears PageUptodate() on the page to
-	  prevent anyone else getting hold of it whilst it does the write out.
-	  This is necessary as the on-disk inode needs "fixups" applied before
-	  the write to disk which are removed again after the write and
-	  PageUptodate is then set again.  It then analyses the page looking
-	  for dirty on-disk inodes and when it finds one it calls
-	  ntfs_may_write_mft_record() to see if it is safe to write this
-	  on-disk inode.  This then calls ilookup5() to check if the
-	  corresponding VFS inode is in icache().  This in turn calls ifind()
-	  which waits on the inode lock via wait_on_inode whilst holding the
-	  global inode_lock.
-	  Process 2: pdflush results in a call to __sync_single_inode for the
-	  same VFS inode X on the ntfs volume.  This locks the inode (I_LOCK)
-	  then calls write-inode -> ntfs_write_inode -> map_mft_record() ->
-	  read_cache_page() for the page (in page cache of table of inodes
-	  $MFT, inode 0) containing the on-disk inode.  This page has
-	  PageUptodate() clear because of Process 1 (see above) so
-	  read_cache_page() blocks when it tries to take the page lock for the
-	  page so it can call ntfs_read_page().
-	  Thus Process 1 is holding the page lock on the page containing the
-	  on-disk inode X and it is waiting on the inode X to be unlocked in
-	  ifind() so it can write the page out and then unlock the page.
-	  And Process 2 is holding the inode lock on inode X and is waiting for
-	  the page to be unlocked so it can call ntfs_readpage() or discover
-	  that Process 1 set PageUptodate() again and use the page.
-	  Thus we have a deadlock due to ifind() waiting on the inode lock.
-	  The solution: The fix is to use the newly introduced
-	  ilookup5_nowait() which does not wait on the inode's lock and hence
-	  avoids the deadlock.  This is safe as we do not care about the VFS
-	  inode and only use the fact that it is in the VFS inode cache and the
-	  fact that the vfs and ntfs inodes are one struct in memory to find
-	  the ntfs inode in memory if present.  Also, the ntfs inode has its
-	  own locking so it does not matter if the vfs inode is locked.
-	- Fix bug in mft record writing where we forgot to set the device in
-	  the buffers when mapping them after the VM had discarded them.
-	  Thanks to Martin MOKREJÅ for the bug report.
-
-2.1.22 - Many bug and race fixes and error handling improvements.
-
-	- Improve error handling in fs/ntfs/inode.c::ntfs_truncate().
-	- Change fs/ntfs/inode.c::ntfs_truncate() to return an error code
-	  instead of void and provide a helper ntfs_truncate_vfs() for the
-	  vfs ->truncate method.
-	- Add a new ntfs inode flag NInoTruncateFailed() and modify
-	  fs/ntfs/inode.c::ntfs_truncate() to set and clear it appropriately.
-	- Fix min_size and max_size definitions in ATTR_DEF structure in
-	  fs/ntfs/layout.h to be signed.
-	- Add attribute definition handling helpers to fs/ntfs/attrib.[hc]:
-	  ntfs_attr_size_bounds_check(), ntfs_attr_can_be_non_resident(), and
-	  ntfs_attr_can_be_resident(), which in turn use the new private helper
-	  ntfs_attr_find_in_attrdef().
-	- In fs/ntfs/aops.c::mark_ntfs_record_dirty(), take the
-	  mapping->private_lock around the dirtying of the buffer heads
-	  analagous to the way it is done in __set_page_dirty_buffers().
-	- Ensure the mft record size does not exceed the PAGE_CACHE_SIZE at
-	  mount time as this cannot work with the current implementation.
-	- Check for location of attribute name and improve error handling in
-	  general in fs/ntfs/inode.c::ntfs_read_locked_inode() and friends.
-	- In fs/ntfs/aops.c::ntfs_writepage(), if the page is fully outside
-	  i_size, i.e. race with truncate, invalidate the buffers on the page
-	  so that they become freeable and hence the page does not leak.
-	- Remove unused function fs/ntfs/runlist.c::ntfs_rl_merge().  (Adrian
-	  Bunk)
-	- Fix stupid bug in fs/ntfs/attrib.c::ntfs_attr_find() that resulted in
-	  a NULL pointer dereference in the error code path when a corrupt
-	  attribute was found.  (Thanks to Domen Puncer for the bug report.)
-	- Add MODULE_VERSION() to fs/ntfs/super.c.
-	- Make several functions and variables static.  (Adrian Bunk)
-	- Modify fs/ntfs/aops.c::mark_ntfs_record_dirty() so it allocates
-	  buffers for the page if they are not present and then marks the
-	  buffers belonging to the ntfs record dirty.  This causes the buffers
-	  to become busy and hence they are safe from removal until the page
-	  has been written out.
-	- Fix stupid bug in fs/ntfs/attrib.c::ntfs_external_attr_find() in the
-	  error handling code path that resulted in a BUG() due to trying to
-	  unmap an extent mft record when the mapping of it had failed and it
-	  thus was not mapped.  (Thanks to Ken MacFerrin for the bug report.)
-	- Drop the runlist lock after the vcn has been read in
-	  fs/ntfs/lcnalloc.c::__ntfs_cluster_free().
-	- Rewrite handling of multi sector transfer errors.  We now do not set
-	  PageError() when such errors are detected in the async i/o handler
-	  fs/ntfs/aops.c::ntfs_end_buffer_async_read().  All users of mst
-	  protected attributes now check the magic of each ntfs record as they
-	  use it and act appropriately.  This has the effect of making errors
-	  granular per ntfs record rather than per page which solves the case
-	  where we cannot access any of the ntfs records in a page when a
-	  single one of them had an mst error.  (Thanks to Ken MacFerrin for
-	  the bug report.)
-	- Fix error handling in fs/ntfs/quota.c::ntfs_mark_quotas_out_of_date()
-	  where we failed to release i_mutex on the $Quota/$Q attribute inode.
-	- Fix bug in handling of bad inodes in fs/ntfs/namei.c::ntfs_lookup().
-	- Add mapping of unmapped buffers to all remaining code paths, i.e.
-	  fs/ntfs/aops.c::ntfs_write_mst_block(), mft.c::ntfs_sync_mft_mirror(),
-	  and write_mft_record_nolock().  From now on we require that the
-	  complete runlist for the mft mirror is always mapped into memory.
-	- Add creation of buffers to fs/ntfs/mft.c::ntfs_sync_mft_mirror().
-	- Improve error handling in fs/ntfs/aops.c::ntfs_{read,write}_block().
-	- Cleanup fs/ntfs/aops.c::ntfs_{read,write}page() since we know that a
-	  resident attribute will be smaller than a page which makes the code
-	  simpler.  Also make the code more tolerant to concurrent ->truncate.
-
-2.1.21 - Fix some races and bugs, rewrite mft write code, add mft allocator.
-
-	- Implement extent mft record deallocation
-	  fs/ntfs/mft.c::ntfs_extent_mft_record_free().
-	- Splitt runlist related functions off from attrib.[hc] to runlist.[hc].
-	- Add vol->mft_data_pos and initialize it at mount time.
-	- Rename init_runlist() to ntfs_init_runlist(), ntfs_vcn_to_lcn() to
-	  ntfs_rl_vcn_to_lcn(), decompress_mapping_pairs() to
-	  ntfs_mapping_pairs_decompress(), ntfs_merge_runlists() to
-	  ntfs_runlists_merge() and adapt all callers.
-	- Add fs/ntfs/runlist.[hc]::ntfs_get_nr_significant_bytes(),
-	  ntfs_get_size_for_mapping_pairs(), ntfs_write_significant_bytes(),
-	  and ntfs_mapping_pairs_build(), adapted from libntfs.
-	- Make fs/ntfs/lcnalloc.c::ntfs_cluster_free_from_rl_nolock() not
-	  static and add a declaration for it to lcnalloc.h.
-	- Add fs/ntfs/lcnalloc.h::ntfs_cluster_free_from_rl() which is a static
-	  inline wrapper for ntfs_cluster_free_from_rl_nolock() which takes the
-	  cluster bitmap lock for the duration of the call.
-	- Add fs/ntfs/attrib.[hc]::ntfs_attr_record_resize().
-	- Implement the equivalent of memset() for an ntfs attribute in
-	  fs/ntfs/attrib.[hc]::ntfs_attr_set() and switch
-	  fs/ntfs/logfile.c::ntfs_empty_logfile() to using it.
-	- Remove unnecessary casts from LCN_* constants.
-	- Implement fs/ntfs/runlist.c::ntfs_rl_truncate_nolock().
-	- Add MFT_RECORD_OLD as a copy of MFT_RECORD in fs/ntfs/layout.h and
-	  change MFT_RECORD to contain the NTFS 3.1+ specific fields.
-	- Add a helper function fs/ntfs/aops.c::mark_ntfs_record_dirty() which
-	  marks all buffers belonging to an ntfs record dirty, followed by
-	  marking the page the ntfs record is in dirty and also marking the vfs
-	  inode containing the ntfs record dirty (I_DIRTY_PAGES).
-	- Switch fs/ntfs/index.h::ntfs_index_entry_mark_dirty() to using the
-	  new helper fs/ntfs/aops.c::mark_ntfs_record_dirty() and remove the no
-	  longer needed fs/ntfs/index.[hc]::__ntfs_index_entry_mark_dirty().
-	- Move ntfs_{un,}map_page() from ntfs.h to aops.h and fix resulting
-	  include errors.
-	- Move the typedefs for runlist_element and runlist from types.h to
-	  runlist.h and fix resulting include errors.
-	- Remove unused {__,}format_mft_record() from fs/ntfs/mft.c.
-	- Modify fs/ntfs/mft.c::__mark_mft_record_dirty() to use the helper
-	  mark_ntfs_record_dirty() which also changes the behaviour in that we
-	  now set the buffers belonging to the mft record dirty as well as the
-	  page itself.
-	- Update fs/ntfs/mft.c::write_mft_record_nolock() and sync_mft_mirror()
-	  to cope with the fact that there now are dirty buffers in mft pages.
-	- Update fs/ntfs/inode.c::ntfs_write_inode() to also use the helper
-	  mark_ntfs_record_dirty() and thus to set the buffers belonging to the
-	  mft record dirty as well as the page itself.
-	- Fix compiler warnings on x86-64 in fs/ntfs/dir.c.  (Randy Dunlap,
-	  slightly modified by me)
-	- Add fs/ntfs/mft.c::try_map_mft_record() which fails with -EALREADY if
-	  the mft record is already locked and otherwise behaves the same way
-	  as fs/ntfs/mft.c::map_mft_record().
-	- Modify fs/ntfs/mft.c::write_mft_record_nolock() so that it only
-	  writes the mft record if the buffers belonging to it are dirty.
-	  Otherwise we assume that it was written out by other means already.
-	- Attempting to write outside initialized size is _not_ a bug so remove
-	  the bug check from fs/ntfs/aops.c::ntfs_write_mst_block().  It is in
-	  fact required to write outside initialized size when preparing to
-	  extend the initialized size.
-	- Map the page instead of using page_address() before writing to it in
-	  fs/ntfs/aops.c::ntfs_mft_writepage().
-	- Provide exclusion between opening an inode / mapping an mft record
-	  and accessing the mft record in fs/ntfs/mft.c::ntfs_mft_writepage()
-	  by setting the page not uptodate throughout ntfs_mft_writepage().
-	- Clear the page uptodate flag in fs/ntfs/aops.c::ntfs_write_mst_block()
-	  to ensure noone can see the page whilst the mst fixups are applied.
-	- Add the helper fs/ntfs/mft.c::ntfs_may_write_mft_record() which
-	  checks if an mft record may be written out safely obtaining any
-	  necessary locks in the process.  This is used by
-	  fs/ntfs/aops.c::ntfs_write_mst_block().
-	- Modify fs/ntfs/aops.c::ntfs_write_mst_block() to also work for
-	  writing mft records and improve its error handling in the process.
-	  Now if any of the records in the page fail to be written out, all
-	  other records will be written out instead of aborting completely.
-	- Remove ntfs_mft_aops and update all users to use ntfs_mst_aops.
-	- Modify fs/ntfs/inode.c::ntfs_read_locked_inode() to set the
-	  ntfs_mst_aops for all inodes which are NInoMstProtected() and
-	  ntfs_aops for all other inodes.
-	- Rename fs/ntfs/mft.c::sync_mft_mirror{,_umount}() to
-	  ntfs_sync_mft_mirror{,_umount}() and change their parameters so they
-	  no longer require an ntfs inode to be present.  Update all callers.
-	- Cleanup the error handling in fs/ntfs/mft.c::ntfs_sync_mft_mirror().
-	- Clear the page uptodate flag in fs/ntfs/mft.c::ntfs_sync_mft_mirror()
-	  to ensure noone can see the page whilst the mst fixups are applied.
-	- Remove the no longer needed fs/ntfs/mft.c::ntfs_mft_writepage() and
-	  fs/ntfs/mft.c::try_map_mft_record().
-	- Fix callers of fs/ntfs/aops.c::mark_ntfs_record_dirty() to call it
-	  with the ntfs inode which contains the page rather than the ntfs
-	  inode the mft record of which is in the page.
-	- Fix race condition in fs/ntfs/inode.c::ntfs_put_inode() by moving the
-	  index inode bitmap inode release code from there to
-	  fs/ntfs/inode.c::ntfs_clear_big_inode().  (Thanks to Christoph
-	  Hellwig for spotting this.)
-	- Fix race condition in fs/ntfs/inode.c::ntfs_put_inode() by taking the
-	  inode semaphore around the code that sets ni->itype.index.bmp_ino to
-	  NULL and reorganize the code to optimize it a bit.  (Thanks to
-	  Christoph Hellwig for spotting this.)
-	- Modify fs/ntfs/aops.c::mark_ntfs_record_dirty() to no longer take the
-	  ntfs inode as a parameter as this is confusing and misleading and the
-	  needed ntfs inode is available via NTFS_I(page->mapping->host).
-	  Adapt all callers to this change.
-	- Modify fs/ntfs/mft.c::write_mft_record_nolock() and
-	  fs/ntfs/aops.c::ntfs_write_mst_block() to only check the dirty state
-	  of the first buffer in a record and to take this as the ntfs record
-	  dirty state.  We cannot look at the dirty state for subsequent
-	  buffers because we might be racing with
-	  fs/ntfs/aops.c::mark_ntfs_record_dirty().
-	- Move the static inline ntfs_init_big_inode() from fs/ntfs/inode.c to
-	  inode.h and make fs/ntfs/inode.c::__ntfs_init_inode() non-static and
-	  add a declaration for it to inode.h.  Fix some compilation issues
-	  that resulted due to #includes and header file interdependencies.
-	- Simplify setup of i_mode in fs/ntfs/inode.c::ntfs_read_locked_inode().
-	- Add helpers fs/ntfs/layout.h::MK_MREF() and MK_LE_MREF().
-	- Modify fs/ntfs/mft.c::map_extent_mft_record() to only verify the mft
-	  record sequence number if it is specified (i.e. not zero).
-	- Add fs/ntfs/mft.[hc]::ntfs_mft_record_alloc() and various helper
-	  functions used by it.
-	- Update Documentation/filesystems/ntfs.txt with instructions on how to
-	  use the Device-Mapper driver with NTFS ftdisk/LDM raid.  This removes
-	  the linear raid problem with the Software RAID / MD driver when one
-	  or more of the devices has an odd number of sectors.
-
-2.1.20 - Fix two stupid bugs introduced in 2.1.18 release.
-
-	- Fix stupid bug in fs/ntfs/attrib.c::ntfs_attr_reinit_search_ctx()
-	  where we did not clear ctx->al_entry but it was still set due to
-	  changes in ntfs_attr_lookup() and ntfs_external_attr_find() in
-	  particular.
-	- Fix another stupid bug in fs/ntfs/attrib.c::ntfs_external_attr_find()
-	  where we forgot to unmap the extent mft record when we had finished
-	  enumerating an attribute which caused a bug check to trigger when the
-	  VFS calls ->clear_inode.
-
-2.1.19 - Many cleanups, improvements, and a minor bug fix.
-
-	- Update ->setattr (fs/ntfs/inode.c::ntfs_setattr()) to refuse to
-	  change the uid, gid, and mode of an inode as we do not support NTFS
-	  ACLs yet.
-	- Remove BKL use from ntfs_setattr() syncing up with the rest of the
-	  kernel.
-	- Get rid of the ugly transparent union in fs/ntfs/dir.c::ntfs_readdir()
-	  and ntfs_filldir() as per suggestion from Al Viro.
-	- Change '\0' and L'\0' to simply 0 as per advice from Linus Torvalds.
-	- Update ->truncate (fs/ntfs/inode.c::ntfs_truncate()) to check if the
-	  inode size has changed and to only output an error if so.
-	- Rename fs/ntfs/attrib.h::attribute_value_length() to ntfs_attr_size().
-	- Add le{16,32,64} as well as sle{16,32,64} data types to
-	  fs/ntfs/types.h.
-	- Change ntfschar to be le16 instead of u16 in fs/ntfs/types.h.
-	- Add le versions of VCN, LCN, and LSN called leVCN, leLCN, and leLSN,
-	  respectively, to fs/ntfs/types.h.
-	- Update endianness conversion macros in fs/ntfs/endian.h to use the
-	  new types as appropriate.
-	- Do proper type casting when using sle64_to_cpup() in fs/ntfs/dir.c
-	  and index.c.
-	- Add leMFT_REF data type to fs/ntfs/layout.h.
-	- Update all NTFS header files with the new little endian data types.
-	  Affected files are fs/ntfs/layout.h, logfile.h, and time.h.
-	- Do proper type casting when using ntfs_is_*_recordp() in
-	  fs/ntfs/logfile.c, mft.c, and super.c. 
-	- Fix all the sparse bitwise warnings.  Had to change all the typedef
-	  enums storing little endian values to simple enums plus a typedef for
-	  the datatype to make sparse happy.
-	- Fix a bug found by the new sparse bitwise warnings where the default
-	  upcase table was defined as a pointer to wchar_t rather than ntfschar
-	  in fs/ntfs/ntfs.h and super.c.
-	- Change {const_,}cpu_to_le{16,32}(0) to just 0 as suggested by Al Viro.
-
-2.1.18 - Fix scheduling latencies at mount time as well as an endianness bug.
-
-	- Remove vol->nr_mft_records as it was pretty meaningless and optimize
-	  the calculation of total/free inodes as used by statfs().
-	- Fix scheduling latencies in ntfs_fill_super() by dropping the BKL
-	  because the code itself is using the ntfs_lock semaphore which
-	  provides safe locking.  (Ingo Molnar)
-	- Fix a potential bug in fs/ntfs/mft.c::map_extent_mft_record() that
-	  could occur in the future for when we start closing/freeing extent
-	  inodes if we don't set base_ni->ext.extent_ntfs_inos to NULL after
-	  we free it.
-	- Rename {find,lookup}_attr() to ntfs_attr_{find,lookup}() as well as
-	  find_external_attr() to ntfs_external_attr_find() to cleanup the
-	  namespace a bit and to be more consistent with libntfs.
-	- Rename {{re,}init,get,put}_attr_search_ctx() to
-	  ntfs_attr_{{re,}init,get,put}_search_ctx() as well as the type
-	  attr_search_context to ntfs_attr_search_ctx.
-	- Force use of ntfs_attr_find() in ntfs_attr_lookup() when searching
-	  for the attribute list attribute itself.
-	- Fix endianness bug in ntfs_external_attr_find().
-	- Change ntfs_{external_,}attr_find() to return 0 on success, -ENOENT
-	  if the attribute is not found, and -EIO on real error.  In the case
-	  of -ENOENT, the search context is updated to describe the attribute
-	  before which the attribute being searched for would need to be
-	  inserted if such an action were to be desired and in the case of
-	  ntfs_external_attr_find() the search context is also updated to
-	  indicate the attribute list entry before which the attribute list
-	  entry of the attribute being searched for would need to be inserted
-	  if such an action were to be desired.  Also make ntfs_find_attr()
-	  static and remove its prototype from attrib.h as it is not used
-	  anywhere other than attrib.c.  Update ntfs_attr_lookup() and all
-	  callers of ntfs_{external,}attr_{find,lookup}() for the new return
-	  values.
-	- Minor cleanup of fs/ntfs/inode.c::ntfs_init_locked_inode().
-
-2.1.17 - Fix bugs in mount time error code paths and other updates.
-
-	- Implement bitmap modification code (fs/ntfs/bitmap.[hc]).  This
-	  includes functions to set/clear a single bit or a run of bits.
-	- Add fs/ntfs/attrib.[hc]::ntfs_find_vcn() which returns the locked
-	  runlist element containing a particular vcn.  It also takes care of
-	  mapping any needed runlist fragments.
-	- Implement cluster (de-)allocation code (fs/ntfs/lcnalloc.[hc]).
-	- Load attribute definition table from $AttrDef at mount time.
-	- Fix bugs in mount time error code paths involving (de)allocation of
-	  the default and volume upcase tables.
-	- Remove ntfs_nr_mounts as it is no longer used.
-
-2.1.16 - Implement access time updates, file sync, async io, and read/writev.
-
-	- Add support for readv/writev and aio_read/aio_write (fs/ntfs/file.c).
-	  This is done by setting the appropriate file operations pointers to
-	  the generic helper functions provided by mm/filemap.c.
-	- Implement fsync, fdatasync, and msync both for files (fs/ntfs/file.c)
-	  and directories (fs/ntfs/dir.c).
-	- Add support for {a,m,c}time updates to inode.c::ntfs_write_inode().
-	  Note, except for the root directory and any other system files opened
-	  by the user, the system files will not have their access times
-	  updated as they are only accessed at the inode level an hence the
-	  file level functions which cause the times to be updated are never
-	  invoked.
-
-2.1.15 - Invalidate quotas when (re)mounting read-write.
-
-	- Add new element itype.index.collation_rule to the ntfs inode
-	  structure and set it appropriately in ntfs_read_locked_inode().
-	- Implement a new inode type "index" to allow efficient access to the
-	  indices found in various system files and adapt inode handling
-	  accordingly (fs/ntfs/inode.[hc]).  An index inode is essentially an
-	  attribute inode (NInoAttr() is true) with an attribute type of
-	  AT_INDEX_ALLOCATION.  As such, it is no longer allowed to call
-	  ntfs_attr_iget() with an attribute type of AT_INDEX_ALLOCATION as
-	  there would be no way to distinguish between normal attribute inodes
-	  and index inodes.  The function to obtain an index inode is
-	  ntfs_index_iget() and it uses the helper function
-	  ntfs_read_locked_index_inode().  Note, we do not overload
-	  ntfs_attr_iget() as indices consist of multiple attributes so using
-	  ntfs_attr_iget() to obtain an index inode would be confusing.
-	- Ensure that there is no overflow when doing page->index <<
-	  PAGE_CACHE_SHIFT by casting page->index to s64 in fs/ntfs/aops.c.
-	- Use atomic kmap instead of kmap() in fs/ntfs/aops.c::ntfs_read_page()
-	  and ntfs_read_block().
-	- Use case sensitive attribute lookups instead of case insensitive ones.
-	- Lock all page cache pages belonging to mst protected attributes while
-	  accessing them to ensure we never see corrupt data while the page is
-	  under writeout.
-	- Add framework for generic ntfs collation (fs/ntfs/collation.[hc]).
-	  We have ntfs_is_collation_rule_supported() to check if the collation
-	  rule you want to use is supported and ntfs_collation() which actually
-	  collates two data items.  We currently only support COLLATION_BINARY
-	  and COLLATION_NTOFS_ULONG but support for other collation rules will
-	  be added as the need arises.
-	- Add a new type, ntfs_index_context, to allow retrieval of an index
-	  entry using the corresponding index key.  To get an index context,
-	  use ntfs_index_ctx_get() and to release it, use ntfs_index_ctx_put().
-	  This also adds a new slab cache for the index contexts.  To lookup a
-	  key in an index inode, use ntfs_index_lookup().  After modifying an
-	  index entry, call ntfs_index_entry_flush_dcache_page() followed by
-	  ntfs_index_entry_mark_dirty() to ensure the changes are written out
-	  to disk.  For details see fs/ntfs/index.[hc].  Note, at present, if
-	  an index entry is in the index allocation attribute rather than the
-	  index root attribute it will not be written out (you will get a
-	  warning message about discarded changes instead).
-	- Load the quota file ($Quota) and check if quota tracking is enabled
-	  and if so, mark the quotas out of date.  This causes windows to
-	  rescan the volume on boot and update all quota entries.
-	- Add a set_page_dirty address space operation for ntfs_m[fs]t_aops.
-	  It is simply set to __set_page_dirty_nobuffers() to make sure that
-	  running set_page_dirty() on a page containing mft/ntfs records will
-	  not affect the dirty state of the page buffers.
-	- Add fs/ntfs/index.c::__ntfs_index_entry_mark_dirty() which sets all
-	  buffers that are inside the ntfs record in the page dirty after which
-	  it sets the page dirty.  This allows ->writepage to only write the
-	  dirty index records rather than having to write all the records in
-	  the page.  Modify fs/ntfs/index.h::ntfs_index_entry_mark_dirty() to
-	  use this rather than __set_page_dirty_nobuffers().
-	- Implement fs/ntfs/aops.c::ntfs_write_mst_block() which enables the
-	  writing of page cache pages belonging to mst protected attributes
-	  like the index allocation attribute in directory indices and other
-	  indices like $Quota/$Q, etc.  This means that the quota is now marked
-	  out of date on all volumes rather than only on ones where the quota
-	  defaults entry is in the index root attribute of the $Quota/$Q index.
-
-2.1.14 - Fix an NFSd caused deadlock reported by several users.
-
-	- Modify fs/ntfs/ntfs_readdir() to copy the index root attribute value
-	  to a buffer so that we can put the search context and unmap the mft
-	  record before calling the filldir() callback.  We need to do this
-	  because of NFSd which calls ->lookup() from its filldir callback()
-	  and this causes NTFS to deadlock as ntfs_lookup() maps the mft record
-	  of the directory and since ntfs_readdir() has got it mapped already
-	  ntfs_lookup() deadlocks.
-
-2.1.13 - Enable overwriting of resident files and housekeeping of system files.
-
-	- Implement writing of mft records (fs/ntfs/mft.[hc]), which includes
-	  keeping the mft mirror in sync with the mft when mirrored mft records
-	  are written.  The functions are write_mft_record{,_nolock}().  The
-	  implementation is quite rudimentary for now with lots of things not
-	  implemented yet but I am not sure any of them can actually occur so
-	  I will wait for people to hit each one and only then implement it.
-	- Commit open system inodes at umount time.  This should make it
-	  virtually impossible for sync_mft_mirror_umount() to ever be needed.
-	- Implement ->write_inode (fs/ntfs/inode.c::ntfs_write_inode()) for the
-	  ntfs super operations.  This gives us inode writing via the VFS inode
-	  dirty code paths.  Note:  Access time updates are not implemented yet.
-	- Implement fs/ntfs/mft.[hc]::{,__}mark_mft_record_dirty() and make
-	  fs/ntfs/aops.c::ntfs_writepage() and ntfs_commit_write() use it, thus
-	  finally enabling resident file overwrite!  (-8  This also includes a
-	  placeholder for ->writepage (ntfs_mft_writepage()), which for now
-	  just redirties the page and returns.  Also, at umount time, we for
-	  now throw away all mft data page cache pages after the last call to
-	  ntfs_commit_inode() in the hope that all inodes will have been
-	  written out by then and hence no dirty (meta)data will be lost.  We
-	  also check for this case and emit an error message telling the user
-	  to run chkdsk.
-	- Use set_page_writeback() and end_page_writeback() in the resident
-	  attribute code path of fs/ntfs/aops.c::ntfs_writepage() otherwise
-	  the radix-tree tag PAGECACHE_TAG_DIRTY remains set even though the
-	  page is clean.
-	- Implement ntfs_mft_writepage() so it now checks if any of the mft
-	  records in the page are dirty and if so redirties the page and
-	  returns.  Otherwise it just returns (after doing set_page_writeback(),
-	  unlock_page(), end_page_writeback() or the radix-tree tag
-	  PAGECACHE_TAG_DIRTY remains set even though the page is clean), thus
-	  alowing the VM to do with the page as it pleases.  Also, at umount
-	  time, now only throw away dirty mft (meta)data pages if dirty inodes
-	  are present and ask the user to email us if they see this happening.
-	- Add functions ntfs_{clear,set}_volume_flags(), to modify the volume
-	  information flags (fs/ntfs/super.c).
-	- Mark the volume dirty when (re)mounting read-write and mark it clean
-	  when unmounting or remounting read-only.  If any volume errors are
-	  found, the volume is left marked dirty to force chkdsk to run.
-	- Add code to set the NT4 compatibility flag when (re)mounting
-	  read-write for newer NTFS versions but leave it commented out for now
-	  since we do not make any modifications that are NTFS 1.2 specific yet
-	  and since setting this flag breaks Captive-NTFS which is not nice.
-	  This code must be enabled once we start writing NTFS 1.2 specific
-	  changes otherwise Windows NTFS driver might crash / cause corruption.
-
-2.1.12 - Fix the second fix to the decompression engine and some cleanups.
-
-	- Add a new address space operations struct, ntfs_mst_aops, for mst
-	  protected attributes.  This is because the default ntfs_aops do not
-	  make sense with mst protected data and were they to write anything to
-	  such an attribute they would cause data corruption so we provide
-	  ntfs_mst_aops which does not have any write related operations set.
-	- Cleanup dirty ntfs inode handling (fs/ntfs/inode.[hc]) which also
-	  includes an adapted ntfs_commit_inode() and an implementation of
-	  ntfs_write_inode() which for now just cleans dirty inodes without
-	  writing them (it does emit a warning that this is happening).
-	- Undo the second decompression engine fix (see 2.1.9 release ChangeLog
-	  entry) as it was only fixing a theoretical bug but at the same time
-	  it badly broke the handling of sparse and uncompressed compression
-	  blocks.
-
-2.1.11 - Driver internal cleanups.
-
-	- Only build logfile.o if building the driver with read-write support.
-	- Really final white space cleanups.
-	- Use generic_ffs() instead of ffs() in logfile.c which allows the
-	  log_page_size variable to be optimized by gcc into a constant.
-	- Rename uchar_t to ntfschar everywhere as uchar_t is unsigned 1-byte
-	  char as defined by POSIX and as found on some systems.
-
-2.1.10 - Force read-only (re)mounting of volumes with unsupported volume flags.
-
-	- Finish off the white space cleanups (remove trailing spaces, etc).
-	- Clean up ntfs_fill_super() and ntfs_read_inode_mount() by removing
-	  the kludges around the first iget().  Instead of (re)setting ->s_op
-	  we have the $MFT inode set up by explicit new_inode() / set ->i_ino /
-	  insert_inode_hash() / call ntfs_read_inode_mount() directly.  This
-	  kills the need for second super_operations and allows to return error
-	  from ntfs_read_inode_mount() without resorting to ugly "poisoning"
-	  tricks.  (Al Viro)
-	- Force read-only (re)mounting if any of the following bits are set in
-	  the volume information flags:
-	  	VOLUME_IS_DIRTY, VOLUME_RESIZE_LOG_FILE,
-		VOLUME_UPGRADE_ON_MOUNT, VOLUME_DELETE_USN_UNDERWAY,
-		VOLUME_REPAIR_OBJECT_ID, VOLUME_MODIFIED_BY_CHKDSK
-	  To make this easier we define VOLUME_MUST_MOUNT_RO_MASK with all the
-	  above bits set so the test is made easy.
-
-2.1.9 - Fix two bugs in decompression engine.
-
-	- Fix a bug where we would not always detect that we have reached the
-	  end of a compression block because we were ending at minus one byte
-	  which is effectively the same as being at the end.  The fix is to
-	  check whether the uncompressed buffer has been fully filled and if so
-	  we assume we have reached the end of the compression block.  A big
-	  thank you to Marcin Gibuła for the bug report, the assistance in
-	  tracking down the bug and testing the fix.
-	- Fix a possible bug where when a compressed read is truncated to the
-	  end of the file, the offset inside the last page was not truncated.
-
-2.1.8 - Handle $MFT mirror and $LogFile, improve time handling, and cleanups.
-
-	- Use get_bh() instead of manual atomic_inc() in fs/ntfs/compress.c.
-	- Modify fs/ntfs/time.c::ntfs2utc(), get_current_ntfs_time(), and
-	  utc2ntfs() to work with struct timespec instead of time_t on the
-	  Linux UTC time side thus preserving the full precision of the NTFS
-	  time and only loosing up to 99 nano-seconds in the Linux UTC time.
-	- Move fs/ntfs/time.c to fs/ntfs/time.h and make the time functions
-	  static inline.
-	- Remove unused ntfs_dirty_inode().
-	- Cleanup super operations declaration in fs/ntfs/super.c.
-	- Wrap flush_dcache_mft_record_page() in #ifdef NTFS_RW.
-	- Add NInoTestSetFoo() and NInoTestClearFoo() macro magic to
-	  fs/ntfs/inode.h and use it to declare NInoTest{Set,Clear}Dirty.
-	- Move typedefs for ntfs_attr and test_t from fs/ntfs/inode.c to
-	  fs/ntfs/inode.h so they can be used elsewhere.
-	- Determine the mft mirror size as the number of mirrored mft records
-	  and store it in ntfs_volume->mftmirr_size (fs/ntfs/super.c).
-	- Load the mft mirror at mount time and compare the mft records stored
-	  in it to the ones in the mft.  Force a read-only mount if the two do
-	  not match (fs/ntfs/super.c).
-	- Fix type casting related warnings on 64-bit architectures.  Thanks
-	  to Meelis Roos for reporting them.
-	- Move %L to %ll as %L is floating point and %ll is integer which is
-	  what we want.
-	- Read the journal ($LogFile) and determine if the volume has been
-	  shutdown cleanly and force a read-only mount if not (fs/ntfs/super.c
-	  and fs/ntfs/logfile.c).  This is a little bit of a crude check in
-	  that we only look at the restart areas and not at the actual log
-	  records so that there will be a very small number of cases where we
-	  think that a volume is dirty when in fact it is clean.  This should
-	  only affect volumes that have not been shutdown cleanly and did not
-	  have any pending, non-check-pointed i/o.
-	- If the $LogFile indicates a clean shutdown and a read-write (re)mount
-	  is requested, empty $LogFile by overwriting it with 0xff bytes to
-	  ensure that Windows cannot cause data corruption by replaying a stale
-	  journal after Linux has written to the volume.
-
-2.1.7 - Enable NFS exporting of mounted NTFS volumes.
-
-	- Set i_generation in the VFS inode from the seq_no of the NTFS inode.
-	- Make ntfs_lookup() NFS export safe, i.e. use d_splice_alias(), etc.
-	- Implement ->get_dentry() in fs/ntfs/namei.c::ntfs_get_dentry() as the
-	  default doesn't allow inode number 0 which is a valid inode on NTFS
-	  and even if it did allow that it uses iget() instead of ntfs_iget()
-	  which makes it useless for us.
-	- Implement ->get_parent() in fs/ntfs/namei.c::ntfs_get_parent() as the
-	  default just returns -EACCES which is not very useful.
-	- Define export operations (->s_export_op) for NTFS (ntfs_export_ops)
-	  and set them up in the super block at mount time (super.c) this
-	  allows mounted NTFS volumes to be exported via NFS.
-	- Add missing return -EOPNOTSUPP; in
-	  fs/ntfs/aops.c::ntfs_commit_nonresident_write().
-	- Enforce no atime and no dir atime updates at mount/remount time as
-	  they are not implemented yet anyway.
-	- Move a few assignments in fs/ntfs/attrib.c::load_attribute_list() to
-	  after a NULL check.  Thanks to Dave Jones for pointing this out.
-
-2.1.6 - Fix minor bug in handling of compressed directories.
-
-	- Fix bug in handling of compressed directories.  A compressed
-	  directory is not really compressed so when we set the ->i_blocks
-	  field of a compressed directory inode we were setting it from the
-	  non-existing field ni->itype.compressed.size which gave random
-	  results...  For directories we now always use ni->allocated_size.
-
-2.1.5 - Fix minor bug in attribute list attribute handling.
-
-	- Fix bug in attribute list handling.  Actually it is not as much a bug
-	  as too much protection in that we were not allowing attribute lists
-	  which waste space on disk while Windows XP clearly allows it and in
-	  fact creates such attribute lists so our driver was failing.
-	- Update NTFS documentation ready for 2.6 kernel release.
-
-2.1.4 - Reduce compiler requirements.
-
-	- Remove all uses of unnamed structs and unions in the driver to make
-	  old and newer gcc versions happy. Makes it a bit uglier IMO but at
-	  least people will stop hassling me about it.
-
-2.1.3 - Important bug fixes in corner cases.
-
-	- super.c::parse_ntfs_boot_sector(): Correct the check for 64-bit
-	  clusters. (Philipp Thomas)
-	- attrib.c::load_attribute_list(): Fix bug when initialized_size is a
-	  multiple of the block_size but not the cluster size. (Szabolcs
-	  Szakacsits)
-
-2.1.2 - Important bug fixes aleviating the hangs in statfs.
-
-	- Fix buggy free cluster and free inode determination logic.
-
-2.1.1 - Minor updates.
-
-	- Add handling for initialized_size != data_size in compressed files.
-	- Reduce function local stack usage from 0x3d4 bytes to just noise in
-	  fs/ntfs/upcase.c. (Randy Dunlap)
-	- Remove compiler warnings for newer gcc.
-	- Pages are no longer kmapped by mm/filemap.c::generic_file_write()
-	  around calls to ->{prepare,commit}_write.  Adapt NTFS appropriately
-	  in fs/ntfs/aops.c::ntfs_prepare_nonresident_write() by using
-	  kmap_atomic(KM_USER0).
-
-2.1.0 - First steps towards write support: implement file overwrite.
-
-	- Add configuration option for developmental write support with an
-	  appropriately scary configuration help text.
-	- Initial implementation of fs/ntfs/aops.c::ntfs_writepage() and its
-	  helper fs/ntfs/aops.c::ntfs_write_block(). This enables mmap(2) based
-	  overwriting of existing files on ntfs. Note: Resident files are
-	  only written into memory, and not written out to disk at present, so
-	  avoid writing to files smaller than about 1kiB.
-	- Initial implementation of fs/ntfs/aops.c::ntfs_prepare_write(), its
-	  helper fs/ntfs/aops.c::ntfs_prepare_nonresident_write() and their
-	  counterparts, fs/ntfs/aops.c::ntfs_commit_write(), and
-	  fs/ntfs/aops.c::ntfs_commit_nonresident_write(), respectively. Also,
-	  add generic_file_write() to the ntfs file operations (fs/ntfs/file.c).
-	  This enables write(2) based overwriting of existing files on ntfs.
-	  Note: As with mmap(2) based overwriting, resident files are only
-	  written into memory, and not written out to disk at present, so avoid
-	  writing to files smaller than about 1kiB.
-	- Implement ->truncate (fs/ntfs/inode.c::ntfs_truncate()) and
-	  ->setattr() (fs/ntfs/inode.c::ntfs_setattr()) inode operations for
-	  files with the purpose of intercepting and aborting all i_size
-	  changes which we do not support yet. ntfs_truncate() actually only
-	  emits a warning message but AFAICS our interception of i_size changes
-	  elsewhere means ntfs_truncate() never gets called for i_size changes.
-	  It is only called from generic_file_write() when we fail in
-	  ntfs_prepare_{,nonresident_}write() in order to discard any
-	  instantiated buffers beyond i_size. Thus i_size is not actually
-	  changed so our warning message is enough. Unfortunately it is not
-	  possible to easily determine if i_size is being changed or not hence
-	  we just emit an appropriately worded error message.
-
-2.0.25 - Small bug fixes and cleanups.
-
-	- Unlock the page in an out of memory error code path in
-	  fs/ntfs/aops.c::ntfs_read_block().
-	- If fs/ntfs/aops.c::ntfs_read_page() is called on an uptodate page,
-	  just unlock the page and return. (This can happen due to ->writepage
-	  clearing PageUptodate() during write out of MstProtected()
-	  attributes.
-	- Remove leaked write code again.
-
-2.0.24 - Cleanups.
-
-	- Treat BUG_ON() as ASSERT() not VERIFY(), i.e. do not use side effects
-	  inside BUG_ON(). (Adam J. Richter)
-	- Split logical OR expressions inside BUG_ON() into individual BUG_ON()
-	  calls for improved debugging. (Adam J. Richter)
-	- Add errors flag to the ntfs volume state, accessed via
-	  NVol{,Set,Clear}Errors(vol).
-	- Do not allow read-write remounts of read-only volumes with errors.
-	- Clarify comment for ntfs file operation sendfile which was added by
-	  Christoph Hellwig a while ago (just using generic_file_sendfile())
-	  to say that ntfs ->sendfile is only used for the case where the
-	  source data is on the ntfs partition and the destination is
-	  somewhere else, i.e. nothing we need to concern ourselves with.
-	- Add generic_file_write() as our ntfs file write operation.
-
-2.0.23 - Major bug fixes (races, deadlocks, non-i386 architectures).
-
-	- Massive internal locking changes to mft record locking. Fixes lock
-	  recursion and replaces the mrec_lock read/write semaphore with a
-	  mutex. Also removes the now superfluous mft_count. This fixes several
-	  race conditions and deadlocks, especially in the future write code.
-	- Fix ntfs over loopback for compressed files by adding an
-	  optimization barrier. (gcc was screwing up otherwise ?)
-	- Miscellaneous cleanups all over the code and a fix or two in error
-	  handling code paths.
-	Thanks go to Christoph Hellwig for pointing out the following two:
-	- Remove now unused function fs/ntfs/malloc.h::vmalloc_nofs().
-	- Fix ntfs_free() for ia64 and parisc by checking for VMALLOC_END, too.
-
-2.0.22 - Cleanups, mainly to ntfs_readdir(), and use C99 initializers.
-
-	- Change fs/ntfs/dir.c::ntfs_reddir() to only read/write ->f_pos once
-	  at entry/exit respectively.
-	- Use C99 initializers for structures.
-	- Remove unused variable blocks from fs/ntfs/aops.c::ntfs_read_block().
-
-2.0.21 - Check for, and refuse to work with too large files/directories/volumes.
-
-	- Limit volume size at mount time to 2TiB on architectures where
-	  unsigned long is 32-bits (fs/ntfs/super.c::parse_ntfs_boot_sector()).
-	  This is the most we can do without overflowing the 32-bit limit of
-	  the block device size imposed on us by sb_bread() and sb_getblk()
-	  for the time being.
-	- Limit file/directory size at open() time to 16TiB on architectures
-	  where unsigned long is 32-bits (fs/ntfs/file.c::ntfs_file_open() and
-	  fs/ntfs/dir.c::ntfs_dir_open()). This is the most we can do without
-	  overflowing the page cache page index.
-
-2.0.20 - Support non-resident directory index bitmaps, fix page leak in readdir.
-
-	- Move the directory index bitmap to use an attribute inode instead of
-	  having special fields for it inside the ntfs inode structure. This
-	  means that the index bitmaps now use the page cache for i/o, too,
-	  and also as a side effect we get support for non-resident index
-	  bitmaps for free.
-	- Simplify/cleanup error handling in fs/ntfs/dir.c::ntfs_readdir() and
-	  fix a page leak that manifested itself in some cases.
-	- Add fs/ntfs/inode.c::ntfs_put_inode(), which we need to release the
-	  index bitmap inode on the final iput().
-
-2.0.19 - Fix race condition, improvements, and optimizations in i/o interface.
-
-	- Apply block optimization added to fs/ntfs/aops.c::ntfs_read_block()
-	  to fs/ntfs/compress.c::ntfs_file_read_compressed_block() as well.
-	- Drop the "file" from ntfs_file_read_compressed_block().
-	- Rename fs/ntfs/aops.c::ntfs_enb_buffer_read_async() to
-	  ntfs_end_buffer_async_read() (more like the fs/buffer.c counterpart).
-	- Update ntfs_end_buffer_async_read() with the improved logic from
-	  its updated counterpart fs/buffer.c::end_buffer_async_read(). Apply
-	  further logic improvements to better determine when we set PageError.
-	- Update submission of buffers in fs/ntfs/aops.c::ntfs_read_block() to
-	  check for the buffers being uptodate first in line with the updated
-	  fs/buffer.c::block_read_full_page(). This plugs a small race
-	  condition.
-
-2.0.18 - Fix race condition in reading of compressed files.
-
-	- There was a narrow window between checking a buffer head for being
-	  uptodate and locking it in ntfs_file_read_compressed_block(). We now
-	  lock the buffer and then check whether it is uptodate or not.
-
-2.0.17 - Cleanups and optimizations - shrinking the ToDo list.
-
-	- Modify fs/ntfs/inode.c::ntfs_read_locked_inode() to return an error
-	  code and update callers, i.e. ntfs_iget(), to pass that error code
-	  up instead of just using -EIO.
-	- Modifications to super.c to ensure that both mount and remount
-	  cannot set any write related options when the driver is compiled
-	  read-only.
-	- Optimize block resolution in fs/ntfs/aops.c::ntfs_read_block() to
-	  cache the current runlist element. This should improve performance
-	  when reading very large and/or very fragmented data.
-
-2.0.16 - Convert access to $MFT/$BITMAP to attribute inode API.
-
-	- Fix a stupid bug introduced in 2.0.15 where we were unmapping the
-	  wrong inode in fs/ntfs/inode.c::ntfs_attr_iget().
-	- Fix debugging check in fs/ntfs/aops.c::ntfs_read_block().
-	- Convert $MFT/$BITMAP access to attribute inode API and remove all
-	  remnants of the ugly mftbmp address space and operations hack. This
-	  means we finally have only one readpage function as well as only one
-	  async io completion handler. Yey! The mft bitmap is now just an
-	  attribute inode and is accessed from vol->mftbmp_ino just as if it
-	  were a normal file. Fake inodes rule. (-:
-
-2.0.15 - Fake inodes based attribute i/o via the pagecache, fixes and cleanups.
-
-	- Fix silly bug in fs/ntfs/super.c::parse_options() which was causing
-	  remounts to fail when the partition had an entry in /etc/fstab and
-	  the entry specified the nls= option.
-	- Apply same macro magic used in fs/ntfs/inode.h to fs/ntfs/volume.h to
-	  expand all the helper functions NVolFoo(), NVolSetFoo(), and
-	  NVolClearFoo().
-	- Move copyright statement from driver initialisation message to
-	  module description (fs/super.c). This makes the initialisation
-	  message fit on one line and fits in better with rest of kernel.
-	- Update fs/ntfs/attrib.c::map_run_list() to work on both real and
-	  attribute inodes, and both for files and directories.
-	- Implement fake attribute inodes allowing all attribute i/o to go via
-	  the page cache and to use all the normal vfs/mm functionality:
-	  - Add ntfs_attr_iget() and its helper ntfs_read_locked_attr_inode()
-	    to fs/ntfs/inode.c.
-	  - Add needed cleanup code to ntfs_clear_big_inode().
-	- Merge address space operations for files and directories (aops.c),
-	  now just have ntfs_aops:
-	  - Rename:
-		end_buffer_read_attr_async() ->	ntfs_end_buffer_read_async(),
-		ntfs_attr_read_block()	     ->	ntfs_read_block(),
-		ntfs_file_read_page()	     ->	ntfs_readpage().
-	  - Rewrite fs/ntfs/aops.c::ntfs_readpage() to work on both real and
-	    attribute inodes, and both for files and directories.
-	  - Remove obsolete fs/ntfs/aops.c::ntfs_mst_readpage().
-
-2.0.14 - Run list merging code cleanup, minor locking changes, typo fixes.
-
-	- Change fs/ntfs/super.c::ntfs_statfs() to not rely on BKL by moving
-	  the locking out of super.c::get_nr_free_mft_records() and taking and
-	  dropping the mftbmp_lock rw_semaphore in ntfs_statfs() itself.
-	- Bring attribute runlist merging code (fs/ntfs/attrib.c) in sync with
-	  current userspace ntfs library code. This means that if a merge
-	  fails the original runlists are always left unmodified instead of
-	  being silently corrupted.
-	- Misc typo fixes.
-
-2.0.13 - Use iget5_locked() in preparation for fake inodes and small cleanups.
-
-	- Remove nr_mft_bits and the now superfluous union with nr_mft_records
-	  from ntfs_volume structure.
-	- Remove nr_lcn_bits and the now superfluous union with nr_clusters
-	  from ntfs_volume structure.
-	- Use iget5_locked() and friends instead of conventional iget(). Wrap
-	  the call in fs/ntfs/inode.c::ntfs_iget() and update callers of iget()
-	  to use ntfs_iget(). Leave only one iget() call at mount time so we
-	  don't need an ntfs_iget_mount().
-	- Change fs/ntfs/inode.c::ntfs_new_extent_inode() to take mft_no as an
-	  additional argument.
-
-2.0.12 - Initial cleanup of address space operations following 2.0.11 changes.
-
-	- Merge fs/ntfs/aops.c::end_buffer_read_mst_async() and
-	  fs/ntfs/aops.c::end_buffer_read_file_async() into one function
-	  fs/ntfs/aops.c::end_buffer_read_attr_async() using NInoMstProtected()
-	  to determine whether to apply mst fixups or not.
-	- Above change allows merging fs/ntfs/aops.c::ntfs_file_read_block()
-	  and fs/ntfs/aops.c::ntfs_mst_readpage() into one function
-	  fs/ntfs/aops.c::ntfs_attr_read_block(). Also, create a tiny wrapper
-	  fs/ntfs/aops.c::ntfs_mst_readpage() to transform the parameters from
-	  the VFS readpage function prototype to the ntfs_attr_read_block()
-	  function prototype.
-
-2.0.11 - Initial preparations for fake inode based attribute i/o.
-
-	- Move definition of ntfs_inode_state_bits to fs/ntfs/inode.h and
-	  do some macro magic (adapted from include/linux/buffer_head.h) to
-	  expand all the helper functions NInoFoo(), NInoSetFoo(), and
-	  NInoClearFoo().
-	- Add new flag to ntfs_inode_state_bits: NI_Sparse.
-	- Add new fields to ntfs_inode structure to allow use of fake inodes
-	  for attribute i/o: type, name, name_len. Also add new state bits:
-	  NI_Attr, which, if set, indicates the inode is a fake inode, and
-	  NI_MstProtected, which, if set, indicates the attribute uses multi
-	  sector transfer protection, i.e. fixups need to be applied after
-	  reads and before/after writes.
-	- Rename fs/ntfs/inode.c::ntfs_{new,clear,destroy}_inode() to
-	  ntfs_{new,clear,destroy}_extent_inode() and update callers.
-	- Use ntfs_clear_extent_inode() in fs/ntfs/inode.c::__ntfs_clear_inode()
-	  instead of ntfs_destroy_extent_inode().
-	- Cleanup memory deallocations in {__,}ntfs_clear_{,big_}inode().
-	- Make all operations on ntfs inode state bits use the NIno* functions.
-	- Set up the new ntfs inode fields and state bits in
-	  fs/ntfs/inode.c::ntfs_read_inode() and add appropriate cleanup of
-	  allocated memory to __ntfs_clear_inode().
-	- Cleanup ntfs_inode structure a bit for better ordering of elements
-	  w.r.t. their size to allow better packing of the structure in memory.
-
-2.0.10 - There can only be 2^32 - 1 inodes on an NTFS volume.
-
-	- Add check at mount time to verify that the number of inodes on the
-	  volume does not exceed 2^32 - 1, which is the maximum allowed for
-	  NTFS according to Microsoft.
-	- Change mft_no member of ntfs_inode structure to be unsigned long.
-	  Update all users. This makes ntfs_inode->mft_no just a copy of struct
-	  inode->i_ino. But we can't just always use struct inode->i_ino and
-	  remove mft_no because extent inodes do not have an attached struct
-	  inode.
-
-2.0.9 - Decompression engine now uses a single buffer and other cleanups.
-
-	- Change decompression engine to use a single buffer protected by a
-	  spin lock instead of per-CPU buffers. (Rusty Russell)
-	- Do not update cb_pos when handling a partial final page during
-	  decompression of a sparse compression block, as the value is later
-	  reset without being read/used. (Rusty Russell)
-	- Switch to using the new KM_BIO_SRC_IRQ for atomic kmap()s. (Andrew
-	  Morton)
-	- Change buffer size in ntfs_readdir()/ntfs_filldir() to use
-	  NLS_MAX_CHARSET_SIZE which makes the buffers almost 1kiB each but
-	  it also makes everything safer so it is a good thing.
-	- Miscellaneous minor cleanups to comments.
-
-2.0.8 - Major updates for handling of case sensitivity and dcache aliasing.
-
-	Big thanks go to Al Viro and other inhabitants of #kernel for investing
-	their time to discuss the case sensitivity and dcache aliasing issues.
-
-	- Remove unused source file fs/ntfs/attraops.c.
-	- Remove show_inodes mount option(s), thus dropping support for
-	  displaying of short file names.
-	- Remove deprecated mount option posix.
-	- Restore show_sys_files mount option.
-	- Add new mount option case_sensitive, to determine if the driver
-	  treats file names as case sensitive or not. If case sensitive, create
-	  file names in the POSIX namespace. Otherwise create file names in the
-	  LONG/WIN32 namespace. Note, files remain accessible via their short
-	  file name, if it exists.
-	- Remove really dumb logic bug in boot sector recovery code.
-	- Fix dcache aliasing issues wrt short/long file names via changes
-	  to fs/ntfs/dir.c::ntfs_lookup_inode_by_name() and
-	  fs/ntfs/namei.c::ntfs_lookup():
-	  - Add additional argument to ntfs_lookup_inode_by_name() in which we
-	    return information about the matching file name if the case is not
-	    matching or the match is a short file name. See comments above the
-	    function definition for details.
-	  - Change ntfs_lookup() to only create dcache entries for the correctly
-	    cased file name and only for the WIN32 namespace counterpart of DOS
-	    namespace file names. This ensures we have only one dentry per
-	    directory and also removes all dcache aliasing issues between short
-	    and long file names once we add write support. See comments above
-	    function for details.
-	- Fix potential 1 byte overflow in fs/ntfs/unistr.c::ntfs_ucstonls().
-
-2.0.7 - Minor cleanups and updates for changes in core kernel code.
-
-	- Remove much of the NULL struct element initializers.
-	- Various updates to make compatible with recent kernels.
-	- Remove defines of MAX_BUF_PER_PAGE and include linux/buffer_head.h
-	  in fs/ntfs/ntfs.h instead.
-	- Remove no longer needed KERNEL_VERSION checks. We are now in the
-	  kernel proper so they are no longer needed.
-
-2.0.6 - Major bugfix to make compatible with other kernel changes.
-
-	- Initialize the mftbmp address space properly now that there are more
-	  fields in the struct address_space. This was leading to hangs and
-	  oopses on umount since 2.5.12 because of changes to other parts of
-	  the kernel. We probably want a kernel generic init_address_space()
-	  function...
-	- Drop BKL from ntfs_readdir() after consultation with Al Viro. The
-	  only caller of ->readdir() is vfs_readdir() which holds i_mutex
-	  during the call, and i_mutex is sufficient protection against changes
-	  in the directory inode (including ->i_size).
-	- Use generic_file_llseek() for directories (as opposed to
-	  default_llseek()) as this downs i_mutex instead of the BKL which is
-	  what we now need for exclusion against ->f_pos changes considering we
-	  no longer take the BKL in ntfs_readdir().
-
-2.0.5 - Major bugfix. Buffer overflow in extent inode handling.
-
-	- No need to set old blocksize in super.c::ntfs_fill_super() as the
-	  VFS does so via invocation of deactivate_super() calling
-	  fs->fill_super() calling block_kill_super() which does it.
-	- BKL moved from VFS into dir.c::ntfs_readdir(). (Linus Torvalds)
-	  -> Do we really need it? I don't think so as we have exclusion on
-	  the directory ntfs_inode rw_semaphore mrec_lock. We mmight have to
-	  move the ->f_pos accesses under the mrec_lock though. Check this...
-	- Fix really, really, really stupid buffer overflow in extent inode
-	  handling in mft.c::map_extent_mft_record().
-
-2.0.4 - Cleanups and updates for kernel 2.5.11.
-
-	- Add documentation on how to use the MD driver to be able to use NTFS
-	  stripe and volume sets in Linux and generally cleanup documentation
-	  a bit.
-	Remove all uses of kdev_t in favour of struct block_device *:
-	- Change compress.c::ntfs_file_read_compressed_block() to use
-	  sb_getblk() instead of getblk().
-	- Change super.c::ntfs_fill_super() to use bdev_hardsect_size() instead
-	  of get_hardsect_size().
-	- No need to get old blocksize in super.c::ntfs_fill_super() as
-	  fs/super.c::get_sb_bdev() already does this.
-	- Set bh->b_bdev instead of bh->b_dev throughout aops.c.
-
-2.0.3 - Small bug fixes, cleanups, and performance improvements.
-
-	- Remove some dead code from mft.c.
-	- Optimize readpage and read_block functions throughout aops.c so that
-	  only initialized blocks are read. Non-initialized ones have their
-	  buffer head mapped, zeroed, and set up to date, without scheduling
-	  any i/o. Thanks to Al Viro for advice on how to avoid the device i/o.
-	Thanks go to Andrew Morton for spotting the below:
-	- Fix buglet in allocate_compression_buffers() error code path.
-	- Call flush_dcache_page() after modifying page cache page contents in
-	  ntfs_file_readpage().
-	- Check for existence of page buffers throughout aops.c before calling
-	  create_empty_buffers(). This happens when an I/O error occurs and the
-	  read is retried. (It also happens once writing is implemented so that
-	  needed doing anyway but I had left it for later...)
-	- Don't BUG_ON() uptodate and/or mapped buffers throughout aops.c in
-	  readpage and read_block functions. Reasoning same as above (i.e. I/O
-	  error retries and future write code paths.)
-
-2.0.2 - Minor updates and cleanups.
-
-	- Cleanup: rename mst.c::__post_read_mst_fixup to post_write_mst_fixup
-	  and cleanup the code a bit, removing the unused size parameter.
-	- Change default fmask to 0177 and update documentation.
-	- Change attrib.c::get_attr_search_ctx() to return the search context
-	  directly instead of taking the address of a pointer. A return value
-	  of NULL means the allocation failed. Updated all callers
-	  appropriately.
-	- Update to 2.5.9 kernel (preserving backwards compatibility) by
-	  replacing all occurences of page->buffers with page_buffers(page).
-	- Fix minor bugs in runlist merging, also minor cleanup.
-	- Updates to bootsector layout and mft mirror contents descriptions.
-	- Small bug fix in error detection in unistr.c and some cleanups.
-	- Grow name buffer allocations in unistr.c in aligned mutlipled of 64
-	  bytes.
-
-2.0.1 - Minor updates.
-
-	- Make default umask correspond to documentation.
-	- Improve documentation.
-	- Set default mode to include execute bit. The {u,f,d}mask can be used
-	  to take it away if desired. This allows binaries to be executed from
-	  a mounted ntfs partition.
-
-2.0.0 - New version number. Remove TNG from the name. Now in the kernel.
-
-	- Add kill_super, just keeping up with the vfs changes in the kernel.
-	- Repeat some changes from tng-0.0.8 that somehow got lost on the way
-	  from the CVS import into BitKeeper.
-	- Begin to implement proper handling of allocated_size vs
-	  initialized_size vs data_size (i.e. i_size). Done are
-	  mft.c::ntfs_mft_readpage(), aops.c::end_buffer_read_index_async(),
-	  and attrib.c::load_attribute_list().
-	- Lock the runlist in attrib.c::load_attribute_list() while using it.
-	- Fix memory leak in ntfs_file_read_compressed_block() and generally
-	  clean up compress.c a little, removing some uncommented/unused debug
-	  code.
-	- Tidy up dir.c a little bit.
-	- Don't bother getting the runlist in inode.c::ntfs_read_inode().
-	- Merge mft.c::ntfs_mft_readpage() and aops.c::ntfs_index_readpage()
-	  creating aops.c::ntfs_mst_readpage(), improving the handling of
-	  holes and overflow in the process and implementing the correct
-	  equivalent of ntfs_file_get_block() in ntfs_mst_readpage() itself.
-	  I am aiming for correctness at the moment. Modularisation can come
-	  later.
-	- Rename aops.c::end_buffer_read_index_async() to
-	  end_buffer_read_mst_async() and optimize the overflow checking and
-	  handling.
-	- Use the host of the mftbmp address space mapping to hold the ntfs
-	  volume. This is needed so the async i/o completion handler can
-	  retrieve a pointer to the volume. Hopefully this will not cause
-	  problems elsewhere in the kernel... Otherwise will need to use a
-	  fake inode.
-	- Complete implementation of proper handling of allocated_size vs
-	  initialized_size vs data_size (i.e. i_size) in whole driver.
-	  Basically aops.c is now completely rewritten.
-	- Change NTFS driver name to just NTFS and set version number to 2.0.0
-	  to make a clear distinction from the old driver which is still on
-	  version 1.1.22.
-
-tng-0.0.8 - 08/03/2002 - Now using BitKeeper, http://linux-ntfs.bkbits.net/
-
-	- Replace bdevname(sb->s_dev) with sb->s_id.
-	- Remove now superfluous new-line characters in all callers of
-	  ntfs_debug().
-	- Apply kludge in ntfs_read_inode(), setting i_nlink to 1 for
-	  directories. Without this the "find" utility gets very upset which is
-	  fair enough as Linux/Unix do not support directory hard links.
-	- Further runlist merging work. (Richard Russon)
-	- Backwards compatibility for gcc-2.95. (Richard Russon)
-	- Update to kernel 2.5.5-pre1 and rediff the now tiny patch.
-	- Convert to new filesystem declaration using ->ntfs_get_sb() and
-	  replacing ntfs_read_super() with ntfs_fill_super().
-	- Set s_maxbytes to MAX_LFS_FILESIZE to avoid page cache page index
-	  overflow on 32-bit architectures.
-	- Cleanup upcase loading code to use ntfs_(un)map_page().
-	- Disable/reenable preemtion in critical sections of compession engine.
-	- Replace device size determination in ntfs_fill_super() with
-	  sb->s_bdev->bd_inode->i_size (in bytes) and remove now superfluous
-	  function super.c::get_nr_blocks().
-	- Implement a mount time option (show_inodes) allowing choice of which
-	  types of inode names readdir() returns and modify ntfs_filldir()
-	  accordingly. There are several parameters to show_inodes:
-		system:	system files
-		win32:	long file names (including POSIX file names) [DEFAULT]
-		long:	same as win32
-		dos:	short file names only (excluding POSIX file names)
-		short:	same as dos
-		posix:	same as both win32 and dos
-		all:	all file names
-	  Note that the options are additive, i.e. specifying:
-		-o show_inodes=system,show_inodes=win32,show_inodes=dos
-	  is the same as specifying:
-		-o show_inodes=all
-	  Note that the "posix" and "all" options will show all directory
-	  names, BUT the link count on each directory inode entry is set to 1,
-	  due to Linux not supporting directory hard links. This may well
-	  confuse some userspace applications, since the directory names will
-	  have the same inode numbers. Thus it is NOT advisable to use the
-	  "posix" or "all" options. We provide them only for completeness sake.
-	- Add copies of allocated_size, initialized_size, and compressed_size to
-	  the ntfs inode structure and set them up in
-	  inode.c::ntfs_read_inode(). These reflect the unnamed data attribute
-	  for files and the index allocation attribute for directories.
-	- Add copies of allocated_size and initialized_size to ntfs inode for
-	  $BITMAP attribute of large directories and set them up in
-	  inode.c::ntfs_read_inode().
-	- Add copies of allocated_size and initialized_size to ntfs volume for
-	  $BITMAP attribute of $MFT and set them up in
-	  super.c::load_system_files().
-	- Parse deprecated ntfs driver options (iocharset, show_sys_files,
-	  posix, and utf8) and tell user what the new options to use are. Note
-	  we still do support them but they will be removed with kernel 2.7.x.
-	- Change all occurences of integer long long printf formatting to hex
-	  as printk() will not support long long integer format if/when the
-	  div64 patch goes into the kernel.
-	- Make slab caches have stable names and change the names to what they
-	  were intended to be. These changes are required/made possible by the
-	  new slab cache name handling which removes the length limitation by
-	  requiring the caller of kmem_cache_create() to supply a stable name
-	  which is then referenced but not copied.
-	- Rename run_list structure to run_list_element and create a new
-	  run_list structure containing a pointer to a run_list_element
-	  structure and a read/write semaphore. Adapt all users of runlists
-	  to new scheme and take and release the lock as needed. This fixes a
-	  nasty race as the run_list changes even when inodes are locked for
-	  reading and even when the inode isn't locked at all, so we really
-	  needed the serialization. We use a semaphore rather than a spinlock
-	  as memory allocations can sleep and doing everything GFP_ATOMIC
-	  would be silly.
-	- Cleanup read_inode() removing all code checking for lowest_vcn != 0.
-	  This can never happen due to the nature of lookup_attr() and how we
-	  support attribute lists. If it did happen it would imply the inode
-	  being corrupt.
-	- Check for lowest_vcn != 0 in ntfs_read_inode() and mark the inode as
-	  bad if found.
-	- Update to 2.5.6-pre2 changes in struct address_space.
-	- Use parent_ino() when accessing d_parent inode number in dir.c.
-	- Import Sourceforge CVS repository into BitKeeper repository:
-		http://linux-ntfs.bkbits.net/ntfs-tng-2.5
-	- Update fs/Makefile, fs/Config.help, fs/Config.in, and
-	  Documentation/filesystems/ntfs.txt for NTFS TNG.
-	- Create kernel configuration option controlling whether debugging
-	  is enabled or not.
-	- Add the required export of end_buffer_io_sync() from the patches
-	  directory to the kernel code.
-	- Update inode.c::ntfs_show_options() with show_inodes mount option.
-	- Update errors mount option.
-
-tng-0.0.7 - 13/02/2002 - The driver is now feature complete for read-only!
-
-	- Cleanup mft.c and it's debug/error output in particular. Fix a minor
-	  bug in mapping of extent inodes. Update all the comments to fit all
-	  the recent code changes.
-	- Modify vcn_to_lcn() to cope with entirely unmapped runlists.
-	- Cleanups in compress.c, mostly comments and folding help.
-	- Implement attrib.c::map_run_list() as a generic helper.
-	- Make compress.c::ntfs_file_read_compressed_block() use map_run_list()
-	  thus making code shorter and enabling attribute list support.
-	- Cleanup incorrect use of [su]64 with %L printf format specifier in
-	  all source files. Type casts to [unsigned] long long added to correct
-	  the mismatches (important for architectures which have long long not
-	  being 64 bits).
-	- Merge async io completion handlers for directory indexes and $MFT
-	  data into one by setting the index_block_size{_bits} of the ntfs
-	  inode for $MFT to the mft_record_size{_bits} of the ntfs_volume.
-	- Cleanup aops.c, update comments.
-	- Make ntfs_file_get_block() use map_run_list() so all files now
-	  support attribute lists.
-	- Make ntfs_dir_readpage() almost verbatim copy of
-	  block_read_full_page() by using ntfs_file_get_block() with only real
-	  difference being the use of our own async io completion handler
-	  rather than the default one, thus reducing the amount of code and
-	  automatically enabling attribute list support for directory indices.
-	- Fix bug in load_attribute_list() - forgot to call brelse in error
-	  code path.
-	- Change parameters to find_attr() and lookup_attr(). We no longer
-	  pass in the upcase table and its length. These can be gotten from
-	  ctx->ntfs_ino->vol->upcase{_len}. Update all callers.
-	- Cleanups in attrib.c.
-	- Implement merging of runlists, attrib.c::merge_run_lists() and its
-	  helpers. (Richard Russon)
-	- Attribute lists part 2, attribute extents and multi part runlists:
-	  enable proper support for LCN_RL_NOT_MAPPED and automatic mapping of
-	  further runlist parts via attrib.c::map_run_list().
-	- Tiny endianness bug fix in decompress_mapping_pairs().
-
-tng-0.0.6 - Encrypted directories, bug fixes, cleanups, debugging enhancements.
-
-	- Enable encrypted directories. (Their index root is marked encrypted
-	  to indicate that new files in that directory should be created
-	  encrypted.)
-	- Fix bug in NInoBmpNonResident() macro. (Cut and paste error.)
-	- Enable $Extend system directory. Most (if not all) extended system
-	  files do not have unnamed data attributes so ntfs_read_inode() had to
-	  special case them but that is ok, as the special casing recovery
-	  happens inside an error code path so there is zero slow down in the
-	  normal fast path. The special casing is done by introducing a new
-	  function inode.c::ntfs_is_extended_system_file() which checks if any
-	  of the hard links in the inode point to $Extend as being their parent
-	  directory and if they do we assume this is an extended system file.
-	- Create a sysctl/proc interface to allow {dis,en}abling of debug output
-	  when compiled with -DDEBUG. Default is debug messages to be disabled.
-	  To enable them, one writes a non-zero value to /proc/sys/fs/ntfs-debug
-	  (if /proc is enabled) or uses sysctl(2) to effect the same (if sysctl
-	  interface is enabled). Inspired by old ntfs driver.
-	- Add debug_msgs insmod/kernel boot parameter to set whether debug
-	  messages are {dis,en}abled. This is useful to enable debug messages
-	  during ntfs initialization and is the only way to activate debugging
-	  when the sysctl interface is not enabled.
-	- Cleanup debug output in various places.
-	- Remove all dollar signs ($) from the source (except comments) to
-	  enable compilation on architectures whose gcc compiler does not
-	  support dollar signs in the names of variables/constants. Attribute
-	  types now start with AT_ instead of $ and $I30 is now just I30.
-	- Cleanup ntfs_lookup() and add consistency check of sequence numbers.
-	- Load complete runlist for $MFT/$BITMAP during mount and cleanup
-	  access functions. This means we now cope with $MFT/$BITMAP being
-	  spread accross several mft records.
-	- Disable modification of mft_zone_multiplier on remount. We can always
-	  reenable this later on if we really want to, but we will need to make
-	  sure we readjust the mft_zone size / layout accordingly.
-
-tng-0.0.5 - Modernize for 2.5.x and further in line-ing with Al Viro's comments.
-
-	- Use sb_set_blocksize() instead of set_blocksize() and verify the
-	  return value.
-	- Use sb_bread() instead of bread() throughout.
-	- Add index_vcn_size{_bits} to ntfs_inode structure to store the size
-	  of a directory index block vcn. Apply resulting simplifications in
-	  dir.c everywhere.
-	- Fix a small bug somewhere (but forgot what it was).
-	- Change ntfs_{debug,error,warning} to enable gcc to do type checking
-	  on the printf-format parameter list and fix bugs reported by gcc
-	  as a result. (Richard Russon)
-	- Move inode allocation strategy to Al's new stuff but maintain the
-	  divorce of ntfs_inode from struct inode. To achieve this we have two
-	  separate slab caches, one for big ntfs inodes containing a struct
-	  inode and pure ntfs inodes and at the same time fix some faulty
-	  error code paths in ntfs_read_inode().
-	- Show mount options in proc (inode.c::ntfs_show_options()).
-
-tng-0.0.4 - Big changes, getting in line with Al Viro's comments.
-
-	- Modified (un)map_mft_record functions to be common for read and write
-	  case. To specify which is which, added extra parameter at front of
-	  parameter list. Pass either READ or WRITE to this, each has the
-	  obvious meaning.
-	- General cleanups to allow for easier folding in vi.
-	- attrib.c::decompress_mapping_pairs() now accepts the old runlist
-	  argument, and invokes attrib.c::merge_run_lists() to merge the old
-	  and the new runlists.
-	- Removed attrib.c::find_first_attr().
-	- Implemented loading of attribute list and complete runlist for $MFT.
-	  This means we now cope with $MFT being spread across several mft
-	  records.
-	- Adapt to 2.5.2-pre9 and the changed create_empty_buffers() syntax.
-	- Adapt major/minor/kdev_t/[bk]devname stuff to new 2.5.x kernels.
-	- Make ntfs_volume be allocated via kmalloc() instead of using a slab
-	  cache. There are too little ntfs_volume structures at any one time
-	  to justify a private slab cache.
-	- Fix bogus kmap() use in async io completion. Now use kmap_atomic().
-	  Use KM_BIO_IRQ on advice from IRC/kernel...
-	- Use ntfs_map_page() in map_mft_record() and create ->readpage method
-	  for reading $MFT (ntfs_mft_readpage). In the process create dedicated
-	  address space operations (ntfs_mft_aops) for $MFT inode mapping. Also
-	  removed the now superfluous exports from the kernel core patch.
-	- Fix a bug where kfree() was used instead of ntfs_free().
-	- Change map_mft_record() to take ntfs_inode as argument instead of
-	  vfs inode. Dito for unmap_mft_record(). Adapt all callers.
-	- Add pointer to ntfs_volume to ntfs_inode.
-	- Add mft record number and sequence number to ntfs_inode. Stop using
-	  i_ino and i_generation for in-driver purposes.
-	- Implement attrib.c::merge_run_lists(). (Richard Russon)
-	- Remove use of proper inodes by extent inodes. Move i_ino and
-	  i_generation to ntfs_inode to do this. Apply simplifications that
-	  result and remove iget_no_wait(), etc.
-	- Pass ntfs_inode everywhere in the driver (used to be struct inode).
-	- Add reference counting in ntfs_inode for the ntfs inode itself and
-	  for the mapped mft record.
-	- Extend mft record mapping so we can (un)map extent mft records (new
-	  functions (un)map_extent_mft_record), and so mappings are reference
-	  counted and don't have to happen twice if already mapped - just ref
-	  count increases.
-	- Add -o iocharset as alias to -o nls for backwards compatibility.
-	- The latest core patch is now tiny. In fact just a single additional
-	  export is necessary over the base kernel.
-
-tng-0.0.3 - Cleanups, enhancements, bug fixes.
-
-	- Work on attrib.c::decompress_mapping_pairs() to detect base extents
-	  and setup the runlist appropriately using knowledge provided by the
-	  sizes in the base attribute record.
-	- Balance the get_/put_attr_search_ctx() calls so we don't leak memory
-	  any more.
-	- Introduce ntfs_malloc_nofs() and ntfs_free() to allocate/free a single
-	  page or use vmalloc depending on the amount of memory requested.
-	- Cleanup error output. The __FUNCTION__ "(): " is now added
-	  automatically. Introduced a new header file debug.h to support this
-	  and also moved ntfs_debug() function into it.
-	- Make reading of compressed files more intelligent and especially get
-	  rid of the vmalloc_nofs() from readpage(). This now uses per CPU
-	  buffers (allocated at first mount with cluster size <= 4kiB and
-	  deallocated on last umount with cluster size <= 4kiB), and
-	  asynchronous io for the compressed data using a list of buffer heads.
-	  Er, we use synchronous io as async io only works on whole pages
-	  covered by buffers and not on individual buffer heads...
-	- Bug fix for reading compressed files with sparse compression blocks.
-
-tng-0.0.2 - Now handles larger/fragmented/compressed volumes/files/dirs.
-
-	- Fixed handling of directories when cluster size exceeds index block
-	  size.
-	- Hide DOS only name space directory entries from readdir() but allow
-	  them in lookup(). This should fix the problem that Linux doesn't
-	  support directory hard links, while still allowing access to entries
-	  via their short file name. This also has the benefit of mimicking
-	  what Windows users are used to, so it is the ideal solution.
-	- Implemented sync_page everywhere so no more hangs in D state when
-	  waiting for a page.
-	- Stop using bforget() in favour of brelse().
-	- Stop locking buffers unnecessarily.
-	- Implemented compressed files (inode->mapping contains uncompressed
-	  data, raw compressed data is currently bread() into a vmalloc()ed
-	  memory buffer).
-	- Enable compressed directories. (Their index root is marked compressed
-	  to indicate that new files in that directory should be created
-	  compressed.)
-	- Use vsnprintf rather than vsprintf in the ntfs_error and ntfs_warning
-	  functions. (Thanks to Will Dyson for pointing this out.)
-	- Moved the ntfs_inode and ntfs_volume (the former ntfs_inode_info and
-	  ntfs_sb_info) out of the common inode and super_block structures and
-	  started using the generic_ip and generic_sbp pointers instead. This
-	  makes ntfs entirely private with respect to the kernel tree.
-	- Detect compiler version and abort with error message if gcc less than
-	  2.96 is used.
-	- Fix bug in name comparison function in unistr.c.
-	- Implement attribute lists part 1, the infrastructure: search contexts
-	  and operations, find_external_attr(), lookup_attr()) and make the
-	  code use the infrastructure.
-	- Fix stupid buffer overflow bug that became apparent on larger run
-	  list containing attributes.
-	- Fix bugs in readdir() that became apparent on larger directories.
-
-	The driver is now really useful and survives the test
-		find . -type f -exec md5sum "{}" \;
-	without any error messages on a over 1GiB sized partition with >16k
-	files on it, including compressed files and directories and many files
-	and directories with attribute lists.
-
-tng-0.0.1 - The first useful version.
-
-	- Added ntfs_lookup().
-	- Added default upcase generation and handling.
-	- Added compile options to be shown on module init.
-	- Many bug fixes that were "hidden" before.
-	- Update to latest kernel.
-	- Added ntfs_readdir().
-	- Added file operations for mmap(), read(), open() and llseek(). We just
-	  use the generic ones. The whole point of going through implementing
-	  readpage() methods and where possible get_block() call backs is that
-	  this allows us to make use of the generic high level methods provided
-	  by the kernel.
-
-	The driver is now actually useful! Yey. (-: It undoubtedly has got bugs
-	though and it doesn't implement accesssing compressed files yet. Also,
-	accessing files with attribute list attributes is not implemented yet
-	either. But for small or simple filesystems it should work and allow
-	you to list directories, use stat on directory entries and the file
-	system, open, read, mmap and llseek around in files. A big mile stone
-	has been reached!
-
-tng-0.0.0 - Initial version tag.
-
-	Initial driver implementation. The driver can mount and umount simple
-	NTFS filesystems (i.e. ones without attribute lists in the system
-	files). If the mount fails there might be problems in the error handling
-	code paths, so be warned. Otherwise it seems to be loading the system
-	files nicely and the mft record read mapping/unmapping seems to be
-	working nicely, too. Proof of inode metadata in the page cache and non-
-	resident file unnamed stream data in the page cache concepts is thus
-	complete.
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index cfce53cb65d7..c3c2c7ac9020 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -23,6 +23,7 @@
 
 #include <linux/errno.h>
 #include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 50d3b0c258e3..f5094ee224c1 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -22,6 +22,7 @@
 
 #include <linux/buffer_head.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
 
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index 08f7530e9341..6551c7cbad92 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -25,6 +25,7 @@
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 #include <linux/vmalloc.h>
+#include <linux/slab.h>
 
 #include "attrib.h"
 #include "inode.h"
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 5a9e34475e37..0f48e7c5d9e1 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -21,6 +21,7 @@
  */
 
 #include <linux/buffer_head.h>
+#include <linux/slab.h>
 
 #include "dir.h"
 #include "aops.h"
@@ -1526,10 +1527,9 @@ static int ntfs_dir_open(struct inode *vi, struct file *filp)
  * this problem for now.  We do write the $BITMAP attribute if it is present
  * which is the important one for a directory so things are not too bad.
  */
-static int ntfs_dir_fsync(struct file *filp, struct dentry *dentry,
-		int datasync)
+static int ntfs_dir_fsync(struct file *filp, int datasync)
 {
-	struct inode *bmp_vi, *vi = dentry->d_inode;
+	struct inode *bmp_vi, *vi = filp->f_mapping->host;
 	int err, ret;
 	ntfs_attr na;
 
@@ -1545,7 +1545,7 @@ static int ntfs_dir_fsync(struct file *filp, struct dentry *dentry,
  		write_inode_now(bmp_vi, !datasync);
 		iput(bmp_vi);
 	}
-	ret = ntfs_write_inode(vi, 1);
+	ret = __ntfs_write_inode(vi, 1);
 	write_inode_now(vi, !datasync);
 	err = sync_blockdev(vi->i_sb->s_bdev);
 	if (unlikely(err && !ret))
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 43179ddd336f..113ebd9f25a4 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -20,6 +20,7 @@
  */
 
 #include <linux/buffer_head.h>
+#include <linux/gfp.h>
 #include <linux/pagemap.h>
 #include <linux/pagevec.h>
 #include <linux/sched.h>
@@ -97,9 +98,6 @@ static int ntfs_file_open(struct inode *vi, struct file *filp)
  * the page at all.  For a more detailed explanation see ntfs_truncate() in
  * fs/ntfs/inode.c.
  *
- * @cached_page and @lru_pvec are just optimizations for dealing with multiple
- * pages.
- *
  * Return 0 on success and -errno on error.  In the case that an error is
  * encountered it is possible that the initialized size will already have been
  * incremented some way towards @new_init_size but it is guaranteed that if
@@ -109,8 +107,7 @@ static int ntfs_file_open(struct inode *vi, struct file *filp)
  * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be
  *	    held by the caller.
  */
-static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size,
-		struct page **cached_page, struct pagevec *lru_pvec)
+static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size)
 {
 	s64 old_init_size;
 	loff_t old_i_size;
@@ -402,18 +399,13 @@ static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov,
  * Obtain @nr_pages locked page cache pages from the mapping @mapping and
  * starting at index @index.
  *
- * If a page is newly created, increment its refcount and add it to the
- * caller's lru-buffering pagevec @lru_pvec.
- *
- * This is the same as mm/filemap.c::__grab_cache_page(), except that @nr_pages
- * are obtained at once instead of just one page and that 0 is returned on
- * success and -errno on error.
+ * If a page is newly created, add it to lru list
  *
  * Note, the page locks are obtained in ascending page index order.
  */
 static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
 		pgoff_t index, const unsigned nr_pages, struct page **pages,
-		struct page **cached_page, struct pagevec *lru_pvec)
+		struct page **cached_page)
 {
 	int err, nr;
 
@@ -429,7 +421,7 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
 					goto err_out;
 				}
 			}
-			err = add_to_page_cache(*cached_page, mapping, index,
+			err = add_to_page_cache_lru(*cached_page, mapping, index,
 					GFP_KERNEL);
 			if (unlikely(err)) {
 				if (err == -EEXIST)
@@ -437,9 +429,6 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
 				goto err_out;
 			}
 			pages[nr] = *cached_page;
-			page_cache_get(*cached_page);
-			if (unlikely(!pagevec_add(lru_pvec, *cached_page)))
-				__pagevec_lru_add_file(lru_pvec);
 			*cached_page = NULL;
 		}
 		index++;
@@ -1799,7 +1788,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
 	ssize_t status, written;
 	unsigned nr_pages;
 	int err;
-	struct pagevec lru_pvec;
 
 	ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
 			"pos 0x%llx, count 0x%lx.",
@@ -1911,7 +1899,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
 			}
 		}
 	}
-	pagevec_init(&lru_pvec, 0);
 	written = 0;
 	/*
 	 * If the write starts beyond the initialized size, extend it up to the
@@ -1924,8 +1911,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
 	ll = ni->initialized_size;
 	read_unlock_irqrestore(&ni->size_lock, flags);
 	if (pos > ll) {
-		err = ntfs_attr_extend_initialized(ni, pos, &cached_page,
-				&lru_pvec);
+		err = ntfs_attr_extend_initialized(ni, pos);
 		if (err < 0) {
 			ntfs_error(vol->sb, "Cannot perform write to inode "
 					"0x%lx, attribute type 0x%x, because "
@@ -2011,7 +1997,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
 			ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes);
 		/* Get and lock @do_pages starting at index @start_idx. */
 		status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages,
-				pages, &cached_page, &lru_pvec);
+				pages, &cached_page);
 		if (unlikely(status))
 			break;
 		/*
@@ -2076,7 +2062,6 @@ err_out:
 	*ppos = pos;
 	if (cached_page)
 		page_cache_release(cached_page);
-	pagevec_lru_add_file(&lru_pvec);
 	ntfs_debug("Done.  Returning %s (written 0x%lx, status %li).",
 			written ? "written" : "status", (unsigned long)written,
 			(long)status);
@@ -2148,7 +2133,6 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 /**
  * ntfs_file_fsync - sync a file to disk
  * @filp:	file to be synced
- * @dentry:	dentry describing the file to sync
  * @datasync:	if non-zero only flush user data and not metadata
  *
  * Data integrity sync of a file to disk.  Used for fsync, fdatasync, and msync
@@ -2164,25 +2148,21 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
  * Also, if @datasync is true, we do not wait on the inode to be written out
  * but we always wait on the page cache pages to be written out.
  *
- * Note: In the past @filp could be NULL so we ignore it as we don't need it
- * anyway.
- *
  * Locking: Caller must hold i_mutex on the inode.
  *
  * TODO: We should probably also write all attribute/index inodes associated
  * with this inode but since we have no simple way of getting to them we ignore
  * this problem for now.
  */
-static int ntfs_file_fsync(struct file *filp, struct dentry *dentry,
-		int datasync)
+static int ntfs_file_fsync(struct file *filp, int datasync)
 {
-	struct inode *vi = dentry->d_inode;
+	struct inode *vi = filp->f_mapping->host;
 	int err, ret = 0;
 
 	ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
 	BUG_ON(S_ISDIR(vi->i_mode));
 	if (!datasync || !NInoNonResident(NTFS_I(vi)))
-		ret = ntfs_write_inode(vi, 1);
+		ret = __ntfs_write_inode(vi, 1);
 	write_inode_now(vi, !datasync);
 	/*
 	 * NOTE: If we were to use mapping->private_list (see ext2 and
diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c
index 2194eff49743..096c135691ae 100644
--- a/fs/ntfs/index.c
+++ b/fs/ntfs/index.c
@@ -19,6 +19,8 @@
  * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
+#include <linux/slab.h>
+
 #include "aops.h"
 #include "collate.h"
 #include "debug.h"
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index dc2505abb6d7..4b57fb1eac2a 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -2957,7 +2957,7 @@ out:
  *
  * Return 0 on success and -errno on error.
  */
-int ntfs_write_inode(struct inode *vi, int sync)
+int __ntfs_write_inode(struct inode *vi, int sync)
 {
 	sle64 nt;
 	ntfs_inode *ni = NTFS_I(vi);
diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h
index 117eaf8032a3..9a113544605d 100644
--- a/fs/ntfs/inode.h
+++ b/fs/ntfs/inode.h
@@ -307,12 +307,12 @@ extern void ntfs_truncate_vfs(struct inode *vi);
 
 extern int ntfs_setattr(struct dentry *dentry, struct iattr *attr);
 
-extern int ntfs_write_inode(struct inode *vi, int sync);
+extern int __ntfs_write_inode(struct inode *vi, int sync);
 
 static inline void ntfs_commit_inode(struct inode *vi)
 {
 	if (!is_bad_inode(vi))
-		ntfs_write_inode(vi, 1);
+		__ntfs_write_inode(vi, 1);
 	return;
 }
 
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 1caa0ef0b2bb..b572b6727181 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -21,6 +21,7 @@
  */
 
 #include <linux/buffer_head.h>
+#include <linux/slab.h>
 #include <linux/swap.h>
 
 #include "attrib.h"
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index 2ca00153b6ec..358273e59ade 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -23,6 +23,7 @@
 #include <linux/dcache.h>
 #include <linux/exportfs.h>
 #include <linux/security.h>
+#include <linux/slab.h>
 
 #include "attrib.h"
 #include "debug.h"
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 80b04770e8e9..0de1db6cddbf 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -31,6 +31,7 @@
 #include <linux/vfs.h>
 #include <linux/moduleparam.h>
 #include <linux/smp_lock.h>
+#include <linux/bitmap.h>
 
 #include "sysctl.h"
 #include "logfile.h"
@@ -39,6 +40,7 @@
 #include "dir.h"
 #include "debug.h"
 #include "index.h"
+#include "inode.h"
 #include "aops.h"
 #include "layout.h"
 #include "malloc.h"
@@ -2457,7 +2459,6 @@ static void ntfs_put_super(struct super_block *sb)
 static s64 get_nr_free_clusters(ntfs_volume *vol)
 {
 	s64 nr_free = vol->nr_clusters;
-	u32 *kaddr;
 	struct address_space *mapping = vol->lcnbmp_ino->i_mapping;
 	struct page *page;
 	pgoff_t index, max_index;
@@ -2476,7 +2477,8 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
 	ntfs_debug("Reading $Bitmap, max_index = 0x%lx, max_size = 0x%lx.",
 			max_index, PAGE_CACHE_SIZE / 4);
 	for (index = 0; index < max_index; index++) {
-		unsigned int i;
+		unsigned long *kaddr;
+
 		/*
 		 * Read the page from page cache, getting it from backing store
 		 * if necessary, and increment the use count.
@@ -2489,16 +2491,16 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
 			nr_free -= PAGE_CACHE_SIZE * 8;
 			continue;
 		}
-		kaddr = (u32*)kmap_atomic(page, KM_USER0);
+		kaddr = kmap_atomic(page, KM_USER0);
 		/*
-		 * For each 4 bytes, subtract the number of set bits. If this
+		 * Subtract the number of set bits. If this
 		 * is the last page and it is partial we don't really care as
 		 * it just means we do a little extra work but it won't affect
 		 * the result as all out of range bytes are set to zero by
 		 * ntfs_readpage().
 		 */
-	  	for (i = 0; i < PAGE_CACHE_SIZE / 4; i++)
-			nr_free -= (s64)hweight32(kaddr[i]);
+		nr_free -= bitmap_weight(kaddr,
+					PAGE_CACHE_SIZE * BITS_PER_BYTE);
 		kunmap_atomic(kaddr, KM_USER0);
 		page_cache_release(page);
 	}
@@ -2537,7 +2539,6 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
 static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
 		s64 nr_free, const pgoff_t max_index)
 {
-	u32 *kaddr;
 	struct address_space *mapping = vol->mftbmp_ino->i_mapping;
 	struct page *page;
 	pgoff_t index;
@@ -2547,7 +2548,8 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
 	ntfs_debug("Reading $MFT/$BITMAP, max_index = 0x%lx, max_size = "
 			"0x%lx.", max_index, PAGE_CACHE_SIZE / 4);
 	for (index = 0; index < max_index; index++) {
-		unsigned int i;
+		unsigned long *kaddr;
+
 		/*
 		 * Read the page from page cache, getting it from backing store
 		 * if necessary, and increment the use count.
@@ -2560,16 +2562,16 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
 			nr_free -= PAGE_CACHE_SIZE * 8;
 			continue;
 		}
-		kaddr = (u32*)kmap_atomic(page, KM_USER0);
+		kaddr = kmap_atomic(page, KM_USER0);
 		/*
-		 * For each 4 bytes, subtract the number of set bits. If this
+		 * Subtract the number of set bits. If this
 		 * is the last page and it is partial we don't really care as
 		 * it just means we do a little extra work but it won't affect
 		 * the result as all out of range bytes are set to zero by
 		 * ntfs_readpage().
 		 */
-	  	for (i = 0; i < PAGE_CACHE_SIZE / 4; i++)
-			nr_free -= (s64)hweight32(kaddr[i]);
+		nr_free -= bitmap_weight(kaddr,
+					PAGE_CACHE_SIZE * BITS_PER_BYTE);
 		kunmap_atomic(kaddr, KM_USER0);
 		page_cache_release(page);
 	}
@@ -2662,6 +2664,13 @@ static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs)
 	return 0;
 }
 
+#ifdef NTFS_RW
+static int ntfs_write_inode(struct inode *vi, struct writeback_control *wbc)
+{
+	return __ntfs_write_inode(vi, wbc->sync_mode == WB_SYNC_ALL);
+}
+#endif
+
 /**
  * The complete super operations.
  */
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 600d2d2ade11..07d9fd854350 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -29,6 +29,7 @@ ocfs2-objs := \
 	mmap.o 			\
 	namei.o 		\
 	refcounttree.o		\
+	reservations.o		\
 	resize.o		\
 	slot_map.o 		\
 	suballoc.o 		\
@@ -46,6 +47,7 @@ ocfs2_stackglue-objs := stackglue.o
 ocfs2_stack_o2cb-objs := stack_o2cb.o
 ocfs2_stack_user-objs := stack_user.o
 
+obj-$(CONFIG_OCFS2_FS) += dlmfs/
 # cluster/ is always needed when OCFS2_FS for masklog support
 obj-$(CONFIG_OCFS2_FS) += cluster/
 obj-$(CONFIG_OCFS2_FS_O2CB) += dlm/
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 0501974bedd0..da702294d7e7 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -21,6 +21,7 @@
 
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/string.h>
 
 #define MLOG_MASK_PREFIX ML_INODE
@@ -30,6 +31,8 @@
 #include "alloc.h"
 #include "dlmglue.h"
 #include "file.h"
+#include "inode.h"
+#include "journal.h"
 #include "ocfs2_fs.h"
 
 #include "xattr.h"
@@ -166,6 +169,60 @@ static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type)
 }
 
 /*
+ * Helper function to set i_mode in memory and disk. Some call paths
+ * will not have di_bh or a journal handle to pass, in which case it
+ * will create it's own.
+ */
+static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
+			      handle_t *handle, umode_t new_mode)
+{
+	int ret, commit_handle = 0;
+	struct ocfs2_dinode *di;
+
+	if (di_bh == NULL) {
+		ret = ocfs2_read_inode_block(inode, &di_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	} else
+		get_bh(di_bh);
+
+	if (handle == NULL) {
+		handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb),
+					   OCFS2_INODE_UPDATE_CREDITS);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			mlog_errno(ret);
+			goto out_brelse;
+		}
+
+		commit_handle = 1;
+	}
+
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	inode->i_mode = new_mode;
+	di->i_mode = cpu_to_le16(inode->i_mode);
+
+	ocfs2_journal_dirty(handle, di_bh);
+
+out_commit:
+	if (commit_handle)
+		ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out_brelse:
+	brelse(di_bh);
+out:
+	return ret;
+}
+
+/*
  * Set the access or default ACL of an inode.
  */
 static int ocfs2_set_acl(handle_t *handle,
@@ -193,9 +250,14 @@ static int ocfs2_set_acl(handle_t *handle,
 			if (ret < 0)
 				return ret;
 			else {
-				inode->i_mode = mode;
 				if (ret == 0)
 					acl = NULL;
+
+				ret = ocfs2_acl_set_mode(inode, di_bh,
+							 handle, mode);
+				if (ret)
+					return ret;
+
 			}
 		}
 		break;
@@ -283,6 +345,7 @@ int ocfs2_init_acl(handle_t *handle,
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct posix_acl *acl = NULL;
 	int ret = 0;
+	mode_t mode;
 
 	if (!S_ISLNK(inode->i_mode)) {
 		if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
@@ -291,12 +354,17 @@ int ocfs2_init_acl(handle_t *handle,
 			if (IS_ERR(acl))
 				return PTR_ERR(acl);
 		}
-		if (!acl)
-			inode->i_mode &= ~current_umask();
+		if (!acl) {
+			mode = inode->i_mode & ~current_umask();
+			ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
+			if (ret) {
+				mlog_errno(ret);
+				goto cleanup;
+			}
+		}
 	}
 	if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
 		struct posix_acl *clone;
-		mode_t mode;
 
 		if (S_ISDIR(inode->i_mode)) {
 			ret = ocfs2_set_acl(handle, inode, di_bh,
@@ -313,7 +381,7 @@ int ocfs2_init_acl(handle_t *handle,
 		mode = inode->i_mode;
 		ret = posix_acl_create_masq(clone, &mode);
 		if (ret >= 0) {
-			inode->i_mode = mode;
+			ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
 			if (ret > 0) {
 				ret = ocfs2_set_acl(handle, inode,
 						    di_bh, ACL_TYPE_ACCESS,
@@ -421,7 +489,7 @@ cleanup:
 	return ret;
 }
 
-struct xattr_handler ocfs2_xattr_acl_access_handler = {
+const struct xattr_handler ocfs2_xattr_acl_access_handler = {
 	.prefix	= POSIX_ACL_XATTR_ACCESS,
 	.flags	= ACL_TYPE_ACCESS,
 	.list	= ocfs2_xattr_list_acl_access,
@@ -429,7 +497,7 @@ struct xattr_handler ocfs2_xattr_acl_access_handler = {
 	.set	= ocfs2_xattr_set_acl,
 };
 
-struct xattr_handler ocfs2_xattr_acl_default_handler = {
+const struct xattr_handler ocfs2_xattr_acl_default_handler = {
 	.prefix	= POSIX_ACL_XATTR_DEFAULT,
 	.flags	= ACL_TYPE_DEFAULT,
 	.list	= ocfs2_xattr_list_acl_default,
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index d17bdc718f74..215e12ce1d85 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1006,7 +1006,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
 	int count, status, i;
 	u16 suballoc_bit_start;
 	u32 num_got;
-	u64 first_blkno;
+	u64 suballoc_loc, first_blkno;
 	struct ocfs2_super *osb =
 		OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
 	struct ocfs2_extent_block *eb;
@@ -1015,10 +1015,10 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
 
 	count = 0;
 	while (count < wanted) {
-		status = ocfs2_claim_metadata(osb,
-					      handle,
+		status = ocfs2_claim_metadata(handle,
 					      meta_ac,
 					      wanted - count,
+					      &suballoc_loc,
 					      &suballoc_bit_start,
 					      &num_got,
 					      &first_blkno);
@@ -1050,7 +1050,9 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
 			strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
 			eb->h_blkno = cpu_to_le64(first_blkno);
 			eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
-			eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
+			eb->h_suballoc_slot =
+				cpu_to_le16(meta_ac->ac_alloc_slot);
+			eb->h_suballoc_loc = cpu_to_le64(suballoc_loc);
 			eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
 			eb->h_list.l_count =
 				cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
@@ -1060,11 +1062,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
 
 			/* We'll also be dirtied by the caller, so
 			 * this isn't absolutely necessary. */
-			status = ocfs2_journal_dirty(handle, bhs[i]);
-			if (status < 0) {
-				mlog_errno(status);
-				goto bail;
-			}
+			ocfs2_journal_dirty(handle, bhs[i]);
 		}
 
 		count += num_got;
@@ -1128,8 +1126,7 @@ static int ocfs2_adjust_rightmost_branch(handle_t *handle,
 		goto out;
 	}
 
-	status = ocfs2_extend_trans(handle, path_num_items(path) +
-				    handle->h_buffer_credits);
+	status = ocfs2_extend_trans(handle, path_num_items(path));
 	if (status < 0) {
 		mlog_errno(status);
 		goto out;
@@ -1269,12 +1266,7 @@ static int ocfs2_add_branch(handle_t *handle,
 		if (!eb_el->l_tree_depth)
 			new_last_eb_blk = le64_to_cpu(eb->h_blkno);
 
-		status = ocfs2_journal_dirty(handle, bh);
-		if (status < 0) {
-			mlog_errno(status);
-			goto bail;
-		}
-
+		ocfs2_journal_dirty(handle, bh);
 		next_blkno = le64_to_cpu(eb->h_blkno);
 	}
 
@@ -1320,17 +1312,10 @@ static int ocfs2_add_branch(handle_t *handle,
 	eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
 	eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
 
-	status = ocfs2_journal_dirty(handle, *last_eb_bh);
-	if (status < 0)
-		mlog_errno(status);
-	status = ocfs2_journal_dirty(handle, et->et_root_bh);
-	if (status < 0)
-		mlog_errno(status);
-	if (eb_bh) {
-		status = ocfs2_journal_dirty(handle, eb_bh);
-		if (status < 0)
-			mlog_errno(status);
-	}
+	ocfs2_journal_dirty(handle, *last_eb_bh);
+	ocfs2_journal_dirty(handle, et->et_root_bh);
+	if (eb_bh)
+		ocfs2_journal_dirty(handle, eb_bh);
 
 	/*
 	 * Some callers want to track the rightmost leaf so pass it
@@ -1398,11 +1383,7 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
 	for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
 		eb_el->l_recs[i] = root_el->l_recs[i];
 
-	status = ocfs2_journal_dirty(handle, new_eb_bh);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail;
-	}
+	ocfs2_journal_dirty(handle, new_eb_bh);
 
 	status = ocfs2_et_root_journal_access(handle, et,
 					      OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1427,11 +1408,7 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
 	if (root_el->l_tree_depth == cpu_to_le16(1))
 		ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
 
-	status = ocfs2_journal_dirty(handle, et->et_root_bh);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail;
-	}
+	ocfs2_journal_dirty(handle, et->et_root_bh);
 
 	*ret_new_eb_bh = new_eb_bh;
 	new_eb_bh = NULL;
@@ -2063,7 +2040,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
 				       struct ocfs2_path *right_path,
 				       int subtree_index)
 {
-	int ret, i, idx;
+	int i, idx;
 	struct ocfs2_extent_list *el, *left_el, *right_el;
 	struct ocfs2_extent_rec *left_rec, *right_rec;
 	struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
@@ -2101,13 +2078,8 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
 		ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
 					      right_el);
 
-		ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
-		if (ret)
-			mlog_errno(ret);
-
-		ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
-		if (ret)
-			mlog_errno(ret);
+		ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
+		ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
 
 		/*
 		 * Setup our list pointers now so that the current
@@ -2131,9 +2103,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
 
 	root_bh = left_path->p_node[subtree_index].bh;
 
-	ret = ocfs2_journal_dirty(handle, root_bh);
-	if (ret)
-		mlog_errno(ret);
+	ocfs2_journal_dirty(handle, root_bh);
 }
 
 static int ocfs2_rotate_subtree_right(handle_t *handle,
@@ -2206,11 +2176,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle,
 
 	ocfs2_create_empty_extent(right_el);
 
-	ret = ocfs2_journal_dirty(handle, right_leaf_bh);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
+	ocfs2_journal_dirty(handle, right_leaf_bh);
 
 	/* Do the copy now. */
 	i = le16_to_cpu(left_el->l_next_free_rec) - 1;
@@ -2229,11 +2195,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle,
 	memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
 	le16_add_cpu(&left_el->l_next_free_rec, 1);
 
-	ret = ocfs2_journal_dirty(handle, left_leaf_bh);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
+	ocfs2_journal_dirty(handle, left_leaf_bh);
 
 	ocfs2_complete_edge_insert(handle, left_path, right_path,
 				   subtree_index);
@@ -2248,8 +2210,8 @@ out:
  *
  * Will return zero if the path passed in is already the leftmost path.
  */
-static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
-					 struct ocfs2_path *path, u32 *cpos)
+int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
+				  struct ocfs2_path *path, u32 *cpos)
 {
 	int i, j, ret = 0;
 	u64 blkno;
@@ -2326,20 +2288,14 @@ static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
 					   int op_credits,
 					   struct ocfs2_path *path)
 {
-	int ret;
+	int ret = 0;
 	int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
 
-	if (handle->h_buffer_credits < credits) {
+	if (handle->h_buffer_credits < credits)
 		ret = ocfs2_extend_trans(handle,
 					 credits - handle->h_buffer_credits);
-		if (ret)
-			return ret;
 
-		if (unlikely(handle->h_buffer_credits < credits))
-			return ocfs2_extend_trans(handle, credits);
-	}
-
-	return 0;
+	return ret;
 }
 
 /*
@@ -2583,8 +2539,7 @@ static int ocfs2_update_edge_lengths(handle_t *handle,
 	 * records for all the bh in the path.
 	 * So we have to allocate extra credits and access them.
 	 */
-	ret = ocfs2_extend_trans(handle,
-				 handle->h_buffer_credits + subtree_index);
+	ret = ocfs2_extend_trans(handle, subtree_index);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -2822,12 +2777,8 @@ static int ocfs2_rotate_subtree_left(handle_t *handle,
 		ocfs2_remove_empty_extent(right_leaf_el);
 	}
 
-	ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
-	if (ret)
-		mlog_errno(ret);
-	ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
-	if (ret)
-		mlog_errno(ret);
+	ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
+	ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
 
 	if (del_right_subtree) {
 		ocfs2_unlink_subtree(handle, et, left_path, right_path,
@@ -2850,9 +2801,7 @@ static int ocfs2_rotate_subtree_left(handle_t *handle,
 		if (right_has_empty)
 			ocfs2_remove_empty_extent(left_leaf_el);
 
-		ret = ocfs2_journal_dirty(handle, et_root_bh);
-		if (ret)
-			mlog_errno(ret);
+		ocfs2_journal_dirty(handle, et_root_bh);
 
 		*deleted = 1;
 	} else
@@ -2961,10 +2910,7 @@ static int ocfs2_rotate_rightmost_leaf_left(handle_t *handle,
 	}
 
 	ocfs2_remove_empty_extent(el);
-
-	ret = ocfs2_journal_dirty(handle, bh);
-	if (ret)
-		mlog_errno(ret);
+	ocfs2_journal_dirty(handle, bh);
 
 out:
 	return ret;
@@ -3505,15 +3451,9 @@ static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
 
 	ocfs2_cleanup_merge(el, index);
 
-	ret = ocfs2_journal_dirty(handle, bh);
-	if (ret)
-		mlog_errno(ret);
-
+	ocfs2_journal_dirty(handle, bh);
 	if (right_path) {
-		ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
-		if (ret)
-			mlog_errno(ret);
-
+		ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
 		ocfs2_complete_edge_insert(handle, left_path, right_path,
 					   subtree_index);
 	}
@@ -3682,14 +3622,9 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
 
 	ocfs2_cleanup_merge(el, index);
 
-	ret = ocfs2_journal_dirty(handle, bh);
-	if (ret)
-		mlog_errno(ret);
-
+	ocfs2_journal_dirty(handle, bh);
 	if (left_path) {
-		ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
-		if (ret)
-			mlog_errno(ret);
+		ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
 
 		/*
 		 * In the situation that the right_rec is empty and the extent
@@ -4015,10 +3950,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle,
 		le32_add_cpu(&rec->e_int_clusters,
 			     -le32_to_cpu(rec->e_cpos));
 
-		ret = ocfs2_journal_dirty(handle, bh);
-		if (ret)
-			mlog_errno(ret);
-
+		ocfs2_journal_dirty(handle, bh);
 	}
 }
 
@@ -4202,17 +4134,13 @@ static int ocfs2_insert_path(handle_t *handle,
 	struct buffer_head *leaf_bh = path_leaf_bh(right_path);
 
 	if (left_path) {
-		int credits = handle->h_buffer_credits;
-
 		/*
 		 * There's a chance that left_path got passed back to
 		 * us without being accounted for in the
 		 * journal. Extend our transaction here to be sure we
 		 * can change those blocks.
 		 */
-		credits += left_path->p_tree_depth;
-
-		ret = ocfs2_extend_trans(handle, credits);
+		ret = ocfs2_extend_trans(handle, left_path->p_tree_depth);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out;
@@ -4250,17 +4178,13 @@ static int ocfs2_insert_path(handle_t *handle,
 		 * dirty this for us.
 		 */
 		if (left_path)
-			ret = ocfs2_journal_dirty(handle,
-						  path_leaf_bh(left_path));
-			if (ret)
-				mlog_errno(ret);
+			ocfs2_journal_dirty(handle,
+					    path_leaf_bh(left_path));
 	} else
 		ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path),
 				     insert);
 
-	ret = ocfs2_journal_dirty(handle, leaf_bh);
-	if (ret)
-		mlog_errno(ret);
+	ocfs2_journal_dirty(handle, leaf_bh);
 
 	if (left_path) {
 		/*
@@ -4383,9 +4307,7 @@ out_update_clusters:
 		ocfs2_et_update_clusters(et,
 					 le16_to_cpu(insert_rec->e_leaf_clusters));
 
-	ret = ocfs2_journal_dirty(handle, et->et_root_bh);
-	if (ret)
-		mlog_errno(ret);
+	ocfs2_journal_dirty(handle, et->et_root_bh);
 
 out:
 	ocfs2_free_path(left_path);
@@ -4865,7 +4787,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
 		goto leave;
 	}
 
-	status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
+	status = __ocfs2_claim_clusters(handle, data_ac, 1,
 					clusters_to_add, &bit_off, &num_bits);
 	if (status < 0) {
 		if (status != -ENOSPC)
@@ -4894,11 +4816,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
 		goto leave;
 	}
 
-	status = ocfs2_journal_dirty(handle, et->et_root_bh);
-	if (status < 0) {
-		mlog_errno(status);
-		goto leave;
-	}
+	ocfs2_journal_dirty(handle, et->et_root_bh);
 
 	clusters_to_add -= num_bits;
 	*logical_offset += num_bits;
@@ -5308,7 +5226,7 @@ static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
 			    int index, u32 new_range,
 			    struct ocfs2_alloc_context *meta_ac)
 {
-	int ret, depth, credits = handle->h_buffer_credits;
+	int ret, depth, credits;
 	struct buffer_head *last_eb_bh = NULL;
 	struct ocfs2_extent_block *eb;
 	struct ocfs2_extent_list *rightmost_el, *el;
@@ -5339,8 +5257,8 @@ static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
 	} else
 		rightmost_el = path_leaf_el(path);
 
-	credits += path->p_tree_depth +
-		   ocfs2_extend_meta_needed(et->et_root_el);
+	credits = path->p_tree_depth +
+		  ocfs2_extend_meta_needed(et->et_root_el);
 	ret = ocfs2_extend_trans(handle, credits);
 	if (ret) {
 		mlog_errno(ret);
@@ -5670,19 +5588,97 @@ out:
 	return ret;
 }
 
+/*
+ * ocfs2_reserve_blocks_for_rec_trunc() would look basically the
+ * same as ocfs2_lock_alloctors(), except for it accepts a blocks
+ * number to reserve some extra blocks, and it only handles meta
+ * data allocations.
+ *
+ * Currently, only ocfs2_remove_btree_range() uses it for truncating
+ * and punching holes.
+ */
+static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode,
+					      struct ocfs2_extent_tree *et,
+					      u32 extents_to_split,
+					      struct ocfs2_alloc_context **ac,
+					      int extra_blocks)
+{
+	int ret = 0, num_free_extents;
+	unsigned int max_recs_needed = 2 * extents_to_split;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	*ac = NULL;
+
+	num_free_extents = ocfs2_num_free_extents(osb, et);
+	if (num_free_extents < 0) {
+		ret = num_free_extents;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (!num_free_extents ||
+	    (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
+		extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
+
+	if (extra_blocks) {
+		ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, ac);
+		if (ret < 0) {
+			if (ret != -ENOSPC)
+				mlog_errno(ret);
+			goto out;
+		}
+	}
+
+out:
+	if (ret) {
+		if (*ac) {
+			ocfs2_free_alloc_context(*ac);
+			*ac = NULL;
+		}
+	}
+
+	return ret;
+}
+
 int ocfs2_remove_btree_range(struct inode *inode,
 			     struct ocfs2_extent_tree *et,
-			     u32 cpos, u32 phys_cpos, u32 len,
-			     struct ocfs2_cached_dealloc_ctxt *dealloc)
+			     u32 cpos, u32 phys_cpos, u32 len, int flags,
+			     struct ocfs2_cached_dealloc_ctxt *dealloc,
+			     u64 refcount_loc)
 {
-	int ret;
+	int ret, credits = 0, extra_blocks = 0;
 	u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct inode *tl_inode = osb->osb_tl_inode;
 	handle_t *handle;
 	struct ocfs2_alloc_context *meta_ac = NULL;
+	struct ocfs2_refcount_tree *ref_tree = NULL;
+
+	if ((flags & OCFS2_EXT_REFCOUNTED) && len) {
+		BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
+			 OCFS2_HAS_REFCOUNT_FL));
+
+		ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
+					       &ref_tree, NULL);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
 
-	ret = ocfs2_lock_allocators(inode, et, 0, 1, NULL, &meta_ac);
+		ret = ocfs2_prepare_refcount_change_for_del(inode,
+							    refcount_loc,
+							    phys_blkno,
+							    len,
+							    &credits,
+							    &extra_blocks);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	ret = ocfs2_reserve_blocks_for_rec_trunc(inode, et, 1, &meta_ac,
+						 extra_blocks);
 	if (ret) {
 		mlog_errno(ret);
 		return ret;
@@ -5698,7 +5694,8 @@ int ocfs2_remove_btree_range(struct inode *inode,
 		}
 	}
 
-	handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
+	handle = ocfs2_start_trans(osb,
+			ocfs2_remove_extent_credits(osb->sb) + credits);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		mlog_errno(ret);
@@ -5712,7 +5709,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
 		goto out;
 	}
 
-	vfs_dq_free_space_nodirty(inode,
+	dquot_free_space_nodirty(inode,
 				  ocfs2_clusters_to_bytes(inode->i_sb, len));
 
 	ret = ocfs2_remove_extent(handle, et, cpos, len, meta_ac, dealloc);
@@ -5723,15 +5720,22 @@ int ocfs2_remove_btree_range(struct inode *inode,
 
 	ocfs2_et_update_clusters(et, -len);
 
-	ret = ocfs2_journal_dirty(handle, et->et_root_bh);
-	if (ret) {
-		mlog_errno(ret);
-		goto out_commit;
-	}
+	ocfs2_journal_dirty(handle, et->et_root_bh);
 
-	ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
-	if (ret)
-		mlog_errno(ret);
+	if (phys_blkno) {
+		if (flags & OCFS2_EXT_REFCOUNTED)
+			ret = ocfs2_decrease_refcount(inode, handle,
+					ocfs2_blocks_to_clusters(osb->sb,
+								 phys_blkno),
+					len, meta_ac,
+					dealloc, 1);
+		else
+			ret = ocfs2_truncate_log_append(osb, handle,
+							phys_blkno, len);
+		if (ret)
+			mlog_errno(ret);
+
+	}
 
 out_commit:
 	ocfs2_commit_trans(osb, handle);
@@ -5741,6 +5745,9 @@ out:
 	if (meta_ac)
 		ocfs2_free_alloc_context(meta_ac);
 
+	if (ref_tree)
+		ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+
 	return ret;
 }
 
@@ -5849,11 +5856,7 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
 	}
 	tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
 
-	status = ocfs2_journal_dirty(handle, tl_bh);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail;
-	}
+	ocfs2_journal_dirty(handle, tl_bh);
 
 bail:
 	mlog_exit(status);
@@ -5892,11 +5895,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
 
 		tl->tl_used = cpu_to_le16(i);
 
-		status = ocfs2_journal_dirty(handle, tl_bh);
-		if (status < 0) {
-			mlog_errno(status);
-			goto bail;
-		}
+		ocfs2_journal_dirty(handle, tl_bh);
 
 		/* TODO: Perhaps we can calculate the bulk of the
 		 * credits up front rather than extending like
@@ -6037,7 +6036,7 @@ static void ocfs2_truncate_log_worker(struct work_struct *work)
 	if (status < 0)
 		mlog_errno(status);
 	else
-		ocfs2_init_inode_steal_slot(osb);
+		ocfs2_init_steal_slots(osb);
 
 	mlog_exit(status);
 }
@@ -6297,6 +6296,7 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
  */
 struct ocfs2_cached_block_free {
 	struct ocfs2_cached_block_free		*free_next;
+	u64					free_bg;
 	u64					free_blk;
 	unsigned int				free_bit;
 };
@@ -6343,8 +6343,11 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
 	}
 
 	while (head) {
-		bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
-						      head->free_bit);
+		if (head->free_bg)
+			bg_blkno = head->free_bg;
+		else
+			bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
+							      head->free_bit);
 		mlog(0, "Free bit: (bit %u, blkno %llu)\n",
 		     head->free_bit, (unsigned long long)head->free_blk);
 
@@ -6392,7 +6395,7 @@ int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
 	int ret = 0;
 	struct ocfs2_cached_block_free *item;
 
-	item = kmalloc(sizeof(*item), GFP_NOFS);
+	item = kzalloc(sizeof(*item), GFP_NOFS);
 	if (item == NULL) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -6532,8 +6535,8 @@ ocfs2_find_per_slot_free_list(int type,
 }
 
 int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
-			      int type, int slot, u64 blkno,
-			      unsigned int bit)
+			      int type, int slot, u64 suballoc,
+			      u64 blkno, unsigned int bit)
 {
 	int ret;
 	struct ocfs2_per_slot_free_list *fl;
@@ -6546,7 +6549,7 @@ int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
 		goto out;
 	}
 
-	item = kmalloc(sizeof(*item), GFP_NOFS);
+	item = kzalloc(sizeof(*item), GFP_NOFS);
 	if (item == NULL) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -6556,6 +6559,7 @@ int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
 	mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
 	     type, slot, bit, (unsigned long long)blkno);
 
+	item->free_bg = suballoc;
 	item->free_blk = blkno;
 	item->free_bit = bit;
 	item->free_next = fl->f_first;
@@ -6572,433 +6576,11 @@ static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
 {
 	return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
 					 le16_to_cpu(eb->h_suballoc_slot),
+					 le64_to_cpu(eb->h_suballoc_loc),
 					 le64_to_cpu(eb->h_blkno),
 					 le16_to_cpu(eb->h_suballoc_bit));
 }
 
-/* This function will figure out whether the currently last extent
- * block will be deleted, and if it will, what the new last extent
- * block will be so we can update his h_next_leaf_blk field, as well
- * as the dinodes i_last_eb_blk */
-static int ocfs2_find_new_last_ext_blk(struct inode *inode,
-				       unsigned int clusters_to_del,
-				       struct ocfs2_path *path,
-				       struct buffer_head **new_last_eb)
-{
-	int next_free, ret = 0;
-	u32 cpos;
-	struct ocfs2_extent_rec *rec;
-	struct ocfs2_extent_block *eb;
-	struct ocfs2_extent_list *el;
-	struct buffer_head *bh = NULL;
-
-	*new_last_eb = NULL;
-
-	/* we have no tree, so of course, no last_eb. */
-	if (!path->p_tree_depth)
-		goto out;
-
-	/* trunc to zero special case - this makes tree_depth = 0
-	 * regardless of what it is.  */
-	if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
-		goto out;
-
-	el = path_leaf_el(path);
-	BUG_ON(!el->l_next_free_rec);
-
-	/*
-	 * Make sure that this extent list will actually be empty
-	 * after we clear away the data. We can shortcut out if
-	 * there's more than one non-empty extent in the
-	 * list. Otherwise, a check of the remaining extent is
-	 * necessary.
-	 */
-	next_free = le16_to_cpu(el->l_next_free_rec);
-	rec = NULL;
-	if (ocfs2_is_empty_extent(&el->l_recs[0])) {
-		if (next_free > 2)
-			goto out;
-
-		/* We may have a valid extent in index 1, check it. */
-		if (next_free == 2)
-			rec = &el->l_recs[1];
-
-		/*
-		 * Fall through - no more nonempty extents, so we want
-		 * to delete this leaf.
-		 */
-	} else {
-		if (next_free > 1)
-			goto out;
-
-		rec = &el->l_recs[0];
-	}
-
-	if (rec) {
-		/*
-		 * Check it we'll only be trimming off the end of this
-		 * cluster.
-		 */
-		if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
-			goto out;
-	}
-
-	ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
-
-	ret = ocfs2_find_leaf(INODE_CACHE(inode), path_root_el(path), cpos, &bh);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
-
-	eb = (struct ocfs2_extent_block *) bh->b_data;
-	el = &eb->h_list;
-
-	/* ocfs2_find_leaf() gets the eb from ocfs2_read_extent_block().
-	 * Any corruption is a code bug. */
-	BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
-
-	*new_last_eb = bh;
-	get_bh(*new_last_eb);
-	mlog(0, "returning block %llu, (cpos: %u)\n",
-	     (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
-out:
-	brelse(bh);
-
-	return ret;
-}
-
-/*
- * Trim some clusters off the rightmost edge of a tree. Only called
- * during truncate.
- *
- * The caller needs to:
- *   - start journaling of each path component.
- *   - compute and fully set up any new last ext block
- */
-static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
-			   handle_t *handle, struct ocfs2_truncate_context *tc,
-			   u32 clusters_to_del, u64 *delete_start, u8 *flags)
-{
-	int ret, i, index = path->p_tree_depth;
-	u32 new_edge = 0;
-	u64 deleted_eb = 0;
-	struct buffer_head *bh;
-	struct ocfs2_extent_list *el;
-	struct ocfs2_extent_rec *rec;
-
-	*delete_start = 0;
-	*flags = 0;
-
-	while (index >= 0) {
-		bh = path->p_node[index].bh;
-		el = path->p_node[index].el;
-
-		mlog(0, "traveling tree (index = %d, block = %llu)\n",
-		     index,  (unsigned long long)bh->b_blocknr);
-
-		BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
-
-		if (index !=
-		    (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
-			ocfs2_error(inode->i_sb,
-				    "Inode %lu has invalid ext. block %llu",
-				    inode->i_ino,
-				    (unsigned long long)bh->b_blocknr);
-			ret = -EROFS;
-			goto out;
-		}
-
-find_tail_record:
-		i = le16_to_cpu(el->l_next_free_rec) - 1;
-		rec = &el->l_recs[i];
-
-		mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
-		     "next = %u\n", i, le32_to_cpu(rec->e_cpos),
-		     ocfs2_rec_clusters(el, rec),
-		     (unsigned long long)le64_to_cpu(rec->e_blkno),
-		     le16_to_cpu(el->l_next_free_rec));
-
-		BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
-
-		if (le16_to_cpu(el->l_tree_depth) == 0) {
-			/*
-			 * If the leaf block contains a single empty
-			 * extent and no records, we can just remove
-			 * the block.
-			 */
-			if (i == 0 && ocfs2_is_empty_extent(rec)) {
-				memset(rec, 0,
-				       sizeof(struct ocfs2_extent_rec));
-				el->l_next_free_rec = cpu_to_le16(0);
-
-				goto delete;
-			}
-
-			/*
-			 * Remove any empty extents by shifting things
-			 * left. That should make life much easier on
-			 * the code below. This condition is rare
-			 * enough that we shouldn't see a performance
-			 * hit.
-			 */
-			if (ocfs2_is_empty_extent(&el->l_recs[0])) {
-				le16_add_cpu(&el->l_next_free_rec, -1);
-
-				for(i = 0;
-				    i < le16_to_cpu(el->l_next_free_rec); i++)
-					el->l_recs[i] = el->l_recs[i + 1];
-
-				memset(&el->l_recs[i], 0,
-				       sizeof(struct ocfs2_extent_rec));
-
-				/*
-				 * We've modified our extent list. The
-				 * simplest way to handle this change
-				 * is to being the search from the
-				 * start again.
-				 */
-				goto find_tail_record;
-			}
-
-			le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
-
-			/*
-			 * We'll use "new_edge" on our way back up the
-			 * tree to know what our rightmost cpos is.
-			 */
-			new_edge = le16_to_cpu(rec->e_leaf_clusters);
-			new_edge += le32_to_cpu(rec->e_cpos);
-
-			/*
-			 * The caller will use this to delete data blocks.
-			 */
-			*delete_start = le64_to_cpu(rec->e_blkno)
-				+ ocfs2_clusters_to_blocks(inode->i_sb,
-					le16_to_cpu(rec->e_leaf_clusters));
-			*flags = rec->e_flags;
-
-			/*
-			 * If it's now empty, remove this record.
-			 */
-			if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
-				memset(rec, 0,
-				       sizeof(struct ocfs2_extent_rec));
-				le16_add_cpu(&el->l_next_free_rec, -1);
-			}
-		} else {
-			if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
-				memset(rec, 0,
-				       sizeof(struct ocfs2_extent_rec));
-				le16_add_cpu(&el->l_next_free_rec, -1);
-
-				goto delete;
-			}
-
-			/* Can this actually happen? */
-			if (le16_to_cpu(el->l_next_free_rec) == 0)
-				goto delete;
-
-			/*
-			 * We never actually deleted any clusters
-			 * because our leaf was empty. There's no
-			 * reason to adjust the rightmost edge then.
-			 */
-			if (new_edge == 0)
-				goto delete;
-
-			rec->e_int_clusters = cpu_to_le32(new_edge);
-			le32_add_cpu(&rec->e_int_clusters,
-				     -le32_to_cpu(rec->e_cpos));
-
-			 /*
-			  * A deleted child record should have been
-			  * caught above.
-			  */
-			 BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
-		}
-
-delete:
-		ret = ocfs2_journal_dirty(handle, bh);
-		if (ret) {
-			mlog_errno(ret);
-			goto out;
-		}
-
-		mlog(0, "extent list container %llu, after: record %d: "
-		     "(%u, %u, %llu), next = %u.\n",
-		     (unsigned long long)bh->b_blocknr, i,
-		     le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
-		     (unsigned long long)le64_to_cpu(rec->e_blkno),
-		     le16_to_cpu(el->l_next_free_rec));
-
-		/*
-		 * We must be careful to only attempt delete of an
-		 * extent block (and not the root inode block).
-		 */
-		if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
-			struct ocfs2_extent_block *eb =
-				(struct ocfs2_extent_block *)bh->b_data;
-
-			/*
-			 * Save this for use when processing the
-			 * parent block.
-			 */
-			deleted_eb = le64_to_cpu(eb->h_blkno);
-
-			mlog(0, "deleting this extent block.\n");
-
-			ocfs2_remove_from_cache(INODE_CACHE(inode), bh);
-
-			BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
-			BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
-			BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
-
-			ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
-			/* An error here is not fatal. */
-			if (ret < 0)
-				mlog_errno(ret);
-		} else {
-			deleted_eb = 0;
-		}
-
-		index--;
-	}
-
-	ret = 0;
-out:
-	return ret;
-}
-
-static int ocfs2_do_truncate(struct ocfs2_super *osb,
-			     unsigned int clusters_to_del,
-			     struct inode *inode,
-			     struct buffer_head *fe_bh,
-			     handle_t *handle,
-			     struct ocfs2_truncate_context *tc,
-			     struct ocfs2_path *path,
-			     struct ocfs2_alloc_context *meta_ac)
-{
-	int status;
-	struct ocfs2_dinode *fe;
-	struct ocfs2_extent_block *last_eb = NULL;
-	struct ocfs2_extent_list *el;
-	struct buffer_head *last_eb_bh = NULL;
-	u64 delete_blk = 0;
-	u8 rec_flags;
-
-	fe = (struct ocfs2_dinode *) fe_bh->b_data;
-
-	status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
-					     path, &last_eb_bh);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail;
-	}
-
-	/*
-	 * Each component will be touched, so we might as well journal
-	 * here to avoid having to handle errors later.
-	 */
-	status = ocfs2_journal_access_path(INODE_CACHE(inode), handle, path);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail;
-	}
-
-	if (last_eb_bh) {
-		status = ocfs2_journal_access_eb(handle, INODE_CACHE(inode), last_eb_bh,
-						 OCFS2_JOURNAL_ACCESS_WRITE);
-		if (status < 0) {
-			mlog_errno(status);
-			goto bail;
-		}
-
-		last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
-	}
-
-	el = &(fe->id2.i_list);
-
-	/*
-	 * Lower levels depend on this never happening, but it's best
-	 * to check it up here before changing the tree.
-	 */
-	if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
-		ocfs2_error(inode->i_sb,
-			    "Inode %lu has an empty extent record, depth %u\n",
-			    inode->i_ino, le16_to_cpu(el->l_tree_depth));
-		status = -EROFS;
-		goto bail;
-	}
-
-	vfs_dq_free_space_nodirty(inode,
-			ocfs2_clusters_to_bytes(osb->sb, clusters_to_del));
-	spin_lock(&OCFS2_I(inode)->ip_lock);
-	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
-				      clusters_to_del;
-	spin_unlock(&OCFS2_I(inode)->ip_lock);
-	le32_add_cpu(&fe->i_clusters, -clusters_to_del);
-	inode->i_blocks = ocfs2_inode_sector_count(inode);
-
-	status = ocfs2_trim_tree(inode, path, handle, tc,
-				 clusters_to_del, &delete_blk, &rec_flags);
-	if (status) {
-		mlog_errno(status);
-		goto bail;
-	}
-
-	if (le32_to_cpu(fe->i_clusters) == 0) {
-		/* trunc to zero is a special case. */
-		el->l_tree_depth = 0;
-		fe->i_last_eb_blk = 0;
-	} else if (last_eb)
-		fe->i_last_eb_blk = last_eb->h_blkno;
-
-	status = ocfs2_journal_dirty(handle, fe_bh);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail;
-	}
-
-	if (last_eb) {
-		/* If there will be a new last extent block, then by
-		 * definition, there cannot be any leaves to the right of
-		 * him. */
-		last_eb->h_next_leaf_blk = 0;
-		status = ocfs2_journal_dirty(handle, last_eb_bh);
-		if (status < 0) {
-			mlog_errno(status);
-			goto bail;
-		}
-	}
-
-	if (delete_blk) {
-		if (rec_flags & OCFS2_EXT_REFCOUNTED)
-			status = ocfs2_decrease_refcount(inode, handle,
-					ocfs2_blocks_to_clusters(osb->sb,
-								 delete_blk),
-					clusters_to_del, meta_ac,
-					&tc->tc_dealloc, 1);
-		else
-			status = ocfs2_truncate_log_append(osb, handle,
-							   delete_blk,
-							   clusters_to_del);
-		if (status < 0) {
-			mlog_errno(status);
-			goto bail;
-		}
-	}
-	status = 0;
-bail:
-	brelse(last_eb_bh);
-	mlog_exit(status);
-	return status;
-}
-
 static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
 {
 	set_buffer_uptodate(bh);
@@ -7300,14 +6882,15 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
 		unsigned int page_end;
 		u64 phys;
 
-		if (vfs_dq_alloc_space_nodirty(inode,
-				       ocfs2_clusters_to_bytes(osb->sb, 1))) {
-			ret = -EDQUOT;
+		ret = dquot_alloc_space_nodirty(inode,
+				       ocfs2_clusters_to_bytes(osb->sb, 1));
+		if (ret)
 			goto out_commit;
-		}
 		did_quota = 1;
 
-		ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
+		data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
+
+		ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
 					   &num);
 		if (ret) {
 			mlog_errno(ret);
@@ -7380,7 +6963,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
 
 out_commit:
 	if (ret < 0 && did_quota)
-		vfs_dq_free_space_nodirty(inode,
+		dquot_free_space_nodirty(inode,
 					  ocfs2_clusters_to_bytes(osb->sb, 1));
 
 	ocfs2_commit_trans(osb, handle);
@@ -7406,26 +6989,29 @@ out:
  */
 int ocfs2_commit_truncate(struct ocfs2_super *osb,
 			  struct inode *inode,
-			  struct buffer_head *fe_bh,
-			  struct ocfs2_truncate_context *tc)
+			  struct buffer_head *di_bh)
 {
-	int status, i, credits, tl_sem = 0;
-	u32 clusters_to_del, new_highest_cpos, range;
+	int status = 0, i, flags = 0;
+	u32 new_highest_cpos, range, trunc_cpos, trunc_len, phys_cpos, coff;
 	u64 blkno = 0;
 	struct ocfs2_extent_list *el;
-	handle_t *handle = NULL;
-	struct inode *tl_inode = osb->osb_tl_inode;
+	struct ocfs2_extent_rec *rec;
 	struct ocfs2_path *path = NULL;
-	struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
-	struct ocfs2_alloc_context *meta_ac = NULL;
-	struct ocfs2_refcount_tree *ref_tree = NULL;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_extent_list *root_el = &(di->id2.i_list);
+	u64 refcount_loc = le64_to_cpu(di->i_refcount_loc);
+	struct ocfs2_extent_tree et;
+	struct ocfs2_cached_dealloc_ctxt dealloc;
 
 	mlog_entry_void();
 
+	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
+	ocfs2_init_dealloc_ctxt(&dealloc);
+
 	new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
 						     i_size_read(inode));
 
-	path = ocfs2_new_path(fe_bh, &di->id2.i_list,
+	path = ocfs2_new_path(di_bh, &di->id2.i_list,
 			      ocfs2_journal_access_di);
 	if (!path) {
 		status = -ENOMEM;
@@ -7444,8 +7030,6 @@ start:
 		goto bail;
 	}
 
-	credits = 0;
-
 	/*
 	 * Truncate always works against the rightmost tree branch.
 	 */
@@ -7480,101 +7064,62 @@ start:
 	}
 
 	i = le16_to_cpu(el->l_next_free_rec) - 1;
-	range = le32_to_cpu(el->l_recs[i].e_cpos) +
-		ocfs2_rec_clusters(el, &el->l_recs[i]);
-	if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
-		clusters_to_del = 0;
-	} else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
-		clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
-		blkno = le64_to_cpu(el->l_recs[i].e_blkno);
+	rec = &el->l_recs[i];
+	flags = rec->e_flags;
+	range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+
+	if (i == 0 && ocfs2_is_empty_extent(rec)) {
+		/*
+		 * Lower levels depend on this never happening, but it's best
+		 * to check it up here before changing the tree.
+		*/
+		if (root_el->l_tree_depth && rec->e_int_clusters == 0) {
+			ocfs2_error(inode->i_sb, "Inode %lu has an empty "
+				    "extent record, depth %u\n", inode->i_ino,
+				    le16_to_cpu(root_el->l_tree_depth));
+			status = -EROFS;
+			goto bail;
+		}
+		trunc_cpos = le32_to_cpu(rec->e_cpos);
+		trunc_len = 0;
+		blkno = 0;
+	} else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) {
+		/*
+		 * Truncate entire record.
+		 */
+		trunc_cpos = le32_to_cpu(rec->e_cpos);
+		trunc_len = ocfs2_rec_clusters(el, rec);
+		blkno = le64_to_cpu(rec->e_blkno);
 	} else if (range > new_highest_cpos) {
-		clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
-				   le32_to_cpu(el->l_recs[i].e_cpos)) -
-				  new_highest_cpos;
-		blkno = le64_to_cpu(el->l_recs[i].e_blkno) +
-			ocfs2_clusters_to_blocks(inode->i_sb,
-				ocfs2_rec_clusters(el, &el->l_recs[i]) -
-				clusters_to_del);
+		/*
+		 * Partial truncate. it also should be
+		 * the last truncate we're doing.
+		 */
+		trunc_cpos = new_highest_cpos;
+		trunc_len = range - new_highest_cpos;
+		coff = new_highest_cpos - le32_to_cpu(rec->e_cpos);
+		blkno = le64_to_cpu(rec->e_blkno) +
+				ocfs2_clusters_to_blocks(inode->i_sb, coff);
 	} else {
+		/*
+		 * Truncate completed, leave happily.
+		 */
 		status = 0;
 		goto bail;
 	}
 
-	mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
-	     clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
-
-	if (el->l_recs[i].e_flags & OCFS2_EXT_REFCOUNTED && clusters_to_del) {
-		BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
-			 OCFS2_HAS_REFCOUNT_FL));
-
-		status = ocfs2_lock_refcount_tree(osb,
-						le64_to_cpu(di->i_refcount_loc),
-						1, &ref_tree, NULL);
-		if (status) {
-			mlog_errno(status);
-			goto bail;
-		}
-
-		status = ocfs2_prepare_refcount_change_for_del(inode, fe_bh,
-							       blkno,
-							       clusters_to_del,
-							       &credits,
-							       &meta_ac);
-		if (status < 0) {
-			mlog_errno(status);
-			goto bail;
-		}
-	}
-
-	mutex_lock(&tl_inode->i_mutex);
-	tl_sem = 1;
-	/* ocfs2_truncate_log_needs_flush guarantees us at least one
-	 * record is free for use. If there isn't any, we flush to get
-	 * an empty truncate log.  */
-	if (ocfs2_truncate_log_needs_flush(osb)) {
-		status = __ocfs2_flush_truncate_log(osb);
-		if (status < 0) {
-			mlog_errno(status);
-			goto bail;
-		}
-	}
-
-	credits += ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
-						(struct ocfs2_dinode *)fe_bh->b_data,
-						el);
-	handle = ocfs2_start_trans(osb, credits);
-	if (IS_ERR(handle)) {
-		status = PTR_ERR(handle);
-		handle = NULL;
-		mlog_errno(status);
-		goto bail;
-	}
+	phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
 
-	status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
-				   tc, path, meta_ac);
+	status = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
+					  phys_cpos, trunc_len, flags, &dealloc,
+					  refcount_loc);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
 
-	mutex_unlock(&tl_inode->i_mutex);
-	tl_sem = 0;
-
-	ocfs2_commit_trans(osb, handle);
-	handle = NULL;
-
 	ocfs2_reinit_path(path, 1);
 
-	if (meta_ac) {
-		ocfs2_free_alloc_context(meta_ac);
-		meta_ac = NULL;
-	}
-
-	if (ref_tree) {
-		ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
-		ref_tree = NULL;
-	}
-
 	/*
 	 * The check above will catch the case where we've truncated
 	 * away all allocation.
@@ -7585,25 +7130,10 @@ bail:
 
 	ocfs2_schedule_truncate_log_flush(osb, 1);
 
-	if (tl_sem)
-		mutex_unlock(&tl_inode->i_mutex);
-
-	if (handle)
-		ocfs2_commit_trans(osb, handle);
-
-	if (meta_ac)
-		ocfs2_free_alloc_context(meta_ac);
-
-	if (ref_tree)
-		ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
-
-	ocfs2_run_deallocs(osb, &tc->tc_dealloc);
+	ocfs2_run_deallocs(osb, &dealloc);
 
 	ocfs2_free_path(path);
 
-	/* This will drop the ext_alloc cluster lock for us */
-	ocfs2_free_truncate_context(tc);
-
 	mlog_exit(status);
 	return status;
 }
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 1db4359ccb90..55762b554b99 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -140,8 +140,9 @@ int ocfs2_remove_extent(handle_t *handle, struct ocfs2_extent_tree *et,
 			struct ocfs2_cached_dealloc_ctxt *dealloc);
 int ocfs2_remove_btree_range(struct inode *inode,
 			     struct ocfs2_extent_tree *et,
-			     u32 cpos, u32 phys_cpos, u32 len,
-			     struct ocfs2_cached_dealloc_ctxt *dealloc);
+			     u32 cpos, u32 phys_cpos, u32 len, int flags,
+			     struct ocfs2_cached_dealloc_ctxt *dealloc,
+			     u64 refcount_loc);
 
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
 			   struct ocfs2_extent_tree *et);
@@ -209,7 +210,7 @@ static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
 int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
 				u64 blkno, unsigned int bit);
 int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
-			      int type, int slot, u64 blkno,
+			      int type, int slot, u64 suballoc, u64 blkno,
 			      unsigned int bit);
 static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c)
 {
@@ -233,8 +234,7 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
 			   struct ocfs2_truncate_context **tc);
 int ocfs2_commit_truncate(struct ocfs2_super *osb,
 			  struct inode *inode,
-			  struct buffer_head *fe_bh,
-			  struct ocfs2_truncate_context *tc);
+			  struct buffer_head *di_bh);
 int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
 			  unsigned int start, unsigned int end, int trunc);
 
@@ -319,6 +319,8 @@ int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
 			      struct ocfs2_path *path);
 int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
 				   struct ocfs2_path *path, u32 *cpos);
+int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
+				  struct ocfs2_path *path, u32 *cpos);
 int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
 			    struct ocfs2_path *left,
 			    struct ocfs2_path *right);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 3dae4a13f6e4..356e976772bf 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -196,15 +196,14 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock,
 			dump_stack();
 			goto bail;
 		}
-
-		past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
-		mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
-		     (unsigned long long)past_eof);
-
-		if (create && (iblock >= past_eof))
-			set_buffer_new(bh_result);
 	}
 
+	past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
+	mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
+	     (unsigned long long)past_eof);
+	if (create && (iblock >= past_eof))
+		set_buffer_new(bh_result);
+
 bail:
 	if (err < 0)
 		err = -EIO;
@@ -459,36 +458,6 @@ int walk_page_buffers(	handle_t *handle,
 	return ret;
 }
 
-handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
-							 struct page *page,
-							 unsigned from,
-							 unsigned to)
-{
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	handle_t *handle;
-	int ret = 0;
-
-	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
-	if (IS_ERR(handle)) {
-		ret = -ENOMEM;
-		mlog_errno(ret);
-		goto out;
-	}
-
-	if (ocfs2_should_order_data(inode)) {
-		ret = ocfs2_jbd2_file_inode(handle, inode);
-		if (ret < 0)
-			mlog_errno(ret);
-	}
-out:
-	if (ret) {
-		if (!IS_ERR(handle))
-			ocfs2_commit_trans(osb, handle);
-		handle = ERR_PTR(ret);
-	}
-	return handle;
-}
-
 static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
 {
 	sector_t status;
@@ -577,8 +546,9 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
 		goto bail;
 	}
 
-	/* We should already CoW the refcounted extent. */
-	BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
+	/* We should already CoW the refcounted extent in case of create. */
+	BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED));
+
 	/*
 	 * get_more_blocks() expects us to describe a hole by clearing
 	 * the mapped bit on bh_result().
@@ -599,7 +569,7 @@ bail:
 	return ret;
 }
 
-/* 
+/*
  * ocfs2_dio_end_io is called by the dio core when a dio is finished.  We're
  * particularly interested in the aio/dio case.  Like the core uses
  * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
@@ -670,7 +640,7 @@ static ssize_t ocfs2_direct_IO(int rw,
 
 	ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
 					    inode->i_sb->s_bdev, iov, offset,
-					    nr_segs, 
+					    nr_segs,
 					    ocfs2_direct_IO_get_blocks,
 					    ocfs2_dio_end_io);
 
@@ -1130,23 +1100,37 @@ out:
  */
 static int ocfs2_grab_pages_for_write(struct address_space *mapping,
 				      struct ocfs2_write_ctxt *wc,
-				      u32 cpos, loff_t user_pos, int new,
+				      u32 cpos, loff_t user_pos,
+				      unsigned user_len, int new,
 				      struct page *mmap_page)
 {
 	int ret = 0, i;
-	unsigned long start, target_index, index;
+	unsigned long start, target_index, end_index, index;
 	struct inode *inode = mapping->host;
+	loff_t last_byte;
 
 	target_index = user_pos >> PAGE_CACHE_SHIFT;
 
 	/*
 	 * Figure out how many pages we'll be manipulating here. For
 	 * non allocating write, we just change the one
-	 * page. Otherwise, we'll need a whole clusters worth.
+	 * page. Otherwise, we'll need a whole clusters worth.  If we're
+	 * writing past i_size, we only need enough pages to cover the
+	 * last page of the write.
 	 */
 	if (new) {
 		wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
 		start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
+		/*
+		 * We need the index *past* the last page we could possibly
+		 * touch.  This is the page past the end of the write or
+		 * i_size, whichever is greater.
+		 */
+		last_byte = max(user_pos + user_len, i_size_read(inode));
+		BUG_ON(last_byte < 1);
+		end_index = ((last_byte - 1) >> PAGE_CACHE_SHIFT) + 1;
+		if ((start + wc->w_num_pages) > end_index)
+			wc->w_num_pages = end_index - start;
 	} else {
 		wc->w_num_pages = 1;
 		start = target_index;
@@ -1619,21 +1603,20 @@ out:
  * write path can treat it as an non-allocating write, which has no
  * special case code for sparse/nonsparse files.
  */
-static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
-					unsigned len,
+static int ocfs2_expand_nonsparse_inode(struct inode *inode,
+					struct buffer_head *di_bh,
+					loff_t pos, unsigned len,
 					struct ocfs2_write_ctxt *wc)
 {
 	int ret;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	loff_t newsize = pos + len;
 
-	if (ocfs2_sparse_alloc(osb))
-		return 0;
+	BUG_ON(ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
 
 	if (newsize <= i_size_read(inode))
 		return 0;
 
-	ret = ocfs2_extend_no_holes(inode, newsize, pos);
+	ret = ocfs2_extend_no_holes(inode, di_bh, newsize, pos);
 	if (ret)
 		mlog_errno(ret);
 
@@ -1643,6 +1626,18 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
 	return ret;
 }
 
+static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
+			   loff_t pos)
+{
+	int ret = 0;
+
+	BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
+	if (pos > i_size_read(inode))
+		ret = ocfs2_zero_extend(inode, di_bh, pos);
+
+	return ret;
+}
+
 int ocfs2_write_begin_nolock(struct address_space *mapping,
 			     loff_t pos, unsigned len, unsigned flags,
 			     struct page **pagep, void **fsdata,
@@ -1678,7 +1673,11 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 		}
 	}
 
-	ret = ocfs2_expand_nonsparse_inode(inode, pos, len, wc);
+	if (ocfs2_sparse_alloc(osb))
+		ret = ocfs2_zero_tail(inode, di_bh, pos);
+	else
+		ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len,
+						   wc);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -1734,6 +1733,9 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 			goto out;
 		}
 
+		if (data_ac)
+			data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
+
 		credits = ocfs2_calc_extend_credits(inode->i_sb,
 						    &di->id2.i_list,
 						    clusters_to_alloc);
@@ -1763,10 +1765,11 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 
 	wc->w_handle = handle;
 
-	if (clusters_to_alloc && vfs_dq_alloc_space_nodirty(inode,
-			ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc))) {
-		ret = -EDQUOT;
-		goto out_commit;
+	if (clusters_to_alloc) {
+		ret = dquot_alloc_space_nodirty(inode,
+			ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
+		if (ret)
+			goto out_commit;
 	}
 	/*
 	 * We don't want this to fail in ocfs2_write_end(), so do it
@@ -1784,7 +1787,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 	 * that we can zero and flush if we error after adding the
 	 * extent.
 	 */
-	ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos,
+	ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len,
 					 cluster_of_pages, mmap_page);
 	if (ret) {
 		mlog_errno(ret);
@@ -1809,7 +1812,7 @@ success:
 	return 0;
 out_quota:
 	if (clusters_to_alloc)
-		vfs_dq_free_space(inode,
+		dquot_free_space(inode,
 			  ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
 out_commit:
 	ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index b7428c5d0d3b..ec6d12339593 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -403,7 +403,7 @@ void ocfs2_block_check_compute(void *data, size_t blocksize,
 	 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
 	 * larger than 16 bits.
 	 */
-	BUG_ON(ecc > USHORT_MAX);
+	BUG_ON(ecc > USHRT_MAX);
 
 	bc->bc_crc32e = cpu_to_le32(crc);
 	bc->bc_ecc = cpu_to_le16((u16)ecc);
@@ -508,7 +508,7 @@ void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
 	 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
 	 * larger than 16 bits.
 	 */
-	BUG_ON(ecc > USHORT_MAX);
+	BUG_ON(ecc > USHRT_MAX);
 
 	bc->bc_crc32e = cpu_to_le32(crc);
 	bc->bc_ecc = cpu_to_le16((u16)ecc);
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index d43d34a1dd31..f9d5d3ffc75a 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -25,7 +25,6 @@
 
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 
 #include <cluster/masklog.h>
@@ -368,7 +367,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
 	}
 	ocfs2_metadata_cache_io_unlock(ci);
 
-	mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n", 
+	mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n",
 	     (unsigned long long)block, nr,
 	     ((flags & OCFS2_BH_IGNORE_CACHE) || ignore_cache) ? "no" : "yes",
 	     flags);
@@ -407,6 +406,7 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
 				struct buffer_head *bh)
 {
 	int ret = 0;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
 
 	mlog_entry_void();
 
@@ -426,6 +426,7 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
 
 	get_bh(bh); /* for end_buffer_write_sync() */
 	bh->b_end_io = end_buffer_write_sync;
+	ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &di->i_check);
 	submit_bh(WRITE, bh);
 
 	wait_on_buffer(bh);
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index eda5b8bcddd5..41d5f1f92d56 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -34,6 +34,7 @@
 #include <linux/crc32.h>
 #include <linux/time.h>
 #include <linux/debugfs.h>
+#include <linux/slab.h>
 
 #include "heartbeat.h"
 #include "tcp.h"
@@ -78,7 +79,7 @@ static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type);
 
 unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;
 
-/* Only sets a new threshold if there are no active regions. 
+/* Only sets a new threshold if there are no active regions.
  *
  * No locking or otherwise interesting code is required for reading
  * o2hb_dead_threshold as it can't change once regions are active and
@@ -170,7 +171,7 @@ static void o2hb_write_timeout(struct work_struct *work)
 
 	mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u "
 	     "milliseconds\n", reg->hr_dev_name,
-	     jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); 
+	     jiffies_to_msecs(jiffies - reg->hr_last_timeout_start));
 	o2quo_disk_timeout();
 }
 
@@ -624,7 +625,7 @@ static int o2hb_check_slot(struct o2hb_region *reg,
 	     "seq %llu last %llu changed %u equal %u\n",
 	     slot->ds_node_num, (long long)slot->ds_last_generation,
 	     le32_to_cpu(hb_block->hb_cksum),
-	     (unsigned long long)le64_to_cpu(hb_block->hb_seq), 
+	     (unsigned long long)le64_to_cpu(hb_block->hb_seq),
 	     (unsigned long long)slot->ds_last_time, slot->ds_changed_samples,
 	     slot->ds_equal_samples);
 
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 1cd2934de615..c7fba396392d 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -112,9 +112,11 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
 	define_mask(XATTR),
 	define_mask(QUOTA),
 	define_mask(REFCOUNT),
+	define_mask(BASTS),
 	define_mask(ERROR),
 	define_mask(NOTICE),
 	define_mask(KTHREAD),
+	define_mask(RESERVATIONS),
 };
 
 static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, };
@@ -135,7 +137,7 @@ static ssize_t mlog_store(struct kobject *obj, struct attribute *attr,
 	return mlog_mask_store(mlog_attr->mask, buf, count);
 }
 
-static struct sysfs_ops mlog_attr_ops = {
+static const struct sysfs_ops mlog_attr_ops = {
 	.show  = mlog_show,
 	.store = mlog_store,
 };
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 9b4d11726cf2..fd96e2a2fa56 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -114,10 +114,12 @@
 #define ML_XATTR	0x0000000020000000ULL /* ocfs2 extended attributes */
 #define ML_QUOTA	0x0000000040000000ULL /* ocfs2 quota operations */
 #define ML_REFCOUNT	0x0000000080000000ULL /* refcount tree operations */
+#define ML_BASTS	0x0000001000000000ULL /* dlmglue asts and basts */
 /* bits that are infrequently given and frequently matched in the high word */
 #define ML_ERROR	0x0000000100000000ULL /* sent to KERN_ERR */
 #define ML_NOTICE	0x0000000200000000ULL /* setn to KERN_NOTICE */
 #define ML_KTHREAD	0x0000000400000000ULL /* kernel thread activity */
+#define	ML_RESERVATIONS	0x0000000800000000ULL /* ocfs2 alloc reservations */
 
 #define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
 #define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT)
@@ -194,9 +196,9 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
  * previous token if args expands to nothing.
  */
 #define __mlog_printk(level, fmt, args...)				\
-	printk(level "(%u,%lu):%s:%d " fmt, task_pid_nr(current),	\
-	       __mlog_cpu_guess, __PRETTY_FUNCTION__, __LINE__ ,	\
-	       ##args)
+	printk(level "(%s,%u,%lu):%s:%d " fmt, current->comm,		\
+	       task_pid_nr(current), __mlog_cpu_guess,			\
+	       __PRETTY_FUNCTION__, __LINE__ , ##args)
 
 #define mlog(mask, fmt, args...) do {					\
 	u64 __m = MLOG_MASK_PREFIX | (mask);				\
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index c81142e3ef84..ed0c9f367fed 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -19,6 +19,7 @@
  * Boston, MA 021110-1307, USA.
  */
 
+#include <linux/slab.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/configfs.h>
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index 639024033fce..cf3e16696216 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -44,7 +44,6 @@
  * and if they're the last, they fire off the decision.
  */
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/workqueue.h>
 #include <linux/reboot.h>
 
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 334f231a422c..aa75ca3f78da 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -72,9 +72,9 @@
 
 #include "tcp_internal.h"
 
-#define SC_NODEF_FMT "node %s (num %u) at %u.%u.%u.%u:%u"
+#define SC_NODEF_FMT "node %s (num %u) at %pI4:%u"
 #define SC_NODEF_ARGS(sc) sc->sc_node->nd_name, sc->sc_node->nd_num,	\
-			  NIPQUAD(sc->sc_node->nd_ipv4_address),	\
+			  &sc->sc_node->nd_ipv4_address,		\
 			  ntohs(sc->sc_node->nd_ipv4_port)
 
 /*
@@ -485,7 +485,7 @@ static void o2net_set_nn_state(struct o2net_node *nn,
 	}
 
 	if (was_valid && !valid) {
-		printk(KERN_INFO "o2net: no longer connected to "
+		printk(KERN_NOTICE "o2net: no longer connected to "
 		       SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc));
 		o2net_complete_nodes_nsw(nn);
 	}
@@ -493,7 +493,7 @@ static void o2net_set_nn_state(struct o2net_node *nn,
 	if (!was_valid && valid) {
 		o2quo_conn_up(o2net_num_from_nn(nn));
 		cancel_delayed_work(&nn->nn_connect_expired);
-		printk(KERN_INFO "o2net: %s " SC_NODEF_FMT "\n",
+		printk(KERN_NOTICE "o2net: %s " SC_NODEF_FMT "\n",
 		       o2nm_this_node() > sc->sc_node->nd_num ?
 		       		"connected to" : "accepted connection from",
 		       SC_NODEF_ARGS(sc));
@@ -583,6 +583,9 @@ static void o2net_state_change(struct sock *sk)
 			o2net_sc_queue_work(sc, &sc->sc_connect_work);
 			break;
 		default:
+			printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT
+			      " shutdown, state %d\n",
+			      SC_NODEF_ARGS(sc), sk->sk_state);
 			o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
 			break;
 	}
@@ -930,7 +933,7 @@ static void o2net_sendpage(struct o2net_sock_container *sc,
 			cond_resched();
 			continue;
 		}
-		mlog(ML_ERROR, "sendpage of size %zu to " SC_NODEF_FMT 
+		mlog(ML_ERROR, "sendpage of size %zu to " SC_NODEF_FMT
 		     " failed with %zd\n", size, SC_NODEF_ARGS(sc), ret);
 		o2net_ensure_shutdown(nn, sc, 0);
 		break;
@@ -1476,14 +1479,14 @@ static void o2net_idle_timer(unsigned long data)
 
 	do_gettimeofday(&now);
 
-	printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u "
+	printk(KERN_NOTICE "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u "
 	     "seconds, shutting it down.\n", SC_NODEF_ARGS(sc),
 		     o2net_idle_timeout() / 1000,
 		     o2net_idle_timeout() % 1000);
 	mlog(ML_NOTICE, "here are some times that might help debug the "
 	     "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv "
 	     "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n",
-	     sc->sc_tv_timer.tv_sec, (long) sc->sc_tv_timer.tv_usec, 
+	     sc->sc_tv_timer.tv_sec, (long) sc->sc_tv_timer.tv_usec,
 	     now.tv_sec, (long) now.tv_usec,
 	     sc->sc_tv_data_ready.tv_sec, (long) sc->sc_tv_data_ready.tv_usec,
 	     sc->sc_tv_advance_start.tv_sec,
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 8d58cfe410b1..96fa7ebc530c 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -32,10 +32,10 @@
  * on their number */
 #define O2NET_QUORUM_DELAY_MS	((o2hb_dead_threshold + 2) * O2HB_REGION_TIMEOUT_MS)
 
-/* 
+/*
  * This version number represents quite a lot, unfortunately.  It not
  * only represents the raw network message protocol on the wire but also
- * locking semantics of the file system using the protocol.  It should 
+ * locking semantics of the file system using the protocol.  It should
  * be somewhere else, I'm sure, but right now it isn't.
  *
  * With version 11, we separate out the filesystem locking portion.  The
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 28c3ec238796..f04ebcfffc4a 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1194,7 +1194,7 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
 			else
 				de->inode = 0;
 			dir->i_version++;
-			status = ocfs2_journal_dirty(handle, bh);
+			ocfs2_journal_dirty(handle, bh);
 			goto bail;
 		}
 		i += le16_to_cpu(de->rec_len);
@@ -1752,7 +1752,7 @@ int __ocfs2_add_entry(handle_t *handle,
 				ocfs2_recalc_free_list(dir, handle, lookup);
 
 			dir->i_version++;
-			status = ocfs2_journal_dirty(handle, insert_bh);
+			ocfs2_journal_dirty(handle, insert_bh);
 			retval = 0;
 			goto bail;
 		}
@@ -2297,12 +2297,7 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
 	}
 
 	ocfs2_fill_initial_dirents(inode, parent, data->id_data, size);
-
 	ocfs2_journal_dirty(handle, di_bh);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
 
 	i_size_write(inode, size);
 	inode->i_nlink = 2;
@@ -2366,11 +2361,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
 		ocfs2_init_dir_trailer(inode, new_bh, size);
 	}
 
-	status = ocfs2_journal_dirty(handle, new_bh);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail;
-	}
+	ocfs2_journal_dirty(handle, new_bh);
 
 	i_size_write(inode, inode->i_sb->s_blocksize);
 	inode->i_nlink = 2;
@@ -2404,15 +2395,15 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
 	int ret;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
 	u16 dr_suballoc_bit;
-	u64 dr_blkno;
+	u64 suballoc_loc, dr_blkno;
 	unsigned int num_bits;
 	struct buffer_head *dx_root_bh = NULL;
 	struct ocfs2_dx_root_block *dx_root;
 	struct ocfs2_dir_block_trailer *trailer =
 		ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
 
-	ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, &dr_suballoc_bit,
-				   &num_bits, &dr_blkno);
+	ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
+				   &dr_suballoc_bit, &num_bits, &dr_blkno);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -2439,7 +2430,8 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
 	dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
 	memset(dx_root, 0, osb->sb->s_blocksize);
 	strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE);
-	dx_root->dr_suballoc_slot = cpu_to_le16(osb->slot_num);
+	dx_root->dr_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+	dx_root->dr_suballoc_loc = cpu_to_le64(suballoc_loc);
 	dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit);
 	dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation);
 	dx_root->dr_blkno = cpu_to_le64(dr_blkno);
@@ -2458,10 +2450,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
 		dx_root->dr_list.l_count =
 			cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
 	}
-
-	ret = ocfs2_journal_dirty(handle, dx_root_bh);
-	if (ret)
-		mlog_errno(ret);
+	ocfs2_journal_dirty(handle, dx_root_bh);
 
 	ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
 				      OCFS2_JOURNAL_ACCESS_CREATE);
@@ -2475,9 +2464,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
 	OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL;
 	di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
 
-	ret = ocfs2_journal_dirty(handle, di_bh);
-	if (ret)
-		mlog_errno(ret);
+	ocfs2_journal_dirty(handle, di_bh);
 
 	*ret_dx_root_bh = dx_root_bh;
 	dx_root_bh = NULL;
@@ -2558,7 +2545,7 @@ static int __ocfs2_dx_dir_new_cluster(struct inode *dir,
 	 * chance of contiguousness as the directory grows in number
 	 * of entries.
 	 */
-	ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1, 1, &phys, &num);
+	ret = __ocfs2_claim_clusters(handle, data_ac, 1, 1, &phys, &num);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -2964,12 +2951,10 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 		goto out;
 	}
 
-	if (vfs_dq_alloc_space_nodirty(dir,
-				ocfs2_clusters_to_bytes(osb->sb,
-							alloc + dx_alloc))) {
-		ret = -EDQUOT;
+	ret = dquot_alloc_space_nodirty(dir,
+		ocfs2_clusters_to_bytes(osb->sb, alloc + dx_alloc));
+	if (ret)
 		goto out_commit;
-	}
 	did_quota = 1;
 
 	if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
@@ -2993,7 +2978,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 	 * if we only get one now, that's enough to continue. The rest
 	 * will be claimed after the conversion to extents.
 	 */
-	ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len);
+	if (ocfs2_dir_resv_allowed(osb))
+		data_ac->ac_resv = &oi->ip_la_data_resv;
+	ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off, &len);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
@@ -3036,11 +3023,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 		ocfs2_init_dir_trailer(dir, dirdata_bh, i);
 	}
 
-	ret = ocfs2_journal_dirty(handle, dirdata_bh);
-	if (ret) {
-		mlog_errno(ret);
-		goto out_commit;
-	}
+	ocfs2_journal_dirty(handle, dirdata_bh);
 
 	if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
 		/*
@@ -3106,11 +3089,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 	 */
 	dir->i_blocks = ocfs2_inode_sector_count(dir);
 
-	ret = ocfs2_journal_dirty(handle, di_bh);
-	if (ret) {
-		mlog_errno(ret);
-		goto out_commit;
-	}
+	ocfs2_journal_dirty(handle, di_bh);
 
 	if (ocfs2_supports_indexed_dirs(osb)) {
 		ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh,
@@ -3140,7 +3119,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 	 * pass. Claim the 2nd cluster as a separate extent.
 	 */
 	if (alloc > len) {
-		ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
+		ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
 					   &len);
 		if (ret) {
 			mlog_errno(ret);
@@ -3178,7 +3157,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 
 out_commit:
 	if (ret < 0 && did_quota)
-		vfs_dq_free_space_nodirty(dir, bytes_allocated);
+		dquot_free_space_nodirty(dir, bytes_allocated);
 
 	ocfs2_commit_trans(osb, handle);
 
@@ -3221,11 +3200,10 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
 	if (extend) {
 		u32 offset = OCFS2_I(dir)->ip_clusters;
 
-		if (vfs_dq_alloc_space_nodirty(dir,
-					ocfs2_clusters_to_bytes(sb, 1))) {
-			status = -EDQUOT;
+		status = dquot_alloc_space_nodirty(dir,
+					ocfs2_clusters_to_bytes(sb, 1));
+		if (status)
 			goto bail;
-		}
 		did_quota = 1;
 
 		status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset,
@@ -3254,7 +3232,7 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
 	status = 0;
 bail:
 	if (did_quota && status < 0)
-		vfs_dq_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1));
+		dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1));
 	mlog_exit(status);
 	return status;
 }
@@ -3372,6 +3350,9 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 			goto bail;
 		}
 
+		if (ocfs2_dir_resv_allowed(osb))
+			data_ac->ac_resv = &OCFS2_I(dir)->ip_la_data_resv;
+
 		credits = ocfs2_calc_extend_credits(sb, el, 1);
 	} else {
 		spin_unlock(&OCFS2_I(dir)->ip_lock);
@@ -3426,11 +3407,7 @@ do_extend:
 	} else {
 		de->rec_len = cpu_to_le16(sb->s_blocksize);
 	}
-	status = ocfs2_journal_dirty(handle, new_bh);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail;
-	}
+	ocfs2_journal_dirty(handle, new_bh);
 
 	dir_i_size += dir->i_sb->s_blocksize;
 	i_size_write(dir, dir_i_size);
@@ -3889,11 +3866,10 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
 		goto out;
 	}
 
-	if (vfs_dq_alloc_space_nodirty(dir,
-				       ocfs2_clusters_to_bytes(dir->i_sb, 1))) {
-		ret = -EDQUOT;
+	ret = dquot_alloc_space_nodirty(dir,
+				       ocfs2_clusters_to_bytes(dir->i_sb, 1));
+	if (ret)
 		goto out_commit;
-	}
 	did_quota = 1;
 
 	ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh,
@@ -3910,11 +3886,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
 	     sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp,
 	     dx_leaf_sort_swap);
 
-	ret = ocfs2_journal_dirty(handle, dx_leaf_bh);
-	if (ret) {
-		mlog_errno(ret);
-		goto out_commit;
-	}
+	ocfs2_journal_dirty(handle, dx_leaf_bh);
 
 	ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos, insert_hash,
 					   &split_hash);
@@ -3983,7 +3955,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
 
 out_commit:
 	if (ret < 0 && did_quota)
-		vfs_dq_free_space_nodirty(dir,
+		dquot_free_space_nodirty(dir,
 				ocfs2_clusters_to_bytes(dir->i_sb, 1));
 
 	ocfs2_commit_trans(osb, handle);
@@ -4165,11 +4137,10 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir,
 		goto out;
 	}
 
-	if (vfs_dq_alloc_space_nodirty(dir,
-				       ocfs2_clusters_to_bytes(osb->sb, 1))) {
-		ret = -EDQUOT;
+	ret = dquot_alloc_space_nodirty(dir,
+				       ocfs2_clusters_to_bytes(osb->sb, 1));
+	if (ret)
 		goto out_commit;
-	}
 	did_quota = 1;
 
 	/*
@@ -4229,7 +4200,7 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir,
 
 out_commit:
 	if (ret < 0 && did_quota)
-		vfs_dq_free_space_nodirty(dir,
+		dquot_free_space_nodirty(dir,
 					  ocfs2_clusters_to_bytes(dir->i_sb, 1));
 
 	ocfs2_commit_trans(osb, handle);
@@ -4495,7 +4466,10 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir,
 
 	blk = le64_to_cpu(dx_root->dr_blkno);
 	bit = le16_to_cpu(dx_root->dr_suballoc_bit);
-	bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+	if (dx_root->dr_suballoc_loc)
+		bg_blkno = le64_to_cpu(dx_root->dr_suballoc_loc);
+	else
+		bg_blkno = ocfs2_which_suballoc_group(blk, bit);
 	ret = ocfs2_free_suballoc_bits(handle, dx_alloc_inode, dx_alloc_bh,
 				       bit, bg_blkno, 1);
 	if (ret)
@@ -4556,8 +4530,8 @@ int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh)
 
 		p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno);
 
-		ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen,
-					       &dealloc);
+		ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen, 0,
+					       &dealloc, 0);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
index 190361375700..dcebf0d920fa 100644
--- a/fs/ocfs2/dlm/Makefile
+++ b/fs/ocfs2/dlm/Makefile
@@ -1,8 +1,7 @@
 EXTRA_CFLAGS += -Ifs/ocfs2
 
-obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o ocfs2_dlmfs.o
+obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o
 
 ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
 	dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o
 
-ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o
diff --git a/fs/ocfs2/dlm/dlmapi.h b/fs/ocfs2/dlm/dlmapi.h
index b5786a787fab..3cfa114aa391 100644
--- a/fs/ocfs2/dlm/dlmapi.h
+++ b/fs/ocfs2/dlm/dlmapi.h
@@ -95,7 +95,7 @@ const char *dlm_errname(enum dlm_status err);
 		mlog(ML_ERROR, "dlm status = %s\n", dlm_errname((st)));	\
 } while (0)
 
-#define DLM_LKSB_UNUSED1           0x01  
+#define DLM_LKSB_UNUSED1           0x01
 #define DLM_LKSB_PUT_LVB           0x02
 #define DLM_LKSB_GET_LVB           0x04
 #define DLM_LKSB_UNUSED2           0x08
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 01cf8cc3d286..f44999156839 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -28,7 +28,6 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
@@ -89,7 +88,7 @@ static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 	return 0;
 }
 
-static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 {
 	mlog_entry_void();
 
@@ -123,7 +122,7 @@ static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 		dlm_lock_put(lock);
 		/* free up the reserved bast that we are cancelling.
 		 * guaranteed that this will not be the last reserved
-		 * ast because *both* an ast and a bast were reserved 
+		 * ast because *both* an ast and a bast were reserved
 		 * to get to this point.  the res->spinlock will not be
 		 * taken here */
 		dlm_lockres_release_ast(dlm, res);
@@ -146,7 +145,7 @@ void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 }
 
 
-static void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 {
 	mlog_entry_void();
 
@@ -185,9 +184,8 @@ static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
 	BUG_ON(!lksb);
 
 	/* only updates if this node masters the lockres */
+	spin_lock(&res->spinlock);
 	if (res->owner == dlm->node_num) {
-
-		spin_lock(&res->spinlock);
 		/* check the lksb flags for the direction */
 		if (lksb->flags & DLM_LKSB_GET_LVB) {
 			mlog(0, "getting lvb from lockres for %s node\n",
@@ -202,8 +200,8 @@ static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
  		 * here. In the future we might want to clear it at the time
  		 * the put is actually done.
 		 */
-		spin_unlock(&res->spinlock);
 	}
+	spin_unlock(&res->spinlock);
 
 	/* reset any lvb flags on the lksb */
 	lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB);
@@ -453,7 +451,9 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
 	ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen,
 				     lock->ml.node, &status);
 	if (ret < 0)
-		mlog_errno(ret);
+		mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+		     "node %u\n", ret, DLM_PROXY_AST_MSG, dlm->key,
+		     lock->ml.node);
 	else {
 		if (status == DLM_RECOVERING) {
 			mlog(ML_ERROR, "sent AST to node %u, it thinks this "
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 0102be35980c..4b6ae2c13b47 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -37,7 +37,7 @@
 #define DLM_THREAD_SHUFFLE_INTERVAL    5     // flush everything every 5 passes
 #define DLM_THREAD_MS                  200   // flush at least every 200 ms
 
-#define DLM_HASH_SIZE_DEFAULT	(1 << 14)
+#define DLM_HASH_SIZE_DEFAULT	(1 << 17)
 #if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE
 # define DLM_HASH_PAGES		1
 #else
@@ -904,6 +904,8 @@ void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
 
 void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
 void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
 void dlm_do_local_ast(struct dlm_ctxt *dlm,
 		      struct dlm_lock_resource *res,
 		      struct dlm_lock *lock);
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index ca96bce50e18..9f30491e5e88 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -28,7 +28,6 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
@@ -391,12 +390,14 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
 		} else if (ret != DLM_NORMAL && ret != DLM_NOTQUEUED)
 			dlm_error(ret);
 	} else {
-		mlog_errno(tmpret);
+		mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+		     "node %u\n", tmpret, DLM_CONVERT_LOCK_MSG, dlm->key,
+		     res->owner);
 		if (dlm_is_host_down(tmpret)) {
 			/* instead of logging the same network error over
 			 * and over, sleep here and wait for the heartbeat
 			 * to notice the node is dead.  times out after 5s. */
-			dlm_wait_for_node_death(dlm, res->owner, 
+			dlm_wait_for_node_death(dlm, res->owner,
 						DLM_NODE_DEATH_WAIT_MAX);
 			ret = DLM_RECOVERING;
 			mlog(0, "node %u died so returning DLM_RECOVERING "
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 42b0bad7a612..0cd24cf54396 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -102,7 +102,7 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
 	assert_spin_locked(&res->spinlock);
 
 	stringify_lockname(res->lockname.name, res->lockname.len,
-			   buf, sizeof(buf) - 1);
+			   buf, sizeof(buf));
 	printk("lockres: %s, owner=%u, state=%u\n",
 	       buf, res->owner, res->state);
 	printk("  last used: %lu, refcnt: %u, on purge list: %s\n",
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 0334000676d3..153abb5abef0 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -511,7 +511,7 @@ static void __dlm_print_nodes(struct dlm_ctxt *dlm)
 
 	assert_spin_locked(&dlm->spinlock);
 
-	printk(KERN_INFO "ocfs2_dlm: Nodes in domain (\"%s\"): ", dlm->name);
+	printk(KERN_NOTICE "o2dlm: Nodes in domain %s: ", dlm->name);
 
 	while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
 				     node + 1)) < O2NM_MAX_NODES) {
@@ -534,7 +534,7 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
 
 	node = exit_msg->node_idx;
 
-	printk(KERN_INFO "ocfs2_dlm: Node %u leaves domain %s\n", node, dlm->name);
+	printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s\n", node, dlm->name);
 
 	spin_lock(&dlm->spinlock);
 	clear_bit(node, dlm->domain_map);
@@ -565,7 +565,9 @@ static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm,
 	status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key,
 				    &leave_msg, sizeof(leave_msg), node,
 				    NULL);
-
+	if (status < 0)
+		mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+		     "node %u\n", status, DLM_EXIT_DOMAIN_MSG, dlm->key, node);
 	mlog(0, "status return %d from o2net_send_message\n", status);
 
 	return status;
@@ -816,7 +818,7 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data,
 	}
 
 	/* Once the dlm ctxt is marked as leaving then we don't want
-	 * to be put in someone's domain map. 
+	 * to be put in someone's domain map.
 	 * Also, explicitly disallow joining at certain troublesome
 	 * times (ie. during recovery). */
 	if (dlm && dlm->dlm_state != DLM_CTXT_LEAVING) {
@@ -904,7 +906,7 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
 		set_bit(assert->node_idx, dlm->domain_map);
 		__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
 
-		printk(KERN_INFO "ocfs2_dlm: Node %u joins domain %s\n",
+		printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n",
 		       assert->node_idx, dlm->name);
 		__dlm_print_nodes(dlm);
 
@@ -962,7 +964,9 @@ static int dlm_send_one_join_cancel(struct dlm_ctxt *dlm,
 				    &cancel_msg, sizeof(cancel_msg), node,
 				    NULL);
 	if (status < 0) {
-		mlog_errno(status);
+		mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+		     "node %u\n", status, DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY,
+		     node);
 		goto bail;
 	}
 
@@ -1029,10 +1033,11 @@ static int dlm_request_join(struct dlm_ctxt *dlm,
 	byte_copymap(join_msg.node_map, dlm->live_nodes_map, O2NM_MAX_NODES);
 
 	status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg,
-				    sizeof(join_msg), node,
-				    &join_resp);
+				    sizeof(join_msg), node, &join_resp);
 	if (status < 0 && status != -ENOPROTOOPT) {
-		mlog_errno(status);
+		mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+		     "node %u\n", status, DLM_QUERY_JOIN_MSG, DLM_MOD_KEY,
+		     node);
 		goto bail;
 	}
 	dlm_query_join_wire_to_packet(join_resp, &packet);
@@ -1103,7 +1108,9 @@ static int dlm_send_one_join_assert(struct dlm_ctxt *dlm,
 				    &assert_msg, sizeof(assert_msg), node,
 				    NULL);
 	if (status < 0)
-		mlog_errno(status);
+		mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+		     "node %u\n", status, DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
+		     node);
 
 	return status;
 }
@@ -1516,7 +1523,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
 		goto leave;
 	}
 
-	dlm->name = kmalloc(strlen(domain) + 1, GFP_KERNEL);
+	dlm->name = kstrdup(domain, GFP_KERNEL);
 	if (dlm->name == NULL) {
 		mlog_errno(-ENOMEM);
 		kfree(dlm);
@@ -1550,7 +1557,6 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
 	for (i = 0; i < DLM_HASH_BUCKETS; i++)
 		INIT_HLIST_HEAD(dlm_master_hash(dlm, i));
 
-	strcpy(dlm->name, domain);
 	dlm->key = key;
 	dlm->node_num = o2nm_this_node();
 
@@ -1665,7 +1671,7 @@ struct dlm_ctxt * dlm_register_domain(const char *domain,
 	struct dlm_ctxt *dlm = NULL;
 	struct dlm_ctxt *new_ctxt = NULL;
 
-	if (strlen(domain) > O2NM_MAX_NAME_LEN) {
+	if (strlen(domain) >= O2NM_MAX_NAME_LEN) {
 		ret = -ENAMETOOLONG;
 		mlog(ML_ERROR, "domain name length too long\n");
 		goto leave;
@@ -1703,6 +1709,7 @@ retry:
 		}
 
 		if (dlm_protocol_compare(&dlm->fs_locking_proto, fs_proto)) {
+			spin_unlock(&dlm_domain_lock);
 			mlog(ML_ERROR,
 			     "Requested locking protocol version is not "
 			     "compatible with already registered domain "
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 437698e9465f..69cf369961c4 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -269,7 +269,7 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
 		}
 		dlm_revert_pending_lock(res, lock);
 		dlm_lock_put(lock);
-	} else if (dlm_is_recovery_lock(res->lockname.name, 
+	} else if (dlm_is_recovery_lock(res->lockname.name,
 					res->lockname.len)) {
 		/* special case for the $RECOVERY lock.
 		 * there will never be an AST delivered to put
@@ -329,7 +329,9 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
 			BUG();
 		}
 	} else {
-		mlog_errno(tmpret);
+		mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+		     "node %u\n", tmpret, DLM_CREATE_LOCK_MSG, dlm->key,
+		     res->owner);
 		if (dlm_is_host_down(tmpret)) {
 			ret = DLM_RECOVERING;
 			mlog(0, "node %u died so returning DLM_RECOVERING "
@@ -429,7 +431,7 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
 	struct dlm_lock *lock;
 	int kernel_allocated = 0;
 
-	lock = (struct dlm_lock *) kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS);
+	lock = kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS);
 	if (!lock)
 		return NULL;
 
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 03ccf9a7b1f4..94b97fc6a88e 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -366,7 +366,7 @@ void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up)
 	struct dlm_master_list_entry *mle;
 
 	assert_spin_locked(&dlm->spinlock);
-	
+
 	list_for_each_entry(mle, &dlm->mle_hb_events, hb_events) {
 		if (node_up)
 			dlm_mle_node_up(dlm, mle, NULL, idx);
@@ -617,13 +617,11 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
 {
 	struct dlm_lock_resource *res = NULL;
 
-	res = (struct dlm_lock_resource *)
-				kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
+	res = kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
 	if (!res)
 		goto error;
 
-	res->lockname.name = (char *)
-				kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
+	res->lockname.name = kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
 	if (!res->lockname.name)
 		goto error;
 
@@ -757,8 +755,7 @@ lookup:
 		spin_unlock(&dlm->spinlock);
 		mlog(0, "allocating a new resource\n");
 		/* nothing found and we need to allocate one. */
-		alloc_mle = (struct dlm_master_list_entry *)
-			kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
+		alloc_mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
 		if (!alloc_mle)
 			goto leave;
 		res = dlm_new_lockres(dlm, lockid, namelen);
@@ -833,7 +830,7 @@ lookup:
 		__dlm_insert_mle(dlm, mle);
 
 		/* still holding the dlm spinlock, check the recovery map
-		 * to see if there are any nodes that still need to be 
+		 * to see if there are any nodes that still need to be
 		 * considered.  these will not appear in the mle nodemap
 		 * but they might own this lockres.  wait on them. */
 		bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
@@ -883,7 +880,7 @@ redo_request:
 				msleep(500);
 			}
 			continue;
-		} 
+		}
 
 		dlm_kick_recovery_thread(dlm);
 		msleep(1000);
@@ -939,8 +936,8 @@ wait:
 		     res->lockname.name, blocked);
 		if (++tries > 20) {
 			mlog(ML_ERROR, "%s:%.*s: spinning on "
-			     "dlm_wait_for_lock_mastery, blocked=%d\n", 
-			     dlm->name, res->lockname.len, 
+			     "dlm_wait_for_lock_mastery, blocked=%d\n",
+			     dlm->name, res->lockname.len,
 			     res->lockname.name, blocked);
 			dlm_print_one_lock_resource(res);
 			dlm_print_one_mle(mle);
@@ -1029,7 +1026,7 @@ recheck:
 		ret = dlm_restart_lock_mastery(dlm, res, mle, *blocked);
 		b = (mle->type == DLM_MLE_BLOCK);
 		if ((*blocked && !b) || (!*blocked && b)) {
-			mlog(0, "%s:%.*s: status change: old=%d new=%d\n", 
+			mlog(0, "%s:%.*s: status change: old=%d new=%d\n",
 			     dlm->name, res->lockname.len, res->lockname.name,
 			     *blocked, b);
 			*blocked = b;
@@ -1542,8 +1539,7 @@ way_up_top:
 			spin_unlock(&dlm->master_lock);
 			spin_unlock(&dlm->spinlock);
 
-			mle = (struct dlm_master_list_entry *)
-				kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
+			mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
 			if (!mle) {
 				response = DLM_MASTER_RESP_ERROR;
 				mlog_errno(-ENOMEM);
@@ -1602,7 +1598,7 @@ send_response:
 		}
 		mlog(0, "%u is the owner of %.*s, cleaning everyone else\n",
 			     dlm->node_num, res->lockname.len, res->lockname.name);
-		ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx, 
+		ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx,
 						 DLM_ASSERT_MASTER_MLE_CLEANUP);
 		if (ret < 0) {
 			mlog(ML_ERROR, "failed to dispatch assert master work\n");
@@ -1666,7 +1662,9 @@ again:
 		tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key,
 					    &assert, sizeof(assert), to, &r);
 		if (tmpret < 0) {
-			mlog(0, "assert_master returned %d!\n", tmpret);
+			mlog(ML_ERROR, "Error %d when sending message %u (key "
+			     "0x%x) to node %u\n", tmpret,
+			     DLM_ASSERT_MASTER_MSG, dlm->key, to);
 			if (!dlm_is_host_down(tmpret)) {
 				mlog(ML_ERROR, "unhandled error=%d!\n", tmpret);
 				BUG();
@@ -1701,7 +1699,7 @@ again:
 
 		if (r & DLM_ASSERT_RESPONSE_REASSERT) {
 			mlog(0, "%.*s: node %u create mles on other "
-			     "nodes and requests a re-assert\n", 
+			     "nodes and requests a re-assert\n",
 			     namelen, lockname, to);
 			reassert = 1;
 		}
@@ -1812,7 +1810,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
 				spin_unlock(&dlm->master_lock);
 				spin_unlock(&dlm->spinlock);
 				goto done;
-			}	
+			}
 		}
 	}
 	spin_unlock(&dlm->master_lock);
@@ -1875,7 +1873,6 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
 ok:
 		spin_unlock(&res->spinlock);
 	}
-	spin_unlock(&dlm->spinlock);
 
 	// mlog(0, "woo!  got an assert_master from node %u!\n",
 	// 	     assert->node_idx);
@@ -1883,7 +1880,7 @@ ok:
 		int extra_ref = 0;
 		int nn = -1;
 		int rr, err = 0;
-		
+
 		spin_lock(&mle->spinlock);
 		if (mle->type == DLM_MLE_BLOCK || mle->type == DLM_MLE_MIGRATION)
 			extra_ref = 1;
@@ -1891,7 +1888,7 @@ ok:
 			/* MASTER mle: if any bits set in the response map
 			 * then the calling node needs to re-assert to clear
 			 * up nodes that this node contacted */
-			while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES, 
+			while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES,
 						    nn+1)) < O2NM_MAX_NODES) {
 				if (nn != dlm->node_num && nn != assert->node_idx)
 					master_request = 1;
@@ -1926,7 +1923,6 @@ ok:
 		/* master is known, detach if not already detached.
 		 * ensures that only one assert_master call will happen
 		 * on this mle. */
-		spin_lock(&dlm->spinlock);
 		spin_lock(&dlm->master_lock);
 
 		rr = atomic_read(&mle->mle_refs.refcount);
@@ -1959,7 +1955,6 @@ ok:
 			__dlm_put_mle(mle);
 		}
 		spin_unlock(&dlm->master_lock);
-		spin_unlock(&dlm->spinlock);
 	} else if (res) {
 		if (res->owner != assert->node_idx) {
 			mlog(0, "assert_master from %u, but current "
@@ -1967,6 +1962,7 @@ ok:
 			     res->owner, namelen, name);
 		}
 	}
+	spin_unlock(&dlm->spinlock);
 
 done:
 	ret = 0;
@@ -2002,7 +1998,7 @@ kill:
 	__dlm_print_one_lock_resource(res);
 	spin_unlock(&res->spinlock);
 	spin_unlock(&dlm->spinlock);
-	*ret_data = (void *)res; 
+	*ret_data = (void *)res;
 	dlm_put(dlm);
 	return -EINVAL;
 }
@@ -2040,10 +2036,10 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
 	item->u.am.request_from = request_from;
 	item->u.am.flags = flags;
 
-	if (ignore_higher) 
-		mlog(0, "IGNORE HIGHER: %.*s\n", res->lockname.len, 
+	if (ignore_higher)
+		mlog(0, "IGNORE HIGHER: %.*s\n", res->lockname.len,
 		     res->lockname.name);
-		
+
 	spin_lock(&dlm->work_lock);
 	list_add_tail(&item->list, &dlm->work_list);
 	spin_unlock(&dlm->work_lock);
@@ -2133,7 +2129,7 @@ put:
  * think that $RECOVERY is currently mastered by a dead node.  If so,
  * we wait a short time to allow that node to get notified by its own
  * heartbeat stack, then check again.  All $RECOVERY lock resources
- * mastered by dead nodes are purged when the hearbeat callback is 
+ * mastered by dead nodes are purged when the hearbeat callback is
  * fired, so we can know for sure that it is safe to continue once
  * the node returns a live node or no node.  */
 static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
@@ -2174,7 +2170,7 @@ static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
 				ret = -EAGAIN;
 			}
 			spin_unlock(&dlm->spinlock);
-			mlog(0, "%s: reco lock master is %u\n", dlm->name, 
+			mlog(0, "%s: reco lock master is %u\n", dlm->name,
 			     master);
 			break;
 		}
@@ -2207,7 +2203,9 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
 	ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key,
 				 &deref, sizeof(deref), res->owner, &r);
 	if (ret < 0)
-		mlog_errno(ret);
+		mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+		     "node %u\n", ret, DLM_DEREF_LOCKRES_MSG, dlm->key,
+		     res->owner);
 	else if (r < 0) {
 		/* BAD.  other node says I did not have a ref. */
 		mlog(ML_ERROR,"while dropping ref on %s:%.*s "
@@ -2454,8 +2452,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
 		goto leave;
 	}
 
-	mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
-								GFP_NOFS);
+	mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
 	if (!mle) {
 		mlog_errno(ret);
 		goto leave;
@@ -2602,7 +2599,7 @@ fail:
 
 			mlog(0, "%s:%.*s: timed out during migration\n",
 			     dlm->name, res->lockname.len, res->lockname.name);
-			/* avoid hang during shutdown when migrating lockres 
+			/* avoid hang during shutdown when migrating lockres
 			 * to a node which also goes down */
 			if (dlm_is_node_dead(dlm, target)) {
 				mlog(0, "%s:%.*s: expected migration "
@@ -2738,7 +2735,7 @@ static int dlm_migration_can_proceed(struct dlm_ctxt *dlm,
 	can_proceed = !!(res->state & DLM_LOCK_RES_MIGRATING);
 	spin_unlock(&res->spinlock);
 
-	/* target has died, so make the caller break out of the 
+	/* target has died, so make the caller break out of the
 	 * wait_event, but caller must recheck the domain_map */
 	spin_lock(&dlm->spinlock);
 	if (!test_bit(mig_target, dlm->domain_map))
@@ -2811,14 +2808,8 @@ again:
 		mlog(0, "trying again...\n");
 		goto again;
 	}
-	/* now that we are sure the MIGRATING state is there, drop
-	 * the unneded state which blocked threads trying to DIRTY */
-	spin_lock(&res->spinlock);
-	BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY));
-	BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING));
-	res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY;
-	spin_unlock(&res->spinlock);
 
+	ret = 0;
 	/* did the target go down or die? */
 	spin_lock(&dlm->spinlock);
 	if (!test_bit(target, dlm->domain_map)) {
@@ -2829,9 +2820,21 @@ again:
 	spin_unlock(&dlm->spinlock);
 
 	/*
+	 * if target is down, we need to clear DLM_LOCK_RES_BLOCK_DIRTY for
+	 * another try; otherwise, we are sure the MIGRATING state is there,
+	 * drop the unneded state which blocked threads trying to DIRTY
+	 */
+	spin_lock(&res->spinlock);
+	BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY));
+	res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY;
+	if (!ret)
+		BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING));
+	spin_unlock(&res->spinlock);
+
+	/*
 	 * at this point:
 	 *
-	 *   o the DLM_LOCK_RES_MIGRATING flag is set
+	 *   o the DLM_LOCK_RES_MIGRATING flag is set if target not down
 	 *   o there are no pending asts on this lockres
 	 *   o all processes trying to reserve an ast on this
 	 *     lockres must wait for the MIGRATING flag to clear
@@ -2977,7 +2980,9 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
 					 &migrate, sizeof(migrate), nodenum,
 					 &status);
 		if (ret < 0) {
-			mlog(0, "migrate_request returned %d!\n", ret);
+			mlog(ML_ERROR, "Error %d when sending message %u (key "
+			     "0x%x) to node %u\n", ret, DLM_MIGRATE_REQUEST_MSG,
+			     dlm->key, nodenum);
 			if (!dlm_is_host_down(ret)) {
 				mlog(ML_ERROR, "unhandled error=%d!\n", ret);
 				BUG();
@@ -3035,8 +3040,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
 	hash = dlm_lockid_hash(name, namelen);
 
 	/* preallocate.. if this fails, abort */
-	mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
-							 GFP_NOFS);
+	mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
 
 	if (!mle) {
 		ret = -ENOMEM;
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 2f9e4e19a4f2..9dfaac73b36d 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -310,7 +310,7 @@ static int dlm_recovery_thread(void *data)
 	mlog(0, "dlm thread running for %s...\n", dlm->name);
 
 	while (!kthread_should_stop()) {
-		if (dlm_joined(dlm)) {
+		if (dlm_domain_fully_joined(dlm)) {
 			status = dlm_do_recovery(dlm);
 			if (status == -EAGAIN) {
 				/* do not sleep, recheck immediately. */
@@ -463,7 +463,7 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
 	if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
 		int bit;
 
-		bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0);
+		bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES, 0);
 		if (bit >= O2NM_MAX_NODES || bit < 0)
 			dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
 		else
@@ -803,7 +803,9 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
 
 	/* negative status is handled by caller */
 	if (ret < 0)
-		mlog_errno(ret);
+		mlog(ML_ERROR, "Error %d when sending message %u (key "
+		     "0x%x) to node %u\n", ret, DLM_LOCK_REQUEST_MSG,
+		     dlm->key, request_from);
 
 	// return from here, then
 	// sleep until all received or error
@@ -955,10 +957,10 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
 	ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
 				 sizeof(done_msg), send_to, &tmpret);
 	if (ret < 0) {
+		mlog(ML_ERROR, "Error %d when sending message %u (key "
+		     "0x%x) to node %u\n", ret, DLM_RECO_DATA_DONE_MSG,
+		     dlm->key, send_to);
 		if (!dlm_is_host_down(ret)) {
-			mlog_errno(ret);
-			mlog(ML_ERROR, "%s: unknown error sending data-done "
-			     "to %u\n", dlm->name, send_to);
 			BUG();
 		}
 	} else
@@ -1050,7 +1052,7 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
 				if (lock->ml.node == dead_node) {
 					mlog(0, "AHA! there was "
 					     "a $RECOVERY lock for dead "
-					     "node %u (%s)!\n", 
+					     "node %u (%s)!\n",
 					     dead_node, dlm->name);
 					list_del_init(&lock->list);
 					dlm_lock_put(lock);
@@ -1126,7 +1128,9 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
 	if (ret < 0) {
 		/* XXX: negative status is not handled.
 		 * this will end up killing this node. */
-		mlog_errno(ret);
+		mlog(ML_ERROR, "Error %d when sending message %u (key "
+		     "0x%x) to node %u\n", ret, DLM_MIG_LOCKRES_MSG,
+		     dlm->key, send_to);
 	} else {
 		/* might get an -ENOMEM back here */
 		ret = status;
@@ -1164,6 +1168,39 @@ static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
 	mres->master = master;
 }
 
+static void dlm_prepare_lvb_for_migration(struct dlm_lock *lock,
+					  struct dlm_migratable_lockres *mres,
+					  int queue)
+{
+	if (!lock->lksb)
+	       return;
+
+	/* Ignore lvb in all locks in the blocked list */
+	if (queue == DLM_BLOCKED_LIST)
+		return;
+
+	/* Only consider lvbs in locks with granted EX or PR lock levels */
+	if (lock->ml.type != LKM_EXMODE && lock->ml.type != LKM_PRMODE)
+		return;
+
+	if (dlm_lvb_is_empty(mres->lvb)) {
+		memcpy(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN);
+		return;
+	}
+
+	/* Ensure the lvb copied for migration matches in other valid locks */
+	if (!memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))
+		return;
+
+	mlog(ML_ERROR, "Mismatched lvb in lock cookie=%u:%llu, name=%.*s, "
+	     "node=%u\n",
+	     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+	     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+	     lock->lockres->lockname.len, lock->lockres->lockname.name,
+	     lock->ml.node);
+	dlm_print_one_lock_resource(lock->lockres);
+	BUG();
+}
 
 /* returns 1 if this lock fills the network structure,
  * 0 otherwise */
@@ -1181,20 +1218,7 @@ static int dlm_add_lock_to_array(struct dlm_lock *lock,
 	ml->list = queue;
 	if (lock->lksb) {
 		ml->flags = lock->lksb->flags;
-		/* send our current lvb */
-		if (ml->type == LKM_EXMODE ||
-		    ml->type == LKM_PRMODE) {
-			/* if it is already set, this had better be a PR
-			 * and it has to match */
-			if (!dlm_lvb_is_empty(mres->lvb) &&
-			    (ml->type == LKM_EXMODE ||
-			     memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) {
-				mlog(ML_ERROR, "mismatched lvbs!\n");
-				dlm_print_one_lock_resource(lock->lockres);
-				BUG();
-			}
-			memcpy(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN);
-		}
+		dlm_prepare_lvb_for_migration(lock, mres, queue);
 	}
 	ml->node = lock->ml.node;
 	mres->num_locks++;
@@ -1622,7 +1646,9 @@ int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
 				 &req, sizeof(req), nodenum, &status);
 	/* XXX: negative status not handled properly here. */
 	if (ret < 0)
-		mlog_errno(ret);
+		mlog(ML_ERROR, "Error %d when sending message %u (key "
+		     "0x%x) to node %u\n", ret, DLM_MASTER_REQUERY_MSG,
+		     dlm->key, nodenum);
 	else {
 		BUG_ON(status < 0);
 		BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN);
@@ -1730,6 +1756,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
 	struct dlm_lock *lock = NULL;
 	u8 from = O2NM_MAX_NODES;
 	unsigned int added = 0;
+	__be64 c;
 
 	mlog(0, "running %d locks for this lockres\n", mres->num_locks);
 	for (i=0; i<mres->num_locks; i++) {
@@ -1777,19 +1804,48 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
 			/* lock is always created locally first, and
 			 * destroyed locally last.  it must be on the list */
 			if (!lock) {
-				__be64 c = ml->cookie;
-				mlog(ML_ERROR, "could not find local lock "
-					       "with cookie %u:%llu!\n",
+				c = ml->cookie;
+				mlog(ML_ERROR, "Could not find local lock "
+					       "with cookie %u:%llu, node %u, "
+					       "list %u, flags 0x%x, type %d, "
+					       "conv %d, highest blocked %d\n",
 				     dlm_get_lock_cookie_node(be64_to_cpu(c)),
-				     dlm_get_lock_cookie_seq(be64_to_cpu(c)));
+				     dlm_get_lock_cookie_seq(be64_to_cpu(c)),
+				     ml->node, ml->list, ml->flags, ml->type,
+				     ml->convert_type, ml->highest_blocked);
+				__dlm_print_one_lock_resource(res);
+				BUG();
+			}
+
+			if (lock->ml.node != ml->node) {
+				c = lock->ml.cookie;
+				mlog(ML_ERROR, "Mismatched node# in lock "
+				     "cookie %u:%llu, name %.*s, node %u\n",
+				     dlm_get_lock_cookie_node(be64_to_cpu(c)),
+				     dlm_get_lock_cookie_seq(be64_to_cpu(c)),
+				     res->lockname.len, res->lockname.name,
+				     lock->ml.node);
+				c = ml->cookie;
+				mlog(ML_ERROR, "Migrate lock cookie %u:%llu, "
+				     "node %u, list %u, flags 0x%x, type %d, "
+				     "conv %d, highest blocked %d\n",
+				     dlm_get_lock_cookie_node(be64_to_cpu(c)),
+				     dlm_get_lock_cookie_seq(be64_to_cpu(c)),
+				     ml->node, ml->list, ml->flags, ml->type,
+				     ml->convert_type, ml->highest_blocked);
 				__dlm_print_one_lock_resource(res);
 				BUG();
 			}
-			BUG_ON(lock->ml.node != ml->node);
 
 			if (tmpq != queue) {
-				mlog(0, "lock was on %u instead of %u for %.*s\n",
-				     j, ml->list, res->lockname.len, res->lockname.name);
+				c = ml->cookie;
+				mlog(0, "Lock cookie %u:%llu was on list %u "
+				     "instead of list %u for %.*s\n",
+				     dlm_get_lock_cookie_node(be64_to_cpu(c)),
+				     dlm_get_lock_cookie_seq(be64_to_cpu(c)),
+				     j, ml->list, res->lockname.len,
+				     res->lockname.name);
+				__dlm_print_one_lock_resource(res);
 				spin_unlock(&res->spinlock);
 				continue;
 			}
@@ -1839,7 +1895,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
 				 * the lvb. */
 				memcpy(res->lvb, mres->lvb, DLM_LVB_LEN);
 			} else {
-				/* otherwise, the node is sending its 
+				/* otherwise, the node is sending its
 				 * most recent valid lvb info */
 				BUG_ON(ml->type != LKM_EXMODE &&
 				       ml->type != LKM_PRMODE);
@@ -1886,7 +1942,7 @@ skip_lvb:
 		spin_lock(&res->spinlock);
 		list_for_each_entry(lock, queue, list) {
 			if (lock->ml.cookie == ml->cookie) {
-				__be64 c = lock->ml.cookie;
+				c = lock->ml.cookie;
 				mlog(ML_ERROR, "%s:%.*s: %u:%llu: lock already "
 				     "exists on this lockres!\n", dlm->name,
 				     res->lockname.len, res->lockname.name,
@@ -2114,7 +2170,7 @@ static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
 	assert_spin_locked(&res->spinlock);
 
 	if (res->owner == dlm->node_num)
-		/* if this node owned the lockres, and if the dead node 
+		/* if this node owned the lockres, and if the dead node
 		 * had an EX when he died, blank out the lvb */
 		search_node = dead_node;
 	else {
@@ -2152,7 +2208,7 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
 
 	/* this node is the lockres master:
 	 * 1) remove any stale locks for the dead node
-	 * 2) if the dead node had an EX when he died, blank out the lvb 
+	 * 2) if the dead node had an EX when he died, blank out the lvb
 	 */
 	assert_spin_locked(&dlm->spinlock);
 	assert_spin_locked(&res->spinlock);
@@ -2193,7 +2249,12 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
 		mlog(0, "%s:%.*s: freed %u locks for dead node %u, "
 		     "dropping ref from lockres\n", dlm->name,
 		     res->lockname.len, res->lockname.name, freed, dead_node);
-		BUG_ON(!test_bit(dead_node, res->refmap));
+		if(!test_bit(dead_node, res->refmap)) {
+			mlog(ML_ERROR, "%s:%.*s: freed %u locks for dead node %u, "
+			     "but ref was not set\n", dlm->name,
+			     res->lockname.len, res->lockname.name, freed, dead_node);
+			__dlm_print_one_lock_resource(res);
+		}
 		dlm_lockres_clear_refmap_bit(dead_node, res);
 	} else if (test_bit(dead_node, res->refmap)) {
 		mlog(0, "%s:%.*s: dead node %u had a ref, but had "
@@ -2260,7 +2321,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
 				}
 				spin_unlock(&res->spinlock);
 				continue;
-			}			
+			}
 			spin_lock(&res->spinlock);
 			/* zero the lvb if necessary */
 			dlm_revalidate_lvb(dlm, res, dead_node);
@@ -2411,7 +2472,7 @@ static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st)
  * this function on each node racing to become the recovery
  * master will not stop attempting this until either:
  * a) this node gets the EX (and becomes the recovery master),
- * or b) dlm->reco.new_master gets set to some nodenum 
+ * or b) dlm->reco.new_master gets set to some nodenum
  * != O2NM_INVALID_NODE_NUM (another node will do the reco).
  * so each time a recovery master is needed, the entire cluster
  * will sync at this point.  if the new master dies, that will
@@ -2424,7 +2485,7 @@ static int dlm_pick_recovery_master(struct dlm_ctxt *dlm)
 
 	mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n",
 	     dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num);
-again:	
+again:
 	memset(&lksb, 0, sizeof(lksb));
 
 	ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY,
@@ -2437,8 +2498,8 @@ again:
 	if (ret == DLM_NORMAL) {
 		mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n",
 		     dlm->name, dlm->node_num);
-		
-		/* got the EX lock.  check to see if another node 
+
+		/* got the EX lock.  check to see if another node
 		 * just became the reco master */
 		if (dlm_reco_master_ready(dlm)) {
 			mlog(0, "%s: got reco EX lock, but %u will "
@@ -2451,12 +2512,12 @@ again:
 			/* see if recovery was already finished elsewhere */
 			spin_lock(&dlm->spinlock);
 			if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
-				status = -EINVAL;	
+				status = -EINVAL;
 				mlog(0, "%s: got reco EX lock, but "
 				     "node got recovered already\n", dlm->name);
 				if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) {
 					mlog(ML_ERROR, "%s: new master is %u "
-					     "but no dead node!\n", 
+					     "but no dead node!\n",
 					     dlm->name, dlm->reco.new_master);
 					BUG();
 				}
@@ -2468,7 +2529,7 @@ again:
 		 * set the master and send the messages to begin recovery */
 		if (!status) {
 			mlog(0, "%s: dead=%u, this=%u, sending "
-			     "begin_reco now\n", dlm->name, 
+			     "begin_reco now\n", dlm->name,
 			     dlm->reco.dead_node, dlm->node_num);
 			status = dlm_send_begin_reco_message(dlm,
 				      dlm->reco.dead_node);
@@ -2501,7 +2562,7 @@ again:
 		mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n",
 		     dlm->name, dlm->node_num);
 		/* another node is master. wait on
-		 * reco.new_master != O2NM_INVALID_NODE_NUM 
+		 * reco.new_master != O2NM_INVALID_NODE_NUM
 		 * for at most one second */
 		wait_event_timeout(dlm->dlm_reco_thread_wq,
 					 dlm_reco_master_ready(dlm),
@@ -2585,11 +2646,17 @@ retry:
 		if (dlm_is_host_down(ret)) {
 			/* node is down.  not involved in recovery
 			 * so just keep going */
-			mlog(0, "%s: node %u was down when sending "
+			mlog(ML_NOTICE, "%s: node %u was down when sending "
 			     "begin reco msg (%d)\n", dlm->name, nodenum, ret);
 			ret = 0;
 		}
-		if (ret == -EAGAIN) {
+
+		/*
+		 * Prior to commit aad1b15310b9bcd59fa81ab8f2b1513b59553ea8,
+		 * dlm_begin_reco_handler() returned EAGAIN and not -EAGAIN.
+		 * We are handling both for compatibility reasons.
+		 */
+		if (ret == -EAGAIN || ret == EAGAIN) {
 			mlog(0, "%s: trying to start recovery of node "
 			     "%u, but node %u is waiting for last recovery "
 			     "to complete, backoff for a bit\n", dlm->name,
@@ -2599,11 +2666,12 @@ retry:
 		}
 		if (ret < 0) {
 			struct dlm_lock_resource *res;
-			/* this is now a serious problem, possibly ENOMEM 
+
+			/* this is now a serious problem, possibly ENOMEM
 			 * in the network stack.  must retry */
 			mlog_errno(ret);
 			mlog(ML_ERROR, "begin reco of dlm %s to node %u "
-			    " returned %d\n", dlm->name, nodenum, ret);
+			     "returned %d\n", dlm->name, nodenum, ret);
 			res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME,
 						 DLM_RECOVERY_LOCK_NAME_LEN);
 			if (res) {
@@ -2612,7 +2680,7 @@ retry:
 			} else {
 				mlog(ML_ERROR, "recovery lock not found\n");
 			}
-			/* sleep for a bit in hopes that we can avoid 
+			/* sleep for a bit in hopes that we can avoid
 			 * another ENOMEM */
 			msleep(100);
 			goto retry;
@@ -2664,7 +2732,7 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data,
 	}
 	if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) {
 		mlog(ML_NOTICE, "%s: dead_node previously set to %u, "
-		     "node %u changing it to %u\n", dlm->name, 
+		     "node %u changing it to %u\n", dlm->name,
 		     dlm->reco.dead_node, br->node_idx, br->dead_node);
 	}
 	dlm_set_reco_master(dlm, br->node_idx);
@@ -2728,10 +2796,12 @@ stage2:
 		if (ret >= 0)
 			ret = status;
 		if (ret < 0) {
-			mlog_errno(ret);
+			mlog(ML_ERROR, "Error %d when sending message %u (key "
+			     "0x%x) to node %u\n", ret, DLM_FINALIZE_RECO_MSG,
+			     dlm->key, nodenum);
 			if (dlm_is_host_down(ret)) {
-				/* this has no effect on this recovery 
-				 * session, so set the status to zero to 
+				/* this has no effect on this recovery
+				 * session, so set the status to zero to
 				 * finish out the last recovery */
 				mlog(ML_ERROR, "node %u went down after this "
 				     "node finished recovery.\n", nodenum);
@@ -2768,7 +2838,7 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data,
 	mlog(0, "%s: node %u finalizing recovery stage%d of "
 	     "node %u (%u:%u)\n", dlm->name, fr->node_idx, stage,
 	     fr->dead_node, dlm->reco.dead_node, dlm->reco.new_master);
- 
+
 	spin_lock(&dlm->spinlock);
 
 	if (dlm->reco.new_master != fr->node_idx) {
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 52ec020ea78b..d4f73ca68fe5 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -28,7 +28,6 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
@@ -310,6 +309,7 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
 	 * spinlock, and because we know that it is not migrating/
 	 * recovering/in-progress, it is fine to reserve asts and
 	 * basts right before queueing them all throughout */
+	assert_spin_locked(&dlm->ast_lock);
 	assert_spin_locked(&res->spinlock);
 	BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING|
 			      DLM_LOCK_RES_RECOVERING|
@@ -338,7 +338,7 @@ converting:
 			/* queue the BAST if not already */
 			if (lock->ml.highest_blocked == LKM_IVMODE) {
 				__dlm_lockres_reserve_ast(res);
-				dlm_queue_bast(dlm, lock);
+				__dlm_queue_bast(dlm, lock);
 			}
 			/* update the highest_blocked if needed */
 			if (lock->ml.highest_blocked < target->ml.convert_type)
@@ -356,7 +356,7 @@ converting:
 			can_grant = 0;
 			if (lock->ml.highest_blocked == LKM_IVMODE) {
 				__dlm_lockres_reserve_ast(res);
-				dlm_queue_bast(dlm, lock);
+				__dlm_queue_bast(dlm, lock);
 			}
 			if (lock->ml.highest_blocked < target->ml.convert_type)
 				lock->ml.highest_blocked =
@@ -384,7 +384,7 @@ converting:
 		spin_unlock(&target->spinlock);
 
 		__dlm_lockres_reserve_ast(res);
-		dlm_queue_ast(dlm, target);
+		__dlm_queue_ast(dlm, target);
 		/* go back and check for more */
 		goto converting;
 	}
@@ -403,7 +403,7 @@ blocked:
 			can_grant = 0;
 			if (lock->ml.highest_blocked == LKM_IVMODE) {
 				__dlm_lockres_reserve_ast(res);
-				dlm_queue_bast(dlm, lock);
+				__dlm_queue_bast(dlm, lock);
 			}
 			if (lock->ml.highest_blocked < target->ml.type)
 				lock->ml.highest_blocked = target->ml.type;
@@ -419,7 +419,7 @@ blocked:
 			can_grant = 0;
 			if (lock->ml.highest_blocked == LKM_IVMODE) {
 				__dlm_lockres_reserve_ast(res);
-				dlm_queue_bast(dlm, lock);
+				__dlm_queue_bast(dlm, lock);
 			}
 			if (lock->ml.highest_blocked < target->ml.type)
 				lock->ml.highest_blocked = target->ml.type;
@@ -445,7 +445,7 @@ blocked:
 		spin_unlock(&target->spinlock);
 
 		__dlm_lockres_reserve_ast(res);
-		dlm_queue_ast(dlm, target);
+		__dlm_queue_ast(dlm, target);
 		/* go back and check for more */
 		goto converting;
 	}
@@ -675,6 +675,7 @@ static int dlm_thread(void *data)
 		 	/* lockres can be re-dirtied/re-added to the
 			 * dirty_list in this gap, but that is ok */
 
+			spin_lock(&dlm->ast_lock);
 			spin_lock(&res->spinlock);
 			if (res->owner != dlm->node_num) {
 				__dlm_print_one_lock_resource(res);
@@ -695,6 +696,7 @@ static int dlm_thread(void *data)
 				/* move it to the tail and keep going */
 				res->state &= ~DLM_LOCK_RES_DIRTY;
 				spin_unlock(&res->spinlock);
+				spin_unlock(&dlm->ast_lock);
 				mlog(0, "delaying list shuffling for in-"
 				     "progress lockres %.*s, state=%d\n",
 				     res->lockname.len, res->lockname.name,
@@ -716,6 +718,7 @@ static int dlm_thread(void *data)
 			dlm_shuffle_lists(dlm, res);
 			res->state &= ~DLM_LOCK_RES_DIRTY;
 			spin_unlock(&res->spinlock);
+			spin_unlock(&dlm->ast_lock);
 
 			dlm_lockres_calc_usage(dlm, res);
 
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 00f53b2aea76..817287c6a6db 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -28,7 +28,6 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
@@ -190,8 +189,8 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
 			actions &= ~(DLM_UNLOCK_REMOVE_LOCK|
 				     DLM_UNLOCK_REGRANT_LOCK|
 				     DLM_UNLOCK_CLEAR_CONVERT_TYPE);
-		} else if (status == DLM_RECOVERING || 
-			   status == DLM_MIGRATING || 
+		} else if (status == DLM_RECOVERING ||
+			   status == DLM_MIGRATING ||
 			   status == DLM_FORWARD) {
 			/* must clear the actions because this unlock
 			 * is about to be retried.  cannot free or do
@@ -355,7 +354,8 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
 			mlog(0, "master was in-progress.  retry\n");
 		ret = status;
 	} else {
-		mlog_errno(tmpret);
+		mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+		     "node %u\n", tmpret, DLM_UNLOCK_LOCK_MSG, dlm->key, owner);
 		if (dlm_is_host_down(tmpret)) {
 			/* NOTE: this seems strange, but it is what we want.
 			 * when the master goes down during a cancel or
@@ -661,14 +661,14 @@ retry:
 	if (call_ast) {
 		mlog(0, "calling unlockast(%p, %d)\n", data, status);
 		if (is_master) {
-			/* it is possible that there is one last bast 
+			/* it is possible that there is one last bast
 			 * pending.  make sure it is flushed, then
 			 * call the unlockast.
 			 * not an issue if this is a mastered remotely,
 			 * since this lock has been removed from the
 			 * lockres queues and cannot be found. */
 			dlm_kick_thread(dlm, NULL);
-			wait_event(dlm->ast_wq, 
+			wait_event(dlm->ast_wq,
 				   dlm_lock_basts_flushed(dlm, lock));
 		}
 		(*unlockast)(data, status);
diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile
new file mode 100644
index 000000000000..df69b4856d0d
--- /dev/null
+++ b/fs/ocfs2/dlmfs/Makefile
@@ -0,0 +1,5 @@
+EXTRA_CFLAGS += -Ifs/ocfs2
+
+obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o
+
+ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 02bf17808bdc..b83d6107a1f5 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -43,24 +43,17 @@
 #include <linux/init.h>
 #include <linux/string.h>
 #include <linux/backing-dev.h>
+#include <linux/poll.h>
 
 #include <asm/uaccess.h>
 
-
-#include "cluster/nodemanager.h"
-#include "cluster/heartbeat.h"
-#include "cluster/tcp.h"
-
-#include "dlmapi.h"
-
+#include "stackglue.h"
 #include "userdlm.h"
-
 #include "dlmfsver.h"
 
 #define MLOG_MASK_PREFIX ML_DLMFS
 #include "cluster/masklog.h"
 
-#include "ocfs2_lockingver.h"
 
 static const struct super_operations dlmfs_ops;
 static const struct file_operations dlmfs_file_operations;
@@ -71,15 +64,46 @@ static struct kmem_cache *dlmfs_inode_cache;
 
 struct workqueue_struct *user_dlm_worker;
 
+
+
 /*
- * This is the userdlmfs locking protocol version.
+ * These are the ABI capabilities of dlmfs.
+ *
+ * Over time, dlmfs has added some features that were not part of the
+ * initial ABI.  Unfortunately, some of these features are not detectable
+ * via standard usage.  For example, Linux's default poll always returns
+ * POLLIN, so there is no way for a caller of poll(2) to know when dlmfs
+ * added poll support.  Instead, we provide this list of new capabilities.
+ *
+ * Capabilities is a read-only attribute.  We do it as a module parameter
+ * so we can discover it whether dlmfs is built in, loaded, or even not
+ * loaded.
  *
- * See fs/ocfs2/dlmglue.c for more details on locking versions.
+ * The ABI features are local to this machine's dlmfs mount.  This is
+ * distinct from the locking protocol, which is concerned with inter-node
+ * interaction.
+ *
+ * Capabilities:
+ * - bast	: POLLIN against the file descriptor of a held lock
+ *		  signifies a bast fired on the lock.
  */
-static const struct dlm_protocol_version user_locking_protocol = {
-	.pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
-	.pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
-};
+#define DLMFS_CAPABILITIES "bast stackglue"
+extern int param_set_dlmfs_capabilities(const char *val,
+					struct kernel_param *kp)
+{
+	printk(KERN_ERR "%s: readonly parameter\n", kp->name);
+	return -EINVAL;
+}
+static int param_get_dlmfs_capabilities(char *buffer,
+					struct kernel_param *kp)
+{
+	return strlcpy(buffer, DLMFS_CAPABILITIES,
+		       strlen(DLMFS_CAPABILITIES) + 1);
+}
+module_param_call(capabilities, param_set_dlmfs_capabilities,
+		  param_get_dlmfs_capabilities, NULL, 0444);
+MODULE_PARM_DESC(capabilities, DLMFS_CAPABILITIES);
+
 
 /*
  * decodes a set of open flags into a valid lock level and a set of flags.
@@ -88,20 +112,20 @@ static const struct dlm_protocol_version user_locking_protocol = {
  * O_RDONLY -> PRMODE level
  * O_WRONLY -> EXMODE level
  *
- * O_NONBLOCK -> LKM_NOQUEUE
+ * O_NONBLOCK -> NOQUEUE
  */
 static int dlmfs_decode_open_flags(int open_flags,
 				   int *level,
 				   int *flags)
 {
 	if (open_flags & (O_WRONLY|O_RDWR))
-		*level = LKM_EXMODE;
+		*level = DLM_LOCK_EX;
 	else
-		*level = LKM_PRMODE;
+		*level = DLM_LOCK_PR;
 
 	*flags = 0;
 	if (open_flags & O_NONBLOCK)
-		*flags |= LKM_NOQUEUE;
+		*flags |= DLM_LKF_NOQUEUE;
 
 	return 0;
 }
@@ -142,7 +166,7 @@ static int dlmfs_file_open(struct inode *inode,
 		 * to be able userspace to be able to distinguish a
 		 * valid lock request from one that simply couldn't be
 		 * granted. */
-		if (flags & LKM_NOQUEUE && status == -EAGAIN)
+		if (flags & DLM_LKF_NOQUEUE && status == -EAGAIN)
 			status = -ETXTBSY;
 		kfree(fp);
 		goto bail;
@@ -169,7 +193,7 @@ static int dlmfs_file_release(struct inode *inode,
 	status = 0;
 	if (fp) {
 		level = fp->fp_lock_level;
-		if (level != LKM_IVMODE)
+		if (level != DLM_LOCK_IV)
 			user_dlm_cluster_unlock(&ip->ip_lockres, level);
 
 		kfree(fp);
@@ -179,13 +203,46 @@ static int dlmfs_file_release(struct inode *inode,
 	return 0;
 }
 
+/*
+ * We do ->setattr() just to override size changes.  Our size is the size
+ * of the LVB and nothing else.
+ */
+static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	int error;
+	struct inode *inode = dentry->d_inode;
+
+	attr->ia_valid &= ~ATTR_SIZE;
+	error = inode_change_ok(inode, attr);
+	if (!error)
+		error = inode_setattr(inode, attr);
+
+	return error;
+}
+
+static unsigned int dlmfs_file_poll(struct file *file, poll_table *wait)
+{
+	int event = 0;
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct dlmfs_inode_private *ip = DLMFS_I(inode);
+
+	poll_wait(file, &ip->ip_lockres.l_event, wait);
+
+	spin_lock(&ip->ip_lockres.l_lock);
+	if (ip->ip_lockres.l_flags & USER_LOCK_BLOCKED)
+		event = POLLIN | POLLRDNORM;
+	spin_unlock(&ip->ip_lockres.l_lock);
+
+	return event;
+}
+
 static ssize_t dlmfs_file_read(struct file *filp,
 			       char __user *buf,
 			       size_t count,
 			       loff_t *ppos)
 {
 	int bytes_left;
-	ssize_t readlen;
+	ssize_t readlen, got;
 	char *lvb_buf;
 	struct inode *inode = filp->f_path.dentry->d_inode;
 
@@ -205,15 +262,19 @@ static ssize_t dlmfs_file_read(struct file *filp,
 	if ((count + *ppos) > i_size_read(inode))
 		readlen = i_size_read(inode) - *ppos;
 	else
-		readlen = count - *ppos;
+		readlen = count;
 
 	lvb_buf = kmalloc(readlen, GFP_NOFS);
 	if (!lvb_buf)
 		return -ENOMEM;
 
-	user_dlm_read_lvb(inode, lvb_buf, readlen);
-	bytes_left = __copy_to_user(buf, lvb_buf, readlen);
-	readlen -= bytes_left;
+	got = user_dlm_read_lvb(inode, lvb_buf, readlen);
+	if (got) {
+		BUG_ON(got != readlen);
+		bytes_left = __copy_to_user(buf, lvb_buf, readlen);
+		readlen -= bytes_left;
+	} else
+		readlen = 0;
 
 	kfree(lvb_buf);
 
@@ -272,7 +333,7 @@ static void dlmfs_init_once(void *foo)
 	struct dlmfs_inode_private *ip =
 		(struct dlmfs_inode_private *) foo;
 
-	ip->ip_dlm = NULL;
+	ip->ip_conn = NULL;
 	ip->ip_parent = NULL;
 
 	inode_init_once(&ip->ip_vfs_inode);
@@ -314,14 +375,14 @@ static void dlmfs_clear_inode(struct inode *inode)
 		goto clear_fields;
 	}
 
-	mlog(0, "we're a directory, ip->ip_dlm = 0x%p\n", ip->ip_dlm);
+	mlog(0, "we're a directory, ip->ip_conn = 0x%p\n", ip->ip_conn);
 	/* we must be a directory. If required, lets unregister the
 	 * dlm context now. */
-	if (ip->ip_dlm)
-		user_dlm_unregister_context(ip->ip_dlm);
+	if (ip->ip_conn)
+		user_dlm_unregister(ip->ip_conn);
 clear_fields:
 	ip->ip_parent = NULL;
-	ip->ip_dlm = NULL;
+	ip->ip_conn = NULL;
 }
 
 static struct backing_dev_info dlmfs_backing_dev_info = {
@@ -371,7 +432,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 
 	ip = DLMFS_I(inode);
-	ip->ip_dlm = DLMFS_I(parent)->ip_dlm;
+	ip->ip_conn = DLMFS_I(parent)->ip_conn;
 
 	switch (mode & S_IFMT) {
 	default:
@@ -425,13 +486,12 @@ static int dlmfs_mkdir(struct inode * dir,
 	struct inode *inode = NULL;
 	struct qstr *domain = &dentry->d_name;
 	struct dlmfs_inode_private *ip;
-	struct dlm_ctxt *dlm;
-	struct dlm_protocol_version proto = user_locking_protocol;
+	struct ocfs2_cluster_connection *conn;
 
 	mlog(0, "mkdir %.*s\n", domain->len, domain->name);
 
 	/* verify that we have a proper domain */
-	if (domain->len >= O2NM_MAX_NAME_LEN) {
+	if (domain->len >= GROUP_NAME_MAX) {
 		status = -EINVAL;
 		mlog(ML_ERROR, "invalid domain name for directory.\n");
 		goto bail;
@@ -446,14 +506,14 @@ static int dlmfs_mkdir(struct inode * dir,
 
 	ip = DLMFS_I(inode);
 
-	dlm = user_dlm_register_context(domain, &proto);
-	if (IS_ERR(dlm)) {
-		status = PTR_ERR(dlm);
+	conn = user_dlm_register(domain);
+	if (IS_ERR(conn)) {
+		status = PTR_ERR(conn);
 		mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n",
 		     status, domain->len, domain->name);
 		goto bail;
 	}
-	ip->ip_dlm = dlm;
+	ip->ip_conn = conn;
 
 	inc_nlink(dir);
 	d_instantiate(dentry, inode);
@@ -549,6 +609,7 @@ static int dlmfs_fill_super(struct super_block * sb,
 static const struct file_operations dlmfs_file_operations = {
 	.open		= dlmfs_file_open,
 	.release	= dlmfs_file_release,
+	.poll		= dlmfs_file_poll,
 	.read		= dlmfs_file_read,
 	.write		= dlmfs_file_write,
 };
@@ -576,6 +637,7 @@ static const struct super_operations dlmfs_ops = {
 
 static const struct inode_operations dlmfs_file_inode_operations = {
 	.getattr	= simple_getattr,
+	.setattr	= dlmfs_file_setattr,
 };
 
 static int dlmfs_get_sb(struct file_system_type *fs_type,
@@ -620,6 +682,7 @@ static int __init init_dlmfs_fs(void)
 	}
 	cleanup_worker = 1;
 
+	user_dlm_set_locking_protocol();
 	status = register_filesystem(&dlmfs_fs_type);
 bail:
 	if (status) {
diff --git a/fs/ocfs2/dlm/dlmfsver.c b/fs/ocfs2/dlmfs/dlmfsver.c
index a733b3321f83..a733b3321f83 100644
--- a/fs/ocfs2/dlm/dlmfsver.c
+++ b/fs/ocfs2/dlmfs/dlmfsver.c
diff --git a/fs/ocfs2/dlm/dlmfsver.h b/fs/ocfs2/dlmfs/dlmfsver.h
index f35eadbed25c..f35eadbed25c 100644
--- a/fs/ocfs2/dlm/dlmfsver.h
+++ b/fs/ocfs2/dlmfs/dlmfsver.h
diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c
index 4cb1d3dae250..0499e3fb7bdb 100644
--- a/fs/ocfs2/dlm/userdlm.c
+++ b/fs/ocfs2/dlmfs/userdlm.c
@@ -34,18 +34,19 @@
 #include <linux/types.h>
 #include <linux/crc32.h>
 
-
-#include "cluster/nodemanager.h"
-#include "cluster/heartbeat.h"
-#include "cluster/tcp.h"
-
-#include "dlmapi.h"
-
+#include "ocfs2_lockingver.h"
+#include "stackglue.h"
 #include "userdlm.h"
 
 #define MLOG_MASK_PREFIX ML_DLMFS
 #include "cluster/masklog.h"
 
+
+static inline struct user_lock_res *user_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb)
+{
+	return container_of(lksb, struct user_lock_res, l_lksb);
+}
+
 static inline int user_check_wait_flag(struct user_lock_res *lockres,
 				       int flag)
 {
@@ -73,15 +74,15 @@ static inline void user_wait_on_blocked_lock(struct user_lock_res *lockres)
 }
 
 /* I heart container_of... */
-static inline struct dlm_ctxt *
-dlm_ctxt_from_user_lockres(struct user_lock_res *lockres)
+static inline struct ocfs2_cluster_connection *
+cluster_connection_from_user_lockres(struct user_lock_res *lockres)
 {
 	struct dlmfs_inode_private *ip;
 
 	ip = container_of(lockres,
 			  struct dlmfs_inode_private,
 			  ip_lockres);
-	return ip->ip_dlm;
+	return ip->ip_conn;
 }
 
 static struct inode *
@@ -103,9 +104,9 @@ static inline void user_recover_from_dlm_error(struct user_lock_res *lockres)
 }
 
 #define user_log_dlm_error(_func, _stat, _lockres) do {			\
-	mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on "		\
-		"resource %.*s: %s\n", dlm_errname(_stat), _func,	\
-		_lockres->l_namelen, _lockres->l_name, dlm_errmsg(_stat)); \
+	mlog(ML_ERROR, "Dlm error %d while calling %s on "		\
+		"resource %.*s\n", _stat, _func,			\
+		_lockres->l_namelen, _lockres->l_name); 		\
 } while (0)
 
 /* WARNING: This function lives in a world where the only three lock
@@ -113,34 +114,35 @@ static inline void user_recover_from_dlm_error(struct user_lock_res *lockres)
  * lock types are added. */
 static inline int user_highest_compat_lock_level(int level)
 {
-	int new_level = LKM_EXMODE;
+	int new_level = DLM_LOCK_EX;
 
-	if (level == LKM_EXMODE)
-		new_level = LKM_NLMODE;
-	else if (level == LKM_PRMODE)
-		new_level = LKM_PRMODE;
+	if (level == DLM_LOCK_EX)
+		new_level = DLM_LOCK_NL;
+	else if (level == DLM_LOCK_PR)
+		new_level = DLM_LOCK_PR;
 	return new_level;
 }
 
-static void user_ast(void *opaque)
+static void user_ast(struct ocfs2_dlm_lksb *lksb)
 {
-	struct user_lock_res *lockres = opaque;
-	struct dlm_lockstatus *lksb;
+	struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
+	int status;
 
-	mlog(0, "AST fired for lockres %.*s\n", lockres->l_namelen,
-	     lockres->l_name);
+	mlog(ML_BASTS, "AST fired for lockres %.*s, level %d => %d\n",
+	     lockres->l_namelen, lockres->l_name, lockres->l_level,
+	     lockres->l_requested);
 
 	spin_lock(&lockres->l_lock);
 
-	lksb = &(lockres->l_lksb);
-	if (lksb->status != DLM_NORMAL) {
+	status = ocfs2_dlm_lock_status(&lockres->l_lksb);
+	if (status) {
 		mlog(ML_ERROR, "lksb status value of %u on lockres %.*s\n",
-		     lksb->status, lockres->l_namelen, lockres->l_name);
+		     status, lockres->l_namelen, lockres->l_name);
 		spin_unlock(&lockres->l_lock);
 		return;
 	}
 
-	mlog_bug_on_msg(lockres->l_requested == LKM_IVMODE,
+	mlog_bug_on_msg(lockres->l_requested == DLM_LOCK_IV,
 			"Lockres %.*s, requested ivmode. flags 0x%x\n",
 			lockres->l_namelen, lockres->l_name, lockres->l_flags);
 
@@ -148,13 +150,13 @@ static void user_ast(void *opaque)
 	if (lockres->l_requested < lockres->l_level) {
 		if (lockres->l_requested <=
 		    user_highest_compat_lock_level(lockres->l_blocking)) {
-			lockres->l_blocking = LKM_NLMODE;
+			lockres->l_blocking = DLM_LOCK_NL;
 			lockres->l_flags &= ~USER_LOCK_BLOCKED;
 		}
 	}
 
 	lockres->l_level = lockres->l_requested;
-	lockres->l_requested = LKM_IVMODE;
+	lockres->l_requested = DLM_LOCK_IV;
 	lockres->l_flags |= USER_LOCK_ATTACHED;
 	lockres->l_flags &= ~USER_LOCK_BUSY;
 
@@ -193,11 +195,11 @@ static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres)
 		return;
 
 	switch (lockres->l_blocking) {
-	case LKM_EXMODE:
+	case DLM_LOCK_EX:
 		if (!lockres->l_ex_holders && !lockres->l_ro_holders)
 			queue = 1;
 		break;
-	case LKM_PRMODE:
+	case DLM_LOCK_PR:
 		if (!lockres->l_ex_holders)
 			queue = 1;
 		break;
@@ -209,12 +211,12 @@ static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres)
 		__user_dlm_queue_lockres(lockres);
 }
 
-static void user_bast(void *opaque, int level)
+static void user_bast(struct ocfs2_dlm_lksb *lksb, int level)
 {
-	struct user_lock_res *lockres = opaque;
+	struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
 
-	mlog(0, "Blocking AST fired for lockres %.*s. Blocking level %d\n",
-	     lockres->l_namelen, lockres->l_name, level);
+	mlog(ML_BASTS, "BAST fired for lockres %.*s, blocking %d, level %d\n",
+	     lockres->l_namelen, lockres->l_name, level, lockres->l_level);
 
 	spin_lock(&lockres->l_lock);
 	lockres->l_flags |= USER_LOCK_BLOCKED;
@@ -227,15 +229,15 @@ static void user_bast(void *opaque, int level)
 	wake_up(&lockres->l_event);
 }
 
-static void user_unlock_ast(void *opaque, enum dlm_status status)
+static void user_unlock_ast(struct ocfs2_dlm_lksb *lksb, int status)
 {
-	struct user_lock_res *lockres = opaque;
+	struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
 
-	mlog(0, "UNLOCK AST called on lock %.*s\n", lockres->l_namelen,
-	     lockres->l_name);
+	mlog(ML_BASTS, "UNLOCK AST fired for lockres %.*s, flags 0x%x\n",
+	     lockres->l_namelen, lockres->l_name, lockres->l_flags);
 
-	if (status != DLM_NORMAL && status != DLM_CANCELGRANT)
-		mlog(ML_ERROR, "Dlm returns status %d\n", status);
+	if (status)
+		mlog(ML_ERROR, "dlm returns status %d\n", status);
 
 	spin_lock(&lockres->l_lock);
 	/* The teardown flag gets set early during the unlock process,
@@ -243,7 +245,7 @@ static void user_unlock_ast(void *opaque, enum dlm_status status)
 	 * for a concurrent cancel. */
 	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN
 	    && !(lockres->l_flags & USER_LOCK_IN_CANCEL)) {
-		lockres->l_level = LKM_IVMODE;
+		lockres->l_level = DLM_LOCK_IV;
 	} else if (status == DLM_CANCELGRANT) {
 		/* We tried to cancel a convert request, but it was
 		 * already granted. Don't clear the busy flag - the
@@ -254,7 +256,7 @@ static void user_unlock_ast(void *opaque, enum dlm_status status)
 	} else {
 		BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL));
 		/* Cancel succeeded, we want to re-queue */
-		lockres->l_requested = LKM_IVMODE; /* cancel an
+		lockres->l_requested = DLM_LOCK_IV; /* cancel an
 						    * upconvert
 						    * request. */
 		lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
@@ -271,6 +273,21 @@ out_noclear:
 	wake_up(&lockres->l_event);
 }
 
+/*
+ * This is the userdlmfs locking protocol version.
+ *
+ * See fs/ocfs2/dlmglue.c for more details on locking versions.
+ */
+static struct ocfs2_locking_protocol user_dlm_lproto = {
+	.lp_max_version = {
+		.pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
+		.pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
+	},
+	.lp_lock_ast		= user_ast,
+	.lp_blocking_ast	= user_bast,
+	.lp_unlock_ast		= user_unlock_ast,
+};
+
 static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres)
 {
 	struct inode *inode;
@@ -283,10 +300,10 @@ static void user_dlm_unblock_lock(struct work_struct *work)
 	int new_level, status;
 	struct user_lock_res *lockres =
 		container_of(work, struct user_lock_res, l_work);
-	struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
+	struct ocfs2_cluster_connection *conn =
+		cluster_connection_from_user_lockres(lockres);
 
-	mlog(0, "processing lockres %.*s\n", lockres->l_namelen,
-	     lockres->l_name);
+	mlog(0, "lockres %.*s\n", lockres->l_namelen, lockres->l_name);
 
 	spin_lock(&lockres->l_lock);
 
@@ -304,17 +321,23 @@ static void user_dlm_unblock_lock(struct work_struct *work)
 	 * flag, and finally we might get another bast which re-queues
 	 * us before our ast for the downconvert is called. */
 	if (!(lockres->l_flags & USER_LOCK_BLOCKED)) {
+		mlog(ML_BASTS, "lockres %.*s USER_LOCK_BLOCKED\n",
+		     lockres->l_namelen, lockres->l_name);
 		spin_unlock(&lockres->l_lock);
 		goto drop_ref;
 	}
 
 	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
+		mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_TEARDOWN\n",
+		     lockres->l_namelen, lockres->l_name);
 		spin_unlock(&lockres->l_lock);
 		goto drop_ref;
 	}
 
 	if (lockres->l_flags & USER_LOCK_BUSY) {
 		if (lockres->l_flags & USER_LOCK_IN_CANCEL) {
+			mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_CANCEL\n",
+			     lockres->l_namelen, lockres->l_name);
 			spin_unlock(&lockres->l_lock);
 			goto drop_ref;
 		}
@@ -322,32 +345,31 @@ static void user_dlm_unblock_lock(struct work_struct *work)
 		lockres->l_flags |= USER_LOCK_IN_CANCEL;
 		spin_unlock(&lockres->l_lock);
 
-		status = dlmunlock(dlm,
-				   &lockres->l_lksb,
-				   LKM_CANCEL,
-				   user_unlock_ast,
-				   lockres);
-		if (status != DLM_NORMAL)
-			user_log_dlm_error("dlmunlock", status, lockres);
+		status = ocfs2_dlm_unlock(conn, &lockres->l_lksb,
+					  DLM_LKF_CANCEL);
+		if (status)
+			user_log_dlm_error("ocfs2_dlm_unlock", status, lockres);
 		goto drop_ref;
 	}
 
 	/* If there are still incompat holders, we can exit safely
 	 * without worrying about re-queueing this lock as that will
 	 * happen on the last call to user_cluster_unlock. */
-	if ((lockres->l_blocking == LKM_EXMODE)
+	if ((lockres->l_blocking == DLM_LOCK_EX)
 	    && (lockres->l_ex_holders || lockres->l_ro_holders)) {
 		spin_unlock(&lockres->l_lock);
-		mlog(0, "can't downconvert for ex: ro = %u, ex = %u\n",
-			lockres->l_ro_holders, lockres->l_ex_holders);
+		mlog(ML_BASTS, "lockres %.*s, EX/PR Holders %u,%u\n",
+		     lockres->l_namelen, lockres->l_name,
+		     lockres->l_ex_holders, lockres->l_ro_holders);
 		goto drop_ref;
 	}
 
-	if ((lockres->l_blocking == LKM_PRMODE)
+	if ((lockres->l_blocking == DLM_LOCK_PR)
 	    && lockres->l_ex_holders) {
 		spin_unlock(&lockres->l_lock);
-		mlog(0, "can't downconvert for pr: ex = %u\n",
-			lockres->l_ex_holders);
+		mlog(ML_BASTS, "lockres %.*s, EX Holders %u\n",
+		     lockres->l_namelen, lockres->l_name,
+		     lockres->l_ex_holders);
 		goto drop_ref;
 	}
 
@@ -355,22 +377,17 @@ static void user_dlm_unblock_lock(struct work_struct *work)
 	new_level = user_highest_compat_lock_level(lockres->l_blocking);
 	lockres->l_requested = new_level;
 	lockres->l_flags |= USER_LOCK_BUSY;
-	mlog(0, "Downconvert lock from %d to %d\n",
-		lockres->l_level, new_level);
+	mlog(ML_BASTS, "lockres %.*s, downconvert %d => %d\n",
+	     lockres->l_namelen, lockres->l_name, lockres->l_level, new_level);
 	spin_unlock(&lockres->l_lock);
 
 	/* need lock downconvert request now... */
-	status = dlmlock(dlm,
-			 new_level,
-			 &lockres->l_lksb,
-			 LKM_CONVERT|LKM_VALBLK,
-			 lockres->l_name,
-			 lockres->l_namelen,
-			 user_ast,
-			 lockres,
-			 user_bast);
-	if (status != DLM_NORMAL) {
-		user_log_dlm_error("dlmlock", status, lockres);
+	status = ocfs2_dlm_lock(conn, new_level, &lockres->l_lksb,
+				DLM_LKF_CONVERT|DLM_LKF_VALBLK,
+				lockres->l_name,
+				lockres->l_namelen);
+	if (status) {
+		user_log_dlm_error("ocfs2_dlm_lock", status, lockres);
 		user_recover_from_dlm_error(lockres);
 	}
 
@@ -382,10 +399,10 @@ static inline void user_dlm_inc_holders(struct user_lock_res *lockres,
 					int level)
 {
 	switch(level) {
-	case LKM_EXMODE:
+	case DLM_LOCK_EX:
 		lockres->l_ex_holders++;
 		break;
-	case LKM_PRMODE:
+	case DLM_LOCK_PR:
 		lockres->l_ro_holders++;
 		break;
 	default:
@@ -410,20 +427,19 @@ int user_dlm_cluster_lock(struct user_lock_res *lockres,
 			  int lkm_flags)
 {
 	int status, local_flags;
-	struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
+	struct ocfs2_cluster_connection *conn =
+		cluster_connection_from_user_lockres(lockres);
 
-	if (level != LKM_EXMODE &&
-	    level != LKM_PRMODE) {
+	if (level != DLM_LOCK_EX &&
+	    level != DLM_LOCK_PR) {
 		mlog(ML_ERROR, "lockres %.*s: invalid request!\n",
 		     lockres->l_namelen, lockres->l_name);
 		status = -EINVAL;
 		goto bail;
 	}
 
-	mlog(0, "lockres %.*s: asking for %s lock, passed flags = 0x%x\n",
-	     lockres->l_namelen, lockres->l_name,
-	     (level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE",
-	     lkm_flags);
+	mlog(ML_BASTS, "lockres %.*s, level %d, flags = 0x%x\n",
+	     lockres->l_namelen, lockres->l_name, level, lkm_flags);
 
 again:
 	if (signal_pending(current)) {
@@ -457,35 +473,26 @@ again:
 	}
 
 	if (level > lockres->l_level) {
-		local_flags = lkm_flags | LKM_VALBLK;
-		if (lockres->l_level != LKM_IVMODE)
-			local_flags |= LKM_CONVERT;
+		local_flags = lkm_flags | DLM_LKF_VALBLK;
+		if (lockres->l_level != DLM_LOCK_IV)
+			local_flags |= DLM_LKF_CONVERT;
 
 		lockres->l_requested = level;
 		lockres->l_flags |= USER_LOCK_BUSY;
 		spin_unlock(&lockres->l_lock);
 
-		BUG_ON(level == LKM_IVMODE);
-		BUG_ON(level == LKM_NLMODE);
+		BUG_ON(level == DLM_LOCK_IV);
+		BUG_ON(level == DLM_LOCK_NL);
 
 		/* call dlm_lock to upgrade lock now */
-		status = dlmlock(dlm,
-				 level,
-				 &lockres->l_lksb,
-				 local_flags,
-				 lockres->l_name,
-				 lockres->l_namelen,
-				 user_ast,
-				 lockres,
-				 user_bast);
-		if (status != DLM_NORMAL) {
-			if ((lkm_flags & LKM_NOQUEUE) &&
-			    (status == DLM_NOTQUEUED))
-				status = -EAGAIN;
-			else {
-				user_log_dlm_error("dlmlock", status, lockres);
-				status = -EINVAL;
-			}
+		status = ocfs2_dlm_lock(conn, level, &lockres->l_lksb,
+					local_flags, lockres->l_name,
+					lockres->l_namelen);
+		if (status) {
+			if ((lkm_flags & DLM_LKF_NOQUEUE) &&
+			    (status != -EAGAIN))
+				user_log_dlm_error("ocfs2_dlm_lock",
+						   status, lockres);
 			user_recover_from_dlm_error(lockres);
 			goto bail;
 		}
@@ -506,11 +513,11 @@ static inline void user_dlm_dec_holders(struct user_lock_res *lockres,
 					int level)
 {
 	switch(level) {
-	case LKM_EXMODE:
+	case DLM_LOCK_EX:
 		BUG_ON(!lockres->l_ex_holders);
 		lockres->l_ex_holders--;
 		break;
-	case LKM_PRMODE:
+	case DLM_LOCK_PR:
 		BUG_ON(!lockres->l_ro_holders);
 		lockres->l_ro_holders--;
 		break;
@@ -522,8 +529,8 @@ static inline void user_dlm_dec_holders(struct user_lock_res *lockres,
 void user_dlm_cluster_unlock(struct user_lock_res *lockres,
 			     int level)
 {
-	if (level != LKM_EXMODE &&
-	    level != LKM_PRMODE) {
+	if (level != DLM_LOCK_EX &&
+	    level != DLM_LOCK_PR) {
 		mlog(ML_ERROR, "lockres %.*s: invalid request!\n",
 		     lockres->l_namelen, lockres->l_name);
 		return;
@@ -540,33 +547,40 @@ void user_dlm_write_lvb(struct inode *inode,
 			unsigned int len)
 {
 	struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
-	char *lvb = lockres->l_lksb.lvb;
+	char *lvb;
 
 	BUG_ON(len > DLM_LVB_LEN);
 
 	spin_lock(&lockres->l_lock);
 
-	BUG_ON(lockres->l_level < LKM_EXMODE);
+	BUG_ON(lockres->l_level < DLM_LOCK_EX);
+	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
 	memcpy(lvb, val, len);
 
 	spin_unlock(&lockres->l_lock);
 }
 
-void user_dlm_read_lvb(struct inode *inode,
-		       char *val,
-		       unsigned int len)
+ssize_t user_dlm_read_lvb(struct inode *inode,
+			  char *val,
+			  unsigned int len)
 {
 	struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
-	char *lvb = lockres->l_lksb.lvb;
+	char *lvb;
+	ssize_t ret = len;
 
 	BUG_ON(len > DLM_LVB_LEN);
 
 	spin_lock(&lockres->l_lock);
 
-	BUG_ON(lockres->l_level < LKM_PRMODE);
-	memcpy(val, lvb, len);
+	BUG_ON(lockres->l_level < DLM_LOCK_PR);
+	if (ocfs2_dlm_lvb_valid(&lockres->l_lksb)) {
+		lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+		memcpy(val, lvb, len);
+	} else
+		ret = 0;
 
 	spin_unlock(&lockres->l_lock);
+	return ret;
 }
 
 void user_dlm_lock_res_init(struct user_lock_res *lockres,
@@ -576,9 +590,9 @@ void user_dlm_lock_res_init(struct user_lock_res *lockres,
 
 	spin_lock_init(&lockres->l_lock);
 	init_waitqueue_head(&lockres->l_event);
-	lockres->l_level = LKM_IVMODE;
-	lockres->l_requested = LKM_IVMODE;
-	lockres->l_blocking = LKM_IVMODE;
+	lockres->l_level = DLM_LOCK_IV;
+	lockres->l_requested = DLM_LOCK_IV;
+	lockres->l_blocking = DLM_LOCK_IV;
 
 	/* should have been checked before getting here. */
 	BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN);
@@ -592,9 +606,10 @@ void user_dlm_lock_res_init(struct user_lock_res *lockres,
 int user_dlm_destroy_lock(struct user_lock_res *lockres)
 {
 	int status = -EBUSY;
-	struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
+	struct ocfs2_cluster_connection *conn =
+		cluster_connection_from_user_lockres(lockres);
 
-	mlog(0, "asked to destroy %.*s\n", lockres->l_namelen, lockres->l_name);
+	mlog(ML_BASTS, "lockres %.*s\n", lockres->l_namelen, lockres->l_name);
 
 	spin_lock(&lockres->l_lock);
 	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
@@ -627,14 +642,9 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres)
 	lockres->l_flags |= USER_LOCK_BUSY;
 	spin_unlock(&lockres->l_lock);
 
-	status = dlmunlock(dlm,
-			   &lockres->l_lksb,
-			   LKM_VALBLK,
-			   user_unlock_ast,
-			   lockres);
-	if (status != DLM_NORMAL) {
-		user_log_dlm_error("dlmunlock", status, lockres);
-		status = -EINVAL;
+	status = ocfs2_dlm_unlock(conn, &lockres->l_lksb, DLM_LKF_VALBLK);
+	if (status) {
+		user_log_dlm_error("ocfs2_dlm_unlock", status, lockres);
 		goto bail;
 	}
 
@@ -645,32 +655,34 @@ bail:
 	return status;
 }
 
-struct dlm_ctxt *user_dlm_register_context(struct qstr *name,
-					   struct dlm_protocol_version *proto)
+static void user_dlm_recovery_handler_noop(int node_num,
+					   void *recovery_data)
 {
-	struct dlm_ctxt *dlm;
-	u32 dlm_key;
-	char *domain;
-
-	domain = kmalloc(name->len + 1, GFP_NOFS);
-	if (!domain) {
-		mlog_errno(-ENOMEM);
-		return ERR_PTR(-ENOMEM);
-	}
+	/* We ignore recovery events */
+	return;
+}
 
-	dlm_key = crc32_le(0, name->name, name->len);
+void user_dlm_set_locking_protocol(void)
+{
+	ocfs2_stack_glue_set_max_proto_version(&user_dlm_lproto.lp_max_version);
+}
 
-	snprintf(domain, name->len + 1, "%.*s", name->len, name->name);
+struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name)
+{
+	int rc;
+	struct ocfs2_cluster_connection *conn;
 
-	dlm = dlm_register_domain(domain, dlm_key, proto);
-	if (IS_ERR(dlm))
-		mlog_errno(PTR_ERR(dlm));
+	rc = ocfs2_cluster_connect_agnostic(name->name, name->len,
+					    &user_dlm_lproto,
+					    user_dlm_recovery_handler_noop,
+					    NULL, &conn);
+	if (rc)
+		mlog_errno(rc);
 
-	kfree(domain);
-	return dlm;
+	return rc ? ERR_PTR(rc) : conn;
 }
 
-void user_dlm_unregister_context(struct dlm_ctxt *dlm)
+void user_dlm_unregister(struct ocfs2_cluster_connection *conn)
 {
-	dlm_unregister_domain(dlm);
+	ocfs2_cluster_disconnect(conn, 0);
 }
diff --git a/fs/ocfs2/dlm/userdlm.h b/fs/ocfs2/dlmfs/userdlm.h
index 0c3cc03c61fa..3b42d79531d7 100644
--- a/fs/ocfs2/dlm/userdlm.h
+++ b/fs/ocfs2/dlmfs/userdlm.h
@@ -57,7 +57,7 @@ struct user_lock_res {
 	int                      l_level;
 	unsigned int             l_ro_holders;
 	unsigned int             l_ex_holders;
-	struct dlm_lockstatus    l_lksb;
+	struct ocfs2_dlm_lksb    l_lksb;
 
 	int                      l_requested;
 	int                      l_blocking;
@@ -80,15 +80,15 @@ void user_dlm_cluster_unlock(struct user_lock_res *lockres,
 void user_dlm_write_lvb(struct inode *inode,
 			const char *val,
 			unsigned int len);
-void user_dlm_read_lvb(struct inode *inode,
-		       char *val,
-		       unsigned int len);
-struct dlm_ctxt *user_dlm_register_context(struct qstr *name,
-					   struct dlm_protocol_version *proto);
-void user_dlm_unregister_context(struct dlm_ctxt *dlm);
+ssize_t user_dlm_read_lvb(struct inode *inode,
+			  char *val,
+			  unsigned int len);
+struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name);
+void user_dlm_unregister(struct ocfs2_cluster_connection *conn);
+void user_dlm_set_locking_protocol(void);
 
 struct dlmfs_inode_private {
-	struct dlm_ctxt             *ip_dlm;
+	struct ocfs2_cluster_connection	*ip_conn;
 
 	struct user_lock_res ip_lockres; /* unused for directories. */
 	struct inode         *ip_parent;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index c5e4a49e3a12..39eb16ac5f98 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -297,6 +297,11 @@ static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
 		lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
 }
 
+static inline struct ocfs2_lock_res *ocfs2_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb)
+{
+	return container_of(lksb, struct ocfs2_lock_res, l_lksb);
+}
+
 static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
 {
 	BUG_ON(!ocfs2_is_inode_lock(lockres));
@@ -875,6 +880,14 @@ static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lo
 		lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
 
 	lockres->l_level = lockres->l_requested;
+
+	/*
+	 * We set the OCFS2_LOCK_UPCONVERT_FINISHING flag before clearing
+	 * the OCFS2_LOCK_BUSY flag to prevent the dc thread from
+	 * downconverting the lock before the upconvert has fully completed.
+	 */
+	lockres_or_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
+
 	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
 
 	mlog_exit_void();
@@ -907,8 +920,6 @@ static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
 
 	assert_spin_locked(&lockres->l_lock);
 
-	lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
-
 	if (level > lockres->l_blocking) {
 		/* only schedule a downconvert if we haven't already scheduled
 		 * one that goes low enough to satisfy the level we're
@@ -921,6 +932,13 @@ static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
 		lockres->l_blocking = level;
 	}
 
+	mlog(ML_BASTS, "lockres %s, block %d, level %d, l_block %d, dwn %d\n",
+	     lockres->l_name, level, lockres->l_level, lockres->l_blocking,
+	     needs_downconvert);
+
+	if (needs_downconvert)
+		lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
+
 	mlog_exit(needs_downconvert);
 	return needs_downconvert;
 }
@@ -1031,18 +1049,17 @@ static unsigned int lockres_set_pending(struct ocfs2_lock_res *lockres)
 	return lockres->l_pending_gen;
 }
 
-
-static void ocfs2_blocking_ast(void *opaque, int level)
+static void ocfs2_blocking_ast(struct ocfs2_dlm_lksb *lksb, int level)
 {
-	struct ocfs2_lock_res *lockres = opaque;
+	struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
 	struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
 	int needs_downconvert;
 	unsigned long flags;
 
 	BUG_ON(level <= DLM_LOCK_NL);
 
-	mlog(0, "BAST fired for lockres %s, blocking %d, level %d type %s\n",
-	     lockres->l_name, level, lockres->l_level,
+	mlog(ML_BASTS, "BAST fired for lockres %s, blocking %d, level %d, "
+	     "type %s\n", lockres->l_name, level, lockres->l_level,
 	     ocfs2_lock_type_string(lockres->l_type));
 
 	/*
@@ -1063,9 +1080,9 @@ static void ocfs2_blocking_ast(void *opaque, int level)
 	ocfs2_wake_downconvert_thread(osb);
 }
 
-static void ocfs2_locking_ast(void *opaque)
+static void ocfs2_locking_ast(struct ocfs2_dlm_lksb *lksb)
 {
-	struct ocfs2_lock_res *lockres = opaque;
+	struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
 	struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
 	unsigned long flags;
 	int status;
@@ -1086,6 +1103,10 @@ static void ocfs2_locking_ast(void *opaque)
 		return;
 	}
 
+	mlog(ML_BASTS, "AST fired for lockres %s, action %d, unlock %d, "
+	     "level %d => %d\n", lockres->l_name, lockres->l_action,
+	     lockres->l_unlock_action, lockres->l_level, lockres->l_requested);
+
 	switch(lockres->l_action) {
 	case OCFS2_AST_ATTACH:
 		ocfs2_generic_handle_attach_action(lockres);
@@ -1098,8 +1119,8 @@ static void ocfs2_locking_ast(void *opaque)
 		ocfs2_generic_handle_downconvert_action(lockres);
 		break;
 	default:
-		mlog(ML_ERROR, "lockres %s: ast fired with invalid action: %u "
-		     "lockres flags = 0x%lx, unlock action: %u\n",
+		mlog(ML_ERROR, "lockres %s: AST fired with invalid action: %u, "
+		     "flags 0x%lx, unlock: %u\n",
 		     lockres->l_name, lockres->l_action, lockres->l_flags,
 		     lockres->l_unlock_action);
 		BUG();
@@ -1125,6 +1146,88 @@ out:
 	spin_unlock_irqrestore(&lockres->l_lock, flags);
 }
 
+static void ocfs2_unlock_ast(struct ocfs2_dlm_lksb *lksb, int error)
+{
+	struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
+	unsigned long flags;
+
+	mlog_entry_void();
+
+	mlog(ML_BASTS, "UNLOCK AST fired for lockres %s, action = %d\n",
+	     lockres->l_name, lockres->l_unlock_action);
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	if (error) {
+		mlog(ML_ERROR, "Dlm passes error %d for lock %s, "
+		     "unlock_action %d\n", error, lockres->l_name,
+		     lockres->l_unlock_action);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		mlog_exit_void();
+		return;
+	}
+
+	switch(lockres->l_unlock_action) {
+	case OCFS2_UNLOCK_CANCEL_CONVERT:
+		mlog(0, "Cancel convert success for %s\n", lockres->l_name);
+		lockres->l_action = OCFS2_AST_INVALID;
+		/* Downconvert thread may have requeued this lock, we
+		 * need to wake it. */
+		if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
+			ocfs2_wake_downconvert_thread(ocfs2_get_lockres_osb(lockres));
+		break;
+	case OCFS2_UNLOCK_DROP_LOCK:
+		lockres->l_level = DLM_LOCK_IV;
+		break;
+	default:
+		BUG();
+	}
+
+	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+	lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
+	wake_up(&lockres->l_event);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	mlog_exit_void();
+}
+
+/*
+ * This is the filesystem locking protocol.  It provides the lock handling
+ * hooks for the underlying DLM.  It has a maximum version number.
+ * The version number allows interoperability with systems running at
+ * the same major number and an equal or smaller minor number.
+ *
+ * Whenever the filesystem does new things with locks (adds or removes a
+ * lock, orders them differently, does different things underneath a lock),
+ * the version must be changed.  The protocol is negotiated when joining
+ * the dlm domain.  A node may join the domain if its major version is
+ * identical to all other nodes and its minor version is greater than
+ * or equal to all other nodes.  When its minor version is greater than
+ * the other nodes, it will run at the minor version specified by the
+ * other nodes.
+ *
+ * If a locking change is made that will not be compatible with older
+ * versions, the major number must be increased and the minor version set
+ * to zero.  If a change merely adds a behavior that can be disabled when
+ * speaking to older versions, the minor version must be increased.  If a
+ * change adds a fully backwards compatible change (eg, LVB changes that
+ * are just ignored by older versions), the version does not need to be
+ * updated.
+ */
+static struct ocfs2_locking_protocol lproto = {
+	.lp_max_version = {
+		.pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
+		.pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
+	},
+	.lp_lock_ast		= ocfs2_locking_ast,
+	.lp_blocking_ast	= ocfs2_blocking_ast,
+	.lp_unlock_ast		= ocfs2_unlock_ast,
+};
+
+void ocfs2_set_locking_protocol(void)
+{
+	ocfs2_stack_glue_set_max_proto_version(&lproto.lp_max_version);
+}
+
 static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
 						int convert)
 {
@@ -1133,6 +1236,7 @@ static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
 	mlog_entry_void();
 	spin_lock_irqsave(&lockres->l_lock, flags);
 	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+	lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
 	if (convert)
 		lockres->l_action = OCFS2_AST_INVALID;
 	else
@@ -1179,8 +1283,7 @@ static int ocfs2_lock_create(struct ocfs2_super *osb,
 			     &lockres->l_lksb,
 			     dlm_flags,
 			     lockres->l_name,
-			     OCFS2_LOCK_ID_MAX_LEN - 1,
-			     lockres);
+			     OCFS2_LOCK_ID_MAX_LEN - 1);
 	lockres_clear_pending(lockres, gen, osb);
 	if (ret) {
 		ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
@@ -1323,13 +1426,13 @@ static int __ocfs2_cluster_lock(struct ocfs2_super *osb,
 again:
 	wait = 0;
 
+	spin_lock_irqsave(&lockres->l_lock, flags);
+
 	if (catch_signals && signal_pending(current)) {
 		ret = -ERESTARTSYS;
-		goto out;
+		goto unlock;
 	}
 
-	spin_lock_irqsave(&lockres->l_lock, flags);
-
 	mlog_bug_on_msg(lockres->l_flags & OCFS2_LOCK_FREEING,
 			"Cluster lock called on freeing lockres %s! flags "
 			"0x%lx\n", lockres->l_name, lockres->l_flags);
@@ -1346,6 +1449,25 @@ again:
 		goto unlock;
 	}
 
+	if (lockres->l_flags & OCFS2_LOCK_UPCONVERT_FINISHING) {
+		/*
+		 * We've upconverted. If the lock now has a level we can
+		 * work with, we take it. If, however, the lock is not at the
+		 * required level, we go thru the full cycle. One way this could
+		 * happen is if a process requesting an upconvert to PR is
+		 * closely followed by another requesting upconvert to an EX.
+		 * If the process requesting EX lands here, we want it to
+		 * continue attempting to upconvert and let the process
+		 * requesting PR take the lock.
+		 * If multiple processes request upconvert to PR, the first one
+		 * here will take the lock. The others will have to go thru the
+		 * OCFS2_LOCK_BLOCKED check to ensure that there is no pending
+		 * downconvert request.
+		 */
+		if (level <= lockres->l_level)
+			goto update_holders;
+	}
+
 	if (lockres->l_flags & OCFS2_LOCK_BLOCKED &&
 	    !ocfs2_may_continue_on_blocked_lock(lockres, level)) {
 		/* is the lock is currently blocked on behalf of
@@ -1383,7 +1505,7 @@ again:
 		BUG_ON(level == DLM_LOCK_IV);
 		BUG_ON(level == DLM_LOCK_NL);
 
-		mlog(0, "lock %s, convert from %d to level = %d\n",
+		mlog(ML_BASTS, "lockres %s, convert from %d to %d\n",
 		     lockres->l_name, lockres->l_level, level);
 
 		/* call dlm_lock to upgrade lock now */
@@ -1392,8 +1514,7 @@ again:
 				     &lockres->l_lksb,
 				     lkm_flags,
 				     lockres->l_name,
-				     OCFS2_LOCK_ID_MAX_LEN - 1,
-				     lockres);
+				     OCFS2_LOCK_ID_MAX_LEN - 1);
 		lockres_clear_pending(lockres, gen, osb);
 		if (ret) {
 			if (!(lkm_flags & DLM_LKF_NOQUEUE) ||
@@ -1416,11 +1537,14 @@ again:
 		goto again;
 	}
 
+update_holders:
 	/* Ok, if we get here then we're good to go. */
 	ocfs2_inc_holders(lockres, level);
 
 	ret = 0;
 unlock:
+	lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
+
 	spin_unlock_irqrestore(&lockres->l_lock, flags);
 out:
 	/*
@@ -1757,7 +1881,7 @@ out:
  * ocfs2_file_lock() and ocfs2_file_unlock() map to a single pair of
  * flock() calls. The locking approach this requires is sufficiently
  * different from all other cluster lock types that we implement a
- * seperate path to the "low-level" dlm calls. In particular:
+ * separate path to the "low-level" dlm calls. In particular:
  *
  * - No optimization of lock levels is done - we take at exactly
  *   what's been requested.
@@ -1827,8 +1951,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
 	spin_unlock_irqrestore(&lockres->l_lock, flags);
 
 	ret = ocfs2_dlm_lock(osb->cconn, level, &lockres->l_lksb, lkm_flags,
-			     lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1,
-			     lockres);
+			     lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1);
 	if (ret) {
 		if (!trylock || (ret != -EAGAIN)) {
 			ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
@@ -2957,7 +3080,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
 	status = ocfs2_cluster_connect(osb->osb_cluster_stack,
 				       osb->uuid_str,
 				       strlen(osb->uuid_str),
-				       ocfs2_do_node_down, osb,
+				       &lproto, ocfs2_do_node_down, osb,
 				       &conn);
 	if (status) {
 		mlog_errno(status);
@@ -3024,50 +3147,6 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
 	mlog_exit_void();
 }
 
-static void ocfs2_unlock_ast(void *opaque, int error)
-{
-	struct ocfs2_lock_res *lockres = opaque;
-	unsigned long flags;
-
-	mlog_entry_void();
-
-	mlog(0, "UNLOCK AST called on lock %s, action = %d\n", lockres->l_name,
-	     lockres->l_unlock_action);
-
-	spin_lock_irqsave(&lockres->l_lock, flags);
-	if (error) {
-		mlog(ML_ERROR, "Dlm passes error %d for lock %s, "
-		     "unlock_action %d\n", error, lockres->l_name,
-		     lockres->l_unlock_action);
-		spin_unlock_irqrestore(&lockres->l_lock, flags);
-		mlog_exit_void();
-		return;
-	}
-
-	switch(lockres->l_unlock_action) {
-	case OCFS2_UNLOCK_CANCEL_CONVERT:
-		mlog(0, "Cancel convert success for %s\n", lockres->l_name);
-		lockres->l_action = OCFS2_AST_INVALID;
-		/* Downconvert thread may have requeued this lock, we
-		 * need to wake it. */
-		if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
-			ocfs2_wake_downconvert_thread(ocfs2_get_lockres_osb(lockres));
-		break;
-	case OCFS2_UNLOCK_DROP_LOCK:
-		lockres->l_level = DLM_LOCK_IV;
-		break;
-	default:
-		BUG();
-	}
-
-	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
-	lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
-	wake_up(&lockres->l_event);
-	spin_unlock_irqrestore(&lockres->l_lock, flags);
-
-	mlog_exit_void();
-}
-
 static int ocfs2_drop_lock(struct ocfs2_super *osb,
 			   struct ocfs2_lock_res *lockres)
 {
@@ -3135,8 +3214,7 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
 
 	mlog(0, "lock %s\n", lockres->l_name);
 
-	ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, lkm_flags,
-			       lockres);
+	ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, lkm_flags);
 	if (ret) {
 		ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
 		mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
@@ -3155,7 +3233,7 @@ out:
 /* Mark the lockres as being dropped. It will no longer be
  * queued if blocking, but we still may have to wait on it
  * being dequeued from the downconvert thread before we can consider
- * it safe to drop. 
+ * it safe to drop.
  *
  * You can *not* attempt to call cluster_lock on this lockres anymore. */
 void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres)
@@ -3244,13 +3322,20 @@ static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
 	BUG_ON(lockres->l_blocking <= DLM_LOCK_NL);
 
 	if (lockres->l_level <= new_level) {
-		mlog(ML_ERROR, "lockres->l_level (%d) <= new_level (%d)\n",
-		     lockres->l_level, new_level);
+		mlog(ML_ERROR, "lockres %s, lvl %d <= %d, blcklst %d, mask %d, "
+		     "type %d, flags 0x%lx, hold %d %d, act %d %d, req %d, "
+		     "block %d, pgen %d\n", lockres->l_name, lockres->l_level,
+		     new_level, list_empty(&lockres->l_blocked_list),
+		     list_empty(&lockres->l_mask_waiters), lockres->l_type,
+		     lockres->l_flags, lockres->l_ro_holders,
+		     lockres->l_ex_holders, lockres->l_action,
+		     lockres->l_unlock_action, lockres->l_requested,
+		     lockres->l_blocking, lockres->l_pending_gen);
 		BUG();
 	}
 
-	mlog(0, "lock %s, new_level = %d, l_blocking = %d\n",
-	     lockres->l_name, new_level, lockres->l_blocking);
+	mlog(ML_BASTS, "lockres %s, level %d => %d, blocking %d\n",
+	     lockres->l_name, lockres->l_level, new_level, lockres->l_blocking);
 
 	lockres->l_action = OCFS2_AST_DOWNCONVERT;
 	lockres->l_requested = new_level;
@@ -3269,6 +3354,9 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
 
 	mlog_entry_void();
 
+	mlog(ML_BASTS, "lockres %s, level %d => %d\n", lockres->l_name,
+	     lockres->l_level, new_level);
+
 	if (lvb)
 		dlm_flags |= DLM_LKF_VALBLK;
 
@@ -3277,8 +3365,7 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
 			     &lockres->l_lksb,
 			     dlm_flags,
 			     lockres->l_name,
-			     OCFS2_LOCK_ID_MAX_LEN - 1,
-			     lockres);
+			     OCFS2_LOCK_ID_MAX_LEN - 1);
 	lockres_clear_pending(lockres, generation, osb);
 	if (ret) {
 		ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
@@ -3299,14 +3386,12 @@ static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
 	assert_spin_locked(&lockres->l_lock);
 
 	mlog_entry_void();
-	mlog(0, "lock %s\n", lockres->l_name);
 
 	if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
 		/* If we're already trying to cancel a lock conversion
 		 * then just drop the spinlock and allow the caller to
 		 * requeue this lock. */
-
-		mlog(0, "Lockres %s, skip convert\n", lockres->l_name);
+		mlog(ML_BASTS, "lockres %s, skip convert\n", lockres->l_name);
 		return 0;
 	}
 
@@ -3321,6 +3406,8 @@ static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
 			"lock %s, invalid flags: 0x%lx\n",
 			lockres->l_name, lockres->l_flags);
 
+	mlog(ML_BASTS, "lockres %s\n", lockres->l_name);
+
 	return 1;
 }
 
@@ -3330,16 +3417,15 @@ static int ocfs2_cancel_convert(struct ocfs2_super *osb,
 	int ret;
 
 	mlog_entry_void();
-	mlog(0, "lock %s\n", lockres->l_name);
 
 	ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb,
-			       DLM_LKF_CANCEL, lockres);
+			       DLM_LKF_CANCEL);
 	if (ret) {
 		ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
 		ocfs2_recover_from_dlm_error(lockres, 0);
 	}
 
-	mlog(0, "lock %s return from ocfs2_dlm_unlock\n", lockres->l_name);
+	mlog(ML_BASTS, "lockres %s\n", lockres->l_name);
 
 	mlog_exit(ret);
 	return ret;
@@ -3352,6 +3438,7 @@ static int ocfs2_unblock_lock(struct ocfs2_super *osb,
 	unsigned long flags;
 	int blocking;
 	int new_level;
+	int level;
 	int ret = 0;
 	int set_lvb = 0;
 	unsigned int gen;
@@ -3360,9 +3447,17 @@ static int ocfs2_unblock_lock(struct ocfs2_super *osb,
 
 	spin_lock_irqsave(&lockres->l_lock, flags);
 
-	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
-
 recheck:
+	/*
+	 * Is it still blocking? If not, we have no more work to do.
+	 */
+	if (!(lockres->l_flags & OCFS2_LOCK_BLOCKED)) {
+		BUG_ON(lockres->l_blocking != DLM_LOCK_NL);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		ret = 0;
+		goto leave;
+	}
+
 	if (lockres->l_flags & OCFS2_LOCK_BUSY) {
 		/* XXX
 		 * This is a *big* race.  The OCFS2_LOCK_PENDING flag
@@ -3387,8 +3482,11 @@ recheck:
 		 * at the same time they set OCFS2_DLM_BUSY.  They must
 		 * clear OCFS2_DLM_PENDING after dlm_lock() returns.
 		 */
-		if (lockres->l_flags & OCFS2_LOCK_PENDING)
+		if (lockres->l_flags & OCFS2_LOCK_PENDING) {
+			mlog(ML_BASTS, "lockres %s, ReQ: Pending\n",
+			     lockres->l_name);
 			goto leave_requeue;
+		}
 
 		ctl->requeue = 1;
 		ret = ocfs2_prepare_cancel_convert(osb, lockres);
@@ -3401,31 +3499,70 @@ recheck:
 		goto leave;
 	}
 
+	/*
+	 * This prevents livelocks. OCFS2_LOCK_UPCONVERT_FINISHING flag is
+	 * set when the ast is received for an upconvert just before the
+	 * OCFS2_LOCK_BUSY flag is cleared. Now if the fs received a bast
+	 * on the heels of the ast, we want to delay the downconvert just
+	 * enough to allow the up requestor to do its task. Because this
+	 * lock is in the blocked queue, the lock will be downconverted
+	 * as soon as the requestor is done with the lock.
+	 */
+	if (lockres->l_flags & OCFS2_LOCK_UPCONVERT_FINISHING)
+		goto leave_requeue;
+
+	/*
+	 * How can we block and yet be at NL?  We were trying to upconvert
+	 * from NL and got canceled.  The code comes back here, and now
+	 * we notice and clear BLOCKING.
+	 */
+	if (lockres->l_level == DLM_LOCK_NL) {
+		BUG_ON(lockres->l_ex_holders || lockres->l_ro_holders);
+		mlog(ML_BASTS, "lockres %s, Aborting dc\n", lockres->l_name);
+		lockres->l_blocking = DLM_LOCK_NL;
+		lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		goto leave;
+	}
+
 	/* if we're blocking an exclusive and we have *any* holders,
 	 * then requeue. */
 	if ((lockres->l_blocking == DLM_LOCK_EX)
-	    && (lockres->l_ex_holders || lockres->l_ro_holders))
+	    && (lockres->l_ex_holders || lockres->l_ro_holders)) {
+		mlog(ML_BASTS, "lockres %s, ReQ: EX/PR Holders %u,%u\n",
+		     lockres->l_name, lockres->l_ex_holders,
+		     lockres->l_ro_holders);
 		goto leave_requeue;
+	}
 
 	/* If it's a PR we're blocking, then only
 	 * requeue if we've got any EX holders */
 	if (lockres->l_blocking == DLM_LOCK_PR &&
-	    lockres->l_ex_holders)
+	    lockres->l_ex_holders) {
+		mlog(ML_BASTS, "lockres %s, ReQ: EX Holders %u\n",
+		     lockres->l_name, lockres->l_ex_holders);
 		goto leave_requeue;
+	}
 
 	/*
 	 * Can we get a lock in this state if the holder counts are
 	 * zero? The meta data unblock code used to check this.
 	 */
 	if ((lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
-	    && (lockres->l_flags & OCFS2_LOCK_REFRESHING))
+	    && (lockres->l_flags & OCFS2_LOCK_REFRESHING)) {
+		mlog(ML_BASTS, "lockres %s, ReQ: Lock Refreshing\n",
+		     lockres->l_name);
 		goto leave_requeue;
+	}
 
 	new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
 
 	if (lockres->l_ops->check_downconvert
-	    && !lockres->l_ops->check_downconvert(lockres, new_level))
+	    && !lockres->l_ops->check_downconvert(lockres, new_level)) {
+		mlog(ML_BASTS, "lockres %s, ReQ: Checkpointing\n",
+		     lockres->l_name);
 		goto leave_requeue;
+	}
 
 	/* If we get here, then we know that there are no more
 	 * incompatible holders (and anyone asking for an incompatible
@@ -3438,17 +3575,24 @@ recheck:
 	 * may sleep, so we save off a copy of what we're blocking as
 	 * it may change while we're not holding the spin lock. */
 	blocking = lockres->l_blocking;
+	level = lockres->l_level;
 	spin_unlock_irqrestore(&lockres->l_lock, flags);
 
 	ctl->unblock_action = lockres->l_ops->downconvert_worker(lockres, blocking);
 
-	if (ctl->unblock_action == UNBLOCK_STOP_POST)
+	if (ctl->unblock_action == UNBLOCK_STOP_POST) {
+		mlog(ML_BASTS, "lockres %s, UNBLOCK_STOP_POST\n",
+		     lockres->l_name);
 		goto leave;
+	}
 
 	spin_lock_irqsave(&lockres->l_lock, flags);
-	if (blocking != lockres->l_blocking) {
+	if ((blocking != lockres->l_blocking) || (level != lockres->l_level)) {
 		/* If this changed underneath us, then we can't drop
 		 * it just yet. */
+		mlog(ML_BASTS, "lockres %s, block=%d:%d, level=%d:%d, "
+		     "Recheck\n", lockres->l_name, blocking,
+		     lockres->l_blocking, level, lockres->l_level);
 		goto recheck;
 	}
 
@@ -3753,7 +3897,8 @@ static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo)
 		oinfo->dqi_gi.dqi_free_entry =
 					be32_to_cpu(lvb->lvb_free_entry);
 	} else {
-		status = ocfs2_read_quota_block(oinfo->dqi_gqinode, 0, &bh);
+		status = ocfs2_read_quota_phys_block(oinfo->dqi_gqinode,
+						     oinfo->dqi_giblk, &bh);
 		if (status) {
 			mlog_errno(status);
 			goto bail;
@@ -3843,45 +3988,6 @@ void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex)
 		ocfs2_cluster_unlock(osb, lockres, level);
 }
 
-/*
- * This is the filesystem locking protocol.  It provides the lock handling
- * hooks for the underlying DLM.  It has a maximum version number.
- * The version number allows interoperability with systems running at
- * the same major number and an equal or smaller minor number.
- *
- * Whenever the filesystem does new things with locks (adds or removes a
- * lock, orders them differently, does different things underneath a lock),
- * the version must be changed.  The protocol is negotiated when joining
- * the dlm domain.  A node may join the domain if its major version is
- * identical to all other nodes and its minor version is greater than
- * or equal to all other nodes.  When its minor version is greater than
- * the other nodes, it will run at the minor version specified by the
- * other nodes.
- *
- * If a locking change is made that will not be compatible with older
- * versions, the major number must be increased and the minor version set
- * to zero.  If a change merely adds a behavior that can be disabled when
- * speaking to older versions, the minor version must be increased.  If a
- * change adds a fully backwards compatible change (eg, LVB changes that
- * are just ignored by older versions), the version does not need to be
- * updated.
- */
-static struct ocfs2_locking_protocol lproto = {
-	.lp_max_version = {
-		.pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
-		.pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
-	},
-	.lp_lock_ast		= ocfs2_locking_ast,
-	.lp_blocking_ast	= ocfs2_blocking_ast,
-	.lp_unlock_ast		= ocfs2_unlock_ast,
-};
-
-void ocfs2_set_locking_protocol(void)
-{
-	ocfs2_stack_glue_set_locking_protocol(&lproto);
-}
-
-
 static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
 				       struct ocfs2_lock_res *lockres)
 {
@@ -3898,7 +4004,7 @@ static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
 	BUG_ON(!lockres);
 	BUG_ON(!lockres->l_ops);
 
-	mlog(0, "lockres %s blocked.\n", lockres->l_name);
+	mlog(ML_BASTS, "lockres %s blocked\n", lockres->l_name);
 
 	/* Detect whether a lock has been marked as going away while
 	 * the downconvert thread was processing other things. A lock can
@@ -3921,7 +4027,7 @@ unqueue:
 	} else
 		ocfs2_schedule_blocked_lock(osb, lockres);
 
-	mlog(0, "lockres %s, requeue = %s.\n", lockres->l_name,
+	mlog(ML_BASTS, "lockres %s, requeue = %s.\n", lockres->l_name,
 	     ctl.requeue ? "yes" : "no");
 	spin_unlock_irqrestore(&lockres->l_lock, flags);
 
@@ -3943,7 +4049,7 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
 		/* Do not schedule a lock for downconvert when it's on
 		 * the way to destruction - any nodes wanting access
 		 * to the resource will get it soon. */
-		mlog(0, "Lockres %s won't be scheduled: flags 0x%lx\n",
+		mlog(ML_BASTS, "lockres %s won't be scheduled: flags 0x%lx\n",
 		     lockres->l_name, lockres->l_flags);
 		return;
 	}
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 15713cbb865c..19ad145d2af3 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -239,7 +239,7 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
 		mlog(0, "Encoding parent: blkno: %llu, generation: %u\n",
 		     (unsigned long long)blkno, generation);
 	}
-	
+
 	*max_len = len;
 
 bail:
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index d35a27f4523e..09e3fdfa6d33 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -24,6 +24,7 @@
 
 #include <linux/fs.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/fiemap.h>
 
@@ -192,7 +193,7 @@ static int ocfs2_try_to_merge_extent_map(struct ocfs2_extent_map_item *emi,
 		emi->ei_clusters += ins->ei_clusters;
 		return 1;
 	} else if ((ins->ei_phys + ins->ei_clusters) == emi->ei_phys &&
-		   (ins->ei_cpos + ins->ei_clusters) == emi->ei_phys &&
+		   (ins->ei_cpos + ins->ei_clusters) == emi->ei_cpos &&
 		   ins->ei_flags == emi->ei_flags) {
 		emi->ei_phys = ins->ei_phys;
 		emi->ei_cpos = ins->ei_cpos;
@@ -453,7 +454,7 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
 	if (i == -1) {
 		/*
 		 * Holes can be larger than the maximum size of an
-		 * extent, so we return their lengths in a seperate
+		 * extent, so we return their lengths in a separate
 		 * field.
 		 */
 		if (hole_len) {
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 06ccf6a86d35..2b10b36d1577 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -107,6 +107,9 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
 	mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
 		   file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name);
 
+	if (file->f_mode & FMODE_WRITE)
+		dquot_initialize(inode);
+
 	spin_lock(&oi->ip_lock);
 
 	/* Check that the inode hasn't been wiped from disk by another
@@ -172,13 +175,12 @@ static int ocfs2_dir_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static int ocfs2_sync_file(struct file *file,
-			   struct dentry *dentry,
-			   int datasync)
+static int ocfs2_sync_file(struct file *file, int datasync)
 {
 	int err = 0;
 	journal_t *journal;
-	struct inode *inode = dentry->d_inode;
+	struct dentry *dentry = file->f_path.dentry;
+	struct inode *inode = file->f_mapping->host;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
 	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
@@ -275,10 +277,7 @@ int ocfs2_update_inode_atime(struct inode *inode,
 	inode->i_atime = CURRENT_TIME;
 	di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
 	di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
-
-	ret = ocfs2_journal_dirty(handle, bh);
-	if (ret < 0)
-		mlog_errno(ret);
+	ocfs2_journal_dirty(handle, bh);
 
 out_commit:
 	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
@@ -427,9 +426,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
 	di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
 	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
 
-	status = ocfs2_journal_dirty(handle, fe_bh);
-	if (status < 0)
-		mlog_errno(status);
+	ocfs2_journal_dirty(handle, fe_bh);
 
 out_commit:
 	ocfs2_commit_trans(osb, handle);
@@ -446,7 +443,6 @@ static int ocfs2_truncate_file(struct inode *inode,
 	int status = 0;
 	struct ocfs2_dinode *fe = NULL;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct ocfs2_truncate_context *tc = NULL;
 
 	mlog_entry("(inode = %llu, new_i_size = %llu\n",
 		   (unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -485,6 +481,9 @@ static int ocfs2_truncate_file(struct inode *inode,
 
 	down_write(&OCFS2_I(inode)->ip_alloc_sem);
 
+	ocfs2_resv_discard(&osb->osb_la_resmap,
+			   &OCFS2_I(inode)->ip_la_data_resv);
+
 	/*
 	 * The inode lock forced other nodes to sync and drop their
 	 * pages, which (correctly) happens even if we have a truncate
@@ -514,13 +513,7 @@ static int ocfs2_truncate_file(struct inode *inode,
 		goto bail_unlock_sem;
 	}
 
-	status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail_unlock_sem;
-	}
-
-	status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
+	status = ocfs2_commit_truncate(osb, inode, di_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail_unlock_sem;
@@ -629,11 +622,10 @@ restart_all:
 	}
 
 restarted_transaction:
-	if (vfs_dq_alloc_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb,
-	    clusters_to_add))) {
-		status = -EDQUOT;
+	status = dquot_alloc_space_nodirty(inode,
+			ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
+	if (status)
 		goto leave;
-	}
 	did_quota = 1;
 
 	/* reserve a write to the file entry early on - that we if we
@@ -664,17 +656,13 @@ restarted_transaction:
 		goto leave;
 	}
 
-	status = ocfs2_journal_dirty(handle, bh);
-	if (status < 0) {
-		mlog_errno(status);
-		goto leave;
-	}
+	ocfs2_journal_dirty(handle, bh);
 
 	spin_lock(&OCFS2_I(inode)->ip_lock);
 	clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
 	spin_unlock(&OCFS2_I(inode)->ip_lock);
 	/* Release unused quota reservation */
-	vfs_dq_free_space(inode,
+	dquot_free_space(inode,
 			ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
 	did_quota = 0;
 
@@ -682,6 +670,7 @@ restarted_transaction:
 		if (why == RESTART_META) {
 			mlog(0, "restarting function.\n");
 			restart_func = 1;
+			status = 0;
 		} else {
 			BUG_ON(why != RESTART_TRANS);
 
@@ -710,7 +699,7 @@ restarted_transaction:
 
 leave:
 	if (status < 0 && did_quota)
-		vfs_dq_free_space(inode,
+		dquot_free_space(inode,
 			ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
 	if (handle) {
 		ocfs2_commit_trans(osb, handle);
@@ -735,28 +724,55 @@ leave:
 	return status;
 }
 
+/*
+ * While a write will already be ordering the data, a truncate will not.
+ * Thus, we need to explicitly order the zeroed pages.
+ */
+static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	handle_t *handle = NULL;
+	int ret = 0;
+
+	if (!ocfs2_should_order_data(inode))
+		goto out;
+
+	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_jbd2_file_inode(handle, inode);
+	if (ret < 0)
+		mlog_errno(ret);
+
+out:
+	if (ret) {
+		if (!IS_ERR(handle))
+			ocfs2_commit_trans(osb, handle);
+		handle = ERR_PTR(ret);
+	}
+	return handle;
+}
+
 /* Some parts of this taken from generic_cont_expand, which turned out
  * to be too fragile to do exactly what we need without us having to
  * worry about recursive locking in ->write_begin() and ->write_end(). */
-static int ocfs2_write_zero_page(struct inode *inode,
-				 u64 size)
+static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
+				 u64 abs_to)
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct page *page;
-	unsigned long index;
-	unsigned int offset;
+	unsigned long index = abs_from >> PAGE_CACHE_SHIFT;
 	handle_t *handle = NULL;
-	int ret;
+	int ret = 0;
+	unsigned zero_from, zero_to, block_start, block_end;
 
-	offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
-	/* ugh.  in prepare/commit_write, if from==to==start of block, we 
-	** skip the prepare.  make sure we never send an offset for the start
-	** of a block
-	*/
-	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
-		offset++;
-	}
-	index = size >> PAGE_CACHE_SHIFT;
+	BUG_ON(abs_from >= abs_to);
+	BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
+	BUG_ON(abs_from & (inode->i_blkbits - 1));
 
 	page = grab_cache_page(mapping, index);
 	if (!page) {
@@ -765,31 +781,56 @@ static int ocfs2_write_zero_page(struct inode *inode,
 		goto out;
 	}
 
-	ret = ocfs2_prepare_write_nolock(inode, page, offset, offset);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto out_unlock;
-	}
+	/* Get the offsets within the page that we want to zero */
+	zero_from = abs_from & (PAGE_CACHE_SIZE - 1);
+	zero_to = abs_to & (PAGE_CACHE_SIZE - 1);
+	if (!zero_to)
+		zero_to = PAGE_CACHE_SIZE;
 
-	if (ocfs2_should_order_data(inode)) {
-		handle = ocfs2_start_walk_page_trans(inode, page, offset,
-						     offset);
-		if (IS_ERR(handle)) {
-			ret = PTR_ERR(handle);
-			handle = NULL;
+	mlog(0,
+	     "abs_from = %llu, abs_to = %llu, index = %lu, zero_from = %u, zero_to = %u\n",
+	     (unsigned long long)abs_from, (unsigned long long)abs_to,
+	     index, zero_from, zero_to);
+
+	/* We know that zero_from is block aligned */
+	for (block_start = zero_from; block_start < zero_to;
+	     block_start = block_end) {
+		block_end = block_start + (1 << inode->i_blkbits);
+
+		/*
+		 * block_start is block-aligned.  Bump it by one to
+		 * force ocfs2_{prepare,commit}_write() to zero the
+		 * whole block.
+		 */
+		ret = ocfs2_prepare_write_nolock(inode, page,
+						 block_start + 1,
+						 block_start + 1);
+		if (ret < 0) {
+			mlog_errno(ret);
 			goto out_unlock;
 		}
-	}
 
-	/* must not update i_size! */
-	ret = block_commit_write(page, offset, offset);
-	if (ret < 0)
-		mlog_errno(ret);
-	else
-		ret = 0;
+		if (!handle) {
+			handle = ocfs2_zero_start_ordered_transaction(inode);
+			if (IS_ERR(handle)) {
+				ret = PTR_ERR(handle);
+				handle = NULL;
+				break;
+			}
+		}
+
+		/* must not update i_size! */
+		ret = block_commit_write(page, block_start + 1,
+					 block_start + 1);
+		if (ret < 0)
+			mlog_errno(ret);
+		else
+			ret = 0;
+	}
 
 	if (handle)
 		ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+
 out_unlock:
 	unlock_page(page);
 	page_cache_release(page);
@@ -797,22 +838,114 @@ out:
 	return ret;
 }
 
-static int ocfs2_zero_extend(struct inode *inode,
-			     u64 zero_to_size)
+/*
+ * Find the next range to zero.  We do this in terms of bytes because
+ * that's what ocfs2_zero_extend() wants, and it is dealing with the
+ * pagecache.  We may return multiple extents.
+ *
+ * zero_start and zero_end are ocfs2_zero_extend()s current idea of what
+ * needs to be zeroed.  range_start and range_end return the next zeroing
+ * range.  A subsequent call should pass the previous range_end as its
+ * zero_start.  If range_end is 0, there's nothing to do.
+ *
+ * Unwritten extents are skipped over.  Refcounted extents are CoWd.
+ */
+static int ocfs2_zero_extend_get_range(struct inode *inode,
+				       struct buffer_head *di_bh,
+				       u64 zero_start, u64 zero_end,
+				       u64 *range_start, u64 *range_end)
 {
-	int ret = 0;
-	u64 start_off;
-	struct super_block *sb = inode->i_sb;
+	int rc = 0, needs_cow = 0;
+	u32 p_cpos, zero_clusters = 0;
+	u32 zero_cpos =
+		zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+	u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end);
+	unsigned int num_clusters = 0;
+	unsigned int ext_flags = 0;
 
-	start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
-	while (start_off < zero_to_size) {
-		ret = ocfs2_write_zero_page(inode, start_off);
-		if (ret < 0) {
-			mlog_errno(ret);
+	while (zero_cpos < last_cpos) {
+		rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos,
+					&num_clusters, &ext_flags);
+		if (rc) {
+			mlog_errno(rc);
 			goto out;
 		}
 
-		start_off += sb->s_blocksize;
+		if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
+			zero_clusters = num_clusters;
+			if (ext_flags & OCFS2_EXT_REFCOUNTED)
+				needs_cow = 1;
+			break;
+		}
+
+		zero_cpos += num_clusters;
+	}
+	if (!zero_clusters) {
+		*range_end = 0;
+		goto out;
+	}
+
+	while ((zero_cpos + zero_clusters) < last_cpos) {
+		rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters,
+					&p_cpos, &num_clusters,
+					&ext_flags);
+		if (rc) {
+			mlog_errno(rc);
+			goto out;
+		}
+
+		if (!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN))
+			break;
+		if (ext_flags & OCFS2_EXT_REFCOUNTED)
+			needs_cow = 1;
+		zero_clusters += num_clusters;
+	}
+	if ((zero_cpos + zero_clusters) > last_cpos)
+		zero_clusters = last_cpos - zero_cpos;
+
+	if (needs_cow) {
+		rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos, zero_clusters,
+					UINT_MAX);
+		if (rc) {
+			mlog_errno(rc);
+			goto out;
+		}
+	}
+
+	*range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos);
+	*range_end = ocfs2_clusters_to_bytes(inode->i_sb,
+					     zero_cpos + zero_clusters);
+
+out:
+	return rc;
+}
+
+/*
+ * Zero one range returned from ocfs2_zero_extend_get_range().  The caller
+ * has made sure that the entire range needs zeroing.
+ */
+static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
+				   u64 range_end)
+{
+	int rc = 0;
+	u64 next_pos;
+	u64 zero_pos = range_start;
+
+	mlog(0, "range_start = %llu, range_end = %llu\n",
+	     (unsigned long long)range_start,
+	     (unsigned long long)range_end);
+	BUG_ON(range_start >= range_end);
+
+	while (zero_pos < range_end) {
+		next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;
+		if (next_pos > range_end)
+			next_pos = range_end;
+		rc = ocfs2_write_zero_page(inode, zero_pos, next_pos);
+		if (rc < 0) {
+			mlog_errno(rc);
+			break;
+		}
+		zero_pos = next_pos;
 
 		/*
 		 * Very large extends have the potential to lock up
@@ -821,16 +954,63 @@ static int ocfs2_zero_extend(struct inode *inode,
 		cond_resched();
 	}
 
-out:
+	return rc;
+}
+
+int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
+		      loff_t zero_to_size)
+{
+	int ret = 0;
+	u64 zero_start, range_start = 0, range_end = 0;
+	struct super_block *sb = inode->i_sb;
+
+	zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
+	mlog(0, "zero_start %llu for i_size %llu\n",
+	     (unsigned long long)zero_start,
+	     (unsigned long long)i_size_read(inode));
+	while (zero_start < zero_to_size) {
+		ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
+						  zero_to_size,
+						  &range_start,
+						  &range_end);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+		if (!range_end)
+			break;
+		/* Trim the ends */
+		if (range_start < zero_start)
+			range_start = zero_start;
+		if (range_end > zero_to_size)
+			range_end = zero_to_size;
+
+		ret = ocfs2_zero_extend_range(inode, range_start,
+					      range_end);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+		zero_start = range_end;
+	}
+
 	return ret;
 }
 
-int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to)
+int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
+			  u64 new_i_size, u64 zero_to)
 {
 	int ret;
 	u32 clusters_to_add;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 
+	/*
+	 * Only quota files call this without a bh, and they can't be
+	 * refcounted.
+	 */
+	BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+	BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE));
+
 	clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
 	if (clusters_to_add < oi->ip_clusters)
 		clusters_to_add = 0;
@@ -851,7 +1031,7 @@ int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to)
 	 * still need to zero the area between the old i_size and the
 	 * new i_size.
 	 */
-	ret = ocfs2_zero_extend(inode, zero_to);
+	ret = ocfs2_zero_extend(inode, di_bh, zero_to);
 	if (ret < 0)
 		mlog_errno(ret);
 
@@ -873,27 +1053,15 @@ static int ocfs2_extend_file(struct inode *inode,
 		goto out;
 
 	if (i_size_read(inode) == new_i_size)
-  		goto out;
+		goto out;
 	BUG_ON(new_i_size < i_size_read(inode));
 
 	/*
-	 * Fall through for converting inline data, even if the fs
-	 * supports sparse files.
-	 *
-	 * The check for inline data here is legal - nobody can add
-	 * the feature since we have i_mutex. We must check it again
-	 * after acquiring ip_alloc_sem though, as paths like mmap
-	 * might have raced us to converting the inode to extents.
-	 */
-	if (!(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
-	    && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
-		goto out_update_size;
-
-	/*
 	 * The alloc sem blocks people in read/write from reading our
 	 * allocation until we're done changing it. We depend on
 	 * i_mutex to block other extend/truncate calls while we're
-	 * here.
+	 * here.  We even have to hold it for sparse files because there
+	 * might be some tail zeroing.
 	 */
 	down_write(&oi->ip_alloc_sem);
 
@@ -910,14 +1078,16 @@ static int ocfs2_extend_file(struct inode *inode,
 		ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
 		if (ret) {
 			up_write(&oi->ip_alloc_sem);
-
 			mlog_errno(ret);
 			goto out;
 		}
 	}
 
-	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
-		ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size);
+	if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+		ret = ocfs2_zero_extend(inode, di_bh, new_i_size);
+	else
+		ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size,
+					    new_i_size);
 
 	up_write(&oi->ip_alloc_sem);
 
@@ -943,9 +1113,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 	struct ocfs2_super *osb = OCFS2_SB(sb);
 	struct buffer_head *bh = NULL;
 	handle_t *handle = NULL;
-	int qtype;
-	struct dquot *transfer_from[MAXQUOTAS] = { };
 	struct dquot *transfer_to[MAXQUOTAS] = { };
+	int qtype;
 
 	mlog_entry("(0x%p, '%.*s')\n", dentry,
 	           dentry->d_name.len, dentry->d_name.name);
@@ -976,6 +1145,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 	if (status)
 		return status;
 
+	if (is_quota_modification(inode, attr))
+		dquot_initialize(inode);
 	size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
 	if (size_change) {
 		status = ocfs2_rw_lock(inode, 1);
@@ -993,10 +1164,9 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 	}
 
 	if (size_change && attr->ia_size != i_size_read(inode)) {
-		if (attr->ia_size > sb->s_maxbytes) {
-			status = -EFBIG;
+		status = inode_newsize_ok(inode, attr->ia_size);
+		if (status)
 			goto bail_unlock;
-		}
 
 		if (i_size_read(inode) > attr->ia_size) {
 			if (ocfs2_should_order_data(inode)) {
@@ -1021,16 +1191,14 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 		/*
 		 * Gather pointers to quota structures so that allocation /
 		 * freeing of quota structures happens here and not inside
-		 * vfs_dq_transfer() where we have problems with lock ordering
+		 * dquot_transfer() where we have problems with lock ordering
 		 */
 		if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid
 		    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
 		    OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
 			transfer_to[USRQUOTA] = dqget(sb, attr->ia_uid,
 						      USRQUOTA);
-			transfer_from[USRQUOTA] = dqget(sb, inode->i_uid,
-							USRQUOTA);
-			if (!transfer_to[USRQUOTA] || !transfer_from[USRQUOTA]) {
+			if (!transfer_to[USRQUOTA]) {
 				status = -ESRCH;
 				goto bail_unlock;
 			}
@@ -1040,9 +1208,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 		    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
 			transfer_to[GRPQUOTA] = dqget(sb, attr->ia_gid,
 						      GRPQUOTA);
-			transfer_from[GRPQUOTA] = dqget(sb, inode->i_gid,
-							GRPQUOTA);
-			if (!transfer_to[GRPQUOTA] || !transfer_from[GRPQUOTA]) {
+			if (!transfer_to[GRPQUOTA]) {
 				status = -ESRCH;
 				goto bail_unlock;
 			}
@@ -1054,7 +1220,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 			mlog_errno(status);
 			goto bail_unlock;
 		}
-		status = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
+		status = __dquot_transfer(inode, transfer_to);
 		if (status < 0)
 			goto bail_commit;
 	} else {
@@ -1067,7 +1233,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 	}
 
 	/*
-	 * This will intentionally not wind up calling vmtruncate(),
+	 * This will intentionally not wind up calling simple_setsize(),
 	 * since all the work for a size change has been done above.
 	 * Otherwise, we could get into problems with truncate as
 	 * ip_alloc_sem is used there to protect against i_size
@@ -1094,10 +1260,8 @@ bail:
 	brelse(bh);
 
 	/* Release quota pointers in case we acquired them */
-	for (qtype = 0; qtype < MAXQUOTAS; qtype++) {
+	for (qtype = 0; qtype < MAXQUOTAS; qtype++)
 		dqput(transfer_to[qtype]);
-		dqput(transfer_from[qtype]);
-	}
 
 	if (!status && attr->ia_valid & ATTR_MODE) {
 		status = ocfs2_acl_chmod(inode);
@@ -1191,9 +1355,7 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
 	di = (struct ocfs2_dinode *) bh->b_data;
 	di->i_mode = cpu_to_le16(inode->i_mode);
 
-	ret = ocfs2_journal_dirty(handle, bh);
-	if (ret < 0)
-		mlog_errno(ret);
+	ocfs2_journal_dirty(handle, bh);
 
 out_trans:
 	ocfs2_commit_trans(osb, handle);
@@ -1430,16 +1592,90 @@ out:
 	return ret;
 }
 
+static int ocfs2_find_rec(struct ocfs2_extent_list *el, u32 pos)
+{
+	int i;
+	struct ocfs2_extent_rec *rec = NULL;
+
+	for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
+
+		rec = &el->l_recs[i];
+
+		if (le32_to_cpu(rec->e_cpos) < pos)
+			break;
+	}
+
+	return i;
+}
+
+/*
+ * Helper to calculate the punching pos and length in one run, we handle the
+ * following three cases in order:
+ *
+ * - remove the entire record
+ * - remove a partial record
+ * - no record needs to be removed (hole-punching completed)
+*/
+static void ocfs2_calc_trunc_pos(struct inode *inode,
+				 struct ocfs2_extent_list *el,
+				 struct ocfs2_extent_rec *rec,
+				 u32 trunc_start, u32 *trunc_cpos,
+				 u32 *trunc_len, u32 *trunc_end,
+				 u64 *blkno, int *done)
+{
+	int ret = 0;
+	u32 coff, range;
+
+	range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+
+	if (le32_to_cpu(rec->e_cpos) >= trunc_start) {
+		*trunc_cpos = le32_to_cpu(rec->e_cpos);
+		/*
+		 * Skip holes if any.
+		 */
+		if (range < *trunc_end)
+			*trunc_end = range;
+		*trunc_len = *trunc_end - le32_to_cpu(rec->e_cpos);
+		*blkno = le64_to_cpu(rec->e_blkno);
+		*trunc_end = le32_to_cpu(rec->e_cpos);
+	} else if (range > trunc_start) {
+		*trunc_cpos = trunc_start;
+		*trunc_len = *trunc_end - trunc_start;
+		coff = trunc_start - le32_to_cpu(rec->e_cpos);
+		*blkno = le64_to_cpu(rec->e_blkno) +
+				ocfs2_clusters_to_blocks(inode->i_sb, coff);
+		*trunc_end = trunc_start;
+	} else {
+		/*
+		 * It may have two following possibilities:
+		 *
+		 * - last record has been removed
+		 * - trunc_start was within a hole
+		 *
+		 * both two cases mean the completion of hole punching.
+		 */
+		ret = 1;
+	}
+
+	*done = ret;
+}
+
 static int ocfs2_remove_inode_range(struct inode *inode,
 				    struct buffer_head *di_bh, u64 byte_start,
 				    u64 byte_len)
 {
-	int ret = 0;
-	u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size;
+	int ret = 0, flags = 0, done = 0, i;
+	u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
+	u32 cluster_in_el;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_cached_dealloc_ctxt dealloc;
 	struct address_space *mapping = inode->i_mapping;
 	struct ocfs2_extent_tree et;
+	struct ocfs2_path *path = NULL;
+	struct ocfs2_extent_list *el = NULL;
+	struct ocfs2_extent_rec *rec = NULL;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	u64 blkno, refcount_loc = le64_to_cpu(di->i_refcount_loc);
 
 	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
 	ocfs2_init_dealloc_ctxt(&dealloc);
@@ -1465,17 +1701,35 @@ static int ocfs2_remove_inode_range(struct inode *inode,
 		goto out;
 	}
 
+	/*
+	 * For reflinks, we may need to CoW 2 clusters which might be
+	 * partially zero'd later, if hole's start and end offset were
+	 * within one cluster(means is not exactly aligned to clustersize).
+	 */
+
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) {
+
+		ret = ocfs2_cow_file_pos(inode, di_bh, byte_start);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_cow_file_pos(inode, di_bh, byte_start + byte_len);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
 	trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
-	trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits;
-	if (trunc_len >= trunc_start)
-		trunc_len -= trunc_start;
-	else
-		trunc_len = 0;
+	trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits;
+	cluster_in_el = trunc_end;
 
-	mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u\n",
+	mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, cend: %u\n",
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
 	     (unsigned long long)byte_start,
-	     (unsigned long long)byte_len, trunc_start, trunc_len);
+	     (unsigned long long)byte_len, trunc_start, trunc_end);
 
 	ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
 	if (ret) {
@@ -1483,31 +1737,79 @@ static int ocfs2_remove_inode_range(struct inode *inode,
 		goto out;
 	}
 
-	cpos = trunc_start;
-	while (trunc_len) {
-		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
-					 &alloc_size, NULL);
+	path = ocfs2_new_path_from_et(&et);
+	if (!path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	while (trunc_end > trunc_start) {
+
+		ret = ocfs2_find_path(INODE_CACHE(inode), path,
+				      cluster_in_el);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 
-		if (alloc_size > trunc_len)
-			alloc_size = trunc_len;
+		el = path_leaf_el(path);
 
-		/* Only do work for non-holes */
-		if (phys_cpos != 0) {
-			ret = ocfs2_remove_btree_range(inode, &et, cpos,
-						       phys_cpos, alloc_size,
-						       &dealloc);
+		i = ocfs2_find_rec(el, trunc_end);
+		/*
+		 * Need to go to previous extent block.
+		 */
+		if (i < 0) {
+			if (path->p_tree_depth == 0)
+				break;
+
+			ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
+							    path,
+							    &cluster_in_el);
 			if (ret) {
 				mlog_errno(ret);
 				goto out;
 			}
+
+			/*
+			 * We've reached the leftmost extent block,
+			 * it's safe to leave.
+			 */
+			if (cluster_in_el == 0)
+				break;
+
+			/*
+			 * The 'pos' searched for previous extent block is
+			 * always one cluster less than actual trunc_end.
+			 */
+			trunc_end = cluster_in_el + 1;
+
+			ocfs2_reinit_path(path, 1);
+
+			continue;
+
+		} else
+			rec = &el->l_recs[i];
+
+		ocfs2_calc_trunc_pos(inode, el, rec, trunc_start, &trunc_cpos,
+				     &trunc_len, &trunc_end, &blkno, &done);
+		if (done)
+			break;
+
+		flags = rec->e_flags;
+		phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
+
+		ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
+					       phys_cpos, trunc_len, flags,
+					       &dealloc, refcount_loc);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
 		}
 
-		cpos += alloc_size;
-		trunc_len -= alloc_size;
+		cluster_in_el = trunc_end;
+
+		ocfs2_reinit_path(path, 1);
 	}
 
 	ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
@@ -1779,7 +2081,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
 	struct inode *inode = dentry->d_inode;
 	loff_t saved_pos, end;
 
-	/* 
+	/*
 	 * We start with a read level meta lock and only jump to an ex
 	 * if we need to make modifications here.
 	 */
@@ -1836,6 +2138,8 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
 							       &meta_level);
 			if (has_refcount)
 				*has_refcount = 1;
+			if (direct_io)
+				*direct_io = 0;
 		}
 
 		if (ret < 0) {
@@ -1859,10 +2163,6 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
 			break;
 		}
 
-		if (has_refcount && *has_refcount == 1) {
-			*direct_io = 0;
-			break;
-		}
 		/*
 		 * Allowing concurrent direct writes means
 		 * i_size changes wouldn't be synchronized, so
@@ -1980,18 +2280,18 @@ relock:
 	/* communicate with ocfs2_dio_end_io */
 	ocfs2_iocb_set_rw_locked(iocb, rw_level);
 
-	if (direct_io) {
-		ret = generic_segment_checks(iov, &nr_segs, &ocount,
-					     VERIFY_READ);
-		if (ret)
-			goto out_dio;
+	ret = generic_segment_checks(iov, &nr_segs, &ocount,
+				     VERIFY_READ);
+	if (ret)
+		goto out_dio;
 
-		count = ocount;
-		ret = generic_write_checks(file, ppos, &count,
-					   S_ISBLK(inode->i_mode));
-		if (ret)
-			goto out_dio;
+	count = ocount;
+	ret = generic_write_checks(file, ppos, &count,
+				   S_ISBLK(inode->i_mode));
+	if (ret)
+		goto out_dio;
 
+	if (direct_io) {
 		written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
 						    ppos, count, ocount);
 		if (written < 0) {
@@ -1999,30 +2299,37 @@ relock:
 			 * direct write may have instantiated a few
 			 * blocks outside i_size. Trim these off again.
 			 * Don't need i_size_read because we hold i_mutex.
+			 *
+			 * XXX(hch): this looks buggy because ocfs2 did not
+			 * actually implement ->truncate.  Take a look at
+			 * the new truncate sequence and update this accordingly
 			 */
 			if (*ppos + count > inode->i_size)
-				vmtruncate(inode, inode->i_size);
+				simple_setsize(inode, inode->i_size);
 			ret = written;
 			goto out_dio;
 		}
 	} else {
-		written = __generic_file_aio_write(iocb, iov, nr_segs, ppos);
+		current->backing_dev_info = file->f_mapping->backing_dev_info;
+		written = generic_file_buffered_write(iocb, iov, nr_segs, *ppos,
+						      ppos, count, 0);
+		current->backing_dev_info = NULL;
 	}
 
 out_dio:
 	/* buffered aio wouldn't have proper lock coverage today */
 	BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
 
-	if ((file->f_flags & O_DSYNC && !direct_io) || IS_SYNC(inode) ||
-	    (file->f_flags & O_DIRECT && has_refcount)) {
+	if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) ||
+	    ((file->f_flags & O_DIRECT) && has_refcount)) {
 		ret = filemap_fdatawrite_range(file->f_mapping, pos,
 					       pos + count - 1);
 		if (ret < 0)
 			written = ret;
 
-		if (!ret && (old_size != i_size_read(inode) ||
-		    old_clusters != OCFS2_I(inode)->ip_clusters ||
-		    has_refcount)) {
+		if (!ret && ((old_size != i_size_read(inode)) ||
+			     (old_clusters != OCFS2_I(inode)->ip_clusters) ||
+			     has_refcount)) {
 			ret = jbd2_journal_force_commit(osb->journal->j_journal);
 			if (ret < 0)
 				written = ret;
@@ -2033,7 +2340,7 @@ out_dio:
 						      pos + count - 1);
 	}
 
-	/* 
+	/*
 	 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
 	 * function pointer which is called when o_direct io completes so that
 	 * it can unlock our rw lock.  (it's the clustered equivalent of
@@ -2043,7 +2350,7 @@ out_dio:
 	 * async dio is going to do it in the future or an end_io after an
 	 * error has already done it.
 	 */
-	if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
+	if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
 		rw_level = -1;
 		have_alloc_sem = 0;
 	}
@@ -2198,7 +2505,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
 		goto bail;
 	}
 
-	/* 
+	/*
 	 * buffered reads protect themselves in ->readpage().  O_DIRECT reads
 	 * need locks to protect pending reads from racing with truncate.
 	 */
@@ -2220,10 +2527,10 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
 	 * We're fine letting folks race truncates and extending
 	 * writes with read across the cluster, just like they can
 	 * locally. Hence no rw_lock during read.
-	 * 
+	 *
 	 * Take and drop the meta data lock to update inode fields
 	 * like i_size. This allows the checks down below
-	 * generic_file_aio_read() a chance of actually working. 
+	 * generic_file_aio_read() a chance of actually working.
 	 */
 	ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
 	if (ret < 0) {
@@ -2248,7 +2555,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
 bail:
 	if (have_alloc_sem)
 		up_read(&inode->i_alloc_sem);
-	if (rw_level != -1) 
+	if (rw_level != -1)
 		ocfs2_rw_unlock(inode, rw_level);
 	mlog_exit(ret);
 
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index d66cf4f7c70e..97bf761c9e7c 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -54,8 +54,10 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
 int ocfs2_simple_size_update(struct inode *inode,
 			     struct buffer_head *di_bh,
 			     u64 new_i_size);
-int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size,
-			  u64 zero_to);
+int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
+			  u64 new_i_size, u64 zero_to);
+int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
+		      loff_t zero_to);
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
 int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		  struct kstat *stat);
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index c6e7213db868..1aa863dd901f 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -26,7 +26,6 @@
 
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 
 #define MLOG_MASK_PREFIX ML_SUPER
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 0297fb8982b8..abb0a95cc717 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -25,7 +25,6 @@
 
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
@@ -377,6 +376,10 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 
 	OCFS2_I(inode)->ip_last_used_slot = 0;
 	OCFS2_I(inode)->ip_last_used_group = 0;
+
+	if (S_ISDIR(inode->i_mode))
+		ocfs2_resv_set_type(&OCFS2_I(inode)->ip_la_data_resv,
+				    OCFS2_RESV_FLAG_DIR);
 	mlog_exit_void();
 }
 
@@ -475,7 +478,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
 	if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) {
 		status = ocfs2_try_open_lock(inode, 0);
 		if (status) {
-			make_bad_inode(inode);	
+			make_bad_inode(inode);
 			return status;
 		}
 	}
@@ -540,7 +543,6 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
 				     struct buffer_head *fe_bh)
 {
 	int status = 0;
-	struct ocfs2_truncate_context *tc = NULL;
 	struct ocfs2_dinode *fe;
 	handle_t *handle = NULL;
 
@@ -559,6 +561,7 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
 		handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 		if (IS_ERR(handle)) {
 			status = PTR_ERR(handle);
+			handle = NULL;
 			mlog_errno(status);
 			goto out;
 		}
@@ -582,13 +585,7 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
 		ocfs2_commit_trans(osb, handle);
 		handle = NULL;
 
-		status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
-		if (status < 0) {
-			mlog_errno(status);
-			goto out;
-		}
-
-		status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
+		status = ocfs2_commit_truncate(osb, inode, fe_bh);
 		if (status < 0) {
 			mlog_errno(status);
 			goto out;
@@ -640,11 +637,13 @@ static int ocfs2_remove_inode(struct inode *inode,
 		goto bail_unlock;
 	}
 
-	status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
-				  orphan_dir_bh);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail_commit;
+	if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
+		status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
+					  orphan_dir_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail_commit;
+		}
 	}
 
 	/* set the inodes dtime */
@@ -657,15 +656,10 @@ static int ocfs2_remove_inode(struct inode *inode,
 
 	di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec);
 	di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
-
-	status = ocfs2_journal_dirty(handle, di_bh);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail_commit;
-	}
+	ocfs2_journal_dirty(handle, di_bh);
 
 	ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh);
-	vfs_dq_free_inode(inode);
+	dquot_free_inode(inode);
 
 	status = ocfs2_free_dinode(handle, inode_alloc_inode,
 				   inode_alloc_bh, di);
@@ -684,7 +678,7 @@ bail:
 	return status;
 }
 
-/* 
+/*
  * Serialize with orphan dir recovery. If the process doing
  * recovery on this orphan dir does an iget() with the dir
  * i_mutex held, we'll deadlock here. Instead we detect this
@@ -723,38 +717,39 @@ static void ocfs2_signal_wipe_completion(struct ocfs2_super *osb,
 static int ocfs2_wipe_inode(struct inode *inode,
 			    struct buffer_head *di_bh)
 {
-	int status, orphaned_slot;
+	int status, orphaned_slot = -1;
 	struct inode *orphan_dir_inode = NULL;
 	struct buffer_head *orphan_dir_bh = NULL;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct ocfs2_dinode *di;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
 
-	di = (struct ocfs2_dinode *) di_bh->b_data;
-	orphaned_slot = le16_to_cpu(di->i_orphaned_slot);
+	if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
+		orphaned_slot = le16_to_cpu(di->i_orphaned_slot);
 
-	status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot);
-	if (status)
-		return status;
+		status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot);
+		if (status)
+			return status;
 
-	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
-						       ORPHAN_DIR_SYSTEM_INODE,
-						       orphaned_slot);
-	if (!orphan_dir_inode) {
-		status = -EEXIST;
-		mlog_errno(status);
-		goto bail;
-	}
+		orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+							       ORPHAN_DIR_SYSTEM_INODE,
+							       orphaned_slot);
+		if (!orphan_dir_inode) {
+			status = -EEXIST;
+			mlog_errno(status);
+			goto bail;
+		}
 
-	/* Lock the orphan dir. The lock will be held for the entire
-	 * delete_inode operation. We do this now to avoid races with
-	 * recovery completion on other nodes. */
-	mutex_lock(&orphan_dir_inode->i_mutex);
-	status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
-	if (status < 0) {
-		mutex_unlock(&orphan_dir_inode->i_mutex);
+		/* Lock the orphan dir. The lock will be held for the entire
+		 * delete_inode operation. We do this now to avoid races with
+		 * recovery completion on other nodes. */
+		mutex_lock(&orphan_dir_inode->i_mutex);
+		status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
+		if (status < 0) {
+			mutex_unlock(&orphan_dir_inode->i_mutex);
 
-		mlog_errno(status);
-		goto bail;
+			mlog_errno(status);
+			goto bail;
+		}
 	}
 
 	/* we do this while holding the orphan dir lock because we
@@ -795,6 +790,9 @@ static int ocfs2_wipe_inode(struct inode *inode,
 		mlog_errno(status);
 
 bail_unlock_dir:
+	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)
+		return status;
+
 	ocfs2_inode_unlock(orphan_dir_inode, 1);
 	mutex_unlock(&orphan_dir_inode->i_mutex);
 	brelse(orphan_dir_bh);
@@ -890,7 +888,23 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
 
 	/* Do some basic inode verification... */
 	di = (struct ocfs2_dinode *) di_bh->b_data;
-	if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL))) {
+	if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL)) &&
+	    !(oi->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
+		/*
+		 * Inodes in the orphan dir must have ORPHANED_FL.  The only
+		 * inodes that come back out of the orphan dir are reflink
+		 * targets. A reflink target may be moved out of the orphan
+		 * dir between the time we scan the directory and the time we
+		 * process it. This would lead to HAS_REFCOUNT_FL being set but
+		 * ORPHANED_FL not.
+		 */
+		if (di->i_dyn_features & cpu_to_le16(OCFS2_HAS_REFCOUNT_FL)) {
+			mlog(0, "Reflinked inode %llu is no longer orphaned.  "
+			     "it shouldn't be deleted\n",
+			     (unsigned long long)oi->ip_blkno);
+			goto bail;
+		}
+
 		/* for lack of a better error? */
 		status = -EEXIST;
 		mlog(ML_ERROR,
@@ -958,7 +972,7 @@ static void ocfs2_cleanup_delete_inode(struct inode *inode,
 void ocfs2_delete_inode(struct inode *inode)
 {
 	int wipe, status;
-	sigset_t blocked, oldset;
+	sigset_t oldset;
 	struct buffer_head *di_bh = NULL;
 
 	mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
@@ -971,6 +985,8 @@ void ocfs2_delete_inode(struct inode *inode)
 		goto bail;
 	}
 
+	dquot_initialize(inode);
+
 	if (!ocfs2_inode_is_valid_to_delete(inode)) {
 		/* It's probably not necessary to truncate_inode_pages
 		 * here but we do it for safety anyway (it will most
@@ -983,13 +999,7 @@ void ocfs2_delete_inode(struct inode *inode)
 	 * messaging paths may return us -ERESTARTSYS. Which would
 	 * cause us to exit early, resulting in inodes being orphaned
 	 * forever. */
-	sigfillset(&blocked);
-	status = sigprocmask(SIG_BLOCK, &blocked, &oldset);
-	if (status < 0) {
-		mlog_errno(status);
-		ocfs2_cleanup_delete_inode(inode, 1);
-		goto bail;
-	}
+	ocfs2_block_signals(&oldset);
 
 	/*
 	 * Synchronize us against ocfs2_get_dentry. We take this in
@@ -1063,9 +1073,7 @@ bail_unlock_nfs_sync:
 	ocfs2_nfs_sync_unlock(OCFS2_SB(inode->i_sb), 0);
 
 bail_unblock:
-	status = sigprocmask(SIG_SETMASK, &oldset, NULL);
-	if (status < 0)
-		mlog_errno(status);
+	ocfs2_unblock_signals(&oldset);
 bail:
 	clear_inode(inode);
 	mlog_exit_void();
@@ -1087,6 +1095,8 @@ void ocfs2_clear_inode(struct inode *inode)
 	mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
 			"Inode=%lu\n", inode->i_ino);
 
+	dquot_drop(inode);
+
 	/* To preven remote deletes we hold open lock before, now it
 	 * is time to unlock PR and EX open locks. */
 	ocfs2_open_unlock(inode);
@@ -1097,6 +1107,10 @@ void ocfs2_clear_inode(struct inode *inode)
 	ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres);
 	ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);
 
+	ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap,
+			   &oi->ip_la_data_resv);
+	ocfs2_resv_init_once(&oi->ip_la_data_resv);
+
 	/* We very well may get a clear_inode before all an inodes
 	 * metadata has hit disk. Of course, we can't drop any cluster
 	 * locks until the journal has finished with it. The only
@@ -1272,13 +1286,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
 	fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
 	fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
 
-	status = ocfs2_journal_dirty(handle, bh);
-	if (status < 0)
-		mlog_errno(status);
-
-	status = 0;
+	ocfs2_journal_dirty(handle, bh);
 leave:
-
 	mlog_exit(status);
 	return status;
 }
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index ba4fe07b293c..9f5f5fcadc45 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -70,6 +70,8 @@ struct ocfs2_inode_info
 	/* Only valid if the inode is the dir. */
 	u32				ip_last_used_slot;
 	u64				ip_last_used_group;
+
+	struct ocfs2_alloc_reservation	ip_la_data_resv;
 };
 
 /*
@@ -100,6 +102,8 @@ struct ocfs2_inode_info
 #define OCFS2_INODE_MAYBE_ORPHANED	0x00000020
 /* Does someone have the file open O_DIRECT */
 #define OCFS2_INODE_OPEN_DIRECT		0x00000040
+/* Tell the inode wipe code it's not in orphan dir */
+#define OCFS2_INODE_SKIP_ORPHAN_DIR     0x00000080
 
 static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
 {
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 31fbb0619510..7d9d9c132cef 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -7,6 +7,7 @@
 
 #include <linux/fs.h>
 #include <linux/mount.h>
+#include <linux/compat.h>
 
 #define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
@@ -181,6 +182,10 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 #ifdef CONFIG_COMPAT
 long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 {
+	bool preserve;
+	struct reflink_arguments args;
+	struct inode *inode = file->f_path.dentry->d_inode;
+
 	switch (cmd) {
 	case OCFS2_IOC32_GETFLAGS:
 		cmd = OCFS2_IOC_GETFLAGS;
@@ -195,8 +200,15 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	case OCFS2_IOC_GROUP_EXTEND:
 	case OCFS2_IOC_GROUP_ADD:
 	case OCFS2_IOC_GROUP_ADD64:
-	case OCFS2_IOC_REFLINK:
 		break;
+	case OCFS2_IOC_REFLINK:
+		if (copy_from_user(&args, (struct reflink_arguments *)arg,
+				   sizeof(args)))
+			return -EFAULT;
+		preserve = (args.preserve != 0);
+
+		return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path),
+					   compat_ptr(args.new_path), preserve);
 	default:
 		return -ENOIOCTLCMD;
 	}
diff --git a/fs/ocfs2/ioctl.h b/fs/ocfs2/ioctl.h
index cf9a5ee30fef..0cd5323bd3f0 100644
--- a/fs/ocfs2/ioctl.h
+++ b/fs/ocfs2/ioctl.h
@@ -7,10 +7,10 @@
  *
  */
 
-#ifndef OCFS2_IOCTL_H
-#define OCFS2_IOCTL_H
+#ifndef OCFS2_IOCTL_PROTO_H
+#define OCFS2_IOCTL_PROTO_H
 
 long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
 long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg);
 
-#endif /* OCFS2_IOCTL_H */
+#endif /* OCFS2_IOCTL_PROTO_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index bf34c491ae96..625de9d7088c 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -402,9 +402,7 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
 }
 
 /*
- * 'nblocks' is what you want to add to the current
- * transaction. extend_trans will either extend the current handle by
- * nblocks, or commit it and start a new one with nblocks credits.
+ * 'nblocks' is what you want to add to the current transaction.
  *
  * This might call jbd2_journal_restart() which will commit dirty buffers
  * and then restart the transaction. Before calling
@@ -422,11 +420,15 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
  */
 int ocfs2_extend_trans(handle_t *handle, int nblocks)
 {
-	int status;
+	int status, old_nblocks;
 
 	BUG_ON(!handle);
-	BUG_ON(!nblocks);
+	BUG_ON(nblocks < 0);
+
+	if (!nblocks)
+		return 0;
 
+	old_nblocks = handle->h_buffer_credits;
 	mlog_entry_void();
 
 	mlog(0, "Trying to extend transaction by %d blocks\n", nblocks);
@@ -445,7 +447,8 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
 		mlog(0,
 		     "jbd2_journal_extend failed, trying "
 		     "jbd2_journal_restart\n");
-		status = jbd2_journal_restart(handle, nblocks);
+		status = jbd2_journal_restart(handle,
+					      old_nblocks + nblocks);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
@@ -469,7 +472,7 @@ static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger
 	return container_of(triggers, struct ocfs2_triggers, ot_triggers);
 }
 
-static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+static void ocfs2_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
 				 struct buffer_head *bh,
 				 void *data, size_t size)
 {
@@ -488,7 +491,7 @@ static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
  * Quota blocks have their own trigger because the struct ocfs2_block_check
  * offset depends on the blocksize.
  */
-static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+static void ocfs2_dq_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
 				 struct buffer_head *bh,
 				 void *data, size_t size)
 {
@@ -508,7 +511,7 @@ static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
  * Directory blocks also have their own trigger because the
  * struct ocfs2_block_check offset depends on the blocksize.
  */
-static void ocfs2_db_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+static void ocfs2_db_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
 				 struct buffer_head *bh,
 				 void *data, size_t size)
 {
@@ -541,7 +544,7 @@ static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers,
 
 static struct ocfs2_triggers di_triggers = {
 	.ot_triggers = {
-		.t_commit = ocfs2_commit_trigger,
+		.t_frozen = ocfs2_frozen_trigger,
 		.t_abort = ocfs2_abort_trigger,
 	},
 	.ot_offset	= offsetof(struct ocfs2_dinode, i_check),
@@ -549,7 +552,7 @@ static struct ocfs2_triggers di_triggers = {
 
 static struct ocfs2_triggers eb_triggers = {
 	.ot_triggers = {
-		.t_commit = ocfs2_commit_trigger,
+		.t_frozen = ocfs2_frozen_trigger,
 		.t_abort = ocfs2_abort_trigger,
 	},
 	.ot_offset	= offsetof(struct ocfs2_extent_block, h_check),
@@ -557,7 +560,7 @@ static struct ocfs2_triggers eb_triggers = {
 
 static struct ocfs2_triggers rb_triggers = {
 	.ot_triggers = {
-		.t_commit = ocfs2_commit_trigger,
+		.t_frozen = ocfs2_frozen_trigger,
 		.t_abort = ocfs2_abort_trigger,
 	},
 	.ot_offset	= offsetof(struct ocfs2_refcount_block, rf_check),
@@ -565,7 +568,7 @@ static struct ocfs2_triggers rb_triggers = {
 
 static struct ocfs2_triggers gd_triggers = {
 	.ot_triggers = {
-		.t_commit = ocfs2_commit_trigger,
+		.t_frozen = ocfs2_frozen_trigger,
 		.t_abort = ocfs2_abort_trigger,
 	},
 	.ot_offset	= offsetof(struct ocfs2_group_desc, bg_check),
@@ -573,14 +576,14 @@ static struct ocfs2_triggers gd_triggers = {
 
 static struct ocfs2_triggers db_triggers = {
 	.ot_triggers = {
-		.t_commit = ocfs2_db_commit_trigger,
+		.t_frozen = ocfs2_db_frozen_trigger,
 		.t_abort = ocfs2_abort_trigger,
 	},
 };
 
 static struct ocfs2_triggers xb_triggers = {
 	.ot_triggers = {
-		.t_commit = ocfs2_commit_trigger,
+		.t_frozen = ocfs2_frozen_trigger,
 		.t_abort = ocfs2_abort_trigger,
 	},
 	.ot_offset	= offsetof(struct ocfs2_xattr_block, xb_check),
@@ -588,14 +591,14 @@ static struct ocfs2_triggers xb_triggers = {
 
 static struct ocfs2_triggers dq_triggers = {
 	.ot_triggers = {
-		.t_commit = ocfs2_dq_commit_trigger,
+		.t_frozen = ocfs2_dq_frozen_trigger,
 		.t_abort = ocfs2_abort_trigger,
 	},
 };
 
 static struct ocfs2_triggers dr_triggers = {
 	.ot_triggers = {
-		.t_commit = ocfs2_commit_trigger,
+		.t_frozen = ocfs2_frozen_trigger,
 		.t_abort = ocfs2_abort_trigger,
 	},
 	.ot_offset	= offsetof(struct ocfs2_dx_root_block, dr_check),
@@ -603,7 +606,7 @@ static struct ocfs2_triggers dr_triggers = {
 
 static struct ocfs2_triggers dl_triggers = {
 	.ot_triggers = {
-		.t_commit = ocfs2_commit_trigger,
+		.t_frozen = ocfs2_frozen_trigger,
 		.t_abort = ocfs2_abort_trigger,
 	},
 	.ot_offset	= offsetof(struct ocfs2_dx_leaf, dl_check),
@@ -734,8 +737,7 @@ int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
 	return __ocfs2_journal_access(handle, ci, bh, NULL, type);
 }
 
-int ocfs2_journal_dirty(handle_t *handle,
-			struct buffer_head *bh)
+void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh)
 {
 	int status;
 
@@ -743,13 +745,9 @@ int ocfs2_journal_dirty(handle_t *handle,
 		   (unsigned long long)bh->b_blocknr);
 
 	status = jbd2_journal_dirty_metadata(handle, bh);
-	if (status < 0)
-		mlog(ML_ERROR, "Could not dirty metadata buffer. "
-		     "(bh->b_blocknr=%llu)\n",
-		     (unsigned long long)bh->b_blocknr);
+	BUG_ON(status);
 
-	mlog_exit(status);
-	return status;
+	mlog_exit_void();
 }
 
 #define OCFS2_DEFAULT_COMMIT_INTERVAL	(HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
@@ -1938,7 +1936,7 @@ void ocfs2_orphan_scan_work(struct work_struct *work)
 	mutex_lock(&os->os_lock);
 	ocfs2_queue_orphan_scan(osb);
 	if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE)
-		schedule_delayed_work(&os->os_orphan_scan_work,
+		queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
 				      ocfs2_orphan_scan_timeout());
 	mutex_unlock(&os->os_lock);
 }
@@ -1978,8 +1976,8 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
 		atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
 	else {
 		atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE);
-		schedule_delayed_work(&os->os_orphan_scan_work,
-				      ocfs2_orphan_scan_timeout());
+		queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
+				   ocfs2_orphan_scan_timeout());
 	}
 }
 
@@ -2034,7 +2032,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
 		status = -ENOENT;
 		mlog_errno(status);
 		return status;
-	}	
+	}
 
 	mutex_lock(&orphan_dir_inode->i_mutex);
 	status = ocfs2_inode_lock(orphan_dir_inode, NULL, 0);
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 3f74e09b0d80..b5baaa8e710f 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -325,8 +325,7 @@ int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
  *	<modify the bh>
  * 	ocfs2_journal_dirty(handle, bh);
  */
-int                  ocfs2_journal_dirty(handle_t *handle,
-					 struct buffer_head *bh);
+void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh);
 
 /*
  *  Credit Macros:
@@ -562,6 +561,18 @@ static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb,
 	return blocks;
 }
 
+/*
+ * Allocating a discontiguous block group requires the credits from
+ * ocfs2_calc_group_alloc_credits() as well as enough credits to fill
+ * the group descriptor's extent list.  The caller already has started
+ * the transaction with ocfs2_calc_group_alloc_credits().  They extend
+ * it with these credits.
+ */
+static inline int ocfs2_calc_bg_discontig_credits(struct super_block *sb)
+{
+	return ocfs2_extent_recs_per_gd(sb);
+}
+
 static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
 						unsigned int clusters_to_del,
 						struct ocfs2_dinode *fe,
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index ac10f83edb95..ec6adbf8f551 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -52,7 +52,8 @@ static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc);
 
 static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
 					     struct ocfs2_dinode *alloc,
-					     u32 numbits);
+					     u32 *numbits,
+					     struct ocfs2_alloc_reservation *resv);
 
 static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc);
 
@@ -74,6 +75,151 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
 static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
 					  struct inode *local_alloc_inode);
 
+/*
+ * ocfs2_la_default_mb() - determine a default size, in megabytes of
+ * the local alloc.
+ *
+ * Generally, we'd like to pick as large a local alloc as
+ * possible. Performance on large workloads tends to scale
+ * proportionally to la size. In addition to that, the reservations
+ * code functions more efficiently as it can reserve more windows for
+ * write.
+ *
+ * Some things work against us when trying to choose a large local alloc:
+ *
+ * - We need to ensure our sizing is picked to leave enough space in
+ *   group descriptors for other allocations (such as block groups,
+ *   etc). Picking default sizes which are a multiple of 4 could help
+ *   - block groups are allocated in 2mb and 4mb chunks.
+ *
+ * - Likewise, we don't want to starve other nodes of bits on small
+ *   file systems. This can easily be taken care of by limiting our
+ *   default to a reasonable size (256M) on larger cluster sizes.
+ *
+ * - Some file systems can't support very large sizes - 4k and 8k in
+ *   particular are limited to less than 128 and 256 megabytes respectively.
+ *
+ * The following reference table shows group descriptor and local
+ * alloc maximums at various cluster sizes (4k blocksize)
+ *
+ * csize: 4K	group: 126M	la: 121M
+ * csize: 8K	group: 252M	la: 243M
+ * csize: 16K	group: 504M	la: 486M
+ * csize: 32K	group: 1008M	la: 972M
+ * csize: 64K	group: 2016M	la: 1944M
+ * csize: 128K	group: 4032M	la: 3888M
+ * csize: 256K	group: 8064M	la: 7776M
+ * csize: 512K	group: 16128M	la: 15552M
+ * csize: 1024K	group: 32256M	la: 31104M
+ */
+#define	OCFS2_LA_MAX_DEFAULT_MB	256
+#define	OCFS2_LA_OLD_DEFAULT	8
+unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb)
+{
+	unsigned int la_mb;
+	unsigned int gd_mb;
+	unsigned int la_max_mb;
+	unsigned int megs_per_slot;
+	struct super_block *sb = osb->sb;
+
+	gd_mb = ocfs2_clusters_to_megabytes(osb->sb,
+		8 * ocfs2_group_bitmap_size(sb, 0, osb->s_feature_incompat));
+
+	/*
+	 * This takes care of files systems with very small group
+	 * descriptors - 512 byte blocksize at cluster sizes lower
+	 * than 16K and also 1k blocksize with 4k cluster size.
+	 */
+	if ((sb->s_blocksize == 512 && osb->s_clustersize <= 8192)
+	    || (sb->s_blocksize == 1024 && osb->s_clustersize == 4096))
+		return OCFS2_LA_OLD_DEFAULT;
+
+	/*
+	 * Leave enough room for some block groups and make the final
+	 * value we work from a multiple of 4.
+	 */
+	gd_mb -= 16;
+	gd_mb &= 0xFFFFFFFB;
+
+	la_mb = gd_mb;
+
+	/*
+	 * Keep window sizes down to a reasonable default
+	 */
+	if (la_mb > OCFS2_LA_MAX_DEFAULT_MB) {
+		/*
+		 * Some clustersize / blocksize combinations will have
+		 * given us a larger than OCFS2_LA_MAX_DEFAULT_MB
+		 * default size, but get poor distribution when
+		 * limited to exactly 256 megabytes.
+		 *
+		 * As an example, 16K clustersize at 4K blocksize
+		 * gives us a cluster group size of 504M. Paring the
+		 * local alloc size down to 256 however, would give us
+		 * only one window and around 200MB left in the
+		 * cluster group. Instead, find the first size below
+		 * 256 which would give us an even distribution.
+		 *
+		 * Larger cluster group sizes actually work out pretty
+		 * well when pared to 256, so we don't have to do this
+		 * for any group that fits more than two
+		 * OCFS2_LA_MAX_DEFAULT_MB windows.
+		 */
+		if (gd_mb > (2 * OCFS2_LA_MAX_DEFAULT_MB))
+			la_mb = 256;
+		else {
+			unsigned int gd_mult = gd_mb;
+
+			while (gd_mult > 256)
+				gd_mult = gd_mult >> 1;
+
+			la_mb = gd_mult;
+		}
+	}
+
+	megs_per_slot = osb->osb_clusters_at_boot / osb->max_slots;
+	megs_per_slot = ocfs2_clusters_to_megabytes(osb->sb, megs_per_slot);
+	/* Too many nodes, too few disk clusters. */
+	if (megs_per_slot < la_mb)
+		la_mb = megs_per_slot;
+
+	/* We can't store more bits than we can in a block. */
+	la_max_mb = ocfs2_clusters_to_megabytes(osb->sb,
+						ocfs2_local_alloc_size(sb) * 8);
+	if (la_mb > la_max_mb)
+		la_mb = la_max_mb;
+
+	return la_mb;
+}
+
+void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb)
+{
+	struct super_block *sb = osb->sb;
+	unsigned int la_default_mb = ocfs2_la_default_mb(osb);
+	unsigned int la_max_mb;
+
+	la_max_mb = ocfs2_clusters_to_megabytes(sb,
+						ocfs2_local_alloc_size(sb) * 8);
+
+	mlog(0, "requested: %dM, max: %uM, default: %uM\n",
+	     requested_mb, la_max_mb, la_default_mb);
+
+	if (requested_mb == -1) {
+		/* No user request - use defaults */
+		osb->local_alloc_default_bits =
+			ocfs2_megabytes_to_clusters(sb, la_default_mb);
+	} else if (requested_mb > la_max_mb) {
+		/* Request is too big, we give the maximum available */
+		osb->local_alloc_default_bits =
+			ocfs2_megabytes_to_clusters(sb, la_max_mb);
+	} else {
+		osb->local_alloc_default_bits =
+			ocfs2_megabytes_to_clusters(sb, requested_mb);
+	}
+
+	osb->local_alloc_bits = osb->local_alloc_default_bits;
+}
+
 static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb)
 {
 	return (osb->local_alloc_state == OCFS2_LA_THROTTLED ||
@@ -156,7 +302,7 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
 		     osb->local_alloc_bits, (osb->bitmap_cpg - 1));
 		osb->local_alloc_bits =
 			ocfs2_megabytes_to_clusters(osb->sb,
-						    OCFS2_DEFAULT_LOCAL_ALLOC_SIZE);
+						    ocfs2_la_default_mb(osb));
 	}
 
 	/* read the alloc off disk */
@@ -262,6 +408,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
 
 	osb->local_alloc_state = OCFS2_LA_DISABLED;
 
+	ocfs2_resmap_uninit(&osb->osb_la_resmap);
+
 	main_bm_inode = ocfs2_get_system_file_inode(osb,
 						    GLOBAL_BITMAP_SYSTEM_INODE,
 						    OCFS2_INVALID_SLOT);
@@ -305,12 +453,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
 	}
 
 	ocfs2_clear_local_alloc(alloc);
-
-	status = ocfs2_journal_dirty(handle, bh);
-	if (status < 0) {
-		mlog_errno(status);
-		goto out_commit;
-	}
+	ocfs2_journal_dirty(handle, bh);
 
 	brelse(bh);
 	osb->local_alloc_bh = NULL;
@@ -476,51 +619,11 @@ out_mutex:
 
 out:
 	if (!status)
-		ocfs2_init_inode_steal_slot(osb);
+		ocfs2_init_steal_slots(osb);
 	mlog_exit(status);
 	return status;
 }
 
-/* Check to see if the local alloc window is within ac->ac_max_block */
-static int ocfs2_local_alloc_in_range(struct inode *inode,
-				      struct ocfs2_alloc_context *ac,
-				      u32 bits_wanted)
-{
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct ocfs2_dinode *alloc;
-	struct ocfs2_local_alloc *la;
-	int start;
-	u64 block_off;
-
-	if (!ac->ac_max_block)
-		return 1;
-
-	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
-	la = OCFS2_LOCAL_ALLOC(alloc);
-
-	start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
-	if (start == -1) {
-		mlog_errno(-ENOSPC);
-		return 0;
-	}
-
-	/*
-	 * Converting (bm_off + start + bits_wanted) to blocks gives us
-	 * the blkno just past our actual allocation.  This is perfect
-	 * to compare with ac_max_block.
-	 */
-	block_off = ocfs2_clusters_to_blocks(inode->i_sb,
-					     le32_to_cpu(la->la_bm_off) +
-					     start + bits_wanted);
-	mlog(0, "Checking %llu against %llu\n",
-	     (unsigned long long)block_off,
-	     (unsigned long long)ac->ac_max_block);
-	if (block_off > ac->ac_max_block)
-		return 0;
-
-	return 1;
-}
-
 /*
  * make sure we've got at least bits_wanted contiguous bits in the
  * local alloc. You lose them when you drop i_mutex.
@@ -613,17 +716,6 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
 		mlog(0, "Calling in_range for max block %llu\n",
 		     (unsigned long long)ac->ac_max_block);
 
-	if (!ocfs2_local_alloc_in_range(local_alloc_inode, ac,
-					bits_wanted)) {
-		/*
-		 * The window is outside ac->ac_max_block.
-		 * This errno tells the caller to keep localalloc enabled
-		 * but to get the allocation from the main bitmap.
-		 */
-		status = -EFBIG;
-		goto bail;
-	}
-
 	ac->ac_inode = local_alloc_inode;
 	/* We should never use localalloc from another slot */
 	ac->ac_alloc_slot = osb->slot_num;
@@ -664,7 +756,8 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
 	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
 	la = OCFS2_LOCAL_ALLOC(alloc);
 
-	start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
+	start = ocfs2_local_alloc_find_clear_bits(osb, alloc, &bits_wanted,
+						  ac->ac_resv);
 	if (start == -1) {
 		/* TODO: Shouldn't we just BUG here? */
 		status = -ENOSPC;
@@ -674,8 +767,6 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
 
 	bitmap = la->la_bitmap;
 	*bit_off = le32_to_cpu(la->la_bm_off) + start;
-	/* local alloc is always contiguous by nature -- we never
-	 * delete bits from it! */
 	*num_bits = bits_wanted;
 
 	status = ocfs2_journal_access_di(handle,
@@ -687,18 +778,15 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
 		goto bail;
 	}
 
+	ocfs2_resmap_claimed_bits(&osb->osb_la_resmap, ac->ac_resv, start,
+				  bits_wanted);
+
 	while(bits_wanted--)
 		ocfs2_set_bit(start++, bitmap);
 
 	le32_add_cpu(&alloc->id1.bitmap1.i_used, *num_bits);
+	ocfs2_journal_dirty(handle, osb->local_alloc_bh);
 
-	status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail;
-	}
-
-	status = 0;
 bail:
 	mlog_exit(status);
 	return status;
@@ -722,13 +810,17 @@ static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)
 }
 
 static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
-					     struct ocfs2_dinode *alloc,
-					     u32 numbits)
+				     struct ocfs2_dinode *alloc,
+				     u32 *numbits,
+				     struct ocfs2_alloc_reservation *resv)
 {
 	int numfound, bitoff, left, startoff, lastzero;
+	int local_resv = 0;
+	struct ocfs2_alloc_reservation r;
 	void *bitmap = NULL;
+	struct ocfs2_reservation_map *resmap = &osb->osb_la_resmap;
 
-	mlog_entry("(numbits wanted = %u)\n", numbits);
+	mlog_entry("(numbits wanted = %u)\n", *numbits);
 
 	if (!alloc->id1.bitmap1.i_total) {
 		mlog(0, "No bits in my window!\n");
@@ -736,6 +828,30 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
 		goto bail;
 	}
 
+	if (!resv) {
+		local_resv = 1;
+		ocfs2_resv_init_once(&r);
+		ocfs2_resv_set_type(&r, OCFS2_RESV_FLAG_TMP);
+		resv = &r;
+	}
+
+	numfound = *numbits;
+	if (ocfs2_resmap_resv_bits(resmap, resv, &bitoff, &numfound) == 0) {
+		if (numfound < *numbits)
+			*numbits = numfound;
+		goto bail;
+	}
+
+	/*
+	 * Code error. While reservations are enabled, local
+	 * allocation should _always_ go through them.
+	 */
+	BUG_ON(osb->osb_resv_level != 0);
+
+	/*
+	 * Reservations are disabled. Handle this the old way.
+	 */
+
 	bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap;
 
 	numfound = bitoff = startoff = 0;
@@ -761,7 +877,7 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
 			startoff = bitoff+1;
 		}
 		/* we got everything we needed */
-		if (numfound == numbits) {
+		if (numfound == *numbits) {
 			/* mlog(0, "Found it all!\n"); */
 			break;
 		}
@@ -770,12 +886,15 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
 	mlog(0, "Exiting loop, bitoff = %d, numfound = %d\n", bitoff,
 	     numfound);
 
-	if (numfound == numbits)
+	if (numfound == *numbits)
 		bitoff = startoff - numfound;
 	else
 		bitoff = -1;
 
 bail:
+	if (local_resv)
+		ocfs2_resv_discard(resmap, resv);
+
 	mlog_exit(bitoff);
 	return bitoff;
 }
@@ -872,8 +991,10 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
 			     (unsigned long long)la_start_blk,
 			     (unsigned long long)blkno);
 
-			status = ocfs2_free_clusters(handle, main_bm_inode,
-						     main_bm_bh, blkno, count);
+			status = ocfs2_release_clusters(handle,
+							main_bm_inode,
+							main_bm_bh, blkno,
+							count);
 			if (status < 0) {
 				mlog_errno(status);
 				goto bail;
@@ -984,8 +1105,7 @@ static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
 	}
 
 retry_enospc:
-	(*ac)->ac_bits_wanted = osb->local_alloc_bits;
-
+	(*ac)->ac_bits_wanted = osb->local_alloc_default_bits;
 	status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
 	if (status == -ENOSPC) {
 		if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) ==
@@ -1048,7 +1168,7 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
 	/* we used the generic suballoc reserve function, but we set
 	 * everything up nicely, so there's no reason why we can't use
 	 * the more specific cluster api to claim bits. */
-	status = ocfs2_claim_clusters(osb, handle, ac, osb->local_alloc_bits,
+	status = ocfs2_claim_clusters(handle, ac, osb->local_alloc_bits,
 				      &cluster_off, &cluster_count);
 	if (status == -ENOSPC) {
 retry_enospc:
@@ -1061,7 +1181,8 @@ retry_enospc:
 		    OCFS2_LA_DISABLED)
 			goto bail;
 
-		status = ocfs2_claim_clusters(osb, handle, ac,
+		ac->ac_bits_wanted = osb->local_alloc_default_bits;
+		status = ocfs2_claim_clusters(handle, ac,
 					      osb->local_alloc_bits,
 					      &cluster_off,
 					      &cluster_count);
@@ -1096,6 +1217,9 @@ retry_enospc:
 	memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0,
 	       le16_to_cpu(la->la_size));
 
+	ocfs2_resmap_restart(&osb->osb_la_resmap, cluster_count,
+			     OCFS2_LOCAL_ALLOC(alloc)->la_bitmap);
+
 	mlog(0, "New window allocated:\n");
 	mlog(0, "window la_bm_off = %u\n",
 	     OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
@@ -1167,12 +1291,7 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
 	}
 
 	ocfs2_clear_local_alloc(alloc);
-
-	status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail;
-	}
+	ocfs2_journal_dirty(handle, osb->local_alloc_bh);
 
 	status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
 					  main_bm_inode, main_bm_bh);
@@ -1190,7 +1309,6 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
 
 	atomic_inc(&osb->alloc_stats.moves);
 
-	status = 0;
 bail:
 	if (handle)
 		ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h
index ac5ea9f86653..1be9b5864460 100644
--- a/fs/ocfs2/localalloc.h
+++ b/fs/ocfs2/localalloc.h
@@ -30,6 +30,9 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb);
 
 void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb);
 
+void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb);
+unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb);
+
 int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
 				     int node_num,
 				     struct ocfs2_dinode **alloc_copy);
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index 544ac6245175..b5cb3ede9408 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -133,7 +133,7 @@ int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl)
 
 	if (!(fl->fl_flags & FL_POSIX))
 		return -ENOLCK;
-	if (__mandatory_lock(inode))
+	if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
 		return -ENOLCK;
 
 	return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl);
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 39737613424a..af2b8fe1f139 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -25,7 +25,6 @@
 
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <linux/uio.h>
@@ -42,44 +41,20 @@
 #include "file.h"
 #include "inode.h"
 #include "mmap.h"
+#include "super.h"
 
-static inline int ocfs2_vm_op_block_sigs(sigset_t *blocked, sigset_t *oldset)
-{
-	/* The best way to deal with signals in the vm path is
-	 * to block them upfront, rather than allowing the
-	 * locking paths to return -ERESTARTSYS. */
-	sigfillset(blocked);
-
-	/* We should technically never get a bad return value
-	 * from sigprocmask */
-	return sigprocmask(SIG_BLOCK, blocked, oldset);
-}
-
-static inline int ocfs2_vm_op_unblock_sigs(sigset_t *oldset)
-{
-	return sigprocmask(SIG_SETMASK, oldset, NULL);
-}
 
 static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
 {
-	sigset_t blocked, oldset;
-	int error, ret;
+	sigset_t oldset;
+	int ret;
 
 	mlog_entry("(area=%p, page offset=%lu)\n", area, vmf->pgoff);
 
-	error = ocfs2_vm_op_block_sigs(&blocked, &oldset);
-	if (error < 0) {
-		mlog_errno(error);
-		ret = VM_FAULT_SIGBUS;
-		goto out;
-	}
-
+	ocfs2_block_signals(&oldset);
 	ret = filemap_fault(area, vmf);
+	ocfs2_unblock_signals(&oldset);
 
-	error = ocfs2_vm_op_unblock_sigs(&oldset);
-	if (error < 0)
-		mlog_errno(error);
-out:
 	mlog_exit_ptr(vmf->page);
 	return ret;
 }
@@ -159,14 +134,10 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct page *page = vmf->page;
 	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
 	struct buffer_head *di_bh = NULL;
-	sigset_t blocked, oldset;
-	int ret, ret2;
+	sigset_t oldset;
+	int ret;
 
-	ret = ocfs2_vm_op_block_sigs(&blocked, &oldset);
-	if (ret < 0) {
-		mlog_errno(ret);
-		return ret;
-	}
+	ocfs2_block_signals(&oldset);
 
 	/*
 	 * The cluster locks taken will block a truncate from another
@@ -194,9 +165,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	ocfs2_inode_unlock(inode, 1);
 
 out:
-	ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
-	if (ret2 < 0)
-		mlog_errno(ret2);
+	ocfs2_unblock_signals(&oldset);
 	if (ret)
 		ret = VM_FAULT_SIGBUS;
 	return ret;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 50fb26a6a5f5..f171b51a74f7 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -84,7 +84,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
 static int ocfs2_orphan_add(struct ocfs2_super *osb,
 			    handle_t *handle,
 			    struct inode *inode,
-			    struct ocfs2_dinode *fe,
+			    struct buffer_head *fe_bh,
 			    char *name,
 			    struct ocfs2_dir_lookup_result *lookup,
 			    struct inode *orphan_dir_inode);
@@ -204,15 +204,8 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
 		inode->i_nlink = 2;
 	else
 		inode->i_nlink = 1;
-	inode->i_uid = current_fsuid();
-	if (dir->i_mode & S_ISGID) {
-		inode->i_gid = dir->i_gid;
-		if (S_ISDIR(mode))
-			mode |= S_ISGID;
-	} else
-		inode->i_gid = current_fsgid();
-	inode->i_mode = mode;
-	vfs_dq_init(inode);
+	inode_init_owner(inode, dir, mode);
+	dquot_initialize(inode);
 	return inode;
 }
 
@@ -239,11 +232,15 @@ static int ocfs2_mknod(struct inode *dir,
 	};
 	int did_quota_inode = 0;
 	struct ocfs2_dir_lookup_result lookup = { NULL, };
+	sigset_t oldset;
+	int did_block_signals = 0;
 
 	mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
 		   (unsigned long)dev, dentry->d_name.len,
 		   dentry->d_name.name);
 
+	dquot_initialize(dir);
+
 	/* get our super block */
 	osb = OCFS2_SB(dir->i_sb);
 
@@ -348,13 +345,13 @@ static int ocfs2_mknod(struct inode *dir,
 		goto leave;
 	}
 
-	/* We don't use standard VFS wrapper because we don't want vfs_dq_init
-	 * to be called. */
-	if (sb_any_quota_active(osb->sb) &&
-	    osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
-		status = -EDQUOT;
+	/* Starting to change things, restart is no longer possible. */
+	ocfs2_block_signals(&oldset);
+	did_block_signals = 1;
+
+	status = dquot_alloc_inode(inode);
+	if (status)
 		goto leave;
-	}
 	did_quota_inode = 1;
 
 	mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
@@ -386,11 +383,7 @@ static int ocfs2_mknod(struct inode *dir,
 			goto leave;
 		}
 		ocfs2_add_links_count(dirfe, 1);
-		status = ocfs2_journal_dirty(handle, parent_fe_bh);
-		if (status < 0) {
-			mlog_errno(status);
-			goto leave;
-		}
+		ocfs2_journal_dirty(handle, parent_fe_bh);
 		inc_nlink(dir);
 	}
 
@@ -410,32 +403,39 @@ static int ocfs2_mknod(struct inode *dir,
 		}
 	}
 
-	status = ocfs2_add_entry(handle, dentry, inode,
-				 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
-				 &lookup);
-	if (status < 0) {
+	/*
+	 * Do this before adding the entry to the directory. We add
+	 * also set d_op after success so that ->d_iput() will cleanup
+	 * the dentry lock even if ocfs2_add_entry() fails below.
+	 */
+	status = ocfs2_dentry_attach_lock(dentry, inode,
+					  OCFS2_I(dir)->ip_blkno);
+	if (status) {
 		mlog_errno(status);
 		goto leave;
 	}
+	dentry->d_op = &ocfs2_dentry_ops;
 
-	status = ocfs2_dentry_attach_lock(dentry, inode,
-					  OCFS2_I(dir)->ip_blkno);
-	if (status) {
+	status = ocfs2_add_entry(handle, dentry, inode,
+				 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
+				 &lookup);
+	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 
 	insert_inode_hash(inode);
-	dentry->d_op = &ocfs2_dentry_ops;
 	d_instantiate(dentry, inode);
 	status = 0;
 leave:
 	if (status < 0 && did_quota_inode)
-		vfs_dq_free_inode(inode);
+		dquot_free_inode(inode);
 	if (handle)
 		ocfs2_commit_trans(osb, handle);
 
 	ocfs2_inode_unlock(dir, 1);
+	if (did_block_signals)
+		ocfs2_unblock_signals(&oldset);
 
 	if (status == -ENOSPC)
 		mlog(0, "Disk is full\n");
@@ -447,11 +447,6 @@ leave:
 
 	ocfs2_free_dir_lookup_result(&lookup);
 
-	if ((status < 0) && inode) {
-		clear_nlink(inode);
-		iput(inode);
-	}
-
 	if (inode_ac)
 		ocfs2_free_alloc_context(inode_ac);
 
@@ -461,6 +456,17 @@ leave:
 	if (meta_ac)
 		ocfs2_free_alloc_context(meta_ac);
 
+	/*
+	 * We should call iput after the i_mutex of the bitmap been
+	 * unlocked in ocfs2_free_alloc_context, or the
+	 * ocfs2_delete_inode will mutex_lock again.
+	 */
+	if ((status < 0) && inode) {
+		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR;
+		clear_nlink(inode);
+		iput(inode);
+	}
+
 	mlog_exit(status);
 
 	return status;
@@ -478,14 +484,15 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 	int status = 0;
 	struct ocfs2_dinode *fe = NULL;
 	struct ocfs2_extent_list *fel;
-	u64 fe_blkno = 0;
+	u64 suballoc_loc, fe_blkno = 0;
 	u16 suballoc_bit;
 	u16 feat;
 
 	*new_fe_bh = NULL;
 
-	status = ocfs2_claim_new_inode(osb, handle, dir, parent_fe_bh,
-				       inode_ac, &suballoc_bit, &fe_blkno);
+	status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh,
+				       inode_ac, &suballoc_loc,
+				       &suballoc_bit, &fe_blkno);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
@@ -522,6 +529,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 	fe->i_generation = cpu_to_le32(inode->i_generation);
 	fe->i_fs_generation = cpu_to_le32(osb->fs_generation);
 	fe->i_blkno = cpu_to_le64(fe_blkno);
+	fe->i_suballoc_loc = cpu_to_le64(suballoc_loc);
 	fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
 	fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot);
 	fe->i_uid = cpu_to_le32(inode->i_uid);
@@ -558,11 +566,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 		fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb));
 	}
 
-	status = ocfs2_journal_dirty(handle, *new_fe_bh);
-	if (status < 0) {
-		mlog_errno(status);
-		goto leave;
-	}
+	ocfs2_journal_dirty(handle, *new_fe_bh);
 
 	ocfs2_populate_inode(inode, fe, 1);
 	ocfs2_ci_set_new(osb, INODE_CACHE(inode));
@@ -628,6 +632,7 @@ static int ocfs2_link(struct dentry *old_dentry,
 	struct ocfs2_dinode *fe = NULL;
 	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
 	struct ocfs2_dir_lookup_result lookup = { NULL, };
+	sigset_t oldset;
 
 	mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino,
 		   old_dentry->d_name.len, old_dentry->d_name.name,
@@ -636,6 +641,8 @@ static int ocfs2_link(struct dentry *old_dentry,
 	if (S_ISDIR(inode->i_mode))
 		return -EPERM;
 
+	dquot_initialize(dir);
+
 	err = ocfs2_inode_lock_nested(dir, &parent_fe_bh, 1, OI_LS_PARENT);
 	if (err < 0) {
 		if (err != -ENOENT)
@@ -682,6 +689,9 @@ static int ocfs2_link(struct dentry *old_dentry,
 		goto out_unlock_inode;
 	}
 
+	/* Starting to change things, restart is no longer possible. */
+	ocfs2_block_signals(&oldset);
+
 	err = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
 				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (err < 0) {
@@ -694,14 +704,7 @@ static int ocfs2_link(struct dentry *old_dentry,
 	ocfs2_set_links_count(fe, inode->i_nlink);
 	fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
 	fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
-
-	err = ocfs2_journal_dirty(handle, fe_bh);
-	if (err < 0) {
-		ocfs2_add_links_count(fe, -1);
-		drop_nlink(inode);
-		mlog_errno(err);
-		goto out_commit;
-	}
+	ocfs2_journal_dirty(handle, fe_bh);
 
 	err = ocfs2_add_entry(handle, dentry, inode,
 			      OCFS2_I(inode)->ip_blkno,
@@ -725,6 +728,7 @@ static int ocfs2_link(struct dentry *old_dentry,
 
 out_commit:
 	ocfs2_commit_trans(osb, handle);
+	ocfs2_unblock_signals(&oldset);
 out_unlock_inode:
 	ocfs2_inode_unlock(inode, 1);
 
@@ -791,6 +795,8 @@ static int ocfs2_unlink(struct inode *dir,
 	mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
 		   dentry->d_name.len, dentry->d_name.name);
 
+	dquot_initialize(dir);
+
 	BUG_ON(dentry->d_parent->d_inode != dir);
 
 	mlog(0, "ino = %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -877,7 +883,7 @@ static int ocfs2_unlink(struct inode *dir,
 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
 
 	if (inode_is_unlinkable(inode)) {
-		status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name,
+		status = ocfs2_orphan_add(osb, handle, inode, fe_bh, orphan_name,
 					  &orphan_insert, orphan_dir);
 		if (status < 0) {
 			mlog_errno(status);
@@ -896,12 +902,7 @@ static int ocfs2_unlink(struct inode *dir,
 		drop_nlink(inode);
 	drop_nlink(inode);
 	ocfs2_set_links_count(fe, inode->i_nlink);
-
-	status = ocfs2_journal_dirty(handle, fe_bh);
-	if (status < 0) {
-		mlog_errno(status);
-		goto leave;
-	}
+	ocfs2_journal_dirty(handle, fe_bh);
 
 	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
 	if (S_ISDIR(inode->i_mode))
@@ -1051,6 +1052,9 @@ static int ocfs2_rename(struct inode *old_dir,
 		   old_dentry->d_name.len, old_dentry->d_name.name,
 		   new_dentry->d_name.len, new_dentry->d_name.name);
 
+	dquot_initialize(old_dir);
+	dquot_initialize(new_dir);
+
 	osb = OCFS2_SB(old_dir->i_sb);
 
 	if (new_inode) {
@@ -1295,7 +1299,7 @@ static int ocfs2_rename(struct inode *old_dir,
 		if (S_ISDIR(new_inode->i_mode) ||
 		    (ocfs2_read_links_count(newfe) == 1)) {
 			status = ocfs2_orphan_add(osb, handle, new_inode,
-						  newfe, orphan_name,
+						  newfe_bh, orphan_name,
 						  &orphan_insert, orphan_dir);
 			if (status < 0) {
 				mlog_errno(status);
@@ -1316,12 +1320,7 @@ static int ocfs2_rename(struct inode *old_dir,
 			ocfs2_set_links_count(newfe, 0);
 		else
 			ocfs2_add_links_count(newfe, -1);
-
-		status = ocfs2_journal_dirty(handle, newfe_bh);
-		if (status < 0) {
-			mlog_errno(status);
-			goto bail;
-		}
+		ocfs2_journal_dirty(handle, newfe_bh);
 	} else {
 		/* if the name was not found in new_dir, add it now */
 		status = ocfs2_add_entry(handle, new_dentry, old_inode,
@@ -1340,10 +1339,7 @@ static int ocfs2_rename(struct inode *old_dir,
 
 		old_di->i_ctime = cpu_to_le64(old_inode->i_ctime.tv_sec);
 		old_di->i_ctime_nsec = cpu_to_le32(old_inode->i_ctime.tv_nsec);
-
-		status = ocfs2_journal_dirty(handle, old_inode_bh);
-		if (status < 0)
-			mlog_errno(status);
+		ocfs2_journal_dirty(handle, old_inode_bh);
 	} else
 		mlog_errno(status);
 
@@ -1415,7 +1411,7 @@ static int ocfs2_rename(struct inode *old_dir,
 							 OCFS2_JOURNAL_ACCESS_WRITE);
 			fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
 			ocfs2_set_links_count(fe, old_dir->i_nlink);
-			status = ocfs2_journal_dirty(handle, old_dir_bh);
+			ocfs2_journal_dirty(handle, old_dir_bh);
 		}
 	}
 	ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir);
@@ -1547,11 +1543,7 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
 		       (bytes_left > sb->s_blocksize) ? sb->s_blocksize :
 		       bytes_left);
 
-		status = ocfs2_journal_dirty(handle, bhs[virtual]);
-		if (status < 0) {
-			mlog_errno(status);
-			goto bail;
-		}
+		ocfs2_journal_dirty(handle, bhs[virtual]);
 
 		virtual++;
 		p_blkno++;
@@ -1595,10 +1587,14 @@ static int ocfs2_symlink(struct inode *dir,
 	};
 	int did_quota = 0, did_quota_inode = 0;
 	struct ocfs2_dir_lookup_result lookup = { NULL, };
+	sigset_t oldset;
+	int did_block_signals = 0;
 
 	mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
 		   dentry, symname, dentry->d_name.len, dentry->d_name.name);
 
+	dquot_initialize(dir);
+
 	sb = dir->i_sb;
 	osb = OCFS2_SB(sb);
 
@@ -1688,13 +1684,13 @@ static int ocfs2_symlink(struct inode *dir,
 		goto bail;
 	}
 
-	/* We don't use standard VFS wrapper because we don't want vfs_dq_init
-	 * to be called. */
-	if (sb_any_quota_active(osb->sb) &&
-	    osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
-		status = -EDQUOT;
+	/* Starting to change things, restart is no longer possible. */
+	ocfs2_block_signals(&oldset);
+	did_block_signals = 1;
+
+	status = dquot_alloc_inode(inode);
+	if (status)
 		goto bail;
-	}
 	did_quota_inode = 1;
 
 	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry,
@@ -1716,11 +1712,10 @@ static int ocfs2_symlink(struct inode *dir,
 		u32 offset = 0;
 
 		inode->i_op = &ocfs2_symlink_inode_operations;
-		if (vfs_dq_alloc_space_nodirty(inode,
-		    ocfs2_clusters_to_bytes(osb->sb, 1))) {
-			status = -EDQUOT;
+		status = dquot_alloc_space_nodirty(inode,
+		    ocfs2_clusters_to_bytes(osb->sb, 1));
+		if (status)
 			goto bail;
-		}
 		did_quota = 1;
 		status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0,
 					      new_fe_bh,
@@ -1769,33 +1764,40 @@ static int ocfs2_symlink(struct inode *dir,
 		}
 	}
 
-	status = ocfs2_add_entry(handle, dentry, inode,
-				 le64_to_cpu(fe->i_blkno), parent_fe_bh,
-				 &lookup);
-	if (status < 0) {
+	/*
+	 * Do this before adding the entry to the directory. We add
+	 * also set d_op after success so that ->d_iput() will cleanup
+	 * the dentry lock even if ocfs2_add_entry() fails below.
+	 */
+	status = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno);
+	if (status) {
 		mlog_errno(status);
 		goto bail;
 	}
+	dentry->d_op = &ocfs2_dentry_ops;
 
-	status = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno);
-	if (status) {
+	status = ocfs2_add_entry(handle, dentry, inode,
+				 le64_to_cpu(fe->i_blkno), parent_fe_bh,
+				 &lookup);
+	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
 
 	insert_inode_hash(inode);
-	dentry->d_op = &ocfs2_dentry_ops;
 	d_instantiate(dentry, inode);
 bail:
 	if (status < 0 && did_quota)
-		vfs_dq_free_space_nodirty(inode,
+		dquot_free_space_nodirty(inode,
 					ocfs2_clusters_to_bytes(osb->sb, 1));
 	if (status < 0 && did_quota_inode)
-		vfs_dq_free_inode(inode);
+		dquot_free_inode(inode);
 	if (handle)
 		ocfs2_commit_trans(osb, handle);
 
 	ocfs2_inode_unlock(dir, 1);
+	if (did_block_signals)
+		ocfs2_unblock_signals(&oldset);
 
 	brelse(new_fe_bh);
 	brelse(parent_fe_bh);
@@ -1809,6 +1811,7 @@ bail:
 	if (xattr_ac)
 		ocfs2_free_alloc_context(xattr_ac);
 	if ((status < 0) && inode) {
+		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR;
 		clear_nlink(inode);
 		iput(inode);
 	}
@@ -1909,7 +1912,7 @@ leave:
 static int ocfs2_orphan_add(struct ocfs2_super *osb,
 			    handle_t *handle,
 			    struct inode *inode,
-			    struct ocfs2_dinode *fe,
+			    struct buffer_head *fe_bh,
 			    char *name,
 			    struct ocfs2_dir_lookup_result *lookup,
 			    struct inode *orphan_dir_inode)
@@ -1917,6 +1920,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
 	struct buffer_head *orphan_dir_bh = NULL;
 	int status = 0;
 	struct ocfs2_dinode *orphan_fe;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
 
 	mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
 
@@ -1941,29 +1945,42 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
 	if (S_ISDIR(inode->i_mode))
 		ocfs2_add_links_count(orphan_fe, 1);
 	orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
+	ocfs2_journal_dirty(handle, orphan_dir_bh);
 
-	status = ocfs2_journal_dirty(handle, orphan_dir_bh);
+	status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
+				   OCFS2_ORPHAN_NAMELEN, inode,
+				   OCFS2_I(inode)->ip_blkno,
+				   orphan_dir_bh, lookup);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 
-	status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
-				   OCFS2_ORPHAN_NAMELEN, inode,
-				   OCFS2_I(inode)->ip_blkno,
-				   orphan_dir_bh, lookup);
+	/*
+	 * We're going to journal the change of i_flags and i_orphaned_slot.
+	 * It's safe anyway, though some callers may duplicate the journaling.
+	 * Journaling within the func just make the logic look more
+	 * straightforward.
+	 */
+	status = ocfs2_journal_access_di(handle,
+					 INODE_CACHE(inode),
+					 fe_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 
 	le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL);
+	OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR;
 
 	/* Record which orphan dir our inode now resides
 	 * in. delete_inode will use this to determine which orphan
 	 * dir to lock. */
 	fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
 
+	ocfs2_journal_dirty(handle, fe_bh);
+
 	mlog(0, "Inode %llu orphaned in slot %d\n",
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
 
@@ -2027,12 +2044,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
 	if (S_ISDIR(inode->i_mode))
 		ocfs2_add_links_count(orphan_fe, -1);
 	orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
-
-	status = ocfs2_journal_dirty(handle, orphan_dir_bh);
-	if (status < 0) {
-		mlog_errno(status);
-		goto leave;
-	}
+	ocfs2_journal_dirty(handle, orphan_dir_bh);
 
 leave:
 	ocfs2_free_dir_lookup_result(&lookup);
@@ -2099,13 +2111,9 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
 		goto leave;
 	}
 
-	/* We don't use standard VFS wrapper because we don't want vfs_dq_init
-	 * to be called. */
-	if (sb_any_quota_active(osb->sb) &&
-	    osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
-		status = -EDQUOT;
+	status = dquot_alloc_inode(inode);
+	if (status)
 		goto leave;
-	}
 	did_quota_inode = 1;
 
 	inode->i_nlink = 0;
@@ -2125,7 +2133,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
 	}
 
 	di = (struct ocfs2_dinode *)new_di_bh->b_data;
-	status = ocfs2_orphan_add(osb, handle, inode, di, orphan_name,
+	status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name,
 				  &orphan_insert, orphan_dir);
 	if (status < 0) {
 		mlog_errno(status);
@@ -2140,7 +2148,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
 	insert_inode_hash(inode);
 leave:
 	if (status < 0 && did_quota_inode)
-		vfs_dq_free_inode(inode);
+		dquot_free_inode(inode);
 	if (handle)
 		ocfs2_commit_trans(osb, handle);
 
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 9362eea7424b..c67003b6b5a2 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -42,10 +42,12 @@
 
 #include "ocfs2_fs.h"
 #include "ocfs2_lockid.h"
+#include "ocfs2_ioctl.h"
 
 /* For struct ocfs2_blockcheck_stats */
 #include "blockcheck.h"
 
+#include "reservations.h"
 
 /* Caching of metadata buffers */
 
@@ -136,6 +138,10 @@ enum ocfs2_unlock_action {
 #define OCFS2_LOCK_PENDING       (0x00000400) /* This lockres is pending a
 						 call to dlm_lock.  Only
 						 exists with BUSY set. */
+#define OCFS2_LOCK_UPCONVERT_FINISHING (0x00000800) /* blocks the dc thread
+						     * from downconverting
+						     * before the upconvert
+						     * has completed */
 
 struct ocfs2_lock_res_ops;
 
@@ -155,7 +161,7 @@ struct ocfs2_lock_res {
 	int                      l_level;
 	unsigned int             l_ro_holders;
 	unsigned int             l_ex_holders;
-	union ocfs2_dlm_lksb     l_lksb;
+	struct ocfs2_dlm_lksb    l_lksb;
 
 	/* used from AST/BAST funcs. */
 	enum ocfs2_ast_action    l_action;
@@ -301,7 +307,9 @@ struct ocfs2_super
 	u32 s_next_generation;
 	unsigned long osb_flags;
 	s16 s_inode_steal_slot;
+	s16 s_meta_steal_slot;
 	atomic_t s_num_inodes_stolen;
+	atomic_t s_num_meta_stolen;
 
 	unsigned long s_mount_opt;
 	unsigned int s_atime_quantum;
@@ -334,6 +342,9 @@ struct ocfs2_super
 	 */
 	unsigned int local_alloc_bits;
 	unsigned int local_alloc_default_bits;
+	/* osb_clusters_at_boot can become stale! Do not trust it to
+	 * be up to date. */
+	unsigned int osb_clusters_at_boot;
 
 	enum ocfs2_local_alloc_state local_alloc_state; /* protected
 							 * by osb_lock */
@@ -342,6 +353,11 @@ struct ocfs2_super
 
 	u64 la_last_gd;
 
+	struct ocfs2_reservation_map	osb_la_resmap;
+
+	unsigned int	osb_resv_level;
+	unsigned int	osb_dir_resv_level;
+
 	/* Next three fields are for local node slot recovery during
 	 * mount. */
 	int dirty;
@@ -475,6 +491,13 @@ static inline int ocfs2_supports_indexed_dirs(struct ocfs2_super *osb)
 	return 0;
 }
 
+static inline int ocfs2_supports_discontig_bg(struct ocfs2_super *osb)
+{
+	if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)
+		return 1;
+	return 0;
+}
+
 static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb)
 {
 	if (ocfs2_supports_indexed_dirs(osb))
@@ -756,35 +779,24 @@ static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb,
 	return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits);
 }
 
-static inline void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
+static inline unsigned int ocfs2_clusters_to_megabytes(struct super_block *sb,
+						       unsigned int clusters)
 {
-	spin_lock(&osb->osb_lock);
-	osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
-	spin_unlock(&osb->osb_lock);
-	atomic_set(&osb->s_num_inodes_stolen, 0);
+	return clusters >> (20 - OCFS2_SB(sb)->s_clustersize_bits);
 }
 
-static inline void ocfs2_set_inode_steal_slot(struct ocfs2_super *osb,
-					      s16 slot)
+static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap)
 {
-	spin_lock(&osb->osb_lock);
-	osb->s_inode_steal_slot = slot;
-	spin_unlock(&osb->osb_lock);
+	ext2_set_bit(bit, bitmap);
 }
+#define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr))
 
-static inline s16 ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
+static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap)
 {
-	s16 slot;
-
-	spin_lock(&osb->osb_lock);
-	slot = osb->s_inode_steal_slot;
-	spin_unlock(&osb->osb_lock);
-
-	return slot;
+	ext2_clear_bit(bit, bitmap);
 }
+#define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr))
 
-#define ocfs2_set_bit ext2_set_bit
-#define ocfs2_clear_bit ext2_clear_bit
 #define ocfs2_test_bit ext2_test_bit
 #define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
 #define ocfs2_find_next_bit ext2_find_next_bit
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 1a1a679e51b5..33f1c9a8258d 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -100,7 +100,8 @@
 					 | OCFS2_FEATURE_INCOMPAT_XATTR \
 					 | OCFS2_FEATURE_INCOMPAT_META_ECC \
 					 | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \
-					 | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE)
+					 | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \
+					 | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)
 #define OCFS2_FEATURE_RO_COMPAT_SUPP	(OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
 					 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
 					 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
@@ -165,6 +166,9 @@
 /* Refcount tree support */
 #define OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE	0x1000
 
+/* Discontigous block groups */
+#define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG	0x2000
+
 /*
  * backup superblock flag is used to indicate that this volume
  * has backup superblocks.
@@ -254,63 +258,6 @@
 						 * refcount tree */
 
 /*
- * ioctl commands
- */
-#define OCFS2_IOC_GETFLAGS	_IOR('f', 1, long)
-#define OCFS2_IOC_SETFLAGS	_IOW('f', 2, long)
-#define OCFS2_IOC32_GETFLAGS	_IOR('f', 1, int)
-#define OCFS2_IOC32_SETFLAGS	_IOW('f', 2, int)
-
-/*
- * Space reservation / allocation / free ioctls and argument structure
- * are designed to be compatible with XFS.
- *
- * ALLOCSP* and FREESP* are not and will never be supported, but are
- * included here for completeness.
- */
-struct ocfs2_space_resv {
-	__s16		l_type;
-	__s16		l_whence;
-	__s64		l_start;
-	__s64		l_len;		/* len == 0 means until end of file */
-	__s32		l_sysid;
-	__u32		l_pid;
-	__s32		l_pad[4];	/* reserve area			    */
-};
-
-#define OCFS2_IOC_ALLOCSP		_IOW ('X', 10, struct ocfs2_space_resv)
-#define OCFS2_IOC_FREESP		_IOW ('X', 11, struct ocfs2_space_resv)
-#define OCFS2_IOC_RESVSP		_IOW ('X', 40, struct ocfs2_space_resv)
-#define OCFS2_IOC_UNRESVSP	_IOW ('X', 41, struct ocfs2_space_resv)
-#define OCFS2_IOC_ALLOCSP64	_IOW ('X', 36, struct ocfs2_space_resv)
-#define OCFS2_IOC_FREESP64	_IOW ('X', 37, struct ocfs2_space_resv)
-#define OCFS2_IOC_RESVSP64	_IOW ('X', 42, struct ocfs2_space_resv)
-#define OCFS2_IOC_UNRESVSP64	_IOW ('X', 43, struct ocfs2_space_resv)
-
-/* Used to pass group descriptor data when online resize is done */
-struct ocfs2_new_group_input {
-	__u64 group;		/* Group descriptor's blkno. */
-	__u32 clusters;		/* Total number of clusters in this group */
-	__u32 frees;		/* Total free clusters in this group */
-	__u16 chain;		/* Chain for this group */
-	__u16 reserved1;
-	__u32 reserved2;
-};
-
-#define OCFS2_IOC_GROUP_EXTEND	_IOW('o', 1, int)
-#define OCFS2_IOC_GROUP_ADD	_IOW('o', 2,struct ocfs2_new_group_input)
-#define OCFS2_IOC_GROUP_ADD64	_IOW('o', 3,struct ocfs2_new_group_input)
-
-/* Used to pass 2 file names to reflink. */
-struct reflink_arguments {
-	__u64 old_path;
-	__u64 new_path;
-	__u64 preserve;
-};
-#define OCFS2_IOC_REFLINK	_IOW('o', 4, struct reflink_arguments)
-
-
-/*
  * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
  */
 #define OCFS2_JOURNAL_DIRTY_FL	(0x00000001)	/* Journal needs recovery */
@@ -340,14 +287,6 @@ struct reflink_arguments {
 #define OCFS2_MIN_JOURNAL_SIZE		(4 * 1024 * 1024)
 
 /*
- * Default local alloc size (in megabytes)
- *
- * The value chosen should be such that most allocations, including new
- * block groups, use local alloc.
- */
-#define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE	8
-
-/*
  * Inline extended attribute size (in bytes)
  * The value chosen should be aligned to 16 byte boundaries.
  */
@@ -569,7 +508,10 @@ struct ocfs2_extent_block
 					   block group */
 	__le32 h_fs_generation;		/* Must match super block */
 	__le64 h_blkno;			/* Offset on disk, in blocks */
-/*20*/	__le64 h_reserved3;
+/*20*/	__le64 h_suballoc_loc;		/* Suballocator block group this
+					   eb belongs to.  Only valid
+					   if allocated from a
+					   discontiguous block group */
 	__le64 h_next_leaf_blk;		/* Offset on disk, in blocks,
 					   of next leaf header pointing
 					   to data */
@@ -736,7 +678,11 @@ struct ocfs2_dinode {
 /*80*/	struct ocfs2_block_check i_check;	/* Error checking */
 /*88*/	__le64 i_dx_root;		/* Pointer to dir index root block */
 /*90*/	__le64 i_refcount_loc;
-	__le64 i_reserved2[4];
+	__le64 i_suballoc_loc;		/* Suballocator block group this
+					   inode belongs to.  Only valid
+					   if allocated from a
+					   discontiguous block group */
+/*A0*/	__le64 i_reserved2[3];
 /*B8*/	union {
 		__le64 i_pad1;		/* Generic way to refer to this
 					   64bit union */
@@ -871,7 +817,12 @@ struct ocfs2_dx_root_block {
 	__le32		dr_reserved2;
 	__le64		dr_free_blk;		/* Pointer to head of free
 						 * unindexed block list. */
-	__le64		dr_reserved3[15];
+	__le64		dr_suballoc_loc;	/* Suballocator block group
+						   this root belongs to.
+						   Only valid if allocated
+						   from a discontiguous
+						   block group */
+	__le64		dr_reserved3[14];
 	union {
 		struct ocfs2_extent_list dr_list; /* Keep this aligned to 128
 						   * bits for maximum space
@@ -897,6 +848,13 @@ struct ocfs2_dx_leaf {
 };
 
 /*
+ * Largest bitmap for a block (suballocator) group in bytes.  This limit
+ * does not affect cluster groups (global allocator).  Cluster group
+ * bitmaps run to the end of the block.
+ */
+#define OCFS2_MAX_BG_BITMAP_SIZE	256
+
+/*
  * On disk allocator group structure for OCFS2
  */
 struct ocfs2_group_desc
@@ -917,7 +875,29 @@ struct ocfs2_group_desc
 	__le64   bg_blkno;               /* Offset on disk, in blocks */
 /*30*/	struct ocfs2_block_check bg_check;	/* Error checking */
 	__le64   bg_reserved2;
-/*40*/	__u8    bg_bitmap[0];
+/*40*/	union {
+		__u8    bg_bitmap[0];
+		struct {
+			/*
+			 * Block groups may be discontiguous when
+			 * OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG is set.
+			 * The extents of a discontigous block group are
+			 * stored in bg_list.  It is a flat list.
+			 * l_tree_depth must always be zero.  A
+			 * discontiguous group is signified by a non-zero
+			 * bg_list->l_next_free_rec.  Only block groups
+			 * can be discontiguous; Cluster groups cannot.
+			 * We've never made a block group with more than
+			 * 2048 blocks (256 bytes of bg_bitmap).  This
+			 * codifies that limit so that we can fit bg_list.
+			 * bg_size of a discontiguous block group will
+			 * be 256 to match bg_bitmap_filler.
+			 */
+			__u8 bg_bitmap_filler[OCFS2_MAX_BG_BITMAP_SIZE];
+/*140*/			struct ocfs2_extent_list bg_list;
+		};
+	};
+/* Actual on-disk size is one block */
 };
 
 struct ocfs2_refcount_rec {
@@ -962,7 +942,11 @@ struct ocfs2_refcount_block {
 /*40*/	__le32 rf_generation;		/* generation number. all be the same
 					 * for the same refcount tree. */
 	__le32 rf_reserved0;
-	__le64 rf_reserved1[7];
+	__le64 rf_suballoc_loc;		/* Suballocator block group this
+					   refcount block belongs to. Only
+					   valid if allocated from a
+					   discontiguous block group */
+/*50*/	__le64 rf_reserved1[6];
 /*80*/	union {
 		struct ocfs2_refcount_list rf_records;  /* List of refcount
 							  records */
@@ -1074,7 +1058,10 @@ struct ocfs2_xattr_block {
 					real xattr or a xattr tree. */
 	__le16	xb_reserved0;
 	__le32  xb_reserved1;
-	__le64	xb_reserved2;
+	__le64	xb_suballoc_loc;	/* Suballocator block group this
+					   xattr block belongs to. Only
+					   valid if allocated from a
+					   discontiguous block group */
 /*30*/	union {
 		struct ocfs2_xattr_header xb_header; /* xattr header if this
 							block contains xattr */
@@ -1311,6 +1298,16 @@ static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb)
 	return size / sizeof(struct ocfs2_extent_rec);
 }
 
+static inline u16 ocfs2_extent_recs_per_gd(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_group_desc, bg_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
 static inline int ocfs2_dx_entries_per_leaf(struct super_block *sb)
 {
 	int size;
@@ -1341,13 +1338,23 @@ static inline u16 ocfs2_local_alloc_size(struct super_block *sb)
 	return size;
 }
 
-static inline int ocfs2_group_bitmap_size(struct super_block *sb)
+static inline int ocfs2_group_bitmap_size(struct super_block *sb,
+					  int suballocator,
+					  u32 feature_incompat)
 {
-	int size;
-
-	size = sb->s_blocksize -
+	int size = sb->s_blocksize -
 		offsetof(struct ocfs2_group_desc, bg_bitmap);
 
+	/*
+	 * The cluster allocator uses the entire block.  Suballocators have
+	 * never used more than OCFS2_MAX_BG_BITMAP_SIZE.  Unfortunately, older
+	 * code expects bg_size set to the maximum.  Thus we must keep
+	 * bg_size as-is unless discontig_bg is enabled.
+	 */
+	if (suballocator &&
+	    (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG))
+		size = OCFS2_MAX_BG_BITMAP_SIZE;
+
 	return size;
 }
 
@@ -1417,9 +1424,16 @@ static inline int ocfs2_fast_symlink_chars(int blocksize)
 	return blocksize - offsetof(struct ocfs2_dinode, id2.i_symlink);
 }
 
-static inline int ocfs2_max_inline_data(int blocksize)
+static inline int ocfs2_max_inline_data_with_xattr(int blocksize,
+						   struct ocfs2_dinode *di)
 {
-	return blocksize - offsetof(struct ocfs2_dinode, id2.i_data.id_data);
+	if (di && (di->i_dyn_features & OCFS2_INLINE_XATTR_FL))
+		return blocksize -
+			offsetof(struct ocfs2_dinode, id2.i_data.id_data) -
+			di->i_xattr_inline_size;
+	else
+		return blocksize -
+			offsetof(struct ocfs2_dinode, id2.i_data.id_data);
 }
 
 static inline int ocfs2_extent_recs_per_inode(int blocksize)
@@ -1452,23 +1466,43 @@ static inline int ocfs2_extent_recs_per_eb(int blocksize)
 	return size / sizeof(struct ocfs2_extent_rec);
 }
 
-static inline int ocfs2_local_alloc_size(int blocksize)
+static inline int ocfs2_extent_recs_per_gd(int blocksize)
 {
 	int size;
 
 	size = blocksize -
-		offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
+		offsetof(struct ocfs2_group_desc, bg_list.l_recs);
 
-	return size;
+	return size / sizeof(struct ocfs2_extent_rec);
 }
 
-static inline int ocfs2_group_bitmap_size(int blocksize)
+static inline int ocfs2_local_alloc_size(int blocksize)
 {
 	int size;
 
 	size = blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
+
+	return size;
+}
+
+static inline int ocfs2_group_bitmap_size(int blocksize,
+					  int suballocator,
+					  uint32_t feature_incompat)
+{
+	int size = sb->s_blocksize -
 		offsetof(struct ocfs2_group_desc, bg_bitmap);
 
+	/*
+	 * The cluster allocator uses the entire block.  Suballocators have
+	 * never used more than OCFS2_MAX_BG_BITMAP_SIZE.  Unfortunately, older
+	 * code expects bg_size set to the maximum.  Thus we must keep
+	 * bg_size as-is unless discontig_bg is enabled.
+	 */
+	if (suballocator &&
+	    (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG))
+		size = OCFS2_MAX_BG_BITMAP_SIZE;
+
 	return size;
 }
 
@@ -1541,5 +1575,19 @@ static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de,
 	de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
 }
 
+static inline int ocfs2_gd_is_discontig(struct ocfs2_group_desc *gd)
+{
+	if ((offsetof(struct ocfs2_group_desc, bg_bitmap) +
+	     le16_to_cpu(gd->bg_size)) !=
+	    offsetof(struct ocfs2_group_desc, bg_list))
+		return 0;
+	/*
+	 * Only valid to check l_next_free_rec if
+	 * bg_bitmap + bg_size == bg_list.
+	 */
+	if (!gd->bg_list.l_next_free_rec)
+		return 0;
+	return 1;
+}
 #endif  /* _OCFS2_FS_H */
 
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
new file mode 100644
index 000000000000..2d3420af1a83
--- /dev/null
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -0,0 +1,79 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_ioctl.h
+ *
+ * Defines OCFS2 ioctls.
+ *
+ * Copyright (C) 2010 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef OCFS2_IOCTL_H
+#define OCFS2_IOCTL_H
+
+/*
+ * ioctl commands
+ */
+#define OCFS2_IOC_GETFLAGS	_IOR('f', 1, long)
+#define OCFS2_IOC_SETFLAGS	_IOW('f', 2, long)
+#define OCFS2_IOC32_GETFLAGS	_IOR('f', 1, int)
+#define OCFS2_IOC32_SETFLAGS	_IOW('f', 2, int)
+
+/*
+ * Space reservation / allocation / free ioctls and argument structure
+ * are designed to be compatible with XFS.
+ *
+ * ALLOCSP* and FREESP* are not and will never be supported, but are
+ * included here for completeness.
+ */
+struct ocfs2_space_resv {
+	__s16		l_type;
+	__s16		l_whence;
+	__s64		l_start;
+	__s64		l_len;		/* len == 0 means until end of file */
+	__s32		l_sysid;
+	__u32		l_pid;
+	__s32		l_pad[4];	/* reserve area			    */
+};
+
+#define OCFS2_IOC_ALLOCSP		_IOW ('X', 10, struct ocfs2_space_resv)
+#define OCFS2_IOC_FREESP		_IOW ('X', 11, struct ocfs2_space_resv)
+#define OCFS2_IOC_RESVSP		_IOW ('X', 40, struct ocfs2_space_resv)
+#define OCFS2_IOC_UNRESVSP	_IOW ('X', 41, struct ocfs2_space_resv)
+#define OCFS2_IOC_ALLOCSP64	_IOW ('X', 36, struct ocfs2_space_resv)
+#define OCFS2_IOC_FREESP64	_IOW ('X', 37, struct ocfs2_space_resv)
+#define OCFS2_IOC_RESVSP64	_IOW ('X', 42, struct ocfs2_space_resv)
+#define OCFS2_IOC_UNRESVSP64	_IOW ('X', 43, struct ocfs2_space_resv)
+
+/* Used to pass group descriptor data when online resize is done */
+struct ocfs2_new_group_input {
+	__u64 group;		/* Group descriptor's blkno. */
+	__u32 clusters;		/* Total number of clusters in this group */
+	__u32 frees;		/* Total free clusters in this group */
+	__u16 chain;		/* Chain for this group */
+	__u16 reserved1;
+	__u32 reserved2;
+};
+
+#define OCFS2_IOC_GROUP_EXTEND	_IOW('o', 1, int)
+#define OCFS2_IOC_GROUP_ADD	_IOW('o', 2,struct ocfs2_new_group_input)
+#define OCFS2_IOC_GROUP_ADD64	_IOW('o', 3,struct ocfs2_new_group_input)
+
+/* Used to pass 2 file names to reflink. */
+struct reflink_arguments {
+	__u64 old_path;
+	__u64 new_path;
+	__u64 preserve;
+};
+#define OCFS2_IOC_REFLINK	_IOW('o', 4, struct reflink_arguments)
+
+#endif /* OCFS2_IOCTL_H */
diff --git a/fs/ocfs2/ocfs2_lockingver.h b/fs/ocfs2/ocfs2_lockingver.h
index 82d5eeac0fff..2e45c8d2ea7e 100644
--- a/fs/ocfs2/ocfs2_lockingver.h
+++ b/fs/ocfs2/ocfs2_lockingver.h
@@ -23,6 +23,8 @@
 /*
  * The protocol version for ocfs2 cluster locking.  See dlmglue.c for
  * more details.
+ *
+ * 1.0 - Initial locking version from ocfs2 1.4.
  */
 #define OCFS2_LOCKING_PROTOCOL_MAJOR 1
 #define OCFS2_LOCKING_PROTOCOL_MINOR 0
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index 123bc520a2c0..196fcb52d95d 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -23,6 +23,7 @@
 struct ocfs2_dquot {
 	struct dquot dq_dquot;	/* Generic VFS dquot */
 	loff_t dq_local_off;	/* Offset in the local quota file */
+	u64 dq_local_phys_blk;	/* Physical block carrying quota structure */
 	struct ocfs2_quota_chunk *dq_chunk;	/* Chunk dquot is in */
 	unsigned int dq_use_count;	/* Number of nodes having reference to this entry in global quota file */
 	s64 dq_origspace;	/* Last globally synced space usage */
@@ -51,8 +52,9 @@ struct ocfs2_mem_dqinfo {
 	struct ocfs2_lock_res dqi_gqlock;	/* Lock protecting quota information structure */
 	struct buffer_head *dqi_gqi_bh;	/* Buffer head with global quota file inode - set only if inode lock is obtained */
 	int dqi_gqi_count;		/* Number of holders of dqi_gqi_bh */
+	u64 dqi_giblk;			/* Number of block with global information header */
 	struct buffer_head *dqi_lqi_bh;	/* Buffer head with local quota file inode */
-	struct buffer_head *dqi_ibh;	/* Buffer with information header */
+	struct buffer_head *dqi_libh;	/* Buffer with local information header */
 	struct qtree_mem_dqinfo dqi_gi;	/* Info about global file */
 	struct delayed_work dqi_sync_work;	/* Work for syncing dquots */
 	struct ocfs2_quota_recovery *dqi_rec;	/* Pointer to recovery
@@ -102,8 +104,12 @@ static inline int ocfs2_global_release_dquot(struct dquot *dquot)
 
 int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
 void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
-int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
-			   struct buffer_head **bh);
+int ocfs2_validate_quota_block(struct super_block *sb, struct buffer_head *bh);
+int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block,
+				struct buffer_head **bh);
+int ocfs2_create_local_dquot(struct dquot *dquot);
+int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot);
+int ocfs2_local_write_dquot(struct dquot *dquot);
 
 extern const struct dquot_operations ocfs2_quota_operations;
 extern struct quota_format_type ocfs2_quota_format;
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index b437dc0c4cad..4607923eb24c 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -3,6 +3,7 @@
  */
 #include <linux/spinlock.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/quota.h>
 #include <linux/quotaops.h>
 #include <linux/dqblk_qtree.h>
@@ -24,8 +25,44 @@
 #include "dlmglue.h"
 #include "uptodate.h"
 #include "super.h"
+#include "buffer_head_io.h"
 #include "quota.h"
 
+/*
+ * Locking of quotas with OCFS2 is rather complex. Here are rules that
+ * should be obeyed by all the functions:
+ * - any write of quota structure (either to local or global file) is protected
+ *   by dqio_mutex or dquot->dq_lock.
+ * - any modification of global quota file holds inode cluster lock, i_mutex,
+ *   and ip_alloc_sem of the global quota file (achieved by
+ *   ocfs2_lock_global_qf). It also has to hold qinfo_lock.
+ * - an allocation of new blocks for local quota file is protected by
+ *   its ip_alloc_sem
+ *
+ * A rough sketch of locking dependencies (lf = local file, gf = global file):
+ * Normal filesystem operation:
+ *   start_trans -> dqio_mutex -> write to lf
+ * Syncing of local and global file:
+ *   ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock ->
+ *     write to gf
+ *						       -> write to lf
+ * Acquire dquot for the first time:
+ *   dq_lock -> ocfs2_lock_global_qf -> qinfo_lock -> read from gf
+ *				     -> alloc space for gf
+ *				     -> start_trans -> qinfo_lock -> write to gf
+ *	     -> ip_alloc_sem of lf -> alloc space for lf
+ *	     -> write to lf
+ * Release last reference to dquot:
+ *   dq_lock -> ocfs2_lock_global_qf -> start_trans -> qinfo_lock -> write to gf
+ *	     -> write to lf
+ * Note that all the above operations also hold the inode cluster lock of lf.
+ * Recovery:
+ *   inode cluster lock of recovered lf
+ *     -> read bitmaps -> ip_alloc_sem of lf
+ *     -> ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock ->
+ *        write to gf
+ */
+
 static struct workqueue_struct *ocfs2_quota_wq = NULL;
 
 static void qsync_work_fn(struct work_struct *work);
@@ -90,8 +127,7 @@ struct qtree_fmt_operations ocfs2_global_ops = {
 	.is_id = ocfs2_global_is_id,
 };
 
-static int ocfs2_validate_quota_block(struct super_block *sb,
-				      struct buffer_head *bh)
+int ocfs2_validate_quota_block(struct super_block *sb, struct buffer_head *bh)
 {
 	struct ocfs2_disk_dqtrailer *dqt =
 		ocfs2_block_dqtrailer(sb->s_blocksize, bh->b_data);
@@ -109,54 +145,19 @@ static int ocfs2_validate_quota_block(struct super_block *sb,
 	return ocfs2_validate_meta_ecc(sb, bh->b_data, &dqt->dq_check);
 }
 
-int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
-			   struct buffer_head **bh)
+int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block,
+				struct buffer_head **bhp)
 {
-	int rc = 0;
-	struct buffer_head *tmp = *bh;
-
-	if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) {
-		ocfs2_error(inode->i_sb,
-			    "Quota file %llu is probably corrupted! Requested "
-			    "to read block %Lu but file has size only %Lu\n",
-			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
-			    (unsigned long long)v_block,
-			    (unsigned long long)i_size_read(inode));
-		return -EIO;
-	}
-	rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
-				    ocfs2_validate_quota_block);
+	int rc;
+
+	*bhp = NULL;
+	rc = ocfs2_read_blocks(INODE_CACHE(inode), p_block, 1, bhp, 0,
+			       ocfs2_validate_quota_block);
 	if (rc)
 		mlog_errno(rc);
-
-	/* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
-	if (!rc && !*bh)
-		*bh = tmp;
-
 	return rc;
 }
 
-static int ocfs2_get_quota_block(struct inode *inode, int block,
-				 struct buffer_head **bh)
-{
-	u64 pblock, pcount;
-	int err;
-
-	down_read(&OCFS2_I(inode)->ip_alloc_sem);
-	err = ocfs2_extent_map_get_blocks(inode, block, &pblock, &pcount, NULL);
-	up_read(&OCFS2_I(inode)->ip_alloc_sem);
-	if (err) {
-		mlog_errno(err);
-		return err;
-	}
-	*bh = sb_getblk(inode->i_sb, pblock);
-	if (!*bh) {
-		err = -EIO;
-		mlog_errno(err);
-	}
-	return err;
-}
-
 /* Read data from global quotafile - avoid pagecache and such because we cannot
  * afford acquiring the locks... We use quota cluster lock to serialize
  * operations. Caller is responsible for acquiring it. */
@@ -171,6 +172,7 @@ ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
 	int err = 0;
 	struct buffer_head *bh;
 	size_t toread, tocopy;
+	u64 pblock = 0, pcount = 0;
 
 	if (off > i_size)
 		return 0;
@@ -179,8 +181,19 @@ ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
 	toread = len;
 	while (toread > 0) {
 		tocopy = min_t(size_t, (sb->s_blocksize - offset), toread);
+		if (!pcount) {
+			err = ocfs2_extent_map_get_blocks(gqinode, blk, &pblock,
+							  &pcount, NULL);
+			if (err) {
+				mlog_errno(err);
+				return err;
+			}
+		} else {
+			pcount--;
+			pblock++;
+		}
 		bh = NULL;
-		err = ocfs2_read_quota_block(gqinode, blk, &bh);
+		err = ocfs2_read_quota_phys_block(gqinode, pblock, &bh);
 		if (err) {
 			mlog_errno(err);
 			return err;
@@ -208,6 +221,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
 	int err = 0, new = 0, ja_type;
 	struct buffer_head *bh = NULL;
 	handle_t *handle = journal_current_handle();
+	u64 pblock, pcount;
 
 	if (!handle) {
 		mlog(ML_ERROR, "Quota write (off=%llu, len=%llu) cancelled "
@@ -220,12 +234,11 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
 		len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset;
 	}
 
-	mutex_lock_nested(&gqinode->i_mutex, I_MUTEX_QUOTA);
 	if (gqinode->i_size < off + len) {
 		loff_t rounded_end =
 				ocfs2_align_bytes_to_blocks(sb, off + len);
 
-		/* Space is already allocated in ocfs2_global_read_dquot() */
+		/* Space is already allocated in ocfs2_acquire_dquot() */
 		err = ocfs2_simple_size_update(gqinode,
 					       oinfo->dqi_gqi_bh,
 					       rounded_end);
@@ -233,13 +246,20 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
 			goto out;
 		new = 1;
 	}
+	err = ocfs2_extent_map_get_blocks(gqinode, blk, &pblock, &pcount, NULL);
+	if (err) {
+		mlog_errno(err);
+		goto out;
+	}
 	/* Not rewriting whole block? */
 	if ((offset || len < sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) &&
 	    !new) {
-		err = ocfs2_read_quota_block(gqinode, blk, &bh);
+		err = ocfs2_read_quota_phys_block(gqinode, pblock, &bh);
 		ja_type = OCFS2_JOURNAL_ACCESS_WRITE;
 	} else {
-		err = ocfs2_get_quota_block(gqinode, blk, &bh);
+		bh = sb_getblk(sb, pblock);
+		if (!bh)
+			err = -ENOMEM;
 		ja_type = OCFS2_JOURNAL_ACCESS_CREATE;
 	}
 	if (err) {
@@ -260,19 +280,15 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
 		brelse(bh);
 		goto out;
 	}
-	err = ocfs2_journal_dirty(handle, bh);
+	ocfs2_journal_dirty(handle, bh);
 	brelse(bh);
-	if (err < 0)
-		goto out;
 out:
 	if (err) {
-		mutex_unlock(&gqinode->i_mutex);
 		mlog_errno(err);
 		return err;
 	}
 	gqinode->i_version++;
 	ocfs2_mark_inode_dirty(handle, gqinode, oinfo->dqi_gqi_bh);
-	mutex_unlock(&gqinode->i_mutex);
 	return len;
 }
 
@@ -290,11 +306,23 @@ int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
 	else
 		WARN_ON(bh != oinfo->dqi_gqi_bh);
 	spin_unlock(&dq_data_lock);
+	if (ex) {
+		mutex_lock(&oinfo->dqi_gqinode->i_mutex);
+		down_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
+	} else {
+		down_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
+	}
 	return 0;
 }
 
 void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
 {
+	if (ex) {
+		up_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
+		mutex_unlock(&oinfo->dqi_gqinode->i_mutex);
+	} else {
+		up_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
+	}
 	ocfs2_inode_unlock(oinfo->dqi_gqinode, ex);
 	brelse(oinfo->dqi_gqi_bh);
 	spin_lock(&dq_data_lock);
@@ -312,6 +340,7 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
 	struct ocfs2_global_disk_dqinfo dinfo;
 	struct mem_dqinfo *info = sb_dqinfo(sb, type);
 	struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+	u64 pcount;
 	int status;
 
 	mlog_entry_void();
@@ -338,9 +367,19 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
 		mlog_errno(status);
 		goto out_err;
 	}
+
+	status = ocfs2_extent_map_get_blocks(gqinode, 0, &oinfo->dqi_giblk,
+					     &pcount, NULL);
+	if (status < 0)
+		goto out_unlock;
+
+	status = ocfs2_qinfo_lock(oinfo, 0);
+	if (status < 0)
+		goto out_unlock;
 	status = sb->s_op->quota_read(sb, type, (char *)&dinfo,
 				      sizeof(struct ocfs2_global_disk_dqinfo),
 				      OCFS2_GLOBAL_INFO_OFF);
+	ocfs2_qinfo_unlock(oinfo, 0);
 	ocfs2_unlock_global_qf(oinfo, 0);
 	if (status != sizeof(struct ocfs2_global_disk_dqinfo)) {
 		mlog(ML_ERROR, "Cannot read global quota info (%d).\n",
@@ -367,6 +406,10 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
 out_err:
 	mlog_exit(status);
 	return status;
+out_unlock:
+	ocfs2_unlock_global_qf(oinfo, 0);
+	mlog_errno(status);
+	goto out_err;
 }
 
 /* Write information to global quota file. Expects exlusive lock on quota
@@ -425,78 +468,10 @@ static int ocfs2_global_qinit_alloc(struct super_block *sb, int type)
 
 static int ocfs2_calc_global_qinit_credits(struct super_block *sb, int type)
 {
-	/* We modify all the allocated blocks, tree root, and info block */
+	/* We modify all the allocated blocks, tree root, info block and
+	 * the inode */
 	return (ocfs2_global_qinit_alloc(sb, type) + 2) *
-			OCFS2_QUOTA_BLOCK_UPDATE_CREDITS;
-}
-
-/* Read in information from global quota file and acquire a reference to it.
- * dquot_acquire() has already started the transaction and locked quota file */
-int ocfs2_global_read_dquot(struct dquot *dquot)
-{
-	int err, err2, ex = 0;
-	struct super_block *sb = dquot->dq_sb;
-	int type = dquot->dq_type;
-	struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
-	struct ocfs2_super *osb = OCFS2_SB(sb);
-	struct inode *gqinode = info->dqi_gqinode;
-	int need_alloc = ocfs2_global_qinit_alloc(sb, type);
-	handle_t *handle = NULL;
-
-	err = ocfs2_qinfo_lock(info, 0);
-	if (err < 0)
-		goto out;
-	err = qtree_read_dquot(&info->dqi_gi, dquot);
-	if (err < 0)
-		goto out_qlock;
-	OCFS2_DQUOT(dquot)->dq_use_count++;
-	OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
-	OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
-	ocfs2_qinfo_unlock(info, 0);
-
-	if (!dquot->dq_off) {	/* No real quota entry? */
-		ex = 1;
-		/*
-		 * Add blocks to quota file before we start a transaction since
-		 * locking allocators ranks above a transaction start
-		 */
-		WARN_ON(journal_current_handle());
-		down_write(&OCFS2_I(gqinode)->ip_alloc_sem);
-		err = ocfs2_extend_no_holes(gqinode,
-			gqinode->i_size + (need_alloc << sb->s_blocksize_bits),
-			gqinode->i_size);
-		up_write(&OCFS2_I(gqinode)->ip_alloc_sem);
-		if (err < 0)
-			goto out;
-	}
-
-	handle = ocfs2_start_trans(osb,
-				   ocfs2_calc_global_qinit_credits(sb, type));
-	if (IS_ERR(handle)) {
-		err = PTR_ERR(handle);
-		goto out;
-	}
-	err = ocfs2_qinfo_lock(info, ex);
-	if (err < 0)
-		goto out_trans;
-	err = qtree_write_dquot(&info->dqi_gi, dquot);
-	if (ex && info_dirty(sb_dqinfo(dquot->dq_sb, dquot->dq_type))) {
-		err2 = __ocfs2_global_write_info(dquot->dq_sb, dquot->dq_type);
-		if (!err)
-			err = err2;
-	}
-out_qlock:
-	if (ex)
-		ocfs2_qinfo_unlock(info, 1);
-	else
-		ocfs2_qinfo_unlock(info, 0);
-out_trans:
-	if (handle)
-		ocfs2_commit_trans(osb, handle);
-out:
-	if (err < 0)
-		mlog_errno(err);
-	return err;
+			OCFS2_QUOTA_BLOCK_UPDATE_CREDITS + 1;
 }
 
 /* Sync local information about quota modifications with global quota file.
@@ -637,14 +612,13 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
 	}
 	mutex_lock(&sb_dqopt(sb)->dqio_mutex);
 	status = ocfs2_sync_dquot(dquot);
-	mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
 	if (status < 0)
 		mlog_errno(status);
 	/* We have to write local structure as well... */
-	dquot_mark_dquot_dirty(dquot);
-	status = dquot_commit(dquot);
+	status = ocfs2_local_write_dquot(dquot);
 	if (status < 0)
 		mlog_errno(status);
+	mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
 	ocfs2_commit_trans(osb, handle);
 out_ilock:
 	ocfs2_unlock_global_qf(oinfo, 1);
@@ -683,7 +657,9 @@ static int ocfs2_write_dquot(struct dquot *dquot)
 		mlog_errno(status);
 		goto out;
 	}
-	status = dquot_commit(dquot);
+	mutex_lock(&sb_dqopt(dquot->dq_sb)->dqio_mutex);
+	status = ocfs2_local_write_dquot(dquot);
+	mutex_unlock(&sb_dqopt(dquot->dq_sb)->dqio_mutex);
 	ocfs2_commit_trans(osb, handle);
 out:
 	mlog_exit(status);
@@ -714,6 +690,10 @@ static int ocfs2_release_dquot(struct dquot *dquot)
 
 	mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
 
+	mutex_lock(&dquot->dq_lock);
+	/* Check whether we are not racing with some other dqget() */
+	if (atomic_read(&dquot->dq_count) > 1)
+		goto out;
 	status = ocfs2_lock_global_qf(oinfo, 1);
 	if (status < 0)
 		goto out;
@@ -724,30 +704,113 @@ static int ocfs2_release_dquot(struct dquot *dquot)
 		mlog_errno(status);
 		goto out_ilock;
 	}
-	status = dquot_release(dquot);
+
+	status = ocfs2_global_release_dquot(dquot);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+	status = ocfs2_local_release_dquot(handle, dquot);
+	/*
+	 * If we fail here, we cannot do much as global structure is
+	 * already released. So just complain...
+	 */
+	if (status < 0)
+		mlog_errno(status);
+	clear_bit(DQ_ACTIVE_B, &dquot->dq_flags);
+out_trans:
 	ocfs2_commit_trans(osb, handle);
 out_ilock:
 	ocfs2_unlock_global_qf(oinfo, 1);
 out:
+	mutex_unlock(&dquot->dq_lock);
 	mlog_exit(status);
 	return status;
 }
 
+/*
+ * Read global dquot structure from disk or create it if it does
+ * not exist. Also update use count of the global structure and
+ * create structure in node-local quota file.
+ */
 static int ocfs2_acquire_dquot(struct dquot *dquot)
 {
-	struct ocfs2_mem_dqinfo *oinfo =
-			sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
-	int status = 0;
+	int status = 0, err;
+	int ex = 0;
+	struct super_block *sb = dquot->dq_sb;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+	int type = dquot->dq_type;
+	struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+	struct inode *gqinode = info->dqi_gqinode;
+	int need_alloc = ocfs2_global_qinit_alloc(sb, type);
+	handle_t *handle;
 
-	mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
-	/* We need an exclusive lock, because we're going to update use count
-	 * and instantiate possibly new dquot structure */
-	status = ocfs2_lock_global_qf(oinfo, 1);
+	mlog_entry("id=%u, type=%d", dquot->dq_id, type);
+	mutex_lock(&dquot->dq_lock);
+	/*
+	 * We need an exclusive lock, because we're going to update use count
+	 * and instantiate possibly new dquot structure
+	 */
+	status = ocfs2_lock_global_qf(info, 1);
 	if (status < 0)
 		goto out;
-	status = dquot_acquire(dquot);
-	ocfs2_unlock_global_qf(oinfo, 1);
+	if (!test_bit(DQ_READ_B, &dquot->dq_flags)) {
+		status = ocfs2_qinfo_lock(info, 0);
+		if (status < 0)
+			goto out_dq;
+		status = qtree_read_dquot(&info->dqi_gi, dquot);
+		ocfs2_qinfo_unlock(info, 0);
+		if (status < 0)
+			goto out_dq;
+	}
+	set_bit(DQ_READ_B, &dquot->dq_flags);
+
+	OCFS2_DQUOT(dquot)->dq_use_count++;
+	OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
+	OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
+	if (!dquot->dq_off) {	/* No real quota entry? */
+		ex = 1;
+		/*
+		 * Add blocks to quota file before we start a transaction since
+		 * locking allocators ranks above a transaction start
+		 */
+		WARN_ON(journal_current_handle());
+		status = ocfs2_extend_no_holes(gqinode, NULL,
+			gqinode->i_size + (need_alloc << sb->s_blocksize_bits),
+			gqinode->i_size);
+		if (status < 0)
+			goto out_dq;
+	}
+
+	handle = ocfs2_start_trans(osb,
+				   ocfs2_calc_global_qinit_credits(sb, type));
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		goto out_dq;
+	}
+	status = ocfs2_qinfo_lock(info, ex);
+	if (status < 0)
+		goto out_trans;
+	status = qtree_write_dquot(&info->dqi_gi, dquot);
+	if (ex && info_dirty(sb_dqinfo(sb, type))) {
+		err = __ocfs2_global_write_info(sb, type);
+		if (!status)
+			status = err;
+	}
+	ocfs2_qinfo_unlock(info, ex);
+out_trans:
+	ocfs2_commit_trans(osb, handle);
+out_dq:
+	ocfs2_unlock_global_qf(info, 1);
+	if (status < 0)
+		goto out;
+
+	status = ocfs2_create_local_dquot(dquot);
+	if (status < 0)
+		goto out;
+	set_bit(DQ_ACTIVE_B, &dquot->dq_flags);
 out:
+	mutex_unlock(&dquot->dq_lock);
 	mlog_exit(status);
 	return status;
 }
@@ -769,7 +832,6 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
 	struct ocfs2_super *osb = OCFS2_SB(sb);
 
 	mlog_entry("id=%u, type=%d", dquot->dq_id, type);
-	dquot_mark_dquot_dirty(dquot);
 
 	/* In case user set some limits, sync dquot immediately to global
 	 * quota file so that information propagates quicker */
@@ -792,14 +854,16 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
 		mlog_errno(status);
 		goto out_ilock;
 	}
+	mutex_lock(&sb_dqopt(sb)->dqio_mutex);
 	status = ocfs2_sync_dquot(dquot);
 	if (status < 0) {
 		mlog_errno(status);
-		goto out_trans;
+		goto out_dlock;
 	}
 	/* Now write updated local dquot structure */
-	status = dquot_commit(dquot);
-out_trans:
+	status = ocfs2_local_write_dquot(dquot);
+out_dlock:
+	mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
 	ocfs2_commit_trans(osb, handle);
 out_ilock:
 	ocfs2_unlock_global_qf(oinfo, 1);
@@ -851,14 +915,7 @@ static void ocfs2_destroy_dquot(struct dquot *dquot)
 }
 
 const struct dquot_operations ocfs2_quota_operations = {
-	.initialize	= dquot_initialize,
-	.drop		= dquot_drop,
-	.alloc_space	= dquot_alloc_space,
-	.alloc_inode	= dquot_alloc_inode,
-	.free_space	= dquot_free_space,
-	.free_inode	= dquot_free_inode,
-	.transfer	= dquot_transfer,
-	.write_dquot	= ocfs2_write_dquot,
+	/* We never make dquot dirty so .write_dquot is never called */
 	.acquire_dquot	= ocfs2_acquire_dquot,
 	.release_dquot	= ocfs2_release_dquot,
 	.mark_dirty	= ocfs2_mark_dquot_dirty,
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 21f9e71223ca..dc78764ccc4c 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -3,6 +3,7 @@
  */
 
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/quota.h>
 #include <linux/quotaops.h>
 #include <linux/module.h>
@@ -21,6 +22,7 @@
 #include "dlmglue.h"
 #include "quota.h"
 #include "uptodate.h"
+#include "super.h"
 
 /* Number of local quota structures per block */
 static inline unsigned int ol_quota_entries_per_block(struct super_block *sb)
@@ -118,12 +120,8 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
 	lock_buffer(bh);
 	modify(bh, private);
 	unlock_buffer(bh);
-	status = ocfs2_journal_dirty(handle, bh);
-	if (status < 0) {
-		mlog_errno(status);
-		ocfs2_commit_trans(OCFS2_SB(sb), handle);
-		return status;
-	}
+	ocfs2_journal_dirty(handle, bh);
+
 	status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
 	if (status < 0) {
 		mlog_errno(status);
@@ -132,6 +130,39 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
 	return 0;
 }
 
+/*
+ * Read quota block from a given logical offset.
+ *
+ * This function acquires ip_alloc_sem and thus it must not be called with a
+ * transaction started.
+ */
+static int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
+				  struct buffer_head **bh)
+{
+	int rc = 0;
+	struct buffer_head *tmp = *bh;
+
+	if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) {
+		ocfs2_error(inode->i_sb,
+			    "Quota file %llu is probably corrupted! Requested "
+			    "to read block %Lu but file has size only %Lu\n",
+			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
+			    (unsigned long long)v_block,
+			    (unsigned long long)i_size_read(inode));
+		return -EIO;
+	}
+	rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
+				    ocfs2_validate_quota_block);
+	if (rc)
+		mlog_errno(rc);
+
+	/* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
+	if (!rc && !*bh)
+		*bh = tmp;
+
+	return rc;
+}
+
 /* Check whether we understand format of quota files */
 static int ocfs2_local_check_quota_file(struct super_block *sb, int type)
 {
@@ -457,7 +488,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
 			break;
 		}
 		dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data;
-		for_each_bit(bit, rchunk->rc_bitmap, ol_chunk_entries(sb)) {
+		for_each_set_bit(bit, rchunk->rc_bitmap, ol_chunk_entries(sb)) {
 			qbh = NULL;
 			status = ocfs2_read_quota_block(lqinode,
 						ol_dqblk_block(sb, chunk, bit),
@@ -522,9 +553,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
 			ocfs2_clear_bit(bit, dchunk->dqc_bitmap);
 			le32_add_cpu(&dchunk->dqc_free, 1);
 			unlock_buffer(qbh);
-			status = ocfs2_journal_dirty(handle, qbh);
-			if (status < 0)
-				mlog_errno(status);
+			ocfs2_journal_dirty(handle, qbh);
 out_commit:
 			mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
 			ocfs2_commit_trans(OCFS2_SB(sb), handle);
@@ -630,9 +659,7 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
 		lock_buffer(bh);
 		ldinfo->dqi_flags = cpu_to_le32(flags | OLQF_CLEAN);
 		unlock_buffer(bh);
-		status = ocfs2_journal_dirty(handle, bh);
-		if (status < 0)
-			mlog_errno(status);
+		ocfs2_journal_dirty(handle, bh);
 out_trans:
 		ocfs2_commit_trans(osb, handle);
 out_bh:
@@ -678,7 +705,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
 	INIT_LIST_HEAD(&oinfo->dqi_chunk);
 	oinfo->dqi_rec = NULL;
 	oinfo->dqi_lqi_bh = NULL;
-	oinfo->dqi_ibh = NULL;
+	oinfo->dqi_libh = NULL;
 
 	status = ocfs2_global_read_info(sb, type);
 	if (status < 0)
@@ -704,7 +731,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
 	info->dqi_flags = le32_to_cpu(ldinfo->dqi_flags);
 	oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks);
 	oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks);
-	oinfo->dqi_ibh = bh;
+	oinfo->dqi_libh = bh;
 
 	/* We crashed when using local quota file? */
 	if (!(info->dqi_flags & OLQF_CLEAN)) {
@@ -766,7 +793,7 @@ static int ocfs2_local_write_info(struct super_block *sb, int type)
 {
 	struct mem_dqinfo *info = sb_dqinfo(sb, type);
 	struct buffer_head *bh = ((struct ocfs2_mem_dqinfo *)info->dqi_priv)
-						->dqi_ibh;
+						->dqi_libh;
 	int status;
 
 	status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], bh, olq_update_info,
@@ -789,10 +816,6 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
 	int mark_clean = 1, len;
 	int status;
 
-	/* At this point we know there are no more dquots and thus
-	 * even if there's some sync in the pdflush queue, it won't
-	 * find any dquots and return without doing anything */
-	cancel_delayed_work_sync(&oinfo->dqi_sync_work);
 	iput(oinfo->dqi_gqinode);
 	ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock);
 	ocfs2_lock_res_free(&oinfo->dqi_gqlock);
@@ -827,7 +850,7 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
 	/* Mark local file as clean */
 	info->dqi_flags |= OLQF_CLEAN;
 	status = ocfs2_modify_bh(sb_dqopt(sb)->files[type],
-				 oinfo->dqi_ibh,
+				 oinfo->dqi_libh,
 				 olq_update_info,
 				 info);
 	if (status < 0) {
@@ -837,7 +860,7 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
 
 out:
 	ocfs2_inode_unlock(sb_dqopt(sb)->files[type], 1);
-	brelse(oinfo->dqi_ibh);
+	brelse(oinfo->dqi_libh);
 	brelse(oinfo->dqi_lqi_bh);
 	kfree(oinfo);
 	return 0;
@@ -865,22 +888,21 @@ static void olq_set_dquot(struct buffer_head *bh, void *private)
 }
 
 /* Write dquot to local quota file */
-static int ocfs2_local_write_dquot(struct dquot *dquot)
+int ocfs2_local_write_dquot(struct dquot *dquot)
 {
 	struct super_block *sb = dquot->dq_sb;
 	struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
-	struct buffer_head *bh = NULL;
+	struct buffer_head *bh;
+	struct inode *lqinode = sb_dqopt(sb)->files[dquot->dq_type];
 	int status;
 
-	status = ocfs2_read_quota_block(sb_dqopt(sb)->files[dquot->dq_type],
-				    ol_dqblk_file_block(sb, od->dq_local_off),
-				    &bh);
+	status = ocfs2_read_quota_phys_block(lqinode, od->dq_local_phys_blk,
+					     &bh);
 	if (status) {
 		mlog_errno(status);
 		goto out;
 	}
-	status = ocfs2_modify_bh(sb_dqopt(sb)->files[dquot->dq_type], bh,
-				 olq_set_dquot, od);
+	status = ocfs2_modify_bh(lqinode, bh, olq_set_dquot, od);
 	if (status < 0) {
 		mlog_errno(status);
 		goto out;
@@ -949,7 +971,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
 	u64 p_blkno;
 
 	/* We are protected by dqio_sem so no locking needed */
-	status = ocfs2_extend_no_holes(lqinode,
+	status = ocfs2_extend_no_holes(lqinode, NULL,
 				       lqinode->i_size + 2 * sb->s_blocksize,
 				       lqinode->i_size);
 	if (status < 0) {
@@ -980,10 +1002,8 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
 	}
 
 	/* Initialize chunk header */
-	down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
 	status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
 					     &p_blkno, NULL, NULL);
-	up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
 	if (status < 0) {
 		mlog_errno(status);
 		goto out_trans;
@@ -1008,17 +1028,11 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
 	       sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
 	       OCFS2_QBLK_RESERVED_SPACE);
 	unlock_buffer(bh);
-	status = ocfs2_journal_dirty(handle, bh);
-	if (status < 0) {
-		mlog_errno(status);
-		goto out_trans;
-	}
+	ocfs2_journal_dirty(handle, bh);
 
 	/* Initialize new block with structures */
-	down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
 	status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks + 1,
 					     &p_blkno, NULL, NULL);
-	up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
 	if (status < 0) {
 		mlog_errno(status);
 		goto out_trans;
@@ -1039,11 +1053,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
 	lock_buffer(dbh);
 	memset(dbh->b_data, 0, sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE);
 	unlock_buffer(dbh);
-	status = ocfs2_journal_dirty(handle, dbh);
-	if (status < 0) {
-		mlog_errno(status);
-		goto out_trans;
-	}
+	ocfs2_journal_dirty(handle, dbh);
 
 	/* Update local quotafile info */
 	oinfo->dqi_blocks += 2;
@@ -1104,7 +1114,7 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
 		return ocfs2_local_quota_add_chunk(sb, type, offset);
 
 	/* We are protected by dqio_sem so no locking needed */
-	status = ocfs2_extend_no_holes(lqinode,
+	status = ocfs2_extend_no_holes(lqinode, NULL,
 				       lqinode->i_size + sb->s_blocksize,
 				       lqinode->i_size);
 	if (status < 0) {
@@ -1119,10 +1129,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
 	}
 
 	/* Get buffer from the just added block */
-	down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
 	status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
 					     &p_blkno, NULL, NULL);
-	up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
 	if (status < 0) {
 		mlog_errno(status);
 		goto out;
@@ -1154,11 +1162,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
 	lock_buffer(bh);
 	memset(bh->b_data, 0, sb->s_blocksize);
 	unlock_buffer(bh);
-	status = ocfs2_journal_dirty(handle, bh);
-	if (status < 0) {
-		mlog_errno(status);
-		goto out_trans;
-	}
+	ocfs2_journal_dirty(handle, bh);
+
 	/* Update chunk header */
 	status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode),
 					 chunk->qc_headerbh,
@@ -1172,11 +1177,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
 	lock_buffer(chunk->qc_headerbh);
 	le32_add_cpu(&dchunk->dqc_free, ol_quota_entries_per_block(sb));
 	unlock_buffer(chunk->qc_headerbh);
-	status = ocfs2_journal_dirty(handle, chunk->qc_headerbh);
-	if (status < 0) {
-		mlog_errno(status);
-		goto out_trans;
-	}
+	ocfs2_journal_dirty(handle, chunk->qc_headerbh);
+
 	/* Update file header */
 	oinfo->dqi_blocks++;
 	status = ocfs2_local_write_info(sb, type);
@@ -1209,7 +1211,7 @@ static void olq_alloc_dquot(struct buffer_head *bh, void *private)
 }
 
 /* Create dquot in the local file for given id */
-static int ocfs2_create_local_dquot(struct dquot *dquot)
+int ocfs2_create_local_dquot(struct dquot *dquot)
 {
 	struct super_block *sb = dquot->dq_sb;
 	int type = dquot->dq_type;
@@ -1218,17 +1220,27 @@ static int ocfs2_create_local_dquot(struct dquot *dquot)
 	struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
 	int offset;
 	int status;
+	u64 pcount;
 
+	down_write(&OCFS2_I(lqinode)->ip_alloc_sem);
 	chunk = ocfs2_find_free_entry(sb, type, &offset);
 	if (!chunk) {
 		chunk = ocfs2_extend_local_quota_file(sb, type, &offset);
-		if (IS_ERR(chunk))
-			return PTR_ERR(chunk);
+		if (IS_ERR(chunk)) {
+			status = PTR_ERR(chunk);
+			goto out;
+		}
 	} else if (IS_ERR(chunk)) {
-		return PTR_ERR(chunk);
+		status = PTR_ERR(chunk);
+		goto out;
 	}
 	od->dq_local_off = ol_dqblk_off(sb, chunk->qc_num, offset);
 	od->dq_chunk = chunk;
+	status = ocfs2_extent_map_get_blocks(lqinode,
+				     ol_dqblk_block(sb, chunk->qc_num, offset),
+				     &od->dq_local_phys_blk,
+				     &pcount,
+				     NULL);
 
 	/* Initialize dquot structure on disk */
 	status = ocfs2_local_write_dquot(dquot);
@@ -1245,39 +1257,15 @@ static int ocfs2_create_local_dquot(struct dquot *dquot)
 		goto out;
 	}
 out:
+	up_write(&OCFS2_I(lqinode)->ip_alloc_sem);
 	return status;
 }
 
-/* Create entry in local file for dquot, load data from the global file */
-static int ocfs2_local_read_dquot(struct dquot *dquot)
-{
-	int status;
-
-	mlog_entry("id=%u, type=%d\n", dquot->dq_id, dquot->dq_type);
-
-	status = ocfs2_global_read_dquot(dquot);
-	if (status < 0) {
-		mlog_errno(status);
-		goto out_err;
-	}
-
-	/* Now create entry in the local quota file */
-	status = ocfs2_create_local_dquot(dquot);
-	if (status < 0) {
-		mlog_errno(status);
-		goto out_err;
-	}
-	mlog_exit(0);
-	return 0;
-out_err:
-	mlog_exit(status);
-	return status;
-}
-
-/* Release dquot structure from local quota file. ocfs2_release_dquot() has
- * already started a transaction and obtained exclusive lock for global
- * quota file. */
-static int ocfs2_local_release_dquot(struct dquot *dquot)
+/*
+ * Release dquot structure from local quota file. ocfs2_release_dquot() has
+ * already started a transaction and written all changes to global quota file
+ */
+int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot)
 {
 	int status;
 	int type = dquot->dq_type;
@@ -1285,15 +1273,6 @@ static int ocfs2_local_release_dquot(struct dquot *dquot)
 	struct super_block *sb = dquot->dq_sb;
 	struct ocfs2_local_disk_chunk *dchunk;
 	int offset;
-	handle_t *handle = journal_current_handle();
-
-	BUG_ON(!handle);
-	/* First write all local changes to global file */
-	status = ocfs2_global_release_dquot(dquot);
-	if (status < 0) {
-		mlog_errno(status);
-		goto out;
-	}
 
 	status = ocfs2_journal_access_dq(handle,
 			INODE_CACHE(sb_dqopt(sb)->files[type]),
@@ -1311,12 +1290,8 @@ static int ocfs2_local_release_dquot(struct dquot *dquot)
 	ocfs2_clear_bit(offset, dchunk->dqc_bitmap);
 	le32_add_cpu(&dchunk->dqc_free, 1);
 	unlock_buffer(od->dq_chunk->qc_headerbh);
-	status = ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
-	if (status < 0) {
-		mlog_errno(status);
-		goto out;
-	}
-	status = 0;
+	ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
+
 out:
 	/* Clear the read bit so that next time someone uses this
 	 * dquot he reads fresh info from disk and allocates local
@@ -1330,9 +1305,6 @@ static const struct quota_format_ops ocfs2_format_ops = {
 	.read_file_info		= ocfs2_local_read_info,
 	.write_file_info	= ocfs2_global_write_info,
 	.free_file_info		= ocfs2_local_free_info,
-	.read_dqblk		= ocfs2_local_read_dquot,
-	.commit_dqblk		= ocfs2_local_write_dquot,
-	.release_dqblk		= ocfs2_local_release_dquot,
 };
 
 struct quota_format_type ocfs2_quota_format = {
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 74db2be75dd6..3ac5aa733e9c 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -37,7 +37,6 @@
 
 #include <linux/bio.h>
 #include <linux/blkdev.h>
-#include <linux/gfp.h>
 #include <linux/slab.h>
 #include <linux/writeback.h>
 #include <linux/pagevec.h>
@@ -571,7 +570,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
 	struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL;
 	u16 suballoc_bit_start;
 	u32 num_got;
-	u64 first_blkno;
+	u64 suballoc_loc, first_blkno;
 
 	BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
 
@@ -597,7 +596,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
 		goto out_commit;
 	}
 
-	ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
+	ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
 				   &suballoc_bit_start, &num_got,
 				   &first_blkno);
 	if (ret) {
@@ -626,7 +625,8 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
 	rb = (struct ocfs2_refcount_block *)new_bh->b_data;
 	memset(rb, 0, inode->i_sb->s_blocksize);
 	strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
-	rb->rf_suballoc_slot = cpu_to_le16(osb->slot_num);
+	rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+	rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
 	rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
 	rb->rf_fs_generation = cpu_to_le32(osb->fs_generation);
 	rb->rf_blkno = cpu_to_le64(first_blkno);
@@ -791,7 +791,10 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh)
 	if (le32_to_cpu(rb->rf_count) == 1) {
 		blk = le64_to_cpu(rb->rf_blkno);
 		bit = le16_to_cpu(rb->rf_suballoc_bit);
-		bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+		if (rb->rf_suballoc_loc)
+			bg_blkno = le64_to_cpu(rb->rf_suballoc_loc);
+		else
+			bg_blkno = ocfs2_which_suballoc_group(blk, bit);
 
 		alloc_inode = ocfs2_get_system_file_inode(osb,
 					EXTENT_ALLOC_SYSTEM_INODE,
@@ -1269,9 +1272,7 @@ static int ocfs2_change_refcount_rec(handle_t *handle,
 	} else if (merge)
 		ocfs2_refcount_rec_merge(rb, index);
 
-	ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
-	if (ret)
-		mlog_errno(ret);
+	ocfs2_journal_dirty(handle, ref_leaf_bh);
 out:
 	return ret;
 }
@@ -1285,7 +1286,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
 	int ret;
 	u16 suballoc_bit_start;
 	u32 num_got;
-	u64 blkno;
+	u64 suballoc_loc, blkno;
 	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
 	struct buffer_head *new_bh = NULL;
 	struct ocfs2_refcount_block *new_rb;
@@ -1299,7 +1300,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
 		goto out;
 	}
 
-	ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1,
+	ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
 				   &suballoc_bit_start, &num_got,
 				   &blkno);
 	if (ret) {
@@ -1330,7 +1331,8 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
 	memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize);
 
 	new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
-	new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num);
+	new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+	new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
 	new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
 	new_rb->rf_blkno = cpu_to_le64(blkno);
 	new_rb->rf_cpos = cpu_to_le32(0);
@@ -1525,7 +1527,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
 	int ret;
 	u16 suballoc_bit_start;
 	u32 num_got, new_cpos;
-	u64 blkno;
+	u64 suballoc_loc, blkno;
 	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
 	struct ocfs2_refcount_block *root_rb =
 			(struct ocfs2_refcount_block *)ref_root_bh->b_data;
@@ -1549,7 +1551,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
 		goto out;
 	}
 
-	ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1,
+	ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
 				   &suballoc_bit_start, &num_got,
 				   &blkno);
 	if (ret) {
@@ -1576,7 +1578,8 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
 	new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
 	memset(new_rb, 0, sb->s_blocksize);
 	strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
-	new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num);
+	new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+	new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
 	new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
 	new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
 	new_rb->rf_blkno = cpu_to_le64(blkno);
@@ -1695,7 +1698,7 @@ static int ocfs2_adjust_refcount_rec(handle_t *handle,
 	 * 2 more credits, one for the leaf refcount block, one for
 	 * the extent block contains the extent rec.
 	 */
-	ret = ocfs2_extend_trans(handle, handle->h_buffer_credits + 2);
+	ret = ocfs2_extend_trans(handle, 2);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
@@ -1803,11 +1806,7 @@ static int ocfs2_insert_refcount_rec(handle_t *handle,
 	if (merge)
 		ocfs2_refcount_rec_merge(rb, index);
 
-	ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
+	ocfs2_journal_dirty(handle, ref_leaf_bh);
 
 	if (index == 0) {
 		ret = ocfs2_adjust_refcount_rec(handle, ci,
@@ -1978,9 +1977,7 @@ static int ocfs2_split_refcount_rec(handle_t *handle,
 			ocfs2_refcount_rec_merge(rb, index);
 	}
 
-	ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
-	if (ret)
-		mlog_errno(ret);
+	ocfs2_journal_dirty(handle, ref_leaf_bh);
 
 out:
 	brelse(new_bh);
@@ -2113,6 +2110,7 @@ static int ocfs2_remove_refcount_extent(handle_t *handle,
 	 */
 	ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE,
 					le16_to_cpu(rb->rf_suballoc_slot),
+					le64_to_cpu(rb->rf_suballoc_loc),
 					le64_to_cpu(rb->rf_blkno),
 					le16_to_cpu(rb->rf_suballoc_bit));
 	if (ret) {
@@ -2517,20 +2515,19 @@ out:
  *
  * Normally the refcount blocks store these refcount should be
  * contiguous also, so that we can get the number easily.
- * As for meta_ac, we will at most add split 2 refcount record and
- * 2 more refcount block, so just check it in a rough way.
+ * We will at most add split 2 refcount records and 2 more
+ * refcount blocks, so just check it in a rough way.
  *
  * Caller must hold refcount tree lock.
  */
 int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
-					  struct buffer_head *di_bh,
+					  u64 refcount_loc,
 					  u64 phys_blkno,
 					  u32 clusters,
 					  int *credits,
-					  struct ocfs2_alloc_context **meta_ac)
+					  int *ref_blocks)
 {
-	int ret, ref_blocks = 0;
-	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	int ret;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct buffer_head *ref_root_bh = NULL;
 	struct ocfs2_refcount_tree *tree;
@@ -2547,14 +2544,13 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
 	BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
 
 	ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb),
-				      le64_to_cpu(di->i_refcount_loc), &tree);
+				      refcount_loc, &tree);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 
-	ret = ocfs2_read_refcount_block(&tree->rf_ci,
-					le64_to_cpu(di->i_refcount_loc),
+	ret = ocfs2_read_refcount_block(&tree->rf_ci, refcount_loc,
 					&ref_root_bh);
 	if (ret) {
 		mlog_errno(ret);
@@ -2565,21 +2561,14 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
 					       &tree->rf_ci,
 					       ref_root_bh,
 					       start_cpos, clusters,
-					       &ref_blocks, credits);
+					       ref_blocks, credits);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 
-	mlog(0, "reserve new metadata %d, credits = %d\n",
-	     ref_blocks, *credits);
-
-	if (ref_blocks) {
-		ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
-							ref_blocks, meta_ac);
-		if (ret)
-			mlog_errno(ret);
-	}
+	mlog(0, "reserve new metadata %d blocks, credits = %d\n",
+	     *ref_blocks, *credits);
 
 out:
 	brelse(ref_root_bh);
@@ -2942,10 +2931,16 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
 
 	offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
 	end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
+	/*
+	 * We only duplicate pages until we reach the page contains i_size - 1.
+	 * So trim 'end' to i_size.
+	 */
+	if (end > i_size_read(context->inode))
+		end = i_size_read(context->inode);
 
 	while (offset < end) {
 		page_index = offset >> PAGE_CACHE_SHIFT;
-		map_end = (page_index + 1) << PAGE_CACHE_SHIFT;
+		map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT;
 		if (map_end > end)
 			map_end = end;
 
@@ -2957,8 +2952,12 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
 
 		page = grab_cache_page(mapping, page_index);
 
-		/* This page can't be dirtied before we CoW it out. */
-		BUG_ON(PageDirty(page));
+		/*
+		 * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page
+		 * can't be dirtied before we CoW it out.
+		 */
+		if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
+			BUG_ON(PageDirty(page));
 
 		if (!PageUptodate(page)) {
 			ret = block_read_full_page(page, ocfs2_get_block);
@@ -3037,11 +3036,7 @@ static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
 		}
 
 		memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize);
-		ret = ocfs2_journal_dirty(handle, new_bh);
-		if (ret) {
-			mlog_errno(ret);
-			break;
-		}
+		ocfs2_journal_dirty(handle, new_bh);
 
 		brelse(new_bh);
 		brelse(old_bh);
@@ -3170,7 +3165,7 @@ static int ocfs2_cow_sync_writeback(struct super_block *sb,
 
 	while (offset < end) {
 		page_index = offset >> PAGE_CACHE_SHIFT;
-		map_end = (page_index + 1) << PAGE_CACHE_SHIFT;
+		map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT;
 		if (map_end > end)
 			map_end = end;
 
@@ -3279,7 +3274,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
 		} else {
 			delete = 1;
 
-			ret = __ocfs2_claim_clusters(osb, handle,
+			ret = __ocfs2_claim_clusters(handle,
 						     context->data_ac,
 						     1, set_len,
 						     &new_bit, &new_len);
@@ -4071,6 +4066,7 @@ static int ocfs2_complete_reflink(struct inode *s_inode,
 	OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features;
 	spin_unlock(&OCFS2_I(t_inode)->ip_lock);
 	i_size_write(t_inode, size);
+	t_inode->i_blocks = s_inode->i_blocks;
 
 	di->i_xattr_inline_size = s_di->i_xattr_inline_size;
 	di->i_clusters = s_di->i_clusters;
@@ -4079,6 +4075,9 @@ static int ocfs2_complete_reflink(struct inode *s_inode,
 	di->i_attr = s_di->i_attr;
 
 	if (preserve) {
+		t_inode->i_uid = s_inode->i_uid;
+		t_inode->i_gid = s_inode->i_gid;
+		t_inode->i_mode = s_inode->i_mode;
 		di->i_uid = s_di->i_uid;
 		di->i_gid = s_di->i_gid;
 		di->i_mode = s_di->i_mode;
@@ -4173,6 +4172,12 @@ static int __ocfs2_reflink(struct dentry *old_dentry,
 	struct inode *inode = old_dentry->d_inode;
 	struct buffer_head *new_bh = NULL;
 
+	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) {
+		ret = -EINVAL;
+		mlog_errno(ret);
+		goto out;
+	}
+
 	ret = filemap_fdatawrite(inode->i_mapping);
 	if (ret) {
 		mlog_errno(ret);
@@ -4386,7 +4391,7 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
 	}
 
 	mutex_lock(&inode->i_mutex);
-	vfs_dq_init(dir);
+	dquot_initialize(dir);
 	error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve);
 	mutex_unlock(&inode->i_mutex);
 	if (!error)
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index c1d19b1d3ecc..9983ba1570e2 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -47,11 +47,11 @@ int ocfs2_decrease_refcount(struct inode *inode,
 			    struct ocfs2_cached_dealloc_ctxt *dealloc,
 			    int delete);
 int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
-					  struct buffer_head *di_bh,
+					  u64 refcount_loc,
 					  u64 phys_blkno,
 					  u32 clusters,
 					  int *credits,
-					  struct ocfs2_alloc_context **meta_ac);
+					  int *ref_blocks);
 int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh,
 		       u32 cpos, u32 write_len, u32 max_cpos);
 
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
new file mode 100644
index 000000000000..d8b6e4259b80
--- /dev/null
+++ b/fs/ocfs2/reservations.c
@@ -0,0 +1,846 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * reservations.c
+ *
+ * Allocation reservations implementation
+ *
+ * Some code borrowed from fs/ext3/balloc.c and is:
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * The rest is copyright (C) 2010 Novell.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/highmem.h>
+#include <linux/bitops.h>
+#include <linux/list.h>
+
+#define MLOG_MASK_PREFIX ML_RESERVATIONS
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#ifdef CONFIG_OCFS2_DEBUG_FS
+#define OCFS2_CHECK_RESERVATIONS
+#endif
+
+DEFINE_SPINLOCK(resv_lock);
+
+#define	OCFS2_MIN_RESV_WINDOW_BITS	8
+#define	OCFS2_MAX_RESV_WINDOW_BITS	1024
+
+int ocfs2_dir_resv_allowed(struct ocfs2_super *osb)
+{
+	return (osb->osb_resv_level && osb->osb_dir_resv_level);
+}
+
+static unsigned int ocfs2_resv_window_bits(struct ocfs2_reservation_map *resmap,
+					   struct ocfs2_alloc_reservation *resv)
+{
+	struct ocfs2_super *osb = resmap->m_osb;
+	unsigned int bits;
+
+	if (!(resv->r_flags & OCFS2_RESV_FLAG_DIR)) {
+		/* 8, 16, 32, 64, 128, 256, 512, 1024 */
+		bits = 4 << osb->osb_resv_level;
+	} else {
+		bits = 4 << osb->osb_dir_resv_level;
+	}
+	return bits;
+}
+
+static inline unsigned int ocfs2_resv_end(struct ocfs2_alloc_reservation *resv)
+{
+	if (resv->r_len)
+		return resv->r_start + resv->r_len - 1;
+	return resv->r_start;
+}
+
+static inline int ocfs2_resv_empty(struct ocfs2_alloc_reservation *resv)
+{
+	return !!(resv->r_len == 0);
+}
+
+static inline int ocfs2_resmap_disabled(struct ocfs2_reservation_map *resmap)
+{
+	if (resmap->m_osb->osb_resv_level == 0)
+		return 1;
+	return 0;
+}
+
+static void ocfs2_dump_resv(struct ocfs2_reservation_map *resmap)
+{
+	struct ocfs2_super *osb = resmap->m_osb;
+	struct rb_node *node;
+	struct ocfs2_alloc_reservation *resv;
+	int i = 0;
+
+	mlog(ML_NOTICE, "Dumping resmap for device %s. Bitmap length: %u\n",
+	     osb->dev_str, resmap->m_bitmap_len);
+
+	node = rb_first(&resmap->m_reservations);
+	while (node) {
+		resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+
+		mlog(ML_NOTICE, "start: %u\tend: %u\tlen: %u\tlast_start: %u"
+		     "\tlast_len: %u\n", resv->r_start,
+		     ocfs2_resv_end(resv), resv->r_len, resv->r_last_start,
+		     resv->r_last_len);
+
+		node = rb_next(node);
+		i++;
+	}
+
+	mlog(ML_NOTICE, "%d reservations found. LRU follows\n", i);
+
+	i = 0;
+	list_for_each_entry(resv, &resmap->m_lru, r_lru) {
+		mlog(ML_NOTICE, "LRU(%d) start: %u\tend: %u\tlen: %u\t"
+		     "last_start: %u\tlast_len: %u\n", i, resv->r_start,
+		     ocfs2_resv_end(resv), resv->r_len, resv->r_last_start,
+		     resv->r_last_len);
+
+		i++;
+	}
+}
+
+#ifdef OCFS2_CHECK_RESERVATIONS
+static int ocfs2_validate_resmap_bits(struct ocfs2_reservation_map *resmap,
+				      int i,
+				      struct ocfs2_alloc_reservation *resv)
+{
+	char *disk_bitmap = resmap->m_disk_bitmap;
+	unsigned int start = resv->r_start;
+	unsigned int end = ocfs2_resv_end(resv);
+
+	while (start <= end) {
+		if (ocfs2_test_bit(start, disk_bitmap)) {
+			mlog(ML_ERROR,
+			     "reservation %d covers an allocated area "
+			     "starting at bit %u!\n", i, start);
+			return 1;
+		}
+
+		start++;
+	}
+	return 0;
+}
+
+static void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap)
+{
+	unsigned int off = 0;
+	int i = 0;
+	struct rb_node *node;
+	struct ocfs2_alloc_reservation *resv;
+
+	node = rb_first(&resmap->m_reservations);
+	while (node) {
+		resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+
+		if (i > 0 && resv->r_start <= off) {
+			mlog(ML_ERROR, "reservation %d has bad start off!\n",
+			     i);
+			goto bad;
+		}
+
+		if (resv->r_len == 0) {
+			mlog(ML_ERROR, "reservation %d has no length!\n",
+			     i);
+			goto bad;
+		}
+
+		if (resv->r_start > ocfs2_resv_end(resv)) {
+			mlog(ML_ERROR, "reservation %d has invalid range!\n",
+			     i);
+			goto bad;
+		}
+
+		if (ocfs2_resv_end(resv) >= resmap->m_bitmap_len) {
+			mlog(ML_ERROR, "reservation %d extends past bitmap!\n",
+			     i);
+			goto bad;
+		}
+
+		if (ocfs2_validate_resmap_bits(resmap, i, resv))
+			goto bad;
+
+		off = ocfs2_resv_end(resv);
+		node = rb_next(node);
+
+		i++;
+	}
+	return;
+
+bad:
+	ocfs2_dump_resv(resmap);
+	BUG();
+}
+#else
+static inline void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap)
+{
+
+}
+#endif
+
+void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv)
+{
+	memset(resv, 0, sizeof(*resv));
+	INIT_LIST_HEAD(&resv->r_lru);
+}
+
+void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv,
+			 unsigned int flags)
+{
+	BUG_ON(flags & ~OCFS2_RESV_TYPES);
+
+	resv->r_flags |= flags;
+}
+
+int ocfs2_resmap_init(struct ocfs2_super *osb,
+		      struct ocfs2_reservation_map *resmap)
+{
+	memset(resmap, 0, sizeof(*resmap));
+
+	resmap->m_osb = osb;
+	resmap->m_reservations = RB_ROOT;
+	/* m_bitmap_len is initialized to zero by the above memset. */
+	INIT_LIST_HEAD(&resmap->m_lru);
+
+	return 0;
+}
+
+static void ocfs2_resv_mark_lru(struct ocfs2_reservation_map *resmap,
+				struct ocfs2_alloc_reservation *resv)
+{
+	assert_spin_locked(&resv_lock);
+
+	if (!list_empty(&resv->r_lru))
+		list_del_init(&resv->r_lru);
+
+	list_add_tail(&resv->r_lru, &resmap->m_lru);
+}
+
+static void __ocfs2_resv_trunc(struct ocfs2_alloc_reservation *resv)
+{
+	resv->r_len = 0;
+	resv->r_start = 0;
+}
+
+static void ocfs2_resv_remove(struct ocfs2_reservation_map *resmap,
+			      struct ocfs2_alloc_reservation *resv)
+{
+	if (resv->r_flags & OCFS2_RESV_FLAG_INUSE) {
+		list_del_init(&resv->r_lru);
+		rb_erase(&resv->r_node, &resmap->m_reservations);
+		resv->r_flags &= ~OCFS2_RESV_FLAG_INUSE;
+	}
+}
+
+static void __ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
+				 struct ocfs2_alloc_reservation *resv)
+{
+	assert_spin_locked(&resv_lock);
+
+	__ocfs2_resv_trunc(resv);
+	/*
+	 * last_len and last_start no longer make sense if
+	 * we're changing the range of our allocations.
+	 */
+	resv->r_last_len = resv->r_last_start = 0;
+
+	ocfs2_resv_remove(resmap, resv);
+}
+
+/* does nothing if 'resv' is null */
+void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
+			struct ocfs2_alloc_reservation *resv)
+{
+	if (resv) {
+		spin_lock(&resv_lock);
+		__ocfs2_resv_discard(resmap, resv);
+		spin_unlock(&resv_lock);
+	}
+}
+
+static void ocfs2_resmap_clear_all_resv(struct ocfs2_reservation_map *resmap)
+{
+	struct rb_node *node;
+	struct ocfs2_alloc_reservation *resv;
+
+	assert_spin_locked(&resv_lock);
+
+	while ((node = rb_last(&resmap->m_reservations)) != NULL) {
+		resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+
+		__ocfs2_resv_discard(resmap, resv);
+	}
+}
+
+void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap,
+			  unsigned int clen, char *disk_bitmap)
+{
+	if (ocfs2_resmap_disabled(resmap))
+		return;
+
+	spin_lock(&resv_lock);
+
+	ocfs2_resmap_clear_all_resv(resmap);
+	resmap->m_bitmap_len = clen;
+	resmap->m_disk_bitmap = disk_bitmap;
+
+	spin_unlock(&resv_lock);
+}
+
+void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap)
+{
+	/* Does nothing for now. Keep this around for API symmetry */
+}
+
+static void ocfs2_resv_insert(struct ocfs2_reservation_map *resmap,
+			      struct ocfs2_alloc_reservation *new)
+{
+	struct rb_root *root = &resmap->m_reservations;
+	struct rb_node *parent = NULL;
+	struct rb_node **p = &root->rb_node;
+	struct ocfs2_alloc_reservation *tmp;
+
+	assert_spin_locked(&resv_lock);
+
+	mlog(0, "Insert reservation start: %u len: %u\n", new->r_start,
+	     new->r_len);
+
+	while (*p) {
+		parent = *p;
+
+		tmp = rb_entry(parent, struct ocfs2_alloc_reservation, r_node);
+
+		if (new->r_start < tmp->r_start) {
+			p = &(*p)->rb_left;
+
+			/*
+			 * This is a good place to check for
+			 * overlapping reservations.
+			 */
+			BUG_ON(ocfs2_resv_end(new) >= tmp->r_start);
+		} else if (new->r_start > ocfs2_resv_end(tmp)) {
+			p = &(*p)->rb_right;
+		} else {
+			/* This should never happen! */
+			mlog(ML_ERROR, "Duplicate reservation window!\n");
+			BUG();
+		}
+	}
+
+	rb_link_node(&new->r_node, parent, p);
+	rb_insert_color(&new->r_node, root);
+	new->r_flags |= OCFS2_RESV_FLAG_INUSE;
+
+	ocfs2_resv_mark_lru(resmap, new);
+
+	ocfs2_check_resmap(resmap);
+}
+
+/**
+ * ocfs2_find_resv_lhs() - find the window which contains goal
+ * @resmap: reservation map to search
+ * @goal: which bit to search for
+ *
+ * If a window containing that goal is not found, we return the window
+ * which comes before goal. Returns NULL on empty rbtree or no window
+ * before goal.
+ */
+static struct ocfs2_alloc_reservation *
+ocfs2_find_resv_lhs(struct ocfs2_reservation_map *resmap, unsigned int goal)
+{
+	struct ocfs2_alloc_reservation *resv = NULL;
+	struct ocfs2_alloc_reservation *prev_resv = NULL;
+	struct rb_node *node = resmap->m_reservations.rb_node;
+
+	assert_spin_locked(&resv_lock);
+
+	if (!node)
+		return NULL;
+
+	node = rb_first(&resmap->m_reservations);
+	while (node) {
+		resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+
+		if (resv->r_start <= goal && ocfs2_resv_end(resv) >= goal)
+			break;
+
+		/* Check if we overshot the reservation just before goal? */
+		if (resv->r_start > goal) {
+			resv = prev_resv;
+			break;
+		}
+
+		prev_resv = resv;
+		node = rb_next(node);
+	}
+
+	return resv;
+}
+
+/*
+ * We are given a range within the bitmap, which corresponds to a gap
+ * inside the reservations tree (search_start, search_len). The range
+ * can be anything from the whole bitmap, to a gap between
+ * reservations.
+ *
+ * The start value of *rstart is insignificant.
+ *
+ * This function searches the bitmap range starting at search_start
+ * with length search_len for a set of contiguous free bits. We try
+ * to find up to 'wanted' bits, but can sometimes return less.
+ *
+ * Returns the length of allocation, 0 if no free bits are found.
+ *
+ * *cstart and *clen will also be populated with the result.
+ */
+static int ocfs2_resmap_find_free_bits(struct ocfs2_reservation_map *resmap,
+				       unsigned int wanted,
+				       unsigned int search_start,
+				       unsigned int search_len,
+				       unsigned int *rstart,
+				       unsigned int *rlen)
+{
+	void *bitmap = resmap->m_disk_bitmap;
+	unsigned int best_start, best_len = 0;
+	int offset, start, found;
+
+	mlog(0, "Find %u bits within range (%u, len %u) resmap len: %u\n",
+	     wanted, search_start, search_len, resmap->m_bitmap_len);
+
+	found = best_start = best_len = 0;
+
+	start = search_start;
+	while ((offset = ocfs2_find_next_zero_bit(bitmap, resmap->m_bitmap_len,
+						 start)) != -1) {
+		/* Search reached end of the region */
+		if (offset >= (search_start + search_len))
+			break;
+
+		if (offset == start) {
+			/* we found a zero */
+			found++;
+			/* move start to the next bit to test */
+			start++;
+		} else {
+			/* got a zero after some ones */
+			found = 1;
+			start = offset + 1;
+		}
+		if (found > best_len) {
+			best_len = found;
+			best_start = start - found;
+		}
+
+		if (found >= wanted)
+			break;
+	}
+
+	if (best_len == 0)
+		return 0;
+
+	if (best_len >= wanted)
+		best_len = wanted;
+
+	*rlen = best_len;
+	*rstart = best_start;
+
+	mlog(0, "Found start: %u len: %u\n", best_start, best_len);
+
+	return *rlen;
+}
+
+static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
+				     struct ocfs2_alloc_reservation *resv,
+				     unsigned int goal, unsigned int wanted)
+{
+	struct rb_root *root = &resmap->m_reservations;
+	unsigned int gap_start, gap_end, gap_len;
+	struct ocfs2_alloc_reservation *prev_resv, *next_resv;
+	struct rb_node *prev, *next;
+	unsigned int cstart, clen;
+	unsigned int best_start = 0, best_len = 0;
+
+	/*
+	 * Nasty cases to consider:
+	 *
+	 * - rbtree is empty
+	 * - our window should be first in all reservations
+	 * - our window should be last in all reservations
+	 * - need to make sure we don't go past end of bitmap
+	 */
+
+	mlog(0, "resv start: %u resv end: %u goal: %u wanted: %u\n",
+	     resv->r_start, ocfs2_resv_end(resv), goal, wanted);
+
+	assert_spin_locked(&resv_lock);
+
+	if (RB_EMPTY_ROOT(root)) {
+		/*
+		 * Easiest case - empty tree. We can just take
+		 * whatever window of free bits we want.
+		 */
+
+		mlog(0, "Empty root\n");
+
+		clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal,
+						   resmap->m_bitmap_len - goal,
+						   &cstart, &clen);
+
+		/*
+		 * This should never happen - the local alloc window
+		 * will always have free bits when we're called.
+		 */
+		BUG_ON(goal == 0 && clen == 0);
+
+		if (clen == 0)
+			return;
+
+		resv->r_start = cstart;
+		resv->r_len = clen;
+
+		ocfs2_resv_insert(resmap, resv);
+		return;
+	}
+
+	prev_resv = ocfs2_find_resv_lhs(resmap, goal);
+
+	if (prev_resv == NULL) {
+		mlog(0, "Goal on LHS of leftmost window\n");
+
+		/*
+		 * A NULL here means that the search code couldn't
+		 * find a window that starts before goal.
+		 *
+		 * However, we can take the first window after goal,
+		 * which is also by definition, the leftmost window in
+		 * the entire tree. If we can find free bits in the
+		 * gap between goal and the LHS window, then the
+		 * reservation can safely be placed there.
+		 *
+		 * Otherwise we fall back to a linear search, checking
+		 * the gaps in between windows for a place to
+		 * allocate.
+		 */
+
+		next = rb_first(root);
+		next_resv = rb_entry(next, struct ocfs2_alloc_reservation,
+				     r_node);
+
+		/*
+		 * The search should never return such a window. (see
+		 * comment above
+		 */
+		if (next_resv->r_start <= goal) {
+			mlog(ML_ERROR, "goal: %u next_resv: start %u len %u\n",
+			     goal, next_resv->r_start, next_resv->r_len);
+			ocfs2_dump_resv(resmap);
+			BUG();
+		}
+
+		clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal,
+						   next_resv->r_start - goal,
+						   &cstart, &clen);
+		if (clen) {
+			best_len = clen;
+			best_start = cstart;
+			if (best_len == wanted)
+				goto out_insert;
+		}
+
+		prev_resv = next_resv;
+		next_resv = NULL;
+	}
+
+	prev = &prev_resv->r_node;
+
+	/* Now we do a linear search for a window, starting at 'prev_rsv' */
+	while (1) {
+		next = rb_next(prev);
+		if (next) {
+			mlog(0, "One more resv found in linear search\n");
+			next_resv = rb_entry(next,
+					     struct ocfs2_alloc_reservation,
+					     r_node);
+
+			gap_start = ocfs2_resv_end(prev_resv) + 1;
+			gap_end = next_resv->r_start - 1;
+			gap_len = gap_end - gap_start + 1;
+		} else {
+			mlog(0, "No next node\n");
+			/*
+			 * We're at the rightmost edge of the
+			 * tree. See if a reservation between this
+			 * window and the end of the bitmap will work.
+			 */
+			gap_start = ocfs2_resv_end(prev_resv) + 1;
+			gap_len = resmap->m_bitmap_len - gap_start;
+			gap_end = resmap->m_bitmap_len - 1;
+		}
+
+		/*
+		 * No need to check this gap if we have already found
+		 * a larger region of free bits.
+		 */
+		if (gap_len <= best_len)
+			goto next_resv;
+
+		clen = ocfs2_resmap_find_free_bits(resmap, wanted, gap_start,
+						   gap_len, &cstart, &clen);
+		if (clen == wanted) {
+			best_len = clen;
+			best_start = cstart;
+			goto out_insert;
+		} else if (clen > best_len) {
+			best_len = clen;
+			best_start = cstart;
+		}
+
+next_resv:
+		if (!next)
+			break;
+
+		prev = next;
+		prev_resv = rb_entry(prev, struct ocfs2_alloc_reservation,
+				     r_node);
+	}
+
+out_insert:
+	if (best_len) {
+		resv->r_start = best_start;
+		resv->r_len = best_len;
+		ocfs2_resv_insert(resmap, resv);
+	}
+}
+
+static void ocfs2_cannibalize_resv(struct ocfs2_reservation_map *resmap,
+				   struct ocfs2_alloc_reservation *resv,
+				   unsigned int wanted)
+{
+	struct ocfs2_alloc_reservation *lru_resv;
+	int tmpwindow = !!(resv->r_flags & OCFS2_RESV_FLAG_TMP);
+	unsigned int min_bits;
+
+	if (!tmpwindow)
+		min_bits = ocfs2_resv_window_bits(resmap, resv) >> 1;
+	else
+		min_bits = wanted; /* We at know the temp window will use all
+				    * of these bits */
+
+	/*
+	 * Take the first reservation off the LRU as our 'target'. We
+	 * don't try to be smart about it. There might be a case for
+	 * searching based on size but I don't have enough data to be
+	 * sure. --Mark (3/16/2010)
+	 */
+	lru_resv = list_first_entry(&resmap->m_lru,
+				    struct ocfs2_alloc_reservation, r_lru);
+
+	mlog(0, "lru resv: start: %u len: %u end: %u\n", lru_resv->r_start,
+	     lru_resv->r_len, ocfs2_resv_end(lru_resv));
+
+	/*
+	 * Cannibalize (some or all) of the target reservation and
+	 * feed it to the current window.
+	 */
+	if (lru_resv->r_len <= min_bits) {
+		/*
+		 * Discard completely if size is less than or equal to a
+		 * reasonable threshold - 50% of window bits for non temporary
+		 * windows.
+		 */
+		resv->r_start = lru_resv->r_start;
+		resv->r_len = lru_resv->r_len;
+
+		__ocfs2_resv_discard(resmap, lru_resv);
+	} else {
+		unsigned int shrink;
+		if (tmpwindow)
+			shrink = min_bits;
+		else
+			shrink = lru_resv->r_len / 2;
+
+		lru_resv->r_len -= shrink;
+
+		resv->r_start = ocfs2_resv_end(lru_resv) + 1;
+		resv->r_len = shrink;
+	}
+
+	mlog(0, "Reservation now looks like: r_start: %u r_end: %u "
+	     "r_len: %u r_last_start: %u r_last_len: %u\n",
+	     resv->r_start, ocfs2_resv_end(resv), resv->r_len,
+	     resv->r_last_start, resv->r_last_len);
+
+	ocfs2_resv_insert(resmap, resv);
+}
+
+static void ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
+				   struct ocfs2_alloc_reservation *resv,
+				   unsigned int wanted)
+{
+	unsigned int goal = 0;
+
+	BUG_ON(!ocfs2_resv_empty(resv));
+
+	/*
+	 * Begin by trying to get a window as close to the previous
+	 * one as possible. Using the most recent allocation as a
+	 * start goal makes sense.
+	 */
+	if (resv->r_last_len) {
+		goal = resv->r_last_start + resv->r_last_len;
+		if (goal >= resmap->m_bitmap_len)
+			goal = 0;
+	}
+
+	__ocfs2_resv_find_window(resmap, resv, goal, wanted);
+
+	/* Search from last alloc didn't work, try once more from beginning. */
+	if (ocfs2_resv_empty(resv) && goal != 0)
+		__ocfs2_resv_find_window(resmap, resv, 0, wanted);
+
+	if (ocfs2_resv_empty(resv)) {
+		/*
+		 * Still empty? Pull oldest one off the LRU, remove it from
+		 * tree, put this one in it's place.
+		 */
+		ocfs2_cannibalize_resv(resmap, resv, wanted);
+	}
+
+	BUG_ON(ocfs2_resv_empty(resv));
+}
+
+int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
+			   struct ocfs2_alloc_reservation *resv,
+			   int *cstart, int *clen)
+{
+	unsigned int wanted = *clen;
+
+	if (resv == NULL || ocfs2_resmap_disabled(resmap))
+		return -ENOSPC;
+
+	spin_lock(&resv_lock);
+
+	/*
+	 * We don't want to over-allocate for temporary
+	 * windows. Otherwise, we run the risk of fragmenting the
+	 * allocation space.
+	 */
+	wanted = ocfs2_resv_window_bits(resmap, resv);
+	if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen)
+		wanted = *clen;
+
+	if (ocfs2_resv_empty(resv)) {
+		mlog(0, "empty reservation, find new window\n");
+
+		/*
+		 * Try to get a window here. If it works, we must fall
+		 * through and test the bitmap . This avoids some
+		 * ping-ponging of windows due to non-reserved space
+		 * being allocation before we initialize a window for
+		 * that inode.
+		 */
+		ocfs2_resv_find_window(resmap, resv, wanted);
+	}
+
+	BUG_ON(ocfs2_resv_empty(resv));
+
+	*cstart = resv->r_start;
+	*clen = resv->r_len;
+
+	spin_unlock(&resv_lock);
+	return 0;
+}
+
+static void
+	ocfs2_adjust_resv_from_alloc(struct ocfs2_reservation_map *resmap,
+				     struct ocfs2_alloc_reservation *resv,
+				     unsigned int start, unsigned int end)
+{
+	unsigned int rhs = 0;
+	unsigned int old_end = ocfs2_resv_end(resv);
+
+	BUG_ON(start != resv->r_start || old_end < end);
+
+	/*
+	 * Completely used? We can remove it then.
+	 */
+	if (old_end == end) {
+		__ocfs2_resv_discard(resmap, resv);
+		return;
+	}
+
+	rhs = old_end - end;
+
+	/*
+	 * This should have been trapped above.
+	 */
+	BUG_ON(rhs == 0);
+
+	resv->r_start = end + 1;
+	resv->r_len = old_end - resv->r_start + 1;
+}
+
+void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap,
+			       struct ocfs2_alloc_reservation *resv,
+			       u32 cstart, u32 clen)
+{
+	unsigned int cend = cstart + clen - 1;
+
+	if (resmap == NULL || ocfs2_resmap_disabled(resmap))
+		return;
+
+	if (resv == NULL)
+		return;
+
+	BUG_ON(cstart != resv->r_start);
+
+	spin_lock(&resv_lock);
+
+	mlog(0, "claim bits: cstart: %u cend: %u clen: %u r_start: %u "
+	     "r_end: %u r_len: %u, r_last_start: %u r_last_len: %u\n",
+	     cstart, cend, clen, resv->r_start, ocfs2_resv_end(resv),
+	     resv->r_len, resv->r_last_start, resv->r_last_len);
+
+	BUG_ON(cstart < resv->r_start);
+	BUG_ON(cstart > ocfs2_resv_end(resv));
+	BUG_ON(cend > ocfs2_resv_end(resv));
+
+	ocfs2_adjust_resv_from_alloc(resmap, resv, cstart, cend);
+	resv->r_last_start = cstart;
+	resv->r_last_len = clen;
+
+	/*
+	 * May have been discarded above from
+	 * ocfs2_adjust_resv_from_alloc().
+	 */
+	if (!ocfs2_resv_empty(resv))
+		ocfs2_resv_mark_lru(resmap, resv);
+
+	mlog(0, "Reservation now looks like: r_start: %u r_end: %u "
+	     "r_len: %u r_last_start: %u r_last_len: %u\n",
+	     resv->r_start, ocfs2_resv_end(resv), resv->r_len,
+	     resv->r_last_start, resv->r_last_len);
+
+	ocfs2_check_resmap(resmap);
+
+	spin_unlock(&resv_lock);
+}
diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h
new file mode 100644
index 000000000000..1e49cc29d06c
--- /dev/null
+++ b/fs/ocfs2/reservations.h
@@ -0,0 +1,159 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * reservations.h
+ *
+ * Allocation reservations function prototypes and structures.
+ *
+ * Copyright (C) 2010 Novell.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef	OCFS2_RESERVATIONS_H
+#define	OCFS2_RESERVATIONS_H
+
+#include <linux/rbtree.h>
+
+#define OCFS2_DEFAULT_RESV_LEVEL	2
+#define OCFS2_MAX_RESV_LEVEL	9
+#define OCFS2_MIN_RESV_LEVEL	0
+
+struct ocfs2_alloc_reservation {
+	struct rb_node	r_node;
+
+	unsigned int	r_start;	/* Begining of current window */
+	unsigned int	r_len;		/* Length of the window */
+
+	unsigned int	r_last_len;	/* Length of most recent alloc */
+	unsigned int	r_last_start;	/* Start of most recent alloc */
+	struct list_head	r_lru;	/* LRU list head */
+
+	unsigned int	r_flags;
+};
+
+#define	OCFS2_RESV_FLAG_INUSE	0x01	/* Set when r_node is part of a btree */
+#define	OCFS2_RESV_FLAG_TMP	0x02	/* Temporary reservation, will be
+					 * destroyed immedately after use */
+#define	OCFS2_RESV_FLAG_DIR	0x04	/* Reservation is for an unindexed
+					 * directory btree */
+
+struct ocfs2_reservation_map {
+	struct rb_root		m_reservations;
+	char			*m_disk_bitmap;
+
+	struct ocfs2_super	*m_osb;
+
+	/* The following are not initialized to meaningful values until a disk
+	 * bitmap is provided. */
+	u32			m_bitmap_len;	/* Number of valid
+						 * bits available */
+
+	struct list_head	m_lru;		/* LRU of reservations
+						 * structures. */
+
+};
+
+void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv);
+
+#define OCFS2_RESV_TYPES	(OCFS2_RESV_FLAG_TMP|OCFS2_RESV_FLAG_DIR)
+void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv,
+			 unsigned int flags);
+
+int ocfs2_dir_resv_allowed(struct ocfs2_super *osb);
+
+/**
+ * ocfs2_resv_discard() - truncate a reservation
+ * @resmap:
+ * @resv: the reservation to truncate.
+ *
+ * After this function is called, the reservation will be empty, and
+ * unlinked from the rbtree.
+ */
+void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
+			struct ocfs2_alloc_reservation *resv);
+
+
+/**
+ * ocfs2_resmap_init() - Initialize fields of a reservations bitmap
+ * @resmap: struct ocfs2_reservation_map to initialize
+ * @obj: unused for now
+ * @ops: unused for now
+ * @max_bitmap_bytes: Maximum size of the bitmap (typically blocksize)
+ *
+ * Only possible return value other than '0' is -ENOMEM for failure to
+ * allocation mirror bitmap.
+ */
+int ocfs2_resmap_init(struct ocfs2_super *osb,
+		      struct ocfs2_reservation_map *resmap);
+
+/**
+ * ocfs2_resmap_restart() - "restart" a reservation bitmap
+ * @resmap: reservations bitmap
+ * @clen: Number of valid bits in the bitmap
+ * @disk_bitmap: the disk bitmap this resmap should refer to.
+ *
+ * Re-initialize the parameters of a reservation bitmap. This is
+ * useful for local alloc window slides.
+ *
+ * This function will call ocfs2_trunc_resv against all existing
+ * reservations. A future version will recalculate existing
+ * reservations based on the new bitmap.
+ */
+void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap,
+			  unsigned int clen, char *disk_bitmap);
+
+/**
+ * ocfs2_resmap_uninit() - uninitialize a reservation bitmap structure
+ * @resmap: the struct ocfs2_reservation_map to uninitialize
+ */
+void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap);
+
+/**
+ * ocfs2_resmap_resv_bits() - Return still-valid reservation bits
+ * @resmap: reservations bitmap
+ * @resv: reservation to base search from
+ * @cstart: start of proposed allocation
+ * @clen: length (in clusters) of proposed allocation
+ *
+ * Using the reservation data from resv, this function will compare
+ * resmap and resmap->m_disk_bitmap to determine what part (if any) of
+ * the reservation window is still clear to use. If resv is empty,
+ * this function will try to allocate a window for it.
+ *
+ * On success, zero is returned and the valid allocation area is set in cstart
+ * and clen.
+ *
+ * Returns -ENOSPC if reservations are disabled.
+ */
+int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
+			   struct ocfs2_alloc_reservation *resv,
+			   int *cstart, int *clen);
+
+/**
+ * ocfs2_resmap_claimed_bits() - Tell the reservation code that bits were used.
+ * @resmap: reservations bitmap
+ * @resv: optional reservation to recalulate based on new bitmap
+ * @cstart: start of allocation in clusters
+ * @clen: end of allocation in clusters.
+ *
+ * Tell the reservation code that bits were used to fulfill allocation in
+ * resmap. The bits don't have to have been part of any existing
+ * reservation. But we must always call this function when bits are claimed.
+ * Internally, the reservations code will use this information to mark the
+ * reservations bitmap. If resv is passed, it's next allocation window will be
+ * calculated. It also expects that 'cstart' is the same as we passed back
+ * from ocfs2_resmap_resv_bits().
+ */
+void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap,
+			       struct ocfs2_alloc_reservation *resv,
+			       u32 cstart, u32 clen);
+
+#endif	/* OCFS2_RESERVATIONS_H */
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 3c3d673a4d20..dacd553d8617 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -134,11 +134,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
 		le16_add_cpu(&group->bg_free_bits_count, -1 * backups);
 	}
 
-	ret = ocfs2_journal_dirty(handle, group_bh);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto out_rollback;
-	}
+	ocfs2_journal_dirty(handle, group_bh);
 
 	/* update the inode accordingly. */
 	ret = ocfs2_journal_access_di(handle, INODE_CACHE(bm_inode), bm_bh,
@@ -319,7 +315,8 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
 	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
 
 	if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
-				 ocfs2_group_bitmap_size(osb->sb) * 8) {
+		ocfs2_group_bitmap_size(osb->sb, 0,
+					osb->s_feature_incompat) * 8) {
 		mlog(ML_ERROR, "The disk is too old and small. "
 		     "Force to do offline resize.");
 		ret = -EINVAL;
@@ -500,7 +497,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
 	fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
 
 	if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
-				 ocfs2_group_bitmap_size(osb->sb) * 8) {
+		ocfs2_group_bitmap_size(osb->sb, 0,
+					osb->s_feature_incompat) * 8) {
 		mlog(ML_ERROR, "The disk is too old and small."
 		     " Force to do offline resize.");
 		ret = -EINVAL;
@@ -545,12 +543,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
 
 	group = (struct ocfs2_group_desc *)group_bh->b_data;
 	group->bg_next_group = cr->c_blkno;
-
-	ret = ocfs2_journal_dirty(handle, group_bh);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto out_commit;
-	}
+	ocfs2_journal_dirty(handle, group_bh);
 
 	ret = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode),
 				      main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE);
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index e49c41050264..0d3049f696c5 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -19,6 +19,7 @@
 
 #include <linux/kernel.h>
 #include <linux/crc32.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 
 /* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */
@@ -161,24 +162,23 @@ static int dlm_status_to_errno(enum dlm_status status)
 
 static void o2dlm_lock_ast_wrapper(void *astarg)
 {
-	BUG_ON(o2cb_stack.sp_proto == NULL);
+	struct ocfs2_dlm_lksb *lksb = astarg;
 
-	o2cb_stack.sp_proto->lp_lock_ast(astarg);
+	lksb->lksb_conn->cc_proto->lp_lock_ast(lksb);
 }
 
 static void o2dlm_blocking_ast_wrapper(void *astarg, int level)
 {
-	BUG_ON(o2cb_stack.sp_proto == NULL);
+	struct ocfs2_dlm_lksb *lksb = astarg;
 
-	o2cb_stack.sp_proto->lp_blocking_ast(astarg, level);
+	lksb->lksb_conn->cc_proto->lp_blocking_ast(lksb, level);
 }
 
 static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
 {
+	struct ocfs2_dlm_lksb *lksb = astarg;
 	int error = dlm_status_to_errno(status);
 
-	BUG_ON(o2cb_stack.sp_proto == NULL);
-
 	/*
 	 * In o2dlm, you can get both the lock_ast() for the lock being
 	 * granted and the unlock_ast() for the CANCEL failing.  A
@@ -193,16 +193,15 @@ static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
 	if (status == DLM_CANCELGRANT)
 		return;
 
-	o2cb_stack.sp_proto->lp_unlock_ast(astarg, error);
+	lksb->lksb_conn->cc_proto->lp_unlock_ast(lksb, error);
 }
 
 static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn,
 			 int mode,
-			 union ocfs2_dlm_lksb *lksb,
+			 struct ocfs2_dlm_lksb *lksb,
 			 u32 flags,
 			 void *name,
-			 unsigned int namelen,
-			 void *astarg)
+			 unsigned int namelen)
 {
 	enum dlm_status status;
 	int o2dlm_mode = mode_to_o2dlm(mode);
@@ -211,28 +210,27 @@ static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn,
 
 	status = dlmlock(conn->cc_lockspace, o2dlm_mode, &lksb->lksb_o2dlm,
 			 o2dlm_flags, name, namelen,
-			 o2dlm_lock_ast_wrapper, astarg,
+			 o2dlm_lock_ast_wrapper, lksb,
 			 o2dlm_blocking_ast_wrapper);
 	ret = dlm_status_to_errno(status);
 	return ret;
 }
 
 static int o2cb_dlm_unlock(struct ocfs2_cluster_connection *conn,
-			   union ocfs2_dlm_lksb *lksb,
-			   u32 flags,
-			   void *astarg)
+			   struct ocfs2_dlm_lksb *lksb,
+			   u32 flags)
 {
 	enum dlm_status status;
 	int o2dlm_flags = flags_to_o2dlm(flags);
 	int ret;
 
 	status = dlmunlock(conn->cc_lockspace, &lksb->lksb_o2dlm,
-			   o2dlm_flags, o2dlm_unlock_ast_wrapper, astarg);
+			   o2dlm_flags, o2dlm_unlock_ast_wrapper, lksb);
 	ret = dlm_status_to_errno(status);
 	return ret;
 }
 
-static int o2cb_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
+static int o2cb_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
 {
 	return dlm_status_to_errno(lksb->lksb_o2dlm.status);
 }
@@ -242,17 +240,17 @@ static int o2cb_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
  * contents, it will zero out the LVB.  Thus the caller can always trust
  * the contents.
  */
-static int o2cb_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb)
+static int o2cb_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb)
 {
 	return 1;
 }
 
-static void *o2cb_dlm_lvb(union ocfs2_dlm_lksb *lksb)
+static void *o2cb_dlm_lvb(struct ocfs2_dlm_lksb *lksb)
 {
 	return (void *)(lksb->lksb_o2dlm.lvb);
 }
 
-static void o2cb_dump_lksb(union ocfs2_dlm_lksb *lksb)
+static void o2cb_dump_lksb(struct ocfs2_dlm_lksb *lksb)
 {
 	dlm_print_one_lock(lksb->lksb_o2dlm.lockid);
 }
@@ -277,10 +275,10 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
 	u32 dlm_key;
 	struct dlm_ctxt *dlm;
 	struct o2dlm_private *priv;
-	struct dlm_protocol_version dlm_version;
+	struct dlm_protocol_version fs_version;
 
 	BUG_ON(conn == NULL);
-	BUG_ON(o2cb_stack.sp_proto == NULL);
+	BUG_ON(conn->cc_proto == NULL);
 
 	/* for now we only have one cluster/node, make sure we see it
 	 * in the heartbeat universe */
@@ -304,18 +302,18 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
 	/* used by the dlm code to make message headers unique, each
 	 * node in this domain must agree on this. */
 	dlm_key = crc32_le(0, conn->cc_name, conn->cc_namelen);
-	dlm_version.pv_major = conn->cc_version.pv_major;
-	dlm_version.pv_minor = conn->cc_version.pv_minor;
+	fs_version.pv_major = conn->cc_version.pv_major;
+	fs_version.pv_minor = conn->cc_version.pv_minor;
 
-	dlm = dlm_register_domain(conn->cc_name, dlm_key, &dlm_version);
+	dlm = dlm_register_domain(conn->cc_name, dlm_key, &fs_version);
 	if (IS_ERR(dlm)) {
 		rc = PTR_ERR(dlm);
 		mlog_errno(rc);
 		goto out_free;
 	}
 
-	conn->cc_version.pv_major = dlm_version.pv_major;
-	conn->cc_version.pv_minor = dlm_version.pv_minor;
+	conn->cc_version.pv_major = fs_version.pv_major;
+	conn->cc_version.pv_minor = fs_version.pv_minor;
 	conn->cc_lockspace = dlm;
 
 	dlm_register_eviction_cb(dlm, &priv->op_eviction_cb);
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index da78a2a334fd..2dc57bca0688 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -21,11 +21,11 @@
 #include <linux/fs.h>
 #include <linux/miscdevice.h>
 #include <linux/mutex.h>
+#include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/reboot.h>
 #include <asm/uaccess.h>
 
-#include "ocfs2.h"  /* For struct ocfs2_lock_res */
 #include "stackglue.h"
 
 #include <linux/dlm_plock.h>
@@ -63,8 +63,8 @@
  * negotiated by the client.  The client negotiates based on the maximum
  * version advertised in /sys/fs/ocfs2/max_locking_protocol.  The major
  * number from the "SETV" message must match
- * ocfs2_user_plugin.sp_proto->lp_max_version.pv_major, and the minor number
- * must be less than or equal to ...->lp_max_version.pv_minor.
+ * ocfs2_user_plugin.sp_max_proto.pv_major, and the minor number
+ * must be less than or equal to ...sp_max_version.pv_minor.
  *
  * Once this information has been set, mounts will be allowed.  From this
  * point on, the "DOWN" message can be sent for node down notification.
@@ -401,7 +401,7 @@ static int ocfs2_control_do_setversion_msg(struct file *file,
 	char *ptr = NULL;
 	struct ocfs2_control_private *p = file->private_data;
 	struct ocfs2_protocol_version *max =
-		&ocfs2_user_plugin.sp_proto->lp_max_version;
+		&ocfs2_user_plugin.sp_max_proto;
 
 	if (ocfs2_control_get_handshake_state(file) !=
 	    OCFS2_CONTROL_HANDSHAKE_PROTOCOL)
@@ -664,18 +664,10 @@ static void ocfs2_control_exit(void)
 		       -rc);
 }
 
-static struct dlm_lksb *fsdlm_astarg_to_lksb(void *astarg)
-{
-	struct ocfs2_lock_res *res = astarg;
-	return &res->l_lksb.lksb_fsdlm;
-}
-
 static void fsdlm_lock_ast_wrapper(void *astarg)
 {
-	struct dlm_lksb *lksb = fsdlm_astarg_to_lksb(astarg);
-	int status = lksb->sb_status;
-
-	BUG_ON(ocfs2_user_plugin.sp_proto == NULL);
+	struct ocfs2_dlm_lksb *lksb = astarg;
+	int status = lksb->lksb_fsdlm.sb_status;
 
 	/*
 	 * For now we're punting on the issue of other non-standard errors
@@ -688,25 +680,24 @@ static void fsdlm_lock_ast_wrapper(void *astarg)
 	 */
 
 	if (status == -DLM_EUNLOCK || status == -DLM_ECANCEL)
-		ocfs2_user_plugin.sp_proto->lp_unlock_ast(astarg, 0);
+		lksb->lksb_conn->cc_proto->lp_unlock_ast(lksb, 0);
 	else
-		ocfs2_user_plugin.sp_proto->lp_lock_ast(astarg);
+		lksb->lksb_conn->cc_proto->lp_lock_ast(lksb);
 }
 
 static void fsdlm_blocking_ast_wrapper(void *astarg, int level)
 {
-	BUG_ON(ocfs2_user_plugin.sp_proto == NULL);
+	struct ocfs2_dlm_lksb *lksb = astarg;
 
-	ocfs2_user_plugin.sp_proto->lp_blocking_ast(astarg, level);
+	lksb->lksb_conn->cc_proto->lp_blocking_ast(lksb, level);
 }
 
 static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
 			 int mode,
-			 union ocfs2_dlm_lksb *lksb,
+			 struct ocfs2_dlm_lksb *lksb,
 			 u32 flags,
 			 void *name,
-			 unsigned int namelen,
-			 void *astarg)
+			 unsigned int namelen)
 {
 	int ret;
 
@@ -716,36 +707,35 @@ static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
 
 	ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm,
 		       flags|DLM_LKF_NODLCKWT, name, namelen, 0,
-		       fsdlm_lock_ast_wrapper, astarg,
+		       fsdlm_lock_ast_wrapper, lksb,
 		       fsdlm_blocking_ast_wrapper);
 	return ret;
 }
 
 static int user_dlm_unlock(struct ocfs2_cluster_connection *conn,
-			   union ocfs2_dlm_lksb *lksb,
-			   u32 flags,
-			   void *astarg)
+			   struct ocfs2_dlm_lksb *lksb,
+			   u32 flags)
 {
 	int ret;
 
 	ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid,
-			 flags, &lksb->lksb_fsdlm, astarg);
+			 flags, &lksb->lksb_fsdlm, lksb);
 	return ret;
 }
 
-static int user_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
+static int user_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
 {
 	return lksb->lksb_fsdlm.sb_status;
 }
 
-static int user_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb)
+static int user_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb)
 {
 	int invalid = lksb->lksb_fsdlm.sb_flags & DLM_SBF_VALNOTVALID;
 
 	return !invalid;
 }
 
-static void *user_dlm_lvb(union ocfs2_dlm_lksb *lksb)
+static void *user_dlm_lvb(struct ocfs2_dlm_lksb *lksb)
 {
 	if (!lksb->lksb_fsdlm.sb_lvbptr)
 		lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb +
@@ -753,7 +743,7 @@ static void *user_dlm_lvb(union ocfs2_dlm_lksb *lksb)
 	return (void *)(lksb->lksb_fsdlm.sb_lvbptr);
 }
 
-static void user_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
+static void user_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb)
 {
 }
 
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index f3df0baa9a48..39abf89697ed 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -36,7 +36,7 @@
 #define OCFS2_STACK_PLUGIN_USER		"user"
 #define OCFS2_MAX_HB_CTL_PATH		256
 
-static struct ocfs2_locking_protocol *lproto;
+static struct ocfs2_protocol_version locking_max_version;
 static DEFINE_SPINLOCK(ocfs2_stack_lock);
 static LIST_HEAD(ocfs2_stack_list);
 static char cluster_stack_name[OCFS2_STACK_LABEL_LEN + 1];
@@ -176,7 +176,7 @@ int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin)
 	spin_lock(&ocfs2_stack_lock);
 	if (!ocfs2_stack_lookup(plugin->sp_name)) {
 		plugin->sp_count = 0;
-		plugin->sp_proto = lproto;
+		plugin->sp_max_proto = locking_max_version;
 		list_add(&plugin->sp_list, &ocfs2_stack_list);
 		printk(KERN_INFO "ocfs2: Registered cluster interface %s\n",
 		       plugin->sp_name);
@@ -213,77 +213,76 @@ void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin)
 }
 EXPORT_SYMBOL_GPL(ocfs2_stack_glue_unregister);
 
-void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto)
+void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_proto)
 {
 	struct ocfs2_stack_plugin *p;
 
-	BUG_ON(proto == NULL);
-
 	spin_lock(&ocfs2_stack_lock);
-	BUG_ON(active_stack != NULL);
+	if (memcmp(max_proto, &locking_max_version,
+		   sizeof(struct ocfs2_protocol_version))) {
+		BUG_ON(locking_max_version.pv_major != 0);
 
-	lproto = proto;
-	list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
-		p->sp_proto = lproto;
+		locking_max_version = *max_proto;
+		list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
+			p->sp_max_proto = locking_max_version;
+		}
 	}
-
 	spin_unlock(&ocfs2_stack_lock);
 }
-EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_locking_protocol);
+EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_max_proto_version);
 
 
 /*
- * The ocfs2_dlm_lock() and ocfs2_dlm_unlock() functions take
- * "struct ocfs2_lock_res *astarg" instead of "void *astarg" because the
- * underlying stack plugins need to pilfer the lksb off of the lock_res.
- * If some other structure needs to be passed as an astarg, the plugins
- * will need to be given a different avenue to the lksb.
+ * The ocfs2_dlm_lock() and ocfs2_dlm_unlock() functions take no argument
+ * for the ast and bast functions.  They will pass the lksb to the ast
+ * and bast.  The caller can wrap the lksb with their own structure to
+ * get more information.
  */
 int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
 		   int mode,
-		   union ocfs2_dlm_lksb *lksb,
+		   struct ocfs2_dlm_lksb *lksb,
 		   u32 flags,
 		   void *name,
-		   unsigned int namelen,
-		   struct ocfs2_lock_res *astarg)
+		   unsigned int namelen)
 {
-	BUG_ON(lproto == NULL);
-
+	if (!lksb->lksb_conn)
+		lksb->lksb_conn = conn;
+	else
+		BUG_ON(lksb->lksb_conn != conn);
 	return active_stack->sp_ops->dlm_lock(conn, mode, lksb, flags,
-					      name, namelen, astarg);
+					      name, namelen);
 }
 EXPORT_SYMBOL_GPL(ocfs2_dlm_lock);
 
 int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
-		     union ocfs2_dlm_lksb *lksb,
-		     u32 flags,
-		     struct ocfs2_lock_res *astarg)
+		     struct ocfs2_dlm_lksb *lksb,
+		     u32 flags)
 {
-	BUG_ON(lproto == NULL);
+	BUG_ON(lksb->lksb_conn == NULL);
 
-	return active_stack->sp_ops->dlm_unlock(conn, lksb, flags, astarg);
+	return active_stack->sp_ops->dlm_unlock(conn, lksb, flags);
 }
 EXPORT_SYMBOL_GPL(ocfs2_dlm_unlock);
 
-int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
+int ocfs2_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
 {
 	return active_stack->sp_ops->lock_status(lksb);
 }
 EXPORT_SYMBOL_GPL(ocfs2_dlm_lock_status);
 
-int ocfs2_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb)
+int ocfs2_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb)
 {
 	return active_stack->sp_ops->lvb_valid(lksb);
 }
 EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb_valid);
 
-void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb)
+void *ocfs2_dlm_lvb(struct ocfs2_dlm_lksb *lksb)
 {
 	return active_stack->sp_ops->lock_lvb(lksb);
 }
 EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb);
 
-void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
+void ocfs2_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb)
 {
 	active_stack->sp_ops->dump_lksb(lksb);
 }
@@ -312,6 +311,7 @@ EXPORT_SYMBOL_GPL(ocfs2_plock);
 int ocfs2_cluster_connect(const char *stack_name,
 			  const char *group,
 			  int grouplen,
+			  struct ocfs2_locking_protocol *lproto,
 			  void (*recovery_handler)(int node_num,
 						   void *recovery_data),
 			  void *recovery_data,
@@ -329,6 +329,12 @@ int ocfs2_cluster_connect(const char *stack_name,
 		goto out;
 	}
 
+	if (memcmp(&lproto->lp_max_version, &locking_max_version,
+		   sizeof(struct ocfs2_protocol_version))) {
+		rc = -EINVAL;
+		goto out;
+	}
+
 	new_conn = kzalloc(sizeof(struct ocfs2_cluster_connection),
 			   GFP_KERNEL);
 	if (!new_conn) {
@@ -341,6 +347,7 @@ int ocfs2_cluster_connect(const char *stack_name,
 	new_conn->cc_recovery_handler = recovery_handler;
 	new_conn->cc_recovery_data = recovery_data;
 
+	new_conn->cc_proto = lproto;
 	/* Start the new connection at our maximum compatibility level */
 	new_conn->cc_version = lproto->lp_max_version;
 
@@ -366,6 +373,24 @@ out:
 }
 EXPORT_SYMBOL_GPL(ocfs2_cluster_connect);
 
+/* The caller will ensure all nodes have the same cluster stack */
+int ocfs2_cluster_connect_agnostic(const char *group,
+				   int grouplen,
+				   struct ocfs2_locking_protocol *lproto,
+				   void (*recovery_handler)(int node_num,
+							    void *recovery_data),
+				   void *recovery_data,
+				   struct ocfs2_cluster_connection **conn)
+{
+	char *stack_name = NULL;
+
+	if (cluster_stack_name[0])
+		stack_name = cluster_stack_name;
+	return ocfs2_cluster_connect(stack_name, group, grouplen, lproto,
+				     recovery_handler, recovery_data, conn);
+}
+EXPORT_SYMBOL_GPL(ocfs2_cluster_connect_agnostic);
+
 /* If hangup_pending is 0, the stack driver will be dropped */
 int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
 			     int hangup_pending)
@@ -453,10 +478,10 @@ static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj,
 	ssize_t ret = 0;
 
 	spin_lock(&ocfs2_stack_lock);
-	if (lproto)
+	if (locking_max_version.pv_major)
 		ret = snprintf(buf, PAGE_SIZE, "%u.%u\n",
-			       lproto->lp_max_version.pv_major,
-			       lproto->lp_max_version.pv_minor);
+			       locking_max_version.pv_major,
+			       locking_max_version.pv_minor);
 	spin_unlock(&ocfs2_stack_lock);
 
 	return ret;
@@ -685,7 +710,10 @@ static int __init ocfs2_stack_glue_init(void)
 
 static void __exit ocfs2_stack_glue_exit(void)
 {
-	lproto = NULL;
+	memset(&locking_max_version, 0,
+	       sizeof(struct ocfs2_protocol_version));
+	locking_max_version.pv_major = 0;
+	locking_max_version.pv_minor = 0;
 	ocfs2_sysfs_exit();
 	if (ocfs2_table_header)
 		unregister_sysctl_table(ocfs2_table_header);
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 03a44d60eac9..8ce7398ae1d2 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -56,17 +56,6 @@ struct ocfs2_protocol_version {
 };
 
 /*
- * The ocfs2_locking_protocol defines the handlers called on ocfs2's behalf.
- */
-struct ocfs2_locking_protocol {
-	struct ocfs2_protocol_version lp_max_version;
-	void (*lp_lock_ast)(void *astarg);
-	void (*lp_blocking_ast)(void *astarg, int level);
-	void (*lp_unlock_ast)(void *astarg, int error);
-};
-
-
-/*
  * The dlm_lockstatus struct includes lvb space, but the dlm_lksb struct only
  * has a pointer to separately allocated lvb space.  This struct exists only to
  * include in the lksb union to make space for a combined dlm_lksb and lvb.
@@ -81,12 +70,27 @@ struct fsdlm_lksb_plus_lvb {
  * size of the union is known.  Lock status structures are embedded in
  * ocfs2 inodes.
  */
-union ocfs2_dlm_lksb {
-	struct dlm_lockstatus lksb_o2dlm;
-	struct dlm_lksb lksb_fsdlm;
-	struct fsdlm_lksb_plus_lvb padding;
+struct ocfs2_cluster_connection;
+struct ocfs2_dlm_lksb {
+	 union {
+		 struct dlm_lockstatus lksb_o2dlm;
+		 struct dlm_lksb lksb_fsdlm;
+		 struct fsdlm_lksb_plus_lvb padding;
+	 };
+	 struct ocfs2_cluster_connection *lksb_conn;
+};
+
+/*
+ * The ocfs2_locking_protocol defines the handlers called on ocfs2's behalf.
+ */
+struct ocfs2_locking_protocol {
+	struct ocfs2_protocol_version lp_max_version;
+	void (*lp_lock_ast)(struct ocfs2_dlm_lksb *lksb);
+	void (*lp_blocking_ast)(struct ocfs2_dlm_lksb *lksb, int level);
+	void (*lp_unlock_ast)(struct ocfs2_dlm_lksb *lksb, int error);
 };
 
+
 /*
  * A cluster connection.  Mostly opaque to ocfs2, the connection holds
  * state for the underlying stack.  ocfs2 does use cc_version to determine
@@ -96,6 +100,7 @@ struct ocfs2_cluster_connection {
 	char cc_name[GROUP_NAME_MAX];
 	int cc_namelen;
 	struct ocfs2_protocol_version cc_version;
+	struct ocfs2_locking_protocol *cc_proto;
 	void (*cc_recovery_handler)(int node_num, void *recovery_data);
 	void *cc_recovery_data;
 	void *cc_lockspace;
@@ -155,27 +160,29 @@ struct ocfs2_stack_operations {
 	 *
 	 * ast and bast functions are not part of the call because the
 	 * stack will likely want to wrap ast and bast calls before passing
-	 * them to stack->sp_proto.
+	 * them to stack->sp_proto.  There is no astarg.  The lksb will
+	 * be passed back to the ast and bast functions.  The caller can
+	 * use this to find their object.
 	 */
 	int (*dlm_lock)(struct ocfs2_cluster_connection *conn,
 			int mode,
-			union ocfs2_dlm_lksb *lksb,
+			struct ocfs2_dlm_lksb *lksb,
 			u32 flags,
 			void *name,
-			unsigned int namelen,
-			void *astarg);
+			unsigned int namelen);
 
 	/*
 	 * Call the underlying dlm unlock function.  The ->dlm_unlock()
 	 * function should convert the flags as appropriate.
 	 *
 	 * The unlock ast is not passed, as the stack will want to wrap
-	 * it before calling stack->sp_proto->lp_unlock_ast().
+	 * it before calling stack->sp_proto->lp_unlock_ast().  There is
+	 * no astarg.  The lksb will be passed back to the unlock ast
+	 * function.  The caller can use this to find their object.
 	 */
 	int (*dlm_unlock)(struct ocfs2_cluster_connection *conn,
-			  union ocfs2_dlm_lksb *lksb,
-			  u32 flags,
-			  void *astarg);
+			  struct ocfs2_dlm_lksb *lksb,
+			  u32 flags);
 
 	/*
 	 * Return the status of the current lock status block.  The fs
@@ -183,17 +190,17 @@ struct ocfs2_stack_operations {
 	 * callback pulls out the stack-specific lksb, converts the status
 	 * to a proper errno, and returns it.
 	 */
-	int (*lock_status)(union ocfs2_dlm_lksb *lksb);
+	int (*lock_status)(struct ocfs2_dlm_lksb *lksb);
 
 	/*
 	 * Return non-zero if the LVB is valid.
 	 */
-	int (*lvb_valid)(union ocfs2_dlm_lksb *lksb);
+	int (*lvb_valid)(struct ocfs2_dlm_lksb *lksb);
 
 	/*
 	 * Pull the lvb pointer off of the stack-specific lksb.
 	 */
-	void *(*lock_lvb)(union ocfs2_dlm_lksb *lksb);
+	void *(*lock_lvb)(struct ocfs2_dlm_lksb *lksb);
 
 	/*
 	 * Cluster-aware posix locks
@@ -210,7 +217,7 @@ struct ocfs2_stack_operations {
 	 * This is an optoinal debugging hook.  If provided, the
 	 * stack can dump debugging information about this lock.
 	 */
-	void (*dump_lksb)(union ocfs2_dlm_lksb *lksb);
+	void (*dump_lksb)(struct ocfs2_dlm_lksb *lksb);
 };
 
 /*
@@ -226,7 +233,7 @@ struct ocfs2_stack_plugin {
 	/* These are managed by the stackglue code. */
 	struct list_head sp_list;
 	unsigned int sp_count;
-	struct ocfs2_locking_protocol *sp_proto;
+	struct ocfs2_protocol_version sp_max_proto;
 };
 
 
@@ -234,10 +241,22 @@ struct ocfs2_stack_plugin {
 int ocfs2_cluster_connect(const char *stack_name,
 			  const char *group,
 			  int grouplen,
+			  struct ocfs2_locking_protocol *lproto,
 			  void (*recovery_handler)(int node_num,
 						   void *recovery_data),
 			  void *recovery_data,
 			  struct ocfs2_cluster_connection **conn);
+/*
+ * Used by callers that don't store their stack name.  They must ensure
+ * all nodes have the same stack.
+ */
+int ocfs2_cluster_connect_agnostic(const char *group,
+				   int grouplen,
+				   struct ocfs2_locking_protocol *lproto,
+				   void (*recovery_handler)(int node_num,
+							    void *recovery_data),
+				   void *recovery_data,
+				   struct ocfs2_cluster_connection **conn);
 int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
 			     int hangup_pending);
 void ocfs2_cluster_hangup(const char *group, int grouplen);
@@ -246,26 +265,24 @@ int ocfs2_cluster_this_node(unsigned int *node);
 struct ocfs2_lock_res;
 int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
 		   int mode,
-		   union ocfs2_dlm_lksb *lksb,
+		   struct ocfs2_dlm_lksb *lksb,
 		   u32 flags,
 		   void *name,
-		   unsigned int namelen,
-		   struct ocfs2_lock_res *astarg);
+		   unsigned int namelen);
 int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
-		     union ocfs2_dlm_lksb *lksb,
-		     u32 flags,
-		     struct ocfs2_lock_res *astarg);
+		     struct ocfs2_dlm_lksb *lksb,
+		     u32 flags);
 
-int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb);
-int ocfs2_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb);
-void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb);
-void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb);
+int ocfs2_dlm_lock_status(struct ocfs2_dlm_lksb *lksb);
+int ocfs2_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb);
+void *ocfs2_dlm_lvb(struct ocfs2_dlm_lksb *lksb);
+void ocfs2_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb);
 
 int ocfs2_stack_supports_plocks(void);
 int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
 		struct file *file, int cmd, struct file_lock *fl);
 
-void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto);
+void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_proto);
 
 
 /* Used by stack plugins */
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index c30b644d9572..a8e6a95a353f 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -51,7 +51,16 @@
 #define ALLOC_NEW_GROUP			0x1
 #define ALLOC_GROUPS_FROM_GLOBAL	0x2
 
-#define OCFS2_MAX_INODES_TO_STEAL	1024
+#define OCFS2_MAX_TO_STEAL		1024
+
+struct ocfs2_suballoc_result {
+	u64		sr_bg_blkno;	/* The bg we allocated from.  Set
+					   to 0 when a block group is
+					   contiguous. */
+	u64		sr_blkno;	/* The first allocated block */
+	unsigned int	sr_bit_offset;	/* The bit in the bg */
+	unsigned int	sr_bits;	/* How many bits we claimed */
+};
 
 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
@@ -60,6 +69,7 @@ static int ocfs2_block_group_fill(handle_t *handle,
 				  struct inode *alloc_inode,
 				  struct buffer_head *bg_bh,
 				  u64 group_blkno,
+				  unsigned int group_clusters,
 				  u16 my_chain,
 				  struct ocfs2_chain_list *cl);
 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
@@ -73,20 +83,17 @@ static int ocfs2_cluster_group_search(struct inode *inode,
 				      struct buffer_head *group_bh,
 				      u32 bits_wanted, u32 min_bits,
 				      u64 max_block,
-				      u16 *bit_off, u16 *bits_found);
+				      struct ocfs2_suballoc_result *res);
 static int ocfs2_block_group_search(struct inode *inode,
 				    struct buffer_head *group_bh,
 				    u32 bits_wanted, u32 min_bits,
 				    u64 max_block,
-				    u16 *bit_off, u16 *bits_found);
-static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
-				     struct ocfs2_alloc_context *ac,
+				    struct ocfs2_suballoc_result *res);
+static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
 				     handle_t *handle,
 				     u32 bits_wanted,
 				     u32 min_bits,
-				     u16 *bit_off,
-				     unsigned int *num_bits,
-				     u64 *bg_blkno);
+				     struct ocfs2_suballoc_result *res);
 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
 					 int nr);
 static inline int ocfs2_block_group_set_bits(handle_t *handle,
@@ -95,13 +102,6 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
 					     struct buffer_head *group_bh,
 					     unsigned int bit_off,
 					     unsigned int num_bits);
-static inline int ocfs2_block_group_clear_bits(handle_t *handle,
-					       struct inode *alloc_inode,
-					       struct ocfs2_group_desc *bg,
-					       struct buffer_head *group_bh,
-					       unsigned int bit_off,
-					       unsigned int num_bits);
-
 static int ocfs2_relink_block_group(handle_t *handle,
 				    struct inode *alloc_inode,
 				    struct buffer_head *fe_bh,
@@ -137,6 +137,7 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
 	}
 	brelse(ac->ac_bh);
 	ac->ac_bh = NULL;
+	ac->ac_resv = NULL;
 }
 
 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
@@ -152,7 +153,7 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
 
 #define do_error(fmt, ...)						\
 	do{								\
-		if (clean_error)					\
+		if (resize)					\
 			mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__);	\
 		else							\
 			ocfs2_error(sb, fmt, ##__VA_ARGS__);		\
@@ -160,7 +161,7 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
 
 static int ocfs2_validate_gd_self(struct super_block *sb,
 				  struct buffer_head *bh,
-				  int clean_error)
+				  int resize)
 {
 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
 
@@ -211,7 +212,7 @@ static int ocfs2_validate_gd_self(struct super_block *sb,
 static int ocfs2_validate_gd_parent(struct super_block *sb,
 				    struct ocfs2_dinode *di,
 				    struct buffer_head *bh,
-				    int clean_error)
+				    int resize)
 {
 	unsigned int max_bits;
 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
@@ -233,8 +234,11 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
 		return -EINVAL;
 	}
 
-	if (le16_to_cpu(gd->bg_chain) >=
-	    le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) {
+	/* In resize, we may meet the case bg_chain == cl_next_free_rec. */
+	if ((le16_to_cpu(gd->bg_chain) >
+	     le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
+	    ((le16_to_cpu(gd->bg_chain) ==
+	     le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
 		do_error("Group descriptor #%llu has bad chain %u",
 			 (unsigned long long)bh->b_blocknr,
 			 le16_to_cpu(gd->bg_chain));
@@ -329,14 +333,38 @@ out:
 	return rc;
 }
 
+static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
+					  struct ocfs2_group_desc *bg,
+					  struct ocfs2_chain_list *cl,
+					  u64 p_blkno, u32 clusters)
+{
+	struct ocfs2_extent_list *el = &bg->bg_list;
+	struct ocfs2_extent_rec *rec;
+
+	BUG_ON(!ocfs2_supports_discontig_bg(osb));
+	if (!el->l_next_free_rec)
+		el->l_count = cpu_to_le16(ocfs2_extent_recs_per_gd(osb->sb));
+	rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec)];
+	rec->e_blkno = cpu_to_le64(p_blkno);
+	rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) /
+				  le16_to_cpu(cl->cl_bpc));
+	rec->e_leaf_clusters = cpu_to_le32(clusters);
+	le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc));
+	le16_add_cpu(&bg->bg_free_bits_count,
+		     clusters * le16_to_cpu(cl->cl_bpc));
+	le16_add_cpu(&el->l_next_free_rec, 1);
+}
+
 static int ocfs2_block_group_fill(handle_t *handle,
 				  struct inode *alloc_inode,
 				  struct buffer_head *bg_bh,
 				  u64 group_blkno,
+				  unsigned int group_clusters,
 				  u16 my_chain,
 				  struct ocfs2_chain_list *cl)
 {
 	int status = 0;
+	struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
 	struct super_block * sb = alloc_inode->i_sb;
 
@@ -363,19 +391,23 @@ static int ocfs2_block_group_fill(handle_t *handle,
 	memset(bg, 0, sb->s_blocksize);
 	strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
 	bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
-	bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb));
-	bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
+	bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1,
+						osb->s_feature_incompat));
 	bg->bg_chain = cpu_to_le16(my_chain);
 	bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
 	bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
 	bg->bg_blkno = cpu_to_le64(group_blkno);
+	if (group_clusters == le16_to_cpu(cl->cl_cpg))
+		bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
+	else
+		ocfs2_bg_discontig_add_extent(osb, bg, cl, group_blkno,
+					      group_clusters);
+
 	/* set the 1st bit in the bitmap to account for the descriptor block */
 	ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
 	bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
 
-	status = ocfs2_journal_dirty(handle, bg_bh);
-	if (status < 0)
-		mlog_errno(status);
+	ocfs2_journal_dirty(handle, bg_bh);
 
 	/* There is no need to zero out or otherwise initialize the
 	 * other blocks in a group - All valid FS metadata in a block
@@ -401,6 +433,238 @@ static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
 	return best;
 }
 
+static struct buffer_head *
+ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle,
+			       struct inode *alloc_inode,
+			       struct ocfs2_alloc_context *ac,
+			       struct ocfs2_chain_list *cl)
+{
+	int status;
+	u32 bit_off, num_bits;
+	u64 bg_blkno;
+	struct buffer_head *bg_bh;
+	unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
+
+	status = ocfs2_claim_clusters(handle, ac,
+				      le16_to_cpu(cl->cl_cpg), &bit_off,
+				      &num_bits);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	/* setup the group */
+	bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+	mlog(0, "new descriptor, record %u, at block %llu\n",
+	     alloc_rec, (unsigned long long)bg_blkno);
+
+	bg_bh = sb_getblk(osb->sb, bg_blkno);
+	if (!bg_bh) {
+		status = -EIO;
+		mlog_errno(status);
+		goto bail;
+	}
+	ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
+
+	status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
+					bg_blkno, num_bits, alloc_rec, cl);
+	if (status < 0) {
+		brelse(bg_bh);
+		mlog_errno(status);
+	}
+
+bail:
+	return status ? ERR_PTR(status) : bg_bh;
+}
+
+static int ocfs2_block_group_claim_bits(struct ocfs2_super *osb,
+					handle_t *handle,
+					struct ocfs2_alloc_context *ac,
+					unsigned int min_bits,
+					u32 *bit_off, u32 *num_bits)
+{
+	int status = 0;
+
+	while (min_bits) {
+		status = ocfs2_claim_clusters(handle, ac, min_bits,
+					      bit_off, num_bits);
+		if (status != -ENOSPC)
+			break;
+
+		min_bits >>= 1;
+	}
+
+	return status;
+}
+
+static int ocfs2_block_group_grow_discontig(handle_t *handle,
+					    struct inode *alloc_inode,
+					    struct buffer_head *bg_bh,
+					    struct ocfs2_alloc_context *ac,
+					    struct ocfs2_chain_list *cl,
+					    unsigned int min_bits)
+{
+	int status;
+	struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
+	struct ocfs2_group_desc *bg =
+		(struct ocfs2_group_desc *)bg_bh->b_data;
+	unsigned int needed = le16_to_cpu(cl->cl_cpg) -
+			 le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
+	u32 p_cpos, clusters;
+	u64 p_blkno;
+	struct ocfs2_extent_list *el = &bg->bg_list;
+
+	status = ocfs2_journal_access_gd(handle,
+					 INODE_CACHE(alloc_inode),
+					 bg_bh,
+					 OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	while ((needed > 0) && (le16_to_cpu(el->l_next_free_rec) <
+				le16_to_cpu(el->l_count))) {
+		if (min_bits > needed)
+			min_bits = needed;
+		status = ocfs2_block_group_claim_bits(osb, handle, ac,
+						      min_bits, &p_cpos,
+						      &clusters);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			goto bail;
+		}
+		p_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cpos);
+		ocfs2_bg_discontig_add_extent(osb, bg, cl, p_blkno,
+					      clusters);
+
+		min_bits = clusters;
+		needed = le16_to_cpu(cl->cl_cpg) -
+			 le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
+	}
+
+	if (needed > 0) {
+		/*
+		 * We have used up all the extent rec but can't fill up
+		 * the cpg. So bail out.
+		 */
+		status = -ENOSPC;
+		goto bail;
+	}
+
+	ocfs2_journal_dirty(handle, bg_bh);
+
+bail:
+	return status;
+}
+
+static void ocfs2_bg_alloc_cleanup(handle_t *handle,
+				   struct ocfs2_alloc_context *cluster_ac,
+				   struct inode *alloc_inode,
+				   struct buffer_head *bg_bh)
+{
+	int i, ret;
+	struct ocfs2_group_desc *bg;
+	struct ocfs2_extent_list *el;
+	struct ocfs2_extent_rec *rec;
+
+	if (!bg_bh)
+		return;
+
+	bg = (struct ocfs2_group_desc *)bg_bh->b_data;
+	el = &bg->bg_list;
+	for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+		rec = &el->l_recs[i];
+		ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode,
+					  cluster_ac->ac_bh,
+					  le64_to_cpu(rec->e_blkno),
+					  le32_to_cpu(rec->e_leaf_clusters));
+		if (ret)
+			mlog_errno(ret);
+		/* Try all the clusters to free */
+	}
+
+	ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), bg_bh);
+	brelse(bg_bh);
+}
+
+static struct buffer_head *
+ocfs2_block_group_alloc_discontig(handle_t *handle,
+				  struct inode *alloc_inode,
+				  struct ocfs2_alloc_context *ac,
+				  struct ocfs2_chain_list *cl)
+{
+	int status;
+	u32 bit_off, num_bits;
+	u64 bg_blkno;
+	unsigned int min_bits = le16_to_cpu(cl->cl_cpg) >> 1;
+	struct buffer_head *bg_bh = NULL;
+	unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
+	struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
+
+	if (!ocfs2_supports_discontig_bg(osb)) {
+		status = -ENOSPC;
+		goto bail;
+	}
+
+	status = ocfs2_extend_trans(handle,
+				    ocfs2_calc_bg_discontig_credits(osb->sb));
+	if (status) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/*
+	 * We're going to be grabbing from multiple cluster groups.
+	 * We don't have enough credits to relink them all, and the
+	 * cluster groups will be staying in cache for the duration of
+	 * this operation.
+	 */
+	ac->ac_allow_chain_relink = 0;
+
+	/* Claim the first region */
+	status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits,
+					      &bit_off, &num_bits);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+	min_bits = num_bits;
+
+	/* setup the group */
+	bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+	mlog(0, "new descriptor, record %u, at block %llu\n",
+	     alloc_rec, (unsigned long long)bg_blkno);
+
+	bg_bh = sb_getblk(osb->sb, bg_blkno);
+	if (!bg_bh) {
+		status = -EIO;
+		mlog_errno(status);
+		goto bail;
+	}
+	ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
+
+	status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
+					bg_blkno, num_bits, alloc_rec, cl);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_block_group_grow_discontig(handle, alloc_inode,
+						  bg_bh, ac, cl, min_bits);
+	if (status)
+		mlog_errno(status);
+
+bail:
+	if (status)
+		ocfs2_bg_alloc_cleanup(handle, ac, alloc_inode, bg_bh);
+	return status ? ERR_PTR(status) : bg_bh;
+}
+
 /*
  * We expect the block group allocator to already be locked.
  */
@@ -416,9 +680,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
 	struct ocfs2_chain_list *cl;
 	struct ocfs2_alloc_context *ac = NULL;
 	handle_t *handle = NULL;
-	u32 bit_off, num_bits;
 	u16 alloc_rec;
-	u64 bg_blkno;
 	struct buffer_head *bg_bh = NULL;
 	struct ocfs2_group_desc *bg;
 
@@ -451,44 +713,20 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
 		     (unsigned long long)*last_alloc_group);
 		ac->ac_last_group = *last_alloc_group;
 	}
-	status = ocfs2_claim_clusters(osb,
-				      handle,
-				      ac,
-				      le16_to_cpu(cl->cl_cpg),
-				      &bit_off,
-				      &num_bits);
-	if (status < 0) {
+
+	bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode,
+					       ac, cl);
+	if (IS_ERR(bg_bh) && (PTR_ERR(bg_bh) == -ENOSPC))
+		bg_bh = ocfs2_block_group_alloc_discontig(handle,
+							  alloc_inode,
+							  ac, cl);
+	if (IS_ERR(bg_bh)) {
+		status = PTR_ERR(bg_bh);
+		bg_bh = NULL;
 		if (status != -ENOSPC)
 			mlog_errno(status);
 		goto bail;
 	}
-
-	alloc_rec = ocfs2_find_smallest_chain(cl);
-
-	/* setup the group */
-	bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
-	mlog(0, "new descriptor, record %u, at block %llu\n",
-	     alloc_rec, (unsigned long long)bg_blkno);
-
-	bg_bh = sb_getblk(osb->sb, bg_blkno);
-	if (!bg_bh) {
-		status = -EIO;
-		mlog_errno(status);
-		goto bail;
-	}
-	ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
-
-	status = ocfs2_block_group_fill(handle,
-					alloc_inode,
-					bg_bh,
-					bg_blkno,
-					alloc_rec,
-					cl);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail;
-	}
-
 	bg = (struct ocfs2_group_desc *) bg_bh->b_data;
 
 	status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
@@ -498,10 +736,12 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
 		goto bail;
 	}
 
+	alloc_rec = le16_to_cpu(bg->bg_chain);
 	le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
 		     le16_to_cpu(bg->bg_free_bits_count));
-	le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits));
-	cl->cl_recs[alloc_rec].c_blkno  = cpu_to_le64(bg_blkno);
+	le32_add_cpu(&cl->cl_recs[alloc_rec].c_total,
+		     le16_to_cpu(bg->bg_bits));
+	cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno;
 	if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
 		le16_add_cpu(&cl->cl_next_free_rec, 1);
 
@@ -510,11 +750,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
 	le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
 	le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
 
-	status = ocfs2_journal_dirty(handle, bh);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail;
-	}
+	ocfs2_journal_dirty(handle, bh);
 
 	spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
 	OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
@@ -637,12 +873,113 @@ bail:
 	return status;
 }
 
+static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
+{
+	spin_lock(&osb->osb_lock);
+	osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
+	spin_unlock(&osb->osb_lock);
+	atomic_set(&osb->s_num_inodes_stolen, 0);
+}
+
+static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb)
+{
+	spin_lock(&osb->osb_lock);
+	osb->s_meta_steal_slot = OCFS2_INVALID_SLOT;
+	spin_unlock(&osb->osb_lock);
+	atomic_set(&osb->s_num_meta_stolen, 0);
+}
+
+void ocfs2_init_steal_slots(struct ocfs2_super *osb)
+{
+	ocfs2_init_inode_steal_slot(osb);
+	ocfs2_init_meta_steal_slot(osb);
+}
+
+static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type)
+{
+	spin_lock(&osb->osb_lock);
+	if (type == INODE_ALLOC_SYSTEM_INODE)
+		osb->s_inode_steal_slot = slot;
+	else if (type == EXTENT_ALLOC_SYSTEM_INODE)
+		osb->s_meta_steal_slot = slot;
+	spin_unlock(&osb->osb_lock);
+}
+
+static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type)
+{
+	int slot = OCFS2_INVALID_SLOT;
+
+	spin_lock(&osb->osb_lock);
+	if (type == INODE_ALLOC_SYSTEM_INODE)
+		slot = osb->s_inode_steal_slot;
+	else if (type == EXTENT_ALLOC_SYSTEM_INODE)
+		slot = osb->s_meta_steal_slot;
+	spin_unlock(&osb->osb_lock);
+
+	return slot;
+}
+
+static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
+{
+	return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE);
+}
+
+static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb)
+{
+	return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE);
+}
+
+static int ocfs2_steal_resource(struct ocfs2_super *osb,
+				struct ocfs2_alloc_context *ac,
+				int type)
+{
+	int i, status = -ENOSPC;
+	int slot = __ocfs2_get_steal_slot(osb, type);
+
+	/* Start to steal resource from the first slot after ours. */
+	if (slot == OCFS2_INVALID_SLOT)
+		slot = osb->slot_num + 1;
+
+	for (i = 0; i < osb->max_slots; i++, slot++) {
+		if (slot == osb->max_slots)
+			slot = 0;
+
+		if (slot == osb->slot_num)
+			continue;
+
+		status = ocfs2_reserve_suballoc_bits(osb, ac,
+						     type,
+						     (u32)slot, NULL,
+						     NOT_ALLOC_NEW_GROUP);
+		if (status >= 0) {
+			__ocfs2_set_steal_slot(osb, slot, type);
+			break;
+		}
+
+		ocfs2_free_ac_resource(ac);
+	}
+
+	return status;
+}
+
+static int ocfs2_steal_inode(struct ocfs2_super *osb,
+			     struct ocfs2_alloc_context *ac)
+{
+	return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE);
+}
+
+static int ocfs2_steal_meta(struct ocfs2_super *osb,
+			    struct ocfs2_alloc_context *ac)
+{
+	return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE);
+}
+
 int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
 				      int blocks,
 				      struct ocfs2_alloc_context **ac)
 {
 	int status;
-	u32 slot;
+	int slot = ocfs2_get_meta_steal_slot(osb);
 
 	*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
 	if (!(*ac)) {
@@ -653,12 +990,34 @@ int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
 
 	(*ac)->ac_bits_wanted = blocks;
 	(*ac)->ac_which = OCFS2_AC_USE_META;
-	slot = osb->slot_num;
 	(*ac)->ac_group_search = ocfs2_block_group_search;
 
+	if (slot != OCFS2_INVALID_SLOT &&
+		atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL)
+		goto extent_steal;
+
+	atomic_set(&osb->s_num_meta_stolen, 0);
 	status = ocfs2_reserve_suballoc_bits(osb, (*ac),
 					     EXTENT_ALLOC_SYSTEM_INODE,
-					     slot, NULL, ALLOC_NEW_GROUP);
+					     (u32)osb->slot_num, NULL,
+					     ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP);
+
+
+	if (status >= 0) {
+		status = 0;
+		if (slot != OCFS2_INVALID_SLOT)
+			ocfs2_init_meta_steal_slot(osb);
+		goto bail;
+	} else if (status < 0 && status != -ENOSPC) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_free_ac_resource(*ac);
+
+extent_steal:
+	status = ocfs2_steal_meta(osb, *ac);
+	atomic_inc(&osb->s_num_meta_stolen);
 	if (status < 0) {
 		if (status != -ENOSPC)
 			mlog_errno(status);
@@ -685,43 +1044,11 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
 					ac);
 }
 
-static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb,
-					      struct ocfs2_alloc_context *ac)
-{
-	int i, status = -ENOSPC;
-	s16 slot = ocfs2_get_inode_steal_slot(osb);
-
-	/* Start to steal inodes from the first slot after ours. */
-	if (slot == OCFS2_INVALID_SLOT)
-		slot = osb->slot_num + 1;
-
-	for (i = 0; i < osb->max_slots; i++, slot++) {
-		if (slot == osb->max_slots)
-			slot = 0;
-
-		if (slot == osb->slot_num)
-			continue;
-
-		status = ocfs2_reserve_suballoc_bits(osb, ac,
-						     INODE_ALLOC_SYSTEM_INODE,
-						     slot, NULL,
-						     NOT_ALLOC_NEW_GROUP);
-		if (status >= 0) {
-			ocfs2_set_inode_steal_slot(osb, slot);
-			break;
-		}
-
-		ocfs2_free_ac_resource(ac);
-	}
-
-	return status;
-}
-
 int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
 			    struct ocfs2_alloc_context **ac)
 {
 	int status;
-	s16 slot = ocfs2_get_inode_steal_slot(osb);
+	int slot = ocfs2_get_inode_steal_slot(osb);
 	u64 alloc_group;
 
 	*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
@@ -754,14 +1081,14 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
 	 * need to check our slots to see whether there is some space for us.
 	 */
 	if (slot != OCFS2_INVALID_SLOT &&
-	    atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_INODES_TO_STEAL)
+	    atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL)
 		goto inode_steal;
 
 	atomic_set(&osb->s_num_inodes_stolen, 0);
 	alloc_group = osb->osb_inode_alloc_group;
 	status = ocfs2_reserve_suballoc_bits(osb, *ac,
 					     INODE_ALLOC_SYSTEM_INODE,
-					     osb->slot_num,
+					     (u32)osb->slot_num,
 					     &alloc_group,
 					     ALLOC_NEW_GROUP |
 					     ALLOC_GROUPS_FROM_GLOBAL);
@@ -789,7 +1116,7 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
 	ocfs2_free_ac_resource(*ac);
 
 inode_steal:
-	status = ocfs2_steal_inode_from_other_nodes(osb, *ac);
+	status = ocfs2_steal_inode(osb, *ac);
 	atomic_inc(&osb->s_num_inodes_stolen);
 	if (status < 0) {
 		if (status != -ENOSPC)
@@ -859,11 +1186,7 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
 		status = ocfs2_reserve_local_alloc_bits(osb,
 							bits_wanted,
 							*ac);
-		if (status == -EFBIG) {
-			/* The local alloc window is outside ac_max_block.
-			 * use the main bitmap. */
-			status = -ENOSPC;
-		} else if ((status < 0) && (status != -ENOSPC)) {
+		if ((status < 0) && (status != -ENOSPC)) {
 			mlog_errno(status);
 			goto bail;
 		}
@@ -946,8 +1269,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
 					     struct buffer_head *bg_bh,
 					     unsigned int bits_wanted,
 					     unsigned int total_bits,
-					     u16 *bit_off,
-					     u16 *bits_found)
+					     struct ocfs2_suballoc_result *res)
 {
 	void *bitmap;
 	u16 best_offset, best_size;
@@ -991,14 +1313,9 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
 		}
 	}
 
-	/* XXX: I think the first clause is equivalent to the second
-	 * 	- jlbec */
-	if (found == bits_wanted) {
-		*bit_off = start - found;
-		*bits_found = found;
-	} else if (best_size) {
-		*bit_off = best_offset;
-		*bits_found = best_size;
+	if (best_size) {
+		res->sr_bit_offset = best_offset;
+		res->sr_bits = best_size;
 	} else {
 		status = -ENOSPC;
 		/* No error log here -- see the comment above
@@ -1042,16 +1359,10 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
 	}
 
 	le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
-
 	while(num_bits--)
 		ocfs2_set_bit(bit_off++, bitmap);
 
-	status = ocfs2_journal_dirty(handle,
-				     group_bh);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail;
-	}
+	ocfs2_journal_dirty(handle, group_bh);
 
 bail:
 	mlog_exit(status);
@@ -1115,12 +1426,7 @@ static int ocfs2_relink_block_group(handle_t *handle,
 	}
 
 	prev_bg->bg_next_group = bg->bg_next_group;
-
-	status = ocfs2_journal_dirty(handle, prev_bg_bh);
-	if (status < 0) {
-		mlog_errno(status);
-		goto out_rollback;
-	}
+	ocfs2_journal_dirty(handle, prev_bg_bh);
 
 	status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
 					 bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1130,12 +1436,7 @@ static int ocfs2_relink_block_group(handle_t *handle,
 	}
 
 	bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
-
-	status = ocfs2_journal_dirty(handle, bg_bh);
-	if (status < 0) {
-		mlog_errno(status);
-		goto out_rollback;
-	}
+	ocfs2_journal_dirty(handle, bg_bh);
 
 	status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
 					 fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1145,14 +1446,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
 	}
 
 	fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
+	ocfs2_journal_dirty(handle, fe_bh);
 
-	status = ocfs2_journal_dirty(handle, fe_bh);
-	if (status < 0) {
-		mlog_errno(status);
-		goto out_rollback;
-	}
-
-	status = 0;
 out_rollback:
 	if (status < 0) {
 		fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
@@ -1176,14 +1471,13 @@ static int ocfs2_cluster_group_search(struct inode *inode,
 				      struct buffer_head *group_bh,
 				      u32 bits_wanted, u32 min_bits,
 				      u64 max_block,
-				      u16 *bit_off, u16 *bits_found)
+				      struct ocfs2_suballoc_result *res)
 {
 	int search = -ENOSPC;
 	int ret;
 	u64 blkoff;
 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	u16 tmp_off, tmp_found;
 	unsigned int max_bits, gd_cluster_off;
 
 	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
@@ -1210,15 +1504,15 @@ static int ocfs2_cluster_group_search(struct inode *inode,
 
 		ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
 							group_bh, bits_wanted,
-							max_bits,
-							&tmp_off, &tmp_found);
+							max_bits, res);
 		if (ret)
 			return ret;
 
 		if (max_block) {
 			blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
 							  gd_cluster_off +
-							  tmp_off + tmp_found);
+							  res->sr_bit_offset +
+							  res->sr_bits);
 			mlog(0, "Checking %llu against %llu\n",
 			     (unsigned long long)blkoff,
 			     (unsigned long long)max_block);
@@ -1230,16 +1524,14 @@ static int ocfs2_cluster_group_search(struct inode *inode,
 		 * return success, but we still want to return
 		 * -ENOSPC unless it found the minimum number
 		 * of bits. */
-		if (min_bits <= tmp_found) {
-			*bit_off = tmp_off;
-			*bits_found = tmp_found;
+		if (min_bits <= res->sr_bits)
 			search = 0; /* success */
-		} else if (tmp_found) {
+		else if (res->sr_bits) {
 			/*
 			 * Don't show bits which we'll be returning
 			 * for allocation to the local alloc bitmap.
 			 */
-			ocfs2_local_alloc_seen_free_bits(osb, tmp_found);
+			ocfs2_local_alloc_seen_free_bits(osb, res->sr_bits);
 		}
 	}
 
@@ -1250,7 +1542,7 @@ static int ocfs2_block_group_search(struct inode *inode,
 				    struct buffer_head *group_bh,
 				    u32 bits_wanted, u32 min_bits,
 				    u64 max_block,
-				    u16 *bit_off, u16 *bits_found)
+				    struct ocfs2_suballoc_result *res)
 {
 	int ret = -ENOSPC;
 	u64 blkoff;
@@ -1263,10 +1555,10 @@ static int ocfs2_block_group_search(struct inode *inode,
 		ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
 							group_bh, bits_wanted,
 							le16_to_cpu(bg->bg_bits),
-							bit_off, bits_found);
+							res);
 		if (!ret && max_block) {
-			blkoff = le64_to_cpu(bg->bg_blkno) + *bit_off +
-				*bits_found;
+			blkoff = le64_to_cpu(bg->bg_blkno) +
+				res->sr_bit_offset + res->sr_bits;
 			mlog(0, "Checking %llu against %llu\n",
 			     (unsigned long long)blkoff,
 			     (unsigned long long)max_block);
@@ -1299,33 +1591,76 @@ static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
 	tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
 	di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
 	le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
-
-	ret = ocfs2_journal_dirty(handle, di_bh);
-	if (ret < 0)
-		mlog_errno(ret);
+	ocfs2_journal_dirty(handle, di_bh);
 
 out:
 	return ret;
 }
 
+static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res,
+					 struct ocfs2_extent_rec *rec,
+					 struct ocfs2_chain_list *cl)
+{
+	unsigned int bpc = le16_to_cpu(cl->cl_bpc);
+	unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc;
+	unsigned int bitcount = le32_to_cpu(rec->e_leaf_clusters) * bpc;
+
+	if (res->sr_bit_offset < bitoff)
+		return 0;
+	if (res->sr_bit_offset >= (bitoff + bitcount))
+		return 0;
+	res->sr_blkno = le64_to_cpu(rec->e_blkno) +
+		(res->sr_bit_offset - bitoff);
+	if ((res->sr_bit_offset + res->sr_bits) > (bitoff + bitcount))
+		res->sr_bits = (bitoff + bitcount) - res->sr_bit_offset;
+	return 1;
+}
+
+static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context *ac,
+					  struct ocfs2_group_desc *bg,
+					  struct ocfs2_suballoc_result *res)
+{
+	int i;
+	u64 bg_blkno = res->sr_bg_blkno;  /* Save off */
+	struct ocfs2_extent_rec *rec;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
+	struct ocfs2_chain_list *cl = &di->id2.i_chain;
+
+	if (ocfs2_is_cluster_bitmap(ac->ac_inode)) {
+		res->sr_blkno = 0;
+		return;
+	}
+
+	res->sr_blkno = res->sr_bg_blkno + res->sr_bit_offset;
+	res->sr_bg_blkno = 0;  /* Clear it for contig block groups */
+	if (!ocfs2_supports_discontig_bg(OCFS2_SB(ac->ac_inode->i_sb)) ||
+	    !bg->bg_list.l_next_free_rec)
+		return;
+
+	for (i = 0; i < le16_to_cpu(bg->bg_list.l_next_free_rec); i++) {
+		rec = &bg->bg_list.l_recs[i];
+		if (ocfs2_bg_discontig_fix_by_rec(res, rec, cl)) {
+			res->sr_bg_blkno = bg_blkno;  /* Restore */
+			break;
+		}
+	}
+}
+
 static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
 				  handle_t *handle,
 				  u32 bits_wanted,
 				  u32 min_bits,
-				  u16 *bit_off,
-				  unsigned int *num_bits,
-				  u64 gd_blkno,
+				  struct ocfs2_suballoc_result *res,
 				  u16 *bits_left)
 {
 	int ret;
-	u16 found;
 	struct buffer_head *group_bh = NULL;
 	struct ocfs2_group_desc *gd;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
 	struct inode *alloc_inode = ac->ac_inode;
 
-	ret = ocfs2_read_group_descriptor(alloc_inode, di, gd_blkno,
-					  &group_bh);
+	ret = ocfs2_read_group_descriptor(alloc_inode, di,
+					  res->sr_bg_blkno, &group_bh);
 	if (ret < 0) {
 		mlog_errno(ret);
 		return ret;
@@ -1333,17 +1668,18 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
 
 	gd = (struct ocfs2_group_desc *) group_bh->b_data;
 	ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
-				  ac->ac_max_block, bit_off, &found);
+				  ac->ac_max_block, res);
 	if (ret < 0) {
 		if (ret != -ENOSPC)
 			mlog_errno(ret);
 		goto out;
 	}
 
-	*num_bits = found;
+	if (!ret)
+		ocfs2_bg_discontig_fix_result(ac, gd, res);
 
 	ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
-					       *num_bits,
+					       res->sr_bits,
 					       le16_to_cpu(gd->bg_chain));
 	if (ret < 0) {
 		mlog_errno(ret);
@@ -1351,7 +1687,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
 	}
 
 	ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
-					 *bit_off, *num_bits);
+					 res->sr_bit_offset, res->sr_bits);
 	if (ret < 0)
 		mlog_errno(ret);
 
@@ -1367,13 +1703,11 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 			      handle_t *handle,
 			      u32 bits_wanted,
 			      u32 min_bits,
-			      u16 *bit_off,
-			      unsigned int *num_bits,
-			      u64 *bg_blkno,
+			      struct ocfs2_suballoc_result *res,
 			      u16 *bits_left)
 {
 	int status;
-	u16 chain, tmp_bits;
+	u16 chain;
 	u32 tmp_used;
 	u64 next_group;
 	struct inode *alloc_inode = ac->ac_inode;
@@ -1402,8 +1736,8 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 	 * the 1st group with any empty bits. */
 	while ((status = ac->ac_group_search(alloc_inode, group_bh,
 					     bits_wanted, min_bits,
-					     ac->ac_max_block, bit_off,
-					     &tmp_bits)) == -ENOSPC) {
+					     ac->ac_max_block,
+					     res)) == -ENOSPC) {
 		if (!bg->bg_next_group)
 			break;
 
@@ -1428,11 +1762,14 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 	}
 
 	mlog(0, "alloc succeeds: we give %u bits from block group %llu\n",
-	     tmp_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno));
+	     res->sr_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno));
 
-	*num_bits = tmp_bits;
+	res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno);
+
+	BUG_ON(res->sr_bits == 0);
+	if (!status)
+		ocfs2_bg_discontig_fix_result(ac, bg, res);
 
-	BUG_ON(*num_bits == 0);
 
 	/*
 	 * Keep track of previous block descriptor read. When
@@ -1449,7 +1786,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 	 */
 	if (ac->ac_allow_chain_relink &&
 	    (prev_group_bh) &&
-	    (ocfs2_block_group_reasonably_empty(bg, *num_bits))) {
+	    (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) {
 		status = ocfs2_relink_block_group(handle, alloc_inode,
 						  ac->ac_bh, group_bh,
 						  prev_group_bh, chain);
@@ -1471,31 +1808,24 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 	}
 
 	tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
-	fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used);
-	le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits));
-
-	status = ocfs2_journal_dirty(handle,
-				     ac->ac_bh);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail;
-	}
+	fe->id1.bitmap1.i_used = cpu_to_le32(res->sr_bits + tmp_used);
+	le32_add_cpu(&cl->cl_recs[chain].c_free, -res->sr_bits);
+	ocfs2_journal_dirty(handle, ac->ac_bh);
 
 	status = ocfs2_block_group_set_bits(handle,
 					    alloc_inode,
 					    bg,
 					    group_bh,
-					    *bit_off,
-					    *num_bits);
+					    res->sr_bit_offset,
+					    res->sr_bits);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
 
-	mlog(0, "Allocated %u bits from suballocator %llu\n", *num_bits,
+	mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits,
 	     (unsigned long long)le64_to_cpu(fe->i_blkno));
 
-	*bg_blkno = le64_to_cpu(bg->bg_blkno);
 	*bits_left = le16_to_cpu(bg->bg_free_bits_count);
 bail:
 	brelse(group_bh);
@@ -1506,19 +1836,15 @@ bail:
 }
 
 /* will give out up to bits_wanted contiguous bits. */
-static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
-				     struct ocfs2_alloc_context *ac,
+static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
 				     handle_t *handle,
 				     u32 bits_wanted,
 				     u32 min_bits,
-				     u16 *bit_off,
-				     unsigned int *num_bits,
-				     u64 *bg_blkno)
+				     struct ocfs2_suballoc_result *res)
 {
 	int status;
 	u16 victim, i;
 	u16 bits_left = 0;
-	u64 hint_blkno = ac->ac_last_group;
 	struct ocfs2_chain_list *cl;
 	struct ocfs2_dinode *fe;
 
@@ -1536,7 +1862,8 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
 
 	if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
 	    le32_to_cpu(fe->id1.bitmap1.i_total)) {
-		ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used "
+		ocfs2_error(ac->ac_inode->i_sb,
+			    "Chain allocator dinode %llu has %u used "
 			    "bits but only %u total.",
 			    (unsigned long long)le64_to_cpu(fe->i_blkno),
 			    le32_to_cpu(fe->id1.bitmap1.i_used),
@@ -1545,22 +1872,16 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	if (hint_blkno) {
+	res->sr_bg_blkno = ac->ac_last_group;
+	if (res->sr_bg_blkno) {
 		/* Attempt to short-circuit the usual search mechanism
 		 * by jumping straight to the most recently used
 		 * allocation group. This helps us mantain some
 		 * contiguousness across allocations. */
 		status = ocfs2_search_one_group(ac, handle, bits_wanted,
-						min_bits, bit_off, num_bits,
-						hint_blkno, &bits_left);
-		if (!status) {
-			/* Be careful to update *bg_blkno here as the
-			 * caller is expecting it to be filled in, and
-			 * ocfs2_search_one_group() won't do that for
-			 * us. */
-			*bg_blkno = hint_blkno;
+						min_bits, res, &bits_left);
+		if (!status)
 			goto set_hint;
-		}
 		if (status < 0 && status != -ENOSPC) {
 			mlog_errno(status);
 			goto bail;
@@ -1573,8 +1894,8 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
 	ac->ac_chain = victim;
 	ac->ac_allow_chain_relink = 1;
 
-	status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, bit_off,
-				    num_bits, bg_blkno, &bits_left);
+	status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
+				    res, &bits_left);
 	if (!status)
 		goto set_hint;
 	if (status < 0 && status != -ENOSPC) {
@@ -1598,8 +1919,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
 
 		ac->ac_chain = i;
 		status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
-					    bit_off, num_bits, bg_blkno,
-					    &bits_left);
+					    res, &bits_left);
 		if (!status)
 			break;
 		if (status < 0 && status != -ENOSPC) {
@@ -1616,7 +1936,7 @@ set_hint:
 		if (bits_left < min_bits)
 			ac->ac_last_group = 0;
 		else
-			ac->ac_last_group = *bg_blkno;
+			ac->ac_last_group = res->sr_bg_blkno;
 	}
 
 bail:
@@ -1624,37 +1944,37 @@ bail:
 	return status;
 }
 
-int ocfs2_claim_metadata(struct ocfs2_super *osb,
-			 handle_t *handle,
+int ocfs2_claim_metadata(handle_t *handle,
 			 struct ocfs2_alloc_context *ac,
 			 u32 bits_wanted,
+			 u64 *suballoc_loc,
 			 u16 *suballoc_bit_start,
 			 unsigned int *num_bits,
 			 u64 *blkno_start)
 {
 	int status;
-	u64 bg_blkno;
+	struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
 
 	BUG_ON(!ac);
 	BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
 	BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
 
-	status = ocfs2_claim_suballoc_bits(osb,
-					   ac,
+	status = ocfs2_claim_suballoc_bits(ac,
 					   handle,
 					   bits_wanted,
 					   1,
-					   suballoc_bit_start,
-					   num_bits,
-					   &bg_blkno);
+					   &res);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
-	atomic_inc(&osb->alloc_stats.bg_allocs);
+	atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
 
-	*blkno_start = bg_blkno + (u64) *suballoc_bit_start;
-	ac->ac_bits_given += (*num_bits);
+	*suballoc_loc = res.sr_bg_blkno;
+	*suballoc_bit_start = res.sr_bit_offset;
+	*blkno_start = res.sr_blkno;
+	ac->ac_bits_given += res.sr_bits;
+	*num_bits = res.sr_bits;
 	status = 0;
 bail:
 	mlog_exit(status);
@@ -1662,10 +1982,10 @@ bail:
 }
 
 static void ocfs2_init_inode_ac_group(struct inode *dir,
-				      struct buffer_head *parent_fe_bh,
+				      struct buffer_head *parent_di_bh,
 				      struct ocfs2_alloc_context *ac)
 {
-	struct ocfs2_dinode *fe = (struct ocfs2_dinode *)parent_fe_bh->b_data;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_di_bh->b_data;
 	/*
 	 * Try to allocate inodes from some specific group.
 	 *
@@ -1679,10 +1999,14 @@ static void ocfs2_init_inode_ac_group(struct inode *dir,
 	if (OCFS2_I(dir)->ip_last_used_group &&
 	    OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
 		ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
-	else if (le16_to_cpu(fe->i_suballoc_slot) == ac->ac_alloc_slot)
-		ac->ac_last_group = ocfs2_which_suballoc_group(
-					le64_to_cpu(fe->i_blkno),
-					le16_to_cpu(fe->i_suballoc_bit));
+	else if (le16_to_cpu(di->i_suballoc_slot) == ac->ac_alloc_slot) {
+		if (di->i_suballoc_loc)
+			ac->ac_last_group = le64_to_cpu(di->i_suballoc_loc);
+		else
+			ac->ac_last_group = ocfs2_which_suballoc_group(
+					le64_to_cpu(di->i_blkno),
+					le16_to_cpu(di->i_suballoc_bit));
+	}
 }
 
 static inline void ocfs2_save_inode_ac_group(struct inode *dir,
@@ -1692,17 +2016,16 @@ static inline void ocfs2_save_inode_ac_group(struct inode *dir,
 	OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
 }
 
-int ocfs2_claim_new_inode(struct ocfs2_super *osb,
-			  handle_t *handle,
+int ocfs2_claim_new_inode(handle_t *handle,
 			  struct inode *dir,
 			  struct buffer_head *parent_fe_bh,
 			  struct ocfs2_alloc_context *ac,
+			  u64 *suballoc_loc,
 			  u16 *suballoc_bit,
 			  u64 *fe_blkno)
 {
 	int status;
-	unsigned int num_bits;
-	u64 bg_blkno;
+	struct ocfs2_suballoc_result res;
 
 	mlog_entry_void();
 
@@ -1713,23 +2036,22 @@ int ocfs2_claim_new_inode(struct ocfs2_super *osb,
 
 	ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
 
-	status = ocfs2_claim_suballoc_bits(osb,
-					   ac,
+	status = ocfs2_claim_suballoc_bits(ac,
 					   handle,
 					   1,
 					   1,
-					   suballoc_bit,
-					   &num_bits,
-					   &bg_blkno);
+					   &res);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
-	atomic_inc(&osb->alloc_stats.bg_allocs);
+	atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
 
-	BUG_ON(num_bits != 1);
+	BUG_ON(res.sr_bits != 1);
 
-	*fe_blkno = bg_blkno + (u64) (*suballoc_bit);
+	*suballoc_loc = res.sr_bg_blkno;
+	*suballoc_bit = res.sr_bit_offset;
+	*fe_blkno = res.sr_blkno;
 	ac->ac_bits_given++;
 	ocfs2_save_inode_ac_group(dir, ac);
 	status = 0;
@@ -1799,8 +2121,7 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
  * contig. allocation, set to '1' to indicate we can deal with extents
  * of any size.
  */
-int __ocfs2_claim_clusters(struct ocfs2_super *osb,
-			   handle_t *handle,
+int __ocfs2_claim_clusters(handle_t *handle,
 			   struct ocfs2_alloc_context *ac,
 			   u32 min_clusters,
 			   u32 max_clusters,
@@ -1809,8 +2130,8 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
 {
 	int status;
 	unsigned int bits_wanted = max_clusters;
-	u64 bg_blkno = 0;
-	u16 bg_bit_off;
+	struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
+	struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb);
 
 	mlog_entry_void();
 
@@ -1820,6 +2141,8 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
 	       && ac->ac_which != OCFS2_AC_USE_MAIN);
 
 	if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
+		WARN_ON(min_clusters > 1);
+
 		status = ocfs2_claim_local_alloc_bits(osb,
 						      handle,
 						      ac,
@@ -1842,20 +2165,19 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
 		if (bits_wanted > (osb->bitmap_cpg - 1))
 			bits_wanted = osb->bitmap_cpg - 1;
 
-		status = ocfs2_claim_suballoc_bits(osb,
-						   ac,
+		status = ocfs2_claim_suballoc_bits(ac,
 						   handle,
 						   bits_wanted,
 						   min_clusters,
-						   &bg_bit_off,
-						   num_clusters,
-						   &bg_blkno);
+						   &res);
 		if (!status) {
+			BUG_ON(res.sr_blkno); /* cluster alloc can't set */
 			*cluster_start =
 				ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
-								 bg_blkno,
-								 bg_bit_off);
+								 res.sr_bg_blkno,
+								 res.sr_bit_offset);
 			atomic_inc(&osb->alloc_stats.bitmap_data);
+			*num_clusters = res.sr_bits;
 		}
 	}
 	if (status < 0) {
@@ -1871,8 +2193,7 @@ bail:
 	return status;
 }
 
-int ocfs2_claim_clusters(struct ocfs2_super *osb,
-			 handle_t *handle,
+int ocfs2_claim_clusters(handle_t *handle,
 			 struct ocfs2_alloc_context *ac,
 			 u32 min_clusters,
 			 u32 *cluster_start,
@@ -1880,22 +2201,22 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
 {
 	unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
 
-	return __ocfs2_claim_clusters(osb, handle, ac, min_clusters,
+	return __ocfs2_claim_clusters(handle, ac, min_clusters,
 				      bits_wanted, cluster_start, num_clusters);
 }
 
-static inline int ocfs2_block_group_clear_bits(handle_t *handle,
-					       struct inode *alloc_inode,
-					       struct ocfs2_group_desc *bg,
-					       struct buffer_head *group_bh,
-					       unsigned int bit_off,
-					       unsigned int num_bits)
+static int ocfs2_block_group_clear_bits(handle_t *handle,
+					struct inode *alloc_inode,
+					struct ocfs2_group_desc *bg,
+					struct buffer_head *group_bh,
+					unsigned int bit_off,
+					unsigned int num_bits,
+					void (*undo_fn)(unsigned int bit,
+							unsigned long *bmap))
 {
 	int status;
 	unsigned int tmp;
-	int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
 	struct ocfs2_group_desc *undo_bg = NULL;
-	int cluster_bitmap = 0;
 
 	mlog_entry_void();
 
@@ -1905,20 +2226,18 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
 
 	mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
 
-	if (ocfs2_is_cluster_bitmap(alloc_inode))
-		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
-
+	BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode));
 	status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
-					 group_bh, journal_type);
+					 group_bh,
+					 undo_fn ?
+					 OCFS2_JOURNAL_ACCESS_UNDO :
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
 
-	if (ocfs2_is_cluster_bitmap(alloc_inode))
-		cluster_bitmap = 1;
-
-	if (cluster_bitmap) {
+	if (undo_fn) {
 		jbd_lock_bh_state(group_bh);
 		undo_bg = (struct ocfs2_group_desc *)
 					bh2jh(group_bh)->b_committed_data;
@@ -1929,18 +2248,16 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
 	while(tmp--) {
 		ocfs2_clear_bit((bit_off + tmp),
 				(unsigned long *) bg->bg_bitmap);
-		if (cluster_bitmap)
-			ocfs2_set_bit(bit_off + tmp,
-				      (unsigned long *) undo_bg->bg_bitmap);
+		if (undo_fn)
+			undo_fn(bit_off + tmp,
+				(unsigned long *) undo_bg->bg_bitmap);
 	}
 	le16_add_cpu(&bg->bg_free_bits_count, num_bits);
 
-	if (cluster_bitmap)
+	if (undo_fn)
 		jbd_unlock_bh_state(group_bh);
 
-	status = ocfs2_journal_dirty(handle, group_bh);
-	if (status < 0)
-		mlog_errno(status);
+	ocfs2_journal_dirty(handle, group_bh);
 bail:
 	return status;
 }
@@ -1948,12 +2265,14 @@ bail:
 /*
  * expects the suballoc inode to already be locked.
  */
-int ocfs2_free_suballoc_bits(handle_t *handle,
-			     struct inode *alloc_inode,
-			     struct buffer_head *alloc_bh,
-			     unsigned int start_bit,
-			     u64 bg_blkno,
-			     unsigned int count)
+static int _ocfs2_free_suballoc_bits(handle_t *handle,
+				     struct inode *alloc_inode,
+				     struct buffer_head *alloc_bh,
+				     unsigned int start_bit,
+				     u64 bg_blkno,
+				     unsigned int count,
+				     void (*undo_fn)(unsigned int bit,
+						     unsigned long *bitmap))
 {
 	int status = 0;
 	u32 tmp_used;
@@ -1988,7 +2307,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
 
 	status = ocfs2_block_group_clear_bits(handle, alloc_inode,
 					      group, group_bh,
-					      start_bit, count);
+					      start_bit, count, undo_fn);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -2005,12 +2324,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
 		     count);
 	tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
 	fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
-
-	status = ocfs2_journal_dirty(handle, alloc_bh);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail;
-	}
+	ocfs2_journal_dirty(handle, alloc_bh);
 
 bail:
 	brelse(group_bh);
@@ -2019,6 +2333,17 @@ bail:
 	return status;
 }
 
+int ocfs2_free_suballoc_bits(handle_t *handle,
+			     struct inode *alloc_inode,
+			     struct buffer_head *alloc_bh,
+			     unsigned int start_bit,
+			     u64 bg_blkno,
+			     unsigned int count)
+{
+	return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh,
+					 start_bit, bg_blkno, count, NULL);
+}
+
 int ocfs2_free_dinode(handle_t *handle,
 		      struct inode *inode_alloc_inode,
 		      struct buffer_head *inode_alloc_bh,
@@ -2028,15 +2353,19 @@ int ocfs2_free_dinode(handle_t *handle,
 	u16 bit = le16_to_cpu(di->i_suballoc_bit);
 	u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
 
+	if (di->i_suballoc_loc)
+		bg_blkno = le64_to_cpu(di->i_suballoc_loc);
 	return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
 					inode_alloc_bh, bit, bg_blkno, 1);
 }
 
-int ocfs2_free_clusters(handle_t *handle,
-		       struct inode *bitmap_inode,
-		       struct buffer_head *bitmap_bh,
-		       u64 start_blk,
-		       unsigned int num_clusters)
+static int _ocfs2_free_clusters(handle_t *handle,
+				struct inode *bitmap_inode,
+				struct buffer_head *bitmap_bh,
+				u64 start_blk,
+				unsigned int num_clusters,
+				void (*undo_fn)(unsigned int bit,
+						unsigned long *bitmap))
 {
 	int status;
 	u16 bg_start_bit;
@@ -2063,9 +2392,9 @@ int ocfs2_free_clusters(handle_t *handle,
 	mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n",
 	     (unsigned long long)bg_blkno, bg_start_bit);
 
-	status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
-					  bg_start_bit, bg_blkno,
-					  num_clusters);
+	status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
+					   bg_start_bit, bg_blkno,
+					   num_clusters, undo_fn);
 	if (status < 0) {
 		mlog_errno(status);
 		goto out;
@@ -2079,6 +2408,32 @@ out:
 	return status;
 }
 
+int ocfs2_free_clusters(handle_t *handle,
+			struct inode *bitmap_inode,
+			struct buffer_head *bitmap_bh,
+			u64 start_blk,
+			unsigned int num_clusters)
+{
+	return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
+				    start_blk, num_clusters,
+				    _ocfs2_set_bit);
+}
+
+/*
+ * Give never-used clusters back to the global bitmap.  We don't need
+ * to protect these bits in the undo buffer.
+ */
+int ocfs2_release_clusters(handle_t *handle,
+			   struct inode *bitmap_inode,
+			   struct buffer_head *bitmap_bh,
+			   u64 start_blk,
+			   unsigned int num_clusters)
+{
+	return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
+				    start_blk, num_clusters,
+				    _ocfs2_clear_bit);
+}
+
 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
 {
 	printk("Block Group:\n");
@@ -2269,7 +2624,7 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
 				   struct buffer_head *alloc_bh, u64 blkno,
 				   u16 bit, int *res)
 {
-	struct ocfs2_dinode *alloc_fe;
+	struct ocfs2_dinode *alloc_di;
 	struct ocfs2_group_desc *group;
 	struct buffer_head *group_bh = NULL;
 	u64 bg_blkno;
@@ -2278,17 +2633,20 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
 	mlog_entry("blkno: %llu bit: %u\n", (unsigned long long)blkno,
 		   (unsigned int)bit);
 
-	alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data;
-	if ((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain)) {
+	alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data;
+	if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) {
 		mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
 		     (unsigned int)bit,
-		     ocfs2_bits_per_group(&alloc_fe->id2.i_chain));
+		     ocfs2_bits_per_group(&alloc_di->id2.i_chain));
 		status = -EINVAL;
 		goto bail;
 	}
 
-	bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
-	status = ocfs2_read_group_descriptor(suballoc, alloc_fe, bg_blkno,
+	if (alloc_di->i_suballoc_loc)
+		bg_blkno = le64_to_cpu(alloc_di->i_suballoc_loc);
+	else
+		bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
+	status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno,
 					     &group_bh);
 	if (status < 0) {
 		mlog(ML_ERROR, "read group %llu failed %d\n",
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 8c9a78a43164..a017dd3ee7d9 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -26,13 +26,14 @@
 #ifndef _CHAINALLOC_H_
 #define _CHAINALLOC_H_
 
+struct ocfs2_suballoc_result;
 typedef int (group_search_t)(struct inode *,
 			     struct buffer_head *,
 			     u32,			/* bits_wanted */
 			     u32,			/* min_bits */
 			     u64,			/* max_block */
-			     u16 *,			/* *bit_off */
-			     u16 *);			/* *bits_found */
+			     struct ocfs2_suballoc_result *);
+							/* found bits */
 
 struct ocfs2_alloc_context {
 	struct inode *ac_inode;    /* which bitmap are we allocating from? */
@@ -54,8 +55,11 @@ struct ocfs2_alloc_context {
 	u64    ac_last_group;
 	u64    ac_max_block;  /* Highest block number to allocate. 0 is
 				 is the same as ~0 - unlimited */
+
+	struct ocfs2_alloc_reservation	*ac_resv;
 };
 
+void ocfs2_init_steal_slots(struct ocfs2_super *osb);
 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac);
 static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac)
 {
@@ -79,22 +83,21 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,
 			   u32 bits_wanted,
 			   struct ocfs2_alloc_context **ac);
 
-int ocfs2_claim_metadata(struct ocfs2_super *osb,
-			 handle_t *handle,
+int ocfs2_claim_metadata(handle_t *handle,
 			 struct ocfs2_alloc_context *ac,
 			 u32 bits_wanted,
+			 u64 *suballoc_loc,
 			 u16 *suballoc_bit_start,
 			 u32 *num_bits,
 			 u64 *blkno_start);
-int ocfs2_claim_new_inode(struct ocfs2_super *osb,
-			  handle_t *handle,
+int ocfs2_claim_new_inode(handle_t *handle,
 			  struct inode *dir,
 			  struct buffer_head *parent_fe_bh,
 			  struct ocfs2_alloc_context *ac,
+			  u64 *suballoc_loc,
 			  u16 *suballoc_bit,
 			  u64 *fe_blkno);
-int ocfs2_claim_clusters(struct ocfs2_super *osb,
-			 handle_t *handle,
+int ocfs2_claim_clusters(handle_t *handle,
 			 struct ocfs2_alloc_context *ac,
 			 u32 min_clusters,
 			 u32 *cluster_start,
@@ -103,8 +106,7 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
  * Use this variant of ocfs2_claim_clusters to specify a maxiumum
  * number of clusters smaller than the allocation reserved.
  */
-int __ocfs2_claim_clusters(struct ocfs2_super *osb,
-			   handle_t *handle,
+int __ocfs2_claim_clusters(handle_t *handle,
 			   struct ocfs2_alloc_context *ac,
 			   u32 min_clusters,
 			   u32 max_clusters,
@@ -126,6 +128,11 @@ int ocfs2_free_clusters(handle_t *handle,
 			struct buffer_head *bitmap_bh,
 			u64 start_blk,
 			unsigned int num_clusters);
+int ocfs2_release_clusters(handle_t *handle,
+			   struct inode *bitmap_inode,
+			   struct buffer_head *bitmap_bh,
+			   u64 start_blk,
+			   unsigned int num_clusters);
 
 static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
 {
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 26069917a9f5..0eaa929a4dbf 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -69,6 +69,7 @@
 #include "xattr.h"
 #include "quota.h"
 #include "refcounttree.h"
+#include "suballoc.h"
 
 #include "buffer_head_io.h"
 
@@ -93,7 +94,9 @@ struct mount_options
 	unsigned long	mount_opt;
 	unsigned int	atime_quantum;
 	signed short	slot;
-	unsigned int	localalloc_opt;
+	int		localalloc_opt;
+	unsigned int	resv_level;
+	int		dir_resv_level;
 	char		cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
 };
 
@@ -175,6 +178,8 @@ enum {
 	Opt_noacl,
 	Opt_usrquota,
 	Opt_grpquota,
+	Opt_resv_level,
+	Opt_dir_resv_level,
 	Opt_err,
 };
 
@@ -201,6 +206,8 @@ static const match_table_t tokens = {
 	{Opt_noacl, "noacl"},
 	{Opt_usrquota, "usrquota"},
 	{Opt_grpquota, "grpquota"},
+	{Opt_resv_level, "resv_level=%u"},
+	{Opt_dir_resv_level, "dir_resv_level=%u"},
 	{Opt_err, NULL}
 };
 
@@ -301,9 +308,12 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
 
 	spin_lock(&osb->osb_lock);
 	out += snprintf(buf + out, len - out,
-			"%10s => Slot: %d  NumStolen: %d\n", "Steal",
+			"%10s => InodeSlot: %d  StolenInodes: %d, "
+			"MetaSlot: %d  StolenMeta: %d\n", "Steal",
 			osb->s_inode_steal_slot,
-			atomic_read(&osb->s_num_inodes_stolen));
+			atomic_read(&osb->s_num_inodes_stolen),
+			osb->s_meta_steal_slot,
+			atomic_read(&osb->s_num_meta_stolen));
 	spin_unlock(&osb->osb_lock);
 
 	out += snprintf(buf + out, len - out, "OrphanScan => ");
@@ -869,13 +879,15 @@ static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend)
 		if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
 			continue;
 		if (unsuspend)
-			status = vfs_quota_enable(
-					sb_dqopt(sb)->files[type],
-					type, QFMT_OCFS2,
-					DQUOT_SUSPENDED);
-		else
-			status = vfs_quota_disable(sb, type,
-						   DQUOT_SUSPENDED);
+			status = dquot_resume(sb, type);
+		else {
+			struct ocfs2_mem_dqinfo *oinfo;
+
+			/* Cancel periodic syncing before suspending */
+			oinfo = sb_dqinfo(sb, type)->dqi_priv;
+			cancel_delayed_work_sync(&oinfo->dqi_sync_work);
+			status = dquot_suspend(sb, type);
+		}
 		if (status < 0)
 			break;
 	}
@@ -906,8 +918,8 @@ static int ocfs2_enable_quotas(struct ocfs2_super *osb)
 			status = -ENOENT;
 			goto out_quota_off;
 		}
-		status = vfs_quota_enable(inode[type], type, QFMT_OCFS2,
-						DQUOT_USAGE_ENABLED);
+		status = dquot_enable(inode[type], type, QFMT_OCFS2,
+				      DQUOT_USAGE_ENABLED);
 		if (status < 0)
 			goto out_quota_off;
 	}
@@ -928,18 +940,22 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
 	int type;
 	struct inode *inode;
 	struct super_block *sb = osb->sb;
+	struct ocfs2_mem_dqinfo *oinfo;
 
 	/* We mostly ignore errors in this function because there's not much
 	 * we can do when we see them */
 	for (type = 0; type < MAXQUOTAS; type++) {
 		if (!sb_has_quota_loaded(sb, type))
 			continue;
+		/* Cancel periodic syncing before we grab dqonoff_mutex */
+		oinfo = sb_dqinfo(sb, type)->dqi_priv;
+		cancel_delayed_work_sync(&oinfo->dqi_sync_work);
 		inode = igrab(sb->s_dquot.files[type]);
 		/* Turn off quotas. This will remove all dquot structures from
 		 * memory and so they will be automatically synced to global
 		 * quota files */
-		vfs_quota_disable(sb, type, DQUOT_USAGE_ENABLED |
-					    DQUOT_LIMITS_ENABLED);
+		dquot_disable(sb, type, DQUOT_USAGE_ENABLED |
+					DQUOT_LIMITS_ENABLED);
 		if (!inode)
 			continue;
 		iput(inode);
@@ -948,7 +964,7 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
 
 /* Handle quota on quotactl */
 static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
-			  char *path, int remount)
+			  char *path)
 {
 	unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
 					     OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
@@ -956,30 +972,24 @@ static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
 	if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
 		return -EINVAL;
 
-	if (remount)
-		return 0;	/* Just ignore it has been handled in
-				 * ocfs2_remount() */
-	return vfs_quota_enable(sb_dqopt(sb)->files[type], type,
-				    format_id, DQUOT_LIMITS_ENABLED);
+	return dquot_enable(sb_dqopt(sb)->files[type], type,
+			    format_id, DQUOT_LIMITS_ENABLED);
 }
 
 /* Handle quota off quotactl */
-static int ocfs2_quota_off(struct super_block *sb, int type, int remount)
+static int ocfs2_quota_off(struct super_block *sb, int type)
 {
-	if (remount)
-		return 0;	/* Ignore now and handle later in
-				 * ocfs2_remount() */
-	return vfs_quota_disable(sb, type, DQUOT_LIMITS_ENABLED);
+	return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
 }
 
 static const struct quotactl_ops ocfs2_quotactl_ops = {
 	.quota_on	= ocfs2_quota_on,
 	.quota_off	= ocfs2_quota_off,
-	.quota_sync	= vfs_quota_sync,
-	.get_info	= vfs_get_dqinfo,
-	.set_info	= vfs_set_dqinfo,
-	.get_dqblk	= vfs_get_dqblk,
-	.set_dqblk	= vfs_set_dqblk,
+	.quota_sync	= dquot_quota_sync,
+	.get_info	= dquot_get_dqinfo,
+	.set_info	= dquot_set_dqinfo,
+	.get_dqblk	= dquot_get_dqblk,
+	.set_dqblk	= dquot_set_dqblk,
 };
 
 static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
@@ -1024,8 +1034,14 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 	osb->s_atime_quantum = parsed_options.atime_quantum;
 	osb->preferred_slot = parsed_options.slot;
 	osb->osb_commit_interval = parsed_options.commit_interval;
-	osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt);
-	osb->local_alloc_bits = osb->local_alloc_default_bits;
+
+	ocfs2_la_set_sizes(osb, parsed_options.localalloc_opt);
+	osb->osb_resv_level = parsed_options.resv_level;
+	osb->osb_dir_resv_level = parsed_options.resv_level;
+	if (parsed_options.dir_resv_level == -1)
+		osb->osb_dir_resv_level = parsed_options.resv_level;
+	else
+		osb->osb_dir_resv_level = parsed_options.dir_resv_level;
 
 	status = ocfs2_verify_userspace_stack(osb, &parsed_options);
 	if (status)
@@ -1062,7 +1078,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 				     "file system, but write access is "
 				     "unavailable.\n");
 			else
-				mlog_errno(status);			
+				mlog_errno(status);
 			goto read_super_error;
 		}
 
@@ -1281,11 +1297,13 @@ static int ocfs2_parse_options(struct super_block *sb,
 		   options ? options : "(none)");
 
 	mopt->commit_interval = 0;
-	mopt->mount_opt = 0;
+	mopt->mount_opt = OCFS2_MOUNT_NOINTR;
 	mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
 	mopt->slot = OCFS2_INVALID_SLOT;
-	mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
+	mopt->localalloc_opt = -1;
 	mopt->cluster_stack[0] = '\0';
+	mopt->resv_level = OCFS2_DEFAULT_RESV_LEVEL;
+	mopt->dir_resv_level = -1;
 
 	if (!options) {
 		status = 1;
@@ -1376,7 +1394,7 @@ static int ocfs2_parse_options(struct super_block *sb,
 				status = 0;
 				goto bail;
 			}
-			if (option >= 0 && (option <= ocfs2_local_alloc_size(sb) * 8))
+			if (option >= 0)
 				mopt->localalloc_opt = option;
 			break;
 		case Opt_localflocks:
@@ -1429,6 +1447,28 @@ static int ocfs2_parse_options(struct super_block *sb,
 			mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL;
 			mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
 			break;
+		case Opt_resv_level:
+			if (is_remount)
+				break;
+			if (match_int(&args[0], &option)) {
+				status = 0;
+				goto bail;
+			}
+			if (option >= OCFS2_MIN_RESV_LEVEL &&
+			    option < OCFS2_MAX_RESV_LEVEL)
+				mopt->resv_level = option;
+			break;
+		case Opt_dir_resv_level:
+			if (is_remount)
+				break;
+			if (match_int(&args[0], &option)) {
+				status = 0;
+				goto bail;
+			}
+			if (option >= OCFS2_MIN_RESV_LEVEL &&
+			    option < OCFS2_MAX_RESV_LEVEL)
+				mopt->dir_resv_level = option;
+			break;
 		default:
 			mlog(ML_ERROR,
 			     "Unrecognized mount option \"%s\" "
@@ -1483,7 +1523,7 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 			   (unsigned) (osb->osb_commit_interval / HZ));
 
 	local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits);
-	if (local_alloc_megs != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE)
+	if (local_alloc_megs != ocfs2_la_default_mb(osb))
 		seq_printf(s, ",localalloc=%d", local_alloc_megs);
 
 	if (opts & OCFS2_MOUNT_LOCALFLOCKS)
@@ -1510,6 +1550,12 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 	else
 		seq_printf(s, ",noacl");
 
+	if (osb->osb_resv_level != OCFS2_DEFAULT_RESV_LEVEL)
+		seq_printf(s, ",resv_level=%d", osb->osb_resv_level);
+
+	if (osb->osb_dir_resv_level != osb->osb_resv_level)
+		seq_printf(s, ",dir_resv_level=%d", osb->osb_resv_level);
+
 	return 0;
 }
 
@@ -1684,6 +1730,8 @@ static void ocfs2_inode_init_once(void *data)
 	oi->ip_blkno = 0ULL;
 	oi->ip_clusters = 0;
 
+	ocfs2_resv_init_once(&oi->ip_la_data_resv);
+
 	ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
 	ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
 	ocfs2_lock_res_init_once(&oi->ip_open_lockres);
@@ -1997,7 +2045,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	osb->blocked_lock_count = 0;
 	spin_lock_init(&osb->osb_lock);
 	spin_lock_init(&osb->osb_xattr_lock);
-	ocfs2_init_inode_steal_slot(osb);
+	ocfs2_init_steal_slots(osb);
 
 	atomic_set(&osb->alloc_stats.moves, 0);
 	atomic_set(&osb->alloc_stats.local_data, 0);
@@ -2038,6 +2086,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
 
 	init_waitqueue_head(&osb->osb_mount_event);
 
+	status = ocfs2_resmap_init(osb, &osb->osb_la_resmap);
+	if (status) {
+		mlog_errno(status);
+		goto bail;
+	}
+
 	osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL);
 	if (!osb->vol_label) {
 		mlog(ML_ERROR, "unable to alloc vol label\n");
@@ -2220,9 +2274,11 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	}
 
 	osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno;
+	osb->osb_clusters_at_boot = OCFS2_I(inode)->ip_clusters;
 	iput(inode);
 
-	osb->bitmap_cpg = ocfs2_group_bitmap_size(sb) * 8;
+	osb->bitmap_cpg = ocfs2_group_bitmap_size(sb, 0,
+				 osb->s_feature_incompat) * 8;
 
 	status = ocfs2_init_slot_info(osb);
 	if (status < 0) {
@@ -2505,5 +2561,25 @@ void __ocfs2_abort(struct super_block* sb,
 	ocfs2_handle_error(sb);
 }
 
+/*
+ * Void signal blockers, because in-kernel sigprocmask() only fails
+ * when SIG_* is wrong.
+ */
+void ocfs2_block_signals(sigset_t *oldset)
+{
+	int rc;
+	sigset_t blocked;
+
+	sigfillset(&blocked);
+	rc = sigprocmask(SIG_BLOCK, &blocked, oldset);
+	BUG_ON(rc);
+}
+
+void ocfs2_unblock_signals(sigset_t *oldset)
+{
+	int rc = sigprocmask(SIG_SETMASK, oldset, NULL);
+	BUG_ON(rc);
+}
+
 module_init(ocfs2_init);
 module_exit(ocfs2_exit);
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index 783f5270f2a1..40c7de084c10 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -45,4 +45,11 @@ void __ocfs2_abort(struct super_block *sb,
 
 #define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
 
+/*
+ * Void signal blockers, because in-kernel sigprocmask() only fails
+ * when SIG_* is wrong.
+ */
+void ocfs2_block_signals(sigset_t *oldset);
+void ocfs2_unblock_signals(sigset_t *oldset);
+
 #endif /* OCFS2_SUPER_H */
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 49b133ccbf11..32499d213fc4 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -137,20 +137,20 @@ static void *ocfs2_fast_follow_link(struct dentry *dentry,
 	}
 
 	memcpy(link, target, len);
-	nd_set_link(nd, link);
 
 bail:
+	nd_set_link(nd, status ? ERR_PTR(status) : link);
 	brelse(bh);
 
 	mlog_exit(status);
-	return status ? ERR_PTR(status) : link;
+	return NULL;
 }
 
 static void ocfs2_fast_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
 {
-	char *link = cookie;
-
-	kfree(link);
+	char *link = nd_get_link(nd);
+	if (!IS_ERR(link))
+		kfree(link);
 }
 
 const struct inode_operations ocfs2_symlink_inode_operations = {
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index 40e53702948c..bfe7190cdbf1 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -25,7 +25,6 @@
 
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 
 #define MLOG_MASK_PREFIX ML_INODE
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index c61369342a27..a0a120e82b97 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -267,8 +267,8 @@ static int ocfs2_buffer_cached(struct ocfs2_caching_info *ci,
 }
 
 /* Warning: even if it returns true, this does *not* guarantee that
- * the block is stored in our inode metadata cache. 
- * 
+ * the block is stored in our inode metadata cache.
+ *
  * This can be called under lock_buffer()
  */
 int ocfs2_buffer_uptodate(struct ocfs2_caching_info *ci,
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 8fc6fb071c6d..d03469f61801 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -79,6 +79,7 @@ struct ocfs2_xattr_set_ctxt {
 	struct ocfs2_alloc_context *meta_ac;
 	struct ocfs2_alloc_context *data_ac;
 	struct ocfs2_cached_dealloc_ctxt dealloc;
+	int set_abort;
 };
 
 #define OCFS2_XATTR_ROOT_SIZE	(sizeof(struct ocfs2_xattr_def_value_root))
@@ -96,7 +97,7 @@ static struct ocfs2_xattr_def_value_root def_xv = {
 	.xv.xr_list.l_count = cpu_to_le16(1),
 };
 
-struct xattr_handler *ocfs2_xattr_handlers[] = {
+const struct xattr_handler *ocfs2_xattr_handlers[] = {
 	&ocfs2_xattr_user_handler,
 	&ocfs2_xattr_acl_access_handler,
 	&ocfs2_xattr_acl_default_handler,
@@ -105,7 +106,7 @@ struct xattr_handler *ocfs2_xattr_handlers[] = {
 	NULL
 };
 
-static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
+static const struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
 	[OCFS2_XATTR_INDEX_USER]	= &ocfs2_xattr_user_handler,
 	[OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS]
 					= &ocfs2_xattr_acl_access_handler,
@@ -116,10 +117,11 @@ static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
 };
 
 struct ocfs2_xattr_info {
-	int name_index;
-	const char *name;
-	const void *value;
-	size_t value_len;
+	int		xi_name_index;
+	const char	*xi_name;
+	int		xi_name_len;
+	const void	*xi_value;
+	size_t		xi_value_len;
 };
 
 struct ocfs2_xattr_search {
@@ -137,6 +139,115 @@ struct ocfs2_xattr_search {
 	int not_found;
 };
 
+/* Operations on struct ocfs2_xa_entry */
+struct ocfs2_xa_loc;
+struct ocfs2_xa_loc_operations {
+	/*
+	 * Journal functions
+	 */
+	int (*xlo_journal_access)(handle_t *handle, struct ocfs2_xa_loc *loc,
+				  int type);
+	void (*xlo_journal_dirty)(handle_t *handle, struct ocfs2_xa_loc *loc);
+
+	/*
+	 * Return a pointer to the appropriate buffer in loc->xl_storage
+	 * at the given offset from loc->xl_header.
+	 */
+	void *(*xlo_offset_pointer)(struct ocfs2_xa_loc *loc, int offset);
+
+	/* Can we reuse the existing entry for the new value? */
+	int (*xlo_can_reuse)(struct ocfs2_xa_loc *loc,
+			     struct ocfs2_xattr_info *xi);
+
+	/* How much space is needed for the new value? */
+	int (*xlo_check_space)(struct ocfs2_xa_loc *loc,
+			       struct ocfs2_xattr_info *xi);
+
+	/*
+	 * Return the offset of the first name+value pair.  This is
+	 * the start of our downward-filling free space.
+	 */
+	int (*xlo_get_free_start)(struct ocfs2_xa_loc *loc);
+
+	/*
+	 * Remove the name+value at this location.  Do whatever is
+	 * appropriate with the remaining name+value pairs.
+	 */
+	void (*xlo_wipe_namevalue)(struct ocfs2_xa_loc *loc);
+
+	/* Fill xl_entry with a new entry */
+	void (*xlo_add_entry)(struct ocfs2_xa_loc *loc, u32 name_hash);
+
+	/* Add name+value storage to an entry */
+	void (*xlo_add_namevalue)(struct ocfs2_xa_loc *loc, int size);
+
+	/*
+	 * Initialize the value buf's access and bh fields for this entry.
+	 * ocfs2_xa_fill_value_buf() will handle the xv pointer.
+	 */
+	void (*xlo_fill_value_buf)(struct ocfs2_xa_loc *loc,
+				   struct ocfs2_xattr_value_buf *vb);
+};
+
+/*
+ * Describes an xattr entry location.  This is a memory structure
+ * tracking the on-disk structure.
+ */
+struct ocfs2_xa_loc {
+	/* This xattr belongs to this inode */
+	struct inode *xl_inode;
+
+	/* The ocfs2_xattr_header inside the on-disk storage. Not NULL. */
+	struct ocfs2_xattr_header *xl_header;
+
+	/* Bytes from xl_header to the end of the storage */
+	int xl_size;
+
+	/*
+	 * The ocfs2_xattr_entry this location describes.  If this is
+	 * NULL, this location describes the on-disk structure where it
+	 * would have been.
+	 */
+	struct ocfs2_xattr_entry *xl_entry;
+
+	/*
+	 * Internal housekeeping
+	 */
+
+	/* Buffer(s) containing this entry */
+	void *xl_storage;
+
+	/* Operations on the storage backing this location */
+	const struct ocfs2_xa_loc_operations *xl_ops;
+};
+
+/*
+ * Convenience functions to calculate how much space is needed for a
+ * given name+value pair
+ */
+static int namevalue_size(int name_len, uint64_t value_len)
+{
+	if (value_len > OCFS2_XATTR_INLINE_SIZE)
+		return OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
+	else
+		return OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(value_len);
+}
+
+static int namevalue_size_xi(struct ocfs2_xattr_info *xi)
+{
+	return namevalue_size(xi->xi_name_len, xi->xi_value_len);
+}
+
+static int namevalue_size_xe(struct ocfs2_xattr_entry *xe)
+{
+	u64 value_len = le64_to_cpu(xe->xe_value_size);
+
+	BUG_ON((value_len > OCFS2_XATTR_INLINE_SIZE) &&
+	       ocfs2_xattr_is_local(xe));
+	return namevalue_size(xe->xe_name_len, value_len);
+}
+
+
 static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb,
 					     struct ocfs2_xattr_header *xh,
 					     int index,
@@ -212,14 +323,6 @@ static inline u16 ocfs2_blocks_per_xattr_bucket(struct super_block *sb)
 	return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits);
 }
 
-static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb)
-{
-	u16 len = sb->s_blocksize -
-		 offsetof(struct ocfs2_xattr_header, xh_entries);
-
-	return len / sizeof(struct ocfs2_xattr_entry);
-}
-
 #define bucket_blkno(_b) ((_b)->bu_bhs[0]->b_blocknr)
 #define bucket_block(_b, _n) ((_b)->bu_bhs[(_n)]->b_data)
 #define bucket_xh(_b) ((struct ocfs2_xattr_header *)bucket_block((_b), 0))
@@ -437,7 +540,7 @@ static int ocfs2_read_xattr_block(struct inode *inode, u64 xb_blkno,
 
 static inline const char *ocfs2_xattr_prefix(int name_index)
 {
-	struct xattr_handler *handler = NULL;
+	const struct xattr_handler *handler = NULL;
 
 	if (name_index > 0 && name_index < OCFS2_XATTR_MAX)
 		handler = ocfs2_xattr_handler_map[name_index];
@@ -463,35 +566,22 @@ static u32 ocfs2_xattr_name_hash(struct inode *inode,
 	return hash;
 }
 
-/*
- * ocfs2_xattr_hash_entry()
- *
- * Compute the hash of an extended attribute.
- */
-static void ocfs2_xattr_hash_entry(struct inode *inode,
-				   struct ocfs2_xattr_header *header,
-				   struct ocfs2_xattr_entry *entry)
+static int ocfs2_xattr_entry_real_size(int name_len, size_t value_len)
 {
-	u32 hash = 0;
-	char *name = (char *)header + le16_to_cpu(entry->xe_name_offset);
-
-	hash = ocfs2_xattr_name_hash(inode, name, entry->xe_name_len);
-	entry->xe_name_hash = cpu_to_le32(hash);
-
-	return;
+	return namevalue_size(name_len, value_len) +
+		sizeof(struct ocfs2_xattr_entry);
 }
 
-static int ocfs2_xattr_entry_real_size(int name_len, size_t value_len)
+static int ocfs2_xi_entry_usage(struct ocfs2_xattr_info *xi)
 {
-	int size = 0;
-
-	if (value_len <= OCFS2_XATTR_INLINE_SIZE)
-		size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(value_len);
-	else
-		size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
-	size += sizeof(struct ocfs2_xattr_entry);
+	return namevalue_size_xi(xi) +
+		sizeof(struct ocfs2_xattr_entry);
+}
 
-	return size;
+static int ocfs2_xe_entry_usage(struct ocfs2_xattr_entry *xe)
+{
+	return namevalue_size_xe(xe) +
+		sizeof(struct ocfs2_xattr_entry);
 }
 
 int ocfs2_calc_security_init(struct inode *dir,
@@ -619,7 +709,7 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
 					 struct ocfs2_xattr_value_buf *vb,
 					 struct ocfs2_xattr_set_ctxt *ctxt)
 {
-	int status = 0;
+	int status = 0, credits;
 	handle_t *handle = ctxt->handle;
 	enum ocfs2_alloc_restarted why;
 	u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters);
@@ -629,42 +719,54 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
 
 	ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb);
 
-	status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
-			      OCFS2_JOURNAL_ACCESS_WRITE);
-	if (status < 0) {
-		mlog_errno(status);
-		goto leave;
-	}
-
-	prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
-	status = ocfs2_add_clusters_in_btree(handle,
-					     &et,
-					     &logical_start,
-					     clusters_to_add,
-					     0,
-					     ctxt->data_ac,
-					     ctxt->meta_ac,
-					     &why);
-	if (status < 0) {
-		mlog_errno(status);
-		goto leave;
-	}
+	while (clusters_to_add) {
+		status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
+				       OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			break;
+		}
 
-	status = ocfs2_journal_dirty(handle, vb->vb_bh);
-	if (status < 0) {
-		mlog_errno(status);
-		goto leave;
-	}
+		prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
+		status = ocfs2_add_clusters_in_btree(handle,
+						     &et,
+						     &logical_start,
+						     clusters_to_add,
+						     0,
+						     ctxt->data_ac,
+						     ctxt->meta_ac,
+						     &why);
+		if ((status < 0) && (status != -EAGAIN)) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			break;
+		}
 
-	clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - prev_clusters;
+		ocfs2_journal_dirty(handle, vb->vb_bh);
 
-	/*
-	 * We should have already allocated enough space before the transaction,
-	 * so no need to restart.
-	 */
-	BUG_ON(why != RESTART_NONE || clusters_to_add);
+		clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) -
+					 prev_clusters;
 
-leave:
+		if (why != RESTART_NONE && clusters_to_add) {
+			/*
+			 * We can only fail in case the alloc file doesn't give
+			 * up enough clusters.
+			 */
+			BUG_ON(why == RESTART_META);
+
+			mlog(0, "restarting xattr value extension for %u"
+			     " clusters,.\n", clusters_to_add);
+			credits = ocfs2_calc_extend_credits(inode->i_sb,
+							    &vb->vb_xv->xr_list,
+							    clusters_to_add);
+			status = ocfs2_extend_trans(handle, credits);
+			if (status < 0) {
+				status = -ENOMEM;
+				mlog_errno(status);
+				break;
+			}
+		}
+	}
 
 	return status;
 }
@@ -697,12 +799,7 @@ static int __ocfs2_remove_xattr_range(struct inode *inode,
 	}
 
 	le32_add_cpu(&vb->vb_xv->xr_clusters, -len);
-
-	ret = ocfs2_journal_dirty(handle, vb->vb_bh);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
+	ocfs2_journal_dirty(handle, vb->vb_bh);
 
 	if (ext_flags & OCFS2_EXT_REFCOUNTED)
 		ret = ocfs2_decrease_refcount(inode, handle,
@@ -1285,11 +1382,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
 				memset(bh->b_data + cp_len, 0,
 				       blocksize - cp_len);
 
-			ret = ocfs2_journal_dirty(handle, bh);
-			if (ret < 0) {
-				mlog_errno(ret);
-				goto out;
-			}
+			ocfs2_journal_dirty(handle, bh);
 			brelse(bh);
 			bh = NULL;
 
@@ -1308,452 +1401,901 @@ out:
 	return ret;
 }
 
-static int ocfs2_xattr_cleanup(struct inode *inode,
-			       handle_t *handle,
-			       struct ocfs2_xattr_info *xi,
-			       struct ocfs2_xattr_search *xs,
-			       struct ocfs2_xattr_value_buf *vb,
-			       size_t offs)
+static int ocfs2_xa_check_space_helper(int needed_space, int free_start,
+				       int num_entries)
 {
-	int ret = 0;
-	size_t name_len = strlen(xi->name);
-	void *val = xs->base + offs;
-	size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
+	int free_space;
 
-	ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
-			    OCFS2_JOURNAL_ACCESS_WRITE);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
-	/* Decrease xattr count */
-	le16_add_cpu(&xs->header->xh_count, -1);
-	/* Remove the xattr entry and tree root which has already be set*/
-	memset((void *)xs->here, 0, sizeof(struct ocfs2_xattr_entry));
-	memset(val, 0, size);
+	if (!needed_space)
+		return 0;
 
-	ret = ocfs2_journal_dirty(handle, vb->vb_bh);
-	if (ret < 0)
-		mlog_errno(ret);
-out:
-	return ret;
+	free_space = free_start -
+		sizeof(struct ocfs2_xattr_header) -
+		(num_entries * sizeof(struct ocfs2_xattr_entry)) -
+		OCFS2_XATTR_HEADER_GAP;
+	if (free_space < 0)
+		return -EIO;
+	if (free_space < needed_space)
+		return -ENOSPC;
+
+	return 0;
 }
 
-static int ocfs2_xattr_update_entry(struct inode *inode,
-				    handle_t *handle,
-				    struct ocfs2_xattr_info *xi,
-				    struct ocfs2_xattr_search *xs,
-				    struct ocfs2_xattr_value_buf *vb,
-				    size_t offs)
+static int ocfs2_xa_journal_access(handle_t *handle, struct ocfs2_xa_loc *loc,
+				   int type)
 {
-	int ret;
+	return loc->xl_ops->xlo_journal_access(handle, loc, type);
+}
 
-	ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
-			    OCFS2_JOURNAL_ACCESS_WRITE);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
+static void ocfs2_xa_journal_dirty(handle_t *handle, struct ocfs2_xa_loc *loc)
+{
+	loc->xl_ops->xlo_journal_dirty(handle, loc);
+}
 
-	xs->here->xe_name_offset = cpu_to_le16(offs);
-	xs->here->xe_value_size = cpu_to_le64(xi->value_len);
-	if (xi->value_len <= OCFS2_XATTR_INLINE_SIZE)
-		ocfs2_xattr_set_local(xs->here, 1);
-	else
-		ocfs2_xattr_set_local(xs->here, 0);
-	ocfs2_xattr_hash_entry(inode, xs->header, xs->here);
+/* Give a pointer into the storage for the given offset */
+static void *ocfs2_xa_offset_pointer(struct ocfs2_xa_loc *loc, int offset)
+{
+	BUG_ON(offset >= loc->xl_size);
+	return loc->xl_ops->xlo_offset_pointer(loc, offset);
+}
 
-	ret = ocfs2_journal_dirty(handle, vb->vb_bh);
-	if (ret < 0)
-		mlog_errno(ret);
-out:
-	return ret;
+/*
+ * Wipe the name+value pair and allow the storage to reclaim it.  This
+ * must be followed by either removal of the entry or a call to
+ * ocfs2_xa_add_namevalue().
+ */
+static void ocfs2_xa_wipe_namevalue(struct ocfs2_xa_loc *loc)
+{
+	loc->xl_ops->xlo_wipe_namevalue(loc);
 }
 
 /*
- * ocfs2_xattr_set_value_outside()
- *
- * Set large size value in B tree.
+ * Find lowest offset to a name+value pair.  This is the start of our
+ * downward-growing free space.
  */
-static int ocfs2_xattr_set_value_outside(struct inode *inode,
-					 struct ocfs2_xattr_info *xi,
-					 struct ocfs2_xattr_search *xs,
-					 struct ocfs2_xattr_set_ctxt *ctxt,
-					 struct ocfs2_xattr_value_buf *vb,
-					 size_t offs)
+static int ocfs2_xa_get_free_start(struct ocfs2_xa_loc *loc)
 {
-	size_t name_len = strlen(xi->name);
-	void *val = xs->base + offs;
-	struct ocfs2_xattr_value_root *xv = NULL;
-	size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
-	int ret = 0;
+	return loc->xl_ops->xlo_get_free_start(loc);
+}
 
-	memset(val, 0, size);
-	memcpy(val, xi->name, name_len);
-	xv = (struct ocfs2_xattr_value_root *)
-		(val + OCFS2_XATTR_SIZE(name_len));
-	xv->xr_clusters = 0;
-	xv->xr_last_eb_blk = 0;
-	xv->xr_list.l_tree_depth = 0;
-	xv->xr_list.l_count = cpu_to_le16(1);
-	xv->xr_list.l_next_free_rec = 0;
-	vb->vb_xv = xv;
-
-	ret = ocfs2_xattr_value_truncate(inode, vb, xi->value_len, ctxt);
-	if (ret < 0) {
-		mlog_errno(ret);
-		return ret;
+/* Can we reuse loc->xl_entry for xi? */
+static int ocfs2_xa_can_reuse_entry(struct ocfs2_xa_loc *loc,
+				    struct ocfs2_xattr_info *xi)
+{
+	return loc->xl_ops->xlo_can_reuse(loc, xi);
+}
+
+/* How much free space is needed to set the new value */
+static int ocfs2_xa_check_space(struct ocfs2_xa_loc *loc,
+				struct ocfs2_xattr_info *xi)
+{
+	return loc->xl_ops->xlo_check_space(loc, xi);
+}
+
+static void ocfs2_xa_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
+{
+	loc->xl_ops->xlo_add_entry(loc, name_hash);
+	loc->xl_entry->xe_name_hash = cpu_to_le32(name_hash);
+	/*
+	 * We can't leave the new entry's xe_name_offset at zero or
+	 * add_namevalue() will go nuts.  We set it to the size of our
+	 * storage so that it can never be less than any other entry.
+	 */
+	loc->xl_entry->xe_name_offset = cpu_to_le16(loc->xl_size);
+}
+
+static void ocfs2_xa_add_namevalue(struct ocfs2_xa_loc *loc,
+				   struct ocfs2_xattr_info *xi)
+{
+	int size = namevalue_size_xi(xi);
+	int nameval_offset;
+	char *nameval_buf;
+
+	loc->xl_ops->xlo_add_namevalue(loc, size);
+	loc->xl_entry->xe_value_size = cpu_to_le64(xi->xi_value_len);
+	loc->xl_entry->xe_name_len = xi->xi_name_len;
+	ocfs2_xattr_set_type(loc->xl_entry, xi->xi_name_index);
+	ocfs2_xattr_set_local(loc->xl_entry,
+			      xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE);
+
+	nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
+	nameval_buf = ocfs2_xa_offset_pointer(loc, nameval_offset);
+	memset(nameval_buf, 0, size);
+	memcpy(nameval_buf, xi->xi_name, xi->xi_name_len);
+}
+
+static void ocfs2_xa_fill_value_buf(struct ocfs2_xa_loc *loc,
+				    struct ocfs2_xattr_value_buf *vb)
+{
+	int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
+	int name_size = OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len);
+
+	/* Value bufs are for value trees */
+	BUG_ON(ocfs2_xattr_is_local(loc->xl_entry));
+	BUG_ON(namevalue_size_xe(loc->xl_entry) !=
+	       (name_size + OCFS2_XATTR_ROOT_SIZE));
+
+	loc->xl_ops->xlo_fill_value_buf(loc, vb);
+	vb->vb_xv =
+		(struct ocfs2_xattr_value_root *)ocfs2_xa_offset_pointer(loc,
+							nameval_offset +
+							name_size);
+}
+
+static int ocfs2_xa_block_journal_access(handle_t *handle,
+					 struct ocfs2_xa_loc *loc, int type)
+{
+	struct buffer_head *bh = loc->xl_storage;
+	ocfs2_journal_access_func access;
+
+	if (loc->xl_size == (bh->b_size -
+			     offsetof(struct ocfs2_xattr_block,
+				      xb_attrs.xb_header)))
+		access = ocfs2_journal_access_xb;
+	else
+		access = ocfs2_journal_access_di;
+	return access(handle, INODE_CACHE(loc->xl_inode), bh, type);
+}
+
+static void ocfs2_xa_block_journal_dirty(handle_t *handle,
+					 struct ocfs2_xa_loc *loc)
+{
+	struct buffer_head *bh = loc->xl_storage;
+
+	ocfs2_journal_dirty(handle, bh);
+}
+
+static void *ocfs2_xa_block_offset_pointer(struct ocfs2_xa_loc *loc,
+					   int offset)
+{
+	return (char *)loc->xl_header + offset;
+}
+
+static int ocfs2_xa_block_can_reuse(struct ocfs2_xa_loc *loc,
+				    struct ocfs2_xattr_info *xi)
+{
+	/*
+	 * Block storage is strict.  If the sizes aren't exact, we will
+	 * remove the old one and reinsert the new.
+	 */
+	return namevalue_size_xe(loc->xl_entry) ==
+		namevalue_size_xi(xi);
+}
+
+static int ocfs2_xa_block_get_free_start(struct ocfs2_xa_loc *loc)
+{
+	struct ocfs2_xattr_header *xh = loc->xl_header;
+	int i, count = le16_to_cpu(xh->xh_count);
+	int offset, free_start = loc->xl_size;
+
+	for (i = 0; i < count; i++) {
+		offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset);
+		if (offset < free_start)
+			free_start = offset;
 	}
-	ret = ocfs2_xattr_update_entry(inode, ctxt->handle, xi, xs, vb, offs);
-	if (ret < 0) {
-		mlog_errno(ret);
-		return ret;
+
+	return free_start;
+}
+
+static int ocfs2_xa_block_check_space(struct ocfs2_xa_loc *loc,
+				      struct ocfs2_xattr_info *xi)
+{
+	int count = le16_to_cpu(loc->xl_header->xh_count);
+	int free_start = ocfs2_xa_get_free_start(loc);
+	int needed_space = ocfs2_xi_entry_usage(xi);
+
+	/*
+	 * Block storage will reclaim the original entry before inserting
+	 * the new value, so we only need the difference.  If the new
+	 * entry is smaller than the old one, we don't need anything.
+	 */
+	if (loc->xl_entry) {
+		/* Don't need space if we're reusing! */
+		if (ocfs2_xa_can_reuse_entry(loc, xi))
+			needed_space = 0;
+		else
+			needed_space -= ocfs2_xe_entry_usage(loc->xl_entry);
 	}
-	ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb,
-					      xi->value, xi->value_len);
-	if (ret < 0)
-		mlog_errno(ret);
+	if (needed_space < 0)
+		needed_space = 0;
+	return ocfs2_xa_check_space_helper(needed_space, free_start, count);
+}
 
-	return ret;
+/*
+ * Block storage for xattrs keeps the name+value pairs compacted.  When
+ * we remove one, we have to shift any that preceded it towards the end.
+ */
+static void ocfs2_xa_block_wipe_namevalue(struct ocfs2_xa_loc *loc)
+{
+	int i, offset;
+	int namevalue_offset, first_namevalue_offset, namevalue_size;
+	struct ocfs2_xattr_entry *entry = loc->xl_entry;
+	struct ocfs2_xattr_header *xh = loc->xl_header;
+	int count = le16_to_cpu(xh->xh_count);
+
+	namevalue_offset = le16_to_cpu(entry->xe_name_offset);
+	namevalue_size = namevalue_size_xe(entry);
+	first_namevalue_offset = ocfs2_xa_get_free_start(loc);
+
+	/* Shift the name+value pairs */
+	memmove((char *)xh + first_namevalue_offset + namevalue_size,
+		(char *)xh + first_namevalue_offset,
+		namevalue_offset - first_namevalue_offset);
+	memset((char *)xh + first_namevalue_offset, 0, namevalue_size);
+
+	/* Now tell xh->xh_entries about it */
+	for (i = 0; i < count; i++) {
+		offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset);
+		if (offset <= namevalue_offset)
+			le16_add_cpu(&xh->xh_entries[i].xe_name_offset,
+				     namevalue_size);
+	}
+
+	/*
+	 * Note that we don't update xh_free_start or xh_name_value_len
+	 * because they're not used in block-stored xattrs.
+	 */
+}
+
+static void ocfs2_xa_block_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
+{
+	int count = le16_to_cpu(loc->xl_header->xh_count);
+	loc->xl_entry = &(loc->xl_header->xh_entries[count]);
+	le16_add_cpu(&loc->xl_header->xh_count, 1);
+	memset(loc->xl_entry, 0, sizeof(struct ocfs2_xattr_entry));
+}
+
+static void ocfs2_xa_block_add_namevalue(struct ocfs2_xa_loc *loc, int size)
+{
+	int free_start = ocfs2_xa_get_free_start(loc);
+
+	loc->xl_entry->xe_name_offset = cpu_to_le16(free_start - size);
+}
+
+static void ocfs2_xa_block_fill_value_buf(struct ocfs2_xa_loc *loc,
+					  struct ocfs2_xattr_value_buf *vb)
+{
+	struct buffer_head *bh = loc->xl_storage;
+
+	if (loc->xl_size == (bh->b_size -
+			     offsetof(struct ocfs2_xattr_block,
+				      xb_attrs.xb_header)))
+		vb->vb_access = ocfs2_journal_access_xb;
+	else
+		vb->vb_access = ocfs2_journal_access_di;
+	vb->vb_bh = bh;
 }
 
 /*
- * ocfs2_xattr_set_entry_local()
- *
- * Set, replace or remove extended attribute in local.
+ * Operations for xattrs stored in blocks.  This includes inline inode
+ * storage and unindexed ocfs2_xattr_blocks.
  */
-static void ocfs2_xattr_set_entry_local(struct inode *inode,
-					struct ocfs2_xattr_info *xi,
-					struct ocfs2_xattr_search *xs,
-					struct ocfs2_xattr_entry *last,
-					size_t min_offs)
+static const struct ocfs2_xa_loc_operations ocfs2_xa_block_loc_ops = {
+	.xlo_journal_access	= ocfs2_xa_block_journal_access,
+	.xlo_journal_dirty	= ocfs2_xa_block_journal_dirty,
+	.xlo_offset_pointer	= ocfs2_xa_block_offset_pointer,
+	.xlo_check_space	= ocfs2_xa_block_check_space,
+	.xlo_can_reuse		= ocfs2_xa_block_can_reuse,
+	.xlo_get_free_start	= ocfs2_xa_block_get_free_start,
+	.xlo_wipe_namevalue	= ocfs2_xa_block_wipe_namevalue,
+	.xlo_add_entry		= ocfs2_xa_block_add_entry,
+	.xlo_add_namevalue	= ocfs2_xa_block_add_namevalue,
+	.xlo_fill_value_buf	= ocfs2_xa_block_fill_value_buf,
+};
+
+static int ocfs2_xa_bucket_journal_access(handle_t *handle,
+					  struct ocfs2_xa_loc *loc, int type)
 {
-	size_t name_len = strlen(xi->name);
-	int i;
+	struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
 
-	if (xi->value && xs->not_found) {
-		/* Insert the new xattr entry. */
-		le16_add_cpu(&xs->header->xh_count, 1);
-		ocfs2_xattr_set_type(last, xi->name_index);
-		ocfs2_xattr_set_local(last, 1);
-		last->xe_name_len = name_len;
-	} else {
-		void *first_val;
-		void *val;
-		size_t offs, size;
-
-		first_val = xs->base + min_offs;
-		offs = le16_to_cpu(xs->here->xe_name_offset);
-		val = xs->base + offs;
-
-		if (le64_to_cpu(xs->here->xe_value_size) >
-		    OCFS2_XATTR_INLINE_SIZE)
-			size = OCFS2_XATTR_SIZE(name_len) +
-				OCFS2_XATTR_ROOT_SIZE;
+	return ocfs2_xattr_bucket_journal_access(handle, bucket, type);
+}
+
+static void ocfs2_xa_bucket_journal_dirty(handle_t *handle,
+					  struct ocfs2_xa_loc *loc)
+{
+	struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
+
+	ocfs2_xattr_bucket_journal_dirty(handle, bucket);
+}
+
+static void *ocfs2_xa_bucket_offset_pointer(struct ocfs2_xa_loc *loc,
+					    int offset)
+{
+	struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
+	int block, block_offset;
+
+	/* The header is at the front of the bucket */
+	block = offset >> loc->xl_inode->i_sb->s_blocksize_bits;
+	block_offset = offset % loc->xl_inode->i_sb->s_blocksize;
+
+	return bucket_block(bucket, block) + block_offset;
+}
+
+static int ocfs2_xa_bucket_can_reuse(struct ocfs2_xa_loc *loc,
+				     struct ocfs2_xattr_info *xi)
+{
+	return namevalue_size_xe(loc->xl_entry) >=
+		namevalue_size_xi(xi);
+}
+
+static int ocfs2_xa_bucket_get_free_start(struct ocfs2_xa_loc *loc)
+{
+	struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
+	return le16_to_cpu(bucket_xh(bucket)->xh_free_start);
+}
+
+static int ocfs2_bucket_align_free_start(struct super_block *sb,
+					 int free_start, int size)
+{
+	/*
+	 * We need to make sure that the name+value pair fits within
+	 * one block.
+	 */
+	if (((free_start - size) >> sb->s_blocksize_bits) !=
+	    ((free_start - 1) >> sb->s_blocksize_bits))
+		free_start -= free_start % sb->s_blocksize;
+
+	return free_start;
+}
+
+static int ocfs2_xa_bucket_check_space(struct ocfs2_xa_loc *loc,
+				       struct ocfs2_xattr_info *xi)
+{
+	int rc;
+	int count = le16_to_cpu(loc->xl_header->xh_count);
+	int free_start = ocfs2_xa_get_free_start(loc);
+	int needed_space = ocfs2_xi_entry_usage(xi);
+	int size = namevalue_size_xi(xi);
+	struct super_block *sb = loc->xl_inode->i_sb;
+
+	/*
+	 * Bucket storage does not reclaim name+value pairs it cannot
+	 * reuse.  They live as holes until the bucket fills, and then
+	 * the bucket is defragmented.  However, the bucket can reclaim
+	 * the ocfs2_xattr_entry.
+	 */
+	if (loc->xl_entry) {
+		/* Don't need space if we're reusing! */
+		if (ocfs2_xa_can_reuse_entry(loc, xi))
+			needed_space = 0;
 		else
-			size = OCFS2_XATTR_SIZE(name_len) +
-			OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
-
-		if (xi->value && size == OCFS2_XATTR_SIZE(name_len) +
-				OCFS2_XATTR_SIZE(xi->value_len)) {
-			/* The old and the new value have the
-			   same size. Just replace the value. */
-			ocfs2_xattr_set_local(xs->here, 1);
-			xs->here->xe_value_size = cpu_to_le64(xi->value_len);
-			/* Clear value bytes. */
-			memset(val + OCFS2_XATTR_SIZE(name_len),
-			       0,
-			       OCFS2_XATTR_SIZE(xi->value_len));
-			memcpy(val + OCFS2_XATTR_SIZE(name_len),
-			       xi->value,
-			       xi->value_len);
-			return;
-		}
-		/* Remove the old name+value. */
-		memmove(first_val + size, first_val, val - first_val);
-		memset(first_val, 0, size);
-		xs->here->xe_name_hash = 0;
-		xs->here->xe_name_offset = 0;
-		ocfs2_xattr_set_local(xs->here, 1);
-		xs->here->xe_value_size = 0;
-
-		min_offs += size;
-
-		/* Adjust all value offsets. */
-		last = xs->header->xh_entries;
-		for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) {
-			size_t o = le16_to_cpu(last->xe_name_offset);
-
-			if (o < offs)
-				last->xe_name_offset = cpu_to_le16(o + size);
-			last += 1;
-		}
+			needed_space -= sizeof(struct ocfs2_xattr_entry);
+	}
+	BUG_ON(needed_space < 0);
 
-		if (!xi->value) {
-			/* Remove the old entry. */
-			last -= 1;
-			memmove(xs->here, xs->here + 1,
-				(void *)last - (void *)xs->here);
-			memset(last, 0, sizeof(struct ocfs2_xattr_entry));
-			le16_add_cpu(&xs->header->xh_count, -1);
-		}
+	if (free_start < size) {
+		if (needed_space)
+			return -ENOSPC;
+	} else {
+		/*
+		 * First we check if it would fit in the first place.
+		 * Below, we align the free start to a block.  This may
+		 * slide us below the minimum gap.  By checking unaligned
+		 * first, we avoid that error.
+		 */
+		rc = ocfs2_xa_check_space_helper(needed_space, free_start,
+						 count);
+		if (rc)
+			return rc;
+		free_start = ocfs2_bucket_align_free_start(sb, free_start,
+							   size);
 	}
-	if (xi->value) {
-		/* Insert the new name+value. */
-		size_t size = OCFS2_XATTR_SIZE(name_len) +
-				OCFS2_XATTR_SIZE(xi->value_len);
-		void *val = xs->base + min_offs - size;
+	return ocfs2_xa_check_space_helper(needed_space, free_start, count);
+}
 
-		xs->here->xe_name_offset = cpu_to_le16(min_offs - size);
-		memset(val, 0, size);
-		memcpy(val, xi->name, name_len);
-		memcpy(val + OCFS2_XATTR_SIZE(name_len),
-		       xi->value,
-		       xi->value_len);
-		xs->here->xe_value_size = cpu_to_le64(xi->value_len);
-		ocfs2_xattr_set_local(xs->here, 1);
-		ocfs2_xattr_hash_entry(inode, xs->header, xs->here);
+static void ocfs2_xa_bucket_wipe_namevalue(struct ocfs2_xa_loc *loc)
+{
+	le16_add_cpu(&loc->xl_header->xh_name_value_len,
+		     -namevalue_size_xe(loc->xl_entry));
+}
+
+static void ocfs2_xa_bucket_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
+{
+	struct ocfs2_xattr_header *xh = loc->xl_header;
+	int count = le16_to_cpu(xh->xh_count);
+	int low = 0, high = count - 1, tmp;
+	struct ocfs2_xattr_entry *tmp_xe;
+
+	/*
+	 * We keep buckets sorted by name_hash, so we need to find
+	 * our insert place.
+	 */
+	while (low <= high && count) {
+		tmp = (low + high) / 2;
+		tmp_xe = &xh->xh_entries[tmp];
+
+		if (name_hash > le32_to_cpu(tmp_xe->xe_name_hash))
+			low = tmp + 1;
+		else if (name_hash < le32_to_cpu(tmp_xe->xe_name_hash))
+			high = tmp - 1;
+		else {
+			low = tmp;
+			break;
+		}
 	}
 
-	return;
+	if (low != count)
+		memmove(&xh->xh_entries[low + 1],
+			&xh->xh_entries[low],
+			((count - low) * sizeof(struct ocfs2_xattr_entry)));
+
+	le16_add_cpu(&xh->xh_count, 1);
+	loc->xl_entry = &xh->xh_entries[low];
+	memset(loc->xl_entry, 0, sizeof(struct ocfs2_xattr_entry));
+}
+
+static void ocfs2_xa_bucket_add_namevalue(struct ocfs2_xa_loc *loc, int size)
+{
+	int free_start = ocfs2_xa_get_free_start(loc);
+	struct ocfs2_xattr_header *xh = loc->xl_header;
+	struct super_block *sb = loc->xl_inode->i_sb;
+	int nameval_offset;
+
+	free_start = ocfs2_bucket_align_free_start(sb, free_start, size);
+	nameval_offset = free_start - size;
+	loc->xl_entry->xe_name_offset = cpu_to_le16(nameval_offset);
+	xh->xh_free_start = cpu_to_le16(nameval_offset);
+	le16_add_cpu(&xh->xh_name_value_len, size);
+
+}
+
+static void ocfs2_xa_bucket_fill_value_buf(struct ocfs2_xa_loc *loc,
+					   struct ocfs2_xattr_value_buf *vb)
+{
+	struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
+	struct super_block *sb = loc->xl_inode->i_sb;
+	int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
+	int size = namevalue_size_xe(loc->xl_entry);
+	int block_offset = nameval_offset >> sb->s_blocksize_bits;
+
+	/* Values are not allowed to straddle block boundaries */
+	BUG_ON(block_offset !=
+	       ((nameval_offset + size - 1) >> sb->s_blocksize_bits));
+	/* We expect the bucket to be filled in */
+	BUG_ON(!bucket->bu_bhs[block_offset]);
+
+	vb->vb_access = ocfs2_journal_access;
+	vb->vb_bh = bucket->bu_bhs[block_offset];
+}
+
+/* Operations for xattrs stored in buckets. */
+static const struct ocfs2_xa_loc_operations ocfs2_xa_bucket_loc_ops = {
+	.xlo_journal_access	= ocfs2_xa_bucket_journal_access,
+	.xlo_journal_dirty	= ocfs2_xa_bucket_journal_dirty,
+	.xlo_offset_pointer	= ocfs2_xa_bucket_offset_pointer,
+	.xlo_check_space	= ocfs2_xa_bucket_check_space,
+	.xlo_can_reuse		= ocfs2_xa_bucket_can_reuse,
+	.xlo_get_free_start	= ocfs2_xa_bucket_get_free_start,
+	.xlo_wipe_namevalue	= ocfs2_xa_bucket_wipe_namevalue,
+	.xlo_add_entry		= ocfs2_xa_bucket_add_entry,
+	.xlo_add_namevalue	= ocfs2_xa_bucket_add_namevalue,
+	.xlo_fill_value_buf	= ocfs2_xa_bucket_fill_value_buf,
+};
+
+static unsigned int ocfs2_xa_value_clusters(struct ocfs2_xa_loc *loc)
+{
+	struct ocfs2_xattr_value_buf vb;
+
+	if (ocfs2_xattr_is_local(loc->xl_entry))
+		return 0;
+
+	ocfs2_xa_fill_value_buf(loc, &vb);
+	return le32_to_cpu(vb.vb_xv->xr_clusters);
+}
+
+static int ocfs2_xa_value_truncate(struct ocfs2_xa_loc *loc, u64 bytes,
+				   struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int trunc_rc, access_rc;
+	struct ocfs2_xattr_value_buf vb;
+
+	ocfs2_xa_fill_value_buf(loc, &vb);
+	trunc_rc = ocfs2_xattr_value_truncate(loc->xl_inode, &vb, bytes,
+					      ctxt);
+
+	/*
+	 * The caller of ocfs2_xa_value_truncate() has already called
+	 * ocfs2_xa_journal_access on the loc.  However, The truncate code
+	 * calls ocfs2_extend_trans().  This may commit the previous
+	 * transaction and open a new one.  If this is a bucket, truncate
+	 * could leave only vb->vb_bh set up for journaling.  Meanwhile,
+	 * the caller is expecting to dirty the entire bucket.  So we must
+	 * reset the journal work.  We do this even if truncate has failed,
+	 * as it could have failed after committing the extend.
+	 */
+	access_rc = ocfs2_xa_journal_access(ctxt->handle, loc,
+					    OCFS2_JOURNAL_ACCESS_WRITE);
+
+	/* Errors in truncate take precedence */
+	return trunc_rc ? trunc_rc : access_rc;
+}
+
+static void ocfs2_xa_remove_entry(struct ocfs2_xa_loc *loc)
+{
+	int index, count;
+	struct ocfs2_xattr_header *xh = loc->xl_header;
+	struct ocfs2_xattr_entry *entry = loc->xl_entry;
+
+	ocfs2_xa_wipe_namevalue(loc);
+	loc->xl_entry = NULL;
+
+	le16_add_cpu(&xh->xh_count, -1);
+	count = le16_to_cpu(xh->xh_count);
+
+	/*
+	 * Only zero out the entry if there are more remaining.  This is
+	 * important for an empty bucket, as it keeps track of the
+	 * bucket's hash value.  It doesn't hurt empty block storage.
+	 */
+	if (count) {
+		index = ((char *)entry - (char *)&xh->xh_entries) /
+			sizeof(struct ocfs2_xattr_entry);
+		memmove(&xh->xh_entries[index], &xh->xh_entries[index + 1],
+			(count - index) * sizeof(struct ocfs2_xattr_entry));
+		memset(&xh->xh_entries[count], 0,
+		       sizeof(struct ocfs2_xattr_entry));
+	}
 }
 
 /*
- * ocfs2_xattr_set_entry()
+ * If we have a problem adjusting the size of an external value during
+ * ocfs2_xa_prepare_entry() or ocfs2_xa_remove(), we may have an xattr
+ * in an intermediate state.  For example, the value may be partially
+ * truncated.
+ *
+ * If the value tree hasn't changed, the extend/truncate went nowhere.
+ * We have nothing to do.  The caller can treat it as a straight error.
  *
- * Set extended attribute entry into inode or block.
+ * If the value tree got partially truncated, we now have a corrupted
+ * extended attribute.  We're going to wipe its entry and leak the
+ * clusters.  Better to leak some storage than leave a corrupt entry.
  *
- * If extended attribute value size > OCFS2_XATTR_INLINE_SIZE,
- * We first insert tree root(ocfs2_xattr_value_root) with set_entry_local(),
- * then set value in B tree with set_value_outside().
+ * If the value tree grew, it obviously didn't grow enough for the
+ * new entry.  We're not going to try and reclaim those clusters either.
+ * If there was already an external value there (orig_clusters != 0),
+ * the new clusters are attached safely and we can just leave the old
+ * value in place.  If there was no external value there, we remove
+ * the entry.
+ *
+ * This way, the xattr block we store in the journal will be consistent.
+ * If the size change broke because of the journal, no changes will hit
+ * disk anyway.
  */
-static int ocfs2_xattr_set_entry(struct inode *inode,
-				 struct ocfs2_xattr_info *xi,
-				 struct ocfs2_xattr_search *xs,
-				 struct ocfs2_xattr_set_ctxt *ctxt,
-				 int flag)
-{
-	struct ocfs2_xattr_entry *last;
-	struct ocfs2_inode_info *oi = OCFS2_I(inode);
-	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
-	size_t min_offs = xs->end - xs->base, name_len = strlen(xi->name);
-	size_t size_l = 0;
-	handle_t *handle = ctxt->handle;
-	int free, i, ret;
-	struct ocfs2_xattr_info xi_l = {
-		.name_index = xi->name_index,
-		.name = xi->name,
-		.value = xi->value,
-		.value_len = xi->value_len,
-	};
-	struct ocfs2_xattr_value_buf vb = {
-		.vb_bh = xs->xattr_bh,
-		.vb_access = ocfs2_journal_access_di,
-	};
+static void ocfs2_xa_cleanup_value_truncate(struct ocfs2_xa_loc *loc,
+					    const char *what,
+					    unsigned int orig_clusters)
+{
+	unsigned int new_clusters = ocfs2_xa_value_clusters(loc);
+	char *nameval_buf = ocfs2_xa_offset_pointer(loc,
+				le16_to_cpu(loc->xl_entry->xe_name_offset));
+
+	if (new_clusters < orig_clusters) {
+		mlog(ML_ERROR,
+		     "Partial truncate while %s xattr %.*s.  Leaking "
+		     "%u clusters and removing the entry\n",
+		     what, loc->xl_entry->xe_name_len, nameval_buf,
+		     orig_clusters - new_clusters);
+		ocfs2_xa_remove_entry(loc);
+	} else if (!orig_clusters) {
+		mlog(ML_ERROR,
+		     "Unable to allocate an external value for xattr "
+		     "%.*s safely.  Leaking %u clusters and removing the "
+		     "entry\n",
+		     loc->xl_entry->xe_name_len, nameval_buf,
+		     new_clusters - orig_clusters);
+		ocfs2_xa_remove_entry(loc);
+	} else if (new_clusters > orig_clusters)
+		mlog(ML_ERROR,
+		     "Unable to grow xattr %.*s safely.  %u new clusters "
+		     "have been added, but the value will not be "
+		     "modified\n",
+		     loc->xl_entry->xe_name_len, nameval_buf,
+		     new_clusters - orig_clusters);
+}
+
+static int ocfs2_xa_remove(struct ocfs2_xa_loc *loc,
+			   struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int rc = 0;
+	unsigned int orig_clusters;
+
+	if (!ocfs2_xattr_is_local(loc->xl_entry)) {
+		orig_clusters = ocfs2_xa_value_clusters(loc);
+		rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
+		if (rc) {
+			mlog_errno(rc);
+			/*
+			 * Since this is remove, we can return 0 if
+			 * ocfs2_xa_cleanup_value_truncate() is going to
+			 * wipe the entry anyway.  So we check the
+			 * cluster count as well.
+			 */
+			if (orig_clusters != ocfs2_xa_value_clusters(loc))
+				rc = 0;
+			ocfs2_xa_cleanup_value_truncate(loc, "removing",
+							orig_clusters);
+			if (rc)
+				goto out;
+		}
+	}
 
-	if (!(flag & OCFS2_INLINE_XATTR_FL)) {
-		BUG_ON(xs->xattr_bh == xs->inode_bh);
-		vb.vb_access = ocfs2_journal_access_xb;
-	} else
-		BUG_ON(xs->xattr_bh != xs->inode_bh);
+	ocfs2_xa_remove_entry(loc);
 
-	/* Compute min_offs, last and free space. */
-	last = xs->header->xh_entries;
+out:
+	return rc;
+}
 
-	for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) {
-		size_t offs = le16_to_cpu(last->xe_name_offset);
-		if (offs < min_offs)
-			min_offs = offs;
-		last += 1;
-	}
+static void ocfs2_xa_install_value_root(struct ocfs2_xa_loc *loc)
+{
+	int name_size = OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len);
+	char *nameval_buf;
 
-	free = min_offs - ((void *)last - xs->base) - OCFS2_XATTR_HEADER_GAP;
-	if (free < 0)
-		return -EIO;
+	nameval_buf = ocfs2_xa_offset_pointer(loc,
+				le16_to_cpu(loc->xl_entry->xe_name_offset));
+	memcpy(nameval_buf + name_size, &def_xv, OCFS2_XATTR_ROOT_SIZE);
+}
 
-	if (!xs->not_found) {
-		size_t size = 0;
-		if (ocfs2_xattr_is_local(xs->here))
-			size = OCFS2_XATTR_SIZE(name_len) +
-			OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
-		else
-			size = OCFS2_XATTR_SIZE(name_len) +
-				OCFS2_XATTR_ROOT_SIZE;
-		free += (size + sizeof(struct ocfs2_xattr_entry));
-	}
-	/* Check free space in inode or block */
-	if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
-		if (free < sizeof(struct ocfs2_xattr_entry) +
-			   OCFS2_XATTR_SIZE(name_len) +
-			   OCFS2_XATTR_ROOT_SIZE) {
-			ret = -ENOSPC;
-			goto out;
+/*
+ * Take an existing entry and make it ready for the new value.  This
+ * won't allocate space, but it may free space.  It should be ready for
+ * ocfs2_xa_prepare_entry() to finish the work.
+ */
+static int ocfs2_xa_reuse_entry(struct ocfs2_xa_loc *loc,
+				struct ocfs2_xattr_info *xi,
+				struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int rc = 0;
+	int name_size = OCFS2_XATTR_SIZE(xi->xi_name_len);
+	unsigned int orig_clusters;
+	char *nameval_buf;
+	int xe_local = ocfs2_xattr_is_local(loc->xl_entry);
+	int xi_local = xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE;
+
+	BUG_ON(OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len) !=
+	       name_size);
+
+	nameval_buf = ocfs2_xa_offset_pointer(loc,
+				le16_to_cpu(loc->xl_entry->xe_name_offset));
+	if (xe_local) {
+		memset(nameval_buf + name_size, 0,
+		       namevalue_size_xe(loc->xl_entry) - name_size);
+		if (!xi_local)
+			ocfs2_xa_install_value_root(loc);
+	} else {
+		orig_clusters = ocfs2_xa_value_clusters(loc);
+		if (xi_local) {
+			rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
+			if (rc < 0)
+				mlog_errno(rc);
+			else
+				memset(nameval_buf + name_size, 0,
+				       namevalue_size_xe(loc->xl_entry) -
+				       name_size);
+		} else if (le64_to_cpu(loc->xl_entry->xe_value_size) >
+			   xi->xi_value_len) {
+			rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len,
+						     ctxt);
+			if (rc < 0)
+				mlog_errno(rc);
 		}
-		size_l = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
-		xi_l.value = (void *)&def_xv;
-		xi_l.value_len = OCFS2_XATTR_ROOT_SIZE;
-	} else if (xi->value) {
-		if (free < sizeof(struct ocfs2_xattr_entry) +
-			   OCFS2_XATTR_SIZE(name_len) +
-			   OCFS2_XATTR_SIZE(xi->value_len)) {
-			ret = -ENOSPC;
+
+		if (rc) {
+			ocfs2_xa_cleanup_value_truncate(loc, "reusing",
+							orig_clusters);
 			goto out;
 		}
 	}
 
-	if (!xs->not_found) {
-		/* For existing extended attribute */
-		size_t size = OCFS2_XATTR_SIZE(name_len) +
-			OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
-		size_t offs = le16_to_cpu(xs->here->xe_name_offset);
-		void *val = xs->base + offs;
+	loc->xl_entry->xe_value_size = cpu_to_le64(xi->xi_value_len);
+	ocfs2_xattr_set_local(loc->xl_entry, xi_local);
 
-		if (ocfs2_xattr_is_local(xs->here) && size == size_l) {
-			/* Replace existing local xattr with tree root */
-			ret = ocfs2_xattr_set_value_outside(inode, xi, xs,
-							    ctxt, &vb, offs);
-			if (ret < 0)
-				mlog_errno(ret);
-			goto out;
-		} else if (!ocfs2_xattr_is_local(xs->here)) {
-			/* For existing xattr which has value outside */
-			vb.vb_xv = (struct ocfs2_xattr_value_root *)
-				(val + OCFS2_XATTR_SIZE(name_len));
+out:
+	return rc;
+}
 
-			if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
-				/*
-				 * If new value need set outside also,
-				 * first truncate old value to new value,
-				 * then set new value with set_value_outside().
-				 */
-				ret = ocfs2_xattr_value_truncate(inode,
-								 &vb,
-								 xi->value_len,
-								 ctxt);
-				if (ret < 0) {
-					mlog_errno(ret);
-					goto out;
-				}
+/*
+ * Prepares loc->xl_entry to receive the new xattr.  This includes
+ * properly setting up the name+value pair region.  If loc->xl_entry
+ * already exists, it will take care of modifying it appropriately.
+ *
+ * Note that this modifies the data.  You did journal_access already,
+ * right?
+ */
+static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc *loc,
+				  struct ocfs2_xattr_info *xi,
+				  u32 name_hash,
+				  struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int rc = 0;
+	unsigned int orig_clusters;
+	__le64 orig_value_size = 0;
 
-				ret = ocfs2_xattr_update_entry(inode,
-							       handle,
-							       xi,
-							       xs,
-							       &vb,
-							       offs);
-				if (ret < 0) {
-					mlog_errno(ret);
-					goto out;
-				}
+	rc = ocfs2_xa_check_space(loc, xi);
+	if (rc)
+		goto out;
 
-				ret = __ocfs2_xattr_set_value_outside(inode,
-								handle,
-								&vb,
-								xi->value,
-								xi->value_len);
-				if (ret < 0)
-					mlog_errno(ret);
+	if (loc->xl_entry) {
+		if (ocfs2_xa_can_reuse_entry(loc, xi)) {
+			orig_value_size = loc->xl_entry->xe_value_size;
+			rc = ocfs2_xa_reuse_entry(loc, xi, ctxt);
+			if (rc)
+				goto out;
+			goto alloc_value;
+		}
+
+		if (!ocfs2_xattr_is_local(loc->xl_entry)) {
+			orig_clusters = ocfs2_xa_value_clusters(loc);
+			rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
+			if (rc) {
+				mlog_errno(rc);
+				ocfs2_xa_cleanup_value_truncate(loc,
+								"overwriting",
+								orig_clusters);
 				goto out;
-			} else {
-				/*
-				 * If new value need set in local,
-				 * just trucate old value to zero.
-				 */
-				 ret = ocfs2_xattr_value_truncate(inode,
-								  &vb,
-								  0,
-								  ctxt);
-				if (ret < 0)
-					mlog_errno(ret);
 			}
 		}
+		ocfs2_xa_wipe_namevalue(loc);
+	} else
+		ocfs2_xa_add_entry(loc, name_hash);
+
+	/*
+	 * If we get here, we have a blank entry.  Fill it.  We grow our
+	 * name+value pair back from the end.
+	 */
+	ocfs2_xa_add_namevalue(loc, xi);
+	if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE)
+		ocfs2_xa_install_value_root(loc);
+
+alloc_value:
+	if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
+		orig_clusters = ocfs2_xa_value_clusters(loc);
+		rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len, ctxt);
+		if (rc < 0) {
+			ctxt->set_abort = 1;
+			ocfs2_xa_cleanup_value_truncate(loc, "growing",
+							orig_clusters);
+			/*
+			 * If we were growing an existing value,
+			 * ocfs2_xa_cleanup_value_truncate() won't remove
+			 * the entry. We need to restore the original value
+			 * size.
+			 */
+			if (loc->xl_entry) {
+				BUG_ON(!orig_value_size);
+				loc->xl_entry->xe_value_size = orig_value_size;
+			}
+			mlog_errno(rc);
+		}
 	}
 
-	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), xs->inode_bh,
+out:
+	return rc;
+}
+
+/*
+ * Store the value portion of the name+value pair.  This will skip
+ * values that are stored externally.  Their tree roots were set up
+ * by ocfs2_xa_prepare_entry().
+ */
+static int ocfs2_xa_store_value(struct ocfs2_xa_loc *loc,
+				struct ocfs2_xattr_info *xi,
+				struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int rc = 0;
+	int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
+	int name_size = OCFS2_XATTR_SIZE(xi->xi_name_len);
+	char *nameval_buf;
+	struct ocfs2_xattr_value_buf vb;
+
+	nameval_buf = ocfs2_xa_offset_pointer(loc, nameval_offset);
+	if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
+		ocfs2_xa_fill_value_buf(loc, &vb);
+		rc = __ocfs2_xattr_set_value_outside(loc->xl_inode,
+						     ctxt->handle, &vb,
+						     xi->xi_value,
+						     xi->xi_value_len);
+	} else
+		memcpy(nameval_buf + name_size, xi->xi_value, xi->xi_value_len);
+
+	return rc;
+}
+
+static int ocfs2_xa_set(struct ocfs2_xa_loc *loc,
+			struct ocfs2_xattr_info *xi,
+			struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int ret;
+	u32 name_hash = ocfs2_xattr_name_hash(loc->xl_inode, xi->xi_name,
+					      xi->xi_name_len);
+
+	ret = ocfs2_xa_journal_access(ctxt->handle, loc,
 				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 
-	if (!(flag & OCFS2_INLINE_XATTR_FL)) {
-		ret = vb.vb_access(handle, INODE_CACHE(inode), vb.vb_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
-		if (ret) {
-			mlog_errno(ret);
-			goto out;
-		}
-	}
-
 	/*
-	 * Set value in local, include set tree root in local.
-	 * This is the first step for value size >INLINE_SIZE.
+	 * From here on out, everything is going to modify the buffer a
+	 * little.  Errors are going to leave the xattr header in a
+	 * sane state.  Thus, even with errors we dirty the sucker.
 	 */
-	ocfs2_xattr_set_entry_local(inode, &xi_l, xs, last, min_offs);
 
-	if (!(flag & OCFS2_INLINE_XATTR_FL)) {
-		ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
-		if (ret < 0) {
-			mlog_errno(ret);
-			goto out;
-		}
+	/* Don't worry, we are never called with !xi_value and !xl_entry */
+	if (!xi->xi_value) {
+		ret = ocfs2_xa_remove(loc, ctxt);
+		goto out_dirty;
 	}
 
-	if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) &&
-	    (flag & OCFS2_INLINE_XATTR_FL)) {
-		struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-		unsigned int xattrsize = osb->s_xattr_inline_size;
-
-		/*
-		 * Adjust extent record count or inline data size
-		 * to reserve space for extended attribute.
-		 */
-		if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
-			struct ocfs2_inline_data *idata = &di->id2.i_data;
-			le16_add_cpu(&idata->id_count, -xattrsize);
-		} else if (!(ocfs2_inode_is_fast_symlink(inode))) {
-			struct ocfs2_extent_list *el = &di->id2.i_list;
-			le16_add_cpu(&el->l_count, -(xattrsize /
-					sizeof(struct ocfs2_extent_rec)));
-		}
-		di->i_xattr_inline_size = cpu_to_le16(xattrsize);
+	ret = ocfs2_xa_prepare_entry(loc, xi, name_hash, ctxt);
+	if (ret) {
+		if (ret != -ENOSPC)
+			mlog_errno(ret);
+		goto out_dirty;
 	}
-	/* Update xattr flag */
-	spin_lock(&oi->ip_lock);
-	oi->ip_dyn_features |= flag;
-	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
-	spin_unlock(&oi->ip_lock);
 
-	ret = ocfs2_journal_dirty(handle, xs->inode_bh);
-	if (ret < 0)
+	ret = ocfs2_xa_store_value(loc, xi, ctxt);
+	if (ret)
 		mlog_errno(ret);
 
-	if (!ret && xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
-		/*
-		 * Set value outside in B tree.
-		 * This is the second step for value size > INLINE_SIZE.
-		 */
-		size_t offs = le16_to_cpu(xs->here->xe_name_offset);
-		ret = ocfs2_xattr_set_value_outside(inode, xi, xs, ctxt,
-						    &vb, offs);
-		if (ret < 0) {
-			int ret2;
+out_dirty:
+	ocfs2_xa_journal_dirty(ctxt->handle, loc);
 
-			mlog_errno(ret);
-			/*
-			 * If set value outside failed, we have to clean
-			 * the junk tree root we have already set in local.
-			 */
-			ret2 = ocfs2_xattr_cleanup(inode, ctxt->handle,
-						   xi, xs, &vb, offs);
-			if (ret2 < 0)
-				mlog_errno(ret2);
-		}
-	}
 out:
 	return ret;
 }
 
+static void ocfs2_init_dinode_xa_loc(struct ocfs2_xa_loc *loc,
+				     struct inode *inode,
+				     struct buffer_head *bh,
+				     struct ocfs2_xattr_entry *entry)
+{
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
+
+	BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_XATTR_FL));
+
+	loc->xl_inode = inode;
+	loc->xl_ops = &ocfs2_xa_block_loc_ops;
+	loc->xl_storage = bh;
+	loc->xl_entry = entry;
+	loc->xl_size = le16_to_cpu(di->i_xattr_inline_size);
+	loc->xl_header =
+		(struct ocfs2_xattr_header *)(bh->b_data + bh->b_size -
+					      loc->xl_size);
+}
+
+static void ocfs2_init_xattr_block_xa_loc(struct ocfs2_xa_loc *loc,
+					  struct inode *inode,
+					  struct buffer_head *bh,
+					  struct ocfs2_xattr_entry *entry)
+{
+	struct ocfs2_xattr_block *xb =
+		(struct ocfs2_xattr_block *)bh->b_data;
+
+	BUG_ON(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED);
+
+	loc->xl_inode = inode;
+	loc->xl_ops = &ocfs2_xa_block_loc_ops;
+	loc->xl_storage = bh;
+	loc->xl_header = &(xb->xb_attrs.xb_header);
+	loc->xl_entry = entry;
+	loc->xl_size = bh->b_size - offsetof(struct ocfs2_xattr_block,
+					     xb_attrs.xb_header);
+}
+
+static void ocfs2_init_xattr_bucket_xa_loc(struct ocfs2_xa_loc *loc,
+					   struct ocfs2_xattr_bucket *bucket,
+					   struct ocfs2_xattr_entry *entry)
+{
+	loc->xl_inode = bucket->bu_inode;
+	loc->xl_ops = &ocfs2_xa_bucket_loc_ops;
+	loc->xl_storage = bucket;
+	loc->xl_header = bucket_xh(bucket);
+	loc->xl_entry = entry;
+	loc->xl_size = OCFS2_XATTR_BUCKET_SIZE;
+}
+
 /*
  * In xattr remove, if it is stored outside and refcounted, we may have
  * the chance to split the refcount tree. So need the allocators.
@@ -1945,7 +2487,10 @@ static int ocfs2_xattr_free_block(struct inode *inode,
 	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
 	blk = le64_to_cpu(xb->xb_blkno);
 	bit = le16_to_cpu(xb->xb_suballoc_bit);
-	bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+	if (xb->xb_suballoc_loc)
+		bg_blkno = le64_to_cpu(xb->xb_suballoc_loc);
+	else
+		bg_blkno = ocfs2_which_suballoc_group(blk, bit);
 
 	xb_alloc_inode = ocfs2_get_system_file_inode(osb,
 				EXTENT_ALLOC_SYSTEM_INODE,
@@ -2060,9 +2605,7 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
 	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
 	spin_unlock(&oi->ip_lock);
 
-	ret = ocfs2_journal_dirty(handle, di_bh);
-	if (ret < 0)
-		mlog_errno(ret);
+	ocfs2_journal_dirty(handle, di_bh);
 out_commit:
 	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
@@ -2149,6 +2692,53 @@ static int ocfs2_xattr_ibody_find(struct inode *inode,
 	return 0;
 }
 
+static int ocfs2_xattr_ibody_init(struct inode *inode,
+				  struct buffer_head *di_bh,
+				  struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int ret;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	unsigned int xattrsize = osb->s_xattr_inline_size;
+
+	if (!ocfs2_xattr_has_space_inline(inode, di)) {
+		ret = -ENOSPC;
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode), di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Adjust extent record count or inline data size
+	 * to reserve space for extended attribute.
+	 */
+	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		struct ocfs2_inline_data *idata = &di->id2.i_data;
+		le16_add_cpu(&idata->id_count, -xattrsize);
+	} else if (!(ocfs2_inode_is_fast_symlink(inode))) {
+		struct ocfs2_extent_list *el = &di->id2.i_list;
+		le16_add_cpu(&el->l_count, -(xattrsize /
+					     sizeof(struct ocfs2_extent_rec)));
+	}
+	di->i_xattr_inline_size = cpu_to_le16(xattrsize);
+
+	spin_lock(&oi->ip_lock);
+	oi->ip_dyn_features |= OCFS2_INLINE_XATTR_FL|OCFS2_HAS_XATTR_FL;
+	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+	spin_unlock(&oi->ip_lock);
+
+	ocfs2_journal_dirty(ctxt->handle, di_bh);
+
+out:
+	return ret;
+}
+
 /*
  * ocfs2_xattr_ibody_set()
  *
@@ -2160,9 +2750,10 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
 				 struct ocfs2_xattr_search *xs,
 				 struct ocfs2_xattr_set_ctxt *ctxt)
 {
+	int ret;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
-	int ret;
+	struct ocfs2_xa_loc loc;
 
 	if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE)
 		return -ENOSPC;
@@ -2175,8 +2766,25 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
 		}
 	}
 
-	ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt,
-				(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL));
+	if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
+		ret = ocfs2_xattr_ibody_init(inode, xs->inode_bh, ctxt);
+		if (ret) {
+			if (ret != -ENOSPC)
+				mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	ocfs2_init_dinode_xa_loc(&loc, inode, xs->inode_bh,
+				 xs->not_found ? NULL : xs->here);
+	ret = ocfs2_xa_set(&loc, xi, ctxt);
+	if (ret) {
+		if (ret != -ENOSPC)
+			mlog_errno(ret);
+		goto out;
+	}
+	xs->here = loc.xl_entry;
+
 out:
 	up_write(&oi->ip_alloc_sem);
 
@@ -2236,32 +2844,30 @@ cleanup:
 	return ret;
 }
 
-static int ocfs2_create_xattr_block(handle_t *handle,
-				    struct inode *inode,
+static int ocfs2_create_xattr_block(struct inode *inode,
 				    struct buffer_head *inode_bh,
-				    struct ocfs2_alloc_context *meta_ac,
-				    struct buffer_head **ret_bh,
-				    int indexed)
+				    struct ocfs2_xattr_set_ctxt *ctxt,
+				    int indexed,
+				    struct buffer_head **ret_bh)
 {
 	int ret;
 	u16 suballoc_bit_start;
 	u32 num_got;
-	u64 first_blkno;
+	u64 suballoc_loc, first_blkno;
 	struct ocfs2_dinode *di =  (struct ocfs2_dinode *)inode_bh->b_data;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct buffer_head *new_bh = NULL;
 	struct ocfs2_xattr_block *xblk;
 
-	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), inode_bh,
-				      OCFS2_JOURNAL_ACCESS_CREATE);
+	ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode),
+				      inode_bh, OCFS2_JOURNAL_ACCESS_CREATE);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto end;
 	}
 
-	ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
-				   &suballoc_bit_start, &num_got,
-				   &first_blkno);
+	ret = ocfs2_claim_metadata(ctxt->handle, ctxt->meta_ac, 1,
+				   &suballoc_loc, &suballoc_bit_start,
+				   &num_got, &first_blkno);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto end;
@@ -2270,7 +2876,7 @@ static int ocfs2_create_xattr_block(handle_t *handle,
 	new_bh = sb_getblk(inode->i_sb, first_blkno);
 	ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh);
 
-	ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode),
+	ret = ocfs2_journal_access_xb(ctxt->handle, INODE_CACHE(inode),
 				      new_bh,
 				      OCFS2_JOURNAL_ACCESS_CREATE);
 	if (ret < 0) {
@@ -2282,11 +2888,12 @@ static int ocfs2_create_xattr_block(handle_t *handle,
 	xblk = (struct ocfs2_xattr_block *)new_bh->b_data;
 	memset(xblk, 0, inode->i_sb->s_blocksize);
 	strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
-	xblk->xb_suballoc_slot = cpu_to_le16(osb->slot_num);
+	xblk->xb_suballoc_slot = cpu_to_le16(ctxt->meta_ac->ac_alloc_slot);
+	xblk->xb_suballoc_loc = cpu_to_le64(suballoc_loc);
 	xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
-	xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation);
+	xblk->xb_fs_generation =
+		cpu_to_le32(OCFS2_SB(inode->i_sb)->fs_generation);
 	xblk->xb_blkno = cpu_to_le64(first_blkno);
-
 	if (indexed) {
 		struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root;
 		xr->xt_clusters = cpu_to_le32(1);
@@ -2297,14 +2904,17 @@ static int ocfs2_create_xattr_block(handle_t *handle,
 		xr->xt_list.l_next_free_rec = cpu_to_le16(1);
 		xblk->xb_flags = cpu_to_le16(OCFS2_XATTR_INDEXED);
 	}
+	ocfs2_journal_dirty(ctxt->handle, new_bh);
 
-	ret = ocfs2_journal_dirty(handle, new_bh);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto end;
-	}
+	/* Add it to the inode */
 	di->i_xattr_loc = cpu_to_le64(first_blkno);
-	ocfs2_journal_dirty(handle, inode_bh);
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	OCFS2_I(inode)->ip_dyn_features |= OCFS2_HAS_XATTR_FL;
+	di->i_dyn_features = cpu_to_le16(OCFS2_I(inode)->ip_dyn_features);
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	ocfs2_journal_dirty(ctxt->handle, inode_bh);
 
 	*ret_bh = new_bh;
 	new_bh = NULL;
@@ -2326,13 +2936,13 @@ static int ocfs2_xattr_block_set(struct inode *inode,
 				 struct ocfs2_xattr_set_ctxt *ctxt)
 {
 	struct buffer_head *new_bh = NULL;
-	handle_t *handle = ctxt->handle;
 	struct ocfs2_xattr_block *xblk = NULL;
 	int ret;
+	struct ocfs2_xa_loc loc;
 
 	if (!xs->xattr_bh) {
-		ret = ocfs2_create_xattr_block(handle, inode, xs->inode_bh,
-					       ctxt->meta_ac, &new_bh, 0);
+		ret = ocfs2_create_xattr_block(inode, xs->inode_bh, ctxt,
+					       0, &new_bh);
 		if (ret) {
 			mlog_errno(ret);
 			goto end;
@@ -2348,21 +2958,25 @@ static int ocfs2_xattr_block_set(struct inode *inode,
 		xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
 
 	if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) {
-		/* Set extended attribute into external block */
-		ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt,
-					    OCFS2_HAS_XATTR_FL);
-		if (!ret || ret != -ENOSPC)
-			goto end;
+		ocfs2_init_xattr_block_xa_loc(&loc, inode, xs->xattr_bh,
+					      xs->not_found ? NULL : xs->here);
 
-		ret = ocfs2_xattr_create_index_block(inode, xs, ctxt);
-		if (ret)
+		ret = ocfs2_xa_set(&loc, xi, ctxt);
+		if (!ret)
+			xs->here = loc.xl_entry;
+		else if ((ret != -ENOSPC) || ctxt->set_abort)
 			goto end;
+		else {
+			ret = ocfs2_xattr_create_index_block(inode, xs, ctxt);
+			if (ret)
+				goto end;
+		}
 	}
 
-	ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs, ctxt);
+	if (le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)
+		ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs, ctxt);
 
 end:
-
 	return ret;
 }
 
@@ -2371,7 +2985,6 @@ static int ocfs2_xattr_can_be_in_inode(struct inode *inode,
 				       struct ocfs2_xattr_info *xi,
 				       struct ocfs2_xattr_search *xs)
 {
-	u64 value_size;
 	struct ocfs2_xattr_entry *last;
 	int free, i;
 	size_t min_offs = xs->end - xs->base;
@@ -2394,13 +3007,7 @@ static int ocfs2_xattr_can_be_in_inode(struct inode *inode,
 
 	BUG_ON(!xs->not_found);
 
-	if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
-		value_size = OCFS2_XATTR_ROOT_SIZE;
-	else
-		value_size = OCFS2_XATTR_SIZE(xi->value_len);
-
-	if (free >= sizeof(struct ocfs2_xattr_entry) +
-		   OCFS2_XATTR_SIZE(strlen(xi->name)) + value_size)
+	if (free >= (sizeof(struct ocfs2_xattr_entry) + namevalue_size_xi(xi)))
 		return 1;
 
 	return 0;
@@ -2424,7 +3031,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
 	char *base = NULL;
 	int name_offset, name_len = 0;
 	u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb,
-						    xi->value_len);
+						    xi->xi_value_len);
 	u64 value_size;
 
 	/*
@@ -2432,14 +3039,14 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
 	 * No matter whether we replace an old one or add a new one,
 	 * we need this for writing.
 	 */
-	if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
+	if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE)
 		credits += new_clusters *
 			   ocfs2_clusters_to_blocks(inode->i_sb, 1);
 
 	if (xis->not_found && xbs->not_found) {
 		credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 
-		if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+		if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
 			clusters_add += new_clusters;
 			credits += ocfs2_calc_extend_credits(inode->i_sb,
 							&def_xv.xv.xr_list,
@@ -2484,7 +3091,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
 	 * The credits for removing the value tree will be extended
 	 * by ocfs2_remove_extent itself.
 	 */
-	if (!xi->value) {
+	if (!xi->xi_value) {
 		if (!ocfs2_xattr_is_local(xe))
 			credits += ocfs2_remove_extent_credits(inode->i_sb);
 
@@ -2514,7 +3121,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
 		}
 	}
 
-	if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+	if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
 		/* the new values will be stored outside. */
 		u32 old_clusters = 0;
 
@@ -2547,9 +3154,10 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
 		 * value, we don't need any allocation, otherwise we have
 		 * to guess metadata allocation.
 		 */
-		if ((ocfs2_xattr_is_local(xe) && value_size >= xi->value_len) ||
+		if ((ocfs2_xattr_is_local(xe) &&
+		     (value_size >= xi->xi_value_len)) ||
 		    (!ocfs2_xattr_is_local(xe) &&
-		     OCFS2_XATTR_ROOT_SIZE >= xi->value_len))
+		     OCFS2_XATTR_ROOT_SIZE >= xi->xi_value_len))
 			goto out;
 	}
 
@@ -2639,7 +3247,7 @@ static int ocfs2_init_xattr_set_ctxt(struct inode *inode,
 
 	meta_add += extra_meta;
 	mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, "
-	     "credits = %d\n", xi->name, meta_add, clusters_add, *credits);
+	     "credits = %d\n", xi->xi_name, meta_add, clusters_add, *credits);
 
 	if (meta_add) {
 		ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add,
@@ -2679,7 +3287,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
 {
 	int ret = 0, credits, old_found;
 
-	if (!xi->value) {
+	if (!xi->xi_value) {
 		/* Remove existing extended attribute */
 		if (!xis->not_found)
 			ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt);
@@ -2693,8 +3301,8 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
 			 * If succeed and that extended attribute existing in
 			 * external block, then we will remove it.
 			 */
-			xi->value = NULL;
-			xi->value_len = 0;
+			xi->xi_value = NULL;
+			xi->xi_value_len = 0;
 
 			old_found = xis->not_found;
 			xis->not_found = -ENODATA;
@@ -2712,18 +3320,17 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
 				goto out;
 			}
 
-			ret = ocfs2_extend_trans(ctxt->handle, credits +
-					ctxt->handle->h_buffer_credits);
+			ret = ocfs2_extend_trans(ctxt->handle, credits);
 			if (ret) {
 				mlog_errno(ret);
 				goto out;
 			}
 			ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
-		} else if (ret == -ENOSPC) {
+		} else if ((ret == -ENOSPC) && !ctxt->set_abort) {
 			if (di->i_xattr_loc && !xbs->xattr_bh) {
 				ret = ocfs2_xattr_block_find(inode,
-							     xi->name_index,
-							     xi->name, xbs);
+							     xi->xi_name_index,
+							     xi->xi_name, xbs);
 				if (ret)
 					goto out;
 
@@ -2743,8 +3350,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
 					goto out;
 				}
 
-				ret = ocfs2_extend_trans(ctxt->handle, credits +
-					ctxt->handle->h_buffer_credits);
+				ret = ocfs2_extend_trans(ctxt->handle, credits);
 				if (ret) {
 					mlog_errno(ret);
 					goto out;
@@ -2762,8 +3368,8 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
 				 * If succeed and that extended attribute
 				 * existing in inode, we will remove it.
 				 */
-				xi->value = NULL;
-				xi->value_len = 0;
+				xi->xi_value = NULL;
+				xi->xi_value_len = 0;
 				xbs->not_found = -ENODATA;
 				ret = ocfs2_calc_xattr_set_need(inode,
 								di,
@@ -2778,8 +3384,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
 					goto out;
 				}
 
-				ret = ocfs2_extend_trans(ctxt->handle, credits +
-						ctxt->handle->h_buffer_credits);
+				ret = ocfs2_extend_trans(ctxt->handle, credits);
 				if (ret) {
 					mlog_errno(ret);
 					goto out;
@@ -2829,10 +3434,11 @@ int ocfs2_xattr_set_handle(handle_t *handle,
 	int ret;
 
 	struct ocfs2_xattr_info xi = {
-		.name_index = name_index,
-		.name = name,
-		.value = value,
-		.value_len = value_len,
+		.xi_name_index = name_index,
+		.xi_name = name,
+		.xi_name_len = strlen(name),
+		.xi_value = value,
+		.xi_value_len = value_len,
 	};
 
 	struct ocfs2_xattr_search xis = {
@@ -2912,10 +3518,11 @@ int ocfs2_xattr_set(struct inode *inode,
 	struct ocfs2_refcount_tree *ref_tree = NULL;
 
 	struct ocfs2_xattr_info xi = {
-		.name_index = name_index,
-		.name = name,
-		.value = value,
-		.value_len = value_len,
+		.xi_name_index = name_index,
+		.xi_name = name,
+		.xi_name_len = strlen(name),
+		.xi_value = value,
+		.xi_value_len = value_len,
 	};
 
 	struct ocfs2_xattr_search xis = {
@@ -3647,7 +4254,6 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
 	u32 bit_off, len;
 	u64 blkno;
 	handle_t *handle = ctxt->handle;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct buffer_head *xb_bh = xs->xattr_bh;
 	struct ocfs2_xattr_block *xb =
@@ -3675,7 +4281,7 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
 		goto out;
 	}
 
-	ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac,
+	ret = __ocfs2_claim_clusters(handle, ctxt->data_ac,
 				     1, 1, &bit_off, &len);
 	if (ret) {
 		mlog_errno(ret);
@@ -3759,7 +4365,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
 				     struct ocfs2_xattr_bucket *bucket)
 {
 	int ret, i;
-	size_t end, offset, len, value_len;
+	size_t end, offset, len;
 	struct ocfs2_xattr_header *xh;
 	char *entries, *buf, *bucket_buf = NULL;
 	u64 blkno = bucket_blkno(bucket);
@@ -3813,12 +4419,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
 	end = OCFS2_XATTR_BUCKET_SIZE;
 	for (i = 0; i < le16_to_cpu(xh->xh_count); i++, xe++) {
 		offset = le16_to_cpu(xe->xe_name_offset);
-		if (ocfs2_xattr_is_local(xe))
-			value_len = OCFS2_XATTR_SIZE(
-					le64_to_cpu(xe->xe_value_size));
-		else
-			value_len = OCFS2_XATTR_ROOT_SIZE;
-		len = OCFS2_XATTR_SIZE(xe->xe_name_len) + value_len;
+		len = namevalue_size_xe(xe);
 
 		/*
 		 * We must make sure that the name/value pair
@@ -4007,7 +4608,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
 				    int new_bucket_head)
 {
 	int ret, i;
-	int count, start, len, name_value_len = 0, xe_len, name_offset = 0;
+	int count, start, len, name_value_len = 0, name_offset = 0;
 	struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL;
 	struct ocfs2_xattr_header *xh;
 	struct ocfs2_xattr_entry *xe;
@@ -4098,13 +4699,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
 	name_value_len = 0;
 	for (i = 0; i < start; i++) {
 		xe = &xh->xh_entries[i];
-		xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
-		if (ocfs2_xattr_is_local(xe))
-			xe_len +=
-			   OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
-		else
-			xe_len += OCFS2_XATTR_ROOT_SIZE;
-		name_value_len += xe_len;
+		name_value_len += namevalue_size_xe(xe);
 		if (le16_to_cpu(xe->xe_name_offset) < name_offset)
 			name_offset = le16_to_cpu(xe->xe_name_offset);
 	}
@@ -4134,12 +4729,6 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
 	xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE);
 	for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
 		xe = &xh->xh_entries[i];
-		xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
-		if (ocfs2_xattr_is_local(xe))
-			xe_len +=
-			   OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
-		else
-			xe_len += OCFS2_XATTR_ROOT_SIZE;
 		if (le16_to_cpu(xe->xe_name_offset) <
 		    le16_to_cpu(xh->xh_free_start))
 			xh->xh_free_start = xe->xe_name_offset;
@@ -4302,8 +4891,7 @@ static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
 	 * We need to update the first bucket of the old extent and all
 	 * the buckets going to the new extent.
 	 */
-	credits = ((num_buckets + 1) * blks_per_bucket) +
-		handle->h_buffer_credits;
+	credits = ((num_buckets + 1) * blks_per_bucket);
 	ret = ocfs2_extend_trans(handle, credits);
 	if (ret) {
 		mlog_errno(ret);
@@ -4373,7 +4961,7 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode,
 				      u32 *first_hash)
 {
 	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-	int ret, credits = 2 * blk_per_bucket + handle->h_buffer_credits;
+	int ret, credits = 2 * blk_per_bucket;
 
 	BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize);
 
@@ -4514,7 +5102,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
 		goto leave;
 	}
 
-	ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac, 1,
+	ret = __ocfs2_claim_clusters(handle, ctxt->data_ac, 1,
 				     clusters_to_add, &bit_off, &num_bits);
 	if (ret < 0) {
 		if (ret != -ENOSPC)
@@ -4568,9 +5156,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
 		goto leave;
 	}
 
-	ret = ocfs2_journal_dirty(handle, root_bh);
-	if (ret < 0)
-		mlog_errno(ret);
+	ocfs2_journal_dirty(handle, root_bh);
 
 leave:
 	return ret;
@@ -4615,8 +5201,7 @@ static int ocfs2_extend_xattr_bucket(struct inode *inode,
 	 * existing bucket.  Then we add the last existing bucket, the
 	 * new bucket, and the first bucket (3 * blk_per_bucket).
 	 */
-	credits = (end_blk - target_blk) + (3 * blk_per_bucket) +
-		  handle->h_buffer_credits;
+	credits = (end_blk - target_blk) + (3 * blk_per_bucket);
 	ret = ocfs2_extend_trans(handle, credits);
 	if (ret) {
 		mlog_errno(ret);
@@ -4751,195 +5336,6 @@ static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode,
 }
 
 /*
- * Handle the normal xattr set, including replace, delete and new.
- *
- * Note: "local" indicates the real data's locality. So we can't
- * just its bucket locality by its length.
- */
-static void ocfs2_xattr_set_entry_normal(struct inode *inode,
-					 struct ocfs2_xattr_info *xi,
-					 struct ocfs2_xattr_search *xs,
-					 u32 name_hash,
-					 int local)
-{
-	struct ocfs2_xattr_entry *last, *xe;
-	int name_len = strlen(xi->name);
-	struct ocfs2_xattr_header *xh = xs->header;
-	u16 count = le16_to_cpu(xh->xh_count), start;
-	size_t blocksize = inode->i_sb->s_blocksize;
-	char *val;
-	size_t offs, size, new_size;
-
-	last = &xh->xh_entries[count];
-	if (!xs->not_found) {
-		xe = xs->here;
-		offs = le16_to_cpu(xe->xe_name_offset);
-		if (ocfs2_xattr_is_local(xe))
-			size = OCFS2_XATTR_SIZE(name_len) +
-			OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
-		else
-			size = OCFS2_XATTR_SIZE(name_len) +
-			OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE);
-
-		/*
-		 * If the new value will be stored outside, xi->value has been
-		 * initalized as an empty ocfs2_xattr_value_root, and the same
-		 * goes with xi->value_len, so we can set new_size safely here.
-		 * See ocfs2_xattr_set_in_bucket.
-		 */
-		new_size = OCFS2_XATTR_SIZE(name_len) +
-			   OCFS2_XATTR_SIZE(xi->value_len);
-
-		le16_add_cpu(&xh->xh_name_value_len, -size);
-		if (xi->value) {
-			if (new_size > size)
-				goto set_new_name_value;
-
-			/* Now replace the old value with new one. */
-			if (local)
-				xe->xe_value_size = cpu_to_le64(xi->value_len);
-			else
-				xe->xe_value_size = 0;
-
-			val = ocfs2_xattr_bucket_get_val(inode,
-							 xs->bucket, offs);
-			memset(val + OCFS2_XATTR_SIZE(name_len), 0,
-			       size - OCFS2_XATTR_SIZE(name_len));
-			if (OCFS2_XATTR_SIZE(xi->value_len) > 0)
-				memcpy(val + OCFS2_XATTR_SIZE(name_len),
-				       xi->value, xi->value_len);
-
-			le16_add_cpu(&xh->xh_name_value_len, new_size);
-			ocfs2_xattr_set_local(xe, local);
-			return;
-		} else {
-			/*
-			 * Remove the old entry if there is more than one.
-			 * We don't remove the last entry so that we can
-			 * use it to indicate the hash value of the empty
-			 * bucket.
-			 */
-			last -= 1;
-			le16_add_cpu(&xh->xh_count, -1);
-			if (xh->xh_count) {
-				memmove(xe, xe + 1,
-					(void *)last - (void *)xe);
-				memset(last, 0,
-				       sizeof(struct ocfs2_xattr_entry));
-			} else
-				xh->xh_free_start =
-					cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE);
-
-			return;
-		}
-	} else {
-		/* find a new entry for insert. */
-		int low = 0, high = count - 1, tmp;
-		struct ocfs2_xattr_entry *tmp_xe;
-
-		while (low <= high && count) {
-			tmp = (low + high) / 2;
-			tmp_xe = &xh->xh_entries[tmp];
-
-			if (name_hash > le32_to_cpu(tmp_xe->xe_name_hash))
-				low = tmp + 1;
-			else if (name_hash <
-				 le32_to_cpu(tmp_xe->xe_name_hash))
-				high = tmp - 1;
-			else {
-				low = tmp;
-				break;
-			}
-		}
-
-		xe = &xh->xh_entries[low];
-		if (low != count)
-			memmove(xe + 1, xe, (void *)last - (void *)xe);
-
-		le16_add_cpu(&xh->xh_count, 1);
-		memset(xe, 0, sizeof(struct ocfs2_xattr_entry));
-		xe->xe_name_hash = cpu_to_le32(name_hash);
-		xe->xe_name_len = name_len;
-		ocfs2_xattr_set_type(xe, xi->name_index);
-	}
-
-set_new_name_value:
-	/* Insert the new name+value. */
-	size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(xi->value_len);
-
-	/*
-	 * We must make sure that the name/value pair
-	 * exists in the same block.
-	 */
-	offs = le16_to_cpu(xh->xh_free_start);
-	start = offs - size;
-
-	if (start >> inode->i_sb->s_blocksize_bits !=
-	    (offs - 1) >> inode->i_sb->s_blocksize_bits) {
-		offs = offs - offs % blocksize;
-		xh->xh_free_start = cpu_to_le16(offs);
-	}
-
-	val = ocfs2_xattr_bucket_get_val(inode, xs->bucket, offs - size);
-	xe->xe_name_offset = cpu_to_le16(offs - size);
-
-	memset(val, 0, size);
-	memcpy(val, xi->name, name_len);
-	memcpy(val + OCFS2_XATTR_SIZE(name_len), xi->value, xi->value_len);
-
-	xe->xe_value_size = cpu_to_le64(xi->value_len);
-	ocfs2_xattr_set_local(xe, local);
-	xs->here = xe;
-	le16_add_cpu(&xh->xh_free_start, -size);
-	le16_add_cpu(&xh->xh_name_value_len, size);
-
-	return;
-}
-
-/*
- * Set the xattr entry in the specified bucket.
- * The bucket is indicated by xs->bucket and it should have the enough
- * space for the xattr insertion.
- */
-static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
-					   handle_t *handle,
-					   struct ocfs2_xattr_info *xi,
-					   struct ocfs2_xattr_search *xs,
-					   u32 name_hash,
-					   int local)
-{
-	int ret;
-	u64 blkno;
-
-	mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n",
-	     (unsigned long)xi->value_len, xi->name_index,
-	     (unsigned long long)bucket_blkno(xs->bucket));
-
-	if (!xs->bucket->bu_bhs[1]) {
-		blkno = bucket_blkno(xs->bucket);
-		ocfs2_xattr_bucket_relse(xs->bucket);
-		ret = ocfs2_read_xattr_bucket(xs->bucket, blkno);
-		if (ret) {
-			mlog_errno(ret);
-			goto out;
-		}
-	}
-
-	ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
-						OCFS2_JOURNAL_ACCESS_WRITE);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto out;
-	}
-
-	ocfs2_xattr_set_entry_normal(inode, xi, xs, name_hash, local);
-	ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
-
-out:
-	return ret;
-}
-
-/*
  * Truncate the specified xe_off entry in xattr bucket.
  * bucket is indicated by header_bh and len is the new length.
  * Both the ocfs2_xattr_value_root and the entry will be updated here.
@@ -5009,66 +5405,6 @@ out:
 	return ret;
 }
 
-static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
-					struct ocfs2_xattr_search *xs,
-					int len,
-					struct ocfs2_xattr_set_ctxt *ctxt)
-{
-	int ret, offset;
-	struct ocfs2_xattr_entry *xe = xs->here;
-	struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)xs->base;
-
-	BUG_ON(!xs->bucket->bu_bhs[0] || !xe || ocfs2_xattr_is_local(xe));
-
-	offset = xe - xh->xh_entries;
-	ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket,
-						offset, len, ctxt);
-	if (ret)
-		mlog_errno(ret);
-
-	return ret;
-}
-
-static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
-						handle_t *handle,
-						struct ocfs2_xattr_search *xs,
-						char *val,
-						int value_len)
-{
-	int ret, offset, block_off;
-	struct ocfs2_xattr_value_root *xv;
-	struct ocfs2_xattr_entry *xe = xs->here;
-	struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
-	void *base;
-	struct ocfs2_xattr_value_buf vb = {
-		.vb_access = ocfs2_journal_access,
-	};
-
-	BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe));
-
-	ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, xh,
-						xe - xh->xh_entries,
-						&block_off,
-						&offset);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
-
-	base = bucket_block(xs->bucket, block_off);
-	xv = (struct ocfs2_xattr_value_root *)(base + offset +
-		 OCFS2_XATTR_SIZE(xe->xe_name_len));
-
-	vb.vb_xv = xv;
-	vb.vb_bh = xs->bucket->bu_bhs[block_off];
-	ret = __ocfs2_xattr_set_value_outside(inode, handle,
-					      &vb, val, value_len);
-	if (ret)
-		mlog_errno(ret);
-out:
-	return ret;
-}
-
 static int ocfs2_rm_xattr_cluster(struct inode *inode,
 				  struct buffer_head *root_bh,
 				  u64 blkno,
@@ -5141,12 +5477,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
 	}
 
 	le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, -len);
-
-	ret = ocfs2_journal_dirty(handle, root_bh);
-	if (ret) {
-		mlog_errno(ret);
-		goto out_commit;
-	}
+	ocfs2_journal_dirty(handle, root_bh);
 
 	ret = ocfs2_truncate_log_append(osb, handle, blkno, len);
 	if (ret)
@@ -5167,128 +5498,6 @@ out:
 	return ret;
 }
 
-static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
-					 handle_t *handle,
-					 struct ocfs2_xattr_search *xs)
-{
-	struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
-	struct ocfs2_xattr_entry *last = &xh->xh_entries[
-						le16_to_cpu(xh->xh_count) - 1];
-	int ret = 0;
-
-	ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
-						OCFS2_JOURNAL_ACCESS_WRITE);
-	if (ret) {
-		mlog_errno(ret);
-		return;
-	}
-
-	/* Remove the old entry. */
-	memmove(xs->here, xs->here + 1,
-		(void *)last - (void *)xs->here);
-	memset(last, 0, sizeof(struct ocfs2_xattr_entry));
-	le16_add_cpu(&xh->xh_count, -1);
-
-	ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
-}
-
-/*
- * Set the xattr name/value in the bucket specified in xs.
- *
- * As the new value in xi may be stored in the bucket or in an outside cluster,
- * we divide the whole process into 3 steps:
- * 1. insert name/value in the bucket(ocfs2_xattr_set_entry_in_bucket)
- * 2. truncate of the outside cluster(ocfs2_xattr_bucket_value_truncate_xs)
- * 3. Set the value to the outside cluster(ocfs2_xattr_bucket_set_value_outside)
- * 4. If the clusters for the new outside value can't be allocated, we need
- *    to free the xattr we allocated in set.
- */
-static int ocfs2_xattr_set_in_bucket(struct inode *inode,
-				     struct ocfs2_xattr_info *xi,
-				     struct ocfs2_xattr_search *xs,
-				     struct ocfs2_xattr_set_ctxt *ctxt)
-{
-	int ret, local = 1;
-	size_t value_len;
-	char *val = (char *)xi->value;
-	struct ocfs2_xattr_entry *xe = xs->here;
-	u32 name_hash = ocfs2_xattr_name_hash(inode, xi->name,
-					      strlen(xi->name));
-
-	if (!xs->not_found && !ocfs2_xattr_is_local(xe)) {
-		/*
-		 * We need to truncate the xattr storage first.
-		 *
-		 * If both the old and new value are stored to
-		 * outside block, we only need to truncate
-		 * the storage and then set the value outside.
-		 *
-		 * If the new value should be stored within block,
-		 * we should free all the outside block first and
-		 * the modification to the xattr block will be done
-		 * by following steps.
-		 */
-		if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
-			value_len = xi->value_len;
-		else
-			value_len = 0;
-
-		ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
-							   value_len,
-							   ctxt);
-		if (ret)
-			goto out;
-
-		if (value_len)
-			goto set_value_outside;
-	}
-
-	value_len = xi->value_len;
-	/* So we have to handle the inside block change now. */
-	if (value_len > OCFS2_XATTR_INLINE_SIZE) {
-		/*
-		 * If the new value will be stored outside of block,
-		 * initalize a new empty value root and insert it first.
-		 */
-		local = 0;
-		xi->value = &def_xv;
-		xi->value_len = OCFS2_XATTR_ROOT_SIZE;
-	}
-
-	ret = ocfs2_xattr_set_entry_in_bucket(inode, ctxt->handle, xi, xs,
-					      name_hash, local);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
-
-	if (value_len <= OCFS2_XATTR_INLINE_SIZE)
-		goto out;
-
-	/* allocate the space now for the outside block storage. */
-	ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
-						   value_len, ctxt);
-	if (ret) {
-		mlog_errno(ret);
-
-		if (xs->not_found) {
-			/*
-			 * We can't allocate enough clusters for outside
-			 * storage and we have allocated xattr already,
-			 * so need to remove it.
-			 */
-			ocfs2_xattr_bucket_remove_xs(inode, ctxt->handle, xs);
-		}
-		goto out;
-	}
-
-set_value_outside:
-	ret = ocfs2_xattr_bucket_set_value_outside(inode, ctxt->handle,
-						   xs, val, value_len);
-out:
-	return ret;
-}
-
 /*
  * check whether the xattr bucket is filled up with the same hash value.
  * If we want to insert the xattr with the same hash, return -ENOSPC.
@@ -5317,156 +5526,116 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
 	return 0;
 }
 
-static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
-					     struct ocfs2_xattr_info *xi,
-					     struct ocfs2_xattr_search *xs,
-					     struct ocfs2_xattr_set_ctxt *ctxt)
+/*
+ * Try to set the entry in the current bucket.  If we fail, the caller
+ * will handle getting us another bucket.
+ */
+static int ocfs2_xattr_set_entry_bucket(struct inode *inode,
+					struct ocfs2_xattr_info *xi,
+					struct ocfs2_xattr_search *xs,
+					struct ocfs2_xattr_set_ctxt *ctxt)
 {
-	struct ocfs2_xattr_header *xh;
-	struct ocfs2_xattr_entry *xe;
-	u16 count, header_size, xh_free_start;
-	int free, max_free, need, old;
-	size_t value_size = 0, name_len = strlen(xi->name);
-	size_t blocksize = inode->i_sb->s_blocksize;
-	int ret, allocation = 0;
-
-	mlog_entry("Set xattr %s in xattr index block\n", xi->name);
-
-try_again:
-	xh = xs->header;
-	count = le16_to_cpu(xh->xh_count);
-	xh_free_start = le16_to_cpu(xh->xh_free_start);
-	header_size = sizeof(struct ocfs2_xattr_header) +
-			count * sizeof(struct ocfs2_xattr_entry);
-	max_free = OCFS2_XATTR_BUCKET_SIZE - header_size -
-		le16_to_cpu(xh->xh_name_value_len) - OCFS2_XATTR_HEADER_GAP;
-
-	mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size "
-			"of %u which exceed block size\n",
-			(unsigned long long)bucket_blkno(xs->bucket),
-			header_size);
+	int ret;
+	struct ocfs2_xa_loc loc;
 
-	if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE)
-		value_size = OCFS2_XATTR_ROOT_SIZE;
-	else if (xi->value)
-		value_size = OCFS2_XATTR_SIZE(xi->value_len);
+	mlog_entry("Set xattr %s in xattr bucket\n", xi->xi_name);
 
-	if (xs->not_found)
-		need = sizeof(struct ocfs2_xattr_entry) +
-			OCFS2_XATTR_SIZE(name_len) + value_size;
-	else {
-		need = value_size + OCFS2_XATTR_SIZE(name_len);
+	ocfs2_init_xattr_bucket_xa_loc(&loc, xs->bucket,
+				       xs->not_found ? NULL : xs->here);
+	ret = ocfs2_xa_set(&loc, xi, ctxt);
+	if (!ret) {
+		xs->here = loc.xl_entry;
+		goto out;
+	}
+	if (ret != -ENOSPC) {
+		mlog_errno(ret);
+		goto out;
+	}
 
-		/*
-		 * We only replace the old value if the new length is smaller
-		 * than the old one. Otherwise we will allocate new space in the
-		 * bucket to store it.
-		 */
-		xe = xs->here;
-		if (ocfs2_xattr_is_local(xe))
-			old = OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
-		else
-			old = OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE);
+	/* Ok, we need space.  Let's try defragmenting the bucket. */
+	ret = ocfs2_defrag_xattr_bucket(inode, ctxt->handle,
+					xs->bucket);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
 
-		if (old >= value_size)
-			need = 0;
+	ret = ocfs2_xa_set(&loc, xi, ctxt);
+	if (!ret) {
+		xs->here = loc.xl_entry;
+		goto out;
 	}
+	if (ret != -ENOSPC)
+		mlog_errno(ret);
 
-	free = xh_free_start - header_size - OCFS2_XATTR_HEADER_GAP;
-	/*
-	 * We need to make sure the new name/value pair
-	 * can exist in the same block.
-	 */
-	if (xh_free_start % blocksize < need)
-		free -= xh_free_start % blocksize;
-
-	mlog(0, "xs->not_found = %d, in xattr bucket %llu: free = %d, "
-	     "need = %d, max_free = %d, xh_free_start = %u, xh_name_value_len ="
-	     " %u\n", xs->not_found,
-	     (unsigned long long)bucket_blkno(xs->bucket),
-	     free, need, max_free, le16_to_cpu(xh->xh_free_start),
-	     le16_to_cpu(xh->xh_name_value_len));
-
-	if (free < need ||
-	    (xs->not_found &&
-	     count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb))) {
-		if (need <= max_free &&
-		    count < ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
-			/*
-			 * We can create the space by defragment. Since only the
-			 * name/value will be moved, the xe shouldn't be changed
-			 * in xs.
-			 */
-			ret = ocfs2_defrag_xattr_bucket(inode, ctxt->handle,
-							xs->bucket);
-			if (ret) {
-				mlog_errno(ret);
-				goto out;
-			}
 
-			xh_free_start = le16_to_cpu(xh->xh_free_start);
-			free = xh_free_start - header_size
-				- OCFS2_XATTR_HEADER_GAP;
-			if (xh_free_start % blocksize < need)
-				free -= xh_free_start % blocksize;
+out:
+	mlog_exit(ret);
+	return ret;
+}
 
-			if (free >= need)
-				goto xattr_set;
+static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
+					     struct ocfs2_xattr_info *xi,
+					     struct ocfs2_xattr_search *xs,
+					     struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int ret;
 
-			mlog(0, "Can't get enough space for xattr insert by "
-			     "defragment. Need %u bytes, but we have %d, so "
-			     "allocate new bucket for it.\n", need, free);
-		}
+	mlog_entry("Set xattr %s in xattr index block\n", xi->xi_name);
 
-		/*
-		 * We have to add new buckets or clusters and one
-		 * allocation should leave us enough space for insert.
-		 */
-		BUG_ON(allocation);
+	ret = ocfs2_xattr_set_entry_bucket(inode, xi, xs, ctxt);
+	if (!ret)
+		goto out;
+	if (ret != -ENOSPC) {
+		mlog_errno(ret);
+		goto out;
+	}
 
-		/*
-		 * We do not allow for overlapping ranges between buckets. And
-		 * the maximum number of collisions we will allow for then is
-		 * one bucket's worth, so check it here whether we need to
-		 * add a new bucket for the insert.
-		 */
-		ret = ocfs2_check_xattr_bucket_collision(inode,
-							 xs->bucket,
-							 xi->name);
-		if (ret) {
-			mlog_errno(ret);
-			goto out;
-		}
+	/* Ack, need more space.  Let's try to get another bucket! */
 
-		ret = ocfs2_add_new_xattr_bucket(inode,
-						 xs->xattr_bh,
+	/*
+	 * We do not allow for overlapping ranges between buckets. And
+	 * the maximum number of collisions we will allow for then is
+	 * one bucket's worth, so check it here whether we need to
+	 * add a new bucket for the insert.
+	 */
+	ret = ocfs2_check_xattr_bucket_collision(inode,
 						 xs->bucket,
-						 ctxt);
-		if (ret) {
-			mlog_errno(ret);
-			goto out;
-		}
+						 xi->xi_name);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
 
-		/*
-		 * ocfs2_add_new_xattr_bucket() will have updated
-		 * xs->bucket if it moved, but it will not have updated
-		 * any of the other search fields.  Thus, we drop it and
-		 * re-search.  Everything should be cached, so it'll be
-		 * quick.
-		 */
-		ocfs2_xattr_bucket_relse(xs->bucket);
-		ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
-						   xi->name_index,
-						   xi->name, xs);
-		if (ret && ret != -ENODATA)
-			goto out;
-		xs->not_found = ret;
-		allocation = 1;
-		goto try_again;
+	ret = ocfs2_add_new_xattr_bucket(inode,
+					 xs->xattr_bh,
+					 xs->bucket,
+					 ctxt);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
 	}
 
-xattr_set:
-	ret = ocfs2_xattr_set_in_bucket(inode, xi, xs, ctxt);
+	/*
+	 * ocfs2_add_new_xattr_bucket() will have updated
+	 * xs->bucket if it moved, but it will not have updated
+	 * any of the other search fields.  Thus, we drop it and
+	 * re-search.  Everything should be cached, so it'll be
+	 * quick.
+	 */
+	ocfs2_xattr_bucket_relse(xs->bucket);
+	ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
+					   xi->xi_name_index,
+					   xi->xi_name, xs);
+	if (ret && ret != -ENODATA)
+		goto out;
+	xs->not_found = ret;
+
+	/* Ok, we have a new bucket, let's try again */
+	ret = ocfs2_xattr_set_entry_bucket(inode, xi, xs, ctxt);
+	if (ret && (ret != -ENOSPC))
+		mlog_errno(ret);
+
 out:
 	mlog_exit(ret);
 	return ret;
@@ -5678,7 +5847,7 @@ static int ocfs2_prepare_refcount_xattr(struct inode *inode,
 	 * refcount tree, and make the original extent become 3. So we will need
 	 * 2 * cluster more extent recs at most.
 	 */
-	if (!xi->value || xi->value_len <= OCFS2_XATTR_INLINE_SIZE) {
+	if (!xi->xi_value || xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE) {
 
 		ret = ocfs2_refcounted_xattr_delete_need(inode,
 							 &(*ref_tree)->rf_ci,
@@ -6354,33 +6523,33 @@ static int ocfs2_create_empty_xattr_block(struct inode *inode,
 					  int indexed)
 {
 	int ret;
-	handle_t *handle;
-	struct ocfs2_alloc_context *meta_ac;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_xattr_set_ctxt ctxt;
 
-	ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
+	memset(&ctxt, 0, sizeof(ctxt));
+	ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &ctxt.meta_ac);
 	if (ret < 0) {
 		mlog_errno(ret);
 		return ret;
 	}
 
-	handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_CREATE_CREDITS);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
+	ctxt.handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_CREATE_CREDITS);
+	if (IS_ERR(ctxt.handle)) {
+		ret = PTR_ERR(ctxt.handle);
 		mlog_errno(ret);
 		goto out;
 	}
 
 	mlog(0, "create new xattr block for inode %llu, index = %d\n",
 	     (unsigned long long)fe_bh->b_blocknr, indexed);
-	ret = ocfs2_create_xattr_block(handle, inode, fe_bh,
-				       meta_ac, ret_bh, indexed);
+	ret = ocfs2_create_xattr_block(inode, fe_bh, &ctxt, indexed,
+				       ret_bh);
 	if (ret)
 		mlog_errno(ret);
 
-	ocfs2_commit_trans(osb, handle);
+	ocfs2_commit_trans(osb, ctxt.handle);
 out:
-	ocfs2_free_alloc_context(meta_ac);
+	ocfs2_free_alloc_context(ctxt.meta_ac);
 	return ret;
 }
 
@@ -6635,16 +6804,15 @@ out:
 	return ret;
 }
 
-static int ocfs2_reflink_xattr_buckets(handle_t *handle,
+static int ocfs2_reflink_xattr_bucket(handle_t *handle,
 				u64 blkno, u64 new_blkno, u32 clusters,
+				u32 *cpos, int num_buckets,
 				struct ocfs2_alloc_context *meta_ac,
 				struct ocfs2_alloc_context *data_ac,
 				struct ocfs2_reflink_xattr_tree_args *args)
 {
 	int i, j, ret = 0;
 	struct super_block *sb = args->reflink->old_inode->i_sb;
-	u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(sb));
-	u32 num_buckets = clusters * bpc;
 	int bpb = args->old_bucket->bu_blocks;
 	struct ocfs2_xattr_value_buf vb = {
 		.vb_access = ocfs2_journal_access,
@@ -6663,14 +6831,6 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle,
 			break;
 		}
 
-		/*
-		 * The real bucket num in this series of blocks is stored
-		 * in the 1st bucket.
-		 */
-		if (i == 0)
-			num_buckets = le16_to_cpu(
-				bucket_xh(args->old_bucket)->xh_num_buckets);
-
 		ret = ocfs2_xattr_bucket_journal_access(handle,
 						args->new_bucket,
 						OCFS2_JOURNAL_ACCESS_CREATE);
@@ -6684,6 +6844,18 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle,
 			       bucket_block(args->old_bucket, j),
 			       sb->s_blocksize);
 
+		/*
+		 * Record the start cpos so that we can use it to initialize
+		 * our xattr tree we also set the xh_num_bucket for the new
+		 * bucket.
+		 */
+		if (i == 0) {
+			*cpos = le32_to_cpu(bucket_xh(args->new_bucket)->
+					    xh_entries[0].xe_name_hash);
+			bucket_xh(args->new_bucket)->xh_num_buckets =
+				cpu_to_le16(num_buckets);
+		}
+
 		ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket);
 
 		ret = ocfs2_reflink_xattr_header(handle, args->reflink,
@@ -6713,6 +6885,7 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle,
 		}
 
 		ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket);
+
 		ocfs2_xattr_bucket_relse(args->old_bucket);
 		ocfs2_xattr_bucket_relse(args->new_bucket);
 	}
@@ -6721,6 +6894,75 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle,
 	ocfs2_xattr_bucket_relse(args->new_bucket);
 	return ret;
 }
+
+static int ocfs2_reflink_xattr_buckets(handle_t *handle,
+				struct inode *inode,
+				struct ocfs2_reflink_xattr_tree_args *args,
+				struct ocfs2_extent_tree *et,
+				struct ocfs2_alloc_context *meta_ac,
+				struct ocfs2_alloc_context *data_ac,
+				u64 blkno, u32 cpos, u32 len)
+{
+	int ret, first_inserted = 0;
+	u32 p_cluster, num_clusters, reflink_cpos = 0;
+	u64 new_blkno;
+	unsigned int num_buckets, reflink_buckets;
+	unsigned int bpc =
+		ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb));
+
+	ret = ocfs2_read_xattr_bucket(args->old_bucket, blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+	num_buckets = le16_to_cpu(bucket_xh(args->old_bucket)->xh_num_buckets);
+	ocfs2_xattr_bucket_relse(args->old_bucket);
+
+	while (len && num_buckets) {
+		ret = ocfs2_claim_clusters(handle, data_ac,
+					   1, &p_cluster, &num_clusters);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		new_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
+		reflink_buckets = min(num_buckets, bpc * num_clusters);
+
+		ret = ocfs2_reflink_xattr_bucket(handle, blkno,
+						 new_blkno, num_clusters,
+						 &reflink_cpos, reflink_buckets,
+						 meta_ac, data_ac, args);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/*
+		 * For the 1st allocated cluster, we make it use the same cpos
+		 * so that the xattr tree looks the same as the original one
+		 * in the most case.
+		 */
+		if (!first_inserted) {
+			reflink_cpos = cpos;
+			first_inserted = 1;
+		}
+		ret = ocfs2_insert_extent(handle, et, reflink_cpos, new_blkno,
+					  num_clusters, 0, meta_ac);
+		if (ret)
+			mlog_errno(ret);
+
+		mlog(0, "insert new xattr extent rec start %llu len %u to %u\n",
+		     (unsigned long long)new_blkno, num_clusters, reflink_cpos);
+
+		len -= num_clusters;
+		blkno += ocfs2_clusters_to_blocks(inode->i_sb, num_clusters);
+		num_buckets -= reflink_buckets;
+	}
+out:
+	return ret;
+}
+
 /*
  * Create the same xattr extent record in the new inode's xattr tree.
  */
@@ -6732,8 +6974,6 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode,
 				   void *para)
 {
 	int ret, credits = 0;
-	u32 p_cluster, num_clusters;
-	u64 new_blkno;
 	handle_t *handle;
 	struct ocfs2_reflink_xattr_tree_args *args =
 			(struct ocfs2_reflink_xattr_tree_args *)para;
@@ -6742,6 +6982,9 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode,
 	struct ocfs2_alloc_context *data_ac = NULL;
 	struct ocfs2_extent_tree et;
 
+	mlog(0, "reflink xattr buckets %llu len %u\n",
+	     (unsigned long long)blkno, len);
+
 	ocfs2_init_xattr_tree_extent_tree(&et,
 					  INODE_CACHE(args->reflink->new_inode),
 					  args->new_blk_bh);
@@ -6761,32 +7004,12 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode,
 		goto out;
 	}
 
-	ret = ocfs2_claim_clusters(osb, handle, data_ac,
-				   len, &p_cluster, &num_clusters);
-	if (ret) {
-		mlog_errno(ret);
-		goto out_commit;
-	}
-
-	new_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cluster);
-
-	mlog(0, "reflink xattr buckets %llu to %llu, len %u\n",
-	     (unsigned long long)blkno, (unsigned long long)new_blkno, len);
-	ret = ocfs2_reflink_xattr_buckets(handle, blkno, new_blkno, len,
-					  meta_ac, data_ac, args);
-	if (ret) {
-		mlog_errno(ret);
-		goto out_commit;
-	}
-
-	mlog(0, "insert new xattr extent rec start %llu len %u to %u\n",
-	     (unsigned long long)new_blkno, len, cpos);
-	ret = ocfs2_insert_extent(handle, &et, cpos, new_blkno,
-				  len, 0, meta_ac);
+	ret = ocfs2_reflink_xattr_buckets(handle, inode, args, &et,
+					  meta_ac, data_ac,
+					  blkno, cpos, len);
 	if (ret)
 		mlog_errno(ret);
 
-out_commit:
 	ocfs2_commit_trans(osb, handle);
 
 out:
@@ -7060,7 +7283,7 @@ int ocfs2_init_security_set(handle_t *handle,
 				     xattr_ac, data_ac);
 }
 
-struct xattr_handler ocfs2_xattr_security_handler = {
+const struct xattr_handler ocfs2_xattr_security_handler = {
 	.prefix	= XATTR_SECURITY_PREFIX,
 	.list	= ocfs2_xattr_security_list,
 	.get	= ocfs2_xattr_security_get,
@@ -7104,7 +7327,7 @@ static int ocfs2_xattr_trusted_set(struct dentry *dentry, const char *name,
 			       name, value, size, flags);
 }
 
-struct xattr_handler ocfs2_xattr_trusted_handler = {
+const struct xattr_handler ocfs2_xattr_trusted_handler = {
 	.prefix	= XATTR_TRUSTED_PREFIX,
 	.list	= ocfs2_xattr_trusted_list,
 	.get	= ocfs2_xattr_trusted_get,
@@ -7160,7 +7383,7 @@ static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name,
 			       name, value, size, flags);
 }
 
-struct xattr_handler ocfs2_xattr_user_handler = {
+const struct xattr_handler ocfs2_xattr_user_handler = {
 	.prefix	= XATTR_USER_PREFIX,
 	.list	= ocfs2_xattr_user_list,
 	.get	= ocfs2_xattr_user_get,
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index abd72a47f520..aa64bb37a65b 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -37,12 +37,12 @@ struct ocfs2_security_xattr_info {
 	size_t value_len;
 };
 
-extern struct xattr_handler ocfs2_xattr_user_handler;
-extern struct xattr_handler ocfs2_xattr_trusted_handler;
-extern struct xattr_handler ocfs2_xattr_security_handler;
-extern struct xattr_handler ocfs2_xattr_acl_access_handler;
-extern struct xattr_handler ocfs2_xattr_acl_default_handler;
-extern struct xattr_handler *ocfs2_xattr_handlers[];
+extern const struct xattr_handler ocfs2_xattr_user_handler;
+extern const struct xattr_handler ocfs2_xattr_trusted_handler;
+extern const struct xattr_handler ocfs2_xattr_security_handler;
+extern const struct xattr_handler ocfs2_xattr_acl_access_handler;
+extern const struct xattr_handler ocfs2_xattr_acl_default_handler;
+extern const struct xattr_handler *ocfs2_xattr_handlers[];
 
 ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
 int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int,
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 399487c09364..6e7a3291bbe8 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -329,7 +329,7 @@ const struct file_operations omfs_file_operations = {
 	.aio_read = generic_file_aio_read,
 	.aio_write = generic_file_aio_write,
 	.mmap = generic_file_mmap,
-	.fsync = simple_fsync,
+	.fsync = generic_file_fsync,
 	.splice_read = generic_file_splice_read,
 };
 
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index f3b7c1541f3a..089839a6cc64 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -3,14 +3,15 @@
  * Copyright (C) 2006 Bob Copeland <me@bobcopeland.com>
  * Released under GPL v2.
  */
-#include <linux/version.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/vfs.h>
 #include <linux/parser.h>
 #include <linux/buffer_head.h>
 #include <linux/vmalloc.h>
+#include <linux/writeback.h>
 #include <linux/crc-itu-t.h>
 #include "omfs.h"
 
@@ -36,9 +37,7 @@ struct inode *omfs_new_inode(struct inode *dir, int mode)
 		goto fail;
 
 	inode->i_ino = new_block;
-	inode->i_mode = mode;
-	inode->i_uid = current_fsuid();
-	inode->i_gid = current_fsgid();
+	inode_init_owner(inode, NULL, mode);
 	inode->i_mapping->a_ops = &omfs_aops;
 
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -89,7 +88,7 @@ static void omfs_update_checksums(struct omfs_inode *oi)
 	oi->i_head.h_check_xor = xor;
 }
 
-static int omfs_write_inode(struct inode *inode, int wait)
+static int __omfs_write_inode(struct inode *inode, int wait)
 {
 	struct omfs_inode *oi;
 	struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb);
@@ -162,9 +161,14 @@ out:
 	return ret;
 }
 
+static int omfs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	return __omfs_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
+}
+
 int omfs_sync_inode(struct inode *inode)
 {
-	return omfs_write_inode(inode, 1);
+	return __omfs_write_inode(inode, 1);
 }
 
 /*
diff --git a/fs/open.c b/fs/open.c
index 040cef72bc00..5463266db9e6 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -8,10 +8,8 @@
 #include <linux/mm.h>
 #include <linux/file.h>
 #include <linux/fdtable.h>
-#include <linux/quotaops.h>
 #include <linux/fsnotify.h>
 #include <linux/module.h>
-#include <linux/slab.h>
 #include <linux/tty.h>
 #include <linux/namei.h>
 #include <linux/backing-dev.h>
@@ -19,8 +17,8 @@
 #include <linux/securebits.h>
 #include <linux/security.h>
 #include <linux/mount.h>
-#include <linux/vfs.h>
 #include <linux/fcntl.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <linux/fs.h>
 #include <linux/personality.h>
@@ -34,171 +32,6 @@
 
 #include "internal.h"
 
-int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
-	int retval = -ENODEV;
-
-	if (dentry) {
-		retval = -ENOSYS;
-		if (dentry->d_sb->s_op->statfs) {
-			memset(buf, 0, sizeof(*buf));
-			retval = security_sb_statfs(dentry);
-			if (retval)
-				return retval;
-			retval = dentry->d_sb->s_op->statfs(dentry, buf);
-			if (retval == 0 && buf->f_frsize == 0)
-				buf->f_frsize = buf->f_bsize;
-		}
-	}
-	return retval;
-}
-
-EXPORT_SYMBOL(vfs_statfs);
-
-static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf)
-{
-	struct kstatfs st;
-	int retval;
-
-	retval = vfs_statfs(dentry, &st);
-	if (retval)
-		return retval;
-
-	if (sizeof(*buf) == sizeof(st))
-		memcpy(buf, &st, sizeof(st));
-	else {
-		if (sizeof buf->f_blocks == 4) {
-			if ((st.f_blocks | st.f_bfree | st.f_bavail |
-			     st.f_bsize | st.f_frsize) &
-			    0xffffffff00000000ULL)
-				return -EOVERFLOW;
-			/*
-			 * f_files and f_ffree may be -1; it's okay to stuff
-			 * that into 32 bits
-			 */
-			if (st.f_files != -1 &&
-			    (st.f_files & 0xffffffff00000000ULL))
-				return -EOVERFLOW;
-			if (st.f_ffree != -1 &&
-			    (st.f_ffree & 0xffffffff00000000ULL))
-				return -EOVERFLOW;
-		}
-
-		buf->f_type = st.f_type;
-		buf->f_bsize = st.f_bsize;
-		buf->f_blocks = st.f_blocks;
-		buf->f_bfree = st.f_bfree;
-		buf->f_bavail = st.f_bavail;
-		buf->f_files = st.f_files;
-		buf->f_ffree = st.f_ffree;
-		buf->f_fsid = st.f_fsid;
-		buf->f_namelen = st.f_namelen;
-		buf->f_frsize = st.f_frsize;
-		memset(buf->f_spare, 0, sizeof(buf->f_spare));
-	}
-	return 0;
-}
-
-static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf)
-{
-	struct kstatfs st;
-	int retval;
-
-	retval = vfs_statfs(dentry, &st);
-	if (retval)
-		return retval;
-
-	if (sizeof(*buf) == sizeof(st))
-		memcpy(buf, &st, sizeof(st));
-	else {
-		buf->f_type = st.f_type;
-		buf->f_bsize = st.f_bsize;
-		buf->f_blocks = st.f_blocks;
-		buf->f_bfree = st.f_bfree;
-		buf->f_bavail = st.f_bavail;
-		buf->f_files = st.f_files;
-		buf->f_ffree = st.f_ffree;
-		buf->f_fsid = st.f_fsid;
-		buf->f_namelen = st.f_namelen;
-		buf->f_frsize = st.f_frsize;
-		memset(buf->f_spare, 0, sizeof(buf->f_spare));
-	}
-	return 0;
-}
-
-SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf)
-{
-	struct path path;
-	int error;
-
-	error = user_path(pathname, &path);
-	if (!error) {
-		struct statfs tmp;
-		error = vfs_statfs_native(path.dentry, &tmp);
-		if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-			error = -EFAULT;
-		path_put(&path);
-	}
-	return error;
-}
-
-SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf)
-{
-	struct path path;
-	long error;
-
-	if (sz != sizeof(*buf))
-		return -EINVAL;
-	error = user_path(pathname, &path);
-	if (!error) {
-		struct statfs64 tmp;
-		error = vfs_statfs64(path.dentry, &tmp);
-		if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-			error = -EFAULT;
-		path_put(&path);
-	}
-	return error;
-}
-
-SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf)
-{
-	struct file * file;
-	struct statfs tmp;
-	int error;
-
-	error = -EBADF;
-	file = fget(fd);
-	if (!file)
-		goto out;
-	error = vfs_statfs_native(file->f_path.dentry, &tmp);
-	if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-		error = -EFAULT;
-	fput(file);
-out:
-	return error;
-}
-
-SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf)
-{
-	struct file * file;
-	struct statfs64 tmp;
-	int error;
-
-	if (sz != sizeof(*buf))
-		return -EINVAL;
-
-	error = -EBADF;
-	file = fget(fd);
-	if (!file)
-		goto out;
-	error = vfs_statfs64(file->f_path.dentry, &tmp);
-	if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-		error = -EFAULT;
-	fput(file);
-out:
-	return error;
-}
-
 int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
 	struct file *filp)
 {
@@ -271,17 +104,15 @@ static long do_sys_truncate(const char __user *pathname, loff_t length)
 	 * Make sure that there are no leases.  get_write_access() protects
 	 * against the truncate racing with a lease-granting setlease().
 	 */
-	error = break_lease(inode, FMODE_WRITE);
+	error = break_lease(inode, O_WRONLY);
 	if (error)
 		goto put_write_and_out;
 
 	error = locks_verify_truncate(inode, NULL, length);
 	if (!error)
 		error = security_path_truncate(&path, length, 0);
-	if (!error) {
-		vfs_dq_init(inode);
+	if (!error)
 		error = do_truncate(path.dentry, length, 0, NULL);
-	}
 
 put_write_and_out:
 	put_write_access(inode);
diff --git a/fs/partitions/acorn.c b/fs/partitions/acorn.c
index a97b477ac0fc..6921e7890be6 100644
--- a/fs/partitions/acorn.c
+++ b/fs/partitions/acorn.c
@@ -70,14 +70,14 @@ struct riscix_record {
 
 #if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
 	defined(CONFIG_ACORN_PARTITION_ADFS)
-static int
-riscix_partition(struct parsed_partitions *state, struct block_device *bdev,
-		unsigned long first_sect, int slot, unsigned long nr_sects)
+static int riscix_partition(struct parsed_partitions *state,
+			    unsigned long first_sect, int slot,
+			    unsigned long nr_sects)
 {
 	Sector sect;
 	struct riscix_record *rr;
 	
-	rr = (struct riscix_record *)read_dev_sector(bdev, first_sect, &sect);
+	rr = read_part_sector(state, first_sect, &sect);
 	if (!rr)
 		return -1;
 
@@ -123,9 +123,9 @@ struct linux_part {
 
 #if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
 	defined(CONFIG_ACORN_PARTITION_ADFS)
-static int
-linux_partition(struct parsed_partitions *state, struct block_device *bdev,
-		unsigned long first_sect, int slot, unsigned long nr_sects)
+static int linux_partition(struct parsed_partitions *state,
+			   unsigned long first_sect, int slot,
+			   unsigned long nr_sects)
 {
 	Sector sect;
 	struct linux_part *linuxp;
@@ -135,7 +135,7 @@ linux_partition(struct parsed_partitions *state, struct block_device *bdev,
 
 	put_partition(state, slot++, first_sect, size);
 
-	linuxp = (struct linux_part *)read_dev_sector(bdev, first_sect, &sect);
+	linuxp = read_part_sector(state, first_sect, &sect);
 	if (!linuxp)
 		return -1;
 
@@ -157,8 +157,7 @@ linux_partition(struct parsed_partitions *state, struct block_device *bdev,
 #endif
 
 #ifdef CONFIG_ACORN_PARTITION_CUMANA
-int
-adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev)
+int adfspart_check_CUMANA(struct parsed_partitions *state)
 {
 	unsigned long first_sector = 0;
 	unsigned int start_blk = 0;
@@ -185,7 +184,7 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev
 		struct adfs_discrecord *dr;
 		unsigned int nr_sects;
 
-		data = read_dev_sector(bdev, start_blk * 2 + 6, &sect);
+		data = read_part_sector(state, start_blk * 2 + 6, &sect);
 		if (!data)
 			return -1;
 
@@ -217,14 +216,14 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev
 #ifdef CONFIG_ACORN_PARTITION_RISCIX
 		case PARTITION_RISCIX_SCSI:
 			/* RISCiX - we don't know how to find the next one. */
-			slot = riscix_partition(state, bdev, first_sector,
-						 slot, nr_sects);
+			slot = riscix_partition(state, first_sector, slot,
+						nr_sects);
 			break;
 #endif
 
 		case PARTITION_LINUX:
-			slot = linux_partition(state, bdev, first_sector,
-						slot, nr_sects);
+			slot = linux_partition(state, first_sector, slot,
+					       nr_sects);
 			break;
 		}
 		put_dev_sector(sect);
@@ -249,8 +248,7 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev
  *	    hda1 = ADFS partition on first drive.
  *	    hda2 = non-ADFS partition.
  */
-int
-adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
+int adfspart_check_ADFS(struct parsed_partitions *state)
 {
 	unsigned long start_sect, nr_sects, sectscyl, heads;
 	Sector sect;
@@ -259,7 +257,7 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
 	unsigned char id;
 	int slot = 1;
 
-	data = read_dev_sector(bdev, 6, &sect);
+	data = read_part_sector(state, 6, &sect);
 	if (!data)
 		return -1;
 
@@ -278,21 +276,21 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
 	/*
 	 * Work out start of non-adfs partition.
 	 */
-	nr_sects = (bdev->bd_inode->i_size >> 9) - start_sect;
+	nr_sects = (state->bdev->bd_inode->i_size >> 9) - start_sect;
 
 	if (start_sect) {
 		switch (id) {
 #ifdef CONFIG_ACORN_PARTITION_RISCIX
 		case PARTITION_RISCIX_SCSI:
 		case PARTITION_RISCIX_MFM:
-			slot = riscix_partition(state, bdev, start_sect,
-						 slot, nr_sects);
+			slot = riscix_partition(state, start_sect, slot,
+						nr_sects);
 			break;
 #endif
 
 		case PARTITION_LINUX:
-			slot = linux_partition(state, bdev, start_sect,
-						slot, nr_sects);
+			slot = linux_partition(state, start_sect, slot,
+					       nr_sects);
 			break;
 		}
 	}
@@ -308,10 +306,11 @@ struct ics_part {
 	__le32 size;
 };
 
-static int adfspart_check_ICSLinux(struct block_device *bdev, unsigned long block)
+static int adfspart_check_ICSLinux(struct parsed_partitions *state,
+				   unsigned long block)
 {
 	Sector sect;
-	unsigned char *data = read_dev_sector(bdev, block, &sect);
+	unsigned char *data = read_part_sector(state, block, &sect);
 	int result = 0;
 
 	if (data) {
@@ -349,8 +348,7 @@ static inline int valid_ics_sector(const unsigned char *data)
  *	    hda2 = ADFS partition 1 on first drive.
  *		..etc..
  */
-int
-adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
+int adfspart_check_ICS(struct parsed_partitions *state)
 {
 	const unsigned char *data;
 	const struct ics_part *p;
@@ -360,7 +358,7 @@ adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
 	/*
 	 * Try ICS style partitions - sector 0 contains partition info.
 	 */
-	data = read_dev_sector(bdev, 0, &sect);
+	data = read_part_sector(state, 0, &sect);
 	if (!data)
 	    	return -1;
 
@@ -392,7 +390,7 @@ adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
 			 * partition is.  We must not make this visible
 			 * to the filesystem.
 			 */
-			if (size > 1 && adfspart_check_ICSLinux(bdev, start)) {
+			if (size > 1 && adfspart_check_ICSLinux(state, start)) {
 				start += 1;
 				size -= 1;
 			}
@@ -446,8 +444,7 @@ static inline int valid_ptec_sector(const unsigned char *data)
  *	    hda2 = ADFS partition 1 on first drive.
  *		..etc..
  */
-int
-adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bdev)
+int adfspart_check_POWERTEC(struct parsed_partitions *state)
 {
 	Sector sect;
 	const unsigned char *data;
@@ -455,7 +452,7 @@ adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bd
 	int slot = 1;
 	int i;
 
-	data = read_dev_sector(bdev, 0, &sect);
+	data = read_part_sector(state, 0, &sect);
 	if (!data)
 		return -1;
 
@@ -508,8 +505,7 @@ static const char eesox_name[] = {
  *  1. The individual ADFS boot block entries that are placed on the disk.
  *  2. The start address of the next entry.
  */
-int
-adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev)
+int adfspart_check_EESOX(struct parsed_partitions *state)
 {
 	Sector sect;
 	const unsigned char *data;
@@ -518,7 +514,7 @@ adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev)
 	sector_t start = 0;
 	int i, slot = 1;
 
-	data = read_dev_sector(bdev, 7, &sect);
+	data = read_part_sector(state, 7, &sect);
 	if (!data)
 		return -1;
 
@@ -545,7 +541,7 @@ adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev)
 	if (i != 0) {
 		sector_t size;
 
-		size = get_capacity(bdev->bd_disk);
+		size = get_capacity(state->bdev->bd_disk);
 		put_partition(state, slot++, start, size - start);
 		printk("\n");
 	}
diff --git a/fs/partitions/acorn.h b/fs/partitions/acorn.h
index 81fd50ecc080..ede828529692 100644
--- a/fs/partitions/acorn.h
+++ b/fs/partitions/acorn.h
@@ -7,8 +7,8 @@
  *  format, and everyone stick to it?
  */
 
-int adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev);
-int adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev);
-int adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev);
-int adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bdev);
-int adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev);
+int adfspart_check_CUMANA(struct parsed_partitions *state);
+int adfspart_check_ADFS(struct parsed_partitions *state);
+int adfspart_check_ICS(struct parsed_partitions *state);
+int adfspart_check_POWERTEC(struct parsed_partitions *state);
+int adfspart_check_EESOX(struct parsed_partitions *state);
diff --git a/fs/partitions/amiga.c b/fs/partitions/amiga.c
index 9917a8c360f2..ba443d4229f8 100644
--- a/fs/partitions/amiga.c
+++ b/fs/partitions/amiga.c
@@ -23,8 +23,7 @@ checksum_block(__be32 *m, int size)
 	return sum;
 }
 
-int
-amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
+int amiga_partition(struct parsed_partitions *state)
 {
 	Sector sect;
 	unsigned char *data;
@@ -38,11 +37,11 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
 	for (blk = 0; ; blk++, put_dev_sector(sect)) {
 		if (blk == RDB_ALLOCATION_LIMIT)
 			goto rdb_done;
-		data = read_dev_sector(bdev, blk, &sect);
+		data = read_part_sector(state, blk, &sect);
 		if (!data) {
 			if (warn_no_part)
 				printk("Dev %s: unable to read RDB block %d\n",
-				       bdevname(bdev, b), blk);
+				       bdevname(state->bdev, b), blk);
 			res = -1;
 			goto rdb_done;
 		}
@@ -64,7 +63,7 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
 		}
 
 		printk("Dev %s: RDB in block %d has bad checksum\n",
-			       bdevname(bdev, b), blk);
+		       bdevname(state->bdev, b), blk);
 	}
 
 	/* blksize is blocks per 512 byte standard block */
@@ -75,11 +74,11 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
 	put_dev_sector(sect);
 	for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) {
 		blk *= blksize;	/* Read in terms partition table understands */
-		data = read_dev_sector(bdev, blk, &sect);
+		data = read_part_sector(state, blk, &sect);
 		if (!data) {
 			if (warn_no_part)
 				printk("Dev %s: unable to read partition block %d\n",
-				       bdevname(bdev, b), blk);
+				       bdevname(state->bdev, b), blk);
 			res = -1;
 			goto rdb_done;
 		}
diff --git a/fs/partitions/amiga.h b/fs/partitions/amiga.h
index 2f3e9ce22d53..d094585cadaa 100644
--- a/fs/partitions/amiga.h
+++ b/fs/partitions/amiga.h
@@ -2,5 +2,5 @@
  *  fs/partitions/amiga.h
  */
 
-int amiga_partition(struct parsed_partitions *state, struct block_device *bdev);
+int amiga_partition(struct parsed_partitions *state);
 
diff --git a/fs/partitions/atari.c b/fs/partitions/atari.c
index 1f3572d5b755..4439ff1b6cec 100644
--- a/fs/partitions/atari.c
+++ b/fs/partitions/atari.c
@@ -30,7 +30,7 @@ static inline int OK_id(char *s)
 		memcmp (s, "RAW", 3) == 0 ;
 }
 
-int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
+int atari_partition(struct parsed_partitions *state)
 {
 	Sector sect;
 	struct rootsector *rs;
@@ -42,12 +42,12 @@ int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
 	int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */
 #endif
 
-	rs = (struct rootsector *) read_dev_sector(bdev, 0, &sect);
+	rs = read_part_sector(state, 0, &sect);
 	if (!rs)
 		return -1;
 
 	/* Verify this is an Atari rootsector: */
-	hd_size = bdev->bd_inode->i_size >> 9;
+	hd_size = state->bdev->bd_inode->i_size >> 9;
 	if (!VALID_PARTITION(&rs->part[0], hd_size) &&
 	    !VALID_PARTITION(&rs->part[1], hd_size) &&
 	    !VALID_PARTITION(&rs->part[2], hd_size) &&
@@ -84,7 +84,7 @@ int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
 		printk(" XGM<");
 		partsect = extensect = be32_to_cpu(pi->st);
 		while (1) {
-			xrs = (struct rootsector *)read_dev_sector(bdev, partsect, &sect2);
+			xrs = read_part_sector(state, partsect, &sect2);
 			if (!xrs) {
 				printk (" block %ld read failed\n", partsect);
 				put_dev_sector(sect);
diff --git a/fs/partitions/atari.h b/fs/partitions/atari.h
index 63186b00e135..fe2d32a89f36 100644
--- a/fs/partitions/atari.h
+++ b/fs/partitions/atari.h
@@ -31,4 +31,4 @@ struct rootsector
   u16 checksum;			/* checksum for bootable disks */
 } __attribute__((__packed__));
 
-int atari_partition(struct parsed_partitions *state, struct block_device *bdev);
+int atari_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 64bc8998ac9a..5dcd4b0c5533 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -16,6 +16,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/kmod.h>
 #include <linux/ctype.h>
 #include <linux/genhd.h>
@@ -44,7 +45,7 @@ extern void md_autodetect_dev(dev_t dev);
 
 int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/
 
-static int (*check_part[])(struct parsed_partitions *, struct block_device *) = {
+static int (*check_part[])(struct parsed_partitions *) = {
 	/*
 	 * Probe partition formats with tables at disk address 0
 	 * that also have an ADFS boot block at 0xdc0.
@@ -160,10 +161,11 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
 	struct parsed_partitions *state;
 	int i, res, err;
 
-	state = kmalloc(sizeof(struct parsed_partitions), GFP_KERNEL);
+	state = kzalloc(sizeof(struct parsed_partitions), GFP_KERNEL);
 	if (!state)
 		return NULL;
 
+	state->bdev = bdev;
 	disk_name(hd, 0, state->name);
 	printk(KERN_INFO " %s:", state->name);
 	if (isdigit(state->name[strlen(state->name)-1]))
@@ -173,7 +175,7 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
 	i = res = err = 0;
 	while (!res && check_part[i]) {
 		memset(&state->parts, 0, sizeof(state->parts));
-		res = check_part[i++](state, bdev);
+		res = check_part[i++](state);
 		if (res < 0) {
 			/* We have hit an I/O error which we don't report now.
 		 	* But record it, and let the others do their job.
@@ -185,6 +187,8 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
 	}
 	if (res > 0)
 		return state;
+	if (state->access_beyond_eod)
+		err = -ENOSPC;
 	if (err)
 	/* The partition is unrecognized. So report I/O errors if there were any */
 		res = err;
@@ -412,9 +416,10 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 	pdev = part_to_dev(p);
 
 	p->start_sect = start;
-	p->alignment_offset = queue_sector_alignment_offset(disk->queue, start);
-	p->discard_alignment = queue_sector_discard_alignment(disk->queue,
-							      start);
+	p->alignment_offset =
+		queue_limit_alignment_offset(&disk->queue->limits, start);
+	p->discard_alignment =
+		queue_limit_discard_alignment(&disk->queue->limits, start);
 	p->nr_sects = len;
 	p->partno = partno;
 	p->policy = get_disk_ro(disk);
@@ -536,12 +541,33 @@ exit:
 	disk_part_iter_exit(&piter);
 }
 
+static bool disk_unlock_native_capacity(struct gendisk *disk)
+{
+	const struct block_device_operations *bdops = disk->fops;
+
+	if (bdops->unlock_native_capacity &&
+	    !(disk->flags & GENHD_FL_NATIVE_CAPACITY)) {
+		printk(KERN_CONT "enabling native capacity\n");
+		bdops->unlock_native_capacity(disk);
+		disk->flags |= GENHD_FL_NATIVE_CAPACITY;
+		return true;
+	} else {
+		printk(KERN_CONT "truncated\n");
+		return false;
+	}
+}
+
 int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 {
+	struct parsed_partitions *state = NULL;
 	struct disk_part_iter piter;
 	struct hd_struct *part;
-	struct parsed_partitions *state;
 	int p, highest, res;
+rescan:
+	if (state && !IS_ERR(state)) {
+		kfree(state);
+		state = NULL;
+	}
 
 	if (bdev->bd_part_count)
 		return -EBUSY;
@@ -560,8 +586,32 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 	bdev->bd_invalidated = 0;
 	if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
 		return 0;
-	if (IS_ERR(state))	/* I/O error reading the partition table */
+	if (IS_ERR(state)) {
+		/*
+		 * I/O error reading the partition table.  If any
+		 * partition code tried to read beyond EOD, retry
+		 * after unlocking native capacity.
+		 */
+		if (PTR_ERR(state) == -ENOSPC) {
+			printk(KERN_WARNING "%s: partition table beyond EOD, ",
+			       disk->disk_name);
+			if (disk_unlock_native_capacity(disk))
+				goto rescan;
+		}
 		return -EIO;
+	}
+	/*
+	 * If any partition code tried to read beyond EOD, try
+	 * unlocking native capacity even if partition table is
+	 * sucessfully read as we could be missing some partitions.
+	 */
+	if (state->access_beyond_eod) {
+		printk(KERN_WARNING
+		       "%s: partition table partially beyond EOD, ",
+		       disk->disk_name);
+		if (disk_unlock_native_capacity(disk))
+			goto rescan;
+	}
 
 	/* tell userspace that the media / partition table may have changed */
 	kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
@@ -579,7 +629,7 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 	/* add partitions */
 	for (p = 1; p < state->limit; p++) {
 		sector_t size, from;
-try_scan:
+
 		size = state->parts[p].size;
 		if (!size)
 			continue;
@@ -587,30 +637,21 @@ try_scan:
 		from = state->parts[p].from;
 		if (from >= get_capacity(disk)) {
 			printk(KERN_WARNING
-			       "%s: p%d ignored, start %llu is behind the end of the disk\n",
+			       "%s: p%d start %llu is beyond EOD, ",
 			       disk->disk_name, p, (unsigned long long) from);
+			if (disk_unlock_native_capacity(disk))
+				goto rescan;
 			continue;
 		}
 
 		if (from + size > get_capacity(disk)) {
-			const struct block_device_operations *bdops = disk->fops;
-			unsigned long long capacity;
-
 			printk(KERN_WARNING
-			       "%s: p%d size %llu exceeds device capacity, ",
+			       "%s: p%d size %llu extends beyond EOD, ",
 			       disk->disk_name, p, (unsigned long long) size);
 
-			if (bdops->set_capacity &&
-			    (disk->flags & GENHD_FL_NATIVE_CAPACITY) == 0) {
-				printk(KERN_CONT "enabling native capacity\n");
-				capacity = bdops->set_capacity(disk, ~0ULL);
-				disk->flags |= GENHD_FL_NATIVE_CAPACITY;
-				if (capacity > get_capacity(disk)) {
-					set_capacity(disk, capacity);
-					check_disk_size_change(disk, bdev);
-					bdev->bd_invalidated = 0;
-				}
-				goto try_scan;
+			if (disk_unlock_native_capacity(disk)) {
+				/* free state and restart */
+				goto rescan;
 			} else {
 				/*
 				 * we can not ignore partitions of broken tables
@@ -618,7 +659,6 @@ try_scan:
 				 * we limit them to the end of the disk to avoid
 				 * creating invalid block devices
 				 */
-				printk(KERN_CONT "limited to end of disk\n");
 				size = get_capacity(disk) - from;
 			}
 		}
diff --git a/fs/partitions/check.h b/fs/partitions/check.h
index 98dbe1a84528..52f8bd399396 100644
--- a/fs/partitions/check.h
+++ b/fs/partitions/check.h
@@ -6,6 +6,7 @@
  * description.
  */
 struct parsed_partitions {
+	struct block_device *bdev;
 	char name[BDEVNAME_SIZE];
 	struct {
 		sector_t from;
@@ -14,8 +15,19 @@ struct parsed_partitions {
 	} parts[DISK_MAX_PARTS];
 	int next;
 	int limit;
+	bool access_beyond_eod;
 };
 
+static inline void *read_part_sector(struct parsed_partitions *state,
+				     sector_t n, Sector *p)
+{
+	if (n >= get_capacity(state->bdev->bd_disk)) {
+		state->access_beyond_eod = true;
+		return NULL;
+	}
+	return read_dev_sector(state->bdev, n, p);
+}
+
 static inline void
 put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size)
 {
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index 49cfd5f54238..9efb2cfe2410 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -95,6 +95,7 @@
  ************************************************************/
 #include <linux/crc32.h>
 #include <linux/math64.h>
+#include <linux/slab.h>
 #include "check.h"
 #include "efi.h"
 
@@ -139,8 +140,7 @@ efi_crc32(const void *buf, unsigned long len)
  *  the part[0] entry for this disk, and is the number of
  *  physical sectors available on the disk.
  */
-static u64
-last_lba(struct block_device *bdev)
+static u64 last_lba(struct block_device *bdev)
 {
 	if (!bdev || !bdev->bd_inode)
 		return 0;
@@ -180,27 +180,28 @@ is_pmbr_valid(legacy_mbr *mbr)
 
 /**
  * read_lba(): Read bytes from disk, starting at given LBA
- * @bdev
+ * @state
  * @lba
  * @buffer
  * @size_t
  *
- * Description:  Reads @count bytes from @bdev into @buffer.
+ * Description: Reads @count bytes from @state->bdev into @buffer.
  * Returns number of bytes read on success, 0 on error.
  */
-static size_t
-read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
+static size_t read_lba(struct parsed_partitions *state,
+		       u64 lba, u8 *buffer, size_t count)
 {
 	size_t totalreadcount = 0;
+	struct block_device *bdev = state->bdev;
 	sector_t n = lba * (bdev_logical_block_size(bdev) / 512);
 
-	if (!bdev || !buffer || lba > last_lba(bdev))
+	if (!buffer || lba > last_lba(bdev))
                 return 0;
 
 	while (count) {
 		int copied = 512;
 		Sector sect;
-		unsigned char *data = read_dev_sector(bdev, n++, &sect);
+		unsigned char *data = read_part_sector(state, n++, &sect);
 		if (!data)
 			break;
 		if (copied > count)
@@ -216,19 +217,20 @@ read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
 
 /**
  * alloc_read_gpt_entries(): reads partition entries from disk
- * @bdev
+ * @state
  * @gpt - GPT header
  * 
  * Description: Returns ptes on success,  NULL on error.
  * Allocates space for PTEs based on information found in @gpt.
  * Notes: remember to free pte when you're done!
  */
-static gpt_entry *
-alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt)
+static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state,
+					 gpt_header *gpt)
 {
 	size_t count;
 	gpt_entry *pte;
-	if (!bdev || !gpt)
+
+	if (!gpt)
 		return NULL;
 
 	count = le32_to_cpu(gpt->num_partition_entries) *
@@ -239,7 +241,7 @@ alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt)
 	if (!pte)
 		return NULL;
 
-	if (read_lba(bdev, le64_to_cpu(gpt->partition_entry_lba),
+	if (read_lba(state, le64_to_cpu(gpt->partition_entry_lba),
                      (u8 *) pte,
 		     count) < count) {
 		kfree(pte);
@@ -251,27 +253,24 @@ alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt)
 
 /**
  * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk
- * @bdev
+ * @state
  * @lba is the Logical Block Address of the partition table
  * 
  * Description: returns GPT header on success, NULL on error.   Allocates
- * and fills a GPT header starting at @ from @bdev.
+ * and fills a GPT header starting at @ from @state->bdev.
  * Note: remember to free gpt when finished with it.
  */
-static gpt_header *
-alloc_read_gpt_header(struct block_device *bdev, u64 lba)
+static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state,
+					 u64 lba)
 {
 	gpt_header *gpt;
-	unsigned ssz = bdev_logical_block_size(bdev);
-
-	if (!bdev)
-		return NULL;
+	unsigned ssz = bdev_logical_block_size(state->bdev);
 
 	gpt = kzalloc(ssz, GFP_KERNEL);
 	if (!gpt)
 		return NULL;
 
-	if (read_lba(bdev, lba, (u8 *) gpt, ssz) < ssz) {
+	if (read_lba(state, lba, (u8 *) gpt, ssz) < ssz) {
 		kfree(gpt);
                 gpt=NULL;
 		return NULL;
@@ -282,7 +281,7 @@ alloc_read_gpt_header(struct block_device *bdev, u64 lba)
 
 /**
  * is_gpt_valid() - tests one GPT header and PTEs for validity
- * @bdev
+ * @state
  * @lba is the logical block address of the GPT header to test
  * @gpt is a GPT header ptr, filled on return.
  * @ptes is a PTEs ptr, filled on return.
@@ -290,16 +289,15 @@ alloc_read_gpt_header(struct block_device *bdev, u64 lba)
  * Description: returns 1 if valid,  0 on error.
  * If valid, returns pointers to newly allocated GPT header and PTEs.
  */
-static int
-is_gpt_valid(struct block_device *bdev, u64 lba,
-	     gpt_header **gpt, gpt_entry **ptes)
+static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
+			gpt_header **gpt, gpt_entry **ptes)
 {
 	u32 crc, origcrc;
 	u64 lastlba;
 
-	if (!bdev || !gpt || !ptes)
+	if (!ptes)
 		return 0;
-	if (!(*gpt = alloc_read_gpt_header(bdev, lba)))
+	if (!(*gpt = alloc_read_gpt_header(state, lba)))
 		return 0;
 
 	/* Check the GUID Partition Table signature */
@@ -335,7 +333,7 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
 	/* Check the first_usable_lba and last_usable_lba are
 	 * within the disk.
 	 */
-	lastlba = last_lba(bdev);
+	lastlba = last_lba(state->bdev);
 	if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) {
 		pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n",
 			 (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba),
@@ -349,7 +347,7 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
 		goto fail;
 	}
 
-	if (!(*ptes = alloc_read_gpt_entries(bdev, *gpt)))
+	if (!(*ptes = alloc_read_gpt_entries(state, *gpt)))
 		goto fail;
 
 	/* Check the GUID Partition Entry Array CRC */
@@ -494,7 +492,7 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
 
 /**
  * find_valid_gpt() - Search disk for valid GPT headers and PTEs
- * @bdev
+ * @state
  * @gpt is a GPT header ptr, filled on return.
  * @ptes is a PTEs ptr, filled on return.
  * Description: Returns 1 if valid, 0 on error.
@@ -507,24 +505,25 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
  * This protects against devices which misreport their size, and forces
  * the user to decide to use the Alternate GPT.
  */
-static int
-find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
+static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
+			  gpt_entry **ptes)
 {
 	int good_pgpt = 0, good_agpt = 0, good_pmbr = 0;
 	gpt_header *pgpt = NULL, *agpt = NULL;
 	gpt_entry *pptes = NULL, *aptes = NULL;
 	legacy_mbr *legacymbr;
 	u64 lastlba;
-	if (!bdev || !gpt || !ptes)
+
+	if (!ptes)
 		return 0;
 
-	lastlba = last_lba(bdev);
+	lastlba = last_lba(state->bdev);
         if (!force_gpt) {
                 /* This will be added to the EFI Spec. per Intel after v1.02. */
                 legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL);
                 if (legacymbr) {
-                        read_lba(bdev, 0, (u8 *) legacymbr,
-                                 sizeof (*legacymbr));
+                        read_lba(state, 0, (u8 *) legacymbr,
+				 sizeof (*legacymbr));
                         good_pmbr = is_pmbr_valid(legacymbr);
                         kfree(legacymbr);
                 }
@@ -532,15 +531,14 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
                         goto fail;
         }
 
-	good_pgpt = is_gpt_valid(bdev, GPT_PRIMARY_PARTITION_TABLE_LBA,
+	good_pgpt = is_gpt_valid(state, GPT_PRIMARY_PARTITION_TABLE_LBA,
 				 &pgpt, &pptes);
         if (good_pgpt)
-		good_agpt = is_gpt_valid(bdev,
+		good_agpt = is_gpt_valid(state,
 					 le64_to_cpu(pgpt->alternate_lba),
 					 &agpt, &aptes);
         if (!good_agpt && force_gpt)
-                good_agpt = is_gpt_valid(bdev, lastlba,
-                                         &agpt, &aptes);
+                good_agpt = is_gpt_valid(state, lastlba, &agpt, &aptes);
 
         /* The obviously unsuccessful case */
         if (!good_pgpt && !good_agpt)
@@ -582,9 +580,8 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
 }
 
 /**
- * efi_partition(struct parsed_partitions *state, struct block_device *bdev)
+ * efi_partition(struct parsed_partitions *state)
  * @state
- * @bdev
  *
  * Description: called from check.c, if the disk contains GPT
  * partitions, sets up partition entries in the kernel.
@@ -601,15 +598,14 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
  *  1 if successful
  *
  */
-int
-efi_partition(struct parsed_partitions *state, struct block_device *bdev)
+int efi_partition(struct parsed_partitions *state)
 {
 	gpt_header *gpt = NULL;
 	gpt_entry *ptes = NULL;
 	u32 i;
-	unsigned ssz = bdev_logical_block_size(bdev) / 512;
+	unsigned ssz = bdev_logical_block_size(state->bdev) / 512;
 
-	if (!find_valid_gpt(bdev, &gpt, &ptes) || !gpt || !ptes) {
+	if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) {
 		kfree(gpt);
 		kfree(ptes);
 		return 0;
@@ -622,7 +618,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
 		u64 size = le64_to_cpu(ptes[i].ending_lba) -
 			   le64_to_cpu(ptes[i].starting_lba) + 1ULL;
 
-		if (!is_pte_valid(&ptes[i], last_lba(bdev)))
+		if (!is_pte_valid(&ptes[i], last_lba(state->bdev)))
 			continue;
 
 		put_partition(state, i+1, start * ssz, size * ssz);
@@ -630,7 +626,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
 		/* If this is a RAID volume, tell md */
 		if (!efi_guidcmp(ptes[i].partition_type_guid,
 				 PARTITION_LINUX_RAID_GUID))
-			state->parts[i+1].flags = 1;
+			state->parts[i + 1].flags = ADDPART_FLAG_RAID;
 	}
 	kfree(ptes);
 	kfree(gpt);
diff --git a/fs/partitions/efi.h b/fs/partitions/efi.h
index 6998b589abf9..b69ab729558f 100644
--- a/fs/partitions/efi.h
+++ b/fs/partitions/efi.h
@@ -110,7 +110,7 @@ typedef struct _legacy_mbr {
 } __attribute__ ((packed)) legacy_mbr;
 
 /* Functions */
-extern int efi_partition(struct parsed_partitions *state, struct block_device *bdev);
+extern int efi_partition(struct parsed_partitions *state);
 
 #endif
 
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index fc71aab08460..fc8497643fd0 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -58,9 +58,9 @@ cchhb2blk (struct vtoc_cchhb *ptr, struct hd_geometry *geo) {
 
 /*
  */
-int
-ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
+int ibm_partition(struct parsed_partitions *state)
 {
+	struct block_device *bdev = state->bdev;
 	int blocksize, res;
 	loff_t i_size, offset, size, fmt_size;
 	dasd_information2_t *info;
@@ -74,6 +74,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 	} *label;
 	unsigned char *data;
 	Sector sect;
+	sector_t labelsect;
 
 	res = 0;
 	blocksize = bdev_logical_block_size(bdev);
@@ -98,9 +99,19 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 		goto out_freeall;
 
 	/*
+	 * Special case for FBA disks: label sector does not depend on
+	 * blocksize.
+	 */
+	if ((info->cu_type == 0x6310 && info->dev_type == 0x9336) ||
+	    (info->cu_type == 0x3880 && info->dev_type == 0x3370))
+		labelsect = info->label_block;
+	else
+		labelsect = info->label_block * (blocksize >> 9);
+
+	/*
 	 * Get volume label, extract name and type.
 	 */
-	data = read_dev_sector(bdev, info->label_block*(blocksize/512), &sect);
+	data = read_part_sector(state, labelsect, &sect);
 	if (data == NULL)
 		goto out_readerr;
 
@@ -193,8 +204,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 			 */
 			blk = cchhb2blk(&label->vol.vtoc, geo) + 1;
 			counter = 0;
-			data = read_dev_sector(bdev, blk * (blocksize/512),
-					       &sect);
+			data = read_part_sector(state, blk * (blocksize/512),
+						&sect);
 			while (data != NULL) {
 				struct vtoc_format1_label f1;
 
@@ -208,9 +219,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 				    || f1.DS1FMTID == _ascebc['7']
 				    || f1.DS1FMTID == _ascebc['9']) {
 					blk++;
-					data = read_dev_sector(bdev, blk *
-							       (blocksize/512),
-								&sect);
+					data = read_part_sector(state,
+						blk * (blocksize/512), &sect);
 					continue;
 				}
 
@@ -230,9 +240,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 					      size * (blocksize >> 9));
 				counter++;
 				blk++;
-				data = read_dev_sector(bdev,
-						       blk * (blocksize/512),
-						       &sect);
+				data = read_part_sector(state,
+						blk * (blocksize/512), &sect);
 			}
 
 			if (!data)
diff --git a/fs/partitions/ibm.h b/fs/partitions/ibm.h
index 31f85a6ac459..08fb0804a812 100644
--- a/fs/partitions/ibm.h
+++ b/fs/partitions/ibm.h
@@ -1 +1 @@
-int ibm_partition(struct parsed_partitions *, struct block_device *);
+int ibm_partition(struct parsed_partitions *);
diff --git a/fs/partitions/karma.c b/fs/partitions/karma.c
index 176d89bcf123..1cc928bb762f 100644
--- a/fs/partitions/karma.c
+++ b/fs/partitions/karma.c
@@ -9,7 +9,7 @@
 #include "check.h"
 #include "karma.h"
 
-int karma_partition(struct parsed_partitions *state, struct block_device *bdev)
+int karma_partition(struct parsed_partitions *state)
 {
 	int i;
 	int slot = 1;
@@ -29,7 +29,7 @@ int karma_partition(struct parsed_partitions *state, struct block_device *bdev)
 	} __attribute__((packed)) *label;
 	struct d_partition *p;
 
-	data = read_dev_sector(bdev, 0, &sect);
+	data = read_part_sector(state, 0, &sect);
 	if (!data)
 		return -1;
 
diff --git a/fs/partitions/karma.h b/fs/partitions/karma.h
index ecf7d3f2a3d8..c764b2e9df21 100644
--- a/fs/partitions/karma.h
+++ b/fs/partitions/karma.h
@@ -4,5 +4,5 @@
 
 #define KARMA_LABEL_MAGIC		0xAB56
 
-int karma_partition(struct parsed_partitions *state, struct block_device *bdev);
+int karma_partition(struct parsed_partitions *state);
 
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index 8652fb99e962..648c9d8f3357 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -26,6 +26,7 @@
 #include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/stringify.h>
+#include <linux/kernel.h>
 #include "ldm.h"
 #include "check.h"
 #include "msdos.h"
@@ -77,17 +78,16 @@ static int ldm_parse_hexbyte (const u8 *src)
 	int h;
 
 	/* high part */
-	if      ((x = src[0] - '0') <= '9'-'0') h = x;
-	else if ((x = src[0] - 'a') <= 'f'-'a') h = x+10;
-	else if ((x = src[0] - 'A') <= 'F'-'A') h = x+10;
-	else return -1;
-	h <<= 4;
+	x = h = hex_to_bin(src[0]);
+	if (h < 0)
+		return -1;
 
 	/* low part */
-	if ((x = src[1] - '0') <= '9'-'0') return h | x;
-	if ((x = src[1] - 'a') <= 'f'-'a') return h | (x+10);
-	if ((x = src[1] - 'A') <= 'F'-'A') return h | (x+10);
-	return -1;
+	h = hex_to_bin(src[1]);
+	if (h < 0)
+		return -1;
+
+	return (x << 4) + h;
 }
 
 /**
@@ -309,7 +309,7 @@ static bool ldm_compare_tocblocks (const struct tocblock *toc1,
 
 /**
  * ldm_validate_privheads - Compare the primary privhead with its backups
- * @bdev:  Device holding the LDM Database
+ * @state: Partition check state including device holding the LDM Database
  * @ph1:   Memory struct to fill with ph contents
  *
  * Read and compare all three privheads from disk.
@@ -321,8 +321,8 @@ static bool ldm_compare_tocblocks (const struct tocblock *toc1,
  * Return:  'true'   Success
  *          'false'  Error
  */
-static bool ldm_validate_privheads (struct block_device *bdev,
-				    struct privhead *ph1)
+static bool ldm_validate_privheads(struct parsed_partitions *state,
+				   struct privhead *ph1)
 {
 	static const int off[3] = { OFF_PRIV1, OFF_PRIV2, OFF_PRIV3 };
 	struct privhead *ph[3] = { ph1 };
@@ -332,7 +332,7 @@ static bool ldm_validate_privheads (struct block_device *bdev,
 	long num_sects;
 	int i;
 
-	BUG_ON (!bdev || !ph1);
+	BUG_ON (!state || !ph1);
 
 	ph[1] = kmalloc (sizeof (*ph[1]), GFP_KERNEL);
 	ph[2] = kmalloc (sizeof (*ph[2]), GFP_KERNEL);
@@ -346,8 +346,8 @@ static bool ldm_validate_privheads (struct block_device *bdev,
 
 	/* Read and parse privheads */
 	for (i = 0; i < 3; i++) {
-		data = read_dev_sector (bdev,
-			ph[0]->config_start + off[i], &sect);
+		data = read_part_sector(state, ph[0]->config_start + off[i],
+					&sect);
 		if (!data) {
 			ldm_crit ("Disk read failed.");
 			goto out;
@@ -363,7 +363,7 @@ static bool ldm_validate_privheads (struct block_device *bdev,
 		}
 	}
 
-	num_sects = bdev->bd_inode->i_size >> 9;
+	num_sects = state->bdev->bd_inode->i_size >> 9;
 
 	if ((ph[0]->config_start > num_sects) ||
 	   ((ph[0]->config_start + ph[0]->config_size) > num_sects)) {
@@ -397,20 +397,20 @@ out:
 
 /**
  * ldm_validate_tocblocks - Validate the table of contents and its backups
- * @bdev:  Device holding the LDM Database
- * @base:  Offset, into @bdev, of the database
+ * @state: Partition check state including device holding the LDM Database
+ * @base:  Offset, into @state->bdev, of the database
  * @ldb:   Cache of the database structures
  *
  * Find and compare the four tables of contents of the LDM Database stored on
- * @bdev and return the parsed information into @toc1.
+ * @state->bdev and return the parsed information into @toc1.
  *
  * The offsets and sizes of the configs are range-checked against a privhead.
  *
  * Return:  'true'   @toc1 contains validated TOCBLOCK info
  *          'false'  @toc1 contents are undefined
  */
-static bool ldm_validate_tocblocks(struct block_device *bdev,
-	unsigned long base, struct ldmdb *ldb)
+static bool ldm_validate_tocblocks(struct parsed_partitions *state,
+				   unsigned long base, struct ldmdb *ldb)
 {
 	static const int off[4] = { OFF_TOCB1, OFF_TOCB2, OFF_TOCB3, OFF_TOCB4};
 	struct tocblock *tb[4];
@@ -420,7 +420,7 @@ static bool ldm_validate_tocblocks(struct block_device *bdev,
 	int i, nr_tbs;
 	bool result = false;
 
-	BUG_ON(!bdev || !ldb);
+	BUG_ON(!state || !ldb);
 	ph = &ldb->ph;
 	tb[0] = &ldb->toc;
 	tb[1] = kmalloc(sizeof(*tb[1]) * 3, GFP_KERNEL);
@@ -437,7 +437,7 @@ static bool ldm_validate_tocblocks(struct block_device *bdev,
 	 * skip any that fail as long as we get at least one valid TOCBLOCK.
 	 */
 	for (nr_tbs = i = 0; i < 4; i++) {
-		data = read_dev_sector(bdev, base + off[i], &sect);
+		data = read_part_sector(state, base + off[i], &sect);
 		if (!data) {
 			ldm_error("Disk read failed for TOCBLOCK %d.", i);
 			continue;
@@ -473,7 +473,7 @@ err:
 
 /**
  * ldm_validate_vmdb - Read the VMDB and validate it
- * @bdev:  Device holding the LDM Database
+ * @state: Partition check state including device holding the LDM Database
  * @base:  Offset, into @bdev, of the database
  * @ldb:   Cache of the database structures
  *
@@ -483,8 +483,8 @@ err:
  * Return:  'true'   @ldb contains validated VBDB info
  *          'false'  @ldb contents are undefined
  */
-static bool ldm_validate_vmdb (struct block_device *bdev, unsigned long base,
-			       struct ldmdb *ldb)
+static bool ldm_validate_vmdb(struct parsed_partitions *state,
+			      unsigned long base, struct ldmdb *ldb)
 {
 	Sector sect;
 	u8 *data;
@@ -492,12 +492,12 @@ static bool ldm_validate_vmdb (struct block_device *bdev, unsigned long base,
 	struct vmdb *vm;
 	struct tocblock *toc;
 
-	BUG_ON (!bdev || !ldb);
+	BUG_ON (!state || !ldb);
 
 	vm  = &ldb->vm;
 	toc = &ldb->toc;
 
-	data = read_dev_sector (bdev, base + OFF_VMDB, &sect);
+	data = read_part_sector(state, base + OFF_VMDB, &sect);
 	if (!data) {
 		ldm_crit ("Disk read failed.");
 		return false;
@@ -534,21 +534,21 @@ out:
 
 /**
  * ldm_validate_partition_table - Determine whether bdev might be a dynamic disk
- * @bdev:  Device holding the LDM Database
+ * @state: Partition check state including device holding the LDM Database
  *
  * This function provides a weak test to decide whether the device is a dynamic
  * disk or not.  It looks for an MS-DOS-style partition table containing at
  * least one partition of type 0x42 (formerly SFS, now used by Windows for
  * dynamic disks).
  *
- * N.B.  The only possible error can come from the read_dev_sector and that is
+ * N.B.  The only possible error can come from the read_part_sector and that is
  *       only likely to happen if the underlying device is strange.  If that IS
  *       the case we should return zero to let someone else try.
  *
- * Return:  'true'   @bdev is a dynamic disk
- *          'false'  @bdev is not a dynamic disk, or an error occurred
+ * Return:  'true'   @state->bdev is a dynamic disk
+ *          'false'  @state->bdev is not a dynamic disk, or an error occurred
  */
-static bool ldm_validate_partition_table (struct block_device *bdev)
+static bool ldm_validate_partition_table(struct parsed_partitions *state)
 {
 	Sector sect;
 	u8 *data;
@@ -556,9 +556,9 @@ static bool ldm_validate_partition_table (struct block_device *bdev)
 	int i;
 	bool result = false;
 
-	BUG_ON (!bdev);
+	BUG_ON(!state);
 
-	data = read_dev_sector (bdev, 0, &sect);
+	data = read_part_sector(state, 0, &sect);
 	if (!data) {
 		ldm_crit ("Disk read failed.");
 		return false;
@@ -1391,8 +1391,8 @@ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb)
 
 /**
  * ldm_get_vblks - Read the on-disk database of VBLKs into memory
- * @bdev:  Device holding the LDM Database
- * @base:  Offset, into @bdev, of the database
+ * @state: Partition check state including device holding the LDM Database
+ * @base:  Offset, into @state->bdev, of the database
  * @ldb:   Cache of the database structures
  *
  * To use the information from the VBLKs, they need to be read from the disk,
@@ -1401,8 +1401,8 @@ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb)
  * Return:  'true'   All the VBLKs were read successfully
  *          'false'  An error occurred
  */
-static bool ldm_get_vblks (struct block_device *bdev, unsigned long base,
-			   struct ldmdb *ldb)
+static bool ldm_get_vblks(struct parsed_partitions *state, unsigned long base,
+			  struct ldmdb *ldb)
 {
 	int size, perbuf, skip, finish, s, v, recs;
 	u8 *data = NULL;
@@ -1410,7 +1410,7 @@ static bool ldm_get_vblks (struct block_device *bdev, unsigned long base,
 	bool result = false;
 	LIST_HEAD (frags);
 
-	BUG_ON (!bdev || !ldb);
+	BUG_ON(!state || !ldb);
 
 	size   = ldb->vm.vblk_size;
 	perbuf = 512 / size;
@@ -1418,7 +1418,7 @@ static bool ldm_get_vblks (struct block_device *bdev, unsigned long base,
 	finish = (size * ldb->vm.last_vblk_seq) >> 9;
 
 	for (s = skip; s < finish; s++) {		/* For each sector */
-		data = read_dev_sector (bdev, base + OFF_VMDB + s, &sect);
+		data = read_part_sector(state, base + OFF_VMDB + s, &sect);
 		if (!data) {
 			ldm_crit ("Disk read failed.");
 			goto out;
@@ -1474,8 +1474,7 @@ static void ldm_free_vblks (struct list_head *lh)
 
 /**
  * ldm_partition - Find out whether a device is a dynamic disk and handle it
- * @pp:    List of the partitions parsed so far
- * @bdev:  Device holding the LDM Database
+ * @state: Partition check state including device holding the LDM Database
  *
  * This determines whether the device @bdev is a dynamic disk and if so creates
  * the partitions necessary in the gendisk structure pointed to by @hd.
@@ -1485,21 +1484,21 @@ static void ldm_free_vblks (struct list_head *lh)
  * example, if the device is hda, we would have: hda1: LDM database, hda2, hda3,
  * and so on: the actual data containing partitions.
  *
- * Return:  1 Success, @bdev is a dynamic disk and we handled it
- *          0 Success, @bdev is not a dynamic disk
+ * Return:  1 Success, @state->bdev is a dynamic disk and we handled it
+ *          0 Success, @state->bdev is not a dynamic disk
  *         -1 An error occurred before enough information had been read
- *            Or @bdev is a dynamic disk, but it may be corrupted
+ *            Or @state->bdev is a dynamic disk, but it may be corrupted
  */
-int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev)
+int ldm_partition(struct parsed_partitions *state)
 {
 	struct ldmdb  *ldb;
 	unsigned long base;
 	int result = -1;
 
-	BUG_ON (!pp || !bdev);
+	BUG_ON(!state);
 
 	/* Look for signs of a Dynamic Disk */
-	if (!ldm_validate_partition_table (bdev))
+	if (!ldm_validate_partition_table(state))
 		return 0;
 
 	ldb = kmalloc (sizeof (*ldb), GFP_KERNEL);
@@ -1509,15 +1508,15 @@ int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev)
 	}
 
 	/* Parse and check privheads. */
-	if (!ldm_validate_privheads (bdev, &ldb->ph))
+	if (!ldm_validate_privheads(state, &ldb->ph))
 		goto out;		/* Already logged */
 
 	/* All further references are relative to base (database start). */
 	base = ldb->ph.config_start;
 
 	/* Parse and check tocs and vmdb. */
-	if (!ldm_validate_tocblocks (bdev, base, ldb) ||
-	    !ldm_validate_vmdb      (bdev, base, ldb))
+	if (!ldm_validate_tocblocks(state, base, ldb) ||
+	    !ldm_validate_vmdb(state, base, ldb))
 	    	goto out;		/* Already logged */
 
 	/* Initialize vblk lists in ldmdb struct */
@@ -1527,13 +1526,13 @@ int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev)
 	INIT_LIST_HEAD (&ldb->v_comp);
 	INIT_LIST_HEAD (&ldb->v_part);
 
-	if (!ldm_get_vblks (bdev, base, ldb)) {
+	if (!ldm_get_vblks(state, base, ldb)) {
 		ldm_crit ("Failed to read the VBLKs from the database.");
 		goto cleanup;
 	}
 
 	/* Finally, create the data partition devices. */
-	if (ldm_create_data_partitions (pp, ldb)) {
+	if (ldm_create_data_partitions(state, ldb)) {
 		ldm_debug ("Parsed LDM database successfully.");
 		result = 1;
 	}
diff --git a/fs/partitions/ldm.h b/fs/partitions/ldm.h
index 30e08e809c1d..d1fb50b28d86 100644
--- a/fs/partitions/ldm.h
+++ b/fs/partitions/ldm.h
@@ -209,7 +209,7 @@ struct ldmdb {				/* Cache of the database */
 	struct list_head v_part;
 };
 
-int ldm_partition (struct parsed_partitions *state, struct block_device *bdev);
+int ldm_partition(struct parsed_partitions *state);
 
 #endif /* _FS_PT_LDM_H_ */
 
diff --git a/fs/partitions/mac.c b/fs/partitions/mac.c
index d4a0fad3563b..74465ff7c263 100644
--- a/fs/partitions/mac.c
+++ b/fs/partitions/mac.c
@@ -27,7 +27,7 @@ static inline void mac_fix_string(char *stg, int len)
 		stg[i] = 0;
 }
 
-int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
+int mac_partition(struct parsed_partitions *state)
 {
 	int slot = 1;
 	Sector sect;
@@ -42,7 +42,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
 	struct mac_driver_desc *md;
 
 	/* Get 0th block and look at the first partition map entry. */
-	md = (struct mac_driver_desc *) read_dev_sector(bdev, 0, &sect);
+	md = read_part_sector(state, 0, &sect);
 	if (!md)
 		return -1;
 	if (be16_to_cpu(md->signature) != MAC_DRIVER_MAGIC) {
@@ -51,7 +51,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
 	}
 	secsize = be16_to_cpu(md->block_size);
 	put_dev_sector(sect);
-	data = read_dev_sector(bdev, secsize/512, &sect);
+	data = read_part_sector(state, secsize/512, &sect);
 	if (!data)
 		return -1;
 	part = (struct mac_partition *) (data + secsize%512);
@@ -64,7 +64,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
 	for (blk = 1; blk <= blocks_in_map; ++blk) {
 		int pos = blk * secsize;
 		put_dev_sector(sect);
-		data = read_dev_sector(bdev, pos/512, &sect);
+		data = read_part_sector(state, pos/512, &sect);
 		if (!data)
 			return -1;
 		part = (struct mac_partition *) (data + pos%512);
@@ -75,7 +75,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
 			be32_to_cpu(part->block_count) * (secsize/512));
 
 		if (!strnicmp(part->type, "Linux_RAID", 10))
-			state->parts[slot].flags = 1;
+			state->parts[slot].flags = ADDPART_FLAG_RAID;
 #ifdef CONFIG_PPC_PMAC
 		/*
 		 * If this is the first bootable partition, tell the
@@ -123,7 +123,8 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
 	}
 #ifdef CONFIG_PPC_PMAC
 	if (found_root_goodness)
-		note_bootable_part(bdev->bd_dev, found_root, found_root_goodness);
+		note_bootable_part(state->bdev->bd_dev, found_root,
+				   found_root_goodness);
 #endif
 
 	put_dev_sector(sect);
diff --git a/fs/partitions/mac.h b/fs/partitions/mac.h
index bbf26e1386fa..3c7d98436380 100644
--- a/fs/partitions/mac.h
+++ b/fs/partitions/mac.h
@@ -41,4 +41,4 @@ struct mac_driver_desc {
     /* ... more stuff */
 };
 
-int mac_partition(struct parsed_partitions *state, struct block_device *bdev);
+int mac_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 0028d2ef0662..15bfb7b1e044 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -31,14 +31,17 @@
  */
 #include <asm/unaligned.h>
 
-#define SYS_IND(p)	(get_unaligned(&p->sys_ind))
-#define NR_SECTS(p)	({ __le32 __a =	get_unaligned(&p->nr_sects);	\
-				le32_to_cpu(__a); \
-			})
+#define SYS_IND(p)	get_unaligned(&p->sys_ind)
 
-#define START_SECT(p)	({ __le32 __a =	get_unaligned(&p->start_sect);	\
-				le32_to_cpu(__a); \
-			})
+static inline sector_t nr_sects(struct partition *p)
+{
+	return (sector_t)get_unaligned_le32(&p->nr_sects);
+}
+
+static inline sector_t start_sect(struct partition *p)
+{
+	return (sector_t)get_unaligned_le32(&p->start_sect);
+}
 
 static inline int is_extended_partition(struct partition *p)
 {
@@ -61,7 +64,7 @@ msdos_magic_present(unsigned char *p)
 #define AIX_LABEL_MAGIC2	0xC2
 #define AIX_LABEL_MAGIC3	0xD4
 #define AIX_LABEL_MAGIC4	0xC1
-static int aix_magic_present(unsigned char *p, struct block_device *bdev)
+static int aix_magic_present(struct parsed_partitions *state, unsigned char *p)
 {
 	struct partition *pt = (struct partition *) (p + 0x1be);
 	Sector sect;
@@ -82,7 +85,7 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev)
 			is_extended_partition(pt))
 			return 0;
 	}
-	d = read_dev_sector(bdev, 7, &sect);
+	d = read_part_sector(state, 7, &sect);
 	if (d) {
 		if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M')
 			ret = 1;
@@ -102,15 +105,14 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev)
  * only for the actual data partitions.
  */
 
-static void
-parse_extended(struct parsed_partitions *state, struct block_device *bdev,
-			u32 first_sector, u32 first_size)
+static void parse_extended(struct parsed_partitions *state,
+			   sector_t first_sector, sector_t first_size)
 {
 	struct partition *p;
 	Sector sect;
 	unsigned char *data;
-	u32 this_sector, this_size;
-	int sector_size = bdev_logical_block_size(bdev) / 512;
+	sector_t this_sector, this_size;
+	sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
 	int loopct = 0;		/* number of links followed
 				   without finding a data partition */
 	int i;
@@ -123,7 +125,7 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
 			return;
 		if (state->next == state->limit)
 			return;
-		data = read_dev_sector(bdev, this_sector, &sect);
+		data = read_part_sector(state, this_sector, &sect);
 		if (!data)
 			return;
 
@@ -145,14 +147,14 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
 		 * First process the data partition(s)
 		 */
 		for (i=0; i<4; i++, p++) {
-			u32 offs, size, next;
-			if (!NR_SECTS(p) || is_extended_partition(p))
+			sector_t offs, size, next;
+			if (!nr_sects(p) || is_extended_partition(p))
 				continue;
 
 			/* Check the 3rd and 4th entries -
 			   these sometimes contain random garbage */
-			offs = START_SECT(p)*sector_size;
-			size = NR_SECTS(p)*sector_size;
+			offs = start_sect(p)*sector_size;
+			size = nr_sects(p)*sector_size;
 			next = this_sector + offs;
 			if (i >= 2) {
 				if (offs + size > this_size)
@@ -179,13 +181,13 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
 		 */
 		p -= 4;
 		for (i=0; i<4; i++, p++)
-			if (NR_SECTS(p) && is_extended_partition(p))
+			if (nr_sects(p) && is_extended_partition(p))
 				break;
 		if (i == 4)
 			goto done;	 /* nothing left to do */
 
-		this_sector = first_sector + START_SECT(p) * sector_size;
-		this_size = NR_SECTS(p) * sector_size;
+		this_sector = first_sector + start_sect(p) * sector_size;
+		this_size = nr_sects(p) * sector_size;
 		put_dev_sector(sect);
 	}
 done:
@@ -195,9 +197,8 @@ done:
 /* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also
    indicates linux swap.  Be careful before believing this is Solaris. */
 
-static void
-parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
-			u32 offset, u32 size, int origin)
+static void parse_solaris_x86(struct parsed_partitions *state,
+			      sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_SOLARIS_X86_PARTITION
 	Sector sect;
@@ -205,7 +206,7 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
 	int i;
 	short max_nparts;
 
-	v = (struct solaris_x86_vtoc *)read_dev_sector(bdev, offset+1, &sect);
+	v = read_part_sector(state, offset + 1, &sect);
 	if (!v)
 		return;
 	if (le32_to_cpu(v->v_sanity) != SOLARIS_X86_VTOC_SANE) {
@@ -242,16 +243,15 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
  * Create devices for BSD partitions listed in a disklabel, under a
  * dos-like partition. See parse_extended() for more information.
  */
-static void
-parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
-		u32 offset, u32 size, int origin, char *flavour,
-		int max_partitions)
+static void parse_bsd(struct parsed_partitions *state,
+		      sector_t offset, sector_t size, int origin, char *flavour,
+		      int max_partitions)
 {
 	Sector sect;
 	struct bsd_disklabel *l;
 	struct bsd_partition *p;
 
-	l = (struct bsd_disklabel *)read_dev_sector(bdev, offset+1, &sect);
+	l = read_part_sector(state, offset + 1, &sect);
 	if (!l)
 		return;
 	if (le32_to_cpu(l->d_magic) != BSD_DISKMAGIC) {
@@ -263,7 +263,7 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
 	if (le16_to_cpu(l->d_npartitions) < max_partitions)
 		max_partitions = le16_to_cpu(l->d_npartitions);
 	for (p = l->d_partitions; p - l->d_partitions < max_partitions; p++) {
-		u32 bsd_start, bsd_size;
+		sector_t bsd_start, bsd_size;
 
 		if (state->next == state->limit)
 			break;
@@ -288,33 +288,28 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
 }
 #endif
 
-static void
-parse_freebsd(struct parsed_partitions *state, struct block_device *bdev,
-		u32 offset, u32 size, int origin)
+static void parse_freebsd(struct parsed_partitions *state,
+			  sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_BSD_DISKLABEL
-	parse_bsd(state, bdev, offset, size, origin,
-			"bsd", BSD_MAXPARTITIONS);
+	parse_bsd(state, offset, size, origin, "bsd", BSD_MAXPARTITIONS);
 #endif
 }
 
-static void
-parse_netbsd(struct parsed_partitions *state, struct block_device *bdev,
-		u32 offset, u32 size, int origin)
+static void parse_netbsd(struct parsed_partitions *state,
+			 sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_BSD_DISKLABEL
-	parse_bsd(state, bdev, offset, size, origin,
-			"netbsd", BSD_MAXPARTITIONS);
+	parse_bsd(state, offset, size, origin, "netbsd", BSD_MAXPARTITIONS);
 #endif
 }
 
-static void
-parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
-		u32 offset, u32 size, int origin)
+static void parse_openbsd(struct parsed_partitions *state,
+			  sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_BSD_DISKLABEL
-	parse_bsd(state, bdev, offset, size, origin,
-			"openbsd", OPENBSD_MAXPARTITIONS);
+	parse_bsd(state, offset, size, origin, "openbsd",
+		  OPENBSD_MAXPARTITIONS);
 #endif
 }
 
@@ -322,16 +317,15 @@ parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
  * Create devices for Unixware partitions listed in a disklabel, under a
  * dos-like partition. See parse_extended() for more information.
  */
-static void
-parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
-		u32 offset, u32 size, int origin)
+static void parse_unixware(struct parsed_partitions *state,
+			   sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_UNIXWARE_DISKLABEL
 	Sector sect;
 	struct unixware_disklabel *l;
 	struct unixware_slice *p;
 
-	l = (struct unixware_disklabel *)read_dev_sector(bdev, offset+29, &sect);
+	l = read_part_sector(state, offset + 29, &sect);
 	if (!l)
 		return;
 	if (le32_to_cpu(l->d_magic) != UNIXWARE_DISKMAGIC ||
@@ -348,7 +342,8 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
 
 		if (p->s_label != UNIXWARE_FS_UNUSED)
 			put_partition(state, state->next++,
-						START_SECT(p), NR_SECTS(p));
+				      le32_to_cpu(p->start_sect),
+				      le32_to_cpu(p->nr_sects));
 		p++;
 	}
 	put_dev_sector(sect);
@@ -361,9 +356,8 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
  * Anand Krishnamurthy <anandk@wiproge.med.ge.com>
  * Rajeev V. Pillai    <rajeevvp@yahoo.com>
  */
-static void
-parse_minix(struct parsed_partitions *state, struct block_device *bdev,
-		u32 offset, u32 size, int origin)
+static void parse_minix(struct parsed_partitions *state,
+			sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_MINIX_SUBPARTITION
 	Sector sect;
@@ -371,7 +365,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
 	struct partition *p;
 	int i;
 
-	data = read_dev_sector(bdev, offset, &sect);
+	data = read_part_sector(state, offset, &sect);
 	if (!data)
 		return;
 
@@ -390,7 +384,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
 			/* add each partition in use */
 			if (SYS_IND(p) == MINIX_PARTITION)
 				put_partition(state, state->next++,
-					      START_SECT(p), NR_SECTS(p));
+					      start_sect(p), nr_sects(p));
 		}
 		printk(" >\n");
 	}
@@ -400,8 +394,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
 
 static struct {
 	unsigned char id;
-	void (*parse)(struct parsed_partitions *, struct block_device *,
-			u32, u32, int);
+	void (*parse)(struct parsed_partitions *, sector_t, sector_t, int);
 } subtypes[] = {
 	{FREEBSD_PARTITION, parse_freebsd},
 	{NETBSD_PARTITION, parse_netbsd},
@@ -413,16 +406,16 @@ static struct {
 	{0, NULL},
 };
  
-int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
+int msdos_partition(struct parsed_partitions *state)
 {
-	int sector_size = bdev_logical_block_size(bdev) / 512;
+	sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
 	Sector sect;
 	unsigned char *data;
 	struct partition *p;
 	struct fat_boot_sector *fb;
 	int slot;
 
-	data = read_dev_sector(bdev, 0, &sect);
+	data = read_part_sector(state, 0, &sect);
 	if (!data)
 		return -1;
 	if (!msdos_magic_present(data + 510)) {
@@ -430,7 +423,7 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
 		return 0;
 	}
 
-	if (aix_magic_present(data, bdev)) {
+	if (aix_magic_present(state, data)) {
 		put_dev_sector(sect);
 		printk( " [AIX]");
 		return 0;
@@ -483,22 +476,29 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
 
 	state->next = 5;
 	for (slot = 1 ; slot <= 4 ; slot++, p++) {
-		u32 start = START_SECT(p)*sector_size;
-		u32 size = NR_SECTS(p)*sector_size;
+		sector_t start = start_sect(p)*sector_size;
+		sector_t size = nr_sects(p)*sector_size;
 		if (!size)
 			continue;
 		if (is_extended_partition(p)) {
-			/* prevent someone doing mkfs or mkswap on an
-			   extended partition, but leave room for LILO */
-			put_partition(state, slot, start, size == 1 ? 1 : 2);
+			/*
+			 * prevent someone doing mkfs or mkswap on an
+			 * extended partition, but leave room for LILO
+			 * FIXME: this uses one logical sector for > 512b
+			 * sector, although it may not be enough/proper.
+			 */
+			sector_t n = 2;
+			n = min(size, max(sector_size, n));
+			put_partition(state, slot, start, n);
+
 			printk(" <");
-			parse_extended(state, bdev, start, size);
+			parse_extended(state, start, size);
 			printk(" >");
 			continue;
 		}
 		put_partition(state, slot, start, size);
 		if (SYS_IND(p) == LINUX_RAID_PARTITION)
-			state->parts[slot].flags = 1;
+			state->parts[slot].flags = ADDPART_FLAG_RAID;
 		if (SYS_IND(p) == DM6_PARTITION)
 			printk("[DM]");
 		if (SYS_IND(p) == EZD_PARTITION)
@@ -513,7 +513,7 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
 		unsigned char id = SYS_IND(p);
 		int n;
 
-		if (!NR_SECTS(p))
+		if (!nr_sects(p))
 			continue;
 
 		for (n = 0; subtypes[n].parse && id != subtypes[n].id; n++)
@@ -521,8 +521,8 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
 
 		if (!subtypes[n].parse)
 			continue;
-		subtypes[n].parse(state, bdev, START_SECT(p)*sector_size,
-						NR_SECTS(p)*sector_size, slot);
+		subtypes[n].parse(state, start_sect(p) * sector_size,
+				  nr_sects(p) * sector_size, slot);
 	}
 	put_dev_sector(sect);
 	return 1;
diff --git a/fs/partitions/msdos.h b/fs/partitions/msdos.h
index 01e5e0b6902d..38c781c490b3 100644
--- a/fs/partitions/msdos.h
+++ b/fs/partitions/msdos.h
@@ -4,5 +4,5 @@
 
 #define MSDOS_LABEL_MAGIC		0xAA55
 
-int msdos_partition(struct parsed_partitions *state, struct block_device *bdev);
+int msdos_partition(struct parsed_partitions *state);
 
diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c
index c05c17bc5df3..fc22b85d436a 100644
--- a/fs/partitions/osf.c
+++ b/fs/partitions/osf.c
@@ -10,7 +10,7 @@
 #include "check.h"
 #include "osf.h"
 
-int osf_partition(struct parsed_partitions *state, struct block_device *bdev)
+int osf_partition(struct parsed_partitions *state)
 {
 	int i;
 	int slot = 1;
@@ -49,7 +49,7 @@ int osf_partition(struct parsed_partitions *state, struct block_device *bdev)
 	} * label;
 	struct d_partition * partition;
 
-	data = read_dev_sector(bdev, 0, &sect);
+	data = read_part_sector(state, 0, &sect);
 	if (!data)
 		return -1;
 
diff --git a/fs/partitions/osf.h b/fs/partitions/osf.h
index 427b8eab314b..20ed2315ec16 100644
--- a/fs/partitions/osf.h
+++ b/fs/partitions/osf.h
@@ -4,4 +4,4 @@
 
 #define DISKLABELMAGIC (0x82564557UL)
 
-int osf_partition(struct parsed_partitions *state, struct block_device *bdev);
+int osf_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/sgi.c b/fs/partitions/sgi.c
index ed5ac83fe83a..43b1df9aa16c 100644
--- a/fs/partitions/sgi.c
+++ b/fs/partitions/sgi.c
@@ -27,7 +27,7 @@ struct sgi_disklabel {
 	__be32 _unused1;			/* Padding */
 };
 
-int sgi_partition(struct parsed_partitions *state, struct block_device *bdev)
+int sgi_partition(struct parsed_partitions *state)
 {
 	int i, csum;
 	__be32 magic;
@@ -39,7 +39,7 @@ int sgi_partition(struct parsed_partitions *state, struct block_device *bdev)
 	struct sgi_partition *p;
 	char b[BDEVNAME_SIZE];
 
-	label = (struct sgi_disklabel *) read_dev_sector(bdev, 0, &sect);
+	label = read_part_sector(state, 0, &sect);
 	if (!label)
 		return -1;
 	p = &label->partitions[0];
@@ -57,7 +57,7 @@ int sgi_partition(struct parsed_partitions *state, struct block_device *bdev)
 	}
 	if(csum) {
 		printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n",
-		       bdevname(bdev, b));
+		       bdevname(state->bdev, b));
 		put_dev_sector(sect);
 		return 0;
 	}
diff --git a/fs/partitions/sgi.h b/fs/partitions/sgi.h
index 5d5595c09928..b9553ebdd5a9 100644
--- a/fs/partitions/sgi.h
+++ b/fs/partitions/sgi.h
@@ -2,7 +2,7 @@
  *  fs/partitions/sgi.h
  */
 
-extern int sgi_partition(struct parsed_partitions *state, struct block_device *bdev);
+extern int sgi_partition(struct parsed_partitions *state);
 
 #define SGI_LABEL_MAGIC 0x0be5a941
 
diff --git a/fs/partitions/sun.c b/fs/partitions/sun.c
index c95e6a62c01d..a32660e25f7f 100644
--- a/fs/partitions/sun.c
+++ b/fs/partitions/sun.c
@@ -10,7 +10,7 @@
 #include "check.h"
 #include "sun.h"
 
-int sun_partition(struct parsed_partitions *state, struct block_device *bdev)
+int sun_partition(struct parsed_partitions *state)
 {
 	int i;
 	__be16 csum;
@@ -61,7 +61,7 @@ int sun_partition(struct parsed_partitions *state, struct block_device *bdev)
 	int use_vtoc;
 	int nparts;
 
-	label = (struct sun_disklabel *)read_dev_sector(bdev, 0, &sect);
+	label = read_part_sector(state, 0, &sect);
 	if (!label)
 		return -1;
 
@@ -78,7 +78,7 @@ int sun_partition(struct parsed_partitions *state, struct block_device *bdev)
 		csum ^= *ush--;
 	if (csum) {
 		printk("Dev %s Sun disklabel: Csum bad, label corrupted\n",
-		       bdevname(bdev, b));
+		       bdevname(state->bdev, b));
 		put_dev_sector(sect);
 		return 0;
 	}
diff --git a/fs/partitions/sun.h b/fs/partitions/sun.h
index 7f864d1f86d4..2424baa8319f 100644
--- a/fs/partitions/sun.h
+++ b/fs/partitions/sun.h
@@ -5,4 +5,4 @@
 #define SUN_LABEL_MAGIC          0xDABE
 #define SUN_VTOC_SANITY          0x600DDEEE
 
-int sun_partition(struct parsed_partitions *state, struct block_device *bdev);
+int sun_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/sysv68.c b/fs/partitions/sysv68.c
index 4eba27b78643..9030c864428e 100644
--- a/fs/partitions/sysv68.c
+++ b/fs/partitions/sysv68.c
@@ -46,7 +46,7 @@ struct slice {
 };
 
 
-int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev)
+int sysv68_partition(struct parsed_partitions *state)
 {
 	int i, slices;
 	int slot = 1;
@@ -55,7 +55,7 @@ int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev)
 	struct dkblk0 *b;
 	struct slice *slice;
 
-	data = read_dev_sector(bdev, 0, &sect);
+	data = read_part_sector(state, 0, &sect);
 	if (!data)
 		return -1;
 
@@ -68,7 +68,7 @@ int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev)
 	i = be32_to_cpu(b->dk_ios.ios_slcblk);
 	put_dev_sector(sect);
 
-	data = read_dev_sector(bdev, i, &sect);
+	data = read_part_sector(state, i, &sect);
 	if (!data)
 		return -1;
 
diff --git a/fs/partitions/sysv68.h b/fs/partitions/sysv68.h
index fa733f68431b..bf2f5ffa97ac 100644
--- a/fs/partitions/sysv68.h
+++ b/fs/partitions/sysv68.h
@@ -1 +1 @@
-extern int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev);
+extern int sysv68_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/ultrix.c b/fs/partitions/ultrix.c
index ec852c11dce4..db9eef260364 100644
--- a/fs/partitions/ultrix.c
+++ b/fs/partitions/ultrix.c
@@ -9,7 +9,7 @@
 #include "check.h"
 #include "ultrix.h"
 
-int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev)
+int ultrix_partition(struct parsed_partitions *state)
 {
 	int i;
 	Sector sect;
@@ -26,7 +26,7 @@ int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev)
 #define PT_MAGIC	0x032957	/* Partition magic number */
 #define PT_VALID	1		/* Indicates if struct is valid */
 
-	data = read_dev_sector(bdev, (16384 - sizeof(*label))/512, &sect);
+	data = read_part_sector(state, (16384 - sizeof(*label))/512, &sect);
 	if (!data)
 		return -1;
 	
diff --git a/fs/partitions/ultrix.h b/fs/partitions/ultrix.h
index a74bf8e2d370..a3cc00b2bded 100644
--- a/fs/partitions/ultrix.h
+++ b/fs/partitions/ultrix.h
@@ -2,4 +2,4 @@
  *  fs/partitions/ultrix.h
  */
 
-int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev);
+int ultrix_partition(struct parsed_partitions *state);
diff --git a/fs/pipe.c b/fs/pipe.c
index 37ba29ff3158..279eef96c51c 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -11,6 +11,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/fs.h>
+#include <linux/log2.h>
 #include <linux/mount.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/uio.h>
@@ -18,11 +19,23 @@
 #include <linux/pagemap.h>
 #include <linux/audit.h>
 #include <linux/syscalls.h>
+#include <linux/fcntl.h>
 
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
 
 /*
+ * The max size that a non-root user is allowed to grow the pipe. Can
+ * be set by root in /proc/sys/fs/pipe-max-size
+ */
+unsigned int pipe_max_size = 1048576;
+
+/*
+ * Minimum pipe size, as required by POSIX
+ */
+unsigned int pipe_min_size = PAGE_SIZE;
+
+/*
  * We use a start+len construction, which provides full use of the 
  * allocated memory.
  * -- Florian Coosmann (FGC)
@@ -222,6 +235,7 @@ void *generic_pipe_buf_map(struct pipe_inode_info *pipe,
 
 	return kmap(buf->page);
 }
+EXPORT_SYMBOL(generic_pipe_buf_map);
 
 /**
  * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer
@@ -241,6 +255,7 @@ void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,
 	} else
 		kunmap(buf->page);
 }
+EXPORT_SYMBOL(generic_pipe_buf_unmap);
 
 /**
  * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer
@@ -271,6 +286,7 @@ int generic_pipe_buf_steal(struct pipe_inode_info *pipe,
 
 	return 1;
 }
+EXPORT_SYMBOL(generic_pipe_buf_steal);
 
 /**
  * generic_pipe_buf_get - get a reference to a &struct pipe_buffer
@@ -286,6 +302,7 @@ void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
 {
 	page_cache_get(buf->page);
 }
+EXPORT_SYMBOL(generic_pipe_buf_get);
 
 /**
  * generic_pipe_buf_confirm - verify contents of the pipe buffer
@@ -301,6 +318,7 @@ int generic_pipe_buf_confirm(struct pipe_inode_info *info,
 {
 	return 0;
 }
+EXPORT_SYMBOL(generic_pipe_buf_confirm);
 
 /**
  * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
@@ -315,6 +333,7 @@ void generic_pipe_buf_release(struct pipe_inode_info *pipe,
 {
 	page_cache_release(buf->page);
 }
+EXPORT_SYMBOL(generic_pipe_buf_release);
 
 static const struct pipe_buf_operations anon_pipe_buf_ops = {
 	.can_merge = 1,
@@ -390,7 +409,7 @@ redo:
 			if (!buf->len) {
 				buf->ops = NULL;
 				ops->release(pipe, buf);
-				curbuf = (curbuf + 1) & (PIPE_BUFFERS-1);
+				curbuf = (curbuf + 1) & (pipe->buffers - 1);
 				pipe->curbuf = curbuf;
 				pipe->nrbufs = --bufs;
 				do_wakeup = 1;
@@ -472,7 +491,7 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov,
 	chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
 	if (pipe->nrbufs && chars != 0) {
 		int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) &
-							(PIPE_BUFFERS-1);
+							(pipe->buffers - 1);
 		struct pipe_buffer *buf = pipe->bufs + lastbuf;
 		const struct pipe_buf_operations *ops = buf->ops;
 		int offset = buf->offset + buf->len;
@@ -518,8 +537,8 @@ redo1:
 			break;
 		}
 		bufs = pipe->nrbufs;
-		if (bufs < PIPE_BUFFERS) {
-			int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1);
+		if (bufs < pipe->buffers) {
+			int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1);
 			struct pipe_buffer *buf = pipe->bufs + newbuf;
 			struct page *page = pipe->tmp_page;
 			char *src;
@@ -580,7 +599,7 @@ redo2:
 			if (!total_len)
 				break;
 		}
-		if (bufs < PIPE_BUFFERS)
+		if (bufs < pipe->buffers)
 			continue;
 		if (filp->f_flags & O_NONBLOCK) {
 			if (!ret)
@@ -640,7 +659,7 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 			nrbufs = pipe->nrbufs;
 			while (--nrbufs >= 0) {
 				count += pipe->bufs[buf].len;
-				buf = (buf+1) & (PIPE_BUFFERS-1);
+				buf = (buf+1) & (pipe->buffers - 1);
 			}
 			mutex_unlock(&inode->i_mutex);
 
@@ -671,7 +690,7 @@ pipe_poll(struct file *filp, poll_table *wait)
 	}
 
 	if (filp->f_mode & FMODE_WRITE) {
-		mask |= (nrbufs < PIPE_BUFFERS) ? POLLOUT | POLLWRNORM : 0;
+		mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0;
 		/*
 		 * Most Unices do not set POLLERR for FIFOs but on Linux they
 		 * behave exactly like pipes for poll().
@@ -877,25 +896,32 @@ struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
 
 	pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
 	if (pipe) {
-		init_waitqueue_head(&pipe->wait);
-		pipe->r_counter = pipe->w_counter = 1;
-		pipe->inode = inode;
+		pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL);
+		if (pipe->bufs) {
+			init_waitqueue_head(&pipe->wait);
+			pipe->r_counter = pipe->w_counter = 1;
+			pipe->inode = inode;
+			pipe->buffers = PIPE_DEF_BUFFERS;
+			return pipe;
+		}
+		kfree(pipe);
 	}
 
-	return pipe;
+	return NULL;
 }
 
 void __free_pipe_info(struct pipe_inode_info *pipe)
 {
 	int i;
 
-	for (i = 0; i < PIPE_BUFFERS; i++) {
+	for (i = 0; i < pipe->buffers; i++) {
 		struct pipe_buffer *buf = pipe->bufs + i;
 		if (buf->ops)
 			buf->ops->release(pipe, buf);
 	}
 	if (pipe->tmp_page)
 		__free_page(pipe->tmp_page);
+	kfree(pipe->bufs);
 	kfree(pipe);
 }
 
@@ -1094,6 +1120,126 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes)
 }
 
 /*
+ * Allocate a new array of pipe buffers and copy the info over. Returns the
+ * pipe size if successful, or return -ERROR on error.
+ */
+static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
+{
+	struct pipe_buffer *bufs;
+
+	/*
+	 * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
+	 * expect a lot of shrink+grow operations, just free and allocate
+	 * again like we would do for growing. If the pipe currently
+	 * contains more buffers than arg, then return busy.
+	 */
+	if (nr_pages < pipe->nrbufs)
+		return -EBUSY;
+
+	bufs = kcalloc(nr_pages, sizeof(struct pipe_buffer), GFP_KERNEL);
+	if (unlikely(!bufs))
+		return -ENOMEM;
+
+	/*
+	 * The pipe array wraps around, so just start the new one at zero
+	 * and adjust the indexes.
+	 */
+	if (pipe->nrbufs) {
+		unsigned int tail;
+		unsigned int head;
+
+		tail = pipe->curbuf + pipe->nrbufs;
+		if (tail < pipe->buffers)
+			tail = 0;
+		else
+			tail &= (pipe->buffers - 1);
+
+		head = pipe->nrbufs - tail;
+		if (head)
+			memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer));
+		if (tail)
+			memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer));
+	}
+
+	pipe->curbuf = 0;
+	kfree(pipe->bufs);
+	pipe->bufs = bufs;
+	pipe->buffers = nr_pages;
+	return nr_pages * PAGE_SIZE;
+}
+
+/*
+ * Currently we rely on the pipe array holding a power-of-2 number
+ * of pages.
+ */
+static inline unsigned int round_pipe_size(unsigned int size)
+{
+	unsigned long nr_pages;
+
+	nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	return roundup_pow_of_two(nr_pages) << PAGE_SHIFT;
+}
+
+/*
+ * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax
+ * will return an error.
+ */
+int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
+		 size_t *lenp, loff_t *ppos)
+{
+	int ret;
+
+	ret = proc_dointvec_minmax(table, write, buf, lenp, ppos);
+	if (ret < 0 || !write)
+		return ret;
+
+	pipe_max_size = round_pipe_size(pipe_max_size);
+	return ret;
+}
+
+long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct pipe_inode_info *pipe;
+	long ret;
+
+	pipe = file->f_path.dentry->d_inode->i_pipe;
+	if (!pipe)
+		return -EBADF;
+
+	mutex_lock(&pipe->inode->i_mutex);
+
+	switch (cmd) {
+	case F_SETPIPE_SZ: {
+		unsigned int size, nr_pages;
+
+		size = round_pipe_size(arg);
+		nr_pages = size >> PAGE_SHIFT;
+
+		ret = -EINVAL;
+		if (!nr_pages)
+			goto out;
+
+		if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) {
+			ret = -EPERM;
+			goto out;
+		}
+		ret = pipe_set_size(pipe, nr_pages);
+		break;
+		}
+	case F_GETPIPE_SZ:
+		ret = pipe->buffers * PAGE_SIZE;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+out:
+	mutex_unlock(&pipe->inode->i_mutex);
+	return ret;
+}
+
+/*
  * pipefs should _never_ be mounted by userland - too much of security hassle,
  * no real gain from having the whole whorehouse mounted. So we don't need
  * any operations on the root directory. However, we need a non-trivial
diff --git a/fs/pnode.c b/fs/pnode.c
index 8d5f392ec3d3..5cc564a83149 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -86,7 +86,7 @@ static int do_make_slave(struct vfsmount *mnt)
 
 	/*
 	 * slave 'mnt' to a peer mount that has the
-	 * same root dentry. If none is available than
+	 * same root dentry. If none is available then
 	 * slave it to anything that is available.
 	 */
 	while ((peer_mnt = next_peer(peer_mnt)) != mnt &&
@@ -147,6 +147,11 @@ void change_mnt_propagation(struct vfsmount *mnt, int type)
  * get the next mount in the propagation tree.
  * @m: the mount seen last
  * @origin: the original mount from where the tree walk initiated
+ *
+ * Note that peer groups form contiguous segments of slave lists.
+ * We rely on that in get_source() to be able to find out if
+ * vfsmount found while iterating with propagation_next() is
+ * a peer of one we'd found earlier.
  */
 static struct vfsmount *propagation_next(struct vfsmount *m,
 					 struct vfsmount *origin)
@@ -186,10 +191,6 @@ static struct vfsmount *get_source(struct vfsmount *dest,
 {
 	struct vfsmount *p_last_src = NULL;
 	struct vfsmount *p_last_dest = NULL;
-	*type = CL_PROPAGATION;
-
-	if (IS_MNT_SHARED(dest))
-		*type |= CL_MAKE_SHARED;
 
 	while (last_dest != dest->mnt_master) {
 		p_last_dest = last_dest;
@@ -202,13 +203,18 @@ static struct vfsmount *get_source(struct vfsmount *dest,
 		do {
 			p_last_dest = next_peer(p_last_dest);
 		} while (IS_MNT_NEW(p_last_dest));
+		/* is that a peer of the earlier? */
+		if (dest == p_last_dest) {
+			*type = CL_MAKE_SHARED;
+			return p_last_src;
+		}
 	}
-
-	if (dest != p_last_dest) {
-		*type |= CL_SLAVE;
-		return last_src;
-	} else
-		return p_last_src;
+	/* slave of the earlier, then */
+	*type = CL_SLAVE;
+	/* beginning of peer group among the slaves? */
+	if (IS_MNT_SHARED(dest))
+		*type |= CL_MAKE_SHARED;
+	return last_src;
 }
 
 /*
diff --git a/fs/pnode.h b/fs/pnode.h
index 958665d662af..1ea4ae1efcd3 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -21,12 +21,11 @@
 #define CL_SLAVE     		0x02
 #define CL_COPY_ALL 		0x04
 #define CL_MAKE_SHARED 		0x08
-#define CL_PROPAGATION 		0x10
-#define CL_PRIVATE 		0x20
+#define CL_PRIVATE 		0x10
 
 static inline void set_mnt_shared(struct vfsmount *mnt)
 {
-	mnt->mnt_flags &= ~MNT_PNODE_MASK;
+	mnt->mnt_flags &= ~MNT_SHARED_MASK;
 	mnt->mnt_flags |= MNT_SHARED;
 }
 
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 13b5d0708175..fff6572676ae 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -68,7 +68,6 @@
 #include <linux/hugetlb.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
-#include <linux/slab.h>
 #include <linux/smp.h>
 #include <linux/signal.h>
 #include <linux/highmem.h>
@@ -82,7 +81,6 @@
 #include <linux/pid_namespace.h>
 #include <linux/ptrace.h>
 #include <linux/tracehook.h>
-#include <linux/swapops.h>
 
 #include <asm/pgtable.h>
 #include <asm/processor.h>
@@ -178,7 +176,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 		if (tracer)
 			tpid = task_pid_nr_ns(tracer, ns);
 	}
-	cred = get_cred((struct cred *) __task_cred(p));
+	cred = get_task_cred(p);
 	seq_printf(m,
 		"State:\t%s\n"
 		"Tgid:\t%d\n"
@@ -269,9 +267,11 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
 		shpending = p->signal->shared_pending.signal;
 		blocked = p->blocked;
 		collect_sigign_sigcatch(p, &ignored, &caught);
-		num_threads = atomic_read(&p->signal->count);
+		num_threads = get_nr_threads(p);
+		rcu_read_lock();  /* FIXME: is this correct? */
 		qsize = atomic_read(&__task_cred(p)->user->sigpending);
-		qlim = p->signal->rlim[RLIMIT_SIGPENDING].rlim_cur;
+		rcu_read_unlock();
+		qlim = task_rlimit(p, RLIMIT_SIGPENDING);
 		unlock_task_sighand(p, &flags);
 	}
 
@@ -410,7 +410,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 			tty_nr = new_encode_dev(tty_devnum(sig->tty));
 		}
 
-		num_threads = atomic_read(&sig->count);
+		num_threads = get_nr_threads(task);
 		collect_sigign_sigcatch(task, &sigign, &sigcatch);
 
 		cmin_flt = sig->cmin_flt;
@@ -418,7 +418,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 		cutime = sig->cutime;
 		cstime = sig->cstime;
 		cgtime = sig->cgtime;
-		rsslim = sig->rlim[RLIMIT_RSS].rlim_cur;
+		rsslim = ACCESS_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur);
 
 		/* add up live thread stats at the group level */
 		if (whole) {
@@ -494,7 +494,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 		rsslim,
 		mm ? mm->start_code : 0,
 		mm ? mm->end_code : 0,
-		(permitted && mm) ? task->stack_start : 0,
+		(permitted && mm) ? mm->start_stack : 0,
 		esp,
 		eip,
 		/* The signal information here is obsolete.
diff --git a/fs/proc/base.c b/fs/proc/base.c
index e42bbd843ed1..acb7ef80ea4f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -81,6 +81,7 @@
 #include <linux/elf.h>
 #include <linux/pid_namespace.h>
 #include <linux/fs_struct.h>
+#include <linux/slab.h>
 #include "internal.h"
 
 /* NOTE:
@@ -165,18 +166,6 @@ static int get_fs_path(struct task_struct *task, struct path *path, bool root)
 	return result;
 }
 
-static int get_nr_threads(struct task_struct *tsk)
-{
-	unsigned long flags;
-	int count = 0;
-
-	if (lock_task_sighand(tsk, &flags)) {
-		count = atomic_read(&tsk->signal->count);
-		unlock_task_sighand(tsk, &flags);
-	}
-	return count;
-}
-
 static int proc_cwd_link(struct inode *inode, struct path *path)
 {
 	struct task_struct *task = get_proc_task(inode);
@@ -442,12 +431,13 @@ static const struct file_operations proc_lstats_operations = {
 unsigned long badness(struct task_struct *p, unsigned long uptime);
 static int proc_oom_score(struct task_struct *task, char *buffer)
 {
-	unsigned long points;
+	unsigned long points = 0;
 	struct timespec uptime;
 
 	do_posix_clock_monotonic_gettime(&uptime);
 	read_lock(&tasklist_lock);
-	points = badness(task->group_leader, uptime.tv_sec);
+	if (pid_alive(task))
+		points = badness(task, uptime.tv_sec);
 	read_unlock(&tasklist_lock);
 	return sprintf(buffer, "%lu\n", points);
 }
@@ -647,17 +637,11 @@ static int mounts_release(struct inode *inode, struct file *file)
 static unsigned mounts_poll(struct file *file, poll_table *wait)
 {
 	struct proc_mounts *p = file->private_data;
-	struct mnt_namespace *ns = p->ns;
 	unsigned res = POLLIN | POLLRDNORM;
 
-	poll_wait(file, &ns->poll, wait);
-
-	spin_lock(&vfsmount_lock);
-	if (p->event != ns->event) {
-		p->event = ns->event;
+	poll_wait(file, &p->ns->poll, wait);
+	if (mnt_had_events(p))
 		res |= POLLERR | POLLPRI;
-	}
-	spin_unlock(&vfsmount_lock);
 
 	return res;
 }
@@ -734,6 +718,7 @@ out_no_task:
 
 static const struct file_operations proc_info_file_operations = {
 	.read		= proc_info_read,
+	.llseek		= generic_file_llseek,
 };
 
 static int proc_single_show(struct seq_file *m, void *v)
@@ -991,6 +976,7 @@ out_no_task:
 
 static const struct file_operations proc_environ_operations = {
 	.read		= environ_read,
+	.llseek		= generic_file_llseek,
 };
 
 static ssize_t oom_adjust_read(struct file *file, char __user *buf,
@@ -1064,6 +1050,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 static const struct file_operations proc_oom_adjust_operations = {
 	.read		= oom_adjust_read,
 	.write		= oom_adjust_write,
+	.llseek		= generic_file_llseek,
 };
 
 #ifdef CONFIG_AUDITSYSCALL
@@ -1095,8 +1082,12 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
 	if (!capable(CAP_AUDIT_CONTROL))
 		return -EPERM;
 
-	if (current != pid_task(proc_pid(inode), PIDTYPE_PID))
+	rcu_read_lock();
+	if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
+		rcu_read_unlock();
 		return -EPERM;
+	}
+	rcu_read_unlock();
 
 	if (count >= PAGE_SIZE)
 		count = PAGE_SIZE - 1;
@@ -1131,6 +1122,7 @@ out_free_page:
 static const struct file_operations proc_loginuid_operations = {
 	.read		= proc_loginuid_read,
 	.write		= proc_loginuid_write,
+	.llseek		= generic_file_llseek,
 };
 
 static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
@@ -1151,6 +1143,7 @@ static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
 
 static const struct file_operations proc_sessionid_operations = {
 	.read		= proc_sessionid_read,
+	.llseek		= generic_file_llseek,
 };
 #endif
 
@@ -1202,6 +1195,7 @@ static ssize_t proc_fault_inject_write(struct file * file,
 static const struct file_operations proc_fault_inject_operations = {
 	.read		= proc_fault_inject_read,
 	.write		= proc_fault_inject_write,
+	.llseek		= generic_file_llseek,
 };
 #endif
 
@@ -1943,7 +1937,7 @@ static ssize_t proc_fdinfo_read(struct file *file, char __user *buf,
 }
 
 static const struct file_operations proc_fdinfo_file_operations = {
-	.open		= nonseekable_open,
+	.open           = nonseekable_open,
 	.read		= proc_fdinfo_read,
 };
 
@@ -2227,6 +2221,7 @@ out_no_task:
 static const struct file_operations proc_pid_attr_operations = {
 	.read		= proc_pid_attr_read,
 	.write		= proc_pid_attr_write,
+	.llseek		= generic_file_llseek,
 };
 
 static const struct pid_entry attr_dir_stuff[] = {
@@ -2347,6 +2342,7 @@ static ssize_t proc_coredump_filter_write(struct file *file,
 static const struct file_operations proc_coredump_filter_operations = {
 	.read		= proc_coredump_filter_read,
 	.write		= proc_coredump_filter_write,
+	.llseek		= generic_file_llseek,
 };
 #endif
 
@@ -2369,16 +2365,30 @@ static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
 	struct pid_namespace *ns = dentry->d_sb->s_fs_info;
 	pid_t tgid = task_tgid_nr_ns(current, ns);
-	char tmp[PROC_NUMBUF];
-	if (!tgid)
-		return ERR_PTR(-ENOENT);
-	sprintf(tmp, "%d", task_tgid_nr_ns(current, ns));
-	return ERR_PTR(vfs_follow_link(nd,tmp));
+	char *name = ERR_PTR(-ENOENT);
+	if (tgid) {
+		name = __getname();
+		if (!name)
+			name = ERR_PTR(-ENOMEM);
+		else
+			sprintf(name, "%d", tgid);
+	}
+	nd_set_link(nd, name);
+	return NULL;
+}
+
+static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd,
+				void *cookie)
+{
+	char *s = nd_get_link(nd);
+	if (!IS_ERR(s))
+		__putname(s);
 }
 
 static const struct inode_operations proc_self_inode_operations = {
 	.readlink	= proc_self_readlink,
 	.follow_link	= proc_self_follow_link,
+	.put_link	= proc_self_put_link,
 };
 
 /*
@@ -2422,7 +2432,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
 	const struct pid_entry *p = ptr;
 	struct inode *inode;
 	struct proc_inode *ei;
-	struct dentry *error = ERR_PTR(-EINVAL);
+	struct dentry *error;
 
 	/* Allocate the inode */
 	error = ERR_PTR(-ENOMEM);
@@ -2772,7 +2782,7 @@ out:
 
 struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
 {
-	struct dentry *result = ERR_PTR(-ENOENT);
+	struct dentry *result;
 	struct task_struct *task;
 	unsigned tgid;
 	struct pid_namespace *ns;
@@ -2895,7 +2905,7 @@ out_no_task:
  */
 static const struct pid_entry tid_base_stuff[] = {
 	DIR("fd",        S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
-	DIR("fdinfo",    S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fd_operations),
+	DIR("fdinfo",    S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
 	REG("environ",   S_IRUSR, proc_environ_operations),
 	INF("auxv",      S_IRUSR, proc_pid_auxv),
 	ONE("status",    S_IRUGO, proc_pid_status),
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 480cb1065eec..2791907744ed 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -13,6 +13,7 @@
 #include <linux/proc_fs.h>
 #include <linux/stat.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/init.h>
 #include <linux/idr.h>
@@ -291,19 +292,17 @@ static const struct inode_operations proc_file_inode_operations = {
  * returns the struct proc_dir_entry for "/proc/tty/driver", and
  * returns "serial" in residual.
  */
-static int xlate_proc_name(const char *name,
-			   struct proc_dir_entry **ret, const char **residual)
+static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret,
+			     const char **residual)
 {
 	const char     		*cp = name, *next;
 	struct proc_dir_entry	*de;
 	int			len;
-	int 			rtn = 0;
 
 	de = *ret;
 	if (!de)
 		de = &proc_root;
 
-	spin_lock(&proc_subdir_lock);
 	while (1) {
 		next = strchr(cp, '/');
 		if (!next)
@@ -315,16 +314,25 @@ static int xlate_proc_name(const char *name,
 				break;
 		}
 		if (!de) {
-			rtn = -ENOENT;
-			goto out;
+			WARN(1, "name '%s'\n", name);
+			return -ENOENT;
 		}
 		cp += len + 1;
 	}
 	*residual = cp;
 	*ret = de;
-out:
+	return 0;
+}
+
+static int xlate_proc_name(const char *name, struct proc_dir_entry **ret,
+			   const char **residual)
+{
+	int rv;
+
+	spin_lock(&proc_subdir_lock);
+	rv = __xlate_proc_name(name, ret, residual);
 	spin_unlock(&proc_subdir_lock);
-	return rtn;
+	return rv;
 }
 
 static DEFINE_IDA(proc_inum_ida);
@@ -335,21 +343,6 @@ static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */
 /*
  * Return an inode number between PROC_DYNAMIC_FIRST and
  * 0xffffffff, or zero on failure.
- *
- * Current inode allocations in the proc-fs (hex-numbers):
- *
- * 00000000		reserved
- * 00000001-00000fff	static entries	(goners)
- *      001		root-ino
- *
- * 00001000-00001fff	unused
- * 0001xxxx-7fffxxxx	pid-dir entries for pid 1-7fff
- * 80000000-efffffff	unused
- * f0000000-ffffffff	dynamic entries
- *
- * Goal:
- *	Once we split the thing into several virtual filesystems,
- *	we will get rid of magical ranges (and this comment, BTW).
  */
 static unsigned int get_inode_number(void)
 {
@@ -662,6 +655,7 @@ struct proc_dir_entry *proc_symlink(const char *name,
 	}
 	return ent;
 }
+EXPORT_SYMBOL(proc_symlink);
 
 struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode,
 		struct proc_dir_entry *parent)
@@ -700,6 +694,7 @@ struct proc_dir_entry *proc_mkdir(const char *name,
 {
 	return proc_mkdir_mode(name, S_IRUGO | S_IXUGO, parent);
 }
+EXPORT_SYMBOL(proc_mkdir);
 
 struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
 					 struct proc_dir_entry *parent)
@@ -728,6 +723,7 @@ struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
 	}
 	return ent;
 }
+EXPORT_SYMBOL(create_proc_entry);
 
 struct proc_dir_entry *proc_create_data(const char *name, mode_t mode,
 					struct proc_dir_entry *parent,
@@ -762,6 +758,7 @@ out_free:
 out:
 	return NULL;
 }
+EXPORT_SYMBOL(proc_create_data);
 
 static void free_proc_entry(struct proc_dir_entry *de)
 {
@@ -793,11 +790,13 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
 	const char *fn = name;
 	int len;
 
-	if (xlate_proc_name(name, &parent, &fn) != 0)
+	spin_lock(&proc_subdir_lock);
+	if (__xlate_proc_name(name, &parent, &fn) != 0) {
+		spin_unlock(&proc_subdir_lock);
 		return;
+	}
 	len = strlen(fn);
 
-	spin_lock(&proc_subdir_lock);
 	for (p = &parent->subdir; *p; p=&(*p)->next ) {
 		if (proc_match(len, fn, *p)) {
 			de = *p;
@@ -807,8 +806,10 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
 		}
 	}
 	spin_unlock(&proc_subdir_lock);
-	if (!de)
+	if (!de) {
+		WARN(1, "name '%s'\n", name);
 		return;
+	}
 
 	spin_lock(&de->pde_unload_lock);
 	/*
@@ -853,3 +854,4 @@ continue_removing:
 			de->parent->name, de->name, de->subdir->name);
 	pde_put(de);
 }
+EXPORT_SYMBOL(remove_proc_entry);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 445a02bcaab3..aea8502e58a3 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -18,6 +18,7 @@
 #include <linux/module.h>
 #include <linux/smp_lock.h>
 #include <linux/sysctl.h>
+#include <linux/slab.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -231,9 +232,9 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne
 		if (rv == -ENOIOCTLCMD)
 			rv = -EINVAL;
 	} else if (ioctl) {
-		lock_kernel();
+		WARN_ONCE(1, "Procfs ioctl handlers must use unlocked_ioctl, "
+			  "%pf will be called without the Bkl held\n", ioctl);
 		rv = ioctl(file->f_path.dentry->d_inode, file, cmd, arg);
-		unlock_kernel();
 	}
 
 	pde_users_dec(pde);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index a44a7897fd4d..6f37c391468d 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -19,6 +19,7 @@
 #include <linux/highmem.h>
 #include <linux/bootmem.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 #include <linux/list.h>
@@ -490,7 +491,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 		}
 		read_unlock(&kclist_lock);
 
-		if (m == NULL) {
+		if (&m->list == &kclist_head) {
 			if (clear_user(buffer, tsz))
 				return -EFAULT;
 		} else if (is_vmalloc_or_module_addr((void *)start)) {
@@ -557,6 +558,7 @@ static int open_kcore(struct inode *inode, struct file *filp)
 static const struct file_operations proc_kcore_operations = {
 	.read		= read_kcore,
 	.open		= open_kcore,
+	.llseek		= generic_file_llseek,
 };
 
 #ifdef CONFIG_MEMORY_HOTPLUG
@@ -586,7 +588,7 @@ static struct kcore_list kcore_text;
  */
 static void __init proc_kcore_text_init(void)
 {
-	kclist_add(&kcore_text, _stext, _end - _stext, KCORE_TEXT);
+	kclist_add(&kcore_text, _text, _end - _text, KCORE_TEXT);
 }
 #else
 static void __init proc_kcore_text_init(void)
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index 7ca78346d3f0..bd4b5a740ff1 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -12,37 +12,37 @@
 #include <linux/poll.h>
 #include <linux/proc_fs.h>
 #include <linux/fs.h>
+#include <linux/syslog.h>
 
 #include <asm/uaccess.h>
 #include <asm/io.h>
 
 extern wait_queue_head_t log_wait;
 
-extern int do_syslog(int type, char __user *bug, int count);
-
 static int kmsg_open(struct inode * inode, struct file * file)
 {
-	return do_syslog(1,NULL,0);
+	return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_FILE);
 }
 
 static int kmsg_release(struct inode * inode, struct file * file)
 {
-	(void) do_syslog(0,NULL,0);
+	(void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_FILE);
 	return 0;
 }
 
 static ssize_t kmsg_read(struct file *file, char __user *buf,
 			 size_t count, loff_t *ppos)
 {
-	if ((file->f_flags & O_NONBLOCK) && !do_syslog(9, NULL, 0))
+	if ((file->f_flags & O_NONBLOCK) &&
+	    !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE))
 		return -EAGAIN;
-	return do_syslog(2, buf, count);
+	return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_FILE);
 }
 
 static unsigned int kmsg_poll(struct file *file, poll_table *wait)
 {
 	poll_wait(file, &log_wait, wait);
-	if (do_syslog(9, NULL, 0))
+	if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE))
 		return POLLIN | POLLRDNORM;
 	return 0;
 }
@@ -53,6 +53,7 @@ static const struct file_operations proc_kmsg_operations = {
 	.poll		= kmsg_poll,
 	.open		= kmsg_open,
 	.release	= kmsg_release,
+	.llseek		= generic_file_llseek,
 };
 
 static int __init proc_kmsg_init(void)
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 9fe7d7ebe115..b1822dde55c2 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -21,7 +21,6 @@
 #include <linux/mmzone.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
-#include <linux/slab.h>
 #include <linux/smp.h>
 #include <linux/seq_file.h>
 #include <linux/hugetlb.h>
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index 123257bb356b..d9396a4fc7ff 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -10,16 +10,20 @@
 #include <linux/seq_file.h>
 #include <linux/stat.h>
 #include <linux/string.h>
+#include <linux/of.h>
+#include <linux/module.h>
+#include <linux/slab.h>
 #include <asm/prom.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
-#ifndef HAVE_ARCH_DEVTREE_FIXUPS
 static inline void set_node_proc_entry(struct device_node *np,
 				       struct proc_dir_entry *de)
 {
-}
+#ifdef HAVE_ARCH_DEVTREE_FIXUPS
+	np->pde = de;
 #endif
+}
 
 static struct proc_dir_entry *proc_device_tree;
 
@@ -205,6 +209,9 @@ void proc_device_tree_add_node(struct device_node *np,
 	for (pp = np->properties; pp != NULL; pp = pp->next) {
 		p = pp->name;
 
+		if (strchr(p, '/'))
+			continue;
+
 		if (duplicate_name(de, p))
 			p = fixup_name(np, de, p);
 
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 04d1270f1c38..9020ac15baaa 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -14,6 +14,7 @@
 #include <linux/time.h>
 #include <linux/proc_fs.h>
 #include <linux/stat.h>
+#include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/module.h>
diff --git a/fs/proc/root.c b/fs/proc/root.c
index b080b791d9e3..4258384ed22d 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -110,7 +110,6 @@ void __init proc_root_init(void)
 	if (err)
 		return;
 	proc_mnt = kern_mount_data(&proc_fs_type, &init_pid_ns);
-	err = PTR_ERR(proc_mnt);
 	if (IS_ERR(proc_mnt)) {
 		unregister_filesystem(&proc_fs_type);
 		return;
@@ -220,9 +219,3 @@ void pid_ns_release_proc(struct pid_namespace *ns)
 {
 	mntput(ns->proc_mnt);
 }
-
-EXPORT_SYMBOL(proc_symlink);
-EXPORT_SYMBOL(proc_mkdir);
-EXPORT_SYMBOL(create_proc_entry);
-EXPORT_SYMBOL(proc_create_data);
-EXPORT_SYMBOL(remove_proc_entry);
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index b9b7aad2003d..bf31b03fc275 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -1,6 +1,5 @@
 #include <linux/cpumask.h>
 #include <linux/fs.h>
-#include <linux/gfp.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index f277c4a111cb..aea1d3f1ffb5 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -4,6 +4,7 @@
 #include <linux/seq_file.h>
 #include <linux/highmem.h>
 #include <linux/ptrace.h>
+#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/mempolicy.h>
 #include <linux/swap.h>
@@ -16,7 +17,7 @@
 
 void task_mem(struct seq_file *m, struct mm_struct *mm)
 {
-	unsigned long data, text, lib;
+	unsigned long data, text, lib, swap;
 	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
 
 	/*
@@ -36,6 +37,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 	data = mm->total_vm - mm->shared_vm - mm->stack_vm;
 	text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
 	lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
+	swap = get_mm_counter(mm, MM_SWAPENTS);
 	seq_printf(m,
 		"VmPeak:\t%8lu kB\n"
 		"VmSize:\t%8lu kB\n"
@@ -46,7 +48,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		"VmStk:\t%8lu kB\n"
 		"VmExe:\t%8lu kB\n"
 		"VmLib:\t%8lu kB\n"
-		"VmPTE:\t%8lu kB\n",
+		"VmPTE:\t%8lu kB\n"
+		"VmSwap:\t%8lu kB\n",
 		hiwater_vm << (PAGE_SHIFT-10),
 		(total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
 		mm->locked_vm << (PAGE_SHIFT-10),
@@ -54,7 +57,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		total_rss << (PAGE_SHIFT-10),
 		data << (PAGE_SHIFT-10),
 		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
-		(PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
+		(PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10,
+		swap << (PAGE_SHIFT-10));
 }
 
 unsigned long task_vsize(struct mm_struct *mm)
@@ -65,11 +69,11 @@ unsigned long task_vsize(struct mm_struct *mm)
 int task_statm(struct mm_struct *mm, int *shared, int *text,
 	       int *data, int *resident)
 {
-	*shared = get_mm_counter(mm, file_rss);
+	*shared = get_mm_counter(mm, MM_FILEPAGES);
 	*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
 								>> PAGE_SHIFT;
 	*data = mm->total_vm - mm->shared_vm;
-	*resident = *shared + get_mm_counter(mm, anon_rss);
+	*resident = *shared + get_mm_counter(mm, MM_ANONPAGES);
 	return mm->total_vm;
 }
 
@@ -243,25 +247,6 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
 				} else if (vma->vm_start <= mm->start_stack &&
 					   vma->vm_end >= mm->start_stack) {
 					name = "[stack]";
-				} else {
-					unsigned long stack_start;
-					struct proc_maps_private *pmp;
-
-					pmp = m->private;
-					stack_start = pmp->task->stack_start;
-
-					if (vma->vm_start <= stack_start &&
-					    vma->vm_end >= stack_start) {
-						pad_len_spaces(m, len);
-						seq_printf(m,
-						 "[threadstack:%08lx]",
-#ifdef CONFIG_STACK_GROWSUP
-						 vma->vm_end - stack_start
-#else
-						 stack_start - vma->vm_start
-#endif
-						);
-					}
 				}
 			} else {
 				name = "[vdso]";
@@ -403,6 +388,7 @@ static int show_smap(struct seq_file *m, void *v)
 
 	memset(&mss, 0, sizeof mss);
 	mss.vma = vma;
+	/* mmap_sem is held in m_start */
 	if (vma->vm_mm && !is_vm_hugetlb_page(vma))
 		walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
 
@@ -549,7 +535,8 @@ const struct file_operations proc_clear_refs_operations = {
 };
 
 struct pagemapread {
-	u64 __user *out, *end;
+	int pos, len;
+	u64 *buffer;
 };
 
 #define PM_ENTRY_BYTES      sizeof(u64)
@@ -572,10 +559,8 @@ struct pagemapread {
 static int add_to_pagemap(unsigned long addr, u64 pfn,
 			  struct pagemapread *pm)
 {
-	if (put_user(pfn, pm->out))
-		return -EFAULT;
-	pm->out++;
-	if (pm->out >= pm->end)
+	pm->buffer[pm->pos++] = pfn;
+	if (pm->pos >= pm->len)
 		return PM_END_OF_BUFFER;
 	return 0;
 }
@@ -649,6 +634,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	return err;
 }
 
+#ifdef CONFIG_HUGETLB_PAGE
 static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset)
 {
 	u64 pme = 0;
@@ -658,31 +644,18 @@ static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset)
 	return pme;
 }
 
-static int pagemap_hugetlb_range(pte_t *pte, unsigned long addr,
-				 unsigned long end, struct mm_walk *walk)
+/* This function walks within one hugetlb entry in the single call */
+static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
+				 unsigned long addr, unsigned long end,
+				 struct mm_walk *walk)
 {
-	struct vm_area_struct *vma;
 	struct pagemapread *pm = walk->private;
-	struct hstate *hs = NULL;
 	int err = 0;
+	u64 pfn;
 
-	vma = find_vma(walk->mm, addr);
-	if (vma)
-		hs = hstate_vma(vma);
 	for (; addr != end; addr += PAGE_SIZE) {
-		u64 pfn = PM_NOT_PRESENT;
-
-		if (vma && (addr >= vma->vm_end)) {
-			vma = find_vma(walk->mm, addr);
-			if (vma)
-				hs = hstate_vma(vma);
-		}
-
-		if (vma && (vma->vm_start <= addr) && is_vm_hugetlb_page(vma)) {
-			/* calculate pfn of the "raw" page in the hugepage. */
-			int offset = (addr & ~huge_page_mask(hs)) >> PAGE_SHIFT;
-			pfn = huge_pte_to_pagemap_entry(*pte, offset);
-		}
+		int offset = (addr & ~hmask) >> PAGE_SHIFT;
+		pfn = huge_pte_to_pagemap_entry(*pte, offset);
 		err = add_to_pagemap(addr, pfn, pm);
 		if (err)
 			return err;
@@ -692,6 +665,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long addr,
 
 	return err;
 }
+#endif /* HUGETLB_PAGE */
 
 /*
  * /proc/pid/pagemap - an array mapping virtual pages to pfns
@@ -717,21 +691,20 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long addr,
  * determine which areas of memory are actually mapped and llseek to
  * skip over unmapped regions.
  */
+#define PAGEMAP_WALK_SIZE	(PMD_SIZE)
 static ssize_t pagemap_read(struct file *file, char __user *buf,
 			    size_t count, loff_t *ppos)
 {
 	struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
-	struct page **pages, *page;
-	unsigned long uaddr, uend;
 	struct mm_struct *mm;
 	struct pagemapread pm;
-	int pagecount;
 	int ret = -ESRCH;
 	struct mm_walk pagemap_walk = {};
 	unsigned long src;
 	unsigned long svpfn;
 	unsigned long start_vaddr;
 	unsigned long end_vaddr;
+	int copied = 0;
 
 	if (!task)
 		goto out;
@@ -754,38 +727,17 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 	if (!mm)
 		goto out_task;
 
-
-	uaddr = (unsigned long)buf & PAGE_MASK;
-	uend = (unsigned long)(buf + count);
-	pagecount = (PAGE_ALIGN(uend) - uaddr) / PAGE_SIZE;
-	ret = 0;
-	if (pagecount == 0)
-		goto out_mm;
-	pages = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL);
+	pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
+	pm.buffer = kmalloc(pm.len, GFP_TEMPORARY);
 	ret = -ENOMEM;
-	if (!pages)
+	if (!pm.buffer)
 		goto out_mm;
 
-	down_read(&current->mm->mmap_sem);
-	ret = get_user_pages(current, current->mm, uaddr, pagecount,
-			     1, 0, pages, NULL);
-	up_read(&current->mm->mmap_sem);
-
-	if (ret < 0)
-		goto out_free;
-
-	if (ret != pagecount) {
-		pagecount = ret;
-		ret = -EFAULT;
-		goto out_pages;
-	}
-
-	pm.out = (u64 __user *)buf;
-	pm.end = (u64 __user *)(buf + count);
-
 	pagemap_walk.pmd_entry = pagemap_pte_range;
 	pagemap_walk.pte_hole = pagemap_pte_hole;
+#ifdef CONFIG_HUGETLB_PAGE
 	pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
+#endif
 	pagemap_walk.mm = mm;
 	pagemap_walk.private = &pm;
 
@@ -804,23 +756,36 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 	 * user buffer is tracked in "pm", and the walk
 	 * will stop when we hit the end of the buffer.
 	 */
-	ret = walk_page_range(start_vaddr, end_vaddr, &pagemap_walk);
-	if (ret == PM_END_OF_BUFFER)
-		ret = 0;
-	/* don't need mmap_sem for these, but this looks cleaner */
-	*ppos += (char __user *)pm.out - buf;
-	if (!ret)
-		ret = (char __user *)pm.out - buf;
-
-out_pages:
-	for (; pagecount; pagecount--) {
-		page = pages[pagecount-1];
-		if (!PageReserved(page))
-			SetPageDirty(page);
-		page_cache_release(page);
+	ret = 0;
+	while (count && (start_vaddr < end_vaddr)) {
+		int len;
+		unsigned long end;
+
+		pm.pos = 0;
+		end = start_vaddr + PAGEMAP_WALK_SIZE;
+		/* overflow ? */
+		if (end < start_vaddr || end > end_vaddr)
+			end = end_vaddr;
+		down_read(&mm->mmap_sem);
+		ret = walk_page_range(start_vaddr, end, &pagemap_walk);
+		up_read(&mm->mmap_sem);
+		start_vaddr = end;
+
+		len = min(count, PM_ENTRY_BYTES * pm.pos);
+		if (copy_to_user(buf, pm.buffer, len)) {
+			ret = -EFAULT;
+			goto out_free;
+		}
+		copied += len;
+		buf += len;
+		count -= len;
 	}
+	*ppos += copied;
+	if (!ret || ret == PM_END_OF_BUFFER)
+		ret = copied;
+
 out_free:
-	kfree(pages);
+	kfree(pm.buffer);
 out_mm:
 	mmput(mm);
 out_task:
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 5d9fd64ef81a..cb6306e63843 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -5,6 +5,7 @@
 #include <linux/fs_struct.h>
 #include <linux/mount.h>
 #include <linux/ptrace.h>
+#include <linux/slab.h>
 #include <linux/seq_file.h>
 #include "internal.h"
 
@@ -121,11 +122,20 @@ int task_statm(struct mm_struct *mm, int *shared, int *text,
 	return size;
 }
 
+static void pad_len_spaces(struct seq_file *m, int len)
+{
+	len = 25 + sizeof(void*) * 6 - len;
+	if (len < 1)
+		len = 1;
+	seq_printf(m, "%*c", len, ' ');
+}
+
 /*
  * display a single VMA to a sequenced file
  */
 static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
 {
+	struct mm_struct *mm = vma->vm_mm;
 	unsigned long ino = 0;
 	struct file *file;
 	dev_t dev = 0;
@@ -154,11 +164,14 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
 		   MAJOR(dev), MINOR(dev), ino, &len);
 
 	if (file) {
-		len = 25 + sizeof(void *) * 6 - len;
-		if (len < 1)
-			len = 1;
-		seq_printf(m, "%*c", len, ' ');
+		pad_len_spaces(m, len);
 		seq_path(m, &file->f_path, "");
+	} else if (mm) {
+		if (vma->vm_start <= mm->start_stack &&
+			vma->vm_end >= mm->start_stack) {
+			pad_len_spaces(m, len);
+			seq_puts(m, "[stack]");
+		}
 	}
 
 	seq_putc(m, '\n');
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 0872afa58d39..91c817ff02c3 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -12,6 +12,7 @@
 #include <linux/user.h>
 #include <linux/elf.h>
 #include <linux/elfcore.h>
+#include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/bootmem.h>
 #include <linux/init.h>
@@ -162,6 +163,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
 
 static const struct file_operations proc_vmcore_operations = {
 	.read		= read_vmcore,
+	.llseek		= generic_file_llseek,
 };
 
 static struct vmcore* __init get_new_element(void)
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 6f30c3d5bcbf..6e8fc62b40a8 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -77,9 +77,10 @@ out:
 
 const struct file_operations qnx4_dir_operations =
 {
+	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= qnx4_readdir,
-	.fsync		= simple_fsync,
+	.fsync		= generic_file_fsync,
 };
 
 const struct inode_operations qnx4_dir_inode_operations =
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index ebf3440d28ca..277575ddc05c 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -201,7 +201,8 @@ static const char *qnx4_checkroot(struct super_block *sb)
 				rootdir = (struct qnx4_inode_entry *) (bh->b_data + i * QNX4_DIR_ENTRY_SIZE);
 				if (rootdir->di_fname != NULL) {
 					QNX4DEBUG((KERN_INFO "rootdir entry found : [%s]\n", rootdir->di_fname));
-					if (!strncmp(rootdir->di_fname, QNX4_BMNAME, sizeof QNX4_BMNAME)) {
+					if (!strcmp(rootdir->di_fname,
+						    QNX4_BMNAME)) {
 						found = 1;
 						qnx4_sb(sb)->BitMap = kmalloc( sizeof( struct qnx4_inode_entry ), GFP_KERNEL );
 						if (!qnx4_sb(sb)->BitMap) {
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index efc02ebb8c70..3e21b1e2ad3a 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -33,6 +33,14 @@ config PRINT_QUOTA_WARNING
 	  Note that this behavior is currently deprecated and may go away in
 	  future. Please use notification via netlink socket instead.
 
+config QUOTA_DEBUG
+	bool "Additional quota sanity checks"
+	depends on QUOTA
+	default n
+	help
+	  If you say Y here, quota subsystem will perform some additional
+	  sanity checks of quota internal structures. If unsure, say N.
+
 # Generic support for tree structured quota files. Selected when needed.
 config QUOTA_TREE
 	 tristate
@@ -59,3 +67,8 @@ config QUOTACTL
 	bool
 	depends on XFS_QUOTA || QUOTA
 	default y
+
+config QUOTACTL_COMPAT
+	bool
+	depends on QUOTACTL && COMPAT_FOR_U64_ALIGNMENT
+	default y
diff --git a/fs/quota/Makefile b/fs/quota/Makefile
index 68d4f6dc0578..5f9e9e276af0 100644
--- a/fs/quota/Makefile
+++ b/fs/quota/Makefile
@@ -3,3 +3,5 @@ obj-$(CONFIG_QFMT_V1)		+= quota_v1.o
 obj-$(CONFIG_QFMT_V2)		+= quota_v2.o
 obj-$(CONFIG_QUOTA_TREE)	+= quota_tree.o
 obj-$(CONFIG_QUOTACTL)		+= quota.o
+obj-$(CONFIG_QUOTACTL_COMPAT)	+= compat.o
+obj-$(CONFIG_QUOTA_NETLINK_INTERFACE)	+= netlink.o
diff --git a/fs/quota/compat.c b/fs/quota/compat.c
new file mode 100644
index 000000000000..fb1892fe3e56
--- /dev/null
+++ b/fs/quota/compat.c
@@ -0,0 +1,118 @@
+
+#include <linux/syscalls.h>
+#include <linux/compat.h>
+#include <linux/quotaops.h>
+
+/*
+ * This code works only for 32 bit quota tools over 64 bit OS (x86_64, ia64)
+ * and is necessary due to alignment problems.
+ */
+struct compat_if_dqblk {
+	compat_u64 dqb_bhardlimit;
+	compat_u64 dqb_bsoftlimit;
+	compat_u64 dqb_curspace;
+	compat_u64 dqb_ihardlimit;
+	compat_u64 dqb_isoftlimit;
+	compat_u64 dqb_curinodes;
+	compat_u64 dqb_btime;
+	compat_u64 dqb_itime;
+	compat_uint_t dqb_valid;
+};
+
+/* XFS structures */
+struct compat_fs_qfilestat {
+	compat_u64 dqb_bhardlimit;
+	compat_u64 qfs_nblks;
+	compat_uint_t qfs_nextents;
+};
+
+struct compat_fs_quota_stat {
+	__s8		qs_version;
+	__u16		qs_flags;
+	__s8		qs_pad;
+	struct compat_fs_qfilestat	qs_uquota;
+	struct compat_fs_qfilestat	qs_gquota;
+	compat_uint_t	qs_incoredqs;
+	compat_int_t	qs_btimelimit;
+	compat_int_t	qs_itimelimit;
+	compat_int_t	qs_rtbtimelimit;
+	__u16		qs_bwarnlimit;
+	__u16		qs_iwarnlimit;
+};
+
+asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
+						qid_t id, void __user *addr)
+{
+	unsigned int cmds;
+	struct if_dqblk __user *dqblk;
+	struct compat_if_dqblk __user *compat_dqblk;
+	struct fs_quota_stat __user *fsqstat;
+	struct compat_fs_quota_stat __user *compat_fsqstat;
+	compat_uint_t data;
+	u16 xdata;
+	long ret;
+
+	cmds = cmd >> SUBCMDSHIFT;
+
+	switch (cmds) {
+	case Q_GETQUOTA:
+		dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
+		compat_dqblk = addr;
+		ret = sys_quotactl(cmd, special, id, dqblk);
+		if (ret)
+			break;
+		if (copy_in_user(compat_dqblk, dqblk, sizeof(*compat_dqblk)) ||
+			get_user(data, &dqblk->dqb_valid) ||
+			put_user(data, &compat_dqblk->dqb_valid))
+			ret = -EFAULT;
+		break;
+	case Q_SETQUOTA:
+		dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
+		compat_dqblk = addr;
+		ret = -EFAULT;
+		if (copy_in_user(dqblk, compat_dqblk, sizeof(*compat_dqblk)) ||
+			get_user(data, &compat_dqblk->dqb_valid) ||
+			put_user(data, &dqblk->dqb_valid))
+			break;
+		ret = sys_quotactl(cmd, special, id, dqblk);
+		break;
+	case Q_XGETQSTAT:
+		fsqstat = compat_alloc_user_space(sizeof(struct fs_quota_stat));
+		compat_fsqstat = addr;
+		ret = sys_quotactl(cmd, special, id, fsqstat);
+		if (ret)
+			break;
+		ret = -EFAULT;
+		/* Copying qs_version, qs_flags, qs_pad */
+		if (copy_in_user(compat_fsqstat, fsqstat,
+			offsetof(struct compat_fs_quota_stat, qs_uquota)))
+			break;
+		/* Copying qs_uquota */
+		if (copy_in_user(&compat_fsqstat->qs_uquota,
+			&fsqstat->qs_uquota,
+			sizeof(compat_fsqstat->qs_uquota)) ||
+			get_user(data, &fsqstat->qs_uquota.qfs_nextents) ||
+			put_user(data, &compat_fsqstat->qs_uquota.qfs_nextents))
+			break;
+		/* Copying qs_gquota */
+		if (copy_in_user(&compat_fsqstat->qs_gquota,
+			&fsqstat->qs_gquota,
+			sizeof(compat_fsqstat->qs_gquota)) ||
+			get_user(data, &fsqstat->qs_gquota.qfs_nextents) ||
+			put_user(data, &compat_fsqstat->qs_gquota.qfs_nextents))
+			break;
+		/* Copying the rest */
+		if (copy_in_user(&compat_fsqstat->qs_incoredqs,
+			&fsqstat->qs_incoredqs,
+			sizeof(struct compat_fs_quota_stat) -
+			offsetof(struct compat_fs_quota_stat, qs_incoredqs)) ||
+			get_user(xdata, &fsqstat->qs_iwarnlimit) ||
+			put_user(xdata, &compat_fsqstat->qs_iwarnlimit))
+			break;
+		ret = 0;
+		break;
+	default:
+		ret = sys_quotactl(cmd, special, id, addr);
+	}
+	return ret;
+}
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 3fc62b097bed..437d2ca2de97 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -80,11 +80,9 @@
 
 #include <asm/uaccess.h>
 
-#define __DQUOT_PARANOIA
-
 /*
  * There are three quota SMP locks. dq_list_lock protects all lists with quotas
- * and quota formats, dqstats structure containing statistics about the lists
+ * and quota formats.
  * dq_data_lock protects data from dq_dqb and also mem_dqinfo structures and
  * also guards consistency of dquot->dq_dqb with inode->i_blocks, i_bytes.
  * i_blocks and i_bytes updates itself are guarded by i_lock acquired directly
@@ -100,9 +98,13 @@
  *
  * Any operation working on dquots via inode pointers must hold dqptr_sem.  If
  * operation is just reading pointers from inode (or not using them at all) the
- * read lock is enough. If pointers are altered function must hold write lock
- * (these locking rules also apply for S_NOQUOTA flag in the inode - note that
- * for altering the flag i_mutex is also needed).
+ * read lock is enough. If pointers are altered function must hold write lock.
+ * Special care needs to be taken about S_NOQUOTA inode flag (marking that
+ * inode is a quota file). Functions adding pointers from inode to dquots have
+ * to check this flag under dqptr_sem and then (if S_NOQUOTA is not set) they
+ * have to do all pointer modifications before dropping dqptr_sem. This makes
+ * sure they cannot race with quotaon which first sets S_NOQUOTA flag and
+ * then drops all pointers to dquots from an inode.
  *
  * Each dquot has its dq_lock mutex. Locked dquots might not be referenced
  * from inodes (dquot_alloc_space() and such don't check the dq_lock).
@@ -130,7 +132,9 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_state_lock);
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock);
 EXPORT_SYMBOL(dq_data_lock);
 
+#if defined(CONFIG_QUOTA_DEBUG) || defined(CONFIG_PRINT_QUOTA_WARNING)
 static char *quotatypes[] = INITQFNAMES;
+#endif
 static struct quota_format_type *quota_formats;	/* List of registered formats */
 static struct quota_module_name module_names[] = INIT_QUOTA_MODULE_NAMES;
 
@@ -225,6 +229,9 @@ static struct hlist_head *dquot_hash;
 struct dqstats dqstats;
 EXPORT_SYMBOL(dqstats);
 
+static qsize_t inode_get_rsv_space(struct inode *inode);
+static void __dquot_initialize(struct inode *inode, int type);
+
 static inline unsigned int
 hashfn(const struct super_block *sb, unsigned int id, int type)
 {
@@ -268,7 +275,7 @@ static struct dquot *find_dquot(unsigned int hashent, struct super_block *sb,
 static inline void put_dquot_last(struct dquot *dquot)
 {
 	list_add_tail(&dquot->dq_free, &free_dquots);
-	dqstats.free_dquots++;
+	dqstats_inc(DQST_FREE_DQUOTS);
 }
 
 static inline void remove_free_dquot(struct dquot *dquot)
@@ -276,7 +283,7 @@ static inline void remove_free_dquot(struct dquot *dquot)
 	if (list_empty(&dquot->dq_free))
 		return;
 	list_del_init(&dquot->dq_free);
-	dqstats.free_dquots--;
+	dqstats_dec(DQST_FREE_DQUOTS);
 }
 
 static inline void put_inuse(struct dquot *dquot)
@@ -284,12 +291,12 @@ static inline void put_inuse(struct dquot *dquot)
 	/* We add to the back of inuse list so we don't have to restart
 	 * when traversing this list and we block */
 	list_add_tail(&dquot->dq_inuse, &inuse_list);
-	dqstats.allocated_dquots++;
+	dqstats_inc(DQST_ALLOC_DQUOTS);
 }
 
 static inline void remove_inuse(struct dquot *dquot)
 {
-	dqstats.allocated_dquots--;
+	dqstats_dec(DQST_ALLOC_DQUOTS);
 	list_del(&dquot->dq_inuse);
 }
 /*
@@ -312,14 +319,23 @@ static inline int mark_dquot_dirty(struct dquot *dquot)
 	return dquot->dq_sb->dq_op->mark_dirty(dquot);
 }
 
+/* Mark dquot dirty in atomic manner, and return it's old dirty flag state */
 int dquot_mark_dquot_dirty(struct dquot *dquot)
 {
+	int ret = 1;
+
+	/* If quota is dirty already, we don't have to acquire dq_list_lock */
+	if (test_bit(DQ_MOD_B, &dquot->dq_flags))
+		return 1;
+
 	spin_lock(&dq_list_lock);
-	if (!test_and_set_bit(DQ_MOD_B, &dquot->dq_flags))
+	if (!test_and_set_bit(DQ_MOD_B, &dquot->dq_flags)) {
 		list_add(&dquot->dq_dirty, &sb_dqopt(dquot->dq_sb)->
 				info[dquot->dq_type].dqi_dirty_list);
+		ret = 0;
+	}
 	spin_unlock(&dq_list_lock);
-	return 0;
+	return ret;
 }
 EXPORT_SYMBOL(dquot_mark_dquot_dirty);
 
@@ -545,8 +561,8 @@ int dquot_scan_active(struct super_block *sb,
 			continue;
 		/* Now we have active dquot so we can just increase use count */
 		atomic_inc(&dquot->dq_count);
-		dqstats.lookups++;
 		spin_unlock(&dq_list_lock);
+		dqstats_inc(DQST_LOOKUPS);
 		dqput(old_dquot);
 		old_dquot = dquot;
 		ret = fn(dquot, priv);
@@ -564,7 +580,7 @@ out:
 }
 EXPORT_SYMBOL(dquot_scan_active);
 
-int vfs_quota_sync(struct super_block *sb, int type)
+int dquot_quota_sync(struct super_block *sb, int type, int wait)
 {
 	struct list_head *dirty;
 	struct dquot *dquot;
@@ -591,8 +607,8 @@ int vfs_quota_sync(struct super_block *sb, int type)
  			 * holding reference so we can safely just increase
 			 * use count */
 			atomic_inc(&dquot->dq_count);
-			dqstats.lookups++;
 			spin_unlock(&dq_list_lock);
+			dqstats_inc(DQST_LOOKUPS);
 			sb->dq_op->write_dquot(dquot);
 			dqput(dquot);
 			spin_lock(&dq_list_lock);
@@ -604,14 +620,39 @@ int vfs_quota_sync(struct super_block *sb, int type)
 		if ((cnt == type || type == -1) && sb_has_quota_active(sb, cnt)
 		    && info_dirty(&dqopt->info[cnt]))
 			sb->dq_op->write_info(sb, cnt);
-	spin_lock(&dq_list_lock);
-	dqstats.syncs++;
-	spin_unlock(&dq_list_lock);
+	dqstats_inc(DQST_SYNCS);
 	mutex_unlock(&dqopt->dqonoff_mutex);
 
+	if (!wait || (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE))
+		return 0;
+
+	/* This is not very clever (and fast) but currently I don't know about
+	 * any other simple way of getting quota data to disk and we must get
+	 * them there for userspace to be visible... */
+	if (sb->s_op->sync_fs)
+		sb->s_op->sync_fs(sb, 1);
+	sync_blockdev(sb->s_bdev);
+
+	/*
+	 * Now when everything is written we can discard the pagecache so
+	 * that userspace sees the changes.
+	 */
+	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (type != -1 && cnt != type)
+			continue;
+		if (!sb_has_quota_active(sb, cnt))
+			continue;
+		mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex,
+				  I_MUTEX_QUOTA);
+		truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0);
+		mutex_unlock(&sb_dqopt(sb)->files[cnt]->i_mutex);
+	}
+	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+
 	return 0;
 }
-EXPORT_SYMBOL(vfs_quota_sync);
+EXPORT_SYMBOL(dquot_quota_sync);
 
 /* Free unused dquots from cache */
 static void prune_dqcache(int count)
@@ -635,15 +676,16 @@ static void prune_dqcache(int count)
  * This is called from kswapd when we think we need some
  * more memory
  */
-
-static int shrink_dqcache_memory(int nr, gfp_t gfp_mask)
+static int shrink_dqcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 {
 	if (nr) {
 		spin_lock(&dq_list_lock);
 		prune_dqcache(nr);
 		spin_unlock(&dq_list_lock);
 	}
-	return (dqstats.free_dquots / 100) * sysctl_vfs_cache_pressure;
+	return ((unsigned)
+		percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS])
+		/100) * sysctl_vfs_cache_pressure;
 }
 
 static struct shrinker dqcache_shrinker = {
@@ -661,7 +703,7 @@ void dqput(struct dquot *dquot)
 
 	if (!dquot)
 		return;
-#ifdef __DQUOT_PARANOIA
+#ifdef CONFIG_QUOTA_DEBUG
 	if (!atomic_read(&dquot->dq_count)) {
 		printk("VFS: dqput: trying to free free dquot\n");
 		printk("VFS: device %s, dquot of %s %d\n",
@@ -671,10 +713,7 @@ void dqput(struct dquot *dquot)
 		BUG();
 	}
 #endif
-	
-	spin_lock(&dq_list_lock);
-	dqstats.drops++;
-	spin_unlock(&dq_list_lock);
+	dqstats_inc(DQST_DROPS);
 we_slept:
 	spin_lock(&dq_list_lock);
 	if (atomic_read(&dquot->dq_count) > 1) {
@@ -714,7 +753,7 @@ we_slept:
 		goto we_slept;
 	}
 	atomic_dec(&dquot->dq_count);
-#ifdef __DQUOT_PARANOIA
+#ifdef CONFIG_QUOTA_DEBUG
 	/* sanity check */
 	BUG_ON(!list_empty(&dquot->dq_free));
 #endif
@@ -791,15 +830,15 @@ we_slept:
 		put_inuse(dquot);
 		/* hash it first so it can be found */
 		insert_dquot_hash(dquot);
-		dqstats.lookups++;
 		spin_unlock(&dq_list_lock);
+		dqstats_inc(DQST_LOOKUPS);
 	} else {
 		if (!atomic_read(&dquot->dq_count))
 			remove_free_dquot(dquot);
 		atomic_inc(&dquot->dq_count);
-		dqstats.cache_hits++;
-		dqstats.lookups++;
 		spin_unlock(&dq_list_lock);
+		dqstats_inc(DQST_CACHE_HITS);
+		dqstats_inc(DQST_LOOKUPS);
 	}
 	/* Wait for dq_lock - after this we know that either dquot_release() is
 	 * already finished or it will be canceled due to dq_count > 1 test */
@@ -811,7 +850,7 @@ we_slept:
 		dquot = NULL;
 		goto out;
 	}
-#ifdef __DQUOT_PARANOIA
+#ifdef CONFIG_QUOTA_DEBUG
 	BUG_ON(!dquot->dq_sb);	/* Has somebody invalidated entry under us? */
 #endif
 out:
@@ -840,11 +879,18 @@ static int dqinit_needed(struct inode *inode, int type)
 static void add_dquot_ref(struct super_block *sb, int type)
 {
 	struct inode *inode, *old_inode = NULL;
+#ifdef CONFIG_QUOTA_DEBUG
+	int reserved = 0;
+#endif
 
 	spin_lock(&inode_lock);
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
 		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
 			continue;
+#ifdef CONFIG_QUOTA_DEBUG
+		if (unlikely(inode_get_rsv_space(inode) > 0))
+			reserved = 1;
+#endif
 		if (!atomic_read(&inode->i_writecount))
 			continue;
 		if (!dqinit_needed(inode, type))
@@ -854,7 +900,7 @@ static void add_dquot_ref(struct super_block *sb, int type)
 		spin_unlock(&inode_lock);
 
 		iput(old_inode);
-		sb->dq_op->initialize(inode, type);
+		__dquot_initialize(inode, type);
 		/* We hold a reference to 'inode' so it couldn't have been
 		 * removed from s_inodes list while we dropped the inode_lock.
 		 * We cannot iput the inode now as we can be holding the last
@@ -865,6 +911,14 @@ static void add_dquot_ref(struct super_block *sb, int type)
 	}
 	spin_unlock(&inode_lock);
 	iput(old_inode);
+
+#ifdef CONFIG_QUOTA_DEBUG
+	if (reserved) {
+		printk(KERN_WARNING "VFS (%s): Writes happened before quota"
+			" was turned on thus quota information is probably "
+			"inconsistent. Please run quotacheck(8).\n", sb->s_id);
+	}
+#endif
 }
 
 /*
@@ -891,7 +945,7 @@ static int remove_inode_dquot_ref(struct inode *inode, int type,
 	inode->i_dquot[type] = NULL;
 	if (dquot) {
 		if (dqput_blocks(dquot)) {
-#ifdef __DQUOT_PARANOIA
+#ifdef CONFIG_QUOTA_DEBUG
 			if (atomic_read(&dquot->dq_count) != 1)
 				printk(KERN_WARNING "VFS: Adding dquot with dq_count %d to dispose list.\n", atomic_read(&dquot->dq_count));
 #endif
@@ -978,10 +1032,12 @@ static inline void dquot_resv_space(struct dquot *dquot, qsize_t number)
 /*
  * Claim reserved quota space
  */
-static void dquot_claim_reserved_space(struct dquot *dquot,
-						qsize_t number)
+static void dquot_claim_reserved_space(struct dquot *dquot, qsize_t number)
 {
-	WARN_ON(dquot->dq_dqb.dqb_rsvspace < number);
+	if (dquot->dq_dqb.dqb_rsvspace < number) {
+		WARN_ON_ONCE(1);
+		number = dquot->dq_dqb.dqb_rsvspace;
+	}
 	dquot->dq_dqb.dqb_curspace += number;
 	dquot->dq_dqb.dqb_rsvspace -= number;
 }
@@ -989,7 +1045,12 @@ static void dquot_claim_reserved_space(struct dquot *dquot,
 static inline
 void dquot_free_reserved_space(struct dquot *dquot, qsize_t number)
 {
-	dquot->dq_dqb.dqb_rsvspace -= number;
+	if (dquot->dq_dqb.dqb_rsvspace >= number)
+		dquot->dq_dqb.dqb_rsvspace -= number;
+	else {
+		WARN_ON_ONCE(1);
+		dquot->dq_dqb.dqb_rsvspace = 0;
+	}
 }
 
 static void dquot_decr_inodes(struct dquot *dquot, qsize_t number)
@@ -1131,13 +1192,13 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
 	*warntype = QUOTA_NL_NOWARN;
 	if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) ||
 	    test_bit(DQ_FAKE_B, &dquot->dq_flags))
-		return QUOTA_OK;
+		return 0;
 
 	if (dquot->dq_dqb.dqb_ihardlimit &&
 	    newinodes > dquot->dq_dqb.dqb_ihardlimit &&
             !ignore_hardlimit(dquot)) {
 		*warntype = QUOTA_NL_IHARDWARN;
-		return NO_QUOTA;
+		return -EDQUOT;
 	}
 
 	if (dquot->dq_dqb.dqb_isoftlimit &&
@@ -1146,7 +1207,7 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
 	    get_seconds() >= dquot->dq_dqb.dqb_itime &&
             !ignore_hardlimit(dquot)) {
 		*warntype = QUOTA_NL_ISOFTLONGWARN;
-		return NO_QUOTA;
+		return -EDQUOT;
 	}
 
 	if (dquot->dq_dqb.dqb_isoftlimit &&
@@ -1157,7 +1218,7 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
 		    sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_igrace;
 	}
 
-	return QUOTA_OK;
+	return 0;
 }
 
 /* needs dq_data_lock */
@@ -1169,7 +1230,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
 	*warntype = QUOTA_NL_NOWARN;
 	if (!sb_has_quota_limits_enabled(sb, dquot->dq_type) ||
 	    test_bit(DQ_FAKE_B, &dquot->dq_flags))
-		return QUOTA_OK;
+		return 0;
 
 	tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace
 		+ space;
@@ -1179,7 +1240,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
             !ignore_hardlimit(dquot)) {
 		if (!prealloc)
 			*warntype = QUOTA_NL_BHARDWARN;
-		return NO_QUOTA;
+		return -EDQUOT;
 	}
 
 	if (dquot->dq_dqb.dqb_bsoftlimit &&
@@ -1189,7 +1250,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
             !ignore_hardlimit(dquot)) {
 		if (!prealloc)
 			*warntype = QUOTA_NL_BSOFTLONGWARN;
-		return NO_QUOTA;
+		return -EDQUOT;
 	}
 
 	if (dquot->dq_dqb.dqb_bsoftlimit &&
@@ -1205,10 +1266,10 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
 			 * We don't allow preallocation to exceed softlimit so exceeding will
 			 * be always printed
 			 */
-			return NO_QUOTA;
+			return -EDQUOT;
 	}
 
-	return QUOTA_OK;
+	return 0;
 }
 
 static int info_idq_free(struct dquot *dquot, qsize_t inodes)
@@ -1242,25 +1303,32 @@ static int info_bdq_free(struct dquot *dquot, qsize_t space)
 		return QUOTA_NL_BHARDBELOW;
 	return QUOTA_NL_NOWARN;
 }
+
 /*
- *	Initialize quota pointers in inode
- *	We do things in a bit complicated way but by that we avoid calling
- *	dqget() and thus filesystem callbacks under dqptr_sem.
+ * Initialize quota pointers in inode
+ *
+ * We do things in a bit complicated way but by that we avoid calling
+ * dqget() and thus filesystem callbacks under dqptr_sem.
+ *
+ * It is better to call this function outside of any transaction as it
+ * might need a lot of space in journal for dquot structure allocation.
  */
-int dquot_initialize(struct inode *inode, int type)
+static void __dquot_initialize(struct inode *inode, int type)
 {
 	unsigned int id = 0;
-	int cnt, ret = 0;
-	struct dquot *got[MAXQUOTAS] = { NULL, NULL };
+	int cnt;
+	struct dquot *got[MAXQUOTAS];
 	struct super_block *sb = inode->i_sb;
+	qsize_t rsv;
 
 	/* First test before acquiring mutex - solves deadlocks when we
          * re-enter the quota code and are already holding the mutex */
-	if (IS_NOQUOTA(inode))
-		return 0;
+	if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode))
+		return;
 
 	/* First get references to structures we might need. */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		got[cnt] = NULL;
 		if (type != -1 && cnt != type)
 			continue;
 		switch (cnt) {
@@ -1275,7 +1343,6 @@ int dquot_initialize(struct inode *inode, int type)
 	}
 
 	down_write(&sb_dqopt(sb)->dqptr_sem);
-	/* Having dqptr_sem we know NOQUOTA flags can't be altered... */
 	if (IS_NOQUOTA(inode))
 		goto out_err;
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -1287,20 +1354,31 @@ int dquot_initialize(struct inode *inode, int type)
 		if (!inode->i_dquot[cnt]) {
 			inode->i_dquot[cnt] = got[cnt];
 			got[cnt] = NULL;
+			/*
+			 * Make quota reservation system happy if someone
+			 * did a write before quota was turned on
+			 */
+			rsv = inode_get_rsv_space(inode);
+			if (unlikely(rsv))
+				dquot_resv_space(inode->i_dquot[cnt], rsv);
 		}
 	}
 out_err:
 	up_write(&sb_dqopt(sb)->dqptr_sem);
 	/* Drop unused references */
 	dqput_all(got);
-	return ret;
+}
+
+void dquot_initialize(struct inode *inode)
+{
+	__dquot_initialize(inode, -1);
 }
 EXPORT_SYMBOL(dquot_initialize);
 
 /*
  * 	Release all quotas referenced by inode
  */
-int dquot_drop(struct inode *inode)
+static void __dquot_drop(struct inode *inode)
 {
 	int cnt;
 	struct dquot *put[MAXQUOTAS];
@@ -1312,32 +1390,31 @@ int dquot_drop(struct inode *inode)
 	}
 	up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	dqput_all(put);
-	return 0;
 }
-EXPORT_SYMBOL(dquot_drop);
 
-/* Wrapper to remove references to quota structures from inode */
-void vfs_dq_drop(struct inode *inode)
-{
-	/* Here we can get arbitrary inode from clear_inode() so we have
-	 * to be careful. OTOH we don't need locking as quota operations
-	 * are allowed to change only at mount time */
-	if (!IS_NOQUOTA(inode) && inode->i_sb && inode->i_sb->dq_op
-	    && inode->i_sb->dq_op->drop) {
-		int cnt;
-		/* Test before calling to rule out calls from proc and such
-                 * where we are not allowed to block. Note that this is
-		 * actually reliable test even without the lock - the caller
-		 * must assure that nobody can come after the DQUOT_DROP and
-		 * add quota pointers back anyway */
-		for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-			if (inode->i_dquot[cnt])
-				break;
-		if (cnt < MAXQUOTAS)
-			inode->i_sb->dq_op->drop(inode);
-	}
-}
-EXPORT_SYMBOL(vfs_dq_drop);
+void dquot_drop(struct inode *inode)
+{
+	int cnt;
+
+	if (IS_NOQUOTA(inode))
+		return;
+
+	/*
+	 * Test before calling to rule out calls from proc and such
+	 * where we are not allowed to block. Note that this is
+	 * actually reliable test even without the lock - the caller
+	 * must assure that nobody can come after the DQUOT_DROP and
+	 * add quota pointers back anyway.
+	 */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (inode->i_dquot[cnt])
+			break;
+	}
+
+	if (cnt < MAXQUOTAS)
+		__dquot_drop(inode);
+}
+EXPORT_SYMBOL(dquot_drop);
 
 /*
  * inode_reserved_space is managed internally by quota, and protected by
@@ -1351,28 +1428,30 @@ static qsize_t *inode_reserved_space(struct inode * inode)
 	return inode->i_sb->dq_op->get_reserved_space(inode);
 }
 
-static void inode_add_rsv_space(struct inode *inode, qsize_t number)
+void inode_add_rsv_space(struct inode *inode, qsize_t number)
 {
 	spin_lock(&inode->i_lock);
 	*inode_reserved_space(inode) += number;
 	spin_unlock(&inode->i_lock);
 }
+EXPORT_SYMBOL(inode_add_rsv_space);
 
-
-static void inode_claim_rsv_space(struct inode *inode, qsize_t number)
+void inode_claim_rsv_space(struct inode *inode, qsize_t number)
 {
 	spin_lock(&inode->i_lock);
 	*inode_reserved_space(inode) -= number;
 	__inode_add_bytes(inode, number);
 	spin_unlock(&inode->i_lock);
 }
+EXPORT_SYMBOL(inode_claim_rsv_space);
 
-static void inode_sub_rsv_space(struct inode *inode, qsize_t number)
+void inode_sub_rsv_space(struct inode *inode, qsize_t number)
 {
 	spin_lock(&inode->i_lock);
 	*inode_reserved_space(inode) -= number;
 	spin_unlock(&inode->i_lock);
 }
+EXPORT_SYMBOL(inode_sub_rsv_space);
 
 static qsize_t inode_get_rsv_space(struct inode *inode)
 {
@@ -1404,38 +1483,36 @@ static void inode_decr_space(struct inode *inode, qsize_t number, int reserve)
 }
 
 /*
- * Following four functions update i_blocks+i_bytes fields and
- * quota information (together with appropriate checks)
- * NOTE: We absolutely rely on the fact that caller dirties
- * the inode (usually macros in quotaops.h care about this) and
- * holds a handle for the current transaction so that dquot write and
- * inode write go into the same transaction.
+ * This functions updates i_blocks+i_bytes fields and quota information
+ * (together with appropriate checks).
+ *
+ * NOTE: We absolutely rely on the fact that caller dirties the inode
+ * (usually helpers in quotaops.h care about this) and holds a handle for
+ * the current transaction so that dquot write and inode write go into the
+ * same transaction.
  */
 
 /*
  * This operation can block, but only after everything is updated
  */
-int __dquot_alloc_space(struct inode *inode, qsize_t number,
-			int warn, int reserve)
+int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
 {
-	int cnt, ret = QUOTA_OK;
+	int cnt, ret = 0;
 	char warntype[MAXQUOTAS];
+	int warn = flags & DQUOT_SPACE_WARN;
+	int reserve = flags & DQUOT_SPACE_RESERVE;
+	int nofail = flags & DQUOT_SPACE_NOFAIL;
 
 	/*
 	 * First test before acquiring mutex - solves deadlocks when we
 	 * re-enter the quota code and are already holding the mutex
 	 */
-	if (IS_NOQUOTA(inode)) {
+	if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) {
 		inode_incr_space(inode, number, reserve);
 		goto out;
 	}
 
 	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-	if (IS_NOQUOTA(inode)) {
-		inode_incr_space(inode, number, reserve);
-		goto out_unlock;
-	}
-
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
 		warntype[cnt] = QUOTA_NL_NOWARN;
 
@@ -1443,9 +1520,9 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (!inode->i_dquot[cnt])
 			continue;
-		if (check_bdq(inode->i_dquot[cnt], number, warn, warntype+cnt)
-		    == NO_QUOTA) {
-			ret = NO_QUOTA;
+		ret = check_bdq(inode->i_dquot[cnt], number, !warn,
+				warntype+cnt);
+		if (ret && !nofail) {
 			spin_unlock(&dq_data_lock);
 			goto out_flush_warn;
 		}
@@ -1466,61 +1543,45 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
 	mark_all_dquot_dirty(inode->i_dquot);
 out_flush_warn:
 	flush_warnings(inode->i_dquot, warntype);
-out_unlock:
 	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
 out:
 	return ret;
 }
-
-int dquot_alloc_space(struct inode *inode, qsize_t number, int warn)
-{
-	return __dquot_alloc_space(inode, number, warn, 0);
-}
-EXPORT_SYMBOL(dquot_alloc_space);
-
-int dquot_reserve_space(struct inode *inode, qsize_t number, int warn)
-{
-	return __dquot_alloc_space(inode, number, warn, 1);
-}
-EXPORT_SYMBOL(dquot_reserve_space);
+EXPORT_SYMBOL(__dquot_alloc_space);
 
 /*
  * This operation can block, but only after everything is updated
  */
-int dquot_alloc_inode(const struct inode *inode, qsize_t number)
+int dquot_alloc_inode(const struct inode *inode)
 {
-	int cnt, ret = NO_QUOTA;
+	int cnt, ret = 0;
 	char warntype[MAXQUOTAS];
 
 	/* First test before acquiring mutex - solves deadlocks when we
          * re-enter the quota code and are already holding the mutex */
-	if (IS_NOQUOTA(inode))
-		return QUOTA_OK;
+	if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode))
+		return 0;
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
 		warntype[cnt] = QUOTA_NL_NOWARN;
 	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-	if (IS_NOQUOTA(inode)) {
-		up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-		return QUOTA_OK;
-	}
 	spin_lock(&dq_data_lock);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (!inode->i_dquot[cnt])
 			continue;
-		if (check_idq(inode->i_dquot[cnt], number, warntype+cnt)
-		    == NO_QUOTA)
+		ret = check_idq(inode->i_dquot[cnt], 1, warntype + cnt);
+		if (ret)
 			goto warn_put_all;
 	}
 
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (!inode->i_dquot[cnt])
 			continue;
-		dquot_incr_inodes(inode->i_dquot[cnt], number);
+		dquot_incr_inodes(inode->i_dquot[cnt], 1);
 	}
-	ret = QUOTA_OK;
+
 warn_put_all:
 	spin_unlock(&dq_data_lock);
-	if (ret == QUOTA_OK)
+	if (ret == 0)
 		mark_all_dquot_dirty(inode->i_dquot);
 	flush_warnings(inode->i_dquot, warntype);
 	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
@@ -1528,23 +1589,19 @@ warn_put_all:
 }
 EXPORT_SYMBOL(dquot_alloc_inode);
 
-int dquot_claim_space(struct inode *inode, qsize_t number)
+/*
+ * Convert in-memory reserved quotas to real consumed quotas
+ */
+int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
 {
 	int cnt;
-	int ret = QUOTA_OK;
 
-	if (IS_NOQUOTA(inode)) {
+	if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) {
 		inode_claim_rsv_space(inode, number);
-		goto out;
+		return 0;
 	}
 
 	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-	if (IS_NOQUOTA(inode))	{
-		up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-		inode_claim_rsv_space(inode, number);
-		goto out;
-	}
-
 	spin_lock(&dq_data_lock);
 	/* Claim reserved quotas to allocated quotas */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -1557,33 +1614,27 @@ int dquot_claim_space(struct inode *inode, qsize_t number)
 	spin_unlock(&dq_data_lock);
 	mark_all_dquot_dirty(inode->i_dquot);
 	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-out:
-	return ret;
+	return 0;
 }
-EXPORT_SYMBOL(dquot_claim_space);
+EXPORT_SYMBOL(dquot_claim_space_nodirty);
 
 /*
  * This operation can block, but only after everything is updated
  */
-int __dquot_free_space(struct inode *inode, qsize_t number, int reserve)
+void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
 {
 	unsigned int cnt;
 	char warntype[MAXQUOTAS];
+	int reserve = flags & DQUOT_SPACE_RESERVE;
 
 	/* First test before acquiring mutex - solves deadlocks when we
          * re-enter the quota code and are already holding the mutex */
-	if (IS_NOQUOTA(inode)) {
-out_sub:
+	if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) {
 		inode_decr_space(inode, number, reserve);
-		return QUOTA_OK;
+		return;
 	}
 
 	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-	/* Now recheck reliably when holding dqptr_sem */
-	if (IS_NOQUOTA(inode)) {
-		up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-		goto out_sub;
-	}
 	spin_lock(&dq_data_lock);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (!inode->i_dquot[cnt])
@@ -1603,99 +1654,67 @@ out_sub:
 out_unlock:
 	flush_warnings(inode->i_dquot, warntype);
 	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-	return QUOTA_OK;
 }
-
-int dquot_free_space(struct inode *inode, qsize_t number)
-{
-	return  __dquot_free_space(inode, number, 0);
-}
-EXPORT_SYMBOL(dquot_free_space);
-
-/*
- * Release reserved quota space
- */
-void dquot_release_reserved_space(struct inode *inode, qsize_t number)
-{
-	__dquot_free_space(inode, number, 1);
-
-}
-EXPORT_SYMBOL(dquot_release_reserved_space);
+EXPORT_SYMBOL(__dquot_free_space);
 
 /*
  * This operation can block, but only after everything is updated
  */
-int dquot_free_inode(const struct inode *inode, qsize_t number)
+void dquot_free_inode(const struct inode *inode)
 {
 	unsigned int cnt;
 	char warntype[MAXQUOTAS];
 
 	/* First test before acquiring mutex - solves deadlocks when we
          * re-enter the quota code and are already holding the mutex */
-	if (IS_NOQUOTA(inode))
-		return QUOTA_OK;
+	if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode))
+		return;
 
 	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-	/* Now recheck reliably when holding dqptr_sem */
-	if (IS_NOQUOTA(inode)) {
-		up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-		return QUOTA_OK;
-	}
 	spin_lock(&dq_data_lock);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (!inode->i_dquot[cnt])
 			continue;
-		warntype[cnt] = info_idq_free(inode->i_dquot[cnt], number);
-		dquot_decr_inodes(inode->i_dquot[cnt], number);
+		warntype[cnt] = info_idq_free(inode->i_dquot[cnt], 1);
+		dquot_decr_inodes(inode->i_dquot[cnt], 1);
 	}
 	spin_unlock(&dq_data_lock);
 	mark_all_dquot_dirty(inode->i_dquot);
 	flush_warnings(inode->i_dquot, warntype);
 	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-	return QUOTA_OK;
 }
 EXPORT_SYMBOL(dquot_free_inode);
 
 /*
  * Transfer the number of inode and blocks from one diskquota to an other.
+ * On success, dquot references in transfer_to are consumed and references
+ * to original dquots that need to be released are placed there. On failure,
+ * references are kept untouched.
  *
  * This operation can block, but only after everything is updated
  * A transaction must be started when entering this function.
+ *
  */
-int dquot_transfer(struct inode *inode, struct iattr *iattr)
+int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
 {
 	qsize_t space, cur_space;
 	qsize_t rsv_space = 0;
-	struct dquot *transfer_from[MAXQUOTAS];
-	struct dquot *transfer_to[MAXQUOTAS];
-	int cnt, ret = QUOTA_OK;
-	int chuid = iattr->ia_valid & ATTR_UID && inode->i_uid != iattr->ia_uid,
-	    chgid = iattr->ia_valid & ATTR_GID && inode->i_gid != iattr->ia_gid;
+	struct dquot *transfer_from[MAXQUOTAS] = {};
+	int cnt, ret = 0;
 	char warntype_to[MAXQUOTAS];
 	char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS];
 
 	/* First test before acquiring mutex - solves deadlocks when we
          * re-enter the quota code and are already holding the mutex */
 	if (IS_NOQUOTA(inode))
-		return QUOTA_OK;
+		return 0;
 	/* Initialize the arrays */
-	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		transfer_from[cnt] = NULL;
-		transfer_to[cnt] = NULL;
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
 		warntype_to[cnt] = QUOTA_NL_NOWARN;
-	}
-	if (chuid)
-		transfer_to[USRQUOTA] = dqget(inode->i_sb, iattr->ia_uid,
-					      USRQUOTA);
-	if (chgid)
-		transfer_to[GRPQUOTA] = dqget(inode->i_sb, iattr->ia_gid,
-					      GRPQUOTA);
-
 	down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
-	/* Now recheck reliably when holding dqptr_sem */
 	if (IS_NOQUOTA(inode)) {	/* File without quota accounting? */
 		up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
-		goto put_all;
+		return 0;
 	}
 	spin_lock(&dq_data_lock);
 	cur_space = inode_get_bytes(inode);
@@ -1706,9 +1725,11 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 		if (!transfer_to[cnt])
 			continue;
 		transfer_from[cnt] = inode->i_dquot[cnt];
-		if (check_idq(transfer_to[cnt], 1, warntype_to + cnt) ==
-		    NO_QUOTA || check_bdq(transfer_to[cnt], space, 0,
-		    warntype_to + cnt) == NO_QUOTA)
+		ret = check_idq(transfer_to[cnt], 1, warntype_to + cnt);
+		if (ret)
+			goto over_quota;
+		ret = check_bdq(transfer_to[cnt], space, 0, warntype_to + cnt);
+		if (ret)
 			goto over_quota;
 	}
 
@@ -1745,39 +1766,43 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 
 	mark_all_dquot_dirty(transfer_from);
 	mark_all_dquot_dirty(transfer_to);
-	/* The reference we got is transferred to the inode */
+	/* Pass back references to put */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-		transfer_to[cnt] = NULL;
-warn_put_all:
+		transfer_to[cnt] = transfer_from[cnt];
+warn:
 	flush_warnings(transfer_to, warntype_to);
 	flush_warnings(transfer_from, warntype_from_inodes);
 	flush_warnings(transfer_from, warntype_from_space);
-put_all:
-	dqput_all(transfer_from);
-	dqput_all(transfer_to);
 	return ret;
 over_quota:
 	spin_unlock(&dq_data_lock);
 	up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
-	/* Clear dquot pointers we don't want to dqput() */
-	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-		transfer_from[cnt] = NULL;
-	ret = NO_QUOTA;
-	goto warn_put_all;
+	goto warn;
 }
-EXPORT_SYMBOL(dquot_transfer);
+EXPORT_SYMBOL(__dquot_transfer);
 
-/* Wrapper for transferring ownership of an inode */
-int vfs_dq_transfer(struct inode *inode, struct iattr *iattr)
+/* Wrapper for transferring ownership of an inode for uid/gid only
+ * Called from FSXXX_setattr()
+ */
+int dquot_transfer(struct inode *inode, struct iattr *iattr)
 {
-	if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) {
-		vfs_dq_init(inode);
-		if (inode->i_sb->dq_op->transfer(inode, iattr) == NO_QUOTA)
-			return 1;
-	}
-	return 0;
+	struct dquot *transfer_to[MAXQUOTAS] = {};
+	struct super_block *sb = inode->i_sb;
+	int ret;
+
+	if (!sb_any_quota_active(sb) || IS_NOQUOTA(inode))
+		return 0;
+
+	if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid)
+		transfer_to[USRQUOTA] = dqget(sb, iattr->ia_uid, USRQUOTA);
+	if (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)
+		transfer_to[GRPQUOTA] = dqget(sb, iattr->ia_gid, GRPQUOTA);
+
+	ret = __dquot_transfer(inode, transfer_to);
+	dqput_all(transfer_to);
+	return ret;
 }
-EXPORT_SYMBOL(vfs_dq_transfer);
+EXPORT_SYMBOL(dquot_transfer);
 
 /*
  * Write info of quota file to disk
@@ -1798,13 +1823,6 @@ EXPORT_SYMBOL(dquot_commit_info);
  * Definitions of diskquota operations.
  */
 const struct dquot_operations dquot_operations = {
-	.initialize	= dquot_initialize,
-	.drop		= dquot_drop,
-	.alloc_space	= dquot_alloc_space,
-	.alloc_inode	= dquot_alloc_inode,
-	.free_space	= dquot_free_space,
-	.free_inode	= dquot_free_inode,
-	.transfer	= dquot_transfer,
 	.write_dquot	= dquot_commit,
 	.acquire_dquot	= dquot_acquire,
 	.release_dquot	= dquot_release,
@@ -1813,11 +1831,26 @@ const struct dquot_operations dquot_operations = {
 	.alloc_dquot	= dquot_alloc,
 	.destroy_dquot	= dquot_destroy,
 };
+EXPORT_SYMBOL(dquot_operations);
+
+/*
+ * Generic helper for ->open on filesystems supporting disk quotas.
+ */
+int dquot_file_open(struct inode *inode, struct file *file)
+{
+	int error;
+
+	error = generic_file_open(inode, file);
+	if (!error && (file->f_mode & FMODE_WRITE))
+		dquot_initialize(inode);
+	return error;
+}
+EXPORT_SYMBOL(dquot_file_open);
 
 /*
  * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount)
  */
-int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags)
+int dquot_disable(struct super_block *sb, int type, unsigned int flags)
 {
 	int cnt, ret = 0;
 	struct quota_info *dqopt = sb_dqopt(sb);
@@ -1947,14 +1980,15 @@ put_inodes:
 		}
 	return ret;
 }
-EXPORT_SYMBOL(vfs_quota_disable);
+EXPORT_SYMBOL(dquot_disable);
 
-int vfs_quota_off(struct super_block *sb, int type, int remount)
+int dquot_quota_off(struct super_block *sb, int type)
 {
-	return vfs_quota_disable(sb, type, remount ? DQUOT_SUSPENDED :
-				 (DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED));
+	return dquot_disable(sb, type,
+			     DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
 }
-EXPORT_SYMBOL(vfs_quota_off);
+EXPORT_SYMBOL(dquot_quota_off);
+
 /*
  *	Turn quotas on on a device
  */
@@ -1993,11 +2027,13 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
 	}
 
 	if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
-		/* As we bypass the pagecache we must now flush the inode so
-		 * that we see all the changes from userspace... */
-		write_inode_now(inode, 1);
-		/* And now flush the block cache so that kernel sees the
-		 * changes */
+		/* As we bypass the pagecache we must now flush all the
+		 * dirty data and invalidate caches so that kernel sees
+		 * changes from userspace. It is not enough to just flush
+		 * the quota file since if blocksize < pagesize, invalidation
+		 * of the cache could fail because of other unrelated dirty
+		 * data */
+		sync_filesystem(sb);
 		invalidate_bdev(sb->s_bdev);
 	}
 	mutex_lock(&dqopt->dqonoff_mutex);
@@ -2010,14 +2046,16 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
 		/* We don't want quota and atime on quota files (deadlocks
 		 * possible) Also nobody should write to the file - we use
 		 * special IO operations which ignore the immutable bit. */
-		down_write(&dqopt->dqptr_sem);
 		mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
 		oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE |
 					     S_NOQUOTA);
 		inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
 		mutex_unlock(&inode->i_mutex);
-		up_write(&dqopt->dqptr_sem);
-		sb->dq_op->drop(inode);
+		/*
+		 * When S_NOQUOTA is set, remove dquot references as no more
+		 * references can be added
+		 */
+		__dquot_drop(inode);
 	}
 
 	error = -EIO;
@@ -2053,14 +2091,12 @@ out_file_init:
 	iput(inode);
 out_lock:
 	if (oldflags != -1) {
-		down_write(&dqopt->dqptr_sem);
 		mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
 		/* Set the flags back (in the case of accidental quotaon()
 		 * on a wrong file we don't want to mess up the flags) */
 		inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE);
 		inode->i_flags |= oldflags;
 		mutex_unlock(&inode->i_mutex);
-		up_write(&dqopt->dqptr_sem);
 	}
 	mutex_unlock(&dqopt->dqonoff_mutex);
 out_fmt:
@@ -2070,36 +2106,43 @@ out_fmt:
 }
 
 /* Reenable quotas on remount RW */
-static int vfs_quota_on_remount(struct super_block *sb, int type)
+int dquot_resume(struct super_block *sb, int type)
 {
 	struct quota_info *dqopt = sb_dqopt(sb);
 	struct inode *inode;
-	int ret;
+	int ret = 0, cnt;
 	unsigned int flags;
 
-	mutex_lock(&dqopt->dqonoff_mutex);
-	if (!sb_has_quota_suspended(sb, type)) {
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (type != -1 && cnt != type)
+			continue;
+
+		mutex_lock(&dqopt->dqonoff_mutex);
+		if (!sb_has_quota_suspended(sb, cnt)) {
+			mutex_unlock(&dqopt->dqonoff_mutex);
+			continue;
+		}
+		inode = dqopt->files[cnt];
+		dqopt->files[cnt] = NULL;
+		spin_lock(&dq_state_lock);
+		flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
+							DQUOT_LIMITS_ENABLED,
+							cnt);
+		dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, cnt);
+		spin_unlock(&dq_state_lock);
 		mutex_unlock(&dqopt->dqonoff_mutex);
-		return 0;
-	}
-	inode = dqopt->files[type];
-	dqopt->files[type] = NULL;
-	spin_lock(&dq_state_lock);
-	flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
-						DQUOT_LIMITS_ENABLED, type);
-	dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, type);
-	spin_unlock(&dq_state_lock);
-	mutex_unlock(&dqopt->dqonoff_mutex);
 
-	flags = dquot_generic_flag(flags, type);
-	ret = vfs_load_quota_inode(inode, type, dqopt->info[type].dqi_fmt_id,
-				   flags);
-	iput(inode);
+		flags = dquot_generic_flag(flags, cnt);
+		ret = vfs_load_quota_inode(inode, cnt,
+				dqopt->info[cnt].dqi_fmt_id, flags);
+		iput(inode);
+	}
 
 	return ret;
 }
+EXPORT_SYMBOL(dquot_resume);
 
-int vfs_quota_on_path(struct super_block *sb, int type, int format_id,
+int dquot_quota_on_path(struct super_block *sb, int type, int format_id,
 		      struct path *path)
 {
 	int error = security_quota_on(path->dentry);
@@ -2114,40 +2157,36 @@ int vfs_quota_on_path(struct super_block *sb, int type, int format_id,
 					     DQUOT_LIMITS_ENABLED);
 	return error;
 }
-EXPORT_SYMBOL(vfs_quota_on_path);
+EXPORT_SYMBOL(dquot_quota_on_path);
 
-int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name,
-		 int remount)
+int dquot_quota_on(struct super_block *sb, int type, int format_id, char *name)
 {
 	struct path path;
 	int error;
 
-	if (remount)
-		return vfs_quota_on_remount(sb, type);
-
 	error = kern_path(name, LOOKUP_FOLLOW, &path);
 	if (!error) {
-		error = vfs_quota_on_path(sb, type, format_id, &path);
+		error = dquot_quota_on_path(sb, type, format_id, &path);
 		path_put(&path);
 	}
 	return error;
 }
-EXPORT_SYMBOL(vfs_quota_on);
+EXPORT_SYMBOL(dquot_quota_on);
 
 /*
  * More powerful function for turning on quotas allowing setting
  * of individual quota flags
  */
-int vfs_quota_enable(struct inode *inode, int type, int format_id,
-		unsigned int flags)
+int dquot_enable(struct inode *inode, int type, int format_id,
+		 unsigned int flags)
 {
 	int ret = 0;
 	struct super_block *sb = inode->i_sb;
 	struct quota_info *dqopt = sb_dqopt(sb);
 
 	/* Just unsuspend quotas? */
-	if (flags & DQUOT_SUSPENDED)
-		return vfs_quota_on_remount(sb, type);
+	BUG_ON(flags & DQUOT_SUSPENDED);
+
 	if (!flags)
 		return 0;
 	/* Just updating flags needed? */
@@ -2179,13 +2218,13 @@ out_lock:
 load_quota:
 	return vfs_load_quota_inode(inode, type, format_id, flags);
 }
-EXPORT_SYMBOL(vfs_quota_enable);
+EXPORT_SYMBOL(dquot_enable);
 
 /*
  * This function is used when filesystem needs to initialize quotas
  * during mount time.
  */
-int vfs_quota_on_mount(struct super_block *sb, char *qf_name,
+int dquot_quota_on_mount(struct super_block *sb, char *qf_name,
 		int format_id, int type)
 {
 	struct dentry *dentry;
@@ -2211,24 +2250,7 @@ out:
 	dput(dentry);
 	return error;
 }
-EXPORT_SYMBOL(vfs_quota_on_mount);
-
-/* Wrapper to turn on quotas when remounting rw */
-int vfs_dq_quota_on_remount(struct super_block *sb)
-{
-	int cnt;
-	int ret = 0, err;
-
-	if (!sb->s_qcop || !sb->s_qcop->quota_on)
-		return -ENOSYS;
-	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		err = sb->s_qcop->quota_on(sb, cnt, 0, NULL, 1);
-		if (err < 0 && !ret)
-			ret = err;
-	}
-	return ret;
-}
-EXPORT_SYMBOL(vfs_dq_quota_on_remount);
+EXPORT_SYMBOL(dquot_quota_on_mount);
 
 static inline qsize_t qbtos(qsize_t blocks)
 {
@@ -2241,25 +2263,30 @@ static inline qsize_t stoqb(qsize_t space)
 }
 
 /* Generic routine for getting common part of quota structure */
-static void do_get_dqblk(struct dquot *dquot, struct if_dqblk *di)
+static void do_get_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
 {
 	struct mem_dqblk *dm = &dquot->dq_dqb;
 
+	memset(di, 0, sizeof(*di));
+	di->d_version = FS_DQUOT_VERSION;
+	di->d_flags = dquot->dq_type == USRQUOTA ?
+			XFS_USER_QUOTA : XFS_GROUP_QUOTA;
+	di->d_id = dquot->dq_id;
+
 	spin_lock(&dq_data_lock);
-	di->dqb_bhardlimit = stoqb(dm->dqb_bhardlimit);
-	di->dqb_bsoftlimit = stoqb(dm->dqb_bsoftlimit);
-	di->dqb_curspace = dm->dqb_curspace + dm->dqb_rsvspace;
-	di->dqb_ihardlimit = dm->dqb_ihardlimit;
-	di->dqb_isoftlimit = dm->dqb_isoftlimit;
-	di->dqb_curinodes = dm->dqb_curinodes;
-	di->dqb_btime = dm->dqb_btime;
-	di->dqb_itime = dm->dqb_itime;
-	di->dqb_valid = QIF_ALL;
+	di->d_blk_hardlimit = stoqb(dm->dqb_bhardlimit);
+	di->d_blk_softlimit = stoqb(dm->dqb_bsoftlimit);
+	di->d_ino_hardlimit = dm->dqb_ihardlimit;
+	di->d_ino_softlimit = dm->dqb_isoftlimit;
+	di->d_bcount = dm->dqb_curspace + dm->dqb_rsvspace;
+	di->d_icount = dm->dqb_curinodes;
+	di->d_btimer = dm->dqb_btime;
+	di->d_itimer = dm->dqb_itime;
 	spin_unlock(&dq_data_lock);
 }
 
-int vfs_get_dqblk(struct super_block *sb, int type, qid_t id,
-		  struct if_dqblk *di)
+int dquot_get_dqblk(struct super_block *sb, int type, qid_t id,
+		    struct fs_disk_quota *di)
 {
 	struct dquot *dquot;
 
@@ -2271,55 +2298,74 @@ int vfs_get_dqblk(struct super_block *sb, int type, qid_t id,
 
 	return 0;
 }
-EXPORT_SYMBOL(vfs_get_dqblk);
+EXPORT_SYMBOL(dquot_get_dqblk);
+
+#define VFS_FS_DQ_MASK \
+	(FS_DQ_BCOUNT | FS_DQ_BSOFT | FS_DQ_BHARD | \
+	 FS_DQ_ICOUNT | FS_DQ_ISOFT | FS_DQ_IHARD | \
+	 FS_DQ_BTIMER | FS_DQ_ITIMER)
 
 /* Generic routine for setting common part of quota structure */
-static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
+static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
 {
 	struct mem_dqblk *dm = &dquot->dq_dqb;
 	int check_blim = 0, check_ilim = 0;
 	struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_type];
 
-	if ((di->dqb_valid & QIF_BLIMITS &&
-	     (di->dqb_bhardlimit > dqi->dqi_maxblimit ||
-	      di->dqb_bsoftlimit > dqi->dqi_maxblimit)) ||
-	    (di->dqb_valid & QIF_ILIMITS &&
-	     (di->dqb_ihardlimit > dqi->dqi_maxilimit ||
-	      di->dqb_isoftlimit > dqi->dqi_maxilimit)))
+	if (di->d_fieldmask & ~VFS_FS_DQ_MASK)
+		return -EINVAL;
+
+	if (((di->d_fieldmask & FS_DQ_BSOFT) &&
+	     (di->d_blk_softlimit > dqi->dqi_maxblimit)) ||
+	    ((di->d_fieldmask & FS_DQ_BHARD) &&
+	     (di->d_blk_hardlimit > dqi->dqi_maxblimit)) ||
+	    ((di->d_fieldmask & FS_DQ_ISOFT) &&
+	     (di->d_ino_softlimit > dqi->dqi_maxilimit)) ||
+	    ((di->d_fieldmask & FS_DQ_IHARD) &&
+	     (di->d_ino_hardlimit > dqi->dqi_maxilimit)))
 		return -ERANGE;
 
 	spin_lock(&dq_data_lock);
-	if (di->dqb_valid & QIF_SPACE) {
-		dm->dqb_curspace = di->dqb_curspace - dm->dqb_rsvspace;
+	if (di->d_fieldmask & FS_DQ_BCOUNT) {
+		dm->dqb_curspace = di->d_bcount - dm->dqb_rsvspace;
 		check_blim = 1;
-		__set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
+		set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
 	}
-	if (di->dqb_valid & QIF_BLIMITS) {
-		dm->dqb_bsoftlimit = qbtos(di->dqb_bsoftlimit);
-		dm->dqb_bhardlimit = qbtos(di->dqb_bhardlimit);
+
+	if (di->d_fieldmask & FS_DQ_BSOFT)
+		dm->dqb_bsoftlimit = qbtos(di->d_blk_softlimit);
+	if (di->d_fieldmask & FS_DQ_BHARD)
+		dm->dqb_bhardlimit = qbtos(di->d_blk_hardlimit);
+	if (di->d_fieldmask & (FS_DQ_BSOFT | FS_DQ_BHARD)) {
 		check_blim = 1;
-		__set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
+		set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
 	}
-	if (di->dqb_valid & QIF_INODES) {
-		dm->dqb_curinodes = di->dqb_curinodes;
+
+	if (di->d_fieldmask & FS_DQ_ICOUNT) {
+		dm->dqb_curinodes = di->d_icount;
 		check_ilim = 1;
-		__set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
+		set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
 	}
-	if (di->dqb_valid & QIF_ILIMITS) {
-		dm->dqb_isoftlimit = di->dqb_isoftlimit;
-		dm->dqb_ihardlimit = di->dqb_ihardlimit;
+
+	if (di->d_fieldmask & FS_DQ_ISOFT)
+		dm->dqb_isoftlimit = di->d_ino_softlimit;
+	if (di->d_fieldmask & FS_DQ_IHARD)
+		dm->dqb_ihardlimit = di->d_ino_hardlimit;
+	if (di->d_fieldmask & (FS_DQ_ISOFT | FS_DQ_IHARD)) {
 		check_ilim = 1;
-		__set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
+		set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
 	}
-	if (di->dqb_valid & QIF_BTIME) {
-		dm->dqb_btime = di->dqb_btime;
+
+	if (di->d_fieldmask & FS_DQ_BTIMER) {
+		dm->dqb_btime = di->d_btimer;
 		check_blim = 1;
-		__set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
+		set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
 	}
-	if (di->dqb_valid & QIF_ITIME) {
-		dm->dqb_itime = di->dqb_itime;
+
+	if (di->d_fieldmask & FS_DQ_ITIMER) {
+		dm->dqb_itime = di->d_itimer;
 		check_ilim = 1;
-		__set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
+		set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
 	}
 
 	if (check_blim) {
@@ -2327,7 +2373,7 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
 		    dm->dqb_curspace < dm->dqb_bsoftlimit) {
 			dm->dqb_btime = 0;
 			clear_bit(DQ_BLKS_B, &dquot->dq_flags);
-		} else if (!(di->dqb_valid & QIF_BTIME))
+		} else if (!(di->d_fieldmask & FS_DQ_BTIMER))
 			/* Set grace only if user hasn't provided his own... */
 			dm->dqb_btime = get_seconds() + dqi->dqi_bgrace;
 	}
@@ -2336,7 +2382,7 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
 		    dm->dqb_curinodes < dm->dqb_isoftlimit) {
 			dm->dqb_itime = 0;
 			clear_bit(DQ_INODES_B, &dquot->dq_flags);
-		} else if (!(di->dqb_valid & QIF_ITIME))
+		} else if (!(di->d_fieldmask & FS_DQ_ITIMER))
 			/* Set grace only if user hasn't provided his own... */
 			dm->dqb_itime = get_seconds() + dqi->dqi_igrace;
 	}
@@ -2351,8 +2397,8 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
 	return 0;
 }
 
-int vfs_set_dqblk(struct super_block *sb, int type, qid_t id,
-		  struct if_dqblk *di)
+int dquot_set_dqblk(struct super_block *sb, int type, qid_t id,
+		  struct fs_disk_quota *di)
 {
 	struct dquot *dquot;
 	int rc;
@@ -2367,10 +2413,10 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id,
 out:
 	return rc;
 }
-EXPORT_SYMBOL(vfs_set_dqblk);
+EXPORT_SYMBOL(dquot_set_dqblk);
 
 /* Generic routine for getting common part of quota file information */
-int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
+int dquot_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 {
 	struct mem_dqinfo *mi;
   
@@ -2389,10 +2435,10 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
 	return 0;
 }
-EXPORT_SYMBOL(vfs_get_dqinfo);
+EXPORT_SYMBOL(dquot_get_dqinfo);
 
 /* Generic routine for setting common part of quota file information */
-int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
+int dquot_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 {
 	struct mem_dqinfo *mi;
 	int err = 0;
@@ -2419,74 +2465,86 @@ out:
 	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
 	return err;
 }
-EXPORT_SYMBOL(vfs_set_dqinfo);
+EXPORT_SYMBOL(dquot_set_dqinfo);
 
-const struct quotactl_ops vfs_quotactl_ops = {
-	.quota_on	= vfs_quota_on,
-	.quota_off	= vfs_quota_off,
-	.quota_sync	= vfs_quota_sync,
-	.get_info	= vfs_get_dqinfo,
-	.set_info	= vfs_set_dqinfo,
-	.get_dqblk	= vfs_get_dqblk,
-	.set_dqblk	= vfs_set_dqblk
+const struct quotactl_ops dquot_quotactl_ops = {
+	.quota_on	= dquot_quota_on,
+	.quota_off	= dquot_quota_off,
+	.quota_sync	= dquot_quota_sync,
+	.get_info	= dquot_get_dqinfo,
+	.set_info	= dquot_set_dqinfo,
+	.get_dqblk	= dquot_get_dqblk,
+	.set_dqblk	= dquot_set_dqblk
 };
+EXPORT_SYMBOL(dquot_quotactl_ops);
+
+static int do_proc_dqstats(struct ctl_table *table, int write,
+		     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	unsigned int type = (int *)table->data - dqstats.stat;
+
+	/* Update global table */
+	dqstats.stat[type] =
+			percpu_counter_sum_positive(&dqstats.counter[type]);
+	return proc_dointvec(table, write, buffer, lenp, ppos);
+}
 
 static ctl_table fs_dqstats_table[] = {
 	{
 		.procname	= "lookups",
-		.data		= &dqstats.lookups,
+		.data		= &dqstats.stat[DQST_LOOKUPS],
 		.maxlen		= sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= do_proc_dqstats,
 	},
 	{
 		.procname	= "drops",
-		.data		= &dqstats.drops,
+		.data		= &dqstats.stat[DQST_DROPS],
 		.maxlen		= sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= do_proc_dqstats,
 	},
 	{
 		.procname	= "reads",
-		.data		= &dqstats.reads,
+		.data		= &dqstats.stat[DQST_READS],
 		.maxlen		= sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= do_proc_dqstats,
 	},
 	{
 		.procname	= "writes",
-		.data		= &dqstats.writes,
+		.data		= &dqstats.stat[DQST_WRITES],
 		.maxlen		= sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= do_proc_dqstats,
 	},
 	{
 		.procname	= "cache_hits",
-		.data		= &dqstats.cache_hits,
+		.data		= &dqstats.stat[DQST_CACHE_HITS],
 		.maxlen		= sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= do_proc_dqstats,
 	},
 	{
 		.procname	= "allocated_dquots",
-		.data		= &dqstats.allocated_dquots,
+		.data		= &dqstats.stat[DQST_ALLOC_DQUOTS],
 		.maxlen		= sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= do_proc_dqstats,
 	},
 	{
 		.procname	= "free_dquots",
-		.data		= &dqstats.free_dquots,
+		.data		= &dqstats.stat[DQST_FREE_DQUOTS],
 		.maxlen		= sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= do_proc_dqstats,
 	},
 	{
 		.procname	= "syncs",
-		.data		= &dqstats.syncs,
+		.data		= &dqstats.stat[DQST_SYNCS],
 		.maxlen		= sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= do_proc_dqstats,
 	},
 #ifdef CONFIG_PRINT_QUOTA_WARNING
 	{
@@ -2520,7 +2578,7 @@ static ctl_table sys_table[] = {
 
 static int __init dquot_init(void)
 {
-	int i;
+	int i, ret;
 	unsigned long nr_hash, order;
 
 	printk(KERN_NOTICE "VFS: Disk quotas %s\n", __DQUOT_VERSION__);
@@ -2538,6 +2596,12 @@ static int __init dquot_init(void)
 	if (!dquot_hash)
 		panic("Cannot create dquot hash table");
 
+	for (i = 0; i < _DQST_DQSTAT_LAST; i++) {
+		ret = percpu_counter_init(&dqstats.counter[i], 0);
+		if (ret)
+			panic("Cannot create dquot stat counters");
+	}
+
 	/* Find power-of-two hlist_heads which can fit into allocation */
 	nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head);
 	dq_hash_bits = 0;
diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c
new file mode 100644
index 000000000000..d67908b407d9
--- /dev/null
+++ b/fs/quota/netlink.c
@@ -0,0 +1,96 @@
+
+#include <linux/cred.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/quotaops.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+/* Netlink family structure for quota */
+static struct genl_family quota_genl_family = {
+	.id = GENL_ID_GENERATE,
+	.hdrsize = 0,
+	.name = "VFS_DQUOT",
+	.version = 1,
+	.maxattr = QUOTA_NL_A_MAX,
+};
+
+/**
+ * quota_send_warning - Send warning to userspace about exceeded quota
+ * @type: The quota type: USRQQUOTA, GRPQUOTA,...
+ * @id: The user or group id of the quota that was exceeded
+ * @dev: The device on which the fs is mounted (sb->s_dev)
+ * @warntype: The type of the warning: QUOTA_NL_...
+ *
+ * This can be used by filesystems (including those which don't use
+ * dquot) to send a message to userspace relating to quota limits.
+ *
+ */
+
+void quota_send_warning(short type, unsigned int id, dev_t dev,
+			const char warntype)
+{
+	static atomic_t seq;
+	struct sk_buff *skb;
+	void *msg_head;
+	int ret;
+	int msg_size = 4 * nla_total_size(sizeof(u32)) +
+		       2 * nla_total_size(sizeof(u64));
+
+	/* We have to allocate using GFP_NOFS as we are called from a
+	 * filesystem performing write and thus further recursion into
+	 * the fs to free some data could cause deadlocks. */
+	skb = genlmsg_new(msg_size, GFP_NOFS);
+	if (!skb) {
+		printk(KERN_ERR
+		  "VFS: Not enough memory to send quota warning.\n");
+		return;
+	}
+	msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq),
+			&quota_genl_family, 0, QUOTA_NL_C_WARNING);
+	if (!msg_head) {
+		printk(KERN_ERR
+		  "VFS: Cannot store netlink header in quota warning.\n");
+		goto err_out;
+	}
+	ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, type);
+	if (ret)
+		goto attr_err_out;
+	ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID, id);
+	if (ret)
+		goto attr_err_out;
+	ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype);
+	if (ret)
+		goto attr_err_out;
+	ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR, MAJOR(dev));
+	if (ret)
+		goto attr_err_out;
+	ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR, MINOR(dev));
+	if (ret)
+		goto attr_err_out;
+	ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current_uid());
+	if (ret)
+		goto attr_err_out;
+	genlmsg_end(skb, msg_head);
+
+	genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS);
+	return;
+attr_err_out:
+	printk(KERN_ERR "VFS: Not enough space to compose quota message!\n");
+err_out:
+	kfree_skb(skb);
+}
+EXPORT_SYMBOL(quota_send_warning);
+
+static int __init quota_init(void)
+{
+	if (genl_register_family(&quota_genl_family) != 0)
+		printk(KERN_ERR
+		       "VFS: Failed to create quota netlink interface.\n");
+	return 0;
+};
+
+module_init(quota_init);
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index ee91e2756950..b299961e1edb 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -10,7 +10,6 @@
 #include <linux/slab.h>
 #include <asm/current.h>
 #include <asm/uaccess.h>
-#include <linux/compat.h>
 #include <linux/kernel.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
@@ -18,220 +17,226 @@
 #include <linux/capability.h>
 #include <linux/quotaops.h>
 #include <linux/types.h>
-#include <net/netlink.h>
-#include <net/genetlink.h>
+#include <linux/writeback.h>
 
-/* Check validity of generic quotactl commands */
-static int generic_quotactl_valid(struct super_block *sb, int type, int cmd,
-				  qid_t id)
+static int check_quotactl_permission(struct super_block *sb, int type, int cmd,
+				     qid_t id)
 {
-	if (type >= MAXQUOTAS)
-		return -EINVAL;
-	if (!sb && cmd != Q_SYNC)
-		return -ENODEV;
-	/* Is operation supported? */
-	if (sb && !sb->s_qcop)
-		return -ENOSYS;
-
 	switch (cmd) {
-		case Q_GETFMT:
-			break;
-		case Q_QUOTAON:
-			if (!sb->s_qcop->quota_on)
-				return -ENOSYS;
-			break;
-		case Q_QUOTAOFF:
-			if (!sb->s_qcop->quota_off)
-				return -ENOSYS;
-			break;
-		case Q_SETINFO:
-			if (!sb->s_qcop->set_info)
-				return -ENOSYS;
-			break;
-		case Q_GETINFO:
-			if (!sb->s_qcop->get_info)
-				return -ENOSYS;
-			break;
-		case Q_SETQUOTA:
-			if (!sb->s_qcop->set_dqblk)
-				return -ENOSYS;
-			break;
-		case Q_GETQUOTA:
-			if (!sb->s_qcop->get_dqblk)
-				return -ENOSYS;
-			break;
-		case Q_SYNC:
-			if (sb && !sb->s_qcop->quota_sync)
-				return -ENOSYS;
+	/* these commands do not require any special privilegues */
+	case Q_GETFMT:
+	case Q_SYNC:
+	case Q_GETINFO:
+	case Q_XGETQSTAT:
+	case Q_XQUOTASYNC:
+		break;
+	/* allow to query information for dquots we "own" */
+	case Q_GETQUOTA:
+	case Q_XGETQUOTA:
+		if ((type == USRQUOTA && current_euid() == id) ||
+		    (type == GRPQUOTA && in_egroup_p(id)))
 			break;
-		default:
-			return -EINVAL;
+		/*FALLTHROUGH*/
+	default:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
 	}
 
-	/* Is quota turned on for commands which need it? */
-	switch (cmd) {
-		case Q_GETFMT:
-		case Q_GETINFO:
-		case Q_SETINFO:
-		case Q_SETQUOTA:
-		case Q_GETQUOTA:
-			/* This is just an informative test so we are satisfied
-			 * without the lock */
-			if (!sb_has_quota_active(sb, type))
-				return -ESRCH;
-	}
+	return security_quotactl(cmd, type, id, sb);
+}
 
-	/* Check privileges */
-	if (cmd == Q_GETQUOTA) {
-		if (((type == USRQUOTA && current_euid() != id) ||
-		     (type == GRPQUOTA && !in_egroup_p(id))) &&
-		    !capable(CAP_SYS_ADMIN))
-			return -EPERM;
-	}
-	else if (cmd != Q_GETFMT && cmd != Q_SYNC && cmd != Q_GETINFO)
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
+static void quota_sync_one(struct super_block *sb, void *arg)
+{
+	if (sb->s_qcop && sb->s_qcop->quota_sync)
+		sb->s_qcop->quota_sync(sb, *(int *)arg, 1);
+}
+
+static int quota_sync_all(int type)
+{
+	int ret;
 
+	if (type >= MAXQUOTAS)
+		return -EINVAL;
+	ret = security_quotactl(Q_SYNC, type, 0, NULL);
+	if (!ret)
+		iterate_supers(quota_sync_one, &type);
+	return ret;
+}
+
+static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
+		         void __user *addr)
+{
+	char *pathname;
+	int ret = -ENOSYS;
+
+	pathname = getname(addr);
+	if (IS_ERR(pathname))
+		return PTR_ERR(pathname);
+	if (sb->s_qcop->quota_on)
+		ret = sb->s_qcop->quota_on(sb, type, id, pathname);
+	putname(pathname);
+	return ret;
+}
+
+static int quota_getfmt(struct super_block *sb, int type, void __user *addr)
+{
+	__u32 fmt;
+
+	down_read(&sb_dqopt(sb)->dqptr_sem);
+	if (!sb_has_quota_active(sb, type)) {
+		up_read(&sb_dqopt(sb)->dqptr_sem);
+		return -ESRCH;
+	}
+	fmt = sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id;
+	up_read(&sb_dqopt(sb)->dqptr_sem);
+	if (copy_to_user(addr, &fmt, sizeof(fmt)))
+		return -EFAULT;
 	return 0;
 }
 
-/* Check validity of XFS Quota Manager commands */
-static int xqm_quotactl_valid(struct super_block *sb, int type, int cmd,
-			      qid_t id)
+static int quota_getinfo(struct super_block *sb, int type, void __user *addr)
 {
-	if (type >= XQM_MAXQUOTAS)
-		return -EINVAL;
-	if (!sb)
-		return -ENODEV;
-	if (!sb->s_qcop)
+	struct if_dqinfo info;
+	int ret;
+
+	if (!sb->s_qcop->get_info)
 		return -ENOSYS;
+	ret = sb->s_qcop->get_info(sb, type, &info);
+	if (!ret && copy_to_user(addr, &info, sizeof(info)))
+		return -EFAULT;
+	return ret;
+}
 
-	switch (cmd) {
-		case Q_XQUOTAON:
-		case Q_XQUOTAOFF:
-		case Q_XQUOTARM:
-			if (!sb->s_qcop->set_xstate)
-				return -ENOSYS;
-			break;
-		case Q_XGETQSTAT:
-			if (!sb->s_qcop->get_xstate)
-				return -ENOSYS;
-			break;
-		case Q_XSETQLIM:
-			if (!sb->s_qcop->set_xquota)
-				return -ENOSYS;
-			break;
-		case Q_XGETQUOTA:
-			if (!sb->s_qcop->get_xquota)
-				return -ENOSYS;
-			break;
-		case Q_XQUOTASYNC:
-			if (!sb->s_qcop->quota_sync)
-				return -ENOSYS;
-			break;
-		default:
-			return -EINVAL;
-	}
+static int quota_setinfo(struct super_block *sb, int type, void __user *addr)
+{
+	struct if_dqinfo info;
 
-	/* Check privileges */
-	if (cmd == Q_XGETQUOTA) {
-		if (((type == XQM_USRQUOTA && current_euid() != id) ||
-		     (type == XQM_GRPQUOTA && !in_egroup_p(id))) &&
-		     !capable(CAP_SYS_ADMIN))
-			return -EPERM;
-	} else if (cmd != Q_XGETQSTAT && cmd != Q_XQUOTASYNC) {
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-	}
+	if (copy_from_user(&info, addr, sizeof(info)))
+		return -EFAULT;
+	if (!sb->s_qcop->set_info)
+		return -ENOSYS;
+	return sb->s_qcop->set_info(sb, type, &info);
+}
+
+static void copy_to_if_dqblk(struct if_dqblk *dst, struct fs_disk_quota *src)
+{
+	dst->dqb_bhardlimit = src->d_blk_hardlimit;
+	dst->dqb_bsoftlimit = src->d_blk_softlimit;
+	dst->dqb_curspace = src->d_bcount;
+	dst->dqb_ihardlimit = src->d_ino_hardlimit;
+	dst->dqb_isoftlimit = src->d_ino_softlimit;
+	dst->dqb_curinodes = src->d_icount;
+	dst->dqb_btime = src->d_btimer;
+	dst->dqb_itime = src->d_itimer;
+	dst->dqb_valid = QIF_ALL;
+}
+
+static int quota_getquota(struct super_block *sb, int type, qid_t id,
+			  void __user *addr)
+{
+	struct fs_disk_quota fdq;
+	struct if_dqblk idq;
+	int ret;
 
+	if (!sb->s_qcop->get_dqblk)
+		return -ENOSYS;
+	ret = sb->s_qcop->get_dqblk(sb, type, id, &fdq);
+	if (ret)
+		return ret;
+	copy_to_if_dqblk(&idq, &fdq);
+	if (copy_to_user(addr, &idq, sizeof(idq)))
+		return -EFAULT;
 	return 0;
 }
 
-static int check_quotactl_valid(struct super_block *sb, int type, int cmd,
-				qid_t id)
+static void copy_from_if_dqblk(struct fs_disk_quota *dst, struct if_dqblk *src)
 {
-	int error;
-
-	if (XQM_COMMAND(cmd))
-		error = xqm_quotactl_valid(sb, type, cmd, id);
-	else
-		error = generic_quotactl_valid(sb, type, cmd, id);
-	if (!error)
-		error = security_quotactl(cmd, type, id, sb);
-	return error;
+	dst->d_blk_hardlimit = src->dqb_bhardlimit;
+	dst->d_blk_softlimit  = src->dqb_bsoftlimit;
+	dst->d_bcount = src->dqb_curspace;
+	dst->d_ino_hardlimit = src->dqb_ihardlimit;
+	dst->d_ino_softlimit = src->dqb_isoftlimit;
+	dst->d_icount = src->dqb_curinodes;
+	dst->d_btimer = src->dqb_btime;
+	dst->d_itimer = src->dqb_itime;
+
+	dst->d_fieldmask = 0;
+	if (src->dqb_valid & QIF_BLIMITS)
+		dst->d_fieldmask |= FS_DQ_BSOFT | FS_DQ_BHARD;
+	if (src->dqb_valid & QIF_SPACE)
+		dst->d_fieldmask |= FS_DQ_BCOUNT;
+	if (src->dqb_valid & QIF_ILIMITS)
+		dst->d_fieldmask |= FS_DQ_ISOFT | FS_DQ_IHARD;
+	if (src->dqb_valid & QIF_INODES)
+		dst->d_fieldmask |= FS_DQ_ICOUNT;
+	if (src->dqb_valid & QIF_BTIME)
+		dst->d_fieldmask |= FS_DQ_BTIMER;
+	if (src->dqb_valid & QIF_ITIME)
+		dst->d_fieldmask |= FS_DQ_ITIMER;
 }
 
-#ifdef CONFIG_QUOTA
-void sync_quota_sb(struct super_block *sb, int type)
+static int quota_setquota(struct super_block *sb, int type, qid_t id,
+			  void __user *addr)
 {
-	int cnt;
+	struct fs_disk_quota fdq;
+	struct if_dqblk idq;
 
-	if (!sb->s_qcop->quota_sync)
-		return;
+	if (copy_from_user(&idq, addr, sizeof(idq)))
+		return -EFAULT;
+	if (!sb->s_qcop->set_dqblk)
+		return -ENOSYS;
+	copy_from_if_dqblk(&fdq, &idq);
+	return sb->s_qcop->set_dqblk(sb, type, id, &fdq);
+}
 
-	sb->s_qcop->quota_sync(sb, type);
+static int quota_setxstate(struct super_block *sb, int cmd, void __user *addr)
+{
+	__u32 flags;
 
-	if (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE)
-		return;
-	/* This is not very clever (and fast) but currently I don't know about
-	 * any other simple way of getting quota data to disk and we must get
-	 * them there for userspace to be visible... */
-	if (sb->s_op->sync_fs)
-		sb->s_op->sync_fs(sb, 1);
-	sync_blockdev(sb->s_bdev);
+	if (copy_from_user(&flags, addr, sizeof(flags)))
+		return -EFAULT;
+	if (!sb->s_qcop->set_xstate)
+		return -ENOSYS;
+	return sb->s_qcop->set_xstate(sb, flags, cmd);
+}
 
-	/*
-	 * Now when everything is written we can discard the pagecache so
-	 * that userspace sees the changes.
-	 */
-	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
-	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (type != -1 && cnt != type)
-			continue;
-		if (!sb_has_quota_active(sb, cnt))
-			continue;
-		mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex,
-				  I_MUTEX_QUOTA);
-		truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0);
-		mutex_unlock(&sb_dqopt(sb)->files[cnt]->i_mutex);
-	}
-	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+static int quota_getxstate(struct super_block *sb, void __user *addr)
+{
+	struct fs_quota_stat fqs;
+	int ret;
+
+	if (!sb->s_qcop->get_xstate)
+		return -ENOSYS;
+	ret = sb->s_qcop->get_xstate(sb, &fqs);
+	if (!ret && copy_to_user(addr, &fqs, sizeof(fqs)))
+		return -EFAULT;
+	return ret;
 }
-#endif
 
-static void sync_dquots(int type)
+static int quota_setxquota(struct super_block *sb, int type, qid_t id,
+			   void __user *addr)
 {
-	struct super_block *sb;
-	int cnt;
-
-	spin_lock(&sb_lock);
-restart:
-	list_for_each_entry(sb, &super_blocks, s_list) {
-		/* This test just improves performance so it needn't be
-		 * reliable... */
-		for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-			if (type != -1 && type != cnt)
-				continue;
-			if (!sb_has_quota_active(sb, cnt))
-				continue;
-			if (!info_dirty(&sb_dqopt(sb)->info[cnt]) &&
-			   list_empty(&sb_dqopt(sb)->info[cnt].dqi_dirty_list))
-				continue;
-			break;
-		}
-		if (cnt == MAXQUOTAS)
-			continue;
-		sb->s_count++;
-		spin_unlock(&sb_lock);
-		down_read(&sb->s_umount);
-		if (sb->s_root)
-			sync_quota_sb(sb, type);
-		up_read(&sb->s_umount);
-		spin_lock(&sb_lock);
-		if (__put_super_and_need_restart(sb))
-			goto restart;
-	}
-	spin_unlock(&sb_lock);
+	struct fs_disk_quota fdq;
+
+	if (copy_from_user(&fdq, addr, sizeof(fdq)))
+		return -EFAULT;
+	if (!sb->s_qcop->set_dqblk)
+		return -ENOSYS;
+	return sb->s_qcop->set_dqblk(sb, type, id, &fdq);
+}
+
+static int quota_getxquota(struct super_block *sb, int type, qid_t id,
+			   void __user *addr)
+{
+	struct fs_disk_quota fdq;
+	int ret;
+
+	if (!sb->s_qcop->get_dqblk)
+		return -ENOSYS;
+	ret = sb->s_qcop->get_dqblk(sb, type, id, &fdq);
+	if (!ret && copy_to_user(addr, &fdq, sizeof(fdq)))
+		return -EFAULT;
+	return ret;
 }
 
 /* Copy parameters and call proper function */
@@ -240,117 +245,55 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
 {
 	int ret;
 
+	if (type >= (XQM_COMMAND(cmd) ? XQM_MAXQUOTAS : MAXQUOTAS))
+		return -EINVAL;
+	if (!sb->s_qcop)
+		return -ENOSYS;
+
+	ret = check_quotactl_permission(sb, type, cmd, id);
+	if (ret < 0)
+		return ret;
+
 	switch (cmd) {
-		case Q_QUOTAON: {
-			char *pathname;
-
-			pathname = getname(addr);
-			if (IS_ERR(pathname))
-				return PTR_ERR(pathname);
-			ret = sb->s_qcop->quota_on(sb, type, id, pathname, 0);
-			putname(pathname);
-			return ret;
-		}
-		case Q_QUOTAOFF:
-			return sb->s_qcop->quota_off(sb, type, 0);
-
-		case Q_GETFMT: {
-			__u32 fmt;
-
-			down_read(&sb_dqopt(sb)->dqptr_sem);
-			if (!sb_has_quota_active(sb, type)) {
-				up_read(&sb_dqopt(sb)->dqptr_sem);
-				return -ESRCH;
-			}
-			fmt = sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id;
-			up_read(&sb_dqopt(sb)->dqptr_sem);
-			if (copy_to_user(addr, &fmt, sizeof(fmt)))
-				return -EFAULT;
-			return 0;
-		}
-		case Q_GETINFO: {
-			struct if_dqinfo info;
-
-			ret = sb->s_qcop->get_info(sb, type, &info);
-			if (ret)
-				return ret;
-			if (copy_to_user(addr, &info, sizeof(info)))
-				return -EFAULT;
-			return 0;
-		}
-		case Q_SETINFO: {
-			struct if_dqinfo info;
-
-			if (copy_from_user(&info, addr, sizeof(info)))
-				return -EFAULT;
-			return sb->s_qcop->set_info(sb, type, &info);
-		}
-		case Q_GETQUOTA: {
-			struct if_dqblk idq;
-
-			ret = sb->s_qcop->get_dqblk(sb, type, id, &idq);
-			if (ret)
-				return ret;
-			if (copy_to_user(addr, &idq, sizeof(idq)))
-				return -EFAULT;
-			return 0;
-		}
-		case Q_SETQUOTA: {
-			struct if_dqblk idq;
-
-			if (copy_from_user(&idq, addr, sizeof(idq)))
-				return -EFAULT;
-			return sb->s_qcop->set_dqblk(sb, type, id, &idq);
-		}
-		case Q_SYNC:
-			if (sb)
-				sync_quota_sb(sb, type);
-			else
-				sync_dquots(type);
-			return 0;
-
-		case Q_XQUOTAON:
-		case Q_XQUOTAOFF:
-		case Q_XQUOTARM: {
-			__u32 flags;
-
-			if (copy_from_user(&flags, addr, sizeof(flags)))
-				return -EFAULT;
-			return sb->s_qcop->set_xstate(sb, flags, cmd);
-		}
-		case Q_XGETQSTAT: {
-			struct fs_quota_stat fqs;
-		
-			if ((ret = sb->s_qcop->get_xstate(sb, &fqs)))
-				return ret;
-			if (copy_to_user(addr, &fqs, sizeof(fqs)))
-				return -EFAULT;
-			return 0;
-		}
-		case Q_XSETQLIM: {
-			struct fs_disk_quota fdq;
-
-			if (copy_from_user(&fdq, addr, sizeof(fdq)))
-				return -EFAULT;
-		       return sb->s_qcop->set_xquota(sb, type, id, &fdq);
-		}
-		case Q_XGETQUOTA: {
-			struct fs_disk_quota fdq;
-
-			ret = sb->s_qcop->get_xquota(sb, type, id, &fdq);
-			if (ret)
-				return ret;
-			if (copy_to_user(addr, &fdq, sizeof(fdq)))
-				return -EFAULT;
-			return 0;
-		}
-		case Q_XQUOTASYNC:
-			return sb->s_qcop->quota_sync(sb, type);
-		/* We never reach here unless validity check is broken */
-		default:
-			BUG();
+	case Q_QUOTAON:
+		return quota_quotaon(sb, type, cmd, id, addr);
+	case Q_QUOTAOFF:
+		if (!sb->s_qcop->quota_off)
+			return -ENOSYS;
+		return sb->s_qcop->quota_off(sb, type);
+	case Q_GETFMT:
+		return quota_getfmt(sb, type, addr);
+	case Q_GETINFO:
+		return quota_getinfo(sb, type, addr);
+	case Q_SETINFO:
+		return quota_setinfo(sb, type, addr);
+	case Q_GETQUOTA:
+		return quota_getquota(sb, type, id, addr);
+	case Q_SETQUOTA:
+		return quota_setquota(sb, type, id, addr);
+	case Q_SYNC:
+		if (!sb->s_qcop->quota_sync)
+			return -ENOSYS;
+		return sb->s_qcop->quota_sync(sb, type, 1);
+	case Q_XQUOTAON:
+	case Q_XQUOTAOFF:
+	case Q_XQUOTARM:
+		return quota_setxstate(sb, cmd, addr);
+	case Q_XGETQSTAT:
+		return quota_getxstate(sb, addr);
+	case Q_XSETQLIM:
+		return quota_setxquota(sb, type, id, addr);
+	case Q_XGETQUOTA:
+		return quota_getxquota(sb, type, id, addr);
+	case Q_XQUOTASYNC:
+		/* caller already holds s_umount */
+		if (sb->s_flags & MS_RDONLY)
+			return -EROFS;
+		writeback_inodes_sb(sb);
+		return 0;
+	default:
+		return -EINVAL;
 	}
-	return 0;
 }
 
 /*
@@ -397,224 +340,23 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
 	cmds = cmd >> SUBCMDSHIFT;
 	type = cmd & SUBCMDMASK;
 
-	if (cmds != Q_SYNC || special) {
-		sb = quotactl_block(special);
-		if (IS_ERR(sb))
-			return PTR_ERR(sb);
+	/*
+	 * As a special case Q_SYNC can be called without a specific device.
+	 * It will iterate all superblocks that have quota enabled and call
+	 * the sync action on each of them.
+	 */
+	if (!special) {
+		if (cmds == Q_SYNC)
+			return quota_sync_all(type);
+		return -ENODEV;
 	}
 
-	ret = check_quotactl_valid(sb, type, cmds, id);
-	if (ret >= 0)
-		ret = do_quotactl(sb, type, cmds, id, addr);
-	if (sb)
-		drop_super(sb);
-
-	return ret;
-}
-
-#if defined(CONFIG_COMPAT_FOR_U64_ALIGNMENT)
-/*
- * This code works only for 32 bit quota tools over 64 bit OS (x86_64, ia64)
- * and is necessary due to alignment problems.
- */
-struct compat_if_dqblk {
-	compat_u64 dqb_bhardlimit;
-	compat_u64 dqb_bsoftlimit;
-	compat_u64 dqb_curspace;
-	compat_u64 dqb_ihardlimit;
-	compat_u64 dqb_isoftlimit;
-	compat_u64 dqb_curinodes;
-	compat_u64 dqb_btime;
-	compat_u64 dqb_itime;
-	compat_uint_t dqb_valid;
-};
-
-/* XFS structures */
-struct compat_fs_qfilestat {
-	compat_u64 dqb_bhardlimit;
-	compat_u64 qfs_nblks;
-	compat_uint_t qfs_nextents;
-};
-
-struct compat_fs_quota_stat {
-	__s8		qs_version;
-	__u16		qs_flags;
-	__s8		qs_pad;
-	struct compat_fs_qfilestat	qs_uquota;
-	struct compat_fs_qfilestat	qs_gquota;
-	compat_uint_t	qs_incoredqs;
-	compat_int_t	qs_btimelimit;
-	compat_int_t	qs_itimelimit;
-	compat_int_t	qs_rtbtimelimit;
-	__u16		qs_bwarnlimit;
-	__u16		qs_iwarnlimit;
-};
-
-asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
-						qid_t id, void __user *addr)
-{
-	unsigned int cmds;
-	struct if_dqblk __user *dqblk;
-	struct compat_if_dqblk __user *compat_dqblk;
-	struct fs_quota_stat __user *fsqstat;
-	struct compat_fs_quota_stat __user *compat_fsqstat;
-	compat_uint_t data;
-	u16 xdata;
-	long ret;
+	sb = quotactl_block(special);
+	if (IS_ERR(sb))
+		return PTR_ERR(sb);
 
-	cmds = cmd >> SUBCMDSHIFT;
+	ret = do_quotactl(sb, type, cmds, id, addr);
 
-	switch (cmds) {
-	case Q_GETQUOTA:
-		dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
-		compat_dqblk = addr;
-		ret = sys_quotactl(cmd, special, id, dqblk);
-		if (ret)
-			break;
-		if (copy_in_user(compat_dqblk, dqblk, sizeof(*compat_dqblk)) ||
-			get_user(data, &dqblk->dqb_valid) ||
-			put_user(data, &compat_dqblk->dqb_valid))
-			ret = -EFAULT;
-		break;
-	case Q_SETQUOTA:
-		dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
-		compat_dqblk = addr;
-		ret = -EFAULT;
-		if (copy_in_user(dqblk, compat_dqblk, sizeof(*compat_dqblk)) ||
-			get_user(data, &compat_dqblk->dqb_valid) ||
-			put_user(data, &dqblk->dqb_valid))
-			break;
-		ret = sys_quotactl(cmd, special, id, dqblk);
-		break;
-	case Q_XGETQSTAT:
-		fsqstat = compat_alloc_user_space(sizeof(struct fs_quota_stat));
-		compat_fsqstat = addr;
-		ret = sys_quotactl(cmd, special, id, fsqstat);
-		if (ret)
-			break;
-		ret = -EFAULT;
-		/* Copying qs_version, qs_flags, qs_pad */
-		if (copy_in_user(compat_fsqstat, fsqstat,
-			offsetof(struct compat_fs_quota_stat, qs_uquota)))
-			break;
-		/* Copying qs_uquota */
-		if (copy_in_user(&compat_fsqstat->qs_uquota,
-			&fsqstat->qs_uquota,
-			sizeof(compat_fsqstat->qs_uquota)) ||
-			get_user(data, &fsqstat->qs_uquota.qfs_nextents) ||
-			put_user(data, &compat_fsqstat->qs_uquota.qfs_nextents))
-			break;
-		/* Copying qs_gquota */
-		if (copy_in_user(&compat_fsqstat->qs_gquota,
-			&fsqstat->qs_gquota,
-			sizeof(compat_fsqstat->qs_gquota)) ||
-			get_user(data, &fsqstat->qs_gquota.qfs_nextents) ||
-			put_user(data, &compat_fsqstat->qs_gquota.qfs_nextents))
-			break;
-		/* Copying the rest */
-		if (copy_in_user(&compat_fsqstat->qs_incoredqs,
-			&fsqstat->qs_incoredqs,
-			sizeof(struct compat_fs_quota_stat) -
-			offsetof(struct compat_fs_quota_stat, qs_incoredqs)) ||
-			get_user(xdata, &fsqstat->qs_iwarnlimit) ||
-			put_user(xdata, &compat_fsqstat->qs_iwarnlimit))
-			break;
-		ret = 0;
-		break;
-	default:
-		ret = sys_quotactl(cmd, special, id, addr);
-	}
+	drop_super(sb);
 	return ret;
 }
-#endif
-
-
-#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
-
-/* Netlink family structure for quota */
-static struct genl_family quota_genl_family = {
-	.id = GENL_ID_GENERATE,
-	.hdrsize = 0,
-	.name = "VFS_DQUOT",
-	.version = 1,
-	.maxattr = QUOTA_NL_A_MAX,
-};
-
-/**
- * quota_send_warning - Send warning to userspace about exceeded quota
- * @type: The quota type: USRQQUOTA, GRPQUOTA,...
- * @id: The user or group id of the quota that was exceeded
- * @dev: The device on which the fs is mounted (sb->s_dev)
- * @warntype: The type of the warning: QUOTA_NL_...
- *
- * This can be used by filesystems (including those which don't use
- * dquot) to send a message to userspace relating to quota limits.
- *
- */
-
-void quota_send_warning(short type, unsigned int id, dev_t dev,
-			const char warntype)
-{
-	static atomic_t seq;
-	struct sk_buff *skb;
-	void *msg_head;
-	int ret;
-	int msg_size = 4 * nla_total_size(sizeof(u32)) +
-		       2 * nla_total_size(sizeof(u64));
-
-	/* We have to allocate using GFP_NOFS as we are called from a
-	 * filesystem performing write and thus further recursion into
-	 * the fs to free some data could cause deadlocks. */
-	skb = genlmsg_new(msg_size, GFP_NOFS);
-	if (!skb) {
-		printk(KERN_ERR
-		  "VFS: Not enough memory to send quota warning.\n");
-		return;
-	}
-	msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq),
-			&quota_genl_family, 0, QUOTA_NL_C_WARNING);
-	if (!msg_head) {
-		printk(KERN_ERR
-		  "VFS: Cannot store netlink header in quota warning.\n");
-		goto err_out;
-	}
-	ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, type);
-	if (ret)
-		goto attr_err_out;
-	ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID, id);
-	if (ret)
-		goto attr_err_out;
-	ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype);
-	if (ret)
-		goto attr_err_out;
-	ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR, MAJOR(dev));
-	if (ret)
-		goto attr_err_out;
-	ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR, MINOR(dev));
-	if (ret)
-		goto attr_err_out;
-	ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current_uid());
-	if (ret)
-		goto attr_err_out;
-	genlmsg_end(skb, msg_head);
-
-	genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS);
-	return;
-attr_err_out:
-	printk(KERN_ERR "VFS: Not enough space to compose quota message!\n");
-err_out:
-	kfree_skb(skb);
-}
-EXPORT_SYMBOL(quota_send_warning);
-
-static int __init quota_init(void)
-{
-	if (genl_register_family(&quota_genl_family) != 0)
-		printk(KERN_ERR
-		       "VFS: Failed to create quota netlink interface.\n");
-	return 0;
-};
-
-module_init(quota_init);
-#endif
-
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index f81f4bcfb178..24f03407eeb5 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -60,9 +60,17 @@ static ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
 static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
 {
 	struct super_block *sb = info->dqi_sb;
+	ssize_t ret;
 
-	return sb->s_op->quota_write(sb, info->dqi_type, buf,
+	ret = sb->s_op->quota_write(sb, info->dqi_type, buf,
 	       info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
+	if (ret != info->dqi_usable_bs) {
+		q_warn(KERN_WARNING "VFS: dquota write failed on "
+			"dev %s\n", sb->s_id);
+		if (ret >= 0)
+			ret = -EIO;
+	}
+	return ret;
 }
 
 /* Remove empty block from list and return it */
@@ -152,7 +160,7 @@ static int remove_free_dqentry(struct qtree_mem_dqinfo *info, char *buf,
 	dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
 	/* No matter whether write succeeds block is out of list */
 	if (write_blk(info, blk, buf) < 0)
-		printk(KERN_ERR
+		q_warn(KERN_ERR
 		       "VFS: Can't write block (%u) with free entries.\n",
 		       blk);
 	return 0;
@@ -244,7 +252,7 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
 	if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) {
 		*err = remove_free_dqentry(info, buf, blk);
 		if (*err < 0) {
-			printk(KERN_ERR "VFS: find_free_dqentry(): Can't "
+			q_warn(KERN_ERR "VFS: find_free_dqentry(): Can't "
 			       "remove block (%u) from entry free list.\n",
 			       blk);
 			goto out_buf;
@@ -268,7 +276,7 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
 #endif
 	*err = write_blk(info, blk, buf);
 	if (*err < 0) {
-		printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota "
+		q_warn(KERN_ERR "VFS: find_free_dqentry(): Can't write quota "
 				"data block %u.\n", blk);
 		goto out_buf;
 	}
@@ -303,7 +311,7 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 	} else {
 		ret = read_blk(info, *treeblk, buf);
 		if (ret < 0) {
-			printk(KERN_ERR "VFS: Can't read tree quota block "
+			q_warn(KERN_ERR "VFS: Can't read tree quota block "
 					"%u.\n", *treeblk);
 			goto out_buf;
 		}
@@ -365,7 +373,7 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 	if (!dquot->dq_off) {
 		ret = dq_insert_tree(info, dquot);
 		if (ret < 0) {
-			printk(KERN_ERR "VFS: Error %zd occurred while "
+			q_warn(KERN_ERR "VFS: Error %zd occurred while "
 					"creating quota.\n", ret);
 			kfree(ddquot);
 			return ret;
@@ -377,14 +385,14 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 	ret = sb->s_op->quota_write(sb, type, ddquot, info->dqi_entry_size,
 				    dquot->dq_off);
 	if (ret != info->dqi_entry_size) {
-		printk(KERN_WARNING "VFS: dquota write failed on dev %s\n",
+		q_warn(KERN_WARNING "VFS: dquota write failed on dev %s\n",
 		       sb->s_id);
 		if (ret >= 0)
 			ret = -ENOSPC;
 	} else {
 		ret = 0;
 	}
-	dqstats.writes++;
+	dqstats_inc(DQST_WRITES);
 	kfree(ddquot);
 
 	return ret;
@@ -402,14 +410,14 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 	if (!buf)
 		return -ENOMEM;
 	if (dquot->dq_off >> info->dqi_blocksize_bits != blk) {
-		printk(KERN_ERR "VFS: Quota structure has offset to other "
+		q_warn(KERN_ERR "VFS: Quota structure has offset to other "
 		  "block (%u) than it should (%u).\n", blk,
 		  (uint)(dquot->dq_off >> info->dqi_blocksize_bits));
 		goto out_buf;
 	}
 	ret = read_blk(info, blk, buf);
 	if (ret < 0) {
-		printk(KERN_ERR "VFS: Can't read quota data block %u\n", blk);
+		q_warn(KERN_ERR "VFS: Can't read quota data block %u\n", blk);
 		goto out_buf;
 	}
 	dh = (struct qt_disk_dqdbheader *)buf;
@@ -419,7 +427,7 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 		if (ret >= 0)
 			ret = put_free_dqblk(info, buf, blk);
 		if (ret < 0) {
-			printk(KERN_ERR "VFS: Can't move quota data block (%u) "
+			q_warn(KERN_ERR "VFS: Can't move quota data block (%u) "
 			  "to free list.\n", blk);
 			goto out_buf;
 		}
@@ -432,14 +440,14 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 			/* Insert will write block itself */
 			ret = insert_free_dqentry(info, buf, blk);
 			if (ret < 0) {
-				printk(KERN_ERR "VFS: Can't insert quota data "
+				q_warn(KERN_ERR "VFS: Can't insert quota data "
 				       "block (%u) to free entry list.\n", blk);
 				goto out_buf;
 			}
 		} else {
 			ret = write_blk(info, blk, buf);
 			if (ret < 0) {
-				printk(KERN_ERR "VFS: Can't write quota data "
+				q_warn(KERN_ERR "VFS: Can't write quota data "
 				  "block %u\n", blk);
 				goto out_buf;
 			}
@@ -464,7 +472,7 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 		return -ENOMEM;
 	ret = read_blk(info, *blk, buf);
 	if (ret < 0) {
-		printk(KERN_ERR "VFS: Can't read quota data block %u\n", *blk);
+		q_warn(KERN_ERR "VFS: Can't read quota data block %u\n", *blk);
 		goto out_buf;
 	}
 	newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
@@ -488,7 +496,7 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 		} else {
 			ret = write_blk(info, *blk, buf);
 			if (ret < 0)
-				printk(KERN_ERR "VFS: Can't write quota tree "
+				q_warn(KERN_ERR "VFS: Can't write quota tree "
 				  "block %u.\n", *blk);
 		}
 	}
@@ -521,7 +529,7 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
 		return -ENOMEM;
 	ret = read_blk(info, blk, buf);
 	if (ret < 0) {
-		printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
+		q_warn(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
 		goto out_buf;
 	}
 	ddquot = buf + sizeof(struct qt_disk_dqdbheader);
@@ -531,7 +539,7 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
 		ddquot += info->dqi_entry_size;
 	}
 	if (i == qtree_dqstr_in_blk(info)) {
-		printk(KERN_ERR "VFS: Quota for id %u referenced "
+		q_warn(KERN_ERR "VFS: Quota for id %u referenced "
 		  "but not present.\n", dquot->dq_id);
 		ret = -EIO;
 		goto out_buf;
@@ -556,7 +564,7 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
 		return -ENOMEM;
 	ret = read_blk(info, blk, buf);
 	if (ret < 0) {
-		printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
+		q_warn(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
 		goto out_buf;
 	}
 	ret = 0;
@@ -599,7 +607,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 		offset = find_dqentry(info, dquot);
 		if (offset <= 0) {	/* Entry not present? */
 			if (offset < 0)
-				printk(KERN_ERR "VFS: Can't read quota "
+				q_warn(KERN_ERR "VFS: Can't read quota "
 				  "structure for id %u.\n", dquot->dq_id);
 			dquot->dq_off = 0;
 			set_bit(DQ_FAKE_B, &dquot->dq_flags);
@@ -617,7 +625,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 	if (ret != info->dqi_entry_size) {
 		if (ret >= 0)
 			ret = -EIO;
-		printk(KERN_ERR "VFS: Error while reading quota "
+		q_warn(KERN_ERR "VFS: Error while reading quota "
 				"structure for id %u.\n", dquot->dq_id);
 		set_bit(DQ_FAKE_B, &dquot->dq_flags);
 		memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
@@ -634,7 +642,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 	spin_unlock(&dq_data_lock);
 	kfree(ddquot);
 out:
-	dqstats.reads++;
+	dqstats_inc(DQST_READS);
 	return ret;
 }
 EXPORT_SYMBOL(qtree_read_dquot);
diff --git a/fs/quota/quota_tree.h b/fs/quota/quota_tree.h
index a1ab8db81a51..ccc3e71fb1d8 100644
--- a/fs/quota/quota_tree.h
+++ b/fs/quota/quota_tree.h
@@ -22,4 +22,10 @@ struct qt_disk_dqdbheader {
 
 #define QT_TREEOFF	1		/* Offset of tree in file in blocks */
 
+#define q_warn(fmt, args...) \
+do { \
+	if (printk_ratelimit()) \
+		printk(fmt, ## args); \
+} while(0)
+
 #endif /* _LINUX_QUOTAIO_TREE_H */
diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c
index 2ae757e9c008..4af344c5852a 100644
--- a/fs/quota/quota_v1.c
+++ b/fs/quota/quota_v1.c
@@ -71,7 +71,7 @@ static int v1_read_dqblk(struct dquot *dquot)
 	    dquot->dq_dqb.dqb_ihardlimit == 0 &&
 	    dquot->dq_dqb.dqb_isoftlimit == 0)
 		set_bit(DQ_FAKE_B, &dquot->dq_flags);
-	dqstats.reads++;
+	dqstats_inc(DQST_READS);
 
 	return 0;
 }
@@ -104,7 +104,7 @@ static int v1_commit_dqblk(struct dquot *dquot)
 	ret = 0;
 
 out:
-	dqstats.writes++;
+	dqstats_inc(DQST_WRITES);
 
 	return ret;
 }
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index e3da02f4986f..135206af1458 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -63,7 +63,7 @@ static int v2_read_header(struct super_block *sb, int type,
 	size = sb->s_op->quota_read(sb, type, (char *)dqhead,
 				    sizeof(struct v2_disk_dqheader), 0);
 	if (size != sizeof(struct v2_disk_dqheader)) {
-		printk(KERN_WARNING "quota_v2: Failed header read:"
+		q_warn(KERN_WARNING "quota_v2: Failed header read:"
 		       " expected=%zd got=%zd\n",
 			sizeof(struct v2_disk_dqheader), size);
 		return 0;
@@ -106,7 +106,7 @@ static int v2_read_file_info(struct super_block *sb, int type)
 	size = sb->s_op->quota_read(sb, type, (char *)&dinfo,
 	       sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
 	if (size != sizeof(struct v2_disk_dqinfo)) {
-		printk(KERN_WARNING "quota_v2: Can't read info structure on device %s.\n",
+		q_warn(KERN_WARNING "quota_v2: Can't read info structure on device %s.\n",
 			sb->s_id);
 		return -1;
 	}
@@ -167,7 +167,7 @@ static int v2_write_file_info(struct super_block *sb, int type)
 	size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
 	       sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
 	if (size != sizeof(struct v2_disk_dqinfo)) {
-		printk(KERN_WARNING "Can't write info structure on device %s.\n",
+		q_warn(KERN_WARNING "Can't write info structure on device %s.\n",
 			sb->s_id);
 		return -1;
 	}
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 78f613cb9c76..4884ac5ae9be 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -43,12 +43,13 @@ const struct file_operations ramfs_file_operations = {
 	.write		= do_sync_write,
 	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
-	.fsync		= simple_sync_file,
+	.fsync		= noop_fsync,
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= generic_file_splice_write,
 	.llseek		= generic_file_llseek,
 };
 
 const struct inode_operations ramfs_file_inode_operations = {
+	.setattr	= simple_setattr,
 	.getattr	= simple_getattr,
 };
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 1739a4aba25f..d532c20fc179 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -21,6 +21,7 @@
 #include <linux/pagevec.h>
 #include <linux/mman.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 
 #include <asm/uaccess.h>
 #include "internal.h"
@@ -41,7 +42,7 @@ const struct file_operations ramfs_file_operations = {
 	.aio_read		= generic_file_aio_read,
 	.write			= do_sync_write,
 	.aio_write		= generic_file_aio_write,
-	.fsync			= simple_sync_file,
+	.fsync			= noop_fsync,
 	.splice_read		= generic_file_splice_read,
 	.splice_write		= generic_file_splice_write,
 	.llseek			= generic_file_llseek,
@@ -145,7 +146,7 @@ static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size)
 			return ret;
 	}
 
-	ret = vmtruncate(inode, newsize);
+	ret = simple_setsize(inode, newsize);
 
 	return ret;
 }
@@ -168,7 +169,8 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
 
 	/* pick out size-changing events */
 	if (ia->ia_valid & ATTR_SIZE) {
-		loff_t size = i_size_read(inode);
+		loff_t size = inode->i_size;
+
 		if (ia->ia_size != size) {
 			ret = ramfs_nommu_resize(inode, ia->ia_size, size);
 			if (ret < 0 || ia->ia_valid == ATTR_SIZE)
@@ -181,7 +183,7 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
 		}
 	}
 
-	ret = inode_setattr(inode, ia);
+	generic_setattr(inode, ia);
  out:
 	ia->ia_valid = old_ia_valid;
 	return ret;
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index a6090aa1a7c1..a5ebae70dc6d 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -35,6 +35,7 @@
 #include <linux/sched.h>
 #include <linux/parser.h>
 #include <linux/magic.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
@@ -51,14 +52,13 @@ static struct backing_dev_info ramfs_backing_dev_info = {
 			  BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP,
 };
 
-struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
+struct inode *ramfs_get_inode(struct super_block *sb,
+				const struct inode *dir, int mode, dev_t dev)
 {
 	struct inode * inode = new_inode(sb);
 
 	if (inode) {
-		inode->i_mode = mode;
-		inode->i_uid = current_fsuid();
-		inode->i_gid = current_fsgid();
+		inode_init_owner(inode, dir, mode);
 		inode->i_mapping->a_ops = &ramfs_aops;
 		inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
 		mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
@@ -94,15 +94,10 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
 static int
 ramfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
 {
-	struct inode * inode = ramfs_get_inode(dir->i_sb, mode, dev);
+	struct inode * inode = ramfs_get_inode(dir->i_sb, dir, mode, dev);
 	int error = -ENOSPC;
 
 	if (inode) {
-		if (dir->i_mode & S_ISGID) {
-			inode->i_gid = dir->i_gid;
-			if (S_ISDIR(mode))
-				inode->i_mode |= S_ISGID;
-		}
 		d_instantiate(dentry, inode);
 		dget(dentry);	/* Extra count - pin the dentry in core */
 		error = 0;
@@ -129,13 +124,11 @@ static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char *
 	struct inode *inode;
 	int error = -ENOSPC;
 
-	inode = ramfs_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0);
+	inode = ramfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0);
 	if (inode) {
 		int l = strlen(symname)+1;
 		error = page_symlink(inode, symname, l);
 		if (!error) {
-			if (dir->i_mode & S_ISGID)
-				inode->i_gid = dir->i_gid;
 			d_instantiate(dentry, inode);
 			dget(dentry);
 			dir->i_mtime = dir->i_ctime = CURRENT_TIME;
@@ -213,7 +206,7 @@ static int ramfs_parse_options(char *data, struct ramfs_mount_opts *opts)
 	return 0;
 }
 
-static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
+int ramfs_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct ramfs_fs_info *fsi;
 	struct inode *inode = NULL;
@@ -240,7 +233,7 @@ static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
 	sb->s_op		= &ramfs_ops;
 	sb->s_time_gran		= 1;
 
-	inode = ramfs_get_inode(sb, S_IFDIR | fsi->mount_opts.mode, 0);
+	inode = ramfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0);
 	if (!inode) {
 		err = -ENOMEM;
 		goto fail;
diff --git a/fs/read_write.c b/fs/read_write.c
index b7f4a1f94d48..9c0485236e68 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -97,6 +97,23 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
 }
 EXPORT_SYMBOL(generic_file_llseek);
 
+/**
+ * noop_llseek - No Operation Performed llseek implementation
+ * @file:	file structure to seek on
+ * @offset:	file offset to seek to
+ * @origin:	type of seek
+ *
+ * This is an implementation of ->llseek useable for the rare special case when
+ * userspace expects the seek to succeed but the (device) file is actually not
+ * able to perform the seek. In this case you use noop_llseek() instead of
+ * falling back to the default implementation of ->llseek.
+ */
+loff_t noop_llseek(struct file *file, loff_t offset, int origin)
+{
+	return file->f_pos;
+}
+EXPORT_SYMBOL(noop_llseek);
+
 loff_t no_llseek(struct file *file, loff_t offset, int origin)
 {
 	return -ESPIPE;
@@ -258,6 +275,7 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp
 	init_sync_kiocb(&kiocb, filp);
 	kiocb.ki_pos = *ppos;
 	kiocb.ki_left = len;
+	kiocb.ki_nbytes = len;
 
 	for (;;) {
 		ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
@@ -313,6 +331,7 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof
 	init_sync_kiocb(&kiocb, filp);
 	kiocb.ki_pos = *ppos;
 	kiocb.ki_left = len;
+	kiocb.ki_nbytes = len;
 
 	for (;;) {
 		ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index 65c872761177..483442e66ed6 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -169,7 +169,7 @@ static int scan_bitmap_block(struct reiserfs_transaction_handle *th,
 			return 0;	// No free blocks in this bitmap
 		}
 
-		/* search for a first zero bit -- beggining of a window */
+		/* search for a first zero bit -- beginning of a window */
 		*beg = reiserfs_find_next_zero_le_bit
 		    ((unsigned long *)(bh->b_data), boundary, *beg);
 
@@ -425,7 +425,7 @@ static void _reiserfs_free_block(struct reiserfs_transaction_handle *th,
 
 	journal_mark_dirty(th, s, sbh);
 	if (for_unformatted)
-		vfs_dq_free_block_nodirty(inode, 1);
+		dquot_free_block_nodirty(inode, 1);
 }
 
 void reiserfs_free_block(struct reiserfs_transaction_handle *th,
@@ -1049,7 +1049,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
 			       amount_needed, hint->inode->i_uid);
 #endif
 		quota_ret =
-		    vfs_dq_alloc_block_nodirty(hint->inode, amount_needed);
+		    dquot_alloc_block_nodirty(hint->inode, amount_needed);
 		if (quota_ret)	/* Quota exceeded? */
 			return QUOTA_EXCEEDED;
 		if (hint->preallocate && hint->prealloc_size) {
@@ -1058,7 +1058,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
 				       "reiserquota: allocating (prealloc) %d blocks id=%u",
 				       hint->prealloc_size, hint->inode->i_uid);
 #endif
-			quota_ret = vfs_dq_prealloc_block_nodirty(hint->inode,
+			quota_ret = dquot_prealloc_block_nodirty(hint->inode,
 							 hint->prealloc_size);
 			if (quota_ret)
 				hint->preallocate = hint->prealloc_size = 0;
@@ -1092,7 +1092,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
 					       hint->inode->i_uid);
 #endif
 				/* Free not allocated blocks */
-				vfs_dq_free_block_nodirty(hint->inode,
+				dquot_free_block_nodirty(hint->inode,
 					amount_needed + hint->prealloc_size -
 					nr_allocated);
 			}
@@ -1125,7 +1125,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
 			       REISERFS_I(hint->inode)->i_prealloc_count,
 			       hint->inode->i_uid);
 #endif
-		vfs_dq_free_block_nodirty(hint->inode, amount_needed +
+		dquot_free_block_nodirty(hint->inode, amount_needed +
 					 hint->prealloc_size - nr_allocated -
 					 REISERFS_I(hint->inode)->
 					 i_prealloc_count);
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index c094f58c7448..198dabf1b2bb 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -8,15 +8,16 @@
 #include <linux/reiserfs_fs.h>
 #include <linux/stat.h>
 #include <linux/buffer_head.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 
 extern const struct reiserfs_key MIN_KEY;
 
 static int reiserfs_readdir(struct file *, void *, filldir_t);
-static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry,
-			      int datasync);
+static int reiserfs_dir_fsync(struct file *filp, int datasync);
 
 const struct file_operations reiserfs_dir_operations = {
+	.llseek = generic_file_llseek,
 	.read = generic_read_dir,
 	.readdir = reiserfs_readdir,
 	.fsync = reiserfs_dir_fsync,
@@ -26,10 +27,9 @@ const struct file_operations reiserfs_dir_operations = {
 #endif
 };
 
-static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry,
-			      int datasync)
+static int reiserfs_dir_fsync(struct file *filp, int datasync)
 {
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = filp->f_mapping->host;
 	int err;
 	reiserfs_write_lock(inode->i_sb);
 	err = reiserfs_commit_for_inode(inode);
@@ -45,8 +45,6 @@ static inline bool is_privroot_deh(struct dentry *dir,
 				   struct reiserfs_de_head *deh)
 {
 	struct dentry *privroot = REISERFS_SB(dir->d_sb)->priv_root;
-	if (reiserfs_expose_privroot(dir->d_sb))
-		return 0;
 	return (dir == dir->d_parent && privroot->d_inode &&
 	        deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid);
 }
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index da2dba082e2d..b82cdd8a45dd 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -134,10 +134,9 @@ static void reiserfs_vfs_truncate_file(struct inode *inode)
  * be removed...
  */
 
-static int reiserfs_sync_file(struct file *filp,
-			      struct dentry *dentry, int datasync)
+static int reiserfs_sync_file(struct file *filp, int datasync)
 {
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = filp->f_mapping->host;
 	int err;
 	int barrier_done;
 
@@ -147,7 +146,8 @@ static int reiserfs_sync_file(struct file *filp,
 	barrier_done = reiserfs_commit_for_inode(inode);
 	reiserfs_write_unlock(inode->i_sb);
 	if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
-		blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, 
+			BLKDEV_IFL_WAIT);
 	if (barrier_done < 0)
 		return barrier_done;
 	return (err < 0) ? -EIO : 0;
@@ -289,7 +289,7 @@ const struct file_operations reiserfs_file_operations = {
 	.compat_ioctl = reiserfs_compat_ioctl,
 #endif
 	.mmap = reiserfs_file_mmap,
-	.open = generic_file_open,
+	.open = dquot_file_open,
 	.release = reiserfs_file_release,
 	.fsync = reiserfs_sync_file,
 	.aio_read = generic_file_aio_read,
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index 6591cb21edf6..1e4250bc3a6f 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -35,6 +35,7 @@
  **/
 
 #include <linux/time.h>
+#include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/reiserfs_fs.h>
 #include <linux/buffer_head.h>
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 9087b10209e6..0f22fdaf54ac 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -11,6 +11,7 @@
 #include <linux/smp_lock.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <asm/unaligned.h>
 #include <linux/buffer_head.h>
@@ -34,6 +35,9 @@ void reiserfs_delete_inode(struct inode *inode)
 	int depth;
 	int err;
 
+	if (!is_bad_inode(inode))
+		dquot_initialize(inode);
+
 	truncate_inode_pages(&inode->i_data, 0);
 
 	depth = reiserfs_write_lock_once(inode->i_sb);
@@ -54,7 +58,7 @@ void reiserfs_delete_inode(struct inode *inode)
 		 * after delete_object so that quota updates go into the same transaction as
 		 * stat data deletion */
 		if (!err) 
-			vfs_dq_free_inode(inode);
+			dquot_free_inode(inode);
 
 		if (journal_end(&th, inode->i_sb, jbegin_count))
 			goto out;
@@ -1497,9 +1501,11 @@ struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key)
 
 	args.objectid = key->on_disk_key.k_objectid;
 	args.dirid = key->on_disk_key.k_dir_id;
+	reiserfs_write_unlock(s);
 	inode = iget5_locked(s, key->on_disk_key.k_objectid,
 			     reiserfs_find_actor, reiserfs_init_locked_inode,
 			     (void *)(&args));
+	reiserfs_write_lock(s);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 
@@ -1613,7 +1619,7 @@ int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp,
 ** to properly mark inodes for datasync and such, but only actually
 ** does something when called for a synchronous update.
 */
-int reiserfs_write_inode(struct inode *inode, int do_sync)
+int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	struct reiserfs_transaction_handle th;
 	int jbegin_count = 1;
@@ -1625,7 +1631,7 @@ int reiserfs_write_inode(struct inode *inode, int do_sync)
 	 ** inode needs to reach disk for safety, and they can safely be
 	 ** ignored because the altered inode has already been logged.
 	 */
-	if (do_sync && !(current->flags & PF_MEMALLOC)) {
+	if (wbc->sync_mode == WB_SYNC_ALL && !(current->flags & PF_MEMALLOC)) {
 		reiserfs_write_lock(inode->i_sb);
 		if (!journal_begin(&th, inode->i_sb, jbegin_count)) {
 			reiserfs_update_sd(&th, inode);
@@ -1763,10 +1769,10 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 
 	BUG_ON(!th->t_trans_id);
 
-	if (vfs_dq_alloc_inode(inode)) {
-		err = -EDQUOT;
+	dquot_initialize(inode);
+	err = dquot_alloc_inode(inode);
+	if (err)
 		goto out_end_trans;
-	}
 	if (!dir->i_nlink) {
 		err = -EPERM;
 		goto out_bad_inode;
@@ -1957,12 +1963,12 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 	INODE_PKEY(inode)->k_objectid = 0;
 
 	/* Quota change must be inside a transaction for journaling */
-	vfs_dq_free_inode(inode);
+	dquot_free_inode(inode);
 
       out_end_trans:
 	journal_end(th, th->t_super, th->t_blocks_allocated);
 	/* Drop can be outside and it needs more credits so it's better to have it outside */
-	vfs_dq_drop(inode);
+	dquot_drop(inode);
 	inode->i_flags |= S_NOQUOTA;
 	make_bad_inode(inode);
 
@@ -3070,6 +3076,9 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
 	ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID);
 
 	depth = reiserfs_write_lock_once(inode->i_sb);
+	if (is_quota_modification(inode, attr))
+		dquot_initialize(inode);
+
 	if (attr->ia_valid & ATTR_SIZE) {
 		/* version 2 items will be caught by the s_maxbytes check
 		 ** done for us in vmtruncate
@@ -3132,8 +3141,7 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
 						  jbegin_count);
 				if (error)
 					goto out;
-				error =
-				    vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
+				error = dquot_transfer(inode, attr);
 				if (error) {
 					journal_end(&th, inode->i_sb,
 						    jbegin_count);
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 83ac4d3b3cb0..19fbc810e8e7 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -50,6 +50,7 @@
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/uaccess.h>
+#include <linux/slab.h>
 
 #include <asm/system.h>
 
@@ -2217,6 +2218,15 @@ static int journal_read_transaction(struct super_block *sb,
 		brelse(d_bh);
 		return 1;
 	}
+
+	if (bdev_read_only(sb->s_bdev)) {
+		reiserfs_warning(sb, "clm-2076",
+				 "device is readonly, unable to replay log");
+		brelse(c_bh);
+		brelse(d_bh);
+		return -EROFS;
+	}
+
 	trans_id = get_desc_trans_id(desc);
 	/* now we know we've got a good transaction, and it was inside the valid time ranges */
 	log_blocks = kmalloc(get_desc_trans_len(desc) *
@@ -2459,12 +2469,6 @@ static int journal_read(struct super_block *sb)
 		goto start_log_replay;
 	}
 
-	if (continue_replay && bdev_read_only(sb->s_bdev)) {
-		reiserfs_warning(sb, "clm-2076",
-				 "device is readonly, unable to replay log");
-		return -1;
-	}
-
 	/* ok, there are transactions that need to be replayed.  start with the first log block, find
 	 ** all the valid transactions, and pick out the oldest.
 	 */
@@ -2913,7 +2917,9 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
 	journal->j_mount_id = 10;
 	journal->j_state = 0;
 	atomic_set(&(journal->j_jlock), 0);
+	reiserfs_write_unlock(sb);
 	journal->j_cnode_free_list = allocate_cnodes(num_cnodes);
+	reiserfs_write_lock(sb);
 	journal->j_cnode_free_orig = journal->j_cnode_free_list;
 	journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0;
 	journal->j_cnode_used = 0;
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 9d4dcf0b07cb..ee78d4a0086a 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -13,6 +13,7 @@
 
 #include <linux/time.h>
 #include <linux/bitops.h>
+#include <linux/slab.h>
 #include <linux/reiserfs_fs.h>
 #include <linux/reiserfs_acl.h>
 #include <linux/reiserfs_xattr.h>
@@ -546,7 +547,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
 */
 static int drop_new_inode(struct inode *inode)
 {
-	vfs_dq_drop(inode);
+	dquot_drop(inode);
 	make_bad_inode(inode);
 	inode->i_flags |= S_NOQUOTA;
 	iput(inode);
@@ -554,30 +555,20 @@ static int drop_new_inode(struct inode *inode)
 }
 
 /* utility function that does setup for reiserfs_new_inode.
-** vfs_dq_init needs lots of credits so it's better to have it
+** dquot_initialize needs lots of credits so it's better to have it
 ** outside of a transaction, so we had to pull some bits of
 ** reiserfs_new_inode out into this func.
 */
 static int new_inode_init(struct inode *inode, struct inode *dir, int mode)
 {
-
-	/* the quota init calls have to know who to charge the quota to, so
-	 ** we have to set uid and gid here
-	 */
-	inode->i_uid = current_fsuid();
-	inode->i_mode = mode;
 	/* Make inode invalid - just in case we are going to drop it before
 	 * the initialization happens */
 	INODE_PKEY(inode)->k_objectid = 0;
-
-	if (dir->i_mode & S_ISGID) {
-		inode->i_gid = dir->i_gid;
-		if (S_ISDIR(mode))
-			inode->i_mode |= S_ISGID;
-	} else {
-		inode->i_gid = current_fsgid();
-	}
-	vfs_dq_init(inode);
+	/* the quota init calls have to know who to charge the quota to, so
+	 ** we have to set uid and gid here
+	 */
+	inode_init_owner(inode, dir, mode);
+	dquot_initialize(inode);
 	return 0;
 }
 
@@ -594,6 +585,8 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	struct reiserfs_transaction_handle th;
 	struct reiserfs_security_handle security;
 
+	dquot_initialize(dir);
+
 	if (!(inode = new_inode(dir->i_sb))) {
 		return -ENOMEM;
 	}
@@ -666,6 +659,8 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
 	if (!new_valid_dev(rdev))
 		return -EINVAL;
 
+	dquot_initialize(dir);
+
 	if (!(inode = new_inode(dir->i_sb))) {
 		return -ENOMEM;
 	}
@@ -739,6 +734,8 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	    2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
 		 REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
 
+	dquot_initialize(dir);
+
 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
 	/* set flag that new packing locality created and new blocks for the content     * of that directory are not displaced yet */
 	REISERFS_I(dir)->new_packing_locality = 1;
@@ -842,6 +839,8 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
 	    JOURNAL_PER_BALANCE_CNT * 2 + 2 +
 	    4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
 
+	dquot_initialize(dir);
+
 	reiserfs_write_lock(dir->i_sb);
 	retval = journal_begin(&th, dir->i_sb, jbegin_count);
 	if (retval)
@@ -923,6 +922,8 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
 	unsigned long savelink;
 	int depth;
 
+	dquot_initialize(dir);
+
 	inode = dentry->d_inode;
 
 	/* in this transaction we can be doing at max two balancings and update
@@ -1024,6 +1025,8 @@ static int reiserfs_symlink(struct inode *parent_dir,
 	    2 * (REISERFS_QUOTA_INIT_BLOCKS(parent_dir->i_sb) +
 		 REISERFS_QUOTA_TRANS_BLOCKS(parent_dir->i_sb));
 
+	dquot_initialize(parent_dir);
+
 	if (!(inode = new_inode(parent_dir->i_sb))) {
 		return -ENOMEM;
 	}
@@ -1111,6 +1114,8 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
 	    JOURNAL_PER_BALANCE_CNT * 3 +
 	    2 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
 
+	dquot_initialize(dir);
+
 	reiserfs_write_lock(dir->i_sb);
 	if (inode->i_nlink >= REISERFS_LINK_MAX) {
 		//FIXME: sd_nlink is 32 bit for new files
@@ -1235,6 +1240,9 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	    JOURNAL_PER_BALANCE_CNT * 3 + 5 +
 	    4 * REISERFS_QUOTA_TRANS_BLOCKS(old_dir->i_sb);
 
+	dquot_initialize(old_dir);
+	dquot_initialize(new_dir);
+
 	old_inode = old_dentry->d_inode;
 	new_dentry_inode = new_dentry->d_inode;
 
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index 5fa7118f04e1..313d39d639eb 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -1299,7 +1299,7 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
 		       "reiserquota delete_item(): freeing %u, id=%u type=%c",
 		       quota_cut_bytes, inode->i_uid, head2type(&s_ih));
 #endif
-	vfs_dq_free_space_nodirty(inode, quota_cut_bytes);
+	dquot_free_space_nodirty(inode, quota_cut_bytes);
 
 	/* Return deleted body length */
 	return ret_value;
@@ -1383,7 +1383,7 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
 					       quota_cut_bytes, inode->i_uid,
 					       key2type(key));
 #endif
-				vfs_dq_free_space_nodirty(inode,
+				dquot_free_space_nodirty(inode,
 							 quota_cut_bytes);
 			}
 			break;
@@ -1733,7 +1733,7 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
 		       "reiserquota cut_from_item(): freeing %u id=%u type=%c",
 		       quota_cut_bytes, inode->i_uid, '?');
 #endif
-	vfs_dq_free_space_nodirty(inode, quota_cut_bytes);
+	dquot_free_space_nodirty(inode, quota_cut_bytes);
 	return ret_value;
 }
 
@@ -1968,9 +1968,10 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
 		       key2type(&(key->on_disk_key)));
 #endif
 
-	if (vfs_dq_alloc_space_nodirty(inode, pasted_size)) {
+	retval = dquot_alloc_space_nodirty(inode, pasted_size);
+	if (retval) {
 		pathrelse(search_path);
-		return -EDQUOT;
+		return retval;
 	}
 	init_tb_struct(th, &s_paste_balance, th->t_super, search_path,
 		       pasted_size);
@@ -2024,7 +2025,7 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
 		       pasted_size, inode->i_uid,
 		       key2type(&(key->on_disk_key)));
 #endif
-	vfs_dq_free_space_nodirty(inode, pasted_size);
+	dquot_free_space_nodirty(inode, pasted_size);
 	return retval;
 }
 
@@ -2062,9 +2063,10 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
 #endif
 		/* We can't dirty inode here. It would be immediately written but
 		 * appropriate stat item isn't inserted yet... */
-		if (vfs_dq_alloc_space_nodirty(inode, quota_bytes)) {
+		retval = dquot_alloc_space_nodirty(inode, quota_bytes);
+		if (retval) {
 			pathrelse(path);
-			return -EDQUOT;
+			return retval;
 		}
 	}
 	init_tb_struct(th, &s_ins_balance, th->t_super, path,
@@ -2113,6 +2115,6 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
 		       quota_bytes, inode->i_uid, head2type(ih));
 #endif
 	if (inode)
-		vfs_dq_free_space_nodirty(inode, quota_bytes);
+		dquot_free_space_nodirty(inode, quota_bytes);
 	return retval;
 }
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index b4a7dd03bdb9..9822fa15118b 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -12,6 +12,7 @@
  */
 
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/time.h>
 #include <asm/uaccess.h>
@@ -157,6 +158,7 @@ static int finish_unfinished(struct super_block *s)
 #ifdef CONFIG_QUOTA
 	int i;
 	int ms_active_set;
+	int quota_enabled[MAXQUOTAS];
 #endif
 
 	/* compose key to look for "save" links */
@@ -178,8 +180,15 @@ static int finish_unfinished(struct super_block *s)
 	}
 	/* Turn on quotas so that they are updated correctly */
 	for (i = 0; i < MAXQUOTAS; i++) {
+		quota_enabled[i] = 1;
 		if (REISERFS_SB(s)->s_qf_names[i]) {
-			int ret = reiserfs_quota_on_mount(s, i);
+			int ret;
+
+			if (sb_has_quota_active(s, i)) {
+				quota_enabled[i] = 0;
+				continue;
+			}
+			ret = reiserfs_quota_on_mount(s, i);
 			if (ret < 0)
 				reiserfs_warning(s, "reiserfs-2500",
 						 "cannot turn on journaled "
@@ -246,7 +255,7 @@ static int finish_unfinished(struct super_block *s)
 			retval = remove_save_link_only(s, &save_link_key, 0);
 			continue;
 		}
-		vfs_dq_init(inode);
+		dquot_initialize(inode);
 
 		if (truncate && S_ISDIR(inode->i_mode)) {
 			/* We got a truncate request for a dir which is impossible.
@@ -303,8 +312,8 @@ static int finish_unfinished(struct super_block *s)
 #ifdef CONFIG_QUOTA
 	/* Turn quotas off */
 	for (i = 0; i < MAXQUOTAS; i++) {
-		if (sb_dqopt(s)->files[i])
-			vfs_quota_off(s, i, 0);
+		if (sb_dqopt(s)->files[i] && quota_enabled[i])
+			dquot_quota_off(s, i);
 	}
 	if (ms_active_set)
 		/* Restore the flag back */
@@ -465,6 +474,8 @@ static void reiserfs_put_super(struct super_block *s)
 	struct reiserfs_transaction_handle th;
 	th.t_trans_id = 0;
 
+	dquot_disable(s, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
+
 	reiserfs_write_lock(s);
 
 	if (s->s_dirt)
@@ -578,6 +589,11 @@ out:
 	reiserfs_write_unlock_once(inode->i_sb, lock_depth);
 }
 
+static void reiserfs_clear_inode(struct inode *inode)
+{
+	dquot_drop(inode);
+}
+
 #ifdef CONFIG_QUOTA
 static ssize_t reiserfs_quota_write(struct super_block *, int, const char *,
 				    size_t, loff_t);
@@ -590,6 +606,7 @@ static const struct super_operations reiserfs_sops = {
 	.destroy_inode = reiserfs_destroy_inode,
 	.write_inode = reiserfs_write_inode,
 	.dirty_inode = reiserfs_dirty_inode,
+	.clear_inode = reiserfs_clear_inode,
 	.delete_inode = reiserfs_delete_inode,
 	.put_super = reiserfs_put_super,
 	.write_super = reiserfs_write_super,
@@ -613,16 +630,9 @@ static int reiserfs_acquire_dquot(struct dquot *);
 static int reiserfs_release_dquot(struct dquot *);
 static int reiserfs_mark_dquot_dirty(struct dquot *);
 static int reiserfs_write_info(struct super_block *, int);
-static int reiserfs_quota_on(struct super_block *, int, int, char *, int);
+static int reiserfs_quota_on(struct super_block *, int, int, char *);
 
 static const struct dquot_operations reiserfs_quota_operations = {
-	.initialize = dquot_initialize,
-	.drop = dquot_drop,
-	.alloc_space = dquot_alloc_space,
-	.alloc_inode = dquot_alloc_inode,
-	.free_space = dquot_free_space,
-	.free_inode = dquot_free_inode,
-	.transfer = dquot_transfer,
 	.write_dquot = reiserfs_write_dquot,
 	.acquire_dquot = reiserfs_acquire_dquot,
 	.release_dquot = reiserfs_release_dquot,
@@ -634,12 +644,12 @@ static const struct dquot_operations reiserfs_quota_operations = {
 
 static const struct quotactl_ops reiserfs_qctl_operations = {
 	.quota_on = reiserfs_quota_on,
-	.quota_off = vfs_quota_off,
-	.quota_sync = vfs_quota_sync,
-	.get_info = vfs_get_dqinfo,
-	.set_info = vfs_set_dqinfo,
-	.get_dqblk = vfs_get_dqblk,
-	.set_dqblk = vfs_set_dqblk,
+	.quota_off = dquot_quota_off,
+	.quota_sync = dquot_quota_sync,
+	.get_info = dquot_get_dqinfo,
+	.set_info = dquot_set_dqinfo,
+	.get_dqblk = dquot_get_dqblk,
+	.set_dqblk = dquot_set_dqblk,
 };
 #endif
 
@@ -1242,6 +1252,11 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 		if (s->s_flags & MS_RDONLY)
 			/* it is read-only already */
 			goto out_ok;
+
+		err = dquot_suspend(s, -1);
+		if (err < 0)
+			goto out_err;
+
 		/* try to remount file system with read-only permissions */
 		if (sb_umount_state(rs) == REISERFS_VALID_FS
 		    || REISERFS_SB(s)->s_mount_state != REISERFS_VALID_FS) {
@@ -1295,6 +1310,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 	s->s_dirt = 0;
 
 	if (!(*mount_flags & MS_RDONLY)) {
+		dquot_resume(s, -1);
 		finish_unfinished(s);
 		reiserfs_xattr_init(s, *mount_flags);
 	}
@@ -1619,10 +1635,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 	save_mount_options(s, data);
 
 	sbi = kzalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL);
-	if (!sbi) {
-		errval = -ENOMEM;
-		goto error_alloc;
-	}
+	if (!sbi)
+		return -ENOMEM;
 	s->s_fs_info = sbi;
 	/* Set default values for options: non-aggressive tails, RO on errors */
 	REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_SMALLTAIL);
@@ -1879,12 +1893,12 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 	return (0);
 
 error:
-	reiserfs_write_unlock(s);
-error_alloc:
 	if (jinit_done) {	/* kill the commit thread, free journal ram */
 		journal_release_error(NULL, s);
 	}
 
+	reiserfs_write_unlock(s);
+
 	reiserfs_free_bitmap_cache(s);
 	if (SB_BUFFER_WITH_SB(s))
 		brelse(SB_BUFFER_WITH_SB(s));
@@ -2024,15 +2038,15 @@ static int reiserfs_write_info(struct super_block *sb, int type)
  */
 static int reiserfs_quota_on_mount(struct super_block *sb, int type)
 {
-	return vfs_quota_on_mount(sb, REISERFS_SB(sb)->s_qf_names[type],
-				  REISERFS_SB(sb)->s_jquota_fmt, type);
+	return dquot_quota_on_mount(sb, REISERFS_SB(sb)->s_qf_names[type],
+					REISERFS_SB(sb)->s_jquota_fmt, type);
 }
 
 /*
  * Standard function to be called on quota_on
  */
 static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
-			     char *name, int remount)
+			     char *name)
 {
 	int err;
 	struct path path;
@@ -2041,9 +2055,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
 
 	if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA)))
 		return -EINVAL;
-	/* No more checks needed? Path and format_id are bogus anyway... */
-	if (remount)
-		return vfs_quota_on(sb, type, format_id, name, 1);
+
 	err = kern_path(name, LOOKUP_FOLLOW, &path);
 	if (err)
 		return err;
@@ -2087,7 +2099,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
 		if (err)
 			goto out;
 	}
-	err = vfs_quota_on_path(sb, type, format_id, &path);
+	err = dquot_quota_on_path(sb, type, format_id, &path);
 out:
 	path_put(&path);
 	return err;
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 81f09fab8ae4..8c4cf273c672 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -38,6 +38,7 @@
 #include <linux/dcache.h>
 #include <linux/namei.h>
 #include <linux/errno.h>
+#include <linux/gfp.h>
 #include <linux/fs.h>
 #include <linux/file.h>
 #include <linux/pagemap.h>
@@ -61,7 +62,6 @@
 static int xattr_create(struct inode *dir, struct dentry *dentry, int mode)
 {
 	BUG_ON(!mutex_is_locked(&dir->i_mutex));
-	vfs_dq_init(dir);
 	return dir->i_op->create(dir, dentry, mode, NULL);
 }
 #endif
@@ -69,7 +69,6 @@ static int xattr_create(struct inode *dir, struct dentry *dentry, int mode)
 static int xattr_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
 	BUG_ON(!mutex_is_locked(&dir->i_mutex));
-	vfs_dq_init(dir);
 	return dir->i_op->mkdir(dir, dentry, mode);
 }
 
@@ -81,7 +80,6 @@ static int xattr_unlink(struct inode *dir, struct dentry *dentry)
 {
 	int error;
 	BUG_ON(!mutex_is_locked(&dir->i_mutex));
-	vfs_dq_init(dir);
 
 	reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex,
 					I_MUTEX_CHILD, dir->i_sb);
@@ -97,7 +95,6 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
 {
 	int error;
 	BUG_ON(!mutex_is_locked(&dir->i_mutex));
-	vfs_dq_init(dir);
 
 	reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex,
 					I_MUTEX_CHILD, dir->i_sb);
@@ -557,7 +554,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
 	if (!err && new_size < i_size_read(dentry->d_inode)) {
 		struct iattr newattrs = {
 			.ia_ctime = current_fs_time(inode->i_sb),
-			.ia_size = buffer_size,
+			.ia_size = new_size,
 			.ia_valid = ATTR_SIZE | ATTR_CTIME,
 		};
 
@@ -726,11 +723,11 @@ out:
 			(handler) = *(handlers)++)
 
 /* This is the implementation for the xattr plugin infrastructure */
-static inline struct xattr_handler *
-find_xattr_handler_prefix(struct xattr_handler **handlers,
+static inline const struct xattr_handler *
+find_xattr_handler_prefix(const struct xattr_handler **handlers,
 			   const char *name)
 {
-	struct xattr_handler *xah;
+	const struct xattr_handler *xah;
 
 	if (!handlers)
 		return NULL;
@@ -751,7 +748,7 @@ ssize_t
 reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer,
 		  size_t size)
 {
-	struct xattr_handler *handler;
+	const struct xattr_handler *handler;
 
 	handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
 
@@ -770,7 +767,7 @@ int
 reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 		  size_t size, int flags)
 {
-	struct xattr_handler *handler;
+	const struct xattr_handler *handler;
 
 	handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
 
@@ -787,7 +784,7 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
  */
 int reiserfs_removexattr(struct dentry *dentry, const char *name)
 {
-	struct xattr_handler *handler;
+	const struct xattr_handler *handler;
 	handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
 
 	if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
@@ -810,7 +807,7 @@ static int listxattr_filler(void *buf, const char *name, int namelen,
 	size_t size;
 	if (name[0] != '.' ||
 	    (namelen != 1 && (name[1] != '.' || namelen != 2))) {
-		struct xattr_handler *handler;
+		const struct xattr_handler *handler;
 		handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr,
 						    name);
 		if (!handler)	/* Unsupported xattr name */
@@ -923,7 +920,7 @@ static int create_privroot(struct dentry *dentry) { return 0; }
 #endif
 
 /* Actual operations that are exported to VFS-land */
-struct xattr_handler *reiserfs_xattr_handlers[] = {
+const struct xattr_handler *reiserfs_xattr_handlers[] = {
 #ifdef CONFIG_REISERFS_FS_XATTR
 	&reiserfs_xattr_user_handler,
 	&reiserfs_xattr_trusted_handler,
@@ -976,21 +973,13 @@ int reiserfs_permission(struct inode *inode, int mask)
 	return generic_permission(inode, mask, NULL);
 }
 
-/* This will catch lookups from the fs root to .reiserfs_priv */
-static int
-xattr_lookup_poison(struct dentry *dentry, struct qstr *q1, struct qstr *name)
+static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-	struct dentry *priv_root = REISERFS_SB(dentry->d_sb)->priv_root;
-	if (container_of(q1, struct dentry, d_name) == priv_root)
-		return -ENOENT;
-	if (q1->len == name->len &&
-		   !memcmp(q1->name, name->name, name->len))
-		return 0;
-	return 1;
+	return -EPERM;
 }
 
 static const struct dentry_operations xattr_lookup_poison_ops = {
-	.d_compare = xattr_lookup_poison,
+	.d_revalidate = xattr_hide_revalidate,
 };
 
 int reiserfs_lookup_privroot(struct super_block *s)
@@ -1004,8 +993,7 @@ int reiserfs_lookup_privroot(struct super_block *s)
 				strlen(PRIVROOT_NAME));
 	if (!IS_ERR(dentry)) {
 		REISERFS_SB(s)->priv_root = dentry;
-		if (!reiserfs_expose_privroot(s))
-			s->s_root->d_op = &xattr_lookup_poison_ops;
+		dentry->d_op = &xattr_lookup_poison_ops;
 		if (dentry->d_inode)
 			dentry->d_inode->i_flags |= S_PRIVATE;
 	} else
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index dd20a7883f0f..536d697a8a28 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -5,6 +5,7 @@
 #include <linux/errno.h>
 #include <linux/pagemap.h>
 #include <linux/xattr.h>
+#include <linux/slab.h>
 #include <linux/posix_acl_xattr.h>
 #include <linux/reiserfs_xattr.h>
 #include <linux/reiserfs_acl.h>
@@ -499,7 +500,7 @@ static size_t posix_acl_access_list(struct dentry *dentry, char *list,
 	return size;
 }
 
-struct xattr_handler reiserfs_posix_acl_access_handler = {
+const struct xattr_handler reiserfs_posix_acl_access_handler = {
 	.prefix = POSIX_ACL_XATTR_ACCESS,
 	.flags = ACL_TYPE_ACCESS,
 	.get = posix_acl_get,
@@ -519,7 +520,7 @@ static size_t posix_acl_default_list(struct dentry *dentry, char *list,
 	return size;
 }
 
-struct xattr_handler reiserfs_posix_acl_default_handler = {
+const struct xattr_handler reiserfs_posix_acl_default_handler = {
 	.prefix = POSIX_ACL_XATTR_DEFAULT,
 	.flags = ACL_TYPE_DEFAULT,
 	.get = posix_acl_get,
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index d8b5bfcbdd30..237c6928d3c6 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -3,6 +3,7 @@
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/xattr.h>
+#include <linux/slab.h>
 #include <linux/reiserfs_xattr.h>
 #include <linux/security.h>
 #include <asm/uaccess.h>
@@ -76,7 +77,7 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode,
 		return error;
 	}
 
-	if (sec->length) {
+	if (sec->length && reiserfs_xattrs_initialized(inode->i_sb)) {
 		blocks = reiserfs_xattr_jcreate_nblocks(inode) +
 			 reiserfs_xattr_nblocks(inode, sec->length);
 		/* We don't want to count the directories twice if we have
@@ -110,7 +111,7 @@ void reiserfs_security_free(struct reiserfs_security_handle *sec)
 	sec->value = NULL;
 }
 
-struct xattr_handler reiserfs_xattr_security_handler = {
+const struct xattr_handler reiserfs_xattr_security_handler = {
 	.prefix = XATTR_SECURITY_PREFIX,
 	.get = security_get,
 	.set = security_set,
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index 5b08aaca3daf..9883736ce3ec 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -48,7 +48,7 @@ static size_t trusted_list(struct dentry *dentry, char *list, size_t list_size,
 	return len;
 }
 
-struct xattr_handler reiserfs_xattr_trusted_handler = {
+const struct xattr_handler reiserfs_xattr_trusted_handler = {
 	.prefix = XATTR_TRUSTED_PREFIX,
 	.get = trusted_get,
 	.set = trusted_set,
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index 75d59c49b911..45ae1a00013a 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -44,7 +44,7 @@ static size_t user_list(struct dentry *dentry, char *list, size_t list_size,
 	return len;
 }
 
-struct xattr_handler reiserfs_xattr_user_handler = {
+const struct xattr_handler reiserfs_xattr_user_handler = {
 	.prefix = XATTR_USER_PREFIX,
 	.get = user_get,
 	.set = user_set,
diff --git a/fs/select.c b/fs/select.c
index fd38ce2e32e3..500a669f7790 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -691,6 +691,23 @@ SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp,
 }
 #endif /* HAVE_SET_RESTORE_SIGMASK */
 
+#ifdef __ARCH_WANT_SYS_OLD_SELECT
+struct sel_arg_struct {
+	unsigned long n;
+	fd_set __user *inp, *outp, *exp;
+	struct timeval __user *tvp;
+};
+
+SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg)
+{
+	struct sel_arg_struct a;
+
+	if (copy_from_user(&a, arg, sizeof(a)))
+		return -EFAULT;
+	return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp);
+}
+#endif
+
 struct poll_list {
 	struct poll_list *next;
 	int len;
@@ -821,7 +838,7 @@ int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
  	struct poll_list *walk = head;
  	unsigned long todo = nfds;
 
-	if (nfds > current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
+	if (nfds > rlimit(RLIMIT_NOFILE))
 		return -EINVAL;
 
 	len = min_t(unsigned int, nfds, N_STACK_PPS);
diff --git a/fs/seq_file.c b/fs/seq_file.c
index eae7d9dbf3ff..e1f437be6c3c 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -674,7 +674,6 @@ struct list_head *seq_list_start(struct list_head *head, loff_t pos)
 
 	return NULL;
 }
-
 EXPORT_SYMBOL(seq_list_start);
 
 struct list_head *seq_list_start_head(struct list_head *head, loff_t pos)
@@ -684,7 +683,6 @@ struct list_head *seq_list_start_head(struct list_head *head, loff_t pos)
 
 	return seq_list_start(head, pos - 1);
 }
-
 EXPORT_SYMBOL(seq_list_start_head);
 
 struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos)
@@ -695,5 +693,131 @@ struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos)
 	++*ppos;
 	return lh == head ? NULL : lh;
 }
-
 EXPORT_SYMBOL(seq_list_next);
+
+/**
+ * seq_hlist_start - start an iteration of a hlist
+ * @head: the head of the hlist
+ * @pos:  the start position of the sequence
+ *
+ * Called at seq_file->op->start().
+ */
+struct hlist_node *seq_hlist_start(struct hlist_head *head, loff_t pos)
+{
+	struct hlist_node *node;
+
+	hlist_for_each(node, head)
+		if (pos-- == 0)
+			return node;
+	return NULL;
+}
+EXPORT_SYMBOL(seq_hlist_start);
+
+/**
+ * seq_hlist_start_head - start an iteration of a hlist
+ * @head: the head of the hlist
+ * @pos:  the start position of the sequence
+ *
+ * Called at seq_file->op->start(). Call this function if you want to
+ * print a header at the top of the output.
+ */
+struct hlist_node *seq_hlist_start_head(struct hlist_head *head, loff_t pos)
+{
+	if (!pos)
+		return SEQ_START_TOKEN;
+
+	return seq_hlist_start(head, pos - 1);
+}
+EXPORT_SYMBOL(seq_hlist_start_head);
+
+/**
+ * seq_hlist_next - move to the next position of the hlist
+ * @v:    the current iterator
+ * @head: the head of the hlist
+ * @ppos: the current position
+ *
+ * Called at seq_file->op->next().
+ */
+struct hlist_node *seq_hlist_next(void *v, struct hlist_head *head,
+				  loff_t *ppos)
+{
+	struct hlist_node *node = v;
+
+	++*ppos;
+	if (v == SEQ_START_TOKEN)
+		return head->first;
+	else
+		return node->next;
+}
+EXPORT_SYMBOL(seq_hlist_next);
+
+/**
+ * seq_hlist_start_rcu - start an iteration of a hlist protected by RCU
+ * @head: the head of the hlist
+ * @pos:  the start position of the sequence
+ *
+ * Called at seq_file->op->start().
+ *
+ * This list-traversal primitive may safely run concurrently with
+ * the _rcu list-mutation primitives such as hlist_add_head_rcu()
+ * as long as the traversal is guarded by rcu_read_lock().
+ */
+struct hlist_node *seq_hlist_start_rcu(struct hlist_head *head,
+				       loff_t pos)
+{
+	struct hlist_node *node;
+
+	__hlist_for_each_rcu(node, head)
+		if (pos-- == 0)
+			return node;
+	return NULL;
+}
+EXPORT_SYMBOL(seq_hlist_start_rcu);
+
+/**
+ * seq_hlist_start_head_rcu - start an iteration of a hlist protected by RCU
+ * @head: the head of the hlist
+ * @pos:  the start position of the sequence
+ *
+ * Called at seq_file->op->start(). Call this function if you want to
+ * print a header at the top of the output.
+ *
+ * This list-traversal primitive may safely run concurrently with
+ * the _rcu list-mutation primitives such as hlist_add_head_rcu()
+ * as long as the traversal is guarded by rcu_read_lock().
+ */
+struct hlist_node *seq_hlist_start_head_rcu(struct hlist_head *head,
+					    loff_t pos)
+{
+	if (!pos)
+		return SEQ_START_TOKEN;
+
+	return seq_hlist_start_rcu(head, pos - 1);
+}
+EXPORT_SYMBOL(seq_hlist_start_head_rcu);
+
+/**
+ * seq_hlist_next_rcu - move to the next position of the hlist protected by RCU
+ * @v:    the current iterator
+ * @head: the head of the hlist
+ * @ppos: the current position
+ *
+ * Called at seq_file->op->next().
+ *
+ * This list-traversal primitive may safely run concurrently with
+ * the _rcu list-mutation primitives such as hlist_add_head_rcu()
+ * as long as the traversal is guarded by rcu_read_lock().
+ */
+struct hlist_node *seq_hlist_next_rcu(void *v,
+				      struct hlist_head *head,
+				      loff_t *ppos)
+{
+	struct hlist_node *node = v;
+
+	++*ppos;
+	if (v == SEQ_START_TOKEN)
+		return rcu_dereference(head->first);
+	else
+		return rcu_dereference(node->next);
+}
+EXPORT_SYMBOL(seq_hlist_next_rcu);
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 1dabe4ee02fe..f329849ce3c0 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -22,6 +22,7 @@
 #include <linux/init.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/kernel.h>
 #include <linux/signal.h>
 #include <linux/list.h>
diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
index 3e4803b4427e..00a70cab1f36 100644
--- a/fs/smbfs/dir.c
+++ b/fs/smbfs/dir.c
@@ -37,9 +37,10 @@ static int smb_link(struct dentry *, struct inode *, struct dentry *);
 
 const struct file_operations smb_dir_operations =
 {
+	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= smb_readdir,
-	.ioctl		= smb_ioctl,
+	.unlocked_ioctl	= smb_ioctl,
 	.open		= smb_dir_open,
 };
 
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index 92d5e8ffb639..8e187a0f94bb 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -13,7 +13,6 @@
 #include <linux/fcntl.h>
 #include <linux/stat.h>
 #include <linux/mm.h>
-#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/smp_lock.h>
 #include <linux/net.h>
@@ -29,8 +28,9 @@
 #include "proto.h"
 
 static int
-smb_fsync(struct file *file, struct dentry * dentry, int datasync)
+smb_fsync(struct file *file, int datasync)
 {
+	struct dentry *dentry = file->f_path.dentry;
 	struct smb_sb_info *server = server_from_dentry(dentry);
 	int result;
 
@@ -438,7 +438,7 @@ const struct file_operations smb_file_operations =
 	.aio_read	= smb_file_aio_read,
 	.write		= do_sync_write,
 	.aio_write	= smb_file_aio_write,
-	.ioctl		= smb_ioctl,
+	.unlocked_ioctl	= smb_ioctl,
 	.mmap		= smb_file_mmap,
 	.open		= smb_file_open,
 	.release	= smb_file_release,
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 1c4c8f089970..9551cb6f7fe4 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -479,6 +479,7 @@ smb_put_super(struct super_block *sb)
 	if (server->conn_pid)
 		kill_pid(server->conn_pid, SIGTERM, 1);
 
+	bdi_destroy(&server->bdi);
 	kfree(server->ops);
 	smb_unload_nls(server);
 	sb->s_fs_info = NULL;
@@ -525,6 +526,11 @@ static int smb_fill_super(struct super_block *sb, void *raw_data, int silent)
 	if (!server)
 		goto out_no_server;
 	sb->s_fs_info = server;
+	
+	if (bdi_setup_and_register(&server->bdi, "smbfs", BDI_CAP_MAP_COPY))
+		goto out_bdi;
+
+	sb->s_bdi = &server->bdi;
 
 	server->super_block = sb;
 	server->mnt = NULL;
@@ -624,6 +630,8 @@ out_no_smbiod:
 out_bad_option:
 	kfree(mem);
 out_no_mem:
+	bdi_destroy(&server->bdi);
+out_bdi:
 	if (!server->mnt)
 		printk(KERN_ERR "smb_fill_super: allocation failure\n");
 	sb->s_fs_info = NULL;
@@ -706,7 +714,7 @@ smb_notify_change(struct dentry *dentry, struct iattr *attr)
 		error = server->ops->truncate(inode, attr->ia_size);
 		if (error)
 			goto out;
-		error = vmtruncate(inode, attr->ia_size);
+		error = simple_setsize(inode, attr->ia_size);
 		if (error)
 			goto out;
 		refresh = 1;
diff --git a/fs/smbfs/ioctl.c b/fs/smbfs/ioctl.c
index dbae1f8ea26f..07215312ad39 100644
--- a/fs/smbfs/ioctl.c
+++ b/fs/smbfs/ioctl.c
@@ -13,6 +13,7 @@
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/highuid.h>
+#include <linux/smp_lock.h>
 #include <linux/net.h>
 
 #include <linux/smb_fs.h>
@@ -22,14 +23,14 @@
 
 #include "proto.h"
 
-int
-smb_ioctl(struct inode *inode, struct file *filp,
-	  unsigned int cmd, unsigned long arg)
+long
+smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
-	struct smb_sb_info *server = server_from_inode(inode);
+	struct smb_sb_info *server = server_from_inode(filp->f_path.dentry->d_inode);
 	struct smb_conn_opt opt;
 	int result = -EINVAL;
 
+	lock_kernel();
 	switch (cmd) {
 		uid16_t uid16;
 		uid_t uid32;
@@ -62,6 +63,7 @@ smb_ioctl(struct inode *inode, struct file *filp,
 	default:
 		break;
 	}
+	unlock_kernel();
 
 	return result;
 }
diff --git a/fs/smbfs/proto.h b/fs/smbfs/proto.h
index 03f456c1b7d4..05939a6f43e6 100644
--- a/fs/smbfs/proto.h
+++ b/fs/smbfs/proto.h
@@ -67,7 +67,7 @@ extern const struct address_space_operations smb_file_aops;
 extern const struct file_operations smb_file_operations;
 extern const struct inode_operations smb_file_inode_operations;
 /* ioctl.c */
-extern int smb_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg);
+extern long smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
 /* smbiod.c */
 extern void smbiod_wake_up(void);
 extern int smbiod_register_server(struct smb_sb_info *server);
diff --git a/fs/smbfs/smbiod.c b/fs/smbfs/smbiod.c
index 6bd9b691a463..0e39a924f10a 100644
--- a/fs/smbfs/smbiod.c
+++ b/fs/smbfs/smbiod.c
@@ -12,7 +12,6 @@
 #include <linux/string.h>
 #include <linux/stat.h>
 #include <linux/errno.h>
-#include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/file.h>
 #include <linux/dcache.h>
diff --git a/fs/splice.c b/fs/splice.c
index 39208663aaf1..efdbfece9932 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -30,6 +30,7 @@
 #include <linux/syscalls.h>
 #include <linux/uio.h>
 #include <linux/security.h>
+#include <linux/gfp.h>
 
 /*
  * Attempt to steal a page from a pipe buffer. This should perhaps go into
@@ -192,8 +193,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
 			break;
 		}
 
-		if (pipe->nrbufs < PIPE_BUFFERS) {
-			int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);
+		if (pipe->nrbufs < pipe->buffers) {
+			int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
 			struct pipe_buffer *buf = pipe->bufs + newbuf;
 
 			buf->page = spd->pages[page_nr];
@@ -213,7 +214,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
 
 			if (!--spd->nr_pages)
 				break;
-			if (pipe->nrbufs < PIPE_BUFFERS)
+			if (pipe->nrbufs < pipe->buffers)
 				continue;
 
 			break;
@@ -264,6 +265,36 @@ static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
 	page_cache_release(spd->pages[i]);
 }
 
+/*
+ * Check if we need to grow the arrays holding pages and partial page
+ * descriptions.
+ */
+int splice_grow_spd(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
+{
+	if (pipe->buffers <= PIPE_DEF_BUFFERS)
+		return 0;
+
+	spd->pages = kmalloc(pipe->buffers * sizeof(struct page *), GFP_KERNEL);
+	spd->partial = kmalloc(pipe->buffers * sizeof(struct partial_page), GFP_KERNEL);
+
+	if (spd->pages && spd->partial)
+		return 0;
+
+	kfree(spd->pages);
+	kfree(spd->partial);
+	return -ENOMEM;
+}
+
+void splice_shrink_spd(struct pipe_inode_info *pipe,
+		       struct splice_pipe_desc *spd)
+{
+	if (pipe->buffers <= PIPE_DEF_BUFFERS)
+		return;
+
+	kfree(spd->pages);
+	kfree(spd->partial);
+}
+
 static int
 __generic_file_splice_read(struct file *in, loff_t *ppos,
 			   struct pipe_inode_info *pipe, size_t len,
@@ -271,8 +302,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 {
 	struct address_space *mapping = in->f_mapping;
 	unsigned int loff, nr_pages, req_pages;
-	struct page *pages[PIPE_BUFFERS];
-	struct partial_page partial[PIPE_BUFFERS];
+	struct page *pages[PIPE_DEF_BUFFERS];
+	struct partial_page partial[PIPE_DEF_BUFFERS];
 	struct page *page;
 	pgoff_t index, end_index;
 	loff_t isize;
@@ -285,15 +316,18 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 		.spd_release = spd_release_page,
 	};
 
+	if (splice_grow_spd(pipe, &spd))
+		return -ENOMEM;
+
 	index = *ppos >> PAGE_CACHE_SHIFT;
 	loff = *ppos & ~PAGE_CACHE_MASK;
 	req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	nr_pages = min(req_pages, (unsigned)PIPE_BUFFERS);
+	nr_pages = min(req_pages, pipe->buffers);
 
 	/*
 	 * Lookup the (hopefully) full range of pages we need.
 	 */
-	spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages);
+	spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages);
 	index += spd.nr_pages;
 
 	/*
@@ -320,7 +354,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 				break;
 
 			error = add_to_page_cache_lru(page, mapping, index,
-						mapping_gfp_mask(mapping));
+						GFP_KERNEL);
 			if (unlikely(error)) {
 				page_cache_release(page);
 				if (error == -EEXIST)
@@ -334,7 +368,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 			unlock_page(page);
 		}
 
-		pages[spd.nr_pages++] = page;
+		spd.pages[spd.nr_pages++] = page;
 		index++;
 	}
 
@@ -355,7 +389,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 		 * this_len is the max we'll use from this page
 		 */
 		this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
-		page = pages[page_nr];
+		page = spd.pages[page_nr];
 
 		if (PageReadahead(page))
 			page_cache_async_readahead(mapping, &in->f_ra, in,
@@ -392,8 +426,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 					error = -ENOMEM;
 					break;
 				}
-				page_cache_release(pages[page_nr]);
-				pages[page_nr] = page;
+				page_cache_release(spd.pages[page_nr]);
+				spd.pages[page_nr] = page;
 			}
 			/*
 			 * page was already under io and is now done, great
@@ -450,8 +484,8 @@ fill_it:
 			len = this_len;
 		}
 
-		partial[page_nr].offset = loff;
-		partial[page_nr].len = this_len;
+		spd.partial[page_nr].offset = loff;
+		spd.partial[page_nr].len = this_len;
 		len -= this_len;
 		loff = 0;
 		spd.nr_pages++;
@@ -463,12 +497,13 @@ fill_it:
 	 * we got, 'nr_pages' is how many pages are in the map.
 	 */
 	while (page_nr < nr_pages)
-		page_cache_release(pages[page_nr++]);
+		page_cache_release(spd.pages[page_nr++]);
 	in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
 
 	if (spd.nr_pages)
-		return splice_to_pipe(pipe, &spd);
+		error = splice_to_pipe(pipe, &spd);
 
+	splice_shrink_spd(pipe, &spd);
 	return error;
 }
 
@@ -559,9 +594,9 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 	unsigned int nr_pages;
 	unsigned int nr_freed;
 	size_t offset;
-	struct page *pages[PIPE_BUFFERS];
-	struct partial_page partial[PIPE_BUFFERS];
-	struct iovec vec[PIPE_BUFFERS];
+	struct page *pages[PIPE_DEF_BUFFERS];
+	struct partial_page partial[PIPE_DEF_BUFFERS];
+	struct iovec *vec, __vec[PIPE_DEF_BUFFERS];
 	pgoff_t index;
 	ssize_t res;
 	size_t this_len;
@@ -575,11 +610,22 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 		.spd_release = spd_release_page,
 	};
 
+	if (splice_grow_spd(pipe, &spd))
+		return -ENOMEM;
+
+	res = -ENOMEM;
+	vec = __vec;
+	if (pipe->buffers > PIPE_DEF_BUFFERS) {
+		vec = kmalloc(pipe->buffers * sizeof(struct iovec), GFP_KERNEL);
+		if (!vec)
+			goto shrink_ret;
+	}
+
 	index = *ppos >> PAGE_CACHE_SHIFT;
 	offset = *ppos & ~PAGE_CACHE_MASK;
 	nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 
-	for (i = 0; i < nr_pages && i < PIPE_BUFFERS && len; i++) {
+	for (i = 0; i < nr_pages && i < pipe->buffers && len; i++) {
 		struct page *page;
 
 		page = alloc_page(GFP_USER);
@@ -590,7 +636,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 		this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset);
 		vec[i].iov_base = (void __user *) page_address(page);
 		vec[i].iov_len = this_len;
-		pages[i] = page;
+		spd.pages[i] = page;
 		spd.nr_pages++;
 		len -= this_len;
 		offset = 0;
@@ -609,11 +655,11 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 	nr_freed = 0;
 	for (i = 0; i < spd.nr_pages; i++) {
 		this_len = min_t(size_t, vec[i].iov_len, res);
-		partial[i].offset = 0;
-		partial[i].len = this_len;
+		spd.partial[i].offset = 0;
+		spd.partial[i].len = this_len;
 		if (!this_len) {
-			__free_page(pages[i]);
-			pages[i] = NULL;
+			__free_page(spd.pages[i]);
+			spd.pages[i] = NULL;
 			nr_freed++;
 		}
 		res -= this_len;
@@ -624,13 +670,18 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 	if (res > 0)
 		*ppos += res;
 
+shrink_ret:
+	if (vec != __vec)
+		kfree(vec);
+	splice_shrink_spd(pipe, &spd);
 	return res;
 
 err:
 	for (i = 0; i < spd.nr_pages; i++)
-		__free_page(pages[i]);
+		__free_page(spd.pages[i]);
 
-	return error;
+	res = error;
+	goto shrink_ret;
 }
 EXPORT_SYMBOL(default_file_splice_read);
 
@@ -783,7 +834,7 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
 		if (!buf->len) {
 			buf->ops = NULL;
 			ops->release(pipe, buf);
-			pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
+			pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
 			pipe->nrbufs--;
 			if (pipe->inode)
 				sd->need_wakeup = true;
@@ -1210,7 +1261,7 @@ out_release:
 	 * If we did an incomplete transfer we must release
 	 * the pipe buffers in question:
 	 */
-	for (i = 0; i < PIPE_BUFFERS; i++) {
+	for (i = 0; i < pipe->buffers; i++) {
 		struct pipe_buffer *buf = pipe->bufs + i;
 
 		if (buf->ops) {
@@ -1231,7 +1282,8 @@ static int direct_splice_actor(struct pipe_inode_info *pipe,
 {
 	struct file *file = sd->u.file;
 
-	return do_splice_from(pipe, file, &sd->pos, sd->total_len, sd->flags);
+	return do_splice_from(pipe, file, &file->f_pos, sd->total_len,
+			      sd->flags);
 }
 
 /**
@@ -1320,8 +1372,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 		if (off_in)
 			return -ESPIPE;
 		if (off_out) {
-			if (!out->f_op || !out->f_op->llseek ||
-			    out->f_op->llseek == no_llseek)
+			if (!(out->f_mode & FMODE_PWRITE))
 				return -EINVAL;
 			if (copy_from_user(&offset, off_out, sizeof(loff_t)))
 				return -EFAULT;
@@ -1341,8 +1392,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 		if (off_out)
 			return -ESPIPE;
 		if (off_in) {
-			if (!in->f_op || !in->f_op->llseek ||
-			    in->f_op->llseek == no_llseek)
+			if (!(in->f_mode & FMODE_PREAD))
 				return -EINVAL;
 			if (copy_from_user(&offset, off_in, sizeof(loff_t)))
 				return -EFAULT;
@@ -1370,7 +1420,8 @@ static long do_splice(struct file *in, loff_t __user *off_in,
  */
 static int get_iovec_page_array(const struct iovec __user *iov,
 				unsigned int nr_vecs, struct page **pages,
-				struct partial_page *partial, int aligned)
+				struct partial_page *partial, int aligned,
+				unsigned int pipe_buffers)
 {
 	int buffers = 0, error = 0;
 
@@ -1413,8 +1464,8 @@ static int get_iovec_page_array(const struct iovec __user *iov,
 			break;
 
 		npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-		if (npages > PIPE_BUFFERS - buffers)
-			npages = PIPE_BUFFERS - buffers;
+		if (npages > pipe_buffers - buffers)
+			npages = pipe_buffers - buffers;
 
 		error = get_user_pages_fast((unsigned long)base, npages,
 					0, &pages[buffers]);
@@ -1449,7 +1500,7 @@ static int get_iovec_page_array(const struct iovec __user *iov,
 		 * or if we mapped the max number of pages that we have
 		 * room for.
 		 */
-		if (error < npages || buffers == PIPE_BUFFERS)
+		if (error < npages || buffers == pipe_buffers)
 			break;
 
 		nr_vecs--;
@@ -1592,8 +1643,8 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
 			     unsigned long nr_segs, unsigned int flags)
 {
 	struct pipe_inode_info *pipe;
-	struct page *pages[PIPE_BUFFERS];
-	struct partial_page partial[PIPE_BUFFERS];
+	struct page *pages[PIPE_DEF_BUFFERS];
+	struct partial_page partial[PIPE_DEF_BUFFERS];
 	struct splice_pipe_desc spd = {
 		.pages = pages,
 		.partial = partial,
@@ -1601,17 +1652,25 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
 		.ops = &user_page_pipe_buf_ops,
 		.spd_release = spd_release_page,
 	};
+	long ret;
 
 	pipe = pipe_info(file->f_path.dentry->d_inode);
 	if (!pipe)
 		return -EBADF;
 
-	spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial,
-					    flags & SPLICE_F_GIFT);
+	if (splice_grow_spd(pipe, &spd))
+		return -ENOMEM;
+
+	spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages,
+					    spd.partial, flags & SPLICE_F_GIFT,
+					    pipe->buffers);
 	if (spd.nr_pages <= 0)
-		return spd.nr_pages;
+		ret = spd.nr_pages;
+	else
+		ret = splice_to_pipe(pipe, &spd);
 
-	return splice_to_pipe(pipe, &spd);
+	splice_shrink_spd(pipe, &spd);
+	return ret;
 }
 
 /*
@@ -1737,13 +1796,13 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
 	 * Check ->nrbufs without the inode lock first. This function
 	 * is speculative anyways, so missing one is ok.
 	 */
-	if (pipe->nrbufs < PIPE_BUFFERS)
+	if (pipe->nrbufs < pipe->buffers)
 		return 0;
 
 	ret = 0;
 	pipe_lock(pipe);
 
-	while (pipe->nrbufs >= PIPE_BUFFERS) {
+	while (pipe->nrbufs >= pipe->buffers) {
 		if (!pipe->readers) {
 			send_sig(SIGPIPE, current, 0);
 			ret = -EPIPE;
@@ -1809,7 +1868,7 @@ retry:
 		 * Cannot make any progress, because either the input
 		 * pipe is empty or the output pipe is full.
 		 */
-		if (!ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) {
+		if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) {
 			/* Already processed some buffers, break */
 			if (ret)
 				break;
@@ -1830,7 +1889,7 @@ retry:
 		}
 
 		ibuf = ipipe->bufs + ipipe->curbuf;
-		nbuf = (opipe->curbuf + opipe->nrbufs) % PIPE_BUFFERS;
+		nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
 		obuf = opipe->bufs + nbuf;
 
 		if (len >= ibuf->len) {
@@ -1840,7 +1899,7 @@ retry:
 			*obuf = *ibuf;
 			ibuf->ops = NULL;
 			opipe->nrbufs++;
-			ipipe->curbuf = (ipipe->curbuf + 1) % PIPE_BUFFERS;
+			ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1);
 			ipipe->nrbufs--;
 			input_wakeup = true;
 		} else {
@@ -1913,11 +1972,11 @@ static int link_pipe(struct pipe_inode_info *ipipe,
 		 * If we have iterated all input buffers or ran out of
 		 * output room, break.
 		 */
-		if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS)
+		if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers)
 			break;
 
-		ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1));
-		nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1);
+		ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1));
+		nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
 
 		/*
 		 * Get a reference to this pipe buffer,
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index 25a00d19d686..cc6ce8a84c21 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -26,6 +26,17 @@ config SQUASHFS
 
 	  If unsure, say N.
 
+config SQUASHFS_XATTRS
+	bool "Squashfs XATTR support"
+	depends on SQUASHFS
+	default n
+	help
+	  Saying Y here includes support for extended attributes (xattrs).
+	  Xattrs are name:value pairs associated with inodes by
+	  the kernel or by users (see the attr(5) manual page).
+
+	  If unsure, say N.
+
 config SQUASHFS_EMBEDDED
 
 	bool "Additional option for memory-constrained systems" 
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index 70e3244fa30f..2cee3e9fa452 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -4,4 +4,6 @@
 
 obj-$(CONFIG_SQUASHFS) += squashfs.o
 squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
-squashfs-y += namei.o super.o symlink.o
+squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o
+squashfs-$(CONFIG_SQUASHFS_XATTRS) += xattr.o xattr_id.o
+
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 2a7960310349..653c030eb840 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -29,15 +29,14 @@
 #include <linux/fs.h>
 #include <linux/vfs.h>
 #include <linux/slab.h>
-#include <linux/mutex.h>
 #include <linux/string.h>
 #include <linux/buffer_head.h>
-#include <linux/zlib.h>
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
 #include "squashfs_fs_i.h"
 #include "squashfs.h"
+#include "decompressor.h"
 
 /*
  * Read the metadata block length, this is stored in the first two
@@ -88,9 +87,8 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
 	u64 cur_index = index >> msblk->devblksize_log2;
 	int bytes, compressed, b = 0, k = 0, page = 0, avail;
 
-
-	bh = kcalloc((msblk->block_size >> msblk->devblksize_log2) + 1,
-				sizeof(*bh), GFP_KERNEL);
+	bh = kcalloc(((srclength + msblk->devblksize - 1)
+		>> msblk->devblksize_log2) + 1, sizeof(*bh), GFP_KERNEL);
 	if (bh == NULL)
 		return -ENOMEM;
 
@@ -153,72 +151,10 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
 	}
 
 	if (compressed) {
-		int zlib_err = 0, zlib_init = 0;
-
-		/*
-		 * Uncompress block.
-		 */
-
-		mutex_lock(&msblk->read_data_mutex);
-
-		msblk->stream.avail_out = 0;
-		msblk->stream.avail_in = 0;
-
-		bytes = length;
-		do {
-			if (msblk->stream.avail_in == 0 && k < b) {
-				avail = min(bytes, msblk->devblksize - offset);
-				bytes -= avail;
-				wait_on_buffer(bh[k]);
-				if (!buffer_uptodate(bh[k]))
-					goto release_mutex;
-
-				if (avail == 0) {
-					offset = 0;
-					put_bh(bh[k++]);
-					continue;
-				}
-
-				msblk->stream.next_in = bh[k]->b_data + offset;
-				msblk->stream.avail_in = avail;
-				offset = 0;
-			}
-
-			if (msblk->stream.avail_out == 0 && page < pages) {
-				msblk->stream.next_out = buffer[page++];
-				msblk->stream.avail_out = PAGE_CACHE_SIZE;
-			}
-
-			if (!zlib_init) {
-				zlib_err = zlib_inflateInit(&msblk->stream);
-				if (zlib_err != Z_OK) {
-					ERROR("zlib_inflateInit returned"
-						" unexpected result 0x%x,"
-						" srclength %d\n", zlib_err,
-						srclength);
-					goto release_mutex;
-				}
-				zlib_init = 1;
-			}
-
-			zlib_err = zlib_inflate(&msblk->stream, Z_SYNC_FLUSH);
-
-			if (msblk->stream.avail_in == 0 && k < b)
-				put_bh(bh[k++]);
-		} while (zlib_err == Z_OK);
-
-		if (zlib_err != Z_STREAM_END) {
-			ERROR("zlib_inflate error, data probably corrupt\n");
-			goto release_mutex;
-		}
-
-		zlib_err = zlib_inflateEnd(&msblk->stream);
-		if (zlib_err != Z_OK) {
-			ERROR("zlib_inflate error, data probably corrupt\n");
-			goto release_mutex;
-		}
-		length = msblk->stream.total_out;
-		mutex_unlock(&msblk->read_data_mutex);
+		length = squashfs_decompress(msblk, buffer, bh, b, offset,
+			 length, srclength, pages);
+		if (length < 0)
+			goto read_failure;
 	} else {
 		/*
 		 * Block is uncompressed.
@@ -255,9 +191,6 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
 	kfree(bh);
 	return length;
 
-release_mutex:
-	mutex_unlock(&msblk->read_data_mutex);
-
 block_release:
 	for (; k < b; k++)
 		put_bh(bh[k]);
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index 40c98fa6b5d6..57314bee9059 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -51,7 +51,6 @@
 #include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/wait.h>
-#include <linux/zlib.h>
 #include <linux/pagemap.h>
 
 #include "squashfs_fs.h"
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
new file mode 100644
index 000000000000..157478da6ac9
--- /dev/null
+++ b/fs/squashfs/decompressor.c
@@ -0,0 +1,68 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * decompressor.c
+ */
+
+#include <linux/types.h>
+#include <linux/mutex.h>
+#include <linux/buffer_head.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "decompressor.h"
+#include "squashfs.h"
+
+/*
+ * This file (and decompressor.h) implements a decompressor framework for
+ * Squashfs, allowing multiple decompressors to be easily supported
+ */
+
+static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = {
+	NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0
+};
+
+static const struct squashfs_decompressor squashfs_lzo_unsupported_comp_ops = {
+	NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0
+};
+
+static const struct squashfs_decompressor squashfs_unknown_comp_ops = {
+	NULL, NULL, NULL, 0, "unknown", 0
+};
+
+static const struct squashfs_decompressor *decompressor[] = {
+	&squashfs_zlib_comp_ops,
+	&squashfs_lzma_unsupported_comp_ops,
+	&squashfs_lzo_unsupported_comp_ops,
+	&squashfs_unknown_comp_ops
+};
+
+
+const struct squashfs_decompressor *squashfs_lookup_decompressor(int id)
+{
+	int i;
+
+	for (i = 0; decompressor[i]->id; i++)
+		if (id == decompressor[i]->id)
+			break;
+
+	return decompressor[i];
+}
diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h
new file mode 100644
index 000000000000..7425f80783f6
--- /dev/null
+++ b/fs/squashfs/decompressor.h
@@ -0,0 +1,55 @@
+#ifndef DECOMPRESSOR_H
+#define DECOMPRESSOR_H
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * decompressor.h
+ */
+
+struct squashfs_decompressor {
+	void	*(*init)(struct squashfs_sb_info *);
+	void	(*free)(void *);
+	int	(*decompress)(struct squashfs_sb_info *, void **,
+		struct buffer_head **, int, int, int, int, int);
+	int	id;
+	char	*name;
+	int	supported;
+};
+
+static inline void *squashfs_decompressor_init(struct squashfs_sb_info *msblk)
+{
+	return msblk->decompressor->init(msblk);
+}
+
+static inline void squashfs_decompressor_free(struct squashfs_sb_info *msblk,
+	void *s)
+{
+	if (msblk->decompressor)
+		msblk->decompressor->free(s);
+}
+
+static inline int squashfs_decompress(struct squashfs_sb_info *msblk,
+	void **buffer, struct buffer_head **bh, int b, int offset, int length,
+	int srclength, int pages)
+{
+	return msblk->decompressor->decompress(msblk, buffer, bh, b, offset,
+		length, srclength, pages);
+}
+#endif
diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
index 566b0eaed868..12b933ac6585 100644
--- a/fs/squashfs/dir.c
+++ b/fs/squashfs/dir.c
@@ -30,7 +30,6 @@
 #include <linux/fs.h>
 #include <linux/vfs.h>
 #include <linux/slab.h>
-#include <linux/zlib.h>
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
index 2b1b8fe5e037..7f93d5a9ee05 100644
--- a/fs/squashfs/export.c
+++ b/fs/squashfs/export.c
@@ -39,7 +39,6 @@
 #include <linux/vfs.h>
 #include <linux/dcache.h>
 #include <linux/exportfs.h>
-#include <linux/zlib.h>
 #include <linux/slab.h>
 
 #include "squashfs_fs.h"
diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index 717767d831df..a25c5060bdcb 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -47,7 +47,6 @@
 #include <linux/string.h>
 #include <linux/pagemap.h>
 #include <linux/mutex.h>
-#include <linux/zlib.h>
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c
index b5a2c15bbbc7..7c90bbd6879d 100644
--- a/fs/squashfs/fragment.c
+++ b/fs/squashfs/fragment.c
@@ -36,7 +36,6 @@
 #include <linux/fs.h>
 #include <linux/vfs.h>
 #include <linux/slab.h>
-#include <linux/zlib.h>
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c
index 3795b837ba28..b7f64bcd2b70 100644
--- a/fs/squashfs/id.c
+++ b/fs/squashfs/id.c
@@ -34,7 +34,6 @@
 #include <linux/fs.h>
 #include <linux/vfs.h>
 #include <linux/slab.h>
-#include <linux/zlib.h>
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index 9101dbde39ec..62e63ad25075 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -40,12 +40,13 @@
 
 #include <linux/fs.h>
 #include <linux/vfs.h>
-#include <linux/zlib.h>
+#include <linux/xattr.h>
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
 #include "squashfs_fs_i.h"
 #include "squashfs.h"
+#include "xattr.h"
 
 /*
  * Initialise VFS inode with the base inode information common to all
@@ -112,6 +113,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
 	int err, type, offset = SQUASHFS_INODE_OFFSET(ino);
 	union squashfs_inode squashfs_ino;
 	struct squashfs_base_inode *sqshb_ino = &squashfs_ino.base;
+	int xattr_id = SQUASHFS_INVALID_XATTR;
 
 	TRACE("Entered squashfs_read_inode\n");
 
@@ -200,8 +202,10 @@ int squashfs_read_inode(struct inode *inode, long long ino)
 			frag_offset = 0;
 		}
 
+		xattr_id = le32_to_cpu(sqsh_ino->xattr);
 		inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
 		inode->i_size = le64_to_cpu(sqsh_ino->file_size);
+		inode->i_op = &squashfs_inode_ops;
 		inode->i_fop = &generic_ro_fops;
 		inode->i_mode |= S_IFREG;
 		inode->i_blocks = ((inode->i_size -
@@ -252,6 +256,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
 		if (err < 0)
 			goto failed_read;
 
+		xattr_id = le32_to_cpu(sqsh_ino->xattr);
 		inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
 		inode->i_size = le32_to_cpu(sqsh_ino->file_size);
 		inode->i_op = &squashfs_dir_inode_ops;
@@ -281,21 +286,33 @@ int squashfs_read_inode(struct inode *inode, long long ino)
 
 		inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
 		inode->i_size = le32_to_cpu(sqsh_ino->symlink_size);
-		inode->i_op = &page_symlink_inode_operations;
+		inode->i_op = &squashfs_symlink_inode_ops;
 		inode->i_data.a_ops = &squashfs_symlink_aops;
 		inode->i_mode |= S_IFLNK;
 		squashfs_i(inode)->start = block;
 		squashfs_i(inode)->offset = offset;
 
+		if (type == SQUASHFS_LSYMLINK_TYPE) {
+			__le32 xattr;
+
+			err = squashfs_read_metadata(sb, NULL, &block,
+						&offset, inode->i_size);
+			if (err < 0)
+				goto failed_read;
+			err = squashfs_read_metadata(sb, &xattr, &block,
+						&offset, sizeof(xattr));
+			if (err < 0)
+				goto failed_read;
+			xattr_id = le32_to_cpu(xattr);
+		}
+
 		TRACE("Symbolic link inode %x:%x, start_block %llx, offset "
 				"%x\n", SQUASHFS_INODE_BLK(ino), offset,
 				block, offset);
 		break;
 	}
 	case SQUASHFS_BLKDEV_TYPE:
-	case SQUASHFS_CHRDEV_TYPE:
-	case SQUASHFS_LBLKDEV_TYPE:
-	case SQUASHFS_LCHRDEV_TYPE: {
+	case SQUASHFS_CHRDEV_TYPE: {
 		struct squashfs_dev_inode *sqsh_ino = &squashfs_ino.dev;
 		unsigned int rdev;
 
@@ -316,10 +333,32 @@ int squashfs_read_inode(struct inode *inode, long long ino)
 				SQUASHFS_INODE_BLK(ino), offset, rdev);
 		break;
 	}
+	case SQUASHFS_LBLKDEV_TYPE:
+	case SQUASHFS_LCHRDEV_TYPE: {
+		struct squashfs_ldev_inode *sqsh_ino = &squashfs_ino.ldev;
+		unsigned int rdev;
+
+		err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+				sizeof(*sqsh_ino));
+		if (err < 0)
+			goto failed_read;
+
+		if (type == SQUASHFS_LCHRDEV_TYPE)
+			inode->i_mode |= S_IFCHR;
+		else
+			inode->i_mode |= S_IFBLK;
+		xattr_id = le32_to_cpu(sqsh_ino->xattr);
+		inode->i_op = &squashfs_inode_ops;
+		inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+		rdev = le32_to_cpu(sqsh_ino->rdev);
+		init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
+
+		TRACE("Device inode %x:%x, rdev %x\n",
+				SQUASHFS_INODE_BLK(ino), offset, rdev);
+		break;
+	}
 	case SQUASHFS_FIFO_TYPE:
-	case SQUASHFS_SOCKET_TYPE:
-	case SQUASHFS_LFIFO_TYPE:
-	case SQUASHFS_LSOCKET_TYPE: {
+	case SQUASHFS_SOCKET_TYPE: {
 		struct squashfs_ipc_inode *sqsh_ino = &squashfs_ino.ipc;
 
 		err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
@@ -335,14 +374,52 @@ int squashfs_read_inode(struct inode *inode, long long ino)
 		init_special_inode(inode, inode->i_mode, 0);
 		break;
 	}
+	case SQUASHFS_LFIFO_TYPE:
+	case SQUASHFS_LSOCKET_TYPE: {
+		struct squashfs_lipc_inode *sqsh_ino = &squashfs_ino.lipc;
+
+		err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+				sizeof(*sqsh_ino));
+		if (err < 0)
+			goto failed_read;
+
+		if (type == SQUASHFS_LFIFO_TYPE)
+			inode->i_mode |= S_IFIFO;
+		else
+			inode->i_mode |= S_IFSOCK;
+		xattr_id = le32_to_cpu(sqsh_ino->xattr);
+		inode->i_op = &squashfs_inode_ops;
+		inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+		init_special_inode(inode, inode->i_mode, 0);
+		break;
+	}
 	default:
 		ERROR("Unknown inode type %d in squashfs_iget!\n", type);
 		return -EINVAL;
 	}
 
+	if (xattr_id != SQUASHFS_INVALID_XATTR && msblk->xattr_id_table) {
+		err = squashfs_xattr_lookup(sb, xattr_id,
+					&squashfs_i(inode)->xattr_count,
+					&squashfs_i(inode)->xattr_size,
+					&squashfs_i(inode)->xattr);
+		if (err < 0)
+			goto failed_read;
+		inode->i_blocks += ((squashfs_i(inode)->xattr_size - 1) >> 9)
+				+ 1;
+	} else
+		squashfs_i(inode)->xattr_count = 0;
+
 	return 0;
 
 failed_read:
 	ERROR("Unable to read inode 0x%llx\n", ino);
 	return err;
 }
+
+
+const struct inode_operations squashfs_inode_ops = {
+	.getxattr = generic_getxattr,
+	.listxattr = squashfs_listxattr
+};
+
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
index 9e398653b22b..7a9464d08cf6 100644
--- a/fs/squashfs/namei.c
+++ b/fs/squashfs/namei.c
@@ -57,12 +57,13 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/dcache.h>
-#include <linux/zlib.h>
+#include <linux/xattr.h>
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
 #include "squashfs_fs_i.h"
 #include "squashfs.h"
+#include "xattr.h"
 
 /*
  * Lookup name in the directory index, returning the location of the metadata
@@ -238,5 +239,7 @@ failed:
 
 
 const struct inode_operations squashfs_dir_inode_ops = {
-	.lookup = squashfs_lookup
+	.lookup = squashfs_lookup,
+	.getxattr = generic_getxattr,
+	.listxattr = squashfs_listxattr
 };
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 0e9feb6adf7e..733a17c42945 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -51,6 +51,9 @@ extern struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *,
 				u64, int);
 extern int squashfs_read_table(struct super_block *, void *, u64, int);
 
+/* decompressor.c */
+extern const struct squashfs_decompressor *squashfs_lookup_decompressor(int);
+
 /* export.c */
 extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64,
 				unsigned int);
@@ -70,8 +73,11 @@ extern struct inode *squashfs_iget(struct super_block *, long long,
 				unsigned int);
 extern int squashfs_read_inode(struct inode *, long long);
 
+/* xattr.c */
+extern ssize_t squashfs_listxattr(struct dentry *, char *, size_t);
+
 /*
- * Inodes and files operations
+ * Inodes, files,  decompressor and xattr operations
  */
 
 /* dir.c */
@@ -83,8 +89,18 @@ extern const struct export_operations squashfs_export_ops;
 /* file.c */
 extern const struct address_space_operations squashfs_aops;
 
+/* inode.c */
+extern const struct inode_operations squashfs_inode_ops;
+
 /* namei.c */
 extern const struct inode_operations squashfs_dir_inode_ops;
 
 /* symlink.c */
 extern const struct address_space_operations squashfs_symlink_aops;
+extern const struct inode_operations squashfs_symlink_inode_ops;
+
+/* xattr.c */
+extern const struct xattr_handler *squashfs_xattr_handlers[];
+
+/* zlib_wrapper.c */
+extern const struct squashfs_decompressor squashfs_zlib_comp_ops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 283daafc568e..8eabb808b78d 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -46,6 +46,7 @@
 #define SQUASHFS_NAME_LEN		256
 
 #define SQUASHFS_INVALID_FRAG		(0xffffffffU)
+#define SQUASHFS_INVALID_XATTR		(0xffffffffU)
 #define SQUASHFS_INVALID_BLK		(-1LL)
 
 /* Filesystem flags */
@@ -96,6 +97,13 @@
 #define SQUASHFS_LFIFO_TYPE		13
 #define SQUASHFS_LSOCKET_TYPE		14
 
+/* Xattr types */
+#define SQUASHFS_XATTR_USER             0
+#define SQUASHFS_XATTR_TRUSTED          1
+#define SQUASHFS_XATTR_SECURITY         2
+#define SQUASHFS_XATTR_VALUE_OOL        256
+#define SQUASHFS_XATTR_PREFIX_MASK      0xff
+
 /* Flag whether block is compressed or uncompressed, bit is set if block is
  * uncompressed */
 #define SQUASHFS_COMPRESSED_BIT		(1 << 15)
@@ -174,6 +182,24 @@
 
 #define SQUASHFS_ID_BLOCK_BYTES(A)	(SQUASHFS_ID_BLOCKS(A) *\
 					sizeof(u64))
+/* xattr id lookup table defines */
+#define SQUASHFS_XATTR_BYTES(A)		((A) * sizeof(struct squashfs_xattr_id))
+
+#define SQUASHFS_XATTR_BLOCK(A)		(SQUASHFS_XATTR_BYTES(A) / \
+					SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_XATTR_BLOCK_OFFSET(A)	(SQUASHFS_XATTR_BYTES(A) % \
+					SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_XATTR_BLOCKS(A)	((SQUASHFS_XATTR_BYTES(A) + \
+					SQUASHFS_METADATA_SIZE - 1) / \
+					SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_XATTR_BLOCK_BYTES(A)	(SQUASHFS_XATTR_BLOCKS(A) *\
+					sizeof(u64))
+#define SQUASHFS_XATTR_BLK(A)		((unsigned int) ((A) >> 16))
+
+#define SQUASHFS_XATTR_OFFSET(A)	((unsigned int) ((A) & 0xffff))
 
 /* cached data constants for filesystem */
 #define SQUASHFS_CACHED_BLKS		8
@@ -183,8 +209,6 @@
 #define SQUASHFS_MAX_FILE_SIZE		(1LL << \
 					(SQUASHFS_MAX_FILE_SIZE_LOG - 2))
 
-#define SQUASHFS_MARKER_BYTE		0xff
-
 /* meta index cache */
 #define SQUASHFS_META_INDEXES	(SQUASHFS_METADATA_SIZE / sizeof(unsigned int))
 #define SQUASHFS_META_ENTRIES	127
@@ -211,7 +235,9 @@ struct meta_index {
 /*
  * definitions for structures on disk
  */
-#define ZLIB_COMPRESSION	 1
+#define ZLIB_COMPRESSION	1
+#define LZMA_COMPRESSION	2
+#define LZO_COMPRESSION		3
 
 struct squashfs_super_block {
 	__le32			s_magic;
@@ -228,7 +254,7 @@ struct squashfs_super_block {
 	__le64			root_inode;
 	__le64			bytes_used;
 	__le64			id_table_start;
-	__le64			xattr_table_start;
+	__le64			xattr_id_table_start;
 	__le64			inode_table_start;
 	__le64			directory_table_start;
 	__le64			fragment_table_start;
@@ -261,6 +287,17 @@ struct squashfs_ipc_inode {
 	__le32			nlink;
 };
 
+struct squashfs_lipc_inode {
+	__le16			inode_type;
+	__le16			mode;
+	__le16			uid;
+	__le16			guid;
+	__le32			mtime;
+	__le32	 		inode_number;
+	__le32			nlink;
+	__le32			xattr;
+};
+
 struct squashfs_dev_inode {
 	__le16			inode_type;
 	__le16			mode;
@@ -272,6 +309,18 @@ struct squashfs_dev_inode {
 	__le32			rdev;
 };
 
+struct squashfs_ldev_inode {
+	__le16			inode_type;
+	__le16			mode;
+	__le16			uid;
+	__le16			guid;
+	__le32			mtime;
+	__le32	 		inode_number;
+	__le32			nlink;
+	__le32			rdev;
+	__le32			xattr;
+};
+
 struct squashfs_symlink_inode {
 	__le16			inode_type;
 	__le16			mode;
@@ -349,12 +398,14 @@ struct squashfs_ldir_inode {
 union squashfs_inode {
 	struct squashfs_base_inode		base;
 	struct squashfs_dev_inode		dev;
+	struct squashfs_ldev_inode		ldev;
 	struct squashfs_symlink_inode		symlink;
 	struct squashfs_reg_inode		reg;
 	struct squashfs_lreg_inode		lreg;
 	struct squashfs_dir_inode		dir;
 	struct squashfs_ldir_inode		ldir;
 	struct squashfs_ipc_inode		ipc;
+	struct squashfs_lipc_inode		lipc;
 };
 
 struct squashfs_dir_entry {
@@ -377,4 +428,27 @@ struct squashfs_fragment_entry {
 	unsigned int		unused;
 };
 
+struct squashfs_xattr_entry {
+	__le16			type;
+	__le16			size;
+	char			data[0];
+};
+
+struct squashfs_xattr_val {
+	__le32			vsize;
+	char			value[0];
+};
+
+struct squashfs_xattr_id {
+	__le64			xattr;
+	__le32			count;
+	__le32			size;
+};
+
+struct squashfs_xattr_id_table {
+	__le64			xattr_table_start;
+	__le32			xattr_ids;
+	__le32			unused;
+};
+
 #endif
diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h
index fbfca30c0c68..d3e3a37f28a1 100644
--- a/fs/squashfs/squashfs_fs_i.h
+++ b/fs/squashfs/squashfs_fs_i.h
@@ -26,6 +26,9 @@
 struct squashfs_inode_info {
 	u64		start;
 	int		offset;
+	u64		xattr;
+	unsigned int	xattr_size;
+	int		xattr_count;
 	union {
 		struct {
 			u64		fragment_block;
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index c8c65614dd1c..d9037a5215f0 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -52,25 +52,28 @@ struct squashfs_cache_entry {
 };
 
 struct squashfs_sb_info {
-	int			devblksize;
-	int			devblksize_log2;
-	struct squashfs_cache	*block_cache;
-	struct squashfs_cache	*fragment_cache;
-	struct squashfs_cache	*read_page;
-	int			next_meta_index;
-	__le64			*id_table;
-	__le64			*fragment_index;
-	unsigned int		*fragment_index_2;
-	struct mutex		read_data_mutex;
-	struct mutex		meta_index_mutex;
-	struct meta_index	*meta_index;
-	z_stream		stream;
-	__le64			*inode_lookup_table;
-	u64			inode_table;
-	u64			directory_table;
-	unsigned int		block_size;
-	unsigned short		block_log;
-	long long		bytes_used;
-	unsigned int		inodes;
+	const struct squashfs_decompressor	*decompressor;
+	int					devblksize;
+	int					devblksize_log2;
+	struct squashfs_cache			*block_cache;
+	struct squashfs_cache			*fragment_cache;
+	struct squashfs_cache			*read_page;
+	int					next_meta_index;
+	__le64					*id_table;
+	__le64					*fragment_index;
+	__le64					*xattr_id_table;
+	struct mutex				read_data_mutex;
+	struct mutex				meta_index_mutex;
+	struct meta_index			*meta_index;
+	void					*stream;
+	__le64					*inode_lookup_table;
+	u64					inode_table;
+	u64					directory_table;
+	u64					xattr_table;
+	unsigned int				block_size;
+	unsigned short				block_log;
+	long long				bytes_used;
+	unsigned int				inodes;
+	int					xattr_ids;
 };
 #endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 6c197ef53add..88b4f8606652 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -35,34 +35,43 @@
 #include <linux/pagemap.h>
 #include <linux/init.h>
 #include <linux/module.h>
-#include <linux/zlib.h>
 #include <linux/magic.h>
+#include <linux/xattr.h>
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
 #include "squashfs_fs_i.h"
 #include "squashfs.h"
+#include "decompressor.h"
+#include "xattr.h"
 
 static struct file_system_type squashfs_fs_type;
 static const struct super_operations squashfs_super_ops;
 
-static int supported_squashfs_filesystem(short major, short minor, short comp)
+static const struct squashfs_decompressor *supported_squashfs_filesystem(short
+	major, short minor, short id)
 {
+	const struct squashfs_decompressor *decompressor;
+
 	if (major < SQUASHFS_MAJOR) {
 		ERROR("Major/Minor mismatch, older Squashfs %d.%d "
 			"filesystems are unsupported\n", major, minor);
-		return -EINVAL;
+		return NULL;
 	} else if (major > SQUASHFS_MAJOR || minor > SQUASHFS_MINOR) {
 		ERROR("Major/Minor mismatch, trying to mount newer "
 			"%d.%d filesystem\n", major, minor);
 		ERROR("Please update your kernel\n");
-		return -EINVAL;
+		return NULL;
 	}
 
-	if (comp != ZLIB_COMPRESSION)
-		return -EINVAL;
+	decompressor = squashfs_lookup_decompressor(id);
+	if (!decompressor->supported) {
+		ERROR("Filesystem uses \"%s\" compression. This is not "
+			"supported\n", decompressor->name);
+		return NULL;
+	}
 
-	return 0;
+	return decompressor;
 }
 
 
@@ -75,7 +84,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
 	long long root_inode;
 	unsigned short flags;
 	unsigned int fragments;
-	u64 lookup_table_start;
+	u64 lookup_table_start, xattr_id_table_start;
 	int err;
 
 	TRACE("Entered squashfs_fill_superblock\n");
@@ -87,13 +96,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
 	}
 	msblk = sb->s_fs_info;
 
-	msblk->stream.workspace = kmalloc(zlib_inflate_workspacesize(),
-		GFP_KERNEL);
-	if (msblk->stream.workspace == NULL) {
-		ERROR("Failed to allocate zlib workspace\n");
-		goto failure;
-	}
-
 	sblk = kzalloc(sizeof(*sblk), GFP_KERNEL);
 	if (sblk == NULL) {
 		ERROR("Failed to allocate squashfs_super_block\n");
@@ -120,32 +122,25 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
 		goto failed_mount;
 	}
 
+	err = -EINVAL;
+
 	/* Check it is a SQUASHFS superblock */
 	sb->s_magic = le32_to_cpu(sblk->s_magic);
 	if (sb->s_magic != SQUASHFS_MAGIC) {
 		if (!silent)
 			ERROR("Can't find a SQUASHFS superblock on %s\n",
 						bdevname(sb->s_bdev, b));
-		err = -EINVAL;
 		goto failed_mount;
 	}
 
-	/* Check the MAJOR & MINOR versions and compression type */
-	err = supported_squashfs_filesystem(le16_to_cpu(sblk->s_major),
+	/* Check the MAJOR & MINOR versions and lookup compression type */
+	msblk->decompressor = supported_squashfs_filesystem(
+			le16_to_cpu(sblk->s_major),
 			le16_to_cpu(sblk->s_minor),
 			le16_to_cpu(sblk->compression));
-	if (err < 0)
+	if (msblk->decompressor == NULL)
 		goto failed_mount;
 
-	err = -EINVAL;
-
-	/*
-	 * Check if there's xattrs in the filesystem.  These are not
-	 * supported in this version, so warn that they will be ignored.
-	 */
-	if (le64_to_cpu(sblk->xattr_table_start) != SQUASHFS_INVALID_BLK)
-		ERROR("Xattrs in filesystem, these will be ignored\n");
-
 	/* Check the filesystem does not extend beyond the end of the
 	   block device */
 	msblk->bytes_used = le64_to_cpu(sblk->bytes_used);
@@ -205,6 +200,10 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
 
 	err = -ENOMEM;
 
+	msblk->stream = squashfs_decompressor_init(msblk);
+	if (msblk->stream == NULL)
+		goto failed_mount;
+
 	msblk->block_cache = squashfs_cache_init("metadata",
 			SQUASHFS_CACHED_BLKS, SQUASHFS_METADATA_SIZE);
 	if (msblk->block_cache == NULL)
@@ -249,7 +248,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
 allocate_lookup_table:
 	lookup_table_start = le64_to_cpu(sblk->lookup_table_start);
 	if (lookup_table_start == SQUASHFS_INVALID_BLK)
-		goto allocate_root;
+		goto allocate_xattr_table;
 
 	/* Allocate and read inode lookup table */
 	msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb,
@@ -262,6 +261,21 @@ allocate_lookup_table:
 
 	sb->s_export_op = &squashfs_export_ops;
 
+allocate_xattr_table:
+	sb->s_xattr = squashfs_xattr_handlers;
+	xattr_id_table_start = le64_to_cpu(sblk->xattr_id_table_start);
+	if (xattr_id_table_start == SQUASHFS_INVALID_BLK)
+		goto allocate_root;
+
+	/* Allocate and read xattr id lookup table */
+	msblk->xattr_id_table = squashfs_read_xattr_id_table(sb,
+		xattr_id_table_start, &msblk->xattr_table, &msblk->xattr_ids);
+	if (IS_ERR(msblk->xattr_id_table)) {
+		err = PTR_ERR(msblk->xattr_id_table);
+		msblk->xattr_id_table = NULL;
+		if (err != -ENOTSUPP)
+			goto failed_mount;
+	}
 allocate_root:
 	root = new_inode(sb);
 	if (!root) {
@@ -271,7 +285,8 @@ allocate_root:
 
 	err = squashfs_read_inode(root, root_inode);
 	if (err) {
-		iget_failed(root);
+		make_bad_inode(root);
+		iput(root);
 		goto failed_mount;
 	}
 	insert_inode_hash(root);
@@ -292,17 +307,17 @@ failed_mount:
 	squashfs_cache_delete(msblk->block_cache);
 	squashfs_cache_delete(msblk->fragment_cache);
 	squashfs_cache_delete(msblk->read_page);
+	squashfs_decompressor_free(msblk, msblk->stream);
 	kfree(msblk->inode_lookup_table);
 	kfree(msblk->fragment_index);
 	kfree(msblk->id_table);
-	kfree(msblk->stream.workspace);
+	kfree(msblk->xattr_id_table);
 	kfree(sb->s_fs_info);
 	sb->s_fs_info = NULL;
 	kfree(sblk);
 	return err;
 
 failure:
-	kfree(msblk->stream.workspace);
 	kfree(sb->s_fs_info);
 	sb->s_fs_info = NULL;
 	return -ENOMEM;
@@ -346,10 +361,12 @@ static void squashfs_put_super(struct super_block *sb)
 		squashfs_cache_delete(sbi->block_cache);
 		squashfs_cache_delete(sbi->fragment_cache);
 		squashfs_cache_delete(sbi->read_page);
+		squashfs_decompressor_free(sbi, sbi->stream);
 		kfree(sbi->id_table);
 		kfree(sbi->fragment_index);
 		kfree(sbi->meta_index);
-		kfree(sbi->stream.workspace);
+		kfree(sbi->inode_lookup_table);
+		kfree(sbi->xattr_id_table);
 		kfree(sb->s_fs_info);
 		sb->s_fs_info = NULL;
 	}
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index 83d87880aac8..ec86434921e1 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -33,15 +33,15 @@
 #include <linux/fs.h>
 #include <linux/vfs.h>
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/pagemap.h>
-#include <linux/zlib.h>
+#include <linux/xattr.h>
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
 #include "squashfs_fs_i.h"
 #include "squashfs.h"
+#include "xattr.h"
 
 static int squashfs_symlink_readpage(struct file *file, struct page *page)
 {
@@ -116,3 +116,12 @@ error_out:
 const struct address_space_operations squashfs_symlink_aops = {
 	.readpage = squashfs_symlink_readpage
 };
+
+const struct inode_operations squashfs_symlink_inode_ops = {
+	.readlink = generic_readlink,
+	.follow_link = page_follow_link_light,
+	.put_link = page_put_link,
+	.getxattr = generic_getxattr,
+	.listxattr = squashfs_listxattr
+};
+
diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
new file mode 100644
index 000000000000..c7655e8b31cd
--- /dev/null
+++ b/fs/squashfs/xattr.c
@@ -0,0 +1,323 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2010
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * xattr_id.c
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/xattr.h>
+#include <linux/slab.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+static const struct xattr_handler *squashfs_xattr_handler(int);
+
+ssize_t squashfs_listxattr(struct dentry *d, char *buffer,
+	size_t buffer_size)
+{
+	struct inode *inode = d->d_inode;
+	struct super_block *sb = inode->i_sb;
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr)
+						 + msblk->xattr_table;
+	int offset = SQUASHFS_XATTR_OFFSET(squashfs_i(inode)->xattr);
+	int count = squashfs_i(inode)->xattr_count;
+	size_t rest = buffer_size;
+	int err;
+
+	/* check that the file system has xattrs */
+	if (msblk->xattr_id_table == NULL)
+		return -EOPNOTSUPP;
+
+	/* loop reading each xattr name */
+	while (count--) {
+		struct squashfs_xattr_entry entry;
+		struct squashfs_xattr_val val;
+		const struct xattr_handler *handler;
+		int name_size, prefix_size = 0;
+
+		err = squashfs_read_metadata(sb, &entry, &start, &offset,
+							sizeof(entry));
+		if (err < 0)
+			goto failed;
+
+		name_size = le16_to_cpu(entry.size);
+		handler = squashfs_xattr_handler(le16_to_cpu(entry.type));
+		if (handler)
+			prefix_size = handler->list(d, buffer, rest, NULL,
+				name_size, handler->flags);
+		if (prefix_size) {
+			if (buffer) {
+				if (prefix_size + name_size + 1 > rest) {
+					err = -ERANGE;
+					goto failed;
+				}
+				buffer += prefix_size;
+			}
+			err = squashfs_read_metadata(sb, buffer, &start,
+				&offset, name_size);
+			if (err < 0)
+				goto failed;
+			if (buffer) {
+				buffer[name_size] = '\0';
+				buffer += name_size + 1;
+			}
+			rest -= prefix_size + name_size + 1;
+		} else  {
+			/* no handler or insuffficient privileges, so skip */
+			err = squashfs_read_metadata(sb, NULL, &start,
+				&offset, name_size);
+			if (err < 0)
+				goto failed;
+		}
+
+
+		/* skip remaining xattr entry */
+		err = squashfs_read_metadata(sb, &val, &start, &offset,
+						sizeof(val));
+		if (err < 0)
+			goto failed;
+
+		err = squashfs_read_metadata(sb, NULL, &start, &offset,
+						le32_to_cpu(val.vsize));
+		if (err < 0)
+			goto failed;
+	}
+	err = buffer_size - rest;
+
+failed:
+	return err;
+}
+
+
+static int squashfs_xattr_get(struct inode *inode, int name_index,
+	const char *name, void *buffer, size_t buffer_size)
+{
+	struct super_block *sb = inode->i_sb;
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr)
+						 + msblk->xattr_table;
+	int offset = SQUASHFS_XATTR_OFFSET(squashfs_i(inode)->xattr);
+	int count = squashfs_i(inode)->xattr_count;
+	int name_len = strlen(name);
+	int err, vsize;
+	char *target = kmalloc(name_len, GFP_KERNEL);
+
+	if (target == NULL)
+		return  -ENOMEM;
+
+	/* loop reading each xattr name */
+	for (; count; count--) {
+		struct squashfs_xattr_entry entry;
+		struct squashfs_xattr_val val;
+		int type, prefix, name_size;
+
+		err = squashfs_read_metadata(sb, &entry, &start, &offset,
+							sizeof(entry));
+		if (err < 0)
+			goto failed;
+
+		name_size = le16_to_cpu(entry.size);
+		type = le16_to_cpu(entry.type);
+		prefix = type & SQUASHFS_XATTR_PREFIX_MASK;
+
+		if (prefix == name_index && name_size == name_len)
+			err = squashfs_read_metadata(sb, target, &start,
+						&offset, name_size);
+		else
+			err = squashfs_read_metadata(sb, NULL, &start,
+						&offset, name_size);
+		if (err < 0)
+			goto failed;
+
+		if (prefix == name_index && name_size == name_len &&
+					strncmp(target, name, name_size) == 0) {
+			/* found xattr */
+			if (type & SQUASHFS_XATTR_VALUE_OOL) {
+				__le64 xattr;
+				/* val is a reference to the real location */
+				err = squashfs_read_metadata(sb, &val, &start,
+						&offset, sizeof(val));
+				if (err < 0)
+					goto failed;
+				err = squashfs_read_metadata(sb, &xattr, &start,
+					 &offset, sizeof(xattr));
+				if (err < 0)
+					goto failed;
+				xattr = le64_to_cpu(xattr);
+				start = SQUASHFS_XATTR_BLK(xattr) +
+							msblk->xattr_table;
+				offset = SQUASHFS_XATTR_OFFSET(xattr);
+			}
+			/* read xattr value */
+			err = squashfs_read_metadata(sb, &val, &start, &offset,
+							sizeof(val));
+			if (err < 0)
+				goto failed;
+
+			vsize = le32_to_cpu(val.vsize);
+			if (buffer) {
+				if (vsize > buffer_size) {
+					err = -ERANGE;
+					goto failed;
+				}
+				err = squashfs_read_metadata(sb, buffer, &start,
+					 &offset, vsize);
+				if (err < 0)
+					goto failed;
+			}
+			break;
+		}
+
+		/* no match, skip remaining xattr entry */
+		err = squashfs_read_metadata(sb, &val, &start, &offset,
+							sizeof(val));
+		if (err < 0)
+			goto failed;
+		err = squashfs_read_metadata(sb, NULL, &start, &offset,
+						le32_to_cpu(val.vsize));
+		if (err < 0)
+			goto failed;
+	}
+	err = count ? vsize : -ENODATA;
+
+failed:
+	kfree(target);
+	return err;
+}
+
+
+/*
+ * User namespace support
+ */
+static size_t squashfs_user_list(struct dentry *d, char *list, size_t list_size,
+	const char *name, size_t name_len, int type)
+{
+	if (list && XATTR_USER_PREFIX_LEN <= list_size)
+		memcpy(list, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
+	return XATTR_USER_PREFIX_LEN;
+}
+
+static int squashfs_user_get(struct dentry *d, const char *name, void *buffer,
+	size_t size, int type)
+{
+	if (name[0] == '\0')
+		return  -EINVAL;
+
+	return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_USER, name,
+		buffer, size);
+}
+
+static const struct xattr_handler squashfs_xattr_user_handler = {
+	.prefix	= XATTR_USER_PREFIX,
+	.list	= squashfs_user_list,
+	.get	= squashfs_user_get
+};
+
+/*
+ * Trusted namespace support
+ */
+static size_t squashfs_trusted_list(struct dentry *d, char *list,
+	size_t list_size, const char *name, size_t name_len, int type)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return 0;
+
+	if (list && XATTR_TRUSTED_PREFIX_LEN <= list_size)
+		memcpy(list, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
+	return XATTR_TRUSTED_PREFIX_LEN;
+}
+
+static int squashfs_trusted_get(struct dentry *d, const char *name,
+	void *buffer, size_t size, int type)
+{
+	if (name[0] == '\0')
+		return  -EINVAL;
+
+	return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_TRUSTED, name,
+		buffer, size);
+}
+
+static const struct xattr_handler squashfs_xattr_trusted_handler = {
+	.prefix	= XATTR_TRUSTED_PREFIX,
+	.list	= squashfs_trusted_list,
+	.get	= squashfs_trusted_get
+};
+
+/*
+ * Security namespace support
+ */
+static size_t squashfs_security_list(struct dentry *d, char *list,
+	size_t list_size, const char *name, size_t name_len, int type)
+{
+	if (list && XATTR_SECURITY_PREFIX_LEN <= list_size)
+		memcpy(list, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN);
+	return XATTR_SECURITY_PREFIX_LEN;
+}
+
+static int squashfs_security_get(struct dentry *d, const char *name,
+	void *buffer, size_t size, int type)
+{
+	if (name[0] == '\0')
+		return  -EINVAL;
+
+	return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_SECURITY, name,
+		buffer, size);
+}
+
+static const struct xattr_handler squashfs_xattr_security_handler = {
+	.prefix	= XATTR_SECURITY_PREFIX,
+	.list	= squashfs_security_list,
+	.get	= squashfs_security_get
+};
+
+static inline const struct xattr_handler *squashfs_xattr_handler(int type)
+{
+	if (type & ~(SQUASHFS_XATTR_PREFIX_MASK | SQUASHFS_XATTR_VALUE_OOL))
+		/* ignore unrecognised type */
+		return NULL;
+
+	switch (type & SQUASHFS_XATTR_PREFIX_MASK) {
+	case SQUASHFS_XATTR_USER:
+		return &squashfs_xattr_user_handler;
+	case SQUASHFS_XATTR_TRUSTED:
+		return &squashfs_xattr_trusted_handler;
+	case SQUASHFS_XATTR_SECURITY:
+		return &squashfs_xattr_security_handler;
+	default:
+		/* ignore unrecognised type */
+		return NULL;
+	}
+}
+
+const struct xattr_handler *squashfs_xattr_handlers[] = {
+	&squashfs_xattr_user_handler,
+	&squashfs_xattr_trusted_handler,
+	&squashfs_xattr_security_handler,
+	NULL
+};
+
diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h
new file mode 100644
index 000000000000..9da071ae181c
--- /dev/null
+++ b/fs/squashfs/xattr.h
@@ -0,0 +1,46 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2010
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * xattr.h
+ */
+
+#ifdef CONFIG_SQUASHFS_XATTRS
+extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64,
+		u64 *, int *);
+extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *,
+		int *, unsigned long long *);
+#else
+static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb,
+		u64 start, u64 *xattr_table_start, int *xattr_ids)
+{
+	ERROR("Xattrs in filesystem, these will be ignored\n");
+	return ERR_PTR(-ENOTSUPP);
+}
+
+static inline int squashfs_xattr_lookup(struct super_block *sb,
+		unsigned int index, int *count, int *size,
+		unsigned long long *xattr)
+{
+	return 0;
+}
+#define squashfs_listxattr NULL
+#define generic_getxattr NULL
+#define squashfs_xattr_handlers NULL
+#endif
diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c
new file mode 100644
index 000000000000..cfb41106098f
--- /dev/null
+++ b/fs/squashfs/xattr_id.c
@@ -0,0 +1,100 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2010
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * xattr_id.c
+ */
+
+/*
+ * This file implements code to map the 32-bit xattr id stored in the inode
+ * into the on disk location of the xattr data.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+/*
+ * Map xattr id using the xattr id look up table
+ */
+int squashfs_xattr_lookup(struct super_block *sb, unsigned int index,
+		int *count, unsigned int *size, unsigned long long *xattr)
+{
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	int block = SQUASHFS_XATTR_BLOCK(index);
+	int offset = SQUASHFS_XATTR_BLOCK_OFFSET(index);
+	u64 start_block = le64_to_cpu(msblk->xattr_id_table[block]);
+	struct squashfs_xattr_id id;
+	int err;
+
+	err = squashfs_read_metadata(sb, &id, &start_block, &offset,
+							sizeof(id));
+	if (err < 0)
+		return err;
+
+	*xattr = le64_to_cpu(id.xattr);
+	*size = le32_to_cpu(id.size);
+	*count = le32_to_cpu(id.count);
+	return 0;
+}
+
+
+/*
+ * Read uncompressed xattr id lookup table indexes from disk into memory
+ */
+__le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 start,
+		u64 *xattr_table_start, int *xattr_ids)
+{
+	unsigned int len;
+	__le64 *xid_table;
+	struct squashfs_xattr_id_table id_table;
+	int err;
+
+	err = squashfs_read_table(sb, &id_table, start, sizeof(id_table));
+	if (err < 0) {
+		ERROR("unable to read xattr id table\n");
+		return ERR_PTR(err);
+	}
+	*xattr_table_start = le64_to_cpu(id_table.xattr_table_start);
+	*xattr_ids = le32_to_cpu(id_table.xattr_ids);
+	len = SQUASHFS_XATTR_BLOCK_BYTES(*xattr_ids);
+
+	TRACE("In read_xattr_index_table, length %d\n", len);
+
+	/* Allocate xattr id lookup table indexes */
+	xid_table = kmalloc(len, GFP_KERNEL);
+	if (xid_table == NULL) {
+		ERROR("Failed to allocate xattr id index table\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	err = squashfs_read_table(sb, xid_table, start + sizeof(id_table), len);
+	if (err < 0) {
+		ERROR("unable to read xattr id index table\n");
+		kfree(xid_table);
+		return ERR_PTR(err);
+	}
+
+	return xid_table;
+}
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
new file mode 100644
index 000000000000..7a603874e483
--- /dev/null
+++ b/fs/squashfs/zlib_wrapper.c
@@ -0,0 +1,152 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * zlib_wrapper.c
+ */
+
+
+#include <linux/mutex.h>
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include <linux/zlib.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+#include "decompressor.h"
+
+static void *zlib_init(struct squashfs_sb_info *dummy)
+{
+	z_stream *stream = kmalloc(sizeof(z_stream), GFP_KERNEL);
+	if (stream == NULL)
+		goto failed;
+	stream->workspace = kmalloc(zlib_inflate_workspacesize(),
+		GFP_KERNEL);
+	if (stream->workspace == NULL)
+		goto failed;
+
+	return stream;
+
+failed:
+	ERROR("Failed to allocate zlib workspace\n");
+	kfree(stream);
+	return NULL;
+}
+
+
+static void zlib_free(void *strm)
+{
+	z_stream *stream = strm;
+
+	if (stream)
+		kfree(stream->workspace);
+	kfree(stream);
+}
+
+
+static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
+	struct buffer_head **bh, int b, int offset, int length, int srclength,
+	int pages)
+{
+	int zlib_err = 0, zlib_init = 0;
+	int avail, bytes, k = 0, page = 0;
+	z_stream *stream = msblk->stream;
+
+	mutex_lock(&msblk->read_data_mutex);
+
+	stream->avail_out = 0;
+	stream->avail_in = 0;
+
+	bytes = length;
+	do {
+		if (stream->avail_in == 0 && k < b) {
+			avail = min(bytes, msblk->devblksize - offset);
+			bytes -= avail;
+			wait_on_buffer(bh[k]);
+			if (!buffer_uptodate(bh[k]))
+				goto release_mutex;
+
+			if (avail == 0) {
+				offset = 0;
+				put_bh(bh[k++]);
+				continue;
+			}
+
+			stream->next_in = bh[k]->b_data + offset;
+			stream->avail_in = avail;
+			offset = 0;
+		}
+
+		if (stream->avail_out == 0 && page < pages) {
+			stream->next_out = buffer[page++];
+			stream->avail_out = PAGE_CACHE_SIZE;
+		}
+
+		if (!zlib_init) {
+			zlib_err = zlib_inflateInit(stream);
+			if (zlib_err != Z_OK) {
+				ERROR("zlib_inflateInit returned unexpected "
+					"result 0x%x, srclength %d\n",
+					zlib_err, srclength);
+				goto release_mutex;
+			}
+			zlib_init = 1;
+		}
+
+		zlib_err = zlib_inflate(stream, Z_SYNC_FLUSH);
+
+		if (stream->avail_in == 0 && k < b)
+			put_bh(bh[k++]);
+	} while (zlib_err == Z_OK);
+
+	if (zlib_err != Z_STREAM_END) {
+		ERROR("zlib_inflate error, data probably corrupt\n");
+		goto release_mutex;
+	}
+
+	zlib_err = zlib_inflateEnd(stream);
+	if (zlib_err != Z_OK) {
+		ERROR("zlib_inflate error, data probably corrupt\n");
+		goto release_mutex;
+	}
+
+	length = stream->total_out;
+	mutex_unlock(&msblk->read_data_mutex);
+	return length;
+
+release_mutex:
+	mutex_unlock(&msblk->read_data_mutex);
+
+	for (; k < b; k++)
+		put_bh(bh[k]);
+
+	return -EIO;
+}
+
+const struct squashfs_decompressor squashfs_zlib_comp_ops = {
+	.init = zlib_init,
+	.free = zlib_free,
+	.decompress = zlib_uncompress,
+	.id = ZLIB_COMPRESSION,
+	.name = "zlib",
+	.supported = 1
+};
+
diff --git a/fs/statfs.c b/fs/statfs.c
new file mode 100644
index 000000000000..4ef021f3b612
--- /dev/null
+++ b/fs/statfs.c
@@ -0,0 +1,196 @@
+#include <linux/syscalls.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/statfs.h>
+#include <linux/security.h>
+#include <linux/uaccess.h>
+
+int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	int retval = -ENODEV;
+
+	if (dentry) {
+		retval = -ENOSYS;
+		if (dentry->d_sb->s_op->statfs) {
+			memset(buf, 0, sizeof(*buf));
+			retval = security_sb_statfs(dentry);
+			if (retval)
+				return retval;
+			retval = dentry->d_sb->s_op->statfs(dentry, buf);
+			if (retval == 0 && buf->f_frsize == 0)
+				buf->f_frsize = buf->f_bsize;
+		}
+	}
+	return retval;
+}
+
+EXPORT_SYMBOL(vfs_statfs);
+
+static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf)
+{
+	struct kstatfs st;
+	int retval;
+
+	retval = vfs_statfs(dentry, &st);
+	if (retval)
+		return retval;
+
+	if (sizeof(*buf) == sizeof(st))
+		memcpy(buf, &st, sizeof(st));
+	else {
+		if (sizeof buf->f_blocks == 4) {
+			if ((st.f_blocks | st.f_bfree | st.f_bavail |
+			     st.f_bsize | st.f_frsize) &
+			    0xffffffff00000000ULL)
+				return -EOVERFLOW;
+			/*
+			 * f_files and f_ffree may be -1; it's okay to stuff
+			 * that into 32 bits
+			 */
+			if (st.f_files != -1 &&
+			    (st.f_files & 0xffffffff00000000ULL))
+				return -EOVERFLOW;
+			if (st.f_ffree != -1 &&
+			    (st.f_ffree & 0xffffffff00000000ULL))
+				return -EOVERFLOW;
+		}
+
+		buf->f_type = st.f_type;
+		buf->f_bsize = st.f_bsize;
+		buf->f_blocks = st.f_blocks;
+		buf->f_bfree = st.f_bfree;
+		buf->f_bavail = st.f_bavail;
+		buf->f_files = st.f_files;
+		buf->f_ffree = st.f_ffree;
+		buf->f_fsid = st.f_fsid;
+		buf->f_namelen = st.f_namelen;
+		buf->f_frsize = st.f_frsize;
+		memset(buf->f_spare, 0, sizeof(buf->f_spare));
+	}
+	return 0;
+}
+
+static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf)
+{
+	struct kstatfs st;
+	int retval;
+
+	retval = vfs_statfs(dentry, &st);
+	if (retval)
+		return retval;
+
+	if (sizeof(*buf) == sizeof(st))
+		memcpy(buf, &st, sizeof(st));
+	else {
+		buf->f_type = st.f_type;
+		buf->f_bsize = st.f_bsize;
+		buf->f_blocks = st.f_blocks;
+		buf->f_bfree = st.f_bfree;
+		buf->f_bavail = st.f_bavail;
+		buf->f_files = st.f_files;
+		buf->f_ffree = st.f_ffree;
+		buf->f_fsid = st.f_fsid;
+		buf->f_namelen = st.f_namelen;
+		buf->f_frsize = st.f_frsize;
+		memset(buf->f_spare, 0, sizeof(buf->f_spare));
+	}
+	return 0;
+}
+
+SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf)
+{
+	struct path path;
+	int error;
+
+	error = user_path(pathname, &path);
+	if (!error) {
+		struct statfs tmp;
+		error = vfs_statfs_native(path.dentry, &tmp);
+		if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
+			error = -EFAULT;
+		path_put(&path);
+	}
+	return error;
+}
+
+SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf)
+{
+	struct path path;
+	long error;
+
+	if (sz != sizeof(*buf))
+		return -EINVAL;
+	error = user_path(pathname, &path);
+	if (!error) {
+		struct statfs64 tmp;
+		error = vfs_statfs64(path.dentry, &tmp);
+		if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
+			error = -EFAULT;
+		path_put(&path);
+	}
+	return error;
+}
+
+SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf)
+{
+	struct file *file;
+	struct statfs tmp;
+	int error;
+
+	error = -EBADF;
+	file = fget(fd);
+	if (!file)
+		goto out;
+	error = vfs_statfs_native(file->f_path.dentry, &tmp);
+	if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
+		error = -EFAULT;
+	fput(file);
+out:
+	return error;
+}
+
+SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf)
+{
+	struct file *file;
+	struct statfs64 tmp;
+	int error;
+
+	if (sz != sizeof(*buf))
+		return -EINVAL;
+
+	error = -EBADF;
+	file = fget(fd);
+	if (!file)
+		goto out;
+	error = vfs_statfs64(file->f_path.dentry, &tmp);
+	if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
+		error = -EFAULT;
+	fput(file);
+out:
+	return error;
+}
+
+SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
+{
+	struct super_block *s;
+	struct ustat tmp;
+	struct kstatfs sbuf;
+	int err;
+
+	s = user_get_super(new_decode_dev(dev));
+	if (!s)
+		return -EINVAL;
+
+	err = vfs_statfs(s->s_root, &sbuf);
+	drop_super(s);
+	if (err)
+		return err;
+
+	memset(&tmp,0,sizeof(struct ustat));
+	tmp.f_tfree = sbuf.f_bfree;
+	tmp.f_tinode = sbuf.f_ffree;
+
+	return copy_to_user(ubuf, &tmp, sizeof(struct ustat)) ? -EFAULT : 0;
+}
diff --git a/fs/super.c b/fs/super.c
index aff046b0fe78..938119ab8dcb 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -22,22 +22,14 @@
 
 #include <linux/module.h>
 #include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/smp_lock.h>
 #include <linux/acct.h>
 #include <linux/blkdev.h>
-#include <linux/quotaops.h>
-#include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/security.h>
-#include <linux/syscalls.h>
-#include <linux/vfs.h>
 #include <linux/writeback.h>		/* for the emergency remount stuff */
 #include <linux/idr.h>
-#include <linux/kobject.h>
 #include <linux/mutex.h>
-#include <linux/file.h>
-#include <asm/uaccess.h>
+#include <linux/backing-dev.h>
 #include "internal.h"
 
 
@@ -92,16 +84,15 @@ static struct super_block *alloc_super(struct file_system_type *type)
 		 * subclass.
 		 */
 		down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING);
-		s->s_count = S_BIAS;
+		s->s_count = 1;
 		atomic_set(&s->s_active, 1);
 		mutex_init(&s->s_vfs_rename_mutex);
+		lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
 		mutex_init(&s->s_dquot.dqio_mutex);
 		mutex_init(&s->s_dquot.dqonoff_mutex);
 		init_rwsem(&s->s_dquot.dqptr_sem);
 		init_waitqueue_head(&s->s_wait_unfrozen);
 		s->s_maxbytes = MAX_NON_LFS;
-		s->dq_op = sb_dquot_ops;
-		s->s_qcop = sb_quotactl_ops;
 		s->s_op = &default_op;
 		s->s_time_gran = 1000000000;
 	}
@@ -126,39 +117,14 @@ static inline void destroy_super(struct super_block *s)
 /* Superblock refcounting  */
 
 /*
- * Drop a superblock's refcount.  Returns non-zero if the superblock was
- * destroyed.  The caller must hold sb_lock.
+ * Drop a superblock's refcount.  The caller must hold sb_lock.
  */
-static int __put_super(struct super_block *sb)
+void __put_super(struct super_block *sb)
 {
-	int ret = 0;
-
 	if (!--sb->s_count) {
+		list_del_init(&sb->s_list);
 		destroy_super(sb);
-		ret = 1;
-	}
-	return ret;
-}
-
-/*
- * Drop a superblock's refcount.
- * Returns non-zero if the superblock is about to be destroyed and
- * at least is already removed from super_blocks list, so if we are
- * making a loop through super blocks then we need to restart.
- * The caller must hold sb_lock.
- */
-int __put_super_and_need_restart(struct super_block *sb)
-{
-	/* check for race with generic_shutdown_super() */
-	if (list_empty(&sb->s_list)) {
-		/* super block is removed, need to restart... */
-		__put_super(sb);
-		return 1;
 	}
-	/* can't be the last, since s_list is still in use */
-	sb->s_count--;
-	BUG_ON(sb->s_count == 0);
-	return 0;
 }
 
 /**
@@ -177,57 +143,47 @@ void put_super(struct super_block *sb)
 
 
 /**
- *	deactivate_super	-	drop an active reference to superblock
+ *	deactivate_locked_super	-	drop an active reference to superblock
  *	@s: superblock to deactivate
  *
- *	Drops an active reference to superblock, acquiring a temprory one if
- *	there is no active references left.  In that case we lock superblock,
+ *	Drops an active reference to superblock, converting it into a temprory
+ *	one if there is no other active references left.  In that case we
  *	tell fs driver to shut it down and drop the temporary reference we
  *	had just acquired.
+ *
+ *	Caller holds exclusive lock on superblock; that lock is released.
  */
-void deactivate_super(struct super_block *s)
+void deactivate_locked_super(struct super_block *s)
 {
 	struct file_system_type *fs = s->s_type;
-	if (atomic_dec_and_lock(&s->s_active, &sb_lock)) {
-		s->s_count -= S_BIAS-1;
-		spin_unlock(&sb_lock);
-		vfs_dq_off(s, 0);
-		down_write(&s->s_umount);
+	if (atomic_dec_and_test(&s->s_active)) {
 		fs->kill_sb(s);
 		put_filesystem(fs);
 		put_super(s);
+	} else {
+		up_write(&s->s_umount);
 	}
 }
 
-EXPORT_SYMBOL(deactivate_super);
+EXPORT_SYMBOL(deactivate_locked_super);
 
 /**
- *	deactivate_locked_super	-	drop an active reference to superblock
+ *	deactivate_super	-	drop an active reference to superblock
  *	@s: superblock to deactivate
  *
- *	Equivalent of up_write(&s->s_umount); deactivate_super(s);, except that
- *	it does not unlock it until it's all over.  As the result, it's safe to
- *	use to dispose of new superblock on ->get_sb() failure exits - nobody
- *	will see the sucker until it's all over.  Equivalent using up_write +
- *	deactivate_super is safe for that purpose only if superblock is either
- *	safe to use or has NULL ->s_root when we unlock.
+ *	Variant of deactivate_locked_super(), except that superblock is *not*
+ *	locked by caller.  If we are going to drop the final active reference,
+ *	lock will be acquired prior to that.
  */
-void deactivate_locked_super(struct super_block *s)
+void deactivate_super(struct super_block *s)
 {
-	struct file_system_type *fs = s->s_type;
-	if (atomic_dec_and_lock(&s->s_active, &sb_lock)) {
-		s->s_count -= S_BIAS-1;
-		spin_unlock(&sb_lock);
-		vfs_dq_off(s, 0);
-		fs->kill_sb(s);
-		put_filesystem(fs);
-		put_super(s);
-	} else {
-		up_write(&s->s_umount);
+        if (!atomic_add_unless(&s->s_active, -1, 1)) {
+		down_write(&s->s_umount);
+		deactivate_locked_super(s);
 	}
 }
 
-EXPORT_SYMBOL(deactivate_locked_super);
+EXPORT_SYMBOL(deactivate_super);
 
 /**
  *	grab_super - acquire an active reference
@@ -242,22 +198,17 @@ EXPORT_SYMBOL(deactivate_locked_super);
  */
 static int grab_super(struct super_block *s) __releases(sb_lock)
 {
+	if (atomic_inc_not_zero(&s->s_active)) {
+		spin_unlock(&sb_lock);
+		return 1;
+	}
+	/* it's going away */
 	s->s_count++;
 	spin_unlock(&sb_lock);
+	/* wait for it to die */
 	down_write(&s->s_umount);
-	if (s->s_root) {
-		spin_lock(&sb_lock);
-		if (s->s_count > S_BIAS) {
-			atomic_inc(&s->s_active);
-			s->s_count--;
-			spin_unlock(&sb_lock);
-			return 1;
-		}
-		spin_unlock(&sb_lock);
-	}
 	up_write(&s->s_umount);
 	put_super(s);
-	yield();
 	return 0;
 }
 
@@ -320,8 +271,7 @@ void generic_shutdown_super(struct super_block *sb)
 	}
 	spin_lock(&sb_lock);
 	/* should be initialized for __put_super_and_need_restart() */
-	list_del_init(&sb->s_list);
-	list_del(&sb->s_instances);
+	list_del_init(&sb->s_instances);
 	spin_unlock(&sb_lock);
 	up_write(&sb->s_umount);
 }
@@ -356,6 +306,7 @@ retry:
 				up_write(&s->s_umount);
 				destroy_super(s);
 			}
+			down_write(&old->s_umount);
 			return old;
 		}
 	}
@@ -407,11 +358,12 @@ EXPORT_SYMBOL(drop_super);
  */
 void sync_supers(void)
 {
-	struct super_block *sb;
+	struct super_block *sb, *n;
 
 	spin_lock(&sb_lock);
-restart:
-	list_for_each_entry(sb, &super_blocks, s_list) {
+	list_for_each_entry_safe(sb, n, &super_blocks, s_list) {
+		if (list_empty(&sb->s_instances))
+			continue;
 		if (sb->s_op->write_super && sb->s_dirt) {
 			sb->s_count++;
 			spin_unlock(&sb_lock);
@@ -422,14 +374,47 @@ restart:
 			up_read(&sb->s_umount);
 
 			spin_lock(&sb_lock);
-			if (__put_super_and_need_restart(sb))
-				goto restart;
+			/* lock was dropped, must reset next */
+			list_safe_reset_next(sb, n, s_list);
+			__put_super(sb);
 		}
 	}
 	spin_unlock(&sb_lock);
 }
 
 /**
+ *	iterate_supers - call function for all active superblocks
+ *	@f: function to call
+ *	@arg: argument to pass to it
+ *
+ *	Scans the superblock list and calls given function, passing it
+ *	locked superblock and given argument.
+ */
+void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
+{
+	struct super_block *sb, *n;
+
+	spin_lock(&sb_lock);
+	list_for_each_entry_safe(sb, n, &super_blocks, s_list) {
+		if (list_empty(&sb->s_instances))
+			continue;
+		sb->s_count++;
+		spin_unlock(&sb_lock);
+
+		down_read(&sb->s_umount);
+		if (sb->s_root)
+			f(sb, arg);
+		up_read(&sb->s_umount);
+
+		spin_lock(&sb_lock);
+		/* lock was dropped, must reset next */
+		list_safe_reset_next(sb, n, s_list);
+		__put_super(sb);
+	}
+	spin_unlock(&sb_lock);
+}
+
+/**
  *	get_super - get the superblock of a device
  *	@bdev: device to get the superblock for
  *	
@@ -437,7 +422,7 @@ restart:
  *	mounted on the device given. %NULL is returned if no match is found.
  */
 
-struct super_block * get_super(struct block_device *bdev)
+struct super_block *get_super(struct block_device *bdev)
 {
 	struct super_block *sb;
 
@@ -447,17 +432,20 @@ struct super_block * get_super(struct block_device *bdev)
 	spin_lock(&sb_lock);
 rescan:
 	list_for_each_entry(sb, &super_blocks, s_list) {
+		if (list_empty(&sb->s_instances))
+			continue;
 		if (sb->s_bdev == bdev) {
 			sb->s_count++;
 			spin_unlock(&sb_lock);
 			down_read(&sb->s_umount);
+			/* still alive? */
 			if (sb->s_root)
 				return sb;
 			up_read(&sb->s_umount);
-			/* restart only when sb is no longer on the list */
+			/* nope, got unmounted */
 			spin_lock(&sb_lock);
-			if (__put_super_and_need_restart(sb))
-				goto rescan;
+			__put_super(sb);
+			goto rescan;
 		}
 	}
 	spin_unlock(&sb_lock);
@@ -472,7 +460,7 @@ EXPORT_SYMBOL(get_super);
  *
  * Scans the superblock list and finds the superblock of the file system
  * mounted on the device given.  Returns the superblock with an active
- * reference and s_umount held exclusively or %NULL if none was found.
+ * reference or %NULL if none was found.
  */
 struct super_block *get_active_super(struct block_device *bdev)
 {
@@ -481,81 +469,49 @@ struct super_block *get_active_super(struct block_device *bdev)
 	if (!bdev)
 		return NULL;
 
+restart:
 	spin_lock(&sb_lock);
 	list_for_each_entry(sb, &super_blocks, s_list) {
-		if (sb->s_bdev != bdev)
+		if (list_empty(&sb->s_instances))
 			continue;
-
-		sb->s_count++;
-		spin_unlock(&sb_lock);
-		down_write(&sb->s_umount);
-		if (sb->s_root) {
-			spin_lock(&sb_lock);
-			if (sb->s_count > S_BIAS) {
-				atomic_inc(&sb->s_active);
-				sb->s_count--;
-				spin_unlock(&sb_lock);
+		if (sb->s_bdev == bdev) {
+			if (grab_super(sb)) /* drops sb_lock */
 				return sb;
-			}
-			spin_unlock(&sb_lock);
+			else
+				goto restart;
 		}
-		up_write(&sb->s_umount);
-		put_super(sb);
-		yield();
-		spin_lock(&sb_lock);
 	}
 	spin_unlock(&sb_lock);
 	return NULL;
 }
  
-struct super_block * user_get_super(dev_t dev)
+struct super_block *user_get_super(dev_t dev)
 {
 	struct super_block *sb;
 
 	spin_lock(&sb_lock);
 rescan:
 	list_for_each_entry(sb, &super_blocks, s_list) {
+		if (list_empty(&sb->s_instances))
+			continue;
 		if (sb->s_dev ==  dev) {
 			sb->s_count++;
 			spin_unlock(&sb_lock);
 			down_read(&sb->s_umount);
+			/* still alive? */
 			if (sb->s_root)
 				return sb;
 			up_read(&sb->s_umount);
-			/* restart only when sb is no longer on the list */
+			/* nope, got unmounted */
 			spin_lock(&sb_lock);
-			if (__put_super_and_need_restart(sb))
-				goto rescan;
+			__put_super(sb);
+			goto rescan;
 		}
 	}
 	spin_unlock(&sb_lock);
 	return NULL;
 }
 
-SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
-{
-        struct super_block *s;
-        struct ustat tmp;
-        struct kstatfs sbuf;
-	int err = -EINVAL;
-
-        s = user_get_super(new_decode_dev(dev));
-        if (s == NULL)
-                goto out;
-	err = vfs_statfs(s->s_root, &sbuf);
-	drop_super(s);
-	if (err)
-		goto out;
-
-        memset(&tmp,0,sizeof(struct ustat));
-        tmp.f_tfree = sbuf.f_bfree;
-        tmp.f_tinode = sbuf.f_ffree;
-
-        err = copy_to_user(ubuf,&tmp,sizeof(struct ustat)) ? -EFAULT : 0;
-out:
-	return err;
-}
-
 /**
  *	do_remount_sb - asks filesystem to change mount options.
  *	@sb:	superblock in question
@@ -568,7 +524,7 @@ out:
 int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 {
 	int retval;
-	int remount_rw;
+	int remount_ro;
 
 	if (sb->s_frozen != SB_UNFROZEN)
 		return -EBUSY;
@@ -583,18 +539,16 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 	shrink_dcache_sb(sb);
 	sync_filesystem(sb);
 
+	remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
+
 	/* If we are remounting RDONLY and current sb is read/write,
 	   make sure there are no rw files opened */
-	if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY)) {
+	if (remount_ro) {
 		if (force)
 			mark_files_ro(sb);
 		else if (!fs_may_remount_ro(sb))
 			return -EBUSY;
-		retval = vfs_dq_off(sb, 1);
-		if (retval < 0 && retval != -ENOSYS)
-			return -EBUSY;
 	}
-	remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
 
 	if (sb->s_op->remount_fs) {
 		retval = sb->s_op->remount_fs(sb, &flags, data);
@@ -602,31 +556,42 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 			return retval;
 	}
 	sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
-	if (remount_rw)
-		vfs_dq_quota_on_remount(sb);
+
+	/*
+	 * Some filesystems modify their metadata via some other path than the
+	 * bdev buffer cache (eg. use a private mapping, or directories in
+	 * pagecache, etc). Also file data modifications go via their own
+	 * mappings. So If we try to mount readonly then copy the filesystem
+	 * from bdev, we could get stale data, so invalidate it to give a best
+	 * effort at coherency.
+	 */
+	if (remount_ro && sb->s_bdev)
+		invalidate_bdev(sb->s_bdev);
 	return 0;
 }
 
 static void do_emergency_remount(struct work_struct *work)
 {
-	struct super_block *sb;
+	struct super_block *sb, *n;
 
 	spin_lock(&sb_lock);
-	list_for_each_entry(sb, &super_blocks, s_list) {
+	list_for_each_entry_safe(sb, n, &super_blocks, s_list) {
+		if (list_empty(&sb->s_instances))
+			continue;
 		sb->s_count++;
 		spin_unlock(&sb_lock);
 		down_write(&sb->s_umount);
 		if (sb->s_root && sb->s_bdev && !(sb->s_flags & MS_RDONLY)) {
 			/*
-			 * ->remount_fs needs lock_kernel().
-			 *
 			 * What lock protects sb->s_flags??
 			 */
 			do_remount_sb(sb, MS_RDONLY, NULL, 1);
 		}
 		up_write(&sb->s_umount);
-		put_super(sb);
 		spin_lock(&sb_lock);
+		/* lock was dropped, must reset next */
+		list_safe_reset_next(sb, n, s_list);
+		__put_super(sb);
 	}
 	spin_unlock(&sb_lock);
 	kfree(work);
@@ -681,6 +646,7 @@ int set_anon_super(struct super_block *s, void *data)
 		return -EMFILE;
 	}
 	s->s_dev = MKDEV(0, dev & MINORMASK);
+	s->s_bdi = &noop_backing_dev_info;
 	return 0;
 }
 
@@ -925,6 +891,9 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
 	if (!mnt)
 		goto out;
 
+	if (flags & MS_KERNMOUNT)
+		mnt->mnt_flags = MNT_INTERNAL;
+
 	if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) {
 		secdata = alloc_secdata();
 		if (!secdata)
@@ -939,10 +908,11 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
 	if (error < 0)
 		goto out_free_secdata;
 	BUG_ON(!mnt->mnt_sb);
+	WARN_ON(!mnt->mnt_sb->s_bdi);
 
- 	error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata);
- 	if (error)
- 		goto out_sb;
+	error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata);
+	if (error)
+		goto out_sb;
 
 	/*
 	 * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE
@@ -972,6 +942,96 @@ out:
 
 EXPORT_SYMBOL_GPL(vfs_kern_mount);
 
+/**
+ * freeze_super - lock the filesystem and force it into a consistent state
+ * @sb: the super to lock
+ *
+ * Syncs the super to make sure the filesystem is consistent and calls the fs's
+ * freeze_fs.  Subsequent calls to this without first thawing the fs will return
+ * -EBUSY.
+ */
+int freeze_super(struct super_block *sb)
+{
+	int ret;
+
+	atomic_inc(&sb->s_active);
+	down_write(&sb->s_umount);
+	if (sb->s_frozen) {
+		deactivate_locked_super(sb);
+		return -EBUSY;
+	}
+
+	if (sb->s_flags & MS_RDONLY) {
+		sb->s_frozen = SB_FREEZE_TRANS;
+		smp_wmb();
+		up_write(&sb->s_umount);
+		return 0;
+	}
+
+	sb->s_frozen = SB_FREEZE_WRITE;
+	smp_wmb();
+
+	sync_filesystem(sb);
+
+	sb->s_frozen = SB_FREEZE_TRANS;
+	smp_wmb();
+
+	sync_blockdev(sb->s_bdev);
+	if (sb->s_op->freeze_fs) {
+		ret = sb->s_op->freeze_fs(sb);
+		if (ret) {
+			printk(KERN_ERR
+				"VFS:Filesystem freeze failed\n");
+			sb->s_frozen = SB_UNFROZEN;
+			deactivate_locked_super(sb);
+			return ret;
+		}
+	}
+	up_write(&sb->s_umount);
+	return 0;
+}
+EXPORT_SYMBOL(freeze_super);
+
+/**
+ * thaw_super -- unlock filesystem
+ * @sb: the super to thaw
+ *
+ * Unlocks the filesystem and marks it writeable again after freeze_super().
+ */
+int thaw_super(struct super_block *sb)
+{
+	int error;
+
+	down_write(&sb->s_umount);
+	if (sb->s_frozen == SB_UNFROZEN) {
+		up_write(&sb->s_umount);
+		return -EINVAL;
+	}
+
+	if (sb->s_flags & MS_RDONLY)
+		goto out;
+
+	if (sb->s_op->unfreeze_fs) {
+		error = sb->s_op->unfreeze_fs(sb);
+		if (error) {
+			printk(KERN_ERR
+				"VFS:Filesystem thaw failed\n");
+			sb->s_frozen = SB_FREEZE_TRANS;
+			up_write(&sb->s_umount);
+			return error;
+		}
+	}
+
+out:
+	sb->s_frozen = SB_UNFROZEN;
+	smp_wmb();
+	wake_up(&sb->s_wait_unfrozen);
+	deactivate_locked_super(sb);
+
+	return 0;
+}
+EXPORT_SYMBOL(thaw_super);
+
 static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
 {
 	int err;
diff --git a/fs/sync.c b/fs/sync.c
index 418727a2a239..15aa6f03b2da 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -5,6 +5,7 @@
 #include <linux/kernel.h>
 #include <linux/file.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/writeback.h>
@@ -13,6 +14,7 @@
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
 #include <linux/buffer_head.h>
+#include <linux/backing-dev.h>
 #include "internal.h"
 
 #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
@@ -31,17 +33,17 @@ static int __sync_filesystem(struct super_block *sb, int wait)
 	 * This should be safe, as we require bdi backing to actually
 	 * write out data in the first place
 	 */
-	if (!sb->s_bdi)
+	if (!sb->s_bdi || sb->s_bdi == &noop_backing_dev_info)
 		return 0;
 
-	/* Avoid doing twice syncing and cache pruning for quota sync */
-	if (!wait) {
-		writeout_quota_sb(sb, -1);
-		writeback_inodes_sb(sb);
-	} else {
-		sync_quota_sb(sb, -1);
+	if (sb->s_qcop && sb->s_qcop->quota_sync)
+		sb->s_qcop->quota_sync(sb, -1, wait);
+
+	if (wait)
 		sync_inodes_sb(sb);
-	}
+	else
+		writeback_inodes_sb(sb);
+
 	if (sb->s_op->sync_fs)
 		sb->s_op->sync_fs(sb, wait);
 	return __sync_blockdev(sb->s_bdev, wait);
@@ -75,50 +77,18 @@ int sync_filesystem(struct super_block *sb)
 }
 EXPORT_SYMBOL_GPL(sync_filesystem);
 
+static void sync_one_sb(struct super_block *sb, void *arg)
+{
+	if (!(sb->s_flags & MS_RDONLY) && sb->s_bdi)
+		__sync_filesystem(sb, *(int *)arg);
+}
 /*
  * Sync all the data for all the filesystems (called by sys_sync() and
  * emergency sync)
- *
- * This operation is careful to avoid the livelock which could easily happen
- * if two or more filesystems are being continuously dirtied.  s_need_sync
- * is used only here.  We set it against all filesystems and then clear it as
- * we sync them.  So redirtied filesystems are skipped.
- *
- * But if process A is currently running sync_filesystems and then process B
- * calls sync_filesystems as well, process B will set all the s_need_sync
- * flags again, which will cause process A to resync everything.  Fix that with
- * a local mutex.
  */
 static void sync_filesystems(int wait)
 {
-	struct super_block *sb;
-	static DEFINE_MUTEX(mutex);
-
-	mutex_lock(&mutex);		/* Could be down_interruptible */
-	spin_lock(&sb_lock);
-	list_for_each_entry(sb, &super_blocks, s_list)
-		sb->s_need_sync = 1;
-
-restart:
-	list_for_each_entry(sb, &super_blocks, s_list) {
-		if (!sb->s_need_sync)
-			continue;
-		sb->s_need_sync = 0;
-		sb->s_count++;
-		spin_unlock(&sb_lock);
-
-		down_read(&sb->s_umount);
-		if (!(sb->s_flags & MS_RDONLY) && sb->s_root && sb->s_bdi)
-			__sync_filesystem(sb, wait);
-		up_read(&sb->s_umount);
-
-		/* restart only when sb is no longer on the list */
-		spin_lock(&sb_lock);
-		if (__put_super_and_need_restart(sb))
-			goto restart;
-	}
-	spin_unlock(&sb_lock);
-	mutex_unlock(&mutex);
+	iterate_supers(sync_one_sb, &wait);
 }
 
 /*
@@ -160,12 +130,10 @@ void emergency_sync(void)
 
 /*
  * Generic function to fsync a file.
- *
- * filp may be NULL if called via the msync of a vma.
  */
-int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
+int file_fsync(struct file *filp, int datasync)
 {
-	struct inode * inode = dentry->d_inode;
+	struct inode *inode = filp->f_mapping->host;
 	struct super_block * sb;
 	int ret, err;
 
@@ -188,7 +156,6 @@ EXPORT_SYMBOL(file_fsync);
 /**
  * vfs_fsync_range - helper to sync a range of data & metadata to disk
  * @file:		file to sync
- * @dentry:		dentry of @file
  * @start:		offset in bytes of the beginning of data range to sync
  * @end:		offset in bytes of the end of data range (inclusive)
  * @datasync:		perform only datasync
@@ -196,32 +163,13 @@ EXPORT_SYMBOL(file_fsync);
  * Write back data in range @start..@end and metadata for @file to disk.  If
  * @datasync is set only metadata needed to access modified file data is
  * written.
- *
- * In case this function is called from nfsd @file may be %NULL and
- * only @dentry is set.  This can only happen when the filesystem
- * implements the export_operations API.
  */
-int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start,
-		    loff_t end, int datasync)
+int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
 {
-	const struct file_operations *fop;
-	struct address_space *mapping;
+	struct address_space *mapping = file->f_mapping;
 	int err, ret;
 
-	/*
-	 * Get mapping and operations from the file in case we have
-	 * as file, or get the default values for them in case we
-	 * don't have a struct file available.  Damn nfsd..
-	 */
-	if (file) {
-		mapping = file->f_mapping;
-		fop = file->f_op;
-	} else {
-		mapping = dentry->d_inode->i_mapping;
-		fop = dentry->d_inode->i_fop;
-	}
-
-	if (!fop || !fop->fsync) {
+	if (!file->f_op || !file->f_op->fsync) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -233,7 +181,7 @@ int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start,
 	 * livelocks in fsync_buffers_list().
 	 */
 	mutex_lock(&mapping->host->i_mutex);
-	err = fop->fsync(file, dentry, datasync);
+	err = file->f_op->fsync(file, datasync);
 	if (!ret)
 		ret = err;
 	mutex_unlock(&mapping->host->i_mutex);
@@ -246,19 +194,14 @@ EXPORT_SYMBOL(vfs_fsync_range);
 /**
  * vfs_fsync - perform a fsync or fdatasync on a file
  * @file:		file to sync
- * @dentry:		dentry of @file
  * @datasync:		only perform a fdatasync operation
  *
  * Write back data and metadata for @file to disk.  If @datasync is
  * set only metadata needed to access modified file data is written.
- *
- * In case this function is called from nfsd @file may be %NULL and
- * only @dentry is set.  This can only happen when the filesystem
- * implements the export_operations API.
  */
-int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int vfs_fsync(struct file *file, int datasync)
 {
-	return vfs_fsync_range(file, dentry, 0, LLONG_MAX, datasync);
+	return vfs_fsync_range(file, 0, LLONG_MAX, datasync);
 }
 EXPORT_SYMBOL(vfs_fsync);
 
@@ -269,7 +212,7 @@ static int do_fsync(unsigned int fd, int datasync)
 
 	file = fget(fd);
 	if (file) {
-		ret = vfs_fsync(file, file->f_path.dentry, datasync);
+		ret = vfs_fsync(file, datasync);
 		fput(file);
 	}
 	return ret;
@@ -297,8 +240,7 @@ int generic_write_sync(struct file *file, loff_t pos, loff_t count)
 {
 	if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host))
 		return 0;
-	return vfs_fsync_range(file, file->f_path.dentry, pos,
-			       pos + count - 1,
+	return vfs_fsync_range(file, pos, pos + count - 1,
 			       (file->f_flags & __O_SYNC) ? 0 : 1);
 }
 EXPORT_SYMBOL(generic_write_sync);
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index a0a500af24a1..4e321f7353fa 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -46,22 +46,22 @@ struct bin_buffer {
 };
 
 static int
-fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count)
+fill_read(struct file *file, char *buffer, loff_t off, size_t count)
 {
-	struct sysfs_dirent *attr_sd = dentry->d_fsdata;
+	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
 	struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
 	struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
 	int rc;
 
 	/* need attr_sd for attr, its parent for kobj */
-	if (!sysfs_get_active_two(attr_sd))
+	if (!sysfs_get_active(attr_sd))
 		return -ENODEV;
 
 	rc = -EIO;
 	if (attr->read)
-		rc = attr->read(kobj, attr, buffer, off, count);
+		rc = attr->read(file, kobj, attr, buffer, off, count);
 
-	sysfs_put_active_two(attr_sd);
+	sysfs_put_active(attr_sd);
 
 	return rc;
 }
@@ -70,8 +70,7 @@ static ssize_t
 read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
 {
 	struct bin_buffer *bb = file->private_data;
-	struct dentry *dentry = file->f_path.dentry;
-	int size = dentry->d_inode->i_size;
+	int size = file->f_path.dentry->d_inode->i_size;
 	loff_t offs = *off;
 	int count = min_t(size_t, bytes, PAGE_SIZE);
 	char *temp;
@@ -92,7 +91,7 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
 
 	mutex_lock(&bb->mutex);
 
-	count = fill_read(dentry, bb->buffer, offs, count);
+	count = fill_read(file, bb->buffer, offs, count);
 	if (count < 0) {
 		mutex_unlock(&bb->mutex);
 		goto out_free;
@@ -117,22 +116,22 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
 }
 
 static int
-flush_write(struct dentry *dentry, char *buffer, loff_t offset, size_t count)
+flush_write(struct file *file, char *buffer, loff_t offset, size_t count)
 {
-	struct sysfs_dirent *attr_sd = dentry->d_fsdata;
+	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
 	struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
 	struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
 	int rc;
 
 	/* need attr_sd for attr, its parent for kobj */
-	if (!sysfs_get_active_two(attr_sd))
+	if (!sysfs_get_active(attr_sd))
 		return -ENODEV;
 
 	rc = -EIO;
 	if (attr->write)
-		rc = attr->write(kobj, attr, buffer, offset, count);
+		rc = attr->write(file, kobj, attr, buffer, offset, count);
 
-	sysfs_put_active_two(attr_sd);
+	sysfs_put_active(attr_sd);
 
 	return rc;
 }
@@ -141,8 +140,7 @@ static ssize_t write(struct file *file, const char __user *userbuf,
 		     size_t bytes, loff_t *off)
 {
 	struct bin_buffer *bb = file->private_data;
-	struct dentry *dentry = file->f_path.dentry;
-	int size = dentry->d_inode->i_size;
+	int size = file->f_path.dentry->d_inode->i_size;
 	loff_t offs = *off;
 	int count = min_t(size_t, bytes, PAGE_SIZE);
 	char *temp;
@@ -165,7 +163,7 @@ static ssize_t write(struct file *file, const char __user *userbuf,
 
 	memcpy(bb->buffer, temp, count);
 
-	count = flush_write(dentry, bb->buffer, offs, count);
+	count = flush_write(file, bb->buffer, offs, count);
 	mutex_unlock(&bb->mutex);
 
 	if (count > 0)
@@ -184,12 +182,12 @@ static void bin_vma_open(struct vm_area_struct *vma)
 	if (!bb->vm_ops || !bb->vm_ops->open)
 		return;
 
-	if (!sysfs_get_active_two(attr_sd))
+	if (!sysfs_get_active(attr_sd))
 		return;
 
 	bb->vm_ops->open(vma);
 
-	sysfs_put_active_two(attr_sd);
+	sysfs_put_active(attr_sd);
 }
 
 static void bin_vma_close(struct vm_area_struct *vma)
@@ -201,12 +199,12 @@ static void bin_vma_close(struct vm_area_struct *vma)
 	if (!bb->vm_ops || !bb->vm_ops->close)
 		return;
 
-	if (!sysfs_get_active_two(attr_sd))
+	if (!sysfs_get_active(attr_sd))
 		return;
 
 	bb->vm_ops->close(vma);
 
-	sysfs_put_active_two(attr_sd);
+	sysfs_put_active(attr_sd);
 }
 
 static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
@@ -219,12 +217,12 @@ static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	if (!bb->vm_ops || !bb->vm_ops->fault)
 		return VM_FAULT_SIGBUS;
 
-	if (!sysfs_get_active_two(attr_sd))
+	if (!sysfs_get_active(attr_sd))
 		return VM_FAULT_SIGBUS;
 
 	ret = bb->vm_ops->fault(vma, vmf);
 
-	sysfs_put_active_two(attr_sd);
+	sysfs_put_active(attr_sd);
 	return ret;
 }
 
@@ -241,12 +239,12 @@ static int bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	if (!bb->vm_ops->page_mkwrite)
 		return 0;
 
-	if (!sysfs_get_active_two(attr_sd))
+	if (!sysfs_get_active(attr_sd))
 		return VM_FAULT_SIGBUS;
 
 	ret = bb->vm_ops->page_mkwrite(vma, vmf);
 
-	sysfs_put_active_two(attr_sd);
+	sysfs_put_active(attr_sd);
 	return ret;
 }
 
@@ -261,12 +259,12 @@ static int bin_access(struct vm_area_struct *vma, unsigned long addr,
 	if (!bb->vm_ops || !bb->vm_ops->access)
 		return -EINVAL;
 
-	if (!sysfs_get_active_two(attr_sd))
+	if (!sysfs_get_active(attr_sd))
 		return -EINVAL;
 
 	ret = bb->vm_ops->access(vma, addr, buf, len, write);
 
-	sysfs_put_active_two(attr_sd);
+	sysfs_put_active(attr_sd);
 	return ret;
 }
 
@@ -281,12 +279,12 @@ static int bin_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
 	if (!bb->vm_ops || !bb->vm_ops->set_policy)
 		return 0;
 
-	if (!sysfs_get_active_two(attr_sd))
+	if (!sysfs_get_active(attr_sd))
 		return -EINVAL;
 
 	ret = bb->vm_ops->set_policy(vma, new);
 
-	sysfs_put_active_two(attr_sd);
+	sysfs_put_active(attr_sd);
 	return ret;
 }
 
@@ -301,12 +299,12 @@ static struct mempolicy *bin_get_policy(struct vm_area_struct *vma,
 	if (!bb->vm_ops || !bb->vm_ops->get_policy)
 		return vma->vm_policy;
 
-	if (!sysfs_get_active_two(attr_sd))
+	if (!sysfs_get_active(attr_sd))
 		return vma->vm_policy;
 
 	pol = bb->vm_ops->get_policy(vma, addr);
 
-	sysfs_put_active_two(attr_sd);
+	sysfs_put_active(attr_sd);
 	return pol;
 }
 
@@ -321,12 +319,12 @@ static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
 	if (!bb->vm_ops || !bb->vm_ops->migrate)
 		return 0;
 
-	if (!sysfs_get_active_two(attr_sd))
+	if (!sysfs_get_active(attr_sd))
 		return 0;
 
 	ret = bb->vm_ops->migrate(vma, from, to, flags);
 
-	sysfs_put_active_two(attr_sd);
+	sysfs_put_active(attr_sd);
 	return ret;
 }
 #endif
@@ -356,14 +354,14 @@ static int mmap(struct file *file, struct vm_area_struct *vma)
 
 	/* need attr_sd for attr, its parent for kobj */
 	rc = -ENODEV;
-	if (!sysfs_get_active_two(attr_sd))
+	if (!sysfs_get_active(attr_sd))
 		goto out_unlock;
 
 	rc = -EINVAL;
 	if (!attr->mmap)
 		goto out_put;
 
-	rc = attr->mmap(kobj, attr, vma);
+	rc = attr->mmap(file, kobj, attr, vma);
 	if (rc)
 		goto out_put;
 
@@ -384,7 +382,7 @@ static int mmap(struct file *file, struct vm_area_struct *vma)
 	bb->vm_ops = vma->vm_ops;
 	vma->vm_ops = &bin_vm_ops;
 out_put:
-	sysfs_put_active_two(attr_sd);
+	sysfs_put_active(attr_sd);
 out_unlock:
 	mutex_unlock(&bb->mutex);
 
@@ -399,7 +397,7 @@ static int open(struct inode * inode, struct file * file)
 	int error;
 
 	/* binary file operations requires both @sd and its parent */
-	if (!sysfs_get_active_two(attr_sd))
+	if (!sysfs_get_active(attr_sd))
 		return -ENODEV;
 
 	error = -EACCES;
@@ -426,11 +424,11 @@ static int open(struct inode * inode, struct file * file)
 	mutex_unlock(&sysfs_bin_lock);
 
 	/* open succeeded, put active references */
-	sysfs_put_active_two(attr_sd);
+	sysfs_put_active(attr_sd);
 	return 0;
 
  err_out:
-	sysfs_put_active_two(attr_sd);
+	sysfs_put_active(attr_sd);
 	kfree(bb);
 	return error;
 }
@@ -501,7 +499,7 @@ int sysfs_create_bin_file(struct kobject *kobj,
 void sysfs_remove_bin_file(struct kobject *kobj,
 			   const struct bin_attribute *attr)
 {
-	sysfs_hash_and_remove(kobj->sd, attr->attr.name);
+	sysfs_hash_and_remove(kobj->sd, NULL, attr->attr.name);
 }
 
 EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 699f371b9f12..7e54bac8c4b0 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -93,7 +93,7 @@ static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
  *	RETURNS:
  *	Pointer to @sd on success, NULL on failure.
  */
-static struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
+struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
 {
 	if (unlikely(!sd))
 		return NULL;
@@ -124,7 +124,7 @@ static struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
  *	Put an active reference to @sd.  This function is noop if @sd
  *	is NULL.
  */
-static void sysfs_put_active(struct sysfs_dirent *sd)
+void sysfs_put_active(struct sysfs_dirent *sd)
 {
 	struct completion *cmpl;
 	int v;
@@ -145,45 +145,6 @@ static void sysfs_put_active(struct sysfs_dirent *sd)
 }
 
 /**
- *	sysfs_get_active_two - get active references to sysfs_dirent and parent
- *	@sd: sysfs_dirent of interest
- *
- *	Get active reference to @sd and its parent.  Parent's active
- *	reference is grabbed first.  This function is noop if @sd is
- *	NULL.
- *
- *	RETURNS:
- *	Pointer to @sd on success, NULL on failure.
- */
-struct sysfs_dirent *sysfs_get_active_two(struct sysfs_dirent *sd)
-{
-	if (sd) {
-		if (sd->s_parent && unlikely(!sysfs_get_active(sd->s_parent)))
-			return NULL;
-		if (unlikely(!sysfs_get_active(sd))) {
-			sysfs_put_active(sd->s_parent);
-			return NULL;
-		}
-	}
-	return sd;
-}
-
-/**
- *	sysfs_put_active_two - put active references to sysfs_dirent and parent
- *	@sd: sysfs_dirent of interest
- *
- *	Put active references to @sd and its parent.  This function is
- *	noop if @sd is NULL.
- */
-void sysfs_put_active_two(struct sysfs_dirent *sd)
-{
-	if (sd) {
-		sysfs_put_active(sd);
-		sysfs_put_active(sd->s_parent);
-	}
-}
-
-/**
  *	sysfs_deactivate - deactivate sysfs_dirent
  *	@sd: sysfs_dirent to deactivate
  *
@@ -195,6 +156,10 @@ static void sysfs_deactivate(struct sysfs_dirent *sd)
 	int v;
 
 	BUG_ON(sd->s_sibling || !(sd->s_flags & SYSFS_FLAG_REMOVED));
+
+	if (!(sysfs_type(sd) & SYSFS_ACTIVE_REF))
+		return;
+
 	sd->s_sibling = (void *)&wait;
 
 	rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_);
@@ -354,7 +319,6 @@ struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type)
 
 	atomic_set(&sd->s_count, 1);
 	atomic_set(&sd->s_active, 0);
-	sysfs_dirent_init_lockdep(sd);
 
 	sd->s_name = name;
 	sd->s_mode = mode;
@@ -416,7 +380,7 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 {
 	struct sysfs_inode_attrs *ps_iattr;
 
-	if (sysfs_find_dirent(acxt->parent_sd, sd->s_name))
+	if (sysfs_find_dirent(acxt->parent_sd, sd->s_ns, sd->s_name))
 		return -EEXIST;
 
 	sd->s_parent = sysfs_get(acxt->parent_sd);
@@ -569,13 +533,17 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
  *	Pointer to sysfs_dirent if found, NULL if not.
  */
 struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
+				       const void *ns,
 				       const unsigned char *name)
 {
 	struct sysfs_dirent *sd;
 
-	for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling)
+	for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling) {
+		if (ns && sd->s_ns && (sd->s_ns != ns))
+			continue;
 		if (!strcmp(sd->s_name, name))
 			return sd;
+	}
 	return NULL;
 }
 
@@ -594,12 +562,13 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
  *	Pointer to sysfs_dirent if found, NULL if not.
  */
 struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
+				      const void *ns,
 				      const unsigned char *name)
 {
 	struct sysfs_dirent *sd;
 
 	mutex_lock(&sysfs_mutex);
-	sd = sysfs_find_dirent(parent_sd, name);
+	sd = sysfs_find_dirent(parent_sd, ns, name);
 	sysfs_get(sd);
 	mutex_unlock(&sysfs_mutex);
 
@@ -608,7 +577,8 @@ struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
 EXPORT_SYMBOL_GPL(sysfs_get_dirent);
 
 static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
-		      const char *name, struct sysfs_dirent **p_sd)
+	enum kobj_ns_type type, const void *ns, const char *name,
+	struct sysfs_dirent **p_sd)
 {
 	umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
 	struct sysfs_addrm_cxt acxt;
@@ -619,6 +589,9 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
 	sd = sysfs_new_dirent(name, mode, SYSFS_DIR);
 	if (!sd)
 		return -ENOMEM;
+
+	sd->s_flags |= (type << SYSFS_NS_TYPE_SHIFT);
+	sd->s_ns = ns;
 	sd->s_dir.kobj = kobj;
 
 	/* link in */
@@ -637,7 +610,33 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
 int sysfs_create_subdir(struct kobject *kobj, const char *name,
 			struct sysfs_dirent **p_sd)
 {
-	return create_dir(kobj, kobj->sd, name, p_sd);
+	return create_dir(kobj, kobj->sd,
+			  KOBJ_NS_TYPE_NONE, NULL, name, p_sd);
+}
+
+/**
+ *	sysfs_read_ns_type: return associated ns_type
+ *	@kobj: the kobject being queried
+ *
+ *	Each kobject can be tagged with exactly one namespace type
+ *	(i.e. network or user).  Return the ns_type associated with
+ *	this object if any
+ */
+static enum kobj_ns_type sysfs_read_ns_type(struct kobject *kobj)
+{
+	const struct kobj_ns_type_operations *ops;
+	enum kobj_ns_type type;
+
+	ops = kobj_child_ns_ops(kobj);
+	if (!ops)
+		return KOBJ_NS_TYPE_NONE;
+
+	type = ops->type;
+	BUG_ON(type <= KOBJ_NS_TYPE_NONE);
+	BUG_ON(type >= KOBJ_NS_TYPES);
+	BUG_ON(!kobj_ns_type_registered(type));
+
+	return type;
 }
 
 /**
@@ -646,7 +645,9 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name,
  */
 int sysfs_create_dir(struct kobject * kobj)
 {
+	enum kobj_ns_type type;
 	struct sysfs_dirent *parent_sd, *sd;
+	const void *ns = NULL;
 	int error = 0;
 
 	BUG_ON(!kobj);
@@ -656,7 +657,11 @@ int sysfs_create_dir(struct kobject * kobj)
 	else
 		parent_sd = &sysfs_root;
 
-	error = create_dir(kobj, parent_sd, kobject_name(kobj), &sd);
+	if (sysfs_ns_type(parent_sd))
+		ns = kobj->ktype->namespace(kobj);
+	type = sysfs_read_ns_type(kobj);
+
+	error = create_dir(kobj, parent_sd, type, ns, kobject_name(kobj), &sd);
 	if (!error)
 		kobj->sd = sd;
 	return error;
@@ -666,13 +671,19 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
 				struct nameidata *nd)
 {
 	struct dentry *ret = NULL;
-	struct sysfs_dirent *parent_sd = dentry->d_parent->d_fsdata;
+	struct dentry *parent = dentry->d_parent;
+	struct sysfs_dirent *parent_sd = parent->d_fsdata;
 	struct sysfs_dirent *sd;
 	struct inode *inode;
+	enum kobj_ns_type type;
+	const void *ns;
 
 	mutex_lock(&sysfs_mutex);
 
-	sd = sysfs_find_dirent(parent_sd, dentry->d_name.name);
+	type = sysfs_ns_type(parent_sd);
+	ns = sysfs_info(dir->i_sb)->ns[type];
+
+	sd = sysfs_find_dirent(parent_sd, ns, dentry->d_name.name);
 
 	/* no such entry */
 	if (!sd) {
@@ -681,7 +692,7 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
 	}
 
 	/* attach dentry and inode */
-	inode = sysfs_get_inode(sd);
+	inode = sysfs_get_inode(dir->i_sb, sd);
 	if (!inode) {
 		ret = ERR_PTR(-ENOMEM);
 		goto out_unlock;
@@ -771,7 +782,8 @@ void sysfs_remove_dir(struct kobject * kobj)
 }
 
 int sysfs_rename(struct sysfs_dirent *sd,
-	struct sysfs_dirent *new_parent_sd, const char *new_name)
+	struct sysfs_dirent *new_parent_sd, const void *new_ns,
+	const char *new_name)
 {
 	const char *dup_name = NULL;
 	int error;
@@ -779,12 +791,12 @@ int sysfs_rename(struct sysfs_dirent *sd,
 	mutex_lock(&sysfs_mutex);
 
 	error = 0;
-	if ((sd->s_parent == new_parent_sd) &&
+	if ((sd->s_parent == new_parent_sd) && (sd->s_ns == new_ns) &&
 	    (strcmp(sd->s_name, new_name) == 0))
 		goto out;	/* nothing to rename */
 
 	error = -EEXIST;
-	if (sysfs_find_dirent(new_parent_sd, new_name))
+	if (sysfs_find_dirent(new_parent_sd, new_ns, new_name))
 		goto out;
 
 	/* rename sysfs_dirent */
@@ -806,6 +818,7 @@ int sysfs_rename(struct sysfs_dirent *sd,
 		sd->s_parent = new_parent_sd;
 		sysfs_link_sibling(sd);
 	}
+	sd->s_ns = new_ns;
 
 	error = 0;
  out:
@@ -816,19 +829,28 @@ int sysfs_rename(struct sysfs_dirent *sd,
 
 int sysfs_rename_dir(struct kobject *kobj, const char *new_name)
 {
-	return sysfs_rename(kobj->sd, kobj->sd->s_parent, new_name);
+	struct sysfs_dirent *parent_sd = kobj->sd->s_parent;
+	const void *new_ns = NULL;
+
+	if (sysfs_ns_type(parent_sd))
+		new_ns = kobj->ktype->namespace(kobj);
+
+	return sysfs_rename(kobj->sd, parent_sd, new_ns, new_name);
 }
 
 int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj)
 {
 	struct sysfs_dirent *sd = kobj->sd;
 	struct sysfs_dirent *new_parent_sd;
+	const void *new_ns = NULL;
 
 	BUG_ON(!sd->s_parent);
+	if (sysfs_ns_type(sd->s_parent))
+		new_ns = kobj->ktype->namespace(kobj);
 	new_parent_sd = new_parent_kobj && new_parent_kobj->sd ?
 		new_parent_kobj->sd : &sysfs_root;
 
-	return sysfs_rename(sd, new_parent_sd, sd->s_name);
+	return sysfs_rename(sd, new_parent_sd, new_ns, sd->s_name);
 }
 
 /* Relationship between s_mode and the DT_xxx types */
@@ -837,13 +859,56 @@ static inline unsigned char dt_type(struct sysfs_dirent *sd)
 	return (sd->s_mode >> 12) & 15;
 }
 
+static int sysfs_dir_release(struct inode *inode, struct file *filp)
+{
+	sysfs_put(filp->private_data);
+	return 0;
+}
+
+static struct sysfs_dirent *sysfs_dir_pos(const void *ns,
+	struct sysfs_dirent *parent_sd,	ino_t ino, struct sysfs_dirent *pos)
+{
+	if (pos) {
+		int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) &&
+			pos->s_parent == parent_sd &&
+			ino == pos->s_ino;
+		sysfs_put(pos);
+		if (!valid)
+			pos = NULL;
+	}
+	if (!pos && (ino > 1) && (ino < INT_MAX)) {
+		pos = parent_sd->s_dir.children;
+		while (pos && (ino > pos->s_ino))
+			pos = pos->s_sibling;
+	}
+	while (pos && pos->s_ns && pos->s_ns != ns)
+		pos = pos->s_sibling;
+	return pos;
+}
+
+static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
+	struct sysfs_dirent *parent_sd,	ino_t ino, struct sysfs_dirent *pos)
+{
+	pos = sysfs_dir_pos(ns, parent_sd, ino, pos);
+	if (pos)
+		pos = pos->s_sibling;
+	while (pos && pos->s_ns && pos->s_ns != ns)
+		pos = pos->s_sibling;
+	return pos;
+}
+
 static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
 {
 	struct dentry *dentry = filp->f_path.dentry;
 	struct sysfs_dirent * parent_sd = dentry->d_fsdata;
-	struct sysfs_dirent *pos;
+	struct sysfs_dirent *pos = filp->private_data;
+	enum kobj_ns_type type;
+	const void *ns;
 	ino_t ino;
 
+	type = sysfs_ns_type(parent_sd);
+	ns = sysfs_info(dentry->d_sb)->ns[type];
+
 	if (filp->f_pos == 0) {
 		ino = parent_sd->s_ino;
 		if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) == 0)
@@ -857,29 +922,31 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
 		if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) == 0)
 			filp->f_pos++;
 	}
-	if ((filp->f_pos > 1) && (filp->f_pos < INT_MAX)) {
-		mutex_lock(&sysfs_mutex);
-
-		/* Skip the dentries we have already reported */
-		pos = parent_sd->s_dir.children;
-		while (pos && (filp->f_pos > pos->s_ino))
-			pos = pos->s_sibling;
-
-		for ( ; pos; pos = pos->s_sibling) {
-			const char * name;
-			int len;
-
-			name = pos->s_name;
-			len = strlen(name);
-			filp->f_pos = ino = pos->s_ino;
+	mutex_lock(&sysfs_mutex);
+	for (pos = sysfs_dir_pos(ns, parent_sd, filp->f_pos, pos);
+	     pos;
+	     pos = sysfs_dir_next_pos(ns, parent_sd, filp->f_pos, pos)) {
+		const char * name;
+		unsigned int type;
+		int len, ret;
+
+		name = pos->s_name;
+		len = strlen(name);
+		ino = pos->s_ino;
+		type = dt_type(pos);
+		filp->f_pos = ino;
+		filp->private_data = sysfs_get(pos);
 
-			if (filldir(dirent, name, len, filp->f_pos, ino,
-					 dt_type(pos)) < 0)
-				break;
-		}
-		if (!pos)
-			filp->f_pos = INT_MAX;
 		mutex_unlock(&sysfs_mutex);
+		ret = filldir(dirent, name, len, filp->f_pos, ino, type);
+		mutex_lock(&sysfs_mutex);
+		if (ret < 0)
+			break;
+	}
+	mutex_unlock(&sysfs_mutex);
+	if ((filp->f_pos > 1) && !pos) { /* EOF */
+		filp->f_pos = INT_MAX;
+		filp->private_data = NULL;
 	}
 	return 0;
 }
@@ -888,5 +955,6 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
 const struct file_operations sysfs_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= sysfs_readdir,
+	.release	= sysfs_dir_release,
 	.llseek		= generic_file_llseek,
 };
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index dc30d9e31683..1beaa739d0a6 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -53,7 +53,7 @@ struct sysfs_buffer {
 	size_t			count;
 	loff_t			pos;
 	char			* page;
-	struct sysfs_ops	* ops;
+	const struct sysfs_ops	* ops;
 	struct mutex		mutex;
 	int			needs_read_fill;
 	int			event;
@@ -75,7 +75,7 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer
 {
 	struct sysfs_dirent *attr_sd = dentry->d_fsdata;
 	struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
-	struct sysfs_ops * ops = buffer->ops;
+	const struct sysfs_ops * ops = buffer->ops;
 	int ret = 0;
 	ssize_t count;
 
@@ -85,13 +85,13 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer
 		return -ENOMEM;
 
 	/* need attr_sd for attr and ops, its parent for kobj */
-	if (!sysfs_get_active_two(attr_sd))
+	if (!sysfs_get_active(attr_sd))
 		return -ENODEV;
 
 	buffer->event = atomic_read(&attr_sd->s_attr.open->event);
 	count = ops->show(kobj, attr_sd->s_attr.attr, buffer->page);
 
-	sysfs_put_active_two(attr_sd);
+	sysfs_put_active(attr_sd);
 
 	/*
 	 * The code works fine with PAGE_SIZE return but it's likely to
@@ -199,16 +199,16 @@ flush_write_buffer(struct dentry * dentry, struct sysfs_buffer * buffer, size_t
 {
 	struct sysfs_dirent *attr_sd = dentry->d_fsdata;
 	struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
-	struct sysfs_ops * ops = buffer->ops;
+	const struct sysfs_ops * ops = buffer->ops;
 	int rc;
 
 	/* need attr_sd for attr and ops, its parent for kobj */
-	if (!sysfs_get_active_two(attr_sd))
+	if (!sysfs_get_active(attr_sd))
 		return -ENODEV;
 
 	rc = ops->store(kobj, attr_sd->s_attr.attr, buffer->page, count);
 
-	sysfs_put_active_two(attr_sd);
+	sysfs_put_active(attr_sd);
 
 	return rc;
 }
@@ -335,7 +335,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
 	struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
 	struct sysfs_buffer *buffer;
-	struct sysfs_ops *ops;
+	const struct sysfs_ops *ops;
 	int error = -EACCES;
 	char *p;
 
@@ -344,7 +344,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 		memmove(last_sysfs_file, p, strlen(p) + 1);
 
 	/* need attr_sd for attr and ops, its parent for kobj */
-	if (!sysfs_get_active_two(attr_sd))
+	if (!sysfs_get_active(attr_sd))
 		return -ENODEV;
 
 	/* every kobject with an attribute needs a ktype assigned */
@@ -393,13 +393,13 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 		goto err_free;
 
 	/* open succeeded, put active references */
-	sysfs_put_active_two(attr_sd);
+	sysfs_put_active(attr_sd);
 	return 0;
 
  err_free:
 	kfree(buffer);
  err_out:
-	sysfs_put_active_two(attr_sd);
+	sysfs_put_active(attr_sd);
 	return error;
 }
 
@@ -437,12 +437,12 @@ static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
 	struct sysfs_open_dirent *od = attr_sd->s_attr.open;
 
 	/* need parent for the kobj, grab both */
-	if (!sysfs_get_active_two(attr_sd))
+	if (!sysfs_get_active(attr_sd))
 		goto trigger;
 
 	poll_wait(filp, &od->poll, wait);
 
-	sysfs_put_active_two(attr_sd);
+	sysfs_put_active(attr_sd);
 
 	if (buffer->event != atomic_read(&od->event))
 		goto trigger;
@@ -478,9 +478,12 @@ void sysfs_notify(struct kobject *k, const char *dir, const char *attr)
 	mutex_lock(&sysfs_mutex);
 
 	if (sd && dir)
-		sd = sysfs_find_dirent(sd, dir);
+		/* Only directories are tagged, so no need to pass
+		 * a tag explicitly.
+		 */
+		sd = sysfs_find_dirent(sd, NULL, dir);
 	if (sd && attr)
-		sd = sysfs_find_dirent(sd, attr);
+		sd = sysfs_find_dirent(sd, NULL, attr);
 	if (sd)
 		sysfs_notify_dirent(sd);
 
@@ -509,6 +512,7 @@ int sysfs_add_file_mode(struct sysfs_dirent *dir_sd,
 	if (!sd)
 		return -ENOMEM;
 	sd->s_attr.attr = (void *)attr;
+	sysfs_dirent_init_lockdep(sd);
 
 	sysfs_addrm_start(&acxt, dir_sd);
 	rc = sysfs_add_one(&acxt, sd);
@@ -542,6 +546,18 @@ int sysfs_create_file(struct kobject * kobj, const struct attribute * attr)
 
 }
 
+int sysfs_create_files(struct kobject *kobj, const struct attribute **ptr)
+{
+	int err = 0;
+	int i;
+
+	for (i = 0; ptr[i] && !err; i++)
+		err = sysfs_create_file(kobj, ptr[i]);
+	if (err)
+		while (--i >= 0)
+			sysfs_remove_file(kobj, ptr[i]);
+	return err;
+}
 
 /**
  * sysfs_add_file_to_group - add an attribute file to a pre-existing group.
@@ -556,7 +572,7 @@ int sysfs_add_file_to_group(struct kobject *kobj,
 	int error;
 
 	if (group)
-		dir_sd = sysfs_get_dirent(kobj->sd, group);
+		dir_sd = sysfs_get_dirent(kobj->sd, NULL, group);
 	else
 		dir_sd = sysfs_get(kobj->sd);
 
@@ -586,7 +602,7 @@ int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode)
 	mutex_lock(&sysfs_mutex);
 
 	rc = -ENOENT;
-	sd = sysfs_find_dirent(kobj->sd, attr->name);
+	sd = sysfs_find_dirent(kobj->sd, NULL, attr->name);
 	if (!sd)
 		goto out;
 
@@ -611,9 +627,15 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file);
 
 void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr)
 {
-	sysfs_hash_and_remove(kobj->sd, attr->name);
+	sysfs_hash_and_remove(kobj->sd, NULL, attr->name);
 }
 
+void sysfs_remove_files(struct kobject * kobj, const struct attribute **ptr)
+{
+	int i;
+	for (i = 0; ptr[i]; i++)
+		sysfs_remove_file(kobj, ptr[i]);
+}
 
 /**
  * sysfs_remove_file_from_group - remove an attribute file from a group.
@@ -627,11 +649,11 @@ void sysfs_remove_file_from_group(struct kobject *kobj,
 	struct sysfs_dirent *dir_sd;
 
 	if (group)
-		dir_sd = sysfs_get_dirent(kobj->sd, group);
+		dir_sd = sysfs_get_dirent(kobj->sd, NULL, group);
 	else
 		dir_sd = sysfs_get(kobj->sd);
 	if (dir_sd) {
-		sysfs_hash_and_remove(dir_sd, attr->name);
+		sysfs_hash_and_remove(dir_sd, NULL, attr->name);
 		sysfs_put(dir_sd);
 	}
 }
@@ -732,3 +754,5 @@ EXPORT_SYMBOL_GPL(sysfs_schedule_callback);
 
 EXPORT_SYMBOL_GPL(sysfs_create_file);
 EXPORT_SYMBOL_GPL(sysfs_remove_file);
+EXPORT_SYMBOL_GPL(sysfs_remove_files);
+EXPORT_SYMBOL_GPL(sysfs_create_files);
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index fe611949a7f7..23c1e598792a 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -23,7 +23,7 @@ static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
 	int i;
 
 	for (i = 0, attr = grp->attrs; *attr; i++, attr++)
-		sysfs_hash_and_remove(dir_sd, (*attr)->name);
+		sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
 }
 
 static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
@@ -39,7 +39,7 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
 		 * visibility.  Do this by first removing then
 		 * re-adding (if required) the file */
 		if (update)
-			sysfs_hash_and_remove(dir_sd, (*attr)->name);
+			sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
 		if (grp->is_visible) {
 			mode = grp->is_visible(kobj, *attr, i);
 			if (!mode)
@@ -132,7 +132,7 @@ void sysfs_remove_group(struct kobject * kobj,
 	struct sysfs_dirent *sd;
 
 	if (grp->name) {
-		sd = sysfs_get_dirent(dir_sd, grp->name);
+		sd = sysfs_get_dirent(dir_sd, NULL, grp->name);
 		if (!sd) {
 			WARN(!sd, KERN_WARNING "sysfs group %p not found for "
 				"kobject '%s'\n", grp, kobject_name(kobj));
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 220b758523ae..0835a3b70e03 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -18,6 +18,7 @@
 #include <linux/capability.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/xattr.h>
 #include <linux/security.h>
 #include "sysfs.h"
@@ -81,24 +82,23 @@ int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr * iattr)
 		if (!sd_attrs)
 			return -ENOMEM;
 		sd->s_iattr = sd_attrs;
-	} else {
-		/* attributes were changed at least once in past */
-		iattrs = &sd_attrs->ia_iattr;
-
-		if (ia_valid & ATTR_UID)
-			iattrs->ia_uid = iattr->ia_uid;
-		if (ia_valid & ATTR_GID)
-			iattrs->ia_gid = iattr->ia_gid;
-		if (ia_valid & ATTR_ATIME)
-			iattrs->ia_atime = iattr->ia_atime;
-		if (ia_valid & ATTR_MTIME)
-			iattrs->ia_mtime = iattr->ia_mtime;
-		if (ia_valid & ATTR_CTIME)
-			iattrs->ia_ctime = iattr->ia_ctime;
-		if (ia_valid & ATTR_MODE) {
-			umode_t mode = iattr->ia_mode;
-			iattrs->ia_mode = sd->s_mode = mode;
-		}
+	}
+	/* attributes were changed at least once in past */
+	iattrs = &sd_attrs->ia_iattr;
+
+	if (ia_valid & ATTR_UID)
+		iattrs->ia_uid = iattr->ia_uid;
+	if (ia_valid & ATTR_GID)
+		iattrs->ia_gid = iattr->ia_gid;
+	if (ia_valid & ATTR_ATIME)
+		iattrs->ia_atime = iattr->ia_atime;
+	if (ia_valid & ATTR_MTIME)
+		iattrs->ia_mtime = iattr->ia_mtime;
+	if (ia_valid & ATTR_CTIME)
+		iattrs->ia_ctime = iattr->ia_ctime;
+	if (ia_valid & ATTR_MODE) {
+		umode_t mode = iattr->ia_mode;
+		iattrs->ia_mode = sd->s_mode = mode;
 	}
 	return 0;
 }
@@ -112,20 +112,20 @@ int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
 	if (!sd)
 		return -EINVAL;
 
+	mutex_lock(&sysfs_mutex);
 	error = inode_change_ok(inode, iattr);
 	if (error)
-		return error;
-
-	iattr->ia_valid &= ~ATTR_SIZE; /* ignore size changes */
+		goto out;
 
-	error = inode_setattr(inode, iattr);
+	error = sysfs_sd_setattr(sd, iattr);
 	if (error)
-		return error;
+		goto out;
 
-	mutex_lock(&sysfs_mutex);
-	error = sysfs_sd_setattr(sd, iattr);
-	mutex_unlock(&sysfs_mutex);
+	/* this ignores size changes */
+	generic_setattr(inode, iattr);
 
+out:
+	mutex_unlock(&sysfs_mutex);
 	return error;
 }
 
@@ -284,6 +284,7 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
 
 /**
  *	sysfs_get_inode - get inode for sysfs_dirent
+ *	@sb: super block
  *	@sd: sysfs_dirent to allocate inode for
  *
  *	Get inode for @sd.  If such inode doesn't exist, a new inode
@@ -296,11 +297,11 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
  *	RETURNS:
  *	Pointer to allocated inode on success, NULL on failure.
  */
-struct inode * sysfs_get_inode(struct sysfs_dirent *sd)
+struct inode * sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd)
 {
 	struct inode *inode;
 
-	inode = iget_locked(sysfs_sb, sd->s_ino);
+	inode = iget_locked(sb, sd->s_ino);
 	if (inode && (inode->i_state & I_NEW))
 		sysfs_init_inode(sd, inode);
 
@@ -323,7 +324,7 @@ void sysfs_delete_inode(struct inode *inode)
 	sysfs_put(sd);
 }
 
-int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name)
+int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const char *name)
 {
 	struct sysfs_addrm_cxt acxt;
 	struct sysfs_dirent *sd;
@@ -333,7 +334,9 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name)
 
 	sysfs_addrm_start(&acxt, dir_sd);
 
-	sd = sysfs_find_dirent(dir_sd, name);
+	sd = sysfs_find_dirent(dir_sd, ns, name);
+	if (sd && (sd->s_ns != ns))
+		sd = NULL;
 	if (sd)
 		sysfs_remove_one(&acxt, sd);
 
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 49749955ccaf..281c0c9bc39f 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -18,12 +18,12 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/magic.h>
+#include <linux/slab.h>
 
 #include "sysfs.h"
 
 
 static struct vfsmount *sysfs_mount;
-struct super_block * sysfs_sb = NULL;
 struct kmem_cache *sysfs_dir_cachep;
 
 static const struct super_operations sysfs_ops = {
@@ -35,7 +35,7 @@ static const struct super_operations sysfs_ops = {
 struct sysfs_dirent sysfs_root = {
 	.s_name		= "",
 	.s_count	= ATOMIC_INIT(1),
-	.s_flags	= SYSFS_DIR,
+	.s_flags	= SYSFS_DIR | (KOBJ_NS_TYPE_NONE << SYSFS_NS_TYPE_SHIFT),
 	.s_mode		= S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO,
 	.s_ino		= 1,
 };
@@ -50,11 +50,10 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_magic = SYSFS_MAGIC;
 	sb->s_op = &sysfs_ops;
 	sb->s_time_gran = 1;
-	sysfs_sb = sb;
 
 	/* get root inode, initialize and unlock it */
 	mutex_lock(&sysfs_mutex);
-	inode = sysfs_get_inode(&sysfs_root);
+	inode = sysfs_get_inode(sb, &sysfs_root);
 	mutex_unlock(&sysfs_mutex);
 	if (!inode) {
 		pr_debug("sysfs: could not get root inode\n");
@@ -73,18 +72,107 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 }
 
+static int sysfs_test_super(struct super_block *sb, void *data)
+{
+	struct sysfs_super_info *sb_info = sysfs_info(sb);
+	struct sysfs_super_info *info = data;
+	enum kobj_ns_type type;
+	int found = 1;
+
+	for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) {
+		if (sb_info->ns[type] != info->ns[type])
+			found = 0;
+	}
+	return found;
+}
+
+static int sysfs_set_super(struct super_block *sb, void *data)
+{
+	int error;
+	error = set_anon_super(sb, data);
+	if (!error)
+		sb->s_fs_info = data;
+	return error;
+}
+
 static int sysfs_get_sb(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, sysfs_fill_super, mnt);
+	struct sysfs_super_info *info;
+	enum kobj_ns_type type;
+	struct super_block *sb;
+	int error;
+
+	error = -ENOMEM;
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info)
+		goto out;
+
+	for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
+		info->ns[type] = kobj_ns_current(type);
+
+	sb = sget(fs_type, sysfs_test_super, sysfs_set_super, info);
+	if (IS_ERR(sb) || sb->s_fs_info != info)
+		kfree(info);
+	if (IS_ERR(sb)) {
+		error = PTR_ERR(sb);
+		goto out;
+	}
+	if (!sb->s_root) {
+		sb->s_flags = flags;
+		error = sysfs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
+		if (error) {
+			deactivate_locked_super(sb);
+			goto out;
+		}
+		sb->s_flags |= MS_ACTIVE;
+	}
+
+	simple_set_mnt(mnt, sb);
+	error = 0;
+out:
+	return error;
+}
+
+static void sysfs_kill_sb(struct super_block *sb)
+{
+	struct sysfs_super_info *info = sysfs_info(sb);
+
+	/* Remove the superblock from fs_supers/s_instances
+	 * so we can't find it, before freeing sysfs_super_info.
+	 */
+	kill_anon_super(sb);
+	kfree(info);
 }
 
 static struct file_system_type sysfs_fs_type = {
 	.name		= "sysfs",
 	.get_sb		= sysfs_get_sb,
-	.kill_sb	= kill_anon_super,
+	.kill_sb	= sysfs_kill_sb,
 };
 
+void sysfs_exit_ns(enum kobj_ns_type type, const void *ns)
+{
+	struct super_block *sb;
+
+	mutex_lock(&sysfs_mutex);
+	spin_lock(&sb_lock);
+	list_for_each_entry(sb, &sysfs_fs_type.fs_supers, s_instances) {
+		struct sysfs_super_info *info = sysfs_info(sb);
+		/*
+		 * If we see a superblock on the fs_supers/s_instances
+		 * list the unmount has not completed and sb->s_fs_info
+		 * points to a valid struct sysfs_super_info.
+		 */
+		/* Ignore superblocks with the wrong ns */
+		if (info->ns[type] != ns)
+			continue;
+		info->ns[type] = NULL;
+	}
+	spin_unlock(&sb_lock);
+	mutex_unlock(&sysfs_mutex);
+}
+
 int __init sysfs_init(void)
 {
 	int err = -ENOMEM;
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index c5eff49fa41b..a7ac78f8e67a 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -11,6 +11,7 @@
  */
 
 #include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/mount.h>
 #include <linux/module.h>
 #include <linux/kobject.h>
@@ -27,6 +28,7 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
 	struct sysfs_dirent *target_sd = NULL;
 	struct sysfs_dirent *sd = NULL;
 	struct sysfs_addrm_cxt acxt;
+	enum kobj_ns_type ns_type;
 	int error;
 
 	BUG_ON(!name);
@@ -57,14 +59,29 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
 	if (!sd)
 		goto out_put;
 
+	ns_type = sysfs_ns_type(parent_sd);
+	if (ns_type)
+		sd->s_ns = target->ktype->namespace(target);
 	sd->s_symlink.target_sd = target_sd;
 	target_sd = NULL;	/* reference is now owned by the symlink */
 
 	sysfs_addrm_start(&acxt, parent_sd);
-	if (warn)
-		error = sysfs_add_one(&acxt, sd);
-	else
-		error = __sysfs_add_one(&acxt, sd);
+	/* Symlinks must be between directories with the same ns_type */
+	if (!ns_type ||
+	    (ns_type == sysfs_ns_type(sd->s_symlink.target_sd->s_parent))) {
+		if (warn)
+			error = sysfs_add_one(&acxt, sd);
+		else
+			error = __sysfs_add_one(&acxt, sd);
+	} else {
+		error = -EINVAL;
+		WARN(1, KERN_WARNING
+			"sysfs: symlink across ns_types %s/%s -> %s/%s\n",
+			parent_sd->s_name,
+			sd->s_name,
+			sd->s_symlink.target_sd->s_parent->s_name,
+			sd->s_symlink.target_sd->s_name);
+	}
 	sysfs_addrm_finish(&acxt);
 
 	if (error)
@@ -106,6 +123,26 @@ int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target,
 }
 
 /**
+ *	sysfs_delete_link - remove symlink in object's directory.
+ *	@kobj:	object we're acting for.
+ *	@targ:	object we're pointing to.
+ *	@name:	name of the symlink to remove.
+ *
+ *	Unlike sysfs_remove_link sysfs_delete_link has enough information
+ *	to successfully delete symlinks in tagged directories.
+ */
+void sysfs_delete_link(struct kobject *kobj, struct kobject *targ,
+			const char *name)
+{
+	const void *ns = NULL;
+	spin_lock(&sysfs_assoc_lock);
+	if (targ->sd && sysfs_ns_type(kobj->sd))
+		ns = targ->sd->s_ns;
+	spin_unlock(&sysfs_assoc_lock);
+	sysfs_hash_and_remove(kobj->sd, ns, name);
+}
+
+/**
  *	sysfs_remove_link - remove symlink in object's directory.
  *	@kobj:	object we're acting for.
  *	@name:	name of the symlink to remove.
@@ -120,7 +157,52 @@ void sysfs_remove_link(struct kobject * kobj, const char * name)
 	else
 		parent_sd = kobj->sd;
 
-	sysfs_hash_and_remove(parent_sd, name);
+	sysfs_hash_and_remove(parent_sd, NULL, name);
+}
+
+/**
+ *	sysfs_rename_link - rename symlink in object's directory.
+ *	@kobj:	object we're acting for.
+ *	@targ:	object we're pointing to.
+ *	@old:	previous name of the symlink.
+ *	@new:	new name of the symlink.
+ *
+ *	A helper function for the common rename symlink idiom.
+ */
+int sysfs_rename_link(struct kobject *kobj, struct kobject *targ,
+			const char *old, const char *new)
+{
+	struct sysfs_dirent *parent_sd, *sd = NULL;
+	const void *old_ns = NULL, *new_ns = NULL;
+	int result;
+
+	if (!kobj)
+		parent_sd = &sysfs_root;
+	else
+		parent_sd = kobj->sd;
+
+	if (targ->sd)
+		old_ns = targ->sd->s_ns;
+
+	result = -ENOENT;
+	sd = sysfs_get_dirent(parent_sd, old_ns, old);
+	if (!sd)
+		goto out;
+
+	result = -EINVAL;
+	if (sysfs_type(sd) != SYSFS_KOBJ_LINK)
+		goto out;
+	if (sd->s_symlink.target_sd->s_dir.kobj != targ)
+		goto out;
+
+	if (sysfs_ns_type(parent_sd))
+		new_ns = targ->ktype->namespace(targ);
+
+	result = sysfs_rename(sd, parent_sd, new_ns, new);
+
+out:
+	sysfs_put(sd);
+	return result;
 }
 
 static int sysfs_get_target_path(struct sysfs_dirent *parent_sd,
@@ -222,3 +304,4 @@ const struct inode_operations sysfs_symlink_inode_operations = {
 
 EXPORT_SYMBOL_GPL(sysfs_create_link);
 EXPORT_SYMBOL_GPL(sysfs_remove_link);
+EXPORT_SYMBOL_GPL(sysfs_rename_link);
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index cdd9377a6e06..6a13105b5594 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -58,6 +58,7 @@ struct sysfs_dirent {
 	struct sysfs_dirent	*s_sibling;
 	const char		*s_name;
 
+	const void		*s_ns; /* namespace tag */
 	union {
 		struct sysfs_elem_dir		s_dir;
 		struct sysfs_elem_symlink	s_symlink;
@@ -66,8 +67,8 @@ struct sysfs_dirent {
 	};
 
 	unsigned int		s_flags;
+	unsigned short		s_mode;
 	ino_t			s_ino;
-	umode_t			s_mode;
 	struct sysfs_inode_attrs *s_iattr;
 };
 
@@ -79,21 +80,38 @@ struct sysfs_dirent {
 #define SYSFS_KOBJ_BIN_ATTR		0x0004
 #define SYSFS_KOBJ_LINK			0x0008
 #define SYSFS_COPY_NAME			(SYSFS_DIR | SYSFS_KOBJ_LINK)
+#define SYSFS_ACTIVE_REF		(SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR)
 
-#define SYSFS_FLAG_MASK			~SYSFS_TYPE_MASK
-#define SYSFS_FLAG_REMOVED		0x0200
+/* identify any namespace tag on sysfs_dirents */
+#define SYSFS_NS_TYPE_MASK		0xff00
+#define SYSFS_NS_TYPE_SHIFT		8
+
+#define SYSFS_FLAG_MASK			~(SYSFS_NS_TYPE_MASK|SYSFS_TYPE_MASK)
+#define SYSFS_FLAG_REMOVED		0x020000
 
 static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
 {
 	return sd->s_flags & SYSFS_TYPE_MASK;
 }
 
+/*
+ * Return any namespace tags on this dirent.
+ * enum kobj_ns_type is defined in linux/kobject.h
+ */
+static inline enum kobj_ns_type sysfs_ns_type(struct sysfs_dirent *sd)
+{
+	return (sd->s_flags & SYSFS_NS_TYPE_MASK) >> SYSFS_NS_TYPE_SHIFT;
+}
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 #define sysfs_dirent_init_lockdep(sd)				\
 do {								\
-	static struct lock_class_key __key;			\
+	struct attribute *attr = sd->s_attr.attr;		\
+	struct lock_class_key *key = attr->key;			\
+	if (!key)						\
+		key = &attr->skey;				\
 								\
-	lockdep_init_map(&sd->dep_map, "s_active", &__key, 0);	\
+	lockdep_init_map(&sd->dep_map, "s_active", key, 0);	\
 } while(0)
 #else
 #define sysfs_dirent_init_lockdep(sd) do {} while(0)
@@ -110,8 +128,17 @@ struct sysfs_addrm_cxt {
 /*
  * mount.c
  */
+
+/*
+ * Each sb is associated with a set of namespace tags (i.e.
+ * the network namespace of the task which mounted this sysfs
+ * instance).
+ */
+struct sysfs_super_info {
+	const void *ns[KOBJ_NS_TYPES];
+};
+#define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info))
 extern struct sysfs_dirent sysfs_root;
-extern struct super_block *sysfs_sb;
 extern struct kmem_cache *sysfs_dir_cachep;
 
 /*
@@ -124,8 +151,8 @@ extern const struct file_operations sysfs_dir_operations;
 extern const struct inode_operations sysfs_dir_inode_operations;
 
 struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd);
-struct sysfs_dirent *sysfs_get_active_two(struct sysfs_dirent *sd);
-void sysfs_put_active_two(struct sysfs_dirent *sd);
+struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd);
+void sysfs_put_active(struct sysfs_dirent *sd);
 void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
 		       struct sysfs_dirent *parent_sd);
 int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
@@ -134,8 +161,10 @@ void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
 void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
 
 struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
+				       const void *ns,
 				       const unsigned char *name);
 struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
+				      const void *ns,
 				      const unsigned char *name);
 struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type);
 
@@ -146,7 +175,7 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name,
 void sysfs_remove_subdir(struct sysfs_dirent *sd);
 
 int sysfs_rename(struct sysfs_dirent *sd,
-	struct sysfs_dirent *new_parent_sd, const char *new_name);
+	struct sysfs_dirent *new_parent_sd, const void *ns, const char *new_name);
 
 static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd)
 {
@@ -168,7 +197,7 @@ static inline void __sysfs_put(struct sysfs_dirent *sd)
 /*
  * inode.c
  */
-struct inode *sysfs_get_inode(struct sysfs_dirent *sd);
+struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd);
 void sysfs_delete_inode(struct inode *inode);
 int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr);
 int sysfs_permission(struct inode *inode, int mask);
@@ -176,7 +205,7 @@ int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
 int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
 int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 		size_t size, int flags);
-int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name);
+int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const char *name);
 int sysfs_inode_init(void);
 
 /*
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 4e50286a4cc3..79941e4964a4 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -24,7 +24,7 @@ const struct file_operations sysv_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= sysv_readdir,
-	.fsync		= simple_fsync,
+	.fsync		= generic_file_fsync,
 };
 
 static inline void dir_put_page(struct page *page)
@@ -164,8 +164,8 @@ struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_
 							name, de->name))
 					goto found;
 			}
+			dir_put_page(page);
 		}
-		dir_put_page(page);
 
 		if (++n >= npages)
 			n = 0;
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index 96340c01f4a7..750cc22349bd 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -26,7 +26,7 @@ const struct file_operations sysv_file_operations = {
 	.write		= do_sync_write,
 	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
-	.fsync		= simple_fsync,
+	.fsync		= generic_file_fsync,
 	.splice_read	= generic_file_splice_read,
 };
 
diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c
index 241e9765cfad..fcc498ec9b33 100644
--- a/fs/sysv/ialloc.c
+++ b/fs/sysv/ialloc.c
@@ -25,6 +25,7 @@
 #include <linux/stat.h>
 #include <linux/string.h>
 #include <linux/buffer_head.h>
+#include <linux/writeback.h>
 #include "sysv.h"
 
 /* We don't trust the value of
@@ -139,6 +140,9 @@ struct inode * sysv_new_inode(const struct inode * dir, mode_t mode)
 	struct inode *inode;
 	sysv_ino_t ino;
 	unsigned count;
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_NONE
+	};
 
 	inode = new_inode(sb);
 	if (!inode)
@@ -159,15 +163,7 @@ struct inode * sysv_new_inode(const struct inode * dir, mode_t mode)
 	*sbi->s_sb_fic_count = cpu_to_fs16(sbi, count);
 	fs16_add(sbi, sbi->s_sb_total_free_inodes, -1);
 	dirty_sb(sb);
-	
-	if (dir->i_mode & S_ISGID) {
-		inode->i_gid = dir->i_gid;
-		if (S_ISDIR(mode))
-			mode |= S_ISGID;
-	} else
-		inode->i_gid = current_fsgid();
-
-	inode->i_uid = current_fsuid();
+	inode_init_owner(inode, dir, mode);
 	inode->i_ino = fs16_to_cpu(sbi, ino);
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
 	inode->i_blocks = 0;
@@ -176,8 +172,7 @@ struct inode * sysv_new_inode(const struct inode * dir, mode_t mode)
 	insert_inode_hash(inode);
 	mark_inode_dirty(inode);
 
-	inode->i_mode = mode;		/* for sysv_write_inode() */
-	sysv_write_inode(inode, 0);	/* ensure inode not allocated again */
+	sysv_write_inode(inode, &wbc);	/* ensure inode not allocated again */
 	mark_inode_dirty(inode);	/* cleared by sysv_write_inode() */
 	/* That's it. */
 	unlock_super(sb);
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 9824743832a7..d4a5380b5669 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -26,6 +26,7 @@
 #include <linux/init.h>
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
+#include <linux/writeback.h>
 #include <linux/namei.h>
 #include <asm/byteorder.h>
 #include "sysv.h"
@@ -42,6 +43,7 @@ static int sysv_sync_fs(struct super_block *sb, int wait)
 	 * then attach current time stamp.
 	 * But if the filesystem was marked clean, keep it clean.
 	 */
+	sb->s_dirt = 0;
 	old_time = fs32_to_cpu(sbi, *sbi->s_sb_time);
 	if (sbi->s_type == FSTYPE_SYSV4) {
 		if (*sbi->s_sb_state == cpu_to_fs32(sbi, 0x7c269d38 - old_time))
@@ -246,7 +248,7 @@ bad_inode:
 	return ERR_PTR(-EIO);
 }
 
-int sysv_write_inode(struct inode *inode, int wait)
+static int __sysv_write_inode(struct inode *inode, int wait)
 {
 	struct super_block * sb = inode->i_sb;
 	struct sysv_sb_info * sbi = SYSV_SB(sb);
@@ -296,9 +298,14 @@ int sysv_write_inode(struct inode *inode, int wait)
 	return 0;
 }
 
+int sysv_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	return __sysv_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
+}
+
 int sysv_sync_inode(struct inode *inode)
 {
-	return sysv_write_inode(inode, 1);
+	return __sysv_write_inode(inode, 1);
 }
 
 static void sysv_delete_inode(struct inode *inode)
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index 53786eb5cf60..94cb9b4d76c2 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -142,7 +142,7 @@ extern int __sysv_write_begin(struct file *file, struct address_space *mapping,
 
 /* inode.c */
 extern struct inode *sysv_iget(struct super_block *, unsigned int);
-extern int sysv_write_inode(struct inode *, int);
+extern int sysv_write_inode(struct inode *, struct writeback_control *wbc);
 extern int sysv_sync_inode(struct inode *);
 extern void sysv_set_inode(struct inode *, dev_t);
 extern int sysv_getattr(struct vfsmount *, struct dentry *, struct kstat *);
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 1bfc95ad5f71..b86ab8eff79a 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -14,6 +14,7 @@
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
+#include <linux/slab.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/time.h>
@@ -109,31 +110,14 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
 	struct timerfd_ctx *ctx = file->private_data;
 	ssize_t res;
 	u64 ticks = 0;
-	DECLARE_WAITQUEUE(wait, current);
 
 	if (count < sizeof(ticks))
 		return -EINVAL;
 	spin_lock_irq(&ctx->wqh.lock);
-	res = -EAGAIN;
-	if (!ctx->ticks && !(file->f_flags & O_NONBLOCK)) {
-		__add_wait_queue(&ctx->wqh, &wait);
-		for (res = 0;;) {
-			set_current_state(TASK_INTERRUPTIBLE);
-			if (ctx->ticks) {
-				res = 0;
-				break;
-			}
-			if (signal_pending(current)) {
-				res = -ERESTARTSYS;
-				break;
-			}
-			spin_unlock_irq(&ctx->wqh.lock);
-			schedule();
-			spin_lock_irq(&ctx->wqh.lock);
-		}
-		__remove_wait_queue(&ctx->wqh, &wait);
-		__set_current_state(TASK_RUNNING);
-	}
+	if (file->f_flags & O_NONBLOCK)
+		res = -EAGAIN;
+	else
+		res = wait_event_interruptible_locked_irq(ctx->wqh, ctx->ticks);
 	if (ctx->ticks) {
 		ticks = ctx->ticks;
 		if (ctx->expired && ctx->tintv.tv64) {
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 076ca50e9933..c8ff0d1ae5d3 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -62,7 +62,9 @@
  */
 static void shrink_liability(struct ubifs_info *c, int nr_to_write)
 {
+	down_read(&c->vfs_sb->s_umount);
 	writeback_inodes_sb(c->vfs_sb);
+	up_read(&c->vfs_sb->s_umount);
 }
 
 /**
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 4775af401167..37fa7ed062d8 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -45,6 +45,7 @@
 
 #include <linux/freezer.h>
 #include <linux/kthread.h>
+#include <linux/slab.h>
 #include "ubifs.h"
 
 /**
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 90492327b383..c2a68baa782f 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -34,6 +34,7 @@
 #include <linux/moduleparam.h>
 #include <linux/debugfs.h>
 #include <linux/math64.h>
+#include <linux/slab.h>
 
 #ifdef CONFIG_UBIFS_FS_DEBUG
 
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 552fb0111fff..87ebcce72213 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -104,14 +104,7 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
 	 */
 	inode->i_flags |= (S_NOCMTIME);
 
-	inode->i_uid = current_fsuid();
-	if (dir->i_mode & S_ISGID) {
-		inode->i_gid = dir->i_gid;
-		if (S_ISDIR(mode))
-			mode |= S_ISGID;
-	} else
-		inode->i_gid = current_fsgid();
-	inode->i_mode = mode;
+	inode_init_owner(inode, dir, mode);
 	inode->i_mtime = inode->i_atime = inode->i_ctime =
 			 ubifs_current_time(inode);
 	inode->i_mapping->nrpages = 0;
@@ -1120,7 +1113,7 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (release)
 		ubifs_release_budget(c, &ino_req);
 	if (IS_SYNC(old_inode))
-		err = old_inode->i_sb->s_op->write_inode(old_inode, 1);
+		err = old_inode->i_sb->s_op->write_inode(old_inode, NULL);
 	return err;
 
 out_cancel:
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 16a6444330ec..12f445cee9f7 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -52,6 +52,7 @@
 #include "ubifs.h"
 #include <linux/mount.h>
 #include <linux/namei.h>
+#include <linux/slab.h>
 
 static int read_block(struct inode *inode, void *addr, unsigned int block,
 		      struct ubifs_data_node *dn)
@@ -966,12 +967,15 @@ static int do_writepage(struct page *page, int len)
  * the page locked, and it locks @ui_mutex. However, write-back does take inode
  * @i_mutex, which means other VFS operations may be run on this inode at the
  * same time. And the problematic one is truncation to smaller size, from where
- * we have to call 'vmtruncate()', which first changes @inode->i_size, then
+ * we have to call 'simple_setsize()', which first changes @inode->i_size, then
  * drops the truncated pages. And while dropping the pages, it takes the page
- * lock. This means that 'do_truncation()' cannot call 'vmtruncate()' with
+ * lock. This means that 'do_truncation()' cannot call 'simple_setsize()' with
  * @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. This
  * means that @inode->i_size is changed while @ui_mutex is unlocked.
  *
+ * XXX: with the new truncate the above is not true anymore, the simple_setsize
+ * calls can be replaced with the individual components.
+ *
  * But in 'ubifs_writepage()' we have to guarantee that we do not write beyond
  * inode size. How do we do this if @inode->i_size may became smaller while we
  * are in the middle of 'ubifs_writepage()'? The UBIFS solution is the
@@ -1011,7 +1015,7 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
 	/* Is the page fully inside @i_size? */
 	if (page->index < end_index) {
 		if (page->index >= synced_i_size >> PAGE_CACHE_SHIFT) {
-			err = inode->i_sb->s_op->write_inode(inode, 1);
+			err = inode->i_sb->s_op->write_inode(inode, NULL);
 			if (err)
 				goto out_unlock;
 			/*
@@ -1039,7 +1043,7 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
 	kunmap_atomic(kaddr, KM_USER0);
 
 	if (i_size > synced_i_size) {
-		err = inode->i_sb->s_op->write_inode(inode, 1);
+		err = inode->i_sb->s_op->write_inode(inode, NULL);
 		if (err)
 			goto out_unlock;
 	}
@@ -1124,7 +1128,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
 		budgeted = 0;
 	}
 
-	err = vmtruncate(inode, new_size);
+	err = simple_setsize(inode, new_size);
 	if (err)
 		goto out_budg;
 
@@ -1213,7 +1217,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
 
 	if (attr->ia_valid & ATTR_SIZE) {
 		dbg_gen("size %lld -> %lld", inode->i_size, new_size);
-		err = vmtruncate(inode, new_size);
+		err = simple_setsize(inode, new_size);
 		if (err)
 			goto out;
 	}
@@ -1222,7 +1226,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
 	if (attr->ia_valid & ATTR_SIZE) {
 		/* Truncation changes inode [mc]time */
 		inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
-		/* 'vmtruncate()' changed @i_size, update @ui_size */
+		/* 'simple_setsize()' changed @i_size, update @ui_size */
 		ui->ui_size = inode->i_size;
 	}
 
@@ -1242,7 +1246,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
 	if (release)
 		ubifs_release_budget(c, &req);
 	if (IS_SYNC(inode))
-		err = inode->i_sb->s_op->write_inode(inode, 1);
+		err = inode->i_sb->s_op->write_inode(inode, NULL);
 	return err;
 
 out:
@@ -1303,9 +1307,9 @@ static void *ubifs_follow_link(struct dentry *dentry, struct nameidata *nd)
 	return NULL;
 }
 
-int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int ubifs_fsync(struct file *file, int datasync)
 {
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = file->f_mapping->host;
 	struct ubifs_info *c = inode->i_sb->s_fs_info;
 	int err;
 
@@ -1316,7 +1320,7 @@ int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync)
 	 * the inode unless this is a 'datasync()' call.
 	 */
 	if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) {
-		err = inode->i_sb->s_op->write_inode(inode, 1);
+		err = inode->i_sb->s_op->write_inode(inode, NULL);
 		if (err)
 			return err;
 	}
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index e5a3d8e96bb7..918d1582ca05 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -53,6 +53,7 @@
  * good, and GC takes extra care when moving them.
  */
 
+#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/list_sort.h>
 #include "ubifs.h"
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index e589fedaf1ef..bcf5a16f30bb 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -51,6 +51,7 @@
  */
 
 #include <linux/crc32.h>
+#include <linux/slab.h>
 #include "ubifs.h"
 
 /**
@@ -63,6 +64,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
 	if (!c->ro_media) {
 		c->ro_media = 1;
 		c->no_chk_data_crc = 0;
+		c->vfs_sb->s_flags |= MS_RDONLY;
 		ubifs_warn("switched to read-only mode, error %d", err);
 		dbg_dump_stack();
 	}
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index b2792e84d245..ad7f67b827ea 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -46,6 +46,7 @@
 #include "ubifs.h"
 #include <linux/crc16.h>
 #include <linux/math64.h>
+#include <linux/slab.h>
 
 /**
  * do_calc_lpt_geom - calculate sizes for the LPT area.
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 8cbfb8248025..13cb7a4237bf 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -26,6 +26,7 @@
  */
 
 #include <linux/crc16.h>
+#include <linux/slab.h>
 #include "ubifs.h"
 
 /**
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 868a55ee080f..109c6ea03bb5 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -31,6 +31,7 @@
  */
 
 #include <linux/crc32.h>
+#include <linux/slab.h>
 #include "ubifs.h"
 
 /**
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 57085e43320f..96cb62c8a9dd 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -27,6 +27,7 @@
  */
 
 #include "ubifs.h"
+#include <linux/slab.h>
 #include <linux/random.h>
 #include <linux/math64.h>
 
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index 02feb59cefca..0b201114a5ad 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -277,7 +277,7 @@ static int kick_a_thread(void)
 	return 0;
 }
 
-int ubifs_shrinker(int nr, gfp_t gfp_mask)
+int ubifs_shrinker(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 {
 	int freed, contention = 0;
 	long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 43f9d19a6f33..4d2f2157dd3f 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -283,7 +283,7 @@ static void ubifs_destroy_inode(struct inode *inode)
 /*
  * Note, Linux write-back code calls this without 'i_mutex'.
  */
-static int ubifs_write_inode(struct inode *inode, int wait)
+static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	int err = 0;
 	struct ubifs_info *c = inode->i_sb->s_fs_info;
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index e5b1a7d00fa0..2194915220e5 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -31,6 +31,7 @@
  */
 
 #include <linux/crc32.h>
+#include <linux/slab.h>
 #include "ubifs.h"
 
 /*
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index b2d976366a46..04310878f449 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -28,6 +28,7 @@
 #include <linux/fs.h>
 #include <linux/err.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/spinlock.h>
 #include <linux/mutex.h>
@@ -378,7 +379,7 @@ struct ubifs_gced_idx_leb {
  * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses
  * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot
  * make sure @inode->i_size is always changed under @ui_mutex, because it
- * cannot call 'vmtruncate()' with @ui_mutex locked, because it would deadlock
+ * cannot call 'simple_setsize()' with @ui_mutex locked, because it would deadlock
  * with 'ubifs_writepage()' (see file.c). All the other inode fields are
  * changed under @ui_mutex, so they do not need "shadow" fields. Note, one
  * could consider to rework locking and base it on "shadow" fields.
@@ -1574,7 +1575,7 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot);
 int ubifs_tnc_end_commit(struct ubifs_info *c);
 
 /* shrinker.c */
-int ubifs_shrinker(int nr_to_scan, gfp_t gfp_mask);
+int ubifs_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask);
 
 /* commit.c */
 int ubifs_bg_thread(void *info);
@@ -1677,7 +1678,7 @@ const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c);
 int ubifs_calc_dark(const struct ubifs_info *c, int spc);
 
 /* file.c */
-int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync);
+int ubifs_fsync(struct file *file, int datasync);
 int ubifs_setattr(struct dentry *dentry, struct iattr *attr);
 
 /* dir.c */
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 195830f47569..c74400f88fe0 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -56,6 +56,7 @@
  */
 
 #include "ubifs.h"
+#include <linux/slab.h>
 #include <linux/xattr.h>
 #include <linux/posix_acl_xattr.h>
 
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 82372e332f08..b608efaa4cee 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -21,7 +21,6 @@
 
 #include "udfdecl.h"
 
-#include <linux/quotaops.h>
 #include <linux/buffer_head.h>
 #include <linux/bitops.h>
 
@@ -31,55 +30,8 @@
 #define udf_clear_bit(nr, addr) ext2_clear_bit(nr, addr)
 #define udf_set_bit(nr, addr) ext2_set_bit(nr, addr)
 #define udf_test_bit(nr, addr) ext2_test_bit(nr, addr)
-#define udf_find_first_one_bit(addr, size) find_first_one_bit(addr, size)
 #define udf_find_next_one_bit(addr, size, offset) \
-		find_next_one_bit(addr, size, offset)
-
-#define leBPL_to_cpup(x) leNUM_to_cpup(BITS_PER_LONG, x)
-#define leNUM_to_cpup(x, y) xleNUM_to_cpup(x, y)
-#define xleNUM_to_cpup(x, y) (le ## x ## _to_cpup(y))
-#define uintBPL_t uint(BITS_PER_LONG)
-#define uint(x) xuint(x)
-#define xuint(x) __le ## x
-
-static inline int find_next_one_bit(void *addr, int size, int offset)
-{
-	uintBPL_t *p = ((uintBPL_t *) addr) + (offset / BITS_PER_LONG);
-	int result = offset & ~(BITS_PER_LONG - 1);
-	unsigned long tmp;
-
-	if (offset >= size)
-		return size;
-	size -= result;
-	offset &= (BITS_PER_LONG - 1);
-	if (offset) {
-		tmp = leBPL_to_cpup(p++);
-		tmp &= ~0UL << offset;
-		if (size < BITS_PER_LONG)
-			goto found_first;
-		if (tmp)
-			goto found_middle;
-		size -= BITS_PER_LONG;
-		result += BITS_PER_LONG;
-	}
-	while (size & ~(BITS_PER_LONG - 1)) {
-		tmp = leBPL_to_cpup(p++);
-		if (tmp)
-			goto found_middle;
-		result += BITS_PER_LONG;
-		size -= BITS_PER_LONG;
-	}
-	if (!size)
-		return result;
-	tmp = leBPL_to_cpup(p);
-found_first:
-	tmp &= ~0UL >> (BITS_PER_LONG - size);
-found_middle:
-	return result + ffz(~tmp);
-}
-
-#define find_first_one_bit(addr, size)\
-	find_next_one_bit((addr), (size), 0)
+		ext2_find_next_bit(addr, size, offset)
 
 static int read_block_bitmap(struct super_block *sb,
 			     struct udf_bitmap *bitmap, unsigned int block,
@@ -172,9 +124,8 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
 
 	mutex_lock(&sbi->s_alloc_mutex);
 	partmap = &sbi->s_partmaps[bloc->partitionReferenceNum];
-	if (bloc->logicalBlockNum < 0 ||
-	    (bloc->logicalBlockNum + count) >
-		partmap->s_partition_len) {
+	if (bloc->logicalBlockNum + count < count ||
+	    (bloc->logicalBlockNum + count) > partmap->s_partition_len) {
 		udf_debug("%d < %d || %d + %d > %d\n",
 			  bloc->logicalBlockNum, 0, bloc->logicalBlockNum,
 			  count, partmap->s_partition_len);
@@ -207,8 +158,6 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
 				udf_debug("byte=%2x\n",
 					((char *)bh->b_data)[(bit + i) >> 3]);
 			} else {
-				if (inode)
-					vfs_dq_free_block(inode, 1);
 				udf_add_free_space(sb, sbi->s_partition, 1);
 			}
 		}
@@ -258,15 +207,8 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb,
 		bit = block % (sb->s_blocksize << 3);
 
 		while (bit < (sb->s_blocksize << 3) && block_count > 0) {
-			if (!udf_test_bit(bit, bh->b_data))
-				goto out;
-			else if (vfs_dq_prealloc_block(inode, 1))
-				goto out;
-			else if (!udf_clear_bit(bit, bh->b_data)) {
-				udf_debug("bit already cleared for block %d\n", bit);
-				vfs_dq_free_block(inode, 1);
+			if (!udf_clear_bit(bit, bh->b_data))
 				goto out;
-			}
 			block_count--;
 			alloc_count++;
 			bit++;
@@ -386,16 +328,6 @@ search_back:
 	}
 
 got_block:
-
-	/*
-	 * Check quota for allocation of this block.
-	 */
-	if (inode && vfs_dq_alloc_block(inode, 1)) {
-		mutex_unlock(&sbi->s_alloc_mutex);
-		*err = -EDQUOT;
-		return 0;
-	}
-
 	newblock = bit + (block_group << (sb->s_blocksize_bits + 3)) -
 		(sizeof(struct spaceBitmapDesc) << 3);
 
@@ -436,9 +368,8 @@ static void udf_table_free_blocks(struct super_block *sb,
 
 	mutex_lock(&sbi->s_alloc_mutex);
 	partmap = &sbi->s_partmaps[bloc->partitionReferenceNum];
-	if (bloc->logicalBlockNum < 0 ||
-	    (bloc->logicalBlockNum + count) >
-		partmap->s_partition_len) {
+	if (bloc->logicalBlockNum + count < count ||
+	    (bloc->logicalBlockNum + count) > partmap->s_partition_len) {
 		udf_debug("%d < %d || %d + %d > %d\n",
 			  bloc->logicalBlockNum, 0, bloc->logicalBlockNum, count,
 			  partmap->s_partition_len);
@@ -446,10 +377,6 @@ static void udf_table_free_blocks(struct super_block *sb,
 	}
 
 	iinfo = UDF_I(table);
-	/* We do this up front - There are some error conditions that
-	   could occure, but.. oh well */
-	if (inode)
-		vfs_dq_free_block(inode, count);
 	udf_add_free_space(sb, sbi->s_partition, count);
 
 	start = bloc->logicalBlockNum + offset;
@@ -547,7 +474,7 @@ static void udf_table_free_blocks(struct super_block *sb,
 		}
 
 		if (epos.offset + (2 * adsize) > sb->s_blocksize) {
-			char *sptr, *dptr;
+			unsigned char *sptr, *dptr;
 			int loffset;
 
 			brelse(oepos.bh);
@@ -694,10 +621,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
 		epos.offset -= adsize;
 
 		alloc_count = (elen >> sb->s_blocksize_bits);
-		if (inode && vfs_dq_prealloc_block(inode,
-			alloc_count > block_count ? block_count : alloc_count))
-			alloc_count = 0;
-		else if (alloc_count > block_count) {
+		if (alloc_count > block_count) {
 			alloc_count = block_count;
 			eloc.logicalBlockNum += alloc_count;
 			elen -= (alloc_count << sb->s_blocksize_bits);
@@ -798,13 +722,6 @@ static int udf_table_new_block(struct super_block *sb,
 	goal_eloc.logicalBlockNum++;
 	goal_elen -= sb->s_blocksize;
 
-	if (inode && vfs_dq_alloc_block(inode, 1)) {
-		brelse(goal_epos.bh);
-		mutex_unlock(&sbi->s_alloc_mutex);
-		*err = -EDQUOT;
-		return 0;
-	}
-
 	if (goal_elen)
 		udf_write_aext(table, &goal_epos, &goal_eloc, goal_elen, 1);
 	else
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 61d9a76a3a69..51552bf50225 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -45,8 +45,8 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
 	int block, iblock;
 	loff_t nf_pos = (filp->f_pos - 1) << 2;
 	int flen;
-	char *fname = NULL;
-	char *nameptr;
+	unsigned char *fname = NULL;
+	unsigned char *nameptr;
 	uint16_t liu;
 	uint8_t lfi;
 	loff_t size = udf_ext0_offset(dir) + dir->i_size;
@@ -207,8 +207,9 @@ static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir)
 
 /* readdir and lookup functions */
 const struct file_operations udf_dir_operations = {
+	.llseek			= generic_file_llseek,
 	.read			= generic_read_dir,
 	.readdir		= udf_readdir,
-	.ioctl			= udf_ioctl,
-	.fsync			= simple_fsync,
+	.unlocked_ioctl		= udf_ioctl,
+	.fsync			= generic_file_fsync,
 };
diff --git a/fs/udf/file.c b/fs/udf/file.c
index f311d509b6a3..94e06d6bddbd 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -36,6 +36,7 @@
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
 #include <linux/aio.h>
+#include <linux/smp_lock.h>
 
 #include "udf_i.h"
 #include "udf_sb.h"
@@ -143,50 +144,60 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	return retval;
 }
 
-int udf_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
-	      unsigned long arg)
+long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
+	struct inode *inode = filp->f_dentry->d_inode;
 	long old_block, new_block;
 	int result = -EINVAL;
 
+	lock_kernel();
+
 	if (file_permission(filp, MAY_READ) != 0) {
-		udf_debug("no permission to access inode %lu\n",
-			  inode->i_ino);
-		return -EPERM;
+		udf_debug("no permission to access inode %lu\n", inode->i_ino);
+		result = -EPERM;
+		goto out;
 	}
 
 	if (!arg) {
 		udf_debug("invalid argument to udf_ioctl\n");
-		return -EINVAL;
+		result = -EINVAL;
+		goto out;
 	}
 
 	switch (cmd) {
 	case UDF_GETVOLIDENT:
 		if (copy_to_user((char __user *)arg,
 				 UDF_SB(inode->i_sb)->s_volume_ident, 32))
-			return -EFAULT;
+			result = -EFAULT;
 		else
-			return 0;
+			result = 0;
+		goto out;
 	case UDF_RELOCATE_BLOCKS:
-		if (!capable(CAP_SYS_ADMIN))
-			return -EACCES;
-		if (get_user(old_block, (long __user *)arg))
-			return -EFAULT;
+		if (!capable(CAP_SYS_ADMIN)) {
+			result = -EACCES;
+			goto out;
+		}
+		if (get_user(old_block, (long __user *)arg)) {
+			result = -EFAULT;
+			goto out;
+		}
 		result = udf_relocate_blocks(inode->i_sb,
 						old_block, &new_block);
 		if (result == 0)
 			result = put_user(new_block, (long __user *)arg);
-		return result;
+		goto out;
 	case UDF_GETEASIZE:
 		result = put_user(UDF_I(inode)->i_lenEAttr, (int __user *)arg);
-		break;
+		goto out;
 	case UDF_GETEABLOCK:
 		result = copy_to_user((char __user *)arg,
 				      UDF_I(inode)->i_ext.i_data,
 				      UDF_I(inode)->i_lenEAttr) ? -EFAULT : 0;
-		break;
+		goto out;
 	}
 
+out:
+	unlock_kernel();
 	return result;
 }
 
@@ -206,17 +217,17 @@ static int udf_release_file(struct inode *inode, struct file *filp)
 const struct file_operations udf_file_operations = {
 	.read			= do_sync_read,
 	.aio_read		= generic_file_aio_read,
-	.ioctl			= udf_ioctl,
+	.unlocked_ioctl		= udf_ioctl,
 	.open			= generic_file_open,
 	.mmap			= generic_file_mmap,
 	.write			= do_sync_write,
 	.aio_write		= udf_file_aio_write,
 	.release		= udf_release_file,
-	.fsync			= simple_fsync,
+	.fsync			= generic_file_fsync,
 	.splice_read		= generic_file_splice_read,
 	.llseek			= generic_file_llseek,
 };
 
 const struct inode_operations udf_file_inode_operations = {
-	.truncate = udf_truncate,
+	.truncate		= udf_truncate,
 };
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index c10fa39f97e2..18cd7111185d 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -20,7 +20,6 @@
 
 #include "udfdecl.h"
 #include <linux/fs.h>
-#include <linux/quotaops.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 
@@ -32,13 +31,6 @@ void udf_free_inode(struct inode *inode)
 	struct super_block *sb = inode->i_sb;
 	struct udf_sb_info *sbi = UDF_SB(sb);
 
-	/*
-	 * Note: we must free any quota before locking the superblock,
-	 * as writing the quota to disk may need the lock as well.
-	 */
-	vfs_dq_free_inode(inode);
-	vfs_dq_drop(inode);
-
 	clear_inode(inode);
 
 	mutex_lock(&sbi->s_alloc_mutex);
@@ -124,15 +116,8 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
 		udf_updated_lvid(sb);
 	}
 	mutex_unlock(&sbi->s_alloc_mutex);
-	inode->i_mode = mode;
-	inode->i_uid = current_fsuid();
-	if (dir->i_mode & S_ISGID) {
-		inode->i_gid = dir->i_gid;
-		if (S_ISDIR(mode))
-			mode |= S_ISGID;
-	} else {
-		inode->i_gid = current_fsgid();
-	}
+
+	inode_init_owner(inode, dir, mode);
 
 	iinfo->i_location.logicalBlockNum = block;
 	iinfo->i_location.partitionReferenceNum =
@@ -153,15 +138,6 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
 	insert_inode_hash(inode);
 	mark_inode_dirty(inode);
 
-	if (vfs_dq_alloc_inode(inode)) {
-		vfs_dq_drop(inode);
-		inode->i_flags |= S_NOQUOTA;
-		inode->i_nlink = 0;
-		iput(inode);
-		*err = -EDQUOT;
-		return NULL;
-	}
-
 	*err = 0;
 	return inode;
 }
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index f90231eb2916..124852bcf6fe 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -102,12 +102,13 @@ void udf_clear_inode(struct inode *inode)
 	if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB &&
 	    inode->i_size != iinfo->i_lenExtents) {
 		printk(KERN_WARNING "UDF-fs (%s): Inode %lu (mode %o) has "
-			"inode size %llu different from extent lenght %llu. "
+			"inode size %llu different from extent length %llu. "
 			"Filesystem need not be standards compliant.\n",
 			inode->i_sb->s_id, inode->i_ino, inode->i_mode,
 			(unsigned long long)inode->i_size,
 			(unsigned long long)iinfo->i_lenExtents);
 	}
+
 	kfree(iinfo->i_ext.i_data);
 	iinfo->i_ext.i_data = NULL;
 }
@@ -1308,7 +1309,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
 		break;
 	case ICBTAG_FILE_TYPE_SYMLINK:
 		inode->i_data.a_ops = &udf_symlink_aops;
-		inode->i_op = &page_symlink_inode_operations;
+		inode->i_op = &udf_symlink_inode_operations;
 		inode->i_mode = S_IFLNK | S_IRWXUGO;
 		break;
 	case ICBTAG_FILE_TYPE_MAIN:
@@ -1373,12 +1374,12 @@ static mode_t udf_convert_permissions(struct fileEntry *fe)
 	return mode;
 }
 
-int udf_write_inode(struct inode *inode, int sync)
+int udf_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	int ret;
 
 	lock_kernel();
-	ret = udf_update_inode(inode, sync);
+	ret = udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
 	unlock_kernel();
 
 	return ret;
@@ -1402,20 +1403,19 @@ static int udf_update_inode(struct inode *inode, int do_sync)
 	unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
 	struct udf_inode_info *iinfo = UDF_I(inode);
 
-	bh = udf_tread(inode->i_sb,
-			udf_get_lb_pblock(inode->i_sb,
-					  &iinfo->i_location, 0));
+	bh = udf_tgetblk(inode->i_sb,
+			udf_get_lb_pblock(inode->i_sb, &iinfo->i_location, 0));
 	if (!bh) {
-		udf_debug("bread failure\n");
-		return -EIO;
+		udf_debug("getblk failure\n");
+		return -ENOMEM;
 	}
 
-	memset(bh->b_data, 0x00, inode->i_sb->s_blocksize);
-
+	lock_buffer(bh);
+	memset(bh->b_data, 0, inode->i_sb->s_blocksize);
 	fe = (struct fileEntry *)bh->b_data;
 	efe = (struct extendedFileEntry *)bh->b_data;
 
-	if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_USE)) {
+	if (iinfo->i_use) {
 		struct unallocSpaceEntry *use =
 			(struct unallocSpaceEntry *)bh->b_data;
 
@@ -1423,20 +1423,18 @@ static int udf_update_inode(struct inode *inode, int do_sync)
 		memcpy(bh->b_data + sizeof(struct unallocSpaceEntry),
 		       iinfo->i_ext.i_data, inode->i_sb->s_blocksize -
 					sizeof(struct unallocSpaceEntry));
+		use->descTag.tagIdent = cpu_to_le16(TAG_IDENT_USE);
+		use->descTag.tagLocation =
+				cpu_to_le32(iinfo->i_location.logicalBlockNum);
 		crclen = sizeof(struct unallocSpaceEntry) +
 				iinfo->i_lenAlloc - sizeof(struct tag);
-		use->descTag.tagLocation = cpu_to_le32(
-						iinfo->i_location.
-							logicalBlockNum);
 		use->descTag.descCRCLength = cpu_to_le16(crclen);
 		use->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)use +
 							   sizeof(struct tag),
 							   crclen));
 		use->descTag.tagChecksum = udf_tag_checksum(&use->descTag);
 
-		mark_buffer_dirty(bh);
-		brelse(bh);
-		return err;
+		goto out;
 	}
 
 	if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_FORGET))
@@ -1591,18 +1589,21 @@ static int udf_update_inode(struct inode *inode, int do_sync)
 	fe->descTag.tagSerialNum = cpu_to_le16(sbi->s_serial_number);
 	fe->descTag.tagLocation = cpu_to_le32(
 					iinfo->i_location.logicalBlockNum);
-	crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc -
-								sizeof(struct tag);
+	crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc - sizeof(struct tag);
 	fe->descTag.descCRCLength = cpu_to_le16(crclen);
 	fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(struct tag),
 						  crclen));
 	fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag);
 
+out:
+	set_buffer_uptodate(bh);
+	unlock_buffer(bh);
+
 	/* write the data blocks */
 	mark_buffer_dirty(bh);
 	if (do_sync) {
 		sync_dirty_buffer(bh);
-		if (buffer_req(bh) && !buffer_uptodate(bh)) {
+		if (buffer_write_io_error(bh)) {
 			printk(KERN_WARNING "IO error syncing udf inode "
 				"[%s:%08lx]\n", inode->i_sb->s_id,
 				inode->i_ino);
@@ -1672,7 +1673,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
 		return -1;
 
 	if (epos->offset + (2 * adsize) > inode->i_sb->s_blocksize) {
-		char *sptr, *dptr;
+		unsigned char *sptr, *dptr;
 		struct buffer_head *nbh;
 		int err, loffset;
 		struct kernel_lb_addr obloc = epos->block;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index cd2115060fdc..bf5fc674193c 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -27,15 +27,14 @@
 #include <linux/errno.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
-#include <linux/quotaops.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/sched.h>
 #include <linux/crc-itu-t.h>
 #include <linux/exportfs.h>
 
-static inline int udf_match(int len1, const char *name1, int len2,
-			    const char *name2)
+static inline int udf_match(int len1, const unsigned char *name1, int len2,
+			    const unsigned char *name2)
 {
 	if (len1 != len2)
 		return 0;
@@ -142,15 +141,15 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
 }
 
 static struct fileIdentDesc *udf_find_entry(struct inode *dir,
-					    struct qstr *child,
+					    const struct qstr *child,
 					    struct udf_fileident_bh *fibh,
 					    struct fileIdentDesc *cfi)
 {
 	struct fileIdentDesc *fi = NULL;
 	loff_t f_pos;
 	int block, flen;
-	char *fname = NULL;
-	char *nameptr;
+	unsigned char *fname = NULL;
+	unsigned char *nameptr;
 	uint8_t lfi;
 	uint16_t liu;
 	loff_t size;
@@ -308,7 +307,7 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
 {
 	struct super_block *sb = dir->i_sb;
 	struct fileIdentDesc *fi = NULL;
-	char *name = NULL;
+	unsigned char *name = NULL;
 	int namelen;
 	loff_t f_pos;
 	loff_t size = udf_ext0_offset(dir) + dir->i_size;
@@ -577,7 +576,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
 		inode->i_data.a_ops = &udf_aops;
 	inode->i_op = &udf_file_inode_operations;
 	inode->i_fop = &udf_file_operations;
-	inode->i_mode = mode;
 	mark_inode_dirty(inode);
 
 	fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
@@ -623,7 +621,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
 		goto out;
 
 	iinfo = UDF_I(inode);
-	inode->i_uid = current_fsuid();
 	init_special_inode(inode, mode, rdev);
 	fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
 	if (!fi) {
@@ -668,7 +665,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 		goto out;
 
 	err = -EIO;
-	inode = udf_new_inode(dir, S_IFDIR, &err);
+	inode = udf_new_inode(dir, S_IFDIR | mode, &err);
 	if (!inode)
 		goto out;
 
@@ -691,9 +688,6 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 			FID_FILE_CHAR_DIRECTORY | FID_FILE_CHAR_PARENT;
 	udf_write_fi(inode, &cfi, fi, &fibh, NULL, NULL);
 	brelse(fibh.sbh);
-	inode->i_mode = S_IFDIR | mode;
-	if (dir->i_mode & S_ISGID)
-		inode->i_mode |= S_ISGID;
 	mark_inode_dirty(inode);
 
 	fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
@@ -885,22 +879,22 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
 {
 	struct inode *inode;
 	struct pathComponent *pc;
-	char *compstart;
+	const char *compstart;
 	struct udf_fileident_bh fibh;
 	struct extent_position epos = {};
 	int eoffset, elen = 0;
 	struct fileIdentDesc *fi;
 	struct fileIdentDesc cfi;
-	char *ea;
+	uint8_t *ea;
 	int err;
 	int block;
-	char *name = NULL;
+	unsigned char *name = NULL;
 	int namelen;
 	struct buffer_head *bh;
 	struct udf_inode_info *iinfo;
 
 	lock_kernel();
-	inode = udf_new_inode(dir, S_IFLNK, &err);
+	inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err);
 	if (!inode)
 		goto out;
 
@@ -911,9 +905,8 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
 	}
 
 	iinfo = UDF_I(inode);
-	inode->i_mode = S_IFLNK | S_IRWXUGO;
 	inode->i_data.a_ops = &udf_symlink_aops;
-	inode->i_op = &page_symlink_inode_operations;
+	inode->i_op = &udf_symlink_inode_operations;
 
 	if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
 		struct kernel_lb_addr eloc;
@@ -970,7 +963,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
 
 		pc = (struct pathComponent *)(ea + elen);
 
-		compstart = (char *)symname;
+		compstart = symname;
 
 		do {
 			symname++;
@@ -1384,3 +1377,8 @@ const struct inode_operations udf_dir_inode_operations = {
 	.mknod				= udf_mknod,
 	.rename				= udf_rename,
 };
+const struct inode_operations udf_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+};
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index 4b540ee632d5..745eb209be0c 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -24,7 +24,6 @@
 
 #include <linux/fs.h>
 #include <linux/string.h>
-#include <linux/slab.h>
 #include <linux/buffer_head.h>
 
 uint32_t udf_get_pblock(struct super_block *sb, uint32_t block,
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 1e4543cbcd27..612d1e2e285a 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -557,6 +557,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
 {
 	struct udf_options uopt;
 	struct udf_sb_info *sbi = UDF_SB(sb);
+	int error = 0;
 
 	uopt.flags = sbi->s_flags;
 	uopt.uid   = sbi->s_uid;
@@ -582,17 +583,17 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
 			*flags |= MS_RDONLY;
 	}
 
-	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
-		unlock_kernel();
-		return 0;
-	}
+	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+		goto out_unlock;
+
 	if (*flags & MS_RDONLY)
 		udf_close_lvid(sb);
 	else
 		udf_open_lvid(sb);
 
+out_unlock:
 	unlock_kernel();
-	return 0;
+	return error;
 }
 
 /* Check Volume Structure Descriptors (ECMA 167 2/9.1) */
@@ -1939,7 +1940,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 	/* Fill in the rest of the superblock */
 	sb->s_op = &udf_sb_ops;
 	sb->s_export_op = &udf_export_ops;
-	sb->dq_op = NULL;
+
 	sb->s_dirt = 0;
 	sb->s_magic = UDF_SUPER_MAGIC;
 	sb->s_time_gran = 1000;
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index c3265e1385d4..16064787d2b7 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -26,18 +26,17 @@
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/stat.h>
-#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include "udf_i.h"
 
-static void udf_pc_to_char(struct super_block *sb, char *from, int fromlen,
-			   char *to)
+static void udf_pc_to_char(struct super_block *sb, unsigned char *from,
+			   int fromlen, unsigned char *to)
 {
 	struct pathComponent *pc;
 	int elen = 0;
-	char *p = to;
+	unsigned char *p = to;
 
 	while (elen < fromlen) {
 		pc = (struct pathComponent *)(from + elen);
@@ -75,9 +74,9 @@ static int udf_symlink_filler(struct file *file, struct page *page)
 {
 	struct inode *inode = page->mapping->host;
 	struct buffer_head *bh = NULL;
-	char *symlink;
+	unsigned char *symlink;
 	int err = -EIO;
-	char *p = kmap(page);
+	unsigned char *p = kmap(page);
 	struct udf_inode_info *iinfo;
 
 	lock_kernel();
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 8d46f4294ee7..2bac0354891f 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -76,6 +76,7 @@ extern const struct inode_operations udf_dir_inode_operations;
 extern const struct file_operations udf_dir_operations;
 extern const struct inode_operations udf_file_inode_operations;
 extern const struct file_operations udf_file_operations;
+extern const struct inode_operations udf_symlink_inode_operations;
 extern const struct address_space_operations udf_aops;
 extern const struct address_space_operations udf_adinicb_aops;
 extern const struct address_space_operations udf_symlink_aops;
@@ -129,9 +130,7 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
 			uint8_t *, uint8_t *);
 
 /* file.c */
-extern int udf_ioctl(struct inode *, struct file *, unsigned int,
-		     unsigned long);
-
+extern long udf_ioctl(struct file *, unsigned int, unsigned long);
 /* inode.c */
 extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
 extern int udf_sync_inode(struct inode *);
@@ -142,7 +141,7 @@ extern void udf_truncate(struct inode *);
 extern void udf_read_inode(struct inode *);
 extern void udf_delete_inode(struct inode *);
 extern void udf_clear_inode(struct inode *);
-extern int udf_write_inode(struct inode *, int);
+extern int udf_write_inode(struct inode *, struct writeback_control *wbc);
 extern long udf_block_map(struct inode *, sector_t);
 extern int udf_extend_file(struct inode *, struct extent_position *,
 			   struct kernel_long_ad *, sector_t);
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index cefa8c8913e6..d03a90b6ad69 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -24,6 +24,7 @@
 #include <linux/string.h>	/* for memset */
 #include <linux/nls.h>
 #include <linux/crc-itu-t.h>
+#include <linux/slab.h>
 
 #include "udf_sb.h"
 
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 54c16ec95dff..048484fb10d2 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -12,7 +12,6 @@
 #include <linux/stat.h>
 #include <linux/time.h>
 #include <linux/string.h>
-#include <linux/quotaops.h>
 #include <linux/buffer_head.h>
 #include <linux/capability.h>
 #include <linux/bitops.h>
@@ -85,9 +84,6 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
 				   "bit already cleared for fragment %u", i);
 	}
 	
-	vfs_dq_free_block(inode, count);
-
-	
 	fs32_add(sb, &ucg->cg_cs.cs_nffree, count);
 	uspi->cs_total.cs_nffree += count;
 	fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
@@ -195,7 +191,6 @@ do_more:
 		ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
 		if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
 			ufs_clusteracct (sb, ucpi, blkno, 1);
-		vfs_dq_free_block(inode, uspi->s_fpb);
 
 		fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1);
 		uspi->cs_total.cs_nbfree++;
@@ -556,10 +551,6 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
 		fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1);
 	for (i = oldcount; i < newcount; i++)
 		ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i);
-	if (vfs_dq_alloc_block(inode, count)) {
-		*err = -EDQUOT;
-		return 0;
-	}
 
 	fs32_sub(sb, &ucg->cg_cs.cs_nffree, count);
 	fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
@@ -664,7 +655,6 @@ cg_found:
 		for (i = count; i < uspi->s_fpb; i++)
 			ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i);
 		i = uspi->s_fpb - count;
-		vfs_dq_free_block(inode, i);
 
 		fs32_add(sb, &ucg->cg_cs.cs_nffree, i);
 		uspi->cs_total.cs_nffree += i;
@@ -676,10 +666,6 @@ cg_found:
 	result = ufs_bitmap_search (sb, ucpi, goal, allocsize);
 	if (result == INVBLOCK)
 		return 0;
-	if (vfs_dq_alloc_block(inode, count)) {
-		*err = -EDQUOT;
-		return 0;
-	}
 	for (i = 0; i < count; i++)
 		ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, result + i);
 	
@@ -747,10 +733,6 @@ gotit:
 	ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
 	if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
 		ufs_clusteracct (sb, ucpi, blkno, -1);
-	if (vfs_dq_alloc_block(inode, uspi->s_fpb)) {
-		*err = -EDQUOT;
-		return INVBLOCK;
-	}
 
 	fs32_sub(sb, &ucg->cg_cs.cs_nbfree, 1);
 	uspi->cs_total.cs_nbfree--;
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 22af68f8b682..ec784756dc65 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -31,7 +31,7 @@
  * len <= UFS_MAXNAMLEN and de != NULL are guaranteed by caller.
  */
 static inline int ufs_match(struct super_block *sb, int len,
-		const char * const name, struct ufs_dir_entry * de)
+		const unsigned char *name, struct ufs_dir_entry *de)
 {
 	if (len != ufs_get_de_namlen(sb, de))
 		return 0;
@@ -70,7 +70,7 @@ static inline unsigned long ufs_dir_pages(struct inode *inode)
 	return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
 }
 
-ino_t ufs_inode_by_name(struct inode *dir, struct qstr *qstr)
+ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr)
 {
 	ino_t res = 0;
 	struct ufs_dir_entry *de;
@@ -249,11 +249,11 @@ struct ufs_dir_entry *ufs_dotdot(struct inode *dir, struct page **p)
  * (as a parameter - res_dir). Page is returned mapped and unlocked.
  * Entry is guaranteed to be valid.
  */
-struct ufs_dir_entry *ufs_find_entry(struct inode *dir, struct qstr *qstr,
+struct ufs_dir_entry *ufs_find_entry(struct inode *dir, const struct qstr *qstr,
 				     struct page **res_page)
 {
 	struct super_block *sb = dir->i_sb;
-	const char *name = qstr->name;
+	const unsigned char *name = qstr->name;
 	int namelen = qstr->len;
 	unsigned reclen = UFS_DIR_REC_LEN(namelen);
 	unsigned long start, n;
@@ -313,7 +313,7 @@ found:
 int ufs_add_link(struct dentry *dentry, struct inode *inode)
 {
 	struct inode *dir = dentry->d_parent->d_inode;
-	const char *name = dentry->d_name.name;
+	const unsigned char *name = dentry->d_name.name;
 	int namelen = dentry->d_name.len;
 	struct super_block *sb = dir->i_sb;
 	unsigned reclen = UFS_DIR_REC_LEN(namelen);
@@ -666,6 +666,6 @@ not_empty:
 const struct file_operations ufs_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= ufs_readdir,
-	.fsync		= simple_fsync,
+	.fsync		= generic_file_fsync,
 	.llseek		= generic_file_llseek,
 };
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index 73655c61240a..33afa20d4509 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -41,6 +41,6 @@ const struct file_operations ufs_file_operations = {
 	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
 	.open           = generic_file_open,
-	.fsync		= simple_fsync,
+	.fsync		= generic_file_fsync,
 	.splice_read	= generic_file_splice_read,
 };
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 3527c00fef0d..594480e537d2 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -27,7 +27,6 @@
 #include <linux/time.h>
 #include <linux/stat.h>
 #include <linux/string.h>
-#include <linux/quotaops.h>
 #include <linux/buffer_head.h>
 #include <linux/sched.h>
 #include <linux/bitops.h>
@@ -95,9 +94,6 @@ void ufs_free_inode (struct inode * inode)
 
 	is_directory = S_ISDIR(inode->i_mode);
 
-	vfs_dq_free_inode(inode);
-	vfs_dq_drop(inode);
-
 	clear_inode (inode);
 
 	if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit))
@@ -303,15 +299,7 @@ cg_found:
 	sb->s_dirt = 1;
 
 	inode->i_ino = cg * uspi->s_ipg + bit;
-	inode->i_mode = mode;
-	inode->i_uid = current_fsuid();
-	if (dir->i_mode & S_ISGID) {
-		inode->i_gid = dir->i_gid;
-		if (S_ISDIR(mode))
-			inode->i_mode |= S_ISGID;
-	} else
-		inode->i_gid = current_fsgid();
-
+	inode_init_owner(inode, dir, mode);
 	inode->i_blocks = 0;
 	inode->i_generation = 0;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
@@ -355,20 +343,12 @@ cg_found:
 
 	unlock_super (sb);
 
-	if (vfs_dq_alloc_inode(inode)) {
-		vfs_dq_drop(inode);
-		err = -EDQUOT;
-		goto fail_without_unlock;
-	}
-
 	UFSD("allocating inode %lu\n", inode->i_ino);
 	UFSD("EXIT\n");
 	return inode;
 
 fail_remove_inode:
 	unlock_super(sb);
-fail_without_unlock:
-	inode->i_flags |= S_NOQUOTA;
 	inode->i_nlink = 0;
 	iput(inode);
 	UFSD("EXIT (FAILED): err %d\n", err);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 7cf33379fd46..73fe773aa034 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -36,6 +36,7 @@
 #include <linux/mm.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
+#include <linux/writeback.h>
 
 #include "ufs_fs.h"
 #include "ufs.h"
@@ -601,7 +602,7 @@ static void ufs_set_inode_ops(struct inode *inode)
 		if (!inode->i_blocks)
 			inode->i_op = &ufs_fast_symlink_inode_operations;
 		else {
-			inode->i_op = &page_symlink_inode_operations;
+			inode->i_op = &ufs_symlink_inode_operations;
 			inode->i_mapping->a_ops = &ufs_aops;
 		}
 	} else
@@ -890,11 +891,11 @@ static int ufs_update_inode(struct inode * inode, int do_sync)
 	return 0;
 }
 
-int ufs_write_inode (struct inode * inode, int wait)
+int ufs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	int ret;
 	lock_kernel();
-	ret = ufs_update_inode (inode, wait);
+	ret = ufs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
 	unlock_kernel();
 	return ret;
 }
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 4c26d9e8bc94..b056f02b1fb3 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -84,6 +84,7 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, int mode,
 	int err;
 
 	UFSD("BEGIN\n");
+
 	inode = ufs_new_inode(dir, mode);
 	err = PTR_ERR(inode);
 
@@ -107,6 +108,7 @@ static int ufs_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t
 
 	if (!old_valid_dev(rdev))
 		return -EINVAL;
+
 	inode = ufs_new_inode(dir, mode);
 	err = PTR_ERR(inode);
 	if (!IS_ERR(inode)) {
@@ -139,7 +141,7 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
 
 	if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) {
 		/* slow symlink */
-		inode->i_op = &page_symlink_inode_operations;
+		inode->i_op = &ufs_symlink_inode_operations;
 		inode->i_mapping->a_ops = &ufs_aops;
 		err = page_symlink(inode, symname, l);
 		if (err)
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 143c20bfb04b..3ec5a9eb6efb 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -77,7 +77,6 @@
 
 #include <linux/errno.h>
 #include <linux/fs.h>
-#include <linux/quotaops.h>
 #include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/stat.h>
@@ -918,6 +917,7 @@ again:
 	sbi->s_bytesex = BYTESEX_LE;
 	switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) {
 		case UFS_MAGIC:
+		case UFS_MAGIC_BW:
 		case UFS2_MAGIC:
 		case UFS_MAGIC_LFN:
 	        case UFS_MAGIC_FEA:
@@ -927,6 +927,7 @@ again:
 	sbi->s_bytesex = BYTESEX_BE;
 	switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) {
 		case UFS_MAGIC:
+		case UFS_MAGIC_BW:
 		case UFS2_MAGIC:
 		case UFS_MAGIC_LFN:
 	        case UFS_MAGIC_FEA:
@@ -1016,6 +1017,9 @@ magic_found:
 		case UFS_FSSTABLE:
 			UFSD("fs is stable\n");
 			break;
+		case UFS_FSLOG:
+			UFSD("fs is logging fs\n");
+			break;
 		case UFS_FSOSF1:
 			UFSD("fs is DEC OSF/1\n");
 			break;
@@ -1042,7 +1046,7 @@ magic_found:
 	 */
 	sb->s_op = &ufs_super_ops;
 	sb->s_export_op = &ufs_export_ops;
-	sb->dq_op = NULL; /***/
+
 	sb->s_magic = fs32_to_cpu(sb, usb3->fs_magic);
 
 	uspi->s_sblkno = fs32_to_cpu(sb, usb1->fs_sblkno);
@@ -1432,11 +1436,6 @@ static void destroy_inodecache(void)
 	kmem_cache_destroy(ufs_inode_cachep);
 }
 
-#ifdef CONFIG_QUOTA
-static ssize_t ufs_quota_read(struct super_block *, int, char *,size_t, loff_t);
-static ssize_t ufs_quota_write(struct super_block *, int, const char *, size_t, loff_t);
-#endif
-
 static const struct super_operations ufs_super_ops = {
 	.alloc_inode	= ufs_alloc_inode,
 	.destroy_inode	= ufs_destroy_inode,
@@ -1448,104 +1447,8 @@ static const struct super_operations ufs_super_ops = {
 	.statfs		= ufs_statfs,
 	.remount_fs	= ufs_remount,
 	.show_options   = ufs_show_options,
-#ifdef CONFIG_QUOTA
-	.quota_read	= ufs_quota_read,
-	.quota_write	= ufs_quota_write,
-#endif
 };
 
-#ifdef CONFIG_QUOTA
-
-/* Read data from quotafile - avoid pagecache and such because we cannot afford
- * acquiring the locks... As quota files are never truncated and quota code
- * itself serializes the operations (and noone else should touch the files)
- * we don't have to be afraid of races */
-static ssize_t ufs_quota_read(struct super_block *sb, int type, char *data,
-			       size_t len, loff_t off)
-{
-	struct inode *inode = sb_dqopt(sb)->files[type];
-	sector_t blk = off >> sb->s_blocksize_bits;
-	int err = 0;
-	int offset = off & (sb->s_blocksize - 1);
-	int tocopy;
-	size_t toread;
-	struct buffer_head *bh;
-	loff_t i_size = i_size_read(inode);
-
-	if (off > i_size)
-		return 0;
-	if (off+len > i_size)
-		len = i_size-off;
-	toread = len;
-	while (toread > 0) {
-		tocopy = sb->s_blocksize - offset < toread ?
-				sb->s_blocksize - offset : toread;
-
-		bh = ufs_bread(inode, blk, 0, &err);
-		if (err)
-			return err;
-		if (!bh)	/* A hole? */
-			memset(data, 0, tocopy);
-		else {
-			memcpy(data, bh->b_data+offset, tocopy);
-			brelse(bh);
-		}
-		offset = 0;
-		toread -= tocopy;
-		data += tocopy;
-		blk++;
-	}
-	return len;
-}
-
-/* Write to quotafile */
-static ssize_t ufs_quota_write(struct super_block *sb, int type,
-				const char *data, size_t len, loff_t off)
-{
-	struct inode *inode = sb_dqopt(sb)->files[type];
-	sector_t blk = off >> sb->s_blocksize_bits;
-	int err = 0;
-	int offset = off & (sb->s_blocksize - 1);
-	int tocopy;
-	size_t towrite = len;
-	struct buffer_head *bh;
-
-	mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
-	while (towrite > 0) {
-		tocopy = sb->s_blocksize - offset < towrite ?
-				sb->s_blocksize - offset : towrite;
-
-		bh = ufs_bread(inode, blk, 1, &err);
-		if (!bh)
-			goto out;
-		lock_buffer(bh);
-		memcpy(bh->b_data+offset, data, tocopy);
-		flush_dcache_page(bh->b_page);
-		set_buffer_uptodate(bh);
-		mark_buffer_dirty(bh);
-		unlock_buffer(bh);
-		brelse(bh);
-		offset = 0;
-		towrite -= tocopy;
-		data += tocopy;
-		blk++;
-	}
-out:
-	if (len == towrite) {
-		mutex_unlock(&inode->i_mutex);
-		return err;
-	}
-	if (inode->i_size < off+len-towrite)
-		i_size_write(inode, off+len-towrite);
-	inode->i_version++;
-	inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
-	mark_inode_dirty(inode);
-	mutex_unlock(&inode->i_mutex);
-	return len - towrite;
-}
-
-#endif
-
 static int ufs_get_sb(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
diff --git a/fs/ufs/symlink.c b/fs/ufs/symlink.c
index c0156eda44bc..d283628b4778 100644
--- a/fs/ufs/symlink.c
+++ b/fs/ufs/symlink.c
@@ -42,4 +42,12 @@ static void *ufs_follow_link(struct dentry *dentry, struct nameidata *nd)
 const struct inode_operations ufs_fast_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= ufs_follow_link,
+	.setattr	= ufs_setattr,
+};
+
+const struct inode_operations ufs_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+	.setattr	= ufs_setattr,
 };
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index 41dd431ce228..589e01a465ba 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -500,14 +500,12 @@ out:
 	return err;
 }
 
-
 /*
- * We don't define our `inode->i_op->truncate', and call it here,
- * because of:
- * - there is no way to know old size
- * - there is no way inform user about error, if it happens in `truncate'
+ * TODO:
+ *	- truncate case should use proper ordering instead of using
+ *	  simple_setsize
  */
-static int ufs_setattr(struct dentry *dentry, struct iattr *attr)
+int ufs_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = dentry->d_inode;
 	unsigned int ia_valid = attr->ia_valid;
@@ -517,10 +515,10 @@ static int ufs_setattr(struct dentry *dentry, struct iattr *attr)
 	if (error)
 		return error;
 
-	if (ia_valid & ATTR_SIZE &&
-	    attr->ia_size != i_size_read(inode)) {
+	if (ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) {
 		loff_t old_i_size = inode->i_size;
-		error = vmtruncate(inode, attr->ia_size);
+
+		error = simple_setsize(inode, attr->ia_size);
 		if (error)
 			return error;
 		error = ufs_truncate(inode, old_i_size);
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 0b4c39bc0d9e..179ae6b3180a 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -86,9 +86,9 @@ extern void ufs_put_cylinder (struct super_block *, unsigned);
 /* dir.c */
 extern const struct inode_operations ufs_dir_inode_operations;
 extern int ufs_add_link (struct dentry *, struct inode *);
-extern ino_t ufs_inode_by_name(struct inode *, struct qstr *);
+extern ino_t ufs_inode_by_name(struct inode *, const struct qstr *);
 extern int ufs_make_empty(struct inode *, struct inode *);
-extern struct ufs_dir_entry *ufs_find_entry(struct inode *, struct qstr *, struct page **);
+extern struct ufs_dir_entry *ufs_find_entry(struct inode *, const struct qstr *, struct page **);
 extern int ufs_delete_entry(struct inode *, struct ufs_dir_entry *, struct page *);
 extern int ufs_empty_dir (struct inode *);
 extern struct ufs_dir_entry *ufs_dotdot(struct inode *, struct page **);
@@ -106,7 +106,7 @@ extern struct inode * ufs_new_inode (struct inode *, int);
 
 /* inode.c */
 extern struct inode *ufs_iget(struct super_block *, unsigned long);
-extern int ufs_write_inode (struct inode *, int);
+extern int ufs_write_inode (struct inode *, struct writeback_control *);
 extern int ufs_sync_inode (struct inode *);
 extern void ufs_delete_inode (struct inode *);
 extern struct buffer_head * ufs_bread (struct inode *, unsigned, int, int *);
@@ -122,9 +122,11 @@ extern void ufs_panic (struct super_block *, const char *, const char *, ...) __
 
 /* symlink.c */
 extern const struct inode_operations ufs_fast_symlink_inode_operations;
+extern const struct inode_operations ufs_symlink_inode_operations;
 
 /* truncate.c */
 extern int ufs_truncate (struct inode *, loff_t);
+extern int ufs_setattr(struct dentry *dentry, struct iattr *attr);
 
 static inline struct ufs_sb_info *UFS_SB(struct super_block *sb)
 {
diff --git a/fs/ufs/ufs_fs.h b/fs/ufs/ufs_fs.h
index 54bde1895a80..8aba544f9fad 100644
--- a/fs/ufs/ufs_fs.h
+++ b/fs/ufs/ufs_fs.h
@@ -48,6 +48,7 @@ typedef __u16 __bitwise __fs16;
 #define UFS_SECTOR_SIZE 512
 #define UFS_SECTOR_BITS 9
 #define UFS_MAGIC  0x00011954
+#define UFS_MAGIC_BW 0x0f242697
 #define UFS2_MAGIC 0x19540119
 #define UFS_CIGAM  0x54190100 /* byteswapped MAGIC */
 
@@ -138,6 +139,7 @@ typedef __u16 __bitwise __fs16;
 
 #define UFS_USEEFT  ((__u16)65535)
 
+/* fs_clean values */
 #define UFS_FSOK      0x7c269d38
 #define UFS_FSACTIVE  ((__s8)0x00)
 #define UFS_FSCLEAN   ((__s8)0x01)
@@ -145,6 +147,11 @@ typedef __u16 __bitwise __fs16;
 #define UFS_FSOSF1    ((__s8)0x03)	/* is this correct for DEC OSF/1? */
 #define UFS_FSBAD     ((__s8)0xff)
 
+/* Solaris-specific fs_clean values */
+#define UFS_FSSUSPEND ((__s8)0xfe)	/* temporarily suspended */
+#define UFS_FSLOG     ((__s8)0xfd)	/* logging fs */
+#define UFS_FSFIX     ((__s8)0xfc)	/* being repaired while mounted */
+
 /* From here to next blank line, s_flags for ufs_sb_info */
 /* directory entry encoding */
 #define UFS_DE_MASK		0x00000010	/* mask for the following */
@@ -227,11 +234,16 @@ typedef __u16 __bitwise __fs16;
  */
 #define ufs_cbtocylno(bno) \
 	((bno) * uspi->s_nspf / uspi->s_spc)
-#define ufs_cbtorpos(bno) \
+#define ufs_cbtorpos(bno)				      \
+	((UFS_SB(sb)->s_flags & UFS_CG_SUN) ?		      \
+	 (((((bno) * uspi->s_nspf % uspi->s_spc) %	      \
+	    uspi->s_nsect) *				      \
+	   uspi->s_nrpos) / uspi->s_nsect)		      \
+	 :						      \
 	((((bno) * uspi->s_nspf % uspi->s_spc / uspi->s_nsect \
 	* uspi->s_trackskew + (bno) * uspi->s_nspf % uspi->s_spc \
 	% uspi->s_nsect * uspi->s_interleave) % uspi->s_nsect \
-	* uspi->s_nrpos) / uspi->s_npsect)
+	  * uspi->s_nrpos) / uspi->s_npsect))
 
 /*
  * The following macros optimize certain frequently calculated
diff --git a/fs/xattr.c b/fs/xattr.c
index 46f87e828b48..01bb8135e14a 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -590,10 +590,10 @@ strcmp_prefix(const char *a, const char *a_prefix)
 /*
  * Find the xattr_handler with the matching prefix.
  */
-static struct xattr_handler *
-xattr_resolve_name(struct xattr_handler **handlers, const char **name)
+static const struct xattr_handler *
+xattr_resolve_name(const struct xattr_handler **handlers, const char **name)
 {
-	struct xattr_handler *handler;
+	const struct xattr_handler *handler;
 
 	if (!*name)
 		return NULL;
@@ -614,7 +614,7 @@ xattr_resolve_name(struct xattr_handler **handlers, const char **name)
 ssize_t
 generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size)
 {
-	struct xattr_handler *handler;
+	const struct xattr_handler *handler;
 
 	handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
 	if (!handler)
@@ -629,7 +629,7 @@ generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t s
 ssize_t
 generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
-	struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr;
+	const struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr;
 	unsigned int size = 0;
 
 	if (!buffer) {
@@ -659,7 +659,7 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
 int
 generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags)
 {
-	struct xattr_handler *handler;
+	const struct xattr_handler *handler;
 
 	if (size == 0)
 		value = "";  /* empty EA, do not remove */
@@ -676,7 +676,7 @@ generic_setxattr(struct dentry *dentry, const char *name, const void *value, siz
 int
 generic_removexattr(struct dentry *dentry, const char *name)
 {
-	struct xattr_handler *handler;
+	const struct xattr_handler *handler;
 
 	handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
 	if (!handler)
diff --git a/fs/xattr_acl.c b/fs/xattr_acl.c
index 05ac0fe9c4d3..8d5a506c82eb 100644
--- a/fs/xattr_acl.c
+++ b/fs/xattr_acl.c
@@ -6,9 +6,9 @@
  */
 
 #include <linux/module.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/posix_acl_xattr.h>
+#include <linux/gfp.h>
 
 
 /*
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 56641fe52a23..c8fb13f83b3f 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -16,7 +16,7 @@
 # Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 #
 
-EXTRA_CFLAGS +=	 -I$(src) -I$(src)/linux-2.6 -funsigned-char
+EXTRA_CFLAGS +=	 -I$(src) -I$(src)/linux-2.6
 
 XFS_LINUX := linux-2.6
 
@@ -77,6 +77,7 @@ xfs-y				+= xfs_alloc.o \
 				   xfs_itable.o \
 				   xfs_dfrag.o \
 				   xfs_log.o \
+				   xfs_log_cil.o \
 				   xfs_log_recover.o \
 				   xfs_mount.o \
 				   xfs_mru_cache.o \
@@ -105,7 +106,6 @@ xfs-y				+= $(addprefix $(XFS_LINUX)/, \
 				   xfs_globals.o \
 				   xfs_ioctl.o \
 				   xfs_iops.o \
-				   xfs_lrw.o \
 				   xfs_super.o \
 				   xfs_sync.o \
 				   xfs_xattr.o)
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index 2d3f90afe5f1..666c9db48eb6 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -16,16 +16,33 @@
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 #include <linux/mm.h>
-#include <linux/vmalloc.h>
 #include <linux/highmem.h>
+#include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include "time.h"
 #include "kmem.h"
 
-#define MAX_VMALLOCS	6
-#define MAX_SLAB_SIZE	0x20000
+/*
+ * Greedy allocation.  May fail and may return vmalloced memory.
+ *
+ * Must be freed using kmem_free_large.
+ */
+void *
+kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize)
+{
+	void		*ptr;
+	size_t		kmsize = maxsize;
+
+	while (!(ptr = kmem_zalloc_large(kmsize))) {
+		if ((kmsize >>= 1) <= minsize)
+			kmsize = minsize;
+	}
+	if (ptr)
+		*size = kmsize;
+	return ptr;
+}
 
 void *
 kmem_alloc(size_t size, unsigned int __nocast flags)
@@ -34,19 +51,8 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
 	gfp_t	lflags = kmem_flags_convert(flags);
 	void	*ptr;
 
-#ifdef DEBUG
-	if (unlikely(!(flags & KM_LARGE) && (size > PAGE_SIZE))) {
-		printk(KERN_WARNING "Large %s attempt, size=%ld\n",
-			__func__, (long)size);
-		dump_stack();
-	}
-#endif
-
 	do {
-		if (size < MAX_SLAB_SIZE || retries > MAX_VMALLOCS)
-			ptr = kmalloc(size, lflags);
-		else
-			ptr = __vmalloc(size, lflags, PAGE_KERNEL);
+		ptr = kmalloc(size, lflags);
 		if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
 			return ptr;
 		if (!(++retries % 100))
@@ -68,27 +74,6 @@ kmem_zalloc(size_t size, unsigned int __nocast flags)
 	return ptr;
 }
 
-void *
-kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize,
-		   unsigned int __nocast flags)
-{
-	void		*ptr;
-	size_t		kmsize = maxsize;
-	unsigned int	kmflags = (flags & ~KM_SLEEP) | KM_NOSLEEP;
-
-	while (!(ptr = kmem_zalloc(kmsize, kmflags))) {
-		if ((kmsize <= minsize) && (flags & KM_NOSLEEP))
-			break;
-		if ((kmsize >>= 1) <= minsize) {
-			kmsize = minsize;
-			kmflags = flags;
-		}
-	}
-	if (ptr)
-		*size = kmsize;
-	return ptr;
-}
-
 void
 kmem_free(const void *ptr)
 {
diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h
index 179cbd630f69..f7c8f7a9ea6d 100644
--- a/fs/xfs/linux-2.6/kmem.h
+++ b/fs/xfs/linux-2.6/kmem.h
@@ -21,6 +21,7 @@
 #include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
+#include <linux/vmalloc.h>
 
 /*
  * General memory allocation interfaces
@@ -30,7 +31,6 @@
 #define KM_NOSLEEP	0x0002u
 #define KM_NOFS		0x0004u
 #define KM_MAYFAIL	0x0008u
-#define KM_LARGE	0x0010u
 
 /*
  * We use a special process flag to avoid recursive callbacks into
@@ -42,7 +42,7 @@ kmem_flags_convert(unsigned int __nocast flags)
 {
 	gfp_t	lflags;
 
-	BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL|KM_LARGE));
+	BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL));
 
 	if (flags & KM_NOSLEEP) {
 		lflags = GFP_ATOMIC | __GFP_NOWARN;
@@ -56,10 +56,25 @@ kmem_flags_convert(unsigned int __nocast flags)
 
 extern void *kmem_alloc(size_t, unsigned int __nocast);
 extern void *kmem_zalloc(size_t, unsigned int __nocast);
-extern void *kmem_zalloc_greedy(size_t *, size_t, size_t, unsigned int __nocast);
 extern void *kmem_realloc(const void *, size_t, size_t, unsigned int __nocast);
 extern void  kmem_free(const void *);
 
+static inline void *kmem_zalloc_large(size_t size)
+{
+	void *ptr;
+
+	ptr = vmalloc(size);
+	if (ptr)
+		memset(ptr, 0, size);
+	return ptr;
+}
+static inline void kmem_free_large(void *ptr)
+{
+	vfree(ptr);
+}
+
+extern void *kmem_zalloc_greedy(size_t *, size_t, size_t);
+
 /*
  * Zone interfaces
  */
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index 883ca5ab8af5..9f769b5b38fc 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -22,6 +22,7 @@
 #include "xfs_inode.h"
 #include "xfs_vnodeops.h"
 #include "xfs_trace.h"
+#include <linux/slab.h>
 #include <linux/xattr.h>
 #include <linux/posix_acl_xattr.h>
 
@@ -106,7 +107,7 @@ xfs_get_acl(struct inode *inode, int type)
 	struct posix_acl *acl;
 	struct xfs_acl *xfs_acl;
 	int len = sizeof(struct xfs_acl);
-	char *ea_name;
+	unsigned char *ea_name;
 	int error;
 
 	acl = get_cached_acl(inode, type);
@@ -133,7 +134,8 @@ xfs_get_acl(struct inode *inode, int type)
 	if (!xfs_acl)
 		return ERR_PTR(-ENOMEM);
 
-	error = -xfs_attr_get(ip, ea_name, (char *)xfs_acl, &len, ATTR_ROOT);
+	error = -xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl,
+							&len, ATTR_ROOT);
 	if (error) {
 		/*
 		 * If the attribute doesn't exist make sure we have a negative
@@ -162,7 +164,7 @@ STATIC int
 xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 {
 	struct xfs_inode *ip = XFS_I(inode);
-	char *ea_name;
+	unsigned char *ea_name;
 	int error;
 
 	if (S_ISLNK(inode->i_mode))
@@ -194,7 +196,7 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 			(sizeof(struct xfs_acl_entry) *
 			 (XFS_ACL_MAX_ENTRIES - acl->a_count));
 
-		error = -xfs_attr_set(ip, ea_name, (char *)xfs_acl,
+		error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl,
 				len, ATTR_ROOT);
 
 		kfree(xfs_acl);
@@ -262,7 +264,7 @@ xfs_set_mode(struct inode *inode, mode_t mode)
 }
 
 static int
-xfs_acl_exists(struct inode *inode, char *name)
+xfs_acl_exists(struct inode *inode, unsigned char *name)
 {
 	int len = sizeof(struct xfs_acl);
 
@@ -438,14 +440,14 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name,
 	return error;
 }
 
-struct xattr_handler xfs_xattr_acl_access_handler = {
+const struct xattr_handler xfs_xattr_acl_access_handler = {
 	.prefix	= POSIX_ACL_XATTR_ACCESS,
 	.flags	= ACL_TYPE_ACCESS,
 	.get	= xfs_xattr_acl_get,
 	.set	= xfs_xattr_acl_set,
 };
 
-struct xattr_handler xfs_xattr_acl_default_handler = {
+const struct xattr_handler xfs_xattr_acl_default_handler = {
 	.prefix	= POSIX_ACL_XATTR_DEFAULT,
 	.flags	= ACL_TYPE_DEFAULT,
 	.get	= xfs_xattr_acl_get,
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 66abe36c1213..34640d6dbdcb 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -39,10 +39,21 @@
 #include "xfs_iomap.h"
 #include "xfs_vnodeops.h"
 #include "xfs_trace.h"
+#include "xfs_bmap.h"
+#include <linux/gfp.h>
 #include <linux/mpage.h>
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
 
+/*
+ * Types of I/O for bmap clustering and I/O completion tracking.
+ */
+enum {
+	IO_READ,	/* mapping for a read */
+	IO_DELAY,	/* mapping covers delalloc region */
+	IO_UNWRITTEN,	/* mapping covers allocated but uninitialized data */
+	IO_NEW		/* just allocated */
+};
 
 /*
  * Prime number of hash buckets since address is used as the key.
@@ -101,8 +112,9 @@ xfs_count_page_state(
 
 STATIC struct block_device *
 xfs_find_bdev_for_inode(
-	struct xfs_inode	*ip)
+	struct inode		*inode)
 {
+	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
 
 	if (XFS_IS_REALTIME_INODE(ip))
@@ -163,14 +175,17 @@ xfs_ioend_new_eof(
 }
 
 /*
- * Update on-disk file size now that data has been written to disk.
- * The current in-memory file size is i_size.  If a write is beyond
- * eof i_new_size will be the intended file size until i_size is
- * updated.  If this write does not extend all the way to the valid
- * file size then restrict this update to the end of the write.
+ * Update on-disk file size now that data has been written to disk.  The
+ * current in-memory file size is i_size.  If a write is beyond eof i_new_size
+ * will be the intended file size until i_size is updated.  If this write does
+ * not extend all the way to the valid file size then restrict this update to
+ * the end of the write.
+ *
+ * This function does not block as blocking on the inode lock in IO completion
+ * can lead to IO completion order dependency deadlocks.. If it can't get the
+ * inode ilock it will return EAGAIN. Callers must handle this.
  */
-
-STATIC void
+STATIC int
 xfs_setfilesize(
 	xfs_ioend_t		*ioend)
 {
@@ -178,19 +193,43 @@ xfs_setfilesize(
 	xfs_fsize_t		isize;
 
 	ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
-	ASSERT(ioend->io_type != IOMAP_READ);
+	ASSERT(ioend->io_type != IO_READ);
 
 	if (unlikely(ioend->io_error))
-		return;
+		return 0;
+
+	if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
+		return EAGAIN;
 
-	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	isize = xfs_ioend_new_eof(ioend);
 	if (isize) {
 		ip->i_d.di_size = isize;
-		xfs_mark_inode_dirty_sync(ip);
+		xfs_mark_inode_dirty(ip);
 	}
 
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return 0;
+}
+
+/*
+ * Schedule IO completion handling on a xfsdatad if this was
+ * the final hold on this ioend. If we are asked to wait,
+ * flush the workqueue.
+ */
+STATIC void
+xfs_finish_ioend(
+	xfs_ioend_t	*ioend,
+	int		wait)
+{
+	if (atomic_dec_and_test(&ioend->io_remaining)) {
+		struct workqueue_struct *wq;
+
+		wq = (ioend->io_type == IO_UNWRITTEN) ?
+			xfsconvertd_workqueue : xfsdatad_workqueue;
+		queue_work(wq, &ioend->io_work);
+		if (wait)
+			flush_workqueue(wq);
+	}
 }
 
 /*
@@ -198,19 +237,18 @@ xfs_setfilesize(
  */
 STATIC void
 xfs_end_io(
-	struct work_struct	*work)
+	struct work_struct *work)
 {
-	xfs_ioend_t		*ioend =
-		container_of(work, xfs_ioend_t, io_work);
-	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
+	xfs_ioend_t	*ioend = container_of(work, xfs_ioend_t, io_work);
+	struct xfs_inode *ip = XFS_I(ioend->io_inode);
+	int		error = 0;
 
 	/*
 	 * For unwritten extents we need to issue transactions to convert a
 	 * range to normal written extens after the data I/O has finished.
 	 */
-	if (ioend->io_type == IOMAP_UNWRITTEN &&
+	if (ioend->io_type == IO_UNWRITTEN &&
 	    likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) {
-		int error;
 
 		error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
 						 ioend->io_size);
@@ -222,30 +260,23 @@ xfs_end_io(
 	 * We might have to update the on-disk file size after extending
 	 * writes.
 	 */
-	if (ioend->io_type != IOMAP_READ)
-		xfs_setfilesize(ioend);
-	xfs_destroy_ioend(ioend);
-}
-
-/*
- * Schedule IO completion handling on a xfsdatad if this was
- * the final hold on this ioend. If we are asked to wait,
- * flush the workqueue.
- */
-STATIC void
-xfs_finish_ioend(
-	xfs_ioend_t	*ioend,
-	int		wait)
-{
-	if (atomic_dec_and_test(&ioend->io_remaining)) {
-		struct workqueue_struct *wq;
-
-		wq = (ioend->io_type == IOMAP_UNWRITTEN) ?
-			xfsconvertd_workqueue : xfsdatad_workqueue;
-		queue_work(wq, &ioend->io_work);
-		if (wait)
-			flush_workqueue(wq);
+	if (ioend->io_type != IO_READ) {
+		error = xfs_setfilesize(ioend);
+		ASSERT(!error || error == EAGAIN);
 	}
+
+	/*
+	 * If we didn't complete processing of the ioend, requeue it to the
+	 * tail of the workqueue for another attempt later. Otherwise destroy
+	 * it.
+	 */
+	if (error == EAGAIN) {
+		atomic_inc(&ioend->io_remaining);
+		xfs_finish_ioend(ioend, 0);
+		/* ensure we don't spin on blocked ioends */
+		delay(1);
+	} else
+		xfs_destroy_ioend(ioend);
 }
 
 /*
@@ -288,21 +319,25 @@ xfs_map_blocks(
 	struct inode		*inode,
 	loff_t			offset,
 	ssize_t			count,
-	xfs_iomap_t		*mapp,
+	struct xfs_bmbt_irec	*imap,
 	int			flags)
 {
 	int			nmaps = 1;
+	int			new = 0;
 
-	return -xfs_iomap(XFS_I(inode), offset, count, flags, mapp, &nmaps);
+	return -xfs_iomap(XFS_I(inode), offset, count, flags, imap, &nmaps, &new);
 }
 
 STATIC int
-xfs_iomap_valid(
-	xfs_iomap_t		*iomapp,
-	loff_t			offset)
+xfs_imap_valid(
+	struct inode		*inode,
+	struct xfs_bmbt_irec	*imap,
+	xfs_off_t		offset)
 {
-	return offset >= iomapp->iomap_offset &&
-		offset < iomapp->iomap_offset + iomapp->iomap_bsize;
+	offset >>= inode->i_blkbits;
+
+	return offset >= imap->br_startoff &&
+		offset < imap->br_startoff + imap->br_blockcount;
 }
 
 /*
@@ -341,7 +376,7 @@ xfs_submit_ioend_bio(
 	 * but don't update the inode size until I/O completion.
 	 */
 	if (xfs_ioend_new_eof(ioend))
-		xfs_mark_inode_dirty_sync(XFS_I(ioend->io_inode));
+		xfs_mark_inode_dirty(XFS_I(ioend->io_inode));
 
 	submit_bio(wbc->sync_mode == WB_SYNC_ALL ?
 		   WRITE_SYNC_PLUG : WRITE, bio);
@@ -533,19 +568,23 @@ xfs_add_to_ioend(
 
 STATIC void
 xfs_map_buffer(
+	struct inode		*inode,
 	struct buffer_head	*bh,
-	xfs_iomap_t		*mp,
-	xfs_off_t		offset,
-	uint			block_bits)
+	struct xfs_bmbt_irec	*imap,
+	xfs_off_t		offset)
 {
 	sector_t		bn;
+	struct xfs_mount	*m = XFS_I(inode)->i_mount;
+	xfs_off_t		iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff);
+	xfs_daddr_t		iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock);
 
-	ASSERT(mp->iomap_bn != IOMAP_DADDR_NULL);
+	ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+	ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
 
-	bn = (mp->iomap_bn >> (block_bits - BBSHIFT)) +
-	      ((offset - mp->iomap_offset) >> block_bits);
+	bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) +
+	      ((offset - iomap_offset) >> inode->i_blkbits);
 
-	ASSERT(bn || (mp->iomap_flags & IOMAP_REALTIME));
+	ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode)));
 
 	bh->b_blocknr = bn;
 	set_buffer_mapped(bh);
@@ -553,17 +592,17 @@ xfs_map_buffer(
 
 STATIC void
 xfs_map_at_offset(
+	struct inode		*inode,
 	struct buffer_head	*bh,
-	loff_t			offset,
-	int			block_bits,
-	xfs_iomap_t		*iomapp)
+	struct xfs_bmbt_irec	*imap,
+	xfs_off_t		offset)
 {
-	ASSERT(!(iomapp->iomap_flags & IOMAP_HOLE));
-	ASSERT(!(iomapp->iomap_flags & IOMAP_DELAY));
+	ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+	ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
 
 	lock_buffer(bh);
-	xfs_map_buffer(bh, iomapp, offset, block_bits);
-	bh->b_bdev = iomapp->iomap_target->bt_bdev;
+	xfs_map_buffer(inode, bh, imap, offset);
+	bh->b_bdev = xfs_find_bdev_for_inode(inode);
 	set_buffer_mapped(bh);
 	clear_buffer_delay(bh);
 	clear_buffer_unwritten(bh);
@@ -692,11 +731,11 @@ xfs_is_delayed_page(
 		bh = head = page_buffers(page);
 		do {
 			if (buffer_unwritten(bh))
-				acceptable = (type == IOMAP_UNWRITTEN);
+				acceptable = (type == IO_UNWRITTEN);
 			else if (buffer_delay(bh))
-				acceptable = (type == IOMAP_DELAY);
+				acceptable = (type == IO_DELAY);
 			else if (buffer_dirty(bh) && buffer_mapped(bh))
-				acceptable = (type == IOMAP_NEW);
+				acceptable = (type == IO_NEW);
 			else
 				break;
 		} while ((bh = bh->b_this_page) != head);
@@ -719,7 +758,7 @@ xfs_convert_page(
 	struct inode		*inode,
 	struct page		*page,
 	loff_t			tindex,
-	xfs_iomap_t		*mp,
+	struct xfs_bmbt_irec	*imap,
 	xfs_ioend_t		**ioendp,
 	struct writeback_control *wbc,
 	int			startio,
@@ -729,7 +768,6 @@ xfs_convert_page(
 	xfs_off_t		end_offset;
 	unsigned long		p_offset;
 	unsigned int		type;
-	int			bbits = inode->i_blkbits;
 	int			len, page_dirty;
 	int			count = 0, done = 0, uptodate = 1;
  	xfs_off_t		offset = page_offset(page);
@@ -781,19 +819,19 @@ xfs_convert_page(
 
 		if (buffer_unwritten(bh) || buffer_delay(bh)) {
 			if (buffer_unwritten(bh))
-				type = IOMAP_UNWRITTEN;
+				type = IO_UNWRITTEN;
 			else
-				type = IOMAP_DELAY;
+				type = IO_DELAY;
 
-			if (!xfs_iomap_valid(mp, offset)) {
+			if (!xfs_imap_valid(inode, imap, offset)) {
 				done = 1;
 				continue;
 			}
 
-			ASSERT(!(mp->iomap_flags & IOMAP_HOLE));
-			ASSERT(!(mp->iomap_flags & IOMAP_DELAY));
+			ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+			ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
 
-			xfs_map_at_offset(bh, offset, bbits, mp);
+			xfs_map_at_offset(inode, bh, imap, offset);
 			if (startio) {
 				xfs_add_to_ioend(inode, bh, offset,
 						type, ioendp, done);
@@ -805,7 +843,7 @@ xfs_convert_page(
 			page_dirty--;
 			count++;
 		} else {
-			type = IOMAP_NEW;
+			type = IO_NEW;
 			if (buffer_mapped(bh) && all_bh && startio) {
 				lock_buffer(bh);
 				xfs_add_to_ioend(inode, bh, offset,
@@ -845,7 +883,7 @@ STATIC void
 xfs_cluster_write(
 	struct inode		*inode,
 	pgoff_t			tindex,
-	xfs_iomap_t		*iomapp,
+	struct xfs_bmbt_irec	*imap,
 	xfs_ioend_t		**ioendp,
 	struct writeback_control *wbc,
 	int			startio,
@@ -864,7 +902,7 @@ xfs_cluster_write(
 
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			done = xfs_convert_page(inode, pvec.pages[i], tindex++,
-					iomapp, ioendp, wbc, startio, all_bh);
+					imap, ioendp, wbc, startio, all_bh);
 			if (done)
 				break;
 		}
@@ -874,6 +912,125 @@ xfs_cluster_write(
 	}
 }
 
+STATIC void
+xfs_vm_invalidatepage(
+	struct page		*page,
+	unsigned long		offset)
+{
+	trace_xfs_invalidatepage(page->mapping->host, page, offset);
+	block_invalidatepage(page, offset);
+}
+
+/*
+ * If the page has delalloc buffers on it, we need to punch them out before we
+ * invalidate the page. If we don't, we leave a stale delalloc mapping on the
+ * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read
+ * is done on that same region - the delalloc extent is returned when none is
+ * supposed to be there.
+ *
+ * We prevent this by truncating away the delalloc regions on the page before
+ * invalidating it. Because they are delalloc, we can do this without needing a
+ * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this
+ * truncation without a transaction as there is no space left for block
+ * reservation (typically why we see a ENOSPC in writeback).
+ *
+ * This is not a performance critical path, so for now just do the punching a
+ * buffer head at a time.
+ */
+STATIC void
+xfs_aops_discard_page(
+	struct page		*page)
+{
+	struct inode		*inode = page->mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct buffer_head	*bh, *head;
+	loff_t			offset = page_offset(page);
+	ssize_t			len = 1 << inode->i_blkbits;
+
+	if (!xfs_is_delayed_page(page, IO_DELAY))
+		goto out_invalidate;
+
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+		goto out_invalidate;
+
+	xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+		"page discard on page %p, inode 0x%llx, offset %llu.",
+			page, ip->i_ino, offset);
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	bh = head = page_buffers(page);
+	do {
+		int		done;
+		xfs_fileoff_t	offset_fsb;
+		xfs_bmbt_irec_t	imap;
+		int		nimaps = 1;
+		int		error;
+		xfs_fsblock_t	firstblock;
+		xfs_bmap_free_t flist;
+
+		if (!buffer_delay(bh))
+			goto next_buffer;
+
+		offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+
+		/*
+		 * Map the range first and check that it is a delalloc extent
+		 * before trying to unmap the range. Otherwise we will be
+		 * trying to remove a real extent (which requires a
+		 * transaction) or a hole, which is probably a bad idea...
+		 */
+		error = xfs_bmapi(NULL, ip, offset_fsb, 1,
+				XFS_BMAPI_ENTIRE,  NULL, 0, &imap,
+				&nimaps, NULL, NULL);
+
+		if (error) {
+			/* something screwed, just bail */
+			if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+				xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+				"page discard failed delalloc mapping lookup.");
+			}
+			break;
+		}
+		if (!nimaps) {
+			/* nothing there */
+			goto next_buffer;
+		}
+		if (imap.br_startblock != DELAYSTARTBLOCK) {
+			/* been converted, ignore */
+			goto next_buffer;
+		}
+		WARN_ON(imap.br_blockcount == 0);
+
+		/*
+		 * Note: while we initialise the firstblock/flist pair, they
+		 * should never be used because blocks should never be
+		 * allocated or freed for a delalloc extent and hence we need
+		 * don't cancel or finish them after the xfs_bunmapi() call.
+		 */
+		xfs_bmap_init(&flist, &firstblock);
+		error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock,
+					&flist, NULL, &done);
+
+		ASSERT(!flist.xbf_count && !flist.xbf_first);
+		if (error) {
+			/* something screwed, just bail */
+			if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+				xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+			"page discard unable to remove delalloc mapping.");
+			}
+			break;
+		}
+next_buffer:
+		offset += len;
+
+	} while ((bh = bh->b_this_page) != head);
+
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+out_invalidate:
+	xfs_vm_invalidatepage(page, 0);
+	return;
+}
+
 /*
  * Calling this without startio set means we are being asked to make a dirty
  * page ready for freeing it's buffers.  When called with startio set then
@@ -902,15 +1059,15 @@ xfs_page_state_convert(
 	int		unmapped) /* also implies page uptodate */
 {
 	struct buffer_head	*bh, *head;
-	xfs_iomap_t		iomap;
+	struct xfs_bmbt_irec	imap;
 	xfs_ioend_t		*ioend = NULL, *iohead = NULL;
 	loff_t			offset;
 	unsigned long           p_offset = 0;
 	unsigned int		type;
 	__uint64_t              end_offset;
-	pgoff_t                 end_index, last_index, tlast;
+	pgoff_t                 end_index, last_index;
 	ssize_t			size, len;
-	int			flags, err, iomap_valid = 0, uptodate = 1;
+	int			flags, err, imap_valid = 0, uptodate = 1;
 	int			page_dirty, count = 0;
 	int			trylock = 0;
 	int			all_bh = unmapped;
@@ -957,7 +1114,7 @@ xfs_page_state_convert(
 	bh = head = page_buffers(page);
 	offset = page_offset(page);
 	flags = BMAPI_READ;
-	type = IOMAP_NEW;
+	type = IO_NEW;
 
 	/* TODO: cleanup count and page_dirty */
 
@@ -971,12 +1128,12 @@ xfs_page_state_convert(
 			 * the iomap is actually still valid, but the ioend
 			 * isn't.  shouldn't happen too often.
 			 */
-			iomap_valid = 0;
+			imap_valid = 0;
 			continue;
 		}
 
-		if (iomap_valid)
-			iomap_valid = xfs_iomap_valid(&iomap, offset);
+		if (imap_valid)
+			imap_valid = xfs_imap_valid(inode, &imap, offset);
 
 		/*
 		 * First case, map an unwritten extent and prepare for
@@ -997,20 +1154,20 @@ xfs_page_state_convert(
 			 * Make sure we don't use a read-only iomap
 			 */
 			if (flags == BMAPI_READ)
-				iomap_valid = 0;
+				imap_valid = 0;
 
 			if (buffer_unwritten(bh)) {
-				type = IOMAP_UNWRITTEN;
+				type = IO_UNWRITTEN;
 				flags = BMAPI_WRITE | BMAPI_IGNSTATE;
 			} else if (buffer_delay(bh)) {
-				type = IOMAP_DELAY;
+				type = IO_DELAY;
 				flags = BMAPI_ALLOCATE | trylock;
 			} else {
-				type = IOMAP_NEW;
+				type = IO_NEW;
 				flags = BMAPI_WRITE | BMAPI_MMAP;
 			}
 
-			if (!iomap_valid) {
+			if (!imap_valid) {
 				/*
 				 * if we didn't have a valid mapping then we
 				 * need to ensure that we put the new mapping
@@ -1020,7 +1177,7 @@ xfs_page_state_convert(
 				 * for unwritten extent conversion.
 				 */
 				new_ioend = 1;
-				if (type == IOMAP_NEW) {
+				if (type == IO_NEW) {
 					size = xfs_probe_cluster(inode,
 							page, bh, head, 0);
 				} else {
@@ -1028,14 +1185,14 @@ xfs_page_state_convert(
 				}
 
 				err = xfs_map_blocks(inode, offset, size,
-						&iomap, flags);
+						&imap, flags);
 				if (err)
 					goto error;
-				iomap_valid = xfs_iomap_valid(&iomap, offset);
+				imap_valid = xfs_imap_valid(inode, &imap,
+							    offset);
 			}
-			if (iomap_valid) {
-				xfs_map_at_offset(bh, offset,
-						inode->i_blkbits, &iomap);
+			if (imap_valid) {
+				xfs_map_at_offset(inode, bh, &imap, offset);
 				if (startio) {
 					xfs_add_to_ioend(inode, bh, offset,
 							type, &ioend,
@@ -1054,40 +1211,41 @@ xfs_page_state_convert(
 			 * That means it must already have extents allocated
 			 * underneath it. Map the extent by reading it.
 			 */
-			if (!iomap_valid || flags != BMAPI_READ) {
+			if (!imap_valid || flags != BMAPI_READ) {
 				flags = BMAPI_READ;
 				size = xfs_probe_cluster(inode, page, bh,
 								head, 1);
 				err = xfs_map_blocks(inode, offset, size,
-						&iomap, flags);
+						&imap, flags);
 				if (err)
 					goto error;
-				iomap_valid = xfs_iomap_valid(&iomap, offset);
+				imap_valid = xfs_imap_valid(inode, &imap,
+							    offset);
 			}
 
 			/*
-			 * We set the type to IOMAP_NEW in case we are doing a
+			 * We set the type to IO_NEW in case we are doing a
 			 * small write at EOF that is extending the file but
 			 * without needing an allocation. We need to update the
 			 * file size on I/O completion in this case so it is
 			 * the same case as having just allocated a new extent
 			 * that we are writing into for the first time.
 			 */
-			type = IOMAP_NEW;
+			type = IO_NEW;
 			if (trylock_buffer(bh)) {
 				ASSERT(buffer_mapped(bh));
-				if (iomap_valid)
+				if (imap_valid)
 					all_bh = 1;
 				xfs_add_to_ioend(inode, bh, offset, type,
-						&ioend, !iomap_valid);
+						&ioend, !imap_valid);
 				page_dirty--;
 				count++;
 			} else {
-				iomap_valid = 0;
+				imap_valid = 0;
 			}
 		} else if ((buffer_uptodate(bh) || PageUptodate(page)) &&
 			   (unmapped || startio)) {
-			iomap_valid = 0;
+			imap_valid = 0;
 		}
 
 		if (!iohead)
@@ -1101,12 +1259,23 @@ xfs_page_state_convert(
 	if (startio)
 		xfs_start_page_writeback(page, 1, count);
 
-	if (ioend && iomap_valid) {
-		offset = (iomap.iomap_offset + iomap.iomap_bsize - 1) >>
-					PAGE_CACHE_SHIFT;
-		tlast = min_t(pgoff_t, offset, last_index);
-		xfs_cluster_write(inode, page->index + 1, &iomap, &ioend,
-					wbc, startio, all_bh, tlast);
+	if (ioend && imap_valid) {
+		xfs_off_t		end_index;
+
+		end_index = imap.br_startoff + imap.br_blockcount;
+
+		/* to bytes */
+		end_index <<= inode->i_blkbits;
+
+		/* to pages */
+		end_index = (end_index - 1) >> PAGE_CACHE_SHIFT;
+
+		/* check against file size */
+		if (end_index > last_index)
+			end_index = last_index;
+
+		xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
+					wbc, startio, all_bh, end_index);
 	}
 
 	if (iohead)
@@ -1125,7 +1294,7 @@ error:
 	 */
 	if (err != -EAGAIN) {
 		if (!unmapped)
-			block_invalidatepage(page, 0);
+			xfs_aops_discard_page(page);
 		ClearPageUptodate(page);
 	}
 	return err;
@@ -1164,6 +1333,21 @@ xfs_vm_writepage(
 	trace_xfs_writepage(inode, page, 0);
 
 	/*
+	 * Refuse to write the page out if we are called from reclaim context.
+	 *
+	 * This is primarily to avoid stack overflows when called from deep
+	 * used stacks in random callers for direct reclaim, but disabling
+	 * reclaim for kswap is a nice side-effect as kswapd causes rather
+	 * suboptimal I/O patters, too.
+	 *
+	 * This should really be done by the core VM, but until that happens
+	 * filesystems like XFS, btrfs and ext4 have to take care of this
+	 * by themselves.
+	 */
+	if (current->flags & PF_MEMALLOC)
+		goto out_fail;
+
+	/*
 	 * We need a transaction if:
 	 *  1. There are delalloc buffers on the page
 	 *  2. The page is uptodate and we have unmapped buffers
@@ -1197,14 +1381,6 @@ xfs_vm_writepage(
 	if (!page_has_buffers(page))
 		create_empty_buffers(page, 1 << inode->i_blkbits, 0);
 
-
-	/*
-	 *  VM calculation for nr_to_write seems off.  Bump it way
-	 *  up, this gets simple streaming writes zippy again.
-	 *  To be reviewed again after Jens' writeback changes.
-	 */
-	wbc->nr_to_write *= 4;
-
 	/*
 	 * Convert delayed allocate, unwritten or unmapped space
 	 * to real space and flush out to disk.
@@ -1308,10 +1484,11 @@ __xfs_get_blocks(
 	int			direct,
 	bmapi_flags_t		flags)
 {
-	xfs_iomap_t		iomap;
+	struct xfs_bmbt_irec	imap;
 	xfs_off_t		offset;
 	ssize_t			size;
-	int			niomap = 1;
+	int			nimap = 1;
+	int			new = 0;
 	int			error;
 
 	offset = (xfs_off_t)iblock << inode->i_blkbits;
@@ -1322,22 +1499,21 @@ __xfs_get_blocks(
 		return 0;
 
 	error = xfs_iomap(XFS_I(inode), offset, size,
-			     create ? flags : BMAPI_READ, &iomap, &niomap);
+			     create ? flags : BMAPI_READ, &imap, &nimap, &new);
 	if (error)
 		return -error;
-	if (niomap == 0)
+	if (nimap == 0)
 		return 0;
 
-	if (iomap.iomap_bn != IOMAP_DADDR_NULL) {
+	if (imap.br_startblock != HOLESTARTBLOCK &&
+	    imap.br_startblock != DELAYSTARTBLOCK) {
 		/*
 		 * For unwritten extents do not report a disk address on
 		 * the read case (treat as if we're reading into a hole).
 		 */
-		if (create || !(iomap.iomap_flags & IOMAP_UNWRITTEN)) {
-			xfs_map_buffer(bh_result, &iomap, offset,
-				       inode->i_blkbits);
-		}
-		if (create && (iomap.iomap_flags & IOMAP_UNWRITTEN)) {
+		if (create || !ISUNWRITTEN(&imap))
+			xfs_map_buffer(inode, bh_result, &imap, offset);
+		if (create && ISUNWRITTEN(&imap)) {
 			if (direct)
 				bh_result->b_private = inode;
 			set_buffer_unwritten(bh_result);
@@ -1348,7 +1524,7 @@ __xfs_get_blocks(
 	 * If this is a realtime file, data may be on a different device.
 	 * to that pointed to from the buffer_head b_bdev currently.
 	 */
-	bh_result->b_bdev = iomap.iomap_target->bt_bdev;
+	bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
 
 	/*
 	 * If we previously allocated a block out beyond eof and we are now
@@ -1362,10 +1538,10 @@ __xfs_get_blocks(
 	if (create &&
 	    ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
 	     (offset >= i_size_read(inode)) ||
-	     (iomap.iomap_flags & (IOMAP_NEW|IOMAP_UNWRITTEN))))
+	     (new || ISUNWRITTEN(&imap))))
 		set_buffer_new(bh_result);
 
-	if (iomap.iomap_flags & IOMAP_DELAY) {
+	if (imap.br_startblock == DELAYSTARTBLOCK) {
 		BUG_ON(direct);
 		if (create) {
 			set_buffer_uptodate(bh_result);
@@ -1374,11 +1550,23 @@ __xfs_get_blocks(
 		}
 	}
 
+	/*
+	 * If this is O_DIRECT or the mpage code calling tell them how large
+	 * the mapping is, so that we can avoid repeated get_blocks calls.
+	 */
 	if (direct || size > (1 << inode->i_blkbits)) {
-		ASSERT(iomap.iomap_bsize - iomap.iomap_delta > 0);
-		offset = min_t(xfs_off_t,
-				iomap.iomap_bsize - iomap.iomap_delta, size);
-		bh_result->b_size = (ssize_t)min_t(xfs_off_t, LONG_MAX, offset);
+		xfs_off_t		mapping_size;
+
+		mapping_size = imap.br_startoff + imap.br_blockcount - iblock;
+		mapping_size <<= inode->i_blkbits;
+
+		ASSERT(mapping_size > 0);
+		if (mapping_size > size)
+			mapping_size = size;
+		if (mapping_size > LONG_MAX)
+			mapping_size = LONG_MAX;
+
+		bh_result->b_size = mapping_size;
 	}
 
 	return 0;
@@ -1436,7 +1624,7 @@ xfs_end_io_direct(
 	 */
 	ioend->io_offset = offset;
 	ioend->io_size = size;
-	if (ioend->io_type == IOMAP_READ) {
+	if (ioend->io_type == IO_READ) {
 		xfs_finish_ioend(ioend, 0);
 	} else if (private && size > 0) {
 		xfs_finish_ioend(ioend, is_sync_kiocb(iocb));
@@ -1447,7 +1635,7 @@ xfs_end_io_direct(
 		 * didn't map an unwritten extent so switch it's completion
 		 * handler.
 		 */
-		ioend->io_type = IOMAP_NEW;
+		ioend->io_type = IO_NEW;
 		xfs_finish_ioend(ioend, 0);
 	}
 
@@ -1472,10 +1660,10 @@ xfs_vm_direct_IO(
 	struct block_device *bdev;
 	ssize_t		ret;
 
-	bdev = xfs_find_bdev_for_inode(XFS_I(inode));
+	bdev = xfs_find_bdev_for_inode(inode);
 
 	iocb->private = xfs_alloc_ioend(inode, rw == WRITE ?
-					IOMAP_UNWRITTEN : IOMAP_READ);
+					IO_UNWRITTEN : IO_READ);
 
 	ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov,
 					    offset, nr_segs,
@@ -1535,15 +1723,6 @@ xfs_vm_readpages(
 	return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
 }
 
-STATIC void
-xfs_vm_invalidatepage(
-	struct page		*page,
-	unsigned long		offset)
-{
-	trace_xfs_invalidatepage(page->mapping->host, page, offset);
-	block_invalidatepage(page, offset);
-}
-
 const struct address_space_operations xfs_address_space_operations = {
 	.readpage		= xfs_vm_readpage,
 	.readpages		= xfs_vm_readpages,
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 77b8be81c769..2ee3f7a60163 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -18,7 +18,7 @@
 #include "xfs.h"
 #include <linux/stddef.h>
 #include <linux/errno.h>
-#include <linux/slab.h>
+#include <linux/gfp.h>
 #include <linux/pagemap.h>
 #include <linux/init.h>
 #include <linux/vmalloc.h>
@@ -33,9 +33,11 @@
 #include <linux/migrate.h>
 #include <linux/backing-dev.h>
 #include <linux/freezer.h>
+#include <linux/list_sort.h>
 
 #include "xfs_sb.h"
 #include "xfs_inum.h"
+#include "xfs_log.h"
 #include "xfs_ag.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
@@ -43,7 +45,7 @@
 
 static kmem_zone_t *xfs_buf_zone;
 STATIC int xfsbufd(void *);
-STATIC int xfsbufd_wakeup(int, gfp_t);
+STATIC int xfsbufd_wakeup(struct shrinker *, int, gfp_t);
 STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
 static struct shrinker xfs_buf_shake = {
 	.shrink = xfsbufd_wakeup,
@@ -76,6 +78,27 @@ struct workqueue_struct *xfsconvertd_workqueue;
 #define xfs_buf_deallocate(bp) \
 	kmem_zone_free(xfs_buf_zone, (bp));
 
+static inline int
+xfs_buf_is_vmapped(
+	struct xfs_buf	*bp)
+{
+	/*
+	 * Return true if the buffer is vmapped.
+	 *
+	 * The XBF_MAPPED flag is set if the buffer should be mapped, but the
+	 * code is clever enough to know it doesn't have to map a single page,
+	 * so the check has to be both for XBF_MAPPED and bp->b_page_count > 1.
+	 */
+	return (bp->b_flags & XBF_MAPPED) && bp->b_page_count > 1;
+}
+
+static inline int
+xfs_buf_vmap_len(
+	struct xfs_buf	*bp)
+{
+	return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
+}
+
 /*
  *	Page Region interfaces.
  *
@@ -146,75 +169,6 @@ test_page_region(
 }
 
 /*
- *	Mapping of multi-page buffers into contiguous virtual space
- */
-
-typedef struct a_list {
-	void		*vm_addr;
-	struct a_list	*next;
-} a_list_t;
-
-static a_list_t		*as_free_head;
-static int		as_list_len;
-static DEFINE_SPINLOCK(as_lock);
-
-/*
- *	Try to batch vunmaps because they are costly.
- */
-STATIC void
-free_address(
-	void		*addr)
-{
-	a_list_t	*aentry;
-
-#ifdef CONFIG_XEN
-	/*
-	 * Xen needs to be able to make sure it can get an exclusive
-	 * RO mapping of pages it wants to turn into a pagetable.  If
-	 * a newly allocated page is also still being vmap()ed by xfs,
-	 * it will cause pagetable construction to fail.  This is a
-	 * quick workaround to always eagerly unmap pages so that Xen
-	 * is happy.
-	 */
-	vunmap(addr);
-	return;
-#endif
-
-	aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT);
-	if (likely(aentry)) {
-		spin_lock(&as_lock);
-		aentry->next = as_free_head;
-		aentry->vm_addr = addr;
-		as_free_head = aentry;
-		as_list_len++;
-		spin_unlock(&as_lock);
-	} else {
-		vunmap(addr);
-	}
-}
-
-STATIC void
-purge_addresses(void)
-{
-	a_list_t	*aentry, *old;
-
-	if (as_free_head == NULL)
-		return;
-
-	spin_lock(&as_lock);
-	aentry = as_free_head;
-	as_free_head = NULL;
-	as_list_len = 0;
-	spin_unlock(&as_lock);
-
-	while ((old = aentry) != NULL) {
-		vunmap(aentry->vm_addr);
-		aentry = aentry->next;
-		kfree(old);
-	}
-}
-
-/*
  *	Internal xfs_buf_t object manipulation
  */
 
@@ -314,8 +268,9 @@ xfs_buf_free(
 	if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
 		uint		i;
 
-		if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1))
-			free_address(bp->b_addr - bp->b_offset);
+		if (xfs_buf_is_vmapped(bp))
+			vm_unmap_ram(bp->b_addr - bp->b_offset,
+					bp->b_page_count);
 
 		for (i = 0; i < bp->b_page_count; i++) {
 			struct page	*page = bp->b_pages[i];
@@ -385,7 +340,7 @@ _xfs_buf_lookup_pages(
 					__func__, gfp_mask);
 
 			XFS_STATS_INC(xb_page_retries);
-			xfsbufd_wakeup(0, gfp_mask);
+			xfsbufd_wakeup(NULL, 0, gfp_mask);
 			congestion_wait(BLK_RW_ASYNC, HZ/50);
 			goto retry;
 		}
@@ -435,10 +390,8 @@ _xfs_buf_map_pages(
 		bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
 		bp->b_flags |= XBF_MAPPED;
 	} else if (flags & XBF_MAPPED) {
-		if (as_list_len > 64)
-			purge_addresses();
-		bp->b_addr = vmap(bp->b_pages, bp->b_page_count,
-					VM_MAP, PAGE_KERNEL);
+		bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
+					-1, PAGE_KERNEL);
 		if (unlikely(bp->b_addr == NULL))
 			return -ENOMEM;
 		bp->b_addr += bp->b_offset;
@@ -898,6 +851,12 @@ xfs_buf_lock_value(
  *	Note that this in no way locks the underlying pages, so it is only
  *	useful for synchronizing concurrent use of buffer objects, not for
  *	synchronizing independent access to the underlying pages.
+ *
+ *	If we come across a stale, pinned, locked buffer, we know that we
+ *	are being asked to lock a buffer that has been reallocated. Because
+ *	it is pinned, we know that the log has not been pushed to disk and
+ *	hence it will still be locked. Rather than sleeping until someone
+ *	else pushes the log, push it ourselves before trying to get the lock.
  */
 void
 xfs_buf_lock(
@@ -905,6 +864,8 @@ xfs_buf_lock(
 {
 	trace_xfs_buf_lock(bp, _RET_IP_);
 
+	if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
+		xfs_log_force(bp->b_mount, 0);
 	if (atomic_read(&bp->b_io_remaining))
 		blk_run_address_space(bp->b_target->bt_mapping);
 	down(&bp->b_sema);
@@ -1051,22 +1012,25 @@ xfs_buf_ioerror(
 }
 
 int
-xfs_bawrite(
-	void			*mp,
+xfs_bwrite(
+	struct xfs_mount	*mp,
 	struct xfs_buf		*bp)
 {
-	trace_xfs_buf_bawrite(bp, _RET_IP_);
+	int			error;
 
-	ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);
+	bp->b_strat = xfs_bdstrat_cb;
+	bp->b_mount = mp;
+	bp->b_flags |= XBF_WRITE;
+	bp->b_flags &= ~(XBF_ASYNC | XBF_READ);
 
 	xfs_buf_delwri_dequeue(bp);
+	xfs_buf_iostrategy(bp);
 
-	bp->b_flags &= ~(XBF_READ | XBF_DELWRI | XBF_READ_AHEAD);
-	bp->b_flags |= (XBF_WRITE | XBF_ASYNC | _XBF_RUN_QUEUES);
-
-	bp->b_mount = mp;
-	bp->b_strat = xfs_bdstrat_cb;
-	return xfs_bdstrat_cb(bp);
+	error = xfs_buf_iowait(bp);
+	if (error)
+		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+	xfs_buf_relse(bp);
+	return error;
 }
 
 void
@@ -1085,6 +1049,126 @@ xfs_bdwrite(
 	xfs_buf_delwri_queue(bp, 1);
 }
 
+/*
+ * Called when we want to stop a buffer from getting written or read.
+ * We attach the EIO error, muck with its flags, and call biodone
+ * so that the proper iodone callbacks get called.
+ */
+STATIC int
+xfs_bioerror(
+	xfs_buf_t *bp)
+{
+#ifdef XFSERRORDEBUG
+	ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
+#endif
+
+	/*
+	 * No need to wait until the buffer is unpinned, we aren't flushing it.
+	 */
+	XFS_BUF_ERROR(bp, EIO);
+
+	/*
+	 * We're calling biodone, so delete XBF_DONE flag.
+	 */
+	XFS_BUF_UNREAD(bp);
+	XFS_BUF_UNDELAYWRITE(bp);
+	XFS_BUF_UNDONE(bp);
+	XFS_BUF_STALE(bp);
+
+	XFS_BUF_CLR_BDSTRAT_FUNC(bp);
+	xfs_biodone(bp);
+
+	return EIO;
+}
+
+/*
+ * Same as xfs_bioerror, except that we are releasing the buffer
+ * here ourselves, and avoiding the biodone call.
+ * This is meant for userdata errors; metadata bufs come with
+ * iodone functions attached, so that we can track down errors.
+ */
+STATIC int
+xfs_bioerror_relse(
+	struct xfs_buf	*bp)
+{
+	int64_t		fl = XFS_BUF_BFLAGS(bp);
+	/*
+	 * No need to wait until the buffer is unpinned.
+	 * We aren't flushing it.
+	 *
+	 * chunkhold expects B_DONE to be set, whether
+	 * we actually finish the I/O or not. We don't want to
+	 * change that interface.
+	 */
+	XFS_BUF_UNREAD(bp);
+	XFS_BUF_UNDELAYWRITE(bp);
+	XFS_BUF_DONE(bp);
+	XFS_BUF_STALE(bp);
+	XFS_BUF_CLR_IODONE_FUNC(bp);
+	XFS_BUF_CLR_BDSTRAT_FUNC(bp);
+	if (!(fl & XBF_ASYNC)) {
+		/*
+		 * Mark b_error and B_ERROR _both_.
+		 * Lot's of chunkcache code assumes that.
+		 * There's no reason to mark error for
+		 * ASYNC buffers.
+		 */
+		XFS_BUF_ERROR(bp, EIO);
+		XFS_BUF_FINISH_IOWAIT(bp);
+	} else {
+		xfs_buf_relse(bp);
+	}
+
+	return EIO;
+}
+
+
+/*
+ * All xfs metadata buffers except log state machine buffers
+ * get this attached as their b_bdstrat callback function.
+ * This is so that we can catch a buffer
+ * after prematurely unpinning it to forcibly shutdown the filesystem.
+ */
+int
+xfs_bdstrat_cb(
+	struct xfs_buf	*bp)
+{
+	if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
+		trace_xfs_bdstrat_shut(bp, _RET_IP_);
+		/*
+		 * Metadata write that didn't get logged but
+		 * written delayed anyway. These aren't associated
+		 * with a transaction, and can be ignored.
+		 */
+		if (!bp->b_iodone && !XFS_BUF_ISREAD(bp))
+			return xfs_bioerror_relse(bp);
+		else
+			return xfs_bioerror(bp);
+	}
+
+	xfs_buf_iorequest(bp);
+	return 0;
+}
+
+/*
+ * Wrapper around bdstrat so that we can stop data from going to disk in case
+ * we are shutting down the filesystem.  Typically user data goes thru this
+ * path; one of the exceptions is the superblock.
+ */
+void
+xfsbdstrat(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp)
+{
+	if (XFS_FORCED_SHUTDOWN(mp)) {
+		trace_xfs_bdstrat_shut(bp, _RET_IP_);
+		xfs_bioerror_relse(bp);
+		return;
+	}
+
+	xfs_buf_iorequest(bp);
+}
+
 STATIC void
 _xfs_buf_ioend(
 	xfs_buf_t		*bp,
@@ -1107,6 +1191,9 @@ xfs_buf_bio_end_io(
 
 	xfs_buf_ioerror(bp, -error);
 
+	if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
+		invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
+
 	do {
 		struct page	*page = bvec->bv_page;
 
@@ -1216,6 +1303,10 @@ next_chunk:
 
 submit_io:
 	if (likely(bio->bi_size)) {
+		if (xfs_buf_is_vmapped(bp)) {
+			flush_kernel_vmap_range(bp->b_addr,
+						xfs_buf_vmap_len(bp));
+		}
 		submit_bio(rw, bio);
 		if (size)
 			goto next_chunk;
@@ -1296,7 +1387,7 @@ xfs_buf_iomove(
 	xfs_buf_t		*bp,	/* buffer to process		*/
 	size_t			boff,	/* starting buffer offset	*/
 	size_t			bsize,	/* length to copy		*/
-	caddr_t			data,	/* data address			*/
+	void			*data,	/* data address			*/
 	xfs_buf_rw_t		mode)	/* read/write/zero flag		*/
 {
 	size_t			bend, cpoff, csize;
@@ -1378,8 +1469,8 @@ xfs_alloc_bufhash(
 
 	btp->bt_hashshift = external ? 3 : 8;	/* 8 or 256 buckets */
 	btp->bt_hashmask = (1 << btp->bt_hashshift) - 1;
-	btp->bt_hash = kmem_zalloc((1 << btp->bt_hashshift) *
-					sizeof(xfs_bufhash_t), KM_SLEEP | KM_LARGE);
+	btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) *
+					 sizeof(xfs_bufhash_t));
 	for (i = 0; i < (1 << btp->bt_hashshift); i++) {
 		spin_lock_init(&btp->bt_hash[i].bh_lock);
 		INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);
@@ -1390,7 +1481,7 @@ STATIC void
 xfs_free_bufhash(
 	xfs_buftarg_t		*btp)
 {
-	kmem_free(btp->bt_hash);
+	kmem_free_large(btp->bt_hash);
 	btp->bt_hash = NULL;
 }
 
@@ -1527,7 +1618,8 @@ xfs_mapping_buftarg(
 
 STATIC int
 xfs_alloc_delwrite_queue(
-	xfs_buftarg_t		*btp)
+	xfs_buftarg_t		*btp,
+	const char		*fsname)
 {
 	int	error = 0;
 
@@ -1535,7 +1627,7 @@ xfs_alloc_delwrite_queue(
 	INIT_LIST_HEAD(&btp->bt_delwrite_queue);
 	spin_lock_init(&btp->bt_delwrite_lock);
 	btp->bt_flags = 0;
-	btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd");
+	btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
 	if (IS_ERR(btp->bt_task)) {
 		error = PTR_ERR(btp->bt_task);
 		goto out_error;
@@ -1548,7 +1640,8 @@ out_error:
 xfs_buftarg_t *
 xfs_alloc_buftarg(
 	struct block_device	*bdev,
-	int			external)
+	int			external,
+	const char		*fsname)
 {
 	xfs_buftarg_t		*btp;
 
@@ -1560,7 +1653,7 @@ xfs_alloc_buftarg(
 		goto error;
 	if (xfs_mapping_buftarg(btp, bdev))
 		goto error;
-	if (xfs_alloc_delwrite_queue(btp))
+	if (xfs_alloc_delwrite_queue(btp, fsname))
 		goto error;
 	xfs_alloc_bufhash(btp, external);
 	return btp;
@@ -1595,6 +1688,11 @@ xfs_buf_delwri_queue(
 		list_del(&bp->b_list);
 	}
 
+	if (list_empty(dwq)) {
+		/* start xfsbufd as it is about to have something to do */
+		wake_up_process(bp->b_target->bt_task);
+	}
+
 	bp->b_flags |= _XBF_DELWRI_Q;
 	list_add_tail(&bp->b_list, dwq);
 	bp->b_queuetime = jiffies;
@@ -1626,6 +1724,35 @@ xfs_buf_delwri_dequeue(
 	trace_xfs_buf_delwri_dequeue(bp, _RET_IP_);
 }
 
+/*
+ * If a delwri buffer needs to be pushed before it has aged out, then promote
+ * it to the head of the delwri queue so that it will be flushed on the next
+ * xfsbufd run. We do this by resetting the queuetime of the buffer to be older
+ * than the age currently needed to flush the buffer. Hence the next time the
+ * xfsbufd sees it is guaranteed to be considered old enough to flush.
+ */
+void
+xfs_buf_delwri_promote(
+	struct xfs_buf	*bp)
+{
+	struct xfs_buftarg *btp = bp->b_target;
+	long		age = xfs_buf_age_centisecs * msecs_to_jiffies(10) + 1;
+
+	ASSERT(bp->b_flags & XBF_DELWRI);
+	ASSERT(bp->b_flags & _XBF_DELWRI_Q);
+
+	/*
+	 * Check the buffer age before locking the delayed write queue as we
+	 * don't need to promote buffers that are already past the flush age.
+	 */
+	if (bp->b_queuetime < jiffies - age)
+		return;
+	bp->b_queuetime = jiffies - age;
+	spin_lock(&btp->bt_delwrite_lock);
+	list_move(&bp->b_list, &btp->bt_delwrite_queue);
+	spin_unlock(&btp->bt_delwrite_lock);
+}
+
 STATIC void
 xfs_buf_runall_queues(
 	struct workqueue_struct	*queue)
@@ -1635,6 +1762,7 @@ xfs_buf_runall_queues(
 
 STATIC int
 xfsbufd_wakeup(
+	struct shrinker		*shrink,
 	int			priority,
 	gfp_t			mask)
 {
@@ -1644,6 +1772,8 @@ xfsbufd_wakeup(
 	list_for_each_entry(btp, &xfs_buftarg_list, bt_list) {
 		if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags))
 			continue;
+		if (list_empty(&btp->bt_delwrite_queue))
+			continue;
 		set_bit(XBT_FORCE_FLUSH, &btp->bt_flags);
 		wake_up_process(btp->bt_task);
 	}
@@ -1694,20 +1824,53 @@ xfs_buf_delwri_split(
 
 }
 
+/*
+ * Compare function is more complex than it needs to be because
+ * the return value is only 32 bits and we are doing comparisons
+ * on 64 bit values
+ */
+static int
+xfs_buf_cmp(
+	void		*priv,
+	struct list_head *a,
+	struct list_head *b)
+{
+	struct xfs_buf	*ap = container_of(a, struct xfs_buf, b_list);
+	struct xfs_buf	*bp = container_of(b, struct xfs_buf, b_list);
+	xfs_daddr_t		diff;
+
+	diff = ap->b_bn - bp->b_bn;
+	if (diff < 0)
+		return -1;
+	if (diff > 0)
+		return 1;
+	return 0;
+}
+
+void
+xfs_buf_delwri_sort(
+	xfs_buftarg_t	*target,
+	struct list_head *list)
+{
+	list_sort(NULL, list, xfs_buf_cmp);
+}
+
 STATIC int
 xfsbufd(
 	void		*data)
 {
-	struct list_head tmp;
-	xfs_buftarg_t	*target = (xfs_buftarg_t *)data;
-	int		count;
-	xfs_buf_t	*bp;
+	xfs_buftarg_t   *target = (xfs_buftarg_t *)data;
 
 	current->flags |= PF_MEMALLOC;
 
 	set_freezable();
 
 	do {
+		long	age = xfs_buf_age_centisecs * msecs_to_jiffies(10);
+		long	tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10);
+		int	count = 0;
+		struct list_head tmp;
+
 		if (unlikely(freezing(current))) {
 			set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
 			refrigerator();
@@ -1715,24 +1878,20 @@ xfsbufd(
 			clear_bit(XBT_FORCE_SLEEP, &target->bt_flags);
 		}
 
-		schedule_timeout_interruptible(
-			xfs_buf_timer_centisecs * msecs_to_jiffies(10));
+		/* sleep for a long time if there is nothing to do. */
+		if (list_empty(&target->bt_delwrite_queue))
+			tout = MAX_SCHEDULE_TIMEOUT;
+		schedule_timeout_interruptible(tout);
 
-		xfs_buf_delwri_split(target, &tmp,
-				xfs_buf_age_centisecs * msecs_to_jiffies(10));
-
-		count = 0;
+		xfs_buf_delwri_split(target, &tmp, age);
+		list_sort(NULL, &tmp, xfs_buf_cmp);
 		while (!list_empty(&tmp)) {
-			bp = list_entry(tmp.next, xfs_buf_t, b_list);
-			ASSERT(target == bp->b_target);
-
+			struct xfs_buf *bp;
+			bp = list_first_entry(&tmp, struct xfs_buf, b_list);
 			list_del_init(&bp->b_list);
 			xfs_buf_iostrategy(bp);
 			count++;
 		}
-
-		if (as_list_len > 0)
-			purge_addresses();
 		if (count)
 			blk_run_address_space(target->bt_mapping);
 
@@ -1751,42 +1910,45 @@ xfs_flush_buftarg(
 	xfs_buftarg_t	*target,
 	int		wait)
 {
-	struct list_head tmp;
-	xfs_buf_t	*bp, *n;
+	xfs_buf_t	*bp;
 	int		pincount = 0;
+	LIST_HEAD(tmp_list);
+	LIST_HEAD(wait_list);
 
 	xfs_buf_runall_queues(xfsconvertd_workqueue);
 	xfs_buf_runall_queues(xfsdatad_workqueue);
 	xfs_buf_runall_queues(xfslogd_workqueue);
 
 	set_bit(XBT_FORCE_FLUSH, &target->bt_flags);
-	pincount = xfs_buf_delwri_split(target, &tmp, 0);
+	pincount = xfs_buf_delwri_split(target, &tmp_list, 0);
 
 	/*
-	 * Dropped the delayed write list lock, now walk the temporary list
+	 * Dropped the delayed write list lock, now walk the temporary list.
+	 * All I/O is issued async and then if we need to wait for completion
+	 * we do that after issuing all the IO.
 	 */
-	list_for_each_entry_safe(bp, n, &tmp, b_list) {
+	list_sort(NULL, &tmp_list, xfs_buf_cmp);
+	while (!list_empty(&tmp_list)) {
+		bp = list_first_entry(&tmp_list, struct xfs_buf, b_list);
 		ASSERT(target == bp->b_target);
-		if (wait)
+		list_del_init(&bp->b_list);
+		if (wait) {
 			bp->b_flags &= ~XBF_ASYNC;
-		else
-			list_del_init(&bp->b_list);
-
+			list_add(&bp->b_list, &wait_list);
+		}
 		xfs_buf_iostrategy(bp);
 	}
 
-	if (wait)
+	if (wait) {
+		/* Expedite and wait for IO to complete. */
 		blk_run_address_space(target->bt_mapping);
+		while (!list_empty(&wait_list)) {
+			bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
 
-	/*
-	 * Remaining list items must be flushed before returning
-	 */
-	while (!list_empty(&tmp)) {
-		bp = list_entry(tmp.next, xfs_buf_t, b_list);
-
-		list_del_init(&bp->b_list);
-		xfs_iowait(bp);
-		xfs_buf_relse(bp);
+			list_del_init(&bp->b_list);
+			xfs_iowait(bp);
+			xfs_buf_relse(bp);
+		}
 	}
 
 	return pincount;
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index a34c7b54822d..5fbecefa5dfd 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -232,13 +232,17 @@ extern void xfs_buf_lock(xfs_buf_t *);
 extern void xfs_buf_unlock(xfs_buf_t *);
 
 /* Buffer Read and Write Routines */
-extern int xfs_bawrite(void *mp, xfs_buf_t *bp);
+extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
 extern void xfs_bdwrite(void *mp, xfs_buf_t *bp);
+
+extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
+extern int xfs_bdstrat_cb(struct xfs_buf *);
+
 extern void xfs_buf_ioend(xfs_buf_t *,	int);
 extern void xfs_buf_ioerror(xfs_buf_t *, int);
 extern int xfs_buf_iorequest(xfs_buf_t *);
 extern int xfs_buf_iowait(xfs_buf_t *);
-extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, xfs_caddr_t,
+extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
 				xfs_buf_rw_t);
 
 static inline int xfs_buf_iostrategy(xfs_buf_t *bp)
@@ -261,6 +265,7 @@ extern int xfs_buf_ispin(xfs_buf_t *);
 
 /* Delayed Write Buffer Routines */
 extern void xfs_buf_delwri_dequeue(xfs_buf_t *);
+extern void xfs_buf_delwri_promote(xfs_buf_t *);
 
 /* Buffer Daemon Setup Routines */
 extern int xfs_buf_init(void);
@@ -270,33 +275,19 @@ extern void xfs_buf_terminate(void);
 	({ char __b[BDEVNAME_SIZE]; bdevname((target)->bt_bdev, __b); __b; })
 
 
-#define XFS_B_ASYNC		XBF_ASYNC
-#define XFS_B_DELWRI		XBF_DELWRI
-#define XFS_B_READ		XBF_READ
-#define XFS_B_WRITE		XBF_WRITE
-#define XFS_B_STALE		XBF_STALE
-
-#define XFS_BUF_TRYLOCK		XBF_TRYLOCK
-#define XFS_INCORE_TRYLOCK	XBF_TRYLOCK
-#define XFS_BUF_LOCK		XBF_LOCK
-#define XFS_BUF_MAPPED		XBF_MAPPED
-
-#define BUF_BUSY		XBF_DONT_BLOCK
-
 #define XFS_BUF_BFLAGS(bp)	((bp)->b_flags)
 #define XFS_BUF_ZEROFLAGS(bp)	((bp)->b_flags &= \
 		~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED))
 
-#define XFS_BUF_STALE(bp)	((bp)->b_flags |= XFS_B_STALE)
-#define XFS_BUF_UNSTALE(bp)	((bp)->b_flags &= ~XFS_B_STALE)
-#define XFS_BUF_ISSTALE(bp)	((bp)->b_flags & XFS_B_STALE)
+#define XFS_BUF_STALE(bp)	((bp)->b_flags |= XBF_STALE)
+#define XFS_BUF_UNSTALE(bp)	((bp)->b_flags &= ~XBF_STALE)
+#define XFS_BUF_ISSTALE(bp)	((bp)->b_flags & XBF_STALE)
 #define XFS_BUF_SUPER_STALE(bp)	do {				\
 					XFS_BUF_STALE(bp);	\
 					xfs_buf_delwri_dequeue(bp);	\
 					XFS_BUF_DONE(bp);	\
 				} while (0)
 
-#define XFS_BUF_MANAGE		XBF_FS_MANAGED
 #define XFS_BUF_UNMANAGE(bp)	((bp)->b_flags &= ~XBF_FS_MANAGED)
 
 #define XFS_BUF_DELAYWRITE(bp)		((bp)->b_flags |= XBF_DELWRI)
@@ -385,31 +376,11 @@ static inline void xfs_buf_relse(xfs_buf_t *bp)
 
 #define xfs_biomove(bp, off, len, data, rw) \
 	    xfs_buf_iomove((bp), (off), (len), (data), \
-		((rw) == XFS_B_WRITE) ? XBRW_WRITE : XBRW_READ)
+		((rw) == XBF_WRITE) ? XBRW_WRITE : XBRW_READ)
 
 #define xfs_biozero(bp, off, len) \
 	    xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
 
-
-static inline int XFS_bwrite(xfs_buf_t *bp)
-{
-	int	iowait = (bp->b_flags & XBF_ASYNC) == 0;
-	int	error = 0;
-
-	if (!iowait)
-		bp->b_flags |= _XBF_RUN_QUEUES;
-
-	xfs_buf_delwri_dequeue(bp);
-	xfs_buf_iostrategy(bp);
-	if (iowait) {
-		error = xfs_buf_iowait(bp);
-		xfs_buf_relse(bp);
-	}
-	return error;
-}
-
-#define XFS_bdstrat(bp) xfs_buf_iorequest(bp)
-
 #define xfs_iowait(bp)	xfs_buf_iowait(bp)
 
 #define xfs_baread(target, rablkno, ralen)  \
@@ -419,11 +390,12 @@ static inline int XFS_bwrite(xfs_buf_t *bp)
 /*
  *	Handling of buftargs.
  */
-extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int);
+extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int, const char *);
 extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
 extern void xfs_wait_buftarg(xfs_buftarg_t *);
 extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
 extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
+
 #ifdef CONFIG_KDB_MODULES
 extern struct list_head *xfs_get_buftarg_list(void);
 #endif
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 87b8cbd23d4b..e7839ee49e43 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -29,6 +29,7 @@
 #include "xfs_vnodeops.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
+#include "xfs_inode_item.h"
 
 /*
  * Note that we only accept fileids which are long enough rather than allow
@@ -127,13 +128,12 @@ xfs_nfs_get_inode(
 		return ERR_PTR(-ESTALE);
 
 	/*
-	 * The XFS_IGET_BULKSTAT means that an invalid inode number is just
-	 * fine and not an indication of a corrupted filesystem.  Because
-	 * clients can send any kind of invalid file handle, e.g. after
-	 * a restore on the server we have to deal with this case gracefully.
+	 * The XFS_IGET_UNTRUSTED means that an invalid inode number is just
+	 * fine and not an indication of a corrupted filesystem as clients can
+	 * send invalid file handles and we have to handle it gracefully..
 	 */
-	error = xfs_iget(mp, NULL, ino, XFS_IGET_BULKSTAT,
-			 XFS_ILOCK_SHARED, &ip, 0);
+	error = xfs_iget(mp, NULL, ino, XFS_IGET_UNTRUSTED,
+			 XFS_ILOCK_SHARED, &ip);
 	if (error) {
 		/*
 		 * EINVAL means the inode cluster doesn't exist anymore.
@@ -215,9 +215,28 @@ xfs_fs_get_parent(
 	return d_obtain_alias(VFS_I(cip));
 }
 
+STATIC int
+xfs_fs_nfs_commit_metadata(
+	struct inode		*inode)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	int			error = 0;
+
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+	if (xfs_ipincount(ip)) {
+		error = _xfs_log_force_lsn(mp, ip->i_itemp->ili_last_lsn,
+				XFS_LOG_SYNC, NULL);
+	}
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+	return error;
+}
+
 const struct export_operations xfs_export_operations = {
 	.encode_fh		= xfs_fs_encode_fh,
 	.fh_to_dentry		= xfs_fs_fh_to_dentry,
 	.fh_to_parent		= xfs_fs_fh_to_parent,
 	.get_parent		= xfs_fs_get_parent,
+	.commit_metadata	= xfs_fs_nfs_commit_metadata,
 };
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index e4caeb28ce2e..257a56b127cf 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -16,6 +16,7 @@
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 #include "xfs.h"
+#include "xfs_fs.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_inum.h"
@@ -34,52 +35,281 @@
 #include "xfs_dir2_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
 #include "xfs_error.h"
 #include "xfs_rw.h"
 #include "xfs_vnodeops.h"
 #include "xfs_da_btree.h"
 #include "xfs_ioctl.h"
+#include "xfs_trace.h"
 
 #include <linux/dcache.h>
 
 static const struct vm_operations_struct xfs_file_vm_ops;
 
-STATIC ssize_t
-xfs_file_aio_read(
-	struct kiocb		*iocb,
-	const struct iovec	*iov,
-	unsigned long		nr_segs,
-	loff_t			pos)
+/*
+ *	xfs_iozero
+ *
+ *	xfs_iozero clears the specified range of buffer supplied,
+ *	and marks all the affected blocks as valid and modified.  If
+ *	an affected block is not allocated, it will be allocated.  If
+ *	an affected block is not completely overwritten, and is not
+ *	valid before the operation, it will be read from disk before
+ *	being partially zeroed.
+ */
+STATIC int
+xfs_iozero(
+	struct xfs_inode	*ip,	/* inode			*/
+	loff_t			pos,	/* offset in file		*/
+	size_t			count)	/* size of data to zero		*/
 {
-	struct file		*file = iocb->ki_filp;
-	int			ioflags = 0;
+	struct page		*page;
+	struct address_space	*mapping;
+	int			status;
 
-	BUG_ON(iocb->ki_pos != pos);
-	if (unlikely(file->f_flags & O_DIRECT))
-		ioflags |= IO_ISDIRECT;
-	if (file->f_mode & FMODE_NOCMTIME)
-		ioflags |= IO_INVIS;
-	return xfs_read(XFS_I(file->f_path.dentry->d_inode), iocb, iov,
-				nr_segs, &iocb->ki_pos, ioflags);
+	mapping = VFS_I(ip)->i_mapping;
+	do {
+		unsigned offset, bytes;
+		void *fsdata;
+
+		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
+		bytes = PAGE_CACHE_SIZE - offset;
+		if (bytes > count)
+			bytes = count;
+
+		status = pagecache_write_begin(NULL, mapping, pos, bytes,
+					AOP_FLAG_UNINTERRUPTIBLE,
+					&page, &fsdata);
+		if (status)
+			break;
+
+		zero_user(page, offset, bytes);
+
+		status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
+					page, fsdata);
+		WARN_ON(status <= 0); /* can't return less than zero! */
+		pos += bytes;
+		count -= bytes;
+		status = 0;
+	} while (count);
+
+	return (-status);
+}
+
+STATIC int
+xfs_file_fsync(
+	struct file		*file,
+	int			datasync)
+{
+	struct inode		*inode = file->f_mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_trans	*tp;
+	int			error = 0;
+	int			log_flushed = 0;
+
+	xfs_itrace_entry(ip);
+
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+		return -XFS_ERROR(EIO);
+
+	xfs_iflags_clear(ip, XFS_ITRUNCATED);
+
+	xfs_ioend_wait(ip);
+
+	/*
+	 * We always need to make sure that the required inode state is safe on
+	 * disk.  The inode might be clean but we still might need to force the
+	 * log because of committed transactions that haven't hit the disk yet.
+	 * Likewise, there could be unflushed non-transactional changes to the
+	 * inode core that have to go to disk and this requires us to issue
+	 * a synchronous transaction to capture these changes correctly.
+	 *
+	 * This code relies on the assumption that if the i_update_core field
+	 * of the inode is clear and the inode is unpinned then it is clean
+	 * and no action is required.
+	 */
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+
+	/*
+	 * First check if the VFS inode is marked dirty.  All the dirtying
+	 * of non-transactional updates no goes through mark_inode_dirty*,
+	 * which allows us to distinguish beteeen pure timestamp updates
+	 * and i_size updates which need to be caught for fdatasync.
+	 * After that also theck for the dirty state in the XFS inode, which
+	 * might gets cleared when the inode gets written out via the AIL
+	 * or xfs_iflush_cluster.
+	 */
+	if (((inode->i_state & I_DIRTY_DATASYNC) ||
+	    ((inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
+	    ip->i_update_core) {
+		/*
+		 * Kick off a transaction to log the inode core to get the
+		 * updates.  The sync transaction will also force the log.
+		 */
+		xfs_iunlock(ip, XFS_ILOCK_SHARED);
+		tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
+		error = xfs_trans_reserve(tp, 0,
+				XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0);
+		if (error) {
+			xfs_trans_cancel(tp, 0);
+			return -error;
+		}
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+		/*
+		 * Note - it's possible that we might have pushed ourselves out
+		 * of the way during trans_reserve which would flush the inode.
+		 * But there's no guarantee that the inode buffer has actually
+		 * gone out yet (it's delwri).	Plus the buffer could be pinned
+		 * anyway if it's part of an inode in another recent
+		 * transaction.	 So we play it safe and fire off the
+		 * transaction anyway.
+		 */
+		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+		xfs_trans_ihold(tp, ip);
+		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+		xfs_trans_set_sync(tp);
+		error = _xfs_trans_commit(tp, 0, &log_flushed);
+
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	} else {
+		/*
+		 * Timestamps/size haven't changed since last inode flush or
+		 * inode transaction commit.  That means either nothing got
+		 * written or a transaction committed which caught the updates.
+		 * If the latter happened and the transaction hasn't hit the
+		 * disk yet, the inode will be still be pinned.  If it is,
+		 * force the log.
+		 */
+		if (xfs_ipincount(ip)) {
+			error = _xfs_log_force_lsn(ip->i_mount,
+					ip->i_itemp->ili_last_lsn,
+					XFS_LOG_SYNC, &log_flushed);
+		}
+		xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	}
+
+	if (ip->i_mount->m_flags & XFS_MOUNT_BARRIER) {
+		/*
+		 * If the log write didn't issue an ordered tag we need
+		 * to flush the disk cache for the data device now.
+		 */
+		if (!log_flushed)
+			xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
+
+		/*
+		 * If this inode is on the RT dev we need to flush that
+		 * cache as well.
+		 */
+		if (XFS_IS_REALTIME_INODE(ip))
+			xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
+	}
+
+	return -error;
 }
 
 STATIC ssize_t
-xfs_file_aio_write(
+xfs_file_aio_read(
 	struct kiocb		*iocb,
-	const struct iovec	*iov,
+	const struct iovec	*iovp,
 	unsigned long		nr_segs,
 	loff_t			pos)
 {
 	struct file		*file = iocb->ki_filp;
+	struct inode		*inode = file->f_mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	size_t			size = 0;
+	ssize_t			ret = 0;
 	int			ioflags = 0;
+	xfs_fsize_t		n;
+	unsigned long		seg;
+
+	XFS_STATS_INC(xs_read_calls);
 
 	BUG_ON(iocb->ki_pos != pos);
+
 	if (unlikely(file->f_flags & O_DIRECT))
 		ioflags |= IO_ISDIRECT;
 	if (file->f_mode & FMODE_NOCMTIME)
 		ioflags |= IO_INVIS;
-	return xfs_write(XFS_I(file->f_mapping->host), iocb, iov, nr_segs,
-				&iocb->ki_pos, ioflags);
+
+	/* START copy & waste from filemap.c */
+	for (seg = 0; seg < nr_segs; seg++) {
+		const struct iovec *iv = &iovp[seg];
+
+		/*
+		 * If any segment has a negative length, or the cumulative
+		 * length ever wraps negative then return -EINVAL.
+		 */
+		size += iv->iov_len;
+		if (unlikely((ssize_t)(size|iv->iov_len) < 0))
+			return XFS_ERROR(-EINVAL);
+	}
+	/* END copy & waste from filemap.c */
+
+	if (unlikely(ioflags & IO_ISDIRECT)) {
+		xfs_buftarg_t	*target =
+			XFS_IS_REALTIME_INODE(ip) ?
+				mp->m_rtdev_targp : mp->m_ddev_targp;
+		if ((iocb->ki_pos & target->bt_smask) ||
+		    (size & target->bt_smask)) {
+			if (iocb->ki_pos == ip->i_size)
+				return 0;
+			return -XFS_ERROR(EINVAL);
+		}
+	}
+
+	n = XFS_MAXIOFFSET(mp) - iocb->ki_pos;
+	if (n <= 0 || size == 0)
+		return 0;
+
+	if (n < size)
+		size = n;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	if (unlikely(ioflags & IO_ISDIRECT))
+		mutex_lock(&inode->i_mutex);
+	xfs_ilock(ip, XFS_IOLOCK_SHARED);
+
+	if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
+		int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
+		int iolock = XFS_IOLOCK_SHARED;
+
+		ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, iocb->ki_pos, size,
+					dmflags, &iolock);
+		if (ret) {
+			xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+			if (unlikely(ioflags & IO_ISDIRECT))
+				mutex_unlock(&inode->i_mutex);
+			return ret;
+		}
+	}
+
+	if (unlikely(ioflags & IO_ISDIRECT)) {
+		if (inode->i_mapping->nrpages) {
+			ret = -xfs_flushinval_pages(ip,
+					(iocb->ki_pos & PAGE_CACHE_MASK),
+					-1, FI_REMAPF_LOCKED);
+		}
+		mutex_unlock(&inode->i_mutex);
+		if (ret) {
+			xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+			return ret;
+		}
+	}
+
+	trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
+
+	ret = generic_file_aio_read(iocb, iovp, nr_segs, iocb->ki_pos);
+	if (ret > 0)
+		XFS_STATS_ADD(xs_read_bytes, ret);
+
+	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+	return ret;
 }
 
 STATIC ssize_t
@@ -87,16 +317,44 @@ xfs_file_splice_read(
 	struct file		*infilp,
 	loff_t			*ppos,
 	struct pipe_inode_info	*pipe,
-	size_t			len,
+	size_t			count,
 	unsigned int		flags)
 {
+	struct xfs_inode	*ip = XFS_I(infilp->f_mapping->host);
+	struct xfs_mount	*mp = ip->i_mount;
 	int			ioflags = 0;
+	ssize_t			ret;
+
+	XFS_STATS_INC(xs_read_calls);
 
 	if (infilp->f_mode & FMODE_NOCMTIME)
 		ioflags |= IO_INVIS;
 
-	return xfs_splice_read(XFS_I(infilp->f_path.dentry->d_inode),
-				   infilp, ppos, pipe, len, flags, ioflags);
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+		return -EIO;
+
+	xfs_ilock(ip, XFS_IOLOCK_SHARED);
+
+	if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
+		int iolock = XFS_IOLOCK_SHARED;
+		int error;
+
+		error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count,
+					FILP_DELAY_FLAG(infilp), &iolock);
+		if (error) {
+			xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+			return -error;
+		}
+	}
+
+	trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
+
+	ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
+	if (ret > 0)
+		XFS_STATS_ADD(xs_read_bytes, ret);
+
+	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+	return ret;
 }
 
 STATIC ssize_t
@@ -104,16 +362,538 @@ xfs_file_splice_write(
 	struct pipe_inode_info	*pipe,
 	struct file		*outfilp,
 	loff_t			*ppos,
-	size_t			len,
+	size_t			count,
 	unsigned int		flags)
 {
+	struct inode		*inode = outfilp->f_mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fsize_t		isize, new_size;
 	int			ioflags = 0;
+	ssize_t			ret;
+
+	XFS_STATS_INC(xs_write_calls);
 
 	if (outfilp->f_mode & FMODE_NOCMTIME)
 		ioflags |= IO_INVIS;
 
-	return xfs_splice_write(XFS_I(outfilp->f_path.dentry->d_inode),
-				    pipe, outfilp, ppos, len, flags, ioflags);
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+		return -EIO;
+
+	xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+	if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) {
+		int iolock = XFS_IOLOCK_EXCL;
+		int error;
+
+		error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count,
+					FILP_DELAY_FLAG(outfilp), &iolock);
+		if (error) {
+			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+			return -error;
+		}
+	}
+
+	new_size = *ppos + count;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	if (new_size > ip->i_size)
+		ip->i_new_size = new_size;
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
+
+	ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
+	if (ret > 0)
+		XFS_STATS_ADD(xs_write_bytes, ret);
+
+	isize = i_size_read(inode);
+	if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
+		*ppos = isize;
+
+	if (*ppos > ip->i_size) {
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		if (*ppos > ip->i_size)
+			ip->i_size = *ppos;
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	}
+
+	if (ip->i_new_size) {
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		ip->i_new_size = 0;
+		if (ip->i_d.di_size > ip->i_size)
+			ip->i_d.di_size = ip->i_size;
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	}
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+	return ret;
+}
+
+/*
+ * This routine is called to handle zeroing any space in the last
+ * block of the file that is beyond the EOF.  We do this since the
+ * size is being increased without writing anything to that block
+ * and we don't want anyone to read the garbage on the disk.
+ */
+STATIC int				/* error (positive) */
+xfs_zero_last_block(
+	xfs_inode_t	*ip,
+	xfs_fsize_t	offset,
+	xfs_fsize_t	isize)
+{
+	xfs_fileoff_t	last_fsb;
+	xfs_mount_t	*mp = ip->i_mount;
+	int		nimaps;
+	int		zero_offset;
+	int		zero_len;
+	int		error = 0;
+	xfs_bmbt_irec_t	imap;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+	zero_offset = XFS_B_FSB_OFFSET(mp, isize);
+	if (zero_offset == 0) {
+		/*
+		 * There are no extra bytes in the last block on disk to
+		 * zero, so return.
+		 */
+		return 0;
+	}
+
+	last_fsb = XFS_B_TO_FSBT(mp, isize);
+	nimaps = 1;
+	error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap,
+			  &nimaps, NULL, NULL);
+	if (error) {
+		return error;
+	}
+	ASSERT(nimaps > 0);
+	/*
+	 * If the block underlying isize is just a hole, then there
+	 * is nothing to zero.
+	 */
+	if (imap.br_startblock == HOLESTARTBLOCK) {
+		return 0;
+	}
+	/*
+	 * Zero the part of the last block beyond the EOF, and write it
+	 * out sync.  We need to drop the ilock while we do this so we
+	 * don't deadlock when the buffer cache calls back to us.
+	 */
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	zero_len = mp->m_sb.sb_blocksize - zero_offset;
+	if (isize + zero_len > offset)
+		zero_len = offset - isize;
+	error = xfs_iozero(ip, isize, zero_len);
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	ASSERT(error >= 0);
+	return error;
+}
+
+/*
+ * Zero any on disk space between the current EOF and the new,
+ * larger EOF.  This handles the normal case of zeroing the remainder
+ * of the last block in the file and the unusual case of zeroing blocks
+ * out beyond the size of the file.  This second case only happens
+ * with fixed size extents and when the system crashes before the inode
+ * size was updated but after blocks were allocated.  If fill is set,
+ * then any holes in the range are filled and zeroed.  If not, the holes
+ * are left alone as holes.
+ */
+
+int					/* error (positive) */
+xfs_zero_eof(
+	xfs_inode_t	*ip,
+	xfs_off_t	offset,		/* starting I/O offset */
+	xfs_fsize_t	isize)		/* current inode size */
+{
+	xfs_mount_t	*mp = ip->i_mount;
+	xfs_fileoff_t	start_zero_fsb;
+	xfs_fileoff_t	end_zero_fsb;
+	xfs_fileoff_t	zero_count_fsb;
+	xfs_fileoff_t	last_fsb;
+	xfs_fileoff_t	zero_off;
+	xfs_fsize_t	zero_len;
+	int		nimaps;
+	int		error = 0;
+	xfs_bmbt_irec_t	imap;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
+	ASSERT(offset > isize);
+
+	/*
+	 * First handle zeroing the block on which isize resides.
+	 * We only zero a part of that block so it is handled specially.
+	 */
+	error = xfs_zero_last_block(ip, offset, isize);
+	if (error) {
+		ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
+		return error;
+	}
+
+	/*
+	 * Calculate the range between the new size and the old
+	 * where blocks needing to be zeroed may exist.  To get the
+	 * block where the last byte in the file currently resides,
+	 * we need to subtract one from the size and truncate back
+	 * to a block boundary.  We subtract 1 in case the size is
+	 * exactly on a block boundary.
+	 */
+	last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
+	start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
+	end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
+	ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
+	if (last_fsb == end_zero_fsb) {
+		/*
+		 * The size was only incremented on its last block.
+		 * We took care of that above, so just return.
+		 */
+		return 0;
+	}
+
+	ASSERT(start_zero_fsb <= end_zero_fsb);
+	while (start_zero_fsb <= end_zero_fsb) {
+		nimaps = 1;
+		zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
+		error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb,
+				  0, NULL, 0, &imap, &nimaps, NULL, NULL);
+		if (error) {
+			ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
+			return error;
+		}
+		ASSERT(nimaps > 0);
+
+		if (imap.br_state == XFS_EXT_UNWRITTEN ||
+		    imap.br_startblock == HOLESTARTBLOCK) {
+			/*
+			 * This loop handles initializing pages that were
+			 * partially initialized by the code below this
+			 * loop. It basically zeroes the part of the page
+			 * that sits on a hole and sets the page as P_HOLE
+			 * and calls remapf if it is a mapped file.
+			 */
+			start_zero_fsb = imap.br_startoff + imap.br_blockcount;
+			ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
+			continue;
+		}
+
+		/*
+		 * There are blocks we need to zero.
+		 * Drop the inode lock while we're doing the I/O.
+		 * We'll still have the iolock to protect us.
+		 */
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+		zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
+		zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
+
+		if ((zero_off + zero_len) > offset)
+			zero_len = offset - zero_off;
+
+		error = xfs_iozero(ip, zero_off, zero_len);
+		if (error) {
+			goto out_lock;
+		}
+
+		start_zero_fsb = imap.br_startoff + imap.br_blockcount;
+		ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
+
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+	}
+
+	return 0;
+
+out_lock:
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	ASSERT(error >= 0);
+	return error;
+}
+
+STATIC ssize_t
+xfs_file_aio_write(
+	struct kiocb		*iocb,
+	const struct iovec	*iovp,
+	unsigned long		nr_segs,
+	loff_t			pos)
+{
+	struct file		*file = iocb->ki_filp;
+	struct address_space	*mapping = file->f_mapping;
+	struct inode		*inode = mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	ssize_t			ret = 0, error = 0;
+	int			ioflags = 0;
+	xfs_fsize_t		isize, new_size;
+	int			iolock;
+	int			eventsent = 0;
+	size_t			ocount = 0, count;
+	int			need_i_mutex;
+
+	XFS_STATS_INC(xs_write_calls);
+
+	BUG_ON(iocb->ki_pos != pos);
+
+	if (unlikely(file->f_flags & O_DIRECT))
+		ioflags |= IO_ISDIRECT;
+	if (file->f_mode & FMODE_NOCMTIME)
+		ioflags |= IO_INVIS;
+
+	error = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
+	if (error)
+		return error;
+
+	count = ocount;
+	if (count == 0)
+		return 0;
+
+	xfs_wait_for_freeze(mp, SB_FREEZE_WRITE);
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+relock:
+	if (ioflags & IO_ISDIRECT) {
+		iolock = XFS_IOLOCK_SHARED;
+		need_i_mutex = 0;
+	} else {
+		iolock = XFS_IOLOCK_EXCL;
+		need_i_mutex = 1;
+		mutex_lock(&inode->i_mutex);
+	}
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
+
+start:
+	error = -generic_write_checks(file, &pos, &count,
+					S_ISBLK(inode->i_mode));
+	if (error) {
+		xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
+		goto out_unlock_mutex;
+	}
+
+	if ((DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) &&
+	    !(ioflags & IO_INVIS) && !eventsent)) {
+		int		dmflags = FILP_DELAY_FLAG(file);
+
+		if (need_i_mutex)
+			dmflags |= DM_FLAGS_IMUX;
+
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		error = XFS_SEND_DATA(ip->i_mount, DM_EVENT_WRITE, ip,
+				      pos, count, dmflags, &iolock);
+		if (error) {
+			goto out_unlock_internal;
+		}
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		eventsent = 1;
+
+		/*
+		 * The iolock was dropped and reacquired in XFS_SEND_DATA
+		 * so we have to recheck the size when appending.
+		 * We will only "goto start;" once, since having sent the
+		 * event prevents another call to XFS_SEND_DATA, which is
+		 * what allows the size to change in the first place.
+		 */
+		if ((file->f_flags & O_APPEND) && pos != ip->i_size)
+			goto start;
+	}
+
+	if (ioflags & IO_ISDIRECT) {
+		xfs_buftarg_t	*target =
+			XFS_IS_REALTIME_INODE(ip) ?
+				mp->m_rtdev_targp : mp->m_ddev_targp;
+
+		if ((pos & target->bt_smask) || (count & target->bt_smask)) {
+			xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
+			return XFS_ERROR(-EINVAL);
+		}
+
+		if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) {
+			xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
+			iolock = XFS_IOLOCK_EXCL;
+			need_i_mutex = 1;
+			mutex_lock(&inode->i_mutex);
+			xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
+			goto start;
+		}
+	}
+
+	new_size = pos + count;
+	if (new_size > ip->i_size)
+		ip->i_new_size = new_size;
+
+	if (likely(!(ioflags & IO_INVIS)))
+		file_update_time(file);
+
+	/*
+	 * If the offset is beyond the size of the file, we have a couple
+	 * of things to do. First, if there is already space allocated
+	 * we need to either create holes or zero the disk or ...
+	 *
+	 * If there is a page where the previous size lands, we need
+	 * to zero it out up to the new size.
+	 */
+
+	if (pos > ip->i_size) {
+		error = xfs_zero_eof(ip, pos, ip->i_size);
+		if (error) {
+			xfs_iunlock(ip, XFS_ILOCK_EXCL);
+			goto out_unlock_internal;
+		}
+	}
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	/*
+	 * If we're writing the file then make sure to clear the
+	 * setuid and setgid bits if the process is not being run
+	 * by root.  This keeps people from modifying setuid and
+	 * setgid binaries.
+	 */
+	error = -file_remove_suid(file);
+	if (unlikely(error))
+		goto out_unlock_internal;
+
+	/* We can write back this queue in page reclaim */
+	current->backing_dev_info = mapping->backing_dev_info;
+
+	if ((ioflags & IO_ISDIRECT)) {
+		if (mapping->nrpages) {
+			WARN_ON(need_i_mutex == 0);
+			error = xfs_flushinval_pages(ip,
+					(pos & PAGE_CACHE_MASK),
+					-1, FI_REMAPF_LOCKED);
+			if (error)
+				goto out_unlock_internal;
+		}
+
+		if (need_i_mutex) {
+			/* demote the lock now the cached pages are gone */
+			xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
+			mutex_unlock(&inode->i_mutex);
+
+			iolock = XFS_IOLOCK_SHARED;
+			need_i_mutex = 0;
+		}
+
+		trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags);
+		ret = generic_file_direct_write(iocb, iovp,
+				&nr_segs, pos, &iocb->ki_pos, count, ocount);
+
+		/*
+		 * direct-io write to a hole: fall through to buffered I/O
+		 * for completing the rest of the request.
+		 */
+		if (ret >= 0 && ret != count) {
+			XFS_STATS_ADD(xs_write_bytes, ret);
+
+			pos += ret;
+			count -= ret;
+
+			ioflags &= ~IO_ISDIRECT;
+			xfs_iunlock(ip, iolock);
+			goto relock;
+		}
+	} else {
+		int enospc = 0;
+		ssize_t ret2 = 0;
+
+write_retry:
+		trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags);
+		ret2 = generic_file_buffered_write(iocb, iovp, nr_segs,
+				pos, &iocb->ki_pos, count, ret);
+		/*
+		 * if we just got an ENOSPC, flush the inode now we
+		 * aren't holding any page locks and retry *once*
+		 */
+		if (ret2 == -ENOSPC && !enospc) {
+			error = xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
+			if (error)
+				goto out_unlock_internal;
+			enospc = 1;
+			goto write_retry;
+		}
+		ret = ret2;
+	}
+
+	current->backing_dev_info = NULL;
+
+	isize = i_size_read(inode);
+	if (unlikely(ret < 0 && ret != -EFAULT && iocb->ki_pos > isize))
+		iocb->ki_pos = isize;
+
+	if (iocb->ki_pos > ip->i_size) {
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		if (iocb->ki_pos > ip->i_size)
+			ip->i_size = iocb->ki_pos;
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	}
+
+	if (ret == -ENOSPC &&
+	    DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
+		xfs_iunlock(ip, iolock);
+		if (need_i_mutex)
+			mutex_unlock(&inode->i_mutex);
+		error = XFS_SEND_NAMESP(ip->i_mount, DM_EVENT_NOSPACE, ip,
+				DM_RIGHT_NULL, ip, DM_RIGHT_NULL, NULL, NULL,
+				0, 0, 0); /* Delay flag intentionally  unused */
+		if (need_i_mutex)
+			mutex_lock(&inode->i_mutex);
+		xfs_ilock(ip, iolock);
+		if (error)
+			goto out_unlock_internal;
+		goto start;
+	}
+
+	error = -ret;
+	if (ret <= 0)
+		goto out_unlock_internal;
+
+	XFS_STATS_ADD(xs_write_bytes, ret);
+
+	/* Handle various SYNC-type writes */
+	if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
+		loff_t end = pos + ret - 1;
+		int error2;
+
+		xfs_iunlock(ip, iolock);
+		if (need_i_mutex)
+			mutex_unlock(&inode->i_mutex);
+
+		error2 = filemap_write_and_wait_range(mapping, pos, end);
+		if (!error)
+			error = error2;
+		if (need_i_mutex)
+			mutex_lock(&inode->i_mutex);
+		xfs_ilock(ip, iolock);
+
+		error2 = -xfs_file_fsync(file,
+					 (file->f_flags & __O_SYNC) ? 0 : 1);
+		if (!error)
+			error = error2;
+	}
+
+ out_unlock_internal:
+	if (ip->i_new_size) {
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		ip->i_new_size = 0;
+		/*
+		 * If this was a direct or synchronous I/O that failed (such
+		 * as ENOSPC) then part of the I/O may have been written to
+		 * disk before the error occured.  In this case the on-disk
+		 * file size may have been adjusted beyond the in-memory file
+		 * size and now needs to be truncated back.
+		 */
+		if (ip->i_d.di_size > ip->i_size)
+			ip->i_d.di_size = ip->i_size;
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	}
+	xfs_iunlock(ip, iolock);
+ out_unlock_mutex:
+	if (need_i_mutex)
+		mutex_unlock(&inode->i_mutex);
+	return -error;
 }
 
 STATIC int
@@ -160,28 +940,6 @@ xfs_file_release(
 	return -xfs_release(XFS_I(inode));
 }
 
-/*
- * We ignore the datasync flag here because a datasync is effectively
- * identical to an fsync. That is, datasync implies that we need to write
- * only the metadata needed to be able to access the data that is written
- * if we crash after the call completes. Hence if we are writing beyond
- * EOF we have to log the inode size change as well, which makes it a
- * full fsync. If we don't write beyond EOF, the inode core will be
- * clean in memory and so we don't need to log the inode, just like
- * fsync.
- */
-STATIC int
-xfs_file_fsync(
-	struct file		*file,
-	struct dentry		*dentry,
-	int			datasync)
-{
-	struct xfs_inode	*ip = XFS_I(dentry->d_inode);
-
-	xfs_iflags_clear(ip, XFS_ITRUNCATED);
-	return -xfs_fsync(ip);
-}
-
 STATIC int
 xfs_file_readdir(
 	struct file	*filp,
@@ -203,9 +961,9 @@ xfs_file_readdir(
 	 *
 	 * Try to give it an estimate that's good enough, maybe at some
 	 * point we can change the ->readdir prototype to include the
-	 * buffer size.
+	 * buffer size.  For now we use the current glibc buffer size.
 	 */
-	bufsize = (size_t)min_t(loff_t, PAGE_SIZE, ip->i_d.di_size);
+	bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size);
 
 	error = xfs_readdir(ip, dirent, bufsize,
 				(xfs_off_t *)&filp->f_pos, filldir);
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index 7501b85fd860..b6918d76bc7b 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -79,7 +79,7 @@ xfs_flush_pages(
 		xfs_iflags_clear(ip, XFS_ITRUNCATED);
 		ret = -filemap_fdatawrite(mapping);
 	}
-	if (flags & XFS_B_ASYNC)
+	if (flags & XBF_ASYNC)
 		return ret;
 	ret2 = xfs_wait_on_pages(ip, first, last);
 	if (!ret)
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index a034cf624437..e59a81062830 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -58,6 +58,7 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/pagemap.h>
+#include <linux/slab.h>
 #include <linux/exportfs.h>
 
 /*
@@ -447,12 +448,12 @@ xfs_attrlist_by_handle(
 int
 xfs_attrmulti_attr_get(
 	struct inode		*inode,
-	char			*name,
-	char			__user *ubuf,
+	unsigned char		*name,
+	unsigned char		__user *ubuf,
 	__uint32_t		*len,
 	__uint32_t		flags)
 {
-	char			*kbuf;
+	unsigned char		*kbuf;
 	int			error = EFAULT;
 
 	if (*len > XATTR_SIZE_MAX)
@@ -476,12 +477,12 @@ xfs_attrmulti_attr_get(
 int
 xfs_attrmulti_attr_set(
 	struct inode		*inode,
-	char			*name,
-	const char		__user *ubuf,
+	unsigned char		*name,
+	const unsigned char	__user *ubuf,
 	__uint32_t		len,
 	__uint32_t		flags)
 {
-	char			*kbuf;
+	unsigned char		*kbuf;
 	int			error = EFAULT;
 
 	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
@@ -501,7 +502,7 @@ xfs_attrmulti_attr_set(
 int
 xfs_attrmulti_attr_remove(
 	struct inode		*inode,
-	char			*name,
+	unsigned char		*name,
 	__uint32_t		flags)
 {
 	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
@@ -519,13 +520,17 @@ xfs_attrmulti_by_handle(
 	xfs_fsop_attrmulti_handlereq_t am_hreq;
 	struct dentry		*dentry;
 	unsigned int		i, size;
-	char			*attr_name;
+	unsigned char		*attr_name;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -XFS_ERROR(EPERM);
 	if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t)))
 		return -XFS_ERROR(EFAULT);
 
+	/* overflow check */
+	if (am_hreq.opcount >= INT_MAX / sizeof(xfs_attr_multiop_t))
+		return -E2BIG;
+
 	dentry = xfs_handlereq_to_dentry(parfilp, &am_hreq.hreq);
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
@@ -547,7 +552,7 @@ xfs_attrmulti_by_handle(
 
 	error = 0;
 	for (i = 0; i < am_hreq.opcount; i++) {
-		ops[i].am_error = strncpy_from_user(attr_name,
+		ops[i].am_error = strncpy_from_user((char *)attr_name,
 				ops[i].am_attrname, MAXNAMELEN);
 		if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
 			error = -ERANGE;
@@ -674,10 +679,9 @@ xfs_ioc_bulkstat(
 		error = xfs_bulkstat_single(mp, &inlast,
 						bulkreq.ubuffer, &done);
 	else	/* XFS_IOC_FSBULKSTAT */
-		error = xfs_bulkstat(mp, &inlast, &count,
-			(bulkstat_one_pf)xfs_bulkstat_one, NULL,
-			sizeof(xfs_bstat_t), bulkreq.ubuffer,
-			BULKSTAT_FG_QUICK, &done);
+		error = xfs_bulkstat(mp, &inlast, &count, xfs_bulkstat_one,
+				     sizeof(xfs_bstat_t), bulkreq.ubuffer,
+				     &done);
 
 	if (error)
 		return -error;
@@ -1431,6 +1435,9 @@ xfs_file_ioctl(
 		if (!capable(CAP_SYS_ADMIN))
 			return -EPERM;
 
+		if (mp->m_flags & XFS_MOUNT_RDONLY)
+			return -XFS_ERROR(EROFS);
+
 		if (copy_from_user(&inout, arg, sizeof(inout)))
 			return -XFS_ERROR(EFAULT);
 
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.h b/fs/xfs/linux-2.6/xfs_ioctl.h
index 7bd7c6afc1eb..d56173b34a2a 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.h
+++ b/fs/xfs/linux-2.6/xfs_ioctl.h
@@ -45,23 +45,23 @@ xfs_readlink_by_handle(
 extern int
 xfs_attrmulti_attr_get(
 	struct inode		*inode,
-	char			*name,
-	char			__user *ubuf,
+	unsigned char		*name,
+	unsigned char		__user *ubuf,
 	__uint32_t		*len,
 	__uint32_t		flags);
 
 extern int
-	xfs_attrmulti_attr_set(
+xfs_attrmulti_attr_set(
 	struct inode		*inode,
-	char			*name,
-	const char		__user *ubuf,
+	unsigned char		*name,
+	const unsigned char	__user *ubuf,
 	__uint32_t		len,
 	__uint32_t		flags);
 
 extern int
 xfs_attrmulti_attr_remove(
 	struct inode		*inode,
-	char			*name,
+	unsigned char		*name,
 	__uint32_t		flags);
 
 extern struct dentry *
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index be1527b1670c..52ed49e6465c 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -18,6 +18,7 @@
 #include <linux/compat.h>
 #include <linux/ioctl.h>
 #include <linux/mount.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include "xfs.h"
 #include "xfs_fs.h"
@@ -236,15 +237,12 @@ xfs_bulkstat_one_compat(
 	xfs_ino_t	ino,		/* inode number to get data for */
 	void		__user *buffer,	/* buffer to place output in */
 	int		ubsize,		/* size of buffer */
-	void		*private_data,	/* my private data */
-	xfs_daddr_t	bno,		/* starting bno of inode cluster */
 	int		*ubused,	/* bytes used by me */
-	void		*dibuff,	/* on-disk inode buffer */
 	int		*stat)		/* BULKSTAT_RV_... */
 {
 	return xfs_bulkstat_one_int(mp, ino, buffer, ubsize,
-				    xfs_bulkstat_one_fmt_compat, bno,
-				    ubused, dibuff, stat);
+				    xfs_bulkstat_one_fmt_compat,
+				    ubused, stat);
 }
 
 /* copied from xfs_ioctl.c */
@@ -297,13 +295,11 @@ xfs_compat_ioc_bulkstat(
 		int res;
 
 		error = xfs_bulkstat_one_compat(mp, inlast, bulkreq.ubuffer,
-				sizeof(compat_xfs_bstat_t),
-				NULL, 0, NULL, NULL, &res);
+				sizeof(compat_xfs_bstat_t), 0, &res);
 	} else if (cmd == XFS_IOC_FSBULKSTAT_32) {
 		error = xfs_bulkstat(mp, &inlast, &count,
-			xfs_bulkstat_one_compat, NULL,
-			sizeof(compat_xfs_bstat_t), bulkreq.ubuffer,
-			BULKSTAT_FG_QUICK, &done);
+			xfs_bulkstat_one_compat, sizeof(compat_xfs_bstat_t),
+			bulkreq.ubuffer, &done);
 	} else
 		error = XFS_ERROR(EINVAL);
 	if (error)
@@ -411,7 +407,7 @@ xfs_compat_attrmulti_by_handle(
 	compat_xfs_fsop_attrmulti_handlereq_t	am_hreq;
 	struct dentry				*dentry;
 	unsigned int				i, size;
-	char					*attr_name;
+	unsigned char				*attr_name;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -XFS_ERROR(EPERM);
@@ -419,6 +415,10 @@ xfs_compat_attrmulti_by_handle(
 			   sizeof(compat_xfs_fsop_attrmulti_handlereq_t)))
 		return -XFS_ERROR(EFAULT);
 
+	/* overflow check */
+	if (am_hreq.opcount >= INT_MAX / sizeof(compat_xfs_attr_multiop_t))
+		return -E2BIG;
+
 	dentry = xfs_compat_handlereq_to_dentry(parfilp, &am_hreq.hreq);
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
@@ -440,7 +440,7 @@ xfs_compat_attrmulti_by_handle(
 
 	error = 0;
 	for (i = 0; i < am_hreq.opcount; i++) {
-		ops[i].am_error = strncpy_from_user(attr_name,
+		ops[i].am_error = strncpy_from_user((char *)attr_name,
 				compat_ptr(ops[i].am_attrname),
 				MAXNAMELEN);
 		if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 225946012d0b..44f0b2de153e 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -56,6 +56,7 @@
 #include <linux/security.h>
 #include <linux/falloc.h>
 #include <linux/fiemap.h>
+#include <linux/slab.h>
 
 /*
  * Bring the timestamps in the XFS inode uptodate.
@@ -91,6 +92,16 @@ xfs_mark_inode_dirty_sync(
 		mark_inode_dirty_sync(inode);
 }
 
+void
+xfs_mark_inode_dirty(
+	xfs_inode_t	*ip)
+{
+	struct inode	*inode = VFS_I(ip);
+
+	if (!(inode->i_state & (I_WILL_FREE|I_FREEING|I_CLEAR)))
+		mark_inode_dirty(inode);
+}
+
 /*
  * Change the requested timestamp in the given inode.
  * We don't lock across timestamp updates, and we don't log them but
@@ -140,10 +151,10 @@ xfs_init_security(
 	struct xfs_inode *ip = XFS_I(inode);
 	size_t		length;
 	void		*value;
-	char		*name;
+	unsigned char	*name;
 	int		error;
 
-	error = security_inode_init_security(inode, dir, &name,
+	error = security_inode_init_security(inode, dir, (char **)&name,
 					     &value, &length);
 	if (error) {
 		if (error == -EOPNOTSUPP)
@@ -574,11 +585,20 @@ xfs_vn_fallocate(
 	bf.l_len = len;
 
 	xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+	/* check the new inode size is valid before allocating */
+	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+	    offset + len > i_size_read(inode)) {
+		new_size = offset + len;
+		error = inode_newsize_ok(inode, new_size);
+		if (error)
+			goto out_unlock;
+	}
+
 	error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
 				       0, XFS_ATTR_NOLOCK);
-	if (!error && !(mode & FALLOC_FL_KEEP_SIZE) &&
-	    offset + len > i_size_read(inode))
-		new_size = offset + len;
+	if (error)
+		goto out_unlock;
 
 	/* Change file size if needed */
 	if (new_size) {
@@ -589,6 +609,7 @@ xfs_vn_fallocate(
 		error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
 	}
 
+out_unlock:
 	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 out_error:
 	return error;
@@ -662,7 +683,10 @@ xfs_vn_fiemap(
 		bm.bmv_length = BTOBB(length);
 
 	/* We add one because in getbmap world count includes the header */
-	bm.bmv_count = fieinfo->fi_extents_max + 1;
+	bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM :
+					fieinfo->fi_extents_max + 1;
+	bm.bmv_count = min_t(__s32, bm.bmv_count,
+			     (PAGE_SIZE * 16 / sizeof(struct getbmapx)));
 	bm.bmv_iflags = BMV_IF_PREALLOC;
 	if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
 		bm.bmv_iflags |= BMV_IF_ATTRFORK;
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 5af0c81ca1ae..facfb323a706 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -88,7 +88,6 @@
 #include <xfs_super.h>
 #include <xfs_globals.h>
 #include <xfs_fs_subr.h>
-#include <xfs_lrw.h>
 #include <xfs_buf.h>
 
 /*
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
deleted file mode 100644
index 0d32457abef1..000000000000
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ /dev/null
@@ -1,852 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_alloc.h"
-#include "xfs_dmapi.h"
-#include "xfs_quota.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_bmap.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_rtalloc.h"
-#include "xfs_error.h"
-#include "xfs_itable.h"
-#include "xfs_rw.h"
-#include "xfs_attr.h"
-#include "xfs_inode_item.h"
-#include "xfs_buf_item.h"
-#include "xfs_utils.h"
-#include "xfs_iomap.h"
-#include "xfs_vnodeops.h"
-#include "xfs_trace.h"
-
-#include <linux/capability.h>
-#include <linux/writeback.h>
-
-
-/*
- *	xfs_iozero
- *
- *	xfs_iozero clears the specified range of buffer supplied,
- *	and marks all the affected blocks as valid and modified.  If
- *	an affected block is not allocated, it will be allocated.  If
- *	an affected block is not completely overwritten, and is not
- *	valid before the operation, it will be read from disk before
- *	being partially zeroed.
- */
-STATIC int
-xfs_iozero(
-	struct xfs_inode	*ip,	/* inode			*/
-	loff_t			pos,	/* offset in file		*/
-	size_t			count)	/* size of data to zero		*/
-{
-	struct page		*page;
-	struct address_space	*mapping;
-	int			status;
-
-	mapping = VFS_I(ip)->i_mapping;
-	do {
-		unsigned offset, bytes;
-		void *fsdata;
-
-		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
-		bytes = PAGE_CACHE_SIZE - offset;
-		if (bytes > count)
-			bytes = count;
-
-		status = pagecache_write_begin(NULL, mapping, pos, bytes,
-					AOP_FLAG_UNINTERRUPTIBLE,
-					&page, &fsdata);
-		if (status)
-			break;
-
-		zero_user(page, offset, bytes);
-
-		status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
-					page, fsdata);
-		WARN_ON(status <= 0); /* can't return less than zero! */
-		pos += bytes;
-		count -= bytes;
-		status = 0;
-	} while (count);
-
-	return (-status);
-}
-
-ssize_t			/* bytes read, or (-)  error */
-xfs_read(
-	xfs_inode_t		*ip,
-	struct kiocb		*iocb,
-	const struct iovec	*iovp,
-	unsigned int		segs,
-	loff_t			*offset,
-	int			ioflags)
-{
-	struct file		*file = iocb->ki_filp;
-	struct inode		*inode = file->f_mapping->host;
-	xfs_mount_t		*mp = ip->i_mount;
-	size_t			size = 0;
-	ssize_t			ret = 0;
-	xfs_fsize_t		n;
-	unsigned long		seg;
-
-
-	XFS_STATS_INC(xs_read_calls);
-
-	/* START copy & waste from filemap.c */
-	for (seg = 0; seg < segs; seg++) {
-		const struct iovec *iv = &iovp[seg];
-
-		/*
-		 * If any segment has a negative length, or the cumulative
-		 * length ever wraps negative then return -EINVAL.
-		 */
-		size += iv->iov_len;
-		if (unlikely((ssize_t)(size|iv->iov_len) < 0))
-			return XFS_ERROR(-EINVAL);
-	}
-	/* END copy & waste from filemap.c */
-
-	if (unlikely(ioflags & IO_ISDIRECT)) {
-		xfs_buftarg_t	*target =
-			XFS_IS_REALTIME_INODE(ip) ?
-				mp->m_rtdev_targp : mp->m_ddev_targp;
-		if ((*offset & target->bt_smask) ||
-		    (size & target->bt_smask)) {
-			if (*offset == ip->i_size) {
-				return (0);
-			}
-			return -XFS_ERROR(EINVAL);
-		}
-	}
-
-	n = XFS_MAXIOFFSET(mp) - *offset;
-	if ((n <= 0) || (size == 0))
-		return 0;
-
-	if (n < size)
-		size = n;
-
-	if (XFS_FORCED_SHUTDOWN(mp))
-		return -EIO;
-
-	if (unlikely(ioflags & IO_ISDIRECT))
-		mutex_lock(&inode->i_mutex);
-	xfs_ilock(ip, XFS_IOLOCK_SHARED);
-
-	if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
-		int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
-		int iolock = XFS_IOLOCK_SHARED;
-
-		ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *offset, size,
-					dmflags, &iolock);
-		if (ret) {
-			xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-			if (unlikely(ioflags & IO_ISDIRECT))
-				mutex_unlock(&inode->i_mutex);
-			return ret;
-		}
-	}
-
-	if (unlikely(ioflags & IO_ISDIRECT)) {
-		if (inode->i_mapping->nrpages)
-			ret = -xfs_flushinval_pages(ip, (*offset & PAGE_CACHE_MASK),
-						    -1, FI_REMAPF_LOCKED);
-		mutex_unlock(&inode->i_mutex);
-		if (ret) {
-			xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-			return ret;
-		}
-	}
-
-	trace_xfs_file_read(ip, size, *offset, ioflags);
-
-	iocb->ki_pos = *offset;
-	ret = generic_file_aio_read(iocb, iovp, segs, *offset);
-	if (ret > 0)
-		XFS_STATS_ADD(xs_read_bytes, ret);
-
-	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-	return ret;
-}
-
-ssize_t
-xfs_splice_read(
-	xfs_inode_t		*ip,
-	struct file		*infilp,
-	loff_t			*ppos,
-	struct pipe_inode_info	*pipe,
-	size_t			count,
-	int			flags,
-	int			ioflags)
-{
-	xfs_mount_t		*mp = ip->i_mount;
-	ssize_t			ret;
-
-	XFS_STATS_INC(xs_read_calls);
-	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-		return -EIO;
-
-	xfs_ilock(ip, XFS_IOLOCK_SHARED);
-
-	if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
-		int iolock = XFS_IOLOCK_SHARED;
-		int error;
-
-		error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count,
-					FILP_DELAY_FLAG(infilp), &iolock);
-		if (error) {
-			xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-			return -error;
-		}
-	}
-
-	trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
-
-	ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
-	if (ret > 0)
-		XFS_STATS_ADD(xs_read_bytes, ret);
-
-	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-	return ret;
-}
-
-ssize_t
-xfs_splice_write(
-	xfs_inode_t		*ip,
-	struct pipe_inode_info	*pipe,
-	struct file		*outfilp,
-	loff_t			*ppos,
-	size_t			count,
-	int			flags,
-	int			ioflags)
-{
-	xfs_mount_t		*mp = ip->i_mount;
-	ssize_t			ret;
-	struct inode		*inode = outfilp->f_mapping->host;
-	xfs_fsize_t		isize, new_size;
-
-	XFS_STATS_INC(xs_write_calls);
-	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-		return -EIO;
-
-	xfs_ilock(ip, XFS_IOLOCK_EXCL);
-
-	if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) {
-		int iolock = XFS_IOLOCK_EXCL;
-		int error;
-
-		error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count,
-					FILP_DELAY_FLAG(outfilp), &iolock);
-		if (error) {
-			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-			return -error;
-		}
-	}
-
-	new_size = *ppos + count;
-
-	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	if (new_size > ip->i_size)
-		ip->i_new_size = new_size;
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
-	trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
-
-	ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
-	if (ret > 0)
-		XFS_STATS_ADD(xs_write_bytes, ret);
-
-	isize = i_size_read(inode);
-	if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
-		*ppos = isize;
-
-	if (*ppos > ip->i_size) {
-		xfs_ilock(ip, XFS_ILOCK_EXCL);
-		if (*ppos > ip->i_size)
-			ip->i_size = *ppos;
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-	}
-
-	if (ip->i_new_size) {
-		xfs_ilock(ip, XFS_ILOCK_EXCL);
-		ip->i_new_size = 0;
-		if (ip->i_d.di_size > ip->i_size)
-			ip->i_d.di_size = ip->i_size;
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-	}
-	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-	return ret;
-}
-
-/*
- * This routine is called to handle zeroing any space in the last
- * block of the file that is beyond the EOF.  We do this since the
- * size is being increased without writing anything to that block
- * and we don't want anyone to read the garbage on the disk.
- */
-STATIC int				/* error (positive) */
-xfs_zero_last_block(
-	xfs_inode_t	*ip,
-	xfs_fsize_t	offset,
-	xfs_fsize_t	isize)
-{
-	xfs_fileoff_t	last_fsb;
-	xfs_mount_t	*mp = ip->i_mount;
-	int		nimaps;
-	int		zero_offset;
-	int		zero_len;
-	int		error = 0;
-	xfs_bmbt_irec_t	imap;
-
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-
-	zero_offset = XFS_B_FSB_OFFSET(mp, isize);
-	if (zero_offset == 0) {
-		/*
-		 * There are no extra bytes in the last block on disk to
-		 * zero, so return.
-		 */
-		return 0;
-	}
-
-	last_fsb = XFS_B_TO_FSBT(mp, isize);
-	nimaps = 1;
-	error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap,
-			  &nimaps, NULL, NULL);
-	if (error) {
-		return error;
-	}
-	ASSERT(nimaps > 0);
-	/*
-	 * If the block underlying isize is just a hole, then there
-	 * is nothing to zero.
-	 */
-	if (imap.br_startblock == HOLESTARTBLOCK) {
-		return 0;
-	}
-	/*
-	 * Zero the part of the last block beyond the EOF, and write it
-	 * out sync.  We need to drop the ilock while we do this so we
-	 * don't deadlock when the buffer cache calls back to us.
-	 */
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
-	zero_len = mp->m_sb.sb_blocksize - zero_offset;
-	if (isize + zero_len > offset)
-		zero_len = offset - isize;
-	error = xfs_iozero(ip, isize, zero_len);
-
-	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	ASSERT(error >= 0);
-	return error;
-}
-
-/*
- * Zero any on disk space between the current EOF and the new,
- * larger EOF.  This handles the normal case of zeroing the remainder
- * of the last block in the file and the unusual case of zeroing blocks
- * out beyond the size of the file.  This second case only happens
- * with fixed size extents and when the system crashes before the inode
- * size was updated but after blocks were allocated.  If fill is set,
- * then any holes in the range are filled and zeroed.  If not, the holes
- * are left alone as holes.
- */
-
-int					/* error (positive) */
-xfs_zero_eof(
-	xfs_inode_t	*ip,
-	xfs_off_t	offset,		/* starting I/O offset */
-	xfs_fsize_t	isize)		/* current inode size */
-{
-	xfs_mount_t	*mp = ip->i_mount;
-	xfs_fileoff_t	start_zero_fsb;
-	xfs_fileoff_t	end_zero_fsb;
-	xfs_fileoff_t	zero_count_fsb;
-	xfs_fileoff_t	last_fsb;
-	xfs_fileoff_t	zero_off;
-	xfs_fsize_t	zero_len;
-	int		nimaps;
-	int		error = 0;
-	xfs_bmbt_irec_t	imap;
-
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
-	ASSERT(offset > isize);
-
-	/*
-	 * First handle zeroing the block on which isize resides.
-	 * We only zero a part of that block so it is handled specially.
-	 */
-	error = xfs_zero_last_block(ip, offset, isize);
-	if (error) {
-		ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
-		return error;
-	}
-
-	/*
-	 * Calculate the range between the new size and the old
-	 * where blocks needing to be zeroed may exist.  To get the
-	 * block where the last byte in the file currently resides,
-	 * we need to subtract one from the size and truncate back
-	 * to a block boundary.  We subtract 1 in case the size is
-	 * exactly on a block boundary.
-	 */
-	last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
-	start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
-	end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
-	ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
-	if (last_fsb == end_zero_fsb) {
-		/*
-		 * The size was only incremented on its last block.
-		 * We took care of that above, so just return.
-		 */
-		return 0;
-	}
-
-	ASSERT(start_zero_fsb <= end_zero_fsb);
-	while (start_zero_fsb <= end_zero_fsb) {
-		nimaps = 1;
-		zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
-		error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb,
-				  0, NULL, 0, &imap, &nimaps, NULL, NULL);
-		if (error) {
-			ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
-			return error;
-		}
-		ASSERT(nimaps > 0);
-
-		if (imap.br_state == XFS_EXT_UNWRITTEN ||
-		    imap.br_startblock == HOLESTARTBLOCK) {
-			/*
-			 * This loop handles initializing pages that were
-			 * partially initialized by the code below this
-			 * loop. It basically zeroes the part of the page
-			 * that sits on a hole and sets the page as P_HOLE
-			 * and calls remapf if it is a mapped file.
-			 */
-			start_zero_fsb = imap.br_startoff + imap.br_blockcount;
-			ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
-			continue;
-		}
-
-		/*
-		 * There are blocks we need to zero.
-		 * Drop the inode lock while we're doing the I/O.
-		 * We'll still have the iolock to protect us.
-		 */
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
-		zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
-		zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
-
-		if ((zero_off + zero_len) > offset)
-			zero_len = offset - zero_off;
-
-		error = xfs_iozero(ip, zero_off, zero_len);
-		if (error) {
-			goto out_lock;
-		}
-
-		start_zero_fsb = imap.br_startoff + imap.br_blockcount;
-		ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
-
-		xfs_ilock(ip, XFS_ILOCK_EXCL);
-	}
-
-	return 0;
-
-out_lock:
-	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	ASSERT(error >= 0);
-	return error;
-}
-
-ssize_t				/* bytes written, or (-) error */
-xfs_write(
-	struct xfs_inode	*xip,
-	struct kiocb		*iocb,
-	const struct iovec	*iovp,
-	unsigned int		nsegs,
-	loff_t			*offset,
-	int			ioflags)
-{
-	struct file		*file = iocb->ki_filp;
-	struct address_space	*mapping = file->f_mapping;
-	struct inode		*inode = mapping->host;
-	unsigned long		segs = nsegs;
-	xfs_mount_t		*mp;
-	ssize_t			ret = 0, error = 0;
-	xfs_fsize_t		isize, new_size;
-	int			iolock;
-	int			eventsent = 0;
-	size_t			ocount = 0, count;
-	loff_t			pos;
-	int			need_i_mutex;
-
-	XFS_STATS_INC(xs_write_calls);
-
-	error = generic_segment_checks(iovp, &segs, &ocount, VERIFY_READ);
-	if (error)
-		return error;
-
-	count = ocount;
-	pos = *offset;
-
-	if (count == 0)
-		return 0;
-
-	mp = xip->i_mount;
-
-	xfs_wait_for_freeze(mp, SB_FREEZE_WRITE);
-
-	if (XFS_FORCED_SHUTDOWN(mp))
-		return -EIO;
-
-relock:
-	if (ioflags & IO_ISDIRECT) {
-		iolock = XFS_IOLOCK_SHARED;
-		need_i_mutex = 0;
-	} else {
-		iolock = XFS_IOLOCK_EXCL;
-		need_i_mutex = 1;
-		mutex_lock(&inode->i_mutex);
-	}
-
-	xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
-
-start:
-	error = -generic_write_checks(file, &pos, &count,
-					S_ISBLK(inode->i_mode));
-	if (error) {
-		xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
-		goto out_unlock_mutex;
-	}
-
-	if ((DM_EVENT_ENABLED(xip, DM_EVENT_WRITE) &&
-	    !(ioflags & IO_INVIS) && !eventsent)) {
-		int		dmflags = FILP_DELAY_FLAG(file);
-
-		if (need_i_mutex)
-			dmflags |= DM_FLAGS_IMUX;
-
-		xfs_iunlock(xip, XFS_ILOCK_EXCL);
-		error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, xip,
-				      pos, count, dmflags, &iolock);
-		if (error) {
-			goto out_unlock_internal;
-		}
-		xfs_ilock(xip, XFS_ILOCK_EXCL);
-		eventsent = 1;
-
-		/*
-		 * The iolock was dropped and reacquired in XFS_SEND_DATA
-		 * so we have to recheck the size when appending.
-		 * We will only "goto start;" once, since having sent the
-		 * event prevents another call to XFS_SEND_DATA, which is
-		 * what allows the size to change in the first place.
-		 */
-		if ((file->f_flags & O_APPEND) && pos != xip->i_size)
-			goto start;
-	}
-
-	if (ioflags & IO_ISDIRECT) {
-		xfs_buftarg_t	*target =
-			XFS_IS_REALTIME_INODE(xip) ?
-				mp->m_rtdev_targp : mp->m_ddev_targp;
-
-		if ((pos & target->bt_smask) || (count & target->bt_smask)) {
-			xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
-			return XFS_ERROR(-EINVAL);
-		}
-
-		if (!need_i_mutex && (mapping->nrpages || pos > xip->i_size)) {
-			xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
-			iolock = XFS_IOLOCK_EXCL;
-			need_i_mutex = 1;
-			mutex_lock(&inode->i_mutex);
-			xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
-			goto start;
-		}
-	}
-
-	new_size = pos + count;
-	if (new_size > xip->i_size)
-		xip->i_new_size = new_size;
-
-	if (likely(!(ioflags & IO_INVIS)))
-		file_update_time(file);
-
-	/*
-	 * If the offset is beyond the size of the file, we have a couple
-	 * of things to do. First, if there is already space allocated
-	 * we need to either create holes or zero the disk or ...
-	 *
-	 * If there is a page where the previous size lands, we need
-	 * to zero it out up to the new size.
-	 */
-
-	if (pos > xip->i_size) {
-		error = xfs_zero_eof(xip, pos, xip->i_size);
-		if (error) {
-			xfs_iunlock(xip, XFS_ILOCK_EXCL);
-			goto out_unlock_internal;
-		}
-	}
-	xfs_iunlock(xip, XFS_ILOCK_EXCL);
-
-	/*
-	 * If we're writing the file then make sure to clear the
-	 * setuid and setgid bits if the process is not being run
-	 * by root.  This keeps people from modifying setuid and
-	 * setgid binaries.
-	 */
-
-	if (((xip->i_d.di_mode & S_ISUID) ||
-	    ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) ==
-		(S_ISGID | S_IXGRP))) &&
-	     !capable(CAP_FSETID)) {
-		error = xfs_write_clear_setuid(xip);
-		if (likely(!error))
-			error = -file_remove_suid(file);
-		if (unlikely(error)) {
-			goto out_unlock_internal;
-		}
-	}
-
-	/* We can write back this queue in page reclaim */
-	current->backing_dev_info = mapping->backing_dev_info;
-
-	if ((ioflags & IO_ISDIRECT)) {
-		if (mapping->nrpages) {
-			WARN_ON(need_i_mutex == 0);
-			error = xfs_flushinval_pages(xip,
-					(pos & PAGE_CACHE_MASK),
-					-1, FI_REMAPF_LOCKED);
-			if (error)
-				goto out_unlock_internal;
-		}
-
-		if (need_i_mutex) {
-			/* demote the lock now the cached pages are gone */
-			xfs_ilock_demote(xip, XFS_IOLOCK_EXCL);
-			mutex_unlock(&inode->i_mutex);
-
-			iolock = XFS_IOLOCK_SHARED;
-			need_i_mutex = 0;
-		}
-
-		trace_xfs_file_direct_write(xip, count, *offset, ioflags);
-		ret = generic_file_direct_write(iocb, iovp,
-				&segs, pos, offset, count, ocount);
-
-		/*
-		 * direct-io write to a hole: fall through to buffered I/O
-		 * for completing the rest of the request.
-		 */
-		if (ret >= 0 && ret != count) {
-			XFS_STATS_ADD(xs_write_bytes, ret);
-
-			pos += ret;
-			count -= ret;
-
-			ioflags &= ~IO_ISDIRECT;
-			xfs_iunlock(xip, iolock);
-			goto relock;
-		}
-	} else {
-		int enospc = 0;
-		ssize_t ret2 = 0;
-
-write_retry:
-		trace_xfs_file_buffered_write(xip, count, *offset, ioflags);
-		ret2 = generic_file_buffered_write(iocb, iovp, segs,
-				pos, offset, count, ret);
-		/*
-		 * if we just got an ENOSPC, flush the inode now we
-		 * aren't holding any page locks and retry *once*
-		 */
-		if (ret2 == -ENOSPC && !enospc) {
-			error = xfs_flush_pages(xip, 0, -1, 0, FI_NONE);
-			if (error)
-				goto out_unlock_internal;
-			enospc = 1;
-			goto write_retry;
-		}
-		ret = ret2;
-	}
-
-	current->backing_dev_info = NULL;
-
-	isize = i_size_read(inode);
-	if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize))
-		*offset = isize;
-
-	if (*offset > xip->i_size) {
-		xfs_ilock(xip, XFS_ILOCK_EXCL);
-		if (*offset > xip->i_size)
-			xip->i_size = *offset;
-		xfs_iunlock(xip, XFS_ILOCK_EXCL);
-	}
-
-	if (ret == -ENOSPC &&
-	    DM_EVENT_ENABLED(xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
-		xfs_iunlock(xip, iolock);
-		if (need_i_mutex)
-			mutex_unlock(&inode->i_mutex);
-		error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, xip,
-				DM_RIGHT_NULL, xip, DM_RIGHT_NULL, NULL, NULL,
-				0, 0, 0); /* Delay flag intentionally  unused */
-		if (need_i_mutex)
-			mutex_lock(&inode->i_mutex);
-		xfs_ilock(xip, iolock);
-		if (error)
-			goto out_unlock_internal;
-		goto start;
-	}
-
-	error = -ret;
-	if (ret <= 0)
-		goto out_unlock_internal;
-
-	XFS_STATS_ADD(xs_write_bytes, ret);
-
-	/* Handle various SYNC-type writes */
-	if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
-		loff_t end = pos + ret - 1;
-		int error2;
-
-		xfs_iunlock(xip, iolock);
-		if (need_i_mutex)
-			mutex_unlock(&inode->i_mutex);
-
-		error2 = filemap_write_and_wait_range(mapping, pos, end);
-		if (!error)
-			error = error2;
-		if (need_i_mutex)
-			mutex_lock(&inode->i_mutex);
-		xfs_ilock(xip, iolock);
-
-		error2 = xfs_fsync(xip);
-		if (!error)
-			error = error2;
-	}
-
- out_unlock_internal:
-	if (xip->i_new_size) {
-		xfs_ilock(xip, XFS_ILOCK_EXCL);
-		xip->i_new_size = 0;
-		/*
-		 * If this was a direct or synchronous I/O that failed (such
-		 * as ENOSPC) then part of the I/O may have been written to
-		 * disk before the error occured.  In this case the on-disk
-		 * file size may have been adjusted beyond the in-memory file
-		 * size and now needs to be truncated back.
-		 */
-		if (xip->i_d.di_size > xip->i_size)
-			xip->i_d.di_size = xip->i_size;
-		xfs_iunlock(xip, XFS_ILOCK_EXCL);
-	}
-	xfs_iunlock(xip, iolock);
- out_unlock_mutex:
-	if (need_i_mutex)
-		mutex_unlock(&inode->i_mutex);
-	return -error;
-}
-
-/*
- * All xfs metadata buffers except log state machine buffers
- * get this attached as their b_bdstrat callback function.
- * This is so that we can catch a buffer
- * after prematurely unpinning it to forcibly shutdown the filesystem.
- */
-int
-xfs_bdstrat_cb(struct xfs_buf *bp)
-{
-	if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
-		trace_xfs_bdstrat_shut(bp, _RET_IP_);
-		/*
-		 * Metadata write that didn't get logged but
-		 * written delayed anyway. These aren't associated
-		 * with a transaction, and can be ignored.
-		 */
-		if (XFS_BUF_IODONE_FUNC(bp) == NULL &&
-		    (XFS_BUF_ISREAD(bp)) == 0)
-			return (xfs_bioerror_relse(bp));
-		else
-			return (xfs_bioerror(bp));
-	}
-
-	xfs_buf_iorequest(bp);
-	return 0;
-}
-
-/*
- * Wrapper around bdstrat so that we can stop data from going to disk in case
- * we are shutting down the filesystem.  Typically user data goes thru this
- * path; one of the exceptions is the superblock.
- */
-void
-xfsbdstrat(
-	struct xfs_mount	*mp,
-	struct xfs_buf		*bp)
-{
-	ASSERT(mp);
-	if (!XFS_FORCED_SHUTDOWN(mp)) {
-		xfs_buf_iorequest(bp);
-		return;
-	}
-
-	trace_xfs_bdstrat_shut(bp, _RET_IP_);
-	xfs_bioerror_relse(bp);
-}
-
-/*
- * If the underlying (data/log/rt) device is readonly, there are some
- * operations that cannot proceed.
- */
-int
-xfs_dev_is_read_only(
-	xfs_mount_t		*mp,
-	char			*message)
-{
-	if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
-	    xfs_readonly_buftarg(mp->m_logdev_targp) ||
-	    (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
-		cmn_err(CE_NOTE,
-			"XFS: %s required on read-only device.", message);
-		cmn_err(CE_NOTE,
-			"XFS: write access unavailable, cannot proceed.");
-		return EROFS;
-	}
-	return 0;
-}
diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h
deleted file mode 100644
index d1f7789c7ffb..000000000000
--- a/fs/xfs/linux-2.6/xfs_lrw.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_LRW_H__
-#define __XFS_LRW_H__
-
-struct xfs_mount;
-struct xfs_inode;
-struct xfs_buf;
-
-/* errors from xfsbdstrat() must be extracted from the buffer */
-extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
-extern int xfs_bdstrat_cb(struct xfs_buf *);
-extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
-
-extern int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
-
-#endif	/* __XFS_LRW_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index 3d4a0c84d634..067cafbfc635 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -19,10 +19,10 @@
 #include "xfs_dmapi.h"
 #include "xfs_sb.h"
 #include "xfs_inum.h"
+#include "xfs_log.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_quota.h"
-#include "xfs_log.h"
 #include "xfs_trans.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
@@ -44,20 +44,6 @@ xfs_quota_type(int type)
 }
 
 STATIC int
-xfs_fs_quota_sync(
-	struct super_block	*sb,
-	int			type)
-{
-	struct xfs_mount	*mp = XFS_M(sb);
-
-	if (sb->s_flags & MS_RDONLY)
-		return -EROFS;
-	if (!XFS_IS_QUOTA_RUNNING(mp))
-		return -ENOSYS;
-	return -xfs_sync_data(mp, 0);
-}
-
-STATIC int
 xfs_fs_get_xstate(
 	struct super_block	*sb,
 	struct fs_quota_stat	*fqs)
@@ -82,8 +68,6 @@ xfs_fs_set_xstate(
 		return -EROFS;
 	if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp))
 		return -ENOSYS;
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
 
 	if (uflags & XFS_QUOTA_UDQ_ACCT)
 		flags |= XFS_UQUOTA_ACCT;
@@ -113,7 +97,7 @@ xfs_fs_set_xstate(
 }
 
 STATIC int
-xfs_fs_get_xquota(
+xfs_fs_get_dqblk(
 	struct super_block	*sb,
 	int			type,
 	qid_t			id,
@@ -130,7 +114,7 @@ xfs_fs_get_xquota(
 }
 
 STATIC int
-xfs_fs_set_xquota(
+xfs_fs_set_dqblk(
 	struct super_block	*sb,
 	int			type,
 	qid_t			id,
@@ -144,16 +128,13 @@ xfs_fs_set_xquota(
 		return -ENOSYS;
 	if (!XFS_IS_QUOTA_ON(mp))
 		return -ESRCH;
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
 
 	return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq);
 }
 
 const struct quotactl_ops xfs_quotactl_operations = {
-	.quota_sync		= xfs_fs_quota_sync,
 	.get_xstate		= xfs_fs_get_xstate,
 	.set_xstate		= xfs_fs_set_xstate,
-	.get_xquota		= xfs_fs_get_xquota,
-	.set_xquota		= xfs_fs_set_xquota,
+	.get_dqblk		= xfs_fs_get_dqblk,
+	.set_dqblk		= xfs_fs_set_dqblk,
 };
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 77414db10dc2..80938c736c27 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -61,6 +61,7 @@
 
 #include <linux/namei.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/mempool.h>
 #include <linux/writeback.h>
@@ -118,6 +119,8 @@ mempool_t *xfs_ioend_pool;
 #define MNTOPT_DMAPI	"dmapi"		/* DMI enabled (DMAPI / XDSM) */
 #define MNTOPT_XDSM	"xdsm"		/* DMI enabled (DMAPI / XDSM) */
 #define MNTOPT_DMI	"dmi"		/* DMI enabled (DMAPI / XDSM) */
+#define MNTOPT_DELAYLOG   "delaylog"	/* Delayed loging enabled */
+#define MNTOPT_NODELAYLOG "nodelaylog"	/* Delayed loging disabled */
 
 /*
  * Table driven mount option parser.
@@ -373,6 +376,13 @@ xfs_parseargs(
 			mp->m_flags |= XFS_MOUNT_DMAPI;
 		} else if (!strcmp(this_char, MNTOPT_DMI)) {
 			mp->m_flags |= XFS_MOUNT_DMAPI;
+		} else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
+			mp->m_flags |= XFS_MOUNT_DELAYLOG;
+			cmn_err(CE_WARN,
+				"Enabling EXPERIMENTAL delayed logging feature "
+				"- use at your own risk.\n");
+		} else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
+			mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
 		} else if (!strcmp(this_char, "ihashsize")) {
 			cmn_err(CE_WARN,
 	"XFS: ihashsize no longer used, option is deprecated.");
@@ -534,6 +544,7 @@ xfs_showargs(
 		{ XFS_MOUNT_FILESTREAMS,	"," MNTOPT_FILESTREAM },
 		{ XFS_MOUNT_DMAPI,		"," MNTOPT_DMAPI },
 		{ XFS_MOUNT_GRPID,		"," MNTOPT_GRPID },
+		{ XFS_MOUNT_DELAYLOG,		"," MNTOPT_DELAYLOG },
 		{ 0, NULL }
 	};
 	static struct proc_xfs_info xfs_info_unset[] = {
@@ -724,7 +735,8 @@ void
 xfs_blkdev_issue_flush(
 	xfs_buftarg_t		*buftarg)
 {
-	blkdev_issue_flush(buftarg->bt_bdev, NULL);
+	blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL,
+			BLKDEV_IFL_WAIT);
 }
 
 STATIC void
@@ -788,18 +800,18 @@ xfs_open_devices(
 	 * Setup xfs_mount buffer target pointers
 	 */
 	error = ENOMEM;
-	mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0);
+	mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0, mp->m_fsname);
 	if (!mp->m_ddev_targp)
 		goto out_close_rtdev;
 
 	if (rtdev) {
-		mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1);
+		mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1, mp->m_fsname);
 		if (!mp->m_rtdev_targp)
 			goto out_free_ddev_targ;
 	}
 
 	if (logdev && logdev != ddev) {
-		mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1);
+		mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1, mp->m_fsname);
 		if (!mp->m_logdev_targp)
 			goto out_free_rtdev_targ;
 	} else {
@@ -877,12 +889,11 @@ xfsaild(
 {
 	struct xfs_ail	*ailp = data;
 	xfs_lsn_t	last_pushed_lsn = 0;
-	long		tout = 0;
+	long		tout = 0; /* milliseconds */
 
 	while (!kthread_should_stop()) {
-		if (tout)
-			schedule_timeout_interruptible(msecs_to_jiffies(tout));
-		tout = 1000;
+		schedule_timeout_interruptible(tout ?
+				msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
 
 		/* swsusp */
 		try_to_freeze();
@@ -902,7 +913,8 @@ xfsaild_start(
 	struct xfs_ail	*ailp)
 {
 	ailp->xa_target = 0;
-	ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild");
+	ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
+				    ailp->xa_mount->m_fsname);
 	if (IS_ERR(ailp->xa_task))
 		return -PTR_ERR(ailp->xa_task);
 	return 0;
@@ -1022,59 +1034,109 @@ xfs_fs_dirty_inode(
 	XFS_I(inode)->i_update_core = 1;
 }
 
-/*
- * Attempt to flush the inode, this will actually fail
- * if the inode is pinned, but we dirty the inode again
- * at the point when it is unpinned after a log write,
- * since this is when the inode itself becomes flushable.
- */
+STATIC int
+xfs_log_inode(
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	int			error;
+
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
+	error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
+
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		/* we need to return with the lock hold shared */
+		xfs_ilock(ip, XFS_ILOCK_SHARED);
+		return error;
+	}
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+	/*
+	 * Note - it's possible that we might have pushed ourselves out of the
+	 * way during trans_reserve which would flush the inode.  But there's
+	 * no guarantee that the inode buffer has actually gone out yet (it's
+	 * delwri).  Plus the buffer could be pinned anyway if it's part of
+	 * an inode in another recent transaction.  So we play it safe and
+	 * fire off the transaction anyway.
+	 */
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_ihold(tp, ip);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	xfs_trans_set_sync(tp);
+	error = xfs_trans_commit(tp, 0);
+	xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
+
+	return error;
+}
+
 STATIC int
 xfs_fs_write_inode(
 	struct inode		*inode,
-	int			sync)
+	struct writeback_control *wbc)
 {
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
-	int			error = 0;
+	int			error = EAGAIN;
 
 	xfs_itrace_entry(ip);
 
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return XFS_ERROR(EIO);
 
-	if (sync) {
-		error = xfs_wait_on_pages(ip, 0, -1);
-		if (error)
+	if (wbc->sync_mode == WB_SYNC_ALL) {
+		/*
+		 * Make sure the inode has hit stable storage.  By using the
+		 * log and the fsync transactions we reduce the IOs we have
+		 * to do here from two (log and inode) to just the log.
+		 *
+		 * Note: We still need to do a delwri write of the inode after
+		 * this to flush it to the backing buffer so that bulkstat
+		 * works properly if this is the first time the inode has been
+		 * written.  Because we hold the ilock atomically over the
+		 * transaction commit and the inode flush we are guaranteed
+		 * that the inode is not pinned when it returns. If the flush
+		 * lock is already held, then the inode has already been
+		 * flushed once and we don't need to flush it again.  Hence
+		 * the code will only flush the inode if it isn't already
+		 * being flushed.
+		 */
+		xfs_ioend_wait(ip);
+		xfs_ilock(ip, XFS_ILOCK_SHARED);
+		if (ip->i_update_core) {
+			error = xfs_log_inode(ip);
+			if (error)
+				goto out_unlock;
+		}
+	} else {
+		/*
+		 * We make this non-blocking if the inode is contended, return
+		 * EAGAIN to indicate to the caller that they did not succeed.
+		 * This prevents the flush path from blocking on inodes inside
+		 * another operation right now, they get caught later by xfs_sync.
+		 */
+		if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
 			goto out;
 	}
 
-	/*
-	 * Bypass inodes which have already been cleaned by
-	 * the inode flush clustering code inside xfs_iflush
-	 */
-	if (xfs_inode_clean(ip))
-		goto out;
+	if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
+		goto out_unlock;
 
 	/*
-	 * We make this non-blocking if the inode is contended, return
-	 * EAGAIN to indicate to the caller that they did not succeed.
-	 * This prevents the flush path from blocking on inodes inside
-	 * another operation right now, they get caught later by xfs_sync.
+	 * Now we have the flush lock and the inode is not pinned, we can check
+	 * if the inode is really clean as we know that there are no pending
+	 * transaction completions, it is not waiting on the delayed write
+	 * queue and there is no IO in progress.
 	 */
-	if (sync) {
-		xfs_ilock(ip, XFS_ILOCK_SHARED);
-		xfs_iflock(ip);
-
-		error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
-	} else {
-		error = EAGAIN;
-		if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
-			goto out;
-		if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
-			goto out_unlock;
-
-		error = xfs_iflush(ip, XFS_IFLUSH_ASYNC_NOBLOCK);
+	if (xfs_inode_clean(ip)) {
+		xfs_ifunlock(ip);
+		error = 0;
+		goto out_unlock;
 	}
+	error = xfs_iflush(ip, 0);
 
  out_unlock:
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -1160,6 +1222,7 @@ xfs_fs_put_super(
 
 	xfs_unmountfs(mp);
 	xfs_freesb(mp);
+	xfs_inode_shrinker_unregister(mp);
 	xfs_icsb_destroy_counters(mp);
 	xfs_close_devices(mp);
 	xfs_dmops_put(mp);
@@ -1257,6 +1320,29 @@ xfs_fs_statfs(
 	return 0;
 }
 
+STATIC void
+xfs_save_resvblks(struct xfs_mount *mp)
+{
+	__uint64_t resblks = 0;
+
+	mp->m_resblks_save = mp->m_resblks;
+	xfs_reserve_blocks(mp, &resblks, NULL);
+}
+
+STATIC void
+xfs_restore_resvblks(struct xfs_mount *mp)
+{
+	__uint64_t resblks;
+
+	if (mp->m_resblks_save) {
+		resblks = mp->m_resblks_save;
+		mp->m_resblks_save = 0;
+	} else
+		resblks = xfs_default_resblks(mp);
+
+	xfs_reserve_blocks(mp, &resblks, NULL);
+}
+
 STATIC int
 xfs_fs_remount(
 	struct super_block	*sb,
@@ -1336,11 +1422,27 @@ xfs_fs_remount(
 			}
 			mp->m_update_flags = 0;
 		}
+
+		/*
+		 * Fill out the reserve pool if it is empty. Use the stashed
+		 * value if it is non-zero, otherwise go with the default.
+		 */
+		xfs_restore_resvblks(mp);
 	}
 
 	/* rw -> ro */
 	if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
+		/*
+		 * After we have synced the data but before we sync the
+		 * metadata, we need to free up the reserve block pool so that
+		 * the used block count in the superblock on disk is correct at
+		 * the end of the remount. Stash the current reserve pool size
+		 * so that if we get remounted rw, we can return it to the same
+		 * size.
+		 */
+
 		xfs_quiesce_data(mp);
+		xfs_save_resvblks(mp);
 		xfs_quiesce_attr(mp);
 		mp->m_flags |= XFS_MOUNT_RDONLY;
 	}
@@ -1359,11 +1461,22 @@ xfs_fs_freeze(
 {
 	struct xfs_mount	*mp = XFS_M(sb);
 
+	xfs_save_resvblks(mp);
 	xfs_quiesce_attr(mp);
 	return -xfs_fs_log_dummy(mp);
 }
 
 STATIC int
+xfs_fs_unfreeze(
+	struct super_block	*sb)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+
+	xfs_restore_resvblks(mp);
+	return 0;
+}
+
+STATIC int
 xfs_fs_show_options(
 	struct seq_file		*m,
 	struct vfsmount		*mnt)
@@ -1523,6 +1636,8 @@ xfs_fs_fill_super(
 	if (error)
 		goto fail_vnrele;
 
+	xfs_inode_shrinker_register(mp);
+
 	kfree(mtpt);
 	return 0;
 
@@ -1585,6 +1700,7 @@ static const struct super_operations xfs_super_operations = {
 	.put_super		= xfs_fs_put_super,
 	.sync_fs		= xfs_fs_sync_fs,
 	.freeze_fs		= xfs_fs_freeze,
+	.unfreeze_fs		= xfs_fs_unfreeze,
 	.statfs			= xfs_fs_statfs,
 	.remount_fs		= xfs_fs_remount,
 	.show_options		= xfs_fs_show_options,
@@ -1649,7 +1765,7 @@ xfs_init_zones(void)
 	 * but it is much faster.
 	 */
 	xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) +
-				(((XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK) /
+				(((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) /
 				  NBWORD) * sizeof(int))), "xfs_buf_item");
 	if (!xfs_buf_item_zone)
 		goto out_destroy_trans_zone;
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 233d4b9881b1..519618e9279e 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -85,7 +85,7 @@ extern __uint64_t xfs_max_file_offset(unsigned int);
 extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
 
 extern const struct export_operations xfs_export_operations;
-extern struct xattr_handler *xfs_xattr_handlers[];
+extern const struct xattr_handler *xfs_xattr_handlers[];
 extern const struct quotactl_ops xfs_quotactl_operations;
 
 #define XFS_M(sb)		((struct xfs_mount *)((sb)->s_fs_info))
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 1f5e4bb5e970..a51a07c3a70c 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -90,14 +90,14 @@ xfs_inode_ag_lookup(
 STATIC int
 xfs_inode_ag_walk(
 	struct xfs_mount	*mp,
-	xfs_agnumber_t		ag,
+	struct xfs_perag	*pag,
 	int			(*execute)(struct xfs_inode *ip,
 					   struct xfs_perag *pag, int flags),
 	int			flags,
 	int			tag,
-	int			exclusive)
+	int			exclusive,
+	int			*nr_to_scan)
 {
-	struct xfs_perag	*pag = &mp->m_perag[ag];
 	uint32_t		first_index;
 	int			last_error = 0;
 	int			skipped;
@@ -135,17 +135,50 @@ restart:
 		if (error == EFSCORRUPTED)
 			break;
 
-	} while (1);
+	} while ((*nr_to_scan)--);
 
 	if (skipped) {
 		delay(1);
 		goto restart;
 	}
-
-	xfs_put_perag(mp, pag);
 	return last_error;
 }
 
+/*
+ * Select the next per-ag structure to iterate during the walk. The reclaim
+ * walk is optimised only to walk AGs with reclaimable inodes in them.
+ */
+static struct xfs_perag *
+xfs_inode_ag_iter_next_pag(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		*first,
+	int			tag)
+{
+	struct xfs_perag	*pag = NULL;
+
+	if (tag == XFS_ICI_RECLAIM_TAG) {
+		int found;
+		int ref;
+
+		spin_lock(&mp->m_perag_lock);
+		found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
+				(void **)&pag, *first, 1, tag);
+		if (found <= 0) {
+			spin_unlock(&mp->m_perag_lock);
+			return NULL;
+		}
+		*first = pag->pag_agno + 1;
+		/* open coded pag reference increment */
+		ref = atomic_inc_return(&pag->pag_ref);
+		spin_unlock(&mp->m_perag_lock);
+		trace_xfs_perag_get_reclaim(mp, pag->pag_agno, ref, _RET_IP_);
+	} else {
+		pag = xfs_perag_get(mp, *first);
+		(*first)++;
+	}
+	return pag;
+}
+
 int
 xfs_inode_ag_iterator(
 	struct xfs_mount	*mp,
@@ -153,23 +186,31 @@ xfs_inode_ag_iterator(
 					   struct xfs_perag *pag, int flags),
 	int			flags,
 	int			tag,
-	int			exclusive)
+	int			exclusive,
+	int			*nr_to_scan)
 {
+	struct xfs_perag	*pag;
 	int			error = 0;
 	int			last_error = 0;
 	xfs_agnumber_t		ag;
-
-	for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
-		if (!mp->m_perag[ag].pag_ici_init)
-			continue;
-		error = xfs_inode_ag_walk(mp, ag, execute, flags, tag,
-						exclusive);
+	int			nr;
+
+	nr = nr_to_scan ? *nr_to_scan : INT_MAX;
+	ag = 0;
+	while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, tag))) {
+		error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
+						exclusive, &nr);
+		xfs_perag_put(pag);
 		if (error) {
 			last_error = error;
 			if (error == EFSCORRUPTED)
 				break;
 		}
+		if (nr <= 0)
+			break;
 	}
+	if (nr_to_scan)
+		*nr_to_scan = nr;
 	return XFS_ERROR(last_error);
 }
 
@@ -231,7 +272,7 @@ xfs_sync_inode_data(
 	}
 
 	error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
-				0 : XFS_B_ASYNC, FI_NONE);
+				0 : XBF_ASYNC, FI_NONE);
 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 
  out_wait:
@@ -267,8 +308,7 @@ xfs_sync_inode_attr(
 		goto out_unlock;
 	}
 
-	error = xfs_iflush(ip, (flags & SYNC_WAIT) ?
-			   XFS_IFLUSH_SYNC : XFS_IFLUSH_DELWRI);
+	error = xfs_iflush(ip, flags);
 
  out_unlock:
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -289,14 +329,11 @@ xfs_sync_data(
 	ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
 
 	error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags,
-				      XFS_ICI_NO_TAG, 0);
+				      XFS_ICI_NO_TAG, 0, NULL);
 	if (error)
 		return XFS_ERROR(error);
 
-	xfs_log_force(mp, 0,
-		      (flags & SYNC_WAIT) ?
-		       XFS_LOG_FORCE | XFS_LOG_SYNC :
-		       XFS_LOG_FORCE);
+	xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
 	return 0;
 }
 
@@ -311,7 +348,7 @@ xfs_sync_attr(
 	ASSERT((flags & ~SYNC_WAIT) == 0);
 
 	return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags,
-				     XFS_ICI_NO_TAG, 0);
+				     XFS_ICI_NO_TAG, 0, NULL);
 }
 
 STATIC int
@@ -322,10 +359,6 @@ xfs_commit_dummy_trans(
 	struct xfs_inode	*ip = mp->m_rootip;
 	struct xfs_trans	*tp;
 	int			error;
-	int			log_flags = XFS_LOG_FORCE;
-
-	if (flags & SYNC_WAIT)
-		log_flags |= XFS_LOG_SYNC;
 
 	/*
 	 * Put a dummy transaction in the log to tell recovery
@@ -347,74 +380,29 @@ xfs_commit_dummy_trans(
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 
 	/* the log force ensures this transaction is pushed to disk */
-	xfs_log_force(mp, 0, log_flags);
+	xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
 	return error;
 }
 
-int
+STATIC int
 xfs_sync_fsdata(
-	struct xfs_mount	*mp,
-	int			flags)
+	struct xfs_mount	*mp)
 {
 	struct xfs_buf		*bp;
-	struct xfs_buf_log_item	*bip;
-	int			error = 0;
 
 	/*
-	 * If this is xfssyncd() then only sync the superblock if we can
-	 * lock it without sleeping and it is not pinned.
+	 * If the buffer is pinned then push on the log so we won't get stuck
+	 * waiting in the write for someone, maybe ourselves, to flush the log.
+	 *
+	 * Even though we just pushed the log above, we did not have the
+	 * superblock buffer locked at that point so it can become pinned in
+	 * between there and here.
 	 */
-	if (flags & SYNC_TRYLOCK) {
-		ASSERT(!(flags & SYNC_WAIT));
-
-		bp = xfs_getsb(mp, XFS_BUF_TRYLOCK);
-		if (!bp)
-			goto out;
-
-		bip = XFS_BUF_FSPRIVATE(bp, struct xfs_buf_log_item *);
-		if (!bip || !xfs_buf_item_dirty(bip) || XFS_BUF_ISPINNED(bp))
-			goto out_brelse;
-	} else {
-		bp = xfs_getsb(mp, 0);
+	bp = xfs_getsb(mp, 0);
+	if (XFS_BUF_ISPINNED(bp))
+		xfs_log_force(mp, 0);
 
-		/*
-		 * If the buffer is pinned then push on the log so we won't
-		 * get stuck waiting in the write for someone, maybe
-		 * ourselves, to flush the log.
-		 *
-		 * Even though we just pushed the log above, we did not have
-		 * the superblock buffer locked at that point so it can
-		 * become pinned in between there and here.
-		 */
-		if (XFS_BUF_ISPINNED(bp))
-			xfs_log_force(mp, 0, XFS_LOG_FORCE);
-	}
-
-
-	if (flags & SYNC_WAIT)
-		XFS_BUF_UNASYNC(bp);
-	else
-		XFS_BUF_ASYNC(bp);
-
-	error = xfs_bwrite(mp, bp);
-	if (error)
-		return error;
-
-	/*
-	 * If this is a data integrity sync make sure all pending buffers
-	 * are flushed out for the log coverage check below.
-	 */
-	if (flags & SYNC_WAIT)
-		xfs_flush_buftarg(mp->m_ddev_targp, 1);
-
-	if (xfs_log_need_covered(mp))
-		error = xfs_commit_dummy_trans(mp, flags);
-	return error;
-
- out_brelse:
-	xfs_buf_relse(bp);
- out:
-	return error;
+	return xfs_bwrite(mp, bp);
 }
 
 /*
@@ -438,7 +426,7 @@ int
 xfs_quiesce_data(
 	struct xfs_mount	*mp)
 {
-	int error;
+	int			error, error2 = 0;
 
 	/* push non-blocking */
 	xfs_sync_data(mp, 0);
@@ -448,17 +436,21 @@ xfs_quiesce_data(
 	xfs_sync_data(mp, SYNC_WAIT);
 	xfs_qm_sync(mp, SYNC_WAIT);
 
-	/* drop inode references pinned by filestreams */
-	xfs_filestream_flush(mp);
-
 	/* write superblock and hoover up shutdown errors */
-	error = xfs_sync_fsdata(mp, SYNC_WAIT);
+	error = xfs_sync_fsdata(mp);
+
+	/* make sure all delwri buffers are written out */
+	xfs_flush_buftarg(mp->m_ddev_targp, 1);
+
+	/* mark the log as covered if needed */
+	if (xfs_log_need_covered(mp))
+		error2 = xfs_commit_dummy_trans(mp, SYNC_WAIT);
 
 	/* flush data-only devices */
 	if (mp->m_rtdev_targp)
 		XFS_bflush(mp->m_rtdev_targp);
 
-	return error;
+	return error ? error : error2;
 }
 
 STATIC void
@@ -467,16 +459,18 @@ xfs_quiesce_fs(
 {
 	int	count = 0, pincount;
 
+	xfs_reclaim_inodes(mp, 0);
 	xfs_flush_buftarg(mp->m_ddev_targp, 0);
-	xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
 
 	/*
 	 * This loop must run at least twice.  The first instance of the loop
 	 * will flush most meta data but that will generate more meta data
 	 * (typically directory updates).  Which then must be flushed and
-	 * logged before we can write the unmount record.
+	 * logged before we can write the unmount record. We also so sync
+	 * reclaim of inodes to catch any that the above delwri flush skipped.
 	 */
 	do {
+		xfs_reclaim_inodes(mp, SYNC_WAIT);
 		xfs_sync_attr(mp, SYNC_WAIT);
 		pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
 		if (!pincount) {
@@ -575,13 +569,13 @@ xfs_flush_inodes(
 	igrab(inode);
 	xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion);
 	wait_for_completion(&completion);
-	xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC);
+	xfs_log_force(ip->i_mount, XFS_LOG_SYNC);
 }
 
 /*
- * Every sync period we need to unpin all items, reclaim inodes, sync
- * quota and write out the superblock. We might need to cover the log
- * to indicate it is idle.
+ * Every sync period we need to unpin all items, reclaim inodes and sync
+ * disk quotas.  We might need to cover the log to indicate that the
+ * filesystem is idle.
  */
 STATIC void
 xfs_sync_worker(
@@ -591,11 +585,12 @@ xfs_sync_worker(
 	int		error;
 
 	if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
-		xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
-		xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
+		xfs_log_force(mp, 0);
+		xfs_reclaim_inodes(mp, 0);
 		/* dgc: errors ignored here */
 		error = xfs_qm_sync(mp, SYNC_TRYLOCK);
-		error = xfs_sync_fsdata(mp, SYNC_TRYLOCK);
+		if (xfs_log_need_covered(mp))
+			error = xfs_commit_dummy_trans(mp, 0);
 	}
 	mp->m_sync_seq++;
 	wake_up(&mp->m_wait_single_sync_task);
@@ -613,7 +608,8 @@ xfssyncd(
 	set_freezable();
 	timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
 	for (;;) {
-		timeleft = schedule_timeout_interruptible(timeleft);
+		if (list_empty(&mp->m_sync_list))
+			timeleft = schedule_timeout_interruptible(timeleft);
 		/* swsusp */
 		try_to_freeze();
 		if (kthread_should_stop() && list_empty(&mp->m_sync_list))
@@ -633,8 +629,7 @@ xfssyncd(
 			list_add_tail(&mp->m_sync_work.w_list,
 					&mp->m_sync_list);
 		}
-		list_for_each_entry_safe(work, n, &mp->m_sync_list, w_list)
-			list_move(&work->w_list, &tmp);
+		list_splice_init(&mp->m_sync_list, &tmp);
 		spin_unlock(&mp->m_sync_lock);
 
 		list_for_each_entry_safe(work, n, &tmp, w_list) {
@@ -658,7 +653,7 @@ xfs_syncd_init(
 	mp->m_sync_work.w_syncer = xfs_sync_worker;
 	mp->m_sync_work.w_mount = mp;
 	mp->m_sync_work.w_completion = NULL;
-	mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd");
+	mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd/%s", mp->m_fsname);
 	if (IS_ERR(mp->m_sync_task))
 		return -PTR_ERR(mp->m_sync_task);
 	return 0;
@@ -679,6 +674,18 @@ __xfs_inode_set_reclaim_tag(
 	radix_tree_tag_set(&pag->pag_ici_root,
 			   XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
 			   XFS_ICI_RECLAIM_TAG);
+
+	if (!pag->pag_ici_reclaimable) {
+		/* propagate the reclaim tag up into the perag radix tree */
+		spin_lock(&ip->i_mount->m_perag_lock);
+		radix_tree_tag_set(&ip->i_mount->m_perag_tree,
+				XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+				XFS_ICI_RECLAIM_TAG);
+		spin_unlock(&ip->i_mount->m_perag_lock);
+		trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
+							-1, _RET_IP_);
+	}
+	pag->pag_ici_reclaimable++;
 }
 
 /*
@@ -690,16 +697,17 @@ void
 xfs_inode_set_reclaim_tag(
 	xfs_inode_t	*ip)
 {
-	xfs_mount_t	*mp = ip->i_mount;
-	xfs_perag_t	*pag = xfs_get_perag(mp, ip->i_ino);
+	struct xfs_mount *mp = ip->i_mount;
+	struct xfs_perag *pag;
 
-	read_lock(&pag->pag_ici_lock);
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+	write_lock(&pag->pag_ici_lock);
 	spin_lock(&ip->i_flags_lock);
 	__xfs_inode_set_reclaim_tag(pag, ip);
 	__xfs_iflags_set(ip, XFS_IRECLAIMABLE);
 	spin_unlock(&ip->i_flags_lock);
-	read_unlock(&pag->pag_ici_lock);
-	xfs_put_perag(mp, pag);
+	write_unlock(&pag->pag_ici_lock);
+	xfs_perag_put(pag);
 }
 
 void
@@ -710,14 +718,77 @@ __xfs_inode_clear_reclaim_tag(
 {
 	radix_tree_tag_clear(&pag->pag_ici_root,
 			XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
+	pag->pag_ici_reclaimable--;
+	if (!pag->pag_ici_reclaimable) {
+		/* clear the reclaim tag from the perag radix tree */
+		spin_lock(&ip->i_mount->m_perag_lock);
+		radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
+				XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+				XFS_ICI_RECLAIM_TAG);
+		spin_unlock(&ip->i_mount->m_perag_lock);
+		trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
+							-1, _RET_IP_);
+	}
 }
 
+/*
+ * Inodes in different states need to be treated differently, and the return
+ * value of xfs_iflush is not sufficient to get this right. The following table
+ * lists the inode states and the reclaim actions necessary for non-blocking
+ * reclaim:
+ *
+ *
+ *	inode state	     iflush ret		required action
+ *      ---------------      ----------         ---------------
+ *	bad			-		reclaim
+ *	shutdown		EIO		unpin and reclaim
+ *	clean, unpinned		0		reclaim
+ *	stale, unpinned		0		reclaim
+ *	clean, pinned(*)	0		requeue
+ *	stale, pinned		EAGAIN		requeue
+ *	dirty, delwri ok	0		requeue
+ *	dirty, delwri blocked	EAGAIN		requeue
+ *	dirty, sync flush	0		reclaim
+ *
+ * (*) dgc: I don't think the clean, pinned state is possible but it gets
+ * handled anyway given the order of checks implemented.
+ *
+ * As can be seen from the table, the return value of xfs_iflush() is not
+ * sufficient to correctly decide the reclaim action here. The checks in
+ * xfs_iflush() might look like duplicates, but they are not.
+ *
+ * Also, because we get the flush lock first, we know that any inode that has
+ * been flushed delwri has had the flush completed by the time we check that
+ * the inode is clean. The clean inode check needs to be done before flushing
+ * the inode delwri otherwise we would loop forever requeuing clean inodes as
+ * we cannot tell apart a successful delwri flush and a clean inode from the
+ * return value of xfs_iflush().
+ *
+ * Note that because the inode is flushed delayed write by background
+ * writeback, the flush lock may already be held here and waiting on it can
+ * result in very long latencies. Hence for sync reclaims, where we wait on the
+ * flush lock, the caller should push out delayed write inodes first before
+ * trying to reclaim them to minimise the amount of time spent waiting. For
+ * background relaim, we just requeue the inode for the next pass.
+ *
+ * Hence the order of actions after gaining the locks should be:
+ *	bad		=> reclaim
+ *	shutdown	=> unpin and reclaim
+ *	pinned, delwri	=> requeue
+ *	pinned, sync	=> unpin
+ *	stale		=> reclaim
+ *	clean		=> reclaim
+ *	dirty, delwri	=> flush and requeue
+ *	dirty, sync	=> flush, wait and reclaim
+ */
 STATIC int
 xfs_reclaim_inode(
 	struct xfs_inode	*ip,
 	struct xfs_perag	*pag,
 	int			sync_mode)
 {
+	int	error = 0;
+
 	/*
 	 * The radix tree lock here protects a thread in xfs_iget from racing
 	 * with us starting reclaim on the inode.  Once we have the
@@ -735,33 +806,70 @@ xfs_reclaim_inode(
 	spin_unlock(&ip->i_flags_lock);
 	write_unlock(&pag->pag_ici_lock);
 
-	/*
-	 * If the inode is still dirty, then flush it out.  If the inode
-	 * is not in the AIL, then it will be OK to flush it delwri as
-	 * long as xfs_iflush() does not keep any references to the inode.
-	 * We leave that decision up to xfs_iflush() since it has the
-	 * knowledge of whether it's OK to simply do a delwri flush of
-	 * the inode or whether we need to wait until the inode is
-	 * pulled from the AIL.
-	 * We get the flush lock regardless, though, just to make sure
-	 * we don't free it while it is being flushed.
-	 */
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	xfs_iflock(ip);
+	if (!xfs_iflock_nowait(ip)) {
+		if (!(sync_mode & SYNC_WAIT))
+			goto out;
+		xfs_iflock(ip);
+	}
+
+	if (is_bad_inode(VFS_I(ip)))
+		goto reclaim;
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+		xfs_iunpin_wait(ip);
+		goto reclaim;
+	}
+	if (xfs_ipincount(ip)) {
+		if (!(sync_mode & SYNC_WAIT)) {
+			xfs_ifunlock(ip);
+			goto out;
+		}
+		xfs_iunpin_wait(ip);
+	}
+	if (xfs_iflags_test(ip, XFS_ISTALE))
+		goto reclaim;
+	if (xfs_inode_clean(ip))
+		goto reclaim;
+
+	/* Now we have an inode that needs flushing */
+	error = xfs_iflush(ip, sync_mode);
+	if (sync_mode & SYNC_WAIT) {
+		xfs_iflock(ip);
+		goto reclaim;
+	}
 
 	/*
-	 * In the case of a forced shutdown we rely on xfs_iflush() to
-	 * wait for the inode to be unpinned before returning an error.
+	 * When we have to flush an inode but don't have SYNC_WAIT set, we
+	 * flush the inode out using a delwri buffer and wait for the next
+	 * call into reclaim to find it in a clean state instead of waiting for
+	 * it now. We also don't return errors here - if the error is transient
+	 * then the next reclaim pass will flush the inode, and if the error
+	 * is permanent then the next sync reclaim will reclaim the inode and
+	 * pass on the error.
 	 */
-	if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) {
-		/* synchronize with xfs_iflush_done */
-		xfs_iflock(ip);
-		xfs_ifunlock(ip);
+	if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+		xfs_fs_cmn_err(CE_WARN, ip->i_mount,
+			"inode 0x%llx background reclaim flush failed with %d",
+			(long long)ip->i_ino, error);
 	}
+out:
+	xfs_iflags_clear(ip, XFS_IRECLAIM);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	/*
+	 * We could return EAGAIN here to make reclaim rescan the inode tree in
+	 * a short while. However, this just burns CPU time scanning the tree
+	 * waiting for IO to complete and xfssyncd never goes back to the idle
+	 * state. Instead, return 0 to let the next scheduled background reclaim
+	 * attempt to reclaim the inode again.
+	 */
+	return 0;
 
+reclaim:
+	xfs_ifunlock(ip);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	xfs_ireclaim(ip);
-	return 0;
+	return error;
+
 }
 
 int
@@ -770,5 +878,57 @@ xfs_reclaim_inodes(
 	int		mode)
 {
 	return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode,
-					XFS_ICI_RECLAIM_TAG, 1);
+					XFS_ICI_RECLAIM_TAG, 1, NULL);
+}
+
+/*
+ * Shrinker infrastructure.
+ */
+static int
+xfs_reclaim_inode_shrink(
+	struct shrinker	*shrink,
+	int		nr_to_scan,
+	gfp_t		gfp_mask)
+{
+	struct xfs_mount *mp;
+	struct xfs_perag *pag;
+	xfs_agnumber_t	ag;
+	int		reclaimable;
+
+	mp = container_of(shrink, struct xfs_mount, m_inode_shrink);
+	if (nr_to_scan) {
+		if (!(gfp_mask & __GFP_FS))
+			return -1;
+
+		xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0,
+					XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan);
+		/* if we don't exhaust the scan, don't bother coming back */
+		if (nr_to_scan > 0)
+			return -1;
+       }
+
+	reclaimable = 0;
+	ag = 0;
+	while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag,
+					XFS_ICI_RECLAIM_TAG))) {
+		reclaimable += pag->pag_ici_reclaimable;
+		xfs_perag_put(pag);
+	}
+	return reclaimable;
+}
+
+void
+xfs_inode_shrinker_register(
+	struct xfs_mount	*mp)
+{
+	mp->m_inode_shrink.shrink = xfs_reclaim_inode_shrink;
+	mp->m_inode_shrink.seeks = DEFAULT_SEEKS;
+	register_shrinker(&mp->m_inode_shrink);
+}
+
+void
+xfs_inode_shrinker_unregister(
+	struct xfs_mount	*mp)
+{
+	unregister_shrinker(&mp->m_inode_shrink);
 }
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index ea932b43335d..e28139aaa4aa 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -37,7 +37,6 @@ void xfs_syncd_stop(struct xfs_mount *mp);
 
 int xfs_sync_attr(struct xfs_mount *mp, int flags);
 int xfs_sync_data(struct xfs_mount *mp, int flags);
-int xfs_sync_fsdata(struct xfs_mount *mp, int flags);
 
 int xfs_quiesce_data(struct xfs_mount *mp);
 void xfs_quiesce_attr(struct xfs_mount *mp);
@@ -54,6 +53,9 @@ void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
 int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
 int xfs_inode_ag_iterator(struct xfs_mount *mp,
 	int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
-	int flags, int tag, int write_lock);
+	int flags, int tag, int write_lock, int *nr_to_scan);
+
+void xfs_inode_shrinker_register(struct xfs_mount *mp);
+void xfs_inode_shrinker_unregister(struct xfs_mount *mp);
 
 #endif
diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/linux-2.6/xfs_trace.c
index 856eb3c8d605..d12be8470cba 100644
--- a/fs/xfs/linux-2.6/xfs_trace.c
+++ b/fs/xfs/linux-2.6/xfs_trace.c
@@ -41,7 +41,6 @@
 #include "xfs_alloc.h"
 #include "xfs_bmap.h"
 #include "xfs_attr.h"
-#include "xfs_attr_sf.h"
 #include "xfs_attr_leaf.h"
 #include "xfs_log_priv.h"
 #include "xfs_buf_item.h"
@@ -50,22 +49,8 @@
 #include "xfs_aops.h"
 #include "quota/xfs_dquot_item.h"
 #include "quota/xfs_dquot.h"
-
-/*
- * Format fsblock number into a static buffer & return it.
- */
-STATIC char *xfs_fmtfsblock(xfs_fsblock_t bno)
-{
-	static char rval[50];
-
-	if (bno == NULLFSBLOCK)
-		sprintf(rval, "NULLFSBLOCK");
-	else if (isnullstartblock(bno))
-		sprintf(rval, "NULLSTARTBLOCK(%lld)", startblockval(bno));
-	else
-		sprintf(rval, "%lld", (xfs_dfsbno_t)bno);
-	return rval;
-}
+#include "xfs_log_recover.h"
+#include "xfs_inode_item.h"
 
 /*
  * We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index c22a608321a3..302820690904 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -32,6 +32,10 @@ struct xfs_da_node_entry;
 struct xfs_dquot;
 struct xlog_ticket;
 struct log;
+struct xlog_recover;
+struct xlog_recover_item;
+struct xfs_buf_log_format;
+struct xfs_inode_log_format;
 
 DECLARE_EVENT_CLASS(xfs_attr_list_class,
 	TP_PROTO(struct xfs_attr_list_context *ctx),
@@ -91,6 +95,40 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add);
 DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk);
 DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound);
 
+DECLARE_EVENT_CLASS(xfs_perag_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,
+		 unsigned long caller_ip),
+	TP_ARGS(mp, agno, refcount, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(int, refcount)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->refcount = refcount;
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d agno %u refcount %d caller %pf",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->refcount,
+		  (char *)__entry->caller_ip)
+);
+
+#define DEFINE_PERAG_REF_EVENT(name)	\
+DEFINE_EVENT(xfs_perag_class, name,	\
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,	\
+		 unsigned long caller_ip),					\
+	TP_ARGS(mp, agno, refcount, caller_ip))
+DEFINE_PERAG_REF_EVENT(xfs_perag_get);
+DEFINE_PERAG_REF_EVENT(xfs_perag_get_reclaim);
+DEFINE_PERAG_REF_EVENT(xfs_perag_put);
+DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
+DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
+
 TRACE_EVENT(xfs_attr_list_node_descend,
 	TP_PROTO(struct xfs_attr_list_context *ctx,
 		 struct xfs_da_node_entry *btree),
@@ -170,13 +208,13 @@ TRACE_EVENT(xfs_iext_insert,
 		__entry->caller_ip = caller_ip;
 	),
 	TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
-		  "offset %lld block %s count %lld flag %d caller %pf",
+		  "offset %lld block %lld count %lld flag %d caller %pf",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->ino,
 		  __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
 		  (long)__entry->idx,
 		  __entry->startoff,
-		  xfs_fmtfsblock(__entry->startblock),
+		  (__int64_t)__entry->startblock,
 		  __entry->blockcount,
 		  __entry->state,
 		  (char *)__entry->caller_ip)
@@ -214,13 +252,13 @@ DECLARE_EVENT_CLASS(xfs_bmap_class,
 		__entry->caller_ip = caller_ip;
 	),
 	TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
-		  "offset %lld block %s count %lld flag %d caller %pf",
+		  "offset %lld block %lld count %lld flag %d caller %pf",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->ino,
 		  __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
 		  (long)__entry->idx,
 		  __entry->startoff,
-		  xfs_fmtfsblock(__entry->startblock),
+		  (__int64_t)__entry->startblock,
 		  __entry->blockcount,
 		  __entry->state,
 		  (char *)__entry->caller_ip)
@@ -456,6 +494,7 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock);
 DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale);
 DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed);
 DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pushbuf);
 DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf);
 DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf_recur);
 DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb);
@@ -534,18 +573,21 @@ DECLARE_EVENT_CLASS(xfs_inode_class,
 		__field(dev_t, dev)
 		__field(xfs_ino_t, ino)
 		__field(int, count)
+		__field(int, pincount)
 		__field(unsigned long, caller_ip)
 	),
 	TP_fast_assign(
 		__entry->dev = VFS_I(ip)->i_sb->s_dev;
 		__entry->ino = ip->i_ino;
 		__entry->count = atomic_read(&VFS_I(ip)->i_count);
+		__entry->pincount = atomic_read(&ip->i_pincount);
 		__entry->caller_ip = caller_ip;
 	),
-	TP_printk("dev %d:%d ino 0x%llx count %d caller %pf",
+	TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %pf",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->ino,
 		  __entry->count,
+		  __entry->pincount,
 		  (char *)__entry->caller_ip)
 )
 
@@ -555,6 +597,10 @@ DEFINE_EVENT(xfs_inode_class, name, \
 	TP_ARGS(ip, caller_ip))
 DEFINE_INODE_EVENT(xfs_ihold);
 DEFINE_INODE_EVENT(xfs_irele);
+DEFINE_INODE_EVENT(xfs_inode_pin);
+DEFINE_INODE_EVENT(xfs_inode_unpin);
+DEFINE_INODE_EVENT(xfs_inode_unpin_nowait);
+
 /* the old xfs_itrace_entry tracer - to be replaced by s.th. in the VFS */
 DEFINE_INODE_EVENT(xfs_inode);
 #define xfs_itrace_entry(ip)    \
@@ -565,7 +611,7 @@ DECLARE_EVENT_CLASS(xfs_dquot_class,
 	TP_ARGS(dqp),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
-		__field(__be32, id)
+		__field(u32, id)
 		__field(unsigned, flags)
 		__field(unsigned, nrefs)
 		__field(unsigned long long, res_bcount)
@@ -578,7 +624,7 @@ DECLARE_EVENT_CLASS(xfs_dquot_class,
 	), \
 	TP_fast_assign(
 		__entry->dev = dqp->q_mount->m_super->s_dev;
-		__entry->id = dqp->q_core.d_id;
+		__entry->id = be32_to_cpu(dqp->q_core.d_id);
 		__entry->flags = dqp->dq_flags;
 		__entry->nrefs = dqp->q_nrefs;
 		__entry->res_bcount = dqp->q_res_bcount;
@@ -594,10 +640,10 @@ DECLARE_EVENT_CLASS(xfs_dquot_class,
 			be64_to_cpu(dqp->q_core.d_ino_softlimit);
 	),
 	TP_printk("dev %d:%d id 0x%x flags %s nrefs %u res_bc 0x%llx "
-		  "bcnt 0x%llx [hard 0x%llx | soft 0x%llx] "
-		  "icnt 0x%llx [hard 0x%llx | soft 0x%llx]",
+		  "bcnt 0x%llx bhardlimit 0x%llx bsoftlimit 0x%llx "
+		  "icnt 0x%llx ihardlimit 0x%llx isoftlimit 0x%llx]",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  be32_to_cpu(__entry->id),
+		  __entry->id,
 		  __print_flags(__entry->flags, "|", XFS_DQ_FLAGS),
 		  __entry->nrefs,
 		  __entry->res_bcount,
@@ -614,8 +660,6 @@ DEFINE_EVENT(xfs_dquot_class, name, \
 	TP_PROTO(struct xfs_dquot *dqp), \
 	TP_ARGS(dqp))
 DEFINE_DQUOT_EVENT(xfs_dqadjust);
-DEFINE_DQUOT_EVENT(xfs_dqshake_dirty);
-DEFINE_DQUOT_EVENT(xfs_dqshake_unlink);
 DEFINE_DQUOT_EVENT(xfs_dqreclaim_want);
 DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty);
 DEFINE_DQUOT_EVENT(xfs_dqreclaim_unlink);
@@ -630,7 +674,6 @@ DEFINE_DQUOT_EVENT(xfs_dqread_fail);
 DEFINE_DQUOT_EVENT(xfs_dqlookup_found);
 DEFINE_DQUOT_EVENT(xfs_dqlookup_want);
 DEFINE_DQUOT_EVENT(xfs_dqlookup_freelist);
-DEFINE_DQUOT_EVENT(xfs_dqlookup_move);
 DEFINE_DQUOT_EVENT(xfs_dqlookup_done);
 DEFINE_DQUOT_EVENT(xfs_dqget_hit);
 DEFINE_DQUOT_EVENT(xfs_dqget_miss);
@@ -739,165 +782,181 @@ DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit);
 DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub);
 
-#define DEFINE_RW_EVENT(name) \
-TRACE_EVENT(name, \
-	TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \
-	TP_ARGS(ip, count, offset, flags), \
-	TP_STRUCT__entry( \
-		__field(dev_t, dev) \
-		__field(xfs_ino_t, ino) \
-		__field(xfs_fsize_t, size) \
-		__field(xfs_fsize_t, new_size) \
-		__field(loff_t, offset) \
-		__field(size_t, count) \
-		__field(int, flags) \
-	), \
-	TP_fast_assign( \
-		__entry->dev = VFS_I(ip)->i_sb->s_dev; \
-		__entry->ino = ip->i_ino; \
-		__entry->size = ip->i_d.di_size; \
-		__entry->new_size = ip->i_new_size; \
-		__entry->offset = offset; \
-		__entry->count = count; \
-		__entry->flags = flags; \
-	), \
-	TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \
-		  "offset 0x%llx count 0x%zx ioflags %s", \
-		  MAJOR(__entry->dev), MINOR(__entry->dev), \
-		  __entry->ino, \
-		  __entry->size, \
-		  __entry->new_size, \
-		  __entry->offset, \
-		  __entry->count, \
-		  __print_flags(__entry->flags, "|", XFS_IO_FLAGS)) \
+DECLARE_EVENT_CLASS(xfs_file_class,
+	TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags),
+	TP_ARGS(ip, count, offset, flags),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_fsize_t, size)
+		__field(xfs_fsize_t, new_size)
+		__field(loff_t, offset)
+		__field(size_t, count)
+		__field(int, flags)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->size = ip->i_d.di_size;
+		__entry->new_size = ip->i_new_size;
+		__entry->offset = offset;
+		__entry->count = count;
+		__entry->flags = flags;
+	),
+	TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
+		  "offset 0x%llx count 0x%zx ioflags %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->size,
+		  __entry->new_size,
+		  __entry->offset,
+		  __entry->count,
+		  __print_flags(__entry->flags, "|", XFS_IO_FLAGS))
 )
+
+#define DEFINE_RW_EVENT(name)		\
+DEFINE_EVENT(xfs_file_class, name,	\
+	TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags),	\
+	TP_ARGS(ip, count, offset, flags))
 DEFINE_RW_EVENT(xfs_file_read);
 DEFINE_RW_EVENT(xfs_file_buffered_write);
 DEFINE_RW_EVENT(xfs_file_direct_write);
 DEFINE_RW_EVENT(xfs_file_splice_read);
 DEFINE_RW_EVENT(xfs_file_splice_write);
 
+DECLARE_EVENT_CLASS(xfs_page_class,
+	TP_PROTO(struct inode *inode, struct page *page, unsigned long off),
+	TP_ARGS(inode, page, off),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(pgoff_t, pgoff)
+		__field(loff_t, size)
+		__field(unsigned long, offset)
+		__field(int, delalloc)
+		__field(int, unmapped)
+		__field(int, unwritten)
+	),
+	TP_fast_assign(
+		int delalloc = -1, unmapped = -1, unwritten = -1;
 
-#define DEFINE_PAGE_EVENT(name) \
-TRACE_EVENT(name, \
-	TP_PROTO(struct inode *inode, struct page *page, unsigned long off), \
-	TP_ARGS(inode, page, off), \
-	TP_STRUCT__entry( \
-		__field(dev_t, dev) \
-		__field(xfs_ino_t, ino) \
-		__field(pgoff_t, pgoff) \
-		__field(loff_t, size) \
-		__field(unsigned long, offset) \
-		__field(int, delalloc) \
-		__field(int, unmapped) \
-		__field(int, unwritten) \
-	), \
-	TP_fast_assign( \
-		int delalloc = -1, unmapped = -1, unwritten = -1; \
-	\
-		if (page_has_buffers(page)) \
-			xfs_count_page_state(page, &delalloc, \
-					     &unmapped, &unwritten); \
-		__entry->dev = inode->i_sb->s_dev; \
-		__entry->ino = XFS_I(inode)->i_ino; \
-		__entry->pgoff = page_offset(page); \
-		__entry->size = i_size_read(inode); \
-		__entry->offset = off; \
-		__entry->delalloc = delalloc; \
-		__entry->unmapped = unmapped; \
-		__entry->unwritten = unwritten; \
-	), \
-	TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx " \
-		  "delalloc %d unmapped %d unwritten %d", \
-		  MAJOR(__entry->dev), MINOR(__entry->dev), \
-		  __entry->ino, \
-		  __entry->pgoff, \
-		  __entry->size, \
-		  __entry->offset, \
-		  __entry->delalloc, \
-		  __entry->unmapped, \
-		  __entry->unwritten) \
+		if (page_has_buffers(page))
+			xfs_count_page_state(page, &delalloc,
+					     &unmapped, &unwritten);
+		__entry->dev = inode->i_sb->s_dev;
+		__entry->ino = XFS_I(inode)->i_ino;
+		__entry->pgoff = page_offset(page);
+		__entry->size = i_size_read(inode);
+		__entry->offset = off;
+		__entry->delalloc = delalloc;
+		__entry->unmapped = unmapped;
+		__entry->unwritten = unwritten;
+	),
+	TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
+		  "delalloc %d unmapped %d unwritten %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->pgoff,
+		  __entry->size,
+		  __entry->offset,
+		  __entry->delalloc,
+		  __entry->unmapped,
+		  __entry->unwritten)
 )
+
+#define DEFINE_PAGE_EVENT(name)		\
+DEFINE_EVENT(xfs_page_class, name,	\
+	TP_PROTO(struct inode *inode, struct page *page, unsigned long off),	\
+	TP_ARGS(inode, page, off))
 DEFINE_PAGE_EVENT(xfs_writepage);
 DEFINE_PAGE_EVENT(xfs_releasepage);
 DEFINE_PAGE_EVENT(xfs_invalidatepage);
 
-#define DEFINE_IOMAP_EVENT(name) \
-TRACE_EVENT(name, \
-	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
-		 int flags, struct xfs_bmbt_irec *irec), \
-	TP_ARGS(ip, offset, count, flags, irec), \
-	TP_STRUCT__entry( \
-		__field(dev_t, dev) \
-		__field(xfs_ino_t, ino) \
-		__field(loff_t, size) \
-		__field(loff_t, new_size) \
-		__field(loff_t, offset) \
-		__field(size_t, count) \
-		__field(int, flags) \
-		__field(xfs_fileoff_t, startoff) \
-		__field(xfs_fsblock_t, startblock) \
-		__field(xfs_filblks_t, blockcount) \
-	), \
-	TP_fast_assign( \
-		__entry->dev = VFS_I(ip)->i_sb->s_dev; \
-		__entry->ino = ip->i_ino; \
-		__entry->size = ip->i_d.di_size; \
-		__entry->new_size = ip->i_new_size; \
-		__entry->offset = offset; \
-		__entry->count = count; \
-		__entry->flags = flags; \
-		__entry->startoff = irec ? irec->br_startoff : 0; \
-		__entry->startblock = irec ? irec->br_startblock : 0; \
-		__entry->blockcount = irec ? irec->br_blockcount : 0; \
-	), \
-	TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \
-		  "offset 0x%llx count %zd flags %s " \
-		  "startoff 0x%llx startblock %s blockcount 0x%llx", \
-		  MAJOR(__entry->dev), MINOR(__entry->dev), \
-		  __entry->ino, \
-		  __entry->size, \
-		  __entry->new_size, \
-		  __entry->offset, \
-		  __entry->count, \
-		  __print_flags(__entry->flags, "|", BMAPI_FLAGS), \
-		  __entry->startoff, \
-		  xfs_fmtfsblock(__entry->startblock), \
-		  __entry->blockcount) \
+DECLARE_EVENT_CLASS(xfs_iomap_class,
+	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
+		 int flags, struct xfs_bmbt_irec *irec),
+	TP_ARGS(ip, offset, count, flags, irec),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(loff_t, size)
+		__field(loff_t, new_size)
+		__field(loff_t, offset)
+		__field(size_t, count)
+		__field(int, flags)
+		__field(xfs_fileoff_t, startoff)
+		__field(xfs_fsblock_t, startblock)
+		__field(xfs_filblks_t, blockcount)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->size = ip->i_d.di_size;
+		__entry->new_size = ip->i_new_size;
+		__entry->offset = offset;
+		__entry->count = count;
+		__entry->flags = flags;
+		__entry->startoff = irec ? irec->br_startoff : 0;
+		__entry->startblock = irec ? irec->br_startblock : 0;
+		__entry->blockcount = irec ? irec->br_blockcount : 0;
+	),
+	TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
+		  "offset 0x%llx count %zd flags %s "
+		  "startoff 0x%llx startblock %lld blockcount 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->size,
+		  __entry->new_size,
+		  __entry->offset,
+		  __entry->count,
+		  __print_flags(__entry->flags, "|", BMAPI_FLAGS),
+		  __entry->startoff,
+		  (__int64_t)__entry->startblock,
+		  __entry->blockcount)
 )
+
+#define DEFINE_IOMAP_EVENT(name)	\
+DEFINE_EVENT(xfs_iomap_class, name,	\
+	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,	\
+		 int flags, struct xfs_bmbt_irec *irec),		\
+	TP_ARGS(ip, offset, count, flags, irec))
 DEFINE_IOMAP_EVENT(xfs_iomap_enter);
 DEFINE_IOMAP_EVENT(xfs_iomap_found);
 DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
 
-#define DEFINE_SIMPLE_IO_EVENT(name) \
-TRACE_EVENT(name, \
-	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), \
-	TP_ARGS(ip, offset, count), \
-	TP_STRUCT__entry( \
-		__field(dev_t, dev) \
-		__field(xfs_ino_t, ino) \
-		__field(loff_t, size) \
-		__field(loff_t, new_size) \
-		__field(loff_t, offset) \
-		__field(size_t, count) \
-	), \
-	TP_fast_assign( \
-		__entry->dev = VFS_I(ip)->i_sb->s_dev; \
-		__entry->ino = ip->i_ino; \
-		__entry->size = ip->i_d.di_size; \
-		__entry->new_size = ip->i_new_size; \
-		__entry->offset = offset; \
-		__entry->count = count; \
-	), \
-	TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \
-		  "offset 0x%llx count %zd", \
-		  MAJOR(__entry->dev), MINOR(__entry->dev), \
-		  __entry->ino, \
-		  __entry->size, \
-		  __entry->new_size, \
-		  __entry->offset, \
-		  __entry->count) \
+DECLARE_EVENT_CLASS(xfs_simple_io_class,
+	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
+	TP_ARGS(ip, offset, count),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(loff_t, size)
+		__field(loff_t, new_size)
+		__field(loff_t, offset)
+		__field(size_t, count)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->size = ip->i_d.di_size;
+		__entry->new_size = ip->i_new_size;
+		__entry->offset = offset;
+		__entry->count = count;
+	),
+	TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
+		  "offset 0x%llx count %zd",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->size,
+		  __entry->new_size,
+		  __entry->offset,
+		  __entry->count)
 );
+
+#define DEFINE_SIMPLE_IO_EVENT(name)	\
+DEFINE_EVENT(xfs_simple_io_class, name,	\
+	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),	\
+	TP_ARGS(ip, offset, count))
 DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
 DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
 
@@ -1023,83 +1082,112 @@ TRACE_EVENT(xfs_bunmap,
 
 );
 
+#define XFS_BUSY_SYNC \
+	{ 0,	"async" }, \
+	{ 1,	"sync" }
+
 TRACE_EVENT(xfs_alloc_busy,
-	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
-		 xfs_extlen_t len, int slot),
-	TP_ARGS(mp, agno, agbno, len, slot),
+	TP_PROTO(struct xfs_trans *trans, xfs_agnumber_t agno,
+		 xfs_agblock_t agbno, xfs_extlen_t len, int sync),
+	TP_ARGS(trans, agno, agbno, len, sync),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
+		__field(struct xfs_trans *, tp)
+		__field(int, tid)
 		__field(xfs_agnumber_t, agno)
 		__field(xfs_agblock_t, agbno)
 		__field(xfs_extlen_t, len)
-		__field(int, slot)
+		__field(int, sync)
 	),
 	TP_fast_assign(
-		__entry->dev = mp->m_super->s_dev;
+		__entry->dev = trans->t_mountp->m_super->s_dev;
+		__entry->tp = trans;
+		__entry->tid = trans->t_ticket->t_tid;
 		__entry->agno = agno;
 		__entry->agbno = agbno;
 		__entry->len = len;
-		__entry->slot = slot;
+		__entry->sync = sync;
 	),
-	TP_printk("dev %d:%d agno %u agbno %u len %u slot %d",
+	TP_printk("dev %d:%d trans 0x%p tid 0x%x agno %u agbno %u len %u %s",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->tp,
+		  __entry->tid,
 		  __entry->agno,
 		  __entry->agbno,
 		  __entry->len,
-		  __entry->slot)
+		  __print_symbolic(__entry->sync, XFS_BUSY_SYNC))
 
 );
 
-#define XFS_BUSY_STATES \
-	{ 0,	"found" }, \
-	{ 1,	"missing" }
-
 TRACE_EVENT(xfs_alloc_unbusy,
 	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
-		 int slot, int found),
-	TP_ARGS(mp, agno, slot, found),
+		 xfs_agblock_t agbno, xfs_extlen_t len),
+	TP_ARGS(mp, agno, agbno, len),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
 		__field(xfs_agnumber_t, agno)
-		__field(int, slot)
-		__field(int, found)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_extlen_t, len)
 	),
 	TP_fast_assign(
 		__entry->dev = mp->m_super->s_dev;
 		__entry->agno = agno;
-		__entry->slot = slot;
-		__entry->found = found;
+		__entry->agbno = agbno;
+		__entry->len = len;
 	),
-	TP_printk("dev %d:%d agno %u slot %d %s",
+	TP_printk("dev %d:%d agno %u agbno %u len %u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->agno,
-		  __entry->slot,
-		  __print_symbolic(__entry->found, XFS_BUSY_STATES))
+		  __entry->agbno,
+		  __entry->len)
 );
 
+#define XFS_BUSY_STATES \
+	{ 0,	"missing" }, \
+	{ 1,	"found" }
+
 TRACE_EVENT(xfs_alloc_busysearch,
-	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
-		 xfs_extlen_t len, xfs_lsn_t lsn),
-	TP_ARGS(mp, agno, agbno, len, lsn),
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 xfs_agblock_t agbno, xfs_extlen_t len, int found),
+	TP_ARGS(mp, agno, agbno, len, found),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
 		__field(xfs_agnumber_t, agno)
 		__field(xfs_agblock_t, agbno)
 		__field(xfs_extlen_t, len)
-		__field(xfs_lsn_t, lsn)
+		__field(int, found)
 	),
 	TP_fast_assign(
 		__entry->dev = mp->m_super->s_dev;
 		__entry->agno = agno;
 		__entry->agbno = agbno;
 		__entry->len = len;
-		__entry->lsn = lsn;
+		__entry->found = found;
 	),
-	TP_printk("dev %d:%d agno %u agbno %u len %u force lsn 0x%llx",
+	TP_printk("dev %d:%d agno %u agbno %u len %u %s",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->agno,
 		  __entry->agbno,
 		  __entry->len,
+		  __print_symbolic(__entry->found, XFS_BUSY_STATES))
+);
+
+TRACE_EVENT(xfs_trans_commit_lsn,
+	TP_PROTO(struct xfs_trans *trans),
+	TP_ARGS(trans),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(struct xfs_trans *, tp)
+		__field(xfs_lsn_t, lsn)
+	),
+	TP_fast_assign(
+		__entry->dev = trans->t_mountp->m_super->s_dev;
+		__entry->tp = trans;
+		__entry->lsn = trans->t_commit_lsn;
+	),
+	TP_printk("dev %d:%d trans 0x%p commit_lsn 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->tp,
 		  __entry->lsn)
 );
 
@@ -1414,6 +1502,193 @@ TRACE_EVENT(xfs_dir2_leafn_moveents,
 		  __entry->count)
 );
 
+#define XFS_SWAPEXT_INODES \
+	{ 0,	"target" }, \
+	{ 1,	"temp" }
+
+#define XFS_INODE_FORMAT_STR \
+	{ 0,	"invalid" }, \
+	{ 1,	"local" }, \
+	{ 2,	"extent" }, \
+	{ 3,	"btree" }
+
+DECLARE_EVENT_CLASS(xfs_swap_extent_class,
+	TP_PROTO(struct xfs_inode *ip, int which),
+	TP_ARGS(ip, which),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(int, which)
+		__field(xfs_ino_t, ino)
+		__field(int, format)
+		__field(int, nex)
+		__field(int, max_nex)
+		__field(int, broot_size)
+		__field(int, fork_off)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->which = which;
+		__entry->ino = ip->i_ino;
+		__entry->format = ip->i_d.di_format;
+		__entry->nex = ip->i_d.di_nextents;
+		__entry->max_nex = ip->i_df.if_ext_max;
+		__entry->broot_size = ip->i_df.if_broot_bytes;
+		__entry->fork_off = XFS_IFORK_BOFF(ip);
+	),
+	TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, "
+		  "Max in-fork extents %d, broot size %d, fork offset %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __print_symbolic(__entry->which, XFS_SWAPEXT_INODES),
+		  __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR),
+		  __entry->nex,
+		  __entry->max_nex,
+		  __entry->broot_size,
+		  __entry->fork_off)
+)
+
+#define DEFINE_SWAPEXT_EVENT(name) \
+DEFINE_EVENT(xfs_swap_extent_class, name, \
+	TP_PROTO(struct xfs_inode *ip, int which), \
+	TP_ARGS(ip, which))
+
+DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before);
+DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after);
+
+DECLARE_EVENT_CLASS(xfs_log_recover_item_class,
+	TP_PROTO(struct log *log, struct xlog_recover *trans,
+		struct xlog_recover_item *item, int pass),
+	TP_ARGS(log, trans, item, pass),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned long, item)
+		__field(xlog_tid_t, tid)
+		__field(int, type)
+		__field(int, pass)
+		__field(int, count)
+		__field(int, total)
+	),
+	TP_fast_assign(
+		__entry->dev = log->l_mp->m_super->s_dev;
+		__entry->item = (unsigned long)item;
+		__entry->tid = trans->r_log_tid;
+		__entry->type = ITEM_TYPE(item);
+		__entry->pass = pass;
+		__entry->count = item->ri_cnt;
+		__entry->total = item->ri_total;
+	),
+	TP_printk("dev %d:%d trans 0x%x, pass %d, item 0x%p, item type %s "
+		  "item region count/total %d/%d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->tid,
+		  __entry->pass,
+		  (void *)__entry->item,
+		  __print_symbolic(__entry->type, XFS_LI_TYPE_DESC),
+		  __entry->count,
+		  __entry->total)
+)
+
+#define DEFINE_LOG_RECOVER_ITEM(name) \
+DEFINE_EVENT(xfs_log_recover_item_class, name, \
+	TP_PROTO(struct log *log, struct xlog_recover *trans, \
+		struct xlog_recover_item *item, int pass), \
+	TP_ARGS(log, trans, item, pass))
+
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add_cont);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_head);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_tail);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_recover);
+
+DECLARE_EVENT_CLASS(xfs_log_recover_buf_item_class,
+	TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f),
+	TP_ARGS(log, buf_f),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(__int64_t, blkno)
+		__field(unsigned short, len)
+		__field(unsigned short, flags)
+		__field(unsigned short, size)
+		__field(unsigned int, map_size)
+	),
+	TP_fast_assign(
+		__entry->dev = log->l_mp->m_super->s_dev;
+		__entry->blkno = buf_f->blf_blkno;
+		__entry->len = buf_f->blf_len;
+		__entry->flags = buf_f->blf_flags;
+		__entry->size = buf_f->blf_size;
+		__entry->map_size = buf_f->blf_map_size;
+	),
+	TP_printk("dev %d:%d blkno 0x%llx, len %u, flags 0x%x, size %d, "
+			"map_size %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->blkno,
+		  __entry->len,
+		  __entry->flags,
+		  __entry->size,
+		  __entry->map_size)
+)
+
+#define DEFINE_LOG_RECOVER_BUF_ITEM(name) \
+DEFINE_EVENT(xfs_log_recover_buf_item_class, name, \
+	TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f), \
+	TP_ARGS(log, buf_f))
+
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_not_cancel);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_add);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_ref_inc);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_recover);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_inode_buf);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_reg_buf);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_dquot_buf);
+
+DECLARE_EVENT_CLASS(xfs_log_recover_ino_item_class,
+	TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f),
+	TP_ARGS(log, in_f),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(unsigned short, size)
+		__field(int, fields)
+		__field(unsigned short, asize)
+		__field(unsigned short, dsize)
+		__field(__int64_t, blkno)
+		__field(int, len)
+		__field(int, boffset)
+	),
+	TP_fast_assign(
+		__entry->dev = log->l_mp->m_super->s_dev;
+		__entry->ino = in_f->ilf_ino;
+		__entry->size = in_f->ilf_size;
+		__entry->fields = in_f->ilf_fields;
+		__entry->asize = in_f->ilf_asize;
+		__entry->dsize = in_f->ilf_dsize;
+		__entry->blkno = in_f->ilf_blkno;
+		__entry->len = in_f->ilf_len;
+		__entry->boffset = in_f->ilf_boffset;
+	),
+	TP_printk("dev %d:%d ino 0x%llx, size %u, fields 0x%x, asize %d, "
+			"dsize %d, blkno 0x%llx, len %d, boffset %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->size,
+		  __entry->fields,
+		  __entry->asize,
+		  __entry->dsize,
+		  __entry->blkno,
+		  __entry->len,
+		  __entry->boffset)
+)
+#define DEFINE_LOG_RECOVER_INO_ITEM(name) \
+DEFINE_EVENT(xfs_log_recover_ino_item_class, name, \
+	TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f), \
+	TP_ARGS(log, in_f))
+
+DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover);
+DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel);
+DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip);
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/linux-2.6/xfs_xattr.c
index 0b1878857fc3..87d3e03878c8 100644
--- a/fs/xfs/linux-2.6/xfs_xattr.c
+++ b/fs/xfs/linux-2.6/xfs_xattr.c
@@ -45,7 +45,7 @@ xfs_xattr_get(struct dentry *dentry, const char *name,
 		value = NULL;
 	}
 
-	error = -xfs_attr_get(ip, name, value, &asize, xflags);
+	error = -xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags);
 	if (error)
 		return error;
 	return asize;
@@ -67,32 +67,33 @@ xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
 		xflags |= ATTR_REPLACE;
 
 	if (!value)
-		return -xfs_attr_remove(ip, name, xflags);
-	return -xfs_attr_set(ip, name, (void *)value, size, xflags);
+		return -xfs_attr_remove(ip, (unsigned char *)name, xflags);
+	return -xfs_attr_set(ip, (unsigned char *)name,
+				(void *)value, size, xflags);
 }
 
-static struct xattr_handler xfs_xattr_user_handler = {
+static const struct xattr_handler xfs_xattr_user_handler = {
 	.prefix	= XATTR_USER_PREFIX,
 	.flags	= 0, /* no flags implies user namespace */
 	.get	= xfs_xattr_get,
 	.set	= xfs_xattr_set,
 };
 
-static struct xattr_handler xfs_xattr_trusted_handler = {
+static const struct xattr_handler xfs_xattr_trusted_handler = {
 	.prefix	= XATTR_TRUSTED_PREFIX,
 	.flags	= ATTR_ROOT,
 	.get	= xfs_xattr_get,
 	.set	= xfs_xattr_set,
 };
 
-static struct xattr_handler xfs_xattr_security_handler = {
+static const struct xattr_handler xfs_xattr_security_handler = {
 	.prefix	= XATTR_SECURITY_PREFIX,
 	.flags	= ATTR_SECURE,
 	.get	= xfs_xattr_get,
 	.set	= xfs_xattr_set,
 };
 
-struct xattr_handler *xfs_xattr_handlers[] = {
+const struct xattr_handler *xfs_xattr_handlers[] = {
 	&xfs_xattr_user_handler,
 	&xfs_xattr_trusted_handler,
 	&xfs_xattr_security_handler,
@@ -124,8 +125,13 @@ static const char *xfs_xattr_prefix(int flags)
 }
 
 static int
-xfs_xattr_put_listent(struct xfs_attr_list_context *context, int flags,
-		char *name, int namelen, int valuelen, char *value)
+xfs_xattr_put_listent(
+	struct xfs_attr_list_context *context,
+	int		flags,
+	unsigned char	*name,
+	int		namelen,
+	int		valuelen,
+	unsigned char	*value)
 {
 	unsigned int prefix_len = xfs_xattr_prefix_len(flags);
 	char *offset;
@@ -148,7 +154,7 @@ xfs_xattr_put_listent(struct xfs_attr_list_context *context, int flags,
 	offset = (char *)context->alist + context->count;
 	strncpy(offset, xfs_xattr_prefix(flags), prefix_len);
 	offset += prefix_len;
-	strncpy(offset, name, namelen);			/* real name */
+	strncpy(offset, (char *)name, namelen);			/* real name */
 	offset += namelen;
 	*offset = '\0';
 	context->count += prefix_len + namelen + 1;
@@ -156,8 +162,13 @@ xfs_xattr_put_listent(struct xfs_attr_list_context *context, int flags,
 }
 
 static int
-xfs_xattr_put_listent_sizes(struct xfs_attr_list_context *context, int flags,
-		char *name, int namelen, int valuelen, char *value)
+xfs_xattr_put_listent_sizes(
+	struct xfs_attr_list_context *context,
+	int		flags,
+	unsigned char	*name,
+	int		namelen,
+	int		valuelen,
+	unsigned char	*value)
 {
 	context->count += xfs_xattr_prefix_len(flags) + namelen + 1;
 	return 0;
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index d7c7eea09fc2..585e7633dfc7 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -101,7 +101,7 @@ xfs_qm_dqinit(
 	 * No need to re-initialize these if this is a reclaimed dquot.
 	 */
 	if (brandnewdquot) {
-		dqp->dq_flnext = dqp->dq_flprev = dqp;
+		INIT_LIST_HEAD(&dqp->q_freelist);
 		mutex_init(&dqp->q_qlock);
 		init_waitqueue_head(&dqp->q_pinwait);
 
@@ -119,20 +119,20 @@ xfs_qm_dqinit(
 		 * Only the q_core portion was zeroed in dqreclaim_one().
 		 * So, we need to reset others.
 		 */
-		 dqp->q_nrefs = 0;
-		 dqp->q_blkno = 0;
-		 dqp->MPL_NEXT = dqp->HL_NEXT = NULL;
-		 dqp->HL_PREVP = dqp->MPL_PREVP = NULL;
-		 dqp->q_bufoffset = 0;
-		 dqp->q_fileoffset = 0;
-		 dqp->q_transp = NULL;
-		 dqp->q_gdquot = NULL;
-		 dqp->q_res_bcount = 0;
-		 dqp->q_res_icount = 0;
-		 dqp->q_res_rtbcount = 0;
-		 atomic_set(&dqp->q_pincount, 0);
-		 dqp->q_hash = NULL;
-		 ASSERT(dqp->dq_flnext == dqp->dq_flprev);
+		dqp->q_nrefs = 0;
+		dqp->q_blkno = 0;
+		INIT_LIST_HEAD(&dqp->q_mplist);
+		INIT_LIST_HEAD(&dqp->q_hashlist);
+		dqp->q_bufoffset = 0;
+		dqp->q_fileoffset = 0;
+		dqp->q_transp = NULL;
+		dqp->q_gdquot = NULL;
+		dqp->q_res_bcount = 0;
+		dqp->q_res_icount = 0;
+		dqp->q_res_rtbcount = 0;
+		atomic_set(&dqp->q_pincount, 0);
+		dqp->q_hash = NULL;
+		ASSERT(list_empty(&dqp->q_freelist));
 
 		trace_xfs_dqreuse(dqp);
 	}
@@ -158,7 +158,7 @@ void
 xfs_qm_dqdestroy(
 	xfs_dquot_t	*dqp)
 {
-	ASSERT(! XFS_DQ_IS_ON_FREELIST(dqp));
+	ASSERT(list_empty(&dqp->q_freelist));
 
 	mutex_destroy(&dqp->q_qlock);
 	sv_destroy(&dqp->q_pinwait);
@@ -252,7 +252,7 @@ xfs_qm_adjust_dqtimers(
 		     (be64_to_cpu(d->d_bcount) >=
 		      be64_to_cpu(d->d_blk_hardlimit)))) {
 			d->d_btimer = cpu_to_be32(get_seconds() +
-					XFS_QI_BTIMELIMIT(mp));
+					mp->m_quotainfo->qi_btimelimit);
 		} else {
 			d->d_bwarns = 0;
 		}
@@ -275,7 +275,7 @@ xfs_qm_adjust_dqtimers(
 		     (be64_to_cpu(d->d_icount) >=
 		      be64_to_cpu(d->d_ino_hardlimit)))) {
 			d->d_itimer = cpu_to_be32(get_seconds() +
-					XFS_QI_ITIMELIMIT(mp));
+					mp->m_quotainfo->qi_itimelimit);
 		} else {
 			d->d_iwarns = 0;
 		}
@@ -298,7 +298,7 @@ xfs_qm_adjust_dqtimers(
 		     (be64_to_cpu(d->d_rtbcount) >=
 		      be64_to_cpu(d->d_rtb_hardlimit)))) {
 			d->d_rtbtimer = cpu_to_be32(get_seconds() +
-					XFS_QI_RTBTIMELIMIT(mp));
+					mp->m_quotainfo->qi_rtbtimelimit);
 		} else {
 			d->d_rtbwarns = 0;
 		}
@@ -325,6 +325,7 @@ xfs_qm_init_dquot_blk(
 	uint		type,
 	xfs_buf_t	*bp)
 {
+	struct xfs_quotainfo	*q = mp->m_quotainfo;
 	xfs_dqblk_t	*d;
 	int		curid, i;
 
@@ -337,16 +338,16 @@ xfs_qm_init_dquot_blk(
 	/*
 	 * ID of the first dquot in the block - id's are zero based.
 	 */
-	curid = id - (id % XFS_QM_DQPERBLK(mp));
+	curid = id - (id % q->qi_dqperchunk);
 	ASSERT(curid >= 0);
-	memset(d, 0, BBTOB(XFS_QI_DQCHUNKLEN(mp)));
-	for (i = 0; i < XFS_QM_DQPERBLK(mp); i++, d++, curid++)
+	memset(d, 0, BBTOB(q->qi_dqchunklen));
+	for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++)
 		xfs_qm_dqinit_core(curid, type, d);
 	xfs_trans_dquot_buf(tp, bp,
-			    (type & XFS_DQ_USER ? XFS_BLI_UDQUOT_BUF :
-			    ((type & XFS_DQ_PROJ) ? XFS_BLI_PDQUOT_BUF :
-			     XFS_BLI_GDQUOT_BUF)));
-	xfs_trans_log_buf(tp, bp, 0, BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1);
+			    (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF :
+			    ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF :
+			     XFS_BLF_GDQUOT_BUF)));
+	xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
 }
 
 
@@ -419,7 +420,7 @@ xfs_qm_dqalloc(
 	/* now we can just get the buffer (there's nothing to read yet) */
 	bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
 			       dqp->q_blkno,
-			       XFS_QI_DQCHUNKLEN(mp),
+			       mp->m_quotainfo->qi_dqchunklen,
 			       0);
 	if (!bp || (error = XFS_BUF_GETERROR(bp)))
 		goto error1;
@@ -500,7 +501,8 @@ xfs_qm_dqtobp(
 	 */
 	if (dqp->q_blkno == (xfs_daddr_t) 0) {
 		/* We use the id as an index */
-		dqp->q_fileoffset = (xfs_fileoff_t)id / XFS_QM_DQPERBLK(mp);
+		dqp->q_fileoffset = (xfs_fileoff_t)id /
+					mp->m_quotainfo->qi_dqperchunk;
 		nmaps = 1;
 		quotip = XFS_DQ_TO_QIP(dqp);
 		xfs_ilock(quotip, XFS_ILOCK_SHARED);
@@ -529,7 +531,7 @@ xfs_qm_dqtobp(
 		/*
 		 * offset of dquot in the (fixed sized) dquot chunk.
 		 */
-		dqp->q_bufoffset = (id % XFS_QM_DQPERBLK(mp)) *
+		dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
 			sizeof(xfs_dqblk_t);
 		if (map.br_startblock == HOLESTARTBLOCK) {
 			/*
@@ -559,15 +561,13 @@ xfs_qm_dqtobp(
 	 * Read in the buffer, unless we've just done the allocation
 	 * (in which case we already have the buf).
 	 */
-	if (! newdquot) {
+	if (!newdquot) {
 		trace_xfs_dqtobp_read(dqp);
 
-		if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
-					       dqp->q_blkno,
-					       XFS_QI_DQCHUNKLEN(mp),
-					       0, &bp))) {
-			return (error);
-		}
+		error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+					   dqp->q_blkno,
+					   mp->m_quotainfo->qi_dqchunklen,
+					   0, &bp);
 		if (error || !bp)
 			return XFS_ERROR(error);
 	}
@@ -689,14 +689,14 @@ xfs_qm_idtodq(
 	tp = NULL;
 	if (flags & XFS_QMOPT_DQALLOC) {
 		tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
-		if ((error = xfs_trans_reserve(tp,
-				       XFS_QM_DQALLOC_SPACE_RES(mp),
-				       XFS_WRITE_LOG_RES(mp) +
-					      BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1 +
-					      128,
-				       0,
-				       XFS_TRANS_PERM_LOG_RES,
-				       XFS_WRITE_LOG_COUNT))) {
+		error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp),
+				XFS_WRITE_LOG_RES(mp) +
+				BBTOB(mp->m_quotainfo->qi_dqchunklen) - 1 +
+				128,
+				0,
+				XFS_TRANS_PERM_LOG_RES,
+				XFS_WRITE_LOG_COUNT);
+		if (error) {
 			cancelflags = 0;
 			goto error0;
 		}
@@ -751,7 +751,6 @@ xfs_qm_dqlookup(
 {
 	xfs_dquot_t		*dqp;
 	uint			flist_locked;
-	xfs_dquot_t		*d;
 
 	ASSERT(mutex_is_locked(&qh->qh_lock));
 
@@ -760,7 +759,7 @@ xfs_qm_dqlookup(
 	/*
 	 * Traverse the hashchain looking for a match
 	 */
-	for (dqp = qh->qh_next; dqp != NULL; dqp = dqp->HL_NEXT) {
+	list_for_each_entry(dqp, &qh->qh_list, q_hashlist) {
 		/*
 		 * We already have the hashlock. We don't need the
 		 * dqlock to look at the id field of the dquot, since the
@@ -772,12 +771,12 @@ xfs_qm_dqlookup(
 			/*
 			 * All in core dquots must be on the dqlist of mp
 			 */
-			ASSERT(dqp->MPL_PREVP != NULL);
+			ASSERT(!list_empty(&dqp->q_mplist));
 
 			xfs_dqlock(dqp);
 			if (dqp->q_nrefs == 0) {
-				ASSERT (XFS_DQ_IS_ON_FREELIST(dqp));
-				if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) {
+				ASSERT(!list_empty(&dqp->q_freelist));
+				if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) {
 					trace_xfs_dqlookup_want(dqp);
 
 					/*
@@ -787,7 +786,7 @@ xfs_qm_dqlookup(
 					 */
 					dqp->dq_flags |= XFS_DQ_WANT;
 					xfs_dqunlock(dqp);
-					xfs_qm_freelist_lock(xfs_Gqm);
+					mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
 					xfs_dqlock(dqp);
 					dqp->dq_flags &= ~(XFS_DQ_WANT);
 				}
@@ -802,46 +801,28 @@ xfs_qm_dqlookup(
 
 			if (flist_locked) {
 				if (dqp->q_nrefs != 0) {
-					xfs_qm_freelist_unlock(xfs_Gqm);
+					mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
 					flist_locked = B_FALSE;
 				} else {
-					/*
-					 * take it off the freelist
-					 */
+					/* take it off the freelist */
 					trace_xfs_dqlookup_freelist(dqp);
-					XQM_FREELIST_REMOVE(dqp);
-					/* xfs_qm_freelist_print(&(xfs_Gqm->
-							qm_dqfreelist),
-							"after removal"); */
+					list_del_init(&dqp->q_freelist);
+					xfs_Gqm->qm_dqfrlist_cnt--;
 				}
 			}
 
-			/*
-			 * grab a reference
-			 */
 			XFS_DQHOLD(dqp);
 
 			if (flist_locked)
-				xfs_qm_freelist_unlock(xfs_Gqm);
+				mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
 			/*
 			 * move the dquot to the front of the hashchain
 			 */
 			ASSERT(mutex_is_locked(&qh->qh_lock));
-			if (dqp->HL_PREVP != &qh->qh_next) {
-				trace_xfs_dqlookup_move(dqp);
-				if ((d = dqp->HL_NEXT))
-					d->HL_PREVP = dqp->HL_PREVP;
-				*(dqp->HL_PREVP) = d;
-				d = qh->qh_next;
-				d->HL_PREVP = &dqp->HL_NEXT;
-				dqp->HL_NEXT = d;
-				dqp->HL_PREVP = &qh->qh_next;
-				qh->qh_next = dqp;
-			}
+			list_move(&dqp->q_hashlist, &qh->qh_list);
 			trace_xfs_dqlookup_done(dqp);
 			*O_dqpp = dqp;
-			ASSERT(mutex_is_locked(&qh->qh_lock));
-			return (0);
+			return 0;
 		}
 	}
 
@@ -975,16 +956,17 @@ xfs_qm_dqget(
 	 */
 	if (ip) {
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
-		if (! XFS_IS_DQTYPE_ON(mp, type)) {
-			/* inode stays locked on return */
-			xfs_qm_dqdestroy(dqp);
-			return XFS_ERROR(ESRCH);
-		}
+
 		/*
 		 * A dquot could be attached to this inode by now, since
 		 * we had dropped the ilock.
 		 */
 		if (type == XFS_DQ_USER) {
+			if (!XFS_IS_UQUOTA_ON(mp)) {
+				/* inode stays locked on return */
+				xfs_qm_dqdestroy(dqp);
+				return XFS_ERROR(ESRCH);
+			}
 			if (ip->i_udquot) {
 				xfs_qm_dqdestroy(dqp);
 				dqp = ip->i_udquot;
@@ -992,6 +974,11 @@ xfs_qm_dqget(
 				goto dqret;
 			}
 		} else {
+			if (!XFS_IS_OQUOTA_ON(mp)) {
+				/* inode stays locked on return */
+				xfs_qm_dqdestroy(dqp);
+				return XFS_ERROR(ESRCH);
+			}
 			if (ip->i_gdquot) {
 				xfs_qm_dqdestroy(dqp);
 				dqp = ip->i_gdquot;
@@ -1033,13 +1020,14 @@ xfs_qm_dqget(
 	 */
 	ASSERT(mutex_is_locked(&h->qh_lock));
 	dqp->q_hash = h;
-	XQM_HASHLIST_INSERT(h, dqp);
+	list_add(&dqp->q_hashlist, &h->qh_list);
+	h->qh_version++;
 
 	/*
 	 * Attach this dquot to this filesystem's list of all dquots,
 	 * kept inside the mount structure in m_quotainfo field
 	 */
-	xfs_qm_mplist_lock(mp);
+	mutex_lock(&mp->m_quotainfo->qi_dqlist_lock);
 
 	/*
 	 * We return a locked dquot to the caller, with a reference taken
@@ -1047,9 +1035,9 @@ xfs_qm_dqget(
 	xfs_dqlock(dqp);
 	dqp->q_nrefs = 1;
 
-	XQM_MPLIST_INSERT(&(XFS_QI_MPL_LIST(mp)), dqp);
-
-	xfs_qm_mplist_unlock(mp);
+	list_add(&dqp->q_mplist, &mp->m_quotainfo->qi_dqlist);
+	mp->m_quotainfo->qi_dquots++;
+	mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
 	mutex_unlock(&h->qh_lock);
  dqret:
 	ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -1086,10 +1074,10 @@ xfs_qm_dqput(
 	 * drop the dqlock and acquire the freelist and dqlock
 	 * in the right order; but try to get it out-of-order first
 	 */
-	if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) {
+	if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) {
 		trace_xfs_dqput_wait(dqp);
 		xfs_dqunlock(dqp);
-		xfs_qm_freelist_lock(xfs_Gqm);
+		mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
 		xfs_dqlock(dqp);
 	}
 
@@ -1100,10 +1088,8 @@ xfs_qm_dqput(
 		if (--dqp->q_nrefs == 0) {
 			trace_xfs_dqput_free(dqp);
 
-			/*
-			 * insert at end of the freelist.
-			 */
-			XQM_FREELIST_INSERT(&(xfs_Gqm->qm_dqfreelist), dqp);
+			list_add_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist);
+			xfs_Gqm->qm_dqfrlist_cnt++;
 
 			/*
 			 * If we just added a udquot to the freelist, then
@@ -1118,10 +1104,6 @@ xfs_qm_dqput(
 				xfs_dqlock(gdqp);
 				dqp->q_gdquot = NULL;
 			}
-
-			/* xfs_qm_freelist_print(&(xfs_Gqm->qm_dqfreelist),
-			   "@@@@@++ Free list (after append) @@@@@+");
-			   */
 		}
 		xfs_dqunlock(dqp);
 
@@ -1133,7 +1115,7 @@ xfs_qm_dqput(
 			break;
 		dqp = gdqp;
 	}
-	xfs_qm_freelist_unlock(xfs_Gqm);
+	mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
 }
 
 /*
@@ -1187,7 +1169,7 @@ xfs_qm_dqflush(
 	 * block, nada.
 	 */
 	if (!XFS_DQ_IS_DIRTY(dqp) ||
-	    (!(flags & XFS_QMOPT_SYNC) && atomic_read(&dqp->q_pincount) > 0)) {
+	    (!(flags & SYNC_WAIT) && atomic_read(&dqp->q_pincount) > 0)) {
 		xfs_dqfunlock(dqp);
 		return 0;
 	}
@@ -1248,23 +1230,20 @@ xfs_qm_dqflush(
 	 */
 	if (XFS_BUF_ISPINNED(bp)) {
 		trace_xfs_dqflush_force(dqp);
-		xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
+		xfs_log_force(mp, 0);
 	}
 
-	if (flags & XFS_QMOPT_DELWRI) {
-		xfs_bdwrite(mp, bp);
-	} else if (flags & XFS_QMOPT_ASYNC) {
-		error = xfs_bawrite(mp, bp);
-	} else {
+	if (flags & SYNC_WAIT)
 		error = xfs_bwrite(mp, bp);
-	}
+	else
+		xfs_bdwrite(mp, bp);
 
 	trace_xfs_dqflush_done(dqp);
 
 	/*
 	 * dqp is still locked, but caller is free to unlock it now.
 	 */
-	return (error);
+	return error;
 
 }
 
@@ -1389,10 +1368,10 @@ int
 xfs_qm_dqpurge(
 	xfs_dquot_t	*dqp)
 {
-	xfs_dqhash_t	*thishash;
+	xfs_dqhash_t	*qh = dqp->q_hash;
 	xfs_mount_t	*mp = dqp->q_mount;
 
-	ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
+	ASSERT(mutex_is_locked(&mp->m_quotainfo->qi_dqlist_lock));
 	ASSERT(mutex_is_locked(&dqp->q_hash->qh_lock));
 
 	xfs_dqlock(dqp);
@@ -1410,7 +1389,7 @@ xfs_qm_dqpurge(
 		return (1);
 	}
 
-	ASSERT(XFS_DQ_IS_ON_FREELIST(dqp));
+	ASSERT(!list_empty(&dqp->q_freelist));
 
 	/*
 	 * If we're turning off quotas, we have to make sure that, for
@@ -1445,7 +1424,7 @@ xfs_qm_dqpurge(
 		 * We don't care about getting disk errors here. We need
 		 * to purge this dquot anyway, so we go ahead regardless.
 		 */
-		error = xfs_qm_dqflush(dqp, XFS_QMOPT_SYNC);
+		error = xfs_qm_dqflush(dqp, SYNC_WAIT);
 		if (error)
 			xfs_fs_cmn_err(CE_WARN, mp,
 				"xfs_qm_dqpurge: dquot %p flush failed", dqp);
@@ -1455,14 +1434,16 @@ xfs_qm_dqpurge(
 	ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
 	       !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL));
 
-	thishash = dqp->q_hash;
-	XQM_HASHLIST_REMOVE(thishash, dqp);
-	XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(mp)), dqp);
+	list_del_init(&dqp->q_hashlist);
+	qh->qh_version++;
+	list_del_init(&dqp->q_mplist);
+	mp->m_quotainfo->qi_dqreclaims++;
+	mp->m_quotainfo->qi_dquots--;
 	/*
 	 * XXX Move this to the front of the freelist, if we can get the
 	 * freelist lock.
 	 */
-	ASSERT(XFS_DQ_IS_ON_FREELIST(dqp));
+	ASSERT(!list_empty(&dqp->q_freelist));
 
 	dqp->q_mount = NULL;
 	dqp->q_hash = NULL;
@@ -1470,7 +1451,7 @@ xfs_qm_dqpurge(
 	memset(&dqp->q_core, 0, sizeof(dqp->q_core));
 	xfs_dqfunlock(dqp);
 	xfs_dqunlock(dqp);
-	mutex_unlock(&thishash->qh_lock);
+	mutex_unlock(&qh->qh_lock);
 	return (0);
 }
 
@@ -1520,6 +1501,7 @@ void
 xfs_qm_dqflock_pushbuf_wait(
 	xfs_dquot_t	*dqp)
 {
+	xfs_mount_t	*mp = dqp->q_mount;
 	xfs_buf_t	*bp;
 
 	/*
@@ -1528,26 +1510,18 @@ xfs_qm_dqflock_pushbuf_wait(
 	 * out immediately.  We'll be able to acquire
 	 * the flush lock when the I/O completes.
 	 */
-	bp = xfs_incore(dqp->q_mount->m_ddev_targp, dqp->q_blkno,
-		    XFS_QI_DQCHUNKLEN(dqp->q_mount),
-		    XFS_INCORE_TRYLOCK);
-	if (bp != NULL) {
-		if (XFS_BUF_ISDELAYWRITE(bp)) {
-			int	error;
-			if (XFS_BUF_ISPINNED(bp)) {
-				xfs_log_force(dqp->q_mount,
-					      (xfs_lsn_t)0,
-					      XFS_LOG_FORCE);
-			}
-			error = xfs_bawrite(dqp->q_mount, bp);
-			if (error)
-				xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
-					"xfs_qm_dqflock_pushbuf_wait: "
-					"pushbuf error %d on dqp %p, bp %p",
-					error, dqp, bp);
-		} else {
-			xfs_buf_relse(bp);
-		}
+	bp = xfs_incore(mp->m_ddev_targp, dqp->q_blkno,
+			mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
+	if (!bp)
+		goto out_lock;
+
+	if (XFS_BUF_ISDELAYWRITE(bp)) {
+		if (XFS_BUF_ISPINNED(bp))
+			xfs_log_force(mp, 0);
+		xfs_buf_delwri_promote(bp);
+		wake_up_process(bp->b_target->bt_task);
 	}
+	xfs_buf_relse(bp);
+out_lock:
 	xfs_dqflock(dqp);
 }
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index a0f7da586d1b..5da3a23b820d 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -33,40 +33,23 @@
  * The hash chain headers (hash buckets)
  */
 typedef struct xfs_dqhash {
-	struct xfs_dquot *qh_next;
+	struct list_head  qh_list;
 	struct mutex	  qh_lock;
 	uint		  qh_version;	/* ever increasing version */
 	uint		  qh_nelems;	/* number of dquots on the list */
 } xfs_dqhash_t;
 
-typedef struct xfs_dqlink {
-	struct xfs_dquot  *ql_next;	/* forward link */
-	struct xfs_dquot **ql_prevp;	/* pointer to prev ql_next */
-} xfs_dqlink_t;
-
 struct xfs_mount;
 struct xfs_trans;
 
 /*
- * This is the marker which is designed to occupy the first few
- * bytes of the xfs_dquot_t structure. Even inside this, the freelist pointers
- * must come first.
- * This serves as the marker ("sentinel") when we have to restart list
- * iterations because of locking considerations.
- */
-typedef struct xfs_dqmarker {
-	struct xfs_dquot*dqm_flnext;	/* link to freelist: must be first */
-	struct xfs_dquot*dqm_flprev;
-	xfs_dqlink_t	 dqm_mplist;	/* link to mount's list of dquots */
-	xfs_dqlink_t	 dqm_hashlist;	/* link to the hash chain */
-	uint		 dqm_flags;	/* various flags (XFS_DQ_*) */
-} xfs_dqmarker_t;
-
-/*
  * The incore dquot structure
  */
 typedef struct xfs_dquot {
-	xfs_dqmarker_t	 q_lists;	/* list ptrs, q_flags (marker) */
+	uint		 dq_flags;	/* various flags (XFS_DQ_*) */
+	struct list_head q_freelist;	/* global free list of dquots */
+	struct list_head q_mplist;	/* mount's list of dquots */
+	struct list_head q_hashlist;	/* gloabl hash list of dquots */
 	xfs_dqhash_t	*q_hash;	/* the hashchain header */
 	struct xfs_mount*q_mount;	/* filesystem this relates to */
 	struct xfs_trans*q_transp;	/* trans this belongs to currently */
@@ -87,13 +70,6 @@ typedef struct xfs_dquot {
 	wait_queue_head_t q_pinwait;	/* dquot pinning wait queue */
 } xfs_dquot_t;
 
-
-#define dq_flnext	q_lists.dqm_flnext
-#define dq_flprev	q_lists.dqm_flprev
-#define dq_mplist	q_lists.dqm_mplist
-#define dq_hashlist	q_lists.dqm_hashlist
-#define dq_flags	q_lists.dqm_flags
-
 /*
  * Lock hierarchy for q_qlock:
  *	XFS_QLOCK_NORMAL is the implicit default,
@@ -127,7 +103,6 @@ static inline void xfs_dqfunlock(xfs_dquot_t *dqp)
 }
 
 #define XFS_DQ_IS_LOCKED(dqp)	(mutex_is_locked(&((dqp)->q_qlock)))
-#define XFS_DQ_IS_ON_FREELIST(dqp)  ((dqp)->dq_flnext != (dqp))
 #define XFS_DQ_IS_DIRTY(dqp)	((dqp)->dq_flags & XFS_DQ_DIRTY)
 #define XFS_QM_ISUDQ(dqp)	((dqp)->dq_flags & XFS_DQ_USER)
 #define XFS_QM_ISPDQ(dqp)	((dqp)->dq_flags & XFS_DQ_PROJ)
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index d0d4a9a0bbd7..8d89a24ae324 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -74,11 +74,11 @@ xfs_qm_dquot_logitem_format(
 
 	logvec->i_addr = (xfs_caddr_t)&logitem->qli_format;
 	logvec->i_len  = sizeof(xfs_dq_logformat_t);
-	XLOG_VEC_SET_TYPE(logvec, XLOG_REG_TYPE_QFORMAT);
+	logvec->i_type = XLOG_REG_TYPE_QFORMAT;
 	logvec++;
 	logvec->i_addr = (xfs_caddr_t)&logitem->qli_dquot->q_core;
 	logvec->i_len  = sizeof(xfs_disk_dquot_t);
-	XLOG_VEC_SET_TYPE(logvec, XLOG_REG_TYPE_DQUOT);
+	logvec->i_type = XLOG_REG_TYPE_DQUOT;
 
 	ASSERT(2 == logitem->qli_item.li_desc->lid_size);
 	logitem->qli_format.qlf_size = 2;
@@ -107,8 +107,7 @@ xfs_qm_dquot_logitem_pin(
 /* ARGSUSED */
 STATIC void
 xfs_qm_dquot_logitem_unpin(
-	xfs_dq_logitem_t *logitem,
-	int		  stale)
+	xfs_dq_logitem_t *logitem)
 {
 	xfs_dquot_t *dqp = logitem->qli_dquot;
 
@@ -123,7 +122,7 @@ xfs_qm_dquot_logitem_unpin_remove(
 	xfs_dq_logitem_t *logitem,
 	xfs_trans_t	 *tp)
 {
-	xfs_qm_dquot_logitem_unpin(logitem, 0);
+	xfs_qm_dquot_logitem_unpin(logitem);
 }
 
 /*
@@ -153,7 +152,7 @@ xfs_qm_dquot_logitem_push(
 	 * lock without sleeping, then there must not have been
 	 * anyone in the process of flushing the dquot.
 	 */
-	error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
+	error = xfs_qm_dqflush(dqp, 0);
 	if (error)
 		xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
 			"xfs_qm_dquot_logitem_push: push error %d on dqp %p",
@@ -190,7 +189,7 @@ xfs_qm_dqunpin_wait(
 	/*
 	 * Give the log a push so we don't wait here too long.
 	 */
-	xfs_log_force(dqp->q_mount, (xfs_lsn_t)0, XFS_LOG_FORCE);
+	xfs_log_force(dqp->q_mount, 0);
 	wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0));
 }
 
@@ -212,68 +211,31 @@ xfs_qm_dquot_logitem_pushbuf(
 	xfs_dquot_t	*dqp;
 	xfs_mount_t	*mp;
 	xfs_buf_t	*bp;
-	uint		dopush;
 
 	dqp = qip->qli_dquot;
 	ASSERT(XFS_DQ_IS_LOCKED(dqp));
 
 	/*
-	 * The qli_pushbuf_flag keeps others from
-	 * trying to duplicate our effort.
-	 */
-	ASSERT(qip->qli_pushbuf_flag != 0);
-	ASSERT(qip->qli_push_owner == current_pid());
-
-	/*
 	 * If flushlock isn't locked anymore, chances are that the
 	 * inode flush completed and the inode was taken off the AIL.
 	 * So, just get out.
 	 */
 	if (completion_done(&dqp->q_flush)  ||
 	    ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) {
-		qip->qli_pushbuf_flag = 0;
 		xfs_dqunlock(dqp);
 		return;
 	}
 	mp = dqp->q_mount;
 	bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno,
-		    XFS_QI_DQCHUNKLEN(mp),
-		    XFS_INCORE_TRYLOCK);
-	if (bp != NULL) {
-		if (XFS_BUF_ISDELAYWRITE(bp)) {
-			dopush = ((qip->qli_item.li_flags & XFS_LI_IN_AIL) &&
-				  !completion_done(&dqp->q_flush));
-			qip->qli_pushbuf_flag = 0;
-			xfs_dqunlock(dqp);
-
-			if (XFS_BUF_ISPINNED(bp)) {
-				xfs_log_force(mp, (xfs_lsn_t)0,
-					      XFS_LOG_FORCE);
-			}
-			if (dopush) {
-				int	error;
-#ifdef XFSRACEDEBUG
-				delay_for_intr();
-				delay(300);
-#endif
-				error = xfs_bawrite(mp, bp);
-				if (error)
-					xfs_fs_cmn_err(CE_WARN, mp,
-	"xfs_qm_dquot_logitem_pushbuf: pushbuf error %d on qip %p, bp %p",
-							error, qip, bp);
-			} else {
-				xfs_buf_relse(bp);
-			}
-		} else {
-			qip->qli_pushbuf_flag = 0;
-			xfs_dqunlock(dqp);
-			xfs_buf_relse(bp);
-		}
+			mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
+	xfs_dqunlock(dqp);
+	if (!bp)
 		return;
-	}
+	if (XFS_BUF_ISDELAYWRITE(bp))
+		xfs_buf_delwri_promote(bp);
+	xfs_buf_relse(bp);
+	return;
 
-	qip->qli_pushbuf_flag = 0;
-	xfs_dqunlock(dqp);
 }
 
 /*
@@ -291,50 +253,24 @@ xfs_qm_dquot_logitem_trylock(
 	xfs_dq_logitem_t	*qip)
 {
 	xfs_dquot_t		*dqp;
-	uint			retval;
 
 	dqp = qip->qli_dquot;
 	if (atomic_read(&dqp->q_pincount) > 0)
-		return (XFS_ITEM_PINNED);
+		return XFS_ITEM_PINNED;
 
 	if (! xfs_qm_dqlock_nowait(dqp))
-		return (XFS_ITEM_LOCKED);
+		return XFS_ITEM_LOCKED;
 
-	retval = XFS_ITEM_SUCCESS;
 	if (!xfs_dqflock_nowait(dqp)) {
 		/*
-		 * The dquot is already being flushed.	It may have been
-		 * flushed delayed write, however, and we don't want to
-		 * get stuck waiting for that to complete.  So, we want to check
-		 * to see if we can lock the dquot's buffer without sleeping.
-		 * If we can and it is marked for delayed write, then we
-		 * hold it and send it out from the push routine.  We don't
-		 * want to do that now since we might sleep in the device
-		 * strategy routine.  We also don't want to grab the buffer lock
-		 * here because we'd like not to call into the buffer cache
-		 * while holding the AIL lock.
-		 * Make sure to only return PUSHBUF if we set pushbuf_flag
-		 * ourselves.  If someone else is doing it then we don't
-		 * want to go to the push routine and duplicate their efforts.
+		 * dquot has already been flushed to the backing buffer,
+		 * leave it locked, pushbuf routine will unlock it.
 		 */
-		if (qip->qli_pushbuf_flag == 0) {
-			qip->qli_pushbuf_flag = 1;
-			ASSERT(qip->qli_format.qlf_blkno == dqp->q_blkno);
-#ifdef DEBUG
-			qip->qli_push_owner = current_pid();
-#endif
-			/*
-			 * The dquot is left locked.
-			 */
-			retval = XFS_ITEM_PUSHBUF;
-		} else {
-			retval = XFS_ITEM_FLUSHING;
-			xfs_dqunlock_nonotify(dqp);
-		}
+		return XFS_ITEM_PUSHBUF;
 	}
 
 	ASSERT(qip->qli_item.li_flags & XFS_LI_IN_AIL);
-	return (retval);
+	return XFS_ITEM_SUCCESS;
 }
 
 
@@ -392,8 +328,7 @@ static struct xfs_item_ops xfs_dquot_item_ops = {
 	.iop_format	= (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
 					xfs_qm_dquot_logitem_format,
 	.iop_pin	= (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_pin,
-	.iop_unpin	= (void(*)(xfs_log_item_t*, int))
-					xfs_qm_dquot_logitem_unpin,
+	.iop_unpin	= (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_unpin,
 	.iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
 					xfs_qm_dquot_logitem_unpin_remove,
 	.iop_trylock	= (uint(*)(xfs_log_item_t*))
@@ -420,9 +355,8 @@ xfs_qm_dquot_logitem_init(
 	xfs_dq_logitem_t  *lp;
 	lp = &dqp->q_logitem;
 
-	lp->qli_item.li_type = XFS_LI_DQUOT;
-	lp->qli_item.li_ops = &xfs_dquot_item_ops;
-	lp->qli_item.li_mountp = dqp->q_mount;
+	xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT,
+					&xfs_dquot_item_ops);
 	lp->qli_dquot = dqp;
 	lp->qli_format.qlf_type = XFS_LI_DQUOT;
 	lp->qli_format.qlf_id = be32_to_cpu(dqp->q_core.d_id);
@@ -467,7 +401,7 @@ xfs_qm_qoff_logitem_format(xfs_qoff_logitem_t	*qf,
 
 	log_vector->i_addr = (xfs_caddr_t)&(qf->qql_format);
 	log_vector->i_len = sizeof(xfs_qoff_logitem_t);
-	XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_QUOTAOFF);
+	log_vector->i_type = XLOG_REG_TYPE_QUOTAOFF;
 	qf->qql_format.qf_size = 1;
 }
 
@@ -489,7 +423,7 @@ xfs_qm_qoff_logitem_pin(xfs_qoff_logitem_t *qf)
  */
 /*ARGSUSED*/
 STATIC void
-xfs_qm_qoff_logitem_unpin(xfs_qoff_logitem_t *qf, int stale)
+xfs_qm_qoff_logitem_unpin(xfs_qoff_logitem_t *qf)
 {
 	return;
 }
@@ -600,8 +534,7 @@ static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
 	.iop_format	= (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
 					xfs_qm_qoff_logitem_format,
 	.iop_pin	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin,
-	.iop_unpin	= (void(*)(xfs_log_item_t* ,int))
-					xfs_qm_qoff_logitem_unpin,
+	.iop_unpin	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unpin,
 	.iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*))
 					xfs_qm_qoff_logitem_unpin_remove,
 	.iop_trylock	= (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock,
@@ -622,8 +555,7 @@ static struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
 	.iop_format	= (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
 					xfs_qm_qoff_logitem_format,
 	.iop_pin	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin,
-	.iop_unpin	= (void(*)(xfs_log_item_t*, int))
-					xfs_qm_qoff_logitem_unpin,
+	.iop_unpin	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unpin,
 	.iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*))
 					xfs_qm_qoff_logitem_unpin_remove,
 	.iop_trylock	= (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock,
@@ -649,11 +581,8 @@ xfs_qm_qoff_logitem_init(
 
 	qf = (xfs_qoff_logitem_t*) kmem_zalloc(sizeof(xfs_qoff_logitem_t), KM_SLEEP);
 
-	qf->qql_item.li_type = XFS_LI_QUOTAOFF;
-	if (start)
-		qf->qql_item.li_ops = &xfs_qm_qoffend_logitem_ops;
-	else
-		qf->qql_item.li_ops = &xfs_qm_qoff_logitem_ops;
+	xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ?
+			&xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops);
 	qf->qql_item.li_mountp = mp;
 	qf->qql_format.qf_type = XFS_LI_QUOTAOFF;
 	qf->qql_format.qf_flags = flags;
diff --git a/fs/xfs/quota/xfs_dquot_item.h b/fs/xfs/quota/xfs_dquot_item.h
index 5a632531f843..5acae2ada70b 100644
--- a/fs/xfs/quota/xfs_dquot_item.h
+++ b/fs/xfs/quota/xfs_dquot_item.h
@@ -27,10 +27,6 @@ typedef struct xfs_dq_logitem {
 	xfs_log_item_t		 qli_item;	   /* common portion */
 	struct xfs_dquot	*qli_dquot;	   /* dquot ptr */
 	xfs_lsn_t		 qli_flush_lsn;	   /* lsn at last flush */
-	unsigned short		 qli_pushbuf_flag; /* 1 bit used in push_ail */
-#ifdef DEBUG
-	uint64_t		 qli_push_owner;
-#endif
 	xfs_dq_logformat_t	 qli_format;	   /* logged structure */
 } xfs_dq_logitem_t;
 
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 9e627a8b5b0e..67c018392d62 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -67,12 +67,9 @@ static cred_t	xfs_zerocr;
 STATIC void	xfs_qm_list_init(xfs_dqlist_t *, char *, int);
 STATIC void	xfs_qm_list_destroy(xfs_dqlist_t *);
 
-STATIC void	xfs_qm_freelist_init(xfs_frlist_t *);
-STATIC void	xfs_qm_freelist_destroy(xfs_frlist_t *);
-
 STATIC int	xfs_qm_init_quotainos(xfs_mount_t *);
 STATIC int	xfs_qm_init_quotainfo(xfs_mount_t *);
-STATIC int	xfs_qm_shake(int, gfp_t);
+STATIC int	xfs_qm_shake(struct shrinker *, int, gfp_t);
 
 static struct shrinker xfs_qm_shaker = {
 	.shrink = xfs_qm_shake,
@@ -84,21 +81,25 @@ extern struct mutex	qcheck_lock;
 #endif
 
 #ifdef QUOTADEBUG
-#define XQM_LIST_PRINT(l, NXT, title) \
-{ \
-	xfs_dquot_t	*dqp; int i = 0; \
-	cmn_err(CE_DEBUG, "%s (#%d)", title, (int) (l)->qh_nelems); \
-	for (dqp = (l)->qh_next; dqp != NULL; dqp = dqp->NXT) { \
-		cmn_err(CE_DEBUG, "   %d.  \"%d (%s)\"   " \
-				  "bcnt = %d, icnt = %d, refs = %d", \
-			++i, (int) be32_to_cpu(dqp->q_core.d_id), \
-			DQFLAGTO_TYPESTR(dqp),	     \
-			(int) be64_to_cpu(dqp->q_core.d_bcount), \
-			(int) be64_to_cpu(dqp->q_core.d_icount), \
-			(int) dqp->q_nrefs);  } \
+static void
+xfs_qm_dquot_list_print(
+	struct xfs_mount *mp)
+{
+	xfs_dquot_t	*dqp;
+	int		i = 0;
+
+	list_for_each_entry(dqp, &mp->m_quotainfo->qi_dqlist_lock, qi_mplist) {
+		cmn_err(CE_DEBUG, "   %d. \"%d (%s)\"   "
+				  "bcnt = %lld, icnt = %lld, refs = %d",
+			i++, be32_to_cpu(dqp->q_core.d_id),
+			DQFLAGTO_TYPESTR(dqp),
+			(long long)be64_to_cpu(dqp->q_core.d_bcount),
+			(long long)be64_to_cpu(dqp->q_core.d_icount),
+			dqp->q_nrefs);
+	}
 }
 #else
-#define XQM_LIST_PRINT(l, NXT, title) do { } while (0)
+static void xfs_qm_dquot_list_print(struct xfs_mount *mp) { }
 #endif
 
 /*
@@ -118,9 +119,14 @@ xfs_Gqm_init(void)
 	 */
 	udqhash = kmem_zalloc_greedy(&hsize,
 				     XFS_QM_HASHSIZE_LOW * sizeof(xfs_dqhash_t),
-				     XFS_QM_HASHSIZE_HIGH * sizeof(xfs_dqhash_t),
-				     KM_SLEEP | KM_MAYFAIL | KM_LARGE);
-	gdqhash = kmem_zalloc(hsize, KM_SLEEP | KM_LARGE);
+				     XFS_QM_HASHSIZE_HIGH * sizeof(xfs_dqhash_t));
+	if (!udqhash)
+		goto out;
+
+	gdqhash = kmem_zalloc_large(hsize);
+	if (!gdqhash)
+		goto out_free_udqhash;
+
 	hsize /= sizeof(xfs_dqhash_t);
 	ndquot = hsize << 8;
 
@@ -139,7 +145,9 @@ xfs_Gqm_init(void)
 	/*
 	 * Freelist of all dquots of all file systems
 	 */
-	xfs_qm_freelist_init(&(xqm->qm_dqfreelist));
+	INIT_LIST_HEAD(&xqm->qm_dqfrlist);
+	xqm->qm_dqfrlist_cnt = 0;
+	mutex_init(&xqm->qm_dqfrlist_lock);
 
 	/*
 	 * dquot zone. we register our own low-memory callback.
@@ -170,6 +178,11 @@ xfs_Gqm_init(void)
 	mutex_init(&qcheck_lock);
 #endif
 	return xqm;
+
+ out_free_udqhash:
+	kmem_free_large(udqhash);
+ out:
+	return NULL;
 }
 
 /*
@@ -179,6 +192,7 @@ STATIC void
 xfs_qm_destroy(
 	struct xfs_qm	*xqm)
 {
+	struct xfs_dquot *dqp, *n;
 	int		hsize, i;
 
 	ASSERT(xqm != NULL);
@@ -189,12 +203,26 @@ xfs_qm_destroy(
 		xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i]));
 		xfs_qm_list_destroy(&(xqm->qm_grp_dqhtable[i]));
 	}
-	kmem_free(xqm->qm_usr_dqhtable);
-	kmem_free(xqm->qm_grp_dqhtable);
+	kmem_free_large(xqm->qm_usr_dqhtable);
+	kmem_free_large(xqm->qm_grp_dqhtable);
 	xqm->qm_usr_dqhtable = NULL;
 	xqm->qm_grp_dqhtable = NULL;
 	xqm->qm_dqhashmask = 0;
-	xfs_qm_freelist_destroy(&(xqm->qm_dqfreelist));
+
+	/* frlist cleanup */
+	mutex_lock(&xqm->qm_dqfrlist_lock);
+	list_for_each_entry_safe(dqp, n, &xqm->qm_dqfrlist, q_freelist) {
+		xfs_dqlock(dqp);
+#ifdef QUOTADEBUG
+		cmn_err(CE_DEBUG, "FREELIST destroy 0x%p", dqp);
+#endif
+		list_del_init(&dqp->q_freelist);
+		xfs_Gqm->qm_dqfrlist_cnt--;
+		xfs_dqunlock(dqp);
+		xfs_qm_dqdestroy(dqp);
+	}
+	mutex_unlock(&xqm->qm_dqfrlist_lock);
+	mutex_destroy(&xqm->qm_dqfrlist_lock);
 #ifdef DEBUG
 	mutex_destroy(&qcheck_lock);
 #endif
@@ -219,8 +247,14 @@ xfs_qm_hold_quotafs_ref(
 	 */
 	mutex_lock(&xfs_Gqm_lock);
 
-	if (xfs_Gqm == NULL)
+	if (!xfs_Gqm) {
 		xfs_Gqm = xfs_Gqm_init();
+		if (!xfs_Gqm) {
+			mutex_unlock(&xfs_Gqm_lock);
+			return ENOMEM;
+		}
+	}
+
 	/*
 	 * We can keep a list of all filesystems with quotas mounted for
 	 * debugging and statistical purposes, but ...
@@ -242,7 +276,7 @@ STATIC void
 xfs_qm_rele_quotafs_ref(
 	struct xfs_mount *mp)
 {
-	xfs_dquot_t	*dqp, *nextdqp;
+	xfs_dquot_t	*dqp, *n;
 
 	ASSERT(xfs_Gqm);
 	ASSERT(xfs_Gqm->qm_nrefs > 0);
@@ -250,26 +284,24 @@ xfs_qm_rele_quotafs_ref(
 	/*
 	 * Go thru the freelist and destroy all inactive dquots.
 	 */
-	xfs_qm_freelist_lock(xfs_Gqm);
+	mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
 
-	for (dqp = xfs_Gqm->qm_dqfreelist.qh_next;
-	     dqp != (xfs_dquot_t *)&(xfs_Gqm->qm_dqfreelist); ) {
+	list_for_each_entry_safe(dqp, n, &xfs_Gqm->qm_dqfrlist, q_freelist) {
 		xfs_dqlock(dqp);
-		nextdqp = dqp->dq_flnext;
 		if (dqp->dq_flags & XFS_DQ_INACTIVE) {
 			ASSERT(dqp->q_mount == NULL);
 			ASSERT(! XFS_DQ_IS_DIRTY(dqp));
-			ASSERT(dqp->HL_PREVP == NULL);
-			ASSERT(dqp->MPL_PREVP == NULL);
-			XQM_FREELIST_REMOVE(dqp);
+			ASSERT(list_empty(&dqp->q_hashlist));
+			ASSERT(list_empty(&dqp->q_mplist));
+			list_del_init(&dqp->q_freelist);
+			xfs_Gqm->qm_dqfrlist_cnt--;
 			xfs_dqunlock(dqp);
 			xfs_qm_dqdestroy(dqp);
 		} else {
 			xfs_dqunlock(dqp);
 		}
-		dqp = nextdqp;
 	}
-	xfs_qm_freelist_unlock(xfs_Gqm);
+	mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
 
 	/*
 	 * Destroy the entire XQM. If somebody mounts with quotaon, this'll
@@ -291,7 +323,7 @@ xfs_qm_unmount(
 	struct xfs_mount	*mp)
 {
 	if (mp->m_quotainfo) {
-		xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING);
+		xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL);
 		xfs_qm_destroy_quotainfo(mp);
 	}
 }
@@ -435,20 +467,21 @@ xfs_qm_unmount_quotas(
  */
 STATIC int
 xfs_qm_dqflush_all(
-	xfs_mount_t	*mp,
-	int		flags)
+	struct xfs_mount	*mp,
+	int			sync_mode)
 {
-	int		recl;
-	xfs_dquot_t	*dqp;
-	int		niters;
-	int		error;
+	struct xfs_quotainfo	*q = mp->m_quotainfo;
+	int			recl;
+	struct xfs_dquot	*dqp;
+	int			niters;
+	int			error;
 
-	if (mp->m_quotainfo == NULL)
+	if (!q)
 		return 0;
 	niters = 0;
 again:
-	xfs_qm_mplist_lock(mp);
-	FOREACH_DQUOT_IN_MP(dqp, mp) {
+	mutex_lock(&q->qi_dqlist_lock);
+	list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
 		xfs_dqlock(dqp);
 		if (! XFS_DQ_IS_DIRTY(dqp)) {
 			xfs_dqunlock(dqp);
@@ -456,7 +489,7 @@ again:
 		}
 
 		/* XXX a sentinel would be better */
-		recl = XFS_QI_MPLRECLAIMS(mp);
+		recl = q->qi_dqreclaims;
 		if (!xfs_dqflock_nowait(dqp)) {
 			/*
 			 * If we can't grab the flush lock then check
@@ -471,21 +504,21 @@ again:
 		 * Let go of the mplist lock. We don't want to hold it
 		 * across a disk write.
 		 */
-		xfs_qm_mplist_unlock(mp);
-		error = xfs_qm_dqflush(dqp, flags);
+		mutex_unlock(&q->qi_dqlist_lock);
+		error = xfs_qm_dqflush(dqp, sync_mode);
 		xfs_dqunlock(dqp);
 		if (error)
 			return error;
 
-		xfs_qm_mplist_lock(mp);
-		if (recl != XFS_QI_MPLRECLAIMS(mp)) {
-			xfs_qm_mplist_unlock(mp);
+		mutex_lock(&q->qi_dqlist_lock);
+		if (recl != q->qi_dqreclaims) {
+			mutex_unlock(&q->qi_dqlist_lock);
 			/* XXX restart limit */
 			goto again;
 		}
 	}
 
-	xfs_qm_mplist_unlock(mp);
+	mutex_unlock(&q->qi_dqlist_lock);
 	/* return ! busy */
 	return 0;
 }
@@ -495,15 +528,15 @@ again:
  */
 STATIC void
 xfs_qm_detach_gdquots(
-	xfs_mount_t	*mp)
+	struct xfs_mount	*mp)
 {
-	xfs_dquot_t	*dqp, *gdqp;
-	int		nrecl;
+	struct xfs_quotainfo	*q = mp->m_quotainfo;
+	struct xfs_dquot	*dqp, *gdqp;
+	int			nrecl;
 
  again:
-	ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
-	dqp = XFS_QI_MPLNEXT(mp);
-	while (dqp) {
+	ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
+	list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
 		xfs_dqlock(dqp);
 		if ((gdqp = dqp->q_gdquot)) {
 			xfs_dqlock(gdqp);
@@ -516,15 +549,14 @@ xfs_qm_detach_gdquots(
 			 * Can't hold the mplist lock across a dqput.
 			 * XXXmust convert to marker based iterations here.
 			 */
-			nrecl = XFS_QI_MPLRECLAIMS(mp);
-			xfs_qm_mplist_unlock(mp);
+			nrecl = q->qi_dqreclaims;
+			mutex_unlock(&q->qi_dqlist_lock);
 			xfs_qm_dqput(gdqp);
 
-			xfs_qm_mplist_lock(mp);
-			if (nrecl != XFS_QI_MPLRECLAIMS(mp))
+			mutex_lock(&q->qi_dqlist_lock);
+			if (nrecl != q->qi_dqreclaims)
 				goto again;
 		}
-		dqp = dqp->MPL_NEXT;
 	}
 }
 
@@ -536,23 +568,23 @@ xfs_qm_detach_gdquots(
  */
 STATIC int
 xfs_qm_dqpurge_int(
-	xfs_mount_t	*mp,
-	uint		flags) /* QUOTAOFF/UMOUNTING/UQUOTA/PQUOTA/GQUOTA */
+	struct xfs_mount	*mp,
+	uint			flags)
 {
-	xfs_dquot_t	*dqp;
-	uint		dqtype;
-	int		nrecl;
-	xfs_dquot_t	*nextdqp;
-	int		nmisses;
+	struct xfs_quotainfo	*q = mp->m_quotainfo;
+	struct xfs_dquot	*dqp, *n;
+	uint			dqtype;
+	int			nrecl;
+	int			nmisses;
 
-	if (mp->m_quotainfo == NULL)
+	if (!q)
 		return 0;
 
 	dqtype = (flags & XFS_QMOPT_UQUOTA) ? XFS_DQ_USER : 0;
 	dqtype |= (flags & XFS_QMOPT_PQUOTA) ? XFS_DQ_PROJ : 0;
 	dqtype |= (flags & XFS_QMOPT_GQUOTA) ? XFS_DQ_GROUP : 0;
 
-	xfs_qm_mplist_lock(mp);
+	mutex_lock(&q->qi_dqlist_lock);
 
 	/*
 	 * In the first pass through all incore dquots of this filesystem,
@@ -564,28 +596,25 @@ xfs_qm_dqpurge_int(
 
       again:
 	nmisses = 0;
-	ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
+	ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
 	/*
 	 * Try to get rid of all of the unwanted dquots. The idea is to
 	 * get them off mplist and hashlist, but leave them on freelist.
 	 */
-	dqp = XFS_QI_MPLNEXT(mp);
-	while (dqp) {
+	list_for_each_entry_safe(dqp, n, &q->qi_dqlist, q_mplist) {
 		/*
 		 * It's OK to look at the type without taking dqlock here.
 		 * We're holding the mplist lock here, and that's needed for
 		 * a dqreclaim.
 		 */
-		if ((dqp->dq_flags & dqtype) == 0) {
-			dqp = dqp->MPL_NEXT;
+		if ((dqp->dq_flags & dqtype) == 0)
 			continue;
-		}
 
 		if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
-			nrecl = XFS_QI_MPLRECLAIMS(mp);
-			xfs_qm_mplist_unlock(mp);
+			nrecl = q->qi_dqreclaims;
+			mutex_unlock(&q->qi_dqlist_lock);
 			mutex_lock(&dqp->q_hash->qh_lock);
-			xfs_qm_mplist_lock(mp);
+			mutex_lock(&q->qi_dqlist_lock);
 
 			/*
 			 * XXXTheoretically, we can get into a very long
@@ -593,7 +622,7 @@ xfs_qm_dqpurge_int(
 			 * No one can be adding dquots to the mplist at
 			 * this point, but somebody might be taking things off.
 			 */
-			if (nrecl != XFS_QI_MPLRECLAIMS(mp)) {
+			if (nrecl != q->qi_dqreclaims) {
 				mutex_unlock(&dqp->q_hash->qh_lock);
 				goto again;
 			}
@@ -603,11 +632,9 @@ xfs_qm_dqpurge_int(
 		 * Take the dquot off the mplist and hashlist. It may remain on
 		 * freelist in INACTIVE state.
 		 */
-		nextdqp = dqp->MPL_NEXT;
 		nmisses += xfs_qm_dqpurge(dqp);
-		dqp = nextdqp;
 	}
-	xfs_qm_mplist_unlock(mp);
+	mutex_unlock(&q->qi_dqlist_lock);
 	return nmisses;
 }
 
@@ -907,33 +934,33 @@ xfs_qm_dqdetach(
 
 int
 xfs_qm_sync(
-	xfs_mount_t	*mp,
-	int		flags)
+	struct xfs_mount	*mp,
+	int			flags)
 {
-	int		recl, restarts;
-	xfs_dquot_t	*dqp;
-	uint		flush_flags;
-	int		error;
+	struct xfs_quotainfo	*q = mp->m_quotainfo;
+	int			recl, restarts;
+	struct xfs_dquot	*dqp;
+	int			error;
 
 	if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
 		return 0;
 
-	flush_flags = (flags & SYNC_WAIT) ? XFS_QMOPT_SYNC : XFS_QMOPT_DELWRI;
 	restarts = 0;
 
   again:
-	xfs_qm_mplist_lock(mp);
+	mutex_lock(&q->qi_dqlist_lock);
 	/*
 	 * dqpurge_all() also takes the mplist lock and iterate thru all dquots
 	 * in quotaoff. However, if the QUOTA_ACTIVE bits are not cleared
 	 * when we have the mplist lock, we know that dquots will be consistent
 	 * as long as we have it locked.
 	 */
-	if (! XFS_IS_QUOTA_ON(mp)) {
-		xfs_qm_mplist_unlock(mp);
+	if (!XFS_IS_QUOTA_ON(mp)) {
+		mutex_unlock(&q->qi_dqlist_lock);
 		return 0;
 	}
-	FOREACH_DQUOT_IN_MP(dqp, mp) {
+	ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
+	list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
 		/*
 		 * If this is vfs_sync calling, then skip the dquots that
 		 * don't 'seem' to be dirty. ie. don't acquire dqlock.
@@ -957,7 +984,7 @@ xfs_qm_sync(
 		}
 
 		/* XXX a sentinel would be better */
-		recl = XFS_QI_MPLRECLAIMS(mp);
+		recl = q->qi_dqreclaims;
 		if (!xfs_dqflock_nowait(dqp)) {
 			if (flags & SYNC_TRYLOCK) {
 				xfs_dqunlock(dqp);
@@ -977,25 +1004,25 @@ xfs_qm_sync(
 		 * Let go of the mplist lock. We don't want to hold it
 		 * across a disk write
 		 */
-		xfs_qm_mplist_unlock(mp);
-		error = xfs_qm_dqflush(dqp, flush_flags);
+		mutex_unlock(&q->qi_dqlist_lock);
+		error = xfs_qm_dqflush(dqp, flags);
 		xfs_dqunlock(dqp);
 		if (error && XFS_FORCED_SHUTDOWN(mp))
 			return 0;	/* Need to prevent umount failure */
 		else if (error)
 			return error;
 
-		xfs_qm_mplist_lock(mp);
-		if (recl != XFS_QI_MPLRECLAIMS(mp)) {
+		mutex_lock(&q->qi_dqlist_lock);
+		if (recl != q->qi_dqreclaims) {
 			if (++restarts >= XFS_QM_SYNC_MAX_RESTARTS)
 				break;
 
-			xfs_qm_mplist_unlock(mp);
+			mutex_unlock(&q->qi_dqlist_lock);
 			goto again;
 		}
 	}
 
-	xfs_qm_mplist_unlock(mp);
+	mutex_unlock(&q->qi_dqlist_lock);
 	return 0;
 }
 
@@ -1040,8 +1067,9 @@ xfs_qm_init_quotainfo(
 		return error;
 	}
 
-	xfs_qm_list_init(&qinf->qi_dqlist, "mpdqlist", 0);
-	lockdep_set_class(&qinf->qi_dqlist.qh_lock, &xfs_quota_mplist_class);
+	INIT_LIST_HEAD(&qinf->qi_dqlist);
+	mutex_init(&qinf->qi_dqlist_lock);
+	lockdep_set_class(&qinf->qi_dqlist_lock, &xfs_quota_mplist_class);
 
 	qinf->qi_dqreclaims = 0;
 
@@ -1138,7 +1166,8 @@ xfs_qm_destroy_quotainfo(
 	 */
 	xfs_qm_rele_quotafs_ref(mp);
 
-	xfs_qm_list_destroy(&qi->qi_dqlist);
+	ASSERT(list_empty(&qi->qi_dqlist));
+	mutex_destroy(&qi->qi_dqlist_lock);
 
 	if (qi->qi_uquotaip) {
 		IRELE(qi->qi_uquotaip);
@@ -1165,7 +1194,7 @@ xfs_qm_list_init(
 	int		n)
 {
 	mutex_init(&list->qh_lock);
-	list->qh_next = NULL;
+	INIT_LIST_HEAD(&list->qh_list);
 	list->qh_version = 0;
 	list->qh_nelems = 0;
 }
@@ -1304,9 +1333,6 @@ xfs_qm_qino_alloc(
 	 */
 	spin_lock(&mp->m_sb_lock);
 	if (flags & XFS_QMOPT_SBVERSION) {
-#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
-		unsigned oldv = mp->m_sb.sb_versionnum;
-#endif
 		ASSERT(!xfs_sb_version_hasquota(&mp->m_sb));
 		ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
 				   XFS_SB_GQUOTINO | XFS_SB_QFLAGS)) ==
@@ -1319,11 +1345,6 @@ xfs_qm_qino_alloc(
 
 		/* qflags will get updated _after_ quotacheck */
 		mp->m_sb.sb_qflags = 0;
-#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
-		cmn_err(CE_NOTE,
-			"Old superblock version %x, converting to %x.",
-			oldv, mp->m_sb.sb_versionnum);
-#endif
 	}
 	if (flags & XFS_QMOPT_UQUOTA)
 		mp->m_sb.sb_uquotino = (*ip)->i_ino;
@@ -1359,10 +1380,10 @@ xfs_qm_reset_dqcounts(
 #ifdef DEBUG
 	j = XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
 	do_div(j, sizeof(xfs_dqblk_t));
-	ASSERT(XFS_QM_DQPERBLK(mp) == j);
+	ASSERT(mp->m_quotainfo->qi_dqperchunk == j);
 #endif
 	ddq = (xfs_disk_dquot_t *)XFS_BUF_PTR(bp);
-	for (j = 0; j < XFS_QM_DQPERBLK(mp); j++) {
+	for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) {
 		/*
 		 * Do a sanity check, and if needed, repair the dqblk. Don't
 		 * output any warnings because it's perfectly possible to
@@ -1417,7 +1438,7 @@ xfs_qm_dqiter_bufs(
 	while (blkcnt--) {
 		error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
 			      XFS_FSB_TO_DADDR(mp, bno),
-			      (int)XFS_QI_DQCHUNKLEN(mp), 0, &bp);
+			      mp->m_quotainfo->qi_dqchunklen, 0, &bp);
 		if (error)
 			break;
 
@@ -1427,7 +1448,7 @@ xfs_qm_dqiter_bufs(
 		 * goto the next block.
 		 */
 		bno++;
-		firstid += XFS_QM_DQPERBLK(mp);
+		firstid += mp->m_quotainfo->qi_dqperchunk;
 	}
 	return error;
 }
@@ -1493,7 +1514,7 @@ xfs_qm_dqiterate(
 				continue;
 
 			firstid = (xfs_dqid_t) map[i].br_startoff *
-				XFS_QM_DQPERBLK(mp);
+				mp->m_quotainfo->qi_dqperchunk;
 			/*
 			 * Do a read-ahead on the next extent.
 			 */
@@ -1504,7 +1525,7 @@ xfs_qm_dqiterate(
 				while (rablkcnt--) {
 					xfs_baread(mp->m_ddev_targp,
 					       XFS_FSB_TO_DADDR(mp, rablkno),
-					       (int)XFS_QI_DQCHUNKLEN(mp));
+					       mp->m_quotainfo->qi_dqchunklen);
 					rablkno++;
 				}
 			}
@@ -1564,8 +1585,10 @@ xfs_qm_quotacheck_dqadjust(
 
 	/*
 	 * Set default limits, adjust timers (since we changed usages)
+	 *
+	 * There are no timers for the default values set in the root dquot.
 	 */
-	if (! XFS_IS_SUSER_DQUOT(dqp)) {
+	if (dqp->q_core.d_id) {
 		xfs_qm_adjust_dqlimits(dqp->q_mount, &dqp->q_core);
 		xfs_qm_adjust_dqtimers(dqp->q_mount, &dqp->q_core);
 	}
@@ -1609,10 +1632,7 @@ xfs_qm_dqusage_adjust(
 	xfs_ino_t	ino,		/* inode number to get data for */
 	void		__user *buffer,	/* not used */
 	int		ubsize,		/* not used */
-	void		*private_data,	/* not used */
-	xfs_daddr_t	bno,		/* starting block of inode cluster */
 	int		*ubused,	/* not used */
-	void		*dip,		/* on-disk inode pointer (not used) */
 	int		*res)		/* result code value */
 {
 	xfs_inode_t	*ip;
@@ -1637,7 +1657,7 @@ xfs_qm_dqusage_adjust(
 	 * the case in all other instances. It's OK that we do this because
 	 * quotacheck is done only at mount time.
 	 */
-	if ((error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip, bno))) {
+	if ((error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip))) {
 		*res = BULKSTAT_RV_NOTHING;
 		return error;
 	}
@@ -1735,14 +1755,14 @@ xfs_qm_quotacheck(
 	lastino = 0;
 	flags = 0;
 
-	ASSERT(XFS_QI_UQIP(mp) || XFS_QI_GQIP(mp));
+	ASSERT(mp->m_quotainfo->qi_uquotaip || mp->m_quotainfo->qi_gquotaip);
 	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
 
 	/*
 	 * There should be no cached dquots. The (simplistic) quotacheck
 	 * algorithm doesn't like that.
 	 */
-	ASSERT(XFS_QI_MPLNDQUOTS(mp) == 0);
+	ASSERT(list_empty(&mp->m_quotainfo->qi_dqlist));
 
 	cmn_err(CE_NOTE, "XFS quotacheck %s: Please wait.", mp->m_fsname);
 
@@ -1751,15 +1771,19 @@ xfs_qm_quotacheck(
 	 * their counters to zero. We need a clean slate.
 	 * We don't log our changes till later.
 	 */
-	if ((uip = XFS_QI_UQIP(mp))) {
-		if ((error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA)))
+	uip = mp->m_quotainfo->qi_uquotaip;
+	if (uip) {
+		error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA);
+		if (error)
 			goto error_return;
 		flags |= XFS_UQUOTA_CHKD;
 	}
 
-	if ((gip = XFS_QI_GQIP(mp))) {
-		if ((error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ?
-					XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA)))
+	gip = mp->m_quotainfo->qi_gquotaip;
+	if (gip) {
+		error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ?
+					XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA);
+		if (error)
 			goto error_return;
 		flags |= XFS_OQUOTA_CHKD;
 	}
@@ -1769,12 +1793,13 @@ xfs_qm_quotacheck(
 		 * Iterate thru all the inodes in the file system,
 		 * adjusting the corresponding dquot counters in core.
 		 */
-		if ((error = xfs_bulkstat(mp, &lastino, &count,
-				     xfs_qm_dqusage_adjust, NULL,
-				     structsz, NULL, BULKSTAT_FG_IGET, &done)))
+		error = xfs_bulkstat(mp, &lastino, &count,
+				     xfs_qm_dqusage_adjust,
+				     structsz, NULL, &done);
+		if (error)
 			break;
 
-	} while (! done);
+	} while (!done);
 
 	/*
 	 * We've made all the changes that we need to make incore.
@@ -1782,7 +1807,7 @@ xfs_qm_quotacheck(
 	 * successfully.
 	 */
 	if (!error)
-		error = xfs_qm_dqflush_all(mp, XFS_QMOPT_DELWRI);
+		error = xfs_qm_dqflush_all(mp, 0);
 
 	/*
 	 * We can get this error if we couldn't do a dquot allocation inside
@@ -1792,7 +1817,7 @@ xfs_qm_quotacheck(
 	 * at this point (because we intentionally didn't in dqget_noattach).
 	 */
 	if (error) {
-		xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_QUOTAOFF);
+		xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL);
 		goto error_return;
 	}
 
@@ -1813,7 +1838,7 @@ xfs_qm_quotacheck(
 	mp->m_qflags &= ~(XFS_OQUOTA_CHKD | XFS_UQUOTA_CHKD);
 	mp->m_qflags |= flags;
 
-	XQM_LIST_PRINT(&(XFS_QI_MPL_LIST(mp)), MPL_NEXT, "++++ Mp list +++");
+	xfs_qm_dquot_list_print(mp);
 
  error_return:
 	if (error) {
@@ -1862,14 +1887,14 @@ xfs_qm_init_quotainos(
 		    mp->m_sb.sb_uquotino != NULLFSINO) {
 			ASSERT(mp->m_sb.sb_uquotino > 0);
 			if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
-					     0, 0, &uip, 0)))
+					     0, 0, &uip)))
 				return XFS_ERROR(error);
 		}
 		if (XFS_IS_OQUOTA_ON(mp) &&
 		    mp->m_sb.sb_gquotino != NULLFSINO) {
 			ASSERT(mp->m_sb.sb_gquotino > 0);
 			if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
-					     0, 0, &gip, 0))) {
+					     0, 0, &gip))) {
 				if (uip)
 					IRELE(uip);
 				return XFS_ERROR(error);
@@ -1908,59 +1933,53 @@ xfs_qm_init_quotainos(
 		}
 	}
 
-	XFS_QI_UQIP(mp) = uip;
-	XFS_QI_GQIP(mp) = gip;
+	mp->m_quotainfo->qi_uquotaip = uip;
+	mp->m_quotainfo->qi_gquotaip = gip;
 
 	return 0;
 }
 
 
+
 /*
- * Traverse the freelist of dquots and attempt to reclaim a maximum of
- * 'howmany' dquots. This operation races with dqlookup(), and attempts to
- * favor the lookup function ...
- * XXXsup merge this with qm_reclaim_one().
+ * Just pop the least recently used dquot off the freelist and
+ * recycle it. The returned dquot is locked.
  */
-STATIC int
-xfs_qm_shake_freelist(
-	int howmany)
+STATIC xfs_dquot_t *
+xfs_qm_dqreclaim_one(void)
 {
-	int		nreclaimed;
-	xfs_dqhash_t	*hash;
-	xfs_dquot_t	*dqp, *nextdqp;
+	xfs_dquot_t	*dqpout;
+	xfs_dquot_t	*dqp;
 	int		restarts;
-	int		nflushes;
 
-	if (howmany <= 0)
-		return 0;
-
-	nreclaimed = 0;
 	restarts = 0;
-	nflushes = 0;
+	dqpout = NULL;
 
-#ifdef QUOTADEBUG
-	cmn_err(CE_DEBUG, "Shake free 0x%x", howmany);
-#endif
-	/* lock order is : hashchainlock, freelistlock, mplistlock */
- tryagain:
-	xfs_qm_freelist_lock(xfs_Gqm);
+	/* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
+startagain:
+	mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
 
-	for (dqp = xfs_Gqm->qm_dqfreelist.qh_next;
-	     ((dqp != (xfs_dquot_t *) &xfs_Gqm->qm_dqfreelist) &&
-	      nreclaimed < howmany); ) {
+	list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) {
+		struct xfs_mount *mp = dqp->q_mount;
 		xfs_dqlock(dqp);
 
 		/*
 		 * We are racing with dqlookup here. Naturally we don't
-		 * want to reclaim a dquot that lookup wants.
+		 * want to reclaim a dquot that lookup wants. We release the
+		 * freelist lock and start over, so that lookup will grab
+		 * both the dquot and the freelistlock.
 		 */
 		if (dqp->dq_flags & XFS_DQ_WANT) {
+			ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
+
+			trace_xfs_dqreclaim_want(dqp);
+
 			xfs_dqunlock(dqp);
-			xfs_qm_freelist_unlock(xfs_Gqm);
+			mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
 			if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
-				return nreclaimed;
+				return NULL;
 			XQM_STATS_INC(xqmstats.xs_qm_dqwants);
-			goto tryagain;
+			goto startagain;
 		}
 
 		/*
@@ -1969,23 +1988,27 @@ xfs_qm_shake_freelist(
 		 * life easier.
 		 */
 		if (dqp->dq_flags & XFS_DQ_INACTIVE) {
-			ASSERT(dqp->q_mount == NULL);
+			ASSERT(mp == NULL);
 			ASSERT(! XFS_DQ_IS_DIRTY(dqp));
-			ASSERT(dqp->HL_PREVP == NULL);
-			ASSERT(dqp->MPL_PREVP == NULL);
+			ASSERT(list_empty(&dqp->q_hashlist));
+			ASSERT(list_empty(&dqp->q_mplist));
+			list_del_init(&dqp->q_freelist);
+			xfs_Gqm->qm_dqfrlist_cnt--;
+			xfs_dqunlock(dqp);
+			dqpout = dqp;
 			XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
-			nextdqp = dqp->dq_flnext;
-			goto off_freelist;
+			break;
 		}
 
-		ASSERT(dqp->MPL_PREVP);
+		ASSERT(dqp->q_hash);
+		ASSERT(!list_empty(&dqp->q_mplist));
+
 		/*
 		 * Try to grab the flush lock. If this dquot is in the process of
 		 * getting flushed to disk, we don't want to reclaim it.
 		 */
 		if (!xfs_dqflock_nowait(dqp)) {
 			xfs_dqunlock(dqp);
-			dqp = dqp->dq_flnext;
 			continue;
 		}
 
@@ -1998,21 +2021,21 @@ xfs_qm_shake_freelist(
 		if (XFS_DQ_IS_DIRTY(dqp)) {
 			int	error;
 
-			trace_xfs_dqshake_dirty(dqp);
+			trace_xfs_dqreclaim_dirty(dqp);
 
 			/*
 			 * We flush it delayed write, so don't bother
-			 * releasing the mplock.
+			 * releasing the freelist lock.
 			 */
-			error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
+			error = xfs_qm_dqflush(dqp, 0);
 			if (error) {
-				xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
-			"xfs_qm_dqflush_all: dquot %p flush failed", dqp);
+				xfs_fs_cmn_err(CE_WARN, mp,
+			"xfs_qm_dqreclaim: dquot %p flush failed", dqp);
 			}
 			xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
-			dqp = dqp->dq_flnext;
 			continue;
 		}
+
 		/*
 		 * We're trying to get the hashlock out of order. This races
 		 * with dqlookup; so, we giveup and goto the next dquot if
@@ -2021,62 +2044,83 @@ xfs_qm_shake_freelist(
 		 * waiting for the freelist lock.
 		 */
 		if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
-			xfs_dqfunlock(dqp);
-			xfs_dqunlock(dqp);
-			dqp = dqp->dq_flnext;
-			continue;
+			restarts++;
+			goto dqfunlock;
 		}
+
 		/*
 		 * This races with dquot allocation code as well as dqflush_all
 		 * and reclaim code. So, if we failed to grab the mplist lock,
 		 * giveup everything and start over.
 		 */
-		hash = dqp->q_hash;
-		ASSERT(hash);
-		if (! xfs_qm_mplist_nowait(dqp->q_mount)) {
-			/* XXX put a sentinel so that we can come back here */
+		if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) {
+			restarts++;
+			mutex_unlock(&dqp->q_hash->qh_lock);
 			xfs_dqfunlock(dqp);
 			xfs_dqunlock(dqp);
-			mutex_unlock(&hash->qh_lock);
-			xfs_qm_freelist_unlock(xfs_Gqm);
-			if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
-				return nreclaimed;
-			goto tryagain;
+			mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
+			if (restarts++ >= XFS_QM_RECLAIM_MAX_RESTARTS)
+				return NULL;
+			goto startagain;
 		}
 
-		trace_xfs_dqshake_unlink(dqp);
-
-#ifdef QUOTADEBUG
-		cmn_err(CE_DEBUG, "Shake 0x%p, ID 0x%x\n",
-			dqp, be32_to_cpu(dqp->q_core.d_id));
-#endif
 		ASSERT(dqp->q_nrefs == 0);
-		nextdqp = dqp->dq_flnext;
-		XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp);
-		XQM_HASHLIST_REMOVE(hash, dqp);
+		list_del_init(&dqp->q_mplist);
+		mp->m_quotainfo->qi_dquots--;
+		mp->m_quotainfo->qi_dqreclaims++;
+		list_del_init(&dqp->q_hashlist);
+		dqp->q_hash->qh_version++;
+		list_del_init(&dqp->q_freelist);
+		xfs_Gqm->qm_dqfrlist_cnt--;
+		dqpout = dqp;
+		mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
+		mutex_unlock(&dqp->q_hash->qh_lock);
+dqfunlock:
 		xfs_dqfunlock(dqp);
-		xfs_qm_mplist_unlock(dqp->q_mount);
-		mutex_unlock(&hash->qh_lock);
-
- off_freelist:
-		XQM_FREELIST_REMOVE(dqp);
 		xfs_dqunlock(dqp);
-		nreclaimed++;
-		XQM_STATS_INC(xqmstats.xs_qm_dqshake_reclaims);
+		if (dqpout)
+			break;
+		if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
+			return NULL;
+	}
+	mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
+	return dqpout;
+}
+
+/*
+ * Traverse the freelist of dquots and attempt to reclaim a maximum of
+ * 'howmany' dquots. This operation races with dqlookup(), and attempts to
+ * favor the lookup function ...
+ */
+STATIC int
+xfs_qm_shake_freelist(
+	int	howmany)
+{
+	int		nreclaimed = 0;
+	xfs_dquot_t	*dqp;
+
+	if (howmany <= 0)
+		return 0;
+
+	while (nreclaimed < howmany) {
+		dqp = xfs_qm_dqreclaim_one();
+		if (!dqp)
+			return nreclaimed;
 		xfs_qm_dqdestroy(dqp);
-		dqp = nextdqp;
+		nreclaimed++;
 	}
-	xfs_qm_freelist_unlock(xfs_Gqm);
 	return nreclaimed;
 }
 
-
 /*
  * The kmem_shake interface is invoked when memory is running low.
  */
 /* ARGSUSED */
 STATIC int
-xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask)
+xfs_qm_shake(
+	struct shrinker	*shrink,
+	int		nr_to_scan,
+	gfp_t		gfp_mask)
 {
 	int	ndqused, nfree, n;
 
@@ -2085,7 +2129,7 @@ xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask)
 	if (!xfs_Gqm)
 		return 0;
 
-	nfree = xfs_Gqm->qm_dqfreelist.qh_nelems; /* free dquots */
+	nfree = xfs_Gqm->qm_dqfrlist_cnt; /* free dquots */
 	/* incore dquots in all f/s's */
 	ndqused = atomic_read(&xfs_Gqm->qm_totaldquots) - nfree;
 
@@ -2101,131 +2145,6 @@ xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask)
 }
 
 
-/*
- * Just pop the least recently used dquot off the freelist and
- * recycle it. The returned dquot is locked.
- */
-STATIC xfs_dquot_t *
-xfs_qm_dqreclaim_one(void)
-{
-	xfs_dquot_t	*dqpout;
-	xfs_dquot_t	*dqp;
-	int		restarts;
-	int		nflushes;
-
-	restarts = 0;
-	dqpout = NULL;
-	nflushes = 0;
-
-	/* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
- startagain:
-	xfs_qm_freelist_lock(xfs_Gqm);
-
-	FOREACH_DQUOT_IN_FREELIST(dqp, &(xfs_Gqm->qm_dqfreelist)) {
-		xfs_dqlock(dqp);
-
-		/*
-		 * We are racing with dqlookup here. Naturally we don't
-		 * want to reclaim a dquot that lookup wants. We release the
-		 * freelist lock and start over, so that lookup will grab
-		 * both the dquot and the freelistlock.
-		 */
-		if (dqp->dq_flags & XFS_DQ_WANT) {
-			ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
-
-			trace_xfs_dqreclaim_want(dqp);
-
-			xfs_dqunlock(dqp);
-			xfs_qm_freelist_unlock(xfs_Gqm);
-			if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
-				return NULL;
-			XQM_STATS_INC(xqmstats.xs_qm_dqwants);
-			goto startagain;
-		}
-
-		/*
-		 * If the dquot is inactive, we are assured that it is
-		 * not on the mplist or the hashlist, and that makes our
-		 * life easier.
-		 */
-		if (dqp->dq_flags & XFS_DQ_INACTIVE) {
-			ASSERT(dqp->q_mount == NULL);
-			ASSERT(! XFS_DQ_IS_DIRTY(dqp));
-			ASSERT(dqp->HL_PREVP == NULL);
-			ASSERT(dqp->MPL_PREVP == NULL);
-			XQM_FREELIST_REMOVE(dqp);
-			xfs_dqunlock(dqp);
-			dqpout = dqp;
-			XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
-			break;
-		}
-
-		ASSERT(dqp->q_hash);
-		ASSERT(dqp->MPL_PREVP);
-
-		/*
-		 * Try to grab the flush lock. If this dquot is in the process of
-		 * getting flushed to disk, we don't want to reclaim it.
-		 */
-		if (!xfs_dqflock_nowait(dqp)) {
-			xfs_dqunlock(dqp);
-			continue;
-		}
-
-		/*
-		 * We have the flush lock so we know that this is not in the
-		 * process of being flushed. So, if this is dirty, flush it
-		 * DELWRI so that we don't get a freelist infested with
-		 * dirty dquots.
-		 */
-		if (XFS_DQ_IS_DIRTY(dqp)) {
-			int	error;
-
-			trace_xfs_dqreclaim_dirty(dqp);
-
-			/*
-			 * We flush it delayed write, so don't bother
-			 * releasing the freelist lock.
-			 */
-			error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
-			if (error) {
-				xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
-			"xfs_qm_dqreclaim: dquot %p flush failed", dqp);
-			}
-			xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
-			continue;
-		}
-
-		if (! xfs_qm_mplist_nowait(dqp->q_mount)) {
-			xfs_dqfunlock(dqp);
-			xfs_dqunlock(dqp);
-			continue;
-		}
-
-		if (!mutex_trylock(&dqp->q_hash->qh_lock))
-			goto mplistunlock;
-
-		trace_xfs_dqreclaim_unlink(dqp);
-
-		ASSERT(dqp->q_nrefs == 0);
-		XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp);
-		XQM_HASHLIST_REMOVE(dqp->q_hash, dqp);
-		XQM_FREELIST_REMOVE(dqp);
-		dqpout = dqp;
-		mutex_unlock(&dqp->q_hash->qh_lock);
- mplistunlock:
-		xfs_qm_mplist_unlock(dqp->q_mount);
-		xfs_dqfunlock(dqp);
-		xfs_dqunlock(dqp);
-		if (dqpout)
-			break;
-	}
-
-	xfs_qm_freelist_unlock(xfs_Gqm);
-	return dqpout;
-}
-
-
 /*------------------------------------------------------------------*/
 
 /*
@@ -2650,66 +2569,3 @@ xfs_qm_vop_create_dqattach(
 	}
 }
 
-/* ------------- list stuff -----------------*/
-STATIC void
-xfs_qm_freelist_init(xfs_frlist_t *ql)
-{
-	ql->qh_next = ql->qh_prev = (xfs_dquot_t *) ql;
-	mutex_init(&ql->qh_lock);
-	ql->qh_version = 0;
-	ql->qh_nelems = 0;
-}
-
-STATIC void
-xfs_qm_freelist_destroy(xfs_frlist_t *ql)
-{
-	xfs_dquot_t	*dqp, *nextdqp;
-
-	mutex_lock(&ql->qh_lock);
-	for (dqp = ql->qh_next;
-	     dqp != (xfs_dquot_t *)ql; ) {
-		xfs_dqlock(dqp);
-		nextdqp = dqp->dq_flnext;
-#ifdef QUOTADEBUG
-		cmn_err(CE_DEBUG, "FREELIST destroy 0x%p", dqp);
-#endif
-		XQM_FREELIST_REMOVE(dqp);
-		xfs_dqunlock(dqp);
-		xfs_qm_dqdestroy(dqp);
-		dqp = nextdqp;
-	}
-	mutex_unlock(&ql->qh_lock);
-	mutex_destroy(&ql->qh_lock);
-
-	ASSERT(ql->qh_nelems == 0);
-}
-
-STATIC void
-xfs_qm_freelist_insert(xfs_frlist_t *ql, xfs_dquot_t *dq)
-{
-	dq->dq_flnext = ql->qh_next;
-	dq->dq_flprev = (xfs_dquot_t *)ql;
-	ql->qh_next = dq;
-	dq->dq_flnext->dq_flprev = dq;
-	xfs_Gqm->qm_dqfreelist.qh_nelems++;
-	xfs_Gqm->qm_dqfreelist.qh_version++;
-}
-
-void
-xfs_qm_freelist_unlink(xfs_dquot_t *dq)
-{
-	xfs_dquot_t *next = dq->dq_flnext;
-	xfs_dquot_t *prev = dq->dq_flprev;
-
-	next->dq_flprev = prev;
-	prev->dq_flnext = next;
-	dq->dq_flnext = dq->dq_flprev = dq;
-	xfs_Gqm->qm_dqfreelist.qh_nelems--;
-	xfs_Gqm->qm_dqfreelist.qh_version++;
-}
-
-void
-xfs_qm_freelist_append(xfs_frlist_t *ql, xfs_dquot_t *dq)
-{
-	xfs_qm_freelist_insert((xfs_frlist_t *)ql->qh_prev, dq);
-}
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index 495564b8af38..c9446f1c726d 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -72,17 +72,6 @@ extern kmem_zone_t	*qm_dqtrxzone;
 #define XFS_QM_MAX_DQCLUSTER_LOGSZ	3
 
 typedef xfs_dqhash_t	xfs_dqlist_t;
-/*
- * The freelist head. The first two fields match the first two in the
- * xfs_dquot_t structure (in xfs_dqmarker_t)
- */
-typedef struct xfs_frlist {
-       struct xfs_dquot *qh_next;
-       struct xfs_dquot *qh_prev;
-       struct mutex	 qh_lock;
-       uint		 qh_version;
-       uint		 qh_nelems;
-} xfs_frlist_t;
 
 /*
  * Quota Manager (global) structure. Lives only in core.
@@ -91,7 +80,9 @@ typedef struct xfs_qm {
 	xfs_dqlist_t	*qm_usr_dqhtable;/* udquot hash table */
 	xfs_dqlist_t	*qm_grp_dqhtable;/* gdquot hash table */
 	uint		 qm_dqhashmask;	 /* # buckets in dq hashtab - 1 */
-	xfs_frlist_t	 qm_dqfreelist;	 /* freelist of dquots */
+	struct list_head qm_dqfrlist;	 /* freelist of dquots */
+	struct mutex	 qm_dqfrlist_lock;
+	int		 qm_dqfrlist_cnt;
 	atomic_t	 qm_totaldquots; /* total incore dquots */
 	uint		 qm_nrefs;	 /* file systems with quota on */
 	int		 qm_dqfree_ratio;/* ratio of free to inuse dquots */
@@ -106,7 +97,9 @@ typedef struct xfs_qm {
 typedef struct xfs_quotainfo {
 	xfs_inode_t	*qi_uquotaip;	 /* user quota inode */
 	xfs_inode_t	*qi_gquotaip;	 /* group quota inode */
-	xfs_dqlist_t	 qi_dqlist;	 /* all dquots in filesys */
+	struct list_head qi_dqlist;	 /* all dquots in filesys */
+	struct mutex	 qi_dqlist_lock;
+	int		 qi_dquots;
 	int		 qi_dqreclaims;	 /* a change here indicates
 					    a removal in the dqlist */
 	time_t		 qi_btimelimit;	 /* limit for blks timer */
@@ -175,10 +168,6 @@ extern int		xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
 extern int		xfs_qm_scall_quotaon(xfs_mount_t *, uint);
 extern int		xfs_qm_scall_quotaoff(xfs_mount_t *, uint);
 
-/* list stuff */
-extern void		xfs_qm_freelist_append(xfs_frlist_t *, xfs_dquot_t *);
-extern void		xfs_qm_freelist_unlink(xfs_dquot_t *);
-
 #ifdef DEBUG
 extern int		xfs_qm_internalqcheck(xfs_mount_t *);
 #else
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index a5346630dfae..97b410c12794 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -59,7 +59,7 @@ xfs_fill_statvfs_from_dquot(
 		be64_to_cpu(dp->d_blk_hardlimit);
 	if (limit && statp->f_blocks > limit) {
 		statp->f_blocks = limit;
-		statp->f_bfree =
+		statp->f_bfree = statp->f_bavail =
 			(statp->f_blocks > be64_to_cpu(dp->d_bcount)) ?
 			 (statp->f_blocks - be64_to_cpu(dp->d_bcount)) : 0;
 	}
diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c
index 83e7ea3e25fa..3d1fc79532e2 100644
--- a/fs/xfs/quota/xfs_qm_stats.c
+++ b/fs/xfs/quota/xfs_qm_stats.c
@@ -55,7 +55,7 @@ static int xqm_proc_show(struct seq_file *m, void *v)
 			ndquot,
 			xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0,
 			xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0,
-			xfs_Gqm? xfs_Gqm->qm_dqfreelist.qh_nelems : 0);
+			xfs_Gqm? xfs_Gqm->qm_dqfrlist_cnt : 0);
 	return 0;
 }
 
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 873e07e29074..b4487764e923 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -79,6 +79,7 @@ xfs_qm_scall_quotaoff(
 	xfs_mount_t		*mp,
 	uint			flags)
 {
+	struct xfs_quotainfo	*q = mp->m_quotainfo;
 	uint			dqtype;
 	int			error;
 	uint			inactivate_flags;
@@ -102,11 +103,8 @@ xfs_qm_scall_quotaoff(
 	 * critical thing.
 	 * If quotaoff, then we must be dealing with the root filesystem.
 	 */
-	ASSERT(mp->m_quotainfo);
-	if (mp->m_quotainfo)
-		mutex_lock(&(XFS_QI_QOFFLOCK(mp)));
-
-	ASSERT(mp->m_quotainfo);
+	ASSERT(q);
+	mutex_lock(&q->qi_quotaofflock);
 
 	/*
 	 * If we're just turning off quota enforcement, change mp and go.
@@ -117,7 +115,7 @@ xfs_qm_scall_quotaoff(
 		spin_lock(&mp->m_sb_lock);
 		mp->m_sb.sb_qflags = mp->m_qflags;
 		spin_unlock(&mp->m_sb_lock);
-		mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+		mutex_unlock(&q->qi_quotaofflock);
 
 		/* XXX what to do if error ? Revert back to old vals incore ? */
 		error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS);
@@ -150,10 +148,8 @@ xfs_qm_scall_quotaoff(
 	 * Nothing to do?  Don't complain. This happens when we're just
 	 * turning off quota enforcement.
 	 */
-	if ((mp->m_qflags & flags) == 0) {
-		mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
-		return (0);
-	}
+	if ((mp->m_qflags & flags) == 0)
+		goto out_unlock;
 
 	/*
 	 * Write the LI_QUOTAOFF log record, and do SB changes atomically,
@@ -162,7 +158,7 @@ xfs_qm_scall_quotaoff(
 	 */
 	error = xfs_qm_log_quotaoff(mp, &qoffstart, flags);
 	if (error)
-		goto out_error;
+		goto out_unlock;
 
 	/*
 	 * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct
@@ -204,7 +200,7 @@ xfs_qm_scall_quotaoff(
 	 * So, if we couldn't purge all the dquots from the filesystem,
 	 * we can't get rid of the incore data structures.
 	 */
-	while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype|XFS_QMOPT_QUOTAOFF)))
+	while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype)))
 		delay(10 * nculprits);
 
 	/*
@@ -222,7 +218,7 @@ xfs_qm_scall_quotaoff(
 	if (error) {
 		/* We're screwed now. Shutdown is the only option. */
 		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-		goto out_error;
+		goto out_unlock;
 	}
 
 	/*
@@ -230,27 +226,26 @@ xfs_qm_scall_quotaoff(
 	 */
 	if (((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET1) ||
 	    ((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET2)) {
-		mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+		mutex_unlock(&q->qi_quotaofflock);
 		xfs_qm_destroy_quotainfo(mp);
 		return (0);
 	}
 
 	/*
-	 * Release our quotainode references, and vn_purge them,
-	 * if we don't need them anymore.
+	 * Release our quotainode references if we don't need them anymore.
 	 */
-	if ((dqtype & XFS_QMOPT_UQUOTA) && XFS_QI_UQIP(mp)) {
-		IRELE(XFS_QI_UQIP(mp));
-		XFS_QI_UQIP(mp) = NULL;
+	if ((dqtype & XFS_QMOPT_UQUOTA) && q->qi_uquotaip) {
+		IRELE(q->qi_uquotaip);
+		q->qi_uquotaip = NULL;
 	}
-	if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && XFS_QI_GQIP(mp)) {
-		IRELE(XFS_QI_GQIP(mp));
-		XFS_QI_GQIP(mp) = NULL;
+	if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && q->qi_gquotaip) {
+		IRELE(q->qi_gquotaip);
+		q->qi_gquotaip = NULL;
 	}
-out_error:
-	mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
 
-	return (error);
+out_unlock:
+	mutex_unlock(&q->qi_quotaofflock);
+	return error;
 }
 
 int
@@ -267,7 +262,7 @@ xfs_qm_scall_trunc_qfiles(
 	}
 
 	if ((flags & XFS_DQ_USER) && mp->m_sb.sb_uquotino != NULLFSINO) {
-		error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 0, 0, &qip, 0);
+		error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 0, 0, &qip);
 		if (!error) {
 			error = xfs_truncate_file(mp, qip);
 			IRELE(qip);
@@ -276,7 +271,7 @@ xfs_qm_scall_trunc_qfiles(
 
 	if ((flags & (XFS_DQ_GROUP|XFS_DQ_PROJ)) &&
 	    mp->m_sb.sb_gquotino != NULLFSINO) {
-		error2 = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &qip, 0);
+		error2 = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &qip);
 		if (!error2) {
 			error2 = xfs_truncate_file(mp, qip);
 			IRELE(qip);
@@ -379,9 +374,9 @@ xfs_qm_scall_quotaon(
 	/*
 	 * Switch on quota enforcement in core.
 	 */
-	mutex_lock(&(XFS_QI_QOFFLOCK(mp)));
+	mutex_lock(&mp->m_quotainfo->qi_quotaofflock);
 	mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD);
-	mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+	mutex_unlock(&mp->m_quotainfo->qi_quotaofflock);
 
 	return (0);
 }
@@ -392,11 +387,12 @@ xfs_qm_scall_quotaon(
  */
 int
 xfs_qm_scall_getqstat(
-	xfs_mount_t	*mp,
-	fs_quota_stat_t *out)
+	struct xfs_mount	*mp,
+	struct fs_quota_stat	*out)
 {
-	xfs_inode_t	*uip, *gip;
-	boolean_t	tempuqip, tempgqip;
+	struct xfs_quotainfo	*q = mp->m_quotainfo;
+	struct xfs_inode	*uip, *gip;
+	boolean_t		tempuqip, tempgqip;
 
 	uip = gip = NULL;
 	tempuqip = tempgqip = B_FALSE;
@@ -415,18 +411,18 @@ xfs_qm_scall_getqstat(
 	out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino;
 	out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
 
-	if (mp->m_quotainfo) {
-		uip = mp->m_quotainfo->qi_uquotaip;
-		gip = mp->m_quotainfo->qi_gquotaip;
+	if (q) {
+		uip = q->qi_uquotaip;
+		gip = q->qi_gquotaip;
 	}
 	if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
 		if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
-					0, 0, &uip, 0) == 0)
+					0, 0, &uip) == 0)
 			tempuqip = B_TRUE;
 	}
 	if (!gip && mp->m_sb.sb_gquotino != NULLFSINO) {
 		if (xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
-					0, 0, &gip, 0) == 0)
+					0, 0, &gip) == 0)
 			tempgqip = B_TRUE;
 	}
 	if (uip) {
@@ -441,17 +437,20 @@ xfs_qm_scall_getqstat(
 		if (tempgqip)
 			IRELE(gip);
 	}
-	if (mp->m_quotainfo) {
-		out->qs_incoredqs = XFS_QI_MPLNDQUOTS(mp);
-		out->qs_btimelimit = XFS_QI_BTIMELIMIT(mp);
-		out->qs_itimelimit = XFS_QI_ITIMELIMIT(mp);
-		out->qs_rtbtimelimit = XFS_QI_RTBTIMELIMIT(mp);
-		out->qs_bwarnlimit = XFS_QI_BWARNLIMIT(mp);
-		out->qs_iwarnlimit = XFS_QI_IWARNLIMIT(mp);
+	if (q) {
+		out->qs_incoredqs = q->qi_dquots;
+		out->qs_btimelimit = q->qi_btimelimit;
+		out->qs_itimelimit = q->qi_itimelimit;
+		out->qs_rtbtimelimit = q->qi_rtbtimelimit;
+		out->qs_bwarnlimit = q->qi_bwarnlimit;
+		out->qs_iwarnlimit = q->qi_iwarnlimit;
 	}
-	return (0);
+	return 0;
 }
 
+#define XFS_DQ_MASK \
+	(FS_DQ_LIMIT_MASK | FS_DQ_TIMER_MASK | FS_DQ_WARNS_MASK)
+
 /*
  * Adjust quota limits, and start/stop timers accordingly.
  */
@@ -462,15 +461,17 @@ xfs_qm_scall_setqlim(
 	uint			type,
 	fs_disk_quota_t		*newlim)
 {
+	struct xfs_quotainfo	*q = mp->m_quotainfo;
 	xfs_disk_dquot_t	*ddq;
 	xfs_dquot_t		*dqp;
 	xfs_trans_t		*tp;
 	int			error;
 	xfs_qcnt_t		hard, soft;
 
-	if ((newlim->d_fieldmask &
-	    (FS_DQ_LIMIT_MASK|FS_DQ_TIMER_MASK|FS_DQ_WARNS_MASK)) == 0)
-		return (0);
+	if (newlim->d_fieldmask & ~XFS_DQ_MASK)
+		return EINVAL;
+	if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0)
+		return 0;
 
 	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
 	if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_disk_dquot_t) + 128,
@@ -485,7 +486,7 @@ xfs_qm_scall_setqlim(
 	 * a quotaoff from happening). (XXXThis doesn't currently happen
 	 * because we take the vfslock before calling xfs_qm_sysent).
 	 */
-	mutex_lock(&(XFS_QI_QOFFLOCK(mp)));
+	mutex_lock(&q->qi_quotaofflock);
 
 	/*
 	 * Get the dquot (locked), and join it to the transaction.
@@ -493,9 +494,8 @@ xfs_qm_scall_setqlim(
 	 */
 	if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) {
 		xfs_trans_cancel(tp, XFS_TRANS_ABORT);
-		mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
 		ASSERT(error != ENOENT);
-		return (error);
+		goto out_unlock;
 	}
 	xfs_trans_dqjoin(tp, dqp);
 	ddq = &dqp->q_core;
@@ -513,8 +513,8 @@ xfs_qm_scall_setqlim(
 		ddq->d_blk_hardlimit = cpu_to_be64(hard);
 		ddq->d_blk_softlimit = cpu_to_be64(soft);
 		if (id == 0) {
-			mp->m_quotainfo->qi_bhardlimit = hard;
-			mp->m_quotainfo->qi_bsoftlimit = soft;
+			q->qi_bhardlimit = hard;
+			q->qi_bsoftlimit = soft;
 		}
 	} else {
 		qdprintk("blkhard %Ld < blksoft %Ld\n", hard, soft);
@@ -529,8 +529,8 @@ xfs_qm_scall_setqlim(
 		ddq->d_rtb_hardlimit = cpu_to_be64(hard);
 		ddq->d_rtb_softlimit = cpu_to_be64(soft);
 		if (id == 0) {
-			mp->m_quotainfo->qi_rtbhardlimit = hard;
-			mp->m_quotainfo->qi_rtbsoftlimit = soft;
+			q->qi_rtbhardlimit = hard;
+			q->qi_rtbsoftlimit = soft;
 		}
 	} else {
 		qdprintk("rtbhard %Ld < rtbsoft %Ld\n", hard, soft);
@@ -546,8 +546,8 @@ xfs_qm_scall_setqlim(
 		ddq->d_ino_hardlimit = cpu_to_be64(hard);
 		ddq->d_ino_softlimit = cpu_to_be64(soft);
 		if (id == 0) {
-			mp->m_quotainfo->qi_ihardlimit = hard;
-			mp->m_quotainfo->qi_isoftlimit = soft;
+			q->qi_ihardlimit = hard;
+			q->qi_isoftlimit = soft;
 		}
 	} else {
 		qdprintk("ihard %Ld < isoft %Ld\n", hard, soft);
@@ -572,23 +572,23 @@ xfs_qm_scall_setqlim(
 		 * for warnings.
 		 */
 		if (newlim->d_fieldmask & FS_DQ_BTIMER) {
-			mp->m_quotainfo->qi_btimelimit = newlim->d_btimer;
+			q->qi_btimelimit = newlim->d_btimer;
 			ddq->d_btimer = cpu_to_be32(newlim->d_btimer);
 		}
 		if (newlim->d_fieldmask & FS_DQ_ITIMER) {
-			mp->m_quotainfo->qi_itimelimit = newlim->d_itimer;
+			q->qi_itimelimit = newlim->d_itimer;
 			ddq->d_itimer = cpu_to_be32(newlim->d_itimer);
 		}
 		if (newlim->d_fieldmask & FS_DQ_RTBTIMER) {
-			mp->m_quotainfo->qi_rtbtimelimit = newlim->d_rtbtimer;
+			q->qi_rtbtimelimit = newlim->d_rtbtimer;
 			ddq->d_rtbtimer = cpu_to_be32(newlim->d_rtbtimer);
 		}
 		if (newlim->d_fieldmask & FS_DQ_BWARNS)
-			mp->m_quotainfo->qi_bwarnlimit = newlim->d_bwarns;
+			q->qi_bwarnlimit = newlim->d_bwarns;
 		if (newlim->d_fieldmask & FS_DQ_IWARNS)
-			mp->m_quotainfo->qi_iwarnlimit = newlim->d_iwarns;
+			q->qi_iwarnlimit = newlim->d_iwarns;
 		if (newlim->d_fieldmask & FS_DQ_RTBWARNS)
-			mp->m_quotainfo->qi_rtbwarnlimit = newlim->d_rtbwarns;
+			q->qi_rtbwarnlimit = newlim->d_rtbwarns;
 	} else {
 		/*
 		 * If the user is now over quota, start the timelimit.
@@ -605,8 +605,9 @@ xfs_qm_scall_setqlim(
 	error = xfs_trans_commit(tp, 0);
 	xfs_qm_dqprint(dqp);
 	xfs_qm_dqrele(dqp);
-	mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
 
+ out_unlock:
+	mutex_unlock(&q->qi_quotaofflock);
 	return error;
 }
 
@@ -853,7 +854,8 @@ xfs_dqrele_inode(
 	int			error;
 
 	/* skip quota inodes */
-	if (ip == XFS_QI_UQIP(ip->i_mount) || ip == XFS_QI_GQIP(ip->i_mount)) {
+	if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
+	    ip == ip->i_mount->m_quotainfo->qi_gquotaip) {
 		ASSERT(ip->i_udquot == NULL);
 		ASSERT(ip->i_gdquot == NULL);
 		read_unlock(&pag->pag_ici_lock);
@@ -891,7 +893,8 @@ xfs_qm_dqrele_all_inodes(
 	uint		 flags)
 {
 	ASSERT(mp->m_quotainfo);
-	xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG, 0);
+	xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags,
+				XFS_ICI_NO_TAG, 0, NULL);
 }
 
 /*------------------------------------------------------------------------*/
@@ -930,7 +933,8 @@ struct mutex  qcheck_lock;
 }
 
 typedef struct dqtest {
-	xfs_dqmarker_t	q_lists;
+	uint		 dq_flags;	/* various flags (XFS_DQ_*) */
+	struct list_head q_hashlist;
 	xfs_dqhash_t	*q_hash;	/* the hashchain header */
 	xfs_mount_t	*q_mount;	/* filesystem this relates to */
 	xfs_dqid_t	d_id;		/* user id or group id */
@@ -941,14 +945,9 @@ typedef struct dqtest {
 STATIC void
 xfs_qm_hashinsert(xfs_dqhash_t *h, xfs_dqtest_t *dqp)
 {
-	xfs_dquot_t *d;
-	if (((d) = (h)->qh_next))
-		(d)->HL_PREVP = &((dqp)->HL_NEXT);
-	(dqp)->HL_NEXT = d;
-	(dqp)->HL_PREVP = &((h)->qh_next);
-	(h)->qh_next = (xfs_dquot_t *)dqp;
-	(h)->qh_version++;
-	(h)->qh_nelems++;
+	list_add(&dqp->q_hashlist, &h->qh_list);
+	h->qh_version++;
+	h->qh_nelems++;
 }
 STATIC void
 xfs_qm_dqtest_print(
@@ -1060,9 +1059,7 @@ xfs_qm_internalqcheck_dqget(
 	xfs_dqhash_t	*h;
 
 	h = DQTEST_HASH(mp, id, type);
-	for (d = (xfs_dqtest_t *) h->qh_next; d != NULL;
-	     d = (xfs_dqtest_t *) d->HL_NEXT) {
-		/* DQTEST_LIST_PRINT(h, HL_NEXT, "@@@@@ dqtestlist @@@@@"); */
+	list_for_each_entry(d, &h->qh_list, q_hashlist) {
 		if (d->d_id == id && mp == d->q_mount) {
 			*O_dq = d;
 			return (0);
@@ -1073,6 +1070,7 @@ xfs_qm_internalqcheck_dqget(
 	d->d_id = id;
 	d->q_mount = mp;
 	d->q_hash = h;
+	INIT_LIST_HEAD(&d->q_hashlist);
 	xfs_qm_hashinsert(h, d);
 	*O_dq = d;
 	return (0);
@@ -1111,10 +1109,7 @@ xfs_qm_internalqcheck_adjust(
 	xfs_ino_t	ino,		/* inode number to get data for */
 	void		__user *buffer,	/* not used */
 	int		ubsize,		/* not used */
-	void		*private_data,	/* not used */
-	xfs_daddr_t	bno,		/* starting block of inode cluster */
 	int		*ubused,	/* not used */
-	void		*dip,		/* not used */
 	int		*res)		/* bulkstat result code */
 {
 	xfs_inode_t		*ip;
@@ -1136,7 +1131,7 @@ xfs_qm_internalqcheck_adjust(
 	ipreleased = B_FALSE;
  again:
 	lock_flags = XFS_ILOCK_SHARED;
-	if ((error = xfs_iget(mp, NULL, ino, 0, lock_flags, &ip, bno))) {
+	if ((error = xfs_iget(mp, NULL, ino, 0, lock_flags, &ip))) {
 		*res = BULKSTAT_RV_NOTHING;
 		return (error);
 	}
@@ -1179,8 +1174,6 @@ xfs_qm_internalqcheck(
 	xfs_ino_t	lastino;
 	int		done, count;
 	int		i;
-	xfs_dqtest_t	*d, *e;
-	xfs_dqhash_t	*h1;
 	int		error;
 
 	lastino = 0;
@@ -1192,9 +1185,9 @@ xfs_qm_internalqcheck(
 	if (! XFS_IS_QUOTA_ON(mp))
 		return XFS_ERROR(ESRCH);
 
-	xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
+	xfs_log_force(mp, XFS_LOG_SYNC);
 	XFS_bflush(mp->m_ddev_targp);
-	xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
+	xfs_log_force(mp, XFS_LOG_SYNC);
 	XFS_bflush(mp->m_ddev_targp);
 
 	mutex_lock(&qcheck_lock);
@@ -1209,30 +1202,29 @@ xfs_qm_internalqcheck(
 		 * Iterate thru all the inodes in the file system,
 		 * adjusting the corresponding dquot counters
 		 */
-		if ((error = xfs_bulkstat(mp, &lastino, &count,
-				 xfs_qm_internalqcheck_adjust, NULL,
-				 0, NULL, BULKSTAT_FG_IGET, &done))) {
+		error = xfs_bulkstat(mp, &lastino, &count,
+				 xfs_qm_internalqcheck_adjust,
+				 0, NULL, &done);
+		if (error) {
+			cmn_err(CE_DEBUG, "Bulkstat returned error 0x%x", error);
 			break;
 		}
-	} while (! done);
-	if (error) {
-		cmn_err(CE_DEBUG, "Bulkstat returned error 0x%x", error);
-	}
+	} while (!done);
+
 	cmn_err(CE_DEBUG, "Checking results against system dquots");
 	for (i = 0; i < qmtest_hashmask; i++) {
-		h1 = &qmtest_udqtab[i];
-		for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) {
+		xfs_dqtest_t	*d, *n;
+		xfs_dqhash_t	*h;
+
+		h = &qmtest_udqtab[i];
+		list_for_each_entry_safe(d, n, &h->qh_list, q_hashlist) {
 			xfs_dqtest_cmp(d);
-			e = (xfs_dqtest_t *) d->HL_NEXT;
 			kmem_free(d);
-			d = e;
 		}
-		h1 = &qmtest_gdqtab[i];
-		for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) {
+		h = &qmtest_gdqtab[i];
+		list_for_each_entry_safe(d, n, &h->qh_list, q_hashlist) {
 			xfs_dqtest_cmp(d);
-			e = (xfs_dqtest_t *) d->HL_NEXT;
 			kmem_free(d);
-			d = e;
 		}
 	}
 
diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h
index 8286b2842b6b..94a3d927d716 100644
--- a/fs/xfs/quota/xfs_quota_priv.h
+++ b/fs/xfs/quota/xfs_quota_priv.h
@@ -24,43 +24,6 @@
  */
 #define XFS_DQITER_MAP_SIZE	10
 
-/* Number of dquots that fit in to a dquot block */
-#define XFS_QM_DQPERBLK(mp)	((mp)->m_quotainfo->qi_dqperchunk)
-
-#define XFS_DQ_IS_ADDEDTO_TRX(t, d)	((d)->q_transp == (t))
-
-#define XFS_QI_MPLRECLAIMS(mp)	((mp)->m_quotainfo->qi_dqreclaims)
-#define XFS_QI_UQIP(mp)		((mp)->m_quotainfo->qi_uquotaip)
-#define XFS_QI_GQIP(mp)		((mp)->m_quotainfo->qi_gquotaip)
-#define XFS_QI_DQCHUNKLEN(mp)	((mp)->m_quotainfo->qi_dqchunklen)
-#define XFS_QI_BTIMELIMIT(mp)	((mp)->m_quotainfo->qi_btimelimit)
-#define XFS_QI_RTBTIMELIMIT(mp) ((mp)->m_quotainfo->qi_rtbtimelimit)
-#define XFS_QI_ITIMELIMIT(mp)	((mp)->m_quotainfo->qi_itimelimit)
-#define XFS_QI_BWARNLIMIT(mp)	((mp)->m_quotainfo->qi_bwarnlimit)
-#define XFS_QI_RTBWARNLIMIT(mp)	((mp)->m_quotainfo->qi_rtbwarnlimit)
-#define XFS_QI_IWARNLIMIT(mp)	((mp)->m_quotainfo->qi_iwarnlimit)
-#define XFS_QI_QOFFLOCK(mp)	((mp)->m_quotainfo->qi_quotaofflock)
-
-#define XFS_QI_MPL_LIST(mp)	((mp)->m_quotainfo->qi_dqlist)
-#define XFS_QI_MPLNEXT(mp)	((mp)->m_quotainfo->qi_dqlist.qh_next)
-#define XFS_QI_MPLNDQUOTS(mp)	((mp)->m_quotainfo->qi_dqlist.qh_nelems)
-
-#define xfs_qm_mplist_lock(mp) \
-	mutex_lock(&(XFS_QI_MPL_LIST(mp).qh_lock))
-#define xfs_qm_mplist_nowait(mp) \
-	mutex_trylock(&(XFS_QI_MPL_LIST(mp).qh_lock))
-#define xfs_qm_mplist_unlock(mp) \
-	mutex_unlock(&(XFS_QI_MPL_LIST(mp).qh_lock))
-#define XFS_QM_IS_MPLIST_LOCKED(mp) \
-	mutex_is_locked(&(XFS_QI_MPL_LIST(mp).qh_lock))
-
-#define xfs_qm_freelist_lock(qm) \
-	mutex_lock(&((qm)->qm_dqfreelist.qh_lock))
-#define xfs_qm_freelist_lock_nowait(qm) \
-	mutex_trylock(&((qm)->qm_dqfreelist.qh_lock))
-#define xfs_qm_freelist_unlock(qm) \
-	mutex_unlock(&((qm)->qm_dqfreelist.qh_lock))
-
 /*
  * Hash into a bucket in the dquot hash table, based on <mp, id>.
  */
@@ -72,9 +35,6 @@
 				      XFS_DQ_HASHVAL(mp, id)) : \
 				     (xfs_Gqm->qm_grp_dqhtable + \
 				      XFS_DQ_HASHVAL(mp, id)))
-#define XFS_IS_DQTYPE_ON(mp, type)   (type == XFS_DQ_USER ? \
-					XFS_IS_UQUOTA_ON(mp) : \
-					XFS_IS_OQUOTA_ON(mp))
 #define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
 	!dqp->q_core.d_blk_hardlimit && \
 	!dqp->q_core.d_blk_softlimit && \
@@ -86,68 +46,6 @@
 	!dqp->q_core.d_rtbcount && \
 	!dqp->q_core.d_icount)
 
-#define HL_PREVP	dq_hashlist.ql_prevp
-#define HL_NEXT		dq_hashlist.ql_next
-#define MPL_PREVP	dq_mplist.ql_prevp
-#define MPL_NEXT	dq_mplist.ql_next
-
-
-#define _LIST_REMOVE(h, dqp, PVP, NXT)				\
-	{							\
-		 xfs_dquot_t *d;				\
-		 if (((d) = (dqp)->NXT))				\
-			 (d)->PVP = (dqp)->PVP;			\
-		 *((dqp)->PVP) = d;				\
-		 (dqp)->NXT = NULL;				\
-		 (dqp)->PVP = NULL;				\
-		 (h)->qh_version++;				\
-		 (h)->qh_nelems--;				\
-	}
-
-#define _LIST_INSERT(h, dqp, PVP, NXT)				\
-	{							\
-		 xfs_dquot_t *d;				\
-		 if (((d) = (h)->qh_next))			\
-			 (d)->PVP = &((dqp)->NXT);		\
-		 (dqp)->NXT = d;				\
-		 (dqp)->PVP = &((h)->qh_next);			\
-		 (h)->qh_next = dqp;				\
-		 (h)->qh_version++;				\
-		 (h)->qh_nelems++;				\
-	 }
-
-#define FOREACH_DQUOT_IN_MP(dqp, mp) \
-	for ((dqp) = XFS_QI_MPLNEXT(mp); (dqp) != NULL; (dqp) = (dqp)->MPL_NEXT)
-
-#define FOREACH_DQUOT_IN_FREELIST(dqp, qlist)	\
-for ((dqp) = (qlist)->qh_next; (dqp) != (xfs_dquot_t *)(qlist); \
-     (dqp) = (dqp)->dq_flnext)
-
-#define XQM_HASHLIST_INSERT(h, dqp)	\
-	 _LIST_INSERT(h, dqp, HL_PREVP, HL_NEXT)
-
-#define XQM_FREELIST_INSERT(h, dqp)	\
-	 xfs_qm_freelist_append(h, dqp)
-
-#define XQM_MPLIST_INSERT(h, dqp)	\
-	 _LIST_INSERT(h, dqp, MPL_PREVP, MPL_NEXT)
-
-#define XQM_HASHLIST_REMOVE(h, dqp)	\
-	 _LIST_REMOVE(h, dqp, HL_PREVP, HL_NEXT)
-#define XQM_FREELIST_REMOVE(dqp)	\
-	 xfs_qm_freelist_unlink(dqp)
-#define XQM_MPLIST_REMOVE(h, dqp)	\
-	{ _LIST_REMOVE(h, dqp, MPL_PREVP, MPL_NEXT); \
-	  XFS_QI_MPLRECLAIMS((dqp)->q_mount)++; }
-
-#define XFS_DQ_IS_LOGITEM_INITD(dqp)	((dqp)->q_logitem.qli_dquot == (dqp))
-
-#define XFS_QM_DQP_TO_DQACCT(tp, dqp)	(XFS_QM_ISUDQ(dqp) ? \
-					 (tp)->t_dqinfo->dqa_usrdquots : \
-					 (tp)->t_dqinfo->dqa_grpdquots)
-#define XFS_IS_SUSER_DQUOT(dqp)		\
-	(!((dqp)->q_core.d_id))
-
 #define DQFLAGTO_TYPESTR(d)	(((d)->dq_flags & XFS_DQ_USER) ? "USR" : \
 				 (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \
 				 (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???")))
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index 97ac9640be98..061d827da33c 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -59,12 +59,11 @@ xfs_trans_dqjoin(
 	xfs_trans_t	*tp,
 	xfs_dquot_t	*dqp)
 {
-	xfs_dq_logitem_t    *lp;
+	xfs_dq_logitem_t    *lp = &dqp->q_logitem;
 
-	ASSERT(! XFS_DQ_IS_ADDEDTO_TRX(tp, dqp));
+	ASSERT(dqp->q_transp != tp);
 	ASSERT(XFS_DQ_IS_LOCKED(dqp));
-	ASSERT(XFS_DQ_IS_LOGITEM_INITD(dqp));
-	lp = &dqp->q_logitem;
+	ASSERT(lp->qli_dquot == dqp);
 
 	/*
 	 * Get a log_item_desc to point at the new item.
@@ -96,7 +95,7 @@ xfs_trans_log_dquot(
 {
 	xfs_log_item_desc_t	*lidp;
 
-	ASSERT(XFS_DQ_IS_ADDEDTO_TRX(tp, dqp));
+	ASSERT(dqp->q_transp == tp);
 	ASSERT(XFS_DQ_IS_LOCKED(dqp));
 
 	lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(&dqp->q_logitem));
@@ -198,16 +197,16 @@ xfs_trans_get_dqtrx(
 	int		i;
 	xfs_dqtrx_t	*qa;
 
-	for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
-		qa = XFS_QM_DQP_TO_DQACCT(tp, dqp);
+	qa = XFS_QM_ISUDQ(dqp) ?
+		tp->t_dqinfo->dqa_usrdquots : tp->t_dqinfo->dqa_grpdquots;
 
+	for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
 		if (qa[i].qt_dquot == NULL ||
-		    qa[i].qt_dquot == dqp) {
-			return (&qa[i]);
-		}
+		    qa[i].qt_dquot == dqp)
+			return &qa[i];
 	}
 
-	return (NULL);
+	return NULL;
 }
 
 /*
@@ -381,7 +380,7 @@ xfs_trans_apply_dquot_deltas(
 				break;
 
 			ASSERT(XFS_DQ_IS_LOCKED(dqp));
-			ASSERT(XFS_DQ_IS_ADDEDTO_TRX(tp, dqp));
+			ASSERT(dqp->q_transp == tp);
 
 			/*
 			 * adjust the actual number of blocks used
@@ -589,12 +588,18 @@ xfs_trans_unreserve_and_mod_dquots(
 	}
 }
 
-STATIC int
-xfs_quota_error(uint flags)
+STATIC void
+xfs_quota_warn(
+	struct xfs_mount	*mp,
+	struct xfs_dquot	*dqp,
+	int			type)
 {
-	if (flags & XFS_QMOPT_ENOSPC)
-		return ENOSPC;
-	return EDQUOT;
+	/* no warnings for project quotas - we just return ENOSPC later */
+	if (dqp->dq_flags & XFS_DQ_PROJ)
+		return;
+	quota_send_warning((dqp->dq_flags & XFS_DQ_USER) ? USRQUOTA : GRPQUOTA,
+			   be32_to_cpu(dqp->q_core.d_id), mp->m_super->s_dev,
+			   type);
 }
 
 /*
@@ -612,7 +617,6 @@ xfs_trans_dqresv(
 	long		ninos,
 	uint		flags)
 {
-	int		error;
 	xfs_qcnt_t	hardlimit;
 	xfs_qcnt_t	softlimit;
 	time_t		timer;
@@ -634,7 +638,7 @@ xfs_trans_dqresv(
 			softlimit = q->qi_bsoftlimit;
 		timer = be32_to_cpu(dqp->q_core.d_btimer);
 		warns = be16_to_cpu(dqp->q_core.d_bwarns);
-		warnlimit = XFS_QI_BWARNLIMIT(dqp->q_mount);
+		warnlimit = dqp->q_mount->m_quotainfo->qi_bwarnlimit;
 		resbcountp = &dqp->q_res_bcount;
 	} else {
 		ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS);
@@ -646,10 +650,9 @@ xfs_trans_dqresv(
 			softlimit = q->qi_rtbsoftlimit;
 		timer = be32_to_cpu(dqp->q_core.d_rtbtimer);
 		warns = be16_to_cpu(dqp->q_core.d_rtbwarns);
-		warnlimit = XFS_QI_RTBWARNLIMIT(dqp->q_mount);
+		warnlimit = dqp->q_mount->m_quotainfo->qi_rtbwarnlimit;
 		resbcountp = &dqp->q_res_rtbcount;
 	}
-	error = 0;
 
 	if ((flags & XFS_QMOPT_FORCE_RES) == 0 &&
 	    dqp->q_core.d_id &&
@@ -667,40 +670,46 @@ xfs_trans_dqresv(
 			 * nblks.
 			 */
 			if (hardlimit > 0ULL &&
-			     (hardlimit <= nblks + *resbcountp)) {
-				error = xfs_quota_error(flags);
+			    hardlimit <= nblks + *resbcountp) {
+				xfs_quota_warn(mp, dqp, QUOTA_NL_BHARDWARN);
 				goto error_return;
 			}
-
 			if (softlimit > 0ULL &&
-			     (softlimit <= nblks + *resbcountp)) {
+			    softlimit <= nblks + *resbcountp) {
 				if ((timer != 0 && get_seconds() > timer) ||
 				    (warns != 0 && warns >= warnlimit)) {
-					error = xfs_quota_error(flags);
+					xfs_quota_warn(mp, dqp,
+						       QUOTA_NL_BSOFTLONGWARN);
 					goto error_return;
 				}
+
+				xfs_quota_warn(mp, dqp, QUOTA_NL_BSOFTWARN);
 			}
 		}
 		if (ninos > 0) {
 			count = be64_to_cpu(dqp->q_core.d_icount);
 			timer = be32_to_cpu(dqp->q_core.d_itimer);
 			warns = be16_to_cpu(dqp->q_core.d_iwarns);
-			warnlimit = XFS_QI_IWARNLIMIT(dqp->q_mount);
+			warnlimit = dqp->q_mount->m_quotainfo->qi_iwarnlimit;
 			hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit);
 			if (!hardlimit)
 				hardlimit = q->qi_ihardlimit;
 			softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit);
 			if (!softlimit)
 				softlimit = q->qi_isoftlimit;
+
 			if (hardlimit > 0ULL && count >= hardlimit) {
-				error = xfs_quota_error(flags);
+				xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN);
 				goto error_return;
-			} else if (softlimit > 0ULL && count >= softlimit) {
-				if ((timer != 0 && get_seconds() > timer) ||
+			}
+			if (softlimit > 0ULL && count >= softlimit) {
+				if  ((timer != 0 && get_seconds() > timer) ||
 				     (warns != 0 && warns >= warnlimit)) {
-					error = xfs_quota_error(flags);
+					xfs_quota_warn(mp, dqp,
+						       QUOTA_NL_ISOFTLONGWARN);
 					goto error_return;
 				}
+				xfs_quota_warn(mp, dqp, QUOTA_NL_ISOFTWARN);
 			}
 		}
 	}
@@ -736,9 +745,14 @@ xfs_trans_dqresv(
 	ASSERT(dqp->q_res_rtbcount >= be64_to_cpu(dqp->q_core.d_rtbcount));
 	ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount));
 
+	xfs_dqunlock(dqp);
+	return 0;
+
 error_return:
 	xfs_dqunlock(dqp);
-	return error;
+	if (flags & XFS_QMOPT_ENOSPC)
+		return ENOSPC;
+	return EDQUOT;
 }
 
 
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 00fd357c3e46..0135e2a669d7 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -36,8 +36,8 @@ struct xfs_acl {
 };
 
 /* On-disk XFS extended attribute names */
-#define SGI_ACL_FILE		"SGI_ACL_FILE"
-#define SGI_ACL_DEFAULT		"SGI_ACL_DEFAULT"
+#define SGI_ACL_FILE		(unsigned char *)"SGI_ACL_FILE"
+#define SGI_ACL_DEFAULT		(unsigned char *)"SGI_ACL_DEFAULT"
 #define SGI_ACL_FILE_SIZE	(sizeof(SGI_ACL_FILE)-1)
 #define SGI_ACL_DEFAULT_SIZE	(sizeof(SGI_ACL_DEFAULT)-1)
 
@@ -49,8 +49,8 @@ extern int xfs_acl_chmod(struct inode *inode);
 extern int posix_acl_access_exists(struct inode *inode);
 extern int posix_acl_default_exists(struct inode *inode);
 
-extern struct xattr_handler xfs_xattr_acl_access_handler;
-extern struct xattr_handler xfs_xattr_acl_default_handler;
+extern const struct xattr_handler xfs_xattr_acl_access_handler;
+extern const struct xattr_handler xfs_xattr_acl_default_handler;
 #else
 # define xfs_check_acl					NULL
 # define xfs_get_acl(inode, type)			NULL
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 6702bd865811..4917d4eed4ed 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -175,29 +175,31 @@ typedef struct xfs_agfl {
 } xfs_agfl_t;
 
 /*
- * Busy block/extent entry.  Used in perag to mark blocks that have been freed
- * but whose transactions aren't committed to disk yet.
+ * Busy block/extent entry.  Indexed by a rbtree in perag to mark blocks that
+ * have been freed but whose transactions aren't committed to disk yet.
+ *
+ * Note that we use the transaction ID to record the transaction, not the
+ * transaction structure itself. See xfs_alloc_busy_insert() for details.
  */
-typedef struct xfs_perag_busy {
-	xfs_agblock_t	busy_start;
-	xfs_extlen_t	busy_length;
-	struct xfs_trans *busy_tp;	/* transaction that did the free */
-} xfs_perag_busy_t;
+struct xfs_busy_extent {
+	struct rb_node	rb_node;	/* ag by-bno indexed search tree */
+	struct list_head list;		/* transaction busy extent list */
+	xfs_agnumber_t	agno;
+	xfs_agblock_t	bno;
+	xfs_extlen_t	length;
+	xlog_tid_t	tid;		/* transaction that created this */
+};
 
 /*
  * Per-ag incore structure, copies of information in agf and agi,
  * to improve the performance of allocation group selection.
- *
- * pick sizes which fit in allocation buckets well
  */
-#if (BITS_PER_LONG == 32)
-#define XFS_PAGB_NUM_SLOTS	84
-#elif (BITS_PER_LONG == 64)
 #define XFS_PAGB_NUM_SLOTS	128
-#endif
 
-typedef struct xfs_perag
-{
+typedef struct xfs_perag {
+	struct xfs_mount *pag_mount;	/* owner filesystem */
+	xfs_agnumber_t	pag_agno;	/* AG this structure belongs to */
+	atomic_t	pag_ref;	/* perag reference count */
 	char		pagf_init;	/* this agf's entry is initialized */
 	char		pagi_init;	/* this agi's entry is initialized */
 	char		pagf_metadata;	/* the agf is preferred to be metadata */
@@ -210,8 +212,6 @@ typedef struct xfs_perag
 	__uint32_t	pagf_btreeblks;	/* # of blocks held in AGF btrees */
 	xfs_agino_t	pagi_freecount;	/* number of free inodes */
 	xfs_agino_t	pagi_count;	/* number of allocated inodes */
-	int		pagb_count;	/* pagb slots in use */
-	xfs_perag_busy_t *pagb_list;	/* unstable blocks */
 
 	/*
 	 * Inode allocation search lookup optimisation.
@@ -222,14 +222,16 @@ typedef struct xfs_perag
 	xfs_agino_t	pagl_leftrec;
 	xfs_agino_t	pagl_rightrec;
 #ifdef __KERNEL__
-	spinlock_t	pagb_lock;	/* lock for pagb_list */
+	spinlock_t	pagb_lock;	/* lock for pagb_tree */
+	struct rb_root	pagb_tree;	/* ordered tree of busy extents */
 
 	atomic_t        pagf_fstrms;    /* # of filestreams active in this AG */
 
-	int		pag_ici_init;	/* incore inode cache initialised */
 	rwlock_t	pag_ici_lock;	/* incore inode lock */
 	struct radix_tree_root pag_ici_root;	/* incore inode cache root */
+	int		pag_ici_reclaimable;	/* reclaimable inodes */
 #endif
+	int		pagb_count;	/* pagb slots in use */
 } xfs_perag_t;
 
 /*
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 275b1f4f9430..a7fbe8a99b12 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -46,11 +46,9 @@
 #define	XFSA_FIXUP_BNO_OK	1
 #define	XFSA_FIXUP_CNT_OK	2
 
-STATIC void
-xfs_alloc_search_busy(xfs_trans_t *tp,
-		    xfs_agnumber_t agno,
-		    xfs_agblock_t bno,
-		    xfs_extlen_t len);
+static int
+xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
+		    xfs_agblock_t bno, xfs_extlen_t len);
 
 /*
  * Prototypes for per-ag allocation routines
@@ -540,9 +538,16 @@ xfs_alloc_ag_vextent(
 				be32_to_cpu(agf->agf_length));
 			xfs_alloc_log_agf(args->tp, args->agbp,
 						XFS_AGF_FREEBLKS);
-			/* search the busylist for these blocks */
-			xfs_alloc_search_busy(args->tp, args->agno,
-					args->agbno, args->len);
+			/*
+			 * Search the busylist for these blocks and mark the
+			 * transaction as synchronous if blocks are found. This
+			 * avoids the need to block due to a synchronous log
+			 * force to ensure correct ordering as the synchronous
+			 * transaction will guarantee that for us.
+			 */
+			if (xfs_alloc_busy_search(args->mp, args->agno,
+						args->agbno, args->len))
+				xfs_trans_set_sync(args->tp);
 		}
 		if (!args->isfl)
 			xfs_trans_mod_sb(args->tp,
@@ -1662,11 +1667,13 @@ xfs_free_ag_extent(
 		xfs_agf_t	*agf;
 		xfs_perag_t	*pag;		/* per allocation group data */
 
+		pag = xfs_perag_get(mp, agno);
+		pag->pagf_freeblks += len;
+		xfs_perag_put(pag);
+
 		agf = XFS_BUF_TO_AGF(agbp);
-		pag = &mp->m_perag[agno];
 		be32_add_cpu(&agf->agf_freeblks, len);
 		xfs_trans_agblocks_delta(tp, len);
-		pag->pagf_freeblks += len;
 		XFS_WANT_CORRUPTED_GOTO(
 			be32_to_cpu(agf->agf_freeblks) <=
 			be32_to_cpu(agf->agf_length),
@@ -1691,7 +1698,7 @@ xfs_free_ag_extent(
 	 * when the iclog commits to disk.  If a busy block is allocated,
 	 * the iclog is pushed up to the LSN that freed the block.
 	 */
-	xfs_alloc_mark_busy(tp, agno, bno, len);
+	xfs_alloc_busy_insert(tp, agno, bno, len);
 	return 0;
 
  error0:
@@ -1969,10 +1976,12 @@ xfs_alloc_get_freelist(
 	xfs_trans_brelse(tp, agflbp);
 	if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp))
 		agf->agf_flfirst = 0;
-	pag = &mp->m_perag[be32_to_cpu(agf->agf_seqno)];
+
+	pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
 	be32_add_cpu(&agf->agf_flcount, -1);
 	xfs_trans_agflist_delta(tp, -1);
 	pag->pagf_flcount--;
+	xfs_perag_put(pag);
 
 	logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
 	if (btreeblk) {
@@ -1985,14 +1994,20 @@ xfs_alloc_get_freelist(
 	*bnop = bno;
 
 	/*
-	 * As blocks are freed, they are added to the per-ag busy list
-	 * and remain there until the freeing transaction is committed to
-	 * disk.  Now that we have allocated blocks, this list must be
-	 * searched to see if a block is being reused.  If one is, then
-	 * the freeing transaction must be pushed to disk NOW by forcing
-	 * to disk all iclogs up that transaction's LSN.
+	 * As blocks are freed, they are added to the per-ag busy list and
+	 * remain there until the freeing transaction is committed to disk.
+	 * Now that we have allocated blocks, this list must be searched to see
+	 * if a block is being reused.  If one is, then the freeing transaction
+	 * must be pushed to disk before this transaction.
+	 *
+	 * We do this by setting the current transaction to a sync transaction
+	 * which guarantees that the freeing transaction is on disk before this
+	 * transaction. This is done instead of a synchronous log force here so
+	 * that we don't sit and wait with the AGF locked in the transaction
+	 * during the log force.
 	 */
-	xfs_alloc_search_busy(tp, be32_to_cpu(agf->agf_seqno), bno, 1);
+	if (xfs_alloc_busy_search(mp, be32_to_cpu(agf->agf_seqno), bno, 1))
+		xfs_trans_set_sync(tp);
 	return 0;
 }
 
@@ -2078,7 +2093,8 @@ xfs_alloc_put_freelist(
 	be32_add_cpu(&agf->agf_fllast, 1);
 	if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp))
 		agf->agf_fllast = 0;
-	pag = &mp->m_perag[be32_to_cpu(agf->agf_seqno)];
+
+	pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
 	be32_add_cpu(&agf->agf_flcount, 1);
 	xfs_trans_agflist_delta(tp, 1);
 	pag->pagf_flcount++;
@@ -2089,6 +2105,7 @@ xfs_alloc_put_freelist(
 		pag->pagf_btreeblks--;
 		logflags |= XFS_AGF_BTREEBLKS;
 	}
+	xfs_perag_put(pag);
 
 	xfs_alloc_log_agf(tp, agbp, logflags);
 
@@ -2152,7 +2169,6 @@ xfs_read_agf(
 		xfs_trans_brelse(tp, *bpp);
 		return XFS_ERROR(EFSCORRUPTED);
 	}
-
 	XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGF, XFS_AGF_REF);
 	return 0;
 }
@@ -2175,7 +2191,7 @@ xfs_alloc_read_agf(
 	ASSERT(agno != NULLAGNUMBER);
 
 	error = xfs_read_agf(mp, tp, agno,
-			(flags & XFS_ALLOC_FLAG_TRYLOCK) ? XFS_BUF_TRYLOCK : 0,
+			(flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
 			bpp);
 	if (error)
 		return error;
@@ -2184,7 +2200,7 @@ xfs_alloc_read_agf(
 	ASSERT(!XFS_BUF_GETERROR(*bpp));
 
 	agf = XFS_BUF_TO_AGF(*bpp);
-	pag = &mp->m_perag[agno];
+	pag = xfs_perag_get(mp, agno);
 	if (!pag->pagf_init) {
 		pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
 		pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
@@ -2195,8 +2211,8 @@ xfs_alloc_read_agf(
 		pag->pagf_levels[XFS_BTNUM_CNTi] =
 			be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
 		spin_lock_init(&pag->pagb_lock);
-		pag->pagb_list = kmem_zalloc(XFS_PAGB_NUM_SLOTS *
-					sizeof(xfs_perag_busy_t), KM_SLEEP);
+		pag->pagb_count = 0;
+		pag->pagb_tree = RB_ROOT;
 		pag->pagf_init = 1;
 	}
 #ifdef DEBUG
@@ -2211,6 +2227,7 @@ xfs_alloc_read_agf(
 		       be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
 	}
 #endif
+	xfs_perag_put(pag);
 	return 0;
 }
 
@@ -2270,8 +2287,7 @@ xfs_alloc_vextent(
 		 * These three force us into a single a.g.
 		 */
 		args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
-		down_read(&mp->m_peraglock);
-		args->pag = &mp->m_perag[args->agno];
+		args->pag = xfs_perag_get(mp, args->agno);
 		args->minleft = 0;
 		error = xfs_alloc_fix_freelist(args, 0);
 		args->minleft = minleft;
@@ -2280,14 +2296,12 @@ xfs_alloc_vextent(
 			goto error0;
 		}
 		if (!args->agbp) {
-			up_read(&mp->m_peraglock);
 			trace_xfs_alloc_vextent_noagbp(args);
 			break;
 		}
 		args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
 		if ((error = xfs_alloc_ag_vextent(args)))
 			goto error0;
-		up_read(&mp->m_peraglock);
 		break;
 	case XFS_ALLOCTYPE_START_BNO:
 		/*
@@ -2339,9 +2353,8 @@ xfs_alloc_vextent(
 		 * Loop over allocation groups twice; first time with
 		 * trylock set, second time without.
 		 */
-		down_read(&mp->m_peraglock);
 		for (;;) {
-			args->pag = &mp->m_perag[args->agno];
+			args->pag = xfs_perag_get(mp, args->agno);
 			if (no_min) args->minleft = 0;
 			error = xfs_alloc_fix_freelist(args, flags);
 			args->minleft = minleft;
@@ -2400,8 +2413,8 @@ xfs_alloc_vextent(
 					}
 				}
 			}
+			xfs_perag_put(args->pag);
 		}
-		up_read(&mp->m_peraglock);
 		if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) {
 			if (args->agno == sagno)
 				mp->m_agfrotor = (mp->m_agfrotor + 1) %
@@ -2427,9 +2440,10 @@ xfs_alloc_vextent(
 			args->len);
 #endif
 	}
+	xfs_perag_put(args->pag);
 	return 0;
 error0:
-	up_read(&mp->m_peraglock);
+	xfs_perag_put(args->pag);
 	return error;
 }
 
@@ -2454,8 +2468,7 @@ xfs_free_extent(
 	args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
 	ASSERT(args.agno < args.mp->m_sb.sb_agcount);
 	args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
-	down_read(&args.mp->m_peraglock);
-	args.pag = &args.mp->m_perag[args.agno];
+	args.pag = xfs_perag_get(args.mp, args.agno);
 	if ((error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING)))
 		goto error0;
 #ifdef DEBUG
@@ -2465,7 +2478,7 @@ xfs_free_extent(
 #endif
 	error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
 error0:
-	up_read(&args.mp->m_peraglock);
+	xfs_perag_put(args.pag);
 	return error;
 }
 
@@ -2477,127 +2490,263 @@ error0:
  * list is reused, the transaction that freed it must be forced to disk
  * before continuing to use the block.
  *
- * xfs_alloc_mark_busy - add to the per-ag busy list
- * xfs_alloc_clear_busy - remove an item from the per-ag busy list
+ * xfs_alloc_busy_insert - add to the per-ag busy list
+ * xfs_alloc_busy_clear - remove an item from the per-ag busy list
+ * xfs_alloc_busy_search - search for a busy extent
+ */
+
+/*
+ * Insert a new extent into the busy tree.
+ *
+ * The busy extent tree is indexed by the start block of the busy extent.
+ * there can be multiple overlapping ranges in the busy extent tree but only
+ * ever one entry at a given start block. The reason for this is that
+ * multi-block extents can be freed, then smaller chunks of that extent
+ * allocated and freed again before the first transaction commit is on disk.
+ * If the exact same start block is freed a second time, we have to wait for
+ * that busy extent to pass out of the tree before the new extent is inserted.
+ * There are two main cases we have to handle here.
+ *
+ * The first case is a transaction that triggers a "free - allocate - free"
+ * cycle. This can occur during btree manipulations as a btree block is freed
+ * to the freelist, then allocated from the free list, then freed again. In
+ * this case, the second extxpnet free is what triggers the duplicate and as
+ * such the transaction IDs should match. Because the extent was allocated in
+ * this transaction, the transaction must be marked as synchronous. This is
+ * true for all cases where the free/alloc/free occurs in the one transaction,
+ * hence the addition of the ASSERT(tp->t_flags & XFS_TRANS_SYNC) to this case.
+ * This serves to catch violations of the second case quite effectively.
+ *
+ * The second case is where the free/alloc/free occur in different
+ * transactions. In this case, the thread freeing the extent the second time
+ * can't mark the extent busy immediately because it is already tracked in a
+ * transaction that may be committing.  When the log commit for the existing
+ * busy extent completes, the busy extent will be removed from the tree. If we
+ * allow the second busy insert to continue using that busy extent structure,
+ * it can be freed before this transaction is safely in the log.  Hence our
+ * only option in this case is to force the log to remove the existing busy
+ * extent from the list before we insert the new one with the current
+ * transaction ID.
+ *
+ * The problem we are trying to avoid in the free-alloc-free in separate
+ * transactions is most easily described with a timeline:
+ *
+ *      Thread 1	Thread 2	Thread 3	xfslogd
+ *	xact alloc
+ *	free X
+ *	mark busy
+ *	commit xact
+ *	free xact
+ *			xact alloc
+ *			alloc X
+ *			busy search
+ *			mark xact sync
+ *			commit xact
+ *			free xact
+ *			force log
+ *			checkpoint starts
+ *			....
+ *					xact alloc
+ *					free X
+ *					mark busy
+ *					finds match
+ *					*** KABOOM! ***
+ *					....
+ *							log IO completes
+ *							unbusy X
+ *			checkpoint completes
+ *
+ * By issuing a log force in thread 3 @ "KABOOM", the thread will block until
+ * the checkpoint completes, and the busy extent it matched will have been
+ * removed from the tree when it is woken. Hence it can then continue safely.
+ *
+ * However, to ensure this matching process is robust, we need to use the
+ * transaction ID for identifying transaction, as delayed logging results in
+ * the busy extent and transaction lifecycles being different. i.e. the busy
+ * extent is active for a lot longer than the transaction.  Hence the
+ * transaction structure can be freed and reallocated, then mark the same
+ * extent busy again in the new transaction. In this case the new transaction
+ * will have a different tid but can have the same address, and hence we need
+ * to check against the tid.
+ *
+ * Future: for delayed logging, we could avoid the log force if the extent was
+ * first freed in the current checkpoint sequence. This, however, requires the
+ * ability to pin the current checkpoint in memory until this transaction
+ * commits to ensure that both the original free and the current one combine
+ * logically into the one checkpoint. If the checkpoint sequences are
+ * different, however, we still need to wait on a log force.
  */
 void
-xfs_alloc_mark_busy(xfs_trans_t *tp,
-		    xfs_agnumber_t agno,
-		    xfs_agblock_t bno,
-		    xfs_extlen_t len)
+xfs_alloc_busy_insert(
+	struct xfs_trans	*tp,
+	xfs_agnumber_t		agno,
+	xfs_agblock_t		bno,
+	xfs_extlen_t		len)
 {
-	xfs_mount_t		*mp;
-	xfs_perag_busy_t	*bsy;
-	int			n;
+	struct xfs_busy_extent	*new;
+	struct xfs_busy_extent	*busyp;
+	struct xfs_perag	*pag;
+	struct rb_node		**rbp;
+	struct rb_node		*parent;
+	int			match;
 
-	mp = tp->t_mountp;
-	spin_lock(&mp->m_perag[agno].pagb_lock);
-
-	/* search pagb_list for an open slot */
-	for (bsy = mp->m_perag[agno].pagb_list, n = 0;
-	     n < XFS_PAGB_NUM_SLOTS;
-	     bsy++, n++) {
-		if (bsy->busy_tp == NULL) {
-			break;
-		}
-	}
 
-	trace_xfs_alloc_busy(mp, agno, bno, len, n);
-
-	if (n < XFS_PAGB_NUM_SLOTS) {
-		bsy = &mp->m_perag[agno].pagb_list[n];
-		mp->m_perag[agno].pagb_count++;
-		bsy->busy_start = bno;
-		bsy->busy_length = len;
-		bsy->busy_tp = tp;
-		xfs_trans_add_busy(tp, agno, n);
-	} else {
+	new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL);
+	if (!new) {
 		/*
-		 * The busy list is full!  Since it is now not possible to
-		 * track the free block, make this a synchronous transaction
-		 * to insure that the block is not reused before this
-		 * transaction commits.
+		 * No Memory!  Since it is now not possible to track the free
+		 * block, make this a synchronous transaction to insure that
+		 * the block is not reused before this transaction commits.
 		 */
+		trace_xfs_alloc_busy(tp, agno, bno, len, 1);
 		xfs_trans_set_sync(tp);
+		return;
 	}
 
-	spin_unlock(&mp->m_perag[agno].pagb_lock);
-}
-
-void
-xfs_alloc_clear_busy(xfs_trans_t *tp,
-		     xfs_agnumber_t agno,
-		     int idx)
-{
-	xfs_mount_t		*mp;
-	xfs_perag_busy_t	*list;
-
-	mp = tp->t_mountp;
-
-	spin_lock(&mp->m_perag[agno].pagb_lock);
-	list = mp->m_perag[agno].pagb_list;
-
-	ASSERT(idx < XFS_PAGB_NUM_SLOTS);
-
-	trace_xfs_alloc_unbusy(mp, agno, idx, list[idx].busy_tp == tp);
-
-	if (list[idx].busy_tp == tp) {
-		list[idx].busy_tp = NULL;
-		mp->m_perag[agno].pagb_count--;
+	new->agno = agno;
+	new->bno = bno;
+	new->length = len;
+	new->tid = xfs_log_get_trans_ident(tp);
+
+	INIT_LIST_HEAD(&new->list);
+
+	/* trace before insert to be able to see failed inserts */
+	trace_xfs_alloc_busy(tp, agno, bno, len, 0);
+
+	pag = xfs_perag_get(tp->t_mountp, new->agno);
+restart:
+	spin_lock(&pag->pagb_lock);
+	rbp = &pag->pagb_tree.rb_node;
+	parent = NULL;
+	busyp = NULL;
+	match = 0;
+	while (*rbp && match >= 0) {
+		parent = *rbp;
+		busyp = rb_entry(parent, struct xfs_busy_extent, rb_node);
+
+		if (new->bno < busyp->bno) {
+			/* may overlap, but exact start block is lower */
+			rbp = &(*rbp)->rb_left;
+			if (new->bno + new->length > busyp->bno)
+				match = busyp->tid == new->tid ? 1 : -1;
+		} else if (new->bno > busyp->bno) {
+			/* may overlap, but exact start block is higher */
+			rbp = &(*rbp)->rb_right;
+			if (bno < busyp->bno + busyp->length)
+				match = busyp->tid == new->tid ? 1 : -1;
+		} else {
+			match = busyp->tid == new->tid ? 1 : -1;
+			break;
+		}
+	}
+	if (match < 0) {
+		/* overlap marked busy in different transaction */
+		spin_unlock(&pag->pagb_lock);
+		xfs_log_force(tp->t_mountp, XFS_LOG_SYNC);
+		goto restart;
 	}
+	if (match > 0) {
+		/*
+		 * overlap marked busy in same transaction. Update if exact
+		 * start block match, otherwise combine the busy extents into
+		 * a single range.
+		 */
+		if (busyp->bno == new->bno) {
+			busyp->length = max(busyp->length, new->length);
+			spin_unlock(&pag->pagb_lock);
+			ASSERT(tp->t_flags & XFS_TRANS_SYNC);
+			xfs_perag_put(pag);
+			kmem_free(new);
+			return;
+		}
+		rb_erase(&busyp->rb_node, &pag->pagb_tree);
+		new->length = max(busyp->bno + busyp->length,
+					new->bno + new->length) -
+				min(busyp->bno, new->bno);
+		new->bno = min(busyp->bno, new->bno);
+	} else
+		busyp = NULL;
 
-	spin_unlock(&mp->m_perag[agno].pagb_lock);
-}
+	rb_link_node(&new->rb_node, parent, rbp);
+	rb_insert_color(&new->rb_node, &pag->pagb_tree);
 
+	list_add(&new->list, &tp->t_busy);
+	spin_unlock(&pag->pagb_lock);
+	xfs_perag_put(pag);
+	kmem_free(busyp);
+}
 
 /*
- * If we find the extent in the busy list, force the log out to get the
- * extent out of the busy list so the caller can use it straight away.
+ * Search for a busy extent within the range of the extent we are about to
+ * allocate.  You need to be holding the busy extent tree lock when calling
+ * xfs_alloc_busy_search(). This function returns 0 for no overlapping busy
+ * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact
+ * match. This is done so that a non-zero return indicates an overlap that
+ * will require a synchronous transaction, but it can still be
+ * used to distinguish between a partial or exact match.
  */
-STATIC void
-xfs_alloc_search_busy(xfs_trans_t *tp,
-		    xfs_agnumber_t agno,
-		    xfs_agblock_t bno,
-		    xfs_extlen_t len)
+static int
+xfs_alloc_busy_search(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno,
+	xfs_agblock_t		bno,
+	xfs_extlen_t		len)
 {
-	xfs_mount_t		*mp;
-	xfs_perag_busy_t	*bsy;
-	xfs_agblock_t		uend, bend;
-	xfs_lsn_t		lsn = 0;
-	int			cnt;
-
-	mp = tp->t_mountp;
+	struct xfs_perag	*pag;
+	struct rb_node		*rbp;
+	struct xfs_busy_extent	*busyp;
+	int			match = 0;
+
+	pag = xfs_perag_get(mp, agno);
+	spin_lock(&pag->pagb_lock);
+
+	rbp = pag->pagb_tree.rb_node;
+
+	/* find closest start bno overlap */
+	while (rbp) {
+		busyp = rb_entry(rbp, struct xfs_busy_extent, rb_node);
+		if (bno < busyp->bno) {
+			/* may overlap, but exact start block is lower */
+			if (bno + len > busyp->bno)
+				match = -1;
+			rbp = rbp->rb_left;
+		} else if (bno > busyp->bno) {
+			/* may overlap, but exact start block is higher */
+			if (bno < busyp->bno + busyp->length)
+				match = -1;
+			rbp = rbp->rb_right;
+		} else {
+			/* bno matches busyp, length determines exact match */
+			match = (busyp->length == len) ? 1 : -1;
+			break;
+		}
+	}
+	spin_unlock(&pag->pagb_lock);
+	trace_xfs_alloc_busysearch(mp, agno, bno, len, !!match);
+	xfs_perag_put(pag);
+	return match;
+}
 
-	spin_lock(&mp->m_perag[agno].pagb_lock);
+void
+xfs_alloc_busy_clear(
+	struct xfs_mount	*mp,
+	struct xfs_busy_extent	*busyp)
+{
+	struct xfs_perag	*pag;
 
-	uend = bno + len - 1;
+	trace_xfs_alloc_unbusy(mp, busyp->agno, busyp->bno,
+						busyp->length);
 
-	/*
-	 * search pagb_list for this slot, skipping open slots. We have to
-	 * search the entire array as there may be multiple overlaps and
-	 * we have to get the most recent LSN for the log force to push out
-	 * all the transactions that span the range.
-	 */
-	for (cnt = 0; cnt < mp->m_perag[agno].pagb_count; cnt++) {
-		bsy = &mp->m_perag[agno].pagb_list[cnt];
-		if (!bsy->busy_tp)
-			continue;
+	ASSERT(xfs_alloc_busy_search(mp, busyp->agno, busyp->bno,
+						busyp->length) == 1);
 
-		bend = bsy->busy_start + bsy->busy_length - 1;
-		if (bno > bend || uend < bsy->busy_start)
-			continue;
+	list_del_init(&busyp->list);
 
-		/* (start1,length1) within (start2, length2) */
-		if (XFS_LSN_CMP(bsy->busy_tp->t_commit_lsn, lsn) > 0)
-			lsn = bsy->busy_tp->t_commit_lsn;
-	}
-	spin_unlock(&mp->m_perag[agno].pagb_lock);
-	trace_xfs_alloc_busysearch(tp->t_mountp, agno, bno, len, lsn);
+	pag = xfs_perag_get(mp, busyp->agno);
+	spin_lock(&pag->pagb_lock);
+	rb_erase(&busyp->rb_node, &pag->pagb_tree);
+	spin_unlock(&pag->pagb_lock);
+	xfs_perag_put(pag);
 
-	/*
-	 * If a block was found, force the log through the LSN of the
-	 * transaction that freed the block
-	 */
-	if (lsn)
-		xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC);
+	kmem_free(busyp);
 }
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 599bffa39784..6d05199b667c 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -22,6 +22,7 @@ struct xfs_buf;
 struct xfs_mount;
 struct xfs_perag;
 struct xfs_trans;
+struct xfs_busy_extent;
 
 /*
  * Freespace allocation types.  Argument to xfs_alloc_[v]extent.
@@ -119,15 +120,13 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp,
 #ifdef __KERNEL__
 
 void
-xfs_alloc_mark_busy(xfs_trans_t *tp,
+xfs_alloc_busy_insert(xfs_trans_t *tp,
 		xfs_agnumber_t agno,
 		xfs_agblock_t bno,
 		xfs_extlen_t len);
 
 void
-xfs_alloc_clear_busy(xfs_trans_t *tp,
-		xfs_agnumber_t ag,
-		int idx);
+xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp);
 
 #endif	/* __KERNEL__ */
 
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index adbd9141aea1..83f494218759 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -61,12 +61,14 @@ xfs_allocbt_set_root(
 	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
 	xfs_agnumber_t		seqno = be32_to_cpu(agf->agf_seqno);
 	int			btnum = cur->bc_btnum;
+	struct xfs_perag	*pag = xfs_perag_get(cur->bc_mp, seqno);
 
 	ASSERT(ptr->s != 0);
 
 	agf->agf_roots[btnum] = ptr->s;
 	be32_add_cpu(&agf->agf_levels[btnum], inc);
-	cur->bc_mp->m_perag[seqno].pagf_levels[btnum] += inc;
+	pag->pagf_levels[btnum] += inc;
+	xfs_perag_put(pag);
 
 	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
 }
@@ -132,7 +134,7 @@ xfs_allocbt_free_block(
 	 * disk. If a busy block is allocated, the iclog is pushed up to the
 	 * LSN that freed the block.
 	 */
-	xfs_alloc_mark_busy(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
+	xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
 	xfs_trans_agbtree_delta(cur->bc_tp, -1);
 	return 0;
 }
@@ -150,6 +152,7 @@ xfs_allocbt_update_lastrec(
 {
 	struct xfs_agf		*agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
 	xfs_agnumber_t		seqno = be32_to_cpu(agf->agf_seqno);
+	struct xfs_perag	*pag;
 	__be32			len;
 	int			numrecs;
 
@@ -193,7 +196,9 @@ xfs_allocbt_update_lastrec(
 	}
 
 	agf->agf_longest = len;
-	cur->bc_mp->m_perag[seqno].pagf_longest = be32_to_cpu(len);
+	pag = xfs_perag_get(cur->bc_mp, seqno);
+	pag->pagf_longest = be32_to_cpu(len);
+	xfs_perag_put(pag);
 	xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST);
 }
 
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index e953b6cfb2a8..b9c196a53c42 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -93,12 +93,12 @@ STATIC int xfs_attr_rmtval_remove(xfs_da_args_t *args);
 STATIC int
 xfs_attr_name_to_xname(
 	struct xfs_name	*xname,
-	const char	*aname)
+	const unsigned char *aname)
 {
 	if (!aname)
 		return EINVAL;
 	xname->name = aname;
-	xname->len = strlen(aname);
+	xname->len = strlen((char *)aname);
 	if (xname->len >= MAXNAMELEN)
 		return EFAULT;		/* match IRIX behaviour */
 
@@ -124,7 +124,7 @@ STATIC int
 xfs_attr_get_int(
 	struct xfs_inode	*ip,
 	struct xfs_name		*name,
-	char			*value,
+	unsigned char		*value,
 	int			*valuelenp,
 	int			flags)
 {
@@ -171,8 +171,8 @@ xfs_attr_get_int(
 int
 xfs_attr_get(
 	xfs_inode_t	*ip,
-	const char	*name,
-	char		*value,
+	const unsigned char *name,
+	unsigned char	*value,
 	int		*valuelenp,
 	int		flags)
 {
@@ -197,7 +197,7 @@ xfs_attr_get(
 /*
  * Calculate how many blocks we need for the new attribute,
  */
-int
+STATIC int
 xfs_attr_calc_size(
 	struct xfs_inode 	*ip,
 	int			namelen,
@@ -235,8 +235,12 @@ xfs_attr_calc_size(
 }
 
 STATIC int
-xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
-		char *value, int valuelen, int flags)
+xfs_attr_set_int(
+	struct xfs_inode *dp,
+	struct xfs_name	*name,
+	unsigned char	*value,
+	int		valuelen,
+	int		flags)
 {
 	xfs_da_args_t	args;
 	xfs_fsblock_t	firstblock;
@@ -452,8 +456,8 @@ out:
 int
 xfs_attr_set(
 	xfs_inode_t	*dp,
-	const char	*name,
-	char		*value,
+	const unsigned char *name,
+	unsigned char	*value,
 	int		valuelen,
 	int		flags)
 {
@@ -600,7 +604,7 @@ out:
 int
 xfs_attr_remove(
 	xfs_inode_t	*dp,
-	const char	*name,
+	const unsigned char *name,
 	int		flags)
 {
 	int		error;
@@ -669,9 +673,13 @@ xfs_attr_list_int(xfs_attr_list_context_t *context)
  */
 /*ARGSUSED*/
 STATIC int
-xfs_attr_put_listent(xfs_attr_list_context_t *context, int flags,
-		     char *name, int namelen,
-		     int valuelen, char *value)
+xfs_attr_put_listent(
+	xfs_attr_list_context_t *context,
+	int		flags,
+	unsigned char	*name,
+	int		namelen,
+	int		valuelen,
+	unsigned char	*value)
 {
 	struct attrlist *alist = (struct attrlist *)context->alist;
 	attrlist_ent_t *aep;
@@ -1980,7 +1988,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
 	xfs_bmbt_irec_t map[ATTR_RMTVALUE_MAPSIZE];
 	xfs_mount_t *mp;
 	xfs_daddr_t dblkno;
-	xfs_caddr_t dst;
+	void *dst;
 	xfs_buf_t *bp;
 	int nmap, error, tmp, valuelen, blkcnt, i;
 	xfs_dablk_t lblkno;
@@ -2007,15 +2015,14 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
 			dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
 			blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
 			error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno,
-					     blkcnt,
-					     XFS_BUF_LOCK | XBF_DONT_BLOCK,
+					     blkcnt, XBF_LOCK | XBF_DONT_BLOCK,
 					     &bp);
 			if (error)
 				return(error);
 
 			tmp = (valuelen < XFS_BUF_SIZE(bp))
 				? valuelen : XFS_BUF_SIZE(bp);
-			xfs_biomove(bp, 0, tmp, dst, XFS_B_READ);
+			xfs_biomove(bp, 0, tmp, dst, XBF_READ);
 			xfs_buf_relse(bp);
 			dst += tmp;
 			valuelen -= tmp;
@@ -2039,7 +2046,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
 	xfs_inode_t *dp;
 	xfs_bmbt_irec_t map;
 	xfs_daddr_t dblkno;
-	xfs_caddr_t src;
+	void *src;
 	xfs_buf_t *bp;
 	xfs_dablk_t lblkno;
 	int blkcnt, valuelen, nmap, error, tmp, committed;
@@ -2141,13 +2148,13 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
 		blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
 
 		bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt,
-				 XFS_BUF_LOCK | XBF_DONT_BLOCK);
+				 XBF_LOCK | XBF_DONT_BLOCK);
 		ASSERT(bp);
 		ASSERT(!XFS_BUF_GETERROR(bp));
 
 		tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen :
 							XFS_BUF_SIZE(bp);
-		xfs_biomove(bp, 0, tmp, src, XFS_B_WRITE);
+		xfs_biomove(bp, 0, tmp, src, XBF_WRITE);
 		if (tmp < XFS_BUF_SIZE(bp))
 			xfs_biozero(bp, tmp, XFS_BUF_SIZE(bp) - tmp);
 		if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */
@@ -2208,8 +2215,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
 		/*
 		 * If the "remote" value is in the cache, remove it.
 		 */
-		bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt,
-				XFS_INCORE_TRYLOCK);
+		bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK);
 		if (bp) {
 			XFS_BUF_STALE(bp);
 			XFS_BUF_UNDELAYWRITE(bp);
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
index 59b410ce69a1..e920d68ef509 100644
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/xfs_attr.h
@@ -113,7 +113,7 @@ typedef struct attrlist_cursor_kern {
 
 
 typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int,
-				      char *, int, int, char *);
+			      unsigned char *, int, int, unsigned char *);
 
 typedef struct xfs_attr_list_context {
 	struct xfs_inode		*dp;		/* inode */
@@ -139,7 +139,6 @@ typedef struct xfs_attr_list_context {
 /*
  * Overall external interface routines.
  */
-int xfs_attr_calc_size(struct xfs_inode *, int, int, int *);
 int xfs_attr_inactive(struct xfs_inode *dp);
 int xfs_attr_rmtval_get(struct xfs_da_args *args);
 int xfs_attr_list_int(struct xfs_attr_list_context *);
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index baf41b5af756..a90ce74fc256 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -521,11 +521,11 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
 
 	sfe = &sf->list[0];
 	for (i = 0; i < sf->hdr.count; i++) {
-		nargs.name = (char *)sfe->nameval;
+		nargs.name = sfe->nameval;
 		nargs.namelen = sfe->namelen;
-		nargs.value = (char *)&sfe->nameval[nargs.namelen];
+		nargs.value = &sfe->nameval[nargs.namelen];
 		nargs.valuelen = sfe->valuelen;
-		nargs.hashval = xfs_da_hashname((char *)sfe->nameval,
+		nargs.hashval = xfs_da_hashname(sfe->nameval,
 						sfe->namelen);
 		nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags);
 		error = xfs_attr_leaf_lookup_int(bp, &nargs); /* set a->index */
@@ -612,10 +612,10 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
 		for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
 			error = context->put_listent(context,
 					   sfe->flags,
-					   (char *)sfe->nameval,
+					   sfe->nameval,
 					   (int)sfe->namelen,
 					   (int)sfe->valuelen,
-					   (char*)&sfe->nameval[sfe->namelen]);
+					   &sfe->nameval[sfe->namelen]);
 
 			/*
 			 * Either search callback finished early or
@@ -659,8 +659,8 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
 		}
 
 		sbp->entno = i;
-		sbp->hash = xfs_da_hashname((char *)sfe->nameval, sfe->namelen);
-		sbp->name = (char *)sfe->nameval;
+		sbp->hash = xfs_da_hashname(sfe->nameval, sfe->namelen);
+		sbp->name = sfe->nameval;
 		sbp->namelen = sfe->namelen;
 		/* These are bytes, and both on-disk, don't endian-flip */
 		sbp->valuelen = sfe->valuelen;
@@ -818,9 +818,9 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff)
 			continue;
 		ASSERT(entry->flags & XFS_ATTR_LOCAL);
 		name_loc = xfs_attr_leaf_name_local(leaf, i);
-		nargs.name = (char *)name_loc->nameval;
+		nargs.name = name_loc->nameval;
 		nargs.namelen = name_loc->namelen;
-		nargs.value = (char *)&name_loc->nameval[nargs.namelen];
+		nargs.value = &name_loc->nameval[nargs.namelen];
 		nargs.valuelen = be16_to_cpu(name_loc->valuelen);
 		nargs.hashval = be32_to_cpu(entry->hashval);
 		nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(entry->flags);
@@ -2370,10 +2370,10 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
 
 			retval = context->put_listent(context,
 						entry->flags,
-						(char *)name_loc->nameval,
+						name_loc->nameval,
 						(int)name_loc->namelen,
 						be16_to_cpu(name_loc->valuelen),
-						(char *)&name_loc->nameval[name_loc->namelen]);
+						&name_loc->nameval[name_loc->namelen]);
 			if (retval)
 				return retval;
 		} else {
@@ -2397,15 +2397,15 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
 					return retval;
 				retval = context->put_listent(context,
 						entry->flags,
-						(char *)name_rmt->name,
+						name_rmt->name,
 						(int)name_rmt->namelen,
 						valuelen,
-						(char*)args.value);
+						args.value);
 				kmem_free(args.value);
 			} else {
 				retval = context->put_listent(context,
 						entry->flags,
-						(char *)name_rmt->name,
+						name_rmt->name,
 						(int)name_rmt->namelen,
 						valuelen,
 						NULL);
@@ -2950,7 +2950,7 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
 						map.br_blockcount);
 			bp = xfs_trans_get_buf(*trans,
 					dp->i_mount->m_ddev_targp,
-					dblkno, dblkcnt, XFS_BUF_LOCK);
+					dblkno, dblkcnt, XBF_LOCK);
 			xfs_trans_binval(*trans, bp);
 			/*
 			 * Roll to next transaction.
diff --git a/fs/xfs/xfs_attr_sf.h b/fs/xfs/xfs_attr_sf.h
index 76ab7b0cbb3a..919756e3ba53 100644
--- a/fs/xfs/xfs_attr_sf.h
+++ b/fs/xfs/xfs_attr_sf.h
@@ -52,7 +52,7 @@ typedef struct xfs_attr_sf_sort {
 	__uint8_t	valuelen;	/* length of value */
 	__uint8_t	flags;		/* flags bits (see xfs_attr_leaf.h) */
 	xfs_dahash_t	hash;		/* this entry's hash value */
-	char		*name;		/* name value, pointer into buffer */
+	unsigned char	*name;		/* name value, pointer into buffer */
 } xfs_attr_sf_sort_t;
 
 #define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen)	/* space name/value uses */ \
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 98251cdc52aa..99587ded043f 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2550,22 +2550,134 @@ xfs_bmap_rtalloc(
 }
 
 STATIC int
+xfs_bmap_btalloc_nullfb(
+	struct xfs_bmalloca	*ap,
+	struct xfs_alloc_arg	*args,
+	xfs_extlen_t		*blen)
+{
+	struct xfs_mount	*mp = ap->ip->i_mount;
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		ag, startag;
+	int			notinit = 0;
+	int			error;
+
+	if (ap->userdata && xfs_inode_is_filestream(ap->ip))
+		args->type = XFS_ALLOCTYPE_NEAR_BNO;
+	else
+		args->type = XFS_ALLOCTYPE_START_BNO;
+	args->total = ap->total;
+
+	/*
+	 * Search for an allocation group with a single extent large enough
+	 * for the request.  If one isn't found, then adjust the minimum
+	 * allocation size to the largest space found.
+	 */
+	startag = ag = XFS_FSB_TO_AGNO(mp, args->fsbno);
+	if (startag == NULLAGNUMBER)
+		startag = ag = 0;
+
+	pag = xfs_perag_get(mp, ag);
+	while (*blen < ap->alen) {
+		if (!pag->pagf_init) {
+			error = xfs_alloc_pagf_init(mp, args->tp, ag,
+						    XFS_ALLOC_FLAG_TRYLOCK);
+			if (error) {
+				xfs_perag_put(pag);
+				return error;
+			}
+		}
+
+		/*
+		 * See xfs_alloc_fix_freelist...
+		 */
+		if (pag->pagf_init) {
+			xfs_extlen_t	longest;
+			longest = xfs_alloc_longest_free_extent(mp, pag);
+			if (*blen < longest)
+				*blen = longest;
+		} else
+			notinit = 1;
+
+		if (xfs_inode_is_filestream(ap->ip)) {
+			if (*blen >= ap->alen)
+				break;
+
+			if (ap->userdata) {
+				/*
+				 * If startag is an invalid AG, we've
+				 * come here once before and
+				 * xfs_filestream_new_ag picked the
+				 * best currently available.
+				 *
+				 * Don't continue looping, since we
+				 * could loop forever.
+				 */
+				if (startag == NULLAGNUMBER)
+					break;
+
+				error = xfs_filestream_new_ag(ap, &ag);
+				xfs_perag_put(pag);
+				if (error)
+					return error;
+
+				/* loop again to set 'blen'*/
+				startag = NULLAGNUMBER;
+				pag = xfs_perag_get(mp, ag);
+				continue;
+			}
+		}
+		if (++ag == mp->m_sb.sb_agcount)
+			ag = 0;
+		if (ag == startag)
+			break;
+		xfs_perag_put(pag);
+		pag = xfs_perag_get(mp, ag);
+	}
+	xfs_perag_put(pag);
+
+	/*
+	 * Since the above loop did a BUF_TRYLOCK, it is
+	 * possible that there is space for this request.
+	 */
+	if (notinit || *blen < ap->minlen)
+		args->minlen = ap->minlen;
+	/*
+	 * If the best seen length is less than the request
+	 * length, use the best as the minimum.
+	 */
+	else if (*blen < ap->alen)
+		args->minlen = *blen;
+	/*
+	 * Otherwise we've seen an extent as big as alen,
+	 * use that as the minimum.
+	 */
+	else
+		args->minlen = ap->alen;
+
+	/*
+	 * set the failure fallback case to look in the selected
+	 * AG as the stream may have moved.
+	 */
+	if (xfs_inode_is_filestream(ap->ip))
+		ap->rval = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
+
+	return 0;
+}
+
+STATIC int
 xfs_bmap_btalloc(
 	xfs_bmalloca_t	*ap)		/* bmap alloc argument struct */
 {
 	xfs_mount_t	*mp;		/* mount point structure */
 	xfs_alloctype_t	atype = 0;	/* type for allocation routines */
 	xfs_extlen_t	align;		/* minimum allocation alignment */
-	xfs_agnumber_t	ag;
 	xfs_agnumber_t	fb_agno;	/* ag number of ap->firstblock */
-	xfs_agnumber_t	startag;
+	xfs_agnumber_t	ag;
 	xfs_alloc_arg_t	args;
 	xfs_extlen_t	blen;
 	xfs_extlen_t	nextminlen = 0;
-	xfs_perag_t	*pag;
 	int		nullfb;		/* true if ap->firstblock isn't set */
 	int		isaligned;
-	int		notinit;
 	int		tryagain;
 	int		error;
 
@@ -2612,102 +2724,9 @@ xfs_bmap_btalloc(
 	args.firstblock = ap->firstblock;
 	blen = 0;
 	if (nullfb) {
-		if (ap->userdata && xfs_inode_is_filestream(ap->ip))
-			args.type = XFS_ALLOCTYPE_NEAR_BNO;
-		else
-			args.type = XFS_ALLOCTYPE_START_BNO;
-		args.total = ap->total;
-
-		/*
-		 * Search for an allocation group with a single extent
-		 * large enough for the request.
-		 *
-		 * If one isn't found, then adjust the minimum allocation
-		 * size to the largest space found.
-		 */
-		startag = ag = XFS_FSB_TO_AGNO(mp, args.fsbno);
-		if (startag == NULLAGNUMBER)
-			startag = ag = 0;
-		notinit = 0;
-		down_read(&mp->m_peraglock);
-		while (blen < ap->alen) {
-			pag = &mp->m_perag[ag];
-			if (!pag->pagf_init &&
-			    (error = xfs_alloc_pagf_init(mp, args.tp,
-				    ag, XFS_ALLOC_FLAG_TRYLOCK))) {
-				up_read(&mp->m_peraglock);
-				return error;
-			}
-			/*
-			 * See xfs_alloc_fix_freelist...
-			 */
-			if (pag->pagf_init) {
-				xfs_extlen_t	longest;
-				longest = xfs_alloc_longest_free_extent(mp, pag);
-				if (blen < longest)
-					blen = longest;
-			} else
-				notinit = 1;
-
-			if (xfs_inode_is_filestream(ap->ip)) {
-				if (blen >= ap->alen)
-					break;
-
-				if (ap->userdata) {
-					/*
-					 * If startag is an invalid AG, we've
-					 * come here once before and
-					 * xfs_filestream_new_ag picked the
-					 * best currently available.
-					 *
-					 * Don't continue looping, since we
-					 * could loop forever.
-					 */
-					if (startag == NULLAGNUMBER)
-						break;
-
-					error = xfs_filestream_new_ag(ap, &ag);
-					if (error) {
-						up_read(&mp->m_peraglock);
-						return error;
-					}
-
-					/* loop again to set 'blen'*/
-					startag = NULLAGNUMBER;
-					continue;
-				}
-			}
-			if (++ag == mp->m_sb.sb_agcount)
-				ag = 0;
-			if (ag == startag)
-				break;
-		}
-		up_read(&mp->m_peraglock);
-		/*
-		 * Since the above loop did a BUF_TRYLOCK, it is
-		 * possible that there is space for this request.
-		 */
-		if (notinit || blen < ap->minlen)
-			args.minlen = ap->minlen;
-		/*
-		 * If the best seen length is less than the request
-		 * length, use the best as the minimum.
-		 */
-		else if (blen < ap->alen)
-			args.minlen = blen;
-		/*
-		 * Otherwise we've seen an extent as big as alen,
-		 * use that as the minimum.
-		 */
-		else
-			args.minlen = ap->alen;
-
-		/*
-		 * set the failure fallback case to look in the selected
-		 * AG as the stream may have moved.
-		 */
-		if (xfs_inode_is_filestream(ap->ip))
-			ap->rval = args.fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
+		error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
+		if (error)
+			return error;
 	} else if (ap->low) {
 		if (xfs_inode_is_filestream(ap->ip))
 			args.type = XFS_ALLOCTYPE_FIRST_AG;
@@ -3810,7 +3829,7 @@ xfs_bmap_add_attrfork(
 	}
 	if ((error = xfs_bmap_finish(&tp, &flist, &committed)))
 		goto error2;
-	error = xfs_trans_commit(tp, XFS_TRANS_PERM_LOG_RES);
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 	ASSERT(ip->i_df.if_ext_max ==
 	       XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
 	return error;
@@ -4470,7 +4489,7 @@ xfs_bmapi(
 	xfs_fsblock_t	abno;		/* allocated block number */
 	xfs_extlen_t	alen;		/* allocated extent length */
 	xfs_fileoff_t	aoff;		/* allocated file offset */
-	xfs_bmalloca_t	bma;		/* args for xfs_bmap_alloc */
+	xfs_bmalloca_t	bma = { 0 };	/* args for xfs_bmap_alloc */
 	xfs_btree_cur_t	*cur;		/* bmap btree cursor */
 	xfs_fileoff_t	end;		/* end of mapped file region */
 	int		eof;		/* we've hit the end of extents */
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 38751d5fac6f..416e47e54b83 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -334,7 +334,7 @@ xfs_bmbt_disk_set_allf(
 /*
  * Set all the fields in a bmap extent record from the uncompressed form.
  */
-void
+STATIC void
 xfs_bmbt_disk_set_all(
 	xfs_bmbt_rec_t	*r,
 	xfs_bmbt_irec_t *s)
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index cf07ca7c22e7..0e66c4ea0f85 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -223,7 +223,6 @@ extern void xfs_bmbt_set_startblock(xfs_bmbt_rec_host_t *r, xfs_fsblock_t v);
 extern void xfs_bmbt_set_startoff(xfs_bmbt_rec_host_t *r, xfs_fileoff_t v);
 extern void xfs_bmbt_set_state(xfs_bmbt_rec_host_t *r, xfs_exntst_t v);
 
-extern void xfs_bmbt_disk_set_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s);
 extern void xfs_bmbt_disk_set_allf(xfs_bmbt_rec_t *r, xfs_fileoff_t o,
 			xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v);
 
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 36a0992dd669..96be4b0f2496 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -977,7 +977,7 @@ xfs_btree_get_buf_block(
 	xfs_daddr_t		d;
 
 	/* need to sort out how callers deal with failures first */
-	ASSERT(!(flags & XFS_BUF_TRYLOCK));
+	ASSERT(!(flags & XBF_TRYLOCK));
 
 	d = xfs_btree_ptr_to_daddr(cur, ptr);
 	*bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
@@ -1008,7 +1008,7 @@ xfs_btree_read_buf_block(
 	int			error;
 
 	/* need to sort out how callers deal with failures first */
-	ASSERT(!(flags & XFS_BUF_TRYLOCK));
+	ASSERT(!(flags & XBF_TRYLOCK));
 
 	d = xfs_btree_ptr_to_daddr(cur, ptr);
 	error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index a30f7e9eb2b9..02a80984aa05 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -64,7 +64,7 @@ xfs_buf_item_log_debug(
 	nbytes = last - first + 1;
 	bfset(bip->bli_logged, first, nbytes);
 	for (x = 0; x < nbytes; x++) {
-		chunk_num = byte >> XFS_BLI_SHIFT;
+		chunk_num = byte >> XFS_BLF_SHIFT;
 		word_num = chunk_num >> BIT_TO_WORD_SHIFT;
 		bit_num = chunk_num & (NBWORD - 1);
 		wordp = &(bip->bli_format.blf_data_map[word_num]);
@@ -166,7 +166,7 @@ xfs_buf_item_size(
 		 * cancel flag in it.
 		 */
 		trace_xfs_buf_item_size_stale(bip);
-		ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+		ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
 		return 1;
 	}
 
@@ -197,9 +197,9 @@ xfs_buf_item_size(
 		} else if (next_bit != last_bit + 1) {
 			last_bit = next_bit;
 			nvecs++;
-		} else if (xfs_buf_offset(bp, next_bit * XFS_BLI_CHUNK) !=
-			   (xfs_buf_offset(bp, last_bit * XFS_BLI_CHUNK) +
-			    XFS_BLI_CHUNK)) {
+		} else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) !=
+			   (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) +
+			    XFS_BLF_CHUNK)) {
 			last_bit = next_bit;
 			nvecs++;
 		} else {
@@ -250,10 +250,24 @@ xfs_buf_item_format(
 		       ((bip->bli_format.blf_map_size - 1) * sizeof(uint)));
 	vecp->i_addr = (xfs_caddr_t)&bip->bli_format;
 	vecp->i_len = base_size;
-	XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BFORMAT);
+	vecp->i_type = XLOG_REG_TYPE_BFORMAT;
 	vecp++;
 	nvecs = 1;
 
+	/*
+	 * If it is an inode buffer, transfer the in-memory state to the
+	 * format flags and clear the in-memory state. We do not transfer
+	 * this state if the inode buffer allocation has not yet been committed
+	 * to the log as setting the XFS_BLI_INODE_BUF flag will prevent
+	 * correct replay of the inode allocation.
+	 */
+	if (bip->bli_flags & XFS_BLI_INODE_BUF) {
+		if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
+		      xfs_log_item_in_current_chkpt(&bip->bli_item)))
+			bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF;
+		bip->bli_flags &= ~XFS_BLI_INODE_BUF;
+	}
+
 	if (bip->bli_flags & XFS_BLI_STALE) {
 		/*
 		 * The buffer is stale, so all we need to log
@@ -261,7 +275,7 @@ xfs_buf_item_format(
 		 * cancel flag in it.
 		 */
 		trace_xfs_buf_item_format_stale(bip);
-		ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+		ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
 		bip->bli_format.blf_size = nvecs;
 		return;
 	}
@@ -294,29 +308,29 @@ xfs_buf_item_format(
 		 * keep counting and scanning.
 		 */
 		if (next_bit == -1) {
-			buffer_offset = first_bit * XFS_BLI_CHUNK;
+			buffer_offset = first_bit * XFS_BLF_CHUNK;
 			vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
-			vecp->i_len = nbits * XFS_BLI_CHUNK;
-			XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK);
+			vecp->i_len = nbits * XFS_BLF_CHUNK;
+			vecp->i_type = XLOG_REG_TYPE_BCHUNK;
 			nvecs++;
 			break;
 		} else if (next_bit != last_bit + 1) {
-			buffer_offset = first_bit * XFS_BLI_CHUNK;
+			buffer_offset = first_bit * XFS_BLF_CHUNK;
 			vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
-			vecp->i_len = nbits * XFS_BLI_CHUNK;
-			XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK);
+			vecp->i_len = nbits * XFS_BLF_CHUNK;
+			vecp->i_type = XLOG_REG_TYPE_BCHUNK;
 			nvecs++;
 			vecp++;
 			first_bit = next_bit;
 			last_bit = next_bit;
 			nbits = 1;
-		} else if (xfs_buf_offset(bp, next_bit << XFS_BLI_SHIFT) !=
-			   (xfs_buf_offset(bp, last_bit << XFS_BLI_SHIFT) +
-			    XFS_BLI_CHUNK)) {
-			buffer_offset = first_bit * XFS_BLI_CHUNK;
+		} else if (xfs_buf_offset(bp, next_bit << XFS_BLF_SHIFT) !=
+			   (xfs_buf_offset(bp, last_bit << XFS_BLF_SHIFT) +
+			    XFS_BLF_CHUNK)) {
+			buffer_offset = first_bit * XFS_BLF_CHUNK;
 			vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
-			vecp->i_len = nbits * XFS_BLI_CHUNK;
-			XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK);
+			vecp->i_len = nbits * XFS_BLF_CHUNK;
+			vecp->i_type = XLOG_REG_TYPE_BCHUNK;
 /* You would think we need to bump the nvecs here too, but we do not
  * this number is used by recovery, and it gets confused by the boundary
  * split here
@@ -341,10 +355,15 @@ xfs_buf_item_format(
 }
 
 /*
- * This is called to pin the buffer associated with the buf log
- * item in memory so it cannot be written out.  Simply call bpin()
- * on the buffer to do this.
+ * This is called to pin the buffer associated with the buf log item in memory
+ * so it cannot be written out.  Simply call bpin() on the buffer to do this.
+ *
+ * We also always take a reference to the buffer log item here so that the bli
+ * is held while the item is pinned in memory. This means that we can
+ * unconditionally drop the reference count a transaction holds when the
+ * transaction is completed.
  */
+
 STATIC void
 xfs_buf_item_pin(
 	xfs_buf_log_item_t	*bip)
@@ -356,6 +375,7 @@ xfs_buf_item_pin(
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 	ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
 	       (bip->bli_flags & XFS_BLI_STALE));
+	atomic_inc(&bip->bli_refcount);
 	trace_xfs_buf_item_pin(bip);
 	xfs_bpin(bp);
 }
@@ -372,12 +392,12 @@ xfs_buf_item_pin(
  */
 STATIC void
 xfs_buf_item_unpin(
-	xfs_buf_log_item_t	*bip,
-	int			stale)
+	xfs_buf_log_item_t	*bip)
 {
 	struct xfs_ail	*ailp;
 	xfs_buf_t	*bp;
 	int		freed;
+	int		stale = bip->bli_flags & XFS_BLI_STALE;
 
 	bp = bip->bli_buf;
 	ASSERT(bp != NULL);
@@ -393,7 +413,7 @@ xfs_buf_item_unpin(
 		ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
 		ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
 		ASSERT(XFS_BUF_ISSTALE(bp));
-		ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+		ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
 		trace_xfs_buf_item_unpin_stale(bip);
 
 		/*
@@ -428,47 +448,43 @@ xfs_buf_item_unpin_remove(
 	xfs_buf_log_item_t	*bip,
 	xfs_trans_t		*tp)
 {
-	xfs_buf_t		*bp;
-	xfs_log_item_desc_t	*lidp;
-	int			stale = 0;
-
-	bp = bip->bli_buf;
-	/*
-	 * will xfs_buf_item_unpin() call xfs_buf_item_relse()?
-	 */
+	/* will xfs_buf_item_unpin() call xfs_buf_item_relse()? */
 	if ((atomic_read(&bip->bli_refcount) == 1) &&
 	    (bip->bli_flags & XFS_BLI_STALE)) {
+		/*
+		 * yes -- We can safely do some work here and then call
+		 * buf_item_unpin to do the rest because we are
+		 * are holding the buffer locked so no one else will be
+		 * able to bump up the refcount. We have to remove the
+		 * log item from the transaction as we are about to release
+		 * our reference to the buffer. If we don't, the unlock that
+		 * occurs later in the xfs_trans_uncommit() will try to
+		 * reference the buffer which we no longer have a hold on.
+		 */
+		struct xfs_log_item_desc *lidp;
+
 		ASSERT(XFS_BUF_VALUSEMA(bip->bli_buf) <= 0);
 		trace_xfs_buf_item_unpin_stale(bip);
 
-		/*
-		 * yes -- clear the xaction descriptor in-use flag
-		 * and free the chunk if required.  We can safely
-		 * do some work here and then call buf_item_unpin
-		 * to do the rest because if the if is true, then
-		 * we are holding the buffer locked so no one else
-		 * will be able to bump up the refcount.
-		 */
-		lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) bip);
-		stale = lidp->lid_flags & XFS_LID_BUF_STALE;
+		lidp = xfs_trans_find_item(tp, (xfs_log_item_t *)bip);
 		xfs_trans_free_item(tp, lidp);
+
 		/*
-		 * Since the transaction no longer refers to the buffer,
-		 * the buffer should no longer refer to the transaction.
+		 * Since the transaction no longer refers to the buffer, the
+		 * buffer should no longer refer to the transaction.
 		 */
-		XFS_BUF_SET_FSPRIVATE2(bp, NULL);
+		XFS_BUF_SET_FSPRIVATE2(bip->bli_buf, NULL);
 	}
-
-	xfs_buf_item_unpin(bip, stale);
-
-	return;
+	xfs_buf_item_unpin(bip);
 }
 
 /*
  * This is called to attempt to lock the buffer associated with this
  * buf log item.  Don't sleep on the buffer lock.  If we can't get
- * the lock right away, return 0.  If we can get the lock, pull the
- * buffer from the free list, mark it busy, and return 1.
+ * the lock right away, return 0.  If we can get the lock, take a
+ * reference to the buffer. If this is a delayed write buffer that
+ * needs AIL help to be written back, invoke the pushbuf routine
+ * rather than the normal success path.
  */
 STATIC uint
 xfs_buf_item_trylock(
@@ -477,42 +493,39 @@ xfs_buf_item_trylock(
 	xfs_buf_t	*bp;
 
 	bp = bip->bli_buf;
-
-	if (XFS_BUF_ISPINNED(bp)) {
+	if (XFS_BUF_ISPINNED(bp))
 		return XFS_ITEM_PINNED;
-	}
-
-	if (!XFS_BUF_CPSEMA(bp)) {
+	if (!XFS_BUF_CPSEMA(bp))
 		return XFS_ITEM_LOCKED;
-	}
 
-	/*
-	 * Remove the buffer from the free list.  Only do this
-	 * if it's on the free list.  Private buffers like the
-	 * superblock buffer are not.
-	 */
+	/* take a reference to the buffer.  */
 	XFS_BUF_HOLD(bp);
 
 	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
 	trace_xfs_buf_item_trylock(bip);
+	if (XFS_BUF_ISDELAYWRITE(bp))
+		return XFS_ITEM_PUSHBUF;
 	return XFS_ITEM_SUCCESS;
 }
 
 /*
- * Release the buffer associated with the buf log item.
- * If there is no dirty logged data associated with the
- * buffer recorded in the buf log item, then free the
- * buf log item and remove the reference to it in the
- * buffer.
+ * Release the buffer associated with the buf log item.  If there is no dirty
+ * logged data associated with the buffer recorded in the buf log item, then
+ * free the buf log item and remove the reference to it in the buffer.
+ *
+ * This call ignores the recursion count.  It is only called when the buffer
+ * should REALLY be unlocked, regardless of the recursion count.
  *
- * This call ignores the recursion count.  It is only called
- * when the buffer should REALLY be unlocked, regardless
- * of the recursion count.
+ * We unconditionally drop the transaction's reference to the log item. If the
+ * item was logged, then another reference was taken when it was pinned, so we
+ * can safely drop the transaction reference now.  This also allows us to avoid
+ * potential races with the unpin code freeing the bli by not referencing the
+ * bli after we've dropped the reference count.
  *
- * If the XFS_BLI_HOLD flag is set in the buf log item, then
- * free the log item if necessary but do not unlock the buffer.
- * This is for support of xfs_trans_bhold(). Make sure the
- * XFS_BLI_HOLD field is cleared if we don't free the item.
+ * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item
+ * if necessary but do not unlock the buffer.  This is for support of
+ * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't
+ * free the item.
  */
 STATIC void
 xfs_buf_item_unlock(
@@ -524,73 +537,54 @@ xfs_buf_item_unlock(
 
 	bp = bip->bli_buf;
 
-	/*
-	 * Clear the buffer's association with this transaction.
-	 */
+	/* Clear the buffer's association with this transaction. */
 	XFS_BUF_SET_FSPRIVATE2(bp, NULL);
 
 	/*
-	 * If this is a transaction abort, don't return early.
-	 * Instead, allow the brelse to happen.
-	 * Normally it would be done for stale (cancelled) buffers
-	 * at unpin time, but we'll never go through the pin/unpin
-	 * cycle if we abort inside commit.
+	 * If this is a transaction abort, don't return early.  Instead, allow
+	 * the brelse to happen.  Normally it would be done for stale
+	 * (cancelled) buffers at unpin time, but we'll never go through the
+	 * pin/unpin cycle if we abort inside commit.
 	 */
 	aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0;
 
 	/*
-	 * If the buf item is marked stale, then don't do anything.
-	 * We'll unlock the buffer and free the buf item when the
-	 * buffer is unpinned for the last time.
+	 * Before possibly freeing the buf item, determine if we should
+	 * release the buffer at the end of this routine.
 	 */
-	if (bip->bli_flags & XFS_BLI_STALE) {
-		bip->bli_flags &= ~XFS_BLI_LOGGED;
-		trace_xfs_buf_item_unlock_stale(bip);
-		ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
-		if (!aborted)
-			return;
-	}
+	hold = bip->bli_flags & XFS_BLI_HOLD;
+
+	/* Clear the per transaction state. */
+	bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD);
 
 	/*
-	 * Drop the transaction's reference to the log item if
-	 * it was not logged as part of the transaction.  Otherwise
-	 * we'll drop the reference in xfs_buf_item_unpin() when
-	 * the transaction is really through with the buffer.
+	 * If the buf item is marked stale, then don't do anything.  We'll
+	 * unlock the buffer and free the buf item when the buffer is unpinned
+	 * for the last time.
 	 */
-	if (!(bip->bli_flags & XFS_BLI_LOGGED)) {
-		atomic_dec(&bip->bli_refcount);
-	} else {
-		/*
-		 * Clear the logged flag since this is per
-		 * transaction state.
-		 */
-		bip->bli_flags &= ~XFS_BLI_LOGGED;
+	if (bip->bli_flags & XFS_BLI_STALE) {
+		trace_xfs_buf_item_unlock_stale(bip);
+		ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
+		if (!aborted) {
+			atomic_dec(&bip->bli_refcount);
+			return;
+		}
 	}
 
-	/*
-	 * Before possibly freeing the buf item, determine if we should
-	 * release the buffer at the end of this routine.
-	 */
-	hold = bip->bli_flags & XFS_BLI_HOLD;
 	trace_xfs_buf_item_unlock(bip);
 
 	/*
-	 * If the buf item isn't tracking any data, free it.
-	 * Otherwise, if XFS_BLI_HOLD is set clear it.
+	 * If the buf item isn't tracking any data, free it, otherwise drop the
+	 * reference we hold to it.
 	 */
 	if (xfs_bitmap_empty(bip->bli_format.blf_data_map,
-			     bip->bli_format.blf_map_size)) {
+			     bip->bli_format.blf_map_size))
 		xfs_buf_item_relse(bp);
-	} else if (hold) {
-		bip->bli_flags &= ~XFS_BLI_HOLD;
-	}
+	else
+		atomic_dec(&bip->bli_refcount);
 
-	/*
-	 * Release the buffer if XFS_BLI_HOLD was not set.
-	 */
-	if (!hold) {
+	if (!hold)
 		xfs_buf_relse(bp);
-	}
 }
 
 /*
@@ -626,11 +620,9 @@ xfs_buf_item_committed(
 }
 
 /*
- * This is called to asynchronously write the buffer associated with this
- * buf log item out to disk. The buffer will already have been locked by
- * a successful call to xfs_buf_item_trylock().  If the buffer still has
- * B_DELWRI set, then get it going out to disk with a call to bawrite().
- * If not, then just release the buffer.
+ * The buffer is locked, but is not a delayed write buffer. This happens
+ * if we race with IO completion and hence we don't want to try to write it
+ * again. Just release the buffer.
  */
 STATIC void
 xfs_buf_item_push(
@@ -642,17 +634,29 @@ xfs_buf_item_push(
 	trace_xfs_buf_item_push(bip);
 
 	bp = bip->bli_buf;
+	ASSERT(!XFS_BUF_ISDELAYWRITE(bp));
+	xfs_buf_relse(bp);
+}
 
-	if (XFS_BUF_ISDELAYWRITE(bp)) {
-		int	error;
-		error = xfs_bawrite(bip->bli_item.li_mountp, bp);
-		if (error)
-			xfs_fs_cmn_err(CE_WARN, bip->bli_item.li_mountp,
-			"xfs_buf_item_push: pushbuf error %d on bip %p, bp %p",
-					error, bip, bp);
-	} else {
-		xfs_buf_relse(bp);
-	}
+/*
+ * The buffer is locked and is a delayed write buffer. Promote the buffer
+ * in the delayed write queue as the caller knows that they must invoke
+ * the xfsbufd to get this buffer written. We have to unlock the buffer
+ * to allow the xfsbufd to write it, too.
+ */
+STATIC void
+xfs_buf_item_pushbuf(
+	xfs_buf_log_item_t	*bip)
+{
+	xfs_buf_t	*bp;
+
+	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+	trace_xfs_buf_item_pushbuf(bip);
+
+	bp = bip->bli_buf;
+	ASSERT(XFS_BUF_ISDELAYWRITE(bp));
+	xfs_buf_delwri_promote(bp);
+	xfs_buf_relse(bp);
 }
 
 /* ARGSUSED */
@@ -669,7 +673,7 @@ static struct xfs_item_ops xfs_buf_item_ops = {
 	.iop_format	= (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
 					xfs_buf_item_format,
 	.iop_pin	= (void(*)(xfs_log_item_t*))xfs_buf_item_pin,
-	.iop_unpin	= (void(*)(xfs_log_item_t*, int))xfs_buf_item_unpin,
+	.iop_unpin	= (void(*)(xfs_log_item_t*))xfs_buf_item_unpin,
 	.iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *))
 					xfs_buf_item_unpin_remove,
 	.iop_trylock	= (uint(*)(xfs_log_item_t*))xfs_buf_item_trylock,
@@ -677,7 +681,7 @@ static struct xfs_item_ops xfs_buf_item_ops = {
 	.iop_committed	= (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
 					xfs_buf_item_committed,
 	.iop_push	= (void(*)(xfs_log_item_t*))xfs_buf_item_push,
-	.iop_pushbuf	= NULL,
+	.iop_pushbuf	= (void(*)(xfs_log_item_t*))xfs_buf_item_pushbuf,
 	.iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
 					xfs_buf_item_committing
 };
@@ -717,20 +721,17 @@ xfs_buf_item_init(
 	}
 
 	/*
-	 * chunks is the number of XFS_BLI_CHUNK size pieces
+	 * chunks is the number of XFS_BLF_CHUNK size pieces
 	 * the buffer can be divided into. Make sure not to
 	 * truncate any pieces.  map_size is the size of the
 	 * bitmap needed to describe the chunks of the buffer.
 	 */
-	chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLI_CHUNK - 1)) >> XFS_BLI_SHIFT);
+	chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLF_CHUNK - 1)) >> XFS_BLF_SHIFT);
 	map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT);
 
 	bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone,
 						    KM_SLEEP);
-	bip->bli_item.li_type = XFS_LI_BUF;
-	bip->bli_item.li_ops = &xfs_buf_item_ops;
-	bip->bli_item.li_mountp = mp;
-	bip->bli_item.li_ailp = mp->m_ail;
+	xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops);
 	bip->bli_buf = bp;
 	xfs_buf_hold(bp);
 	bip->bli_format.blf_type = XFS_LI_BUF;
@@ -793,8 +794,8 @@ xfs_buf_item_log(
 	/*
 	 * Convert byte offsets to bit numbers.
 	 */
-	first_bit = first >> XFS_BLI_SHIFT;
-	last_bit = last >> XFS_BLI_SHIFT;
+	first_bit = first >> XFS_BLF_SHIFT;
+	last_bit = last >> XFS_BLF_SHIFT;
 
 	/*
 	 * Calculate the total number of bits to be set.
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 217f34af00cb..f20bb472d582 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -26,7 +26,7 @@ extern kmem_zone_t	*xfs_buf_item_zone;
  * have been logged.
  * For 6.2 and beyond, this is XFS_LI_BUF.  We use this to log everything.
  */
-typedef struct xfs_buf_log_format_t {
+typedef struct xfs_buf_log_format {
 	unsigned short	blf_type;	/* buf log item type indicator */
 	unsigned short	blf_size;	/* size of this item */
 	ushort		blf_flags;	/* misc state */
@@ -41,22 +41,22 @@ typedef struct xfs_buf_log_format_t {
  * This flag indicates that the buffer contains on disk inodes
  * and requires special recovery handling.
  */
-#define	XFS_BLI_INODE_BUF	0x1
+#define	XFS_BLF_INODE_BUF	0x1
 /*
  * This flag indicates that the buffer should not be replayed
  * during recovery because its blocks are being freed.
  */
-#define	XFS_BLI_CANCEL		0x2
+#define	XFS_BLF_CANCEL		0x2
 /*
  * This flag indicates that the buffer contains on disk
  * user or group dquots and may require special recovery handling.
  */
-#define	XFS_BLI_UDQUOT_BUF	0x4
-#define XFS_BLI_PDQUOT_BUF	0x8
-#define	XFS_BLI_GDQUOT_BUF	0x10
+#define	XFS_BLF_UDQUOT_BUF	0x4
+#define XFS_BLF_PDQUOT_BUF	0x8
+#define	XFS_BLF_GDQUOT_BUF	0x10
 
-#define	XFS_BLI_CHUNK		128
-#define	XFS_BLI_SHIFT		7
+#define	XFS_BLF_CHUNK		128
+#define	XFS_BLF_SHIFT		7
 #define	BIT_TO_WORD_SHIFT	5
 #define	NBWORD			(NBBY * sizeof(unsigned int))
 
@@ -69,6 +69,7 @@ typedef struct xfs_buf_log_format_t {
 #define	XFS_BLI_LOGGED		0x08
 #define	XFS_BLI_INODE_ALLOC_BUF	0x10
 #define XFS_BLI_STALE_INODE	0x20
+#define	XFS_BLI_INODE_BUF	0x40
 
 #define XFS_BLI_FLAGS \
 	{ XFS_BLI_HOLD,		"HOLD" }, \
@@ -76,7 +77,8 @@ typedef struct xfs_buf_log_format_t {
 	{ XFS_BLI_STALE,	"STALE" }, \
 	{ XFS_BLI_LOGGED,	"LOGGED" }, \
 	{ XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \
-	{ XFS_BLI_STALE_INODE,	"STALE_INODE" }
+	{ XFS_BLI_STALE_INODE,	"STALE_INODE" }, \
+	{ XFS_BLI_INODE_BUF,	"INODE_BUF" }
 
 
 #ifdef __KERNEL__
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index c0c8869115b1..0ca556b4bf31 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1534,8 +1534,8 @@ xfs_da_hashname(const __uint8_t *name, int namelen)
 enum xfs_dacmp
 xfs_da_compname(
 	struct xfs_da_args *args,
-	const char 	*name,
-	int 		len)
+	const unsigned char *name,
+	int		len)
 {
 	return (args->namelen == len && memcmp(args->name, name, len) == 0) ?
 					XFS_CMP_EXACT : XFS_CMP_DIFFERENT;
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 30cd08f56a3a..fe9f5a8c1d2a 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -209,7 +209,8 @@ typedef struct xfs_da_state {
  */
 struct xfs_nameops {
 	xfs_dahash_t	(*hashname)(struct xfs_name *);
-	enum xfs_dacmp	(*compname)(struct xfs_da_args *, const char *, int);
+	enum xfs_dacmp	(*compname)(struct xfs_da_args *,
+					const unsigned char *, int);
 };
 
 
@@ -260,7 +261,7 @@ int	xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
 
 uint xfs_da_hashname(const __uint8_t *name_string, int name_length);
 enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args,
-				const char *name, int len);
+				const unsigned char *name, int len);
 
 
 xfs_da_state_t *xfs_da_state_alloc(void);
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 84ca1cf16a1e..7f159d2a429a 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -45,15 +45,21 @@
 #include "xfs_vnodeops.h"
 #include "xfs_trace.h"
 
+
+static int xfs_swap_extents(
+	xfs_inode_t	*ip,	/* target inode */
+	xfs_inode_t	*tip,	/* tmp inode */
+	xfs_swapext_t	*sxp);
+
 /*
- * Syssgi interface for swapext
+ * ioctl interface for swapext
  */
 int
 xfs_swapext(
 	xfs_swapext_t	*sxp)
 {
 	xfs_inode_t     *ip, *tip;
-	struct file	*file, *target_file;
+	struct file	*file, *tmp_file;
 	int		error = 0;
 
 	/* Pull information for the target fd */
@@ -63,51 +69,54 @@ xfs_swapext(
 		goto out;
 	}
 
-	if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) {
+	if (!(file->f_mode & FMODE_WRITE) ||
+	    !(file->f_mode & FMODE_READ) ||
+	    (file->f_flags & O_APPEND)) {
 		error = XFS_ERROR(EBADF);
 		goto out_put_file;
 	}
 
-	target_file = fget((int)sxp->sx_fdtmp);
-	if (!target_file) {
+	tmp_file = fget((int)sxp->sx_fdtmp);
+	if (!tmp_file) {
 		error = XFS_ERROR(EINVAL);
 		goto out_put_file;
 	}
 
-	if (!(target_file->f_mode & FMODE_WRITE) ||
-	    (target_file->f_flags & O_APPEND)) {
+	if (!(tmp_file->f_mode & FMODE_WRITE) ||
+	    !(tmp_file->f_mode & FMODE_READ) ||
+	    (tmp_file->f_flags & O_APPEND)) {
 		error = XFS_ERROR(EBADF);
-		goto out_put_target_file;
+		goto out_put_tmp_file;
 	}
 
 	if (IS_SWAPFILE(file->f_path.dentry->d_inode) ||
-	    IS_SWAPFILE(target_file->f_path.dentry->d_inode)) {
+	    IS_SWAPFILE(tmp_file->f_path.dentry->d_inode)) {
 		error = XFS_ERROR(EINVAL);
-		goto out_put_target_file;
+		goto out_put_tmp_file;
 	}
 
 	ip = XFS_I(file->f_path.dentry->d_inode);
-	tip = XFS_I(target_file->f_path.dentry->d_inode);
+	tip = XFS_I(tmp_file->f_path.dentry->d_inode);
 
 	if (ip->i_mount != tip->i_mount) {
 		error = XFS_ERROR(EINVAL);
-		goto out_put_target_file;
+		goto out_put_tmp_file;
 	}
 
 	if (ip->i_ino == tip->i_ino) {
 		error = XFS_ERROR(EINVAL);
-		goto out_put_target_file;
+		goto out_put_tmp_file;
 	}
 
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
 		error = XFS_ERROR(EIO);
-		goto out_put_target_file;
+		goto out_put_tmp_file;
 	}
 
 	error = xfs_swap_extents(ip, tip, sxp);
 
- out_put_target_file:
-	fput(target_file);
+ out_put_tmp_file:
+	fput(tmp_file);
  out_put_file:
 	fput(file);
  out:
@@ -171,22 +180,32 @@ xfs_swap_extents_check_format(
 	    XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max)
 		return EINVAL;
 
-	/* Check root block of temp in btree form to max in target */
+	/*
+	 * If we are in a btree format, check that the temp root block will fit
+	 * in the target and that it has enough extents to be in btree format
+	 * in the target.
+	 *
+	 * Note that we have to be careful to allow btree->extent conversions
+	 * (a common defrag case) which will occur when the temp inode is in
+	 * extent format...
+	 */
 	if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
-	    XFS_IFORK_BOFF(ip) &&
-	    tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip))
+	    ((XFS_IFORK_BOFF(ip) &&
+	      tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) ||
+	     XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= ip->i_df.if_ext_max))
 		return EINVAL;
 
-	/* Check root block of target in btree form to max in temp */
+	/* Reciprocal target->temp btree format checks */
 	if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
-	    XFS_IFORK_BOFF(tip) &&
-	    ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip))
+	    ((XFS_IFORK_BOFF(tip) &&
+	      ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) ||
+	     XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= tip->i_df.if_ext_max))
 		return EINVAL;
 
 	return 0;
 }
 
-int
+static int
 xfs_swap_extents(
 	xfs_inode_t	*ip,	/* target inode */
 	xfs_inode_t	*tip,	/* tmp inode */
@@ -254,6 +273,9 @@ xfs_swap_extents(
 		goto out_unlock;
 	}
 
+	trace_xfs_swap_extent_before(ip, 0);
+	trace_xfs_swap_extent_before(tip, 1);
+
 	/* check inode formats now that data is flushed */
 	error = xfs_swap_extents_check_format(ip, tip);
 	if (error) {
@@ -421,6 +443,8 @@ xfs_swap_extents(
 
 	error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT);
 
+	trace_xfs_swap_extent_after(ip, 0);
+	trace_xfs_swap_extent_after(tip, 1);
 out:
 	kmem_free(tempifp);
 	return error;
diff --git a/fs/xfs/xfs_dfrag.h b/fs/xfs/xfs_dfrag.h
index 4f55a6306558..20bdd935c121 100644
--- a/fs/xfs/xfs_dfrag.h
+++ b/fs/xfs/xfs_dfrag.h
@@ -48,9 +48,6 @@ typedef struct xfs_swapext
  */
 int	xfs_swapext(struct xfs_swapext *sx);
 
-int	xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip,
-		struct xfs_swapext *sxp);
-
 #endif	/* __KERNEL__ */
 
 #endif	/* __XFS_DFRAG_H__ */
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index 93634a7e90e9..42520f041265 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -44,7 +44,7 @@
 #include "xfs_vnodeops.h"
 #include "xfs_trace.h"
 
-struct xfs_name xfs_name_dotdot = {"..", 2};
+struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2};
 
 /*
  * ASCII case-insensitive (ie. A-Z) support for directories that was
@@ -66,8 +66,8 @@ xfs_ascii_ci_hashname(
 STATIC enum xfs_dacmp
 xfs_ascii_ci_compname(
 	struct xfs_da_args *args,
-	const char	*name,
-	int 		len)
+	const unsigned char *name,
+	int		len)
 {
 	enum xfs_dacmp	result;
 	int		i;
@@ -247,7 +247,7 @@ xfs_dir_createname(
 int
 xfs_dir_cilookup_result(
 	struct xfs_da_args *args,
-	const char	*name,
+	const unsigned char *name,
 	int		len)
 {
 	if (args->cmpresult == XFS_CMP_DIFFERENT)
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h
index 1d9ef96f33aa..74a3b1057685 100644
--- a/fs/xfs/xfs_dir2.h
+++ b/fs/xfs/xfs_dir2.h
@@ -100,7 +100,7 @@ extern int xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp,
 extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
 				struct xfs_dabuf *bp);
 
-extern int xfs_dir_cilookup_result(struct xfs_da_args *args, const char *name,
-				int len);
+extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
+				const unsigned char *name, int len);
 
 #endif	/* __XFS_DIR2_H__ */
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index ddc4ecc7807f..779a267b0a84 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -57,8 +57,8 @@ static xfs_dahash_t xfs_dir_hash_dot, xfs_dir_hash_dotdot;
 void
 xfs_dir_startup(void)
 {
-	xfs_dir_hash_dot = xfs_da_hashname(".", 1);
-	xfs_dir_hash_dotdot = xfs_da_hashname("..", 2);
+	xfs_dir_hash_dot = xfs_da_hashname((unsigned char *)".", 1);
+	xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2);
 }
 
 /*
@@ -513,8 +513,9 @@ xfs_dir2_block_getdents(
 		/*
 		 * If it didn't fit, set the final offset to here & return.
 		 */
-		if (filldir(dirent, dep->name, dep->namelen, cook & 0x7fffffff,
-			    be64_to_cpu(dep->inumber), DT_UNKNOWN)) {
+		if (filldir(dirent, (char *)dep->name, dep->namelen,
+			    cook & 0x7fffffff, be64_to_cpu(dep->inumber),
+			    DT_UNKNOWN)) {
 			*offset = cook & 0x7fffffff;
 			xfs_da_brelse(NULL, bp);
 			return 0;
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 29f484c11b3a..e2d89854ec9e 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -1081,7 +1081,7 @@ xfs_dir2_leaf_getdents(
 		dep = (xfs_dir2_data_entry_t *)ptr;
 		length = xfs_dir2_data_entsize(dep->namelen);
 
-		if (filldir(dirent, dep->name, dep->namelen,
+		if (filldir(dirent, (char *)dep->name, dep->namelen,
 			    xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff,
 			    be64_to_cpu(dep->inumber), DT_UNKNOWN))
 			break;
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index ce6e355199b5..78fc4d9ae756 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -65,7 +65,7 @@ static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
 /*
  * Log entries from a freespace block.
  */
-void
+STATIC void
 xfs_dir2_free_log_bests(
 	xfs_trans_t		*tp,		/* transaction pointer */
 	xfs_dabuf_t		*bp,		/* freespace buffer */
diff --git a/fs/xfs/xfs_dir2_node.h b/fs/xfs/xfs_dir2_node.h
index dde72db3d695..82dfe7147195 100644
--- a/fs/xfs/xfs_dir2_node.h
+++ b/fs/xfs/xfs_dir2_node.h
@@ -75,8 +75,6 @@ xfs_dir2_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db)
 	return ((db) % XFS_DIR2_MAX_FREE_BESTS(mp));
 }
 
-extern void xfs_dir2_free_log_bests(struct xfs_trans *tp, struct xfs_dabuf *bp,
-				    int first, int last);
 extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args,
 				 struct xfs_dabuf *lbp);
 extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_dabuf *bp, int *count);
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index 9d4f17a69676..c1a5945d463a 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -782,7 +782,7 @@ xfs_dir2_sf_getdents(
 		}
 
 		ino = xfs_dir2_sf_get_inumber(sfp, xfs_dir2_sf_inumberp(sfep));
-		if (filldir(dirent, sfep->name, sfep->namelen,
+		if (filldir(dirent, (char *)sfep->name, sfep->namelen,
 			    off & 0x7fffffff, ino, DT_UNKNOWN)) {
 			*offset = off & 0x7fffffff;
 			return 0;
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 92d5cd5bf4f2..047b8a8e5c29 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -170,7 +170,7 @@ xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
 	va_list ap;
 
 #ifdef DEBUG
-	xfs_panic_mask |= XFS_PTAG_SHUTDOWN_CORRUPT;
+	xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
 #endif
 
 	if (xfs_panic_mask && (xfs_panic_mask & panic_tag)
@@ -186,18 +186,18 @@ xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
 
 void
 xfs_error_report(
-	char		*tag,
-	int		level,
-	xfs_mount_t	*mp,
-	char		*fname,
-	int		linenum,
-	inst_t		*ra)
+	const char		*tag,
+	int			level,
+	struct xfs_mount	*mp,
+	const char		*filename,
+	int			linenum,
+	inst_t			*ra)
 {
 	if (level <= xfs_error_level) {
 		xfs_cmn_err(XFS_PTAG_ERROR_REPORT,
 			    CE_ALERT, mp,
 		"XFS internal error %s at line %d of file %s.  Caller 0x%p\n",
-			    tag, linenum, fname, ra);
+			    tag, linenum, filename, ra);
 
 		xfs_stack_trace();
 	}
@@ -205,15 +205,15 @@ xfs_error_report(
 
 void
 xfs_corruption_error(
-	char		*tag,
-	int		level,
-	xfs_mount_t	*mp,
-	void		*p,
-	char		*fname,
-	int		linenum,
-	inst_t		*ra)
+	const char		*tag,
+	int			level,
+	struct xfs_mount	*mp,
+	void			*p,
+	const char		*filename,
+	int			linenum,
+	inst_t			*ra)
 {
 	if (level <= xfs_error_level)
 		xfs_hex_dump(p, 16);
-	xfs_error_report(tag, level, mp, fname, linenum, ra);
+	xfs_error_report(tag, level, mp, filename, linenum, ra);
 }
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 0c93051c4651..c2c1a072bb82 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -29,10 +29,11 @@ extern int	xfs_error_trap(int);
 
 struct xfs_mount;
 
-extern void xfs_error_report(char *tag, int level, struct xfs_mount *mp,
-				char *fname, int linenum, inst_t *ra);
-extern void xfs_corruption_error(char *tag, int level, struct xfs_mount *mp,
-				void *p, char *fname, int linenum, inst_t *ra);
+extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp,
+			const char *filename, int linenum, inst_t *ra);
+extern void xfs_corruption_error(const char *tag, int level,
+			struct xfs_mount *mp, void *p, const char *filename,
+			int linenum, inst_t *ra);
 
 #define	XFS_ERROR_REPORT(e, lvl, mp)	\
 	xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address)
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 05a4bdd4be39..409fe81585fd 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -82,7 +82,7 @@ xfs_efi_item_format(xfs_efi_log_item_t	*efip,
 
 	log_vector->i_addr = (xfs_caddr_t)&(efip->efi_format);
 	log_vector->i_len = size;
-	XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_EFI_FORMAT);
+	log_vector->i_type = XLOG_REG_TYPE_EFI_FORMAT;
 	ASSERT(size >= sizeof(xfs_efi_log_format_t));
 }
 
@@ -106,7 +106,7 @@ xfs_efi_item_pin(xfs_efi_log_item_t *efip)
  */
 /*ARGSUSED*/
 STATIC void
-xfs_efi_item_unpin(xfs_efi_log_item_t *efip, int stale)
+xfs_efi_item_unpin(xfs_efi_log_item_t *efip)
 {
 	struct xfs_ail		*ailp = efip->efi_item.li_ailp;
 
@@ -224,7 +224,7 @@ static struct xfs_item_ops xfs_efi_item_ops = {
 	.iop_format	= (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
 					xfs_efi_item_format,
 	.iop_pin	= (void(*)(xfs_log_item_t*))xfs_efi_item_pin,
-	.iop_unpin	= (void(*)(xfs_log_item_t*, int))xfs_efi_item_unpin,
+	.iop_unpin	= (void(*)(xfs_log_item_t*))xfs_efi_item_unpin,
 	.iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *))
 					xfs_efi_item_unpin_remove,
 	.iop_trylock	= (uint(*)(xfs_log_item_t*))xfs_efi_item_trylock,
@@ -259,10 +259,7 @@ xfs_efi_init(xfs_mount_t	*mp,
 							     KM_SLEEP);
 	}
 
-	efip->efi_item.li_type = XFS_LI_EFI;
-	efip->efi_item.li_ops = &xfs_efi_item_ops;
-	efip->efi_item.li_mountp = mp;
-	efip->efi_item.li_ailp = mp->m_ail;
+	xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
 	efip->efi_format.efi_nextents = nextents;
 	efip->efi_format.efi_id = (__psint_t)(void*)efip;
 
@@ -406,7 +403,7 @@ xfs_efd_item_format(xfs_efd_log_item_t	*efdp,
 
 	log_vector->i_addr = (xfs_caddr_t)&(efdp->efd_format);
 	log_vector->i_len = size;
-	XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_EFD_FORMAT);
+	log_vector->i_type = XLOG_REG_TYPE_EFD_FORMAT;
 	ASSERT(size >= sizeof(xfs_efd_log_format_t));
 }
 
@@ -428,7 +425,7 @@ xfs_efd_item_pin(xfs_efd_log_item_t *efdp)
  */
 /*ARGSUSED*/
 STATIC void
-xfs_efd_item_unpin(xfs_efd_log_item_t *efdp, int stale)
+xfs_efd_item_unpin(xfs_efd_log_item_t *efdp)
 {
 	return;
 }
@@ -518,7 +515,7 @@ static struct xfs_item_ops xfs_efd_item_ops = {
 	.iop_format	= (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
 					xfs_efd_item_format,
 	.iop_pin	= (void(*)(xfs_log_item_t*))xfs_efd_item_pin,
-	.iop_unpin	= (void(*)(xfs_log_item_t*, int))xfs_efd_item_unpin,
+	.iop_unpin	= (void(*)(xfs_log_item_t*))xfs_efd_item_unpin,
 	.iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
 					xfs_efd_item_unpin_remove,
 	.iop_trylock	= (uint(*)(xfs_log_item_t*))xfs_efd_item_trylock,
@@ -554,10 +551,7 @@ xfs_efd_init(xfs_mount_t	*mp,
 							     KM_SLEEP);
 	}
 
-	efdp->efd_item.li_type = XFS_LI_EFD;
-	efdp->efd_item.li_ops = &xfs_efd_item_ops;
-	efdp->efd_item.li_mountp = mp;
-	efdp->efd_item.li_ailp = mp->m_ail;
+	xfs_log_item_init(mp, &efdp->efd_item, XFS_LI_EFD, &xfs_efd_item_ops);
 	efdp->efd_efip = efip;
 	efdp->efd_format.efd_nextents = nextents;
 	efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index a631e1451abb..390850ee6603 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -140,6 +140,7 @@ _xfs_filestream_pick_ag(
 	int		flags,
 	xfs_extlen_t	minlen)
 {
+	int		streams, max_streams;
 	int		err, trylock, nscan;
 	xfs_extlen_t	longest, free, minfree, maxfree = 0;
 	xfs_agnumber_t	ag, max_ag = NULLAGNUMBER;
@@ -155,15 +156,15 @@ _xfs_filestream_pick_ag(
 	trylock = XFS_ALLOC_FLAG_TRYLOCK;
 
 	for (nscan = 0; 1; nscan++) {
-
-		TRACE_AG_SCAN(mp, ag, xfs_filestream_peek_ag(mp, ag));
-
-		pag = mp->m_perag + ag;
+		pag = xfs_perag_get(mp, ag);
+		TRACE_AG_SCAN(mp, ag, atomic_read(&pag->pagf_fstrms));
 
 		if (!pag->pagf_init) {
 			err = xfs_alloc_pagf_init(mp, NULL, ag, trylock);
-			if (err && !trylock)
+			if (err && !trylock) {
+				xfs_perag_put(pag);
 				return err;
+			}
 		}
 
 		/* Might fail sometimes during the 1st pass with trylock set. */
@@ -173,6 +174,7 @@ _xfs_filestream_pick_ag(
 		/* Keep track of the AG with the most free blocks. */
 		if (pag->pagf_freeblks > maxfree) {
 			maxfree = pag->pagf_freeblks;
+			max_streams = atomic_read(&pag->pagf_fstrms);
 			max_ag = ag;
 		}
 
@@ -195,6 +197,8 @@ _xfs_filestream_pick_ag(
 
 			/* Break out, retaining the reference on the AG. */
 			free = pag->pagf_freeblks;
+			streams = atomic_read(&pag->pagf_fstrms);
+			xfs_perag_put(pag);
 			*agp = ag;
 			break;
 		}
@@ -202,6 +206,7 @@ _xfs_filestream_pick_ag(
 		/* Drop the reference on this AG, it's not usable. */
 		xfs_filestream_put_ag(mp, ag);
 next_ag:
+		xfs_perag_put(pag);
 		/* Move to the next AG, wrapping to AG 0 if necessary. */
 		if (++ag >= mp->m_sb.sb_agcount)
 			ag = 0;
@@ -229,6 +234,7 @@ next_ag:
 		if (max_ag != NULLAGNUMBER) {
 			xfs_filestream_get_ag(mp, max_ag);
 			TRACE_AG_PICK1(mp, max_ag, maxfree);
+			streams = max_streams;
 			free = maxfree;
 			*agp = max_ag;
 			break;
@@ -240,16 +246,14 @@ next_ag:
 		return 0;
 	}
 
-	TRACE_AG_PICK2(mp, startag, *agp, xfs_filestream_peek_ag(mp, *agp),
-			free, nscan, flags);
+	TRACE_AG_PICK2(mp, startag, *agp, streams, free, nscan, flags);
 
 	return 0;
 }
 
 /*
  * Set the allocation group number for a file or a directory, updating inode
- * references and per-AG references as appropriate.  Must be called with the
- * m_peraglock held in read mode.
+ * references and per-AG references as appropriate.
  */
 static int
 _xfs_filestream_update_ag(
@@ -451,20 +455,6 @@ xfs_filestream_unmount(
 }
 
 /*
- * If the mount point's m_perag array is going to be reallocated, all
- * outstanding cache entries must be flushed to avoid accessing reference count
- * addresses that have been freed.  The call to xfs_filestream_flush() must be
- * made inside the block that holds the m_peraglock in write mode to do the
- * reallocation.
- */
-void
-xfs_filestream_flush(
-	xfs_mount_t	*mp)
-{
-	xfs_mru_cache_flush(mp->m_filestream);
-}
-
-/*
  * Return the AG of the filestream the file or directory belongs to, or
  * NULLAGNUMBER otherwise.
  */
@@ -526,7 +516,6 @@ xfs_filestream_associate(
 
 	mp = pip->i_mount;
 	cache = mp->m_filestream;
-	down_read(&mp->m_peraglock);
 
 	/*
 	 * We have a problem, Houston.
@@ -543,10 +532,8 @@ xfs_filestream_associate(
 	 *
 	 * So, if we can't get the iolock without sleeping then just give up
 	 */
-	if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL)) {
-		up_read(&mp->m_peraglock);
+	if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL))
 		return 1;
-	}
 
 	/* If the parent directory is already in the cache, use its AG. */
 	item = xfs_mru_cache_lookup(cache, pip->i_ino);
@@ -601,7 +588,6 @@ exit_did_pick:
 
 exit:
 	xfs_iunlock(pip, XFS_IOLOCK_EXCL);
-	up_read(&mp->m_peraglock);
 	return -err;
 }
 
diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h
index 4aba67c5f64f..260f757bbc5d 100644
--- a/fs/xfs/xfs_filestream.h
+++ b/fs/xfs/xfs_filestream.h
@@ -79,12 +79,21 @@ extern ktrace_t *xfs_filestreams_trace_buf;
  * the cache that reference per-ag array elements that have since been
  * reallocated.
  */
+/*
+ * xfs_filestream_peek_ag is only used in tracing code
+ */
 static inline int
 xfs_filestream_peek_ag(
 	xfs_mount_t	*mp,
 	xfs_agnumber_t	agno)
 {
-	return atomic_read(&mp->m_perag[agno].pagf_fstrms);
+	struct xfs_perag *pag;
+	int		ret;
+
+	pag = xfs_perag_get(mp, agno);
+	ret = atomic_read(&pag->pagf_fstrms);
+	xfs_perag_put(pag);
+	return ret;
 }
 
 static inline int
@@ -92,7 +101,13 @@ xfs_filestream_get_ag(
 	xfs_mount_t	*mp,
 	xfs_agnumber_t	agno)
 {
-	return atomic_inc_return(&mp->m_perag[agno].pagf_fstrms);
+	struct xfs_perag *pag;
+	int		ret;
+
+	pag = xfs_perag_get(mp, agno);
+	ret = atomic_inc_return(&pag->pagf_fstrms);
+	xfs_perag_put(pag);
+	return ret;
 }
 
 static inline int
@@ -100,7 +115,13 @@ xfs_filestream_put_ag(
 	xfs_mount_t	*mp,
 	xfs_agnumber_t	agno)
 {
-	return atomic_dec_return(&mp->m_perag[agno].pagf_fstrms);
+	struct xfs_perag *pag;
+	int		ret;
+
+	pag = xfs_perag_get(mp, agno);
+	ret = atomic_dec_return(&pag->pagf_fstrms);
+	xfs_perag_put(pag);
+	return ret;
 }
 
 /* allocation selection flags */
@@ -114,7 +135,6 @@ int xfs_filestream_init(void);
 void xfs_filestream_uninit(void);
 int xfs_filestream_mount(struct xfs_mount *mp);
 void xfs_filestream_unmount(struct xfs_mount *mp);
-void xfs_filestream_flush(struct xfs_mount *mp);
 xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip);
 int xfs_filestream_associate(struct xfs_inode *dip, struct xfs_inode *ip);
 void xfs_filestream_deassociate(struct xfs_inode *ip);
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index f52ac276277e..7cf7220e7d5f 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -292,7 +292,8 @@ typedef struct xfs_bstat {
 	__s32		bs_extents;	/* number of extents		*/
 	__u32		bs_gen;		/* generation count		*/
 	__u16		bs_projid;	/* project id			*/
-	unsigned char	bs_pad[14];	/* pad space, unused		*/
+	__u16		bs_forkoff;	/* inode fork offset in bytes	*/
+	unsigned char	bs_pad[12];	/* pad space, unused		*/
 	__u32		bs_dmevmask;	/* DMIG event mask		*/
 	__u16		bs_dmstate;	/* DMIG state info		*/
 	__u16		bs_aextents;	/* attribute number of extents	*/
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index a13919a6a364..37a6f62c57b6 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -167,27 +167,14 @@ xfs_growfs_data_private(
 	}
 	new = nb - mp->m_sb.sb_dblocks;
 	oagcount = mp->m_sb.sb_agcount;
-	if (nagcount > oagcount) {
-		void *new_perag, *old_perag;
-
-		xfs_filestream_flush(mp);
-
-		new_perag = kmem_zalloc(sizeof(xfs_perag_t) * nagcount,
-					KM_MAYFAIL);
-		if (!new_perag)
-			return XFS_ERROR(ENOMEM);
-
-		down_write(&mp->m_peraglock);
-		memcpy(new_perag, mp->m_perag, sizeof(xfs_perag_t) * oagcount);
-		old_perag = mp->m_perag;
-		mp->m_perag = new_perag;
-
-		mp->m_flags |= XFS_MOUNT_32BITINODES;
-		nagimax = xfs_initialize_perag(mp, nagcount);
-		up_write(&mp->m_peraglock);
 
-		kmem_free(old_perag);
+	/* allocate the new per-ag structures */
+	if (nagcount > oagcount) {
+		error = xfs_initialize_perag(mp, nagcount, &nagimax);
+		if (error)
+			return error;
 	}
+
 	tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS);
 	tp->t_flags |= XFS_TRANS_RESERVE;
 	if ((error = xfs_trans_reserve(tp, XFS_GROWFS_SPACE_RES(mp),
@@ -196,6 +183,11 @@ xfs_growfs_data_private(
 		return error;
 	}
 
+	/*
+	 * Write new AG headers to disk. Non-transactional, but written
+	 * synchronously so they are completed prior to the growfs transaction
+	 * being logged.
+	 */
 	nfree = 0;
 	for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) {
 		/*
@@ -359,6 +351,12 @@ xfs_growfs_data_private(
 			goto error0;
 		}
 	}
+
+	/*
+	 * Update changed superblock fields transactionally. These are not
+	 * seen by the rest of the world until the transaction commit applies
+	 * them atomically to the superblock.
+	 */
 	if (nagcount > oagcount)
 		xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount);
 	if (nb > mp->m_sb.sb_dblocks)
@@ -369,9 +367,9 @@ xfs_growfs_data_private(
 	if (dpct)
 		xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
 	error = xfs_trans_commit(tp, 0);
-	if (error) {
+	if (error)
 		return error;
-	}
+
 	/* New allocation groups fully initialized, so update mount struct */
 	if (nagimax)
 		mp->m_maxagi = nagimax;
@@ -381,6 +379,8 @@ xfs_growfs_data_private(
 		mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
 	} else
 		mp->m_maxicount = 0;
+
+	/* update secondary superblocks. */
 	for (agno = 1; agno < nagcount; agno++) {
 		error = xfs_read_buf(mp, mp->m_ddev_targp,
 				  XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index cb907ba69c4c..c7142a064c48 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -205,7 +205,7 @@ xfs_ialloc_inode_init(
 		d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
 		fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
 					 mp->m_bsize * blks_per_cluster,
-					 XFS_BUF_LOCK);
+					 XBF_LOCK);
 		ASSERT(fbuf);
 		ASSERT(!XFS_BUF_GETERROR(fbuf));
 
@@ -253,6 +253,7 @@ xfs_ialloc_ag_alloc(
 	xfs_agino_t	thisino;	/* current inode number, for loop */
 	int		isaligned = 0;	/* inode allocation at stripe unit */
 					/* boundary */
+	struct xfs_perag *pag;
 
 	args.tp = tp;
 	args.mp = tp->t_mountp;
@@ -382,9 +383,9 @@ xfs_ialloc_ag_alloc(
 	newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
 	be32_add_cpu(&agi->agi_count, newlen);
 	be32_add_cpu(&agi->agi_freecount, newlen);
-	down_read(&args.mp->m_peraglock);
-	args.mp->m_perag[agno].pagi_freecount += newlen;
-	up_read(&args.mp->m_peraglock);
+	pag = xfs_perag_get(args.mp, agno);
+	pag->pagi_freecount += newlen;
+	xfs_perag_put(pag);
 	agi->agi_newino = cpu_to_be32(newino);
 
 	/*
@@ -486,9 +487,8 @@ xfs_ialloc_ag_select(
 	 */
 	agno = pagno;
 	flags = XFS_ALLOC_FLAG_TRYLOCK;
-	down_read(&mp->m_peraglock);
 	for (;;) {
-		pag = &mp->m_perag[agno];
+		pag = xfs_perag_get(mp, agno);
 		if (!pag->pagi_init) {
 			if (xfs_ialloc_read_agi(mp, tp, agno, &agbp)) {
 				agbp = NULL;
@@ -527,7 +527,7 @@ xfs_ialloc_ag_select(
 					agbp = NULL;
 					goto nextag;
 				}
-				up_read(&mp->m_peraglock);
+				xfs_perag_put(pag);
 				return agbp;
 			}
 		}
@@ -535,22 +535,19 @@ unlock_nextag:
 		if (agbp)
 			xfs_trans_brelse(tp, agbp);
 nextag:
+		xfs_perag_put(pag);
 		/*
 		 * No point in iterating over the rest, if we're shutting
 		 * down.
 		 */
-		if (XFS_FORCED_SHUTDOWN(mp)) {
-			up_read(&mp->m_peraglock);
+		if (XFS_FORCED_SHUTDOWN(mp))
 			return NULL;
-		}
 		agno++;
 		if (agno >= agcount)
 			agno = 0;
 		if (agno == pagno) {
-			if (flags == 0) {
-				up_read(&mp->m_peraglock);
+			if (flags == 0)
 				return NULL;
-			}
 			flags = 0;
 		}
 	}
@@ -672,6 +669,7 @@ xfs_dialloc(
 	xfs_agnumber_t	tagno;		/* testing allocation group number */
 	xfs_btree_cur_t	*tcur;		/* temp cursor */
 	xfs_inobt_rec_incore_t trec;	/* temp inode allocation record */
+	struct xfs_perag *pag;
 
 
 	if (*IO_agbp == NULL) {
@@ -771,13 +769,13 @@ nextag:
 			*inop = NULLFSINO;
 			return noroom ? ENOSPC : 0;
 		}
-		down_read(&mp->m_peraglock);
-		if (mp->m_perag[tagno].pagi_inodeok == 0) {
-			up_read(&mp->m_peraglock);
+		pag = xfs_perag_get(mp, tagno);
+		if (pag->pagi_inodeok == 0) {
+			xfs_perag_put(pag);
 			goto nextag;
 		}
 		error = xfs_ialloc_read_agi(mp, tp, tagno, &agbp);
-		up_read(&mp->m_peraglock);
+		xfs_perag_put(pag);
 		if (error)
 			goto nextag;
 		agi = XFS_BUF_TO_AGI(agbp);
@@ -790,6 +788,7 @@ nextag:
 	 */
 	agno = tagno;
 	*IO_agbp = NULL;
+	pag = xfs_perag_get(mp, agno);
 
  restart_pagno:
 	cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno));
@@ -808,7 +807,6 @@ nextag:
 	 * If in the same AG as the parent, try to get near the parent.
 	 */
 	if (pagno == agno) {
-		xfs_perag_t	*pag = &mp->m_perag[agno];
 		int		doneleft;	/* done, to the left */
 		int		doneright;	/* done, to the right */
 		int		searchdistance = 10;
@@ -1006,9 +1004,7 @@ alloc_inode:
 		goto error0;
 	be32_add_cpu(&agi->agi_freecount, -1);
 	xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
-	down_read(&mp->m_peraglock);
-	mp->m_perag[tagno].pagi_freecount--;
-	up_read(&mp->m_peraglock);
+	pag->pagi_freecount--;
 
 	error = xfs_check_agi_freecount(cur, agi);
 	if (error)
@@ -1016,12 +1012,14 @@ alloc_inode:
 
 	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
 	xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
+	xfs_perag_put(pag);
 	*inop = ino;
 	return 0;
 error1:
 	xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
 error0:
 	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+	xfs_perag_put(pag);
 	return error;
 }
 
@@ -1052,6 +1050,7 @@ xfs_difree(
 	xfs_mount_t	*mp;	/* mount structure for filesystem */
 	int		off;	/* offset of inode in inode chunk */
 	xfs_inobt_rec_incore_t rec;	/* btree record */
+	struct xfs_perag *pag;
 
 	mp = tp->t_mountp;
 
@@ -1088,9 +1087,7 @@ xfs_difree(
 	/*
 	 * Get the allocation group header.
 	 */
-	down_read(&mp->m_peraglock);
 	error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
-	up_read(&mp->m_peraglock);
 	if (error) {
 		cmn_err(CE_WARN,
 			"xfs_difree: xfs_ialloc_read_agi() returned an error %d on %s.  Returning error.",
@@ -1157,9 +1154,9 @@ xfs_difree(
 		be32_add_cpu(&agi->agi_count, -ilen);
 		be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
 		xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
-		down_read(&mp->m_peraglock);
-		mp->m_perag[agno].pagi_freecount -= ilen - 1;
-		up_read(&mp->m_peraglock);
+		pag = xfs_perag_get(mp, agno);
+		pag->pagi_freecount -= ilen - 1;
+		xfs_perag_put(pag);
 		xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
 		xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
 
@@ -1188,9 +1185,9 @@ xfs_difree(
 		 */
 		be32_add_cpu(&agi->agi_freecount, 1);
 		xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
-		down_read(&mp->m_peraglock);
-		mp->m_perag[agno].pagi_freecount++;
-		up_read(&mp->m_peraglock);
+		pag = xfs_perag_get(mp, agno);
+		pag->pagi_freecount++;
+		xfs_perag_put(pag);
 		xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
 	}
 
@@ -1206,6 +1203,63 @@ error0:
 	return error;
 }
 
+STATIC int
+xfs_imap_lookup(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	xfs_agnumber_t		agno,
+	xfs_agino_t		agino,
+	xfs_agblock_t		agbno,
+	xfs_agblock_t		*chunk_agbno,
+	xfs_agblock_t		*offset_agbno,
+	int			flags)
+{
+	struct xfs_inobt_rec_incore rec;
+	struct xfs_btree_cur	*cur;
+	struct xfs_buf		*agbp;
+	xfs_agino_t		startino;
+	int			error;
+	int			i;
+
+	error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+	if (error) {
+		xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
+				"xfs_ialloc_read_agi() returned "
+				"error %d, agno %d",
+				error, agno);
+		return error;
+	}
+
+	/*
+	 * derive and lookup the exact inode record for the given agino. If the
+	 * record cannot be found, then it's an invalid inode number and we
+	 * should abort.
+	 */
+	cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
+	startino = agino & ~(XFS_IALLOC_INODES(mp) - 1);
+	error = xfs_inobt_lookup(cur, startino, XFS_LOOKUP_EQ, &i);
+	if (!error) {
+		if (i)
+			error = xfs_inobt_get_rec(cur, &rec, &i);
+		if (!error && i == 0)
+			error = EINVAL;
+	}
+
+	xfs_trans_brelse(tp, agbp);
+	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+	if (error)
+		return error;
+
+	/* for untrusted inodes check it is allocated first */
+	if ((flags & XFS_IGET_UNTRUSTED) &&
+	    (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)))
+		return EINVAL;
+
+	*chunk_agbno = XFS_AGINO_TO_AGBNO(mp, rec.ir_startino);
+	*offset_agbno = agbno - *chunk_agbno;
+	return 0;
+}
+
 /*
  * Return the location of the inode in imap, for mapping it into a buffer.
  */
@@ -1238,8 +1292,11 @@ xfs_imap(
 	if (agno >= mp->m_sb.sb_agcount || agbno >= mp->m_sb.sb_agblocks ||
 	    ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
 #ifdef DEBUG
-		/* no diagnostics for bulkstat, ino comes from userspace */
-		if (flags & XFS_IGET_BULKSTAT)
+		/*
+		 * Don't output diagnostic information for untrusted inodes
+		 * as they can be invalid without implying corruption.
+		 */
+		if (flags & XFS_IGET_UNTRUSTED)
 			return XFS_ERROR(EINVAL);
 		if (agno >= mp->m_sb.sb_agcount) {
 			xfs_fs_cmn_err(CE_ALERT, mp,
@@ -1266,6 +1323,23 @@ xfs_imap(
 		return XFS_ERROR(EINVAL);
 	}
 
+	blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog;
+
+	/*
+	 * For bulkstat and handle lookups, we have an untrusted inode number
+	 * that we have to verify is valid. We cannot do this just by reading
+	 * the inode buffer as it may have been unlinked and removed leaving
+	 * inodes in stale state on disk. Hence we have to do a btree lookup
+	 * in all cases where an untrusted inode number is passed.
+	 */
+	if (flags & XFS_IGET_UNTRUSTED) {
+		error = xfs_imap_lookup(mp, tp, agno, agino, agbno,
+					&chunk_agbno, &offset_agbno, flags);
+		if (error)
+			return error;
+		goto out_map;
+	}
+
 	/*
 	 * If the inode cluster size is the same as the blocksize or
 	 * smaller we get to the buffer by simple arithmetics.
@@ -1280,24 +1354,6 @@ xfs_imap(
 		return 0;
 	}
 
-	blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog;
-
-	/*
-	 * If we get a block number passed from bulkstat we can use it to
-	 * find the buffer easily.
-	 */
-	if (imap->im_blkno) {
-		offset = XFS_INO_TO_OFFSET(mp, ino);
-		ASSERT(offset < mp->m_sb.sb_inopblock);
-
-		cluster_agbno = xfs_daddr_to_agbno(mp, imap->im_blkno);
-		offset += (agbno - cluster_agbno) * mp->m_sb.sb_inopblock;
-
-		imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
-		imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
-		return 0;
-	}
-
 	/*
 	 * If the inode chunks are aligned then use simple maths to
 	 * find the location. Otherwise we have to do a btree
@@ -1307,52 +1363,13 @@ xfs_imap(
 		offset_agbno = agbno & mp->m_inoalign_mask;
 		chunk_agbno = agbno - offset_agbno;
 	} else {
-		xfs_btree_cur_t	*cur;	/* inode btree cursor */
-		xfs_inobt_rec_incore_t chunk_rec;
-		xfs_buf_t	*agbp;	/* agi buffer */
-		int		i;	/* temp state */
-
-		down_read(&mp->m_peraglock);
-		error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
-		up_read(&mp->m_peraglock);
-		if (error) {
-			xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
-					"xfs_ialloc_read_agi() returned "
-					"error %d, agno %d",
-					error, agno);
-			return error;
-		}
-
-		cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
-		error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
-		if (error) {
-			xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
-					"xfs_inobt_lookup() failed");
-			goto error0;
-		}
-
-		error = xfs_inobt_get_rec(cur, &chunk_rec, &i);
-		if (error) {
-			xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
-					"xfs_inobt_get_rec() failed");
-			goto error0;
-		}
-		if (i == 0) {
-#ifdef DEBUG
-			xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
-					"xfs_inobt_get_rec() failed");
-#endif /* DEBUG */
-			error = XFS_ERROR(EINVAL);
-		}
- error0:
-		xfs_trans_brelse(tp, agbp);
-		xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+		error = xfs_imap_lookup(mp, tp, agno, agino, agbno,
+					&chunk_agbno, &offset_agbno, flags);
 		if (error)
 			return error;
-		chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_rec.ir_startino);
-		offset_agbno = agbno - chunk_agbno;
 	}
 
+out_map:
 	ASSERT(agbno >= chunk_agbno);
 	cluster_agbno = chunk_agbno +
 		((offset_agbno / blks_per_cluster) * blks_per_cluster);
@@ -1379,7 +1396,6 @@ xfs_imap(
 			XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
 		return XFS_ERROR(EINVAL);
 	}
-
 	return 0;
 }
 
@@ -1523,8 +1539,7 @@ xfs_ialloc_read_agi(
 		return error;
 
 	agi = XFS_BUF_TO_AGI(*bpp);
-	pag = &mp->m_perag[agno];
-
+	pag = xfs_perag_get(mp, agno);
 	if (!pag->pagi_init) {
 		pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
 		pag->pagi_count = be32_to_cpu(agi->agi_count);
@@ -1537,6 +1552,7 @@ xfs_ialloc_read_agi(
 	 */
 	ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
 		XFS_FORCED_SHUTDOWN(mp));
+	xfs_perag_put(pag);
 	return 0;
 }
 
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 155e798f30a1..8f8b91be2c99 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -190,13 +190,12 @@ xfs_iget_cache_hit(
 		trace_xfs_iget_reclaim(ip);
 
 		/*
-		 * We need to set XFS_INEW atomically with clearing the
-		 * reclaimable tag so that we do have an indicator of the
-		 * inode still being initialized.
+		 * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
+		 * from stomping over us while we recycle the inode.  We can't
+		 * clear the radix tree reclaimable tag yet as it requires
+		 * pag_ici_lock to be held exclusive.
 		 */
-		ip->i_flags |= XFS_INEW;
-		ip->i_flags &= ~XFS_IRECLAIMABLE;
-		__xfs_inode_clear_reclaim_tag(mp, pag, ip);
+		ip->i_flags |= XFS_IRECLAIM;
 
 		spin_unlock(&ip->i_flags_lock);
 		read_unlock(&pag->pag_ici_lock);
@@ -216,7 +215,15 @@ xfs_iget_cache_hit(
 			trace_xfs_iget_reclaim(ip);
 			goto out_error;
 		}
+
+		write_lock(&pag->pag_ici_lock);
+		spin_lock(&ip->i_flags_lock);
+		ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM);
+		ip->i_flags |= XFS_INEW;
+		__xfs_inode_clear_reclaim_tag(mp, pag, ip);
 		inode->i_state = I_NEW;
+		spin_unlock(&ip->i_flags_lock);
+		write_unlock(&pag->pag_ici_lock);
 	} else {
 		/* If the VFS inode is being torn down, pause and try again. */
 		if (!igrab(inode)) {
@@ -252,7 +259,6 @@ xfs_iget_cache_miss(
 	xfs_trans_t		*tp,
 	xfs_ino_t		ino,
 	struct xfs_inode	**ipp,
-	xfs_daddr_t		bno,
 	int			flags,
 	int			lock_flags)
 {
@@ -265,7 +271,7 @@ xfs_iget_cache_miss(
 	if (!ip)
 		return ENOMEM;
 
-	error = xfs_iread(mp, tp, ip, bno, flags);
+	error = xfs_iread(mp, tp, ip, flags);
 	if (error)
 		goto out_destroy;
 
@@ -351,8 +357,6 @@ out_destroy:
  *        within the file system for the inode being requested.
  * lock_flags -- flags indicating how to lock the inode.  See the comment
  *		 for xfs_ilock() for a list of valid values.
- * bno -- the block number starting the buffer containing the inode,
- *	  if known (as by bulkstat), else 0.
  */
 int
 xfs_iget(
@@ -361,8 +365,7 @@ xfs_iget(
 	xfs_ino_t	ino,
 	uint		flags,
 	uint		lock_flags,
-	xfs_inode_t	**ipp,
-	xfs_daddr_t	bno)
+	xfs_inode_t	**ipp)
 {
 	xfs_inode_t	*ip;
 	int		error;
@@ -374,10 +377,7 @@ xfs_iget(
 		return EINVAL;
 
 	/* get the perag structure and ensure that it's inode capable */
-	pag = xfs_get_perag(mp, ino);
-	if (!pag->pagi_inodeok)
-		return EINVAL;
-	ASSERT(pag->pag_ici_init);
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
 	agino = XFS_INO_TO_AGINO(mp, ino);
 
 again:
@@ -393,12 +393,12 @@ again:
 		read_unlock(&pag->pag_ici_lock);
 		XFS_STATS_INC(xs_ig_missed);
 
-		error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, bno,
+		error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
 							flags, lock_flags);
 		if (error)
 			goto out_error_or_again;
 	}
-	xfs_put_perag(mp, pag);
+	xfs_perag_put(pag);
 
 	*ipp = ip;
 
@@ -417,7 +417,7 @@ out_error_or_again:
 		delay(1);
 		goto again;
 	}
-	xfs_put_perag(mp, pag);
+	xfs_perag_put(pag);
 	return error;
 }
 
@@ -488,12 +488,12 @@ xfs_ireclaim(
 	 * added to the tree assert that it's been there before to catch
 	 * problems with the inode life time early on.
 	 */
-	pag = xfs_get_perag(mp, ip->i_ino);
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
 	write_lock(&pag->pag_ici_lock);
 	if (!radix_tree_delete(&pag->pag_ici_root, agino))
 		ASSERT(0);
 	write_unlock(&pag->pag_ici_lock);
-	xfs_put_perag(mp, pag);
+	xfs_perag_put(pag);
 
 	/*
 	 * Here we do an (almost) spurious inode lock in order to coordinate
@@ -737,30 +737,24 @@ xfs_ilock_demote(
 }
 
 #ifdef DEBUG
-/*
- * Debug-only routine, without additional rw_semaphore APIs, we can
- * now only answer requests regarding whether we hold the lock for write
- * (reader state is outside our visibility, we only track writer state).
- *
- * Note: this means !xfs_isilocked would give false positives, so don't do that.
- */
 int
 xfs_isilocked(
 	xfs_inode_t		*ip,
 	uint			lock_flags)
 {
-	if ((lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) ==
-			XFS_ILOCK_EXCL) {
-		if (!ip->i_lock.mr_writer)
-			return 0;
+	if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
+		if (!(lock_flags & XFS_ILOCK_SHARED))
+			return !!ip->i_lock.mr_writer;
+		return rwsem_is_locked(&ip->i_lock.mr_lock);
 	}
 
-	if ((lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) ==
-			XFS_IOLOCK_EXCL) {
-		if (!ip->i_iolock.mr_writer)
-			return 0;
+	if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
+		if (!(lock_flags & XFS_IOLOCK_SHARED))
+			return !!ip->i_iolock.mr_writer;
+		return rwsem_is_locked(&ip->i_iolock.mr_lock);
 	}
 
-	return 1;
+	ASSERT(0);
+	return 0;
 }
 #endif
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index ef77fd88c8e3..b76a829d7e20 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -151,7 +151,7 @@ xfs_imap_to_bp(
 				"an error %d on %s.  Returning error.",
 				error, mp->m_fsname);
 		} else {
-			ASSERT(buf_flags & XFS_BUF_TRYLOCK);
+			ASSERT(buf_flags & XBF_TRYLOCK);
 		}
 		return error;
 	}
@@ -177,7 +177,7 @@ xfs_imap_to_bp(
 		if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
 						XFS_ERRTAG_ITOBP_INOTOBP,
 						XFS_RANDOM_ITOBP_INOTOBP))) {
-			if (iget_flags & XFS_IGET_BULKSTAT) {
+			if (iget_flags & XFS_IGET_UNTRUSTED) {
 				xfs_trans_brelse(tp, bp);
 				return XFS_ERROR(EINVAL);
 			}
@@ -239,7 +239,7 @@ xfs_inotobp(
 	if (error)
 		return error;
 
-	error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, imap_flags);
+	error = xfs_imap_to_bp(mp, tp, &imap, &bp, XBF_LOCK, imap_flags);
 	if (error)
 		return error;
 
@@ -285,7 +285,7 @@ xfs_itobp(
 		return error;
 
 	if (!bp) {
-		ASSERT(buf_flags & XFS_BUF_TRYLOCK);
+		ASSERT(buf_flags & XBF_TRYLOCK);
 		ASSERT(tp == NULL);
 		*bpp = NULL;
 		return EAGAIN;
@@ -787,7 +787,6 @@ xfs_iread(
 	xfs_mount_t	*mp,
 	xfs_trans_t	*tp,
 	xfs_inode_t	*ip,
-	xfs_daddr_t	bno,
 	uint		iget_flags)
 {
 	xfs_buf_t	*bp;
@@ -797,17 +796,15 @@ xfs_iread(
 	/*
 	 * Fill in the location information in the in-core inode.
 	 */
-	ip->i_imap.im_blkno = bno;
 	error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags);
 	if (error)
 		return error;
-	ASSERT(bno == 0 || bno == ip->i_imap.im_blkno);
 
 	/*
 	 * Get pointers to the on-disk inode and the buffer containing it.
 	 */
 	error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp,
-			       XFS_BUF_LOCK, iget_flags);
+			       XBF_LOCK, iget_flags);
 	if (error)
 		return error;
 	dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
@@ -1751,7 +1748,7 @@ xfs_iunlink(
 		 * Here we put the head pointer into our next pointer,
 		 * and then we fall through to point the head at us.
 		 */
-		error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK);
+		error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
 		if (error)
 			return error;
 
@@ -1833,7 +1830,7 @@ xfs_iunlink_remove(
 		 * of dealing with the buffer when there is no need to
 		 * change it.
 		 */
-		error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK);
+		error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
 		if (error) {
 			cmn_err(CE_WARN,
 				"xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
@@ -1895,7 +1892,7 @@ xfs_iunlink_remove(
 		 * Now last_ibp points to the buffer previous to us on
 		 * the unlinked list.  Pull us from the list.
 		 */
-		error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK);
+		error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
 		if (error) {
 			cmn_err(CE_WARN,
 				"xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
@@ -1940,14 +1937,15 @@ xfs_ifree_cluster(
 	int			blks_per_cluster;
 	int			nbufs;
 	int			ninodes;
-	int			i, j, found, pre_flushed;
+	int			i, j;
 	xfs_daddr_t		blkno;
 	xfs_buf_t		*bp;
-	xfs_inode_t		*ip, **ip_found;
+	xfs_inode_t		*ip;
 	xfs_inode_log_item_t	*iip;
 	xfs_log_item_t		*lip;
-	xfs_perag_t		*pag = xfs_get_perag(mp, inum);
+	struct xfs_perag	*pag;
 
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
 	if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
 		blks_per_cluster = 1;
 		ninodes = mp->m_sb.sb_inopblock;
@@ -1959,114 +1957,97 @@ xfs_ifree_cluster(
 		nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster;
 	}
 
-	ip_found = kmem_alloc(ninodes * sizeof(xfs_inode_t *), KM_NOFS);
-
 	for (j = 0; j < nbufs; j++, inum += ninodes) {
+		int	found = 0;
+
 		blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
 					 XFS_INO_TO_AGBNO(mp, inum));
 
+		/*
+		 * We obtain and lock the backing buffer first in the process
+		 * here, as we have to ensure that any dirty inode that we
+		 * can't get the flush lock on is attached to the buffer.
+		 * If we scan the in-memory inodes first, then buffer IO can
+		 * complete before we get a lock on it, and hence we may fail
+		 * to mark all the active inodes on the buffer stale.
+		 */
+		bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
+					mp->m_bsize * blks_per_cluster,
+					XBF_LOCK);
 
 		/*
-		 * Look for each inode in memory and attempt to lock it,
-		 * we can be racing with flush and tail pushing here.
-		 * any inode we get the locks on, add to an array of
-		 * inode items to process later.
+		 * Walk the inodes already attached to the buffer and mark them
+		 * stale. These will all have the flush locks held, so an
+		 * in-memory inode walk can't lock them.
+		 */
+		lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+		while (lip) {
+			if (lip->li_type == XFS_LI_INODE) {
+				iip = (xfs_inode_log_item_t *)lip;
+				ASSERT(iip->ili_logged == 1);
+				lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done;
+				xfs_trans_ail_copy_lsn(mp->m_ail,
+							&iip->ili_flush_lsn,
+							&iip->ili_item.li_lsn);
+				xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
+				found++;
+			}
+			lip = lip->li_bio_list;
+		}
+
+		/*
+		 * For each inode in memory attempt to add it to the inode
+		 * buffer and set it up for being staled on buffer IO
+		 * completion.  This is safe as we've locked out tail pushing
+		 * and flushing by locking the buffer.
 		 *
-		 * The get the buffer lock, we could beat a flush
-		 * or tail pushing thread to the lock here, in which
-		 * case they will go looking for the inode buffer
-		 * and fail, we need some other form of interlock
-		 * here.
+		 * We have already marked every inode that was part of a
+		 * transaction stale above, which means there is no point in
+		 * even trying to lock them.
 		 */
-		found = 0;
 		for (i = 0; i < ninodes; i++) {
 			read_lock(&pag->pag_ici_lock);
 			ip = radix_tree_lookup(&pag->pag_ici_root,
 					XFS_INO_TO_AGINO(mp, (inum + i)));
 
-			/* Inode not in memory or we found it already,
-			 * nothing to do
-			 */
+			/* Inode not in memory or stale, nothing to do */
 			if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) {
 				read_unlock(&pag->pag_ici_lock);
 				continue;
 			}
 
-			if (xfs_inode_clean(ip)) {
-				read_unlock(&pag->pag_ici_lock);
-				continue;
-			}
-
-			/* If we can get the locks then add it to the
-			 * list, otherwise by the time we get the bp lock
-			 * below it will already be attached to the
-			 * inode buffer.
-			 */
-
-			/* This inode will already be locked - by us, lets
-			 * keep it that way.
-			 */
-
-			if (ip == free_ip) {
-				if (xfs_iflock_nowait(ip)) {
-					xfs_iflags_set(ip, XFS_ISTALE);
-					if (xfs_inode_clean(ip)) {
-						xfs_ifunlock(ip);
-					} else {
-						ip_found[found++] = ip;
-					}
-				}
+			/* don't try to lock/unlock the current inode */
+			if (ip != free_ip &&
+			    !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
 				read_unlock(&pag->pag_ici_lock);
 				continue;
 			}
+			read_unlock(&pag->pag_ici_lock);
 
-			if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
-				if (xfs_iflock_nowait(ip)) {
-					xfs_iflags_set(ip, XFS_ISTALE);
-
-					if (xfs_inode_clean(ip)) {
-						xfs_ifunlock(ip);
-						xfs_iunlock(ip, XFS_ILOCK_EXCL);
-					} else {
-						ip_found[found++] = ip;
-					}
-				} else {
+			if (!xfs_iflock_nowait(ip)) {
+				if (ip != free_ip)
 					xfs_iunlock(ip, XFS_ILOCK_EXCL);
-				}
+				continue;
 			}
-			read_unlock(&pag->pag_ici_lock);
-		}
 
-		bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, 
-					mp->m_bsize * blks_per_cluster,
-					XFS_BUF_LOCK);
-
-		pre_flushed = 0;
-		lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
-		while (lip) {
-			if (lip->li_type == XFS_LI_INODE) {
-				iip = (xfs_inode_log_item_t *)lip;
-				ASSERT(iip->ili_logged == 1);
-				lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done;
-				xfs_trans_ail_copy_lsn(mp->m_ail,
-							&iip->ili_flush_lsn,
-							&iip->ili_item.li_lsn);
-				xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
-				pre_flushed++;
+			xfs_iflags_set(ip, XFS_ISTALE);
+			if (xfs_inode_clean(ip)) {
+				ASSERT(ip != free_ip);
+				xfs_ifunlock(ip);
+				xfs_iunlock(ip, XFS_ILOCK_EXCL);
+				continue;
 			}
-			lip = lip->li_bio_list;
-		}
 
-		for (i = 0; i < found; i++) {
-			ip = ip_found[i];
 			iip = ip->i_itemp;
-
 			if (!iip) {
+				/* inode with unlogged changes only */
+				ASSERT(ip != free_ip);
 				ip->i_update_core = 0;
 				xfs_ifunlock(ip);
 				xfs_iunlock(ip, XFS_ILOCK_EXCL);
 				continue;
 			}
+			found++;
 
 			iip->ili_last_fields = iip->ili_format.ilf_fields;
 			iip->ili_format.ilf_fields = 0;
@@ -2077,18 +2058,17 @@ xfs_ifree_cluster(
 			xfs_buf_attach_iodone(bp,
 				(void(*)(xfs_buf_t*,xfs_log_item_t*))
 				xfs_istale_done, (xfs_log_item_t *)iip);
-			if (ip != free_ip) {
+
+			if (ip != free_ip)
 				xfs_iunlock(ip, XFS_ILOCK_EXCL);
-			}
 		}
 
-		if (found || pre_flushed)
+		if (found)
 			xfs_trans_stale_inode_buf(tp, bp);
 		xfs_trans_binval(tp, bp);
 	}
 
-	kmem_free(ip_found);
-	xfs_put_perag(mp, pag);
+	xfs_perag_put(pag);
 }
 
 /*
@@ -2150,7 +2130,7 @@ xfs_ifree(
 
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 
-	error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, XFS_BUF_LOCK);
+	error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, XBF_LOCK);
 	if (error)
 		return error;
 
@@ -2438,72 +2418,33 @@ xfs_idestroy_fork(
 }
 
 /*
- * Increment the pin count of the given buffer.
- * This value is protected by ipinlock spinlock in the mount structure.
+ * This is called to unpin an inode.  The caller must have the inode locked
+ * in at least shared mode so that the buffer cannot be subsequently pinned
+ * once someone is waiting for it to be unpinned.
  */
-void
-xfs_ipin(
-	xfs_inode_t	*ip)
-{
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-
-	atomic_inc(&ip->i_pincount);
-}
-
-/*
- * Decrement the pin count of the given inode, and wake up
- * anyone in xfs_iwait_unpin() if the count goes to 0.  The
- * inode must have been previously pinned with a call to xfs_ipin().
- */
-void
-xfs_iunpin(
-	xfs_inode_t	*ip)
-{
-	ASSERT(atomic_read(&ip->i_pincount) > 0);
-
-	if (atomic_dec_and_test(&ip->i_pincount))
-		wake_up(&ip->i_ipin_wait);
-}
-
-/*
- * This is called to unpin an inode. It can be directed to wait or to return
- * immediately without waiting for the inode to be unpinned.  The caller must
- * have the inode locked in at least shared mode so that the buffer cannot be
- * subsequently pinned once someone is waiting for it to be unpinned.
- */
-STATIC void
-__xfs_iunpin_wait(
-	xfs_inode_t	*ip,
-	int		wait)
+static void
+xfs_iunpin_nowait(
+	struct xfs_inode	*ip)
 {
-	xfs_inode_log_item_t	*iip = ip->i_itemp;
-
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
-	if (atomic_read(&ip->i_pincount) == 0)
-		return;
+
+	trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
 
 	/* Give the log a push to start the unpinning I/O */
-	xfs_log_force(ip->i_mount, (iip && iip->ili_last_lsn) ?
-				iip->ili_last_lsn : 0, XFS_LOG_FORCE);
-	if (wait)
-		wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0));
-}
+	xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0);
 
-static inline void
-xfs_iunpin_wait(
-	xfs_inode_t	*ip)
-{
-	__xfs_iunpin_wait(ip, 1);
 }
 
-static inline void
-xfs_iunpin_nowait(
-	xfs_inode_t	*ip)
+void
+xfs_iunpin_wait(
+	struct xfs_inode	*ip)
 {
-	__xfs_iunpin_wait(ip, 0);
+	if (xfs_ipincount(ip)) {
+		xfs_iunpin_nowait(ip);
+		wait_event(ip->i_ipin_wait, (xfs_ipincount(ip) == 0));
+	}
 }
 
-
 /*
  * xfs_iextents_copy()
  *
@@ -2675,7 +2616,7 @@ xfs_iflush_cluster(
 	xfs_buf_t	*bp)
 {
 	xfs_mount_t		*mp = ip->i_mount;
-	xfs_perag_t		*pag = xfs_get_perag(mp, ip->i_ino);
+	struct xfs_perag	*pag;
 	unsigned long		first_index, mask;
 	unsigned long		inodes_per_cluster;
 	int			ilist_size;
@@ -2686,14 +2627,13 @@ xfs_iflush_cluster(
 	int			bufwasdelwri;
 	int			i;
 
-	ASSERT(pag->pagi_inodeok);
-	ASSERT(pag->pag_ici_init);
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
 
 	inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog;
 	ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
 	ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS);
 	if (!ilist)
-		return 0;
+		goto out_put;
 
 	mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
 	first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
@@ -2762,6 +2702,8 @@ xfs_iflush_cluster(
 out_free:
 	read_unlock(&pag->pag_ici_lock);
 	kmem_free(ilist);
+out_put:
+	xfs_perag_put(pag);
 	return 0;
 
 
@@ -2805,6 +2747,7 @@ cluster_corrupt_out:
 	 */
 	xfs_iflush_abort(iq);
 	kmem_free(ilist);
+	xfs_perag_put(pag);
 	return XFS_ERROR(EFSCORRUPTED);
 }
 
@@ -2827,8 +2770,6 @@ xfs_iflush(
 	xfs_dinode_t		*dip;
 	xfs_mount_t		*mp;
 	int			error;
-	int			noblock = (flags == XFS_IFLUSH_ASYNC_NOBLOCK);
-	enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) };
 
 	XFS_STATS_INC(xs_iflush_count);
 
@@ -2841,15 +2782,6 @@ xfs_iflush(
 	mp = ip->i_mount;
 
 	/*
-	 * If the inode isn't dirty, then just release the inode flush lock and
-	 * do nothing.
-	 */
-	if (xfs_inode_clean(ip)) {
-		xfs_ifunlock(ip);
-		return 0;
-	}
-
-	/*
 	 * We can't flush the inode until it is unpinned, so wait for it if we
 	 * are allowed to block.  We know noone new can pin it, because we are
 	 * holding the inode lock shared and you need to hold it exclusively to
@@ -2860,7 +2792,7 @@ xfs_iflush(
 	 * in the same cluster are dirty, they will probably write the inode
 	 * out for us if they occur after the log force completes.
 	 */
-	if (noblock && xfs_ipincount(ip)) {
+	if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) {
 		xfs_iunpin_nowait(ip);
 		xfs_ifunlock(ip);
 		return EAGAIN;
@@ -2894,60 +2826,10 @@ xfs_iflush(
 	}
 
 	/*
-	 * Decide how buffer will be flushed out.  This is done before
-	 * the call to xfs_iflush_int because this field is zeroed by it.
-	 */
-	if (iip != NULL && iip->ili_format.ilf_fields != 0) {
-		/*
-		 * Flush out the inode buffer according to the directions
-		 * of the caller.  In the cases where the caller has given
-		 * us a choice choose the non-delwri case.  This is because
-		 * the inode is in the AIL and we need to get it out soon.
-		 */
-		switch (flags) {
-		case XFS_IFLUSH_SYNC:
-		case XFS_IFLUSH_DELWRI_ELSE_SYNC:
-			flags = 0;
-			break;
-		case XFS_IFLUSH_ASYNC_NOBLOCK:
-		case XFS_IFLUSH_ASYNC:
-		case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
-			flags = INT_ASYNC;
-			break;
-		case XFS_IFLUSH_DELWRI:
-			flags = INT_DELWRI;
-			break;
-		default:
-			ASSERT(0);
-			flags = 0;
-			break;
-		}
-	} else {
-		switch (flags) {
-		case XFS_IFLUSH_DELWRI_ELSE_SYNC:
-		case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
-		case XFS_IFLUSH_DELWRI:
-			flags = INT_DELWRI;
-			break;
-		case XFS_IFLUSH_ASYNC_NOBLOCK:
-		case XFS_IFLUSH_ASYNC:
-			flags = INT_ASYNC;
-			break;
-		case XFS_IFLUSH_SYNC:
-			flags = 0;
-			break;
-		default:
-			ASSERT(0);
-			flags = 0;
-			break;
-		}
-	}
-
-	/*
 	 * Get the buffer containing the on-disk inode.
 	 */
 	error = xfs_itobp(mp, NULL, ip, &dip, &bp,
-				noblock ? XFS_BUF_TRYLOCK : XFS_BUF_LOCK);
+				(flags & SYNC_WAIT) ? XBF_LOCK : XBF_TRYLOCK);
 	if (error || !bp) {
 		xfs_ifunlock(ip);
 		return error;
@@ -2965,7 +2847,7 @@ xfs_iflush(
 	 * get stuck waiting in the write for too long.
 	 */
 	if (XFS_BUF_ISPINNED(bp))
-		xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
+		xfs_log_force(mp, 0);
 
 	/*
 	 * inode clustering:
@@ -2975,13 +2857,10 @@ xfs_iflush(
 	if (error)
 		goto cluster_corrupt_out;
 
-	if (flags & INT_DELWRI) {
-		xfs_bdwrite(mp, bp);
-	} else if (flags & INT_ASYNC) {
-		error = xfs_bawrite(mp, bp);
-	} else {
+	if (flags & SYNC_WAIT)
 		error = xfs_bwrite(mp, bp);
-	}
+	else
+		xfs_bdwrite(mp, bp);
 	return error;
 
 corrupt_out:
@@ -3016,16 +2895,6 @@ xfs_iflush_int(
 	iip = ip->i_itemp;
 	mp = ip->i_mount;
 
-
-	/*
-	 * If the inode isn't dirty, then just release the inode
-	 * flush lock and do nothing.
-	 */
-	if (xfs_inode_clean(ip)) {
-		xfs_ifunlock(ip);
-		return 0;
-	}
-
 	/* set *dip = inode's place in the buffer */
 	dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
 
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index ec1f28c4fc4f..78550df13cd6 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -420,16 +420,6 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
 #define XFS_ILOCK_DEP(flags)	(((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)
 
 /*
- * Flags for xfs_iflush()
- */
-#define	XFS_IFLUSH_DELWRI_ELSE_SYNC	1
-#define	XFS_IFLUSH_DELWRI_ELSE_ASYNC	2
-#define	XFS_IFLUSH_SYNC			3
-#define	XFS_IFLUSH_ASYNC		4
-#define	XFS_IFLUSH_DELWRI		5
-#define	XFS_IFLUSH_ASYNC_NOBLOCK	6
-
-/*
  * Flags for xfs_itruncate_start().
  */
 #define	XFS_ITRUNC_DEFINITE	0x1
@@ -452,7 +442,7 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
  * xfs_iget.c prototypes.
  */
 int		xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
-			 uint, uint, xfs_inode_t **, xfs_daddr_t);
+			 uint, uint, xfs_inode_t **);
 void		xfs_iput(xfs_inode_t *, uint);
 void		xfs_iput_new(xfs_inode_t *, uint);
 void		xfs_ilock(xfs_inode_t *, uint);
@@ -481,14 +471,14 @@ int		xfs_itruncate_finish(struct xfs_trans **, xfs_inode_t *,
 int		xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
 
 void		xfs_iext_realloc(xfs_inode_t *, int, int);
-void		xfs_ipin(xfs_inode_t *);
-void		xfs_iunpin(xfs_inode_t *);
+void		xfs_iunpin_wait(xfs_inode_t *);
 int		xfs_iflush(xfs_inode_t *, uint);
 void		xfs_ichgtime(xfs_inode_t *, int);
 void		xfs_lock_inodes(xfs_inode_t **, int, uint);
 void		xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
 
 void		xfs_synchronize_times(xfs_inode_t *);
+void		xfs_mark_inode_dirty(xfs_inode_t *);
 void		xfs_mark_inode_dirty_sync(xfs_inode_t *);
 
 #define IHOLD(ip) \
@@ -510,7 +500,7 @@ do { \
  * Flags for xfs_iget()
  */
 #define XFS_IGET_CREATE		0x1
-#define XFS_IGET_BULKSTAT	0x2
+#define XFS_IGET_UNTRUSTED	0x2
 
 int		xfs_inotobp(struct xfs_mount *, struct xfs_trans *,
 			    xfs_ino_t, struct xfs_dinode **,
@@ -519,7 +509,7 @@ int		xfs_itobp(struct xfs_mount *, struct xfs_trans *,
 			  struct xfs_inode *, struct xfs_dinode **,
 			  struct xfs_buf **, uint);
 int		xfs_iread(struct xfs_mount *, struct xfs_trans *,
-			  struct xfs_inode *, xfs_daddr_t, uint);
+			  struct xfs_inode *, uint);
 void		xfs_dinode_to_disk(struct xfs_dinode *,
 				   struct xfs_icdinode *);
 void		xfs_idestroy_fork(struct xfs_inode *, int);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index f38855d21ea5..cf8249a60004 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -228,7 +228,7 @@ xfs_inode_item_format(
 
 	vecp->i_addr = (xfs_caddr_t)&iip->ili_format;
 	vecp->i_len  = sizeof(xfs_inode_log_format_t);
-	XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IFORMAT);
+	vecp->i_type = XLOG_REG_TYPE_IFORMAT;
 	vecp++;
 	nvecs	     = 1;
 
@@ -279,7 +279,7 @@ xfs_inode_item_format(
 
 	vecp->i_addr = (xfs_caddr_t)&ip->i_d;
 	vecp->i_len  = sizeof(struct xfs_icdinode);
-	XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_ICORE);
+	vecp->i_type = XLOG_REG_TYPE_ICORE;
 	vecp++;
 	nvecs++;
 	iip->ili_format.ilf_fields |= XFS_ILOG_CORE;
@@ -336,7 +336,7 @@ xfs_inode_item_format(
 				vecp->i_addr =
 					(char *)(ip->i_df.if_u1.if_extents);
 				vecp->i_len = ip->i_df.if_bytes;
-				XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IEXT);
+				vecp->i_type = XLOG_REG_TYPE_IEXT;
 			} else
 #endif
 			{
@@ -355,7 +355,7 @@ xfs_inode_item_format(
 				vecp->i_addr = (xfs_caddr_t)ext_buffer;
 				vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
 						XFS_DATA_FORK);
-				XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IEXT);
+				vecp->i_type = XLOG_REG_TYPE_IEXT;
 			}
 			ASSERT(vecp->i_len <= ip->i_df.if_bytes);
 			iip->ili_format.ilf_dsize = vecp->i_len;
@@ -373,7 +373,7 @@ xfs_inode_item_format(
 			ASSERT(ip->i_df.if_broot != NULL);
 			vecp->i_addr = (xfs_caddr_t)ip->i_df.if_broot;
 			vecp->i_len = ip->i_df.if_broot_bytes;
-			XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IBROOT);
+			vecp->i_type = XLOG_REG_TYPE_IBROOT;
 			vecp++;
 			nvecs++;
 			iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes;
@@ -399,7 +399,7 @@ xfs_inode_item_format(
 			ASSERT((ip->i_df.if_real_bytes == 0) ||
 			       (ip->i_df.if_real_bytes == data_bytes));
 			vecp->i_len = (int)data_bytes;
-			XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_ILOCAL);
+			vecp->i_type = XLOG_REG_TYPE_ILOCAL;
 			vecp++;
 			nvecs++;
 			iip->ili_format.ilf_dsize = (unsigned)data_bytes;
@@ -477,7 +477,7 @@ xfs_inode_item_format(
 			vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
 					XFS_ATTR_FORK);
 #endif
-			XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_EXT);
+			vecp->i_type = XLOG_REG_TYPE_IATTR_EXT;
 			iip->ili_format.ilf_asize = vecp->i_len;
 			vecp++;
 			nvecs++;
@@ -492,7 +492,7 @@ xfs_inode_item_format(
 			ASSERT(ip->i_afp->if_broot != NULL);
 			vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_broot;
 			vecp->i_len = ip->i_afp->if_broot_bytes;
-			XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_BROOT);
+			vecp->i_type = XLOG_REG_TYPE_IATTR_BROOT;
 			vecp++;
 			nvecs++;
 			iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes;
@@ -516,7 +516,7 @@ xfs_inode_item_format(
 			ASSERT((ip->i_afp->if_real_bytes == 0) ||
 			       (ip->i_afp->if_real_bytes == data_bytes));
 			vecp->i_len = (int)data_bytes;
-			XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_LOCAL);
+			vecp->i_type = XLOG_REG_TYPE_IATTR_LOCAL;
 			vecp++;
 			nvecs++;
 			iip->ili_format.ilf_asize = (unsigned)data_bytes;
@@ -535,31 +535,36 @@ xfs_inode_item_format(
 
 /*
  * This is called to pin the inode associated with the inode log
- * item in memory so it cannot be written out.  Do this by calling
- * xfs_ipin() to bump the pin count in the inode while holding the
- * inode pin lock.
+ * item in memory so it cannot be written out.
  */
 STATIC void
 xfs_inode_item_pin(
 	xfs_inode_log_item_t	*iip)
 {
 	ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL));
-	xfs_ipin(iip->ili_inode);
+
+	trace_xfs_inode_pin(iip->ili_inode, _RET_IP_);
+	atomic_inc(&iip->ili_inode->i_pincount);
 }
 
 
 /*
  * This is called to unpin the inode associated with the inode log
  * item which was previously pinned with a call to xfs_inode_item_pin().
- * Just call xfs_iunpin() on the inode to do this.
+ *
+ * Also wake up anyone in xfs_iunpin_wait() if the count goes to 0.
  */
 /* ARGSUSED */
 STATIC void
 xfs_inode_item_unpin(
-	xfs_inode_log_item_t	*iip,
-	int			stale)
+	xfs_inode_log_item_t	*iip)
 {
-	xfs_iunpin(iip->ili_inode);
+	struct xfs_inode	*ip = iip->ili_inode;
+
+	trace_xfs_inode_unpin(ip, _RET_IP_);
+	ASSERT(atomic_read(&ip->i_pincount) > 0);
+	if (atomic_dec_and_test(&ip->i_pincount))
+		wake_up(&ip->i_ipin_wait);
 }
 
 /* ARGSUSED */
@@ -568,7 +573,7 @@ xfs_inode_item_unpin_remove(
 	xfs_inode_log_item_t	*iip,
 	xfs_trans_t		*tp)
 {
-	xfs_iunpin(iip->ili_inode);
+	xfs_inode_item_unpin(iip);
 }
 
 /*
@@ -602,33 +607,20 @@ xfs_inode_item_trylock(
 
 	if (!xfs_iflock_nowait(ip)) {
 		/*
-		 * If someone else isn't already trying to push the inode
-		 * buffer, we get to do it.
+		 * inode has already been flushed to the backing buffer,
+		 * leave it locked in shared mode, pushbuf routine will
+		 * unlock it.
 		 */
-		if (iip->ili_pushbuf_flag == 0) {
-			iip->ili_pushbuf_flag = 1;
-#ifdef DEBUG
-			iip->ili_push_owner = current_pid();
-#endif
-			/*
-			 * Inode is left locked in shared mode.
-			 * Pushbuf routine gets to unlock it.
-			 */
-			return XFS_ITEM_PUSHBUF;
-		} else {
-			/*
-			 * We hold the AIL lock, so we must specify the
-			 * NONOTIFY flag so that we won't double trip.
-			 */
-			xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY);
-			return XFS_ITEM_FLUSHING;
-		}
-		/* NOTREACHED */
+		return XFS_ITEM_PUSHBUF;
 	}
 
 	/* Stale items should force out the iclog */
 	if (ip->i_flags & XFS_ISTALE) {
 		xfs_ifunlock(ip);
+		/*
+		 * we hold the AIL lock - notify the unlock routine of this
+		 * so it doesn't try to get the lock again.
+		 */
 		xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY);
 		return XFS_ITEM_PINNED;
 	}
@@ -746,11 +738,8 @@ xfs_inode_item_committed(
  * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK
  * failed to get the inode flush lock but did get the inode locked SHARED.
  * Here we're trying to see if the inode buffer is incore, and if so whether it's
- * marked delayed write. If that's the case, we'll initiate a bawrite on that
- * buffer to expedite the process.
- *
- * We aren't holding the AIL lock (or the flush lock) when this gets called,
- * so it is inherently race-y.
+ * marked delayed write. If that's the case, we'll promote it and that will
+ * allow the caller to write the buffer by triggering the xfsbufd to run.
  */
 STATIC void
 xfs_inode_item_pushbuf(
@@ -759,82 +748,30 @@ xfs_inode_item_pushbuf(
 	xfs_inode_t	*ip;
 	xfs_mount_t	*mp;
 	xfs_buf_t	*bp;
-	uint		dopush;
 
 	ip = iip->ili_inode;
-
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
 
 	/*
-	 * The ili_pushbuf_flag keeps others from
-	 * trying to duplicate our effort.
-	 */
-	ASSERT(iip->ili_pushbuf_flag != 0);
-	ASSERT(iip->ili_push_owner == current_pid());
-
-	/*
 	 * If a flush is not in progress anymore, chances are that the
 	 * inode was taken off the AIL. So, just get out.
 	 */
 	if (completion_done(&ip->i_flush) ||
 	    ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) {
-		iip->ili_pushbuf_flag = 0;
 		xfs_iunlock(ip, XFS_ILOCK_SHARED);
 		return;
 	}
 
 	mp = ip->i_mount;
 	bp = xfs_incore(mp->m_ddev_targp, iip->ili_format.ilf_blkno,
-		    iip->ili_format.ilf_len, XFS_INCORE_TRYLOCK);
-
-	if (bp != NULL) {
-		if (XFS_BUF_ISDELAYWRITE(bp)) {
-			/*
-			 * We were racing with iflush because we don't hold
-			 * the AIL lock or the flush lock. However, at this point,
-			 * we have the buffer, and we know that it's dirty.
-			 * So, it's possible that iflush raced with us, and
-			 * this item is already taken off the AIL.
-			 * If not, we can flush it async.
-			 */
-			dopush = ((iip->ili_item.li_flags & XFS_LI_IN_AIL) &&
-				  !completion_done(&ip->i_flush));
-			iip->ili_pushbuf_flag = 0;
-			xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
-			trace_xfs_inode_item_push(bp, _RET_IP_);
+		    iip->ili_format.ilf_len, XBF_TRYLOCK);
 
-			if (XFS_BUF_ISPINNED(bp)) {
-				xfs_log_force(mp, (xfs_lsn_t)0,
-					      XFS_LOG_FORCE);
-			}
-			if (dopush) {
-				int	error;
-				error = xfs_bawrite(mp, bp);
-				if (error)
-					xfs_fs_cmn_err(CE_WARN, mp,
-		"xfs_inode_item_pushbuf: pushbuf error %d on iip %p, bp %p",
-							error, iip, bp);
-			} else {
-				xfs_buf_relse(bp);
-			}
-		} else {
-			iip->ili_pushbuf_flag = 0;
-			xfs_iunlock(ip, XFS_ILOCK_SHARED);
-			xfs_buf_relse(bp);
-		}
-		return;
-	}
-	/*
-	 * We have to be careful about resetting pushbuf flag too early (above).
-	 * Even though in theory we can do it as soon as we have the buflock,
-	 * we don't want others to be doing work needlessly. They'll come to
-	 * this function thinking that pushing the buffer is their
-	 * responsibility only to find that the buffer is still locked by
-	 * another doing the same thing
-	 */
-	iip->ili_pushbuf_flag = 0;
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	if (!bp)
+		return;
+	if (XFS_BUF_ISDELAYWRITE(bp))
+		xfs_buf_delwri_promote(bp);
+	xfs_buf_relse(bp);
 	return;
 }
 
@@ -867,10 +804,14 @@ xfs_inode_item_push(
 	       iip->ili_format.ilf_fields != 0);
 
 	/*
-	 * Write out the inode.  The completion routine ('iflush_done') will
-	 * pull it from the AIL, mark it clean, unlock the flush lock.
+	 * Push the inode to it's backing buffer. This will not remove the
+	 * inode from the AIL - a further push will be required to trigger a
+	 * buffer push. However, this allows all the dirty inodes to be pushed
+	 * to the buffer before it is pushed to disk. THe buffer IO completion
+	 * will pull th einode from the AIL, mark it clean and unlock the flush
+	 * lock.
 	 */
-	(void) xfs_iflush(ip, XFS_IFLUSH_ASYNC);
+	(void) xfs_iflush(ip, 0);
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
 
 	return;
@@ -898,7 +839,7 @@ static struct xfs_item_ops xfs_inode_item_ops = {
 	.iop_format	= (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
 					xfs_inode_item_format,
 	.iop_pin	= (void(*)(xfs_log_item_t*))xfs_inode_item_pin,
-	.iop_unpin	= (void(*)(xfs_log_item_t*, int))xfs_inode_item_unpin,
+	.iop_unpin	= (void(*)(xfs_log_item_t*))xfs_inode_item_unpin,
 	.iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
 					xfs_inode_item_unpin_remove,
 	.iop_trylock	= (uint(*)(xfs_log_item_t*))xfs_inode_item_trylock,
@@ -925,18 +866,9 @@ xfs_inode_item_init(
 	ASSERT(ip->i_itemp == NULL);
 	iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP);
 
-	iip->ili_item.li_type = XFS_LI_INODE;
-	iip->ili_item.li_ops = &xfs_inode_item_ops;
-	iip->ili_item.li_mountp = mp;
-	iip->ili_item.li_ailp = mp->m_ail;
 	iip->ili_inode = ip;
-
-	/*
-	   We have zeroed memory. No need ...
-	   iip->ili_extents_buf = NULL;
-	   iip->ili_pushbuf_flag = 0;
-	 */
-
+	xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE,
+						&xfs_inode_item_ops);
 	iip->ili_format.ilf_type = XFS_LI_INODE;
 	iip->ili_format.ilf_ino = ip->i_ino;
 	iip->ili_format.ilf_blkno = ip->i_imap.im_blkno;
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index cc8df1ac7783..9a467958ecdd 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -144,12 +144,6 @@ typedef struct xfs_inode_log_item {
 						      data exts */
 	struct xfs_bmbt_rec	*ili_aextents_buf; /* array of logged
 						      attr exts */
-	unsigned int            ili_pushbuf_flag;  /* one bit used in push_ail */
-
-#ifdef DEBUG
-	uint64_t                ili_push_owner;    /* one who sets pushbuf_flag
-						      above gets to push the buf */
-#endif
 #ifdef XFS_TRANS_DEBUG
 	int			ili_root_size;
 	char			*ili_orig_root;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 0b65039951a0..ef14943829da 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -55,71 +55,33 @@
 #define XFS_STRAT_WRITE_IMAPS	2
 #define XFS_WRITE_IMAPS		XFS_BMAP_MAX_NMAP
 
-STATIC int
-xfs_imap_to_bmap(
-	xfs_inode_t	*ip,
-	xfs_off_t	offset,
-	xfs_bmbt_irec_t *imap,
-	xfs_iomap_t	*iomapp,
-	int		imaps,			/* Number of imap entries */
-	int		iomaps,			/* Number of iomap entries */
-	int		flags)
-{
-	xfs_mount_t	*mp = ip->i_mount;
-	int		pbm;
-	xfs_fsblock_t	start_block;
-
-
-	for (pbm = 0; imaps && pbm < iomaps; imaps--, iomapp++, imap++, pbm++) {
-		iomapp->iomap_offset = XFS_FSB_TO_B(mp, imap->br_startoff);
-		iomapp->iomap_delta = offset - iomapp->iomap_offset;
-		iomapp->iomap_bsize = XFS_FSB_TO_B(mp, imap->br_blockcount);
-		iomapp->iomap_flags = flags;
-
-		if (XFS_IS_REALTIME_INODE(ip)) {
-			iomapp->iomap_flags |= IOMAP_REALTIME;
-			iomapp->iomap_target = mp->m_rtdev_targp;
-		} else {
-			iomapp->iomap_target = mp->m_ddev_targp;
-		}
-		start_block = imap->br_startblock;
-		if (start_block == HOLESTARTBLOCK) {
-			iomapp->iomap_bn = IOMAP_DADDR_NULL;
-			iomapp->iomap_flags |= IOMAP_HOLE;
-		} else if (start_block == DELAYSTARTBLOCK) {
-			iomapp->iomap_bn = IOMAP_DADDR_NULL;
-			iomapp->iomap_flags |= IOMAP_DELAY;
-		} else {
-			iomapp->iomap_bn = xfs_fsb_to_db(ip, start_block);
-			if (ISUNWRITTEN(imap))
-				iomapp->iomap_flags |= IOMAP_UNWRITTEN;
-		}
-
-		offset += iomapp->iomap_bsize - iomapp->iomap_delta;
-	}
-	return pbm;	/* Return the number filled */
-}
+STATIC int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
+				  int, struct xfs_bmbt_irec *, int *);
+STATIC int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int,
+				 struct xfs_bmbt_irec *, int *);
+STATIC int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
+				struct xfs_bmbt_irec *, int *);
 
 int
 xfs_iomap(
-	xfs_inode_t	*ip,
-	xfs_off_t	offset,
-	ssize_t		count,
-	int		flags,
-	xfs_iomap_t	*iomapp,
-	int		*niomaps)
+	struct xfs_inode	*ip,
+	xfs_off_t		offset,
+	ssize_t			count,
+	int			flags,
+	struct xfs_bmbt_irec	*imap,
+	int			*nimaps,
+	int			*new)
 {
-	xfs_mount_t	*mp = ip->i_mount;
-	xfs_fileoff_t	offset_fsb, end_fsb;
-	int		error = 0;
-	int		lockmode = 0;
-	xfs_bmbt_irec_t	imap;
-	int		nimaps = 1;
-	int		bmapi_flags = 0;
-	int		iomap_flags = 0;
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fileoff_t		offset_fsb, end_fsb;
+	int			error = 0;
+	int			lockmode = 0;
+	int			bmapi_flags = 0;
 
 	ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
 
+	*new = 0;
+
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return XFS_ERROR(EIO);
 
@@ -160,8 +122,8 @@ xfs_iomap(
 
 	error = xfs_bmapi(NULL, ip, offset_fsb,
 			(xfs_filblks_t)(end_fsb - offset_fsb),
-			bmapi_flags,  NULL, 0, &imap,
-			&nimaps, NULL, NULL);
+			bmapi_flags,  NULL, 0, imap,
+			nimaps, NULL, NULL);
 
 	if (error)
 		goto out;
@@ -169,46 +131,41 @@ xfs_iomap(
 	switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)) {
 	case BMAPI_WRITE:
 		/* If we found an extent, return it */
-		if (nimaps &&
-		    (imap.br_startblock != HOLESTARTBLOCK) &&
-		    (imap.br_startblock != DELAYSTARTBLOCK)) {
-			trace_xfs_iomap_found(ip, offset, count, flags, &imap);
+		if (*nimaps &&
+		    (imap->br_startblock != HOLESTARTBLOCK) &&
+		    (imap->br_startblock != DELAYSTARTBLOCK)) {
+			trace_xfs_iomap_found(ip, offset, count, flags, imap);
 			break;
 		}
 
 		if (flags & (BMAPI_DIRECT|BMAPI_MMAP)) {
 			error = xfs_iomap_write_direct(ip, offset, count, flags,
-						       &imap, &nimaps, nimaps);
+						       imap, nimaps);
 		} else {
 			error = xfs_iomap_write_delay(ip, offset, count, flags,
-						      &imap, &nimaps);
+						      imap, nimaps);
 		}
 		if (!error) {
-			trace_xfs_iomap_alloc(ip, offset, count, flags, &imap);
+			trace_xfs_iomap_alloc(ip, offset, count, flags, imap);
 		}
-		iomap_flags = IOMAP_NEW;
+		*new = 1;
 		break;
 	case BMAPI_ALLOCATE:
 		/* If we found an extent, return it */
 		xfs_iunlock(ip, lockmode);
 		lockmode = 0;
 
-		if (nimaps && !isnullstartblock(imap.br_startblock)) {
-			trace_xfs_iomap_found(ip, offset, count, flags, &imap);
+		if (*nimaps && !isnullstartblock(imap->br_startblock)) {
+			trace_xfs_iomap_found(ip, offset, count, flags, imap);
 			break;
 		}
 
 		error = xfs_iomap_write_allocate(ip, offset, count,
-						 &imap, &nimaps);
+						 imap, nimaps);
 		break;
 	}
 
-	if (nimaps) {
-		*niomaps = xfs_imap_to_bmap(ip, offset, &imap,
-					    iomapp, nimaps, *niomaps, iomap_flags);
-	} else if (niomaps) {
-		*niomaps = 0;
-	}
+	ASSERT(*nimaps <= 1);
 
 out:
 	if (lockmode)
@@ -216,7 +173,6 @@ out:
 	return XFS_ERROR(error);
 }
 
-
 STATIC int
 xfs_iomap_eof_align_last_fsb(
 	xfs_mount_t	*mp,
@@ -285,15 +241,14 @@ xfs_cmn_err_fsblock_zero(
 	return EFSCORRUPTED;
 }
 
-int
+STATIC int
 xfs_iomap_write_direct(
 	xfs_inode_t	*ip,
 	xfs_off_t	offset,
 	size_t		count,
 	int		flags,
 	xfs_bmbt_irec_t *ret_imap,
-	int		*nmaps,
-	int		found)
+	int		*nmaps)
 {
 	xfs_mount_t	*mp = ip->i_mount;
 	xfs_fileoff_t	offset_fsb;
@@ -330,7 +285,7 @@ xfs_iomap_write_direct(
 		if (error)
 			goto error_out;
 	} else {
-		if (found && (ret_imap->br_startblock == HOLESTARTBLOCK))
+		if (*nmaps && (ret_imap->br_startblock == HOLESTARTBLOCK))
 			last_fsb = MIN(last_fsb, (xfs_fileoff_t)
 					ret_imap->br_blockcount +
 					ret_imap->br_startoff);
@@ -485,7 +440,7 @@ xfs_iomap_eof_want_preallocate(
 	return 0;
 }
 
-int
+STATIC int
 xfs_iomap_write_delay(
 	xfs_inode_t	*ip,
 	xfs_off_t	offset,
@@ -588,7 +543,7 @@ retry:
  * We no longer bother to look at the incoming map - all we have to
  * guarantee is that whatever we allocate fills the required range.
  */
-int
+STATIC int
 xfs_iomap_write_allocate(
 	xfs_inode_t	*ip,
 	xfs_off_t	offset,
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 174f29990991..81ac4afd45b3 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,19 +18,6 @@
 #ifndef __XFS_IOMAP_H__
 #define __XFS_IOMAP_H__
 
-#define IOMAP_DADDR_NULL ((xfs_daddr_t) (-1LL))
-
-
-typedef enum {				/* iomap_flags values */
-	IOMAP_READ =		0,	/* mapping for a read */
-	IOMAP_HOLE =		0x02,	/* mapping covers a hole  */
-	IOMAP_DELAY =		0x04,	/* mapping covers delalloc region  */
-	IOMAP_REALTIME =	0x10,	/* mapping on the realtime device  */
-	IOMAP_UNWRITTEN =	0x20,	/* mapping covers allocated */
-					/* but uninitialized file data  */
-	IOMAP_NEW =		0x40	/* just allocate */
-} iomap_flags_t;
-
 typedef enum {
 	/* base extent manipulation calls */
 	BMAPI_READ = (1 << 0),		/* read extents */
@@ -52,43 +39,11 @@ typedef enum {
 	{ BMAPI_MMAP,		"MMAP" }, \
 	{ BMAPI_TRYLOCK,	"TRYLOCK" }
 
-/*
- * xfs_iomap_t:  File system I/O map
- *
- * The iomap_bn field is expressed in 512-byte blocks, and is where the
- * mapping starts on disk.
- *
- * The iomap_offset, iomap_bsize and iomap_delta fields are in bytes.
- * iomap_offset is the offset of the mapping in the file itself.
- * iomap_bsize is the size of the mapping,  iomap_delta is the
- * desired data's offset into the mapping, given the offset supplied
- * to the file I/O map routine.
- *
- * When a request is made to read beyond the logical end of the object,
- * iomap_size may be set to 0, but iomap_offset and iomap_length should be set
- * to the actual amount of underlying storage that has been allocated, if any.
- */
-
-typedef struct xfs_iomap {
-	xfs_daddr_t		iomap_bn;	/* first 512B blk of mapping */
-	xfs_buftarg_t		*iomap_target;
-	xfs_off_t		iomap_offset;	/* offset of mapping, bytes */
-	xfs_off_t		iomap_bsize;	/* size of mapping, bytes */
-	xfs_off_t		iomap_delta;	/* offset into mapping, bytes */
-	iomap_flags_t		iomap_flags;
-} xfs_iomap_t;
-
 struct xfs_inode;
 struct xfs_bmbt_irec;
 
 extern int xfs_iomap(struct xfs_inode *, xfs_off_t, ssize_t, int,
-		     struct xfs_iomap *, int *);
-extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
-				  int, struct xfs_bmbt_irec *, int *, int);
-extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int,
-				 struct xfs_bmbt_irec *, int *);
-extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
-				struct xfs_bmbt_irec *, int *);
+		     struct xfs_bmbt_irec *, int *, int *);
 extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t);
 
 #endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 62efab2f3839..2b86f8610512 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -49,24 +49,40 @@ xfs_internal_inum(
 		 (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino)));
 }
 
-STATIC int
-xfs_bulkstat_one_iget(
-	xfs_mount_t	*mp,		/* mount point for filesystem */
-	xfs_ino_t	ino,		/* inode number to get data for */
-	xfs_daddr_t	bno,		/* starting bno of inode cluster */
-	xfs_bstat_t	*buf,		/* return buffer */
-	int		*stat)		/* BULKSTAT_RV_... */
+/*
+ * Return stat information for one inode.
+ * Return 0 if ok, else errno.
+ */
+int
+xfs_bulkstat_one_int(
+	struct xfs_mount	*mp,		/* mount point for filesystem */
+	xfs_ino_t		ino,		/* inode to get data for */
+	void __user		*buffer,	/* buffer to place output in */
+	int			ubsize,		/* size of buffer */
+	bulkstat_one_fmt_pf	formatter,	/* formatter, copy to user */
+	int			*ubused,	/* bytes used by me */
+	int			*stat)		/* BULKSTAT_RV_... */
 {
-	xfs_icdinode_t	*dic;	/* dinode core info pointer */
-	xfs_inode_t	*ip;		/* incore inode pointer */
-	struct inode	*inode;
-	int		error;
+	struct xfs_icdinode	*dic;		/* dinode core info pointer */
+	struct xfs_inode	*ip;		/* incore inode pointer */
+	struct inode		*inode;
+	struct xfs_bstat	*buf;		/* return buffer */
+	int			error = 0;	/* error value */
+
+	*stat = BULKSTAT_RV_NOTHING;
+
+	if (!buffer || xfs_internal_inum(mp, ino))
+		return XFS_ERROR(EINVAL);
+
+	buf = kmem_alloc(sizeof(*buf), KM_SLEEP | KM_MAYFAIL);
+	if (!buf)
+		return XFS_ERROR(ENOMEM);
 
 	error = xfs_iget(mp, NULL, ino,
-			 XFS_IGET_BULKSTAT, XFS_ILOCK_SHARED, &ip, bno);
+			 XFS_IGET_UNTRUSTED, XFS_ILOCK_SHARED, &ip);
 	if (error) {
 		*stat = BULKSTAT_RV_NOTHING;
-		return error;
+		goto out_free;
 	}
 
 	ASSERT(ip != NULL);
@@ -106,6 +122,7 @@ xfs_bulkstat_one_iget(
 	buf->bs_dmevmask = dic->di_dmevmask;
 	buf->bs_dmstate = dic->di_dmstate;
 	buf->bs_aextents = dic->di_anextents;
+	buf->bs_forkoff = XFS_IFORK_BOFF(ip);
 
 	switch (dic->di_format) {
 	case XFS_DINODE_FMT_DEV:
@@ -126,76 +143,16 @@ xfs_bulkstat_one_iget(
 		buf->bs_blocks = dic->di_nblocks + ip->i_delayed_blks;
 		break;
 	}
-
 	xfs_iput(ip, XFS_ILOCK_SHARED);
-	return error;
-}
 
-STATIC void
-xfs_bulkstat_one_dinode(
-	xfs_mount_t	*mp,		/* mount point for filesystem */
-	xfs_ino_t	ino,		/* inode number to get data for */
-	xfs_dinode_t	*dic,		/* dinode inode pointer */
-	xfs_bstat_t	*buf)		/* return buffer */
-{
-	/*
-	 * The inode format changed when we moved the link count and
-	 * made it 32 bits long.  If this is an old format inode,
-	 * convert it in memory to look like a new one.  If it gets
-	 * flushed to disk we will convert back before flushing or
-	 * logging it.  We zero out the new projid field and the old link
-	 * count field.  We'll handle clearing the pad field (the remains
-	 * of the old uuid field) when we actually convert the inode to
-	 * the new format. We don't change the version number so that we
-	 * can distinguish this from a real new format inode.
-	 */
-	if (dic->di_version == 1) {
-		buf->bs_nlink = be16_to_cpu(dic->di_onlink);
-		buf->bs_projid = 0;
-	} else {
-		buf->bs_nlink = be32_to_cpu(dic->di_nlink);
-		buf->bs_projid = be16_to_cpu(dic->di_projid);
-	}
+	error = formatter(buffer, ubsize, ubused, buf);
 
-	buf->bs_ino = ino;
-	buf->bs_mode = be16_to_cpu(dic->di_mode);
-	buf->bs_uid = be32_to_cpu(dic->di_uid);
-	buf->bs_gid = be32_to_cpu(dic->di_gid);
-	buf->bs_size = be64_to_cpu(dic->di_size);
-	buf->bs_atime.tv_sec = be32_to_cpu(dic->di_atime.t_sec);
-	buf->bs_atime.tv_nsec = be32_to_cpu(dic->di_atime.t_nsec);
-	buf->bs_mtime.tv_sec = be32_to_cpu(dic->di_mtime.t_sec);
-	buf->bs_mtime.tv_nsec = be32_to_cpu(dic->di_mtime.t_nsec);
-	buf->bs_ctime.tv_sec = be32_to_cpu(dic->di_ctime.t_sec);
-	buf->bs_ctime.tv_nsec = be32_to_cpu(dic->di_ctime.t_nsec);
-	buf->bs_xflags = xfs_dic2xflags(dic);
-	buf->bs_extsize = be32_to_cpu(dic->di_extsize) << mp->m_sb.sb_blocklog;
-	buf->bs_extents = be32_to_cpu(dic->di_nextents);
-	buf->bs_gen = be32_to_cpu(dic->di_gen);
-	memset(buf->bs_pad, 0, sizeof(buf->bs_pad));
-	buf->bs_dmevmask = be32_to_cpu(dic->di_dmevmask);
-	buf->bs_dmstate = be16_to_cpu(dic->di_dmstate);
-	buf->bs_aextents = be16_to_cpu(dic->di_anextents);
+	if (!error)
+		*stat = BULKSTAT_RV_DIDONE;
 
-	switch (dic->di_format) {
-	case XFS_DINODE_FMT_DEV:
-		buf->bs_rdev = xfs_dinode_get_rdev(dic);
-		buf->bs_blksize = BLKDEV_IOSIZE;
-		buf->bs_blocks = 0;
-		break;
-	case XFS_DINODE_FMT_LOCAL:
-	case XFS_DINODE_FMT_UUID:
-		buf->bs_rdev = 0;
-		buf->bs_blksize = mp->m_sb.sb_blocksize;
-		buf->bs_blocks = 0;
-		break;
-	case XFS_DINODE_FMT_EXTENTS:
-	case XFS_DINODE_FMT_BTREE:
-		buf->bs_rdev = 0;
-		buf->bs_blksize = mp->m_sb.sb_blocksize;
-		buf->bs_blocks = be64_to_cpu(dic->di_nblocks);
-		break;
-	}
+ out_free:
+	kmem_free(buf);
+	return error;
 }
 
 /* Return 0 on success or positive error */
@@ -215,118 +172,17 @@ xfs_bulkstat_one_fmt(
 	return 0;
 }
 
-/*
- * Return stat information for one inode.
- * Return 0 if ok, else errno.
- */
-int		   	    		/* error status */
-xfs_bulkstat_one_int(
-	xfs_mount_t	*mp,		/* mount point for filesystem */
-	xfs_ino_t	ino,		/* inode number to get data for */
-	void		__user *buffer,	/* buffer to place output in */
-	int		ubsize,		/* size of buffer */
-	bulkstat_one_fmt_pf formatter,	/* formatter, copy to user */
-	xfs_daddr_t	bno,		/* starting bno of inode cluster */
-	int		*ubused,	/* bytes used by me */
-	void		*dibuff,	/* on-disk inode buffer */
-	int		*stat)		/* BULKSTAT_RV_... */
-{
-	xfs_bstat_t	*buf;		/* return buffer */
-	int		error = 0;	/* error value */
-	xfs_dinode_t	*dip;		/* dinode inode pointer */
-
-	dip = (xfs_dinode_t *)dibuff;
-	*stat = BULKSTAT_RV_NOTHING;
-
-	if (!buffer || xfs_internal_inum(mp, ino))
-		return XFS_ERROR(EINVAL);
-
-	buf = kmem_alloc(sizeof(*buf), KM_SLEEP);
-
-	if (dip == NULL) {
-		/* We're not being passed a pointer to a dinode.  This happens
-		 * if BULKSTAT_FG_IGET is selected.  Do the iget.
-		 */
-		error = xfs_bulkstat_one_iget(mp, ino, bno, buf, stat);
-		if (error)
-			goto out_free;
-	} else {
-		xfs_bulkstat_one_dinode(mp, ino, dip, buf);
-	}
-
-	error = formatter(buffer, ubsize, ubused, buf);
-	if (error)
-		goto out_free;
-
-	*stat = BULKSTAT_RV_DIDONE;
-
- out_free:
-	kmem_free(buf);
-	return error;
-}
-
 int
 xfs_bulkstat_one(
 	xfs_mount_t	*mp,		/* mount point for filesystem */
 	xfs_ino_t	ino,		/* inode number to get data for */
 	void		__user *buffer,	/* buffer to place output in */
 	int		ubsize,		/* size of buffer */
-	void		*private_data,	/* my private data */
-	xfs_daddr_t	bno,		/* starting bno of inode cluster */
 	int		*ubused,	/* bytes used by me */
-	void		*dibuff,	/* on-disk inode buffer */
 	int		*stat)		/* BULKSTAT_RV_... */
 {
 	return xfs_bulkstat_one_int(mp, ino, buffer, ubsize,
-				    xfs_bulkstat_one_fmt, bno,
-				    ubused, dibuff, stat);
-}
-
-/*
- * Test to see whether we can use the ondisk inode directly, based
- * on the given bulkstat flags, filling in dipp accordingly.
- * Returns zero if the inode is dodgey.
- */
-STATIC int
-xfs_bulkstat_use_dinode(
-	xfs_mount_t	*mp,
-	int		flags,
-	xfs_buf_t	*bp,
-	int		clustidx,
-	xfs_dinode_t	**dipp)
-{
-	xfs_dinode_t	*dip;
-	unsigned int	aformat;
-
-	*dipp = NULL;
-	if (!bp || (flags & BULKSTAT_FG_IGET))
-		return 1;
-	dip = (xfs_dinode_t *)
-			xfs_buf_offset(bp, clustidx << mp->m_sb.sb_inodelog);
-	/*
-	 * Check the buffer containing the on-disk inode for di_mode == 0.
-	 * This is to prevent xfs_bulkstat from picking up just reclaimed
-	 * inodes that have their in-core state initialized but not flushed
-	 * to disk yet. This is a temporary hack that would require a proper
-	 * fix in the future.
-	 */
-	if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC ||
-	    !XFS_DINODE_GOOD_VERSION(dip->di_version) ||
-	    !dip->di_mode)
-		return 0;
-	if (flags & BULKSTAT_FG_QUICK) {
-		*dipp = dip;
-		return 1;
-	}
-	/* BULKSTAT_FG_INLINE: if attr fork is local, or not there, use it */
-	aformat = dip->di_aformat;
-	if ((XFS_DFORK_Q(dip) == 0) ||
-	    (aformat == XFS_DINODE_FMT_LOCAL) ||
-	    (aformat == XFS_DINODE_FMT_EXTENTS && !dip->di_anextents)) {
-		*dipp = dip;
-		return 1;
-	}
-	return 1;
+				    xfs_bulkstat_one_fmt, ubused, stat);
 }
 
 #define XFS_BULKSTAT_UBLEFT(ubleft)	((ubleft) >= statstruct_size)
@@ -340,10 +196,8 @@ xfs_bulkstat(
 	xfs_ino_t		*lastinop, /* last inode returned */
 	int			*ubcountp, /* size of buffer/count returned */
 	bulkstat_one_pf		formatter, /* func that'd fill a single buf */
-	void			*private_data,/* private data for formatter */
 	size_t			statstruct_size, /* sizeof struct filling */
 	char			__user *ubuffer, /* buffer with inode stats */
-	int			flags,	/* defined in xfs_itable.h */
 	int			*done)	/* 1 if there are more stats to get */
 {
 	xfs_agblock_t		agbno=0;/* allocation group block number */
@@ -378,14 +232,12 @@ xfs_bulkstat(
 	int			ubelem;	/* spaces used in user's buffer */
 	int			ubused;	/* bytes used by formatter */
 	xfs_buf_t		*bp;	/* ptr to on-disk inode cluster buf */
-	xfs_dinode_t		*dip;	/* ptr into bp for specific inode */
 
 	/*
 	 * Get the last inode value, see if there's nothing to do.
 	 */
 	ino = (xfs_ino_t)*lastinop;
 	lastino = ino;
-	dip = NULL;
 	agno = XFS_INO_TO_AGNO(mp, ino);
 	agino = XFS_INO_TO_AGINO(mp, ino);
 	if (agno >= mp->m_sb.sb_agcount ||
@@ -408,8 +260,10 @@ xfs_bulkstat(
 		(XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog);
 	nimask = ~(nicluster - 1);
 	nbcluster = nicluster >> mp->m_sb.sb_inopblog;
-	irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4,
-				   KM_SLEEP | KM_MAYFAIL | KM_LARGE);
+	irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4);
+	if (!irbuf)
+		return ENOMEM;
+
 	nirbuf = irbsize / sizeof(*irbuf);
 
 	/*
@@ -420,9 +274,7 @@ xfs_bulkstat(
 	while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) {
 		cond_resched();
 		bp = NULL;
-		down_read(&mp->m_peraglock);
 		error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
-		up_read(&mp->m_peraglock);
 		if (error) {
 			/*
 			 * Skip this allocation group and go to the next one.
@@ -610,37 +462,6 @@ xfs_bulkstat(
 							irbp->ir_startino) +
 						((chunkidx & nimask) >>
 						 mp->m_sb.sb_inopblog);
-
-					if (flags & (BULKSTAT_FG_QUICK |
-						     BULKSTAT_FG_INLINE)) {
-						int offset;
-
-						ino = XFS_AGINO_TO_INO(mp, agno,
-								       agino);
-						bno = XFS_AGB_TO_DADDR(mp, agno,
-								       agbno);
-
-						/*
-						 * Get the inode cluster buffer
-						 */
-						if (bp)
-							xfs_buf_relse(bp);
-
-						error = xfs_inotobp(mp, NULL, ino, &dip,
-								    &bp, &offset,
-								    XFS_IGET_BULKSTAT);
-
-						if (!error)
-							clustidx = offset / mp->m_sb.sb_inodesize;
-						if (XFS_TEST_ERROR(error != 0,
-								   mp, XFS_ERRTAG_BULKSTAT_READ_CHUNK,
-								   XFS_RANDOM_BULKSTAT_READ_CHUNK)) {
-							bp = NULL;
-							ubleft = 0;
-							rval = error;
-							break;
-						}
-					}
 				}
 				ino = XFS_AGINO_TO_INO(mp, agno, agino);
 				bno = XFS_AGB_TO_DADDR(mp, agno, agbno);
@@ -656,35 +477,13 @@ xfs_bulkstat(
 				 * when the chunk is used up.
 				 */
 				irbp->ir_freecount++;
-				if (!xfs_bulkstat_use_dinode(mp, flags, bp,
-							     clustidx, &dip)) {
-					lastino = ino;
-					continue;
-				}
-				/*
-				 * If we need to do an iget, cannot hold bp.
-				 * Drop it, until starting the next cluster.
-				 */
-				if ((flags & BULKSTAT_FG_INLINE) && !dip) {
-					if (bp)
-						xfs_buf_relse(bp);
-					bp = NULL;
-				}
 
 				/*
 				 * Get the inode and fill in a single buffer.
-				 * BULKSTAT_FG_QUICK uses dip to fill it in.
-				 * BULKSTAT_FG_IGET uses igets.
-				 * BULKSTAT_FG_INLINE uses dip if we have an
-				 * inline attr fork, else igets.
-				 * See: xfs_bulkstat_one & xfs_dm_bulkstat_one.
-				 * This is also used to count inodes/blks, etc
-				 * in xfs_qm_quotacheck.
 				 */
 				ubused = statstruct_size;
-				error = formatter(mp, ino, ubufp,
-						ubleft, private_data,
-						bno, &ubused, dip, &fmterror);
+				error = formatter(mp, ino, ubufp, ubleft,
+						  &ubused, &fmterror);
 				if (fmterror == BULKSTAT_RV_NOTHING) {
 					if (error && error != ENOENT &&
 						error != EINVAL) {
@@ -729,7 +528,7 @@ xfs_bulkstat(
 	/*
 	 * Done, we're either out of filesystem or space to put the data.
 	 */
-	kmem_free(irbuf);
+	kmem_free_large(irbuf);
 	*ubcountp = ubelem;
 	/*
 	 * Found some inodes, return them now and return the error next time.
@@ -776,8 +575,7 @@ xfs_bulkstat_single(
 	 */
 
 	ino = (xfs_ino_t)*lastinop;
-	error = xfs_bulkstat_one(mp, ino, buffer, sizeof(xfs_bstat_t),
-				 NULL, 0, NULL, NULL, &res);
+	error = xfs_bulkstat_one(mp, ino, buffer, sizeof(xfs_bstat_t), 0, &res);
 	if (error) {
 		/*
 		 * Special case way failed, do it the "long" way
@@ -786,8 +584,7 @@ xfs_bulkstat_single(
 		(*lastinop)--;
 		count = 1;
 		if (xfs_bulkstat(mp, lastinop, &count, xfs_bulkstat_one,
-				NULL, sizeof(xfs_bstat_t), buffer,
-				BULKSTAT_FG_IGET, done))
+				sizeof(xfs_bstat_t), buffer, done))
 			return error;
 		if (count == 0 || (xfs_ino_t)*lastinop != ino)
 			return error == EFSCORRUPTED ?
@@ -849,9 +646,7 @@ xfs_inumbers(
 	agbp = NULL;
 	while (left > 0 && agno < mp->m_sb.sb_agcount) {
 		if (agbp == NULL) {
-			down_read(&mp->m_peraglock);
 			error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
-			up_read(&mp->m_peraglock);
 			if (error) {
 				/*
 				 * If we can't read the AGI of this ag,
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index 20792bf45946..97295d91d170 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -27,10 +27,7 @@ typedef int (*bulkstat_one_pf)(struct xfs_mount	*mp,
 			       xfs_ino_t	ino,
 			       void		__user *buffer,
 			       int		ubsize,
-			       void		*private_data,
-			       xfs_daddr_t	bno,
 			       int		*ubused,
-			       void		*dip,
 			       int		*stat);
 
 /*
@@ -41,13 +38,6 @@ typedef int (*bulkstat_one_pf)(struct xfs_mount	*mp,
 #define BULKSTAT_RV_GIVEUP	2
 
 /*
- * Values for bulkstat flag argument.
- */
-#define BULKSTAT_FG_IGET	0x1	/* Go through the buffer cache */
-#define BULKSTAT_FG_QUICK	0x2	/* No iget, walk the dinode cluster */
-#define BULKSTAT_FG_INLINE	0x4	/* No iget if inline attrs */
-
-/*
  * Return stat information in bulk (by-inode) for the filesystem.
  */
 int					/* error status */
@@ -56,10 +46,8 @@ xfs_bulkstat(
 	xfs_ino_t	*lastino,	/* last inode returned */
 	int		*count,		/* size of buffer/count returned */
 	bulkstat_one_pf formatter,	/* func that'd fill a single buf */
-	void		*private_data,	/* private data for formatter */
 	size_t		statstruct_size,/* sizeof struct that we're filling */
 	char		__user *ubuffer,/* buffer with inode stats */
-	int		flags,		/* flag to control access method */
 	int		*done);		/* 1 if there are more stats to get */
 
 int
@@ -82,9 +70,7 @@ xfs_bulkstat_one_int(
 	void			__user *buffer,
 	int			ubsize,
 	bulkstat_one_fmt_pf	formatter,
-	xfs_daddr_t		bno,
 	int			*ubused,
-	void			*dibuff,
 	int			*stat);
 
 int
@@ -93,10 +79,7 @@ xfs_bulkstat_one(
 	xfs_ino_t		ino,
 	void			__user *buffer,
 	int			ubsize,
-	void			*private_data,
-	xfs_daddr_t		bno,
 	int			*ubused,
-	void			*dibuff,
 	int			*stat);
 
 typedef int (*inumbers_fmt_pf)(
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 600b5b06aaeb..5215abc8023a 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -44,14 +44,8 @@
 
 kmem_zone_t	*xfs_log_ticket_zone;
 
-#define xlog_write_adv_cnt(ptr, len, off, bytes) \
-	{ (ptr) += (bytes); \
-	  (len) -= (bytes); \
-	  (off) += (bytes);}
-
 /* Local miscellaneous function prototypes */
-STATIC int	 xlog_bdstrat_cb(struct xfs_buf *);
-STATIC int	 xlog_commit_record(xfs_mount_t *mp, xlog_ticket_t *ticket,
+STATIC int	 xlog_commit_record(struct log *log, struct xlog_ticket *ticket,
 				    xlog_in_core_t **, xfs_lsn_t *);
 STATIC xlog_t *  xlog_alloc_log(xfs_mount_t	*mp,
 				xfs_buftarg_t	*log_target,
@@ -60,11 +54,6 @@ STATIC xlog_t *  xlog_alloc_log(xfs_mount_t	*mp,
 STATIC int	 xlog_space_left(xlog_t *log, int cycle, int bytes);
 STATIC int	 xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
 STATIC void	 xlog_dealloc_log(xlog_t *log);
-STATIC int	 xlog_write(xfs_mount_t *mp, xfs_log_iovec_t region[],
-			    int nentries, xfs_log_ticket_t tic,
-			    xfs_lsn_t *start_lsn,
-			    xlog_in_core_t **commit_iclog,
-			    uint flags);
 
 /* local state machine functions */
 STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int);
@@ -80,11 +69,6 @@ STATIC int  xlog_state_release_iclog(xlog_t		*log,
 STATIC void xlog_state_switch_iclogs(xlog_t		*log,
 				     xlog_in_core_t *iclog,
 				     int		eventual_size);
-STATIC int  xlog_state_sync(xlog_t			*log,
-			    xfs_lsn_t 			lsn,
-			    uint			flags,
-			    int				*log_flushed);
-STATIC int  xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed);
 STATIC void xlog_state_want_sync(xlog_t	*log, xlog_in_core_t *iclog);
 
 /* local functions to manipulate grant head */
@@ -99,16 +83,8 @@ STATIC int xlog_regrant_write_log_space(xlog_t		*log,
 STATIC void xlog_ungrant_log_space(xlog_t	 *log,
 				   xlog_ticket_t *ticket);
 
-
-/* local ticket functions */
-STATIC xlog_ticket_t	*xlog_ticket_alloc(xlog_t *log,
-					 int	unit_bytes,
-					 int	count,
-					 char	clientid,
-					 uint	flags);
-
 #if defined(DEBUG)
-STATIC void	xlog_verify_dest_ptr(xlog_t *log, __psint_t ptr);
+STATIC void	xlog_verify_dest_ptr(xlog_t *log, char *ptr);
 STATIC void	xlog_verify_grant_head(xlog_t *log, int equals);
 STATIC void	xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog,
 				  int count, boolean_t syncing);
@@ -249,14 +225,14 @@ xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type)
  * out when the next write occurs.
  */
 xfs_lsn_t
-xfs_log_done(xfs_mount_t	*mp,
-	     xfs_log_ticket_t	xtic,
-	     void		**iclog,
-	     uint		flags)
+xfs_log_done(
+	struct xfs_mount	*mp,
+	struct xlog_ticket	*ticket,
+	struct xlog_in_core	**iclog,
+	uint			flags)
 {
-	xlog_t		*log    = mp->m_log;
-	xlog_ticket_t	*ticket = (xfs_log_ticket_t) xtic;
-	xfs_lsn_t	lsn	= 0;
+	struct log		*log = mp->m_log;
+	xfs_lsn_t		lsn = 0;
 
 	if (XLOG_FORCED_SHUTDOWN(log) ||
 	    /*
@@ -264,8 +240,7 @@ xfs_log_done(xfs_mount_t	*mp,
 	     * If we get an error, just continue and give back the log ticket.
 	     */
 	    (((ticket->t_flags & XLOG_TIC_INITED) == 0) &&
-	     (xlog_commit_record(mp, ticket,
-				 (xlog_in_core_t **)iclog, &lsn)))) {
+	     (xlog_commit_record(log, ticket, iclog, &lsn)))) {
 		lsn = (xfs_lsn_t) -1;
 		if (ticket->t_flags & XLOG_TIC_PERM_RESERV) {
 			flags |= XFS_LOG_REL_PERM_RESERV;
@@ -295,67 +270,8 @@ xfs_log_done(xfs_mount_t	*mp,
 	}
 
 	return lsn;
-}	/* xfs_log_done */
-
-
-/*
- * Force the in-core log to disk.  If flags == XFS_LOG_SYNC,
- *	the force is done synchronously.
- *
- * Asynchronous forces are implemented by setting the WANT_SYNC
- * bit in the appropriate in-core log and then returning.
- *
- * Synchronous forces are implemented with a signal variable. All callers
- * to force a given lsn to disk will wait on a the sv attached to the
- * specific in-core log.  When given in-core log finally completes its
- * write to disk, that thread will wake up all threads waiting on the
- * sv.
- */
-int
-_xfs_log_force(
-	xfs_mount_t	*mp,
-	xfs_lsn_t	lsn,
-	uint		flags,
-	int		*log_flushed)
-{
-	xlog_t		*log = mp->m_log;
-	int		dummy;
-
-	if (!log_flushed)
-		log_flushed = &dummy;
-
-	ASSERT(flags & XFS_LOG_FORCE);
-
-	XFS_STATS_INC(xs_log_force);
-
-	if (log->l_flags & XLOG_IO_ERROR)
-		return XFS_ERROR(EIO);
-	if (lsn == 0)
-		return xlog_state_sync_all(log, flags, log_flushed);
-	else
-		return xlog_state_sync(log, lsn, flags, log_flushed);
-}	/* _xfs_log_force */
-
-/*
- * Wrapper for _xfs_log_force(), to be used when caller doesn't care
- * about errors or whether the log was flushed or not. This is the normal
- * interface to use when trying to unpin items or move the log forward.
- */
-void
-xfs_log_force(
-	xfs_mount_t	*mp,
-	xfs_lsn_t	lsn,
-	uint		flags)
-{
-	int	error;
-	error = _xfs_log_force(mp, lsn, flags, NULL);
-	if (error) {
-		xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: "
-			"error %d returned.", error);
-	}
 }
 
-
 /*
  * Attaches a new iclog I/O completion callback routine during
  * transaction commit.  If the log is in error state, a non-zero
@@ -363,11 +279,11 @@ xfs_log_force(
  * executing the callback at an appropriate time.
  */
 int
-xfs_log_notify(xfs_mount_t	  *mp,		/* mount of partition */
-	       void		  *iclog_hndl,	/* iclog to hang callback off */
-	       xfs_log_callback_t *cb)
+xfs_log_notify(
+	struct xfs_mount	*mp,
+	struct xlog_in_core	*iclog,
+	xfs_log_callback_t	*cb)
 {
-	xlog_in_core_t	  *iclog = (xlog_in_core_t *)iclog_hndl;
 	int	abortflg;
 
 	spin_lock(&iclog->ic_callback_lock);
@@ -381,16 +297,14 @@ xfs_log_notify(xfs_mount_t	  *mp,		/* mount of partition */
 	}
 	spin_unlock(&iclog->ic_callback_lock);
 	return abortflg;
-}	/* xfs_log_notify */
+}
 
 int
-xfs_log_release_iclog(xfs_mount_t *mp,
-		      void	  *iclog_hndl)
+xfs_log_release_iclog(
+	struct xfs_mount	*mp,
+	struct xlog_in_core	*iclog)
 {
-	xlog_t *log = mp->m_log;
-	xlog_in_core_t	  *iclog = (xlog_in_core_t *)iclog_hndl;
-
-	if (xlog_state_release_iclog(log, iclog)) {
+	if (xlog_state_release_iclog(mp->m_log, iclog)) {
 		xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
 		return EIO;
 	}
@@ -409,17 +323,18 @@ xfs_log_release_iclog(xfs_mount_t *mp,
  * reservation, we prevent over allocation problems.
  */
 int
-xfs_log_reserve(xfs_mount_t	 *mp,
-		int		 unit_bytes,
-		int		 cnt,
-		xfs_log_ticket_t *ticket,
-		__uint8_t	 client,
-		uint		 flags,
-		uint		 t_type)
+xfs_log_reserve(
+	struct xfs_mount	*mp,
+	int		 	unit_bytes,
+	int		 	cnt,
+	struct xlog_ticket	**ticket,
+	__uint8_t	 	client,
+	uint		 	flags,
+	uint		 	t_type)
 {
-	xlog_t		*log = mp->m_log;
-	xlog_ticket_t	*internal_ticket;
-	int		retval = 0;
+	struct log		*log = mp->m_log;
+	struct xlog_ticket	*internal_ticket;
+	int			retval = 0;
 
 	ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
 	ASSERT((flags & XFS_LOG_NOSLEEP) == 0);
@@ -432,7 +347,16 @@ xfs_log_reserve(xfs_mount_t	 *mp,
 
 	if (*ticket != NULL) {
 		ASSERT(flags & XFS_LOG_PERM_RESERV);
-		internal_ticket = (xlog_ticket_t *)*ticket;
+		internal_ticket = *ticket;
+
+		/*
+		 * this is a new transaction on the ticket, so we need to
+		 * change the transaction ID so that the next transaction has a
+		 * different TID in the log. Just add one to the existing tid
+		 * so that we can see chains of rolling transactions in the log
+		 * easily.
+		 */
+		internal_ticket->t_tid++;
 
 		trace_xfs_log_reserve(log, internal_ticket);
 
@@ -441,7 +365,8 @@ xfs_log_reserve(xfs_mount_t	 *mp,
 	} else {
 		/* may sleep if need to allocate more tickets */
 		internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt,
-						  client, flags);
+						  client, flags,
+						  KM_SLEEP|KM_MAYFAIL);
 		if (!internal_ticket)
 			return XFS_ERROR(ENOMEM);
 		internal_ticket->t_trans_type = t_type;
@@ -526,6 +451,13 @@ xfs_log_mount(
 	/* Normal transactions can now occur */
 	mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
 
+	/*
+	 * Now the log has been fully initialised and we know were our
+	 * space grant counters are, we can initialise the permanent ticket
+	 * needed for delayed logging to work.
+	 */
+	xlog_cil_init_post_recovery(mp->m_log);
+
 	return 0;
 
 out_destroy_ail:
@@ -583,18 +515,10 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 #ifdef DEBUG
 	xlog_in_core_t	 *first_iclog;
 #endif
-	xfs_log_iovec_t  reg[1];
-	xfs_log_ticket_t tic = NULL;
+	xlog_ticket_t	*tic = NULL;
 	xfs_lsn_t	 lsn;
 	int		 error;
 
-	/* the data section must be 32 bit size aligned */
-	struct {
-	    __uint16_t magic;
-	    __uint16_t pad1;
-	    __uint32_t pad2; /* may as well make it 64 bits */
-	} magic = { XLOG_UNMOUNT_TYPE, 0, 0 };
-
 	/*
 	 * Don't write out unmount record on read-only mounts.
 	 * Or, if we are doing a forced umount (typically because of IO errors).
@@ -602,7 +526,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 	if (mp->m_flags & XFS_MOUNT_RDONLY)
 		return 0;
 
-	error = _xfs_log_force(mp, 0, XFS_LOG_FORCE|XFS_LOG_SYNC, NULL);
+	error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
 	ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log)));
 
 #ifdef DEBUG
@@ -616,16 +540,30 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 	} while (iclog != first_iclog);
 #endif
 	if (! (XLOG_FORCED_SHUTDOWN(log))) {
-		reg[0].i_addr = (void*)&magic;
-		reg[0].i_len  = sizeof(magic);
-		XLOG_VEC_SET_TYPE(&reg[0], XLOG_REG_TYPE_UNMOUNT);
-
 		error = xfs_log_reserve(mp, 600, 1, &tic,
 					XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE);
 		if (!error) {
+			/* the data section must be 32 bit size aligned */
+			struct {
+			    __uint16_t magic;
+			    __uint16_t pad1;
+			    __uint32_t pad2; /* may as well make it 64 bits */
+			} magic = {
+				.magic = XLOG_UNMOUNT_TYPE,
+			};
+			struct xfs_log_iovec reg = {
+				.i_addr = (void *)&magic,
+				.i_len = sizeof(magic),
+				.i_type = XLOG_REG_TYPE_UNMOUNT,
+			};
+			struct xfs_log_vec vec = {
+				.lv_niovecs = 1,
+				.lv_iovecp = &reg,
+			};
+
 			/* remove inited flag */
-			((xlog_ticket_t *)tic)->t_flags = 0;
-			error = xlog_write(mp, reg, 1, tic, &lsn,
+			tic->t_flags = 0;
+			error = xlog_write(log, &vec, tic, &lsn,
 					   NULL, XLOG_UNMOUNT_TRANS);
 			/*
 			 * At this point, we're umounting anyway,
@@ -715,30 +653,54 @@ xfs_log_unmount(xfs_mount_t *mp)
 	xlog_dealloc_log(mp->m_log);
 }
 
+void
+xfs_log_item_init(
+	struct xfs_mount	*mp,
+	struct xfs_log_item	*item,
+	int			type,
+	struct xfs_item_ops	*ops)
+{
+	item->li_mountp = mp;
+	item->li_ailp = mp->m_ail;
+	item->li_type = type;
+	item->li_ops = ops;
+	item->li_lv = NULL;
+
+	INIT_LIST_HEAD(&item->li_ail);
+	INIT_LIST_HEAD(&item->li_cil);
+}
+
 /*
  * Write region vectors to log.  The write happens using the space reservation
  * of the ticket (tic).  It is not a requirement that all writes for a given
- * transaction occur with one call to xfs_log_write().
+ * transaction occur with one call to xfs_log_write(). However, it is important
+ * to note that the transaction reservation code makes an assumption about the
+ * number of log headers a transaction requires that may be violated if you
+ * don't pass all the transaction vectors in one call....
  */
 int
-xfs_log_write(xfs_mount_t *	mp,
-	      xfs_log_iovec_t	reg[],
-	      int		nentries,
-	      xfs_log_ticket_t	tic,
-	      xfs_lsn_t		*start_lsn)
+xfs_log_write(
+	struct xfs_mount	*mp,
+	struct xfs_log_iovec	reg[],
+	int			nentries,
+	struct xlog_ticket	*tic,
+	xfs_lsn_t		*start_lsn)
 {
-	int	error;
-	xlog_t *log = mp->m_log;
+	struct log		*log = mp->m_log;
+	int			error;
+	struct xfs_log_vec	vec = {
+		.lv_niovecs = nentries,
+		.lv_iovecp = reg,
+	};
 
 	if (XLOG_FORCED_SHUTDOWN(log))
 		return XFS_ERROR(EIO);
 
-	if ((error = xlog_write(mp, reg, nentries, tic, start_lsn, NULL, 0))) {
+	error = xlog_write(log, &vec, tic, start_lsn, NULL, 0);
+	if (error)
 		xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
-	}
 	return error;
-}	/* xfs_log_write */
-
+}
 
 void
 xfs_log_move_tail(xfs_mount_t	*mp,
@@ -812,9 +774,16 @@ xfs_log_move_tail(xfs_mount_t	*mp,
 
 /*
  * Determine if we have a transaction that has gone to disk
- * that needs to be covered. Log activity needs to be idle (no AIL and
- * nothing in the iclogs). And, we need to be in the right state indicating
- * something has gone out.
+ * that needs to be covered. To begin the transition to the idle state
+ * firstly the log needs to be idle (no AIL and nothing in the iclogs).
+ * If we are then in a state where covering is needed, the caller is informed
+ * that dummy transactions are required to move the log into the idle state.
+ *
+ * Because this is called as part of the sync process, we should also indicate
+ * that dummy transactions should be issued in anything but the covered or
+ * idle states. This ensures that the log tail is accurately reflected in
+ * the log at the end of the sync, hence if a crash occurrs avoids replay
+ * of transactions where the metadata is already on disk.
  */
 int
 xfs_log_need_covered(xfs_mount_t *mp)
@@ -826,17 +795,24 @@ xfs_log_need_covered(xfs_mount_t *mp)
 		return 0;
 
 	spin_lock(&log->l_icloglock);
-	if (((log->l_covered_state == XLOG_STATE_COVER_NEED) ||
-		(log->l_covered_state == XLOG_STATE_COVER_NEED2))
-			&& !xfs_trans_ail_tail(log->l_ailp)
-			&& xlog_iclogs_empty(log)) {
-		if (log->l_covered_state == XLOG_STATE_COVER_NEED)
-			log->l_covered_state = XLOG_STATE_COVER_DONE;
-		else {
-			ASSERT(log->l_covered_state == XLOG_STATE_COVER_NEED2);
-			log->l_covered_state = XLOG_STATE_COVER_DONE2;
+	switch (log->l_covered_state) {
+	case XLOG_STATE_COVER_DONE:
+	case XLOG_STATE_COVER_DONE2:
+	case XLOG_STATE_COVER_IDLE:
+		break;
+	case XLOG_STATE_COVER_NEED:
+	case XLOG_STATE_COVER_NEED2:
+		if (!xfs_trans_ail_tail(log->l_ailp) &&
+		    xlog_iclogs_empty(log)) {
+			if (log->l_covered_state == XLOG_STATE_COVER_NEED)
+				log->l_covered_state = XLOG_STATE_COVER_DONE;
+			else
+				log->l_covered_state = XLOG_STATE_COVER_DONE2;
 		}
+		/* FALLTHRU */
+	default:
 		needed = 1;
+		break;
 	}
 	spin_unlock(&log->l_icloglock);
 	return needed;
@@ -988,35 +964,6 @@ xlog_iodone(xfs_buf_t *bp)
 }	/* xlog_iodone */
 
 /*
- * The bdstrat callback function for log bufs. This gives us a central
- * place to trap bufs in case we get hit by a log I/O error and need to
- * shutdown. Actually, in practice, even when we didn't get a log error,
- * we transition the iclogs to IOERROR state *after* flushing all existing
- * iclogs to disk. This is because we don't want anymore new transactions to be
- * started or completed afterwards.
- */
-STATIC int
-xlog_bdstrat_cb(struct xfs_buf *bp)
-{
-	xlog_in_core_t *iclog;
-
-	iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *);
-
-	if ((iclog->ic_state & XLOG_STATE_IOERROR) == 0) {
-	  /* note for irix bstrat will need  struct bdevsw passed
-	   * Fix the following macro if the code ever is merged
-	   */
-	    XFS_bdstrat(bp);
-		return 0;
-	}
-
-	XFS_BUF_ERROR(bp, EIO);
-	XFS_BUF_STALE(bp);
-	xfs_biodone(bp);
-	return XFS_ERROR(EIO);
-}
-
-/*
  * Return size of each in-core log record buffer.
  *
  * All machines get 8 x 32kB buffers by default, unless tuned otherwise.
@@ -1102,6 +1049,7 @@ xlog_alloc_log(xfs_mount_t	*mp,
 	int			i;
 	int			iclogsize;
 	int			error = ENOMEM;
+	uint			log2_size = 0;
 
 	log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL);
 	if (!log) {
@@ -1127,29 +1075,30 @@ xlog_alloc_log(xfs_mount_t	*mp,
 
 	error = EFSCORRUPTED;
 	if (xfs_sb_version_hassector(&mp->m_sb)) {
-		log->l_sectbb_log = mp->m_sb.sb_logsectlog - BBSHIFT;
-		if (log->l_sectbb_log < 0 ||
-		    log->l_sectbb_log > mp->m_sectbb_log) {
-			xlog_warn("XFS: Log sector size (0x%x) out of range.",
-						log->l_sectbb_log);
+	        log2_size = mp->m_sb.sb_logsectlog;
+		if (log2_size < BBSHIFT) {
+			xlog_warn("XFS: Log sector size too small "
+				"(0x%x < 0x%x)", log2_size, BBSHIFT);
 			goto out_free_log;
 		}
 
-		/* for larger sector sizes, must have v2 or external log */
-		if (log->l_sectbb_log != 0 &&
-		    (log->l_logBBstart != 0 &&
-		     !xfs_sb_version_haslogv2(&mp->m_sb))) {
-			xlog_warn("XFS: log sector size (0x%x) invalid "
-				  "for configuration.", log->l_sectbb_log);
+	        log2_size -= BBSHIFT;
+		if (log2_size > mp->m_sectbb_log) {
+			xlog_warn("XFS: Log sector size too large "
+				"(0x%x > 0x%x)", log2_size, mp->m_sectbb_log);
 			goto out_free_log;
 		}
-		if (mp->m_sb.sb_logsectlog < BBSHIFT) {
-			xlog_warn("XFS: Log sector log (0x%x) too small.",
-						mp->m_sb.sb_logsectlog);
+
+		/* for larger sector sizes, must have v2 or external log */
+		if (log2_size && log->l_logBBstart > 0 &&
+			    !xfs_sb_version_haslogv2(&mp->m_sb)) {
+
+			xlog_warn("XFS: log sector size (0x%x) invalid "
+				  "for configuration.", log2_size);
 			goto out_free_log;
 		}
 	}
-	log->l_sectbb_mask = (1 << log->l_sectbb_log) - 1;
+	log->l_sectBBsize = 1 << log2_size;
 
 	xlog_get_iclog_buffer_size(mp, log);
 
@@ -1158,7 +1107,6 @@ xlog_alloc_log(xfs_mount_t	*mp,
 	if (!bp)
 		goto out_free_log;
 	XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone);
-	XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb);
 	XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
 	ASSERT(XFS_BUF_ISBUSY(bp));
 	ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
@@ -1196,7 +1144,6 @@ xlog_alloc_log(xfs_mount_t	*mp,
 		if (!XFS_BUF_CPSEMA(bp))
 			ASSERT(0);
 		XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone);
-		XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb);
 		XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
 		iclog->ic_bp = bp;
 		iclog->ic_data = bp->b_addr;
@@ -1231,6 +1178,9 @@ xlog_alloc_log(xfs_mount_t	*mp,
 	*iclogp = log->l_iclog;			/* complete ring */
 	log->l_iclog->ic_prev = prev_iclog;	/* re-write 1st prev ptr */
 
+	error = xlog_cil_init(log);
+	if (error)
+		goto out_free_iclog;
 	return log;
 
 out_free_iclog:
@@ -1258,26 +1208,31 @@ out:
  * ticket.  Return the lsn of the commit record.
  */
 STATIC int
-xlog_commit_record(xfs_mount_t  *mp,
-		   xlog_ticket_t *ticket,
-		   xlog_in_core_t **iclog,
-		   xfs_lsn_t	*commitlsnp)
+xlog_commit_record(
+	struct log		*log,
+	struct xlog_ticket	*ticket,
+	struct xlog_in_core	**iclog,
+	xfs_lsn_t		*commitlsnp)
 {
-	int		error;
-	xfs_log_iovec_t	reg[1];
-
-	reg[0].i_addr = NULL;
-	reg[0].i_len = 0;
-	XLOG_VEC_SET_TYPE(&reg[0], XLOG_REG_TYPE_COMMIT);
+	struct xfs_mount *mp = log->l_mp;
+	int	error;
+	struct xfs_log_iovec reg = {
+		.i_addr = NULL,
+		.i_len = 0,
+		.i_type = XLOG_REG_TYPE_COMMIT,
+	};
+	struct xfs_log_vec vec = {
+		.lv_niovecs = 1,
+		.lv_iovecp = &reg,
+	};
 
 	ASSERT_ALWAYS(iclog);
-	if ((error = xlog_write(mp, reg, 1, ticket, commitlsnp,
-			       iclog, XLOG_COMMIT_TRANS))) {
+	error = xlog_write(log, &vec, ticket, commitlsnp, iclog,
+					XLOG_COMMIT_TRANS);
+	if (error)
 		xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
-	}
 	return error;
-}	/* xlog_commit_record */
-
+}
 
 /*
  * Push on the buffer cache code if we ever use more than 75% of the on-disk
@@ -1343,6 +1298,37 @@ xlog_grant_push_ail(xfs_mount_t	*mp,
 	    xfs_trans_ail_push(log->l_ailp, threshold_lsn);
 }	/* xlog_grant_push_ail */
 
+/*
+ * The bdstrat callback function for log bufs. This gives us a central
+ * place to trap bufs in case we get hit by a log I/O error and need to
+ * shutdown. Actually, in practice, even when we didn't get a log error,
+ * we transition the iclogs to IOERROR state *after* flushing all existing
+ * iclogs to disk. This is because we don't want anymore new transactions to be
+ * started or completed afterwards.
+ */
+STATIC int
+xlog_bdstrat(
+	struct xfs_buf		*bp)
+{
+	struct xlog_in_core	*iclog;
+
+	iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *);
+	if (iclog->ic_state & XLOG_STATE_IOERROR) {
+		XFS_BUF_ERROR(bp, EIO);
+		XFS_BUF_STALE(bp);
+		xfs_biodone(bp);
+		/*
+		 * It would seem logical to return EIO here, but we rely on
+		 * the log state machine to propagate I/O errors instead of
+		 * doing it here.
+		 */
+		return 0;
+	}
+
+	bp->b_flags |= _XBF_RUN_QUEUES;
+	xfs_buf_iorequest(bp);
+	return 0;
+}
 
 /*
  * Flush out the in-core log (iclog) to the on-disk log in an asynchronous 
@@ -1462,7 +1448,7 @@ xlog_sync(xlog_t		*log,
 	 */
 	XFS_BUF_WRITE(bp);
 
-	if ((error = XFS_bwrite(bp))) {
+	if ((error = xlog_bdstrat(bp))) {
 		xfs_ioerror_alert("xlog_sync", log->l_mp, bp,
 				  XFS_BUF_ADDR(bp));
 		return error;
@@ -1502,7 +1488,7 @@ xlog_sync(xlog_t		*log,
 		/* account for internal log which doesn't start at block #0 */
 		XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
 		XFS_BUF_WRITE(bp);
-		if ((error = XFS_bwrite(bp))) {
+		if ((error = xlog_bdstrat(bp))) {
 			xfs_ioerror_alert("xlog_sync (split)", log->l_mp,
 					  bp, XFS_BUF_ADDR(bp));
 			return error;
@@ -1521,6 +1507,8 @@ xlog_dealloc_log(xlog_t *log)
 	xlog_in_core_t	*iclog, *next_iclog;
 	int		i;
 
+	xlog_cil_destroy(log);
+
 	iclog = log->l_iclog;
 	for (i=0; i<log->l_iclog_bufs; i++) {
 		sv_destroy(&iclog->ic_force_wait);
@@ -1563,8 +1551,10 @@ xlog_state_finish_copy(xlog_t		*log,
  * print out info relating to regions written which consume
  * the reservation
  */
-STATIC void
-xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
+void
+xlog_print_tic_res(
+	struct xfs_mount	*mp,
+	struct xlog_ticket	*ticket)
 {
 	uint i;
 	uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t);
@@ -1664,6 +1654,196 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
 			    "bad-rtype" : res_type_str[r_type-1]),
 			    ticket->t_res_arr[i].r_len);
 	}
+
+	xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp,
+		"xfs_log_write: reservation ran out. Need to up reservation");
+	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+}
+
+/*
+ * Calculate the potential space needed by the log vector.  Each region gets
+ * its own xlog_op_header_t and may need to be double word aligned.
+ */
+static int
+xlog_write_calc_vec_length(
+	struct xlog_ticket	*ticket,
+	struct xfs_log_vec	*log_vector)
+{
+	struct xfs_log_vec	*lv;
+	int			headers = 0;
+	int			len = 0;
+	int			i;
+
+	/* acct for start rec of xact */
+	if (ticket->t_flags & XLOG_TIC_INITED)
+		headers++;
+
+	for (lv = log_vector; lv; lv = lv->lv_next) {
+		headers += lv->lv_niovecs;
+
+		for (i = 0; i < lv->lv_niovecs; i++) {
+			struct xfs_log_iovec	*vecp = &lv->lv_iovecp[i];
+
+			len += vecp->i_len;
+			xlog_tic_add_region(ticket, vecp->i_len, vecp->i_type);
+		}
+	}
+
+	ticket->t_res_num_ophdrs += headers;
+	len += headers * sizeof(struct xlog_op_header);
+
+	return len;
+}
+
+/*
+ * If first write for transaction, insert start record  We can't be trying to
+ * commit if we are inited.  We can't have any "partial_copy" if we are inited.
+ */
+static int
+xlog_write_start_rec(
+	struct xlog_op_header	*ophdr,
+	struct xlog_ticket	*ticket)
+{
+	if (!(ticket->t_flags & XLOG_TIC_INITED))
+		return 0;
+
+	ophdr->oh_tid	= cpu_to_be32(ticket->t_tid);
+	ophdr->oh_clientid = ticket->t_clientid;
+	ophdr->oh_len = 0;
+	ophdr->oh_flags = XLOG_START_TRANS;
+	ophdr->oh_res2 = 0;
+
+	ticket->t_flags &= ~XLOG_TIC_INITED;
+
+	return sizeof(struct xlog_op_header);
+}
+
+static xlog_op_header_t *
+xlog_write_setup_ophdr(
+	struct log		*log,
+	struct xlog_op_header	*ophdr,
+	struct xlog_ticket	*ticket,
+	uint			flags)
+{
+	ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
+	ophdr->oh_clientid = ticket->t_clientid;
+	ophdr->oh_res2 = 0;
+
+	/* are we copying a commit or unmount record? */
+	ophdr->oh_flags = flags;
+
+	/*
+	 * We've seen logs corrupted with bad transaction client ids.  This
+	 * makes sure that XFS doesn't generate them on.  Turn this into an EIO
+	 * and shut down the filesystem.
+	 */
+	switch (ophdr->oh_clientid)  {
+	case XFS_TRANSACTION:
+	case XFS_VOLUME:
+	case XFS_LOG:
+		break;
+	default:
+		xfs_fs_cmn_err(CE_WARN, log->l_mp,
+			"Bad XFS transaction clientid 0x%x in ticket 0x%p",
+			ophdr->oh_clientid, ticket);
+		return NULL;
+	}
+
+	return ophdr;
+}
+
+/*
+ * Set up the parameters of the region copy into the log. This has
+ * to handle region write split across multiple log buffers - this
+ * state is kept external to this function so that this code can
+ * can be written in an obvious, self documenting manner.
+ */
+static int
+xlog_write_setup_copy(
+	struct xlog_ticket	*ticket,
+	struct xlog_op_header	*ophdr,
+	int			space_available,
+	int			space_required,
+	int			*copy_off,
+	int			*copy_len,
+	int			*last_was_partial_copy,
+	int			*bytes_consumed)
+{
+	int			still_to_copy;
+
+	still_to_copy = space_required - *bytes_consumed;
+	*copy_off = *bytes_consumed;
+
+	if (still_to_copy <= space_available) {
+		/* write of region completes here */
+		*copy_len = still_to_copy;
+		ophdr->oh_len = cpu_to_be32(*copy_len);
+		if (*last_was_partial_copy)
+			ophdr->oh_flags |= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS);
+		*last_was_partial_copy = 0;
+		*bytes_consumed = 0;
+		return 0;
+	}
+
+	/* partial write of region, needs extra log op header reservation */
+	*copy_len = space_available;
+	ophdr->oh_len = cpu_to_be32(*copy_len);
+	ophdr->oh_flags |= XLOG_CONTINUE_TRANS;
+	if (*last_was_partial_copy)
+		ophdr->oh_flags |= XLOG_WAS_CONT_TRANS;
+	*bytes_consumed += *copy_len;
+	(*last_was_partial_copy)++;
+
+	/* account for new log op header */
+	ticket->t_curr_res -= sizeof(struct xlog_op_header);
+	ticket->t_res_num_ophdrs++;
+
+	return sizeof(struct xlog_op_header);
+}
+
+static int
+xlog_write_copy_finish(
+	struct log		*log,
+	struct xlog_in_core	*iclog,
+	uint			flags,
+	int			*record_cnt,
+	int			*data_cnt,
+	int			*partial_copy,
+	int			*partial_copy_len,
+	int			log_offset,
+	struct xlog_in_core	**commit_iclog)
+{
+	if (*partial_copy) {
+		/*
+		 * This iclog has already been marked WANT_SYNC by
+		 * xlog_state_get_iclog_space.
+		 */
+		xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
+		*record_cnt = 0;
+		*data_cnt = 0;
+		return xlog_state_release_iclog(log, iclog);
+	}
+
+	*partial_copy = 0;
+	*partial_copy_len = 0;
+
+	if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
+		/* no more space in this iclog - push it. */
+		xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
+		*record_cnt = 0;
+		*data_cnt = 0;
+
+		spin_lock(&log->l_icloglock);
+		xlog_state_want_sync(log, iclog);
+		spin_unlock(&log->l_icloglock);
+
+		if (!commit_iclog)
+			return xlog_state_release_iclog(log, iclog);
+		ASSERT(flags & XLOG_COMMIT_TRANS);
+		*commit_iclog = iclog;
+	}
+
+	return 0;
 }
 
 /*
@@ -1706,211 +1886,163 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
  *	we don't update ic_offset until the end when we know exactly how many
  *	bytes have been written out.
  */
-STATIC int
-xlog_write(xfs_mount_t *	mp,
-	   xfs_log_iovec_t	reg[],
-	   int			nentries,
-	   xfs_log_ticket_t	tic,
-	   xfs_lsn_t		*start_lsn,
-	   xlog_in_core_t	**commit_iclog,
-	   uint			flags)
-{
-    xlog_t	     *log = mp->m_log;
-    xlog_ticket_t    *ticket = (xlog_ticket_t *)tic;
-    xlog_in_core_t   *iclog = NULL;  /* ptr to current in-core log */
-    xlog_op_header_t *logop_head;    /* ptr to log operation header */
-    __psint_t	     ptr;	     /* copy address into data region */
-    int		     len;	     /* # xlog_write() bytes 2 still copy */
-    int		     index;	     /* region index currently copying */
-    int		     log_offset;     /* offset (from 0) into data region */
-    int		     start_rec_copy; /* # bytes to copy for start record */
-    int		     partial_copy;   /* did we split a region? */
-    int		     partial_copy_len;/* # bytes copied if split region */
-    int		     need_copy;	     /* # bytes need to memcpy this region */
-    int		     copy_len;	     /* # bytes actually memcpy'ing */
-    int		     copy_off;	     /* # bytes from entry start */
-    int		     contwr;	     /* continued write of in-core log? */
-    int		     error;
-    int		     record_cnt = 0, data_cnt = 0;
-
-    partial_copy_len = partial_copy = 0;
-
-    /* Calculate potential maximum space.  Each region gets its own
-     * xlog_op_header_t and may need to be double word aligned.
-     */
-    len = 0;
-    if (ticket->t_flags & XLOG_TIC_INITED) {    /* acct for start rec of xact */
-	len += sizeof(xlog_op_header_t);
-	ticket->t_res_num_ophdrs++;
-    }
-
-    for (index = 0; index < nentries; index++) {
-	len += sizeof(xlog_op_header_t);	    /* each region gets >= 1 */
-	ticket->t_res_num_ophdrs++;
-	len += reg[index].i_len;
-	xlog_tic_add_region(ticket, reg[index].i_len, reg[index].i_type);
-    }
-    contwr = *start_lsn = 0;
+int
+xlog_write(
+	struct log		*log,
+	struct xfs_log_vec	*log_vector,
+	struct xlog_ticket	*ticket,
+	xfs_lsn_t		*start_lsn,
+	struct xlog_in_core	**commit_iclog,
+	uint			flags)
+{
+	struct xlog_in_core	*iclog = NULL;
+	struct xfs_log_iovec	*vecp;
+	struct xfs_log_vec	*lv;
+	int			len;
+	int			index;
+	int			partial_copy = 0;
+	int			partial_copy_len = 0;
+	int			contwr = 0;
+	int			record_cnt = 0;
+	int			data_cnt = 0;
+	int			error;
+
+	*start_lsn = 0;
+
+	len = xlog_write_calc_vec_length(ticket, log_vector);
+	if (log->l_cilp) {
+		/*
+		 * Region headers and bytes are already accounted for.
+		 * We only need to take into account start records and
+		 * split regions in this function.
+		 */
+		if (ticket->t_flags & XLOG_TIC_INITED)
+			ticket->t_curr_res -= sizeof(xlog_op_header_t);
 
-    if (ticket->t_curr_res < len) {
-	xlog_print_tic_res(mp, ticket);
-#ifdef DEBUG
-	xlog_panic(
-		"xfs_log_write: reservation ran out. Need to up reservation");
-#else
-	/* Customer configurable panic */
-	xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp,
-		"xfs_log_write: reservation ran out. Need to up reservation");
-	/* If we did not panic, shutdown the filesystem */
-	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-#endif
-    } else
-	ticket->t_curr_res -= len;
+		/*
+		 * Commit record headers need to be accounted for. These
+		 * come in as separate writes so are easy to detect.
+		 */
+		if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS))
+			ticket->t_curr_res -= sizeof(xlog_op_header_t);
+	} else
+		ticket->t_curr_res -= len;
+
+	if (ticket->t_curr_res < 0)
+		xlog_print_tic_res(log->l_mp, ticket);
+
+	index = 0;
+	lv = log_vector;
+	vecp = lv->lv_iovecp;
+	while (lv && index < lv->lv_niovecs) {
+		void		*ptr;
+		int		log_offset;
+
+		error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
+						   &contwr, &log_offset);
+		if (error)
+			return error;
 
-    for (index = 0; index < nentries; ) {
-	if ((error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
-					       &contwr, &log_offset)))
-		return error;
+		ASSERT(log_offset <= iclog->ic_size - 1);
+		ptr = iclog->ic_datap + log_offset;
 
-	ASSERT(log_offset <= iclog->ic_size - 1);
-	ptr = (__psint_t) ((char *)iclog->ic_datap+log_offset);
+		/* start_lsn is the first lsn written to. That's all we need. */
+		if (!*start_lsn)
+			*start_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
 
-	/* start_lsn is the first lsn written to. That's all we need. */
-	if (! *start_lsn)
-	    *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
+		/*
+		 * This loop writes out as many regions as can fit in the amount
+		 * of space which was allocated by xlog_state_get_iclog_space().
+		 */
+		while (lv && index < lv->lv_niovecs) {
+			struct xfs_log_iovec	*reg = &vecp[index];
+			struct xlog_op_header	*ophdr;
+			int			start_rec_copy;
+			int			copy_len;
+			int			copy_off;
+
+			ASSERT(reg->i_len % sizeof(__int32_t) == 0);
+			ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0);
+
+			start_rec_copy = xlog_write_start_rec(ptr, ticket);
+			if (start_rec_copy) {
+				record_cnt++;
+				xlog_write_adv_cnt(&ptr, &len, &log_offset,
+						   start_rec_copy);
+			}
 
-	/* This loop writes out as many regions as can fit in the amount
-	 * of space which was allocated by xlog_state_get_iclog_space().
-	 */
-	while (index < nentries) {
-	    ASSERT(reg[index].i_len % sizeof(__int32_t) == 0);
-	    ASSERT((__psint_t)ptr % sizeof(__int32_t) == 0);
-	    start_rec_copy = 0;
-
-	    /* If first write for transaction, insert start record.
-	     * We can't be trying to commit if we are inited.  We can't
-	     * have any "partial_copy" if we are inited.
-	     */
-	    if (ticket->t_flags & XLOG_TIC_INITED) {
-		logop_head		= (xlog_op_header_t *)ptr;
-		logop_head->oh_tid	= cpu_to_be32(ticket->t_tid);
-		logop_head->oh_clientid = ticket->t_clientid;
-		logop_head->oh_len	= 0;
-		logop_head->oh_flags    = XLOG_START_TRANS;
-		logop_head->oh_res2	= 0;
-		ticket->t_flags		&= ~XLOG_TIC_INITED;	/* clear bit */
-		record_cnt++;
-
-		start_rec_copy = sizeof(xlog_op_header_t);
-		xlog_write_adv_cnt(ptr, len, log_offset, start_rec_copy);
-	    }
+			ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags);
+			if (!ophdr)
+				return XFS_ERROR(EIO);
 
-	    /* Copy log operation header directly into data section */
-	    logop_head			= (xlog_op_header_t *)ptr;
-	    logop_head->oh_tid		= cpu_to_be32(ticket->t_tid);
-	    logop_head->oh_clientid	= ticket->t_clientid;
-	    logop_head->oh_res2		= 0;
+			xlog_write_adv_cnt(&ptr, &len, &log_offset,
+					   sizeof(struct xlog_op_header));
+
+			len += xlog_write_setup_copy(ticket, ophdr,
+						     iclog->ic_size-log_offset,
+						     reg->i_len,
+						     &copy_off, &copy_len,
+						     &partial_copy,
+						     &partial_copy_len);
+			xlog_verify_dest_ptr(log, ptr);
+
+			/* copy region */
+			ASSERT(copy_len >= 0);
+			memcpy(ptr, reg->i_addr + copy_off, copy_len);
+			xlog_write_adv_cnt(&ptr, &len, &log_offset, copy_len);
+
+			copy_len += start_rec_copy + sizeof(xlog_op_header_t);
+			record_cnt++;
+			data_cnt += contwr ? copy_len : 0;
+
+			error = xlog_write_copy_finish(log, iclog, flags,
+						       &record_cnt, &data_cnt,
+						       &partial_copy,
+						       &partial_copy_len,
+						       log_offset,
+						       commit_iclog);
+			if (error)
+				return error;
 
-	    /* header copied directly */
-	    xlog_write_adv_cnt(ptr, len, log_offset, sizeof(xlog_op_header_t));
+			/*
+			 * if we had a partial copy, we need to get more iclog
+			 * space but we don't want to increment the region
+			 * index because there is still more is this region to
+			 * write.
+			 *
+			 * If we completed writing this region, and we flushed
+			 * the iclog (indicated by resetting of the record
+			 * count), then we also need to get more log space. If
+			 * this was the last record, though, we are done and
+			 * can just return.
+			 */
+			if (partial_copy)
+				break;
 
-	    /* are we copying a commit or unmount record? */
-	    logop_head->oh_flags = flags;
+			if (++index == lv->lv_niovecs) {
+				lv = lv->lv_next;
+				index = 0;
+				if (lv)
+					vecp = lv->lv_iovecp;
+			}
+			if (record_cnt == 0) {
+				if (!lv)
+					return 0;
+				break;
+			}
+		}
+	}
 
-	    /*
-	     * We've seen logs corrupted with bad transaction client
-	     * ids.  This makes sure that XFS doesn't generate them on.
-	     * Turn this into an EIO and shut down the filesystem.
-	     */
-	    switch (logop_head->oh_clientid)  {
-	    case XFS_TRANSACTION:
-	    case XFS_VOLUME:
-	    case XFS_LOG:
-		break;
-	    default:
-		xfs_fs_cmn_err(CE_WARN, mp,
-		    "Bad XFS transaction clientid 0x%x in ticket 0x%p",
-		    logop_head->oh_clientid, tic);
-		return XFS_ERROR(EIO);
-	    }
+	ASSERT(len == 0);
 
-	    /* Partial write last time? => (partial_copy != 0)
-	     * need_copy is the amount we'd like to copy if everything could
-	     * fit in the current memcpy.
-	     */
-	    need_copy =	reg[index].i_len - partial_copy_len;
-
-	    copy_off = partial_copy_len;
-	    if (need_copy <= iclog->ic_size - log_offset) { /*complete write */
-	        copy_len = need_copy;
-		logop_head->oh_len = cpu_to_be32(copy_len);
-		if (partial_copy)
-		    logop_head->oh_flags|= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS);
-		partial_copy_len = partial_copy = 0;
-	    } else {					    /* partial write */
-		copy_len = iclog->ic_size - log_offset;
-		logop_head->oh_len = cpu_to_be32(copy_len);
-		logop_head->oh_flags |= XLOG_CONTINUE_TRANS;
-		if (partial_copy)
-			logop_head->oh_flags |= XLOG_WAS_CONT_TRANS;
-		partial_copy_len += copy_len;
-		partial_copy++;
-		len += sizeof(xlog_op_header_t); /* from splitting of region */
-		/* account for new log op header */
-		ticket->t_curr_res -= sizeof(xlog_op_header_t);
-		ticket->t_res_num_ophdrs++;
-	    }
-	    xlog_verify_dest_ptr(log, ptr);
-
-	    /* copy region */
-	    ASSERT(copy_len >= 0);
-	    memcpy((xfs_caddr_t)ptr, reg[index].i_addr + copy_off, copy_len);
-	    xlog_write_adv_cnt(ptr, len, log_offset, copy_len);
-
-	    /* make copy_len total bytes copied, including headers */
-	    copy_len += start_rec_copy + sizeof(xlog_op_header_t);
-	    record_cnt++;
-	    data_cnt += contwr ? copy_len : 0;
-	    if (partial_copy) {			/* copied partial region */
-		    /* already marked WANT_SYNC by xlog_state_get_iclog_space */
-		    xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
-		    record_cnt = data_cnt = 0;
-		    if ((error = xlog_state_release_iclog(log, iclog)))
-			    return error;
-		    break;			/* don't increment index */
-	    } else {				/* copied entire region */
-		index++;
-		partial_copy_len = partial_copy = 0;
-
-		if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
-		    xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
-		    record_cnt = data_cnt = 0;
-		    spin_lock(&log->l_icloglock);
-		    xlog_state_want_sync(log, iclog);
-		    spin_unlock(&log->l_icloglock);
-		    if (commit_iclog) {
-			ASSERT(flags & XLOG_COMMIT_TRANS);
-			*commit_iclog = iclog;
-		    } else if ((error = xlog_state_release_iclog(log, iclog)))
-			   return error;
-		    if (index == nentries)
-			    return 0;		/* we are done */
-		    else
-			    break;
-		}
-	    } /* if (partial_copy) */
-	} /* while (index < nentries) */
-    } /* for (index = 0; index < nentries; ) */
-    ASSERT(len == 0);
+	xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
+	if (!commit_iclog)
+		return xlog_state_release_iclog(log, iclog);
 
-    xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
-    if (commit_iclog) {
 	ASSERT(flags & XLOG_COMMIT_TRANS);
 	*commit_iclog = iclog;
 	return 0;
-    }
-    return xlog_state_release_iclog(log, iclog);
-}	/* xlog_write */
+}
 
 
 /*****************************************************************************
@@ -2854,7 +2986,6 @@ xlog_state_switch_iclogs(xlog_t		*log,
 	log->l_iclog = iclog->ic_next;
 }	/* xlog_state_switch_iclogs */
 
-
 /*
  * Write out all data in the in-core log as of this exact moment in time.
  *
@@ -2882,11 +3013,19 @@ xlog_state_switch_iclogs(xlog_t		*log,
  *	   b) when we return from flushing out this iclog, it is still
  *		not in the active nor dirty state.
  */
-STATIC int
-xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed)
+int
+_xfs_log_force(
+	struct xfs_mount	*mp,
+	uint			flags,
+	int			*log_flushed)
 {
-	xlog_in_core_t	*iclog;
-	xfs_lsn_t	lsn;
+	struct log		*log = mp->m_log;
+	struct xlog_in_core	*iclog;
+	xfs_lsn_t		lsn;
+
+	XFS_STATS_INC(xs_log_force);
+
+	xlog_cil_push(log, 1);
 
 	spin_lock(&log->l_icloglock);
 
@@ -2932,7 +3071,9 @@ xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed)
 
 				if (xlog_state_release_iclog(log, iclog))
 					return XFS_ERROR(EIO);
-				*log_flushed = 1;
+
+				if (log_flushed)
+					*log_flushed = 1;
 				spin_lock(&log->l_icloglock);
 				if (be64_to_cpu(iclog->ic_header.h_lsn) == lsn &&
 				    iclog->ic_state != XLOG_STATE_DIRTY)
@@ -2976,19 +3117,37 @@ maybe_sleep:
 		 */
 		if (iclog->ic_state & XLOG_STATE_IOERROR)
 			return XFS_ERROR(EIO);
-		*log_flushed = 1;
-
+		if (log_flushed)
+			*log_flushed = 1;
 	} else {
 
 no_sleep:
 		spin_unlock(&log->l_icloglock);
 	}
 	return 0;
-}	/* xlog_state_sync_all */
+}
 
+/*
+ * Wrapper for _xfs_log_force(), to be used when caller doesn't care
+ * about errors or whether the log was flushed or not. This is the normal
+ * interface to use when trying to unpin items or move the log forward.
+ */
+void
+xfs_log_force(
+	xfs_mount_t	*mp,
+	uint		flags)
+{
+	int	error;
+
+	error = _xfs_log_force(mp, flags, NULL);
+	if (error) {
+		xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: "
+			"error %d returned.", error);
+	}
+}
 
 /*
- * Used by code which implements synchronous log forces.
+ * Force the in-core log to disk for a specific LSN.
  *
  * Find in-core log with lsn.
  *	If it is in the DIRTY state, just return.
@@ -2996,109 +3155,148 @@ no_sleep:
  *		state and go to sleep or return.
  *	If it is in any other state, go to sleep or return.
  *
- * If filesystem activity goes to zero, the iclog will get flushed only by
- * bdflush().
+ * Synchronous forces are implemented with a signal variable. All callers
+ * to force a given lsn to disk will wait on a the sv attached to the
+ * specific in-core log.  When given in-core log finally completes its
+ * write to disk, that thread will wake up all threads waiting on the
+ * sv.
  */
-STATIC int
-xlog_state_sync(xlog_t	  *log,
-		xfs_lsn_t lsn,
-		uint	  flags,
-		int	  *log_flushed)
+int
+_xfs_log_force_lsn(
+	struct xfs_mount	*mp,
+	xfs_lsn_t		lsn,
+	uint			flags,
+	int			*log_flushed)
 {
-    xlog_in_core_t	*iclog;
-    int			already_slept = 0;
+	struct log		*log = mp->m_log;
+	struct xlog_in_core	*iclog;
+	int			already_slept = 0;
 
-try_again:
-    spin_lock(&log->l_icloglock);
-    iclog = log->l_iclog;
+	ASSERT(lsn != 0);
 
-    if (iclog->ic_state & XLOG_STATE_IOERROR) {
-	    spin_unlock(&log->l_icloglock);
-	    return XFS_ERROR(EIO);
-    }
+	XFS_STATS_INC(xs_log_force);
 
-    do {
-	if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) {
-		iclog = iclog->ic_next;
-		continue;
+	if (log->l_cilp) {
+		lsn = xlog_cil_push_lsn(log, lsn);
+		if (lsn == NULLCOMMITLSN)
+			return 0;
 	}
 
-	if (iclog->ic_state == XLOG_STATE_DIRTY) {
+try_again:
+	spin_lock(&log->l_icloglock);
+	iclog = log->l_iclog;
+	if (iclog->ic_state & XLOG_STATE_IOERROR) {
 		spin_unlock(&log->l_icloglock);
-		return 0;
+		return XFS_ERROR(EIO);
 	}
 
-	if (iclog->ic_state == XLOG_STATE_ACTIVE) {
-		/*
-		 * We sleep here if we haven't already slept (e.g.
-		 * this is the first time we've looked at the correct
-		 * iclog buf) and the buffer before us is going to
-		 * be sync'ed. The reason for this is that if we
-		 * are doing sync transactions here, by waiting for
-		 * the previous I/O to complete, we can allow a few
-		 * more transactions into this iclog before we close
-		 * it down.
-		 *
-		 * Otherwise, we mark the buffer WANT_SYNC, and bump
-		 * up the refcnt so we can release the log (which drops
-		 * the ref count).  The state switch keeps new transaction
-		 * commits from using this buffer.  When the current commits
-		 * finish writing into the buffer, the refcount will drop to
-		 * zero and the buffer will go out then.
-		 */
-		if (!already_slept &&
-		    (iclog->ic_prev->ic_state & (XLOG_STATE_WANT_SYNC |
-						 XLOG_STATE_SYNCING))) {
-			ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR));
-			XFS_STATS_INC(xs_log_force_sleep);
-			sv_wait(&iclog->ic_prev->ic_write_wait, PSWP,
-				&log->l_icloglock, s);
-			*log_flushed = 1;
-			already_slept = 1;
-			goto try_again;
-		} else {
+	do {
+		if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) {
+			iclog = iclog->ic_next;
+			continue;
+		}
+
+		if (iclog->ic_state == XLOG_STATE_DIRTY) {
+			spin_unlock(&log->l_icloglock);
+			return 0;
+		}
+
+		if (iclog->ic_state == XLOG_STATE_ACTIVE) {
+			/*
+			 * We sleep here if we haven't already slept (e.g.
+			 * this is the first time we've looked at the correct
+			 * iclog buf) and the buffer before us is going to
+			 * be sync'ed. The reason for this is that if we
+			 * are doing sync transactions here, by waiting for
+			 * the previous I/O to complete, we can allow a few
+			 * more transactions into this iclog before we close
+			 * it down.
+			 *
+			 * Otherwise, we mark the buffer WANT_SYNC, and bump
+			 * up the refcnt so we can release the log (which
+			 * drops the ref count).  The state switch keeps new
+			 * transaction commits from using this buffer.  When
+			 * the current commits finish writing into the buffer,
+			 * the refcount will drop to zero and the buffer will
+			 * go out then.
+			 */
+			if (!already_slept &&
+			    (iclog->ic_prev->ic_state &
+			     (XLOG_STATE_WANT_SYNC | XLOG_STATE_SYNCING))) {
+				ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR));
+
+				XFS_STATS_INC(xs_log_force_sleep);
+
+				sv_wait(&iclog->ic_prev->ic_write_wait,
+					PSWP, &log->l_icloglock, s);
+				if (log_flushed)
+					*log_flushed = 1;
+				already_slept = 1;
+				goto try_again;
+			}
 			atomic_inc(&iclog->ic_refcnt);
 			xlog_state_switch_iclogs(log, iclog, 0);
 			spin_unlock(&log->l_icloglock);
 			if (xlog_state_release_iclog(log, iclog))
 				return XFS_ERROR(EIO);
-			*log_flushed = 1;
+			if (log_flushed)
+				*log_flushed = 1;
 			spin_lock(&log->l_icloglock);
 		}
-	}
 
-	if ((flags & XFS_LOG_SYNC) && /* sleep */
-	    !(iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) {
+		if ((flags & XFS_LOG_SYNC) && /* sleep */
+		    !(iclog->ic_state &
+		      (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) {
+			/*
+			 * Don't wait on completion if we know that we've
+			 * gotten a log write error.
+			 */
+			if (iclog->ic_state & XLOG_STATE_IOERROR) {
+				spin_unlock(&log->l_icloglock);
+				return XFS_ERROR(EIO);
+			}
+			XFS_STATS_INC(xs_log_force_sleep);
+			sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s);
+			/*
+			 * No need to grab the log lock here since we're
+			 * only deciding whether or not to return EIO
+			 * and the memory read should be atomic.
+			 */
+			if (iclog->ic_state & XLOG_STATE_IOERROR)
+				return XFS_ERROR(EIO);
 
-		/*
-		 * Don't wait on completion if we know that we've
-		 * gotten a log write error.
-		 */
-		if (iclog->ic_state & XLOG_STATE_IOERROR) {
+			if (log_flushed)
+				*log_flushed = 1;
+		} else {		/* just return */
 			spin_unlock(&log->l_icloglock);
-			return XFS_ERROR(EIO);
 		}
-		XFS_STATS_INC(xs_log_force_sleep);
-		sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s);
-		/*
-		 * No need to grab the log lock here since we're
-		 * only deciding whether or not to return EIO
-		 * and the memory read should be atomic.
-		 */
-		if (iclog->ic_state & XLOG_STATE_IOERROR)
-			return XFS_ERROR(EIO);
-		*log_flushed = 1;
-	} else {		/* just return */
-		spin_unlock(&log->l_icloglock);
-	}
-	return 0;
 
-    } while (iclog != log->l_iclog);
+		return 0;
+	} while (iclog != log->l_iclog);
 
-    spin_unlock(&log->l_icloglock);
-    return 0;
-}	/* xlog_state_sync */
+	spin_unlock(&log->l_icloglock);
+	return 0;
+}
 
+/*
+ * Wrapper for _xfs_log_force_lsn(), to be used when caller doesn't care
+ * about errors or whether the log was flushed or not. This is the normal
+ * interface to use when trying to unpin items or move the log forward.
+ */
+void
+xfs_log_force_lsn(
+	xfs_mount_t	*mp,
+	xfs_lsn_t	lsn,
+	uint		flags)
+{
+	int	error;
+
+	error = _xfs_log_force_lsn(mp, lsn, flags, NULL);
+	if (error) {
+		xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: "
+			"error %d returned.", error);
+	}
+}
 
 /*
  * Called when we want to mark the current iclog as being ready to sync to
@@ -3148,20 +3346,30 @@ xfs_log_ticket_get(
 	return ticket;
 }
 
+xlog_tid_t
+xfs_log_get_trans_ident(
+	struct xfs_trans	*tp)
+{
+	return tp->t_ticket->t_tid;
+}
+
 /*
  * Allocate and initialise a new log ticket.
  */
-STATIC xlog_ticket_t *
-xlog_ticket_alloc(xlog_t		*log,
-		int		unit_bytes,
-		int		cnt,
-		char		client,
-		uint		xflags)
+xlog_ticket_t *
+xlog_ticket_alloc(
+	struct log	*log,
+	int		unit_bytes,
+	int		cnt,
+	char		client,
+	uint		xflags,
+	int		alloc_flags)
 {
-	xlog_ticket_t	*tic;
+	struct xlog_ticket *tic;
 	uint		num_headers;
+	int		iclog_space;
 
-	tic = kmem_zone_zalloc(xfs_log_ticket_zone, KM_SLEEP|KM_MAYFAIL);
+	tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags);
 	if (!tic)
 		return NULL;
 
@@ -3203,16 +3411,40 @@ xlog_ticket_alloc(xlog_t		*log,
 	/* for start-rec */
 	unit_bytes += sizeof(xlog_op_header_t);
 
-	/* for LR headers */
-	num_headers = ((unit_bytes + log->l_iclog_size-1) >> log->l_iclog_size_log);
+	/*
+	 * for LR headers - the space for data in an iclog is the size minus
+	 * the space used for the headers. If we use the iclog size, then we
+	 * undercalculate the number of headers required.
+	 *
+	 * Furthermore - the addition of op headers for split-recs might
+	 * increase the space required enough to require more log and op
+	 * headers, so take that into account too.
+	 *
+	 * IMPORTANT: This reservation makes the assumption that if this
+	 * transaction is the first in an iclog and hence has the LR headers
+	 * accounted to it, then the remaining space in the iclog is
+	 * exclusively for this transaction.  i.e. if the transaction is larger
+	 * than the iclog, it will be the only thing in that iclog.
+	 * Fundamentally, this means we must pass the entire log vector to
+	 * xlog_write to guarantee this.
+	 */
+	iclog_space = log->l_iclog_size - log->l_iclog_hsize;
+	num_headers = howmany(unit_bytes, iclog_space);
+
+	/* for split-recs - ophdrs added when data split over LRs */
+	unit_bytes += sizeof(xlog_op_header_t) * num_headers;
+
+	/* add extra header reservations if we overrun */
+	while (!num_headers ||
+	       howmany(unit_bytes, iclog_space) > num_headers) {
+		unit_bytes += sizeof(xlog_op_header_t);
+		num_headers++;
+	}
 	unit_bytes += log->l_iclog_hsize * num_headers;
 
 	/* for commit-rec LR header - note: padding will subsume the ophdr */
 	unit_bytes += log->l_iclog_hsize;
 
-	/* for split-recs - ophdrs added when data split over LRs */
-	unit_bytes += sizeof(xlog_op_header_t) * num_headers;
-
 	/* for roundoff padding for transaction data and one for commit record */
 	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) &&
 	    log->l_mp->m_sb.sb_logsunit > 1) {
@@ -3228,13 +3460,13 @@ xlog_ticket_alloc(xlog_t		*log,
 	tic->t_curr_res		= unit_bytes;
 	tic->t_cnt		= cnt;
 	tic->t_ocnt		= cnt;
-	tic->t_tid		= (xlog_tid_t)((__psint_t)tic & 0xffffffff);
+	tic->t_tid		= random32();
 	tic->t_clientid		= client;
 	tic->t_flags		= XLOG_TIC_INITED;
 	tic->t_trans_type	= 0;
 	if (xflags & XFS_LOG_PERM_RESERV)
 		tic->t_flags |= XLOG_TIC_PERM_RESERV;
-	sv_init(&(tic->t_wait), SV_DEFAULT, "logtick");
+	sv_init(&tic->t_wait, SV_DEFAULT, "logtick");
 
 	xlog_tic_reset_res(tic);
 
@@ -3255,20 +3487,22 @@ xlog_ticket_alloc(xlog_t		*log,
  * part of the log in case we trash the log structure.
  */
 void
-xlog_verify_dest_ptr(xlog_t     *log,
-		     __psint_t  ptr)
+xlog_verify_dest_ptr(
+	struct log	*log,
+	char		*ptr)
 {
 	int i;
 	int good_ptr = 0;
 
-	for (i=0; i < log->l_iclog_bufs; i++) {
-		if (ptr >= (__psint_t)log->l_iclog_bak[i] &&
-		    ptr <= (__psint_t)log->l_iclog_bak[i]+log->l_iclog_size)
+	for (i = 0; i < log->l_iclog_bufs; i++) {
+		if (ptr >= log->l_iclog_bak[i] &&
+		    ptr <= log->l_iclog_bak[i] + log->l_iclog_size)
 			good_ptr++;
 	}
-	if (! good_ptr)
+
+	if (!good_ptr)
 		xlog_panic("xlog_verify_dest_ptr: invalid ptr");
-}	/* xlog_verify_dest_ptr */
+}
 
 STATIC void
 xlog_verify_grant_head(xlog_t *log, int equals)
@@ -3454,6 +3688,11 @@ xlog_state_ioerror(
  *	c. nothing new gets queued up after (a) and (b) are done.
  *	d. if !logerror, flush the iclogs to disk, then seal them off
  *	   for business.
+ *
+ * Note: for delayed logging the !logerror case needs to flush the regions
+ * held in memory out to the iclogs before flushing them to disk. This needs
+ * to be done before the log is marked as shutdown, otherwise the flush to the
+ * iclogs will fail.
  */
 int
 xfs_log_force_umount(
@@ -3463,7 +3702,6 @@ xfs_log_force_umount(
 	xlog_ticket_t	*tic;
 	xlog_t		*log;
 	int		retval;
-	int		dummy;
 
 	log = mp->m_log;
 
@@ -3488,6 +3726,16 @@ xfs_log_force_umount(
 		return 1;
 	}
 	retval = 0;
+
+	/*
+	 * Flush the in memory commit item list before marking the log as
+	 * being shut down. We need to do it in this order to ensure all the
+	 * completed transactions are flushed to disk with the xfs_log_force()
+	 * call below.
+	 */
+	if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG))
+		xlog_cil_push(log, 1);
+
 	/*
 	 * We must hold both the GRANT lock and the LOG lock,
 	 * before we mark the filesystem SHUTDOWN and wake
@@ -3537,13 +3785,14 @@ xfs_log_force_umount(
 	}
 	spin_unlock(&log->l_grant_lock);
 
-	if (! (log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
+	if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
 		ASSERT(!logerror);
 		/*
 		 * Force the incore logs to disk before shutting the
 		 * log down completely.
 		 */
-		xlog_state_sync_all(log, XFS_LOG_FORCE|XFS_LOG_SYNC, &dummy);
+		_xfs_log_force(mp, XFS_LOG_SYNC, NULL);
+
 		spin_lock(&log->l_icloglock);
 		retval = xlog_state_ioerror(log);
 		spin_unlock(&log->l_icloglock);
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index d0c9baa50b1a..04c78e642cc8 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -19,7 +19,6 @@
 #define __XFS_LOG_H__
 
 /* get lsn fields */
-
 #define CYCLE_LSN(lsn) ((uint)((lsn)>>32))
 #define BLOCK_LSN(lsn) ((uint)(lsn))
 
@@ -70,14 +69,8 @@ static inline xfs_lsn_t	_lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
  * Flags to xfs_log_force()
  *
  *	XFS_LOG_SYNC:	Synchronous force in-core log to disk
- *	XFS_LOG_FORCE:	Start in-core log write now.
- *	XFS_LOG_URGE:	Start write within some window of time.
- *
- * Note: Either XFS_LOG_FORCE or XFS_LOG_URGE must be set.
  */
 #define XFS_LOG_SYNC		0x1
-#define XFS_LOG_FORCE		0x2
-#define XFS_LOG_URGE		0x4
 
 #endif	/* __KERNEL__ */
 
@@ -110,15 +103,20 @@ static inline xfs_lsn_t	_lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
 #define XLOG_REG_TYPE_TRANSHDR		19
 #define XLOG_REG_TYPE_MAX		19
 
-#define XLOG_VEC_SET_TYPE(vecp, t) ((vecp)->i_type = (t))
-
 typedef struct xfs_log_iovec {
-	xfs_caddr_t		i_addr;		/* beginning address of region */
+	xfs_caddr_t	i_addr;		/* beginning address of region */
 	int		i_len;		/* length in bytes of region */
 	uint		i_type;		/* type of region */
 } xfs_log_iovec_t;
 
-typedef void* xfs_log_ticket_t;
+struct xfs_log_vec {
+	struct xfs_log_vec	*lv_next;	/* next lv in build list */
+	int			lv_niovecs;	/* number of iovecs in lv */
+	struct xfs_log_iovec	*lv_iovecp;	/* iovec array */
+	struct xfs_log_item	*lv_item;	/* owner */
+	char			*lv_buf;	/* formatted buffer */
+	int			lv_buf_len;	/* size of formatted buffer */
+};
 
 /*
  * Structure used to pass callback function and the function's argument
@@ -134,18 +132,33 @@ typedef struct xfs_log_callback {
 #ifdef __KERNEL__
 /* Log manager interfaces */
 struct xfs_mount;
+struct xlog_in_core;
 struct xlog_ticket;
+struct xfs_log_item;
+struct xfs_item_ops;
+struct xfs_trans;
+
+void	xfs_log_item_init(struct xfs_mount	*mp,
+			struct xfs_log_item	*item,
+			int			type,
+			struct xfs_item_ops	*ops);
+
 xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
-		       xfs_log_ticket_t ticket,
-		       void		**iclog,
+		       struct xlog_ticket *ticket,
+		       struct xlog_in_core **iclog,
 		       uint		flags);
 int	  _xfs_log_force(struct xfs_mount *mp,
-			 xfs_lsn_t	lsn,
 			 uint		flags,
 			 int		*log_forced);
 void	  xfs_log_force(struct xfs_mount	*mp,
-			xfs_lsn_t		lsn,
 			uint			flags);
+int	  _xfs_log_force_lsn(struct xfs_mount *mp,
+			     xfs_lsn_t		lsn,
+			     uint		flags,
+			     int		*log_forced);
+void	  xfs_log_force_lsn(struct xfs_mount	*mp,
+			    xfs_lsn_t		lsn,
+			    uint		flags);
 int	  xfs_log_mount(struct xfs_mount	*mp,
 			struct xfs_buftarg	*log_target,
 			xfs_daddr_t		start_block,
@@ -154,21 +167,21 @@ int	  xfs_log_mount_finish(struct xfs_mount *mp);
 void	  xfs_log_move_tail(struct xfs_mount	*mp,
 			    xfs_lsn_t		tail_lsn);
 int	  xfs_log_notify(struct xfs_mount	*mp,
-			 void			*iclog,
+			 struct xlog_in_core	*iclog,
 			 xfs_log_callback_t	*callback_entry);
 int	  xfs_log_release_iclog(struct xfs_mount *mp,
-			 void			 *iclog_hndl);
+			 struct xlog_in_core	 *iclog);
 int	  xfs_log_reserve(struct xfs_mount *mp,
 			  int		   length,
 			  int		   count,
-			  xfs_log_ticket_t *ticket,
+			  struct xlog_ticket **ticket,
 			  __uint8_t	   clientid,
 			  uint		   flags,
 			  uint		   t_type);
 int	  xfs_log_write(struct xfs_mount *mp,
 			xfs_log_iovec_t  region[],
 			int		 nentries,
-			xfs_log_ticket_t ticket,
+			struct xlog_ticket *ticket,
 			xfs_lsn_t	 *start_lsn);
 int	  xfs_log_unmount_write(struct xfs_mount *mp);
 void      xfs_log_unmount(struct xfs_mount *mp);
@@ -177,9 +190,16 @@ int	  xfs_log_need_covered(struct xfs_mount *mp);
 
 void	  xlog_iodone(struct xfs_buf *);
 
-struct xlog_ticket * xfs_log_ticket_get(struct xlog_ticket *ticket);
+struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
 void	  xfs_log_ticket_put(struct xlog_ticket *ticket);
 
+xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp);
+
+int	xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
+				struct xfs_log_vec *log_vector,
+				xfs_lsn_t *commit_lsn, int flags);
+bool	xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
+
 #endif
 
 
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
new file mode 100644
index 000000000000..bb17cc044bf3
--- /dev/null
+++ b/fs/xfs/xfs_log_cil.c
@@ -0,0 +1,725 @@
+/*
+ * Copyright (c) 2010 Red Hat, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log_priv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_error.h"
+#include "xfs_alloc.h"
+
+/*
+ * Perform initial CIL structure initialisation. If the CIL is not
+ * enabled in this filesystem, ensure the log->l_cilp is null so
+ * we can check this conditional to determine if we are doing delayed
+ * logging or not.
+ */
+int
+xlog_cil_init(
+	struct log	*log)
+{
+	struct xfs_cil	*cil;
+	struct xfs_cil_ctx *ctx;
+
+	log->l_cilp = NULL;
+	if (!(log->l_mp->m_flags & XFS_MOUNT_DELAYLOG))
+		return 0;
+
+	cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
+	if (!cil)
+		return ENOMEM;
+
+	ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
+	if (!ctx) {
+		kmem_free(cil);
+		return ENOMEM;
+	}
+
+	INIT_LIST_HEAD(&cil->xc_cil);
+	INIT_LIST_HEAD(&cil->xc_committing);
+	spin_lock_init(&cil->xc_cil_lock);
+	init_rwsem(&cil->xc_ctx_lock);
+	sv_init(&cil->xc_commit_wait, SV_DEFAULT, "cilwait");
+
+	INIT_LIST_HEAD(&ctx->committing);
+	INIT_LIST_HEAD(&ctx->busy_extents);
+	ctx->sequence = 1;
+	ctx->cil = cil;
+	cil->xc_ctx = ctx;
+
+	cil->xc_log = log;
+	log->l_cilp = cil;
+	return 0;
+}
+
+void
+xlog_cil_destroy(
+	struct log	*log)
+{
+	if (!log->l_cilp)
+		return;
+
+	if (log->l_cilp->xc_ctx) {
+		if (log->l_cilp->xc_ctx->ticket)
+			xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket);
+		kmem_free(log->l_cilp->xc_ctx);
+	}
+
+	ASSERT(list_empty(&log->l_cilp->xc_cil));
+	kmem_free(log->l_cilp);
+}
+
+/*
+ * Allocate a new ticket. Failing to get a new ticket makes it really hard to
+ * recover, so we don't allow failure here. Also, we allocate in a context that
+ * we don't want to be issuing transactions from, so we need to tell the
+ * allocation code this as well.
+ *
+ * We don't reserve any space for the ticket - we are going to steal whatever
+ * space we require from transactions as they commit. To ensure we reserve all
+ * the space required, we need to set the current reservation of the ticket to
+ * zero so that we know to steal the initial transaction overhead from the
+ * first transaction commit.
+ */
+static struct xlog_ticket *
+xlog_cil_ticket_alloc(
+	struct log	*log)
+{
+	struct xlog_ticket *tic;
+
+	tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0,
+				KM_SLEEP|KM_NOFS);
+	tic->t_trans_type = XFS_TRANS_CHECKPOINT;
+
+	/*
+	 * set the current reservation to zero so we know to steal the basic
+	 * transaction overhead reservation from the first transaction commit.
+	 */
+	tic->t_curr_res = 0;
+	return tic;
+}
+
+/*
+ * After the first stage of log recovery is done, we know where the head and
+ * tail of the log are. We need this log initialisation done before we can
+ * initialise the first CIL checkpoint context.
+ *
+ * Here we allocate a log ticket to track space usage during a CIL push.  This
+ * ticket is passed to xlog_write() directly so that we don't slowly leak log
+ * space by failing to account for space used by log headers and additional
+ * region headers for split regions.
+ */
+void
+xlog_cil_init_post_recovery(
+	struct log	*log)
+{
+	if (!log->l_cilp)
+		return;
+
+	log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log);
+	log->l_cilp->xc_ctx->sequence = 1;
+	log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle,
+								log->l_curr_block);
+}
+
+/*
+ * Insert the log item into the CIL and calculate the difference in space
+ * consumed by the item. Add the space to the checkpoint ticket and calculate
+ * if the change requires additional log metadata. If it does, take that space
+ * as well. Remove the amount of space we addded to the checkpoint ticket from
+ * the current transaction ticket so that the accounting works out correctly.
+ *
+ * If this is the first time the item is being placed into the CIL in this
+ * context, pin it so it can't be written to disk until the CIL is flushed to
+ * the iclog and the iclog written to disk.
+ */
+static void
+xlog_cil_insert(
+	struct log		*log,
+	struct xlog_ticket	*ticket,
+	struct xfs_log_item	*item,
+	struct xfs_log_vec	*lv)
+{
+	struct xfs_cil		*cil = log->l_cilp;
+	struct xfs_log_vec	*old = lv->lv_item->li_lv;
+	struct xfs_cil_ctx	*ctx = cil->xc_ctx;
+	int			len;
+	int			diff_iovecs;
+	int			iclog_space;
+
+	if (old) {
+		/* existing lv on log item, space used is a delta */
+		ASSERT(!list_empty(&item->li_cil));
+		ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
+
+		len = lv->lv_buf_len - old->lv_buf_len;
+		diff_iovecs = lv->lv_niovecs - old->lv_niovecs;
+		kmem_free(old->lv_buf);
+		kmem_free(old);
+	} else {
+		/* new lv, must pin the log item */
+		ASSERT(!lv->lv_item->li_lv);
+		ASSERT(list_empty(&item->li_cil));
+
+		len = lv->lv_buf_len;
+		diff_iovecs = lv->lv_niovecs;
+		IOP_PIN(lv->lv_item);
+
+	}
+	len += diff_iovecs * sizeof(xlog_op_header_t);
+
+	/* attach new log vector to log item */
+	lv->lv_item->li_lv = lv;
+
+	spin_lock(&cil->xc_cil_lock);
+	list_move_tail(&item->li_cil, &cil->xc_cil);
+	ctx->nvecs += diff_iovecs;
+
+	/*
+	 * If this is the first time the item is being committed to the CIL,
+	 * store the sequence number on the log item so we can tell
+	 * in future commits whether this is the first checkpoint the item is
+	 * being committed into.
+	 */
+	if (!item->li_seq)
+		item->li_seq = ctx->sequence;
+
+	/*
+	 * Now transfer enough transaction reservation to the context ticket
+	 * for the checkpoint. The context ticket is special - the unit
+	 * reservation has to grow as well as the current reservation as we
+	 * steal from tickets so we can correctly determine the space used
+	 * during the transaction commit.
+	 */
+	if (ctx->ticket->t_curr_res == 0) {
+		/* first commit in checkpoint, steal the header reservation */
+		ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
+		ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
+		ticket->t_curr_res -= ctx->ticket->t_unit_res;
+	}
+
+	/* do we need space for more log record headers? */
+	iclog_space = log->l_iclog_size - log->l_iclog_hsize;
+	if (len > 0 && (ctx->space_used / iclog_space !=
+				(ctx->space_used + len) / iclog_space)) {
+		int hdrs;
+
+		hdrs = (len + iclog_space - 1) / iclog_space;
+		/* need to take into account split region headers, too */
+		hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
+		ctx->ticket->t_unit_res += hdrs;
+		ctx->ticket->t_curr_res += hdrs;
+		ticket->t_curr_res -= hdrs;
+		ASSERT(ticket->t_curr_res >= len);
+	}
+	ticket->t_curr_res -= len;
+	ctx->space_used += len;
+
+	spin_unlock(&cil->xc_cil_lock);
+}
+
+/*
+ * Format log item into a flat buffers
+ *
+ * For delayed logging, we need to hold a formatted buffer containing all the
+ * changes on the log item. This enables us to relog the item in memory and
+ * write it out asynchronously without needing to relock the object that was
+ * modified at the time it gets written into the iclog.
+ *
+ * This function builds a vector for the changes in each log item in the
+ * transaction. It then works out the length of the buffer needed for each log
+ * item, allocates them and formats the vector for the item into the buffer.
+ * The buffer is then attached to the log item are then inserted into the
+ * Committed Item List for tracking until the next checkpoint is written out.
+ *
+ * We don't set up region headers during this process; we simply copy the
+ * regions into the flat buffer. We can do this because we still have to do a
+ * formatting step to write the regions into the iclog buffer.  Writing the
+ * ophdrs during the iclog write means that we can support splitting large
+ * regions across iclog boundares without needing a change in the format of the
+ * item/region encapsulation.
+ *
+ * Hence what we need to do now is change the rewrite the vector array to point
+ * to the copied region inside the buffer we just allocated. This allows us to
+ * format the regions into the iclog as though they are being formatted
+ * directly out of the objects themselves.
+ */
+static void
+xlog_cil_format_items(
+	struct log		*log,
+	struct xfs_log_vec	*log_vector,
+	struct xlog_ticket	*ticket,
+	xfs_lsn_t		*start_lsn)
+{
+	struct xfs_log_vec *lv;
+
+	if (start_lsn)
+		*start_lsn = log->l_cilp->xc_ctx->sequence;
+
+	ASSERT(log_vector);
+	for (lv = log_vector; lv; lv = lv->lv_next) {
+		void	*ptr;
+		int	index;
+		int	len = 0;
+
+		/* build the vector array and calculate it's length */
+		IOP_FORMAT(lv->lv_item, lv->lv_iovecp);
+		for (index = 0; index < lv->lv_niovecs; index++)
+			len += lv->lv_iovecp[index].i_len;
+
+		lv->lv_buf_len = len;
+		lv->lv_buf = kmem_zalloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS);
+		ptr = lv->lv_buf;
+
+		for (index = 0; index < lv->lv_niovecs; index++) {
+			struct xfs_log_iovec *vec = &lv->lv_iovecp[index];
+
+			memcpy(ptr, vec->i_addr, vec->i_len);
+			vec->i_addr = ptr;
+			ptr += vec->i_len;
+		}
+		ASSERT(ptr == lv->lv_buf + lv->lv_buf_len);
+
+		xlog_cil_insert(log, ticket, lv->lv_item, lv);
+	}
+}
+
+static void
+xlog_cil_free_logvec(
+	struct xfs_log_vec	*log_vector)
+{
+	struct xfs_log_vec	*lv;
+
+	for (lv = log_vector; lv; ) {
+		struct xfs_log_vec *next = lv->lv_next;
+		kmem_free(lv->lv_buf);
+		kmem_free(lv);
+		lv = next;
+	}
+}
+
+/*
+ * Commit a transaction with the given vector to the Committed Item List.
+ *
+ * To do this, we need to format the item, pin it in memory if required and
+ * account for the space used by the transaction. Once we have done that we
+ * need to release the unused reservation for the transaction, attach the
+ * transaction to the checkpoint context so we carry the busy extents through
+ * to checkpoint completion, and then unlock all the items in the transaction.
+ *
+ * For more specific information about the order of operations in
+ * xfs_log_commit_cil() please refer to the comments in
+ * xfs_trans_commit_iclog().
+ *
+ * Called with the context lock already held in read mode to lock out
+ * background commit, returns without it held once background commits are
+ * allowed again.
+ */
+int
+xfs_log_commit_cil(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_log_vec	*log_vector,
+	xfs_lsn_t		*commit_lsn,
+	int			flags)
+{
+	struct log		*log = mp->m_log;
+	int			log_flags = 0;
+	int			push = 0;
+
+	if (flags & XFS_TRANS_RELEASE_LOG_RES)
+		log_flags = XFS_LOG_REL_PERM_RESERV;
+
+	if (XLOG_FORCED_SHUTDOWN(log)) {
+		xlog_cil_free_logvec(log_vector);
+		return XFS_ERROR(EIO);
+	}
+
+	/* lock out background commit */
+	down_read(&log->l_cilp->xc_ctx_lock);
+	xlog_cil_format_items(log, log_vector, tp->t_ticket, commit_lsn);
+
+	/* check we didn't blow the reservation */
+	if (tp->t_ticket->t_curr_res < 0)
+		xlog_print_tic_res(log->l_mp, tp->t_ticket);
+
+	/* attach the transaction to the CIL if it has any busy extents */
+	if (!list_empty(&tp->t_busy)) {
+		spin_lock(&log->l_cilp->xc_cil_lock);
+		list_splice_init(&tp->t_busy,
+					&log->l_cilp->xc_ctx->busy_extents);
+		spin_unlock(&log->l_cilp->xc_cil_lock);
+	}
+
+	tp->t_commit_lsn = *commit_lsn;
+	xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
+	xfs_trans_unreserve_and_mod_sb(tp);
+
+	/* check for background commit before unlock */
+	if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
+		push = 1;
+	up_read(&log->l_cilp->xc_ctx_lock);
+
+	/*
+	 * We need to push CIL every so often so we don't cache more than we
+	 * can fit in the log. The limit really is that a checkpoint can't be
+	 * more than half the log (the current checkpoint is not allowed to
+	 * overwrite the previous checkpoint), but commit latency and memory
+	 * usage limit this to a smaller size in most cases.
+	 */
+	if (push)
+		xlog_cil_push(log, 0);
+	return 0;
+}
+
+/*
+ * Mark all items committed and clear busy extents. We free the log vector
+ * chains in a separate pass so that we unpin the log items as quickly as
+ * possible.
+ */
+static void
+xlog_cil_committed(
+	void	*args,
+	int	abort)
+{
+	struct xfs_cil_ctx	*ctx = args;
+	struct xfs_log_vec	*lv;
+	int			abortflag = abort ? XFS_LI_ABORTED : 0;
+	struct xfs_busy_extent	*busyp, *n;
+
+	/* unpin all the log items */
+	for (lv = ctx->lv_chain; lv; lv = lv->lv_next ) {
+		xfs_trans_item_committed(lv->lv_item, ctx->start_lsn,
+							abortflag);
+	}
+
+	list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list)
+		xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp);
+
+	spin_lock(&ctx->cil->xc_cil_lock);
+	list_del(&ctx->committing);
+	spin_unlock(&ctx->cil->xc_cil_lock);
+
+	xlog_cil_free_logvec(ctx->lv_chain);
+	kmem_free(ctx);
+}
+
+/*
+ * Push the Committed Item List to the log. If the push_now flag is not set,
+ * then it is a background flush and so we can chose to ignore it.
+ */
+int
+xlog_cil_push(
+	struct log		*log,
+	int			push_now)
+{
+	struct xfs_cil		*cil = log->l_cilp;
+	struct xfs_log_vec	*lv;
+	struct xfs_cil_ctx	*ctx;
+	struct xfs_cil_ctx	*new_ctx;
+	struct xlog_in_core	*commit_iclog;
+	struct xlog_ticket	*tic;
+	int			num_lv;
+	int			num_iovecs;
+	int			len;
+	int			error = 0;
+	struct xfs_trans_header thdr;
+	struct xfs_log_iovec	lhdr;
+	struct xfs_log_vec	lvhdr = { NULL };
+	xfs_lsn_t		commit_lsn;
+
+	if (!cil)
+		return 0;
+
+	new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
+	new_ctx->ticket = xlog_cil_ticket_alloc(log);
+
+	/* lock out transaction commit, but don't block on background push */
+	if (!down_write_trylock(&cil->xc_ctx_lock)) {
+		if (!push_now)
+			goto out_free_ticket;
+		down_write(&cil->xc_ctx_lock);
+	}
+	ctx = cil->xc_ctx;
+
+	/* check if we've anything to push */
+	if (list_empty(&cil->xc_cil))
+		goto out_skip;
+
+	/* check for spurious background flush */
+	if (!push_now && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
+		goto out_skip;
+
+	/*
+	 * pull all the log vectors off the items in the CIL, and
+	 * remove the items from the CIL. We don't need the CIL lock
+	 * here because it's only needed on the transaction commit
+	 * side which is currently locked out by the flush lock.
+	 */
+	lv = NULL;
+	num_lv = 0;
+	num_iovecs = 0;
+	len = 0;
+	while (!list_empty(&cil->xc_cil)) {
+		struct xfs_log_item	*item;
+		int			i;
+
+		item = list_first_entry(&cil->xc_cil,
+					struct xfs_log_item, li_cil);
+		list_del_init(&item->li_cil);
+		if (!ctx->lv_chain)
+			ctx->lv_chain = item->li_lv;
+		else
+			lv->lv_next = item->li_lv;
+		lv = item->li_lv;
+		item->li_lv = NULL;
+
+		num_lv++;
+		num_iovecs += lv->lv_niovecs;
+		for (i = 0; i < lv->lv_niovecs; i++)
+			len += lv->lv_iovecp[i].i_len;
+	}
+
+	/*
+	 * initialise the new context and attach it to the CIL. Then attach
+	 * the current context to the CIL committing lsit so it can be found
+	 * during log forces to extract the commit lsn of the sequence that
+	 * needs to be forced.
+	 */
+	INIT_LIST_HEAD(&new_ctx->committing);
+	INIT_LIST_HEAD(&new_ctx->busy_extents);
+	new_ctx->sequence = ctx->sequence + 1;
+	new_ctx->cil = cil;
+	cil->xc_ctx = new_ctx;
+
+	/*
+	 * The switch is now done, so we can drop the context lock and move out
+	 * of a shared context. We can't just go straight to the commit record,
+	 * though - we need to synchronise with previous and future commits so
+	 * that the commit records are correctly ordered in the log to ensure
+	 * that we process items during log IO completion in the correct order.
+	 *
+	 * For example, if we get an EFI in one checkpoint and the EFD in the
+	 * next (e.g. due to log forces), we do not want the checkpoint with
+	 * the EFD to be committed before the checkpoint with the EFI.  Hence
+	 * we must strictly order the commit records of the checkpoints so
+	 * that: a) the checkpoint callbacks are attached to the iclogs in the
+	 * correct order; and b) the checkpoints are replayed in correct order
+	 * in log recovery.
+	 *
+	 * Hence we need to add this context to the committing context list so
+	 * that higher sequences will wait for us to write out a commit record
+	 * before they do.
+	 */
+	spin_lock(&cil->xc_cil_lock);
+	list_add(&ctx->committing, &cil->xc_committing);
+	spin_unlock(&cil->xc_cil_lock);
+	up_write(&cil->xc_ctx_lock);
+
+	/*
+	 * Build a checkpoint transaction header and write it to the log to
+	 * begin the transaction. We need to account for the space used by the
+	 * transaction header here as it is not accounted for in xlog_write().
+	 *
+	 * The LSN we need to pass to the log items on transaction commit is
+	 * the LSN reported by the first log vector write. If we use the commit
+	 * record lsn then we can move the tail beyond the grant write head.
+	 */
+	tic = ctx->ticket;
+	thdr.th_magic = XFS_TRANS_HEADER_MAGIC;
+	thdr.th_type = XFS_TRANS_CHECKPOINT;
+	thdr.th_tid = tic->t_tid;
+	thdr.th_num_items = num_iovecs;
+	lhdr.i_addr = (xfs_caddr_t)&thdr;
+	lhdr.i_len = sizeof(xfs_trans_header_t);
+	lhdr.i_type = XLOG_REG_TYPE_TRANSHDR;
+	tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t);
+
+	lvhdr.lv_niovecs = 1;
+	lvhdr.lv_iovecp = &lhdr;
+	lvhdr.lv_next = ctx->lv_chain;
+
+	error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
+	if (error)
+		goto out_abort;
+
+	/*
+	 * now that we've written the checkpoint into the log, strictly
+	 * order the commit records so replay will get them in the right order.
+	 */
+restart:
+	spin_lock(&cil->xc_cil_lock);
+	list_for_each_entry(new_ctx, &cil->xc_committing, committing) {
+		/*
+		 * Higher sequences will wait for this one so skip them.
+		 * Don't wait for own own sequence, either.
+		 */
+		if (new_ctx->sequence >= ctx->sequence)
+			continue;
+		if (!new_ctx->commit_lsn) {
+			/*
+			 * It is still being pushed! Wait for the push to
+			 * complete, then start again from the beginning.
+			 */
+			sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
+			goto restart;
+		}
+	}
+	spin_unlock(&cil->xc_cil_lock);
+
+	commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
+	if (error || commit_lsn == -1)
+		goto out_abort;
+
+	/* attach all the transactions w/ busy extents to iclog */
+	ctx->log_cb.cb_func = xlog_cil_committed;
+	ctx->log_cb.cb_arg = ctx;
+	error = xfs_log_notify(log->l_mp, commit_iclog, &ctx->log_cb);
+	if (error)
+		goto out_abort;
+
+	/*
+	 * now the checkpoint commit is complete and we've attached the
+	 * callbacks to the iclog we can assign the commit LSN to the context
+	 * and wake up anyone who is waiting for the commit to complete.
+	 */
+	spin_lock(&cil->xc_cil_lock);
+	ctx->commit_lsn = commit_lsn;
+	sv_broadcast(&cil->xc_commit_wait);
+	spin_unlock(&cil->xc_cil_lock);
+
+	/* release the hounds! */
+	return xfs_log_release_iclog(log->l_mp, commit_iclog);
+
+out_skip:
+	up_write(&cil->xc_ctx_lock);
+out_free_ticket:
+	xfs_log_ticket_put(new_ctx->ticket);
+	kmem_free(new_ctx);
+	return 0;
+
+out_abort:
+	xlog_cil_committed(ctx, XFS_LI_ABORTED);
+	return XFS_ERROR(EIO);
+}
+
+/*
+ * Conditionally push the CIL based on the sequence passed in.
+ *
+ * We only need to push if we haven't already pushed the sequence
+ * number given. Hence the only time we will trigger a push here is
+ * if the push sequence is the same as the current context.
+ *
+ * We return the current commit lsn to allow the callers to determine if a
+ * iclog flush is necessary following this call.
+ *
+ * XXX: Initially, just push the CIL unconditionally and return whatever
+ * commit lsn is there. It'll be empty, so this is broken for now.
+ */
+xfs_lsn_t
+xlog_cil_push_lsn(
+	struct log	*log,
+	xfs_lsn_t	push_seq)
+{
+	struct xfs_cil		*cil = log->l_cilp;
+	struct xfs_cil_ctx	*ctx;
+	xfs_lsn_t		commit_lsn = NULLCOMMITLSN;
+
+restart:
+	down_write(&cil->xc_ctx_lock);
+	ASSERT(push_seq <= cil->xc_ctx->sequence);
+
+	/* check to see if we need to force out the current context */
+	if (push_seq == cil->xc_ctx->sequence) {
+		up_write(&cil->xc_ctx_lock);
+		xlog_cil_push(log, 1);
+		goto restart;
+	}
+
+	/*
+	 * See if we can find a previous sequence still committing.
+	 * We can drop the flush lock as soon as we have the cil lock
+	 * because we are now only comparing contexts protected by
+	 * the cil lock.
+	 *
+	 * We need to wait for all previous sequence commits to complete
+	 * before allowing the force of push_seq to go ahead. Hence block
+	 * on commits for those as well.
+	 */
+	spin_lock(&cil->xc_cil_lock);
+	up_write(&cil->xc_ctx_lock);
+	list_for_each_entry(ctx, &cil->xc_committing, committing) {
+		if (ctx->sequence > push_seq)
+			continue;
+		if (!ctx->commit_lsn) {
+			/*
+			 * It is still being pushed! Wait for the push to
+			 * complete, then start again from the beginning.
+			 */
+			sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
+			goto restart;
+		}
+		if (ctx->sequence != push_seq)
+			continue;
+		/* found it! */
+		commit_lsn = ctx->commit_lsn;
+	}
+	spin_unlock(&cil->xc_cil_lock);
+	return commit_lsn;
+}
+
+/*
+ * Check if the current log item was first committed in this sequence.
+ * We can't rely on just the log item being in the CIL, we have to check
+ * the recorded commit sequence number.
+ *
+ * Note: for this to be used in a non-racy manner, it has to be called with
+ * CIL flushing locked out. As a result, it should only be used during the
+ * transaction commit process when deciding what to format into the item.
+ */
+bool
+xfs_log_item_in_current_chkpt(
+	struct xfs_log_item *lip)
+{
+	struct xfs_cil_ctx *ctx;
+
+	if (!(lip->li_mountp->m_flags & XFS_MOUNT_DELAYLOG))
+		return false;
+	if (list_empty(&lip->li_cil))
+		return false;
+
+	ctx = lip->li_mountp->m_log->l_cilp->xc_ctx;
+
+	/*
+	 * li_seq is written on the first commit of a log item to record the
+	 * first checkpoint it is written to. Hence if it is different to the
+	 * current sequence, we're in a new checkpoint.
+	 */
+	if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0)
+		return false;
+	return true;
+}
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index d55662db7077..8c072618965c 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -152,8 +152,6 @@ static inline uint xlog_get_client_id(__be32 i)
 #define	XLOG_RECOVERY_NEEDED	0x4	/* log was recovered */
 #define XLOG_IO_ERROR		0x8	/* log hit an I/O error, and being
 					   shutdown */
-typedef __uint32_t xlog_tid_t;
-
 
 #ifdef __KERNEL__
 /*
@@ -379,6 +377,99 @@ typedef struct xlog_in_core {
 } xlog_in_core_t;
 
 /*
+ * The CIL context is used to aggregate per-transaction details as well be
+ * passed to the iclog for checkpoint post-commit processing.  After being
+ * passed to the iclog, another context needs to be allocated for tracking the
+ * next set of transactions to be aggregated into a checkpoint.
+ */
+struct xfs_cil;
+
+struct xfs_cil_ctx {
+	struct xfs_cil		*cil;
+	xfs_lsn_t		sequence;	/* chkpt sequence # */
+	xfs_lsn_t		start_lsn;	/* first LSN of chkpt commit */
+	xfs_lsn_t		commit_lsn;	/* chkpt commit record lsn */
+	struct xlog_ticket	*ticket;	/* chkpt ticket */
+	int			nvecs;		/* number of regions */
+	int			space_used;	/* aggregate size of regions */
+	struct list_head	busy_extents;	/* busy extents in chkpt */
+	struct xfs_log_vec	*lv_chain;	/* logvecs being pushed */
+	xfs_log_callback_t	log_cb;		/* completion callback hook. */
+	struct list_head	committing;	/* ctx committing list */
+};
+
+/*
+ * Committed Item List structure
+ *
+ * This structure is used to track log items that have been committed but not
+ * yet written into the log. It is used only when the delayed logging mount
+ * option is enabled.
+ *
+ * This structure tracks the list of committing checkpoint contexts so
+ * we can avoid the problem of having to hold out new transactions during a
+ * flush until we have a the commit record LSN of the checkpoint. We can
+ * traverse the list of committing contexts in xlog_cil_push_lsn() to find a
+ * sequence match and extract the commit LSN directly from there. If the
+ * checkpoint is still in the process of committing, we can block waiting for
+ * the commit LSN to be determined as well. This should make synchronous
+ * operations almost as efficient as the old logging methods.
+ */
+struct xfs_cil {
+	struct log		*xc_log;
+	struct list_head	xc_cil;
+	spinlock_t		xc_cil_lock;
+	struct xfs_cil_ctx	*xc_ctx;
+	struct rw_semaphore	xc_ctx_lock;
+	struct list_head	xc_committing;
+	sv_t			xc_commit_wait;
+};
+
+/*
+ * The amount of log space we should the CIL to aggregate is difficult to size.
+ * Whatever we chose we have to make we can get a reservation for the log space
+ * effectively, that it is large enough to capture sufficient relogging to
+ * reduce log buffer IO significantly, but it is not too large for the log or
+ * induces too much latency when writing out through the iclogs. We track both
+ * space consumed and the number of vectors in the checkpoint context, so we
+ * need to decide which to use for limiting.
+ *
+ * Every log buffer we write out during a push needs a header reserved, which
+ * is at least one sector and more for v2 logs. Hence we need a reservation of
+ * at least 512 bytes per 32k of log space just for the LR headers. That means
+ * 16KB of reservation per megabyte of delayed logging space we will consume,
+ * plus various headers.  The number of headers will vary based on the num of
+ * io vectors, so limiting on a specific number of vectors is going to result
+ * in transactions of varying size. IOWs, it is more consistent to track and
+ * limit space consumed in the log rather than by the number of objects being
+ * logged in order to prevent checkpoint ticket overruns.
+ *
+ * Further, use of static reservations through the log grant mechanism is
+ * problematic. It introduces a lot of complexity (e.g. reserve grant vs write
+ * grant) and a significant deadlock potential because regranting write space
+ * can block on log pushes. Hence if we have to regrant log space during a log
+ * push, we can deadlock.
+ *
+ * However, we can avoid this by use of a dynamic "reservation stealing"
+ * technique during transaction commit whereby unused reservation space in the
+ * transaction ticket is transferred to the CIL ctx commit ticket to cover the
+ * space needed by the checkpoint transaction. This means that we never need to
+ * specifically reserve space for the CIL checkpoint transaction, nor do we
+ * need to regrant space once the checkpoint completes. This also means the
+ * checkpoint transaction ticket is specific to the checkpoint context, rather
+ * than the CIL itself.
+ *
+ * With dynamic reservations, we can basically make up arbitrary limits for the
+ * checkpoint size so long as they don't violate any other size rules.  Hence
+ * the initial maximum size for the checkpoint transaction will be set to a
+ * quarter of the log or 8MB, which ever is smaller. 8MB is an arbitrary limit
+ * right now based on the latency of writing out a large amount of data through
+ * the circular iclog buffers.
+ */
+
+#define XLOG_CIL_SPACE_LIMIT(log)	\
+	(min((log->l_logsize >> 2), (8 * 1024 * 1024)))
+
+/*
  * The reservation head lsn is not made up of a cycle number and block number.
  * Instead, it uses a cycle number and byte number.  Logs don't expect to
  * overflow 31 bits worth of byte offset, so using a byte number will mean
@@ -388,6 +479,7 @@ typedef struct log {
 	/* The following fields don't need locking */
 	struct xfs_mount	*l_mp;	        /* mount point */
 	struct xfs_ail		*l_ailp;	/* AIL log is working with */
+	struct xfs_cil		*l_cilp;	/* CIL log is working with */
 	struct xfs_buf		*l_xbuf;        /* extra buffer for log
 						 * wrapping */
 	struct xfs_buftarg	*l_targ;        /* buftarg of log */
@@ -396,9 +488,7 @@ typedef struct log {
 	struct xfs_buf_cancel	**l_buf_cancel_table;
 	int			l_iclog_hsize;  /* size of iclog header */
 	int			l_iclog_heads;  /* # of iclog header sectors */
-	uint			l_sectbb_log;   /* log2 of sector size in BBs */
-	uint			l_sectbb_mask;  /* sector size (in BBs)
-						 * alignment mask */
+	uint			l_sectBBsize;   /* sector size in BBs (2^n) */
 	int			l_iclog_size;	/* size of log in bytes */
 	int			l_iclog_size_log; /* log power size of log */
 	int			l_iclog_bufs;	/* number of iclog buffers */
@@ -440,19 +530,40 @@ typedef struct log {
 
 #define XLOG_FORCED_SHUTDOWN(log)	((log)->l_flags & XLOG_IO_ERROR)
 
-
 /* common routines */
 extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
-extern int	 xlog_find_tail(xlog_t	*log,
-				xfs_daddr_t *head_blk,
-				xfs_daddr_t *tail_blk);
 extern int	 xlog_recover(xlog_t *log);
 extern int	 xlog_recover_finish(xlog_t *log);
 extern void	 xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
-extern struct xfs_buf *xlog_get_bp(xlog_t *, int);
-extern void	 xlog_put_bp(struct xfs_buf *);
 
-extern kmem_zone_t	*xfs_log_ticket_zone;
+extern kmem_zone_t *xfs_log_ticket_zone;
+struct xlog_ticket *xlog_ticket_alloc(struct log *log, int unit_bytes,
+				int count, char client, uint xflags,
+				int alloc_flags);
+
+
+static inline void
+xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
+{
+	*ptr += bytes;
+	*len -= bytes;
+	*off += bytes;
+}
+
+void	xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket);
+int	xlog_write(struct log *log, struct xfs_log_vec *log_vector,
+				struct xlog_ticket *tic, xfs_lsn_t *start_lsn,
+				xlog_in_core_t **commit_iclog, uint flags);
+
+/*
+ * Committed Item List interfaces
+ */
+int	xlog_cil_init(struct log *log);
+void	xlog_cil_init_post_recovery(struct log *log);
+void	xlog_cil_destroy(struct log *log);
+
+int	xlog_cil_push(struct log *log, int push_now);
+xfs_lsn_t xlog_cil_push_lsn(struct log *log, xfs_lsn_t push_sequence);
 
 /*
  * Unmount record type is used as a pseudo transaction type for the ticket.
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 69ac2e5ef20c..9ac5cfab27b9 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -50,51 +50,81 @@
 
 STATIC int	xlog_find_zeroed(xlog_t *, xfs_daddr_t *);
 STATIC int	xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t);
-STATIC void	xlog_recover_insert_item_backq(xlog_recover_item_t **q,
-					       xlog_recover_item_t *item);
 #if defined(DEBUG)
 STATIC void	xlog_recover_check_summary(xlog_t *);
 #else
 #define	xlog_recover_check_summary(log)
 #endif
 
-
 /*
  * Sector aligned buffer routines for buffer create/read/write/access
  */
 
-#define XLOG_SECTOR_ROUNDUP_BBCOUNT(log, bbs)	\
-	( ((log)->l_sectbb_mask && (bbs & (log)->l_sectbb_mask)) ? \
-	((bbs + (log)->l_sectbb_mask + 1) & ~(log)->l_sectbb_mask) : (bbs) )
-#define XLOG_SECTOR_ROUNDDOWN_BLKNO(log, bno)	((bno) & ~(log)->l_sectbb_mask)
+/*
+ * Verify the given count of basic blocks is valid number of blocks
+ * to specify for an operation involving the given XFS log buffer.
+ * Returns nonzero if the count is valid, 0 otherwise.
+ */
 
-xfs_buf_t *
+static inline int
+xlog_buf_bbcount_valid(
+	xlog_t		*log,
+	int		bbcount)
+{
+	return bbcount > 0 && bbcount <= log->l_logBBsize;
+}
+
+/*
+ * Allocate a buffer to hold log data.  The buffer needs to be able
+ * to map to a range of nbblks basic blocks at any valid (basic
+ * block) offset within the log.
+ */
+STATIC xfs_buf_t *
 xlog_get_bp(
 	xlog_t		*log,
 	int		nbblks)
 {
-	if (nbblks <= 0 || nbblks > log->l_logBBsize) {
-		xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
-		XFS_ERROR_REPORT("xlog_get_bp(1)",
-				 XFS_ERRLEVEL_HIGH, log->l_mp);
+	if (!xlog_buf_bbcount_valid(log, nbblks)) {
+		xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
+			nbblks);
+		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
 		return NULL;
 	}
 
-	if (log->l_sectbb_log) {
-		if (nbblks > 1)
-			nbblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1);
-		nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
-	}
+	/*
+	 * We do log I/O in units of log sectors (a power-of-2
+	 * multiple of the basic block size), so we round up the
+	 * requested size to acommodate the basic blocks required
+	 * for complete log sectors.
+	 *
+	 * In addition, the buffer may be used for a non-sector-
+	 * aligned block offset, in which case an I/O of the
+	 * requested size could extend beyond the end of the
+	 * buffer.  If the requested size is only 1 basic block it
+	 * will never straddle a sector boundary, so this won't be
+	 * an issue.  Nor will this be a problem if the log I/O is
+	 * done in basic blocks (sector size 1).  But otherwise we
+	 * extend the buffer by one extra log sector to ensure
+	 * there's space to accomodate this possiblility.
+	 */
+	if (nbblks > 1 && log->l_sectBBsize > 1)
+		nbblks += log->l_sectBBsize;
+	nbblks = round_up(nbblks, log->l_sectBBsize);
+
 	return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp);
 }
 
-void
+STATIC void
 xlog_put_bp(
 	xfs_buf_t	*bp)
 {
 	xfs_buf_free(bp);
 }
 
+/*
+ * Return the address of the start of the given block number's data
+ * in a log buffer.  The buffer covers a log sector-aligned region.
+ */
 STATIC xfs_caddr_t
 xlog_align(
 	xlog_t		*log,
@@ -102,15 +132,10 @@ xlog_align(
 	int		nbblks,
 	xfs_buf_t	*bp)
 {
-	xfs_caddr_t	ptr;
-
-	if (!log->l_sectbb_log)
-		return XFS_BUF_PTR(bp);
+	xfs_daddr_t	offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
 
-	ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask);
-	ASSERT(XFS_BUF_SIZE(bp) >=
-		BBTOB(nbblks + (blk_no & log->l_sectbb_mask)));
-	return ptr;
+	ASSERT(BBTOB(offset + nbblks) <= XFS_BUF_SIZE(bp));
+	return XFS_BUF_PTR(bp) + BBTOB(offset);
 }
 
 
@@ -126,21 +151,18 @@ xlog_bread_noalign(
 {
 	int		error;
 
-	if (nbblks <= 0 || nbblks > log->l_logBBsize) {
-		xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
-		XFS_ERROR_REPORT("xlog_bread(1)",
-				 XFS_ERRLEVEL_HIGH, log->l_mp);
+	if (!xlog_buf_bbcount_valid(log, nbblks)) {
+		xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
+			nbblks);
+		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
 		return EFSCORRUPTED;
 	}
 
-	if (log->l_sectbb_log) {
-		blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
-		nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
-	}
+	blk_no = round_down(blk_no, log->l_sectBBsize);
+	nbblks = round_up(nbblks, log->l_sectBBsize);
 
 	ASSERT(nbblks > 0);
 	ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
-	ASSERT(bp);
 
 	XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
 	XFS_BUF_READ(bp);
@@ -188,17 +210,15 @@ xlog_bwrite(
 {
 	int		error;
 
-	if (nbblks <= 0 || nbblks > log->l_logBBsize) {
-		xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
-		XFS_ERROR_REPORT("xlog_bwrite(1)",
-				 XFS_ERRLEVEL_HIGH, log->l_mp);
+	if (!xlog_buf_bbcount_valid(log, nbblks)) {
+		xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
+			nbblks);
+		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
 		return EFSCORRUPTED;
 	}
 
-	if (log->l_sectbb_log) {
-		blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
-		nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
-	}
+	blk_no = round_down(blk_no, log->l_sectBBsize);
+	nbblks = round_up(nbblks, log->l_sectBBsize);
 
 	ASSERT(nbblks > 0);
 	ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
@@ -329,39 +349,38 @@ xlog_find_cycle_start(
 {
 	xfs_caddr_t	offset;
 	xfs_daddr_t	mid_blk;
+	xfs_daddr_t	end_blk;
 	uint		mid_cycle;
 	int		error;
 
-	mid_blk = BLK_AVG(first_blk, *last_blk);
-	while (mid_blk != first_blk && mid_blk != *last_blk) {
+	end_blk = *last_blk;
+	mid_blk = BLK_AVG(first_blk, end_blk);
+	while (mid_blk != first_blk && mid_blk != end_blk) {
 		error = xlog_bread(log, mid_blk, 1, bp, &offset);
 		if (error)
 			return error;
 		mid_cycle = xlog_get_cycle(offset);
-		if (mid_cycle == cycle) {
-			*last_blk = mid_blk;
-			/* last_half_cycle == mid_cycle */
-		} else {
-			first_blk = mid_blk;
-			/* first_half_cycle == mid_cycle */
-		}
-		mid_blk = BLK_AVG(first_blk, *last_blk);
+		if (mid_cycle == cycle)
+			end_blk = mid_blk;   /* last_half_cycle == mid_cycle */
+		else
+			first_blk = mid_blk; /* first_half_cycle == mid_cycle */
+		mid_blk = BLK_AVG(first_blk, end_blk);
 	}
-	ASSERT((mid_blk == first_blk && mid_blk+1 == *last_blk) ||
-	       (mid_blk == *last_blk && mid_blk-1 == first_blk));
+	ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) ||
+	       (mid_blk == end_blk && mid_blk-1 == first_blk));
+
+	*last_blk = end_blk;
 
 	return 0;
 }
 
 /*
- * Check that the range of blocks does not contain the cycle number
- * given.  The scan needs to occur from front to back and the ptr into the
- * region must be updated since a later routine will need to perform another
- * test.  If the region is completely good, we end up returning the same
- * last block number.
- *
- * Set blkno to -1 if we encounter no errors.  This is an invalid block number
- * since we don't ever expect logs to get this large.
+ * Check that a range of blocks does not contain stop_on_cycle_no.
+ * Fill in *new_blk with the block offset where such a block is
+ * found, or with -1 (an invalid block number) if there is no such
+ * block in the range.  The scan needs to occur from front to back
+ * and the pointer into the region must be updated since a later
+ * routine will need to perform another test.
  */
 STATIC int
 xlog_find_verify_cycle(
@@ -378,12 +397,16 @@ xlog_find_verify_cycle(
 	xfs_caddr_t	buf = NULL;
 	int		error = 0;
 
+	/*
+	 * Greedily allocate a buffer big enough to handle the full
+	 * range of basic blocks we'll be examining.  If that fails,
+	 * try a smaller size.  We need to be able to read at least
+	 * a log sector, or we're out of luck.
+	 */
 	bufblks = 1 << ffs(nbblks);
-
 	while (!(bp = xlog_get_bp(log, bufblks))) {
-		/* can't get enough memory to do everything in one big buffer */
 		bufblks >>= 1;
-		if (bufblks <= log->l_sectbb_log)
+		if (bufblks < log->l_sectBBsize)
 			return ENOMEM;
 	}
 
@@ -631,7 +654,7 @@ xlog_find_head(
 		 * In this case we want to find the first block with cycle
 		 * number matching last_half_cycle.  We expect the log to be
 		 * some variation on
-		 *        x + 1 ... | x ...
+		 *        x + 1 ... | x ... | x
 		 * The first block with cycle number x (last_half_cycle) will
 		 * be where the new head belongs.  First we do a binary search
 		 * for the first occurrence of last_half_cycle.  The binary
@@ -641,11 +664,13 @@ xlog_find_head(
 		 * the log, then we look for occurrences of last_half_cycle - 1
 		 * at the end of the log.  The cases we're looking for look
 		 * like
-		 *        x + 1 ... | x | x + 1 | x ...
-		 *                               ^ binary search stopped here
+		 *                               v binary search stopped here
+		 *        x + 1 ... | x | x + 1 | x ... | x
+		 *                   ^ but we want to locate this spot
 		 * or
-		 *        x + 1 ... | x ... | x - 1 | x
 		 *        <---------> less than scan distance
+		 *        x + 1 ... | x ... | x - 1 | x
+		 *                           ^ we want to locate this spot
 		 */
 		stop_on_cycle = last_half_cycle;
 		if ((error = xlog_find_cycle_start(log, bp, first_blk,
@@ -701,16 +726,16 @@ xlog_find_head(
 		 * certainly not the head of the log.  By searching for
 		 * last_half_cycle-1 we accomplish that.
 		 */
-		start_blk = log_bbnum - num_scan_bblks + head_blk;
 		ASSERT(head_blk <= INT_MAX &&
-			(xfs_daddr_t) num_scan_bblks - head_blk >= 0);
+			(xfs_daddr_t) num_scan_bblks >= head_blk);
+		start_blk = log_bbnum - (num_scan_bblks - head_blk);
 		if ((error = xlog_find_verify_cycle(log, start_blk,
 					num_scan_bblks - (int)head_blk,
 					(stop_on_cycle - 1), &new_blk)))
 			goto bp_err;
 		if (new_blk != -1) {
 			head_blk = new_blk;
-			goto bad_blk;
+			goto validate_head;
 		}
 
 		/*
@@ -728,7 +753,7 @@ xlog_find_head(
 			head_blk = new_blk;
 	}
 
- bad_blk:
+validate_head:
 	/*
 	 * Now we need to make sure head_blk is not pointing to a block in
 	 * the middle of a log record.
@@ -750,7 +775,7 @@ xlog_find_head(
 		if ((error = xlog_find_verify_log_record(log, start_blk,
 							&head_blk, 0)) == -1) {
 			/* We hit the beginning of the log during our search */
-			start_blk = log_bbnum - num_scan_bblks + head_blk;
+			start_blk = log_bbnum - (num_scan_bblks - head_blk);
 			new_blk = log_bbnum;
 			ASSERT(start_blk <= INT_MAX &&
 				(xfs_daddr_t) log_bbnum-start_blk >= 0);
@@ -805,7 +830,7 @@ xlog_find_head(
  * We could speed up search by using current head_blk buffer, but it is not
  * available.
  */
-int
+STATIC int
 xlog_find_tail(
 	xlog_t			*log,
 	xfs_daddr_t		*head_blk,
@@ -835,12 +860,12 @@ xlog_find_tail(
 	if (*head_blk == 0) {				/* special case */
 		error = xlog_bread(log, 0, 1, bp, &offset);
 		if (error)
-			goto bread_err;
+			goto done;
 
 		if (xlog_get_cycle(offset) == 0) {
 			*tail_blk = 0;
 			/* leave all other log inited values alone */
-			goto exit;
+			goto done;
 		}
 	}
 
@@ -851,7 +876,7 @@ xlog_find_tail(
 	for (i = (int)(*head_blk) - 1; i >= 0; i--) {
 		error = xlog_bread(log, i, 1, bp, &offset);
 		if (error)
-			goto bread_err;
+			goto done;
 
 		if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) {
 			found = 1;
@@ -868,7 +893,7 @@ xlog_find_tail(
 		for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
 			error = xlog_bread(log, i, 1, bp, &offset);
 			if (error)
-				goto bread_err;
+				goto done;
 
 			if (XLOG_HEADER_MAGIC_NUM ==
 			    be32_to_cpu(*(__be32 *)offset)) {
@@ -943,7 +968,7 @@ xlog_find_tail(
 		umount_data_blk = (i + hblks) % log->l_logBBsize;
 		error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
 		if (error)
-			goto bread_err;
+			goto done;
 
 		op_head = (xlog_op_header_t *)offset;
 		if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
@@ -989,12 +1014,10 @@ xlog_find_tail(
 	 * But... if the -device- itself is readonly, just skip this.
 	 * We can't recover this device anyway, so it won't matter.
 	 */
-	if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) {
+	if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp))
 		error = xlog_clear_stale_blocks(log, tail_lsn);
-	}
 
-bread_err:
-exit:
+done:
 	xlog_put_bp(bp);
 
 	if (error)
@@ -1154,16 +1177,22 @@ xlog_write_log_records(
 	xfs_caddr_t	offset;
 	xfs_buf_t	*bp;
 	int		balign, ealign;
-	int		sectbb = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1);
+	int		sectbb = log->l_sectBBsize;
 	int		end_block = start_block + blocks;
 	int		bufblks;
 	int		error = 0;
 	int		i, j = 0;
 
+	/*
+	 * Greedily allocate a buffer big enough to handle the full
+	 * range of basic blocks to be written.  If that fails, try
+	 * a smaller size.  We need to be able to write at least a
+	 * log sector, or we're out of luck.
+	 */
 	bufblks = 1 << ffs(blocks);
 	while (!(bp = xlog_get_bp(log, bufblks))) {
 		bufblks >>= 1;
-		if (bufblks <= log->l_sectbb_log)
+		if (bufblks < sectbb)
 			return ENOMEM;
 	}
 
@@ -1171,7 +1200,7 @@ xlog_write_log_records(
 	 * the buffer in the starting sector not covered by the first
 	 * write below.
 	 */
-	balign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, start_block);
+	balign = round_down(start_block, sectbb);
 	if (balign != start_block) {
 		error = xlog_bread_noalign(log, start_block, 1, bp);
 		if (error)
@@ -1190,7 +1219,7 @@ xlog_write_log_records(
 		 * the buffer in the final sector not covered by the write.
 		 * If this is the same sector as the above read, skip it.
 		 */
-		ealign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, end_block);
+		ealign = round_down(end_block, sectbb);
 		if (j == 0 && (start_block + endcount > ealign)) {
 			offset = XFS_BUF_PTR(bp);
 			balign = BBTOB(ealign - start_block);
@@ -1367,40 +1396,50 @@ xlog_clear_stale_blocks(
 
 STATIC xlog_recover_t *
 xlog_recover_find_tid(
-	xlog_recover_t		*q,
+	struct hlist_head	*head,
 	xlog_tid_t		tid)
 {
-	xlog_recover_t		*p = q;
+	xlog_recover_t		*trans;
+	struct hlist_node	*n;
 
-	while (p != NULL) {
-		if (p->r_log_tid == tid)
-		    break;
-		p = p->r_next;
+	hlist_for_each_entry(trans, n, head, r_list) {
+		if (trans->r_log_tid == tid)
+			return trans;
 	}
-	return p;
+	return NULL;
 }
 
 STATIC void
-xlog_recover_put_hashq(
-	xlog_recover_t		**q,
-	xlog_recover_t		*trans)
+xlog_recover_new_tid(
+	struct hlist_head	*head,
+	xlog_tid_t		tid,
+	xfs_lsn_t		lsn)
 {
-	trans->r_next = *q;
-	*q = trans;
+	xlog_recover_t		*trans;
+
+	trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);
+	trans->r_log_tid   = tid;
+	trans->r_lsn	   = lsn;
+	INIT_LIST_HEAD(&trans->r_itemq);
+
+	INIT_HLIST_NODE(&trans->r_list);
+	hlist_add_head(&trans->r_list, head);
 }
 
 STATIC void
 xlog_recover_add_item(
-	xlog_recover_item_t	**itemq)
+	struct list_head	*head)
 {
 	xlog_recover_item_t	*item;
 
 	item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
-	xlog_recover_insert_item_backq(itemq, item);
+	INIT_LIST_HEAD(&item->ri_list);
+	list_add_tail(&item->ri_list, head);
 }
 
 STATIC int
 xlog_recover_add_to_cont_trans(
+	struct log		*log,
 	xlog_recover_t		*trans,
 	xfs_caddr_t		dp,
 	int			len)
@@ -1409,8 +1448,7 @@ xlog_recover_add_to_cont_trans(
 	xfs_caddr_t		ptr, old_ptr;
 	int			old_len;
 
-	item = trans->r_itemq;
-	if (item == NULL) {
+	if (list_empty(&trans->r_itemq)) {
 		/* finish copying rest of trans header */
 		xlog_recover_add_item(&trans->r_itemq);
 		ptr = (xfs_caddr_t) &trans->r_theader +
@@ -1418,7 +1456,8 @@ xlog_recover_add_to_cont_trans(
 		memcpy(ptr, dp, len); /* d, s, l */
 		return 0;
 	}
-	item = item->ri_prev;
+	/* take the tail entry */
+	item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
 
 	old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
 	old_len = item->ri_buf[item->ri_cnt-1].i_len;
@@ -1427,6 +1466,7 @@ xlog_recover_add_to_cont_trans(
 	memcpy(&ptr[old_len], dp, len); /* d, s, l */
 	item->ri_buf[item->ri_cnt-1].i_len += len;
 	item->ri_buf[item->ri_cnt-1].i_addr = ptr;
+	trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
 	return 0;
 }
 
@@ -1445,6 +1485,7 @@ xlog_recover_add_to_cont_trans(
  */
 STATIC int
 xlog_recover_add_to_trans(
+	struct log		*log,
 	xlog_recover_t		*trans,
 	xfs_caddr_t		dp,
 	int			len)
@@ -1455,8 +1496,7 @@ xlog_recover_add_to_trans(
 
 	if (!len)
 		return 0;
-	item = trans->r_itemq;
-	if (item == NULL) {
+	if (list_empty(&trans->r_itemq)) {
 		/* we need to catch log corruptions here */
 		if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
 			xlog_warn("XFS: xlog_recover_add_to_trans: "
@@ -1474,12 +1514,15 @@ xlog_recover_add_to_trans(
 	memcpy(ptr, dp, len);
 	in_f = (xfs_inode_log_format_t *)ptr;
 
-	if (item->ri_prev->ri_total != 0 &&
-	     item->ri_prev->ri_total == item->ri_prev->ri_cnt) {
+	/* take the tail entry */
+	item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
+	if (item->ri_total != 0 &&
+	     item->ri_total == item->ri_cnt) {
+		/* tail item is in use, get a new one */
 		xlog_recover_add_item(&trans->r_itemq);
+		item = list_entry(trans->r_itemq.prev,
+					xlog_recover_item_t, ri_list);
 	}
-	item = trans->r_itemq;
-	item = item->ri_prev;
 
 	if (item->ri_total == 0) {		/* first region to be added */
 		if (in_f->ilf_size == 0 ||
@@ -1501,99 +1544,37 @@ xlog_recover_add_to_trans(
 	item->ri_buf[item->ri_cnt].i_addr = ptr;
 	item->ri_buf[item->ri_cnt].i_len  = len;
 	item->ri_cnt++;
+	trace_xfs_log_recover_item_add(log, trans, item, 0);
 	return 0;
 }
 
-STATIC void
-xlog_recover_new_tid(
-	xlog_recover_t		**q,
-	xlog_tid_t		tid,
-	xfs_lsn_t		lsn)
-{
-	xlog_recover_t		*trans;
-
-	trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);
-	trans->r_log_tid   = tid;
-	trans->r_lsn	   = lsn;
-	xlog_recover_put_hashq(q, trans);
-}
-
-STATIC int
-xlog_recover_unlink_tid(
-	xlog_recover_t		**q,
-	xlog_recover_t		*trans)
-{
-	xlog_recover_t		*tp;
-	int			found = 0;
-
-	ASSERT(trans != NULL);
-	if (trans == *q) {
-		*q = (*q)->r_next;
-	} else {
-		tp = *q;
-		while (tp) {
-			if (tp->r_next == trans) {
-				found = 1;
-				break;
-			}
-			tp = tp->r_next;
-		}
-		if (!found) {
-			xlog_warn(
-			     "XFS: xlog_recover_unlink_tid: trans not found");
-			ASSERT(0);
-			return XFS_ERROR(EIO);
-		}
-		tp->r_next = tp->r_next->r_next;
-	}
-	return 0;
-}
-
-STATIC void
-xlog_recover_insert_item_backq(
-	xlog_recover_item_t	**q,
-	xlog_recover_item_t	*item)
-{
-	if (*q == NULL) {
-		item->ri_prev = item->ri_next = item;
-		*q = item;
-	} else {
-		item->ri_next		= *q;
-		item->ri_prev		= (*q)->ri_prev;
-		(*q)->ri_prev		= item;
-		item->ri_prev->ri_next	= item;
-	}
-}
-
-STATIC void
-xlog_recover_insert_item_frontq(
-	xlog_recover_item_t	**q,
-	xlog_recover_item_t	*item)
-{
-	xlog_recover_insert_item_backq(q, item);
-	*q = item;
-}
-
+/*
+ * Sort the log items in the transaction. Cancelled buffers need
+ * to be put first so they are processed before any items that might
+ * modify the buffers. If they are cancelled, then the modifications
+ * don't need to be replayed.
+ */
 STATIC int
 xlog_recover_reorder_trans(
-	xlog_recover_t		*trans)
+	struct log		*log,
+	xlog_recover_t		*trans,
+	int			pass)
 {
-	xlog_recover_item_t	*first_item, *itemq, *itemq_next;
-	xfs_buf_log_format_t	*buf_f;
-	ushort			flags = 0;
+	xlog_recover_item_t	*item, *n;
+	LIST_HEAD(sort_list);
+
+	list_splice_init(&trans->r_itemq, &sort_list);
+	list_for_each_entry_safe(item, n, &sort_list, ri_list) {
+		xfs_buf_log_format_t	*buf_f;
 
-	first_item = itemq = trans->r_itemq;
-	trans->r_itemq = NULL;
-	do {
-		itemq_next = itemq->ri_next;
-		buf_f = (xfs_buf_log_format_t *)itemq->ri_buf[0].i_addr;
+		buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr;
 
-		switch (ITEM_TYPE(itemq)) {
+		switch (ITEM_TYPE(item)) {
 		case XFS_LI_BUF:
-			flags = buf_f->blf_flags;
-			if (!(flags & XFS_BLI_CANCEL)) {
-				xlog_recover_insert_item_frontq(&trans->r_itemq,
-								itemq);
+			if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
+				trace_xfs_log_recover_item_reorder_head(log,
+							trans, item, pass);
+				list_move(&item->ri_list, &trans->r_itemq);
 				break;
 			}
 		case XFS_LI_INODE:
@@ -1601,7 +1582,9 @@ xlog_recover_reorder_trans(
 		case XFS_LI_QUOTAOFF:
 		case XFS_LI_EFD:
 		case XFS_LI_EFI:
-			xlog_recover_insert_item_backq(&trans->r_itemq, itemq);
+			trace_xfs_log_recover_item_reorder_tail(log,
+							trans, item, pass);
+			list_move_tail(&item->ri_list, &trans->r_itemq);
 			break;
 		default:
 			xlog_warn(
@@ -1609,8 +1592,8 @@ xlog_recover_reorder_trans(
 			ASSERT(0);
 			return XFS_ERROR(EIO);
 		}
-		itemq = itemq_next;
-	} while (first_item != itemq);
+	}
+	ASSERT(list_empty(&sort_list));
 	return 0;
 }
 
@@ -1650,8 +1633,10 @@ xlog_recover_do_buffer_pass1(
 	/*
 	 * If this isn't a cancel buffer item, then just return.
 	 */
-	if (!(flags & XFS_BLI_CANCEL))
+	if (!(flags & XFS_BLF_CANCEL)) {
+		trace_xfs_log_recover_buf_not_cancel(log, buf_f);
 		return;
+	}
 
 	/*
 	 * Insert an xfs_buf_cancel record into the hash table of
@@ -1685,6 +1670,7 @@ xlog_recover_do_buffer_pass1(
 	while (nextp != NULL) {
 		if (nextp->bc_blkno == blkno && nextp->bc_len == len) {
 			nextp->bc_refcount++;
+			trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
 			return;
 		}
 		prevp = nextp;
@@ -1698,13 +1684,14 @@ xlog_recover_do_buffer_pass1(
 	bcp->bc_refcount = 1;
 	bcp->bc_next = NULL;
 	prevp->bc_next = bcp;
+	trace_xfs_log_recover_buf_cancel_add(log, buf_f);
 }
 
 /*
  * Check to see whether the buffer being recovered has a corresponding
  * entry in the buffer cancel record table.  If it does then return 1
  * so that it will be cancelled, otherwise return 0.  If the buffer is
- * actually a buffer cancel item (XFS_BLI_CANCEL is set), then decrement
+ * actually a buffer cancel item (XFS_BLF_CANCEL is set), then decrement
  * the refcount on the entry in the table and remove it from the table
  * if this is the last reference.
  *
@@ -1729,7 +1716,7 @@ xlog_check_buffer_cancelled(
 		 * There is nothing in the table built in pass one,
 		 * so this buffer must not be cancelled.
 		 */
-		ASSERT(!(flags & XFS_BLI_CANCEL));
+		ASSERT(!(flags & XFS_BLF_CANCEL));
 		return 0;
 	}
 
@@ -1741,7 +1728,7 @@ xlog_check_buffer_cancelled(
 		 * There is no corresponding entry in the table built
 		 * in pass one, so this buffer has not been cancelled.
 		 */
-		ASSERT(!(flags & XFS_BLI_CANCEL));
+		ASSERT(!(flags & XFS_BLF_CANCEL));
 		return 0;
 	}
 
@@ -1760,7 +1747,7 @@ xlog_check_buffer_cancelled(
 			 * one in the table and remove it if this is the
 			 * last reference.
 			 */
-			if (flags & XFS_BLI_CANCEL) {
+			if (flags & XFS_BLF_CANCEL) {
 				bcp->bc_refcount--;
 				if (bcp->bc_refcount == 0) {
 					if (prevp == NULL) {
@@ -1780,7 +1767,7 @@ xlog_check_buffer_cancelled(
 	 * We didn't find a corresponding entry in the table, so
 	 * return 0 so that the buffer is NOT cancelled.
 	 */
-	ASSERT(!(flags & XFS_BLI_CANCEL));
+	ASSERT(!(flags & XFS_BLF_CANCEL));
 	return 0;
 }
 
@@ -1837,6 +1824,8 @@ xlog_recover_do_inode_buffer(
 	unsigned int		*data_map = NULL;
 	unsigned int		map_size = 0;
 
+	trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
+
 	switch (buf_f->blf_type) {
 	case XFS_LI_BUF:
 		data_map = buf_f->blf_data_map;
@@ -1880,8 +1869,8 @@ xlog_recover_do_inode_buffer(
 			nbits = xfs_contig_bits(data_map, map_size,
 							 bit);
 			ASSERT(nbits > 0);
-			reg_buf_offset = bit << XFS_BLI_SHIFT;
-			reg_buf_bytes = nbits << XFS_BLI_SHIFT;
+			reg_buf_offset = bit << XFS_BLF_SHIFT;
+			reg_buf_bytes = nbits << XFS_BLF_SHIFT;
 			item_index++;
 		}
 
@@ -1895,7 +1884,7 @@ xlog_recover_do_inode_buffer(
 		}
 
 		ASSERT(item->ri_buf[item_index].i_addr != NULL);
-		ASSERT((item->ri_buf[item_index].i_len % XFS_BLI_CHUNK) == 0);
+		ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
 		ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp));
 
 		/*
@@ -1932,6 +1921,7 @@ xlog_recover_do_inode_buffer(
 /*ARGSUSED*/
 STATIC void
 xlog_recover_do_reg_buffer(
+	struct xfs_mount	*mp,
 	xlog_recover_item_t	*item,
 	xfs_buf_t		*bp,
 	xfs_buf_log_format_t	*buf_f)
@@ -1943,6 +1933,8 @@ xlog_recover_do_reg_buffer(
 	unsigned int		map_size = 0;
 	int                     error;
 
+	trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
+
 	switch (buf_f->blf_type) {
 	case XFS_LI_BUF:
 		data_map = buf_f->blf_data_map;
@@ -1958,9 +1950,9 @@ xlog_recover_do_reg_buffer(
 		nbits = xfs_contig_bits(data_map, map_size, bit);
 		ASSERT(nbits > 0);
 		ASSERT(item->ri_buf[i].i_addr != NULL);
-		ASSERT(item->ri_buf[i].i_len % XFS_BLI_CHUNK == 0);
+		ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
 		ASSERT(XFS_BUF_COUNT(bp) >=
-		       ((uint)bit << XFS_BLI_SHIFT)+(nbits<<XFS_BLI_SHIFT));
+		       ((uint)bit << XFS_BLF_SHIFT)+(nbits<<XFS_BLF_SHIFT));
 
 		/*
 		 * Do a sanity check if this is a dquot buffer. Just checking
@@ -1969,7 +1961,7 @@ xlog_recover_do_reg_buffer(
 		 */
 		error = 0;
 		if (buf_f->blf_flags &
-		   (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
+		   (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
 			if (item->ri_buf[i].i_addr == NULL) {
 				cmn_err(CE_ALERT,
 					"XFS: NULL dquot in %s.", __func__);
@@ -1990,9 +1982,9 @@ xlog_recover_do_reg_buffer(
 		}
 
 		memcpy(xfs_buf_offset(bp,
-			(uint)bit << XFS_BLI_SHIFT),	/* dest */
+			(uint)bit << XFS_BLF_SHIFT),	/* dest */
 			item->ri_buf[i].i_addr,		/* source */
-			nbits<<XFS_BLI_SHIFT);		/* length */
+			nbits<<XFS_BLF_SHIFT);		/* length */
  next:
 		i++;
 		bit += nbits;
@@ -2141,6 +2133,8 @@ xlog_recover_do_dquot_buffer(
 {
 	uint			type;
 
+	trace_xfs_log_recover_buf_dquot_buf(log, buf_f);
+
 	/*
 	 * Filesystems are required to send in quota flags at mount time.
 	 */
@@ -2149,11 +2143,11 @@ xlog_recover_do_dquot_buffer(
 	}
 
 	type = 0;
-	if (buf_f->blf_flags & XFS_BLI_UDQUOT_BUF)
+	if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF)
 		type |= XFS_DQ_USER;
-	if (buf_f->blf_flags & XFS_BLI_PDQUOT_BUF)
+	if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF)
 		type |= XFS_DQ_PROJ;
-	if (buf_f->blf_flags & XFS_BLI_GDQUOT_BUF)
+	if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF)
 		type |= XFS_DQ_GROUP;
 	/*
 	 * This type of quotas was turned off, so ignore this buffer
@@ -2161,7 +2155,7 @@ xlog_recover_do_dquot_buffer(
 	if (log->l_quotaoffs_flag & type)
 		return;
 
-	xlog_recover_do_reg_buffer(item, bp, buf_f);
+	xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
 }
 
 /*
@@ -2174,7 +2168,7 @@ xlog_recover_do_dquot_buffer(
  * here which overlaps that may be stale.
  *
  * When meta-data buffers are freed at run time we log a buffer item
- * with the XFS_BLI_CANCEL bit set to indicate that previous copies
+ * with the XFS_BLF_CANCEL bit set to indicate that previous copies
  * of the buffer in the log should not be replayed at recovery time.
  * This is so that if the blocks covered by the buffer are reused for
  * file data before we crash we don't end up replaying old, freed
@@ -2208,7 +2202,7 @@ xlog_recover_do_buffer_trans(
 	if (pass == XLOG_RECOVER_PASS1) {
 		/*
 		 * In this pass we're only looking for buf items
-		 * with the XFS_BLI_CANCEL bit set.
+		 * with the XFS_BLF_CANCEL bit set.
 		 */
 		xlog_recover_do_buffer_pass1(log, buf_f);
 		return 0;
@@ -2222,9 +2216,11 @@ xlog_recover_do_buffer_trans(
 		 */
 		cancel = xlog_recover_do_buffer_pass2(log, buf_f);
 		if (cancel) {
+			trace_xfs_log_recover_buf_cancel(log, buf_f);
 			return 0;
 		}
 	}
+	trace_xfs_log_recover_buf_recover(log, buf_f);
 	switch (buf_f->blf_type) {
 	case XFS_LI_BUF:
 		blkno = buf_f->blf_blkno;
@@ -2242,9 +2238,9 @@ xlog_recover_do_buffer_trans(
 	}
 
 	mp = log->l_mp;
-	buf_flags = XFS_BUF_LOCK;
-	if (!(flags & XFS_BLI_INODE_BUF))
-		buf_flags |= XFS_BUF_MAPPED;
+	buf_flags = XBF_LOCK;
+	if (!(flags & XFS_BLF_INODE_BUF))
+		buf_flags |= XBF_MAPPED;
 
 	bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags);
 	if (XFS_BUF_ISERROR(bp)) {
@@ -2256,13 +2252,13 @@ xlog_recover_do_buffer_trans(
 	}
 
 	error = 0;
-	if (flags & XFS_BLI_INODE_BUF) {
+	if (flags & XFS_BLF_INODE_BUF) {
 		error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
 	} else if (flags &
-		  (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
+		  (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
 		xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
 	} else {
-		xlog_recover_do_reg_buffer(item, bp, buf_f);
+		xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
 	}
 	if (error)
 		return XFS_ERROR(error);
@@ -2342,11 +2338,13 @@ xlog_recover_do_inode_trans(
 	if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno,
 					in_f->ilf_len, 0)) {
 		error = 0;
+		trace_xfs_log_recover_inode_cancel(log, in_f);
 		goto error;
 	}
+	trace_xfs_log_recover_inode_recover(log, in_f);
 
 	bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len,
-			  XFS_BUF_LOCK);
+			  XBF_LOCK);
 	if (XFS_BUF_ISERROR(bp)) {
 		xfs_ioerror_alert("xlog_recover_do..(read#2)", mp,
 				  bp, in_f->ilf_blkno);
@@ -2395,6 +2393,7 @@ xlog_recover_do_inode_trans(
 			/* do nothing */
 		} else {
 			xfs_buf_relse(bp);
+			trace_xfs_log_recover_inode_skip(log, in_f);
 			error = 0;
 			goto error;
 		}
@@ -2814,14 +2813,14 @@ xlog_recover_do_trans(
 	int			pass)
 {
 	int			error = 0;
-	xlog_recover_item_t	*item, *first_item;
+	xlog_recover_item_t	*item;
 
-	error = xlog_recover_reorder_trans(trans);
+	error = xlog_recover_reorder_trans(log, trans, pass);
 	if (error)
 		return error;
 
-	first_item = item = trans->r_itemq;
-	do {
+	list_for_each_entry(item, &trans->r_itemq, ri_list) {
+		trace_xfs_log_recover_item_recover(log, trans, item, pass);
 		switch (ITEM_TYPE(item)) {
 		case XFS_LI_BUF:
 			error = xlog_recover_do_buffer_trans(log, item, pass);
@@ -2854,8 +2853,7 @@ xlog_recover_do_trans(
 
 		if (error)
 			return error;
-		item = item->ri_next;
-	} while (first_item != item);
+	}
 
 	return 0;
 }
@@ -2869,21 +2867,18 @@ STATIC void
 xlog_recover_free_trans(
 	xlog_recover_t		*trans)
 {
-	xlog_recover_item_t	*first_item, *item, *free_item;
+	xlog_recover_item_t	*item, *n;
 	int			i;
 
-	item = first_item = trans->r_itemq;
-	do {
-		free_item = item;
-		item = item->ri_next;
-		 /* Free the regions in the item. */
-		for (i = 0; i < free_item->ri_cnt; i++) {
-			kmem_free(free_item->ri_buf[i].i_addr);
-		}
+	list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
+		/* Free the regions in the item. */
+		list_del(&item->ri_list);
+		for (i = 0; i < item->ri_cnt; i++)
+			kmem_free(item->ri_buf[i].i_addr);
 		/* Free the item itself */
-		kmem_free(free_item->ri_buf);
-		kmem_free(free_item);
-	} while (first_item != item);
+		kmem_free(item->ri_buf);
+		kmem_free(item);
+	}
 	/* Free the transaction recover structure */
 	kmem_free(trans);
 }
@@ -2891,14 +2886,12 @@ xlog_recover_free_trans(
 STATIC int
 xlog_recover_commit_trans(
 	xlog_t			*log,
-	xlog_recover_t		**q,
 	xlog_recover_t		*trans,
 	int			pass)
 {
 	int			error;
 
-	if ((error = xlog_recover_unlink_tid(q, trans)))
-		return error;
+	hlist_del(&trans->r_list);
 	if ((error = xlog_recover_do_trans(log, trans, pass)))
 		return error;
 	xlog_recover_free_trans(trans);			/* no error */
@@ -2926,7 +2919,7 @@ xlog_recover_unmount_trans(
 STATIC int
 xlog_recover_process_data(
 	xlog_t			*log,
-	xlog_recover_t		*rhash[],
+	struct hlist_head	rhash[],
 	xlog_rec_header_t	*rhead,
 	xfs_caddr_t		dp,
 	int			pass)
@@ -2960,7 +2953,7 @@ xlog_recover_process_data(
 		}
 		tid = be32_to_cpu(ohead->oh_tid);
 		hash = XLOG_RHASH(tid);
-		trans = xlog_recover_find_tid(rhash[hash], tid);
+		trans = xlog_recover_find_tid(&rhash[hash], tid);
 		if (trans == NULL) {		   /* not found; add new tid */
 			if (ohead->oh_flags & XLOG_START_TRANS)
 				xlog_recover_new_tid(&rhash[hash], tid,
@@ -2978,14 +2971,15 @@ xlog_recover_process_data(
 			switch (flags) {
 			case XLOG_COMMIT_TRANS:
 				error = xlog_recover_commit_trans(log,
-						&rhash[hash], trans, pass);
+								trans, pass);
 				break;
 			case XLOG_UNMOUNT_TRANS:
 				error = xlog_recover_unmount_trans(trans);
 				break;
 			case XLOG_WAS_CONT_TRANS:
-				error = xlog_recover_add_to_cont_trans(trans,
-						dp, be32_to_cpu(ohead->oh_len));
+				error = xlog_recover_add_to_cont_trans(log,
+						trans, dp,
+						be32_to_cpu(ohead->oh_len));
 				break;
 			case XLOG_START_TRANS:
 				xlog_warn(
@@ -2995,7 +2989,7 @@ xlog_recover_process_data(
 				break;
 			case 0:
 			case XLOG_CONTINUE_TRANS:
-				error = xlog_recover_add_to_trans(trans,
+				error = xlog_recover_add_to_trans(log, trans,
 						dp, be32_to_cpu(ohead->oh_len));
 				break;
 			default:
@@ -3204,14 +3198,14 @@ xlog_recover_process_one_iunlink(
 	int				error;
 
 	ino = XFS_AGINO_TO_INO(mp, agno, agino);
-	error = xfs_iget(mp, NULL, ino, 0, 0, &ip, 0);
+	error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
 	if (error)
 		goto fail;
 
 	/*
 	 * Get the on disk inode to find the next inode in the bucket.
 	 */
-	error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XFS_BUF_LOCK);
+	error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XBF_LOCK);
 	if (error)
 		goto fail_iput;
 
@@ -3396,42 +3390,6 @@ xlog_pack_data(
 	}
 }
 
-#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
-STATIC void
-xlog_unpack_data_checksum(
-	xlog_rec_header_t	*rhead,
-	xfs_caddr_t		dp,
-	xlog_t			*log)
-{
-	__be32			*up = (__be32 *)dp;
-	uint			chksum = 0;
-	int			i;
-
-	/* divide length by 4 to get # words */
-	for (i=0; i < be32_to_cpu(rhead->h_len) >> 2; i++) {
-		chksum ^= be32_to_cpu(*up);
-		up++;
-	}
-	if (chksum != be32_to_cpu(rhead->h_chksum)) {
-	    if (rhead->h_chksum ||
-		((log->l_flags & XLOG_CHKSUM_MISMATCH) == 0)) {
-		    cmn_err(CE_DEBUG,
-			"XFS: LogR chksum mismatch: was (0x%x) is (0x%x)\n",
-			    be32_to_cpu(rhead->h_chksum), chksum);
-		    cmn_err(CE_DEBUG,
-"XFS: Disregard message if filesystem was created with non-DEBUG kernel");
-		    if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
-			    cmn_err(CE_DEBUG,
-				"XFS: LogR this is a LogV2 filesystem\n");
-		    }
-		    log->l_flags |= XLOG_CHKSUM_MISMATCH;
-	    }
-	}
-}
-#else
-#define xlog_unpack_data_checksum(rhead, dp, log)
-#endif
-
 STATIC void
 xlog_unpack_data(
 	xlog_rec_header_t	*rhead,
@@ -3455,8 +3413,6 @@ xlog_unpack_data(
 			dp += BBSIZE;
 		}
 	}
-
-	xlog_unpack_data_checksum(rhead, dp, log);
 }
 
 STATIC int
@@ -3517,7 +3473,7 @@ xlog_do_recovery_pass(
 	int			error = 0, h_size;
 	int			bblks, split_bblks;
 	int			hblks, split_hblks, wrapped_hblks;
-	xlog_recover_t		*rhash[XLOG_RHASH_SIZE];
+	struct hlist_head	rhash[XLOG_RHASH_SIZE];
 
 	ASSERT(head_blk != tail_blk);
 
@@ -3555,7 +3511,7 @@ xlog_do_recovery_pass(
 			hblks = 1;
 		}
 	} else {
-		ASSERT(log->l_sectbb_log == 0);
+		ASSERT(log->l_sectBBsize == 1);
 		hblks = 1;
 		hbp = xlog_get_bp(log, 1);
 		h_size = XLOG_BIG_RECORD_BSIZE;
@@ -3978,8 +3934,7 @@ xlog_recover_finish(
 		 * case the unlink transactions would have problems
 		 * pushing the EFIs out of the way.
 		 */
-		xfs_log_force(log->l_mp, (xfs_lsn_t)0,
-			      (XFS_LOG_FORCE | XFS_LOG_SYNC));
+		xfs_log_force(log->l_mp, XFS_LOG_SYNC);
 
 		xlog_recover_process_iunlinks(log);
 
@@ -4012,10 +3967,6 @@ xlog_recover_check_summary(
 	xfs_agf_t	*agfp;
 	xfs_buf_t	*agfbp;
 	xfs_buf_t	*agibp;
-	xfs_buf_t	*sbbp;
-#ifdef XFS_LOUD_RECOVERY
-	xfs_sb_t	*sbp;
-#endif
 	xfs_agnumber_t	agno;
 	__uint64_t	freeblks;
 	__uint64_t	itotal;
@@ -4050,30 +4001,5 @@ xlog_recover_check_summary(
 			xfs_buf_relse(agibp);
 		}
 	}
-
-	sbbp = xfs_getsb(mp, 0);
-#ifdef XFS_LOUD_RECOVERY
-	sbp = &mp->m_sb;
-	xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(sbbp));
-	cmn_err(CE_NOTE,
-		"xlog_recover_check_summary: sb_icount %Lu itotal %Lu",
-		sbp->sb_icount, itotal);
-	cmn_err(CE_NOTE,
-		"xlog_recover_check_summary: sb_ifree %Lu itotal %Lu",
-		sbp->sb_ifree, ifree);
-	cmn_err(CE_NOTE,
-		"xlog_recover_check_summary: sb_fdblocks %Lu freeblks %Lu",
-		sbp->sb_fdblocks, freeblks);
-#if 0
-	/*
-	 * This is turned off until I account for the allocation
-	 * btree blocks which live in free space.
-	 */
-	ASSERT(sbp->sb_icount == itotal);
-	ASSERT(sbp->sb_ifree == ifree);
-	ASSERT(sbp->sb_fdblocks == freeblks);
-#endif
-#endif
-	xfs_buf_relse(sbbp);
 }
 #endif /* DEBUG */
diff --git a/fs/xfs/xfs_log_recover.h b/fs/xfs/xfs_log_recover.h
index b22545555301..1c55ccbb379d 100644
--- a/fs/xfs/xfs_log_recover.h
+++ b/fs/xfs/xfs_log_recover.h
@@ -28,29 +28,28 @@
 #define XLOG_RHASH(tid)	\
 	((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1))
 
-#define XLOG_MAX_REGIONS_IN_ITEM   (XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK / 2 + 1)
+#define XLOG_MAX_REGIONS_IN_ITEM   (XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK / 2 + 1)
 
 
 /*
  * item headers are in ri_buf[0].  Additional buffers follow.
  */
 typedef struct xlog_recover_item {
-	struct xlog_recover_item *ri_next;
-	struct xlog_recover_item *ri_prev;
-	int			 ri_type;
-	int			 ri_cnt;	/* count of regions found */
-	int			 ri_total;	/* total regions */
-	xfs_log_iovec_t		 *ri_buf;	/* ptr to regions buffer */
+	struct list_head	ri_list;
+	int			ri_type;
+	int			ri_cnt;	/* count of regions found */
+	int			ri_total;	/* total regions */
+	xfs_log_iovec_t		*ri_buf;	/* ptr to regions buffer */
 } xlog_recover_item_t;
 
 struct xlog_tid;
 typedef struct xlog_recover {
-	struct xlog_recover *r_next;
-	xlog_tid_t	    r_log_tid;		/* log's transaction id */
-	xfs_trans_header_t  r_theader;		/* trans header for partial */
-	int		    r_state;		/* not needed */
-	xfs_lsn_t	    r_lsn;		/* xact lsn */
-	xlog_recover_item_t *r_itemq;		/* q for items */
+	struct hlist_node	r_list;
+	xlog_tid_t		r_log_tid;	/* log's transaction id */
+	xfs_trans_header_t	r_theader;	/* trans header for partial */
+	int			r_state;	/* not needed */
+	xfs_lsn_t		r_lsn;		/* xact lsn */
+	struct list_head	r_itemq;	/* q for items */
 } xlog_recover_t;
 
 #define ITEM_TYPE(i)	(*(ushort *)(i)->ri_buf[0].i_addr)
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index eb403b40e120..69f62d8b2816 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -201,6 +201,38 @@ xfs_uuid_unmount(
 
 
 /*
+ * Reference counting access wrappers to the perag structures.
+ */
+struct xfs_perag *
+xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
+{
+	struct xfs_perag	*pag;
+	int			ref = 0;
+
+	spin_lock(&mp->m_perag_lock);
+	pag = radix_tree_lookup(&mp->m_perag_tree, agno);
+	if (pag) {
+		ASSERT(atomic_read(&pag->pag_ref) >= 0);
+		/* catch leaks in the positive direction during testing */
+		ASSERT(atomic_read(&pag->pag_ref) < 1000);
+		ref = atomic_inc_return(&pag->pag_ref);
+	}
+	spin_unlock(&mp->m_perag_lock);
+	trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
+	return pag;
+}
+
+void
+xfs_perag_put(struct xfs_perag *pag)
+{
+	int	ref;
+
+	ASSERT(atomic_read(&pag->pag_ref) > 0);
+	ref = atomic_dec_return(&pag->pag_ref);
+	trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
+}
+
+/*
  * Free up the resources associated with a mount structure.  Assume that
  * the structure was initially zeroed, so we can tell which fields got
  * initialized.
@@ -209,13 +241,16 @@ STATIC void
 xfs_free_perag(
 	xfs_mount_t	*mp)
 {
-	if (mp->m_perag) {
-		int	agno;
+	xfs_agnumber_t	agno;
+	struct xfs_perag *pag;
 
-		for (agno = 0; agno < mp->m_maxagi; agno++)
-			if (mp->m_perag[agno].pagb_list)
-				kmem_free(mp->m_perag[agno].pagb_list);
-		kmem_free(mp->m_perag);
+	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+		spin_lock(&mp->m_perag_lock);
+		pag = radix_tree_delete(&mp->m_perag_tree, agno);
+		ASSERT(pag);
+		ASSERT(atomic_read(&pag->pag_ref) == 0);
+		spin_unlock(&mp->m_perag_lock);
+		kmem_free(pag);
 	}
 }
 
@@ -233,10 +268,10 @@ xfs_sb_validate_fsb_count(
 
 #if XFS_BIG_BLKNOS     /* Limited by ULONG_MAX of page cache index */
 	if (nblocks >> (PAGE_CACHE_SHIFT - sbp->sb_blocklog) > ULONG_MAX)
-		return E2BIG;
+		return EFBIG;
 #else                  /* Limited by UINT_MAX of sectors */
 	if (nblocks << (sbp->sb_blocklog - BBSHIFT) > UINT_MAX)
-		return E2BIG;
+		return EFBIG;
 #endif
 	return 0;
 }
@@ -358,7 +393,7 @@ xfs_mount_validate_sb(
 	    xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
 		xfs_fs_mount_cmn_err(flags,
 			"file system too large to be mounted on this system.");
-		return XFS_ERROR(E2BIG);
+		return XFS_ERROR(EFBIG);
 	}
 
 	if (unlikely(sbp->sb_inprogress)) {
@@ -378,46 +413,73 @@ xfs_mount_validate_sb(
 	return 0;
 }
 
-STATIC void
-xfs_initialize_perag_icache(
-	xfs_perag_t	*pag)
-{
-	if (!pag->pag_ici_init) {
-		rwlock_init(&pag->pag_ici_lock);
-		INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
-		pag->pag_ici_init = 1;
-	}
-}
-
-xfs_agnumber_t
+int
 xfs_initialize_perag(
 	xfs_mount_t	*mp,
-	xfs_agnumber_t	agcount)
+	xfs_agnumber_t	agcount,
+	xfs_agnumber_t	*maxagi)
 {
 	xfs_agnumber_t	index, max_metadata;
+	xfs_agnumber_t	first_initialised = 0;
 	xfs_perag_t	*pag;
 	xfs_agino_t	agino;
 	xfs_ino_t	ino;
 	xfs_sb_t	*sbp = &mp->m_sb;
-	xfs_ino_t	max_inum = XFS_MAXINUMBER_32;
+	int		error = -ENOMEM;
 
-	/* Check to see if the filesystem can overflow 32 bit inodes */
+	/*
+	 * Walk the current per-ag tree so we don't try to initialise AGs
+	 * that already exist (growfs case). Allocate and insert all the
+	 * AGs we don't find ready for initialisation.
+	 */
+	for (index = 0; index < agcount; index++) {
+		pag = xfs_perag_get(mp, index);
+		if (pag) {
+			xfs_perag_put(pag);
+			continue;
+		}
+		if (!first_initialised)
+			first_initialised = index;
+
+		pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
+		if (!pag)
+			goto out_unwind;
+		pag->pag_agno = index;
+		pag->pag_mount = mp;
+		rwlock_init(&pag->pag_ici_lock);
+		INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
+
+		if (radix_tree_preload(GFP_NOFS))
+			goto out_unwind;
+
+		spin_lock(&mp->m_perag_lock);
+		if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
+			BUG();
+			spin_unlock(&mp->m_perag_lock);
+			radix_tree_preload_end();
+			error = -EEXIST;
+			goto out_unwind;
+		}
+		spin_unlock(&mp->m_perag_lock);
+		radix_tree_preload_end();
+	}
+
+	/*
+	 * If we mount with the inode64 option, or no inode overflows
+	 * the legacy 32-bit address space clear the inode32 option.
+	 */
 	agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
 	ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
 
-	/* Clear the mount flag if no inode can overflow 32 bits
-	 * on this filesystem, or if specifically requested..
-	 */
-	if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > max_inum) {
+	if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > XFS_MAXINUMBER_32)
 		mp->m_flags |= XFS_MOUNT_32BITINODES;
-	} else {
+	else
 		mp->m_flags &= ~XFS_MOUNT_32BITINODES;
-	}
 
-	/* If we can overflow then setup the ag headers accordingly */
 	if (mp->m_flags & XFS_MOUNT_32BITINODES) {
-		/* Calculate how much should be reserved for inodes to
-		 * meet the max inode percentage.
+		/*
+		 * Calculate how much should be reserved for inodes to meet
+		 * the max inode percentage.
 		 */
 		if (mp->m_maxicount) {
 			__uint64_t	icount;
@@ -430,29 +492,39 @@ xfs_initialize_perag(
 		} else {
 			max_metadata = agcount;
 		}
+
 		for (index = 0; index < agcount; index++) {
 			ino = XFS_AGINO_TO_INO(mp, index, agino);
-			if (ino > max_inum) {
+			if (ino > XFS_MAXINUMBER_32) {
 				index++;
 				break;
 			}
 
-			/* This ag is preferred for inodes */
-			pag = &mp->m_perag[index];
+			pag = xfs_perag_get(mp, index);
 			pag->pagi_inodeok = 1;
 			if (index < max_metadata)
 				pag->pagf_metadata = 1;
-			xfs_initialize_perag_icache(pag);
+			xfs_perag_put(pag);
 		}
 	} else {
-		/* Setup default behavior for smaller filesystems */
 		for (index = 0; index < agcount; index++) {
-			pag = &mp->m_perag[index];
+			pag = xfs_perag_get(mp, index);
 			pag->pagi_inodeok = 1;
-			xfs_initialize_perag_icache(pag);
+			xfs_perag_put(pag);
 		}
 	}
-	return index;
+
+	if (maxagi)
+		*maxagi = index;
+	return 0;
+
+out_unwind:
+	kmem_free(pag);
+	for (; index > first_initialised; index--) {
+		pag = radix_tree_delete(&mp->m_perag_tree, index);
+		kmem_free(pag);
+	}
+	return error;
 }
 
 void
@@ -583,7 +655,7 @@ xfs_readsb(xfs_mount_t *mp, int flags)
 	 * access to the superblock.
 	 */
 	sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
-	extra_flags = XFS_BUF_LOCK | XFS_BUF_MANAGE | XFS_BUF_MAPPED;
+	extra_flags = XBF_LOCK | XBF_FS_MANAGED | XBF_MAPPED;
 
 	bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, BTOBB(sector_size),
 			  extra_flags);
@@ -731,12 +803,13 @@ xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount)
 		error = xfs_ialloc_pagi_init(mp, NULL, index);
 		if (error)
 			return error;
-		pag = &mp->m_perag[index];
+		pag = xfs_perag_get(mp, index);
 		ifree += pag->pagi_freecount;
 		ialloc += pag->pagi_count;
 		bfree += pag->pagf_freeblks;
 		bfreelst += pag->pagf_flcount;
 		btree += pag->pagf_btreeblks;
+		xfs_perag_put(pag);
 	}
 	/*
 	 * Overwrite incore superblock counters with just-read data
@@ -926,7 +999,7 @@ xfs_check_sizes(xfs_mount_t *mp)
 	d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
 	if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
 		cmn_err(CE_WARN, "XFS: size check 1 failed");
-		return XFS_ERROR(E2BIG);
+		return XFS_ERROR(EFBIG);
 	}
 	error = xfs_read_buf(mp, mp->m_ddev_targp,
 			     d - XFS_FSS_TO_BB(mp, 1),
@@ -936,7 +1009,7 @@ xfs_check_sizes(xfs_mount_t *mp)
 	} else {
 		cmn_err(CE_WARN, "XFS: size check 2 failed");
 		if (error == ENOSPC)
-			error = XFS_ERROR(E2BIG);
+			error = XFS_ERROR(EFBIG);
 		return error;
 	}
 
@@ -944,7 +1017,7 @@ xfs_check_sizes(xfs_mount_t *mp)
 		d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
 		if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
 			cmn_err(CE_WARN, "XFS: size check 3 failed");
-			return XFS_ERROR(E2BIG);
+			return XFS_ERROR(EFBIG);
 		}
 		error = xfs_read_buf(mp, mp->m_logdev_targp,
 				     d - XFS_FSB_TO_BB(mp, 1),
@@ -954,7 +1027,7 @@ xfs_check_sizes(xfs_mount_t *mp)
 		} else {
 			cmn_err(CE_WARN, "XFS: size check 3 failed");
 			if (error == ENOSPC)
-				error = XFS_ERROR(E2BIG);
+				error = XFS_ERROR(EFBIG);
 			return error;
 		}
 	}
@@ -1008,6 +1081,24 @@ xfs_mount_reset_sbqflags(
 	return xfs_trans_commit(tp, 0);
 }
 
+__uint64_t
+xfs_default_resblks(xfs_mount_t *mp)
+{
+	__uint64_t resblks;
+
+	/*
+	 * We default to 5% or 8192 fsbs of space reserved, whichever is
+	 * smaller.  This is intended to cover concurrent allocation
+	 * transactions when we initially hit enospc. These each require a 4
+	 * block reservation. Hence by default we cover roughly 2000 concurrent
+	 * allocation reservations.
+	 */
+	resblks = mp->m_sb.sb_dblocks;
+	do_div(resblks, 20);
+	resblks = min_t(__uint64_t, resblks, 8192);
+	return resblks;
+}
+
 /*
  * This function does the following on an initial mount of a file system:
  *	- reads the superblock from disk and init the mount struct
@@ -1152,13 +1243,13 @@ xfs_mountfs(
 	/*
 	 * Allocate and initialize the per-ag data.
 	 */
-	init_rwsem(&mp->m_peraglock);
-	mp->m_perag = kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t),
-				  KM_MAYFAIL);
-	if (!mp->m_perag)
+	spin_lock_init(&mp->m_perag_lock);
+	INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
+	error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
+	if (error) {
+		cmn_err(CE_WARN, "XFS: Failed per-ag init: %d", error);
 		goto out_remove_uuid;
-
-	mp->m_maxagi = xfs_initialize_perag(mp, sbp->sb_agcount);
+	}
 
 	if (!sbp->sb_logblocks) {
 		cmn_err(CE_WARN, "XFS: no log defined");
@@ -1209,7 +1300,7 @@ xfs_mountfs(
 	 * Get and sanity-check the root inode.
 	 * Save the pointer to it in the mount structure.
 	 */
-	error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip, 0);
+	error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip);
 	if (error) {
 		cmn_err(CE_WARN, "XFS: failed to read root inode");
 		goto out_log_dealloc;
@@ -1304,13 +1395,6 @@ xfs_mountfs(
 		xfs_qm_mount_quotas(mp);
 	}
 
-#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
-	if (XFS_IS_QUOTA_ON(mp))
-		xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas turned on");
-	else
-		xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas not turned on");
-#endif
-
 	/*
 	 * Now we are mounted, reserve a small amount of unused space for
 	 * privileged transactions. This is needed so that transaction
@@ -1319,17 +1403,16 @@ xfs_mountfs(
 	 * attr, unwritten extent conversion at ENOSPC, etc. Data allocations
 	 * are not allowed to use this reserved space.
 	 *
-	 * We default to 5% or 1024 fsbs of space reserved, whichever is smaller.
 	 * This may drive us straight to ENOSPC on mount, but that implies
 	 * we were already there on the last unmount. Warn if this occurs.
 	 */
-	resblks = mp->m_sb.sb_dblocks;
-	do_div(resblks, 20);
-	resblks = min_t(__uint64_t, resblks, 1024);
-	error = xfs_reserve_blocks(mp, &resblks, NULL);
-	if (error)
-		cmn_err(CE_WARN, "XFS: Unable to allocate reserve blocks. "
-				"Continuing without a reserve pool.");
+	if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
+		resblks = xfs_default_resblks(mp);
+		error = xfs_reserve_blocks(mp, &resblks, NULL);
+		if (error)
+			cmn_err(CE_WARN, "XFS: Unable to allocate reserve "
+				"blocks. Continuing without a reserve pool.");
+	}
 
 	return 0;
 
@@ -1372,8 +1455,19 @@ xfs_unmountfs(
 	 * push out the iclog we will never get that unlocked. hence we
 	 * need to force the log first.
 	 */
-	xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
-	xfs_reclaim_inodes(mp, XFS_IFLUSH_ASYNC);
+	xfs_log_force(mp, XFS_LOG_SYNC);
+
+	/*
+	 * Do a delwri reclaim pass first so that as many dirty inodes are
+	 * queued up for IO as possible. Then flush the buffers before making
+	 * a synchronous path to catch all the remaining inodes are reclaimed.
+	 * This makes the reclaim process as quick as possible by avoiding
+	 * synchronous writeout and blocking on inodes already in the delwri
+	 * state as much as possible.
+	 */
+	xfs_reclaim_inodes(mp, 0);
+	XFS_bflush(mp->m_ddev_targp);
+	xfs_reclaim_inodes(mp, SYNC_WAIT);
 
 	xfs_qm_unmount(mp);
 
@@ -1382,7 +1476,7 @@ xfs_unmountfs(
 	 * that nothing is pinned.  This is important because bflush()
 	 * will skip pinned buffers.
 	 */
-	xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
+	xfs_log_force(mp, XFS_LOG_SYNC);
 
 	xfs_binval(mp->m_ddev_targp);
 	if (mp->m_rtdev_targp) {
@@ -1548,15 +1642,14 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
 	xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields);
 
 	/* find modified range */
+	f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields);
+	ASSERT((1LL << f) & XFS_SB_MOD_BITS);
+	last = xfs_sb_info[f + 1].offset - 1;
 
 	f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
 	ASSERT((1LL << f) & XFS_SB_MOD_BITS);
 	first = xfs_sb_info[f].offset;
 
-	f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields);
-	ASSERT((1LL << f) & XFS_SB_MOD_BITS);
-	last = xfs_sb_info[f + 1].offset - 1;
-
 	xfs_trans_log_buf(tp, bp, first, last);
 }
 
@@ -1620,26 +1713,30 @@ xfs_mod_incore_sb_unlocked(
 				lcounter += rem;
 			}
 		} else {				/* Taking blocks away */
-
 			lcounter += delta;
+			if (lcounter >= 0) {
+				mp->m_sb.sb_fdblocks = lcounter +
+							XFS_ALLOC_SET_ASIDE(mp);
+				return 0;
+			}
 
-		/*
-		 * If were out of blocks, use any available reserved blocks if
-		 * were allowed to.
-		 */
+			/*
+			 * We are out of blocks, use any available reserved
+			 * blocks if were allowed to.
+			 */
+			if (!rsvd)
+				return XFS_ERROR(ENOSPC);
 
-			if (lcounter < 0) {
-				if (rsvd) {
-					lcounter = (long long)mp->m_resblks_avail + delta;
-					if (lcounter < 0) {
-						return XFS_ERROR(ENOSPC);
-					}
-					mp->m_resblks_avail = lcounter;
-					return 0;
-				} else {	/* not reserved */
-					return XFS_ERROR(ENOSPC);
-				}
+			lcounter = (long long)mp->m_resblks_avail + delta;
+			if (lcounter >= 0) {
+				mp->m_resblks_avail = lcounter;
+				return 0;
 			}
+			printk_once(KERN_WARNING
+				"Filesystem \"%s\": reserve blocks depleted! "
+				"Consider increasing reserve pool size.",
+				mp->m_fsname);
+			return XFS_ERROR(ENOSPC);
 		}
 
 		mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp);
@@ -1887,7 +1984,7 @@ xfs_getsb(
 
 	ASSERT(mp->m_sb_bp != NULL);
 	bp = mp->m_sb_bp;
-	if (flags & XFS_BUF_TRYLOCK) {
+	if (flags & XBF_TRYLOCK) {
 		if (!XFS_BUF_CPSEMA(bp)) {
 			return NULL;
 		}
@@ -1947,6 +2044,26 @@ xfs_mount_log_sb(
 	return error;
 }
 
+/*
+ * If the underlying (data/log/rt) device is readonly, there are some
+ * operations that cannot proceed.
+ */
+int
+xfs_dev_is_read_only(
+	struct xfs_mount	*mp,
+	char			*message)
+{
+	if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
+	    xfs_readonly_buftarg(mp->m_logdev_targp) ||
+	    (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
+		cmn_err(CE_NOTE,
+			"XFS: %s required on read-only device.", message);
+		cmn_err(CE_NOTE,
+			"XFS: write access unavailable, cannot proceed.");
+		return EROFS;
+	}
+	return 0;
+}
 
 #ifdef HAVE_PERCPU_SB
 /*
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 1df7e4502967..5761087ee8ea 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -78,7 +78,8 @@ typedef int	(*xfs_send_destroy_t)(struct xfs_inode *, dm_right_t);
 typedef int	(*xfs_send_namesp_t)(dm_eventtype_t, struct xfs_mount *,
 			struct xfs_inode *, dm_right_t,
 			struct xfs_inode *, dm_right_t,
-			const char *, const char *, mode_t, int, int);
+			const unsigned char *, const unsigned char *,
+			mode_t, int, int);
 typedef int	(*xfs_send_mount_t)(struct xfs_mount *, dm_right_t,
 			char *, char *);
 typedef void	(*xfs_send_unmount_t)(struct xfs_mount *, struct xfs_inode *,
@@ -207,8 +208,8 @@ typedef struct xfs_mount {
 	uint			m_ag_maxlevels;	/* XFS_AG_MAXLEVELS */
 	uint			m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
 	uint			m_in_maxlevels;	/* max inobt btree levels. */
-	struct xfs_perag	*m_perag;	/* per-ag accounting info */
-	struct rw_semaphore	m_peraglock;	/* lock for m_perag (pointer) */
+	struct radix_tree_root	m_perag_tree;	/* per-ag accounting info */
+	spinlock_t		m_perag_lock;	/* lock for m_perag_tree */
 	struct mutex		m_growlock;	/* growfs mutex */
 	int			m_fixedfsid[2];	/* unchanged for life of FS */
 	uint			m_dmevmask;	/* DMI events for this FS */
@@ -224,6 +225,7 @@ typedef struct xfs_mount {
 	__uint64_t		m_maxioffset;	/* maximum inode offset */
 	__uint64_t		m_resblks;	/* total reserved blocks */
 	__uint64_t		m_resblks_avail;/* available reserved blocks */
+	__uint64_t		m_resblks_save;	/* reserved blks @ remount,ro */
 	int			m_dalign;	/* stripe unit */
 	int			m_swidth;	/* stripe width */
 	int			m_sinoalign;	/* stripe unit inode alignment */
@@ -243,7 +245,7 @@ typedef struct xfs_mount {
 	struct xfs_qmops	*m_qm_ops;	/* vector of XQM ops */
 	atomic_t		m_active_trans;	/* number trans frozen */
 #ifdef HAVE_PERCPU_SB
-	xfs_icsb_cnts_t		*m_sb_cnts;	/* per-cpu superblock counters */
+	xfs_icsb_cnts_t __percpu *m_sb_cnts;	/* per-cpu superblock counters */
 	unsigned long		m_icsb_counters; /* disabled per-cpu counters */
 	struct notifier_block	m_icsb_notifier; /* hotplug cpu notifier */
 	struct mutex		m_icsb_mutex;	/* balancer sync lock */
@@ -257,6 +259,7 @@ typedef struct xfs_mount {
 	wait_queue_head_t	m_wait_single_sync_task;
 	__int64_t		m_update_flags;	/* sb flags we need to update
 						   on the next remount,rw */
+	struct shrinker		m_inode_shrink;	/* inode reclaim shrinker */
 } xfs_mount_t;
 
 /*
@@ -265,6 +268,7 @@ typedef struct xfs_mount {
 #define XFS_MOUNT_WSYNC		(1ULL << 0)	/* for nfs - all metadata ops
 						   must be synchronous except
 						   for space allocations */
+#define XFS_MOUNT_DELAYLOG	(1ULL << 1)	/* delayed logging is enabled */
 #define XFS_MOUNT_DMAPI		(1ULL << 2)	/* dmapi is enabled */
 #define XFS_MOUNT_WAS_CLEAN	(1ULL << 3)
 #define XFS_MOUNT_FS_SHUTDOWN	(1ULL << 4)	/* atomic stop of all filesystem
@@ -384,19 +388,10 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
 }
 
 /*
- * perag get/put wrappers for eventual ref counting
+ * perag get/put wrappers for ref counting
  */
-static inline xfs_perag_t *
-xfs_get_perag(struct xfs_mount *mp, xfs_ino_t ino)
-{
-	return &mp->m_perag[XFS_INO_TO_AGNO(mp, ino)];
-}
-
-static inline void
-xfs_put_perag(struct xfs_mount *mp, xfs_perag_t *pag)
-{
-	/* nothing to see here, move along */
-}
+struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno);
+void	xfs_perag_put(struct xfs_perag *pag);
 
 /*
  * Per-cpu superblock locking functions
@@ -428,6 +423,7 @@ typedef struct xfs_mod_sb {
 } xfs_mod_sb_t;
 
 extern int	xfs_log_sbcount(xfs_mount_t *, uint);
+extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);
 extern int	xfs_mountfs(xfs_mount_t *mp);
 
 extern void	xfs_unmountfs(xfs_mount_t *);
@@ -442,6 +438,8 @@ extern void	xfs_freesb(xfs_mount_t *);
 extern int	xfs_fs_writable(xfs_mount_t *);
 extern int	xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
 
+extern int	xfs_dev_is_read_only(struct xfs_mount *, char *);
+
 extern int	xfs_dmops_get(struct xfs_mount *);
 extern void	xfs_dmops_put(struct xfs_mount *);
 
@@ -450,7 +448,8 @@ extern struct xfs_dmops xfs_dmcore_xfs;
 #endif	/* __KERNEL__ */
 
 extern void	xfs_mod_sb(struct xfs_trans *, __int64_t);
-extern xfs_agnumber_t	xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t);
+extern int	xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t,
+					xfs_agnumber_t *);
 extern void	xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
 extern void	xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
 
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 4b0613d99faa..45ce15dc5b2b 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -398,7 +398,7 @@ exit:
  * guaranteed that all the free functions for all the elements have finished
  * executing and the reaper is not running.
  */
-void
+static void
 xfs_mru_cache_flush(
 	xfs_mru_cache_t		*mru)
 {
diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h
index 5d439f34b0c9..36dd3ec8b4eb 100644
--- a/fs/xfs/xfs_mru_cache.h
+++ b/fs/xfs/xfs_mru_cache.h
@@ -42,7 +42,6 @@ void xfs_mru_cache_uninit(void);
 int xfs_mru_cache_create(struct xfs_mru_cache **mrup, unsigned int lifetime_ms,
 			     unsigned int grp_count,
 			     xfs_mru_cache_free_func_t free_func);
-void xfs_mru_cache_flush(xfs_mru_cache_t *mru);
 void xfs_mru_cache_destroy(struct xfs_mru_cache *mru);
 int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key,
 				void *value);
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 91bfd60f4c74..e0e64b113bd6 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -201,9 +201,6 @@ typedef struct xfs_qoff_logformat {
 #define XFS_QMOPT_FORCE_RES	0x0000010 /* ignore quota limits */
 #define XFS_QMOPT_DQSUSER	0x0000020 /* don't cache super users dquot */
 #define XFS_QMOPT_SBVERSION	0x0000040 /* change superblock version num */
-#define XFS_QMOPT_QUOTAOFF	0x0000080 /* quotas are being turned off */
-#define XFS_QMOPT_UMOUNTING	0x0000100 /* filesys is being unmounted */
-#define XFS_QMOPT_DOLOG		0x0000200 /* log buf changes (in quotacheck) */
 #define XFS_QMOPT_DOWARN        0x0000400 /* increase warning cnt if needed */
 #define XFS_QMOPT_DQREPAIR	0x0001000 /* repair dquot if damaged */
 #define XFS_QMOPT_GQUOTA	0x0002000 /* group dquot requested */
@@ -223,16 +220,9 @@ typedef struct xfs_qoff_logformat {
 #define XFS_QMOPT_RES_INOS	0x0800000
 
 /*
- * flags for dqflush and dqflush_all.
- */
-#define XFS_QMOPT_SYNC		0x1000000
-#define XFS_QMOPT_ASYNC		0x2000000
-#define XFS_QMOPT_DELWRI	0x4000000
-
-/*
  * flags for dqalloc.
  */
-#define XFS_QMOPT_INHERIT	0x8000000
+#define XFS_QMOPT_INHERIT	0x1000000
 
 /*
  * flags to xfs_trans_mod_dquot.
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 6be05f756d59..a2d32ce335aa 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -2247,7 +2247,7 @@ xfs_rtmount_init(
 		cmn_err(CE_WARN, "XFS: realtime mount -- %llu != %llu",
 			(unsigned long long) XFS_BB_TO_FSB(mp, d),
 			(unsigned long long) mp->m_sb.sb_rblocks);
-		return XFS_ERROR(E2BIG);
+		return XFS_ERROR(EFBIG);
 	}
 	error = xfs_read_buf(mp, mp->m_rtdev_targp,
 				d - XFS_FSB_TO_BB(mp, 1),
@@ -2256,7 +2256,7 @@ xfs_rtmount_init(
 		cmn_err(CE_WARN,
 	"XFS: realtime mount -- xfs_read_buf failed, returned %d", error);
 		if (error == ENOSPC)
-			return XFS_ERROR(E2BIG);
+			return XFS_ERROR(EFBIG);
 		return error;
 	}
 	xfs_buf_relse(bp);
@@ -2277,12 +2277,12 @@ xfs_rtmount_inodes(
 	sbp = &mp->m_sb;
 	if (sbp->sb_rbmino == NULLFSINO)
 		return 0;
-	error = xfs_iget(mp, NULL, sbp->sb_rbmino, 0, 0, &mp->m_rbmip, 0);
+	error = xfs_iget(mp, NULL, sbp->sb_rbmino, 0, 0, &mp->m_rbmip);
 	if (error)
 		return error;
 	ASSERT(mp->m_rbmip != NULL);
 	ASSERT(sbp->sb_rsumino != NULLFSINO);
-	error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip, 0);
+	error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip);
 	if (error) {
 		IRELE(mp->m_rbmip);
 		return error;
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index b2d67adb6a08..ff614c29b441 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -147,7 +147,16 @@ xfs_growfs_rt(
 # define xfs_rtfree_extent(t,b,l)                       (ENOSYS)
 # define xfs_rtpick_extent(m,t,l,rb)                    (ENOSYS)
 # define xfs_growfs_rt(mp,in)                           (ENOSYS)
-# define xfs_rtmount_init(m)    (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
+static inline int		/* error */
+xfs_rtmount_init(
+	xfs_mount_t	*mp)	/* file system mount structure */
+{
+	if (mp->m_sb.sb_rblocks == 0)
+		return 0;
+
+	cmn_err(CE_WARN, "XFS: Not built with CONFIG_XFS_RT");
+	return ENOSYS;
+}
 # define xfs_rtmount_inodes(m)  (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
 # define xfs_rtunmount_inodes(m)
 #endif	/* CONFIG_XFS_RT */
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index 5aa07caea5f1..e336742a58a4 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -47,48 +47,6 @@
 #include "xfs_trace.h"
 
 /*
- * This is a subroutine for xfs_write() and other writers (xfs_ioctl)
- * which clears the setuid and setgid bits when a file is written.
- */
-int
-xfs_write_clear_setuid(
-	xfs_inode_t	*ip)
-{
-	xfs_mount_t	*mp;
-	xfs_trans_t	*tp;
-	int		error;
-
-	mp = ip->i_mount;
-	tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
-	if ((error = xfs_trans_reserve(tp, 0,
-				      XFS_WRITEID_LOG_RES(mp),
-				      0, 0, 0))) {
-		xfs_trans_cancel(tp, 0);
-		return error;
-	}
-	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-	xfs_trans_ihold(tp, ip);
-	ip->i_d.di_mode &= ~S_ISUID;
-
-	/*
-	 * Note that we don't have to worry about mandatory
-	 * file locking being disabled here because we only
-	 * clear the S_ISGID bit if the Group execute bit is
-	 * on, but if it was on then mandatory locking wouldn't
-	 * have been enabled.
-	 */
-	if (ip->i_d.di_mode & S_IXGRP) {
-		ip->i_d.di_mode &= ~S_ISGID;
-	}
-	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-	xfs_trans_set_sync(tp);
-	error = xfs_trans_commit(tp, 0);
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-	return 0;
-}
-
-/*
  * Force a shutdown of the filesystem instantly while keeping
  * the filesystem consistent. We don't do an unmount here; just shutdown
  * the shop, make sure that absolutely nothing persistent happens to
@@ -153,88 +111,6 @@ xfs_do_force_shutdown(
 	}
 }
 
-
-/*
- * Called when we want to stop a buffer from getting written or read.
- * We attach the EIO error, muck with its flags, and call biodone
- * so that the proper iodone callbacks get called.
- */
-int
-xfs_bioerror(
-	xfs_buf_t *bp)
-{
-
-#ifdef XFSERRORDEBUG
-	ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
-#endif
-
-	/*
-	 * No need to wait until the buffer is unpinned.
-	 * We aren't flushing it.
-	 */
-	XFS_BUF_ERROR(bp, EIO);
-	/*
-	 * We're calling biodone, so delete B_DONE flag. Either way
-	 * we have to call the iodone callback, and calling biodone
-	 * probably is the best way since it takes care of
-	 * GRIO as well.
-	 */
-	XFS_BUF_UNREAD(bp);
-	XFS_BUF_UNDELAYWRITE(bp);
-	XFS_BUF_UNDONE(bp);
-	XFS_BUF_STALE(bp);
-
-	XFS_BUF_CLR_BDSTRAT_FUNC(bp);
-	xfs_biodone(bp);
-
-	return (EIO);
-}
-
-/*
- * Same as xfs_bioerror, except that we are releasing the buffer
- * here ourselves, and avoiding the biodone call.
- * This is meant for userdata errors; metadata bufs come with
- * iodone functions attached, so that we can track down errors.
- */
-int
-xfs_bioerror_relse(
-	xfs_buf_t *bp)
-{
-	int64_t fl;
-
-	ASSERT(XFS_BUF_IODONE_FUNC(bp) != xfs_buf_iodone_callbacks);
-	ASSERT(XFS_BUF_IODONE_FUNC(bp) != xlog_iodone);
-
-	fl = XFS_BUF_BFLAGS(bp);
-	/*
-	 * No need to wait until the buffer is unpinned.
-	 * We aren't flushing it.
-	 *
-	 * chunkhold expects B_DONE to be set, whether
-	 * we actually finish the I/O or not. We don't want to
-	 * change that interface.
-	 */
-	XFS_BUF_UNREAD(bp);
-	XFS_BUF_UNDELAYWRITE(bp);
-	XFS_BUF_DONE(bp);
-	XFS_BUF_STALE(bp);
-	XFS_BUF_CLR_IODONE_FUNC(bp);
-	XFS_BUF_CLR_BDSTRAT_FUNC(bp);
-	if (!(fl & XFS_B_ASYNC)) {
-		/*
-		 * Mark b_error and B_ERROR _both_.
-		 * Lot's of chunkcache code assumes that.
-		 * There's no reason to mark error for
-		 * ASYNC buffers.
-		 */
-		XFS_BUF_ERROR(bp, EIO);
-		XFS_BUF_FINISH_IOWAIT(bp);
-	} else {
-		xfs_buf_relse(bp);
-	}
-	return (EIO);
-}
-
 /*
  * Prints out an ALERT message about I/O error.
  */
@@ -306,37 +182,6 @@ xfs_read_buf(
 }
 
 /*
- * Wrapper around bwrite() so that we can trap
- * write errors, and act accordingly.
- */
-int
-xfs_bwrite(
-	struct xfs_mount *mp,
-	struct xfs_buf	 *bp)
-{
-	int	error;
-
-	/*
-	 * XXXsup how does this work for quotas.
-	 */
-	XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb);
-	bp->b_mount = mp;
-	XFS_BUF_WRITE(bp);
-
-	if ((error = XFS_bwrite(bp))) {
-		ASSERT(mp);
-		/*
-		 * Cannot put a buftrace here since if the buffer is not
-		 * B_HOLD then we will brelse() the buffer before returning
-		 * from bwrite and we could be tracing a buffer that has
-		 * been reused.
-		 */
-		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
-	}
-	return (error);
-}
-
-/*
  * helper function to extract extent size hint from inode
  */
 xfs_extlen_t
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
index 571f2174435c..11c41ec6ed75 100644
--- a/fs/xfs/xfs_rw.h
+++ b/fs/xfs/xfs_rw.h
@@ -39,10 +39,6 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
 /*
  * Prototypes for functions in xfs_rw.c.
  */
-extern int xfs_write_clear_setuid(struct xfs_inode *ip);
-extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
-extern int xfs_bioerror(struct xfs_buf *bp);
-extern int xfs_bioerror_relse(struct xfs_buf *bp);
 extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp,
 			xfs_daddr_t blkno, int len, uint flags,
 			struct xfs_buf **bpp);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 237badcbac3b..28547dfce037 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -44,148 +44,493 @@
 #include "xfs_trans_priv.h"
 #include "xfs_trans_space.h"
 #include "xfs_inode_item.h"
-
-
-STATIC void	xfs_trans_apply_sb_deltas(xfs_trans_t *);
-STATIC uint	xfs_trans_count_vecs(xfs_trans_t *);
-STATIC void	xfs_trans_fill_vecs(xfs_trans_t *, xfs_log_iovec_t *);
-STATIC void	xfs_trans_uncommit(xfs_trans_t *, uint);
-STATIC void	xfs_trans_committed(xfs_trans_t *, int);
-STATIC void	xfs_trans_chunk_committed(xfs_log_item_chunk_t *, xfs_lsn_t, int);
-STATIC void	xfs_trans_free(xfs_trans_t *);
+#include "xfs_trace.h"
 
 kmem_zone_t	*xfs_trans_zone;
 
 
 /*
- * Reservation functions here avoid a huge stack in xfs_trans_init
- * due to register overflow from temporaries in the calculations.
+ * Various log reservation values.
+ *
+ * These are based on the size of the file system block because that is what
+ * most transactions manipulate.  Each adds in an additional 128 bytes per
+ * item logged to try to account for the overhead of the transaction mechanism.
+ *
+ * Note:  Most of the reservations underestimate the number of allocation
+ * groups into which they could free extents in the xfs_bmap_finish() call.
+ * This is because the number in the worst case is quite high and quite
+ * unusual.  In order to fix this we need to change xfs_bmap_finish() to free
+ * extents in only a single AG at a time.  This will require changes to the
+ * EFI code as well, however, so that the EFI for the extents not freed is
+ * logged again in each transaction.  See SGI PV #261917.
+ *
+ * Reservation functions here avoid a huge stack in xfs_trans_init due to
+ * register overflow from temporaries in the calculations.
  */
 
+
+/*
+ * In a write transaction we can allocate a maximum of 2
+ * extents.  This gives:
+ *    the inode getting the new extents: inode size
+ *    the inode's bmap btree: max depth * block size
+ *    the agfs of the ags from which the extents are allocated: 2 * sector
+ *    the superblock free block counter: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ * And the bmap_finish transaction can free bmap blocks in a join:
+ *    the agfs of the ags containing the blocks: 2 * sector size
+ *    the agfls of the ags containing the blocks: 2 * sector size
+ *    the super block free block counter: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_write_reservation(xfs_mount_t *mp)
+xfs_calc_write_reservation(
+	struct xfs_mount	*mp)
 {
-	return XFS_CALC_WRITE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+	return XFS_DQUOT_LOGRES(mp) +
+		MAX((mp->m_sb.sb_inodesize +
+		     XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) +
+		     2 * mp->m_sb.sb_sectsize +
+		     mp->m_sb.sb_sectsize +
+		     XFS_ALLOCFREE_LOG_RES(mp, 2) +
+		     128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) +
+			    XFS_ALLOCFREE_LOG_COUNT(mp, 2))),
+		    (2 * mp->m_sb.sb_sectsize +
+		     2 * mp->m_sb.sb_sectsize +
+		     mp->m_sb.sb_sectsize +
+		     XFS_ALLOCFREE_LOG_RES(mp, 2) +
+		     128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
 }
 
+/*
+ * In truncating a file we free up to two extents at once.  We can modify:
+ *    the inode being truncated: inode size
+ *    the inode's bmap btree: (max depth + 1) * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks:
+ *    the agf for each of the ags: 4 * sector size
+ *    the agfl for each of the ags: 4 * sector size
+ *    the super block to reflect the freed blocks: sector size
+ *    worst case split in allocation btrees per extent assuming 4 extents:
+ *		4 exts * 2 trees * (2 * max depth - 1) * block size
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_itruncate_reservation(xfs_mount_t *mp)
+xfs_calc_itruncate_reservation(
+	struct xfs_mount	*mp)
 {
-	return XFS_CALC_ITRUNCATE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+	return XFS_DQUOT_LOGRES(mp) +
+		MAX((mp->m_sb.sb_inodesize +
+		     XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) +
+		     128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))),
+		    (4 * mp->m_sb.sb_sectsize +
+		     4 * mp->m_sb.sb_sectsize +
+		     mp->m_sb.sb_sectsize +
+		     XFS_ALLOCFREE_LOG_RES(mp, 4) +
+		     128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4)) +
+		     128 * 5 +
+		     XFS_ALLOCFREE_LOG_RES(mp, 1) +
+		     128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
+			    XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
 }
 
+/*
+ * In renaming a files we can modify:
+ *    the four inodes involved: 4 * inode size
+ *    the two directory btrees: 2 * (max depth + v2) * dir block size
+ *    the two directory bmap btrees: 2 * max depth * block size
+ * And the bmap_finish transaction can free dir and bmap blocks (two sets
+ *	of bmap blocks) giving:
+ *    the agf for the ags in which the blocks live: 3 * sector size
+ *    the agfl for the ags in which the blocks live: 3 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_rename_reservation(xfs_mount_t *mp)
+xfs_calc_rename_reservation(
+	struct xfs_mount	*mp)
 {
-	return XFS_CALC_RENAME_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+	return XFS_DQUOT_LOGRES(mp) +
+		MAX((4 * mp->m_sb.sb_inodesize +
+		     2 * XFS_DIROP_LOG_RES(mp) +
+		     128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp))),
+		    (3 * mp->m_sb.sb_sectsize +
+		     3 * mp->m_sb.sb_sectsize +
+		     mp->m_sb.sb_sectsize +
+		     XFS_ALLOCFREE_LOG_RES(mp, 3) +
+		     128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3))));
 }
 
+/*
+ * For creating a link to an inode:
+ *    the parent directory inode: inode size
+ *    the linked inode: inode size
+ *    the directory btree could split: (max depth + v2) * dir block size
+ *    the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free some bmap blocks giving:
+ *    the agf for the ag in which the blocks live: sector size
+ *    the agfl for the ag in which the blocks live: sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_link_reservation(xfs_mount_t *mp)
+xfs_calc_link_reservation(
+	struct xfs_mount	*mp)
 {
-	return XFS_CALC_LINK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+	return XFS_DQUOT_LOGRES(mp) +
+		MAX((mp->m_sb.sb_inodesize +
+		     mp->m_sb.sb_inodesize +
+		     XFS_DIROP_LOG_RES(mp) +
+		     128 * (2 + XFS_DIROP_LOG_COUNT(mp))),
+		    (mp->m_sb.sb_sectsize +
+		     mp->m_sb.sb_sectsize +
+		     mp->m_sb.sb_sectsize +
+		     XFS_ALLOCFREE_LOG_RES(mp, 1) +
+		     128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
 }
 
+/*
+ * For removing a directory entry we can modify:
+ *    the parent directory inode: inode size
+ *    the removed inode: inode size
+ *    the directory btree could join: (max depth + v2) * dir block size
+ *    the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free the dir and bmap blocks giving:
+ *    the agf for the ag in which the blocks live: 2 * sector size
+ *    the agfl for the ag in which the blocks live: 2 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_remove_reservation(xfs_mount_t *mp)
+xfs_calc_remove_reservation(
+	struct xfs_mount	*mp)
 {
-	return XFS_CALC_REMOVE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+	return XFS_DQUOT_LOGRES(mp) +
+		MAX((mp->m_sb.sb_inodesize +
+		     mp->m_sb.sb_inodesize +
+		     XFS_DIROP_LOG_RES(mp) +
+		     128 * (2 + XFS_DIROP_LOG_COUNT(mp))),
+		    (2 * mp->m_sb.sb_sectsize +
+		     2 * mp->m_sb.sb_sectsize +
+		     mp->m_sb.sb_sectsize +
+		     XFS_ALLOCFREE_LOG_RES(mp, 2) +
+		     128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
 }
 
+/*
+ * For symlink we can modify:
+ *    the parent directory inode: inode size
+ *    the new inode: inode size
+ *    the inode btree entry: 1 block
+ *    the directory btree: (max depth + v2) * dir block size
+ *    the directory inode's bmap btree: (max depth + v2) * block size
+ *    the blocks for the symlink: 1 kB
+ * Or in the first xact we allocate some inodes giving:
+ *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_symlink_reservation(xfs_mount_t *mp)
+xfs_calc_symlink_reservation(
+	struct xfs_mount	*mp)
 {
-	return XFS_CALC_SYMLINK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+	return XFS_DQUOT_LOGRES(mp) +
+		MAX((mp->m_sb.sb_inodesize +
+		     mp->m_sb.sb_inodesize +
+		     XFS_FSB_TO_B(mp, 1) +
+		     XFS_DIROP_LOG_RES(mp) +
+		     1024 +
+		     128 * (4 + XFS_DIROP_LOG_COUNT(mp))),
+		    (2 * mp->m_sb.sb_sectsize +
+		     XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) +
+		     XFS_FSB_TO_B(mp, mp->m_in_maxlevels) +
+		     XFS_ALLOCFREE_LOG_RES(mp, 1) +
+		     128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
+			    XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
 }
 
+/*
+ * For create we can modify:
+ *    the parent directory inode: inode size
+ *    the new inode: inode size
+ *    the inode btree entry: block size
+ *    the superblock for the nlink flag: sector size
+ *    the directory btree: (max depth + v2) * dir block size
+ *    the directory inode's bmap btree: (max depth + v2) * block size
+ * Or in the first xact we allocate some inodes giving:
+ *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ *    the superblock for the nlink flag: sector size
+ *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_create_reservation(xfs_mount_t *mp)
+xfs_calc_create_reservation(
+	struct xfs_mount	*mp)
 {
-	return XFS_CALC_CREATE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+	return XFS_DQUOT_LOGRES(mp) +
+		MAX((mp->m_sb.sb_inodesize +
+		     mp->m_sb.sb_inodesize +
+		     mp->m_sb.sb_sectsize +
+		     XFS_FSB_TO_B(mp, 1) +
+		     XFS_DIROP_LOG_RES(mp) +
+		     128 * (3 + XFS_DIROP_LOG_COUNT(mp))),
+		    (3 * mp->m_sb.sb_sectsize +
+		     XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) +
+		     XFS_FSB_TO_B(mp, mp->m_in_maxlevels) +
+		     XFS_ALLOCFREE_LOG_RES(mp, 1) +
+		     128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
+			    XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
 }
 
+/*
+ * Making a new directory is the same as creating a new file.
+ */
 STATIC uint
-xfs_calc_mkdir_reservation(xfs_mount_t *mp)
+xfs_calc_mkdir_reservation(
+	struct xfs_mount	*mp)
 {
-	return XFS_CALC_MKDIR_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+	return xfs_calc_create_reservation(mp);
 }
 
+/*
+ * In freeing an inode we can modify:
+ *    the inode being freed: inode size
+ *    the super block free inode counter: sector size
+ *    the agi hash list and counters: sector size
+ *    the inode btree entry: block size
+ *    the on disk inode before ours in the agi hash list: inode cluster size
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_ifree_reservation(xfs_mount_t *mp)
+xfs_calc_ifree_reservation(
+	struct xfs_mount	*mp)
 {
-	return XFS_CALC_IFREE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+	return XFS_DQUOT_LOGRES(mp) +
+		mp->m_sb.sb_inodesize +
+		mp->m_sb.sb_sectsize +
+		mp->m_sb.sb_sectsize +
+		XFS_FSB_TO_B(mp, 1) +
+		MAX((__uint16_t)XFS_FSB_TO_B(mp, 1),
+		    XFS_INODE_CLUSTER_SIZE(mp)) +
+		128 * 5 +
+		XFS_ALLOCFREE_LOG_RES(mp, 1) +
+		128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
+		       XFS_ALLOCFREE_LOG_COUNT(mp, 1));
 }
 
+/*
+ * When only changing the inode we log the inode and possibly the superblock
+ * We also add a bit of slop for the transaction stuff.
+ */
 STATIC uint
-xfs_calc_ichange_reservation(xfs_mount_t *mp)
+xfs_calc_ichange_reservation(
+	struct xfs_mount	*mp)
 {
-	return XFS_CALC_ICHANGE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+	return XFS_DQUOT_LOGRES(mp) +
+		mp->m_sb.sb_inodesize +
+		mp->m_sb.sb_sectsize +
+		512;
+
 }
 
+/*
+ * Growing the data section of the filesystem.
+ *	superblock
+ *	agi and agf
+ *	allocation btrees
+ */
 STATIC uint
-xfs_calc_growdata_reservation(xfs_mount_t *mp)
+xfs_calc_growdata_reservation(
+	struct xfs_mount	*mp)
 {
-	return XFS_CALC_GROWDATA_LOG_RES(mp);
+	return mp->m_sb.sb_sectsize * 3 +
+		XFS_ALLOCFREE_LOG_RES(mp, 1) +
+		128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1));
 }
 
+/*
+ * Growing the rt section of the filesystem.
+ * In the first set of transactions (ALLOC) we allocate space to the
+ * bitmap or summary files.
+ *	superblock: sector size
+ *	agf of the ag from which the extent is allocated: sector size
+ *	bmap btree for bitmap/summary inode: max depth * blocksize
+ *	bitmap/summary inode: inode size
+ *	allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
+ */
 STATIC uint
-xfs_calc_growrtalloc_reservation(xfs_mount_t *mp)
+xfs_calc_growrtalloc_reservation(
+	struct xfs_mount	*mp)
 {
-	return XFS_CALC_GROWRTALLOC_LOG_RES(mp);
+	return 2 * mp->m_sb.sb_sectsize +
+		XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) +
+		mp->m_sb.sb_inodesize +
+		XFS_ALLOCFREE_LOG_RES(mp, 1) +
+		128 * (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) +
+		       XFS_ALLOCFREE_LOG_COUNT(mp, 1));
 }
 
+/*
+ * Growing the rt section of the filesystem.
+ * In the second set of transactions (ZERO) we zero the new metadata blocks.
+ *	one bitmap/summary block: blocksize
+ */
 STATIC uint
-xfs_calc_growrtzero_reservation(xfs_mount_t *mp)
+xfs_calc_growrtzero_reservation(
+	struct xfs_mount	*mp)
 {
-	return XFS_CALC_GROWRTZERO_LOG_RES(mp);
+	return mp->m_sb.sb_blocksize + 128;
 }
 
+/*
+ * Growing the rt section of the filesystem.
+ * In the third set of transactions (FREE) we update metadata without
+ * allocating any new blocks.
+ *	superblock: sector size
+ *	bitmap inode: inode size
+ *	summary inode: inode size
+ *	one bitmap block: blocksize
+ *	summary blocks: new summary size
+ */
 STATIC uint
-xfs_calc_growrtfree_reservation(xfs_mount_t *mp)
+xfs_calc_growrtfree_reservation(
+	struct xfs_mount	*mp)
 {
-	return XFS_CALC_GROWRTFREE_LOG_RES(mp);
+	return mp->m_sb.sb_sectsize +
+		2 * mp->m_sb.sb_inodesize +
+		mp->m_sb.sb_blocksize +
+		mp->m_rsumsize +
+		128 * 5;
 }
 
+/*
+ * Logging the inode modification timestamp on a synchronous write.
+ *	inode
+ */
 STATIC uint
-xfs_calc_swrite_reservation(xfs_mount_t *mp)
+xfs_calc_swrite_reservation(
+	struct xfs_mount	*mp)
 {
-	return XFS_CALC_SWRITE_LOG_RES(mp);
+	return mp->m_sb.sb_inodesize + 128;
 }
 
+/*
+ * Logging the inode mode bits when writing a setuid/setgid file
+ *	inode
+ */
 STATIC uint
 xfs_calc_writeid_reservation(xfs_mount_t *mp)
 {
-	return XFS_CALC_WRITEID_LOG_RES(mp);
+	return mp->m_sb.sb_inodesize + 128;
 }
 
+/*
+ * Converting the inode from non-attributed to attributed.
+ *	the inode being converted: inode size
+ *	agf block and superblock (for block allocation)
+ *	the new block (directory sized)
+ *	bmap blocks for the new directory block
+ *	allocation btrees
+ */
 STATIC uint
-xfs_calc_addafork_reservation(xfs_mount_t *mp)
+xfs_calc_addafork_reservation(
+	struct xfs_mount	*mp)
 {
-	return XFS_CALC_ADDAFORK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+	return XFS_DQUOT_LOGRES(mp) +
+		mp->m_sb.sb_inodesize +
+		mp->m_sb.sb_sectsize * 2 +
+		mp->m_dirblksize +
+		XFS_FSB_TO_B(mp, XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) +
+		XFS_ALLOCFREE_LOG_RES(mp, 1) +
+		128 * (4 + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1 +
+		       XFS_ALLOCFREE_LOG_COUNT(mp, 1));
 }
 
+/*
+ * Removing the attribute fork of a file
+ *    the inode being truncated: inode size
+ *    the inode's bmap btree: max depth * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks:
+ *    the agf for each of the ags: 4 * sector size
+ *    the agfl for each of the ags: 4 * sector size
+ *    the super block to reflect the freed blocks: sector size
+ *    worst case split in allocation btrees per extent assuming 4 extents:
+ *		4 exts * 2 trees * (2 * max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_attrinval_reservation(xfs_mount_t *mp)
+xfs_calc_attrinval_reservation(
+	struct xfs_mount	*mp)
 {
-	return XFS_CALC_ATTRINVAL_LOG_RES(mp);
+	return MAX((mp->m_sb.sb_inodesize +
+		    XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
+		    128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))),
+		   (4 * mp->m_sb.sb_sectsize +
+		    4 * mp->m_sb.sb_sectsize +
+		    mp->m_sb.sb_sectsize +
+		    XFS_ALLOCFREE_LOG_RES(mp, 4) +
+		    128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))));
 }
 
+/*
+ * Setting an attribute.
+ *	the inode getting the attribute
+ *	the superblock for allocations
+ *	the agfs extents are allocated from
+ *	the attribute btree * max depth
+ *	the inode allocation btree
+ * Since attribute transaction space is dependent on the size of the attribute,
+ * the calculation is done partially at mount time and partially at runtime.
+ */
 STATIC uint
-xfs_calc_attrset_reservation(xfs_mount_t *mp)
+xfs_calc_attrset_reservation(
+	struct xfs_mount	*mp)
 {
-	return XFS_CALC_ATTRSET_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+	return XFS_DQUOT_LOGRES(mp) +
+		mp->m_sb.sb_inodesize +
+		mp->m_sb.sb_sectsize +
+		XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) +
+		128 * (2 + XFS_DA_NODE_MAXDEPTH);
 }
 
+/*
+ * Removing an attribute.
+ *    the inode: inode size
+ *    the attribute btree could join: max depth * block size
+ *    the inode bmap btree could join or split: max depth * block size
+ * And the bmap_finish transaction can free the attr blocks freed giving:
+ *    the agf for the ag in which the blocks live: 2 * sector size
+ *    the agfl for the ag in which the blocks live: 2 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_attrrm_reservation(xfs_mount_t *mp)
+xfs_calc_attrrm_reservation(
+	struct xfs_mount	*mp)
 {
-	return XFS_CALC_ATTRRM_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+	return XFS_DQUOT_LOGRES(mp) +
+		MAX((mp->m_sb.sb_inodesize +
+		     XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) +
+		     XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
+		     128 * (1 + XFS_DA_NODE_MAXDEPTH +
+			    XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))),
+		    (2 * mp->m_sb.sb_sectsize +
+		     2 * mp->m_sb.sb_sectsize +
+		     mp->m_sb.sb_sectsize +
+		     XFS_ALLOCFREE_LOG_RES(mp, 2) +
+		     128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
 }
 
+/*
+ * Clearing a bad agino number in an agi hash bucket.
+ */
 STATIC uint
-xfs_calc_clear_agi_bucket_reservation(xfs_mount_t *mp)
+xfs_calc_clear_agi_bucket_reservation(
+	struct xfs_mount	*mp)
 {
-	return XFS_CALC_CLEAR_AGI_BUCKET_LOG_RES(mp);
+	return mp->m_sb.sb_sectsize + 128;
 }
 
 /*
@@ -194,11 +539,10 @@ xfs_calc_clear_agi_bucket_reservation(xfs_mount_t *mp)
  */
 void
 xfs_trans_init(
-	xfs_mount_t	*mp)
+	struct xfs_mount	*mp)
 {
-	xfs_trans_reservations_t	*resp;
+	struct xfs_trans_reservations *resp = &mp->m_reservations;
 
-	resp = &(mp->m_reservations);
 	resp->tr_write = xfs_calc_write_reservation(mp);
 	resp->tr_itruncate = xfs_calc_itruncate_reservation(mp);
 	resp->tr_rename = xfs_calc_rename_reservation(mp);
@@ -254,13 +598,30 @@ _xfs_trans_alloc(
 	tp->t_type = type;
 	tp->t_mountp = mp;
 	tp->t_items_free = XFS_LIC_NUM_SLOTS;
-	tp->t_busy_free = XFS_LBC_NUM_SLOTS;
 	xfs_lic_init(&(tp->t_items));
-	XFS_LBC_INIT(&(tp->t_busy));
+	INIT_LIST_HEAD(&tp->t_busy);
 	return tp;
 }
 
 /*
+ * Free the transaction structure.  If there is more clean up
+ * to do when the structure is freed, add it here.
+ */
+STATIC void
+xfs_trans_free(
+	struct xfs_trans	*tp)
+{
+	struct xfs_busy_extent	*busyp, *n;
+
+	list_for_each_entry_safe(busyp, n, &tp->t_busy, list)
+		xfs_alloc_busy_clear(tp->t_mountp, busyp);
+
+	atomic_dec(&tp->t_mountp->m_active_trans);
+	xfs_trans_free_dqinfo(tp);
+	kmem_zone_free(xfs_trans_zone, tp);
+}
+
+/*
  * This is called to create a new transaction which will share the
  * permanent log reservation of the given transaction.  The remaining
  * unused block and rt extent reservations are also inherited.  This
@@ -283,9 +644,8 @@ xfs_trans_dup(
 	ntp->t_type = tp->t_type;
 	ntp->t_mountp = tp->t_mountp;
 	ntp->t_items_free = XFS_LIC_NUM_SLOTS;
-	ntp->t_busy_free = XFS_LBC_NUM_SLOTS;
 	xfs_lic_init(&(ntp->t_items));
-	XFS_LBC_INIT(&(ntp->t_busy));
+	INIT_LIST_HEAD(&ntp->t_busy);
 
 	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
 	ASSERT(tp->t_ticket != NULL);
@@ -421,7 +781,6 @@ undo_blocks:
 	return error;
 }
 
-
 /*
  * Record the indicated change to the given field for application
  * to the file system's superblock when the transaction commits.
@@ -650,7 +1009,7 @@ xfs_trans_apply_sb_deltas(
  * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we
  * still need to update the incore superblock with the changes.
  */
-STATIC void
+void
 xfs_trans_unreserve_and_mod_sb(
 	xfs_trans_t	*tp)
 {
@@ -764,94 +1123,256 @@ xfs_trans_unreserve_and_mod_sb(
 	}
 }
 
+/*
+ * Total up the number of log iovecs needed to commit this
+ * transaction.  The transaction itself needs one for the
+ * transaction header.  Ask each dirty item in turn how many
+ * it needs to get the total.
+ */
+static uint
+xfs_trans_count_vecs(
+	struct xfs_trans	*tp)
+{
+	int			nvecs;
+	xfs_log_item_desc_t	*lidp;
+
+	nvecs = 1;
+	lidp = xfs_trans_first_item(tp);
+	ASSERT(lidp != NULL);
+
+	/* In the non-debug case we need to start bailing out if we
+	 * didn't find a log_item here, return zero and let trans_commit
+	 * deal with it.
+	 */
+	if (lidp == NULL)
+		return 0;
+
+	while (lidp != NULL) {
+		/*
+		 * Skip items which aren't dirty in this transaction.
+		 */
+		if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
+			lidp = xfs_trans_next_item(tp, lidp);
+			continue;
+		}
+		lidp->lid_size = IOP_SIZE(lidp->lid_item);
+		nvecs += lidp->lid_size;
+		lidp = xfs_trans_next_item(tp, lidp);
+	}
+
+	return nvecs;
+}
 
 /*
- * xfs_trans_commit
+ * Fill in the vector with pointers to data to be logged
+ * by this transaction.  The transaction header takes
+ * the first vector, and then each dirty item takes the
+ * number of vectors it indicated it needed in xfs_trans_count_vecs().
  *
- * Commit the given transaction to the log a/synchronously.
+ * As each item fills in the entries it needs, also pin the item
+ * so that it cannot be flushed out until the log write completes.
+ */
+static void
+xfs_trans_fill_vecs(
+	struct xfs_trans	*tp,
+	struct xfs_log_iovec	*log_vector)
+{
+	xfs_log_item_desc_t	*lidp;
+	struct xfs_log_iovec	*vecp;
+	uint			nitems;
+
+	/*
+	 * Skip over the entry for the transaction header, we'll
+	 * fill that in at the end.
+	 */
+	vecp = log_vector + 1;
+
+	nitems = 0;
+	lidp = xfs_trans_first_item(tp);
+	ASSERT(lidp);
+	while (lidp) {
+		/* Skip items which aren't dirty in this transaction. */
+		if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
+			lidp = xfs_trans_next_item(tp, lidp);
+			continue;
+		}
+
+		/*
+		 * The item may be marked dirty but not log anything.  This can
+		 * be used to get called when a transaction is committed.
+		 */
+		if (lidp->lid_size)
+			nitems++;
+		IOP_FORMAT(lidp->lid_item, vecp);
+		vecp += lidp->lid_size;
+		IOP_PIN(lidp->lid_item);
+		lidp = xfs_trans_next_item(tp, lidp);
+	}
+
+	/*
+	 * Now that we've counted the number of items in this transaction, fill
+	 * in the transaction header. Note that the transaction header does not
+	 * have a log item.
+	 */
+	tp->t_header.th_magic = XFS_TRANS_HEADER_MAGIC;
+	tp->t_header.th_type = tp->t_type;
+	tp->t_header.th_num_items = nitems;
+	log_vector->i_addr = (xfs_caddr_t)&tp->t_header;
+	log_vector->i_len = sizeof(xfs_trans_header_t);
+	log_vector->i_type = XLOG_REG_TYPE_TRANSHDR;
+}
+
+/*
+ * The committed item processing consists of calling the committed routine of
+ * each logged item, updating the item's position in the AIL if necessary, and
+ * unpinning each item.  If the committed routine returns -1, then do nothing
+ * further with the item because it may have been freed.
  *
- * XFS disk error handling mechanism is not based on a typical
- * transaction abort mechanism. Logically after the filesystem
- * gets marked 'SHUTDOWN', we can't let any new transactions
- * be durable - ie. committed to disk - because some metadata might
- * be inconsistent. In such cases, this returns an error, and the
- * caller may assume that all locked objects joined to the transaction
- * have already been unlocked as if the commit had succeeded.
- * Do not reference the transaction structure after this call.
+ * Since items are unlocked when they are copied to the incore log, it is
+ * possible for two transactions to be completing and manipulating the same
+ * item simultaneously.  The AIL lock will protect the lsn field of each item.
+ * The value of this field can never go backwards.
+ *
+ * We unpin the items after repositioning them in the AIL, because otherwise
+ * they could be immediately flushed and we'd have to race with the flusher
+ * trying to pull the item from the AIL as we add it.
  */
- /*ARGSUSED*/
-int
-_xfs_trans_commit(
-	xfs_trans_t	*tp,
-	uint		flags,
-	int		*log_flushed)
+void
+xfs_trans_item_committed(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		commit_lsn,
+	int			aborted)
 {
-	xfs_log_iovec_t		*log_vector;
-	int			nvec;
-	xfs_mount_t		*mp;
-	xfs_lsn_t		commit_lsn;
-	/* REFERENCED */
-	int			error;
-	int			log_flags;
-	int			sync;
-#define	XFS_TRANS_LOGVEC_COUNT	16
-	xfs_log_iovec_t		log_vector_fast[XFS_TRANS_LOGVEC_COUNT];
-	void			*commit_iclog;
-	int			shutdown;
+	xfs_lsn_t		item_lsn;
+	struct xfs_ail		*ailp;
+
+	if (aborted)
+		lip->li_flags |= XFS_LI_ABORTED;
+	item_lsn = IOP_COMMITTED(lip, commit_lsn);
 
-	commit_lsn = -1;
+	/* If the committed routine returns -1, item has been freed. */
+	if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
+		return;
 
 	/*
-	 * Determine whether this commit is releasing a permanent
-	 * log reservation or not.
+	 * If the returned lsn is greater than what it contained before, update
+	 * the location of the item in the AIL.  If it is not, then do nothing.
+	 * Items can never move backwards in the AIL.
+	 *
+	 * While the new lsn should usually be greater, it is possible that a
+	 * later transaction completing simultaneously with an earlier one
+	 * using the same item could complete first with a higher lsn.  This
+	 * would cause the earlier transaction to fail the test below.
 	 */
-	if (flags & XFS_TRANS_RELEASE_LOG_RES) {
-		ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
-		log_flags = XFS_LOG_REL_PERM_RESERV;
+	ailp = lip->li_ailp;
+	spin_lock(&ailp->xa_lock);
+	if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) {
+		/*
+		 * This will set the item's lsn to item_lsn and update the
+		 * position of the item in the AIL.
+		 *
+		 * xfs_trans_ail_update() drops the AIL lock.
+		 */
+		xfs_trans_ail_update(ailp, lip, item_lsn);
 	} else {
-		log_flags = 0;
+		spin_unlock(&ailp->xa_lock);
 	}
-	mp = tp->t_mountp;
 
 	/*
-	 * If there is nothing to be logged by the transaction,
-	 * then unlock all of the items associated with the
-	 * transaction and free the transaction structure.
-	 * Also make sure to return any reserved blocks to
-	 * the free pool.
+	 * Now that we've repositioned the item in the AIL, unpin it so it can
+	 * be flushed. Pass information about buffer stale state down from the
+	 * log item flags, if anyone else stales the buffer we do not want to
+	 * pay any attention to it.
 	 */
-shut_us_down:
-	shutdown = XFS_FORCED_SHUTDOWN(mp) ? EIO : 0;
-	if (!(tp->t_flags & XFS_TRANS_DIRTY) || shutdown) {
-		xfs_trans_unreserve_and_mod_sb(tp);
+	IOP_UNPIN(lip);
+}
+
+/*
+ * This is typically called by the LM when a transaction has been fully
+ * committed to disk.  It needs to unpin the items which have
+ * been logged by the transaction and update their positions
+ * in the AIL if necessary.
+ *
+ * This also gets called when the transactions didn't get written out
+ * because of an I/O error. Abortflag & XFS_LI_ABORTED is set then.
+ */
+STATIC void
+xfs_trans_committed(
+	struct xfs_trans	*tp,
+	int			abortflag)
+{
+	xfs_log_item_desc_t	*lidp;
+	xfs_log_item_chunk_t	*licp;
+	xfs_log_item_chunk_t	*next_licp;
+
+	/* Call the transaction's completion callback if there is one. */
+	if (tp->t_callback != NULL)
+		tp->t_callback(tp, tp->t_callarg);
+
+	for (lidp = xfs_trans_first_item(tp);
+	     lidp != NULL;
+	     lidp = xfs_trans_next_item(tp, lidp)) {
+		xfs_trans_item_committed(lidp->lid_item, tp->t_lsn, abortflag);
+	}
+
+	/* free the item chunks, ignoring the embedded chunk */
+	for (licp = tp->t_items.lic_next; licp != NULL; licp = next_licp) {
+		next_licp = licp->lic_next;
+		kmem_free(licp);
+	}
+
+	xfs_trans_free(tp);
+}
+
+/*
+ * Called from the trans_commit code when we notice that
+ * the filesystem is in the middle of a forced shutdown.
+ */
+STATIC void
+xfs_trans_uncommit(
+	struct xfs_trans	*tp,
+	uint			flags)
+{
+	xfs_log_item_desc_t	*lidp;
+
+	for (lidp = xfs_trans_first_item(tp);
+	     lidp != NULL;
+	     lidp = xfs_trans_next_item(tp, lidp)) {
 		/*
-		 * It is indeed possible for the transaction to be
-		 * not dirty but the dqinfo portion to be. All that
-		 * means is that we have some (non-persistent) quota
-		 * reservations that need to be unreserved.
+		 * Unpin all but those that aren't dirty.
 		 */
-		xfs_trans_unreserve_and_mod_dquots(tp);
-		if (tp->t_ticket) {
-			commit_lsn = xfs_log_done(mp, tp->t_ticket,
-							NULL, log_flags);
-			if (commit_lsn == -1 && !shutdown)
-				shutdown = XFS_ERROR(EIO);
-		}
-		current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
-		xfs_trans_free_items(tp, shutdown? XFS_TRANS_ABORT : 0);
-		xfs_trans_free_busy(tp);
-		xfs_trans_free(tp);
-		XFS_STATS_INC(xs_trans_empty);
-		return (shutdown);
+		if (lidp->lid_flags & XFS_LID_DIRTY)
+			IOP_UNPIN_REMOVE(lidp->lid_item, tp);
 	}
-	ASSERT(tp->t_ticket != NULL);
 
-	/*
-	 * If we need to update the superblock, then do it now.
-	 */
-	if (tp->t_flags & XFS_TRANS_SB_DIRTY)
-		xfs_trans_apply_sb_deltas(tp);
-	xfs_trans_apply_dquot_deltas(tp);
+	xfs_trans_unreserve_and_mod_sb(tp);
+	xfs_trans_unreserve_and_mod_dquots(tp);
+
+	xfs_trans_free_items(tp, NULLCOMMITLSN, flags);
+	xfs_trans_free(tp);
+}
+
+/*
+ * Format the transaction direct to the iclog. This isolates the physical
+ * transaction commit operation from the logical operation and hence allows
+ * other methods to be introduced without affecting the existing commit path.
+ */
+static int
+xfs_trans_commit_iclog(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	xfs_lsn_t		*commit_lsn,
+	int			flags)
+{
+	int			shutdown;
+	int			error;
+	int			log_flags = 0;
+	struct xlog_in_core	*commit_iclog;
+#define XFS_TRANS_LOGVEC_COUNT  16
+	struct xfs_log_iovec	log_vector_fast[XFS_TRANS_LOGVEC_COUNT];
+	struct xfs_log_iovec	*log_vector;
+	uint			nvec;
+
 
 	/*
 	 * Ask each log item how many log_vector entries it will
@@ -861,8 +1382,7 @@ shut_us_down:
 	 */
 	nvec = xfs_trans_count_vecs(tp);
 	if (nvec == 0) {
-		xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
-		goto shut_us_down;
+		return ENOMEM;	/* triggers a shutdown! */
 	} else if (nvec <= XFS_TRANS_LOGVEC_COUNT) {
 		log_vector = log_vector_fast;
 	} else {
@@ -877,6 +1397,9 @@ shut_us_down:
 	 */
 	xfs_trans_fill_vecs(tp, log_vector);
 
+	if (flags & XFS_TRANS_RELEASE_LOG_RES)
+		log_flags = XFS_LOG_REL_PERM_RESERV;
+
 	error = xfs_log_write(mp, log_vector, nvec, tp->t_ticket, &(tp->t_lsn));
 
 	/*
@@ -884,18 +1407,19 @@ shut_us_down:
 	 * at any time after this call.  However, all the items associated
 	 * with the transaction are still locked and pinned in memory.
 	 */
-	commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags);
+	*commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags);
 
-	tp->t_commit_lsn = commit_lsn;
-	if (nvec > XFS_TRANS_LOGVEC_COUNT) {
+	tp->t_commit_lsn = *commit_lsn;
+	trace_xfs_trans_commit_lsn(tp);
+
+	if (nvec > XFS_TRANS_LOGVEC_COUNT)
 		kmem_free(log_vector);
-	}
 
 	/*
 	 * If we got a log write error. Unpin the logitems that we
 	 * had pinned, clean up, free trans structure, and return error.
 	 */
-	if (error || commit_lsn == -1) {
+	if (error || *commit_lsn == -1) {
 		current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
 		xfs_trans_uncommit(tp, flags|XFS_TRANS_ABORT);
 		return XFS_ERROR(EIO);
@@ -909,8 +1433,6 @@ shut_us_down:
 	 */
 	xfs_trans_unreserve_and_mod_sb(tp);
 
-	sync = tp->t_flags & XFS_TRANS_SYNC;
-
 	/*
 	 * Tell the LM to call the transaction completion routine
 	 * when the log write with LSN commit_lsn completes (e.g.
@@ -953,7 +1475,7 @@ shut_us_down:
 	 * the commit lsn of this transaction for dependency tracking
 	 * purposes.
 	 */
-	xfs_trans_unlock_items(tp, commit_lsn);
+	xfs_trans_unlock_items(tp, *commit_lsn);
 
 	/*
 	 * If we detected a log error earlier, finish committing
@@ -973,157 +1495,204 @@ shut_us_down:
 	 * and the items are released we can finally allow the iclog to
 	 * go to disk.
 	 */
-	error = xfs_log_release_iclog(mp, commit_iclog);
-
-	/*
-	 * If the transaction needs to be synchronous, then force the
-	 * log out now and wait for it.
-	 */
-	if (sync) {
-		if (!error) {
-			error = _xfs_log_force(mp, commit_lsn,
-				      XFS_LOG_FORCE | XFS_LOG_SYNC,
-				      log_flushed);
-		}
-		XFS_STATS_INC(xs_trans_sync);
-	} else {
-		XFS_STATS_INC(xs_trans_async);
-	}
-
-	return (error);
+	return xfs_log_release_iclog(mp, commit_iclog);
 }
 
-
 /*
- * Total up the number of log iovecs needed to commit this
- * transaction.  The transaction itself needs one for the
- * transaction header.  Ask each dirty item in turn how many
- * it needs to get the total.
+ * Walk the log items and allocate log vector structures for
+ * each item large enough to fit all the vectors they require.
+ * Note that this format differs from the old log vector format in
+ * that there is no transaction header in these log vectors.
  */
-STATIC uint
-xfs_trans_count_vecs(
+STATIC struct xfs_log_vec *
+xfs_trans_alloc_log_vecs(
 	xfs_trans_t	*tp)
 {
-	int			nvecs;
 	xfs_log_item_desc_t	*lidp;
+	struct xfs_log_vec	*lv = NULL;
+	struct xfs_log_vec	*ret_lv = NULL;
 
-	nvecs = 1;
 	lidp = xfs_trans_first_item(tp);
-	ASSERT(lidp != NULL);
 
-	/* In the non-debug case we need to start bailing out if we
-	 * didn't find a log_item here, return zero and let trans_commit
-	 * deal with it.
-	 */
-	if (lidp == NULL)
-		return 0;
+	/* Bail out if we didn't find a log item.  */
+	if (!lidp) {
+		ASSERT(0);
+		return NULL;
+	}
 
 	while (lidp != NULL) {
-		/*
-		 * Skip items which aren't dirty in this transaction.
-		 */
+		struct xfs_log_vec *new_lv;
+
+		/* Skip items which aren't dirty in this transaction. */
 		if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
 			lidp = xfs_trans_next_item(tp, lidp);
 			continue;
 		}
+
+		/* Skip items that do not have any vectors for writing */
 		lidp->lid_size = IOP_SIZE(lidp->lid_item);
-		nvecs += lidp->lid_size;
+		if (!lidp->lid_size) {
+			lidp = xfs_trans_next_item(tp, lidp);
+			continue;
+		}
+
+		new_lv = kmem_zalloc(sizeof(*new_lv) +
+				lidp->lid_size * sizeof(struct xfs_log_iovec),
+				KM_SLEEP);
+
+		/* The allocated iovec region lies beyond the log vector. */
+		new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1];
+		new_lv->lv_niovecs = lidp->lid_size;
+		new_lv->lv_item = lidp->lid_item;
+		if (!ret_lv)
+			ret_lv = new_lv;
+		else
+			lv->lv_next = new_lv;
+		lv = new_lv;
 		lidp = xfs_trans_next_item(tp, lidp);
 	}
 
-	return nvecs;
+	return ret_lv;
 }
 
-/*
- * Called from the trans_commit code when we notice that
- * the filesystem is in the middle of a forced shutdown.
- */
-STATIC void
-xfs_trans_uncommit(
-	xfs_trans_t	*tp,
-	uint		flags)
+static int
+xfs_trans_commit_cil(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	xfs_lsn_t		*commit_lsn,
+	int			flags)
 {
-	xfs_log_item_desc_t	*lidp;
+	struct xfs_log_vec	*log_vector;
+	int			error;
 
-	for (lidp = xfs_trans_first_item(tp);
-	     lidp != NULL;
-	     lidp = xfs_trans_next_item(tp, lidp)) {
-		/*
-		 * Unpin all but those that aren't dirty.
-		 */
-		if (lidp->lid_flags & XFS_LID_DIRTY)
-			IOP_UNPIN_REMOVE(lidp->lid_item, tp);
-	}
+	/*
+	 * Get each log item to allocate a vector structure for
+	 * the log item to to pass to the log write code. The
+	 * CIL commit code will format the vector and save it away.
+	 */
+	log_vector = xfs_trans_alloc_log_vecs(tp);
+	if (!log_vector)
+		return ENOMEM;
 
-	xfs_trans_unreserve_and_mod_sb(tp);
-	xfs_trans_unreserve_and_mod_dquots(tp);
+	error = xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags);
+	if (error)
+		return error;
+
+	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
 
-	xfs_trans_free_items(tp, flags);
-	xfs_trans_free_busy(tp);
+	/* xfs_trans_free_items() unlocks them first */
+	xfs_trans_free_items(tp, *commit_lsn, 0);
 	xfs_trans_free(tp);
+	return 0;
 }
 
 /*
- * Fill in the vector with pointers to data to be logged
- * by this transaction.  The transaction header takes
- * the first vector, and then each dirty item takes the
- * number of vectors it indicated it needed in xfs_trans_count_vecs().
+ * xfs_trans_commit
  *
- * As each item fills in the entries it needs, also pin the item
- * so that it cannot be flushed out until the log write completes.
+ * Commit the given transaction to the log a/synchronously.
+ *
+ * XFS disk error handling mechanism is not based on a typical
+ * transaction abort mechanism. Logically after the filesystem
+ * gets marked 'SHUTDOWN', we can't let any new transactions
+ * be durable - ie. committed to disk - because some metadata might
+ * be inconsistent. In such cases, this returns an error, and the
+ * caller may assume that all locked objects joined to the transaction
+ * have already been unlocked as if the commit had succeeded.
+ * Do not reference the transaction structure after this call.
  */
-STATIC void
-xfs_trans_fill_vecs(
-	xfs_trans_t		*tp,
-	xfs_log_iovec_t		*log_vector)
+int
+_xfs_trans_commit(
+	struct xfs_trans	*tp,
+	uint			flags,
+	int			*log_flushed)
 {
-	xfs_log_item_desc_t	*lidp;
-	xfs_log_iovec_t		*vecp;
-	uint			nitems;
+	struct xfs_mount	*mp = tp->t_mountp;
+	xfs_lsn_t		commit_lsn = -1;
+	int			error = 0;
+	int			log_flags = 0;
+	int			sync = tp->t_flags & XFS_TRANS_SYNC;
 
 	/*
-	 * Skip over the entry for the transaction header, we'll
-	 * fill that in at the end.
+	 * Determine whether this commit is releasing a permanent
+	 * log reservation or not.
 	 */
-	vecp = log_vector + 1;		/* pointer arithmetic */
+	if (flags & XFS_TRANS_RELEASE_LOG_RES) {
+		ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+		log_flags = XFS_LOG_REL_PERM_RESERV;
+	}
 
-	nitems = 0;
-	lidp = xfs_trans_first_item(tp);
-	ASSERT(lidp != NULL);
-	while (lidp != NULL) {
-		/*
-		 * Skip items which aren't dirty in this transaction.
-		 */
-		if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
-			lidp = xfs_trans_next_item(tp, lidp);
-			continue;
-		}
-		/*
-		 * The item may be marked dirty but not log anything.
-		 * This can be used to get called when a transaction
-		 * is committed.
-		 */
-		if (lidp->lid_size) {
-			nitems++;
+	/*
+	 * If there is nothing to be logged by the transaction,
+	 * then unlock all of the items associated with the
+	 * transaction and free the transaction structure.
+	 * Also make sure to return any reserved blocks to
+	 * the free pool.
+	 */
+	if (!(tp->t_flags & XFS_TRANS_DIRTY))
+		goto out_unreserve;
+
+	if (XFS_FORCED_SHUTDOWN(mp)) {
+		error = XFS_ERROR(EIO);
+		goto out_unreserve;
+	}
+
+	ASSERT(tp->t_ticket != NULL);
+
+	/*
+	 * If we need to update the superblock, then do it now.
+	 */
+	if (tp->t_flags & XFS_TRANS_SB_DIRTY)
+		xfs_trans_apply_sb_deltas(tp);
+	xfs_trans_apply_dquot_deltas(tp);
+
+	if (mp->m_flags & XFS_MOUNT_DELAYLOG)
+		error = xfs_trans_commit_cil(mp, tp, &commit_lsn, flags);
+	else
+		error = xfs_trans_commit_iclog(mp, tp, &commit_lsn, flags);
+
+	if (error == ENOMEM) {
+		xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
+		error = XFS_ERROR(EIO);
+		goto out_unreserve;
+	}
+
+	/*
+	 * If the transaction needs to be synchronous, then force the
+	 * log out now and wait for it.
+	 */
+	if (sync) {
+		if (!error) {
+			error = _xfs_log_force_lsn(mp, commit_lsn,
+				      XFS_LOG_SYNC, log_flushed);
 		}
-		IOP_FORMAT(lidp->lid_item, vecp);
-		vecp += lidp->lid_size;		/* pointer arithmetic */
-		IOP_PIN(lidp->lid_item);
-		lidp = xfs_trans_next_item(tp, lidp);
+		XFS_STATS_INC(xs_trans_sync);
+	} else {
+		XFS_STATS_INC(xs_trans_async);
 	}
 
+	return error;
+
+out_unreserve:
+	xfs_trans_unreserve_and_mod_sb(tp);
+
 	/*
-	 * Now that we've counted the number of items in this
-	 * transaction, fill in the transaction header.
+	 * It is indeed possible for the transaction to be not dirty but
+	 * the dqinfo portion to be.  All that means is that we have some
+	 * (non-persistent) quota reservations that need to be unreserved.
 	 */
-	tp->t_header.th_magic = XFS_TRANS_HEADER_MAGIC;
-	tp->t_header.th_type = tp->t_type;
-	tp->t_header.th_num_items = nitems;
-	log_vector->i_addr = (xfs_caddr_t)&tp->t_header;
-	log_vector->i_len = sizeof(xfs_trans_header_t);
-	XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_TRANSHDR);
-}
+	xfs_trans_unreserve_and_mod_dquots(tp);
+	if (tp->t_ticket) {
+		commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
+		if (commit_lsn == -1 && !error)
+			error = XFS_ERROR(EIO);
+	}
+	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+	xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0);
+	xfs_trans_free(tp);
 
+	XFS_STATS_INC(xs_trans_empty);
+	return error;
+}
 
 /*
  * Unlock all of the transaction's items and free the transaction.
@@ -1196,25 +1765,10 @@ xfs_trans_cancel(
 	/* mark this thread as no longer being in a transaction */
 	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
 
-	xfs_trans_free_items(tp, flags);
-	xfs_trans_free_busy(tp);
+	xfs_trans_free_items(tp, NULLCOMMITLSN, flags);
 	xfs_trans_free(tp);
 }
 
-
-/*
- * Free the transaction structure.  If there is more clean up
- * to do when the structure is freed, add it here.
- */
-STATIC void
-xfs_trans_free(
-	xfs_trans_t	*tp)
-{
-	atomic_dec(&tp->t_mountp->m_active_trans);
-	xfs_trans_free_dqinfo(tp);
-	kmem_zone_free(xfs_trans_zone, tp);
-}
-
 /*
  * Roll from one trans in the sequence of PERMANENT transactions to
  * the next: permanent transactions are only flushed out when
@@ -1284,174 +1838,3 @@ xfs_trans_roll(
 	xfs_trans_ihold(trans, dp);
 	return 0;
 }
-
-/*
- * THIS SHOULD BE REWRITTEN TO USE xfs_trans_next_item().
- *
- * This is typically called by the LM when a transaction has been fully
- * committed to disk.  It needs to unpin the items which have
- * been logged by the transaction and update their positions
- * in the AIL if necessary.
- * This also gets called when the transactions didn't get written out
- * because of an I/O error. Abortflag & XFS_LI_ABORTED is set then.
- *
- * Call xfs_trans_chunk_committed() to process the items in
- * each chunk.
- */
-STATIC void
-xfs_trans_committed(
-	xfs_trans_t	*tp,
-	int		abortflag)
-{
-	xfs_log_item_chunk_t	*licp;
-	xfs_log_item_chunk_t	*next_licp;
-	xfs_log_busy_chunk_t	*lbcp;
-	xfs_log_busy_slot_t	*lbsp;
-	int			i;
-
-	/*
-	 * Call the transaction's completion callback if there
-	 * is one.
-	 */
-	if (tp->t_callback != NULL) {
-		tp->t_callback(tp, tp->t_callarg);
-	}
-
-	/*
-	 * Special case the chunk embedded in the transaction.
-	 */
-	licp = &(tp->t_items);
-	if (!(xfs_lic_are_all_free(licp))) {
-		xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag);
-	}
-
-	/*
-	 * Process the items in each chunk in turn.
-	 */
-	licp = licp->lic_next;
-	while (licp != NULL) {
-		ASSERT(!xfs_lic_are_all_free(licp));
-		xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag);
-		next_licp = licp->lic_next;
-		kmem_free(licp);
-		licp = next_licp;
-	}
-
-	/*
-	 * Clear all the per-AG busy list items listed in this transaction
-	 */
-	lbcp = &tp->t_busy;
-	while (lbcp != NULL) {
-		for (i = 0, lbsp = lbcp->lbc_busy; i < lbcp->lbc_unused; i++, lbsp++) {
-			if (!XFS_LBC_ISFREE(lbcp, i)) {
-				xfs_alloc_clear_busy(tp, lbsp->lbc_ag,
-						     lbsp->lbc_idx);
-			}
-		}
-		lbcp = lbcp->lbc_next;
-	}
-	xfs_trans_free_busy(tp);
-
-	/*
-	 * That's it for the transaction structure.  Free it.
-	 */
-	xfs_trans_free(tp);
-}
-
-/*
- * This is called to perform the commit processing for each
- * item described by the given chunk.
- *
- * The commit processing consists of unlocking items which were
- * held locked with the SYNC_UNLOCK attribute, calling the committed
- * routine of each logged item, updating the item's position in the AIL
- * if necessary, and unpinning each item.  If the committed routine
- * returns -1, then do nothing further with the item because it
- * may have been freed.
- *
- * Since items are unlocked when they are copied to the incore
- * log, it is possible for two transactions to be completing
- * and manipulating the same item simultaneously.  The AIL lock
- * will protect the lsn field of each item.  The value of this
- * field can never go backwards.
- *
- * We unpin the items after repositioning them in the AIL, because
- * otherwise they could be immediately flushed and we'd have to race
- * with the flusher trying to pull the item from the AIL as we add it.
- */
-STATIC void
-xfs_trans_chunk_committed(
-	xfs_log_item_chunk_t	*licp,
-	xfs_lsn_t		lsn,
-	int			aborted)
-{
-	xfs_log_item_desc_t	*lidp;
-	xfs_log_item_t		*lip;
-	xfs_lsn_t		item_lsn;
-	int			i;
-
-	lidp = licp->lic_descs;
-	for (i = 0; i < licp->lic_unused; i++, lidp++) {
-		struct xfs_ail		*ailp;
-
-		if (xfs_lic_isfree(licp, i)) {
-			continue;
-		}
-
-		lip = lidp->lid_item;
-		if (aborted)
-			lip->li_flags |= XFS_LI_ABORTED;
-
-		/*
-		 * Send in the ABORTED flag to the COMMITTED routine
-		 * so that it knows whether the transaction was aborted
-		 * or not.
-		 */
-		item_lsn = IOP_COMMITTED(lip, lsn);
-
-		/*
-		 * If the committed routine returns -1, make
-		 * no more references to the item.
-		 */
-		if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) {
-			continue;
-		}
-
-		/*
-		 * If the returned lsn is greater than what it
-		 * contained before, update the location of the
-		 * item in the AIL.  If it is not, then do nothing.
-		 * Items can never move backwards in the AIL.
-		 *
-		 * While the new lsn should usually be greater, it
-		 * is possible that a later transaction completing
-		 * simultaneously with an earlier one using the
-		 * same item could complete first with a higher lsn.
-		 * This would cause the earlier transaction to fail
-		 * the test below.
-		 */
-		ailp = lip->li_ailp;
-		spin_lock(&ailp->xa_lock);
-		if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) {
-			/*
-			 * This will set the item's lsn to item_lsn
-			 * and update the position of the item in
-			 * the AIL.
-			 *
-			 * xfs_trans_ail_update() drops the AIL lock.
-			 */
-			xfs_trans_ail_update(ailp, lip, item_lsn);
-		} else {
-			spin_unlock(&ailp->xa_lock);
-		}
-
-		/*
-		 * Now that we've repositioned the item in the AIL,
-		 * unpin it so it can be flushed. Pass information
-		 * about buffer stale state down from the log item
-		 * flags, if anyone else stales the buffer we do not
-		 * want to pay any attention to it.
-		 */
-		IOP_UNPIN(lip, lidp->lid_flags & XFS_LID_BUF_STALE);
-	}
-}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index ca64f33c63a3..e639e8e9a2a9 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -49,6 +49,15 @@ typedef struct xfs_trans_header {
 #define	XFS_LI_DQUOT		0x123d
 #define	XFS_LI_QUOTAOFF		0x123e
 
+#define XFS_LI_TYPE_DESC \
+	{ XFS_LI_EFI,		"XFS_LI_EFI" }, \
+	{ XFS_LI_EFD,		"XFS_LI_EFD" }, \
+	{ XFS_LI_IUNLINK,	"XFS_LI_IUNLINK" }, \
+	{ XFS_LI_INODE,		"XFS_LI_INODE" }, \
+	{ XFS_LI_BUF,		"XFS_LI_BUF" }, \
+	{ XFS_LI_DQUOT,		"XFS_LI_DQUOT" }, \
+	{ XFS_LI_QUOTAOFF,	"XFS_LI_QUOTAOFF" }
+
 /*
  * Transaction types.  Used to distinguish types of buffers.
  */
@@ -97,7 +106,8 @@ typedef struct xfs_trans_header {
 #define	XFS_TRANS_GROWFSRT_FREE		39
 #define	XFS_TRANS_SWAPEXT		40
 #define	XFS_TRANS_SB_COUNT		41
-#define	XFS_TRANS_TYPE_MAX		41
+#define	XFS_TRANS_CHECKPOINT		42
+#define	XFS_TRANS_TYPE_MAX		42
 /* new transaction types need to be reflected in xfs_logprint(8) */
 
 #define XFS_TRANS_TYPES \
@@ -139,6 +149,7 @@ typedef struct xfs_trans_header {
 	{ XFS_TRANS_GROWFSRT_FREE,	"GROWFSRT_FREE" }, \
 	{ XFS_TRANS_SWAPEXT,		"SWAPEXT" }, \
 	{ XFS_TRANS_SB_COUNT,		"SB_COUNT" }, \
+	{ XFS_TRANS_CHECKPOINT,		"CHECKPOINT" }, \
 	{ XFS_TRANS_DUMMY1,		"DUMMY1" }, \
 	{ XFS_TRANS_DUMMY2,		"DUMMY2" }, \
 	{ XLOG_UNMOUNT_REC_TYPE,	"UNMOUNT" }
@@ -159,7 +170,6 @@ typedef struct xfs_log_item_desc {
 
 #define XFS_LID_DIRTY		0x1
 #define XFS_LID_PINNED		0x2
-#define XFS_LID_BUF_STALE	0x8
 
 /*
  * This structure is used to maintain a chunk list of log_item_desc
@@ -290,24 +300,6 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
 
 
 /*
- * Various log reservation values.
- * These are based on the size of the file system block
- * because that is what most transactions manipulate.
- * Each adds in an additional 128 bytes per item logged to
- * try to account for the overhead of the transaction mechanism.
- *
- * Note:
- * Most of the reservations underestimate the number of allocation
- * groups into which they could free extents in the xfs_bmap_finish()
- * call.  This is because the number in the worst case is quite high
- * and quite unusual.  In order to fix this we need to change
- * xfs_bmap_finish() to free extents in only a single AG at a time.
- * This will require changes to the EFI code as well, however, so that
- * the EFI for the extents not freed is logged again in each transaction.
- * See bug 261917.
- */
-
-/*
  * Per-extent log reservation for the allocation btree changes
  * involved in freeing or allocating an extent.
  * 2 trees * (2 blocks/level * max depth - 1) * block size
@@ -331,429 +323,36 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
 	(XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \
 	 XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)
 
-/*
- * In a write transaction we can allocate a maximum of 2
- * extents.  This gives:
- *    the inode getting the new extents: inode size
- *    the inode's bmap btree: max depth * block size
- *    the agfs of the ags from which the extents are allocated: 2 * sector
- *    the superblock free block counter: sector size
- *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- * And the bmap_finish transaction can free bmap blocks in a join:
- *    the agfs of the ags containing the blocks: 2 * sector size
- *    the agfls of the ags containing the blocks: 2 * sector size
- *    the super block free block counter: sector size
- *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_WRITE_LOG_RES(mp) \
-	(MAX( \
-	 ((mp)->m_sb.sb_inodesize + \
-	  XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + \
-	  (2 * (mp)->m_sb.sb_sectsize) + \
-	  (mp)->m_sb.sb_sectsize + \
-	  XFS_ALLOCFREE_LOG_RES(mp, 2) + \
-	  (128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))),\
-	 ((2 * (mp)->m_sb.sb_sectsize) + \
-	  (2 * (mp)->m_sb.sb_sectsize) + \
-	  (mp)->m_sb.sb_sectsize + \
-	  XFS_ALLOCFREE_LOG_RES(mp, 2) + \
-	  (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
 
 #define	XFS_WRITE_LOG_RES(mp)	((mp)->m_reservations.tr_write)
-
-/*
- * In truncating a file we free up to two extents at once.  We can modify:
- *    the inode being truncated: inode size
- *    the inode's bmap btree: (max depth + 1) * block size
- * And the bmap_finish transaction can free the blocks and bmap blocks:
- *    the agf for each of the ags: 4 * sector size
- *    the agfl for each of the ags: 4 * sector size
- *    the super block to reflect the freed blocks: sector size
- *    worst case split in allocation btrees per extent assuming 4 extents:
- *		4 exts * 2 trees * (2 * max depth - 1) * block size
- *    the inode btree: max depth * blocksize
- *    the allocation btrees: 2 trees * (max depth - 1) * block size
- */
-#define	XFS_CALC_ITRUNCATE_LOG_RES(mp) \
-	(MAX( \
-	 ((mp)->m_sb.sb_inodesize + \
-	  XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) + \
-	  (128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))), \
-	 ((4 * (mp)->m_sb.sb_sectsize) + \
-	  (4 * (mp)->m_sb.sb_sectsize) + \
-	  (mp)->m_sb.sb_sectsize + \
-	  XFS_ALLOCFREE_LOG_RES(mp, 4) + \
-	  (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))) + \
-	  (128 * 5) + \
-	  XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-	   (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
-	    XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
-
 #define	XFS_ITRUNCATE_LOG_RES(mp)   ((mp)->m_reservations.tr_itruncate)
-
-/*
- * In renaming a files we can modify:
- *    the four inodes involved: 4 * inode size
- *    the two directory btrees: 2 * (max depth + v2) * dir block size
- *    the two directory bmap btrees: 2 * max depth * block size
- * And the bmap_finish transaction can free dir and bmap blocks (two sets
- *	of bmap blocks) giving:
- *    the agf for the ags in which the blocks live: 3 * sector size
- *    the agfl for the ags in which the blocks live: 3 * sector size
- *    the superblock for the free block count: sector size
- *    the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
- */
-#define	XFS_CALC_RENAME_LOG_RES(mp) \
-	(MAX( \
-	 ((4 * (mp)->m_sb.sb_inodesize) + \
-	  (2 * XFS_DIROP_LOG_RES(mp)) + \
-	  (128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp)))), \
-	 ((3 * (mp)->m_sb.sb_sectsize) + \
-	  (3 * (mp)->m_sb.sb_sectsize) + \
-	  (mp)->m_sb.sb_sectsize + \
-	  XFS_ALLOCFREE_LOG_RES(mp, 3) + \
-	  (128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3))))))
-
 #define	XFS_RENAME_LOG_RES(mp)	((mp)->m_reservations.tr_rename)
-
-/*
- * For creating a link to an inode:
- *    the parent directory inode: inode size
- *    the linked inode: inode size
- *    the directory btree could split: (max depth + v2) * dir block size
- *    the directory bmap btree could join or split: (max depth + v2) * blocksize
- * And the bmap_finish transaction can free some bmap blocks giving:
- *    the agf for the ag in which the blocks live: sector size
- *    the agfl for the ag in which the blocks live: sector size
- *    the superblock for the free block count: sector size
- *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
- */
-#define	XFS_CALC_LINK_LOG_RES(mp) \
-	(MAX( \
-	 ((mp)->m_sb.sb_inodesize + \
-	  (mp)->m_sb.sb_inodesize + \
-	  XFS_DIROP_LOG_RES(mp) + \
-	  (128 * (2 + XFS_DIROP_LOG_COUNT(mp)))), \
-	 ((mp)->m_sb.sb_sectsize + \
-	  (mp)->m_sb.sb_sectsize + \
-	  (mp)->m_sb.sb_sectsize + \
-	  XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-	  (128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
-
 #define	XFS_LINK_LOG_RES(mp)	((mp)->m_reservations.tr_link)
-
-/*
- * For removing a directory entry we can modify:
- *    the parent directory inode: inode size
- *    the removed inode: inode size
- *    the directory btree could join: (max depth + v2) * dir block size
- *    the directory bmap btree could join or split: (max depth + v2) * blocksize
- * And the bmap_finish transaction can free the dir and bmap blocks giving:
- *    the agf for the ag in which the blocks live: 2 * sector size
- *    the agfl for the ag in which the blocks live: 2 * sector size
- *    the superblock for the free block count: sector size
- *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- */
-#define	XFS_CALC_REMOVE_LOG_RES(mp)	\
-	(MAX( \
-	 ((mp)->m_sb.sb_inodesize + \
-	  (mp)->m_sb.sb_inodesize + \
-	  XFS_DIROP_LOG_RES(mp) + \
-	  (128 * (2 + XFS_DIROP_LOG_COUNT(mp)))), \
-	 ((2 * (mp)->m_sb.sb_sectsize) + \
-	  (2 * (mp)->m_sb.sb_sectsize) + \
-	  (mp)->m_sb.sb_sectsize + \
-	  XFS_ALLOCFREE_LOG_RES(mp, 2) + \
-	  (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
-
 #define	XFS_REMOVE_LOG_RES(mp)	((mp)->m_reservations.tr_remove)
-
-/*
- * For symlink we can modify:
- *    the parent directory inode: inode size
- *    the new inode: inode size
- *    the inode btree entry: 1 block
- *    the directory btree: (max depth + v2) * dir block size
- *    the directory inode's bmap btree: (max depth + v2) * block size
- *    the blocks for the symlink: 1 kB
- * Or in the first xact we allocate some inodes giving:
- *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
- *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
- *    the inode btree: max depth * blocksize
- *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
- */
-#define	XFS_CALC_SYMLINK_LOG_RES(mp)		\
-	(MAX( \
-	 ((mp)->m_sb.sb_inodesize + \
-	  (mp)->m_sb.sb_inodesize + \
-	  XFS_FSB_TO_B(mp, 1) + \
-	  XFS_DIROP_LOG_RES(mp) + \
-	  1024 + \
-	  (128 * (4 + XFS_DIROP_LOG_COUNT(mp)))), \
-	 (2 * (mp)->m_sb.sb_sectsize + \
-	  XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
-	  XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \
-	  XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-	  (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
-	   XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
-
 #define	XFS_SYMLINK_LOG_RES(mp)	((mp)->m_reservations.tr_symlink)
-
-/*
- * For create we can modify:
- *    the parent directory inode: inode size
- *    the new inode: inode size
- *    the inode btree entry: block size
- *    the superblock for the nlink flag: sector size
- *    the directory btree: (max depth + v2) * dir block size
- *    the directory inode's bmap btree: (max depth + v2) * block size
- * Or in the first xact we allocate some inodes giving:
- *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
- *    the superblock for the nlink flag: sector size
- *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
- *    the inode btree: max depth * blocksize
- *    the allocation btrees: 2 trees * (max depth - 1) * block size
- */
-#define	XFS_CALC_CREATE_LOG_RES(mp)		\
-	(MAX( \
-	 ((mp)->m_sb.sb_inodesize + \
-	  (mp)->m_sb.sb_inodesize + \
-	  (mp)->m_sb.sb_sectsize + \
-	  XFS_FSB_TO_B(mp, 1) + \
-	  XFS_DIROP_LOG_RES(mp) + \
-	  (128 * (3 + XFS_DIROP_LOG_COUNT(mp)))), \
-	 (3 * (mp)->m_sb.sb_sectsize + \
-	  XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
-	  XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \
-	  XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-	  (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
-	   XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
-
 #define	XFS_CREATE_LOG_RES(mp)	((mp)->m_reservations.tr_create)
-
-/*
- * Making a new directory is the same as creating a new file.
- */
-#define	XFS_CALC_MKDIR_LOG_RES(mp)	XFS_CALC_CREATE_LOG_RES(mp)
-
 #define	XFS_MKDIR_LOG_RES(mp)	((mp)->m_reservations.tr_mkdir)
-
-/*
- * In freeing an inode we can modify:
- *    the inode being freed: inode size
- *    the super block free inode counter: sector size
- *    the agi hash list and counters: sector size
- *    the inode btree entry: block size
- *    the on disk inode before ours in the agi hash list: inode cluster size
- *    the inode btree: max depth * blocksize
- *    the allocation btrees: 2 trees * (max depth - 1) * block size
- */
-#define	XFS_CALC_IFREE_LOG_RES(mp) \
-	((mp)->m_sb.sb_inodesize + \
-	 (mp)->m_sb.sb_sectsize + \
-	 (mp)->m_sb.sb_sectsize + \
-	 XFS_FSB_TO_B((mp), 1) + \
-	 MAX((__uint16_t)XFS_FSB_TO_B((mp), 1), XFS_INODE_CLUSTER_SIZE(mp)) + \
-	 (128 * 5) + \
-	  XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-	  (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
-	   XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
-
-
 #define	XFS_IFREE_LOG_RES(mp)	((mp)->m_reservations.tr_ifree)
-
-/*
- * When only changing the inode we log the inode and possibly the superblock
- * We also add a bit of slop for the transaction stuff.
- */
-#define	XFS_CALC_ICHANGE_LOG_RES(mp)	((mp)->m_sb.sb_inodesize + \
-					 (mp)->m_sb.sb_sectsize + 512)
-
 #define	XFS_ICHANGE_LOG_RES(mp)	((mp)->m_reservations.tr_ichange)
-
-/*
- * Growing the data section of the filesystem.
- *	superblock
- *	agi and agf
- *	allocation btrees
- */
-#define	XFS_CALC_GROWDATA_LOG_RES(mp) \
-	((mp)->m_sb.sb_sectsize * 3 + \
-	 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-	 (128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
-
 #define	XFS_GROWDATA_LOG_RES(mp)    ((mp)->m_reservations.tr_growdata)
-
-/*
- * Growing the rt section of the filesystem.
- * In the first set of transactions (ALLOC) we allocate space to the
- * bitmap or summary files.
- *	superblock: sector size
- *	agf of the ag from which the extent is allocated: sector size
- *	bmap btree for bitmap/summary inode: max depth * blocksize
- *	bitmap/summary inode: inode size
- *	allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
- */
-#define	XFS_CALC_GROWRTALLOC_LOG_RES(mp) \
-	(2 * (mp)->m_sb.sb_sectsize + \
-	 XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + \
-	 (mp)->m_sb.sb_inodesize + \
-	 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-	 (128 * \
-	  (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + \
-	   XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
-
 #define	XFS_GROWRTALLOC_LOG_RES(mp)	((mp)->m_reservations.tr_growrtalloc)
-
-/*
- * Growing the rt section of the filesystem.
- * In the second set of transactions (ZERO) we zero the new metadata blocks.
- *	one bitmap/summary block: blocksize
- */
-#define	XFS_CALC_GROWRTZERO_LOG_RES(mp) \
-	((mp)->m_sb.sb_blocksize + 128)
-
 #define	XFS_GROWRTZERO_LOG_RES(mp)	((mp)->m_reservations.tr_growrtzero)
-
-/*
- * Growing the rt section of the filesystem.
- * In the third set of transactions (FREE) we update metadata without
- * allocating any new blocks.
- *	superblock: sector size
- *	bitmap inode: inode size
- *	summary inode: inode size
- *	one bitmap block: blocksize
- *	summary blocks: new summary size
- */
-#define	XFS_CALC_GROWRTFREE_LOG_RES(mp) \
-	((mp)->m_sb.sb_sectsize + \
-	 2 * (mp)->m_sb.sb_inodesize + \
-	 (mp)->m_sb.sb_blocksize + \
-	 (mp)->m_rsumsize + \
-	 (128 * 5))
-
 #define	XFS_GROWRTFREE_LOG_RES(mp)	((mp)->m_reservations.tr_growrtfree)
-
-/*
- * Logging the inode modification timestamp on a synchronous write.
- *	inode
- */
-#define	XFS_CALC_SWRITE_LOG_RES(mp) \
-	((mp)->m_sb.sb_inodesize + 128)
-
 #define	XFS_SWRITE_LOG_RES(mp)	((mp)->m_reservations.tr_swrite)
-
 /*
  * Logging the inode timestamps on an fsync -- same as SWRITE
  * as long as SWRITE logs the entire inode core
  */
 #define XFS_FSYNC_TS_LOG_RES(mp)        ((mp)->m_reservations.tr_swrite)
-
-/*
- * Logging the inode mode bits when writing a setuid/setgid file
- *	inode
- */
-#define	XFS_CALC_WRITEID_LOG_RES(mp) \
-	((mp)->m_sb.sb_inodesize + 128)
-
 #define	XFS_WRITEID_LOG_RES(mp)	((mp)->m_reservations.tr_swrite)
-
-/*
- * Converting the inode from non-attributed to attributed.
- *	the inode being converted: inode size
- *	agf block and superblock (for block allocation)
- *	the new block (directory sized)
- *	bmap blocks for the new directory block
- *	allocation btrees
- */
-#define	XFS_CALC_ADDAFORK_LOG_RES(mp)	\
-	((mp)->m_sb.sb_inodesize + \
-	 (mp)->m_sb.sb_sectsize * 2 + \
-	 (mp)->m_dirblksize + \
-	 XFS_FSB_TO_B(mp, (XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1)) + \
-	 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-	 (128 * (4 + (XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) + \
-		 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
-
 #define	XFS_ADDAFORK_LOG_RES(mp)	((mp)->m_reservations.tr_addafork)
-
-/*
- * Removing the attribute fork of a file
- *    the inode being truncated: inode size
- *    the inode's bmap btree: max depth * block size
- * And the bmap_finish transaction can free the blocks and bmap blocks:
- *    the agf for each of the ags: 4 * sector size
- *    the agfl for each of the ags: 4 * sector size
- *    the super block to reflect the freed blocks: sector size
- *    worst case split in allocation btrees per extent assuming 4 extents:
- *		4 exts * 2 trees * (2 * max depth - 1) * block size
- */
-#define	XFS_CALC_ATTRINVAL_LOG_RES(mp)	\
-	(MAX( \
-	 ((mp)->m_sb.sb_inodesize + \
-	  XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + \
-	  (128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)))), \
-	 ((4 * (mp)->m_sb.sb_sectsize) + \
-	  (4 * (mp)->m_sb.sb_sectsize) + \
-	  (mp)->m_sb.sb_sectsize + \
-	  XFS_ALLOCFREE_LOG_RES(mp, 4) + \
-	  (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))))))
-
 #define	XFS_ATTRINVAL_LOG_RES(mp)	((mp)->m_reservations.tr_attrinval)
-
-/*
- * Setting an attribute.
- *	the inode getting the attribute
- *	the superblock for allocations
- *	the agfs extents are allocated from
- *	the attribute btree * max depth
- *	the inode allocation btree
- * Since attribute transaction space is dependent on the size of the attribute,
- * the calculation is done partially at mount time and partially at runtime.
- */
-#define	XFS_CALC_ATTRSET_LOG_RES(mp)	\
-	((mp)->m_sb.sb_inodesize + \
-	 (mp)->m_sb.sb_sectsize + \
-	  XFS_FSB_TO_B((mp), XFS_DA_NODE_MAXDEPTH) + \
-	  (128 * (2 + XFS_DA_NODE_MAXDEPTH)))
-
 #define	XFS_ATTRSET_LOG_RES(mp, ext)	\
 	((mp)->m_reservations.tr_attrset + \
 	 (ext * (mp)->m_sb.sb_sectsize) + \
 	 (ext * XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))) + \
 	 (128 * (ext + (ext * XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)))))
-
-/*
- * Removing an attribute.
- *    the inode: inode size
- *    the attribute btree could join: max depth * block size
- *    the inode bmap btree could join or split: max depth * block size
- * And the bmap_finish transaction can free the attr blocks freed giving:
- *    the agf for the ag in which the blocks live: 2 * sector size
- *    the agfl for the ag in which the blocks live: 2 * sector size
- *    the superblock for the free block count: sector size
- *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- */
-#define	XFS_CALC_ATTRRM_LOG_RES(mp)	\
-	(MAX( \
-	  ((mp)->m_sb.sb_inodesize + \
-	  XFS_FSB_TO_B((mp), XFS_DA_NODE_MAXDEPTH) + \
-	  XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + \
-	  (128 * (1 + XFS_DA_NODE_MAXDEPTH + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))), \
-	 ((2 * (mp)->m_sb.sb_sectsize) + \
-	  (2 * (mp)->m_sb.sb_sectsize) + \
-	  (mp)->m_sb.sb_sectsize + \
-	  XFS_ALLOCFREE_LOG_RES(mp, 2) + \
-	  (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
-
 #define	XFS_ATTRRM_LOG_RES(mp)	((mp)->m_reservations.tr_attrrm)
-
-/*
- * Clearing a bad agino number in an agi hash bucket.
- */
-#define	XFS_CALC_CLEAR_AGI_BUCKET_LOG_RES(mp) \
-	((mp)->m_sb.sb_sectsize + 128)
-
 #define	XFS_CLEAR_AGI_BUCKET_LOG_RES(mp)  ((mp)->m_reservations.tr_clearagi)
 
 
@@ -805,6 +404,7 @@ struct xfs_log_item_desc;
 struct xfs_mount;
 struct xfs_trans;
 struct xfs_dquot_acct;
+struct xfs_busy_extent;
 
 typedef struct xfs_log_item {
 	struct list_head		li_ail;		/* AIL pointers */
@@ -820,6 +420,11 @@ typedef struct xfs_log_item {
 							/* buffer item iodone */
 							/* callback func */
 	struct xfs_item_ops		*li_ops;	/* function list */
+
+	/* delayed logging */
+	struct list_head		li_cil;		/* CIL pointers */
+	struct xfs_log_vec		*li_lv;		/* active log vector */
+	xfs_lsn_t			li_seq;		/* CIL commit seq */
 } xfs_log_item_t;
 
 #define	XFS_LI_IN_AIL	0x1
@@ -833,7 +438,7 @@ typedef struct xfs_item_ops {
 	uint (*iop_size)(xfs_log_item_t *);
 	void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
 	void (*iop_pin)(xfs_log_item_t *);
-	void (*iop_unpin)(xfs_log_item_t *, int);
+	void (*iop_unpin)(xfs_log_item_t *);
 	void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *);
 	uint (*iop_trylock)(xfs_log_item_t *);
 	void (*iop_unlock)(xfs_log_item_t *);
@@ -846,7 +451,7 @@ typedef struct xfs_item_ops {
 #define IOP_SIZE(ip)		(*(ip)->li_ops->iop_size)(ip)
 #define IOP_FORMAT(ip,vp)	(*(ip)->li_ops->iop_format)(ip, vp)
 #define IOP_PIN(ip)		(*(ip)->li_ops->iop_pin)(ip)
-#define IOP_UNPIN(ip, flags)	(*(ip)->li_ops->iop_unpin)(ip, flags)
+#define IOP_UNPIN(ip)		(*(ip)->li_ops->iop_unpin)(ip)
 #define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp)
 #define IOP_TRYLOCK(ip)		(*(ip)->li_ops->iop_trylock)(ip)
 #define IOP_UNLOCK(ip)		(*(ip)->li_ops->iop_unlock)(ip)
@@ -861,36 +466,7 @@ typedef struct xfs_item_ops {
 #define	XFS_ITEM_SUCCESS	0
 #define	XFS_ITEM_PINNED		1
 #define	XFS_ITEM_LOCKED		2
-#define	XFS_ITEM_FLUSHING	3
-#define XFS_ITEM_PUSHBUF	4
-
-/*
- * This structure is used to maintain a list of block ranges that have been
- * freed in the transaction.  The ranges are listed in the perag[] busy list
- * between when they're freed and the transaction is committed to disk.
- */
-
-typedef struct xfs_log_busy_slot {
-	xfs_agnumber_t		lbc_ag;
-	ushort			lbc_idx;	/* index in perag.busy[] */
-} xfs_log_busy_slot_t;
-
-#define XFS_LBC_NUM_SLOTS	31
-typedef struct xfs_log_busy_chunk {
-	struct xfs_log_busy_chunk	*lbc_next;
-	uint				lbc_free;	/* free slots bitmask */
-	ushort				lbc_unused;	/* first unused */
-	xfs_log_busy_slot_t		lbc_busy[XFS_LBC_NUM_SLOTS];
-} xfs_log_busy_chunk_t;
-
-#define	XFS_LBC_MAX_SLOT	(XFS_LBC_NUM_SLOTS - 1)
-#define	XFS_LBC_FREEMASK	((1U << XFS_LBC_NUM_SLOTS) - 1)
-
-#define	XFS_LBC_INIT(cp)	((cp)->lbc_free = XFS_LBC_FREEMASK)
-#define	XFS_LBC_CLAIM(cp, slot)	((cp)->lbc_free &= ~(1 << (slot)))
-#define	XFS_LBC_SLOT(cp, slot)	(&((cp)->lbc_busy[(slot)]))
-#define	XFS_LBC_VACANCY(cp)	(((cp)->lbc_free) & XFS_LBC_FREEMASK)
-#define	XFS_LBC_ISFREE(cp, slot) ((cp)->lbc_free & (1 << (slot)))
+#define XFS_ITEM_PUSHBUF	3
 
 /*
  * This is the type of function which can be given to xfs_trans_callback()
@@ -911,7 +487,7 @@ typedef struct xfs_trans {
 	unsigned int		t_blk_res_used;	/* # of resvd blocks used */
 	unsigned int		t_rtx_res;	/* # of rt extents resvd */
 	unsigned int		t_rtx_res_used;	/* # of resvd rt extents used */
-	xfs_log_ticket_t	t_ticket;	/* log mgr ticket */
+	struct xlog_ticket	*t_ticket;	/* log mgr ticket */
 	xfs_lsn_t		t_lsn;		/* log seq num of start of
 						 * transaction. */
 	xfs_lsn_t		t_commit_lsn;	/* log seq num of end of
@@ -943,8 +519,7 @@ typedef struct xfs_trans {
 	unsigned int		t_items_free;	/* log item descs free */
 	xfs_log_item_chunk_t	t_items;	/* first log item desc chunk */
 	xfs_trans_header_t	t_header;	/* header for in-log trans */
-	unsigned int		t_busy_free;	/* busy descs free */
-	xfs_log_busy_chunk_t	t_busy;		/* busy/async free blocks */
+	struct list_head	t_busy;		/* list of busy extents */
 	unsigned long		t_pflags;	/* saved process flags state */
 } xfs_trans_t;
 
@@ -1018,9 +593,6 @@ int		_xfs_trans_commit(xfs_trans_t *,
 void		xfs_trans_cancel(xfs_trans_t *, int);
 int		xfs_trans_ail_init(struct xfs_mount *);
 void		xfs_trans_ail_destroy(struct xfs_mount *);
-xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp,
-					xfs_agnumber_t ag,
-					xfs_extlen_t idx);
 
 extern kmem_zone_t	*xfs_trans_zone;
 
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 2ffc570679be..e799824f7245 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -237,14 +237,15 @@ out:
 }
 
 /*
- * Function that does the work of pushing on the AIL
+ * xfsaild_push does the work of pushing on the AIL.  Returning a timeout of
+ * zero indicates that the caller should sleep until woken.
  */
 long
 xfsaild_push(
 	struct xfs_ail	*ailp,
 	xfs_lsn_t	*last_lsn)
 {
-	long		tout = 1000; /* milliseconds */
+	long		tout = 0;
 	xfs_lsn_t	last_pushed_lsn = *last_lsn;
 	xfs_lsn_t	target =  ailp->xa_target;
 	xfs_lsn_t	lsn;
@@ -252,6 +253,7 @@ xfsaild_push(
 	int		flush_log, count, stuck;
 	xfs_mount_t	*mp = ailp->xa_mount;
 	struct xfs_ail_cursor	*cur = &ailp->xa_cursors;
+	int		push_xfsbufd = 0;
 
 	spin_lock(&ailp->xa_lock);
 	xfs_trans_ail_cursor_init(ailp, cur);
@@ -262,7 +264,7 @@ xfsaild_push(
 		 */
 		xfs_trans_ail_cursor_done(ailp, cur);
 		spin_unlock(&ailp->xa_lock);
-		last_pushed_lsn = 0;
+		*last_lsn = 0;
 		return tout;
 	}
 
@@ -279,7 +281,6 @@ xfsaild_push(
 	 * prevents use from spinning when we can't do anything or there is
 	 * lots of contention on the AIL lists.
 	 */
-	tout = 10;
 	lsn = lip->li_lsn;
 	flush_log = stuck = count = 0;
 	while ((XFS_LSN_CMP(lip->li_lsn, target) < 0)) {
@@ -308,6 +309,7 @@ xfsaild_push(
 			XFS_STATS_INC(xs_push_ail_pushbuf);
 			IOP_PUSHBUF(lip);
 			last_pushed_lsn = lsn;
+			push_xfsbufd = 1;
 			break;
 
 		case XFS_ITEM_PINNED:
@@ -322,12 +324,6 @@ xfsaild_push(
 			stuck++;
 			break;
 
-		case XFS_ITEM_FLUSHING:
-			XFS_STATS_INC(xs_push_ail_flushing);
-			last_pushed_lsn = lsn;
-			stuck++;
-			break;
-
 		default:
 			ASSERT(0);
 			break;
@@ -371,19 +367,24 @@ xfsaild_push(
 		 * move forward in the AIL.
 		 */
 		XFS_STATS_INC(xs_push_ail_flush);
-		xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
+		xfs_log_force(mp, 0);
+	}
+
+	if (push_xfsbufd) {
+		/* we've got delayed write buffers to flush */
+		wake_up_process(mp->m_ddev_targp->bt_task);
 	}
 
 	if (!count) {
 		/* We're past our target or empty, so idle */
-		tout = 1000;
+		last_pushed_lsn = 0;
 	} else if (XFS_LSN_CMP(lsn, target) >= 0) {
 		/*
 		 * We reached the target so wait a bit longer for I/O to
 		 * complete and remove pushed items from the AIL before we
 		 * start the next scan from the start of the AIL.
 		 */
-		tout += 20;
+		tout = 50;
 		last_pushed_lsn = 0;
 	} else if ((stuck * 100) / count > 90) {
 		/*
@@ -395,11 +396,14 @@ xfsaild_push(
 		 * Backoff a bit more to allow some I/O to complete before
 		 * continuing from where we were.
 		 */
-		tout += 10;
+		tout = 20;
+	} else {
+		/* more to do, but wait a short while before continuing */
+		tout = 10;
 	}
 	*last_lsn = last_pushed_lsn;
 	return tout;
-}	/* xfsaild_push */
+}
 
 
 /*
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 49130628d5ef..63d81a22f4fd 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -40,12 +40,111 @@
 #include "xfs_rw.h"
 #include "xfs_trace.h"
 
+/*
+ * Check to see if a buffer matching the given parameters is already
+ * a part of the given transaction.
+ */
+STATIC struct xfs_buf *
+xfs_trans_buf_item_match(
+	struct xfs_trans	*tp,
+	struct xfs_buftarg	*target,
+	xfs_daddr_t		blkno,
+	int			len)
+{
+	xfs_log_item_chunk_t	*licp;
+	xfs_log_item_desc_t	*lidp;
+	xfs_buf_log_item_t	*blip;
+	int			i;
+
+	len = BBTOB(len);
+	for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) {
+		if (xfs_lic_are_all_free(licp)) {
+			ASSERT(licp == &tp->t_items);
+			ASSERT(licp->lic_next == NULL);
+			return NULL;
+		}
+
+		for (i = 0; i < licp->lic_unused; i++) {
+			/*
+			 * Skip unoccupied slots.
+			 */
+			if (xfs_lic_isfree(licp, i))
+				continue;
+
+			lidp = xfs_lic_slot(licp, i);
+			blip = (xfs_buf_log_item_t *)lidp->lid_item;
+			if (blip->bli_item.li_type != XFS_LI_BUF)
+				continue;
 
-STATIC xfs_buf_t *xfs_trans_buf_item_match(xfs_trans_t *, xfs_buftarg_t *,
-		xfs_daddr_t, int);
-STATIC xfs_buf_t *xfs_trans_buf_item_match_all(xfs_trans_t *, xfs_buftarg_t *,
-		xfs_daddr_t, int);
+			if (XFS_BUF_TARGET(blip->bli_buf) == target &&
+			    XFS_BUF_ADDR(blip->bli_buf) == blkno &&
+			    XFS_BUF_COUNT(blip->bli_buf) == len)
+				return blip->bli_buf;
+		}
+	}
 
+	return NULL;
+}
+
+/*
+ * Add the locked buffer to the transaction.
+ *
+ * The buffer must be locked, and it cannot be associated with any
+ * transaction.
+ *
+ * If the buffer does not yet have a buf log item associated with it,
+ * then allocate one for it.  Then add the buf item to the transaction.
+ */
+STATIC void
+_xfs_trans_bjoin(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*bp,
+	int			reset_recur)
+{
+	struct xfs_buf_log_item	*bip;
+
+	ASSERT(XFS_BUF_ISBUSY(bp));
+	ASSERT(XFS_BUF_FSPRIVATE2(bp, void *) == NULL);
+
+	/*
+	 * The xfs_buf_log_item pointer is stored in b_fsprivate.  If
+	 * it doesn't have one yet, then allocate one and initialize it.
+	 * The checks to see if one is there are in xfs_buf_item_init().
+	 */
+	xfs_buf_item_init(bp, tp->t_mountp);
+	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+	ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
+	ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
+	if (reset_recur)
+		bip->bli_recur = 0;
+
+	/*
+	 * Take a reference for this transaction on the buf item.
+	 */
+	atomic_inc(&bip->bli_refcount);
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	(void) xfs_trans_add_item(tp, (xfs_log_item_t *)bip);
+
+	/*
+	 * Initialize b_fsprivate2 so we can find it with incore_match()
+	 * in xfs_trans_get_buf() and friends above.
+	 */
+	XFS_BUF_SET_FSPRIVATE2(bp, tp);
+
+}
+
+void
+xfs_trans_bjoin(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*bp)
+{
+	_xfs_trans_bjoin(tp, bp, 0);
+	trace_xfs_trans_bjoin(bp->b_fspriv);
+}
 
 /*
  * Get and lock the buffer for the caller if it is not already
@@ -53,14 +152,6 @@ STATIC xfs_buf_t *xfs_trans_buf_item_match_all(xfs_trans_t *, xfs_buftarg_t *,
  * within the transaction, just increment its lock recursion count
  * and return a pointer to it.
  *
- * Use the fast path function xfs_trans_buf_item_match() or the buffer
- * cache routine incore_match() to find the buffer
- * if it is already owned by this transaction.
- *
- * If we don't already own the buffer, use get_buf() to get it.
- * If it doesn't yet have an associated xfs_buf_log_item structure,
- * then allocate one and add the item to this transaction.
- *
  * If the transaction pointer is NULL, make this just a normal
  * get_buf() call.
  */
@@ -75,13 +166,14 @@ xfs_trans_get_buf(xfs_trans_t	*tp,
 	xfs_buf_log_item_t	*bip;
 
 	if (flags == 0)
-		flags = XFS_BUF_LOCK | XFS_BUF_MAPPED;
+		flags = XBF_LOCK | XBF_MAPPED;
 
 	/*
 	 * Default to a normal get_buf() call if the tp is NULL.
 	 */
 	if (tp == NULL)
-		return xfs_buf_get(target_dev, blkno, len, flags | BUF_BUSY);
+		return xfs_buf_get(target_dev, blkno, len,
+				   flags | XBF_DONT_BLOCK);
 
 	/*
 	 * If we find the buffer in the cache with this transaction
@@ -89,11 +181,7 @@ xfs_trans_get_buf(xfs_trans_t	*tp,
 	 * have it locked.  In this case we just increment the lock
 	 * recursion count and return the buffer to the caller.
 	 */
-	if (tp->t_items.lic_next == NULL) {
-		bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len);
-	} else {
-		bp  = xfs_trans_buf_item_match_all(tp, target_dev, blkno, len);
-	}
+	bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len);
 	if (bp != NULL) {
 		ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
 		if (XFS_FORCED_SHUTDOWN(tp->t_mountp))
@@ -117,54 +205,22 @@ xfs_trans_get_buf(xfs_trans_t	*tp,
 	}
 
 	/*
-	 * We always specify the BUF_BUSY flag within a transaction so
-	 * that get_buf does not try to push out a delayed write buffer
+	 * We always specify the XBF_DONT_BLOCK flag within a transaction
+	 * so that get_buf does not try to push out a delayed write buffer
 	 * which might cause another transaction to take place (if the
 	 * buffer was delayed alloc).  Such recursive transactions can
 	 * easily deadlock with our current transaction as well as cause
 	 * us to run out of stack space.
 	 */
-	bp = xfs_buf_get(target_dev, blkno, len, flags | BUF_BUSY);
+	bp = xfs_buf_get(target_dev, blkno, len, flags | XBF_DONT_BLOCK);
 	if (bp == NULL) {
 		return NULL;
 	}
 
 	ASSERT(!XFS_BUF_GETERROR(bp));
 
-	/*
-	 * The xfs_buf_log_item pointer is stored in b_fsprivate.  If
-	 * it doesn't have one yet, then allocate one and initialize it.
-	 * The checks to see if one is there are in xfs_buf_item_init().
-	 */
-	xfs_buf_item_init(bp, tp->t_mountp);
-
-	/*
-	 * Set the recursion count for the buffer within this transaction
-	 * to 0.
-	 */
-	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
-	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
-	ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
-	ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
-	bip->bli_recur = 0;
-
-	/*
-	 * Take a reference for this transaction on the buf item.
-	 */
-	atomic_inc(&bip->bli_refcount);
-
-	/*
-	 * Get a log_item_desc to point at the new item.
-	 */
-	(void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip);
-
-	/*
-	 * Initialize b_fsprivate2 so we can find it with incore_match()
-	 * above.
-	 */
-	XFS_BUF_SET_FSPRIVATE2(bp, tp);
-
-	trace_xfs_trans_get_buf(bip);
+	_xfs_trans_bjoin(tp, bp, 1);
+	trace_xfs_trans_get_buf(bp->b_fspriv);
 	return (bp);
 }
 
@@ -209,44 +265,11 @@ xfs_trans_getsb(xfs_trans_t	*tp,
 	}
 
 	bp = xfs_getsb(mp, flags);
-	if (bp == NULL) {
+	if (bp == NULL)
 		return NULL;
-	}
-
-	/*
-	 * The xfs_buf_log_item pointer is stored in b_fsprivate.  If
-	 * it doesn't have one yet, then allocate one and initialize it.
-	 * The checks to see if one is there are in xfs_buf_item_init().
-	 */
-	xfs_buf_item_init(bp, mp);
-
-	/*
-	 * Set the recursion count for the buffer within this transaction
-	 * to 0.
-	 */
-	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
-	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
-	ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
-	ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
-	bip->bli_recur = 0;
-
-	/*
-	 * Take a reference for this transaction on the buf item.
-	 */
-	atomic_inc(&bip->bli_refcount);
 
-	/*
-	 * Get a log_item_desc to point at the new item.
-	 */
-	(void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip);
-
-	/*
-	 * Initialize b_fsprivate2 so we can find it with incore_match()
-	 * above.
-	 */
-	XFS_BUF_SET_FSPRIVATE2(bp, tp);
-
-	trace_xfs_trans_getsb(bip);
+	_xfs_trans_bjoin(tp, bp, 1);
+	trace_xfs_trans_getsb(bp->b_fspriv);
 	return (bp);
 }
 
@@ -264,14 +287,6 @@ int	xfs_error_mod = 33;
  * within the transaction and already read in, just increment its
  * lock recursion count and return a pointer to it.
  *
- * Use the fast path function xfs_trans_buf_item_match() or the buffer
- * cache routine incore_match() to find the buffer
- * if it is already owned by this transaction.
- *
- * If we don't already own the buffer, use read_buf() to get it.
- * If it doesn't yet have an associated xfs_buf_log_item structure,
- * then allocate one and add the item to this transaction.
- *
  * If the transaction pointer is NULL, make this just a normal
  * read_buf() call.
  */
@@ -290,15 +305,15 @@ xfs_trans_read_buf(
 	int			error;
 
 	if (flags == 0)
-		flags = XFS_BUF_LOCK | XFS_BUF_MAPPED;
+		flags = XBF_LOCK | XBF_MAPPED;
 
 	/*
 	 * Default to a normal get_buf() call if the tp is NULL.
 	 */
 	if (tp == NULL) {
-		bp = xfs_buf_read(target, blkno, len, flags | BUF_BUSY);
+		bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK);
 		if (!bp)
-			return (flags & XFS_BUF_TRYLOCK) ?
+			return (flags & XBF_TRYLOCK) ?
 					EAGAIN : XFS_ERROR(ENOMEM);
 
 		if (XFS_BUF_GETERROR(bp) != 0) {
@@ -333,11 +348,7 @@ xfs_trans_read_buf(
 	 * If the buffer is not yet read in, then we read it in, increment
 	 * the lock recursion count, and return it to the caller.
 	 */
-	if (tp->t_items.lic_next == NULL) {
-		bp = xfs_trans_buf_item_match(tp, target, blkno, len);
-	} else {
-		bp = xfs_trans_buf_item_match_all(tp, target, blkno, len);
-	}
+	bp = xfs_trans_buf_item_match(tp, target, blkno, len);
 	if (bp != NULL) {
 		ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
 		ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
@@ -385,14 +396,14 @@ xfs_trans_read_buf(
 	}
 
 	/*
-	 * We always specify the BUF_BUSY flag within a transaction so
-	 * that get_buf does not try to push out a delayed write buffer
+	 * We always specify the XBF_DONT_BLOCK flag within a transaction
+	 * so that get_buf does not try to push out a delayed write buffer
 	 * which might cause another transaction to take place (if the
 	 * buffer was delayed alloc).  Such recursive transactions can
 	 * easily deadlock with our current transaction as well as cause
 	 * us to run out of stack space.
 	 */
-	bp = xfs_buf_read(target, blkno, len, flags | BUF_BUSY);
+	bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK);
 	if (bp == NULL) {
 		*bpp = NULL;
 		return 0;
@@ -424,40 +435,9 @@ xfs_trans_read_buf(
 	if (XFS_FORCED_SHUTDOWN(mp))
 		goto shutdown_abort;
 
-	/*
-	 * The xfs_buf_log_item pointer is stored in b_fsprivate.  If
-	 * it doesn't have one yet, then allocate one and initialize it.
-	 * The checks to see if one is there are in xfs_buf_item_init().
-	 */
-	xfs_buf_item_init(bp, tp->t_mountp);
-
-	/*
-	 * Set the recursion count for the buffer within this transaction
-	 * to 0.
-	 */
-	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
-	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
-	ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
-	ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
-	bip->bli_recur = 0;
-
-	/*
-	 * Take a reference for this transaction on the buf item.
-	 */
-	atomic_inc(&bip->bli_refcount);
+	_xfs_trans_bjoin(tp, bp, 1);
+	trace_xfs_trans_read_buf(bp->b_fspriv);
 
-	/*
-	 * Get a log_item_desc to point at the new item.
-	 */
-	(void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip);
-
-	/*
-	 * Initialize b_fsprivate2 so we can find it with incore_match()
-	 * above.
-	 */
-	XFS_BUF_SET_FSPRIVATE2(bp, tp);
-
-	trace_xfs_trans_read_buf(bip);
 	*bpp = bp;
 	return 0;
 
@@ -472,8 +452,8 @@ shutdown_abort:
 	if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp))
 		cmn_err(CE_NOTE, "about to pop assert, bp == 0x%p", bp);
 #endif
-	ASSERT((XFS_BUF_BFLAGS(bp) & (XFS_B_STALE|XFS_B_DELWRI)) !=
-						(XFS_B_STALE|XFS_B_DELWRI));
+	ASSERT((XFS_BUF_BFLAGS(bp) & (XBF_STALE|XBF_DELWRI)) !=
+				     (XBF_STALE|XBF_DELWRI));
 
 	trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
 	xfs_buf_relse(bp);
@@ -531,7 +511,7 @@ xfs_trans_brelse(xfs_trans_t	*tp,
 	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
 	ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
 	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
-	ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+	ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 
 	/*
@@ -622,53 +602,6 @@ xfs_trans_brelse(xfs_trans_t	*tp,
 }
 
 /*
- * Add the locked buffer to the transaction.
- * The buffer must be locked, and it cannot be associated with any
- * transaction.
- *
- * If the buffer does not yet have a buf log item associated with it,
- * then allocate one for it.  Then add the buf item to the transaction.
- */
-void
-xfs_trans_bjoin(xfs_trans_t	*tp,
-		xfs_buf_t	*bp)
-{
-	xfs_buf_log_item_t	*bip;
-
-	ASSERT(XFS_BUF_ISBUSY(bp));
-	ASSERT(XFS_BUF_FSPRIVATE2(bp, void *) == NULL);
-
-	/*
-	 * The xfs_buf_log_item pointer is stored in b_fsprivate.  If
-	 * it doesn't have one yet, then allocate one and initialize it.
-	 * The checks to see if one is there are in xfs_buf_item_init().
-	 */
-	xfs_buf_item_init(bp, tp->t_mountp);
-	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
-	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
-	ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
-	ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
-
-	/*
-	 * Take a reference for this transaction on the buf item.
-	 */
-	atomic_inc(&bip->bli_refcount);
-
-	/*
-	 * Get a log_item_desc to point at the new item.
-	 */
-	(void) xfs_trans_add_item(tp, (xfs_log_item_t *)bip);
-
-	/*
-	 * Initialize b_fsprivate2 so we can find it with incore_match()
-	 * in xfs_trans_get_buf() and friends above.
-	 */
-	XFS_BUF_SET_FSPRIVATE2(bp, tp);
-
-	trace_xfs_trans_bjoin(bip);
-}
-
-/*
  * Mark the buffer as not needing to be unlocked when the buf item's
  * IOP_UNLOCK() routine is called.  The buffer must already be locked
  * and associated with the given transaction.
@@ -686,7 +619,7 @@ xfs_trans_bhold(xfs_trans_t	*tp,
 
 	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
 	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
-	ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+	ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 	bip->bli_flags |= XFS_BLI_HOLD;
 	trace_xfs_trans_bhold(bip);
@@ -708,7 +641,7 @@ xfs_trans_bhold_release(xfs_trans_t	*tp,
 
 	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
 	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
-	ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+	ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 	ASSERT(bip->bli_flags & XFS_BLI_HOLD);
 	bip->bli_flags &= ~XFS_BLI_HOLD;
@@ -771,7 +704,7 @@ xfs_trans_log_buf(xfs_trans_t	*tp,
 		bip->bli_flags &= ~XFS_BLI_STALE;
 		ASSERT(XFS_BUF_ISSTALE(bp));
 		XFS_BUF_UNSTALE(bp);
-		bip->bli_format.blf_flags &= ~XFS_BLI_CANCEL;
+		bip->bli_format.blf_flags &= ~XFS_BLF_CANCEL;
 	}
 
 	lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
@@ -779,7 +712,6 @@ xfs_trans_log_buf(xfs_trans_t	*tp,
 
 	tp->t_flags |= XFS_TRANS_DIRTY;
 	lidp->lid_flags |= XFS_LID_DIRTY;
-	lidp->lid_flags &= ~XFS_LID_BUF_STALE;
 	bip->bli_flags |= XFS_BLI_LOGGED;
 	xfs_buf_item_log(bip, first, last);
 }
@@ -830,8 +762,8 @@ xfs_trans_binval(
 		ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
 		ASSERT(XFS_BUF_ISSTALE(bp));
 		ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY)));
-		ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_INODE_BUF));
-		ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+		ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF));
+		ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
 		ASSERT(lidp->lid_flags & XFS_LID_DIRTY);
 		ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
 		return;
@@ -842,7 +774,7 @@ xfs_trans_binval(
 	 * in the buf log item.  The STALE flag will be used in
 	 * xfs_buf_item_unpin() to determine if it should clean up
 	 * when the last reference to the buf item is given up.
-	 * We set the XFS_BLI_CANCEL flag in the buf log format structure
+	 * We set the XFS_BLF_CANCEL flag in the buf log format structure
 	 * and log the buf item.  This will be used at recovery time
 	 * to determine that copies of the buffer in the log before
 	 * this should not be replayed.
@@ -860,26 +792,26 @@ xfs_trans_binval(
 	XFS_BUF_UNDELAYWRITE(bp);
 	XFS_BUF_STALE(bp);
 	bip->bli_flags |= XFS_BLI_STALE;
-	bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_DIRTY);
-	bip->bli_format.blf_flags &= ~XFS_BLI_INODE_BUF;
-	bip->bli_format.blf_flags |= XFS_BLI_CANCEL;
+	bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY);
+	bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF;
+	bip->bli_format.blf_flags |= XFS_BLF_CANCEL;
 	memset((char *)(bip->bli_format.blf_data_map), 0,
 	      (bip->bli_format.blf_map_size * sizeof(uint)));
-	lidp->lid_flags |= XFS_LID_DIRTY|XFS_LID_BUF_STALE;
+	lidp->lid_flags |= XFS_LID_DIRTY;
 	tp->t_flags |= XFS_TRANS_DIRTY;
 }
 
 /*
- * This call is used to indicate that the buffer contains on-disk
- * inodes which must be handled specially during recovery.  They
- * require special handling because only the di_next_unlinked from
- * the inodes in the buffer should be recovered.  The rest of the
- * data in the buffer is logged via the inodes themselves.
+ * This call is used to indicate that the buffer contains on-disk inodes which
+ * must be handled specially during recovery.  They require special handling
+ * because only the di_next_unlinked from the inodes in the buffer should be
+ * recovered.  The rest of the data in the buffer is logged via the inodes
+ * themselves.
  *
- * All we do is set the XFS_BLI_INODE_BUF flag in the buffer's log
- * format structure so that we'll know what to do at recovery time.
+ * All we do is set the XFS_BLI_INODE_BUF flag in the items flags so it can be
+ * transferred to the buffer's log format structure so that we'll know what to
+ * do at recovery time.
  */
-/* ARGSUSED */
 void
 xfs_trans_inode_buf(
 	xfs_trans_t	*tp,
@@ -894,7 +826,7 @@ xfs_trans_inode_buf(
 	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 
-	bip->bli_format.blf_flags |= XFS_BLI_INODE_BUF;
+	bip->bli_flags |= XFS_BLI_INODE_BUF;
 }
 
 /*
@@ -976,120 +908,12 @@ xfs_trans_dquot_buf(
 	ASSERT(XFS_BUF_ISBUSY(bp));
 	ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
 	ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
-	ASSERT(type == XFS_BLI_UDQUOT_BUF ||
-	       type == XFS_BLI_PDQUOT_BUF ||
-	       type == XFS_BLI_GDQUOT_BUF);
+	ASSERT(type == XFS_BLF_UDQUOT_BUF ||
+	       type == XFS_BLF_PDQUOT_BUF ||
+	       type == XFS_BLF_GDQUOT_BUF);
 
 	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 
 	bip->bli_format.blf_flags |= type;
 }
-
-/*
- * Check to see if a buffer matching the given parameters is already
- * a part of the given transaction.  Only check the first, embedded
- * chunk, since we don't want to spend all day scanning large transactions.
- */
-STATIC xfs_buf_t *
-xfs_trans_buf_item_match(
-	xfs_trans_t	*tp,
-	xfs_buftarg_t	*target,
-	xfs_daddr_t	blkno,
-	int		len)
-{
-	xfs_log_item_chunk_t	*licp;
-	xfs_log_item_desc_t	*lidp;
-	xfs_buf_log_item_t	*blip;
-	xfs_buf_t		*bp;
-	int			i;
-
-	bp = NULL;
-	len = BBTOB(len);
-	licp = &tp->t_items;
-	if (!xfs_lic_are_all_free(licp)) {
-		for (i = 0; i < licp->lic_unused; i++) {
-			/*
-			 * Skip unoccupied slots.
-			 */
-			if (xfs_lic_isfree(licp, i)) {
-				continue;
-			}
-
-			lidp = xfs_lic_slot(licp, i);
-			blip = (xfs_buf_log_item_t *)lidp->lid_item;
-			if (blip->bli_item.li_type != XFS_LI_BUF) {
-				continue;
-			}
-
-			bp = blip->bli_buf;
-			if ((XFS_BUF_TARGET(bp) == target) &&
-			    (XFS_BUF_ADDR(bp) == blkno) &&
-			    (XFS_BUF_COUNT(bp) == len)) {
-				/*
-				 * We found it.  Break out and
-				 * return the pointer to the buffer.
-				 */
-				break;
-			} else {
-				bp = NULL;
-			}
-		}
-	}
-	return bp;
-}
-
-/*
- * Check to see if a buffer matching the given parameters is already
- * a part of the given transaction.  Check all the chunks, we
- * want to be thorough.
- */
-STATIC xfs_buf_t *
-xfs_trans_buf_item_match_all(
-	xfs_trans_t	*tp,
-	xfs_buftarg_t	*target,
-	xfs_daddr_t	blkno,
-	int		len)
-{
-	xfs_log_item_chunk_t	*licp;
-	xfs_log_item_desc_t	*lidp;
-	xfs_buf_log_item_t	*blip;
-	xfs_buf_t		*bp;
-	int			i;
-
-	bp = NULL;
-	len = BBTOB(len);
-	for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) {
-		if (xfs_lic_are_all_free(licp)) {
-			ASSERT(licp == &tp->t_items);
-			ASSERT(licp->lic_next == NULL);
-			return NULL;
-		}
-		for (i = 0; i < licp->lic_unused; i++) {
-			/*
-			 * Skip unoccupied slots.
-			 */
-			if (xfs_lic_isfree(licp, i)) {
-				continue;
-			}
-
-			lidp = xfs_lic_slot(licp, i);
-			blip = (xfs_buf_log_item_t *)lidp->lid_item;
-			if (blip->bli_item.li_type != XFS_LI_BUF) {
-				continue;
-			}
-
-			bp = blip->bli_buf;
-			if ((XFS_BUF_TARGET(bp) == target) &&
-			    (XFS_BUF_ADDR(bp) == blkno) &&
-			    (XFS_BUF_COUNT(bp) == len)) {
-				/*
-				 * We found it.  Break out and
-				 * return the pointer to the buffer.
-				 */
-				return bp;
-			}
-		}
-	}
-	return NULL;
-}
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 785ff101da0a..2559dfec946b 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -62,7 +62,7 @@ xfs_trans_iget(
 {
 	int			error;
 
-	error = xfs_iget(mp, tp, ino, flags, lock_flags, ipp, 0);
+	error = xfs_iget(mp, tp, ino, flags, lock_flags, ipp);
 	if (!error && tp)
 		xfs_trans_ijoin(tp, *ipp, lock_flags);
 	return error;
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c
index eb3fc57f9eef..f11d37d06dcc 100644
--- a/fs/xfs/xfs_trans_item.c
+++ b/fs/xfs/xfs_trans_item.c
@@ -299,6 +299,7 @@ xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
 void
 xfs_trans_free_items(
 	xfs_trans_t	*tp,
+	xfs_lsn_t	commit_lsn,
 	int		flags)
 {
 	xfs_log_item_chunk_t	*licp;
@@ -311,7 +312,7 @@ xfs_trans_free_items(
 	 * Special case the embedded chunk so we don't free it below.
 	 */
 	if (!xfs_lic_are_all_free(licp)) {
-		(void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN);
+		(void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn);
 		xfs_lic_all_free(licp);
 		licp->lic_unused = 0;
 	}
@@ -322,7 +323,7 @@ xfs_trans_free_items(
 	 */
 	while (licp != NULL) {
 		ASSERT(!xfs_lic_are_all_free(licp));
-		(void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN);
+		(void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn);
 		next_licp = licp->lic_next;
 		kmem_free(licp);
 		licp = next_licp;
@@ -438,112 +439,3 @@ xfs_trans_unlock_chunk(
 
 	return freed;
 }
-
-
-/*
- * This is called to add the given busy item to the transaction's
- * list of busy items.  It must find a free busy item descriptor
- * or allocate a new one and add the item to that descriptor.
- * The function returns a pointer to busy descriptor used to point
- * to the new busy entry.  The log busy entry will now point to its new
- * descriptor with its ???? field.
- */
-xfs_log_busy_slot_t *
-xfs_trans_add_busy(xfs_trans_t *tp, xfs_agnumber_t ag, xfs_extlen_t idx)
-{
-	xfs_log_busy_chunk_t	*lbcp;
-	xfs_log_busy_slot_t	*lbsp;
-	int			i=0;
-
-	/*
-	 * If there are no free descriptors, allocate a new chunk
-	 * of them and put it at the front of the chunk list.
-	 */
-	if (tp->t_busy_free == 0) {
-		lbcp = (xfs_log_busy_chunk_t*)
-		       kmem_alloc(sizeof(xfs_log_busy_chunk_t), KM_SLEEP);
-		ASSERT(lbcp != NULL);
-		/*
-		 * Initialize the chunk, and then
-		 * claim the first slot in the newly allocated chunk.
-		 */
-		XFS_LBC_INIT(lbcp);
-		XFS_LBC_CLAIM(lbcp, 0);
-		lbcp->lbc_unused = 1;
-		lbsp = XFS_LBC_SLOT(lbcp, 0);
-
-		/*
-		 * Link in the new chunk and update the free count.
-		 */
-		lbcp->lbc_next = tp->t_busy.lbc_next;
-		tp->t_busy.lbc_next = lbcp;
-		tp->t_busy_free = XFS_LIC_NUM_SLOTS - 1;
-
-		/*
-		 * Initialize the descriptor and the generic portion
-		 * of the log item.
-		 *
-		 * Point the new slot at this item and return it.
-		 * Also point the log item at its currently active
-		 * descriptor and set the item's mount pointer.
-		 */
-		lbsp->lbc_ag = ag;
-		lbsp->lbc_idx = idx;
-		return lbsp;
-	}
-
-	/*
-	 * Find the free descriptor. It is somewhere in the chunklist
-	 * of descriptors.
-	 */
-	lbcp = &tp->t_busy;
-	while (lbcp != NULL) {
-		if (XFS_LBC_VACANCY(lbcp)) {
-			if (lbcp->lbc_unused <= XFS_LBC_MAX_SLOT) {
-				i = lbcp->lbc_unused;
-				break;
-			} else {
-				/* out-of-order vacancy */
-				cmn_err(CE_DEBUG, "OOO vacancy lbcp 0x%p\n", lbcp);
-				ASSERT(0);
-			}
-		}
-		lbcp = lbcp->lbc_next;
-	}
-	ASSERT(lbcp != NULL);
-	/*
-	 * If we find a free descriptor, claim it,
-	 * initialize it, and return it.
-	 */
-	XFS_LBC_CLAIM(lbcp, i);
-	if (lbcp->lbc_unused <= i) {
-		lbcp->lbc_unused = i + 1;
-	}
-	lbsp = XFS_LBC_SLOT(lbcp, i);
-	tp->t_busy_free--;
-	lbsp->lbc_ag = ag;
-	lbsp->lbc_idx = idx;
-	return lbsp;
-}
-
-
-/*
- * xfs_trans_free_busy
- * Free all of the busy lists from a transaction
- */
-void
-xfs_trans_free_busy(xfs_trans_t *tp)
-{
-	xfs_log_busy_chunk_t	*lbcp;
-	xfs_log_busy_chunk_t	*lbcq;
-
-	lbcp = tp->t_busy.lbc_next;
-	while (lbcp != NULL) {
-		lbcq = lbcp->lbc_next;
-		kmem_free(lbcp);
-		lbcp = lbcq;
-	}
-
-	XFS_LBC_INIT(&tp->t_busy);
-	tp->t_busy.lbc_unused = 0;
-}
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 73e2ad397432..c6e4f2c8de6e 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -35,13 +35,14 @@ struct xfs_log_item_desc	*xfs_trans_find_item(struct xfs_trans *,
 struct xfs_log_item_desc	*xfs_trans_first_item(struct xfs_trans *);
 struct xfs_log_item_desc	*xfs_trans_next_item(struct xfs_trans *,
 					     struct xfs_log_item_desc *);
-void				xfs_trans_free_items(struct xfs_trans *, int);
-void				xfs_trans_unlock_items(struct xfs_trans *,
-							xfs_lsn_t);
-void				xfs_trans_free_busy(xfs_trans_t *tp);
-xfs_log_busy_slot_t		*xfs_trans_add_busy(xfs_trans_t *tp,
-						    xfs_agnumber_t ag,
-						    xfs_extlen_t idx);
+
+void	xfs_trans_unlock_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn);
+void	xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
+				int flags);
+
+void	xfs_trans_item_committed(struct xfs_log_item *lip,
+				xfs_lsn_t commit_lsn, int aborted);
+void	xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
 
 /*
  * AIL traversal cursor.
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index d725428c9df6..320775295e32 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -75,6 +75,8 @@ typedef	__uint32_t	xfs_dahash_t;	/* dir/attr hash value */
 
 typedef __uint16_t	xfs_prid_t;	/* prid_t truncated to 16bits in XFS */
 
+typedef __uint32_t	xlog_tid_t;	/* transaction ID type */
+
 /*
  * These types are 64 bits on disk but are either 32 or 64 bits in memory.
  * Disk based types:
@@ -151,8 +153,8 @@ typedef enum {
 } xfs_btnum_t;
 
 struct xfs_name {
-	const char	*name;
-	int		len;
+	const unsigned char	*name;
+	int			len;
 };
 
 #endif	/* __XFS_TYPES_H__ */
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 6f268756bf36..c1646838898f 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -256,7 +256,7 @@ xfs_setattr(
 		    iattr->ia_size > ip->i_d.di_size) {
 			code = xfs_flush_pages(ip,
 					ip->i_d.di_size, iattr->ia_size,
-					XFS_B_ASYNC, FI_NONE);
+					XBF_ASYNC, FI_NONE);
 		}
 
 		/* wait for all I/O to complete */
@@ -267,7 +267,7 @@ xfs_setattr(
 		if (code) {
 			ASSERT(tp == NULL);
 			lock_flags &= ~XFS_ILOCK_EXCL;
-			ASSERT(lock_flags == XFS_IOLOCK_EXCL);
+			ASSERT(lock_flags == XFS_IOLOCK_EXCL || !need_iolock);
 			goto error_return;
 		}
 		tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
@@ -584,116 +584,6 @@ xfs_readlink(
 }
 
 /*
- * xfs_fsync
- *
- * This is called to sync the inode and its data out to disk.  We need to hold
- * the I/O lock while flushing the data, and the inode lock while flushing the
- * inode.  The inode lock CANNOT be held while flushing the data, so acquire
- * after we're done with that.
- */
-int
-xfs_fsync(
-	xfs_inode_t	*ip)
-{
-	xfs_trans_t	*tp;
-	int		error = 0;
-	int		log_flushed = 0, changed = 1;
-
-	xfs_itrace_entry(ip);
-
-	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-		return XFS_ERROR(EIO);
-
-	/*
-	 * We always need to make sure that the required inode state is safe on
-	 * disk.  The inode might be clean but we still might need to force the
-	 * log because of committed transactions that haven't hit the disk yet.
-	 * Likewise, there could be unflushed non-transactional changes to the
-	 * inode core that have to go to disk and this requires us to issue
-	 * a synchronous transaction to capture these changes correctly.
-	 *
-	 * This code relies on the assumption that if the update_* fields
-	 * of the inode are clear and the inode is unpinned then it is clean
-	 * and no action is required.
-	 */
-	xfs_ilock(ip, XFS_ILOCK_SHARED);
-
-	if (!ip->i_update_core) {
-		/*
-		 * Timestamps/size haven't changed since last inode flush or
-		 * inode transaction commit.  That means either nothing got
-		 * written or a transaction committed which caught the updates.
-		 * If the latter happened and the transaction hasn't hit the
-		 * disk yet, the inode will be still be pinned.  If it is,
-		 * force the log.
-		 */
-
-		xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
-		if (xfs_ipincount(ip)) {
-			error = _xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
-				      XFS_LOG_FORCE | XFS_LOG_SYNC,
-				      &log_flushed);
-		} else {
-			/*
-			 * If the inode is not pinned and nothing has changed
-			 * we don't need to flush the cache.
-			 */
-			changed = 0;
-		}
-	} else	{
-		/*
-		 * Kick off a transaction to log the inode core to get the
-		 * updates.  The sync transaction will also force the log.
-		 */
-		xfs_iunlock(ip, XFS_ILOCK_SHARED);
-		tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
-		error = xfs_trans_reserve(tp, 0,
-				XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0);
-		if (error) {
-			xfs_trans_cancel(tp, 0);
-			return error;
-		}
-		xfs_ilock(ip, XFS_ILOCK_EXCL);
-
-		/*
-		 * Note - it's possible that we might have pushed ourselves out
-		 * of the way during trans_reserve which would flush the inode.
-		 * But there's no guarantee that the inode buffer has actually
-		 * gone out yet (it's delwri).	Plus the buffer could be pinned
-		 * anyway if it's part of an inode in another recent
-		 * transaction.	 So we play it safe and fire off the
-		 * transaction anyway.
-		 */
-		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-		xfs_trans_ihold(tp, ip);
-		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-		xfs_trans_set_sync(tp);
-		error = _xfs_trans_commit(tp, 0, &log_flushed);
-
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-	}
-
-	if ((ip->i_mount->m_flags & XFS_MOUNT_BARRIER) && changed) {
-		/*
-		 * If the log write didn't issue an ordered tag we need
-		 * to flush the disk cache for the data device now.
-		 */
-		if (!log_flushed)
-			xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
-
-		/*
-		 * If this inode is on the RT dev we need to flush that
-		 * cache as well.
-		 */
-		if (XFS_IS_REALTIME_INODE(ip))
-			xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
-	}
-
-	return error;
-}
-
-/*
  * Flags for xfs_free_eofblocks
  */
 #define XFS_FREE_EOF_TRYLOCK	(1<<0)
@@ -1096,7 +986,7 @@ xfs_release(
 		 */
 		truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
 		if (truncated && VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0)
-			xfs_flush_pages(ip, 0, -1, XFS_B_ASYNC, FI_NONE);
+			xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE);
 	}
 
 	if (ip->i_d.di_nlink != 0) {
@@ -1379,7 +1269,7 @@ xfs_lookup(
 	if (error)
 		goto out;
 
-	error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp, 0);
+	error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
 	if (error)
 		goto out_free_name;
 
@@ -2199,7 +2089,8 @@ xfs_symlink(
 	if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) {
 		error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dp,
 					DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
-					link_name->name, target_path, 0, 0, 0);
+					link_name->name,
+					(unsigned char *)target_path, 0, 0, 0);
 		if (error)
 			return error;
 	}
@@ -2395,7 +2286,8 @@ std_return:
 					dp, DM_RIGHT_NULL,
 					error ? NULL : ip,
 					DM_RIGHT_NULL, link_name->name,
-					target_path, 0, error, 0);
+					(unsigned char *)target_path,
+					0, error, 0);
 	}
 
 	if (!error)
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 167a467403a5..d8dfa8d0dadd 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -21,7 +21,6 @@ int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags);
 #define XFS_ATTR_NOACL		0x08	/* Don't call xfs_acl_chmod */
 
 int xfs_readlink(struct xfs_inode *ip, char *link);
-int xfs_fsync(struct xfs_inode *ip);
 int xfs_release(struct xfs_inode *ip);
 int xfs_inactive(struct xfs_inode *ip);
 int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
@@ -43,25 +42,13 @@ int xfs_change_file_space(struct xfs_inode *ip, int cmd,
 int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
 		struct xfs_inode *src_ip, struct xfs_inode *target_dp,
 		struct xfs_name *target_name, struct xfs_inode *target_ip);
-int xfs_attr_get(struct xfs_inode *ip, const char *name, char *value,
-		int *valuelenp, int flags);
-int xfs_attr_set(struct xfs_inode *dp, const char *name, char *value,
-		int valuelen, int flags);
-int xfs_attr_remove(struct xfs_inode *dp, const char *name, int flags);
+int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name,
+		unsigned char *value, int *valuelenp, int flags);
+int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
+		unsigned char *value, int valuelen, int flags);
+int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
 int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
 		int flags, struct attrlist_cursor_kern *cursor);
-ssize_t xfs_read(struct xfs_inode *ip, struct kiocb *iocb,
-		const struct iovec *iovp, unsigned int segs,
-		loff_t *offset, int ioflags);
-ssize_t xfs_splice_read(struct xfs_inode *ip, struct file *infilp,
-		loff_t *ppos, struct pipe_inode_info *pipe, size_t count,
-		int flags, int ioflags);
-ssize_t xfs_splice_write(struct xfs_inode *ip,
-		struct pipe_inode_info *pipe, struct file *outfilp,
-		loff_t *ppos, size_t count, int flags, int ioflags);
-ssize_t xfs_write(struct xfs_inode *xip, struct kiocb *iocb,
-		const struct iovec *iovp, unsigned int nsegs,
-		loff_t *offset, int ioflags);
 int xfs_bmap(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
 		int flags, struct xfs_iomap *iomapp, int *niomaps);
 void xfs_tosspages(struct xfs_inode *inode, xfs_off_t first,
@@ -72,4 +59,6 @@ int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first,
 		xfs_off_t last, uint64_t flags, int fiopt);
 int xfs_wait_on_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last);
 
+int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
+
 #endif /* _XFS_VNODEOPS_H */
author	Michal Marek <mmarek@suse.cz>	2010-08-04 13:59:13 +0200
committer	Michal Marek <mmarek@suse.cz>	2010-08-04 13:59:13 +0200
commit	772320e84588dcbe1600ffb83e5f328f2209ac2a (patch)
tree	a7de21b79340aeaa17c58126f6b801b82c77b53a /fs
parent	1ce53adf13a54375d2a5c7cdbe341b2558389615 (diff)
parent	9fe6206f400646a2322096b56c59891d530e8d51 (diff)