summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig66
-rw-r--r--fs/Kconfig.binfmt2
-rw-r--r--fs/Makefile8
-rw-r--r--fs/afs/file.c4
-rw-r--r--fs/afs/internal.h8
-rw-r--r--fs/afs/write.c131
-rw-r--r--fs/autofs4/Makefile2
-rw-r--r--fs/autofs4/autofs_i.h44
-rw-r--r--fs/autofs4/dev-ioctl.c863
-rw-r--r--fs/autofs4/expire.c18
-rw-r--r--fs/autofs4/init.c11
-rw-r--r--fs/autofs4/inode.c8
-rw-r--r--fs/autofs4/waitq.c42
-rw-r--r--fs/befs/befs_fs_types.h4
-rw-r--r--fs/befs/linuxvfs.c4
-rw-r--r--fs/befs/super.c6
-rw-r--r--fs/binfmt_elf.c6
-rw-r--r--fs/binfmt_elf_fdpic.c85
-rw-r--r--fs/binfmt_em86.c2
-rw-r--r--fs/binfmt_flat.c6
-rw-r--r--fs/binfmt_misc.c4
-rw-r--r--fs/binfmt_script.c5
-rw-r--r--fs/binfmt_som.c2
-rw-r--r--fs/block_dev.c2
-rw-r--r--fs/char_dev.c3
-rw-r--r--fs/coda/psdev.c5
-rw-r--r--fs/compat.c41
-rw-r--r--fs/direct-io.c4
-rw-r--r--fs/dquot.c2
-rw-r--r--fs/ecryptfs/Makefile2
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h28
-rw-r--r--fs/ecryptfs/file.c17
-rw-r--r--fs/ecryptfs/keystore.c32
-rw-r--r--fs/ecryptfs/main.c19
-rw-r--r--fs/ecryptfs/messaging.c118
-rw-r--r--fs/ecryptfs/mmap.c81
-rw-r--r--fs/ecryptfs/netlink.c249
-rw-r--r--fs/eventpoll.c9
-rw-r--r--fs/exec.c15
-rw-r--r--fs/ext2/balloc.c3
-rw-r--r--fs/ext2/dir.c60
-rw-r--r--fs/ext4/balloc.c12
-rw-r--r--fs/ext4/ext4.h1
-rw-r--r--fs/ext4/ext4_sb.h3
-rw-r--r--fs/ext4/inode.c143
-rw-r--r--fs/ext4/mballoc.c263
-rw-r--r--fs/ext4/mballoc.h31
-rw-r--r--fs/ext4/super.c132
-rw-r--r--fs/fs-writeback.c2
-rw-r--r--fs/hfs/catalog.c4
-rw-r--r--fs/hfsplus/bitmap.c12
-rw-r--r--fs/hfsplus/catalog.c5
-rw-r--r--fs/hfsplus/super.c2
-rw-r--r--fs/jbd2/commit.c3
-rw-r--r--fs/jbd2/transaction.c1
-rw-r--r--fs/lockd/Makefile2
-rw-r--r--fs/lockd/clntlock.c13
-rw-r--r--fs/lockd/grace.c59
-rw-r--r--fs/lockd/host.c350
-rw-r--r--fs/lockd/mon.c2
-rw-r--r--fs/lockd/svc.c88
-rw-r--r--fs/lockd/svc4proc.c31
-rw-r--r--fs/lockd/svclock.c18
-rw-r--r--fs/lockd/svcproc.c31
-rw-r--r--fs/lockd/svcsubs.c2
-rw-r--r--fs/lockd/xdr.c2
-rw-r--r--fs/lockd/xdr4.c2
-rw-r--r--fs/mpage.c2
-rw-r--r--fs/nfs/callback.c3
-rw-r--r--fs/nfs/client.c5
-rw-r--r--fs/nfs/dir.c20
-rw-r--r--fs/nfs/file.c18
-rw-r--r--fs/nfs/inode.c183
-rw-r--r--fs/nfs/internal.h25
-rw-r--r--fs/nfs/mount_clnt.c3
-rw-r--r--fs/nfs/namespace.c7
-rw-r--r--fs/nfs/nfs3acl.c2
-rw-r--r--fs/nfs/nfs3proc.c20
-rw-r--r--fs/nfs/nfs4namespace.c105
-rw-r--r--fs/nfs/proc.c10
-rw-r--r--fs/nfs/super.c126
-rw-r--r--fs/nfs/unlink.c5
-rw-r--r--fs/nfs/write.c3
-rw-r--r--fs/nfsd/lockd.c1
-rw-r--r--fs/nfsd/nfs3proc.c8
-rw-r--r--fs/nfsd/nfs4callback.c7
-rw-r--r--fs/nfsd/nfs4proc.c8
-rw-r--r--fs/nfsd/nfs4state.c34
-rw-r--r--fs/nfsd/nfs4xdr.c171
-rw-r--r--fs/nfsd/nfsctl.c5
-rw-r--r--fs/nfsd/nfsfh.c30
-rw-r--r--fs/nfsd/nfsproc.c6
-rw-r--r--fs/nfsd/nfssvc.c20
-rw-r--r--fs/nfsd/vfs.c63
-rw-r--r--fs/nls/nls_base.c21
-rw-r--r--fs/ocfs2/Makefile3
-rw-r--r--fs/ocfs2/alloc.c913
-rw-r--r--fs/ocfs2/alloc.h86
-rw-r--r--fs/ocfs2/aops.c60
-rw-r--r--fs/ocfs2/buffer_head_io.c134
-rw-r--r--fs/ocfs2/buffer_head_io.h23
-rw-r--r--fs/ocfs2/cluster/masklog.c1
-rw-r--r--fs/ocfs2/cluster/masklog.h1
-rw-r--r--fs/ocfs2/dir.c109
-rw-r--r--fs/ocfs2/dlmglue.c9
-rw-r--r--fs/ocfs2/extent_map.c70
-rw-r--r--fs/ocfs2/extent_map.h4
-rw-r--r--fs/ocfs2/file.c333
-rw-r--r--fs/ocfs2/file.h32
-rw-r--r--fs/ocfs2/inode.c87
-rw-r--r--fs/ocfs2/inode.h6
-rw-r--r--fs/ocfs2/ioctl.c3
-rw-r--r--fs/ocfs2/journal.c89
-rw-r--r--fs/ocfs2/journal.h52
-rw-r--r--fs/ocfs2/localalloc.c384
-rw-r--r--fs/ocfs2/localalloc.h4
-rw-r--r--fs/ocfs2/locks.c15
-rw-r--r--fs/ocfs2/locks.h1
-rw-r--r--fs/ocfs2/namei.c101
-rw-r--r--fs/ocfs2/ocfs2.h56
-rw-r--r--fs/ocfs2/ocfs2_fs.h220
-rw-r--r--fs/ocfs2/ocfs2_jbd_compat.h82
-rw-r--r--fs/ocfs2/resize.c11
-rw-r--r--fs/ocfs2/slot_map.c7
-rw-r--r--fs/ocfs2/stack_user.c33
-rw-r--r--fs/ocfs2/stackglue.c20
-rw-r--r--fs/ocfs2/stackglue.h19
-rw-r--r--fs/ocfs2/suballoc.c248
-rw-r--r--fs/ocfs2/suballoc.h26
-rw-r--r--fs/ocfs2/super.c62
-rw-r--r--fs/ocfs2/symlink.c18
-rw-r--r--fs/ocfs2/uptodate.c38
-rw-r--r--fs/ocfs2/uptodate.h3
-rw-r--r--fs/ocfs2/xattr.c4832
-rw-r--r--fs/ocfs2/xattr.h68
-rw-r--r--fs/partitions/acorn.c10
-rw-r--r--fs/partitions/check.c27
-rw-r--r--fs/proc/proc_misc.c30
-rw-r--r--fs/reiserfs/procfs.c3
-rw-r--r--fs/reiserfs/xattr.c2
-rw-r--r--fs/sysfs/bin.c42
-rw-r--r--fs/sysfs/dir.c24
-rw-r--r--fs/sysfs/file.c46
-rw-r--r--fs/sysfs/mount.c15
-rw-r--r--fs/sysfs/sysfs.h6
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c2
146 files changed, 10028 insertions, 2588 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index f54a157a0296..d0a1174fb516 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -160,7 +160,7 @@ config EXT4_FS
filesystem initially.
To compile this file system support as a module, choose M here. The
- module will be called ext4dev.
+ module will be called ext4.
If unsure, say N.
@@ -220,17 +220,16 @@ config JBD
tristate
help
This is a generic journalling layer for block devices. It is
- currently used by the ext3 and OCFS2 file systems, but it could
- also be used to add journal support to other file systems or block
+ currently used by the ext3 file system, but it could also be
+ used to add journal support to other file systems or block
devices such as RAID or LVM.
- If you are using the ext3 or OCFS2 file systems, you need to
- say Y here. If you are not using ext3 OCFS2 then you will probably
- want to say N.
+ If you are using the ext3 file system, you need to say Y here.
+ If you are not using ext3 then you will probably want to say N.
To compile this device as a module, choose M here: the module will be
- called jbd. If you are compiling ext3 or OCFS2 into the kernel,
- you cannot compile this code as a module.
+ called jbd. If you are compiling ext3 into the kernel, you
+ cannot compile this code as a module.
config JBD_DEBUG
bool "JBD (ext3) debugging support"
@@ -254,15 +253,16 @@ config JBD2
help
This is a generic journaling layer for block devices that support
both 32-bit and 64-bit block numbers. It is currently used by
- the ext4 filesystem, but it could also be used to add
+ the ext4 and OCFS2 filesystems, but it could also be used to add
journal support to other file systems or block devices such
as RAID or LVM.
- If you are using ext4, you need to say Y here. If you are not
- using ext4 then you will probably want to say N.
+ If you are using ext4 or OCFS2, you need to say Y here.
+ If you are not using ext4 or OCFS2 then you will
+ probably want to say N.
To compile this device as a module, choose M here. The module will be
- called jbd2. If you are compiling ext4 into the kernel,
+ called jbd2. If you are compiling ext4 or OCFS2 into the kernel,
you cannot compile this code as a module.
config JBD2_DEBUG
@@ -433,6 +433,14 @@ config FS_POSIX_ACL
bool
default n
+config FILE_LOCKING
+ bool "Enable POSIX file locking API" if EMBEDDED
+ default y
+ help
+ This option enables standard file locking support, required
+ for filesystems like NFS and for the flock() system
+ call. Disabling this option saves about 11k.
+
source "fs/xfs/Kconfig"
source "fs/gfs2/Kconfig"
@@ -440,7 +448,7 @@ config OCFS2_FS
tristate "OCFS2 file system support"
depends on NET && SYSFS
select CONFIGFS_FS
- select JBD
+ select JBD2
select CRC32
help
OCFS2 is a general purpose extent based shared disk cluster file
@@ -511,6 +519,16 @@ config OCFS2_DEBUG_FS
this option for debugging only as it is likely to decrease
performance of the filesystem.
+config OCFS2_COMPAT_JBD
+ bool "Use JBD for compatibility"
+ depends on OCFS2_FS
+ default n
+ select JBD
+ help
+ The ocfs2 filesystem now uses JBD2 for its journalling. JBD2
+ is backwards compatible with JBD. It is safe to say N here.
+ However, if you really want to use the original JBD, say Y here.
+
endif # BLOCK
config DNOTIFY
@@ -1779,6 +1797,28 @@ config SUNRPC_XPRT_RDMA
If unsure, say N.
+config SUNRPC_REGISTER_V4
+ bool "Register local RPC services via rpcbind v4 (EXPERIMENTAL)"
+ depends on SUNRPC && EXPERIMENTAL
+ default n
+ help
+ Sun added support for registering RPC services at an IPv6
+ address by creating two new versions of the rpcbind protocol
+ (RFC 1833).
+
+ This option enables support in the kernel RPC server for
+ registering kernel RPC services via version 4 of the rpcbind
+ protocol. If you enable this option, you must run a portmapper
+ daemon that supports rpcbind protocol version 4.
+
+ Serving NFS over IPv6 from knfsd (the kernel's NFS server)
+ requires that you enable this option and use a portmapper that
+ supports rpcbind version 4.
+
+ If unsure, say N to get traditional behavior (register kernel
+ RPC services using only rpcbind version 2). Distributions
+ using the legacy Linux portmapper daemon must say N here.
+
config RPCSEC_GSS_KRB5
tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)"
depends on SUNRPC && EXPERIMENTAL
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 17c9c5ec14c5..801db1341811 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -25,7 +25,7 @@ config BINFMT_ELF
config COMPAT_BINFMT_ELF
bool
- depends on COMPAT && MMU
+ depends on COMPAT && BINFMT_ELF
config BINFMT_ELF_FDPIC
bool "Kernel support for FDPIC ELF binaries"
diff --git a/fs/Makefile b/fs/Makefile
index de404b00eb0c..2168c902d5ca 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -7,8 +7,8 @@
obj-y := open.o read_write.o file_table.o super.o \
char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \
- ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \
- attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
+ ioctl.o readdir.o select.o fifo.o dcache.o inode.o \
+ attr.o bad_inode.o file.o filesystems.o namespace.o \
seq_file.o xattr.o libfs.o fs-writeback.o \
pnode.o drop_caches.o splice.o sync.o utimes.o \
stack.o
@@ -27,6 +27,8 @@ obj-$(CONFIG_ANON_INODES) += anon_inodes.o
obj-$(CONFIG_SIGNALFD) += signalfd.o
obj-$(CONFIG_TIMERFD) += timerfd.o
obj-$(CONFIG_EVENTFD) += eventfd.o
+obj-$(CONFIG_AIO) += aio.o
+obj-$(CONFIG_FILE_LOCKING) += locks.o
obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o
nfsd-$(CONFIG_NFSD) := nfsctl.o
@@ -69,7 +71,7 @@ obj-$(CONFIG_DLM) += dlm/
# Do not add any filesystems before this line
obj-$(CONFIG_REISERFS_FS) += reiserfs/
obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3
-obj-$(CONFIG_EXT4_FS) += ext4/ # Before ext2 so root fs can be ext4dev
+obj-$(CONFIG_EXT4_FS) += ext4/ # Before ext2 so root fs can be ext4
obj-$(CONFIG_JBD) += jbd/
obj-$(CONFIG_JBD2) += jbd2/
obj-$(CONFIG_EXT2_FS) += ext2/
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 525f7c56e068..a3901769a96c 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -50,8 +50,8 @@ const struct address_space_operations afs_fs_aops = {
.launder_page = afs_launder_page,
.releasepage = afs_releasepage,
.invalidatepage = afs_invalidatepage,
- .prepare_write = afs_prepare_write,
- .commit_write = afs_commit_write,
+ .write_begin = afs_write_begin,
+ .write_end = afs_write_end,
.writepage = afs_writepage,
.writepages = afs_writepages,
};
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 3cb6920ff30b..67f259d99cd6 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -728,8 +728,12 @@ extern int afs_volume_release_fileserver(struct afs_vnode *,
*/
extern int afs_set_page_dirty(struct page *);
extern void afs_put_writeback(struct afs_writeback *);
-extern int afs_prepare_write(struct file *, struct page *, unsigned, unsigned);
-extern int afs_commit_write(struct file *, struct page *, unsigned, unsigned);
+extern int afs_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata);
+extern int afs_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata);
extern int afs_writepage(struct page *, struct writeback_control *);
extern int afs_writepages(struct address_space *, struct writeback_control *);
extern int afs_write_inode(struct inode *, int);
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 065b4e10681a..d6b85dab35fc 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -84,15 +84,23 @@ void afs_put_writeback(struct afs_writeback *wb)
* partly or wholly fill a page that's under preparation for writing
*/
static int afs_fill_page(struct afs_vnode *vnode, struct key *key,
- unsigned start, unsigned len, struct page *page)
+ loff_t pos, unsigned len, struct page *page)
{
+ loff_t i_size;
+ unsigned eof;
int ret;
- _enter(",,%u,%u", start, len);
+ _enter(",,%llu,%u", (unsigned long long)pos, len);
- ASSERTCMP(start + len, <=, PAGE_SIZE);
+ ASSERTCMP(len, <=, PAGE_CACHE_SIZE);
- ret = afs_vnode_fetch_data(vnode, key, start, len, page);
+ i_size = i_size_read(&vnode->vfs_inode);
+ if (pos + len > i_size)
+ eof = i_size;
+ else
+ eof = PAGE_CACHE_SIZE;
+
+ ret = afs_vnode_fetch_data(vnode, key, 0, eof, page);
if (ret < 0) {
if (ret == -ENOENT) {
_debug("got NOENT from server"
@@ -107,109 +115,55 @@ static int afs_fill_page(struct afs_vnode *vnode, struct key *key,
}
/*
- * prepare a page for being written to
- */
-static int afs_prepare_page(struct afs_vnode *vnode, struct page *page,
- struct key *key, unsigned offset, unsigned to)
-{
- unsigned eof, tail, start, stop, len;
- loff_t i_size, pos;
- void *p;
- int ret;
-
- _enter("");
-
- if (offset == 0 && to == PAGE_SIZE)
- return 0;
-
- p = kmap_atomic(page, KM_USER0);
-
- i_size = i_size_read(&vnode->vfs_inode);
- pos = (loff_t) page->index << PAGE_SHIFT;
- if (pos >= i_size) {
- /* partial write, page beyond EOF */
- _debug("beyond");
- if (offset > 0)
- memset(p, 0, offset);
- if (to < PAGE_SIZE)
- memset(p + to, 0, PAGE_SIZE - to);
- kunmap_atomic(p, KM_USER0);
- return 0;
- }
-
- if (i_size - pos >= PAGE_SIZE) {
- /* partial write, page entirely before EOF */
- _debug("before");
- tail = eof = PAGE_SIZE;
- } else {
- /* partial write, page overlaps EOF */
- eof = i_size - pos;
- _debug("overlap %u", eof);
- tail = max(eof, to);
- if (tail < PAGE_SIZE)
- memset(p + tail, 0, PAGE_SIZE - tail);
- if (offset > eof)
- memset(p + eof, 0, PAGE_SIZE - eof);
- }
-
- kunmap_atomic(p, KM_USER0);
-
- ret = 0;
- if (offset > 0 || eof > to) {
- /* need to fill one or two bits that aren't going to be written
- * (cover both fillers in one read if there are two) */
- start = (offset > 0) ? 0 : to;
- stop = (eof > to) ? eof : offset;
- len = stop - start;
- _debug("wr=%u-%u av=0-%u rd=%u@%u",
- offset, to, eof, start, len);
- ret = afs_fill_page(vnode, key, start, len, page);
- }
-
- _leave(" = %d", ret);
- return ret;
-}
-
-/*
* prepare to perform part of a write to a page
- * - the caller holds the page locked, preventing it from being written out or
- * modified by anyone else
*/
-int afs_prepare_write(struct file *file, struct page *page,
- unsigned offset, unsigned to)
+int afs_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
{
struct afs_writeback *candidate, *wb;
struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode);
+ struct page *page;
struct key *key = file->private_data;
- pgoff_t index;
+ unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned to = from + len;
+ pgoff_t index = pos >> PAGE_CACHE_SHIFT;
int ret;
_enter("{%x:%u},{%lx},%u,%u",
- vnode->fid.vid, vnode->fid.vnode, page->index, offset, to);
+ vnode->fid.vid, vnode->fid.vnode, index, from, to);
candidate = kzalloc(sizeof(*candidate), GFP_KERNEL);
if (!candidate)
return -ENOMEM;
candidate->vnode = vnode;
- candidate->first = candidate->last = page->index;
- candidate->offset_first = offset;
+ candidate->first = candidate->last = index;
+ candidate->offset_first = from;
candidate->to_last = to;
candidate->usage = 1;
candidate->state = AFS_WBACK_PENDING;
init_waitqueue_head(&candidate->waitq);
+ page = __grab_cache_page(mapping, index);
+ if (!page) {
+ kfree(candidate);
+ return -ENOMEM;
+ }
+ *pagep = page;
+ /* page won't leak in error case: it eventually gets cleaned off LRU */
+
if (!PageUptodate(page)) {
_debug("not up to date");
- ret = afs_prepare_page(vnode, page, key, offset, to);
+ ret = afs_fill_page(vnode, key, pos, len, page);
if (ret < 0) {
kfree(candidate);
_leave(" = %d [prep]", ret);
return ret;
}
+ SetPageUptodate(page);
}
try_again:
- index = page->index;
spin_lock(&vnode->writeback_lock);
/* see if this page is already pending a writeback under a suitable key
@@ -242,8 +196,8 @@ try_again:
subsume_in_current_wb:
_debug("subsume");
ASSERTRANGE(wb->first, <=, index, <=, wb->last);
- if (index == wb->first && offset < wb->offset_first)
- wb->offset_first = offset;
+ if (index == wb->first && from < wb->offset_first)
+ wb->offset_first = from;
if (index == wb->last && to > wb->to_last)
wb->to_last = to;
spin_unlock(&vnode->writeback_lock);
@@ -289,17 +243,17 @@ flush_conflicting_wb:
/*
* finalise part of a write to a page
*/
-int afs_commit_write(struct file *file, struct page *page,
- unsigned offset, unsigned to)
+int afs_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
{
struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode);
loff_t i_size, maybe_i_size;
- _enter("{%x:%u},{%lx},%u,%u",
- vnode->fid.vid, vnode->fid.vnode, page->index, offset, to);
+ _enter("{%x:%u},{%lx}",
+ vnode->fid.vid, vnode->fid.vnode, page->index);
- maybe_i_size = (loff_t) page->index << PAGE_SHIFT;
- maybe_i_size += to;
+ maybe_i_size = pos + copied;
i_size = i_size_read(&vnode->vfs_inode);
if (maybe_i_size > i_size) {
@@ -310,12 +264,13 @@ int afs_commit_write(struct file *file, struct page *page,
spin_unlock(&vnode->writeback_lock);
}
- SetPageUptodate(page);
set_page_dirty(page);
if (PageDirty(page))
_debug("dirtied");
+ unlock_page(page);
+ page_cache_release(page);
- return 0;
+ return copied;
}
/*
diff --git a/fs/autofs4/Makefile b/fs/autofs4/Makefile
index f2c3b79e94d2..a811c1f7d9ab 100644
--- a/fs/autofs4/Makefile
+++ b/fs/autofs4/Makefile
@@ -4,4 +4,4 @@
obj-$(CONFIG_AUTOFS4_FS) += autofs4.o
-autofs4-objs := init.o inode.o root.o symlink.o waitq.o expire.o
+autofs4-objs := init.o inode.o root.o symlink.o waitq.o expire.o dev-ioctl.o
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 69a2f5c92319..e0f16da00e54 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -14,6 +14,7 @@
/* Internal header file for autofs */
#include <linux/auto_fs4.h>
+#include <linux/auto_dev-ioctl.h>
#include <linux/mutex.h>
#include <linux/list.h>
@@ -21,6 +22,11 @@
#define AUTOFS_IOC_FIRST AUTOFS_IOC_READY
#define AUTOFS_IOC_COUNT 32
+#define AUTOFS_DEV_IOCTL_IOC_FIRST (AUTOFS_DEV_IOCTL_VERSION)
+#define AUTOFS_DEV_IOCTL_IOC_COUNT (AUTOFS_IOC_COUNT - 11)
+
+#define AUTOFS_TYPE_TRIGGER (AUTOFS_TYPE_DIRECT|AUTOFS_TYPE_OFFSET)
+
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/time.h>
@@ -35,11 +41,27 @@
/* #define DEBUG */
#ifdef DEBUG
-#define DPRINTK(fmt,args...) do { printk(KERN_DEBUG "pid %d: %s: " fmt "\n" , current->pid , __func__ , ##args); } while(0)
+#define DPRINTK(fmt, args...) \
+do { \
+ printk(KERN_DEBUG "pid %d: %s: " fmt "\n", \
+ current->pid, __func__, ##args); \
+} while (0)
#else
-#define DPRINTK(fmt,args...) do {} while(0)
+#define DPRINTK(fmt, args...) do {} while (0)
#endif
+#define AUTOFS_WARN(fmt, args...) \
+do { \
+ printk(KERN_WARNING "pid %d: %s: " fmt "\n", \
+ current->pid, __func__, ##args); \
+} while (0)
+
+#define AUTOFS_ERROR(fmt, args...) \
+do { \
+ printk(KERN_ERR "pid %d: %s: " fmt "\n", \
+ current->pid, __func__, ##args); \
+} while (0)
+
/* Unified info structure. This is pointed to by both the dentry and
inode structures. Each file in the filesystem has an instance of this
structure. It holds a reference to the dentry, so dentries are never
@@ -61,6 +83,9 @@ struct autofs_info {
unsigned long last_used;
atomic_t count;
+ uid_t uid;
+ gid_t gid;
+
mode_t mode;
size_t size;
@@ -92,10 +117,6 @@ struct autofs_wait_queue {
#define AUTOFS_SBI_MAGIC 0x6d4a556d
-#define AUTOFS_TYPE_INDIRECT 0x0001
-#define AUTOFS_TYPE_DIRECT 0x0002
-#define AUTOFS_TYPE_OFFSET 0x0004
-
struct autofs_sb_info {
u32 magic;
int pipefd;
@@ -169,6 +190,17 @@ int autofs4_expire_run(struct super_block *, struct vfsmount *,
struct autofs_packet_expire __user *);
int autofs4_expire_multi(struct super_block *, struct vfsmount *,
struct autofs_sb_info *, int __user *);
+struct dentry *autofs4_expire_direct(struct super_block *sb,
+ struct vfsmount *mnt,
+ struct autofs_sb_info *sbi, int how);
+struct dentry *autofs4_expire_indirect(struct super_block *sb,
+ struct vfsmount *mnt,
+ struct autofs_sb_info *sbi, int how);
+
+/* Device node initialization */
+
+int autofs_dev_ioctl_init(void);
+void autofs_dev_ioctl_exit(void);
/* Operations structures */
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
new file mode 100644
index 000000000000..625abf5422e2
--- /dev/null
+++ b/fs/autofs4/dev-ioctl.c
@@ -0,0 +1,863 @@
+/*
+ * Copyright 2008 Red Hat, Inc. All rights reserved.
+ * Copyright 2008 Ian Kent <raven@themaw.net>
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ */
+
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/miscdevice.h>
+#include <linux/init.h>
+#include <linux/wait.h>
+#include <linux/namei.h>
+#include <linux/fcntl.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/sched.h>
+#include <linux/compat.h>
+#include <linux/syscalls.h>
+#include <linux/smp_lock.h>
+#include <linux/magic.h>
+#include <linux/dcache.h>
+#include <linux/uaccess.h>
+
+#include "autofs_i.h"
+
+/*
+ * This module implements an interface for routing autofs ioctl control
+ * commands via a miscellaneous device file.
+ *
+ * The alternate interface is needed because we need to be able open
+ * an ioctl file descriptor on an autofs mount that may be covered by
+ * another mount. This situation arises when starting automount(8)
+ * or other user space daemon which uses direct mounts or offset
+ * mounts (used for autofs lazy mount/umount of nested mount trees),
+ * which have been left busy at at service shutdown.
+ */
+
+#define AUTOFS_DEV_IOCTL_SIZE sizeof(struct autofs_dev_ioctl)
+
+typedef int (*ioctl_fn)(struct file *, struct autofs_sb_info *,
+ struct autofs_dev_ioctl *);
+
+static int check_name(const char *name)
+{
+ if (!strchr(name, '/'))
+ return -EINVAL;
+ return 0;
+}
+
+/*
+ * Check a string doesn't overrun the chunk of
+ * memory we copied from user land.
+ */
+static int invalid_str(char *str, void *end)
+{
+ while ((void *) str <= end)
+ if (!*str++)
+ return 0;
+ return -EINVAL;
+}
+
+/*
+ * Check that the user compiled against correct version of autofs
+ * misc device code.
+ *
+ * As well as checking the version compatibility this always copies
+ * the kernel interface version out.
+ */
+static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param)
+{
+ int err = 0;
+
+ if ((AUTOFS_DEV_IOCTL_VERSION_MAJOR != param->ver_major) ||
+ (AUTOFS_DEV_IOCTL_VERSION_MINOR < param->ver_minor)) {
+ AUTOFS_WARN("ioctl control interface version mismatch: "
+ "kernel(%u.%u), user(%u.%u), cmd(%d)",
+ AUTOFS_DEV_IOCTL_VERSION_MAJOR,
+ AUTOFS_DEV_IOCTL_VERSION_MINOR,
+ param->ver_major, param->ver_minor, cmd);
+ err = -EINVAL;
+ }
+
+ /* Fill in the kernel version. */
+ param->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR;
+ param->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR;
+
+ return err;
+}
+
+/*
+ * Copy parameter control struct, including a possible path allocated
+ * at the end of the struct.
+ */
+static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in)
+{
+ struct autofs_dev_ioctl tmp, *ads;
+
+ if (copy_from_user(&tmp, in, sizeof(tmp)))
+ return ERR_PTR(-EFAULT);
+
+ if (tmp.size < sizeof(tmp))
+ return ERR_PTR(-EINVAL);
+
+ ads = kmalloc(tmp.size, GFP_KERNEL);
+ if (!ads)
+ return ERR_PTR(-ENOMEM);
+
+ if (copy_from_user(ads, in, tmp.size)) {
+ kfree(ads);
+ return ERR_PTR(-EFAULT);
+ }
+
+ return ads;
+}
+
+static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
+{
+ kfree(param);
+ return;
+}
+
+/*
+ * Check sanity of parameter control fields and if a path is present
+ * check that it has a "/" and is terminated.
+ */
+static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
+{
+ int err = -EINVAL;
+
+ if (check_dev_ioctl_version(cmd, param)) {
+ AUTOFS_WARN("invalid device control module version "
+ "supplied for cmd(0x%08x)", cmd);
+ goto out;
+ }
+
+ if (param->size > sizeof(*param)) {
+ err = check_name(param->path);
+ if (err) {
+ AUTOFS_WARN("invalid path supplied for cmd(0x%08x)",
+ cmd);
+ goto out;
+ }
+
+ err = invalid_str(param->path,
+ (void *) ((size_t) param + param->size));
+ if (err) {
+ AUTOFS_WARN("invalid path supplied for cmd(0x%08x)",
+ cmd);
+ goto out;
+ }
+ }
+
+ err = 0;
+out:
+ return err;
+}
+
+/*
+ * Get the autofs super block info struct from the file opened on
+ * the autofs mount point.
+ */
+static struct autofs_sb_info *autofs_dev_ioctl_sbi(struct file *f)
+{
+ struct autofs_sb_info *sbi = NULL;
+ struct inode *inode;
+
+ if (f) {
+ inode = f->f_path.dentry->d_inode;
+ sbi = autofs4_sbi(inode->i_sb);
+ }
+ return sbi;
+}
+
+/* Return autofs module protocol version */
+static int autofs_dev_ioctl_protover(struct file *fp,
+ struct autofs_sb_info *sbi,
+ struct autofs_dev_ioctl *param)
+{
+ param->arg1 = sbi->version;
+ return 0;
+}
+
+/* Return autofs module protocol sub version */
+static int autofs_dev_ioctl_protosubver(struct file *fp,
+ struct autofs_sb_info *sbi,
+ struct autofs_dev_ioctl *param)
+{
+ param->arg1 = sbi->sub_version;
+ return 0;
+}
+
+/*
+ * Walk down the mount stack looking for an autofs mount that
+ * has the requested device number (aka. new_encode_dev(sb->s_dev).
+ */
+static int autofs_dev_ioctl_find_super(struct nameidata *nd, dev_t devno)
+{
+ struct dentry *dentry;
+ struct inode *inode;
+ struct super_block *sb;
+ dev_t s_dev;
+ unsigned int err;
+
+ err = -ENOENT;
+
+ /* Lookup the dentry name at the base of our mount point */
+ dentry = d_lookup(nd->path.dentry, &nd->last);
+ if (!dentry)
+ goto out;
+
+ dput(nd->path.dentry);
+ nd->path.dentry = dentry;
+
+ /* And follow the mount stack looking for our autofs mount */
+ while (follow_down(&nd->path.mnt, &nd->path.dentry)) {
+ inode = nd->path.dentry->d_inode;
+ if (!inode)
+ break;
+
+ sb = inode->i_sb;
+ s_dev = new_encode_dev(sb->s_dev);
+ if (devno == s_dev) {
+ if (sb->s_magic == AUTOFS_SUPER_MAGIC) {
+ err = 0;
+ break;
+ }
+ }
+ }
+out:
+ return err;
+}
+
+/*
+ * Walk down the mount stack looking for an autofs mount that
+ * has the requested mount type (ie. indirect, direct or offset).
+ */
+static int autofs_dev_ioctl_find_sbi_type(struct nameidata *nd, unsigned int type)
+{
+ struct dentry *dentry;
+ struct autofs_info *ino;
+ unsigned int err;
+
+ err = -ENOENT;
+
+ /* Lookup the dentry name at the base of our mount point */
+ dentry = d_lookup(nd->path.dentry, &nd->last);
+ if (!dentry)
+ goto out;
+
+ dput(nd->path.dentry);
+ nd->path.dentry = dentry;
+
+ /* And follow the mount stack looking for our autofs mount */
+ while (follow_down(&nd->path.mnt, &nd->path.dentry)) {
+ ino = autofs4_dentry_ino(nd->path.dentry);
+ if (ino && ino->sbi->type & type) {
+ err = 0;
+ break;
+ }
+ }
+out:
+ return err;
+}
+
+static void autofs_dev_ioctl_fd_install(unsigned int fd, struct file *file)
+{
+ struct files_struct *files = current->files;
+ struct fdtable *fdt;
+
+ spin_lock(&files->file_lock);
+ fdt = files_fdtable(files);
+ BUG_ON(fdt->fd[fd] != NULL);
+ rcu_assign_pointer(fdt->fd[fd], file);
+ FD_SET(fd, fdt->close_on_exec);
+ spin_unlock(&files->file_lock);
+}
+
+
+/*
+ * Open a file descriptor on the autofs mount point corresponding
+ * to the given path and device number (aka. new_encode_dev(sb->s_dev)).
+ */
+static int autofs_dev_ioctl_open_mountpoint(const char *path, dev_t devid)
+{
+ struct file *filp;
+ struct nameidata nd;
+ int err, fd;
+
+ fd = get_unused_fd();
+ if (likely(fd >= 0)) {
+ /* Get nameidata of the parent directory */
+ err = path_lookup(path, LOOKUP_PARENT, &nd);
+ if (err)
+ goto out;
+
+ /*
+ * Search down, within the parent, looking for an
+ * autofs super block that has the device number
+ * corresponding to the autofs fs we want to open.
+ */
+ err = autofs_dev_ioctl_find_super(&nd, devid);
+ if (err) {
+ path_put(&nd.path);
+ goto out;
+ }
+
+ filp = dentry_open(nd.path.dentry, nd.path.mnt, O_RDONLY);
+ if (IS_ERR(filp)) {
+ err = PTR_ERR(filp);
+ goto out;
+ }
+
+ autofs_dev_ioctl_fd_install(fd, filp);
+ }
+
+ return fd;
+
+out:
+ put_unused_fd(fd);
+ return err;
+}
+
+/* Open a file descriptor on an autofs mount point */
+static int autofs_dev_ioctl_openmount(struct file *fp,
+ struct autofs_sb_info *sbi,
+ struct autofs_dev_ioctl *param)
+{
+ const char *path;
+ dev_t devid;
+ int err, fd;
+
+ /* param->path has already been checked */
+ if (!param->arg1)
+ return -EINVAL;
+
+ param->ioctlfd = -1;
+
+ path = param->path;
+ devid = param->arg1;
+
+ err = 0;
+ fd = autofs_dev_ioctl_open_mountpoint(path, devid);
+ if (unlikely(fd < 0)) {
+ err = fd;
+ goto out;
+ }
+
+ param->ioctlfd = fd;
+out:
+ return err;
+}
+
+/* Close file descriptor allocated above (user can also use close(2)). */
+static int autofs_dev_ioctl_closemount(struct file *fp,
+ struct autofs_sb_info *sbi,
+ struct autofs_dev_ioctl *param)
+{
+ return sys_close(param->ioctlfd);
+}
+
+/*
+ * Send "ready" status for an existing wait (either a mount or an expire
+ * request).
+ */
+static int autofs_dev_ioctl_ready(struct file *fp,
+ struct autofs_sb_info *sbi,
+ struct autofs_dev_ioctl *param)
+{
+ autofs_wqt_t token;
+
+ token = (autofs_wqt_t) param->arg1;
+ return autofs4_wait_release(sbi, token, 0);
+}
+
+/*
+ * Send "fail" status for an existing wait (either a mount or an expire
+ * request).
+ */
+static int autofs_dev_ioctl_fail(struct file *fp,
+ struct autofs_sb_info *sbi,
+ struct autofs_dev_ioctl *param)
+{
+ autofs_wqt_t token;
+ int status;
+
+ token = (autofs_wqt_t) param->arg1;
+ status = param->arg2 ? param->arg2 : -ENOENT;
+ return autofs4_wait_release(sbi, token, status);
+}
+
+/*
+ * Set the pipe fd for kernel communication to the daemon.
+ *
+ * Normally this is set at mount using an option but if we
+ * are reconnecting to a busy mount then we need to use this
+ * to tell the autofs mount about the new kernel pipe fd. In
+ * order to protect mounts against incorrectly setting the
+ * pipefd we also require that the autofs mount be catatonic.
+ *
+ * This also sets the process group id used to identify the
+ * controlling process (eg. the owning automount(8) daemon).
+ */
+static int autofs_dev_ioctl_setpipefd(struct file *fp,
+ struct autofs_sb_info *sbi,
+ struct autofs_dev_ioctl *param)
+{
+ int pipefd;
+ int err = 0;
+
+ if (param->arg1 == -1)
+ return -EINVAL;
+
+ pipefd = param->arg1;
+
+ mutex_lock(&sbi->wq_mutex);
+ if (!sbi->catatonic) {
+ mutex_unlock(&sbi->wq_mutex);
+ return -EBUSY;
+ } else {
+ struct file *pipe = fget(pipefd);
+ if (!pipe->f_op || !pipe->f_op->write) {
+ err = -EPIPE;
+ fput(pipe);
+ goto out;
+ }
+ sbi->oz_pgrp = task_pgrp_nr(current);
+ sbi->pipefd = pipefd;
+ sbi->pipe = pipe;
+ sbi->catatonic = 0;
+ }
+out:
+ mutex_unlock(&sbi->wq_mutex);
+ return err;
+}
+
+/*
+ * Make the autofs mount point catatonic, no longer responsive to
+ * mount requests. Also closes the kernel pipe file descriptor.
+ */
+static int autofs_dev_ioctl_catatonic(struct file *fp,
+ struct autofs_sb_info *sbi,
+ struct autofs_dev_ioctl *param)
+{
+ autofs4_catatonic_mode(sbi);
+ return 0;
+}
+
+/* Set the autofs mount timeout */
+static int autofs_dev_ioctl_timeout(struct file *fp,
+ struct autofs_sb_info *sbi,
+ struct autofs_dev_ioctl *param)
+{
+ unsigned long timeout;
+
+ timeout = param->arg1;
+ param->arg1 = sbi->exp_timeout / HZ;
+ sbi->exp_timeout = timeout * HZ;
+ return 0;
+}
+
+/*
+ * Return the uid and gid of the last request for the mount
+ *
+ * When reconstructing an autofs mount tree with active mounts
+ * we need to re-connect to mounts that may have used the original
+ * process uid and gid (or string variations of them) for mount
+ * lookups within the map entry.
+ */
+static int autofs_dev_ioctl_requester(struct file *fp,
+ struct autofs_sb_info *sbi,
+ struct autofs_dev_ioctl *param)
+{
+ struct autofs_info *ino;
+ struct nameidata nd;
+ const char *path;
+ dev_t devid;
+ int err = -ENOENT;
+
+ if (param->size <= sizeof(*param)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ path = param->path;
+ devid = sbi->sb->s_dev;
+
+ param->arg1 = param->arg2 = -1;
+
+ /* Get nameidata of the parent directory */
+ err = path_lookup(path, LOOKUP_PARENT, &nd);
+ if (err)
+ goto out;
+
+ err = autofs_dev_ioctl_find_super(&nd, devid);
+ if (err)
+ goto out_release;
+
+ ino = autofs4_dentry_ino(nd.path.dentry);
+ if (ino) {
+ err = 0;
+ autofs4_expire_wait(nd.path.dentry);
+ spin_lock(&sbi->fs_lock);
+ param->arg1 = ino->uid;
+ param->arg2 = ino->gid;
+ spin_unlock(&sbi->fs_lock);
+ }
+
+out_release:
+ path_put(&nd.path);
+out:
+ return err;
+}
+
+/*
+ * Call repeatedly until it returns -EAGAIN, meaning there's nothing
+ * more that can be done.
+ */
+static int autofs_dev_ioctl_expire(struct file *fp,
+ struct autofs_sb_info *sbi,
+ struct autofs_dev_ioctl *param)
+{
+ struct dentry *dentry;
+ struct vfsmount *mnt;
+ int err = -EAGAIN;
+ int how;
+
+ how = param->arg1;
+ mnt = fp->f_path.mnt;
+
+ if (sbi->type & AUTOFS_TYPE_TRIGGER)
+ dentry = autofs4_expire_direct(sbi->sb, mnt, sbi, how);
+ else
+ dentry = autofs4_expire_indirect(sbi->sb, mnt, sbi, how);
+
+ if (dentry) {
+ struct autofs_info *ino = autofs4_dentry_ino(dentry);
+
+ /*
+ * This is synchronous because it makes the daemon a
+ * little easier
+ */
+ err = autofs4_wait(sbi, dentry, NFY_EXPIRE);
+
+ spin_lock(&sbi->fs_lock);
+ if (ino->flags & AUTOFS_INF_MOUNTPOINT) {
+ ino->flags &= ~AUTOFS_INF_MOUNTPOINT;
+ sbi->sb->s_root->d_mounted++;
+ }
+ ino->flags &= ~AUTOFS_INF_EXPIRING;
+ complete_all(&ino->expire_complete);
+ spin_unlock(&sbi->fs_lock);
+ dput(dentry);
+ }
+
+ return err;
+}
+
+/* Check if autofs mount point is in use */
+static int autofs_dev_ioctl_askumount(struct file *fp,
+ struct autofs_sb_info *sbi,
+ struct autofs_dev_ioctl *param)
+{
+ param->arg1 = 0;
+ if (may_umount(fp->f_path.mnt))
+ param->arg1 = 1;
+ return 0;
+}
+
+/*
+ * Check if the given path is a mountpoint.
+ *
+ * If we are supplied with the file descriptor of an autofs
+ * mount we're looking for a specific mount. In this case
+ * the path is considered a mountpoint if it is itself a
+ * mountpoint or contains a mount, such as a multi-mount
+ * without a root mount. In this case we return 1 if the
+ * path is a mount point and the super magic of the covering
+ * mount if there is one or 0 if it isn't a mountpoint.
+ *
+ * If we aren't supplied with a file descriptor then we
+ * lookup the nameidata of the path and check if it is the
+ * root of a mount. If a type is given we are looking for
+ * a particular autofs mount and if we don't find a match
+ * we return fail. If the located nameidata path is the
+ * root of a mount we return 1 along with the super magic
+ * of the mount or 0 otherwise.
+ *
+ * In both cases the the device number (as returned by
+ * new_encode_dev()) is also returned.
+ */
+static int autofs_dev_ioctl_ismountpoint(struct file *fp,
+ struct autofs_sb_info *sbi,
+ struct autofs_dev_ioctl *param)
+{
+ struct nameidata nd;
+ const char *path;
+ unsigned int type;
+ int err = -ENOENT;
+
+ if (param->size <= sizeof(*param)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ path = param->path;
+ type = param->arg1;
+
+ param->arg1 = 0;
+ param->arg2 = 0;
+
+ if (!fp || param->ioctlfd == -1) {
+ if (type == AUTOFS_TYPE_ANY) {
+ struct super_block *sb;
+
+ err = path_lookup(path, LOOKUP_FOLLOW, &nd);
+ if (err)
+ goto out;
+
+ sb = nd.path.dentry->d_sb;
+ param->arg1 = new_encode_dev(sb->s_dev);
+ } else {
+ struct autofs_info *ino;
+
+ err = path_lookup(path, LOOKUP_PARENT, &nd);
+ if (err)
+ goto out;
+
+ err = autofs_dev_ioctl_find_sbi_type(&nd, type);
+ if (err)
+ goto out_release;
+
+ ino = autofs4_dentry_ino(nd.path.dentry);
+ param->arg1 = autofs4_get_dev(ino->sbi);
+ }
+
+ err = 0;
+ if (nd.path.dentry->d_inode &&
+ nd.path.mnt->mnt_root == nd.path.dentry) {
+ err = 1;
+ param->arg2 = nd.path.dentry->d_inode->i_sb->s_magic;
+ }
+ } else {
+ dev_t devid = new_encode_dev(sbi->sb->s_dev);
+
+ err = path_lookup(path, LOOKUP_PARENT, &nd);
+ if (err)
+ goto out;
+
+ err = autofs_dev_ioctl_find_super(&nd, devid);
+ if (err)
+ goto out_release;
+
+ param->arg1 = autofs4_get_dev(sbi);
+
+ err = have_submounts(nd.path.dentry);
+
+ if (nd.path.mnt->mnt_mountpoint != nd.path.mnt->mnt_root) {
+ if (follow_down(&nd.path.mnt, &nd.path.dentry)) {
+ struct inode *inode = nd.path.dentry->d_inode;
+ param->arg2 = inode->i_sb->s_magic;
+ }
+ }
+ }
+
+out_release:
+ path_put(&nd.path);
+out:
+ return err;
+}
+
+/*
+ * Our range of ioctl numbers isn't 0 based so we need to shift
+ * the array index by _IOC_NR(AUTOFS_CTL_IOC_FIRST) for the table
+ * lookup.
+ */
+#define cmd_idx(cmd) (cmd - _IOC_NR(AUTOFS_DEV_IOCTL_IOC_FIRST))
+
+static ioctl_fn lookup_dev_ioctl(unsigned int cmd)
+{
+ static struct {
+ int cmd;
+ ioctl_fn fn;
+ } _ioctls[] = {
+ {cmd_idx(AUTOFS_DEV_IOCTL_VERSION_CMD), NULL},
+ {cmd_idx(AUTOFS_DEV_IOCTL_PROTOVER_CMD),
+ autofs_dev_ioctl_protover},
+ {cmd_idx(AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD),
+ autofs_dev_ioctl_protosubver},
+ {cmd_idx(AUTOFS_DEV_IOCTL_OPENMOUNT_CMD),
+ autofs_dev_ioctl_openmount},
+ {cmd_idx(AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD),
+ autofs_dev_ioctl_closemount},
+ {cmd_idx(AUTOFS_DEV_IOCTL_READY_CMD),
+ autofs_dev_ioctl_ready},
+ {cmd_idx(AUTOFS_DEV_IOCTL_FAIL_CMD),
+ autofs_dev_ioctl_fail},
+ {cmd_idx(AUTOFS_DEV_IOCTL_SETPIPEFD_CMD),
+ autofs_dev_ioctl_setpipefd},
+ {cmd_idx(AUTOFS_DEV_IOCTL_CATATONIC_CMD),
+ autofs_dev_ioctl_catatonic},
+ {cmd_idx(AUTOFS_DEV_IOCTL_TIMEOUT_CMD),
+ autofs_dev_ioctl_timeout},
+ {cmd_idx(AUTOFS_DEV_IOCTL_REQUESTER_CMD),
+ autofs_dev_ioctl_requester},
+ {cmd_idx(AUTOFS_DEV_IOCTL_EXPIRE_CMD),
+ autofs_dev_ioctl_expire},
+ {cmd_idx(AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD),
+ autofs_dev_ioctl_askumount},
+ {cmd_idx(AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD),
+ autofs_dev_ioctl_ismountpoint}
+ };
+ unsigned int idx = cmd_idx(cmd);
+
+ return (idx >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[idx].fn;
+}
+
+/* ioctl dispatcher */
+static int _autofs_dev_ioctl(unsigned int command, struct autofs_dev_ioctl __user *user)
+{
+ struct autofs_dev_ioctl *param;
+ struct file *fp;
+ struct autofs_sb_info *sbi;
+ unsigned int cmd_first, cmd;
+ ioctl_fn fn = NULL;
+ int err = 0;
+
+ /* only root can play with this */
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ cmd_first = _IOC_NR(AUTOFS_DEV_IOCTL_IOC_FIRST);
+ cmd = _IOC_NR(command);
+
+ if (_IOC_TYPE(command) != _IOC_TYPE(AUTOFS_DEV_IOCTL_IOC_FIRST) ||
+ cmd - cmd_first >= AUTOFS_DEV_IOCTL_IOC_COUNT) {
+ return -ENOTTY;
+ }
+
+ /* Copy the parameters into kernel space. */
+ param = copy_dev_ioctl(user);
+ if (IS_ERR(param))
+ return PTR_ERR(param);
+
+ err = validate_dev_ioctl(command, param);
+ if (err)
+ goto out;
+
+ /* The validate routine above always sets the version */
+ if (cmd == AUTOFS_DEV_IOCTL_VERSION_CMD)
+ goto done;
+
+ fn = lookup_dev_ioctl(cmd);
+ if (!fn) {
+ AUTOFS_WARN("unknown command 0x%08x", command);
+ return -ENOTTY;
+ }
+
+ fp = NULL;
+ sbi = NULL;
+
+ /*
+ * For obvious reasons the openmount can't have a file
+ * descriptor yet. We don't take a reference to the
+ * file during close to allow for immediate release.
+ */
+ if (cmd != AUTOFS_DEV_IOCTL_OPENMOUNT_CMD &&
+ cmd != AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD) {
+ fp = fget(param->ioctlfd);
+ if (!fp) {
+ if (cmd == AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD)
+ goto cont;
+ err = -EBADF;
+ goto out;
+ }
+
+ if (!fp->f_op) {
+ err = -ENOTTY;
+ fput(fp);
+ goto out;
+ }
+
+ sbi = autofs_dev_ioctl_sbi(fp);
+ if (!sbi || sbi->magic != AUTOFS_SBI_MAGIC) {
+ err = -EINVAL;
+ fput(fp);
+ goto out;
+ }
+
+ /*
+ * Admin needs to be able to set the mount catatonic in
+ * order to be able to perform the re-open.
+ */
+ if (!autofs4_oz_mode(sbi) &&
+ cmd != AUTOFS_DEV_IOCTL_CATATONIC_CMD) {
+ err = -EACCES;
+ fput(fp);
+ goto out;
+ }
+ }
+cont:
+ err = fn(fp, sbi, param);
+
+ if (fp)
+ fput(fp);
+done:
+ if (err >= 0 && copy_to_user(user, param, AUTOFS_DEV_IOCTL_SIZE))
+ err = -EFAULT;
+out:
+ free_dev_ioctl(param);
+ return err;
+}
+
+static long autofs_dev_ioctl(struct file *file, uint command, ulong u)
+{
+ int err;
+ err = _autofs_dev_ioctl(command, (struct autofs_dev_ioctl __user *) u);
+ return (long) err;
+}
+
+#ifdef CONFIG_COMPAT
+static long autofs_dev_ioctl_compat(struct file *file, uint command, ulong u)
+{
+ return (long) autofs_dev_ioctl(file, command, (ulong) compat_ptr(u));
+}
+#else
+#define autofs_dev_ioctl_compat NULL
+#endif
+
+static const struct file_operations _dev_ioctl_fops = {
+ .unlocked_ioctl = autofs_dev_ioctl,
+ .compat_ioctl = autofs_dev_ioctl_compat,
+ .owner = THIS_MODULE,
+};
+
+static struct miscdevice _autofs_dev_ioctl_misc = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = AUTOFS_DEVICE_NAME,
+ .fops = &_dev_ioctl_fops
+};
+
+/* Register/deregister misc character device */
+int autofs_dev_ioctl_init(void)
+{
+ int r;
+
+ r = misc_register(&_autofs_dev_ioctl_misc);
+ if (r) {
+ AUTOFS_ERROR("misc_register failed for control device");
+ return r;
+ }
+
+ return 0;
+}
+
+void autofs_dev_ioctl_exit(void)
+{
+ misc_deregister(&_autofs_dev_ioctl_misc);
+ return;
+}
+
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index cdabb796ff01..cde2f8e8935a 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -244,10 +244,10 @@ cont:
}
/* Check if we can expire a direct mount (possibly a tree) */
-static struct dentry *autofs4_expire_direct(struct super_block *sb,
- struct vfsmount *mnt,
- struct autofs_sb_info *sbi,
- int how)
+struct dentry *autofs4_expire_direct(struct super_block *sb,
+ struct vfsmount *mnt,
+ struct autofs_sb_info *sbi,
+ int how)
{
unsigned long timeout;
struct dentry *root = dget(sb->s_root);
@@ -283,10 +283,10 @@ static struct dentry *autofs4_expire_direct(struct super_block *sb,
* - it is unused by any user process
* - it has been unused for exp_timeout time
*/
-static struct dentry *autofs4_expire_indirect(struct super_block *sb,
- struct vfsmount *mnt,
- struct autofs_sb_info *sbi,
- int how)
+struct dentry *autofs4_expire_indirect(struct super_block *sb,
+ struct vfsmount *mnt,
+ struct autofs_sb_info *sbi,
+ int how)
{
unsigned long timeout;
struct dentry *root = sb->s_root;
@@ -479,7 +479,7 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
if (arg && get_user(do_now, arg))
return -EFAULT;
- if (sbi->type & AUTOFS_TYPE_DIRECT)
+ if (sbi->type & AUTOFS_TYPE_TRIGGER)
dentry = autofs4_expire_direct(sb, mnt, sbi, do_now);
else
dentry = autofs4_expire_indirect(sb, mnt, sbi, do_now);
diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c
index 723a1c5e361b..9722e4bd8957 100644
--- a/fs/autofs4/init.c
+++ b/fs/autofs4/init.c
@@ -29,11 +29,20 @@ static struct file_system_type autofs_fs_type = {
static int __init init_autofs4_fs(void)
{
- return register_filesystem(&autofs_fs_type);
+ int err;
+
+ err = register_filesystem(&autofs_fs_type);
+ if (err)
+ return err;
+
+ autofs_dev_ioctl_init();
+
+ return err;
}
static void __exit exit_autofs4_fs(void)
{
+ autofs_dev_ioctl_exit();
unregister_filesystem(&autofs_fs_type);
}
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 45d55819203d..c7e65bb30ba0 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -53,6 +53,8 @@ struct autofs_info *autofs4_init_ino(struct autofs_info *ino,
atomic_set(&ino->count, 0);
}
+ ino->uid = 0;
+ ino->gid = 0;
ino->mode = mode;
ino->last_used = jiffies;
@@ -288,7 +290,7 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
*type = AUTOFS_TYPE_DIRECT;
break;
case Opt_offset:
- *type = AUTOFS_TYPE_DIRECT | AUTOFS_TYPE_OFFSET;
+ *type = AUTOFS_TYPE_OFFSET;
break;
default:
return 1;
@@ -336,7 +338,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
sbi->sb = s;
sbi->version = 0;
sbi->sub_version = 0;
- sbi->type = 0;
+ sbi->type = AUTOFS_TYPE_INDIRECT;
sbi->min_proto = 0;
sbi->max_proto = 0;
mutex_init(&sbi->wq_mutex);
@@ -378,7 +380,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
}
root_inode->i_fop = &autofs4_root_operations;
- root_inode->i_op = sbi->type & AUTOFS_TYPE_DIRECT ?
+ root_inode->i_op = sbi->type & AUTOFS_TYPE_TRIGGER ?
&autofs4_direct_root_inode_operations :
&autofs4_indirect_root_inode_operations;
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 35216d18d8b5..4b67c2a2d77c 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -337,7 +337,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
* is very similar for indirect mounts except only dentrys
* in the root of the autofs file system may be negative.
*/
- if (sbi->type & (AUTOFS_TYPE_DIRECT|AUTOFS_TYPE_OFFSET))
+ if (sbi->type & AUTOFS_TYPE_TRIGGER)
return -ENOENT;
else if (!IS_ROOT(dentry->d_parent))
return -ENOENT;
@@ -348,7 +348,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
return -ENOMEM;
/* If this is a direct mount request create a dummy name */
- if (IS_ROOT(dentry) && (sbi->type & AUTOFS_TYPE_DIRECT))
+ if (IS_ROOT(dentry) && sbi->type & AUTOFS_TYPE_TRIGGER)
qstr.len = sprintf(name, "%p", dentry);
else {
qstr.len = autofs4_getpath(sbi, dentry, &name);
@@ -406,11 +406,11 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
type = autofs_ptype_expire_multi;
} else {
if (notify == NFY_MOUNT)
- type = (sbi->type & AUTOFS_TYPE_DIRECT) ?
+ type = (sbi->type & AUTOFS_TYPE_TRIGGER) ?
autofs_ptype_missing_direct :
autofs_ptype_missing_indirect;
else
- type = (sbi->type & AUTOFS_TYPE_DIRECT) ?
+ type = (sbi->type & AUTOFS_TYPE_TRIGGER) ?
autofs_ptype_expire_direct :
autofs_ptype_expire_indirect;
}
@@ -457,6 +457,40 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
status = wq->status;
+ /*
+ * For direct and offset mounts we need to track the requester's
+ * uid and gid in the dentry info struct. This is so it can be
+ * supplied, on request, by the misc device ioctl interface.
+ * This is needed during daemon resatart when reconnecting
+ * to existing, active, autofs mounts. The uid and gid (and
+ * related string values) may be used for macro substitution
+ * in autofs mount maps.
+ */
+ if (!status) {
+ struct autofs_info *ino;
+ struct dentry *de = NULL;
+
+ /* direct mount or browsable map */
+ ino = autofs4_dentry_ino(dentry);
+ if (!ino) {
+ /* If not lookup actual dentry used */
+ de = d_lookup(dentry->d_parent, &dentry->d_name);
+ if (de)
+ ino = autofs4_dentry_ino(de);
+ }
+
+ /* Set mount requester */
+ if (ino) {
+ spin_lock(&sbi->fs_lock);
+ ino->uid = wq->uid;
+ ino->gid = wq->gid;
+ spin_unlock(&sbi->fs_lock);
+ }
+
+ if (de)
+ dput(de);
+ }
+
/* Are we the last process to need status? */
mutex_lock(&sbi->wq_mutex);
if (!--wq->wait_ctr)
diff --git a/fs/befs/befs_fs_types.h b/fs/befs/befs_fs_types.h
index e2595c2c403a..7893eaa1e58c 100644
--- a/fs/befs/befs_fs_types.h
+++ b/fs/befs/befs_fs_types.h
@@ -55,8 +55,12 @@ enum super_flags {
};
#define BEFS_BYTEORDER_NATIVE 0x42494745
+#define BEFS_BYTEORDER_NATIVE_LE (__force fs32)cpu_to_le32(BEFS_BYTEORDER_NATIVE)
+#define BEFS_BYTEORDER_NATIVE_BE (__force fs32)cpu_to_be32(BEFS_BYTEORDER_NATIVE)
#define BEFS_SUPER_MAGIC BEFS_SUPER_MAGIC1
+#define BEFS_SUPER_MAGIC1_LE (__force fs32)cpu_to_le32(BEFS_SUPER_MAGIC1)
+#define BEFS_SUPER_MAGIC1_BE (__force fs32)cpu_to_be32(BEFS_SUPER_MAGIC1)
/*
* Flags of inode
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 9286b2af893a..b6dfee37c7b7 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -809,8 +809,8 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
/* account for offset of super block on x86 */
disk_sb = (befs_super_block *) bh->b_data;
- if ((le32_to_cpu(disk_sb->magic1) == BEFS_SUPER_MAGIC1) ||
- (be32_to_cpu(disk_sb->magic1) == BEFS_SUPER_MAGIC1)) {
+ if ((disk_sb->magic1 == BEFS_SUPER_MAGIC1_LE) ||
+ (disk_sb->magic1 == BEFS_SUPER_MAGIC1_BE)) {
befs_debug(sb, "Using PPC superblock location");
} else {
befs_debug(sb, "Using x86 superblock location");
diff --git a/fs/befs/super.c b/fs/befs/super.c
index 8c3401ff6d6a..41f2b4d0093e 100644
--- a/fs/befs/super.c
+++ b/fs/befs/super.c
@@ -26,10 +26,10 @@ befs_load_sb(struct super_block *sb, befs_super_block * disk_sb)
befs_sb_info *befs_sb = BEFS_SB(sb);
/* Check the byte order of the filesystem */
- if (le32_to_cpu(disk_sb->fs_byte_order) == BEFS_BYTEORDER_NATIVE)
+ if (disk_sb->fs_byte_order == BEFS_BYTEORDER_NATIVE_LE)
befs_sb->byte_order = BEFS_BYTESEX_LE;
- else if (be32_to_cpu(disk_sb->fs_byte_order) == BEFS_BYTEORDER_NATIVE)
- befs_sb->byte_order = BEFS_BYTESEX_BE;
+ else if (disk_sb->fs_byte_order == BEFS_BYTEORDER_NATIVE_BE)
+ befs_sb->byte_order = BEFS_BYTESEX_BE;
befs_sb->magic1 = fs32_to_cpu(sb, disk_sb->magic1);
befs_sb->magic2 = fs32_to_cpu(sb, disk_sb->magic2);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index a8635f637038..83d72006e29d 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -683,7 +683,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
* switch really is going to happen - do this in
* flush_thread(). - akpm
*/
- SET_PERSONALITY(loc->elf_ex, 0);
+ SET_PERSONALITY(loc->elf_ex);
interpreter = open_exec(elf_interpreter);
retval = PTR_ERR(interpreter);
@@ -734,7 +734,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
goto out_free_dentry;
} else {
/* Executables without an interpreter also need a personality */
- SET_PERSONALITY(loc->elf_ex, 0);
+ SET_PERSONALITY(loc->elf_ex);
}
/* Flush all traces of the currently running executable */
@@ -748,7 +748,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
/* Do this immediately, since STACK_TOP as used in setup_arg_pages
may depend on the personality. */
- SET_PERSONALITY(loc->elf_ex, 0);
+ SET_PERSONALITY(loc->elf_ex);
if (elf_read_implies_exec(loc->elf_ex, executable_stack))
current->personality |= READ_IMPLIES_EXEC;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 80c1f952ef78..0e8367c54624 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -25,6 +25,7 @@
#include <linux/fcntl.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
+#include <linux/security.h>
#include <linux/highmem.h>
#include <linux/highuid.h>
#include <linux/personality.h>
@@ -455,8 +456,19 @@ error_kill:
}
/*****************************************************************************/
+
+#ifndef ELF_BASE_PLATFORM
/*
- * present useful information to the program
+ * AT_BASE_PLATFORM indicates the "real" hardware/microarchitecture.
+ * If the arch defines ELF_BASE_PLATFORM (in asm/elf.h), the value
+ * will be copied to the user stack in the same manner as AT_PLATFORM.
+ */
+#define ELF_BASE_PLATFORM NULL
+#endif
+
+/*
+ * present useful information to the program by shovelling it onto the new
+ * process's stack
*/
static int create_elf_fdpic_tables(struct linux_binprm *bprm,
struct mm_struct *mm,
@@ -466,15 +478,19 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
unsigned long sp, csp, nitems;
elf_caddr_t __user *argv, *envp;
size_t platform_len = 0, len;
- char *k_platform;
- char __user *u_platform, *p;
+ char *k_platform, *k_base_platform;
+ char __user *u_platform, *u_base_platform, *p;
long hwcap;
int loop;
int nr; /* reset for each csp adjustment */
- /* we're going to shovel a whole load of stuff onto the stack */
#ifdef CONFIG_MMU
- sp = bprm->p;
+ /* In some cases (e.g. Hyper-Threading), we want to avoid L1 evictions
+ * by the processes running on the same package. One thing we can do is
+ * to shuffle the initial stack for them, so we give the architecture
+ * an opportunity to do so here.
+ */
+ sp = arch_align_stack(bprm->p);
#else
sp = mm->start_stack;
@@ -483,11 +499,14 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
return -EFAULT;
#endif
- /* get hold of platform and hardware capabilities masks for the machine
- * we are running on. In some cases (Sparc), this info is impossible
- * to get, in others (i386) it is merely difficult.
- */
hwcap = ELF_HWCAP;
+
+ /*
+ * If this architecture has a platform capability string, copy it
+ * to userspace. In some cases (Sparc), this info is impossible
+ * for userspace to get any other way, in others (i386) it is
+ * merely difficult.
+ */
k_platform = ELF_PLATFORM;
u_platform = NULL;
@@ -499,19 +518,20 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
return -EFAULT;
}
-#if defined(__i386__) && defined(CONFIG_SMP)
- /* in some cases (e.g. Hyper-Threading), we want to avoid L1 evictions
- * by the processes running on the same package. One thing we can do is
- * to shuffle the initial stack for them.
- *
- * the conditionals here are unneeded, but kept in to make the code
- * behaviour the same as pre change unless we have hyperthreaded
- * processors. This keeps Mr Marcelo Person happier but should be
- * removed for 2.5
+ /*
+ * If this architecture has a "base" platform capability
+ * string, copy it to userspace.
*/
- if (smp_num_siblings > 1)
- sp = sp - ((current->pid % 64) << 7);
-#endif
+ k_base_platform = ELF_BASE_PLATFORM;
+ u_base_platform = NULL;
+
+ if (k_base_platform) {
+ platform_len = strlen(k_base_platform) + 1;
+ sp -= platform_len;
+ u_base_platform = (char __user *) sp;
+ if (__copy_to_user(u_base_platform, k_base_platform, platform_len) != 0)
+ return -EFAULT;
+ }
sp &= ~7UL;
@@ -541,9 +561,13 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
}
/* force 16 byte _final_ alignment here for generality */
-#define DLINFO_ITEMS 13
+#define DLINFO_ITEMS 15
+
+ nitems = 1 + DLINFO_ITEMS + (k_platform ? 1 : 0) +
+ (k_base_platform ? 1 : 0) + AT_VECTOR_SIZE_ARCH;
- nitems = 1 + DLINFO_ITEMS + (k_platform ? 1 : 0) + AT_VECTOR_SIZE_ARCH;
+ if (bprm->interp_flags & BINPRM_FLAGS_EXECFD)
+ nitems++;
csp = sp;
sp -= nitems * 2 * sizeof(unsigned long);
@@ -575,6 +599,19 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
(elf_addr_t) (unsigned long) u_platform);
}
+ if (k_base_platform) {
+ nr = 0;
+ csp -= 2 * sizeof(unsigned long);
+ NEW_AUX_ENT(AT_BASE_PLATFORM,
+ (elf_addr_t) (unsigned long) u_base_platform);
+ }
+
+ if (bprm->interp_flags & BINPRM_FLAGS_EXECFD) {
+ nr = 0;
+ csp -= 2 * sizeof(unsigned long);
+ NEW_AUX_ENT(AT_EXECFD, bprm->interp_data);
+ }
+
nr = 0;
csp -= DLINFO_ITEMS * 2 * sizeof(unsigned long);
NEW_AUX_ENT(AT_HWCAP, hwcap);
@@ -590,6 +627,8 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
NEW_AUX_ENT(AT_EUID, (elf_addr_t) current->euid);
NEW_AUX_ENT(AT_GID, (elf_addr_t) current->gid);
NEW_AUX_ENT(AT_EGID, (elf_addr_t) current->egid);
+ NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm));
+ NEW_AUX_ENT(AT_EXECFN, bprm->exec);
#ifdef ARCH_DLINFO
nr = 0;
diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c
index f9c88d0c8ced..32fb00b52cd0 100644
--- a/fs/binfmt_em86.c
+++ b/fs/binfmt_em86.c
@@ -43,7 +43,7 @@ static int load_em86(struct linux_binprm *bprm,struct pt_regs *regs)
return -ENOEXEC;
}
- bprm->sh_bang = 1; /* Well, the bang-shell is implicit... */
+ bprm->recursion_depth++; /* Well, the bang-shell is implicit... */
allow_write_access(bprm->file);
fput(bprm->file);
bprm->file = NULL;
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index dfc0197905ca..ccb781a6a804 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -229,13 +229,13 @@ static int decompress_exec(
ret = 10;
if (buf[3] & EXTRA_FIELD) {
ret += 2 + buf[10] + (buf[11] << 8);
- if (unlikely(LBUFSIZE == ret)) {
+ if (unlikely(LBUFSIZE <= ret)) {
DBG_FLT("binfmt_flat: buffer overflow (EXTRA)?\n");
goto out_free_buf;
}
}
if (buf[3] & ORIG_NAME) {
- for (; ret < LBUFSIZE && (buf[ret] != 0); ret++)
+ while (ret < LBUFSIZE && buf[ret++] != 0)
;
if (unlikely(LBUFSIZE == ret)) {
DBG_FLT("binfmt_flat: buffer overflow (ORIG_NAME)?\n");
@@ -243,7 +243,7 @@ static int decompress_exec(
}
}
if (buf[3] & COMMENT) {
- for (; ret < LBUFSIZE && (buf[ret] != 0); ret++)
+ while (ret < LBUFSIZE && buf[ret++] != 0)
;
if (unlikely(LBUFSIZE == ret)) {
DBG_FLT("binfmt_flat: buffer overflow (COMMENT)?\n");
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 8d7e88e02e0f..f2744ab4e5b3 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -117,7 +117,7 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
goto _ret;
retval = -ENOEXEC;
- if (bprm->misc_bang)
+ if (bprm->recursion_depth > BINPRM_MAX_RECURSION)
goto _ret;
/* to keep locking time low, we copy the interpreter string */
@@ -197,7 +197,7 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
if (retval < 0)
goto _error;
- bprm->misc_bang = 1;
+ bprm->recursion_depth++;
retval = search_binary_handler (bprm, regs);
if (retval < 0)
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index 9e3963f7ebf1..08343505e184 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -22,14 +22,15 @@ static int load_script(struct linux_binprm *bprm,struct pt_regs *regs)
char interp[BINPRM_BUF_SIZE];
int retval;
- if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!') || (bprm->sh_bang))
+ if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!') ||
+ (bprm->recursion_depth > BINPRM_MAX_RECURSION))
return -ENOEXEC;
/*
* This section does the #! interpretation.
* Sorta complicated, but hopefully it will work. -TYT
*/
- bprm->sh_bang = 1;
+ bprm->recursion_depth++;
allow_write_access(bprm->file);
fput(bprm->file);
bprm->file = NULL;
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index 68be580ba289..74e587a52796 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -306,3 +306,5 @@ static void __exit exit_som_binfmt(void)
core_initcall(init_som_binfmt);
module_exit(exit_som_binfmt);
+
+MODULE_LICENSE("GPL");
diff --git a/fs/block_dev.c b/fs/block_dev.c
index d84f0469a016..218408eed1bb 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1262,7 +1262,7 @@ EXPORT_SYMBOL(ioctl_by_bdev);
/**
* lookup_bdev - lookup a struct block_device by name
- * @pathname: special file representing the block device
+ * @path: special file representing the block device
*
* Get a reference to the blockdevice at @pathname in the current
* namespace if possible and return it. Return ERR_PTR(error)
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 3cb7cda3d780..262fa10e213d 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -22,9 +22,6 @@
#include <linux/mutex.h>
#include <linux/backing-dev.h>
-#ifdef CONFIG_KMOD
-#include <linux/kmod.h>
-#endif
#include "internal.h"
/*
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 0d9b80ec689c..cfd29da714d1 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -362,9 +362,8 @@ static int init_coda_psdev(void)
goto out_chrdev;
}
for (i = 0; i < MAX_CODADEVS; i++)
- device_create_drvdata(coda_psdev_class, NULL,
- MKDEV(CODA_PSDEV_MAJOR, i),
- NULL, "cfs%d", i);
+ device_create(coda_psdev_class, NULL,
+ MKDEV(CODA_PSDEV_MAJOR, i), NULL, "cfs%d", i);
coda_sysctl_init();
goto out;
diff --git a/fs/compat.c b/fs/compat.c
index 133ed7f5d681..3b58c32be526 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -137,6 +137,45 @@ asmlinkage long compat_sys_utimes(char __user *filename, struct compat_timeval _
return compat_sys_futimesat(AT_FDCWD, filename, t);
}
+static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
+{
+ compat_ino_t ino = stat->ino;
+ typeof(ubuf->st_uid) uid = 0;
+ typeof(ubuf->st_gid) gid = 0;
+ int err;
+
+ SET_UID(uid, stat->uid);
+ SET_GID(gid, stat->gid);
+
+ if ((u64) stat->size > MAX_NON_LFS ||
+ !old_valid_dev(stat->dev) ||
+ !old_valid_dev(stat->rdev))
+ return -EOVERFLOW;
+ if (sizeof(ino) < sizeof(stat->ino) && ino != stat->ino)
+ return -EOVERFLOW;
+
+ if (clear_user(ubuf, sizeof(*ubuf)))
+ return -EFAULT;
+
+ err = __put_user(old_encode_dev(stat->dev), &ubuf->st_dev);
+ err |= __put_user(ino, &ubuf->st_ino);
+ err |= __put_user(stat->mode, &ubuf->st_mode);
+ err |= __put_user(stat->nlink, &ubuf->st_nlink);
+ err |= __put_user(uid, &ubuf->st_uid);
+ err |= __put_user(gid, &ubuf->st_gid);
+ err |= __put_user(old_encode_dev(stat->rdev), &ubuf->st_rdev);
+ err |= __put_user(stat->size, &ubuf->st_size);
+ err |= __put_user(stat->atime.tv_sec, &ubuf->st_atime);
+ err |= __put_user(stat->atime.tv_nsec, &ubuf->st_atime_nsec);
+ err |= __put_user(stat->mtime.tv_sec, &ubuf->st_mtime);
+ err |= __put_user(stat->mtime.tv_nsec, &ubuf->st_mtime_nsec);
+ err |= __put_user(stat->ctime.tv_sec, &ubuf->st_ctime);
+ err |= __put_user(stat->ctime.tv_nsec, &ubuf->st_ctime_nsec);
+ err |= __put_user(stat->blksize, &ubuf->st_blksize);
+ err |= __put_user(stat->blocks, &ubuf->st_blocks);
+ return err;
+}
+
asmlinkage long compat_sys_newstat(char __user * filename,
struct compat_stat __user *statbuf)
{
@@ -1239,7 +1278,7 @@ static int compat_count(compat_uptr_t __user *argv, int max)
if (!p)
break;
argv++;
- if(++i > max)
+ if (i++ >= max)
return -E2BIG;
}
}
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 9606ee848fd8..af0558dbe8b7 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -5,11 +5,11 @@
*
* O_DIRECT
*
- * 04Jul2002 akpm@zip.com.au
+ * 04Jul2002 Andrew Morton
* Initial version
* 11Sep2002 janetinc@us.ibm.com
* added readv/writev support.
- * 29Oct2002 akpm@zip.com.au
+ * 29Oct2002 Andrew Morton
* rewrote bio_add_page() support.
* 30Oct2002 pbadari@us.ibm.com
* added support for non-aligned IO.
diff --git a/fs/dquot.c b/fs/dquot.c
index ad7e59003e04..da30a27f2242 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -9,8 +9,6 @@
* implementation is based on one of the several variants of the LINUX
* inode-subsystem with added complexity of the diskquota system.
*
- * Version: $Id: dquot.c,v 6.3 1996/11/17 18:35:34 mvw Exp mvw $
- *
* Author: Marco van Wieringen <mvw@planets.elm.net>
*
* Fixes: Dmitry Gorodchanin <pgmdsg@ibi.com>, 11 Feb 96
diff --git a/fs/ecryptfs/Makefile b/fs/ecryptfs/Makefile
index b4755a85996e..2cc9ee4ad2eb 100644
--- a/fs/ecryptfs/Makefile
+++ b/fs/ecryptfs/Makefile
@@ -4,4 +4,4 @@
obj-$(CONFIG_ECRYPT_FS) += ecryptfs.o
-ecryptfs-objs := dentry.o file.o inode.o main.o super.o mmap.o read_write.o crypto.o keystore.o messaging.o netlink.o miscdev.o kthread.o debug.o
+ecryptfs-objs := dentry.o file.o inode.o main.o super.o mmap.o read_write.o crypto.o keystore.o messaging.o miscdev.o kthread.o debug.o
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index b73fb752c5f8..3504cf9df358 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -79,11 +79,6 @@
#define ECRYPTFS_MAX_PKI_NAME_BYTES 16
#define ECRYPTFS_DEFAULT_NUM_USERS 4
#define ECRYPTFS_MAX_NUM_USERS 32768
-#define ECRYPTFS_TRANSPORT_NETLINK 0
-#define ECRYPTFS_TRANSPORT_CONNECTOR 1
-#define ECRYPTFS_TRANSPORT_RELAYFS 2
-#define ECRYPTFS_TRANSPORT_MISCDEV 3
-#define ECRYPTFS_DEFAULT_TRANSPORT ECRYPTFS_TRANSPORT_MISCDEV
#define ECRYPTFS_XATTR_NAME "user.ecryptfs"
#define RFC2440_CIPHER_DES3_EDE 0x02
@@ -400,8 +395,6 @@ struct ecryptfs_msg_ctx {
struct mutex mux;
};
-extern unsigned int ecryptfs_transport;
-
struct ecryptfs_daemon;
struct ecryptfs_daemon {
@@ -627,31 +620,20 @@ int
ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
size_t size, int flags);
int ecryptfs_read_xattr_region(char *page_virt, struct inode *ecryptfs_inode);
-int ecryptfs_process_helo(unsigned int transport, uid_t euid,
- struct user_namespace *user_ns, struct pid *pid);
+int ecryptfs_process_helo(uid_t euid, struct user_namespace *user_ns,
+ struct pid *pid);
int ecryptfs_process_quit(uid_t euid, struct user_namespace *user_ns,
struct pid *pid);
int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid,
struct user_namespace *user_ns, struct pid *pid,
u32 seq);
-int ecryptfs_send_message(unsigned int transport, char *data, int data_len,
+int ecryptfs_send_message(char *data, int data_len,
struct ecryptfs_msg_ctx **msg_ctx);
int ecryptfs_wait_for_response(struct ecryptfs_msg_ctx *msg_ctx,
struct ecryptfs_message **emsg);
-int ecryptfs_init_messaging(unsigned int transport);
-void ecryptfs_release_messaging(unsigned int transport);
+int ecryptfs_init_messaging(void);
+void ecryptfs_release_messaging(void);
-int ecryptfs_send_netlink(char *data, int data_len,
- struct ecryptfs_msg_ctx *msg_ctx, u8 msg_type,
- u16 msg_flags, struct pid *daemon_pid);
-int ecryptfs_init_netlink(void);
-void ecryptfs_release_netlink(void);
-
-int ecryptfs_send_connector(char *data, int data_len,
- struct ecryptfs_msg_ctx *msg_ctx, u8 msg_type,
- u16 msg_flags, struct pid *daemon_pid);
-int ecryptfs_init_connector(void);
-void ecryptfs_release_connector(void);
void
ecryptfs_write_header_metadata(char *virt,
struct ecryptfs_crypt_stat *crypt_stat,
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 9244d653743e..eb3dc4c7ac06 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -71,12 +71,11 @@ struct ecryptfs_getdents_callback {
void *dirent;
struct dentry *dentry;
filldir_t filldir;
- int err;
int filldir_called;
int entries_written;
};
-/* Inspired by generic filldir in fs/readir.c */
+/* Inspired by generic filldir in fs/readdir.c */
static int
ecryptfs_filldir(void *dirent, const char *name, int namelen, loff_t offset,
u64 ino, unsigned int d_type)
@@ -125,18 +124,18 @@ static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir)
buf.dirent = dirent;
buf.dentry = file->f_path.dentry;
buf.filldir = filldir;
-retry:
buf.filldir_called = 0;
buf.entries_written = 0;
- buf.err = 0;
rc = vfs_readdir(lower_file, ecryptfs_filldir, (void *)&buf);
- if (buf.err)
- rc = buf.err;
- if (buf.filldir_called && !buf.entries_written)
- goto retry;
file->f_pos = lower_file->f_pos;
+ if (rc < 0)
+ goto out;
+ if (buf.filldir_called && !buf.entries_written)
+ goto out;
if (rc >= 0)
- fsstack_copy_attr_atime(inode, lower_file->f_path.dentry->d_inode);
+ fsstack_copy_attr_atime(inode,
+ lower_file->f_path.dentry->d_inode);
+out:
return rc;
}
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index f5b76a331b9c..e22bc3961345 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -234,8 +234,8 @@ parse_tag_65_packet(struct ecryptfs_session_key *session_key, u8 *cipher_code,
}
i += data_len;
if (message_len < (i + m_size)) {
- ecryptfs_printk(KERN_ERR, "The received netlink message is "
- "shorter than expected\n");
+ ecryptfs_printk(KERN_ERR, "The message received from ecryptfsd "
+ "is shorter than expected\n");
rc = -EIO;
goto out;
}
@@ -438,8 +438,8 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
struct ecryptfs_msg_ctx *msg_ctx;
struct ecryptfs_message *msg = NULL;
char *auth_tok_sig;
- char *netlink_message;
- size_t netlink_message_length;
+ char *payload;
+ size_t payload_len;
int rc;
rc = ecryptfs_get_auth_tok_sig(&auth_tok_sig, auth_tok);
@@ -449,15 +449,15 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
goto out;
}
rc = write_tag_64_packet(auth_tok_sig, &(auth_tok->session_key),
- &netlink_message, &netlink_message_length);
+ &payload, &payload_len);
if (rc) {
ecryptfs_printk(KERN_ERR, "Failed to write tag 64 packet\n");
goto out;
}
- rc = ecryptfs_send_message(ecryptfs_transport, netlink_message,
- netlink_message_length, &msg_ctx);
+ rc = ecryptfs_send_message(payload, payload_len, &msg_ctx);
if (rc) {
- ecryptfs_printk(KERN_ERR, "Error sending netlink message\n");
+ ecryptfs_printk(KERN_ERR, "Error sending message to "
+ "ecryptfsd\n");
goto out;
}
rc = ecryptfs_wait_for_response(msg_ctx, &msg);
@@ -1333,23 +1333,22 @@ pki_encrypt_session_key(struct ecryptfs_auth_tok *auth_tok,
struct ecryptfs_key_record *key_rec)
{
struct ecryptfs_msg_ctx *msg_ctx = NULL;
- char *netlink_payload;
- size_t netlink_payload_length;
+ char *payload = NULL;
+ size_t payload_len;
struct ecryptfs_message *msg;
int rc;
rc = write_tag_66_packet(auth_tok->token.private_key.signature,
ecryptfs_code_for_cipher_string(crypt_stat),
- crypt_stat, &netlink_payload,
- &netlink_payload_length);
+ crypt_stat, &payload, &payload_len);
if (rc) {
ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet\n");
goto out;
}
- rc = ecryptfs_send_message(ecryptfs_transport, netlink_payload,
- netlink_payload_length, &msg_ctx);
+ rc = ecryptfs_send_message(payload, payload_len, &msg_ctx);
if (rc) {
- ecryptfs_printk(KERN_ERR, "Error sending netlink message\n");
+ ecryptfs_printk(KERN_ERR, "Error sending message to "
+ "ecryptfsd\n");
goto out;
}
rc = ecryptfs_wait_for_response(msg_ctx, &msg);
@@ -1364,8 +1363,7 @@ pki_encrypt_session_key(struct ecryptfs_auth_tok *auth_tok,
ecryptfs_printk(KERN_ERR, "Error parsing tag 67 packet\n");
kfree(msg);
out:
- if (netlink_payload)
- kfree(netlink_payload);
+ kfree(payload);
return rc;
}
/**
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 8ebe9a5d1d99..046e027a4cb1 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -30,7 +30,6 @@
#include <linux/namei.h>
#include <linux/skbuff.h>
#include <linux/crypto.h>
-#include <linux/netlink.h>
#include <linux/mount.h>
#include <linux/pagemap.h>
#include <linux/key.h>
@@ -49,8 +48,7 @@ MODULE_PARM_DESC(ecryptfs_verbosity,
"0, which is Quiet)");
/**
- * Module parameter that defines the number of netlink message buffer
- * elements
+ * Module parameter that defines the number of message buffer elements
*/
unsigned int ecryptfs_message_buf_len = ECRYPTFS_DEFAULT_MSG_CTX_ELEMS;
@@ -60,9 +58,9 @@ MODULE_PARM_DESC(ecryptfs_message_buf_len,
/**
* Module parameter that defines the maximum guaranteed amount of time to wait
- * for a response through netlink. The actual sleep time will be, more than
+ * for a response from ecryptfsd. The actual sleep time will be, more than
* likely, a small amount greater than this specified value, but only less if
- * the netlink message successfully arrives.
+ * the message successfully arrives.
*/
signed long ecryptfs_message_wait_timeout = ECRYPTFS_MAX_MSG_CTX_TTL / HZ;
@@ -83,8 +81,6 @@ module_param(ecryptfs_number_of_users, uint, 0);
MODULE_PARM_DESC(ecryptfs_number_of_users, "An estimate of the number of "
"concurrent users of eCryptfs");
-unsigned int ecryptfs_transport = ECRYPTFS_DEFAULT_TRANSPORT;
-
void __ecryptfs_printk(const char *fmt, ...)
{
va_list args;
@@ -779,10 +775,11 @@ static int __init ecryptfs_init(void)
"rc = [%d]\n", __func__, rc);
goto out_do_sysfs_unregistration;
}
- rc = ecryptfs_init_messaging(ecryptfs_transport);
+ rc = ecryptfs_init_messaging();
if (rc) {
printk(KERN_ERR "Failure occured while attempting to "
- "initialize the eCryptfs netlink socket\n");
+ "initialize the communications channel to "
+ "ecryptfsd\n");
goto out_destroy_kthread;
}
rc = ecryptfs_init_crypto();
@@ -797,7 +794,7 @@ static int __init ecryptfs_init(void)
goto out;
out_release_messaging:
- ecryptfs_release_messaging(ecryptfs_transport);
+ ecryptfs_release_messaging();
out_destroy_kthread:
ecryptfs_destroy_kthread();
out_do_sysfs_unregistration:
@@ -818,7 +815,7 @@ static void __exit ecryptfs_exit(void)
if (rc)
printk(KERN_ERR "Failure whilst attempting to destroy crypto; "
"rc = [%d]\n", rc);
- ecryptfs_release_messaging(ecryptfs_transport);
+ ecryptfs_release_messaging();
ecryptfs_destroy_kthread();
do_sysfs_unregistration();
unregister_filesystem(&ecryptfs_fs_type);
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 1b5c20058acb..c6983978a31e 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -134,12 +134,11 @@ out:
}
static int
-ecryptfs_send_message_locked(unsigned int transport, char *data, int data_len,
- u8 msg_type, struct ecryptfs_msg_ctx **msg_ctx);
+ecryptfs_send_message_locked(char *data, int data_len, u8 msg_type,
+ struct ecryptfs_msg_ctx **msg_ctx);
/**
* ecryptfs_send_raw_message
- * @transport: Transport type
* @msg_type: Message type
* @daemon: Daemon struct for recipient of message
*
@@ -150,38 +149,25 @@ ecryptfs_send_message_locked(unsigned int transport, char *data, int data_len,
*
* Returns zero on success; non-zero otherwise
*/
-static int ecryptfs_send_raw_message(unsigned int transport, u8 msg_type,
+static int ecryptfs_send_raw_message(u8 msg_type,
struct ecryptfs_daemon *daemon)
{
struct ecryptfs_msg_ctx *msg_ctx;
int rc;
- switch(transport) {
- case ECRYPTFS_TRANSPORT_NETLINK:
- rc = ecryptfs_send_netlink(NULL, 0, NULL, msg_type, 0,
- daemon->pid);
- break;
- case ECRYPTFS_TRANSPORT_MISCDEV:
- rc = ecryptfs_send_message_locked(transport, NULL, 0, msg_type,
- &msg_ctx);
- if (rc) {
- printk(KERN_ERR "%s: Error whilst attempting to send "
- "message via procfs; rc = [%d]\n", __func__, rc);
- goto out;
- }
- /* Raw messages are logically context-free (e.g., no
- * reply is expected), so we set the state of the
- * ecryptfs_msg_ctx object to indicate that it should
- * be freed as soon as the transport sends out the message. */
- mutex_lock(&msg_ctx->mux);
- msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_NO_REPLY;
- mutex_unlock(&msg_ctx->mux);
- break;
- case ECRYPTFS_TRANSPORT_CONNECTOR:
- case ECRYPTFS_TRANSPORT_RELAYFS:
- default:
- rc = -ENOSYS;
+ rc = ecryptfs_send_message_locked(NULL, 0, msg_type, &msg_ctx);
+ if (rc) {
+ printk(KERN_ERR "%s: Error whilst attempting to send "
+ "message to ecryptfsd; rc = [%d]\n", __func__, rc);
+ goto out;
}
+ /* Raw messages are logically context-free (e.g., no
+ * reply is expected), so we set the state of the
+ * ecryptfs_msg_ctx object to indicate that it should
+ * be freed as soon as the message is sent. */
+ mutex_lock(&msg_ctx->mux);
+ msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_NO_REPLY;
+ mutex_unlock(&msg_ctx->mux);
out:
return rc;
}
@@ -227,7 +213,6 @@ out:
/**
* ecryptfs_process_helo
- * @transport: The underlying transport (netlink, etc.)
* @euid: The user ID owner of the message
* @user_ns: The namespace in which @euid applies
* @pid: The process ID for the userspace program that sent the
@@ -239,8 +224,8 @@ out:
* Returns zero after adding a new daemon to the hash list;
* non-zero otherwise.
*/
-int ecryptfs_process_helo(unsigned int transport, uid_t euid,
- struct user_namespace *user_ns, struct pid *pid)
+int ecryptfs_process_helo(uid_t euid, struct user_namespace *user_ns,
+ struct pid *pid)
{
struct ecryptfs_daemon *new_daemon;
struct ecryptfs_daemon *old_daemon;
@@ -252,8 +237,7 @@ int ecryptfs_process_helo(unsigned int transport, uid_t euid,
printk(KERN_WARNING "Received request from user [%d] "
"to register daemon [0x%p]; unregistering daemon "
"[0x%p]\n", euid, pid, old_daemon->pid);
- rc = ecryptfs_send_raw_message(transport, ECRYPTFS_MSG_QUIT,
- old_daemon);
+ rc = ecryptfs_send_raw_message(ECRYPTFS_MSG_QUIT, old_daemon);
if (rc)
printk(KERN_WARNING "Failed to send QUIT "
"message to daemon [0x%p]; rc = [%d]\n",
@@ -467,8 +451,6 @@ out:
/**
* ecryptfs_send_message_locked
- * @transport: The transport over which to send the message (i.e.,
- * netlink)
* @data: The data to send
* @data_len: The length of data
* @msg_ctx: The message context allocated for the send
@@ -478,8 +460,8 @@ out:
* Returns zero on success; non-zero otherwise
*/
static int
-ecryptfs_send_message_locked(unsigned int transport, char *data, int data_len,
- u8 msg_type, struct ecryptfs_msg_ctx **msg_ctx)
+ecryptfs_send_message_locked(char *data, int data_len, u8 msg_type,
+ struct ecryptfs_msg_ctx **msg_ctx)
{
struct ecryptfs_daemon *daemon;
int rc;
@@ -503,20 +485,8 @@ ecryptfs_send_message_locked(unsigned int transport, char *data, int data_len,
ecryptfs_msg_ctx_free_to_alloc(*msg_ctx);
mutex_unlock(&(*msg_ctx)->mux);
mutex_unlock(&ecryptfs_msg_ctx_lists_mux);
- switch (transport) {
- case ECRYPTFS_TRANSPORT_NETLINK:
- rc = ecryptfs_send_netlink(data, data_len, *msg_ctx, msg_type,
- 0, daemon->pid);
- break;
- case ECRYPTFS_TRANSPORT_MISCDEV:
- rc = ecryptfs_send_miscdev(data, data_len, *msg_ctx, msg_type,
- 0, daemon);
- break;
- case ECRYPTFS_TRANSPORT_CONNECTOR:
- case ECRYPTFS_TRANSPORT_RELAYFS:
- default:
- rc = -ENOSYS;
- }
+ rc = ecryptfs_send_miscdev(data, data_len, *msg_ctx, msg_type, 0,
+ daemon);
if (rc)
printk(KERN_ERR "%s: Error attempting to send message to "
"userspace daemon; rc = [%d]\n", __func__, rc);
@@ -526,8 +496,6 @@ out:
/**
* ecryptfs_send_message
- * @transport: The transport over which to send the message (i.e.,
- * netlink)
* @data: The data to send
* @data_len: The length of data
* @msg_ctx: The message context allocated for the send
@@ -536,14 +504,14 @@ out:
*
* Returns zero on success; non-zero otherwise
*/
-int ecryptfs_send_message(unsigned int transport, char *data, int data_len,
+int ecryptfs_send_message(char *data, int data_len,
struct ecryptfs_msg_ctx **msg_ctx)
{
int rc;
mutex_lock(&ecryptfs_daemon_hash_mux);
- rc = ecryptfs_send_message_locked(transport, data, data_len,
- ECRYPTFS_MSG_REQUEST, msg_ctx);
+ rc = ecryptfs_send_message_locked(data, data_len, ECRYPTFS_MSG_REQUEST,
+ msg_ctx);
mutex_unlock(&ecryptfs_daemon_hash_mux);
return rc;
}
@@ -586,7 +554,7 @@ sleep:
return rc;
}
-int ecryptfs_init_messaging(unsigned int transport)
+int ecryptfs_init_messaging(void)
{
int i;
int rc = 0;
@@ -639,27 +607,14 @@ int ecryptfs_init_messaging(unsigned int transport)
mutex_unlock(&ecryptfs_msg_ctx_arr[i].mux);
}
mutex_unlock(&ecryptfs_msg_ctx_lists_mux);
- switch(transport) {
- case ECRYPTFS_TRANSPORT_NETLINK:
- rc = ecryptfs_init_netlink();
- if (rc)
- ecryptfs_release_messaging(transport);
- break;
- case ECRYPTFS_TRANSPORT_MISCDEV:
- rc = ecryptfs_init_ecryptfs_miscdev();
- if (rc)
- ecryptfs_release_messaging(transport);
- break;
- case ECRYPTFS_TRANSPORT_CONNECTOR:
- case ECRYPTFS_TRANSPORT_RELAYFS:
- default:
- rc = -ENOSYS;
- }
+ rc = ecryptfs_init_ecryptfs_miscdev();
+ if (rc)
+ ecryptfs_release_messaging();
out:
return rc;
}
-void ecryptfs_release_messaging(unsigned int transport)
+void ecryptfs_release_messaging(void)
{
if (ecryptfs_msg_ctx_arr) {
int i;
@@ -698,17 +653,6 @@ void ecryptfs_release_messaging(unsigned int transport)
kfree(ecryptfs_daemon_hash);
mutex_unlock(&ecryptfs_daemon_hash_mux);
}
- switch(transport) {
- case ECRYPTFS_TRANSPORT_NETLINK:
- ecryptfs_release_netlink();
- break;
- case ECRYPTFS_TRANSPORT_MISCDEV:
- ecryptfs_destroy_ecryptfs_miscdev();
- break;
- case ECRYPTFS_TRANSPORT_CONNECTOR:
- case ECRYPTFS_TRANSPORT_RELAYFS:
- default:
- break;
- }
+ ecryptfs_destroy_ecryptfs_miscdev();
return;
}
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 245c2dc02d5c..04d7b3fa1ac6 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -265,22 +265,34 @@ out:
}
/**
- * ecryptfs_prepare_write
+ * ecryptfs_write_begin
* @file: The eCryptfs file
- * @page: The eCryptfs page
- * @from: The start byte from which we will write
- * @to: The end byte to which we will write
+ * @mapping: The eCryptfs object
+ * @pos: The file offset at which to start writing
+ * @len: Length of the write
+ * @flags: Various flags
+ * @pagep: Pointer to return the page
+ * @fsdata: Pointer to return fs data (unused)
*
* This function must zero any hole we create
*
* Returns zero on success; non-zero otherwise
*/
-static int ecryptfs_prepare_write(struct file *file, struct page *page,
- unsigned from, unsigned to)
+static int ecryptfs_write_begin(struct file *file,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
{
+ pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ struct page *page;
loff_t prev_page_end_size;
int rc = 0;
+ page = __grab_cache_page(mapping, index);
+ if (!page)
+ return -ENOMEM;
+ *pagep = page;
+
if (!PageUptodate(page)) {
struct ecryptfs_crypt_stat *crypt_stat =
&ecryptfs_inode_to_private(
@@ -289,8 +301,7 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page,
if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)
|| (crypt_stat->flags & ECRYPTFS_NEW_FILE)) {
rc = ecryptfs_read_lower_page_segment(
- page, page->index, 0, PAGE_CACHE_SIZE,
- page->mapping->host);
+ page, index, 0, PAGE_CACHE_SIZE, mapping->host);
if (rc) {
printk(KERN_ERR "%s: Error attemping to read "
"lower page segment; rc = [%d]\n",
@@ -316,8 +327,8 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page,
SetPageUptodate(page);
} else {
rc = ecryptfs_read_lower_page_segment(
- page, page->index, 0, PAGE_CACHE_SIZE,
- page->mapping->host);
+ page, index, 0, PAGE_CACHE_SIZE,
+ mapping->host);
if (rc) {
printk(KERN_ERR "%s: Error reading "
"page; rc = [%d]\n",
@@ -339,10 +350,10 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page,
SetPageUptodate(page);
}
}
- prev_page_end_size = ((loff_t)page->index << PAGE_CACHE_SHIFT);
+ prev_page_end_size = ((loff_t)index << PAGE_CACHE_SHIFT);
/* If creating a page or more of holes, zero them out via truncate.
* Note, this will increase i_size. */
- if (page->index != 0) {
+ if (index != 0) {
if (prev_page_end_size > i_size_read(page->mapping->host)) {
rc = ecryptfs_truncate(file->f_path.dentry,
prev_page_end_size);
@@ -357,8 +368,8 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page,
}
/* Writing to a new page, and creating a small hole from start
* of page? Zero it out. */
- if ((i_size_read(page->mapping->host) == prev_page_end_size)
- && (from != 0))
+ if ((i_size_read(mapping->host) == prev_page_end_size)
+ && (pos != 0))
zero_user(page, 0, PAGE_CACHE_SIZE);
out:
return rc;
@@ -445,21 +456,28 @@ int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode)
}
/**
- * ecryptfs_commit_write
+ * ecryptfs_write_end
* @file: The eCryptfs file object
+ * @mapping: The eCryptfs object
+ * @pos: The file position
+ * @len: The length of the data (unused)
+ * @copied: The amount of data copied
* @page: The eCryptfs page
- * @from: Ignored (we rotate the page IV on each write)
- * @to: Ignored
+ * @fsdata: The fsdata (unused)
*
* This is where we encrypt the data and pass the encrypted data to
* the lower filesystem. In OpenPGP-compatible mode, we operate on
* entire underlying packets.
*/
-static int ecryptfs_commit_write(struct file *file, struct page *page,
- unsigned from, unsigned to)
+static int ecryptfs_write_end(struct file *file,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
{
- loff_t pos;
- struct inode *ecryptfs_inode = page->mapping->host;
+ pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned to = from + copied;
+ struct inode *ecryptfs_inode = mapping->host;
struct ecryptfs_crypt_stat *crypt_stat =
&ecryptfs_inode_to_private(file->f_path.dentry->d_inode)->crypt_stat;
int rc;
@@ -471,25 +489,22 @@ static int ecryptfs_commit_write(struct file *file, struct page *page,
} else
ecryptfs_printk(KERN_DEBUG, "Not a new file\n");
ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page"
- "(page w/ index = [0x%.16x], to = [%d])\n", page->index,
- to);
+ "(page w/ index = [0x%.16x], to = [%d])\n", index, to);
/* Fills in zeros if 'to' goes beyond inode size */
rc = fill_zeros_to_end_of_page(page, to);
if (rc) {
ecryptfs_printk(KERN_WARNING, "Error attempting to fill "
- "zeros in page with index = [0x%.16x]\n",
- page->index);
+ "zeros in page with index = [0x%.16x]\n", index);
goto out;
}
rc = ecryptfs_encrypt_page(page);
if (rc) {
ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper "
- "index [0x%.16x])\n", page->index);
+ "index [0x%.16x])\n", index);
goto out;
}
- pos = (((loff_t)page->index) << PAGE_CACHE_SHIFT) + to;
- if (pos > i_size_read(ecryptfs_inode)) {
- i_size_write(ecryptfs_inode, pos);
+ if (pos + copied > i_size_read(ecryptfs_inode)) {
+ i_size_write(ecryptfs_inode, pos + copied);
ecryptfs_printk(KERN_DEBUG, "Expanded file size to "
"[0x%.16x]\n", i_size_read(ecryptfs_inode));
}
@@ -497,7 +512,11 @@ static int ecryptfs_commit_write(struct file *file, struct page *page,
if (rc)
printk(KERN_ERR "Error writing inode size to metadata; "
"rc = [%d]\n", rc);
+ else
+ rc = copied;
out:
+ unlock_page(page);
+ page_cache_release(page);
return rc;
}
@@ -518,7 +537,7 @@ static sector_t ecryptfs_bmap(struct address_space *mapping, sector_t block)
struct address_space_operations ecryptfs_aops = {
.writepage = ecryptfs_writepage,
.readpage = ecryptfs_readpage,
- .prepare_write = ecryptfs_prepare_write,
- .commit_write = ecryptfs_commit_write,
+ .write_begin = ecryptfs_write_begin,
+ .write_end = ecryptfs_write_end,
.bmap = ecryptfs_bmap,
};
diff --git a/fs/ecryptfs/netlink.c b/fs/ecryptfs/netlink.c
deleted file mode 100644
index e0abad62b395..000000000000
--- a/fs/ecryptfs/netlink.c
+++ /dev/null
@@ -1,249 +0,0 @@
-/**
- * eCryptfs: Linux filesystem encryption layer
- *
- * Copyright (C) 2004-2006 International Business Machines Corp.
- * Author(s): Michael A. Halcrow <mhalcrow@us.ibm.com>
- * Tyler Hicks <tyhicks@ou.edu>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version
- * 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
- */
-
-#include <net/sock.h>
-#include <linux/hash.h>
-#include <linux/random.h>
-#include "ecryptfs_kernel.h"
-
-static struct sock *ecryptfs_nl_sock;
-
-/**
- * ecryptfs_send_netlink
- * @data: The data to include as the payload
- * @data_len: The byte count of the data
- * @msg_ctx: The netlink context that will be used to handle the
- * response message
- * @msg_type: The type of netlink message to send
- * @msg_flags: The flags to include in the netlink header
- * @daemon_pid: The process id of the daemon to send the message to
- *
- * Sends the data to the specified daemon pid and uses the netlink
- * context element to store the data needed for validation upon
- * receiving the response. The data and the netlink context can be
- * null if just sending a netlink header is sufficient. Returns zero
- * upon sending the message; non-zero upon error.
- */
-int ecryptfs_send_netlink(char *data, int data_len,
- struct ecryptfs_msg_ctx *msg_ctx, u8 msg_type,
- u16 msg_flags, struct pid *daemon_pid)
-{
- struct sk_buff *skb;
- struct nlmsghdr *nlh;
- struct ecryptfs_message *msg;
- size_t payload_len;
- int rc;
-
- payload_len = ((data && data_len) ? (sizeof(*msg) + data_len) : 0);
- skb = alloc_skb(NLMSG_SPACE(payload_len), GFP_KERNEL);
- if (!skb) {
- rc = -ENOMEM;
- ecryptfs_printk(KERN_ERR, "Failed to allocate socket buffer\n");
- goto out;
- }
- nlh = NLMSG_PUT(skb, pid_nr(daemon_pid), msg_ctx ? msg_ctx->counter : 0,
- msg_type, payload_len);
- nlh->nlmsg_flags = msg_flags;
- if (msg_ctx && payload_len) {
- msg = (struct ecryptfs_message *)NLMSG_DATA(nlh);
- msg->index = msg_ctx->index;
- msg->data_len = data_len;
- memcpy(msg->data, data, data_len);
- }
- rc = netlink_unicast(ecryptfs_nl_sock, skb, pid_nr(daemon_pid), 0);
- if (rc < 0) {
- ecryptfs_printk(KERN_ERR, "Failed to send eCryptfs netlink "
- "message; rc = [%d]\n", rc);
- goto out;
- }
- rc = 0;
- goto out;
-nlmsg_failure:
- rc = -EMSGSIZE;
- kfree_skb(skb);
-out:
- return rc;
-}
-
-/**
- * ecryptfs_process_nl_reponse
- * @skb: The socket buffer containing the netlink message of state
- * RESPONSE
- *
- * Processes a response message after sending a operation request to
- * userspace. Attempts to assign the msg to a netlink context element
- * at the index specified in the msg. The sk_buff and nlmsghdr must
- * be validated before this function. Returns zero upon delivery to
- * desired context element; non-zero upon delivery failure or error.
- */
-static int ecryptfs_process_nl_response(struct sk_buff *skb)
-{
- struct nlmsghdr *nlh = nlmsg_hdr(skb);
- struct ecryptfs_message *msg = NLMSG_DATA(nlh);
- struct pid *pid;
- int rc;
-
- if (skb->len - NLMSG_HDRLEN - sizeof(*msg) != msg->data_len) {
- rc = -EINVAL;
- ecryptfs_printk(KERN_ERR, "Received netlink message with "
- "incorrectly specified data length\n");
- goto out;
- }
- pid = find_get_pid(NETLINK_CREDS(skb)->pid);
- rc = ecryptfs_process_response(msg, NETLINK_CREDS(skb)->uid, NULL,
- pid, nlh->nlmsg_seq);
- put_pid(pid);
- if (rc)
- printk(KERN_ERR
- "Error processing response message; rc = [%d]\n", rc);
-out:
- return rc;
-}
-
-/**
- * ecryptfs_process_nl_helo
- * @skb: The socket buffer containing the nlmsghdr in HELO state
- *
- * Gets uid and pid of the skb and adds the values to the daemon id
- * hash. Returns zero after adding a new daemon id to the hash list;
- * non-zero otherwise.
- */
-static int ecryptfs_process_nl_helo(struct sk_buff *skb)
-{
- struct pid *pid;
- int rc;
-
- pid = find_get_pid(NETLINK_CREDS(skb)->pid);
- rc = ecryptfs_process_helo(ECRYPTFS_TRANSPORT_NETLINK,
- NETLINK_CREDS(skb)->uid, NULL, pid);
- put_pid(pid);
- if (rc)
- printk(KERN_WARNING "Error processing HELO; rc = [%d]\n", rc);
- return rc;
-}
-
-/**
- * ecryptfs_process_nl_quit
- * @skb: The socket buffer containing the nlmsghdr in QUIT state
- *
- * Gets uid and pid of the skb and deletes the corresponding daemon
- * id, if it is the registered that is requesting the
- * deletion. Returns zero after deleting the desired daemon id;
- * non-zero otherwise.
- */
-static int ecryptfs_process_nl_quit(struct sk_buff *skb)
-{
- struct pid *pid;
- int rc;
-
- pid = find_get_pid(NETLINK_CREDS(skb)->pid);
- rc = ecryptfs_process_quit(NETLINK_CREDS(skb)->uid, NULL, pid);
- put_pid(pid);
- if (rc)
- printk(KERN_WARNING
- "Error processing QUIT message; rc = [%d]\n", rc);
- return rc;
-}
-
-/**
- * ecryptfs_receive_nl_message
- *
- * Callback function called by netlink system when a message arrives.
- * If the message looks to be valid, then an attempt is made to assign
- * it to its desired netlink context element and wake up the process
- * that is waiting for a response.
- */
-static void ecryptfs_receive_nl_message(struct sk_buff *skb)
-{
- struct nlmsghdr *nlh;
-
- nlh = nlmsg_hdr(skb);
- if (!NLMSG_OK(nlh, skb->len)) {
- ecryptfs_printk(KERN_ERR, "Received corrupt netlink "
- "message\n");
- goto free;
- }
- switch (nlh->nlmsg_type) {
- case ECRYPTFS_MSG_RESPONSE:
- if (ecryptfs_process_nl_response(skb)) {
- ecryptfs_printk(KERN_WARNING, "Failed to "
- "deliver netlink response to "
- "requesting operation\n");
- }
- break;
- case ECRYPTFS_MSG_HELO:
- if (ecryptfs_process_nl_helo(skb)) {
- ecryptfs_printk(KERN_WARNING, "Failed to "
- "fulfill HELO request\n");
- }
- break;
- case ECRYPTFS_MSG_QUIT:
- if (ecryptfs_process_nl_quit(skb)) {
- ecryptfs_printk(KERN_WARNING, "Failed to "
- "fulfill QUIT request\n");
- }
- break;
- default:
- ecryptfs_printk(KERN_WARNING, "Dropping netlink "
- "message of unrecognized type [%d]\n",
- nlh->nlmsg_type);
- break;
- }
-free:
- kfree_skb(skb);
-}
-
-/**
- * ecryptfs_init_netlink
- *
- * Initializes the daemon id hash list, netlink context array, and
- * necessary locks. Returns zero upon success; non-zero upon error.
- */
-int ecryptfs_init_netlink(void)
-{
- int rc;
-
- ecryptfs_nl_sock = netlink_kernel_create(&init_net, NETLINK_ECRYPTFS, 0,
- ecryptfs_receive_nl_message,
- NULL, THIS_MODULE);
- if (!ecryptfs_nl_sock) {
- rc = -EIO;
- ecryptfs_printk(KERN_ERR, "Failed to create netlink socket\n");
- goto out;
- }
- ecryptfs_nl_sock->sk_sndtimeo = ECRYPTFS_DEFAULT_SEND_TIMEOUT;
- rc = 0;
-out:
- return rc;
-}
-
-/**
- * ecryptfs_release_netlink
- *
- * Frees all memory used by the netlink context array and releases the
- * netlink socket.
- */
-void ecryptfs_release_netlink(void)
-{
- netlink_kernel_release(ecryptfs_nl_sock);
- ecryptfs_nl_sock = NULL;
-}
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 7cc0eb756b55..99368bda0261 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -927,14 +927,11 @@ errxit:
/*
* During the time we spent in the loop above, some other events
* might have been queued by the poll callback. We re-insert them
- * here (in case they are not already queued, or they're one-shot).
+ * inside the main ready-list here.
*/
for (nepi = ep->ovflist; (epi = nepi) != NULL;
- nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
- if (!ep_is_linked(&epi->rdllink) &&
- (epi->event.events & ~EP_PRIVATE_BITS))
- list_add_tail(&epi->rdllink, &ep->rdllist);
- }
+ nepi = epi->next, epi->next = EP_UNACTIVE_PTR)
+ list_add_tail(&epi->rdllink, &ep->rdllist);
/*
* We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
* releasing the lock, events will be queued in the normal way inside
diff --git a/fs/exec.c b/fs/exec.c
index cecee501ce78..a41e7902ed0b 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -50,15 +50,12 @@
#include <linux/cn_proc.h>
#include <linux/audit.h>
#include <linux/tracehook.h>
+#include <linux/kmod.h>
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
#include <asm/tlb.h>
-#ifdef CONFIG_KMOD
-#include <linux/kmod.h>
-#endif
-
#ifdef __alpha__
/* for /sbin/loader handling in search_binary_handler() */
#include <linux/a.out.h>
@@ -391,7 +388,7 @@ static int count(char __user * __user * argv, int max)
if (!p)
break;
argv++;
- if(++i > max)
+ if (i++ >= max)
return -E2BIG;
cond_resched();
}
@@ -825,8 +822,6 @@ static int de_thread(struct task_struct *tsk)
schedule();
}
- if (unlikely(task_child_reaper(tsk) == leader))
- task_active_pid_ns(tsk)->child_reaper = tsk;
/*
* The only record we have of the real-time age of a
* process, regardless of execs it's done, is start_time.
@@ -1189,7 +1184,7 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
return retval;
/* Remember if the application is TASO. */
- bprm->sh_bang = eh->ah.entry < 0x100000000UL;
+ bprm->taso = eh->ah.entry < 0x100000000UL;
bprm->file = file;
bprm->loader = loader;
@@ -1247,8 +1242,8 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
read_unlock(&binfmt_lock);
if (retval != -ENOEXEC || bprm->mm == NULL) {
break;
-#ifdef CONFIG_KMOD
- }else{
+#ifdef CONFIG_MODULES
+ } else {
#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
if (printable(bprm->buf[0]) &&
printable(bprm->buf[1]) &&
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 10bb02c3f25c..6dac7ba2d22d 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -1295,6 +1295,7 @@ retry_alloc:
* turn off reservation for this allocation
*/
if (my_rsv && (free_blocks < windowsz)
+ && (free_blocks > 0)
&& (rsv_is_empty(&my_rsv->rsv_window)))
my_rsv = NULL;
@@ -1332,7 +1333,7 @@ retry_alloc:
* free blocks is less than half of the reservation
* window size.
*/
- if (free_blocks <= (windowsz/2))
+ if (my_rsv && (free_blocks <= (windowsz/2)))
continue;
brelse(bitmap_bh);
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index a78c6b4af060..11a49ce84392 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -103,7 +103,7 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
return err;
}
-static void ext2_check_page(struct page *page)
+static void ext2_check_page(struct page *page, int quiet)
{
struct inode *dir = page->mapping->host;
struct super_block *sb = dir->i_sb;
@@ -146,10 +146,10 @@ out:
/* Too bad, we had an error */
Ebadsize:
- ext2_error(sb, "ext2_check_page",
- "size of directory #%lu is not a multiple of chunk size",
- dir->i_ino
- );
+ if (!quiet)
+ ext2_error(sb, __func__,
+ "size of directory #%lu is not a multiple "
+ "of chunk size", dir->i_ino);
goto fail;
Eshort:
error = "rec_len is smaller than minimal";
@@ -166,32 +166,36 @@ Espan:
Einumber:
error = "inode out of bounds";
bad_entry:
- ext2_error (sb, "ext2_check_page", "bad entry in directory #%lu: %s - "
- "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
- dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
- (unsigned long) le32_to_cpu(p->inode),
- rec_len, p->name_len);
+ if (!quiet)
+ ext2_error(sb, __func__, "bad entry in directory #%lu: : %s - "
+ "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
+ dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
+ (unsigned long) le32_to_cpu(p->inode),
+ rec_len, p->name_len);
goto fail;
Eend:
- p = (ext2_dirent *)(kaddr + offs);
- ext2_error (sb, "ext2_check_page",
- "entry in directory #%lu spans the page boundary"
- "offset=%lu, inode=%lu",
- dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
- (unsigned long) le32_to_cpu(p->inode));
+ if (!quiet) {
+ p = (ext2_dirent *)(kaddr + offs);
+ ext2_error(sb, "ext2_check_page",
+ "entry in directory #%lu spans the page boundary"
+ "offset=%lu, inode=%lu",
+ dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
+ (unsigned long) le32_to_cpu(p->inode));
+ }
fail:
SetPageChecked(page);
SetPageError(page);
}
-static struct page * ext2_get_page(struct inode *dir, unsigned long n)
+static struct page * ext2_get_page(struct inode *dir, unsigned long n,
+ int quiet)
{
struct address_space *mapping = dir->i_mapping;
struct page *page = read_mapping_page(mapping, n, NULL);
if (!IS_ERR(page)) {
kmap(page);
if (!PageChecked(page))
- ext2_check_page(page);
+ ext2_check_page(page, quiet);
if (PageError(page))
goto fail;
}
@@ -292,7 +296,7 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
for ( ; n < npages; n++, offset = 0) {
char *kaddr, *limit;
ext2_dirent *de;
- struct page *page = ext2_get_page(inode, n);
+ struct page *page = ext2_get_page(inode, n, 0);
if (IS_ERR(page)) {
ext2_error(sb, __func__,
@@ -361,6 +365,7 @@ struct ext2_dir_entry_2 * ext2_find_entry (struct inode * dir,
struct page *page = NULL;
struct ext2_inode_info *ei = EXT2_I(dir);
ext2_dirent * de;
+ int dir_has_error = 0;
if (npages == 0)
goto out;
@@ -374,7 +379,7 @@ struct ext2_dir_entry_2 * ext2_find_entry (struct inode * dir,
n = start;
do {
char *kaddr;
- page = ext2_get_page(dir, n);
+ page = ext2_get_page(dir, n, dir_has_error);
if (!IS_ERR(page)) {
kaddr = page_address(page);
de = (ext2_dirent *) kaddr;
@@ -391,7 +396,9 @@ struct ext2_dir_entry_2 * ext2_find_entry (struct inode * dir,
de = ext2_next_entry(de);
}
ext2_put_page(page);
- }
+ } else
+ dir_has_error = 1;
+
if (++n >= npages)
n = 0;
/* next page is past the blocks we've got */
@@ -414,7 +421,7 @@ found:
struct ext2_dir_entry_2 * ext2_dotdot (struct inode *dir, struct page **p)
{
- struct page *page = ext2_get_page(dir, 0);
+ struct page *page = ext2_get_page(dir, 0, 0);
ext2_dirent *de = NULL;
if (!IS_ERR(page)) {
@@ -487,7 +494,7 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode)
for (n = 0; n <= npages; n++) {
char *dir_end;
- page = ext2_get_page(dir, n);
+ page = ext2_get_page(dir, n, 0);
err = PTR_ERR(page);
if (IS_ERR(page))
goto out;
@@ -655,14 +662,17 @@ int ext2_empty_dir (struct inode * inode)
{
struct page *page = NULL;
unsigned long i, npages = dir_pages(inode);
+ int dir_has_error = 0;
for (i = 0; i < npages; i++) {
char *kaddr;
ext2_dirent * de;
- page = ext2_get_page(inode, i);
+ page = ext2_get_page(inode, i, dir_has_error);
- if (IS_ERR(page))
+ if (IS_ERR(page)) {
+ dir_has_error = 1;
continue;
+ }
kaddr = page_address(page);
de = (ext2_dirent *)kaddr;
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index bd2ece228827..b9821be709bd 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -568,8 +568,16 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
/* this isn't the right place to decide whether block is metadata
* inode.c/extents.c knows better, but for safety ... */
- if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
- ext4_should_journal_data(inode))
+ if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+ metadata = 1;
+
+ /* We need to make sure we don't reuse
+ * block released untill the transaction commit.
+ * writeback mode have weak data consistency so
+ * don't force data as metadata when freeing block
+ * for writeback mode.
+ */
+ if (metadata == 0 && !ext4_should_writeback_data(inode))
metadata = 1;
sb = inode->i_sb;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6690a41cdd9f..4880cc3e6727 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -511,7 +511,6 @@ do { \
/*
* Mount flags
*/
-#define EXT4_MOUNT_CHECK 0x00001 /* Do mount-time checks */
#define EXT4_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */
#define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */
#define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 6a0b40d43264..445fde603df8 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -99,9 +99,6 @@ struct ext4_sb_info {
struct inode *s_buddy_cache;
long s_blocks_reserved;
spinlock_t s_reserve_lock;
- struct list_head s_active_transaction;
- struct list_head s_closed_transaction;
- struct list_head s_committed_transaction;
spinlock_t s_md_lock;
tid_t s_last_transaction;
unsigned short *s_mb_offsets, *s_mb_maxs;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9b4ec9decfd1..8dbf6953845b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1648,6 +1648,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
int ret = 0, err, nr_pages, i;
unsigned long index, end;
struct pagevec pvec;
+ long pages_skipped;
BUG_ON(mpd->next_page <= mpd->first_page);
pagevec_init(&pvec, 0);
@@ -1655,20 +1656,30 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
end = mpd->next_page - 1;
while (index <= end) {
- /* XXX: optimize tail */
- nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+ /*
+ * We can use PAGECACHE_TAG_DIRTY lookup here because
+ * even though we have cleared the dirty flag on the page
+ * We still keep the page in the radix tree with tag
+ * PAGECACHE_TAG_DIRTY. See clear_page_dirty_for_io.
+ * The PAGECACHE_TAG_DIRTY is cleared in set_page_writeback
+ * which is called via the below writepage callback.
+ */
+ nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ PAGECACHE_TAG_DIRTY,
+ min(end - index,
+ (pgoff_t)PAGEVEC_SIZE-1) + 1);
if (nr_pages == 0)
break;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
- index = page->index;
- if (index > end)
- break;
- index++;
-
+ pages_skipped = mpd->wbc->pages_skipped;
err = mapping->a_ops->writepage(page, mpd->wbc);
- if (!err)
+ if (!err && (pages_skipped == mpd->wbc->pages_skipped))
+ /*
+ * have successfully written the page
+ * without skipping the same
+ */
mpd->pages_written++;
/*
* In error case, we have to continue because
@@ -2104,7 +2115,6 @@ static int mpage_da_writepages(struct address_space *mapping,
struct writeback_control *wbc,
struct mpage_da_data *mpd)
{
- long to_write;
int ret;
if (!mpd->get_block)
@@ -2119,19 +2129,18 @@ static int mpage_da_writepages(struct address_space *mapping,
mpd->pages_written = 0;
mpd->retval = 0;
- to_write = wbc->nr_to_write;
-
ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
-
/*
* Handle last extent of pages
*/
if (!mpd->io_done && mpd->next_page != mpd->first_page) {
if (mpage_da_map_blocks(mpd) == 0)
mpage_da_submit_io(mpd);
- }
- wbc->nr_to_write = to_write - mpd->pages_written;
+ mpd->io_done = 1;
+ ret = MPAGE_DA_EXTENT_TAIL;
+ }
+ wbc->nr_to_write -= mpd->pages_written;
return ret;
}
@@ -2360,12 +2369,14 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
static int ext4_da_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
+ pgoff_t index;
+ int range_whole = 0;
handle_t *handle = NULL;
- loff_t range_start = 0;
struct mpage_da_data mpd;
struct inode *inode = mapping->host;
+ int no_nrwrite_index_update;
+ long pages_written = 0, pages_skipped;
int needed_blocks, ret = 0, nr_to_writebump = 0;
- long to_write, pages_skipped = 0;
struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
/*
@@ -2385,23 +2396,26 @@ static int ext4_da_writepages(struct address_space *mapping,
nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
wbc->nr_to_write = sbi->s_mb_stream_request;
}
+ if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+ range_whole = 1;
- if (!wbc->range_cyclic)
- /*
- * If range_cyclic is not set force range_cont
- * and save the old writeback_index
- */
- wbc->range_cont = 1;
-
- range_start = wbc->range_start;
- pages_skipped = wbc->pages_skipped;
+ if (wbc->range_cyclic)
+ index = mapping->writeback_index;
+ else
+ index = wbc->range_start >> PAGE_CACHE_SHIFT;
mpd.wbc = wbc;
mpd.inode = mapping->host;
-restart_loop:
- to_write = wbc->nr_to_write;
- while (!ret && to_write > 0) {
+ /*
+ * we don't want write_cache_pages to update
+ * nr_to_write and writeback_index
+ */
+ no_nrwrite_index_update = wbc->no_nrwrite_index_update;
+ wbc->no_nrwrite_index_update = 1;
+ pages_skipped = wbc->pages_skipped;
+
+ while (!ret && wbc->nr_to_write > 0) {
/*
* we insert one extent at a time. So we need
@@ -2422,48 +2436,53 @@ restart_loop:
dump_stack();
goto out_writepages;
}
- to_write -= wbc->nr_to_write;
-
mpd.get_block = ext4_da_get_block_write;
ret = mpage_da_writepages(mapping, wbc, &mpd);
ext4_journal_stop(handle);
- if (mpd.retval == -ENOSPC)
+ if (mpd.retval == -ENOSPC) {
+ /* commit the transaction which would
+ * free blocks released in the transaction
+ * and try again
+ */
jbd2_journal_force_commit_nested(sbi->s_journal);
-
- /* reset the retry count */
- if (ret == MPAGE_DA_EXTENT_TAIL) {
+ wbc->pages_skipped = pages_skipped;
+ ret = 0;
+ } else if (ret == MPAGE_DA_EXTENT_TAIL) {
/*
* got one extent now try with
* rest of the pages
*/
- to_write += wbc->nr_to_write;
+ pages_written += mpd.pages_written;
+ wbc->pages_skipped = pages_skipped;
ret = 0;
- } else if (wbc->nr_to_write) {
+ } else if (wbc->nr_to_write)
/*
* There is no more writeout needed
* or we requested for a noblocking writeout
* and we found the device congested
*/
- to_write += wbc->nr_to_write;
break;
- }
- wbc->nr_to_write = to_write;
- }
-
- if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) {
- /* We skipped pages in this loop */
- wbc->range_start = range_start;
- wbc->nr_to_write = to_write +
- wbc->pages_skipped - pages_skipped;
- wbc->pages_skipped = pages_skipped;
- goto restart_loop;
}
+ if (pages_skipped != wbc->pages_skipped)
+ printk(KERN_EMERG "This should not happen leaving %s "
+ "with nr_to_write = %ld ret = %d\n",
+ __func__, wbc->nr_to_write, ret);
+
+ /* Update index */
+ index += pages_written;
+ if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+ /*
+ * set the writeback_index so that range_cyclic
+ * mode will write it back later
+ */
+ mapping->writeback_index = index;
out_writepages:
- wbc->nr_to_write = to_write - nr_to_writebump;
- wbc->range_start = range_start;
+ if (!no_nrwrite_index_update)
+ wbc->no_nrwrite_index_update = 0;
+ wbc->nr_to_write -= nr_to_writebump;
return ret;
}
@@ -4175,7 +4194,6 @@ static int ext4_inode_blocks_set(handle_t *handle,
struct inode *inode = &(ei->vfs_inode);
u64 i_blocks = inode->i_blocks;
struct super_block *sb = inode->i_sb;
- int err = 0;
if (i_blocks <= ~0U) {
/*
@@ -4185,36 +4203,27 @@ static int ext4_inode_blocks_set(handle_t *handle,
raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
raw_inode->i_blocks_high = 0;
ei->i_flags &= ~EXT4_HUGE_FILE_FL;
- } else if (i_blocks <= 0xffffffffffffULL) {
+ return 0;
+ }
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
+ return -EFBIG;
+
+ if (i_blocks <= 0xffffffffffffULL) {
/*
* i_blocks can be represented in a 48 bit variable
* as multiple of 512 bytes
*/
- err = ext4_update_rocompat_feature(handle, sb,
- EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
- if (err)
- goto err_out;
- /* i_block is stored in the split 48 bit fields */
raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
ei->i_flags &= ~EXT4_HUGE_FILE_FL;
} else {
- /*
- * i_blocks should be represented in a 48 bit variable
- * as multiple of file system block size
- */
- err = ext4_update_rocompat_feature(handle, sb,
- EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
- if (err)
- goto err_out;
ei->i_flags |= EXT4_HUGE_FILE_FL;
/* i_block is stored in file system block size */
i_blocks = i_blocks >> (inode->i_blkbits - 9);
raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
}
-err_out:
- return err;
+ return 0;
}
/*
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b580714f0d85..dfe17a134052 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2300,6 +2300,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
}
INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
+ meta_group_info[i]->bb_free_root.rb_node = NULL;;
#ifdef DOUBLE_CHECK
{
@@ -2522,9 +2523,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
}
spin_lock_init(&sbi->s_md_lock);
- INIT_LIST_HEAD(&sbi->s_active_transaction);
- INIT_LIST_HEAD(&sbi->s_closed_transaction);
- INIT_LIST_HEAD(&sbi->s_committed_transaction);
spin_lock_init(&sbi->s_bal_lock);
sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
@@ -2553,6 +2551,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
ext4_mb_init_per_dev_proc(sb);
ext4_mb_history_init(sb);
+ sbi->s_journal->j_commit_callback = release_blocks_on_commit;
+
printk(KERN_INFO "EXT4-fs: mballoc enabled\n");
return 0;
}
@@ -2568,7 +2568,7 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
list_del(&pa->pa_group_list);
count++;
- kfree(pa);
+ kmem_cache_free(ext4_pspace_cachep, pa);
}
if (count)
mb_debug("mballoc: %u PAs left\n", count);
@@ -2582,15 +2582,6 @@ int ext4_mb_release(struct super_block *sb)
struct ext4_group_info *grinfo;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- /* release freed, non-committed blocks */
- spin_lock(&sbi->s_md_lock);
- list_splice_init(&sbi->s_closed_transaction,
- &sbi->s_committed_transaction);
- list_splice_init(&sbi->s_active_transaction,
- &sbi->s_committed_transaction);
- spin_unlock(&sbi->s_md_lock);
- ext4_mb_free_committed_blocks(sb);
-
if (sbi->s_group_info) {
for (i = 0; i < sbi->s_groups_count; i++) {
grinfo = ext4_get_group_info(sb, i);
@@ -2644,61 +2635,57 @@ int ext4_mb_release(struct super_block *sb)
return 0;
}
-static noinline_for_stack void
-ext4_mb_free_committed_blocks(struct super_block *sb)
+/*
+ * This function is called by the jbd2 layer once the commit has finished,
+ * so we know we can free the blocks that were released with that commit.
+ */
+static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- int err;
- int i;
- int count = 0;
- int count2 = 0;
- struct ext4_free_metadata *md;
+ struct super_block *sb = journal->j_private;
struct ext4_buddy e4b;
+ struct ext4_group_info *db;
+ int err, count = 0, count2 = 0;
+ struct ext4_free_data *entry;
+ ext4_fsblk_t discard_block;
+ struct list_head *l, *ltmp;
- if (list_empty(&sbi->s_committed_transaction))
- return;
-
- /* there is committed blocks to be freed yet */
- do {
- /* get next array of blocks */
- md = NULL;
- spin_lock(&sbi->s_md_lock);
- if (!list_empty(&sbi->s_committed_transaction)) {
- md = list_entry(sbi->s_committed_transaction.next,
- struct ext4_free_metadata, list);
- list_del(&md->list);
- }
- spin_unlock(&sbi->s_md_lock);
-
- if (md == NULL)
- break;
+ list_for_each_safe(l, ltmp, &txn->t_private_list) {
+ entry = list_entry(l, struct ext4_free_data, list);
mb_debug("gonna free %u blocks in group %lu (0x%p):",
- md->num, md->group, md);
+ entry->count, entry->group, entry);
- err = ext4_mb_load_buddy(sb, md->group, &e4b);
+ err = ext4_mb_load_buddy(sb, entry->group, &e4b);
/* we expect to find existing buddy because it's pinned */
BUG_ON(err != 0);
+ db = e4b.bd_info;
/* there are blocks to put in buddy to make them really free */
- count += md->num;
+ count += entry->count;
count2++;
- ext4_lock_group(sb, md->group);
- for (i = 0; i < md->num; i++) {
- mb_debug(" %u", md->blocks[i]);
- mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
+ ext4_lock_group(sb, entry->group);
+ /* Take it out of per group rb tree */
+ rb_erase(&entry->node, &(db->bb_free_root));
+ mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
+
+ if (!db->bb_free_root.rb_node) {
+ /* No more items in the per group rb tree
+ * balance refcounts from ext4_mb_free_metadata()
+ */
+ page_cache_release(e4b.bd_buddy_page);
+ page_cache_release(e4b.bd_bitmap_page);
}
- mb_debug("\n");
- ext4_unlock_group(sb, md->group);
-
- /* balance refcounts from ext4_mb_free_metadata() */
- page_cache_release(e4b.bd_buddy_page);
- page_cache_release(e4b.bd_bitmap_page);
-
- kfree(md);
+ ext4_unlock_group(sb, entry->group);
+ discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb)
+ + entry->start_blk
+ + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+ trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u", sb->s_id,
+ (unsigned long long) discard_block, entry->count);
+ sb_issue_discard(sb, discard_block, entry->count);
+
+ kmem_cache_free(ext4_free_ext_cachep, entry);
ext4_mb_release_desc(&e4b);
-
- } while (md);
+ }
mb_debug("freed %u blocks in %u structures\n", count, count2);
}
@@ -2712,6 +2699,7 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
static int ext4_mb_init_per_dev_proc(struct super_block *sb)
{
+#ifdef CONFIG_PROC_FS
mode_t mode = S_IFREG | S_IRUGO | S_IWUSR;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct proc_dir_entry *proc;
@@ -2735,10 +2723,14 @@ err_out:
remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
return -ENOMEM;
+#else
+ return 0;
+#endif
}
static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
{
+#ifdef CONFIG_PROC_FS
struct ext4_sb_info *sbi = EXT4_SB(sb);
if (sbi->s_proc == NULL)
@@ -2750,7 +2742,7 @@ static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
-
+#endif
return 0;
}
@@ -2771,6 +2763,16 @@ int __init init_ext4_mballoc(void)
kmem_cache_destroy(ext4_pspace_cachep);
return -ENOMEM;
}
+
+ ext4_free_ext_cachep =
+ kmem_cache_create("ext4_free_block_extents",
+ sizeof(struct ext4_free_data),
+ 0, SLAB_RECLAIM_ACCOUNT, NULL);
+ if (ext4_free_ext_cachep == NULL) {
+ kmem_cache_destroy(ext4_pspace_cachep);
+ kmem_cache_destroy(ext4_ac_cachep);
+ return -ENOMEM;
+ }
return 0;
}
@@ -2779,6 +2781,7 @@ void exit_ext4_mballoc(void)
/* XXX: synchronize_rcu(); */
kmem_cache_destroy(ext4_pspace_cachep);
kmem_cache_destroy(ext4_ac_cachep);
+ kmem_cache_destroy(ext4_free_ext_cachep);
}
@@ -4324,8 +4327,6 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
goto out1;
}
- ext4_mb_poll_new_transaction(sb, handle);
-
*errp = ext4_mb_initialize_context(ac, ar);
if (*errp) {
ar->len = 0;
@@ -4384,35 +4385,20 @@ out1:
return block;
}
-static void ext4_mb_poll_new_transaction(struct super_block *sb,
- handle_t *handle)
-{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
-
- if (sbi->s_last_transaction == handle->h_transaction->t_tid)
- return;
-
- /* new transaction! time to close last one and free blocks for
- * committed transaction. we know that only transaction can be
- * active, so previos transaction can be being logged and we
- * know that transaction before previous is known to be already
- * logged. this means that now we may free blocks freed in all
- * transactions before previous one. hope I'm clear enough ... */
- spin_lock(&sbi->s_md_lock);
- if (sbi->s_last_transaction != handle->h_transaction->t_tid) {
- mb_debug("new transaction %lu, old %lu\n",
- (unsigned long) handle->h_transaction->t_tid,
- (unsigned long) sbi->s_last_transaction);
- list_splice_init(&sbi->s_closed_transaction,
- &sbi->s_committed_transaction);
- list_splice_init(&sbi->s_active_transaction,
- &sbi->s_closed_transaction);
- sbi->s_last_transaction = handle->h_transaction->t_tid;
- }
- spin_unlock(&sbi->s_md_lock);
-
- ext4_mb_free_committed_blocks(sb);
+/*
+ * We can merge two free data extents only if the physical blocks
+ * are contiguous, AND the extents were freed by the same transaction,
+ * AND the blocks are associated with the same group.
+ */
+static int can_merge(struct ext4_free_data *entry1,
+ struct ext4_free_data *entry2)
+{
+ if ((entry1->t_tid == entry2->t_tid) &&
+ (entry1->group == entry2->group) &&
+ ((entry1->start_blk + entry1->count) == entry2->start_blk))
+ return 1;
+ return 0;
}
static noinline_for_stack int
@@ -4422,57 +4408,80 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
struct ext4_group_info *db = e4b->bd_info;
struct super_block *sb = e4b->bd_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_free_metadata *md;
- int i;
+ struct ext4_free_data *entry, *new_entry;
+ struct rb_node **n = &db->bb_free_root.rb_node, *node;
+ struct rb_node *parent = NULL, *new_node;
+
BUG_ON(e4b->bd_bitmap_page == NULL);
BUG_ON(e4b->bd_buddy_page == NULL);
+ new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
+ new_entry->start_blk = block;
+ new_entry->group = group;
+ new_entry->count = count;
+ new_entry->t_tid = handle->h_transaction->t_tid;
+ new_node = &new_entry->node;
+
ext4_lock_group(sb, group);
- for (i = 0; i < count; i++) {
- md = db->bb_md_cur;
- if (md && db->bb_tid != handle->h_transaction->t_tid) {
- db->bb_md_cur = NULL;
- md = NULL;
+ if (!*n) {
+ /* first free block exent. We need to
+ protect buddy cache from being freed,
+ * otherwise we'll refresh it from
+ * on-disk bitmap and lose not-yet-available
+ * blocks */
+ page_cache_get(e4b->bd_buddy_page);
+ page_cache_get(e4b->bd_bitmap_page);
+ }
+ while (*n) {
+ parent = *n;
+ entry = rb_entry(parent, struct ext4_free_data, node);
+ if (block < entry->start_blk)
+ n = &(*n)->rb_left;
+ else if (block >= (entry->start_blk + entry->count))
+ n = &(*n)->rb_right;
+ else {
+ ext4_error(sb, __func__,
+ "Double free of blocks %d (%d %d)\n",
+ block, entry->start_blk, entry->count);
+ return 0;
}
+ }
- if (md == NULL) {
- ext4_unlock_group(sb, group);
- md = kmalloc(sizeof(*md), GFP_NOFS);
- if (md == NULL)
- return -ENOMEM;
- md->num = 0;
- md->group = group;
-
- ext4_lock_group(sb, group);
- if (db->bb_md_cur == NULL) {
- spin_lock(&sbi->s_md_lock);
- list_add(&md->list, &sbi->s_active_transaction);
- spin_unlock(&sbi->s_md_lock);
- /* protect buddy cache from being freed,
- * otherwise we'll refresh it from
- * on-disk bitmap and lose not-yet-available
- * blocks */
- page_cache_get(e4b->bd_buddy_page);
- page_cache_get(e4b->bd_bitmap_page);
- db->bb_md_cur = md;
- db->bb_tid = handle->h_transaction->t_tid;
- mb_debug("new md 0x%p for group %lu\n",
- md, md->group);
- } else {
- kfree(md);
- md = db->bb_md_cur;
- }
+ rb_link_node(new_node, parent, n);
+ rb_insert_color(new_node, &db->bb_free_root);
+
+ /* Now try to see the extent can be merged to left and right */
+ node = rb_prev(new_node);
+ if (node) {
+ entry = rb_entry(node, struct ext4_free_data, node);
+ if (can_merge(entry, new_entry)) {
+ new_entry->start_blk = entry->start_blk;
+ new_entry->count += entry->count;
+ rb_erase(node, &(db->bb_free_root));
+ spin_lock(&sbi->s_md_lock);
+ list_del(&entry->list);
+ spin_unlock(&sbi->s_md_lock);
+ kmem_cache_free(ext4_free_ext_cachep, entry);
}
+ }
- BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS);
- md->blocks[md->num] = block + i;
- md->num++;
- if (md->num == EXT4_BB_MAX_BLOCKS) {
- /* no more space, put full container on a sb's list */
- db->bb_md_cur = NULL;
+ node = rb_next(new_node);
+ if (node) {
+ entry = rb_entry(node, struct ext4_free_data, node);
+ if (can_merge(new_entry, entry)) {
+ new_entry->count += entry->count;
+ rb_erase(node, &(db->bb_free_root));
+ spin_lock(&sbi->s_md_lock);
+ list_del(&entry->list);
+ spin_unlock(&sbi->s_md_lock);
+ kmem_cache_free(ext4_free_ext_cachep, entry);
}
}
+ /* Add the extent to transaction's private list */
+ spin_lock(&sbi->s_md_lock);
+ list_add(&new_entry->list, &handle->h_transaction->t_private_list);
+ spin_unlock(&sbi->s_md_lock);
ext4_unlock_group(sb, group);
return 0;
}
@@ -4500,8 +4509,6 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
*freed = 0;
- ext4_mb_poll_new_transaction(sb, handle);
-
sbi = EXT4_SB(sb);
es = EXT4_SB(sb)->s_es;
if (block < le32_to_cpu(es->s_first_data_block) ||
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index b3b4828f8b89..b5dff1fff1e5 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -18,6 +18,8 @@
#include <linux/pagemap.h>
#include <linux/seq_file.h>
#include <linux/version.h>
+#include <linux/blkdev.h>
+#include <linux/marker.h>
#include "ext4_jbd2.h"
#include "ext4.h"
#include "group.h"
@@ -98,23 +100,29 @@
static struct kmem_cache *ext4_pspace_cachep;
static struct kmem_cache *ext4_ac_cachep;
+static struct kmem_cache *ext4_free_ext_cachep;
-#ifdef EXT4_BB_MAX_BLOCKS
-#undef EXT4_BB_MAX_BLOCKS
-#endif
-#define EXT4_BB_MAX_BLOCKS 30
+struct ext4_free_data {
+ /* this links the free block information from group_info */
+ struct rb_node node;
-struct ext4_free_metadata {
- ext4_group_t group;
- unsigned short num;
- ext4_grpblk_t blocks[EXT4_BB_MAX_BLOCKS];
+ /* this links the free block information from ext4_sb_info */
struct list_head list;
+
+ /* group which free block extent belongs */
+ ext4_group_t group;
+
+ /* free block extent */
+ ext4_grpblk_t start_blk;
+ ext4_grpblk_t count;
+
+ /* transaction which freed this extent */
+ tid_t t_tid;
};
struct ext4_group_info {
unsigned long bb_state;
- unsigned long bb_tid;
- struct ext4_free_metadata *bb_md_cur;
+ struct rb_root bb_free_root;
unsigned short bb_first_free;
unsigned short bb_free;
unsigned short bb_fragments;
@@ -261,8 +269,6 @@ struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
ext4_group_t group);
-static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *);
-static void ext4_mb_free_committed_blocks(struct super_block *);
static void ext4_mb_return_to_preallocation(struct inode *inode,
struct ext4_buddy *e4b, sector_t block,
int count);
@@ -270,6 +276,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *,
struct super_block *, struct ext4_prealloc_space *pa);
static int ext4_mb_init_per_dev_proc(struct super_block *sb);
static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
+static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index dea8f13c2fd9..9b2b2bc4ec17 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -374,66 +374,6 @@ void ext4_update_dynamic_rev(struct super_block *sb)
*/
}
-int ext4_update_compat_feature(handle_t *handle,
- struct super_block *sb, __u32 compat)
-{
- int err = 0;
- if (!EXT4_HAS_COMPAT_FEATURE(sb, compat)) {
- err = ext4_journal_get_write_access(handle,
- EXT4_SB(sb)->s_sbh);
- if (err)
- return err;
- EXT4_SET_COMPAT_FEATURE(sb, compat);
- sb->s_dirt = 1;
- handle->h_sync = 1;
- BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
- "call ext4_journal_dirty_met adata");
- err = ext4_journal_dirty_metadata(handle,
- EXT4_SB(sb)->s_sbh);
- }
- return err;
-}
-
-int ext4_update_rocompat_feature(handle_t *handle,
- struct super_block *sb, __u32 rocompat)
-{
- int err = 0;
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, rocompat)) {
- err = ext4_journal_get_write_access(handle,
- EXT4_SB(sb)->s_sbh);
- if (err)
- return err;
- EXT4_SET_RO_COMPAT_FEATURE(sb, rocompat);
- sb->s_dirt = 1;
- handle->h_sync = 1;
- BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
- "call ext4_journal_dirty_met adata");
- err = ext4_journal_dirty_metadata(handle,
- EXT4_SB(sb)->s_sbh);
- }
- return err;
-}
-
-int ext4_update_incompat_feature(handle_t *handle,
- struct super_block *sb, __u32 incompat)
-{
- int err = 0;
- if (!EXT4_HAS_INCOMPAT_FEATURE(sb, incompat)) {
- err = ext4_journal_get_write_access(handle,
- EXT4_SB(sb)->s_sbh);
- if (err)
- return err;
- EXT4_SET_INCOMPAT_FEATURE(sb, incompat);
- sb->s_dirt = 1;
- handle->h_sync = 1;
- BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
- "call ext4_journal_dirty_met adata");
- err = ext4_journal_dirty_metadata(handle,
- EXT4_SB(sb)->s_sbh);
- }
- return err;
-}
-
/*
* Open the external journal device
*/
@@ -904,7 +844,7 @@ static const struct export_operations ext4_export_ops = {
enum {
Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
- Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov,
+ Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
@@ -915,7 +855,7 @@ enum {
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
- Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc,
+ Opt_stripe, Opt_delalloc, Opt_nodelalloc,
Opt_inode_readahead_blks
};
@@ -933,8 +873,6 @@ static const match_table_t tokens = {
{Opt_err_panic, "errors=panic"},
{Opt_err_ro, "errors=remount-ro"},
{Opt_nouid32, "nouid32"},
- {Opt_nocheck, "nocheck"},
- {Opt_nocheck, "check=none"},
{Opt_debug, "debug"},
{Opt_oldalloc, "oldalloc"},
{Opt_orlov, "orlov"},
@@ -973,8 +911,6 @@ static const match_table_t tokens = {
{Opt_extents, "extents"},
{Opt_noextents, "noextents"},
{Opt_i_version, "i_version"},
- {Opt_mballoc, "mballoc"},
- {Opt_nomballoc, "nomballoc"},
{Opt_stripe, "stripe=%u"},
{Opt_resize, "resize"},
{Opt_delalloc, "delalloc"},
@@ -1073,9 +1009,6 @@ static int parse_options(char *options, struct super_block *sb,
case Opt_nouid32:
set_opt(sbi->s_mount_opt, NO_UID32);
break;
- case Opt_nocheck:
- clear_opt(sbi->s_mount_opt, CHECK);
- break;
case Opt_debug:
set_opt(sbi->s_mount_opt, DEBUG);
break;
@@ -1618,14 +1551,14 @@ static int ext4_check_descriptors(struct super_block *sb)
if (block_bitmap < first_block || block_bitmap > last_block) {
printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
"Block bitmap for group %lu not in group "
- "(block %llu)!", i, block_bitmap);
+ "(block %llu)!\n", i, block_bitmap);
return 0;
}
inode_bitmap = ext4_inode_bitmap(sb, gdp);
if (inode_bitmap < first_block || inode_bitmap > last_block) {
printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
"Inode bitmap for group %lu not in group "
- "(block %llu)!", i, inode_bitmap);
+ "(block %llu)!\n", i, inode_bitmap);
return 0;
}
inode_table = ext4_inode_table(sb, gdp);
@@ -1633,7 +1566,7 @@ static int ext4_check_descriptors(struct super_block *sb)
inode_table + sbi->s_itb_per_group - 1 > last_block) {
printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
"Inode table for group %lu not in group "
- "(block %llu)!", i, inode_table);
+ "(block %llu)!\n", i, inode_table);
return 0;
}
spin_lock(sb_bgl_lock(sbi, i));
@@ -1778,13 +1711,13 @@ static void ext4_orphan_cleanup(struct super_block *sb,
*
* Note, this does *not* consider any metadata overhead for vfs i_blocks.
*/
-static loff_t ext4_max_size(int blkbits)
+static loff_t ext4_max_size(int blkbits, int has_huge_files)
{
loff_t res;
loff_t upper_limit = MAX_LFS_FILESIZE;
/* small i_blocks in vfs inode? */
- if (sizeof(blkcnt_t) < sizeof(u64)) {
+ if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
/*
* CONFIG_LSF is not enabled implies the inode
* i_block represent total blocks in 512 bytes
@@ -1814,7 +1747,7 @@ static loff_t ext4_max_size(int blkbits)
* block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks.
* We need to be 1 filesystem block less than the 2^48 sector limit.
*/
-static loff_t ext4_max_bitmap_size(int bits)
+static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
{
loff_t res = EXT4_NDIR_BLOCKS;
int meta_blocks;
@@ -1827,11 +1760,11 @@ static loff_t ext4_max_bitmap_size(int bits)
* total number of 512 bytes blocks of the file
*/
- if (sizeof(blkcnt_t) < sizeof(u64)) {
+ if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
/*
- * CONFIG_LSF is not enabled implies the inode
- * i_block represent total blocks in 512 bytes
- * 32 == size of vfs inode i_blocks * 8
+ * !has_huge_files or CONFIG_LSF is not enabled
+ * implies the inode i_block represent total blocks in
+ * 512 bytes 32 == size of vfs inode i_blocks * 8
*/
upper_limit = (1LL << 32) - 1;
@@ -1940,7 +1873,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
int blocksize;
int db_count;
int i;
- int needs_recovery;
+ int needs_recovery, has_huge_files;
__le32 features;
__u64 blocks_count;
int err;
@@ -2081,7 +2014,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sb->s_id, le32_to_cpu(features));
goto failed_mount;
}
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
+ has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
+ if (has_huge_files) {
/*
* Large file size enabled file system can only be
* mount if kernel is build with CONFIG_LSF
@@ -2131,8 +2066,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
}
- sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits);
- sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits);
+ sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
+ has_huge_files);
+ sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
@@ -2456,6 +2392,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
"available.\n");
}
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+ printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
+ "requested data journaling mode\n");
+ clear_opt(sbi->s_mount_opt, DELALLOC);
+ } else if (test_opt(sb, DELALLOC))
+ printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
+
+ ext4_ext_init(sb);
+ err = ext4_mb_init(sb, needs_recovery);
+ if (err) {
+ printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n",
+ err);
+ goto failed_mount4;
+ }
+
/*
* akpm: core read_super() calls in here with the superblock locked.
* That deadlocks, because orphan cleanup needs to lock the superblock
@@ -2475,21 +2426,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
"writeback");
- if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
- printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
- "requested data journaling mode\n");
- clear_opt(sbi->s_mount_opt, DELALLOC);
- } else if (test_opt(sb, DELALLOC))
- printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
-
- ext4_ext_init(sb);
- err = ext4_mb_init(sb, needs_recovery);
- if (err) {
- printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n",
- err);
- goto failed_mount4;
- }
-
lock_kernel();
return 0;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 25adfc3c693a..d0ff0b8cf309 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -8,7 +8,7 @@
* pages against inodes. ie: data writeback. Writeout of the
* inode itself is not handled here.
*
- * 10Apr2002 akpm@zip.com.au
+ * 10Apr2002 Andrew Morton
* Split out of fs/inode.c
* Additions for address_space-based writeback
*/
diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c
index ba851576ebb1..6d98f116ca03 100644
--- a/fs/hfs/catalog.c
+++ b/fs/hfs/catalog.c
@@ -190,6 +190,10 @@ int hfs_cat_find_brec(struct super_block *sb, u32 cnid,
fd->search_key->cat.ParID = rec.thread.ParID;
len = fd->search_key->cat.CName.len = rec.thread.CName.len;
+ if (len > HFS_NAMELEN) {
+ printk(KERN_ERR "hfs: bad catalog namelength\n");
+ return -EIO;
+ }
memcpy(fd->search_key->cat.CName.name, rec.thread.CName.name, len);
return hfs_brec_find(fd);
}
diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c
index d128a25b74d2..ea30afc2a03c 100644
--- a/fs/hfsplus/bitmap.c
+++ b/fs/hfsplus/bitmap.c
@@ -32,6 +32,10 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma
mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
mapping = HFSPLUS_SB(sb).alloc_file->i_mapping;
page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL);
+ if (IS_ERR(page)) {
+ start = size;
+ goto out;
+ }
pptr = kmap(page);
curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32;
i = offset % 32;
@@ -73,6 +77,10 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma
break;
page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS,
NULL);
+ if (IS_ERR(page)) {
+ start = size;
+ goto out;
+ }
curr = pptr = kmap(page);
if ((size ^ offset) / PAGE_CACHE_BITS)
end = pptr + PAGE_CACHE_BITS / 32;
@@ -120,6 +128,10 @@ found:
offset += PAGE_CACHE_BITS;
page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS,
NULL);
+ if (IS_ERR(page)) {
+ start = size;
+ goto out;
+ }
pptr = kmap(page);
curr = pptr;
end = pptr + PAGE_CACHE_BITS / 32;
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index ba117c445e78..f6874acb2cf2 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -168,6 +168,11 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid,
return -EIO;
}
+ if (be16_to_cpu(tmp.thread.nodeName.length) > 255) {
+ printk(KERN_ERR "hfs: catalog name length corrupted\n");
+ return -EIO;
+ }
+
hfsplus_cat_build_key_uni(fd->search_key, be32_to_cpu(tmp.thread.parentID),
&tmp.thread.nodeName);
return hfs_brec_find(fd);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index e834e578c93f..eb74531a0a8e 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -356,7 +356,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
} else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n");
sb->s_flags |= MS_RDONLY;
- } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) {
+ } else if ((vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) && !(sb->s_flags & MS_RDONLY)) {
printk(KERN_WARNING "hfs: write access to a journaled filesystem is not supported, "
"use the force option at your own risk, mounting read-only.\n");
sb->s_flags |= MS_RDONLY;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 0abe02c4242a..8b119e16aa36 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -995,6 +995,9 @@ restart_loop:
}
spin_unlock(&journal->j_list_lock);
+ if (journal->j_commit_callback)
+ journal->j_commit_callback(journal, commit_transaction);
+
trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
journal->j_devname, commit_transaction->t_tid,
journal->j_tail_sequence);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index e5d540588fa9..39b7805a599a 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -52,6 +52,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
transaction->t_expires = jiffies + journal->j_commit_interval;
spin_lock_init(&transaction->t_handle_lock);
INIT_LIST_HEAD(&transaction->t_inode_list);
+ INIT_LIST_HEAD(&transaction->t_private_list);
/* Set up the commit timer for the new transaction. */
journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile
index 7725a0a9a555..97f6073ab339 100644
--- a/fs/lockd/Makefile
+++ b/fs/lockd/Makefile
@@ -5,6 +5,6 @@
obj-$(CONFIG_LOCKD) += lockd.o
lockd-objs-y := clntlock.o clntproc.o host.o svc.o svclock.o svcshare.o \
- svcproc.o svcsubs.o mon.o xdr.o
+ svcproc.o svcsubs.o mon.o xdr.o grace.o
lockd-objs-$(CONFIG_LOCKD_V4) += xdr4.o svc4proc.o
lockd-objs := $(lockd-objs-y)
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 0b45fd3a4bfd..8307dd64bf46 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -54,14 +54,13 @@ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init)
u32 nlm_version = (nlm_init->nfs_version == 2) ? 1 : 4;
int status;
- status = lockd_up(nlm_init->protocol);
+ status = lockd_up();
if (status < 0)
return ERR_PTR(status);
- host = nlmclnt_lookup_host((struct sockaddr_in *)nlm_init->address,
+ host = nlmclnt_lookup_host(nlm_init->address, nlm_init->addrlen,
nlm_init->protocol, nlm_version,
- nlm_init->hostname,
- strlen(nlm_init->hostname));
+ nlm_init->hostname);
if (host == NULL) {
lockd_down();
return ERR_PTR(-ENOLCK);
@@ -142,7 +141,7 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout)
/*
* The server lockd has called us back to tell us the lock was granted
*/
-__be32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock)
+__be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
{
const struct file_lock *fl = &lock->fl;
const struct nfs_fh *fh = &lock->fh;
@@ -166,7 +165,7 @@ __be32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock
*/
if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid)
continue;
- if (!nlm_cmp_addr(&block->b_host->h_addr, addr))
+ if (!nlm_cmp_addr(nlm_addr(block->b_host), addr))
continue;
if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0)
continue;
@@ -216,7 +215,7 @@ reclaimer(void *ptr)
/* This one ensures that our parent doesn't terminate while the
* reclaim is in progress */
lock_kernel();
- lockd_up(0); /* note: this cannot fail as lockd is already running */
+ lockd_up(); /* note: this cannot fail as lockd is already running */
dprintk("lockd: reclaiming locks for host %s\n", host->h_name);
diff --git a/fs/lockd/grace.c b/fs/lockd/grace.c
new file mode 100644
index 000000000000..183cc1f0af1c
--- /dev/null
+++ b/fs/lockd/grace.c
@@ -0,0 +1,59 @@
+/*
+ * Common code for control of lockd and nfsv4 grace periods.
+ */
+
+#include <linux/module.h>
+#include <linux/lockd/bind.h>
+
+static LIST_HEAD(grace_list);
+static DEFINE_SPINLOCK(grace_lock);
+
+/**
+ * locks_start_grace
+ * @lm: who this grace period is for
+ *
+ * A grace period is a period during which locks should not be given
+ * out. Currently grace periods are only enforced by the two lock
+ * managers (lockd and nfsd), using the locks_in_grace() function to
+ * check when they are in a grace period.
+ *
+ * This function is called to start a grace period.
+ */
+void locks_start_grace(struct lock_manager *lm)
+{
+ spin_lock(&grace_lock);
+ list_add(&lm->list, &grace_list);
+ spin_unlock(&grace_lock);
+}
+EXPORT_SYMBOL_GPL(locks_start_grace);
+
+/**
+ * locks_end_grace
+ * @lm: who this grace period is for
+ *
+ * Call this function to state that the given lock manager is ready to
+ * resume regular locking. The grace period will not end until all lock
+ * managers that called locks_start_grace() also call locks_end_grace().
+ * Note that callers count on it being safe to call this more than once,
+ * and the second call should be a no-op.
+ */
+void locks_end_grace(struct lock_manager *lm)
+{
+ spin_lock(&grace_lock);
+ list_del_init(&lm->list);
+ spin_unlock(&grace_lock);
+}
+EXPORT_SYMBOL_GPL(locks_end_grace);
+
+/**
+ * locks_in_grace
+ *
+ * Lock managers call this function to determine when it is OK for them
+ * to answer ordinary lock requests, and when they should accept only
+ * lock reclaims.
+ */
+int locks_in_grace(void)
+{
+ return !list_empty(&grace_list);
+}
+EXPORT_SYMBOL_GPL(locks_in_grace);
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index a17664c7eacc..9fd8889097b7 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -11,16 +11,17 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/in.h>
+#include <linux/in6.h>
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/svc.h>
#include <linux/lockd/lockd.h>
#include <linux/lockd/sm_inter.h>
#include <linux/mutex.h>
+#include <net/ipv6.h>
#define NLMDBG_FACILITY NLMDBG_HOSTCACHE
#define NLM_HOST_NRHASH 32
-#define NLM_ADDRHASH(addr) (ntohl(addr) & (NLM_HOST_NRHASH-1))
#define NLM_HOST_REBIND (60 * HZ)
#define NLM_HOST_EXPIRE (300 * HZ)
#define NLM_HOST_COLLECT (120 * HZ)
@@ -30,42 +31,115 @@ static unsigned long next_gc;
static int nrhosts;
static DEFINE_MUTEX(nlm_host_mutex);
-
static void nlm_gc_hosts(void);
-static struct nsm_handle * __nsm_find(const struct sockaddr_in *,
- const char *, unsigned int, int);
-static struct nsm_handle * nsm_find(const struct sockaddr_in *sin,
- const char *hostname,
- unsigned int hostname_len);
+static struct nsm_handle *nsm_find(const struct sockaddr *sap,
+ const size_t salen,
+ const char *hostname,
+ const size_t hostname_len,
+ const int create);
+
+struct nlm_lookup_host_info {
+ const int server; /* search for server|client */
+ const struct sockaddr *sap; /* address to search for */
+ const size_t salen; /* it's length */
+ const unsigned short protocol; /* transport to search for*/
+ const u32 version; /* NLM version to search for */
+ const char *hostname; /* remote's hostname */
+ const size_t hostname_len; /* it's length */
+ const struct sockaddr *src_sap; /* our address (optional) */
+ const size_t src_len; /* it's length */
+};
+
+/*
+ * Hash function must work well on big- and little-endian platforms
+ */
+static unsigned int __nlm_hash32(const __be32 n)
+{
+ unsigned int hash = (__force u32)n ^ ((__force u32)n >> 16);
+ return hash ^ (hash >> 8);
+}
+
+static unsigned int __nlm_hash_addr4(const struct sockaddr *sap)
+{
+ const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+ return __nlm_hash32(sin->sin_addr.s_addr);
+}
+
+static unsigned int __nlm_hash_addr6(const struct sockaddr *sap)
+{
+ const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+ const struct in6_addr addr = sin6->sin6_addr;
+ return __nlm_hash32(addr.s6_addr32[0]) ^
+ __nlm_hash32(addr.s6_addr32[1]) ^
+ __nlm_hash32(addr.s6_addr32[2]) ^
+ __nlm_hash32(addr.s6_addr32[3]);
+}
+
+static unsigned int nlm_hash_address(const struct sockaddr *sap)
+{
+ unsigned int hash;
+
+ switch (sap->sa_family) {
+ case AF_INET:
+ hash = __nlm_hash_addr4(sap);
+ break;
+ case AF_INET6:
+ hash = __nlm_hash_addr6(sap);
+ break;
+ default:
+ hash = 0;
+ }
+ return hash & (NLM_HOST_NRHASH - 1);
+}
+
+static void nlm_clear_port(struct sockaddr *sap)
+{
+ switch (sap->sa_family) {
+ case AF_INET:
+ ((struct sockaddr_in *)sap)->sin_port = 0;
+ break;
+ case AF_INET6:
+ ((struct sockaddr_in6 *)sap)->sin6_port = 0;
+ break;
+ }
+}
+
+static void nlm_display_address(const struct sockaddr *sap,
+ char *buf, const size_t len)
+{
+ const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+ const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+
+ switch (sap->sa_family) {
+ case AF_UNSPEC:
+ snprintf(buf, len, "unspecified");
+ break;
+ case AF_INET:
+ snprintf(buf, len, NIPQUAD_FMT, NIPQUAD(sin->sin_addr.s_addr));
+ break;
+ case AF_INET6:
+ if (ipv6_addr_v4mapped(&sin6->sin6_addr))
+ snprintf(buf, len, NIPQUAD_FMT,
+ NIPQUAD(sin6->sin6_addr.s6_addr32[3]));
+ else
+ snprintf(buf, len, NIP6_FMT, NIP6(sin6->sin6_addr));
+ break;
+ default:
+ snprintf(buf, len, "unsupported address family");
+ break;
+ }
+}
/*
* Common host lookup routine for server & client
*/
-static struct nlm_host *nlm_lookup_host(int server,
- const struct sockaddr_in *sin,
- int proto, u32 version,
- const char *hostname,
- unsigned int hostname_len,
- const struct sockaddr_in *ssin)
+static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
{
struct hlist_head *chain;
struct hlist_node *pos;
struct nlm_host *host;
struct nsm_handle *nsm = NULL;
- int hash;
-
- dprintk("lockd: nlm_lookup_host("NIPQUAD_FMT"->"NIPQUAD_FMT
- ", p=%d, v=%u, my role=%s, name=%.*s)\n",
- NIPQUAD(ssin->sin_addr.s_addr),
- NIPQUAD(sin->sin_addr.s_addr), proto, version,
- server? "server" : "client",
- hostname_len,
- hostname? hostname : "<none>");
-
- hash = NLM_ADDRHASH(sin->sin_addr.s_addr);
-
- /* Lock hash table */
mutex_lock(&nlm_host_mutex);
if (time_after_eq(jiffies, next_gc))
@@ -78,22 +152,22 @@ static struct nlm_host *nlm_lookup_host(int server,
* different NLM rpc_clients into one single nlm_host object.
* This would allow us to have one nlm_host per address.
*/
- chain = &nlm_hosts[hash];
+ chain = &nlm_hosts[nlm_hash_address(ni->sap)];
hlist_for_each_entry(host, pos, chain, h_hash) {
- if (!nlm_cmp_addr(&host->h_addr, sin))
+ if (!nlm_cmp_addr(nlm_addr(host), ni->sap))
continue;
/* See if we have an NSM handle for this client */
if (!nsm)
nsm = host->h_nsmhandle;
- if (host->h_proto != proto)
+ if (host->h_proto != ni->protocol)
continue;
- if (host->h_version != version)
+ if (host->h_version != ni->version)
continue;
- if (host->h_server != server)
+ if (host->h_server != ni->server)
continue;
- if (!nlm_cmp_addr(&host->h_saddr, ssin))
+ if (!nlm_cmp_addr(nlm_srcaddr(host), ni->src_sap))
continue;
/* Move to head of hash chain. */
@@ -101,30 +175,41 @@ static struct nlm_host *nlm_lookup_host(int server,
hlist_add_head(&host->h_hash, chain);
nlm_get_host(host);
+ dprintk("lockd: nlm_lookup_host found host %s (%s)\n",
+ host->h_name, host->h_addrbuf);
goto out;
}
- if (nsm)
- atomic_inc(&nsm->sm_count);
-
- host = NULL;
- /* Sadly, the host isn't in our hash table yet. See if
- * we have an NSM handle for it. If not, create one.
+ /*
+ * The host wasn't in our hash table. If we don't
+ * have an NSM handle for it yet, create one.
*/
- if (!nsm && !(nsm = nsm_find(sin, hostname, hostname_len)))
- goto out;
+ if (nsm)
+ atomic_inc(&nsm->sm_count);
+ else {
+ host = NULL;
+ nsm = nsm_find(ni->sap, ni->salen,
+ ni->hostname, ni->hostname_len, 1);
+ if (!nsm) {
+ dprintk("lockd: nlm_lookup_host failed; "
+ "no nsm handle\n");
+ goto out;
+ }
+ }
host = kzalloc(sizeof(*host), GFP_KERNEL);
if (!host) {
nsm_release(nsm);
+ dprintk("lockd: nlm_lookup_host failed; no memory\n");
goto out;
}
host->h_name = nsm->sm_name;
- host->h_addr = *sin;
- host->h_addr.sin_port = 0; /* ouch! */
- host->h_saddr = *ssin;
- host->h_version = version;
- host->h_proto = proto;
+ memcpy(nlm_addr(host), ni->sap, ni->salen);
+ host->h_addrlen = ni->salen;
+ nlm_clear_port(nlm_addr(host));
+ memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len);
+ host->h_version = ni->version;
+ host->h_proto = ni->protocol;
host->h_rpcclnt = NULL;
mutex_init(&host->h_mutex);
host->h_nextrebind = jiffies + NLM_HOST_REBIND;
@@ -135,7 +220,7 @@ static struct nlm_host *nlm_lookup_host(int server,
host->h_state = 0; /* pseudo NSM state */
host->h_nsmstate = 0; /* real NSM state */
host->h_nsmhandle = nsm;
- host->h_server = server;
+ host->h_server = ni->server;
hlist_add_head(&host->h_hash, chain);
INIT_LIST_HEAD(&host->h_lockowners);
spin_lock_init(&host->h_lock);
@@ -143,6 +228,15 @@ static struct nlm_host *nlm_lookup_host(int server,
INIT_LIST_HEAD(&host->h_reclaim);
nrhosts++;
+
+ nlm_display_address((struct sockaddr *)&host->h_addr,
+ host->h_addrbuf, sizeof(host->h_addrbuf));
+ nlm_display_address((struct sockaddr *)&host->h_srcaddr,
+ host->h_srcaddrbuf, sizeof(host->h_srcaddrbuf));
+
+ dprintk("lockd: nlm_lookup_host created host %s\n",
+ host->h_name);
+
out:
mutex_unlock(&nlm_host_mutex);
return host;
@@ -170,33 +264,103 @@ nlm_destroy_host(struct nlm_host *host)
kfree(host);
}
-/*
- * Find an NLM server handle in the cache. If there is none, create it.
+/**
+ * nlmclnt_lookup_host - Find an NLM host handle matching a remote server
+ * @sap: network address of server
+ * @salen: length of server address
+ * @protocol: transport protocol to use
+ * @version: NLM protocol version
+ * @hostname: '\0'-terminated hostname of server
+ *
+ * Returns an nlm_host structure that matches the passed-in
+ * [server address, transport protocol, NLM version, server hostname].
+ * If one doesn't already exist in the host cache, a new handle is
+ * created and returned.
*/
-struct nlm_host *nlmclnt_lookup_host(const struct sockaddr_in *sin,
- int proto, u32 version,
- const char *hostname,
- unsigned int hostname_len)
+struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
+ const size_t salen,
+ const unsigned short protocol,
+ const u32 version, const char *hostname)
{
- struct sockaddr_in ssin = {0};
-
- return nlm_lookup_host(0, sin, proto, version,
- hostname, hostname_len, &ssin);
+ const struct sockaddr source = {
+ .sa_family = AF_UNSPEC,
+ };
+ struct nlm_lookup_host_info ni = {
+ .server = 0,
+ .sap = sap,
+ .salen = salen,
+ .protocol = protocol,
+ .version = version,
+ .hostname = hostname,
+ .hostname_len = strlen(hostname),
+ .src_sap = &source,
+ .src_len = sizeof(source),
+ };
+
+ dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__,
+ (hostname ? hostname : "<none>"), version,
+ (protocol == IPPROTO_UDP ? "udp" : "tcp"));
+
+ return nlm_lookup_host(&ni);
}
-/*
- * Find an NLM client handle in the cache. If there is none, create it.
+/**
+ * nlmsvc_lookup_host - Find an NLM host handle matching a remote client
+ * @rqstp: incoming NLM request
+ * @hostname: name of client host
+ * @hostname_len: length of client hostname
+ *
+ * Returns an nlm_host structure that matches the [client address,
+ * transport protocol, NLM version, client hostname] of the passed-in
+ * NLM request. If one doesn't already exist in the host cache, a
+ * new handle is created and returned.
+ *
+ * Before possibly creating a new nlm_host, construct a sockaddr
+ * for a specific source address in case the local system has
+ * multiple network addresses. The family of the address in
+ * rq_daddr is guaranteed to be the same as the family of the
+ * address in rq_addr, so it's safe to use the same family for
+ * the source address.
*/
-struct nlm_host *
-nlmsvc_lookup_host(struct svc_rqst *rqstp,
- const char *hostname, unsigned int hostname_len)
+struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
+ const char *hostname,
+ const size_t hostname_len)
{
- struct sockaddr_in ssin = {0};
+ struct sockaddr_in sin = {
+ .sin_family = AF_INET,
+ };
+ struct sockaddr_in6 sin6 = {
+ .sin6_family = AF_INET6,
+ };
+ struct nlm_lookup_host_info ni = {
+ .server = 1,
+ .sap = svc_addr(rqstp),
+ .salen = rqstp->rq_addrlen,
+ .protocol = rqstp->rq_prot,
+ .version = rqstp->rq_vers,
+ .hostname = hostname,
+ .hostname_len = hostname_len,
+ .src_len = rqstp->rq_addrlen,
+ };
+
+ dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__,
+ (int)hostname_len, hostname, rqstp->rq_vers,
+ (rqstp->rq_prot == IPPROTO_UDP ? "udp" : "tcp"));
+
+ switch (ni.sap->sa_family) {
+ case AF_INET:
+ sin.sin_addr.s_addr = rqstp->rq_daddr.addr.s_addr;
+ ni.src_sap = (struct sockaddr *)&sin;
+ break;
+ case AF_INET6:
+ ipv6_addr_copy(&sin6.sin6_addr, &rqstp->rq_daddr.addr6);
+ ni.src_sap = (struct sockaddr *)&sin6;
+ break;
+ default:
+ return NULL;
+ }
- ssin.sin_addr = rqstp->rq_daddr.addr;
- return nlm_lookup_host(1, svc_addr_in(rqstp),
- rqstp->rq_prot, rqstp->rq_vers,
- hostname, hostname_len, &ssin);
+ return nlm_lookup_host(&ni);
}
/*
@@ -207,9 +371,8 @@ nlm_bind_host(struct nlm_host *host)
{
struct rpc_clnt *clnt;
- dprintk("lockd: nlm_bind_host("NIPQUAD_FMT"->"NIPQUAD_FMT")\n",
- NIPQUAD(host->h_saddr.sin_addr),
- NIPQUAD(host->h_addr.sin_addr));
+ dprintk("lockd: nlm_bind_host %s (%s), my addr=%s\n",
+ host->h_name, host->h_addrbuf, host->h_srcaddrbuf);
/* Lock host handle */
mutex_lock(&host->h_mutex);
@@ -221,7 +384,7 @@ nlm_bind_host(struct nlm_host *host)
if (time_after_eq(jiffies, host->h_nextrebind)) {
rpc_force_rebind(clnt);
host->h_nextrebind = jiffies + NLM_HOST_REBIND;
- dprintk("lockd: next rebind in %ld jiffies\n",
+ dprintk("lockd: next rebind in %lu jiffies\n",
host->h_nextrebind - jiffies);
}
} else {
@@ -234,9 +397,9 @@ nlm_bind_host(struct nlm_host *host)
};
struct rpc_create_args args = {
.protocol = host->h_proto,
- .address = (struct sockaddr *)&host->h_addr,
- .addrsize = sizeof(host->h_addr),
- .saddress = (struct sockaddr *)&host->h_saddr,
+ .address = nlm_addr(host),
+ .addrsize = host->h_addrlen,
+ .saddress = nlm_srcaddr(host),
.timeout = &timeparms,
.servername = host->h_name,
.program = &nlm_program,
@@ -324,12 +487,16 @@ void nlm_host_rebooted(const struct sockaddr_in *sin,
struct nsm_handle *nsm;
struct nlm_host *host;
- dprintk("lockd: nlm_host_rebooted(%s, %u.%u.%u.%u)\n",
- hostname, NIPQUAD(sin->sin_addr));
-
- /* Find the NSM handle for this peer */
- if (!(nsm = __nsm_find(sin, hostname, hostname_len, 0)))
+ nsm = nsm_find((struct sockaddr *)sin, sizeof(*sin),
+ hostname, hostname_len, 0);
+ if (nsm == NULL) {
+ dprintk("lockd: never saw rebooted peer '%.*s' before\n",
+ hostname_len, hostname);
return;
+ }
+
+ dprintk("lockd: nlm_host_rebooted(%.*s, %s)\n",
+ hostname_len, hostname, nsm->sm_addrbuf);
/* When reclaiming locks on this peer, make sure that
* we set up a new notification */
@@ -461,22 +628,23 @@ nlm_gc_hosts(void)
static LIST_HEAD(nsm_handles);
static DEFINE_SPINLOCK(nsm_lock);
-static struct nsm_handle *
-__nsm_find(const struct sockaddr_in *sin,
- const char *hostname, unsigned int hostname_len,
- int create)
+static struct nsm_handle *nsm_find(const struct sockaddr *sap,
+ const size_t salen,
+ const char *hostname,
+ const size_t hostname_len,
+ const int create)
{
struct nsm_handle *nsm = NULL;
struct nsm_handle *pos;
- if (!sin)
+ if (!sap)
return NULL;
if (hostname && memchr(hostname, '/', hostname_len) != NULL) {
if (printk_ratelimit()) {
printk(KERN_WARNING "Invalid hostname \"%.*s\" "
"in NFS lock request\n",
- hostname_len, hostname);
+ (int)hostname_len, hostname);
}
return NULL;
}
@@ -489,7 +657,7 @@ retry:
if (strlen(pos->sm_name) != hostname_len
|| memcmp(pos->sm_name, hostname, hostname_len))
continue;
- } else if (!nlm_cmp_addr(&pos->sm_addr, sin))
+ } else if (!nlm_cmp_addr(nsm_addr(pos), sap))
continue;
atomic_inc(&pos->sm_count);
kfree(nsm);
@@ -509,10 +677,13 @@ retry:
if (nsm == NULL)
return NULL;
- nsm->sm_addr = *sin;
+ memcpy(nsm_addr(nsm), sap, salen);
+ nsm->sm_addrlen = salen;
nsm->sm_name = (char *) (nsm + 1);
memcpy(nsm->sm_name, hostname, hostname_len);
nsm->sm_name[hostname_len] = '\0';
+ nlm_display_address((struct sockaddr *)&nsm->sm_addr,
+ nsm->sm_addrbuf, sizeof(nsm->sm_addrbuf));
atomic_set(&nsm->sm_count, 1);
goto retry;
@@ -521,13 +692,6 @@ found:
return nsm;
}
-static struct nsm_handle *
-nsm_find(const struct sockaddr_in *sin, const char *hostname,
- unsigned int hostname_len)
-{
- return __nsm_find(sin, hostname, hostname_len, 1);
-}
-
/*
* Release an NSM handle
*/
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index e4d563543b11..4e7e958e8f67 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -51,7 +51,7 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
memset(&args, 0, sizeof(args));
args.mon_name = nsm->sm_name;
- args.addr = nsm->sm_addr.sin_addr.s_addr;
+ args.addr = nsm_addr_in(nsm)->sin_addr.s_addr;
args.prog = NLM_PROGRAM;
args.vers = 3;
args.proc = NLMPROC_NSM_NOTIFY;
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 5bd9bf0fa9df..c631a83931ce 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -51,7 +51,6 @@ static DEFINE_MUTEX(nlmsvc_mutex);
static unsigned int nlmsvc_users;
static struct task_struct *nlmsvc_task;
static struct svc_rqst *nlmsvc_rqst;
-int nlmsvc_grace_period;
unsigned long nlmsvc_timeout;
/*
@@ -85,27 +84,23 @@ static unsigned long get_lockd_grace_period(void)
return nlm_timeout * 5 * HZ;
}
-unsigned long get_nfs_grace_period(void)
-{
- unsigned long lockdgrace = get_lockd_grace_period();
- unsigned long nfsdgrace = 0;
-
- if (nlmsvc_ops)
- nfsdgrace = nlmsvc_ops->get_grace_period();
-
- return max(lockdgrace, nfsdgrace);
-}
-EXPORT_SYMBOL(get_nfs_grace_period);
+static struct lock_manager lockd_manager = {
+};
-static unsigned long set_grace_period(void)
+static void grace_ender(struct work_struct *not_used)
{
- nlmsvc_grace_period = 1;
- return get_nfs_grace_period() + jiffies;
+ locks_end_grace(&lockd_manager);
}
-static inline void clear_grace_period(void)
+static DECLARE_DELAYED_WORK(grace_period_end, grace_ender);
+
+static void set_grace_period(void)
{
- nlmsvc_grace_period = 0;
+ unsigned long grace_period = get_lockd_grace_period();
+
+ locks_start_grace(&lockd_manager);
+ cancel_delayed_work_sync(&grace_period_end);
+ schedule_delayed_work(&grace_period_end, grace_period);
}
/*
@@ -116,7 +111,6 @@ lockd(void *vrqstp)
{
int err = 0, preverr = 0;
struct svc_rqst *rqstp = vrqstp;
- unsigned long grace_period_expire;
/* try_to_freeze() is called from svc_recv() */
set_freezable();
@@ -139,7 +133,7 @@ lockd(void *vrqstp)
nlm_timeout = LOCKD_DFLT_TIMEO;
nlmsvc_timeout = nlm_timeout * HZ;
- grace_period_expire = set_grace_period();
+ set_grace_period();
/*
* The main request loop. We don't terminate until the last
@@ -153,21 +147,12 @@ lockd(void *vrqstp)
flush_signals(current);
if (nlmsvc_ops) {
nlmsvc_invalidate_all();
- grace_period_expire = set_grace_period();
+ set_grace_period();
}
continue;
}
- /*
- * Retry any blocked locks that have been notified by
- * the VFS. Don't do this during grace period.
- * (Theoretically, there shouldn't even be blocked locks
- * during grace period).
- */
- if (!nlmsvc_grace_period) {
- timeout = nlmsvc_retry_blocked();
- } else if (time_before(grace_period_expire, jiffies))
- clear_grace_period();
+ timeout = nlmsvc_retry_blocked();
/*
* Find a socket with data available and call its
@@ -195,6 +180,7 @@ lockd(void *vrqstp)
svc_process(rqstp);
}
flush_signals(current);
+ cancel_delayed_work_sync(&grace_period_end);
if (nlmsvc_ops)
nlmsvc_invalidate_all();
nlm_shutdown_hosts();
@@ -203,25 +189,28 @@ lockd(void *vrqstp)
}
/*
- * Make any sockets that are needed but not present.
- * If nlm_udpport or nlm_tcpport were set as module
- * options, make those sockets unconditionally
+ * Ensure there are active UDP and TCP listeners for lockd.
+ *
+ * Even if we have only TCP NFS mounts and/or TCP NFSDs, some
+ * local services (such as rpc.statd) still require UDP, and
+ * some NFS servers do not yet support NLM over TCP.
+ *
+ * Returns zero if all listeners are available; otherwise a
+ * negative errno value is returned.
*/
-static int make_socks(struct svc_serv *serv, int proto)
+static int make_socks(struct svc_serv *serv)
{
static int warned;
struct svc_xprt *xprt;
int err = 0;
- if (proto == IPPROTO_UDP || nlm_udpport) {
- xprt = svc_find_xprt(serv, "udp", 0, 0);
- if (!xprt)
- err = svc_create_xprt(serv, "udp", nlm_udpport,
- SVC_SOCK_DEFAULTS);
- else
- svc_xprt_put(xprt);
- }
- if (err >= 0 && (proto == IPPROTO_TCP || nlm_tcpport)) {
+ xprt = svc_find_xprt(serv, "udp", 0, 0);
+ if (!xprt)
+ err = svc_create_xprt(serv, "udp", nlm_udpport,
+ SVC_SOCK_DEFAULTS);
+ else
+ svc_xprt_put(xprt);
+ if (err >= 0) {
xprt = svc_find_xprt(serv, "tcp", 0, 0);
if (!xprt)
err = svc_create_xprt(serv, "tcp", nlm_tcpport,
@@ -241,8 +230,7 @@ static int make_socks(struct svc_serv *serv, int proto)
/*
* Bring up the lockd process if it's not already up.
*/
-int
-lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */
+int lockd_up(void)
{
struct svc_serv *serv;
int error = 0;
@@ -251,11 +239,8 @@ lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */
/*
* Check whether we're already up and running.
*/
- if (nlmsvc_rqst) {
- if (proto)
- error = make_socks(nlmsvc_rqst->rq_server, proto);
+ if (nlmsvc_rqst)
goto out;
- }
/*
* Sanity check: if there's no pid,
@@ -266,13 +251,14 @@ lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */
"lockd_up: no pid, %d users??\n", nlmsvc_users);
error = -ENOMEM;
- serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL);
+ serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, AF_INET, NULL);
if (!serv) {
printk(KERN_WARNING "lockd_up: create service failed\n");
goto out;
}
- if ((error = make_socks(serv, proto)) < 0)
+ error = make_socks(serv);
+ if (error < 0)
goto destroy_and_out;
/*
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 4a714f64515b..014f6ce48172 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -88,12 +88,6 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
dprintk("lockd: TEST4 called\n");
resp->cookie = argp->cookie;
- /* Don't accept test requests during grace period */
- if (nlmsvc_grace_period) {
- resp->status = nlm_lck_denied_grace_period;
- return rc;
- }
-
/* Obtain client and file */
if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
@@ -122,12 +116,6 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
- /* Don't accept new lock requests during grace period */
- if (nlmsvc_grace_period && !argp->reclaim) {
- resp->status = nlm_lck_denied_grace_period;
- return rc;
- }
-
/* Obtain client and file */
if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
@@ -146,7 +134,8 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
/* Now try to lock the file */
resp->status = nlmsvc_lock(rqstp, file, host, &argp->lock,
- argp->block, &argp->cookie);
+ argp->block, &argp->cookie,
+ argp->reclaim);
if (resp->status == nlm_drop_reply)
rc = rpc_drop_reply;
else
@@ -169,7 +158,7 @@ nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
/* Don't accept requests during grace period */
- if (nlmsvc_grace_period) {
+ if (locks_in_grace()) {
resp->status = nlm_lck_denied_grace_period;
return rpc_success;
}
@@ -202,7 +191,7 @@ nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
/* Don't accept new lock requests during grace period */
- if (nlmsvc_grace_period) {
+ if (locks_in_grace()) {
resp->status = nlm_lck_denied_grace_period;
return rpc_success;
}
@@ -231,7 +220,7 @@ nlm4svc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
dprintk("lockd: GRANTED called\n");
- resp->status = nlmclnt_grant(svc_addr_in(rqstp), &argp->lock);
+ resp->status = nlmclnt_grant(svc_addr(rqstp), &argp->lock);
dprintk("lockd: GRANTED status %d\n", ntohl(resp->status));
return rpc_success;
}
@@ -341,7 +330,7 @@ nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
/* Don't accept new lock requests during grace period */
- if (nlmsvc_grace_period && !argp->reclaim) {
+ if (locks_in_grace() && !argp->reclaim) {
resp->status = nlm_lck_denied_grace_period;
return rpc_success;
}
@@ -374,7 +363,7 @@ nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
/* Don't accept requests during grace period */
- if (nlmsvc_grace_period) {
+ if (locks_in_grace()) {
resp->status = nlm_lck_denied_grace_period;
return rpc_success;
}
@@ -432,11 +421,9 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
{
struct sockaddr_in saddr;
- memcpy(&saddr, svc_addr_in(rqstp), sizeof(saddr));
-
dprintk("lockd: SM_NOTIFY called\n");
- if (saddr.sin_addr.s_addr != htonl(INADDR_LOOPBACK)
- || ntohs(saddr.sin_port) >= 1024) {
+
+ if (!nlm_privileged_requester(rqstp)) {
char buf[RPC_MAX_ADDRBUFLEN];
printk(KERN_WARNING "lockd: rejected NSM callback from %s\n",
svc_print_addr(rqstp, buf, sizeof(buf)));
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index cf0d5c2c318d..6063a8e4b9f3 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -360,7 +360,7 @@ nlmsvc_defer_lock_rqst(struct svc_rqst *rqstp, struct nlm_block *block)
__be32
nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
struct nlm_host *host, struct nlm_lock *lock, int wait,
- struct nlm_cookie *cookie)
+ struct nlm_cookie *cookie, int reclaim)
{
struct nlm_block *block = NULL;
int error;
@@ -406,6 +406,15 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
goto out;
}
+ if (locks_in_grace() && !reclaim) {
+ ret = nlm_lck_denied_grace_period;
+ goto out;
+ }
+ if (reclaim && !locks_in_grace()) {
+ ret = nlm_lck_denied_grace_period;
+ goto out;
+ }
+
if (!wait)
lock->fl.fl_flags &= ~FL_SLEEP;
error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL);
@@ -502,6 +511,10 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
goto out;
}
+ if (locks_in_grace()) {
+ ret = nlm_lck_denied_grace_period;
+ goto out;
+ }
error = vfs_test_lock(file->f_file, &lock->fl);
if (error == FILE_LOCK_DEFERRED) {
ret = nlmsvc_defer_lock_rqst(rqstp, block);
@@ -582,6 +595,9 @@ nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock)
(long long)lock->fl.fl_start,
(long long)lock->fl.fl_end);
+ if (locks_in_grace())
+ return nlm_lck_denied_grace_period;
+
mutex_lock(&file->f_mutex);
block = nlmsvc_lookup_block(file, lock);
mutex_unlock(&file->f_mutex);
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 76262c1986f2..548b0bb2b84d 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -117,12 +117,6 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
dprintk("lockd: TEST called\n");
resp->cookie = argp->cookie;
- /* Don't accept test requests during grace period */
- if (nlmsvc_grace_period) {
- resp->status = nlm_lck_denied_grace_period;
- return rc;
- }
-
/* Obtain client and file */
if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
@@ -152,12 +146,6 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
- /* Don't accept new lock requests during grace period */
- if (nlmsvc_grace_period && !argp->reclaim) {
- resp->status = nlm_lck_denied_grace_period;
- return rc;
- }
-
/* Obtain client and file */
if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
@@ -176,7 +164,8 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
/* Now try to lock the file */
resp->status = cast_status(nlmsvc_lock(rqstp, file, host, &argp->lock,
- argp->block, &argp->cookie));
+ argp->block, &argp->cookie,
+ argp->reclaim));
if (resp->status == nlm_drop_reply)
rc = rpc_drop_reply;
else
@@ -199,7 +188,7 @@ nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
/* Don't accept requests during grace period */
- if (nlmsvc_grace_period) {
+ if (locks_in_grace()) {
resp->status = nlm_lck_denied_grace_period;
return rpc_success;
}
@@ -232,7 +221,7 @@ nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
/* Don't accept new lock requests during grace period */
- if (nlmsvc_grace_period) {
+ if (locks_in_grace()) {
resp->status = nlm_lck_denied_grace_period;
return rpc_success;
}
@@ -261,7 +250,7 @@ nlmsvc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
dprintk("lockd: GRANTED called\n");
- resp->status = nlmclnt_grant(svc_addr_in(rqstp), &argp->lock);
+ resp->status = nlmclnt_grant(svc_addr(rqstp), &argp->lock);
dprintk("lockd: GRANTED status %d\n", ntohl(resp->status));
return rpc_success;
}
@@ -373,7 +362,7 @@ nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
/* Don't accept new lock requests during grace period */
- if (nlmsvc_grace_period && !argp->reclaim) {
+ if (locks_in_grace() && !argp->reclaim) {
resp->status = nlm_lck_denied_grace_period;
return rpc_success;
}
@@ -406,7 +395,7 @@ nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
/* Don't accept requests during grace period */
- if (nlmsvc_grace_period) {
+ if (locks_in_grace()) {
resp->status = nlm_lck_denied_grace_period;
return rpc_success;
}
@@ -464,11 +453,9 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
{
struct sockaddr_in saddr;
- memcpy(&saddr, svc_addr_in(rqstp), sizeof(saddr));
-
dprintk("lockd: SM_NOTIFY called\n");
- if (saddr.sin_addr.s_addr != htonl(INADDR_LOOPBACK)
- || ntohs(saddr.sin_port) >= 1024) {
+
+ if (!nlm_privileged_requester(rqstp)) {
char buf[RPC_MAX_ADDRBUFLEN];
printk(KERN_WARNING "lockd: rejected NSM callback from %s\n",
svc_print_addr(rqstp, buf, sizeof(buf)));
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 198b4e55b373..34c2766e27c7 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -418,7 +418,7 @@ EXPORT_SYMBOL_GPL(nlmsvc_unlock_all_by_sb);
static int
nlmsvc_match_ip(void *datap, struct nlm_host *host)
{
- return nlm_cmp_addr(&host->h_saddr, datap);
+ return nlm_cmp_addr(nlm_srcaddr(host), datap);
}
/**
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 3e459e18cc31..1f226290c67c 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -351,8 +351,6 @@ nlmsvc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp)
argp->state = ntohl(*p++);
/* Preserve the address in network byte order */
argp->addr = *p++;
- argp->vers = *p++;
- argp->proto = *p++;
return xdr_argsize_check(rqstp, p);
}
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index 43ff9397e6c6..50c493a8ad8e 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -358,8 +358,6 @@ nlm4svc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp
argp->state = ntohl(*p++);
/* Preserve the address in network byte order */
argp->addr = *p++;
- argp->vers = *p++;
- argp->proto = *p++;
return xdr_argsize_check(rqstp, p);
}
diff --git a/fs/mpage.c b/fs/mpage.c
index dbcc7af76a15..552b80b3facc 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -6,7 +6,7 @@
* Contains functions related to preparing and submitting BIOs which contain
* multiple pagecache pages.
*
- * 15May2002 akpm@zip.com.au
+ * 15May2002 Andrew Morton
* Initial version
* 27Jun2002 axboe@suse.de
* use bio_add_page() to build bio's just the right size
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index f447f4b4476c..6a09760c5960 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -105,7 +105,8 @@ int nfs_callback_up(void)
mutex_lock(&nfs_callback_mutex);
if (nfs_callback_info.users++ || nfs_callback_info.task != NULL)
goto out;
- serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL);
+ serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE,
+ AF_INET, NULL);
ret = -ENOMEM;
if (!serv)
goto out_err;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 5ee23e7058b3..7547600b6174 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -675,7 +675,7 @@ static int nfs_init_server(struct nfs_server *server,
server->nfs_client = clp;
/* Initialise the client representation from the mount data */
- server->flags = data->flags & NFS_MOUNT_FLAGMASK;
+ server->flags = data->flags;
if (data->rsize)
server->rsize = nfs_block_size(data->rsize, NULL);
@@ -850,7 +850,6 @@ static struct nfs_server *nfs_alloc_server(void)
INIT_LIST_HEAD(&server->client_link);
INIT_LIST_HEAD(&server->master_link);
- init_waitqueue_head(&server->active_wq);
atomic_set(&server->active, 0);
server->io_stats = nfs_alloc_iostats();
@@ -1073,7 +1072,7 @@ static int nfs4_init_server(struct nfs_server *server,
goto error;
/* Initialise the client representation from the mount data */
- server->flags = data->flags & NFS_MOUNT_FLAGMASK;
+ server->flags = data->flags;
server->caps |= NFS_CAP_ATOMIC_OPEN;
if (data->rsize)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 74f92b717f78..2ab70d46ecbc 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -156,6 +156,7 @@ typedef struct {
decode_dirent_t decode;
int plus;
unsigned long timestamp;
+ unsigned long gencount;
int timestamp_valid;
} nfs_readdir_descriptor_t;
@@ -177,7 +178,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
struct file *file = desc->file;
struct inode *inode = file->f_path.dentry->d_inode;
struct rpc_cred *cred = nfs_file_cred(file);
- unsigned long timestamp;
+ unsigned long timestamp, gencount;
int error;
dfprintk(DIRCACHE, "NFS: %s: reading cookie %Lu into page %lu\n",
@@ -186,6 +187,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
again:
timestamp = jiffies;
+ gencount = nfs_inc_attr_generation_counter();
error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, desc->entry->cookie, page,
NFS_SERVER(inode)->dtsize, desc->plus);
if (error < 0) {
@@ -199,6 +201,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
goto error;
}
desc->timestamp = timestamp;
+ desc->gencount = gencount;
desc->timestamp_valid = 1;
SetPageUptodate(page);
/* Ensure consistent page alignment of the data.
@@ -224,9 +227,10 @@ int dir_decode(nfs_readdir_descriptor_t *desc)
if (IS_ERR(p))
return PTR_ERR(p);
desc->ptr = p;
- if (desc->timestamp_valid)
+ if (desc->timestamp_valid) {
desc->entry->fattr->time_start = desc->timestamp;
- else
+ desc->entry->fattr->gencount = desc->gencount;
+ } else
desc->entry->fattr->valid &= ~NFS_ATTR_FATTR;
return 0;
}
@@ -471,7 +475,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
struct rpc_cred *cred = nfs_file_cred(file);
struct page *page = NULL;
int status;
- unsigned long timestamp;
+ unsigned long timestamp, gencount;
dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
(unsigned long long)*desc->dir_cookie);
@@ -482,6 +486,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
goto out;
}
timestamp = jiffies;
+ gencount = nfs_inc_attr_generation_counter();
status = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred,
*desc->dir_cookie, page,
NFS_SERVER(inode)->dtsize,
@@ -490,6 +495,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
desc->ptr = kmap(page); /* matching kunmap in nfs_do_filldir */
if (status >= 0) {
desc->timestamp = timestamp;
+ desc->gencount = gencount;
desc->timestamp_valid = 1;
if ((status = dir_decode(desc)) == 0)
desc->entry->prev_cookie = *desc->dir_cookie;
@@ -655,7 +661,7 @@ static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync)
*/
void nfs_force_lookup_revalidate(struct inode *dir)
{
- NFS_I(dir)->cache_change_attribute = jiffies;
+ NFS_I(dir)->cache_change_attribute++;
}
/*
@@ -667,6 +673,8 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
{
if (IS_ROOT(dentry))
return 1;
+ if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)
+ return 0;
if (!nfs_verify_change_attribute(dir, dentry->d_time))
return 0;
/* Revalidate nfsi->cache_change_attribute before we declare a match */
@@ -750,6 +758,8 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
/* Don't revalidate a negative dentry if we're creating a new file */
if (nd != NULL && nfs_lookup_check_intent(nd, LOOKUP_CREATE) != 0)
return 0;
+ if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG)
+ return 1;
return !nfs_check_verifier(dir, dentry);
}
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 78460657f5cb..d319b49f8f06 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -188,13 +188,16 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
/* origin == SEEK_END => we must revalidate the cached file length */
if (origin == SEEK_END) {
struct inode *inode = filp->f_mapping->host;
+
int retval = nfs_revalidate_file_size(inode, filp);
if (retval < 0)
return (loff_t)retval;
- }
- lock_kernel(); /* BKL needed? */
- loff = generic_file_llseek_unlocked(filp, offset, origin);
- unlock_kernel();
+
+ spin_lock(&inode->i_lock);
+ loff = generic_file_llseek_unlocked(filp, offset, origin);
+ spin_unlock(&inode->i_lock);
+ } else
+ loff = generic_file_llseek_unlocked(filp, offset, origin);
return loff;
}
@@ -699,13 +702,6 @@ static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
filp->f_path.dentry->d_name.name,
fl->fl_type, fl->fl_flags);
- /*
- * No BSD flocks over NFS allowed.
- * Note: we could try to fake a POSIX lock request here by
- * using ((u32) filp | 0x80000000) or some such as the pid.
- * Not sure whether that would be unique, though, or whether
- * that would break in other places.
- */
if (!(fl->fl_flags & FL_FLOCK))
return -ENOLCK;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 52daefa2f521..b9195c02a863 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -305,8 +305,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
init_special_inode(inode, inode->i_mode, fattr->rdev);
nfsi->read_cache_jiffies = fattr->time_start;
- nfsi->last_updated = now;
- nfsi->cache_change_attribute = now;
+ nfsi->attr_gencount = fattr->gencount;
inode->i_atime = fattr->atime;
inode->i_mtime = fattr->mtime;
inode->i_ctime = fattr->ctime;
@@ -453,6 +452,7 @@ out_big:
void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
{
if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) {
+ spin_lock(&inode->i_lock);
if ((attr->ia_valid & ATTR_MODE) != 0) {
int mode = attr->ia_mode & S_IALLUGO;
mode |= inode->i_mode & ~S_IALLUGO;
@@ -462,7 +462,6 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
inode->i_uid = attr->ia_uid;
if ((attr->ia_valid & ATTR_GID) != 0)
inode->i_gid = attr->ia_gid;
- spin_lock(&inode->i_lock);
NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
spin_unlock(&inode->i_lock);
}
@@ -472,37 +471,6 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
}
}
-static int nfs_wait_schedule(void *word)
-{
- if (signal_pending(current))
- return -ERESTARTSYS;
- schedule();
- return 0;
-}
-
-/*
- * Wait for the inode to get unlocked.
- */
-static int nfs_wait_on_inode(struct inode *inode)
-{
- struct nfs_inode *nfsi = NFS_I(inode);
- int error;
-
- error = wait_on_bit_lock(&nfsi->flags, NFS_INO_REVALIDATING,
- nfs_wait_schedule, TASK_KILLABLE);
-
- return error;
-}
-
-static void nfs_wake_up_inode(struct inode *inode)
-{
- struct nfs_inode *nfsi = NFS_I(inode);
-
- clear_bit(NFS_INO_REVALIDATING, &nfsi->flags);
- smp_mb__after_clear_bit();
- wake_up_bit(&nfsi->flags, NFS_INO_REVALIDATING);
-}
-
int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
{
struct inode *inode = dentry->d_inode;
@@ -697,20 +665,15 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n",
inode->i_sb->s_id, (long long)NFS_FILEID(inode));
- nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
if (is_bad_inode(inode))
- goto out_nowait;
+ goto out;
if (NFS_STALE(inode))
- goto out_nowait;
-
- status = nfs_wait_on_inode(inode);
- if (status < 0)
goto out;
- status = -ESTALE;
if (NFS_STALE(inode))
goto out;
+ nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr);
if (status != 0) {
dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n",
@@ -724,16 +687,13 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
goto out;
}
- spin_lock(&inode->i_lock);
- status = nfs_update_inode(inode, &fattr);
+ status = nfs_refresh_inode(inode, &fattr);
if (status) {
- spin_unlock(&inode->i_lock);
dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n",
inode->i_sb->s_id,
(long long)NFS_FILEID(inode), status);
goto out;
}
- spin_unlock(&inode->i_lock);
if (nfsi->cache_validity & NFS_INO_INVALID_ACL)
nfs_zap_acl_cache(inode);
@@ -743,9 +703,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
(long long)NFS_FILEID(inode));
out:
- nfs_wake_up_inode(inode);
-
- out_nowait:
return status;
}
@@ -908,9 +865,6 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
return -EIO;
}
- /* Do atomic weak cache consistency updates */
- nfs_wcc_update_inode(inode, fattr);
-
if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
nfsi->change_attr != fattr->change_attr)
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
@@ -939,15 +893,81 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
if (invalid != 0)
nfsi->cache_validity |= invalid;
- else
- nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
- | NFS_INO_INVALID_ATIME
- | NFS_INO_REVAL_PAGECACHE);
nfsi->read_cache_jiffies = fattr->time_start;
return 0;
}
+static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
+{
+ return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0;
+}
+
+static int nfs_size_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
+{
+ return nfs_size_to_loff_t(fattr->size) > i_size_read(inode);
+}
+
+static unsigned long nfs_attr_generation_counter;
+
+static unsigned long nfs_read_attr_generation_counter(void)
+{
+ smp_rmb();
+ return nfs_attr_generation_counter;
+}
+
+unsigned long nfs_inc_attr_generation_counter(void)
+{
+ unsigned long ret;
+ smp_rmb();
+ ret = ++nfs_attr_generation_counter;
+ smp_wmb();
+ return ret;
+}
+
+void nfs_fattr_init(struct nfs_fattr *fattr)
+{
+ fattr->valid = 0;
+ fattr->time_start = jiffies;
+ fattr->gencount = nfs_inc_attr_generation_counter();
+}
+
+/**
+ * nfs_inode_attrs_need_update - check if the inode attributes need updating
+ * @inode - pointer to inode
+ * @fattr - attributes
+ *
+ * Attempt to divine whether or not an RPC call reply carrying stale
+ * attributes got scheduled after another call carrying updated ones.
+ *
+ * To do so, the function first assumes that a more recent ctime means
+ * that the attributes in fattr are newer, however it also attempt to
+ * catch the case where ctime either didn't change, or went backwards
+ * (if someone reset the clock on the server) by looking at whether
+ * or not this RPC call was started after the inode was last updated.
+ * Note also the check for wraparound of 'attr_gencount'
+ *
+ * The function returns 'true' if it thinks the attributes in 'fattr' are
+ * more recent than the ones cached in the inode.
+ *
+ */
+static int nfs_inode_attrs_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
+{
+ const struct nfs_inode *nfsi = NFS_I(inode);
+
+ return ((long)fattr->gencount - (long)nfsi->attr_gencount) > 0 ||
+ nfs_ctime_need_update(inode, fattr) ||
+ nfs_size_need_update(inode, fattr) ||
+ ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0);
+}
+
+static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
+{
+ if (nfs_inode_attrs_need_update(inode, fattr))
+ return nfs_update_inode(inode, fattr);
+ return nfs_check_inode_attributes(inode, fattr);
+}
+
/**
* nfs_refresh_inode - try to update the inode attribute cache
* @inode - pointer to inode
@@ -960,21 +980,28 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
*/
int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
{
- struct nfs_inode *nfsi = NFS_I(inode);
int status;
if ((fattr->valid & NFS_ATTR_FATTR) == 0)
return 0;
spin_lock(&inode->i_lock);
- if (time_after(fattr->time_start, nfsi->last_updated))
- status = nfs_update_inode(inode, fattr);
- else
- status = nfs_check_inode_attributes(inode, fattr);
-
+ status = nfs_refresh_inode_locked(inode, fattr);
spin_unlock(&inode->i_lock);
return status;
}
+static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+ if (S_ISDIR(inode->i_mode))
+ nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+ if ((fattr->valid & NFS_ATTR_FATTR) == 0)
+ return 0;
+ return nfs_refresh_inode_locked(inode, fattr);
+}
+
/**
* nfs_post_op_update_inode - try to update the inode attribute cache
* @inode - pointer to inode
@@ -991,14 +1018,12 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
*/
int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
{
- struct nfs_inode *nfsi = NFS_I(inode);
+ int status;
spin_lock(&inode->i_lock);
- nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
- if (S_ISDIR(inode->i_mode))
- nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+ status = nfs_post_op_update_inode_locked(inode, fattr);
spin_unlock(&inode->i_lock);
- return nfs_refresh_inode(inode, fattr);
+ return status;
}
/**
@@ -1014,6 +1039,15 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
*/
int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr)
{
+ int status;
+
+ spin_lock(&inode->i_lock);
+ /* Don't do a WCC update if these attributes are already stale */
+ if ((fattr->valid & NFS_ATTR_FATTR) == 0 ||
+ !nfs_inode_attrs_need_update(inode, fattr)) {
+ fattr->valid &= ~(NFS_ATTR_WCC_V4|NFS_ATTR_WCC);
+ goto out_noforce;
+ }
if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
(fattr->valid & NFS_ATTR_WCC_V4) == 0) {
fattr->pre_change_attr = NFS_I(inode)->change_attr;
@@ -1026,7 +1060,10 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa
fattr->pre_size = i_size_read(inode);
fattr->valid |= NFS_ATTR_WCC;
}
- return nfs_post_op_update_inode(inode, fattr);
+out_noforce:
+ status = nfs_post_op_update_inode_locked(inode, fattr);
+ spin_unlock(&inode->i_lock);
+ return status;
}
/*
@@ -1092,7 +1129,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
}
/* If ctime has changed we should definitely clear access+acl caches */
if (!timespec_equal(&inode->i_ctime, &fattr->ctime))
- invalid |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+ invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
} else if (nfsi->change_attr != fattr->change_attr) {
dprintk("NFS: change_attr change on server for file %s/%ld\n",
inode->i_sb->s_id, inode->i_ino);
@@ -1126,6 +1163,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
inode->i_gid != fattr->gid)
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+ if (inode->i_nlink != fattr->nlink)
+ invalid |= NFS_INO_INVALID_ATTR;
+
inode->i_mode = fattr->mode;
inode->i_nlink = fattr->nlink;
inode->i_uid = fattr->uid;
@@ -1145,18 +1185,13 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
nfsi->attrtimeo_timestamp = now;
- nfsi->last_updated = now;
+ nfsi->attr_gencount = nfs_inc_attr_generation_counter();
} else {
if (!time_in_range(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) {
if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode))
nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode);
nfsi->attrtimeo_timestamp = now;
}
- /*
- * Avoid jiffy wraparound issues with nfsi->last_updated
- */
- if (!time_in_range(nfsi->last_updated, nfsi->read_cache_jiffies, now))
- nfsi->last_updated = nfsi->read_cache_jiffies;
}
invalid &= ~NFS_INO_INVALID_ATTR;
/* Don't invalidate the data if we were to blame */
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 24241fcbb98d..d212ee41caf2 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -153,6 +153,7 @@ extern void nfs4_clear_inode(struct inode *);
void nfs_zap_acl_cache(struct inode *inode);
/* super.c */
+void nfs_parse_ip_address(char *, size_t, struct sockaddr *, size_t *);
extern struct file_system_type nfs_xdev_fs_type;
#ifdef CONFIG_NFS_V4
extern struct file_system_type nfs4_xdev_fs_type;
@@ -163,8 +164,8 @@ extern struct rpc_stat nfs_rpcstat;
extern int __init register_nfs_fs(void);
extern void __exit unregister_nfs_fs(void);
-extern void nfs_sb_active(struct nfs_server *server);
-extern void nfs_sb_deactive(struct nfs_server *server);
+extern void nfs_sb_active(struct super_block *sb);
+extern void nfs_sb_deactive(struct super_block *sb);
/* namespace.c */
extern char *nfs_path(const char *base,
@@ -276,3 +277,23 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len)
PAGE_SIZE - 1) >> PAGE_SHIFT;
}
+#define IPV6_SCOPE_DELIMITER '%'
+
+/*
+ * Set the port number in an address. Be agnostic about the address
+ * family.
+ */
+static inline void nfs_set_port(struct sockaddr *sap, unsigned short port)
+{
+ struct sockaddr_in *ap = (struct sockaddr_in *)sap;
+ struct sockaddr_in6 *ap6 = (struct sockaddr_in6 *)sap;
+
+ switch (sap->sa_family) {
+ case AF_INET:
+ ap->sin_port = htons(port);
+ break;
+ case AF_INET6:
+ ap6->sin6_port = htons(port);
+ break;
+ }
+}
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 779d2eb649c5..086a6830d785 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -14,6 +14,7 @@
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/sched.h>
#include <linux/nfs_fs.h>
+#include "internal.h"
#ifdef RPC_DEBUG
# define NFSDBG_FACILITY NFSDBG_MOUNT
@@ -98,7 +99,7 @@ out_call_err:
out_mnt_err:
dprintk("NFS: MNT server returned result %d\n", result.status);
- status = -EACCES;
+ status = nfs_stat_to_errno(result.status);
goto out;
}
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 66df08dd1caf..64a288ee046d 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -105,7 +105,10 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
dprintk("--> nfs_follow_mountpoint()\n");
- BUG_ON(IS_ROOT(dentry));
+ err = -ESTALE;
+ if (IS_ROOT(dentry))
+ goto out_err;
+
dprintk("%s: enter\n", __func__);
dput(nd->path.dentry);
nd->path.dentry = dget(dentry);
@@ -189,7 +192,7 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server,
struct nfs_clone_mount *mountdata)
{
#ifdef CONFIG_NFS_V4
- struct vfsmount *mnt = NULL;
+ struct vfsmount *mnt = ERR_PTR(-EINVAL);
switch (server->nfs_client->rpc_ops->version) {
case 2:
case 3:
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 423842f51ac9..cef62557c87d 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -229,6 +229,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
dprintk("NFS call getacl\n");
msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_GETACL];
+ nfs_fattr_init(&fattr);
status = rpc_call_sync(server->client_acl, &msg, 0);
dprintk("NFS reply getacl: %d\n", status);
@@ -322,6 +323,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
dprintk("NFS call setacl\n");
msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL];
+ nfs_fattr_init(&fattr);
status = rpc_call_sync(server->client_acl, &msg, 0);
nfs_access_zap_cache(inode);
nfs_zap_acl_cache(inode);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 1e750e4574a9..c55be7a7679e 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -699,7 +699,7 @@ nfs3_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
}
static int
-nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
+do_proc_fsinfo(struct rpc_clnt *client, struct nfs_fh *fhandle,
struct nfs_fsinfo *info)
{
struct rpc_message msg = {
@@ -711,11 +711,27 @@ nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
dprintk("NFS call fsinfo\n");
nfs_fattr_init(info->fattr);
- status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0);
+ status = rpc_call_sync(client, &msg, 0);
dprintk("NFS reply fsinfo: %d\n", status);
return status;
}
+/*
+ * Bare-bones access to fsinfo: this is for nfs_get_root/nfs_get_sb via
+ * nfs_create_server
+ */
+static int
+nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fsinfo *info)
+{
+ int status;
+
+ status = do_proc_fsinfo(server->client, fhandle, info);
+ if (status && server->nfs_client->cl_rpcclient != server->client)
+ status = do_proc_fsinfo(server->nfs_client->cl_rpcclient, fhandle, info);
+ return status;
+}
+
static int
nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
struct nfs_pathconf *info)
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index b112857301f7..30befc39b3c6 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -93,21 +93,52 @@ static int nfs4_validate_fspath(const struct vfsmount *mnt_parent,
return 0;
}
-/*
- * Check if the string represents a "valid" IPv4 address
- */
-static inline int valid_ipaddr4(const char *buf)
+static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
+ char *page, char *page2,
+ const struct nfs4_fs_location *location)
{
- int rc, count, in[4];
-
- rc = sscanf(buf, "%d.%d.%d.%d", &in[0], &in[1], &in[2], &in[3]);
- if (rc != 4)
- return -EINVAL;
- for (count = 0; count < 4; count++) {
- if (in[count] > 255)
- return -EINVAL;
+ struct vfsmount *mnt = ERR_PTR(-ENOENT);
+ char *mnt_path;
+ int page2len;
+ unsigned int s;
+
+ mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE);
+ if (IS_ERR(mnt_path))
+ return mnt;
+ mountdata->mnt_path = mnt_path;
+ page2 += strlen(mnt_path) + 1;
+ page2len = PAGE_SIZE - strlen(mnt_path) - 1;
+
+ for (s = 0; s < location->nservers; s++) {
+ const struct nfs4_string *buf = &location->servers[s];
+ struct sockaddr_storage addr;
+
+ if (buf->len <= 0 || buf->len >= PAGE_SIZE)
+ continue;
+
+ mountdata->addr = (struct sockaddr *)&addr;
+
+ if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len))
+ continue;
+ nfs_parse_ip_address(buf->data, buf->len,
+ mountdata->addr, &mountdata->addrlen);
+ if (mountdata->addr->sa_family == AF_UNSPEC)
+ continue;
+ nfs_set_port(mountdata->addr, NFS_PORT);
+
+ strncpy(page2, buf->data, page2len);
+ page2[page2len] = '\0';
+ mountdata->hostname = page2;
+
+ snprintf(page, PAGE_SIZE, "%s:%s",
+ mountdata->hostname,
+ mountdata->mnt_path);
+
+ mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, mountdata);
+ if (!IS_ERR(mnt))
+ break;
}
- return 0;
+ return mnt;
}
/**
@@ -128,7 +159,6 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
.authflavor = NFS_SB(mnt_parent->mnt_sb)->client->cl_auth->au_flavor,
};
char *page = NULL, *page2 = NULL;
- unsigned int s;
int loc, error;
if (locations == NULL || locations->nlocations <= 0)
@@ -152,53 +182,16 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
goto out;
}
- loc = 0;
- while (loc < locations->nlocations && IS_ERR(mnt)) {
+ for (loc = 0; loc < locations->nlocations; loc++) {
const struct nfs4_fs_location *location = &locations->locations[loc];
- char *mnt_path;
if (location == NULL || location->nservers <= 0 ||
- location->rootpath.ncomponents == 0) {
- loc++;
+ location->rootpath.ncomponents == 0)
continue;
- }
- mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE);
- if (IS_ERR(mnt_path)) {
- loc++;
- continue;
- }
- mountdata.mnt_path = mnt_path;
-
- s = 0;
- while (s < location->nservers) {
- struct sockaddr_in addr = {
- .sin_family = AF_INET,
- .sin_port = htons(NFS_PORT),
- };
-
- if (location->servers[s].len <= 0 ||
- valid_ipaddr4(location->servers[s].data) < 0) {
- s++;
- continue;
- }
-
- mountdata.hostname = location->servers[s].data;
- addr.sin_addr.s_addr = in_aton(mountdata.hostname),
- mountdata.addr = (struct sockaddr *)&addr;
- mountdata.addrlen = sizeof(addr);
-
- snprintf(page, PAGE_SIZE, "%s:%s",
- mountdata.hostname,
- mountdata.mnt_path);
-
- mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, &mountdata);
- if (!IS_ERR(mnt)) {
- break;
- }
- s++;
- }
- loc++;
+ mnt = try_location(&mountdata, page, page2, location);
+ if (!IS_ERR(mnt))
+ break;
}
out:
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 4dbb84df1b68..193465210d7c 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -65,14 +65,20 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
dprintk("%s: call getattr\n", __func__);
nfs_fattr_init(fattr);
- status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0);
+ status = rpc_call_sync(server->client, &msg, 0);
+ /* Retry with default authentication if different */
+ if (status && server->nfs_client->cl_rpcclient != server->client)
+ status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0);
dprintk("%s: reply getattr: %d\n", __func__, status);
if (status)
return status;
dprintk("%s: call statfs\n", __func__);
msg.rpc_proc = &nfs_procedures[NFSPROC_STATFS];
msg.rpc_resp = &fsinfo;
- status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0);
+ status = rpc_call_sync(server->client, &msg, 0);
+ /* Retry with default authentication if different */
+ if (status && server->nfs_client->cl_rpcclient != server->client)
+ status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0);
dprintk("%s: reply statfs: %d\n", __func__, status);
if (status)
return status;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index ffb697416cb1..8b28b95c9e44 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -91,6 +91,7 @@ enum {
/* Mount options that take string arguments */
Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost,
Opt_addr, Opt_mountaddr, Opt_clientaddr,
+ Opt_lookupcache,
/* Special mount options */
Opt_userspace, Opt_deprecated, Opt_sloppy,
@@ -154,6 +155,8 @@ static const match_table_t nfs_mount_option_tokens = {
{ Opt_mounthost, "mounthost=%s" },
{ Opt_mountaddr, "mountaddr=%s" },
+ { Opt_lookupcache, "lookupcache=%s" },
+
{ Opt_err, NULL }
};
@@ -200,6 +203,22 @@ static const match_table_t nfs_secflavor_tokens = {
{ Opt_sec_err, NULL }
};
+enum {
+ Opt_lookupcache_all, Opt_lookupcache_positive,
+ Opt_lookupcache_none,
+
+ Opt_lookupcache_err
+};
+
+static match_table_t nfs_lookupcache_tokens = {
+ { Opt_lookupcache_all, "all" },
+ { Opt_lookupcache_positive, "pos" },
+ { Opt_lookupcache_positive, "positive" },
+ { Opt_lookupcache_none, "none" },
+
+ { Opt_lookupcache_err, NULL }
+};
+
static void nfs_umount_begin(struct super_block *);
static int nfs_statfs(struct dentry *, struct kstatfs *);
@@ -209,7 +228,6 @@ static int nfs_get_sb(struct file_system_type *, int, const char *, void *, stru
static int nfs_xdev_get_sb(struct file_system_type *fs_type,
int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
static void nfs_kill_super(struct super_block *);
-static void nfs_put_super(struct super_block *);
static int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
static struct file_system_type nfs_fs_type = {
@@ -232,7 +250,6 @@ static const struct super_operations nfs_sops = {
.alloc_inode = nfs_alloc_inode,
.destroy_inode = nfs_destroy_inode,
.write_inode = nfs_write_inode,
- .put_super = nfs_put_super,
.statfs = nfs_statfs,
.clear_inode = nfs_clear_inode,
.umount_begin = nfs_umount_begin,
@@ -337,26 +354,20 @@ void __exit unregister_nfs_fs(void)
unregister_filesystem(&nfs_fs_type);
}
-void nfs_sb_active(struct nfs_server *server)
+void nfs_sb_active(struct super_block *sb)
{
- atomic_inc(&server->active);
-}
+ struct nfs_server *server = NFS_SB(sb);
-void nfs_sb_deactive(struct nfs_server *server)
-{
- if (atomic_dec_and_test(&server->active))
- wake_up(&server->active_wq);
+ if (atomic_inc_return(&server->active) == 1)
+ atomic_inc(&sb->s_active);
}
-static void nfs_put_super(struct super_block *sb)
+void nfs_sb_deactive(struct super_block *sb)
{
struct nfs_server *server = NFS_SB(sb);
- /*
- * Make sure there are no outstanding ops to this server.
- * If so, wait for them to finish before allowing the
- * unmount to continue.
- */
- wait_event(server->active_wq, atomic_read(&server->active) == 0);
+
+ if (atomic_dec_and_test(&server->active))
+ deactivate_super(sb);
}
/*
@@ -664,25 +675,6 @@ static void nfs_umount_begin(struct super_block *sb)
}
/*
- * Set the port number in an address. Be agnostic about the address family.
- */
-static void nfs_set_port(struct sockaddr *sap, unsigned short port)
-{
- switch (sap->sa_family) {
- case AF_INET: {
- struct sockaddr_in *ap = (struct sockaddr_in *)sap;
- ap->sin_port = htons(port);
- break;
- }
- case AF_INET6: {
- struct sockaddr_in6 *ap = (struct sockaddr_in6 *)sap;
- ap->sin6_port = htons(port);
- break;
- }
- }
-}
-
-/*
* Sanity-check a server address provided by the mount command.
*
* Address family must be initialized, and address must not be
@@ -724,20 +716,22 @@ static void nfs_parse_ipv4_address(char *string, size_t str_len,
*addr_len = 0;
}
-#define IPV6_SCOPE_DELIMITER '%'
-
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static void nfs_parse_ipv6_scope_id(const char *string, const size_t str_len,
- const char *delim,
- struct sockaddr_in6 *sin6)
+static int nfs_parse_ipv6_scope_id(const char *string, const size_t str_len,
+ const char *delim,
+ struct sockaddr_in6 *sin6)
{
char *p;
size_t len;
- if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL))
- return ;
+ if ((string + str_len) == delim)
+ return 1;
+
if (*delim != IPV6_SCOPE_DELIMITER)
- return;
+ return 0;
+
+ if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL))
+ return 0;
len = (string + str_len) - delim - 1;
p = kstrndup(delim + 1, len, GFP_KERNEL);
@@ -750,14 +744,20 @@ static void nfs_parse_ipv6_scope_id(const char *string, const size_t str_len,
scope_id = dev->ifindex;
dev_put(dev);
} else {
- /* scope_id is set to zero on error */
- strict_strtoul(p, 10, &scope_id);
+ if (strict_strtoul(p, 10, &scope_id) == 0) {
+ kfree(p);
+ return 0;
+ }
}
kfree(p);
+
sin6->sin6_scope_id = scope_id;
dfprintk(MOUNT, "NFS: IPv6 scope ID = %lu\n", scope_id);
+ return 1;
}
+
+ return 0;
}
static void nfs_parse_ipv6_address(char *string, size_t str_len,
@@ -773,9 +773,11 @@ static void nfs_parse_ipv6_address(char *string, size_t str_len,
sin6->sin6_family = AF_INET6;
*addr_len = sizeof(*sin6);
- if (in6_pton(string, str_len, addr, IPV6_SCOPE_DELIMITER, &delim)) {
- nfs_parse_ipv6_scope_id(string, str_len, delim, sin6);
- return;
+ if (in6_pton(string, str_len, addr,
+ IPV6_SCOPE_DELIMITER, &delim) != 0) {
+ if (nfs_parse_ipv6_scope_id(string, str_len,
+ delim, sin6) != 0)
+ return;
}
}
@@ -798,7 +800,7 @@ static void nfs_parse_ipv6_address(char *string, size_t str_len,
* If there is a problem constructing the new sockaddr, set the address
* family to AF_UNSPEC.
*/
-static void nfs_parse_ip_address(char *string, size_t str_len,
+void nfs_parse_ip_address(char *string, size_t str_len,
struct sockaddr *sap, size_t *addr_len)
{
unsigned int i, colons;
@@ -1258,6 +1260,30 @@ static int nfs_parse_mount_options(char *raw,
&mnt->mount_server.addrlen);
kfree(string);
break;
+ case Opt_lookupcache:
+ string = match_strdup(args);
+ if (string == NULL)
+ goto out_nomem;
+ token = match_token(string,
+ nfs_lookupcache_tokens, args);
+ kfree(string);
+ switch (token) {
+ case Opt_lookupcache_all:
+ mnt->flags &= ~(NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE);
+ break;
+ case Opt_lookupcache_positive:
+ mnt->flags &= ~NFS_MOUNT_LOOKUP_CACHE_NONE;
+ mnt->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG;
+ break;
+ case Opt_lookupcache_none:
+ mnt->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE;
+ break;
+ default:
+ errors++;
+ dfprintk(MOUNT, "NFS: invalid "
+ "lookupcache argument\n");
+ };
+ break;
/*
* Special options
@@ -1558,7 +1584,7 @@ static int nfs_validate_mount_data(void *options,
* Translate to nfs_parsed_mount_data, which nfs_fill_super
* can deal with.
*/
- args->flags = data->flags;
+ args->flags = data->flags & NFS_MOUNT_FLAGMASK;
args->rsize = data->rsize;
args->wsize = data->wsize;
args->timeo = data->timeo;
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index f089e5839d7d..ecc295347775 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -99,7 +99,7 @@ static void nfs_async_unlink_release(void *calldata)
nfs_dec_sillycount(data->dir);
nfs_free_unlinkdata(data);
- nfs_sb_deactive(NFS_SB(sb));
+ nfs_sb_deactive(sb);
}
static const struct rpc_call_ops nfs_unlink_ops = {
@@ -118,6 +118,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
.rpc_message = &msg,
.callback_ops = &nfs_unlink_ops,
.callback_data = data,
+ .workqueue = nfsiod_workqueue,
.flags = RPC_TASK_ASYNC,
};
struct rpc_task *task;
@@ -149,7 +150,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
nfs_dec_sillycount(dir);
return 0;
}
- nfs_sb_active(NFS_SERVER(dir));
+ nfs_sb_active(dir->i_sb);
data->args.fh = NFS_FH(dir);
nfs_fattr_init(&data->res.dir_attr);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 3229e217c773..9f9845859fc1 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1427,8 +1427,9 @@ static int nfs_write_mapping(struct address_space *mapping, int how)
.bdi = mapping->backing_dev_info,
.sync_mode = WB_SYNC_NONE,
.nr_to_write = LONG_MAX,
+ .range_start = 0,
+ .range_end = LLONG_MAX,
.for_writepages = 1,
- .range_cyclic = 1,
};
int ret;
diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c
index 15c6faeec77c..b2786a5f9afe 100644
--- a/fs/nfsd/lockd.c
+++ b/fs/nfsd/lockd.c
@@ -70,7 +70,6 @@ nlm_fclose(struct file *filp)
static struct nlmsvc_binding nfsd_nlm_ops = {
.fopen = nlm_fopen, /* open file for locking */
.fclose = nlm_fclose, /* close file */
- .get_grace_period = get_nfs4_grace_period,
};
void
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 4d617ea28cfc..9dbd2eb91281 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -63,7 +63,8 @@ nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp,
SVCFH_fmt(&argp->fh));
fh_copy(&resp->fh, &argp->fh);
- nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
+ nfserr = fh_verify(rqstp, &resp->fh, 0,
+ NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT);
if (nfserr)
RETURN_STATUS(nfserr);
@@ -530,7 +531,7 @@ nfsd3_proc_fsstat(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
dprintk("nfsd: FSSTAT(3) %s\n",
SVCFH_fmt(&argp->fh));
- nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats);
+ nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats, 0);
fh_put(&argp->fh);
RETURN_STATUS(nfserr);
}
@@ -558,7 +559,8 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
resp->f_maxfilesize = ~(u32) 0;
resp->f_properties = NFS3_FSF_DEFAULT;
- nfserr = fh_verify(rqstp, &argp->fh, 0, NFSD_MAY_NOP);
+ nfserr = fh_verify(rqstp, &argp->fh, 0,
+ NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT);
/* Check special features of the file system. May request
* different read/write sizes for file systems known to have
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 702fa577aa6e..094747a1227c 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -225,7 +225,8 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len);
WRITE32(OP_CB_RECALL);
- WRITEMEM(&cb_rec->cbr_stateid, sizeof(stateid_t));
+ WRITE32(cb_rec->cbr_stateid.si_generation);
+ WRITEMEM(&cb_rec->cbr_stateid.si_opaque, sizeof(stateid_opaque_t));
WRITE32(cb_rec->cbr_trunc);
WRITE32(len);
WRITEMEM(cb_rec->cbr_fhval, len);
@@ -379,6 +380,7 @@ static int do_probe_callback(void *data)
.addrsize = sizeof(addr),
.timeout = &timeparms,
.program = &cb_program,
+ .prognumber = cb->cb_prog,
.version = nfs_cb_version[1]->number,
.authflavor = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */
.flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
@@ -396,9 +398,6 @@ static int do_probe_callback(void *data)
addr.sin_port = htons(cb->cb_port);
addr.sin_addr.s_addr = htonl(cb->cb_addr);
- /* Initialize rpc_stat */
- memset(args.program->stats, 0, sizeof(struct rpc_stat));
-
/* Create RPC client */
client = rpc_create(&args);
if (IS_ERR(client)) {
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index e5b51ffafc6c..669461e291ae 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -201,10 +201,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
/* Openowner is now set, so sequence id will get bumped. Now we need
* these checks before we do any creates: */
status = nfserr_grace;
- if (nfs4_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
+ if (locks_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
goto out;
status = nfserr_no_grace;
- if (!nfs4_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
+ if (!locks_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
goto out;
switch (open->op_claim_type) {
@@ -575,7 +575,7 @@ nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
{
__be32 status;
- if (nfs4_in_grace())
+ if (locks_in_grace())
return nfserr_grace;
status = nfsd_unlink(rqstp, &cstate->current_fh, 0,
remove->rm_name, remove->rm_namelen);
@@ -596,7 +596,7 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (!cstate->save_fh.fh_dentry)
return status;
- if (nfs4_in_grace() && !(cstate->save_fh.fh_export->ex_flags
+ if (locks_in_grace() && !(cstate->save_fh.fh_export->ex_flags
& NFSEXP_NOSUBTREECHECK))
return nfserr_grace;
status = nfsd_rename(rqstp, &cstate->save_fh, rename->rn_sname,
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 1578d7a2667e..0cc7ff5d5ab5 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -61,7 +61,6 @@
static time_t lease_time = 90; /* default lease time */
static time_t user_lease_time = 90;
static time_t boot_time;
-static int in_grace = 1;
static u32 current_ownerid = 1;
static u32 current_fileid = 1;
static u32 current_delegid = 1;
@@ -1640,7 +1639,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
case NFS4_OPEN_CLAIM_NULL:
/* Let's not give out any delegations till everyone's
* had the chance to reclaim theirs.... */
- if (nfs4_in_grace())
+ if (locks_in_grace())
goto out;
if (!atomic_read(&cb->cb_set) || !sop->so_confirmed)
goto out;
@@ -1816,12 +1815,15 @@ out:
return status;
}
+struct lock_manager nfsd4_manager = {
+};
+
static void
-end_grace(void)
+nfsd4_end_grace(void)
{
dprintk("NFSD: end of grace period\n");
nfsd4_recdir_purge_old();
- in_grace = 0;
+ locks_end_grace(&nfsd4_manager);
}
static time_t
@@ -1838,8 +1840,8 @@ nfs4_laundromat(void)
nfs4_lock_state();
dprintk("NFSD: laundromat service - starting\n");
- if (in_grace)
- end_grace();
+ if (locks_in_grace())
+ nfsd4_end_grace();
list_for_each_safe(pos, next, &client_lru) {
clp = list_entry(pos, struct nfs4_client, cl_lru);
if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) {
@@ -1974,7 +1976,7 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
return nfserr_bad_stateid;
else if (ONE_STATEID(stateid) && (flags & RD_STATE))
return nfs_ok;
- else if (nfs4_in_grace()) {
+ else if (locks_in_grace()) {
/* Answer in remaining cases depends on existance of
* conflicting state; so we must wait out the grace period. */
return nfserr_grace;
@@ -1993,7 +1995,7 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
static inline int
io_during_grace_disallowed(struct inode *inode, int flags)
{
- return nfs4_in_grace() && (flags & (RD_STATE | WR_STATE))
+ return locks_in_grace() && (flags & (RD_STATE | WR_STATE))
&& mandatory_lock(inode);
}
@@ -2693,10 +2695,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
filp = lock_stp->st_vfs_file;
status = nfserr_grace;
- if (nfs4_in_grace() && !lock->lk_reclaim)
+ if (locks_in_grace() && !lock->lk_reclaim)
goto out;
status = nfserr_no_grace;
- if (!nfs4_in_grace() && lock->lk_reclaim)
+ if (!locks_in_grace() && lock->lk_reclaim)
goto out;
locks_init_lock(&file_lock);
@@ -2779,7 +2781,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
int error;
__be32 status;
- if (nfs4_in_grace())
+ if (locks_in_grace())
return nfserr_grace;
if (check_lock_length(lockt->lt_offset, lockt->lt_length))
@@ -3192,9 +3194,9 @@ __nfs4_state_start(void)
unsigned long grace_time;
boot_time = get_seconds();
- grace_time = get_nfs_grace_period();
+ grace_time = get_nfs4_grace_period();
lease_time = user_lease_time;
- in_grace = 1;
+ locks_start_grace(&nfsd4_manager);
printk(KERN_INFO "NFSD: starting %ld-second grace period\n",
grace_time/HZ);
laundry_wq = create_singlethread_workqueue("nfsd4");
@@ -3213,12 +3215,6 @@ nfs4_state_start(void)
return;
}
-int
-nfs4_in_grace(void)
-{
- return in_grace;
-}
-
time_t
nfs4_lease_time(void)
{
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 14ba4d9b2859..afcdf4b76843 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -413,6 +413,18 @@ out_nfserr:
}
static __be32
+nfsd4_decode_stateid(struct nfsd4_compoundargs *argp, stateid_t *sid)
+{
+ DECODE_HEAD;
+
+ READ_BUF(sizeof(stateid_t));
+ READ32(sid->si_generation);
+ COPYMEM(&sid->si_opaque, sizeof(stateid_opaque_t));
+
+ DECODE_TAIL;
+}
+
+static __be32
nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access)
{
DECODE_HEAD;
@@ -429,10 +441,9 @@ nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
DECODE_HEAD;
close->cl_stateowner = NULL;
- READ_BUF(4 + sizeof(stateid_t));
+ READ_BUF(4);
READ32(close->cl_seqid);
- READ32(close->cl_stateid.si_generation);
- COPYMEM(&close->cl_stateid.si_opaque, sizeof(stateid_opaque_t));
+ return nfsd4_decode_stateid(argp, &close->cl_stateid);
DECODE_TAIL;
}
@@ -493,13 +504,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
static inline __be32
nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegreturn *dr)
{
- DECODE_HEAD;
-
- READ_BUF(sizeof(stateid_t));
- READ32(dr->dr_stateid.si_generation);
- COPYMEM(&dr->dr_stateid.si_opaque, sizeof(stateid_opaque_t));
-
- DECODE_TAIL;
+ return nfsd4_decode_stateid(argp, &dr->dr_stateid);
}
static inline __be32
@@ -542,20 +547,22 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
READ32(lock->lk_is_new);
if (lock->lk_is_new) {
- READ_BUF(36);
+ READ_BUF(4);
READ32(lock->lk_new_open_seqid);
- READ32(lock->lk_new_open_stateid.si_generation);
-
- COPYMEM(&lock->lk_new_open_stateid.si_opaque, sizeof(stateid_opaque_t));
+ status = nfsd4_decode_stateid(argp, &lock->lk_new_open_stateid);
+ if (status)
+ return status;
+ READ_BUF(8 + sizeof(clientid_t));
READ32(lock->lk_new_lock_seqid);
COPYMEM(&lock->lk_new_clientid, sizeof(clientid_t));
READ32(lock->lk_new_owner.len);
READ_BUF(lock->lk_new_owner.len);
READMEM(lock->lk_new_owner.data, lock->lk_new_owner.len);
} else {
- READ_BUF(20);
- READ32(lock->lk_old_lock_stateid.si_generation);
- COPYMEM(&lock->lk_old_lock_stateid.si_opaque, sizeof(stateid_opaque_t));
+ status = nfsd4_decode_stateid(argp, &lock->lk_old_lock_stateid);
+ if (status)
+ return status;
+ READ_BUF(4);
READ32(lock->lk_old_lock_seqid);
}
@@ -587,13 +594,15 @@ nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku)
DECODE_HEAD;
locku->lu_stateowner = NULL;
- READ_BUF(24 + sizeof(stateid_t));
+ READ_BUF(8);
READ32(locku->lu_type);
if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT))
goto xdr_error;
READ32(locku->lu_seqid);
- READ32(locku->lu_stateid.si_generation);
- COPYMEM(&locku->lu_stateid.si_opaque, sizeof(stateid_opaque_t));
+ status = nfsd4_decode_stateid(argp, &locku->lu_stateid);
+ if (status)
+ return status;
+ READ_BUF(16);
READ64(locku->lu_offset);
READ64(locku->lu_length);
@@ -678,8 +687,10 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
READ32(open->op_delegate_type);
break;
case NFS4_OPEN_CLAIM_DELEGATE_CUR:
- READ_BUF(sizeof(stateid_t) + 4);
- COPYMEM(&open->op_delegate_stateid, sizeof(stateid_t));
+ status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid);
+ if (status)
+ return status;
+ READ_BUF(4);
READ32(open->op_fname.len);
READ_BUF(open->op_fname.len);
SAVEMEM(open->op_fname.data, open->op_fname.len);
@@ -699,9 +710,10 @@ nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_con
DECODE_HEAD;
open_conf->oc_stateowner = NULL;
- READ_BUF(4 + sizeof(stateid_t));
- READ32(open_conf->oc_req_stateid.si_generation);
- COPYMEM(&open_conf->oc_req_stateid.si_opaque, sizeof(stateid_opaque_t));
+ status = nfsd4_decode_stateid(argp, &open_conf->oc_req_stateid);
+ if (status)
+ return status;
+ READ_BUF(4);
READ32(open_conf->oc_seqid);
DECODE_TAIL;
@@ -713,9 +725,10 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d
DECODE_HEAD;
open_down->od_stateowner = NULL;
- READ_BUF(12 + sizeof(stateid_t));
- READ32(open_down->od_stateid.si_generation);
- COPYMEM(&open_down->od_stateid.si_opaque, sizeof(stateid_opaque_t));
+ status = nfsd4_decode_stateid(argp, &open_down->od_stateid);
+ if (status)
+ return status;
+ READ_BUF(12);
READ32(open_down->od_seqid);
READ32(open_down->od_share_access);
READ32(open_down->od_share_deny);
@@ -743,9 +756,10 @@ nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read)
{
DECODE_HEAD;
- READ_BUF(sizeof(stateid_t) + 12);
- READ32(read->rd_stateid.si_generation);
- COPYMEM(&read->rd_stateid.si_opaque, sizeof(stateid_opaque_t));
+ status = nfsd4_decode_stateid(argp, &read->rd_stateid);
+ if (status)
+ return status;
+ READ_BUF(12);
READ64(read->rd_offset);
READ32(read->rd_length);
@@ -834,15 +848,13 @@ nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp,
static __be32
nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr)
{
- DECODE_HEAD;
-
- READ_BUF(sizeof(stateid_t));
- READ32(setattr->sa_stateid.si_generation);
- COPYMEM(&setattr->sa_stateid.si_opaque, sizeof(stateid_opaque_t));
- if ((status = nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr, &setattr->sa_acl)))
- goto out;
+ __be32 status;
- DECODE_TAIL;
+ status = nfsd4_decode_stateid(argp, &setattr->sa_stateid);
+ if (status)
+ return status;
+ return nfsd4_decode_fattr(argp, setattr->sa_bmval,
+ &setattr->sa_iattr, &setattr->sa_acl);
}
static __be32
@@ -927,9 +939,10 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
int len;
DECODE_HEAD;
- READ_BUF(sizeof(stateid_opaque_t) + 20);
- READ32(write->wr_stateid.si_generation);
- COPYMEM(&write->wr_stateid.si_opaque, sizeof(stateid_opaque_t));
+ status = nfsd4_decode_stateid(argp, &write->wr_stateid);
+ if (status)
+ return status;
+ READ_BUF(16);
READ64(write->wr_offset);
READ32(write->wr_stable_how);
if (write->wr_stable_how > 2)
@@ -1183,7 +1196,6 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
* Header routine to setup seqid operation replay cache
*/
#define ENCODE_SEQID_OP_HEAD \
- __be32 *p; \
__be32 *save; \
\
save = resp->p;
@@ -1950,6 +1962,17 @@ fail:
return -EINVAL;
}
+static void
+nfsd4_encode_stateid(struct nfsd4_compoundres *resp, stateid_t *sid)
+{
+ ENCODE_HEAD;
+
+ RESERVE_SPACE(sizeof(stateid_t));
+ WRITE32(sid->si_generation);
+ WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t));
+ ADJUST_ARGS();
+}
+
static __be32
nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access)
{
@@ -1969,12 +1992,9 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c
{
ENCODE_SEQID_OP_HEAD;
- if (!nfserr) {
- RESERVE_SPACE(sizeof(stateid_t));
- WRITE32(close->cl_stateid.si_generation);
- WRITEMEM(&close->cl_stateid.si_opaque, sizeof(stateid_opaque_t));
- ADJUST_ARGS();
- }
+ if (!nfserr)
+ nfsd4_encode_stateid(resp, &close->cl_stateid);
+
ENCODE_SEQID_OP_TAIL(close->cl_stateowner);
return nfserr;
}
@@ -2074,12 +2094,9 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lo
{
ENCODE_SEQID_OP_HEAD;
- if (!nfserr) {
- RESERVE_SPACE(4 + sizeof(stateid_t));
- WRITE32(lock->lk_resp_stateid.si_generation);
- WRITEMEM(&lock->lk_resp_stateid.si_opaque, sizeof(stateid_opaque_t));
- ADJUST_ARGS();
- } else if (nfserr == nfserr_denied)
+ if (!nfserr)
+ nfsd4_encode_stateid(resp, &lock->lk_resp_stateid);
+ else if (nfserr == nfserr_denied)
nfsd4_encode_lock_denied(resp, &lock->lk_denied);
ENCODE_SEQID_OP_TAIL(lock->lk_replay_owner);
@@ -2099,13 +2116,9 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l
{
ENCODE_SEQID_OP_HEAD;
- if (!nfserr) {
- RESERVE_SPACE(sizeof(stateid_t));
- WRITE32(locku->lu_stateid.si_generation);
- WRITEMEM(&locku->lu_stateid.si_opaque, sizeof(stateid_opaque_t));
- ADJUST_ARGS();
- }
-
+ if (!nfserr)
+ nfsd4_encode_stateid(resp, &locku->lu_stateid);
+
ENCODE_SEQID_OP_TAIL(locku->lu_stateowner);
return nfserr;
}
@@ -2128,14 +2141,14 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_li
static __be32
nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open)
{
+ ENCODE_HEAD;
ENCODE_SEQID_OP_HEAD;
if (nfserr)
goto out;
- RESERVE_SPACE(36 + sizeof(stateid_t));
- WRITE32(open->op_stateid.si_generation);
- WRITEMEM(&open->op_stateid.si_opaque, sizeof(stateid_opaque_t));
+ nfsd4_encode_stateid(resp, &open->op_stateid);
+ RESERVE_SPACE(40);
WRITECINFO(open->op_cinfo);
WRITE32(open->op_rflags);
WRITE32(2);
@@ -2148,8 +2161,8 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
case NFS4_OPEN_DELEGATE_NONE:
break;
case NFS4_OPEN_DELEGATE_READ:
- RESERVE_SPACE(20 + sizeof(stateid_t));
- WRITEMEM(&open->op_delegate_stateid, sizeof(stateid_t));
+ nfsd4_encode_stateid(resp, &open->op_delegate_stateid);
+ RESERVE_SPACE(20);
WRITE32(open->op_recall);
/*
@@ -2162,8 +2175,8 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
ADJUST_ARGS();
break;
case NFS4_OPEN_DELEGATE_WRITE:
- RESERVE_SPACE(32 + sizeof(stateid_t));
- WRITEMEM(&open->op_delegate_stateid, sizeof(stateid_t));
+ nfsd4_encode_stateid(resp, &open->op_delegate_stateid);
+ RESERVE_SPACE(32);
WRITE32(0);
/*
@@ -2195,13 +2208,9 @@ static __be32
nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc)
{
ENCODE_SEQID_OP_HEAD;
-
- if (!nfserr) {
- RESERVE_SPACE(sizeof(stateid_t));
- WRITE32(oc->oc_resp_stateid.si_generation);
- WRITEMEM(&oc->oc_resp_stateid.si_opaque, sizeof(stateid_opaque_t));
- ADJUST_ARGS();
- }
+
+ if (!nfserr)
+ nfsd4_encode_stateid(resp, &oc->oc_resp_stateid);
ENCODE_SEQID_OP_TAIL(oc->oc_stateowner);
return nfserr;
@@ -2211,13 +2220,9 @@ static __be32
nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od)
{
ENCODE_SEQID_OP_HEAD;
-
- if (!nfserr) {
- RESERVE_SPACE(sizeof(stateid_t));
- WRITE32(od->od_stateid.si_generation);
- WRITEMEM(&od->od_stateid.si_opaque, sizeof(stateid_opaque_t));
- ADJUST_ARGS();
- }
+
+ if (!nfserr)
+ nfsd4_encode_stateid(resp, &od->od_stateid);
ENCODE_SEQID_OP_TAIL(od->od_stateowner);
return nfserr;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index c53e65f8f3a2..97543df58242 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -614,10 +614,9 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
return -EINVAL;
err = nfsd_create_serv();
if (!err) {
- int proto = 0;
- err = svc_addsock(nfsd_serv, fd, buf, &proto);
+ err = svc_addsock(nfsd_serv, fd, buf);
if (err >= 0) {
- err = lockd_up(proto);
+ err = lockd_up();
if (err < 0)
svc_sock_names(buf+strlen(buf)+1, nfsd_serv, buf);
}
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index ea37c96f0445..cd25d91895a1 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -302,17 +302,27 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
if (error)
goto out;
- if (!(access & NFSD_MAY_LOCK)) {
- /*
- * pseudoflavor restrictions are not enforced on NLM,
- * which clients virtually always use auth_sys for,
- * even while using RPCSEC_GSS for NFS.
- */
- error = check_nfsd_access(exp, rqstp);
- if (error)
- goto out;
- }
+ /*
+ * pseudoflavor restrictions are not enforced on NLM,
+ * which clients virtually always use auth_sys for,
+ * even while using RPCSEC_GSS for NFS.
+ */
+ if (access & NFSD_MAY_LOCK)
+ goto skip_pseudoflavor_check;
+ /*
+ * Clients may expect to be able to use auth_sys during mount,
+ * even if they use gss for everything else; see section 2.3.2
+ * of rfc 2623.
+ */
+ if (access & NFSD_MAY_BYPASS_GSS_ON_ROOT
+ && exp->ex_path.dentry == dentry)
+ goto skip_pseudoflavor_check;
+
+ error = check_nfsd_access(exp, rqstp);
+ if (error)
+ goto out;
+skip_pseudoflavor_check:
/* Finally, check access permissions. */
error = nfsd_permission(rqstp, exp, dentry, access);
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 0766f95d236a..5cffeca7acef 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -65,7 +65,8 @@ nfsd_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp,
dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh));
fh_copy(&resp->fh, &argp->fh);
- nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
+ nfserr = fh_verify(rqstp, &resp->fh, 0,
+ NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT);
return nfsd_return_attrs(nfserr, resp);
}
@@ -521,7 +522,8 @@ nfsd_proc_statfs(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
dprintk("nfsd: STATFS %s\n", SVCFH_fmt(&argp->fh));
- nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats);
+ nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats,
+ NFSD_MAY_BYPASS_GSS_ON_ROOT);
fh_put(&argp->fh);
return nfserr;
}
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 80292ff5e924..59eeb46f82c5 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -229,6 +229,7 @@ int nfsd_create_serv(void)
atomic_set(&nfsd_busy, 0);
nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
+ AF_INET,
nfsd_last_thread, nfsd, THIS_MODULE);
if (nfsd_serv == NULL)
err = -ENOMEM;
@@ -243,25 +244,20 @@ static int nfsd_init_socks(int port)
if (!list_empty(&nfsd_serv->sv_permsocks))
return 0;
- error = lockd_up(IPPROTO_UDP);
- if (error >= 0) {
- error = svc_create_xprt(nfsd_serv, "udp", port,
+ error = svc_create_xprt(nfsd_serv, "udp", port,
SVC_SOCK_DEFAULTS);
- if (error < 0)
- lockd_down();
- }
if (error < 0)
return error;
- error = lockd_up(IPPROTO_TCP);
- if (error >= 0) {
- error = svc_create_xprt(nfsd_serv, "tcp", port,
+ error = svc_create_xprt(nfsd_serv, "tcp", port,
SVC_SOCK_DEFAULTS);
- if (error < 0)
- lockd_down();
- }
if (error < 0)
return error;
+
+ error = lockd_up();
+ if (error < 0)
+ return error;
+
return 0;
}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 18060bed5267..aa1d0d6489a1 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -83,7 +83,6 @@ struct raparm_hbucket {
spinlock_t pb_lock;
} ____cacheline_aligned_in_smp;
-static struct raparms * raparml;
#define RAPARM_HASH_BITS 4
#define RAPARM_HASH_SIZE (1<<RAPARM_HASH_BITS)
#define RAPARM_HASH_MASK (RAPARM_HASH_SIZE-1)
@@ -1866,9 +1865,9 @@ out:
* N.B. After this call fhp needs an fh_put
*/
__be32
-nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat)
+nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, int access)
{
- __be32 err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP);
+ __be32 err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access);
if (!err && vfs_statfs(fhp->fh_dentry,stat))
err = nfserr_io;
return err;
@@ -1966,11 +1965,20 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
void
nfsd_racache_shutdown(void)
{
- if (!raparml)
- return;
+ struct raparms *raparm, *last_raparm;
+ unsigned int i;
+
dprintk("nfsd: freeing readahead buffers.\n");
- kfree(raparml);
- raparml = NULL;
+
+ for (i = 0; i < RAPARM_HASH_SIZE; i++) {
+ raparm = raparm_hash[i].pb_head;
+ while(raparm) {
+ last_raparm = raparm;
+ raparm = raparm->p_next;
+ kfree(last_raparm);
+ }
+ raparm_hash[i].pb_head = NULL;
+ }
}
/*
* Initialize readahead param cache
@@ -1981,35 +1989,38 @@ nfsd_racache_init(int cache_size)
int i;
int j = 0;
int nperbucket;
+ struct raparms **raparm = NULL;
- if (raparml)
+ if (raparm_hash[0].pb_head)
return 0;
- if (cache_size < 2*RAPARM_HASH_SIZE)
- cache_size = 2*RAPARM_HASH_SIZE;
- raparml = kcalloc(cache_size, sizeof(struct raparms), GFP_KERNEL);
-
- if (!raparml) {
- printk(KERN_WARNING
- "nfsd: Could not allocate memory read-ahead cache.\n");
- return -ENOMEM;
- }
+ nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE);
+ if (nperbucket < 2)
+ nperbucket = 2;
+ cache_size = nperbucket * RAPARM_HASH_SIZE;
dprintk("nfsd: allocating %d readahead buffers.\n", cache_size);
- for (i = 0 ; i < RAPARM_HASH_SIZE ; i++) {
- raparm_hash[i].pb_head = NULL;
+
+ for (i = 0; i < RAPARM_HASH_SIZE; i++) {
spin_lock_init(&raparm_hash[i].pb_lock);
- }
- nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE);
- for (i = 0; i < cache_size - 1; i++) {
- if (i % nperbucket == 0)
- raparm_hash[j++].pb_head = raparml + i;
- if (i % nperbucket < nperbucket-1)
- raparml[i].p_next = raparml + i + 1;
+
+ raparm = &raparm_hash[i].pb_head;
+ for (j = 0; j < nperbucket; j++) {
+ *raparm = kzalloc(sizeof(struct raparms), GFP_KERNEL);
+ if (!*raparm)
+ goto out_nomem;
+ raparm = &(*raparm)->p_next;
+ }
+ *raparm = NULL;
}
nfsdstats.ra_size = cache_size;
return 0;
+
+out_nomem:
+ dprintk("nfsd: kmalloc failed, freeing readahead buffers\n");
+ nfsd_racache_shutdown();
+ return -ENOMEM;
}
#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c
index 64965e1c21c4..9b0efdad8910 100644
--- a/fs/nls/nls_base.c
+++ b/fs/nls/nls_base.c
@@ -13,9 +13,7 @@
#include <linux/nls.h>
#include <linux/kernel.h>
#include <linux/errno.h>
-#ifdef CONFIG_KMOD
#include <linux/kmod.h>
-#endif
#include <linux/spinlock.h>
static struct nls_table default_table;
@@ -215,24 +213,7 @@ static struct nls_table *find_nls(char *charset)
struct nls_table *load_nls(char *charset)
{
- struct nls_table *nls;
-#ifdef CONFIG_KMOD
- int ret;
-#endif
-
- nls = find_nls(charset);
- if (nls)
- return nls;
-
-#ifdef CONFIG_KMOD
- ret = request_module("nls_%s", charset);
- if (ret != 0) {
- printk("Unable to load NLS charset %s\n", charset);
- return NULL;
- }
- nls = find_nls(charset);
-#endif
- return nls;
+ return try_then_request_module(find_nls(charset), "nls_%s", charset);
}
void unload_nls(struct nls_table *nls)
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index f6956de56fdb..589dcdfdfe3c 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -34,7 +34,8 @@ ocfs2-objs := \
symlink.o \
sysfile.o \
uptodate.o \
- ver.o
+ ver.o \
+ xattr.o
ocfs2_stackglue-objs := stackglue.o
ocfs2_stack_o2cb-objs := stack_o2cb.o
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 29ff57ec5d1f..0cc2deb9394c 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -49,6 +49,340 @@
#include "buffer_head_io.h"
+
+/*
+ * Operations for a specific extent tree type.
+ *
+ * To implement an on-disk btree (extent tree) type in ocfs2, add
+ * an ocfs2_extent_tree_operations structure and the matching
+ * ocfs2_init_<thingy>_extent_tree() function. That's pretty much it
+ * for the allocation portion of the extent tree.
+ */
+struct ocfs2_extent_tree_operations {
+ /*
+ * last_eb_blk is the block number of the right most leaf extent
+ * block. Most on-disk structures containing an extent tree store
+ * this value for fast access. The ->eo_set_last_eb_blk() and
+ * ->eo_get_last_eb_blk() operations access this value. They are
+ * both required.
+ */
+ void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et,
+ u64 blkno);
+ u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et);
+
+ /*
+ * The on-disk structure usually keeps track of how many total
+ * clusters are stored in this extent tree. This function updates
+ * that value. new_clusters is the delta, and must be
+ * added to the total. Required.
+ */
+ void (*eo_update_clusters)(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ u32 new_clusters);
+
+ /*
+ * If ->eo_insert_check() exists, it is called before rec is
+ * inserted into the extent tree. It is optional.
+ */
+ int (*eo_insert_check)(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *rec);
+ int (*eo_sanity_check)(struct inode *inode, struct ocfs2_extent_tree *et);
+
+ /*
+ * --------------------------------------------------------------
+ * The remaining are internal to ocfs2_extent_tree and don't have
+ * accessor functions
+ */
+
+ /*
+ * ->eo_fill_root_el() takes et->et_object and sets et->et_root_el.
+ * It is required.
+ */
+ void (*eo_fill_root_el)(struct ocfs2_extent_tree *et);
+
+ /*
+ * ->eo_fill_max_leaf_clusters sets et->et_max_leaf_clusters if
+ * it exists. If it does not, et->et_max_leaf_clusters is set
+ * to 0 (unlimited). Optional.
+ */
+ void (*eo_fill_max_leaf_clusters)(struct inode *inode,
+ struct ocfs2_extent_tree *et);
+};
+
+
+/*
+ * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check
+ * in the methods.
+ */
+static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et);
+static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
+ u64 blkno);
+static void ocfs2_dinode_update_clusters(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ u32 clusters);
+static int ocfs2_dinode_insert_check(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *rec);
+static int ocfs2_dinode_sanity_check(struct inode *inode,
+ struct ocfs2_extent_tree *et);
+static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
+static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
+ .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk,
+ .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk,
+ .eo_update_clusters = ocfs2_dinode_update_clusters,
+ .eo_insert_check = ocfs2_dinode_insert_check,
+ .eo_sanity_check = ocfs2_dinode_sanity_check,
+ .eo_fill_root_el = ocfs2_dinode_fill_root_el,
+};
+
+static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
+ u64 blkno)
+{
+ struct ocfs2_dinode *di = et->et_object;
+
+ BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
+ di->i_last_eb_blk = cpu_to_le64(blkno);
+}
+
+static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+ struct ocfs2_dinode *di = et->et_object;
+
+ BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
+ return le64_to_cpu(di->i_last_eb_blk);
+}
+
+static void ocfs2_dinode_update_clusters(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ u32 clusters)
+{
+ struct ocfs2_dinode *di = et->et_object;
+
+ le32_add_cpu(&di->i_clusters, clusters);
+ spin_lock(&OCFS2_I(inode)->ip_lock);
+ OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
+ spin_unlock(&OCFS2_I(inode)->ip_lock);
+}
+
+static int ocfs2_dinode_insert_check(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *rec)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
+ mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
+ (OCFS2_I(inode)->ip_clusters != rec->e_cpos),
+ "Device %s, asking for sparse allocation: inode %llu, "
+ "cpos %u, clusters %u\n",
+ osb->dev_str,
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ rec->e_cpos,
+ OCFS2_I(inode)->ip_clusters);
+
+ return 0;
+}
+
+static int ocfs2_dinode_sanity_check(struct inode *inode,
+ struct ocfs2_extent_tree *et)
+{
+ int ret = 0;
+ struct ocfs2_dinode *di;
+
+ BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
+
+ di = et->et_object;
+ if (!OCFS2_IS_VALID_DINODE(di)) {
+ ret = -EIO;
+ ocfs2_error(inode->i_sb,
+ "Inode %llu has invalid path root",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ }
+
+ return ret;
+}
+
+static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
+{
+ struct ocfs2_dinode *di = et->et_object;
+
+ et->et_root_el = &di->id2.i_list;
+}
+
+
+static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
+{
+ struct ocfs2_xattr_value_root *xv = et->et_object;
+
+ et->et_root_el = &xv->xr_list;
+}
+
+static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
+ u64 blkno)
+{
+ struct ocfs2_xattr_value_root *xv =
+ (struct ocfs2_xattr_value_root *)et->et_object;
+
+ xv->xr_last_eb_blk = cpu_to_le64(blkno);
+}
+
+static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+ struct ocfs2_xattr_value_root *xv =
+ (struct ocfs2_xattr_value_root *) et->et_object;
+
+ return le64_to_cpu(xv->xr_last_eb_blk);
+}
+
+static void ocfs2_xattr_value_update_clusters(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ u32 clusters)
+{
+ struct ocfs2_xattr_value_root *xv =
+ (struct ocfs2_xattr_value_root *)et->et_object;
+
+ le32_add_cpu(&xv->xr_clusters, clusters);
+}
+
+static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
+ .eo_set_last_eb_blk = ocfs2_xattr_value_set_last_eb_blk,
+ .eo_get_last_eb_blk = ocfs2_xattr_value_get_last_eb_blk,
+ .eo_update_clusters = ocfs2_xattr_value_update_clusters,
+ .eo_fill_root_el = ocfs2_xattr_value_fill_root_el,
+};
+
+static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et)
+{
+ struct ocfs2_xattr_block *xb = et->et_object;
+
+ et->et_root_el = &xb->xb_attrs.xb_root.xt_list;
+}
+
+static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct inode *inode,
+ struct ocfs2_extent_tree *et)
+{
+ et->et_max_leaf_clusters =
+ ocfs2_clusters_for_bytes(inode->i_sb,
+ OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
+}
+
+static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
+ u64 blkno)
+{
+ struct ocfs2_xattr_block *xb = et->et_object;
+ struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
+
+ xt->xt_last_eb_blk = cpu_to_le64(blkno);
+}
+
+static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+ struct ocfs2_xattr_block *xb = et->et_object;
+ struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
+
+ return le64_to_cpu(xt->xt_last_eb_blk);
+}
+
+static void ocfs2_xattr_tree_update_clusters(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ u32 clusters)
+{
+ struct ocfs2_xattr_block *xb = et->et_object;
+
+ le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
+}
+
+static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
+ .eo_set_last_eb_blk = ocfs2_xattr_tree_set_last_eb_blk,
+ .eo_get_last_eb_blk = ocfs2_xattr_tree_get_last_eb_blk,
+ .eo_update_clusters = ocfs2_xattr_tree_update_clusters,
+ .eo_fill_root_el = ocfs2_xattr_tree_fill_root_el,
+ .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
+};
+
+static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
+ struct inode *inode,
+ struct buffer_head *bh,
+ void *obj,
+ struct ocfs2_extent_tree_operations *ops)
+{
+ et->et_ops = ops;
+ et->et_root_bh = bh;
+ if (!obj)
+ obj = (void *)bh->b_data;
+ et->et_object = obj;
+
+ et->et_ops->eo_fill_root_el(et);
+ if (!et->et_ops->eo_fill_max_leaf_clusters)
+ et->et_max_leaf_clusters = 0;
+ else
+ et->et_ops->eo_fill_max_leaf_clusters(inode, et);
+}
+
+void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
+ struct inode *inode,
+ struct buffer_head *bh)
+{
+ __ocfs2_init_extent_tree(et, inode, bh, NULL, &ocfs2_dinode_et_ops);
+}
+
+void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
+ struct inode *inode,
+ struct buffer_head *bh)
+{
+ __ocfs2_init_extent_tree(et, inode, bh, NULL,
+ &ocfs2_xattr_tree_et_ops);
+}
+
+void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
+ struct inode *inode,
+ struct buffer_head *bh,
+ struct ocfs2_xattr_value_root *xv)
+{
+ __ocfs2_init_extent_tree(et, inode, bh, xv,
+ &ocfs2_xattr_value_et_ops);
+}
+
+static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
+ u64 new_last_eb_blk)
+{
+ et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk);
+}
+
+static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+ return et->et_ops->eo_get_last_eb_blk(et);
+}
+
+static inline void ocfs2_et_update_clusters(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ u32 clusters)
+{
+ et->et_ops->eo_update_clusters(inode, et, clusters);
+}
+
+static inline int ocfs2_et_insert_check(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *rec)
+{
+ int ret = 0;
+
+ if (et->et_ops->eo_insert_check)
+ ret = et->et_ops->eo_insert_check(inode, et, rec);
+ return ret;
+}
+
+static inline int ocfs2_et_sanity_check(struct inode *inode,
+ struct ocfs2_extent_tree *et)
+{
+ int ret = 0;
+
+ if (et->et_ops->eo_sanity_check)
+ ret = et->et_ops->eo_sanity_check(inode, et);
+ return ret;
+}
+
static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
struct ocfs2_extent_block *eb);
@@ -205,17 +539,6 @@ static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
}
/*
- * Allocate and initialize a new path based on a disk inode tree.
- */
-static struct ocfs2_path *ocfs2_new_inode_path(struct buffer_head *di_bh)
-{
- struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
- struct ocfs2_extent_list *el = &di->id2.i_list;
-
- return ocfs2_new_path(di_bh, el);
-}
-
-/*
* Convenience function to journal all components in a path.
*/
static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
@@ -368,39 +691,35 @@ struct ocfs2_merge_ctxt {
*/
int ocfs2_num_free_extents(struct ocfs2_super *osb,
struct inode *inode,
- struct ocfs2_dinode *fe)
+ struct ocfs2_extent_tree *et)
{
int retval;
- struct ocfs2_extent_list *el;
+ struct ocfs2_extent_list *el = NULL;
struct ocfs2_extent_block *eb;
struct buffer_head *eb_bh = NULL;
+ u64 last_eb_blk = 0;
mlog_entry_void();
- if (!OCFS2_IS_VALID_DINODE(fe)) {
- OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
- retval = -EIO;
- goto bail;
- }
+ el = et->et_root_el;
+ last_eb_blk = ocfs2_et_get_last_eb_blk(et);
- if (fe->i_last_eb_blk) {
- retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
- &eb_bh, OCFS2_BH_CACHED, inode);
+ if (last_eb_blk) {
+ retval = ocfs2_read_block(inode, last_eb_blk,
+ &eb_bh);
if (retval < 0) {
mlog_errno(retval);
goto bail;
}
eb = (struct ocfs2_extent_block *) eb_bh->b_data;
el = &eb->h_list;
- } else
- el = &fe->id2.i_list;
+ }
BUG_ON(el->l_tree_depth != 0);
retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
bail:
- if (eb_bh)
- brelse(eb_bh);
+ brelse(eb_bh);
mlog_exit(retval);
return retval;
@@ -486,8 +805,7 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
bail:
if (status < 0) {
for(i = 0; i < wanted; i++) {
- if (bhs[i])
- brelse(bhs[i]);
+ brelse(bhs[i]);
bhs[i] = NULL;
}
}
@@ -531,7 +849,7 @@ static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el)
static int ocfs2_add_branch(struct ocfs2_super *osb,
handle_t *handle,
struct inode *inode,
- struct buffer_head *fe_bh,
+ struct ocfs2_extent_tree *et,
struct buffer_head *eb_bh,
struct buffer_head **last_eb_bh,
struct ocfs2_alloc_context *meta_ac)
@@ -540,7 +858,6 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
u64 next_blkno, new_last_eb_blk;
struct buffer_head *bh;
struct buffer_head **new_eb_bhs = NULL;
- struct ocfs2_dinode *fe;
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *eb_el;
struct ocfs2_extent_list *el;
@@ -550,13 +867,11 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
BUG_ON(!last_eb_bh || !*last_eb_bh);
- fe = (struct ocfs2_dinode *) fe_bh->b_data;
-
if (eb_bh) {
eb = (struct ocfs2_extent_block *) eb_bh->b_data;
el = &eb->h_list;
} else
- el = &fe->id2.i_list;
+ el = et->et_root_el;
/* we never add a branch to a leaf. */
BUG_ON(!el->l_tree_depth);
@@ -646,7 +961,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
mlog_errno(status);
goto bail;
}
- status = ocfs2_journal_access(handle, inode, fe_bh,
+ status = ocfs2_journal_access(handle, inode, et->et_root_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -662,7 +977,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
}
/* Link the new branch into the rest of the tree (el will
- * either be on the fe, or the extent block passed in. */
+ * either be on the root_bh, or the extent block passed in. */
i = le16_to_cpu(el->l_next_free_rec);
el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
@@ -671,7 +986,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
/* fe needs a new last extent block pointer, as does the
* next_leaf on the previously last-extent-block. */
- fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
+ ocfs2_et_set_last_eb_blk(et, new_last_eb_blk);
eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
@@ -679,7 +994,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
status = ocfs2_journal_dirty(handle, *last_eb_bh);
if (status < 0)
mlog_errno(status);
- status = ocfs2_journal_dirty(handle, fe_bh);
+ status = ocfs2_journal_dirty(handle, et->et_root_bh);
if (status < 0)
mlog_errno(status);
if (eb_bh) {
@@ -700,8 +1015,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
bail:
if (new_eb_bhs) {
for (i = 0; i < new_blocks; i++)
- if (new_eb_bhs[i])
- brelse(new_eb_bhs[i]);
+ brelse(new_eb_bhs[i]);
kfree(new_eb_bhs);
}
@@ -717,16 +1031,15 @@ bail:
static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
handle_t *handle,
struct inode *inode,
- struct buffer_head *fe_bh,
+ struct ocfs2_extent_tree *et,
struct ocfs2_alloc_context *meta_ac,
struct buffer_head **ret_new_eb_bh)
{
int status, i;
u32 new_clusters;
struct buffer_head *new_eb_bh = NULL;
- struct ocfs2_dinode *fe;
struct ocfs2_extent_block *eb;
- struct ocfs2_extent_list *fe_el;
+ struct ocfs2_extent_list *root_el;
struct ocfs2_extent_list *eb_el;
mlog_entry_void();
@@ -746,8 +1059,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
}
eb_el = &eb->h_list;
- fe = (struct ocfs2_dinode *) fe_bh->b_data;
- fe_el = &fe->id2.i_list;
+ root_el = et->et_root_el;
status = ocfs2_journal_access(handle, inode, new_eb_bh,
OCFS2_JOURNAL_ACCESS_CREATE);
@@ -756,11 +1068,11 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
goto bail;
}
- /* copy the fe data into the new extent block */
- eb_el->l_tree_depth = fe_el->l_tree_depth;
- eb_el->l_next_free_rec = fe_el->l_next_free_rec;
- for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
- eb_el->l_recs[i] = fe_el->l_recs[i];
+ /* copy the root extent list data into the new extent block */
+ eb_el->l_tree_depth = root_el->l_tree_depth;
+ eb_el->l_next_free_rec = root_el->l_next_free_rec;
+ for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
+ eb_el->l_recs[i] = root_el->l_recs[i];
status = ocfs2_journal_dirty(handle, new_eb_bh);
if (status < 0) {
@@ -768,7 +1080,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
goto bail;
}
- status = ocfs2_journal_access(handle, inode, fe_bh,
+ status = ocfs2_journal_access(handle, inode, et->et_root_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -777,21 +1089,21 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
new_clusters = ocfs2_sum_rightmost_rec(eb_el);
- /* update fe now */
- le16_add_cpu(&fe_el->l_tree_depth, 1);
- fe_el->l_recs[0].e_cpos = 0;
- fe_el->l_recs[0].e_blkno = eb->h_blkno;
- fe_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
- for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
- memset(&fe_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
- fe_el->l_next_free_rec = cpu_to_le16(1);
+ /* update root_bh now */
+ le16_add_cpu(&root_el->l_tree_depth, 1);
+ root_el->l_recs[0].e_cpos = 0;
+ root_el->l_recs[0].e_blkno = eb->h_blkno;
+ root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
+ for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
+ memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
+ root_el->l_next_free_rec = cpu_to_le16(1);
/* If this is our 1st tree depth shift, then last_eb_blk
* becomes the allocated extent block */
- if (fe_el->l_tree_depth == cpu_to_le16(1))
- fe->i_last_eb_blk = eb->h_blkno;
+ if (root_el->l_tree_depth == cpu_to_le16(1))
+ ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
- status = ocfs2_journal_dirty(handle, fe_bh);
+ status = ocfs2_journal_dirty(handle, et->et_root_bh);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -801,8 +1113,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
new_eb_bh = NULL;
status = 0;
bail:
- if (new_eb_bh)
- brelse(new_eb_bh);
+ brelse(new_eb_bh);
mlog_exit(status);
return status;
@@ -817,22 +1128,21 @@ bail:
* 1) a lowest extent block is found, then we pass it back in
* *lowest_eb_bh and return '0'
*
- * 2) the search fails to find anything, but the dinode has room. We
+ * 2) the search fails to find anything, but the root_el has room. We
* pass NULL back in *lowest_eb_bh, but still return '0'
*
- * 3) the search fails to find anything AND the dinode is full, in
+ * 3) the search fails to find anything AND the root_el is full, in
* which case we return > 0
*
* return status < 0 indicates an error.
*/
static int ocfs2_find_branch_target(struct ocfs2_super *osb,
struct inode *inode,
- struct buffer_head *fe_bh,
+ struct ocfs2_extent_tree *et,
struct buffer_head **target_bh)
{
int status = 0, i;
u64 blkno;
- struct ocfs2_dinode *fe;
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el;
struct buffer_head *bh = NULL;
@@ -842,8 +1152,7 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
*target_bh = NULL;
- fe = (struct ocfs2_dinode *) fe_bh->b_data;
- el = &fe->id2.i_list;
+ el = et->et_root_el;
while(le16_to_cpu(el->l_tree_depth) > 1) {
if (le16_to_cpu(el->l_next_free_rec) == 0) {
@@ -864,13 +1173,10 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
goto bail;
}
- if (bh) {
- brelse(bh);
- bh = NULL;
- }
+ brelse(bh);
+ bh = NULL;
- status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED,
- inode);
+ status = ocfs2_read_block(inode, blkno, &bh);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -886,8 +1192,7 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
if (le16_to_cpu(el->l_next_free_rec) <
le16_to_cpu(el->l_count)) {
- if (lowest_bh)
- brelse(lowest_bh);
+ brelse(lowest_bh);
lowest_bh = bh;
get_bh(lowest_bh);
}
@@ -895,14 +1200,13 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
/* If we didn't find one and the fe doesn't have any room,
* then return '1' */
- if (!lowest_bh
- && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count))
+ el = et->et_root_el;
+ if (!lowest_bh && (el->l_next_free_rec == el->l_count))
status = 1;
*target_bh = lowest_bh;
bail:
- if (bh)
- brelse(bh);
+ brelse(bh);
mlog_exit(status);
return status;
@@ -919,19 +1223,19 @@ bail:
* *last_eb_bh will be updated by ocfs2_add_branch().
*/
static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
- struct buffer_head *di_bh, int *final_depth,
+ struct ocfs2_extent_tree *et, int *final_depth,
struct buffer_head **last_eb_bh,
struct ocfs2_alloc_context *meta_ac)
{
int ret, shift;
- struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
- int depth = le16_to_cpu(di->id2.i_list.l_tree_depth);
+ struct ocfs2_extent_list *el = et->et_root_el;
+ int depth = le16_to_cpu(el->l_tree_depth);
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct buffer_head *bh = NULL;
BUG_ON(meta_ac == NULL);
- shift = ocfs2_find_branch_target(osb, inode, di_bh, &bh);
+ shift = ocfs2_find_branch_target(osb, inode, et, &bh);
if (shift < 0) {
ret = shift;
mlog_errno(ret);
@@ -948,7 +1252,7 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
/* ocfs2_shift_tree_depth will return us a buffer with
* the new extent block (so we can pass that to
* ocfs2_add_branch). */
- ret = ocfs2_shift_tree_depth(osb, handle, inode, di_bh,
+ ret = ocfs2_shift_tree_depth(osb, handle, inode, et,
meta_ac, &bh);
if (ret < 0) {
mlog_errno(ret);
@@ -975,7 +1279,7 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
/* call ocfs2_add_branch to add the final part of the tree with
* the new data. */
mlog(0, "add branch. bh = %p\n", bh);
- ret = ocfs2_add_branch(osb, handle, inode, di_bh, bh, last_eb_bh,
+ ret = ocfs2_add_branch(osb, handle, inode, et, bh, last_eb_bh,
meta_ac);
if (ret < 0) {
mlog_errno(ret);
@@ -1236,8 +1540,7 @@ static int __ocfs2_find_path(struct inode *inode,
brelse(bh);
bh = NULL;
- ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno,
- &bh, OCFS2_BH_CACHED, inode);
+ ret = ocfs2_read_block(inode, blkno, &bh);
if (ret) {
mlog_errno(ret);
goto out;
@@ -2058,11 +2361,11 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
struct ocfs2_path *right_path,
int subtree_index,
struct ocfs2_cached_dealloc_ctxt *dealloc,
- int *deleted)
+ int *deleted,
+ struct ocfs2_extent_tree *et)
{
int ret, i, del_right_subtree = 0, right_has_empty = 0;
- struct buffer_head *root_bh, *di_bh = path_root_bh(right_path);
- struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path);
struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
struct ocfs2_extent_block *eb;
@@ -2114,7 +2417,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
* We have to update i_last_eb_blk during the meta
* data delete.
*/
- ret = ocfs2_journal_access(handle, inode, di_bh,
+ ret = ocfs2_journal_access(handle, inode, et_root_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
@@ -2189,7 +2492,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
ocfs2_update_edge_lengths(inode, handle, left_path);
eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
- di->i_last_eb_blk = eb->h_blkno;
+ ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
/*
* Removal of the extent in the left leaf was skipped
@@ -2199,7 +2502,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
if (right_has_empty)
ocfs2_remove_empty_extent(left_leaf_el);
- ret = ocfs2_journal_dirty(handle, di_bh);
+ ret = ocfs2_journal_dirty(handle, et_root_bh);
if (ret)
mlog_errno(ret);
@@ -2322,7 +2625,8 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
handle_t *handle, int orig_credits,
struct ocfs2_path *path,
struct ocfs2_cached_dealloc_ctxt *dealloc,
- struct ocfs2_path **empty_extent_path)
+ struct ocfs2_path **empty_extent_path,
+ struct ocfs2_extent_tree *et)
{
int ret, subtree_root, deleted;
u32 right_cpos;
@@ -2395,7 +2699,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
right_path, subtree_root,
- dealloc, &deleted);
+ dealloc, &deleted, et);
if (ret == -EAGAIN) {
/*
* The rotation has to temporarily stop due to
@@ -2438,29 +2742,20 @@ out:
}
static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
- struct ocfs2_path *path,
- struct ocfs2_cached_dealloc_ctxt *dealloc)
+ struct ocfs2_path *path,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ struct ocfs2_extent_tree *et)
{
int ret, subtree_index;
u32 cpos;
struct ocfs2_path *left_path = NULL;
- struct ocfs2_dinode *di;
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el;
- /*
- * XXX: This code assumes that the root is an inode, which is
- * true for now but may change as tree code gets generic.
- */
- di = (struct ocfs2_dinode *)path_root_bh(path)->b_data;
- if (!OCFS2_IS_VALID_DINODE(di)) {
- ret = -EIO;
- ocfs2_error(inode->i_sb,
- "Inode %llu has invalid path root",
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
- goto out;
- }
+ ret = ocfs2_et_sanity_check(inode, et);
+ if (ret)
+ goto out;
/*
* There's two ways we handle this depending on
* whether path is the only existing one.
@@ -2517,7 +2812,7 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
ocfs2_update_edge_lengths(inode, handle, left_path);
eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
- di->i_last_eb_blk = eb->h_blkno;
+ ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
} else {
/*
* 'path' is also the leftmost path which
@@ -2528,12 +2823,12 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
*/
ocfs2_unlink_path(inode, handle, dealloc, path, 1);
- el = &di->id2.i_list;
+ el = et->et_root_el;
el->l_tree_depth = 0;
el->l_next_free_rec = 0;
memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
- di->i_last_eb_blk = 0;
+ ocfs2_et_set_last_eb_blk(et, 0);
}
ocfs2_journal_dirty(handle, path_root_bh(path));
@@ -2561,7 +2856,8 @@ out:
*/
static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
struct ocfs2_path *path,
- struct ocfs2_cached_dealloc_ctxt *dealloc)
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ struct ocfs2_extent_tree *et)
{
int ret, orig_credits = handle->h_buffer_credits;
struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
@@ -2575,7 +2871,7 @@ static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
if (path->p_tree_depth == 0) {
rightmost_no_delete:
/*
- * In-inode extents. This is trivially handled, so do
+ * Inline extents. This is trivially handled, so do
* it up front.
*/
ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
@@ -2629,7 +2925,7 @@ rightmost_no_delete:
*/
ret = ocfs2_remove_rightmost_path(inode, handle, path,
- dealloc);
+ dealloc, et);
if (ret)
mlog_errno(ret);
goto out;
@@ -2641,7 +2937,7 @@ rightmost_no_delete:
*/
try_rotate:
ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path,
- dealloc, &restart_path);
+ dealloc, &restart_path, et);
if (ret && ret != -EAGAIN) {
mlog_errno(ret);
goto out;
@@ -2653,7 +2949,7 @@ try_rotate:
ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits,
tmp_path, dealloc,
- &restart_path);
+ &restart_path, et);
if (ret && ret != -EAGAIN) {
mlog_errno(ret);
goto out;
@@ -2939,6 +3235,7 @@ static int ocfs2_merge_rec_left(struct inode *inode,
handle_t *handle,
struct ocfs2_extent_rec *split_rec,
struct ocfs2_cached_dealloc_ctxt *dealloc,
+ struct ocfs2_extent_tree *et,
int index)
{
int ret, i, subtree_index = 0, has_empty_extent = 0;
@@ -3059,7 +3356,8 @@ static int ocfs2_merge_rec_left(struct inode *inode,
le16_to_cpu(el->l_next_free_rec) == 1) {
ret = ocfs2_remove_rightmost_path(inode, handle,
- right_path, dealloc);
+ right_path,
+ dealloc, et);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3086,7 +3384,8 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
int split_index,
struct ocfs2_extent_rec *split_rec,
struct ocfs2_cached_dealloc_ctxt *dealloc,
- struct ocfs2_merge_ctxt *ctxt)
+ struct ocfs2_merge_ctxt *ctxt,
+ struct ocfs2_extent_tree *et)
{
int ret = 0;
@@ -3104,7 +3403,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
* illegal.
*/
ret = ocfs2_rotate_tree_left(inode, handle, path,
- dealloc);
+ dealloc, et);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3147,7 +3446,8 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
/* The merge left us with an empty extent, remove it. */
- ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
+ ret = ocfs2_rotate_tree_left(inode, handle, path,
+ dealloc, et);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3161,7 +3461,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
*/
ret = ocfs2_merge_rec_left(inode, path,
handle, rec,
- dealloc,
+ dealloc, et,
split_index);
if (ret) {
@@ -3170,7 +3470,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
}
ret = ocfs2_rotate_tree_left(inode, handle, path,
- dealloc);
+ dealloc, et);
/*
* Error from this last rotate is not critical, so
* print but don't bubble it up.
@@ -3190,7 +3490,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
ret = ocfs2_merge_rec_left(inode,
path,
handle, split_rec,
- dealloc,
+ dealloc, et,
split_index);
if (ret) {
mlog_errno(ret);
@@ -3213,7 +3513,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
* our leaf. Try to rotate it away.
*/
ret = ocfs2_rotate_tree_left(inode, handle, path,
- dealloc);
+ dealloc, et);
if (ret)
mlog_errno(ret);
ret = 0;
@@ -3347,16 +3647,6 @@ rotate:
ocfs2_rotate_leaf(el, insert_rec);
}
-static inline void ocfs2_update_dinode_clusters(struct inode *inode,
- struct ocfs2_dinode *di,
- u32 clusters)
-{
- le32_add_cpu(&di->i_clusters, clusters);
- spin_lock(&OCFS2_I(inode)->ip_lock);
- OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
- spin_unlock(&OCFS2_I(inode)->ip_lock);
-}
-
static void ocfs2_adjust_rightmost_records(struct inode *inode,
handle_t *handle,
struct ocfs2_path *path,
@@ -3558,8 +3848,8 @@ static void ocfs2_split_record(struct inode *inode,
}
/*
- * This function only does inserts on an allocation b-tree. For dinode
- * lists, ocfs2_insert_at_leaf() is called directly.
+ * This function only does inserts on an allocation b-tree. For tree
+ * depth = 0, ocfs2_insert_at_leaf() is called directly.
*
* right_path is the path we want to do the actual insert
* in. left_path should only be passed in if we need to update that
@@ -3656,7 +3946,7 @@ out:
static int ocfs2_do_insert_extent(struct inode *inode,
handle_t *handle,
- struct buffer_head *di_bh,
+ struct ocfs2_extent_tree *et,
struct ocfs2_extent_rec *insert_rec,
struct ocfs2_insert_type *type)
{
@@ -3664,13 +3954,11 @@ static int ocfs2_do_insert_extent(struct inode *inode,
u32 cpos;
struct ocfs2_path *right_path = NULL;
struct ocfs2_path *left_path = NULL;
- struct ocfs2_dinode *di;
struct ocfs2_extent_list *el;
- di = (struct ocfs2_dinode *) di_bh->b_data;
- el = &di->id2.i_list;
+ el = et->et_root_el;
- ret = ocfs2_journal_access(handle, inode, di_bh,
+ ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
@@ -3682,7 +3970,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
goto out_update_clusters;
}
- right_path = ocfs2_new_inode_path(di_bh);
+ right_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
if (!right_path) {
ret = -ENOMEM;
mlog_errno(ret);
@@ -3732,7 +4020,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
* ocfs2_rotate_tree_right() might have extended the
* transaction without re-journaling our tree root.
*/
- ret = ocfs2_journal_access(handle, inode, di_bh,
+ ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
@@ -3757,10 +4045,10 @@ static int ocfs2_do_insert_extent(struct inode *inode,
out_update_clusters:
if (type->ins_split == SPLIT_NONE)
- ocfs2_update_dinode_clusters(inode, di,
- le16_to_cpu(insert_rec->e_leaf_clusters));
+ ocfs2_et_update_clusters(inode, et,
+ le16_to_cpu(insert_rec->e_leaf_clusters));
- ret = ocfs2_journal_dirty(handle, di_bh);
+ ret = ocfs2_journal_dirty(handle, et->et_root_bh);
if (ret)
mlog_errno(ret);
@@ -3890,7 +4178,8 @@ out:
static void ocfs2_figure_contig_type(struct inode *inode,
struct ocfs2_insert_type *insert,
struct ocfs2_extent_list *el,
- struct ocfs2_extent_rec *insert_rec)
+ struct ocfs2_extent_rec *insert_rec,
+ struct ocfs2_extent_tree *et)
{
int i;
enum ocfs2_contig_type contig_type = CONTIG_NONE;
@@ -3906,6 +4195,21 @@ static void ocfs2_figure_contig_type(struct inode *inode,
}
}
insert->ins_contig = contig_type;
+
+ if (insert->ins_contig != CONTIG_NONE) {
+ struct ocfs2_extent_rec *rec =
+ &el->l_recs[insert->ins_contig_index];
+ unsigned int len = le16_to_cpu(rec->e_leaf_clusters) +
+ le16_to_cpu(insert_rec->e_leaf_clusters);
+
+ /*
+ * Caller might want us to limit the size of extents, don't
+ * calculate contiguousness if we might exceed that limit.
+ */
+ if (et->et_max_leaf_clusters &&
+ (len > et->et_max_leaf_clusters))
+ insert->ins_contig = CONTIG_NONE;
+ }
}
/*
@@ -3914,8 +4218,8 @@ static void ocfs2_figure_contig_type(struct inode *inode,
* ocfs2_figure_appending_type() will figure out whether we'll have to
* insert at the tail of the rightmost leaf.
*
- * This should also work against the dinode list for tree's with 0
- * depth. If we consider the dinode list to be the rightmost leaf node
+ * This should also work against the root extent list for tree's with 0
+ * depth. If we consider the root extent list to be the rightmost leaf node
* then the logic here makes sense.
*/
static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
@@ -3966,14 +4270,13 @@ set_tail_append:
* structure.
*/
static int ocfs2_figure_insert_type(struct inode *inode,
- struct buffer_head *di_bh,
+ struct ocfs2_extent_tree *et,
struct buffer_head **last_eb_bh,
struct ocfs2_extent_rec *insert_rec,
int *free_records,
struct ocfs2_insert_type *insert)
{
int ret;
- struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el;
struct ocfs2_path *path = NULL;
@@ -3981,7 +4284,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
insert->ins_split = SPLIT_NONE;
- el = &di->id2.i_list;
+ el = et->et_root_el;
insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
if (el->l_tree_depth) {
@@ -3991,9 +4294,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
* ocfs2_figure_insert_type() and ocfs2_add_branch()
* may want it later.
*/
- ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
- le64_to_cpu(di->i_last_eb_blk), &bh,
- OCFS2_BH_CACHED, inode);
+ ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), &bh);
if (ret) {
mlog_exit(ret);
goto out;
@@ -4014,12 +4315,12 @@ static int ocfs2_figure_insert_type(struct inode *inode,
le16_to_cpu(el->l_next_free_rec);
if (!insert->ins_tree_depth) {
- ocfs2_figure_contig_type(inode, insert, el, insert_rec);
+ ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
ocfs2_figure_appending_type(insert, el, insert_rec);
return 0;
}
- path = ocfs2_new_inode_path(di_bh);
+ path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
if (!path) {
ret = -ENOMEM;
mlog_errno(ret);
@@ -4048,7 +4349,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
* into two types of appends: simple record append, or a
* rotate inside the tail leaf.
*/
- ocfs2_figure_contig_type(inode, insert, el, insert_rec);
+ ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
/*
* The insert code isn't quite ready to deal with all cases of
@@ -4069,7 +4370,8 @@ static int ocfs2_figure_insert_type(struct inode *inode,
* the case that we're doing a tail append, so maybe we can
* take advantage of that information somehow.
*/
- if (le64_to_cpu(di->i_last_eb_blk) == path_leaf_bh(path)->b_blocknr) {
+ if (ocfs2_et_get_last_eb_blk(et) ==
+ path_leaf_bh(path)->b_blocknr) {
/*
* Ok, ocfs2_find_path() returned us the rightmost
* tree path. This might be an appending insert. There are
@@ -4099,7 +4401,7 @@ out:
int ocfs2_insert_extent(struct ocfs2_super *osb,
handle_t *handle,
struct inode *inode,
- struct buffer_head *fe_bh,
+ struct ocfs2_extent_tree *et,
u32 cpos,
u64 start_blk,
u32 new_clusters,
@@ -4112,26 +4414,21 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
struct ocfs2_insert_type insert = {0, };
struct ocfs2_extent_rec rec;
- BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
-
mlog(0, "add %u clusters at position %u to inode %llu\n",
new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
- mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
- (OCFS2_I(inode)->ip_clusters != cpos),
- "Device %s, asking for sparse allocation: inode %llu, "
- "cpos %u, clusters %u\n",
- osb->dev_str,
- (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos,
- OCFS2_I(inode)->ip_clusters);
-
memset(&rec, 0, sizeof(rec));
rec.e_cpos = cpu_to_le32(cpos);
rec.e_blkno = cpu_to_le64(start_blk);
rec.e_leaf_clusters = cpu_to_le16(new_clusters);
rec.e_flags = flags;
+ status = ocfs2_et_insert_check(inode, et, &rec);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
- status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
+ status = ocfs2_figure_insert_type(inode, et, &last_eb_bh, &rec,
&free_records, &insert);
if (status < 0) {
mlog_errno(status);
@@ -4145,7 +4442,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
free_records, insert.ins_tree_depth);
if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
- status = ocfs2_grow_tree(inode, handle, fe_bh,
+ status = ocfs2_grow_tree(inode, handle, et,
&insert.ins_tree_depth, &last_eb_bh,
meta_ac);
if (status) {
@@ -4155,17 +4452,124 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
}
/* Finally, we can add clusters. This might rotate the tree for us. */
- status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert);
+ status = ocfs2_do_insert_extent(inode, handle, et, &rec, &insert);
if (status < 0)
mlog_errno(status);
- else
+ else if (et->et_ops == &ocfs2_dinode_et_ops)
ocfs2_extent_map_insert_rec(inode, &rec);
bail:
- if (last_eb_bh)
- brelse(last_eb_bh);
+ brelse(last_eb_bh);
+
+ mlog_exit(status);
+ return status;
+}
+
+/*
+ * Allcate and add clusters into the extent b-tree.
+ * The new clusters(clusters_to_add) will be inserted at logical_offset.
+ * The extent b-tree's root is specified by et, and
+ * it is not limited to the file storage. Any extent tree can use this
+ * function if it implements the proper ocfs2_extent_tree.
+ */
+int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
+ struct inode *inode,
+ u32 *logical_offset,
+ u32 clusters_to_add,
+ int mark_unwritten,
+ struct ocfs2_extent_tree *et,
+ handle_t *handle,
+ struct ocfs2_alloc_context *data_ac,
+ struct ocfs2_alloc_context *meta_ac,
+ enum ocfs2_alloc_restarted *reason_ret)
+{
+ int status = 0;
+ int free_extents;
+ enum ocfs2_alloc_restarted reason = RESTART_NONE;
+ u32 bit_off, num_bits;
+ u64 block;
+ u8 flags = 0;
+
+ BUG_ON(!clusters_to_add);
+
+ if (mark_unwritten)
+ flags = OCFS2_EXT_UNWRITTEN;
+
+ free_extents = ocfs2_num_free_extents(osb, inode, et);
+ if (free_extents < 0) {
+ status = free_extents;
+ mlog_errno(status);
+ goto leave;
+ }
+
+ /* there are two cases which could cause us to EAGAIN in the
+ * we-need-more-metadata case:
+ * 1) we haven't reserved *any*
+ * 2) we are so fragmented, we've needed to add metadata too
+ * many times. */
+ if (!free_extents && !meta_ac) {
+ mlog(0, "we haven't reserved any metadata!\n");
+ status = -EAGAIN;
+ reason = RESTART_META;
+ goto leave;
+ } else if ((!free_extents)
+ && (ocfs2_alloc_context_bits_left(meta_ac)
+ < ocfs2_extend_meta_needed(et->et_root_el))) {
+ mlog(0, "filesystem is really fragmented...\n");
+ status = -EAGAIN;
+ reason = RESTART_META;
+ goto leave;
+ }
+
+ status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
+ clusters_to_add, &bit_off, &num_bits);
+ if (status < 0) {
+ if (status != -ENOSPC)
+ mlog_errno(status);
+ goto leave;
+ }
+
+ BUG_ON(num_bits > clusters_to_add);
+ /* reserve our write early -- insert_extent may update the inode */
+ status = ocfs2_journal_access(handle, inode, et->et_root_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
+
+ block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+ mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
+ num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ status = ocfs2_insert_extent(osb, handle, inode, et,
+ *logical_offset, block,
+ num_bits, flags, meta_ac);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
+
+ status = ocfs2_journal_dirty(handle, et->et_root_bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
+
+ clusters_to_add -= num_bits;
+ *logical_offset += num_bits;
+
+ if (clusters_to_add) {
+ mlog(0, "need to alloc once more, wanted = %u\n",
+ clusters_to_add);
+ status = -EAGAIN;
+ reason = RESTART_TRANS;
+ }
+
+leave:
mlog_exit(status);
+ if (reason_ret)
+ *reason_ret = reason;
return status;
}
@@ -4192,7 +4596,7 @@ static void ocfs2_make_right_split_rec(struct super_block *sb,
static int ocfs2_split_and_insert(struct inode *inode,
handle_t *handle,
struct ocfs2_path *path,
- struct buffer_head *di_bh,
+ struct ocfs2_extent_tree *et,
struct buffer_head **last_eb_bh,
int split_index,
struct ocfs2_extent_rec *orig_split_rec,
@@ -4206,7 +4610,6 @@ static int ocfs2_split_and_insert(struct inode *inode,
struct ocfs2_extent_rec split_rec = *orig_split_rec;
struct ocfs2_insert_type insert;
struct ocfs2_extent_block *eb;
- struct ocfs2_dinode *di;
leftright:
/*
@@ -4215,8 +4618,7 @@ leftright:
*/
rec = path_leaf_el(path)->l_recs[split_index];
- di = (struct ocfs2_dinode *)di_bh->b_data;
- rightmost_el = &di->id2.i_list;
+ rightmost_el = et->et_root_el;
depth = le16_to_cpu(rightmost_el->l_tree_depth);
if (depth) {
@@ -4227,8 +4629,8 @@ leftright:
if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
le16_to_cpu(rightmost_el->l_count)) {
- ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, last_eb_bh,
- meta_ac);
+ ret = ocfs2_grow_tree(inode, handle, et,
+ &depth, last_eb_bh, meta_ac);
if (ret) {
mlog_errno(ret);
goto out;
@@ -4265,8 +4667,7 @@ leftright:
do_leftright = 1;
}
- ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec,
- &insert);
+ ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
if (ret) {
mlog_errno(ret);
goto out;
@@ -4308,8 +4709,9 @@ out:
* of the tree is required. All other cases will degrade into a less
* optimal tree layout.
*
- * last_eb_bh should be the rightmost leaf block for any inode with a
- * btree. Since a split may grow the tree or a merge might shrink it, the caller cannot trust the contents of that buffer after this call.
+ * last_eb_bh should be the rightmost leaf block for any extent
+ * btree. Since a split may grow the tree or a merge might shrink it,
+ * the caller cannot trust the contents of that buffer after this call.
*
* This code is optimized for readability - several passes might be
* made over certain portions of the tree. All of those blocks will
@@ -4317,7 +4719,7 @@ out:
* extra overhead is not expressed in terms of disk reads.
*/
static int __ocfs2_mark_extent_written(struct inode *inode,
- struct buffer_head *di_bh,
+ struct ocfs2_extent_tree *et,
handle_t *handle,
struct ocfs2_path *path,
int split_index,
@@ -4357,11 +4759,9 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
*/
if (path->p_tree_depth) {
struct ocfs2_extent_block *eb;
- struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
- ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
- le64_to_cpu(di->i_last_eb_blk),
- &last_eb_bh, OCFS2_BH_CACHED, inode);
+ ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et),
+ &last_eb_bh);
if (ret) {
mlog_exit(ret);
goto out;
@@ -4394,7 +4794,7 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
if (ctxt.c_split_covers_rec)
el->l_recs[split_index] = *split_rec;
else
- ret = ocfs2_split_and_insert(inode, handle, path, di_bh,
+ ret = ocfs2_split_and_insert(inode, handle, path, et,
&last_eb_bh, split_index,
split_rec, meta_ac);
if (ret)
@@ -4402,7 +4802,7 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
} else {
ret = ocfs2_try_to_merge_extent(inode, handle, path,
split_index, split_rec,
- dealloc, &ctxt);
+ dealloc, &ctxt, et);
if (ret)
mlog_errno(ret);
}
@@ -4420,7 +4820,8 @@ out:
*
* The caller is responsible for passing down meta_ac if we'll need it.
*/
-int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
+int ocfs2_mark_extent_written(struct inode *inode,
+ struct ocfs2_extent_tree *et,
handle_t *handle, u32 cpos, u32 len, u32 phys,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_cached_dealloc_ctxt *dealloc)
@@ -4446,10 +4847,14 @@ int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
/*
* XXX: This should be fixed up so that we just re-insert the
* next extent records.
+ *
+ * XXX: This is a hack on the extent tree, maybe it should be
+ * an op?
*/
- ocfs2_extent_map_trunc(inode, 0);
+ if (et->et_ops == &ocfs2_dinode_et_ops)
+ ocfs2_extent_map_trunc(inode, 0);
- left_path = ocfs2_new_inode_path(di_bh);
+ left_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
if (!left_path) {
ret = -ENOMEM;
mlog_errno(ret);
@@ -4480,8 +4885,9 @@ int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
- ret = __ocfs2_mark_extent_written(inode, di_bh, handle, left_path,
- index, &split_rec, meta_ac, dealloc);
+ ret = __ocfs2_mark_extent_written(inode, et, handle, left_path,
+ index, &split_rec, meta_ac,
+ dealloc);
if (ret)
mlog_errno(ret);
@@ -4490,13 +4896,12 @@ out:
return ret;
}
-static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
+static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
handle_t *handle, struct ocfs2_path *path,
int index, u32 new_range,
struct ocfs2_alloc_context *meta_ac)
{
int ret, depth, credits = handle->h_buffer_credits;
- struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
struct buffer_head *last_eb_bh = NULL;
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *rightmost_el, *el;
@@ -4513,9 +4918,8 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
depth = path->p_tree_depth;
if (depth > 0) {
- ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
- le64_to_cpu(di->i_last_eb_blk),
- &last_eb_bh, OCFS2_BH_CACHED, inode);
+ ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et),
+ &last_eb_bh);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -4526,7 +4930,8 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
} else
rightmost_el = path_leaf_el(path);
- credits += path->p_tree_depth + ocfs2_extend_meta_needed(di);
+ credits += path->p_tree_depth +
+ ocfs2_extend_meta_needed(et->et_root_el);
ret = ocfs2_extend_trans(handle, credits);
if (ret) {
mlog_errno(ret);
@@ -4535,7 +4940,7 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
le16_to_cpu(rightmost_el->l_count)) {
- ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, &last_eb_bh,
+ ret = ocfs2_grow_tree(inode, handle, et, &depth, &last_eb_bh,
meta_ac);
if (ret) {
mlog_errno(ret);
@@ -4549,7 +4954,7 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
insert.ins_split = SPLIT_RIGHT;
insert.ins_tree_depth = depth;
- ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec, &insert);
+ ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
if (ret)
mlog_errno(ret);
@@ -4561,7 +4966,8 @@ out:
static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
struct ocfs2_path *path, int index,
struct ocfs2_cached_dealloc_ctxt *dealloc,
- u32 cpos, u32 len)
+ u32 cpos, u32 len,
+ struct ocfs2_extent_tree *et)
{
int ret;
u32 left_cpos, rec_range, trunc_range;
@@ -4573,7 +4979,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
struct ocfs2_extent_block *eb;
if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
- ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
+ ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
if (ret) {
mlog_errno(ret);
goto out;
@@ -4704,7 +5110,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
ocfs2_journal_dirty(handle, path_leaf_bh(path));
- ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
+ ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
if (ret) {
mlog_errno(ret);
goto out;
@@ -4715,7 +5121,8 @@ out:
return ret;
}
-int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
+int ocfs2_remove_extent(struct inode *inode,
+ struct ocfs2_extent_tree *et,
u32 cpos, u32 len, handle_t *handle,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_cached_dealloc_ctxt *dealloc)
@@ -4724,11 +5131,11 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
u32 rec_range, trunc_range;
struct ocfs2_extent_rec *rec;
struct ocfs2_extent_list *el;
- struct ocfs2_path *path;
+ struct ocfs2_path *path = NULL;
ocfs2_extent_map_trunc(inode, 0);
- path = ocfs2_new_inode_path(di_bh);
+ path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
if (!path) {
ret = -ENOMEM;
mlog_errno(ret);
@@ -4781,13 +5188,13 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
- cpos, len);
+ cpos, len, et);
if (ret) {
mlog_errno(ret);
goto out;
}
} else {
- ret = ocfs2_split_tree(inode, di_bh, handle, path, index,
+ ret = ocfs2_split_tree(inode, et, handle, path, index,
trunc_range, meta_ac);
if (ret) {
mlog_errno(ret);
@@ -4836,7 +5243,7 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
}
ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
- cpos, len);
+ cpos, len, et);
if (ret) {
mlog_errno(ret);
goto out;
@@ -5179,8 +5586,7 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
goto bail;
}
- status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
- OCFS2_BH_CACHED, inode);
+ status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
if (status < 0) {
iput(inode);
mlog_errno(status);
@@ -5255,8 +5661,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
bail:
if (tl_inode)
iput(tl_inode);
- if (tl_bh)
- brelse(tl_bh);
+ brelse(tl_bh);
if (status < 0 && (*tl_copy)) {
kfree(*tl_copy);
@@ -5999,20 +6404,13 @@ bail:
return status;
}
-static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh)
+static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
{
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
return 0;
}
-static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
-{
- set_buffer_uptodate(bh);
- mark_buffer_dirty(bh);
- return ocfs2_journal_dirty_data(handle, bh);
-}
-
static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
unsigned int from, unsigned int to,
struct page *page, int zero, u64 *phys)
@@ -6031,17 +6429,18 @@ static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
* here if they aren't - ocfs2_map_page_blocks()
* might've skipped some
*/
- if (ocfs2_should_order_data(inode)) {
- ret = walk_page_buffers(handle,
- page_buffers(page),
- from, to, &partial,
- ocfs2_ordered_zero_func);
- if (ret < 0)
- mlog_errno(ret);
- } else {
+ ret = walk_page_buffers(handle, page_buffers(page),
+ from, to, &partial,
+ ocfs2_zero_func);
+ if (ret < 0)
+ mlog_errno(ret);
+ else if (ocfs2_should_order_data(inode)) {
+ ret = ocfs2_jbd2_file_inode(handle, inode);
+#ifdef CONFIG_OCFS2_COMPAT_JBD
ret = walk_page_buffers(handle, page_buffers(page),
from, to, &partial,
- ocfs2_writeback_zero_func);
+ ocfs2_journal_dirty_data);
+#endif
if (ret < 0)
mlog_errno(ret);
}
@@ -6206,20 +6605,29 @@ out:
return ret;
}
-static void ocfs2_zero_dinode_id2(struct inode *inode, struct ocfs2_dinode *di)
+static void ocfs2_zero_dinode_id2_with_xattr(struct inode *inode,
+ struct ocfs2_dinode *di)
{
unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits;
+ unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
- memset(&di->id2, 0, blocksize - offsetof(struct ocfs2_dinode, id2));
+ if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
+ memset(&di->id2, 0, blocksize -
+ offsetof(struct ocfs2_dinode, id2) -
+ xattrsize);
+ else
+ memset(&di->id2, 0, blocksize -
+ offsetof(struct ocfs2_dinode, id2));
}
void ocfs2_dinode_new_extent_list(struct inode *inode,
struct ocfs2_dinode *di)
{
- ocfs2_zero_dinode_id2(inode, di);
+ ocfs2_zero_dinode_id2_with_xattr(inode, di);
di->id2.i_list.l_tree_depth = 0;
di->id2.i_list.l_next_free_rec = 0;
- di->id2.i_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(inode->i_sb));
+ di->id2.i_list.l_count = cpu_to_le16(
+ ocfs2_extent_recs_per_inode_with_xattr(inode->i_sb, di));
}
void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
@@ -6236,9 +6644,10 @@ void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
* We clear the entire i_data structure here so that all
* fields can be properly initialized.
*/
- ocfs2_zero_dinode_id2(inode, di);
+ ocfs2_zero_dinode_id2_with_xattr(inode, di);
- idata->id_count = cpu_to_le16(ocfs2_max_inline_data(inode->i_sb));
+ idata->id_count = cpu_to_le16(
+ ocfs2_max_inline_data_with_xattr(inode->i_sb, di));
}
int ocfs2_convert_inline_data_to_extents(struct inode *inode,
@@ -6253,6 +6662,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
struct ocfs2_alloc_context *data_ac = NULL;
struct page **pages = NULL;
loff_t end = osb->s_clustersize;
+ struct ocfs2_extent_tree et;
has_data = i_size_read(inode) ? 1 : 0;
@@ -6352,7 +6762,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
* this proves to be false, we could always re-build
* the in-inode data from our pages.
*/
- ret = ocfs2_insert_extent(osb, handle, inode, di_bh,
+ ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
+ ret = ocfs2_insert_extent(osb, handle, inode, &et,
0, block, 1, 0, NULL);
if (ret) {
mlog_errno(ret);
@@ -6395,13 +6806,14 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
handle_t *handle = NULL;
struct inode *tl_inode = osb->osb_tl_inode;
struct ocfs2_path *path = NULL;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
mlog_entry_void();
new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
i_size_read(inode));
- path = ocfs2_new_inode_path(fe_bh);
+ path = ocfs2_new_path(fe_bh, &di->id2.i_list);
if (!path) {
status = -ENOMEM;
mlog_errno(status);
@@ -6572,8 +6984,8 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
if (fe->id2.i_list.l_tree_depth) {
- status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
- &last_eb_bh, OCFS2_BH_CACHED, inode);
+ status = ocfs2_read_block(inode, le64_to_cpu(fe->i_last_eb_blk),
+ &last_eb_bh);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -6686,8 +7098,7 @@ static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
mlog(ML_NOTICE,
"Truncate completion has non-empty dealloc context\n");
- if (tc->tc_last_eb_bh)
- brelse(tc->tc_last_eb_bh);
+ brelse(tc->tc_last_eb_bh);
kfree(tc);
}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 60cd3d59230c..70257c84cfbe 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -26,30 +26,102 @@
#ifndef OCFS2_ALLOC_H
#define OCFS2_ALLOC_H
+
+/*
+ * For xattr tree leaf, we limit the leaf byte size to be 64K.
+ */
+#define OCFS2_MAX_XATTR_TREE_LEAF_SIZE 65536
+
+/*
+ * ocfs2_extent_tree and ocfs2_extent_tree_operations are used to abstract
+ * the b-tree operations in ocfs2. Now all the b-tree operations are not
+ * limited to ocfs2_dinode only. Any data which need to allocate clusters
+ * to store can use b-tree. And it only needs to implement its ocfs2_extent_tree
+ * and operation.
+ *
+ * ocfs2_extent_tree becomes the first-class object for extent tree
+ * manipulation. Callers of the alloc.c code need to fill it via one of
+ * the ocfs2_init_*_extent_tree() operations below.
+ *
+ * ocfs2_extent_tree contains info for the root of the b-tree, it must have a
+ * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree
+ * functions.
+ * ocfs2_extent_tree_operations abstract the normal operations we do for
+ * the root of extent b-tree.
+ */
+struct ocfs2_extent_tree_operations;
+struct ocfs2_extent_tree {
+ struct ocfs2_extent_tree_operations *et_ops;
+ struct buffer_head *et_root_bh;
+ struct ocfs2_extent_list *et_root_el;
+ void *et_object;
+ unsigned int et_max_leaf_clusters;
+};
+
+/*
+ * ocfs2_init_*_extent_tree() will fill an ocfs2_extent_tree from the
+ * specified object buffer.
+ */
+void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
+ struct inode *inode,
+ struct buffer_head *bh);
+void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
+ struct inode *inode,
+ struct buffer_head *bh);
+void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
+ struct inode *inode,
+ struct buffer_head *bh,
+ struct ocfs2_xattr_value_root *xv);
+
struct ocfs2_alloc_context;
int ocfs2_insert_extent(struct ocfs2_super *osb,
handle_t *handle,
struct inode *inode,
- struct buffer_head *fe_bh,
+ struct ocfs2_extent_tree *et,
u32 cpos,
u64 start_blk,
u32 new_clusters,
u8 flags,
struct ocfs2_alloc_context *meta_ac);
+
+enum ocfs2_alloc_restarted {
+ RESTART_NONE = 0,
+ RESTART_TRANS,
+ RESTART_META
+};
+int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
+ struct inode *inode,
+ u32 *logical_offset,
+ u32 clusters_to_add,
+ int mark_unwritten,
+ struct ocfs2_extent_tree *et,
+ handle_t *handle,
+ struct ocfs2_alloc_context *data_ac,
+ struct ocfs2_alloc_context *meta_ac,
+ enum ocfs2_alloc_restarted *reason_ret);
struct ocfs2_cached_dealloc_ctxt;
-int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
+int ocfs2_mark_extent_written(struct inode *inode,
+ struct ocfs2_extent_tree *et,
handle_t *handle, u32 cpos, u32 len, u32 phys,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_cached_dealloc_ctxt *dealloc);
-int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
+int ocfs2_remove_extent(struct inode *inode,
+ struct ocfs2_extent_tree *et,
u32 cpos, u32 len, handle_t *handle,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_cached_dealloc_ctxt *dealloc);
int ocfs2_num_free_extents(struct ocfs2_super *osb,
struct inode *inode,
- struct ocfs2_dinode *fe);
-/* how many new metadata chunks would an allocation need at maximum? */
-static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe)
+ struct ocfs2_extent_tree *et);
+
+/*
+ * how many new metadata chunks would an allocation need at maximum?
+ *
+ * Please note that the caller must make sure that root_el is the root
+ * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
+ * the result may be wrong.
+ */
+static inline int ocfs2_extend_meta_needed(struct ocfs2_extent_list *root_el)
{
/*
* Rather than do all the work of determining how much we need
@@ -59,7 +131,7 @@ static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe)
* new tree_depth==0 extent_block, and one block at the new
* top-of-the tree.
*/
- return le16_to_cpu(fe->id2.i_list.l_tree_depth) + 2;
+ return le16_to_cpu(root_el->l_tree_depth) + 2;
}
void ocfs2_dinode_new_extent_list(struct inode *inode, struct ocfs2_dinode *di);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index a53da1466277..c22543b33420 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -68,9 +68,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
goto bail;
}
- status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
- OCFS2_I(inode)->ip_blkno,
- &bh, OCFS2_BH_CACHED, inode);
+ status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -128,8 +126,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
err = 0;
bail:
- if (bh)
- brelse(bh);
+ brelse(bh);
mlog_exit(err);
return err;
@@ -261,13 +258,11 @@ static int ocfs2_readpage_inline(struct inode *inode, struct page *page)
{
int ret;
struct buffer_head *di_bh = NULL;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
BUG_ON(!PageLocked(page));
BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
- ret = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &di_bh,
- OCFS2_BH_CACHED, inode);
+ ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
if (ret) {
mlog_errno(ret);
goto out;
@@ -485,11 +480,14 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
}
if (ocfs2_should_order_data(inode)) {
+ ret = ocfs2_jbd2_file_inode(handle, inode);
+#ifdef CONFIG_OCFS2_COMPAT_JBD
ret = walk_page_buffers(handle,
page_buffers(page),
from, to, NULL,
ocfs2_journal_dirty_data);
- if (ret < 0)
+#endif
+ if (ret < 0)
mlog_errno(ret);
}
out:
@@ -669,7 +667,7 @@ static void ocfs2_invalidatepage(struct page *page, unsigned long offset)
{
journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
- journal_invalidatepage(journal, page, offset);
+ jbd2_journal_invalidatepage(journal, page, offset);
}
static int ocfs2_releasepage(struct page *page, gfp_t wait)
@@ -678,7 +676,7 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait)
if (!page_has_buffers(page))
return 0;
- return journal_try_to_free_buffers(journal, page, wait);
+ return jbd2_journal_try_to_free_buffers(journal, page, wait);
}
static ssize_t ocfs2_direct_IO(int rw,
@@ -1074,11 +1072,15 @@ static void ocfs2_write_failure(struct inode *inode,
tmppage = wc->w_pages[i];
if (page_has_buffers(tmppage)) {
- if (ocfs2_should_order_data(inode))
+ if (ocfs2_should_order_data(inode)) {
+ ocfs2_jbd2_file_inode(wc->w_handle, inode);
+#ifdef CONFIG_OCFS2_COMPAT_JBD
walk_page_buffers(wc->w_handle,
page_buffers(tmppage),
from, to, NULL,
ocfs2_journal_dirty_data);
+#endif
+ }
block_commit_write(tmppage, from, to);
}
@@ -1242,6 +1244,7 @@ static int ocfs2_write_cluster(struct address_space *mapping,
int ret, i, new, should_zero = 0;
u64 v_blkno, p_blkno;
struct inode *inode = mapping->host;
+ struct ocfs2_extent_tree et;
new = phys == 0 ? 1 : 0;
if (new || unwritten)
@@ -1255,10 +1258,10 @@ static int ocfs2_write_cluster(struct address_space *mapping,
* any additional semaphores or cluster locks.
*/
tmp_pos = cpos;
- ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode,
- &tmp_pos, 1, 0, wc->w_di_bh,
- wc->w_handle, data_ac,
- meta_ac, NULL);
+ ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode,
+ &tmp_pos, 1, 0, wc->w_di_bh,
+ wc->w_handle, data_ac,
+ meta_ac, NULL);
/*
* This shouldn't happen because we must have already
* calculated the correct meta data allocation required. The
@@ -1276,7 +1279,8 @@ static int ocfs2_write_cluster(struct address_space *mapping,
goto out;
}
} else if (unwritten) {
- ret = ocfs2_mark_extent_written(inode, wc->w_di_bh,
+ ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh);
+ ret = ocfs2_mark_extent_written(inode, &et,
wc->w_handle, cpos, 1, phys,
meta_ac, &wc->w_dealloc);
if (ret < 0) {
@@ -1665,6 +1669,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
struct ocfs2_alloc_context *data_ac = NULL;
struct ocfs2_alloc_context *meta_ac = NULL;
handle_t *handle;
+ struct ocfs2_extent_tree et;
ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
if (ret) {
@@ -1712,14 +1717,23 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
* ocfs2_lock_allocators(). It greatly over-estimates
* the work to be done.
*/
- ret = ocfs2_lock_allocators(inode, di, clusters_to_alloc,
- extents_to_split, &data_ac, &meta_ac);
+ mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u,"
+ " clusters_to_add = %u, extents_to_split = %u\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (long long)i_size_read(inode), le32_to_cpu(di->i_clusters),
+ clusters_to_alloc, extents_to_split);
+
+ ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh);
+ ret = ocfs2_lock_allocators(inode, &et,
+ clusters_to_alloc, extents_to_split,
+ &data_ac, &meta_ac);
if (ret) {
mlog_errno(ret);
goto out;
}
- credits = ocfs2_calc_extend_credits(inode->i_sb, di,
+ credits = ocfs2_calc_extend_credits(inode->i_sb,
+ &di->id2.i_list,
clusters_to_alloc);
}
@@ -1905,11 +1919,15 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
}
if (page_has_buffers(tmppage)) {
- if (ocfs2_should_order_data(inode))
+ if (ocfs2_should_order_data(inode)) {
+ ocfs2_jbd2_file_inode(wc->w_handle, inode);
+#ifdef CONFIG_OCFS2_COMPAT_JBD
walk_page_buffers(wc->w_handle,
page_buffers(tmppage),
from, to, NULL,
ocfs2_journal_dirty_data);
+#endif
+ }
block_commit_write(tmppage, from, to);
}
}
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index f136639f5b41..7e947c672469 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -66,7 +66,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
/* remove from dirty list before I/O. */
clear_buffer_dirty(bh);
- get_bh(bh); /* for end_buffer_write_sync() */
+ get_bh(bh); /* for end_buffer_write_sync() */
bh->b_end_io = end_buffer_write_sync;
submit_bh(WRITE, bh);
@@ -88,22 +88,103 @@ out:
return ret;
}
-int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
- struct buffer_head *bhs[], int flags,
- struct inode *inode)
+int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
+ unsigned int nr, struct buffer_head *bhs[])
+{
+ int status = 0;
+ unsigned int i;
+ struct buffer_head *bh;
+
+ if (!nr) {
+ mlog(ML_BH_IO, "No buffers will be read!\n");
+ goto bail;
+ }
+
+ for (i = 0 ; i < nr ; i++) {
+ if (bhs[i] == NULL) {
+ bhs[i] = sb_getblk(osb->sb, block++);
+ if (bhs[i] == NULL) {
+ status = -EIO;
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+ bh = bhs[i];
+
+ if (buffer_jbd(bh)) {
+ mlog(ML_ERROR,
+ "trying to sync read a jbd "
+ "managed bh (blocknr = %llu), skipping\n",
+ (unsigned long long)bh->b_blocknr);
+ continue;
+ }
+
+ if (buffer_dirty(bh)) {
+ /* This should probably be a BUG, or
+ * at least return an error. */
+ mlog(ML_ERROR,
+ "trying to sync read a dirty "
+ "buffer! (blocknr = %llu), skipping\n",
+ (unsigned long long)bh->b_blocknr);
+ continue;
+ }
+
+ lock_buffer(bh);
+ if (buffer_jbd(bh)) {
+ mlog(ML_ERROR,
+ "block %llu had the JBD bit set "
+ "while I was in lock_buffer!",
+ (unsigned long long)bh->b_blocknr);
+ BUG();
+ }
+
+ clear_buffer_uptodate(bh);
+ get_bh(bh); /* for end_buffer_read_sync() */
+ bh->b_end_io = end_buffer_read_sync;
+ submit_bh(READ, bh);
+ }
+
+ for (i = nr; i > 0; i--) {
+ bh = bhs[i - 1];
+
+ if (buffer_jbd(bh)) {
+ mlog(ML_ERROR,
+ "the journal got the buffer while it was "
+ "locked for io! (blocknr = %llu)\n",
+ (unsigned long long)bh->b_blocknr);
+ BUG();
+ }
+
+ wait_on_buffer(bh);
+ if (!buffer_uptodate(bh)) {
+ /* Status won't be cleared from here on out,
+ * so we can safely record this and loop back
+ * to cleanup the other buffers. */
+ status = -EIO;
+ put_bh(bh);
+ bhs[i - 1] = NULL;
+ }
+ }
+
+bail:
+ return status;
+}
+
+int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
+ struct buffer_head *bhs[], int flags)
{
int status = 0;
- struct super_block *sb;
int i, ignore_cache = 0;
struct buffer_head *bh;
- mlog_entry("(block=(%llu), nr=(%d), flags=%d, inode=%p)\n",
- (unsigned long long)block, nr, flags, inode);
+ mlog_entry("(inode=%p, block=(%llu), nr=(%d), flags=%d)\n",
+ inode, (unsigned long long)block, nr, flags);
+ BUG_ON(!inode);
BUG_ON((flags & OCFS2_BH_READAHEAD) &&
- (!inode || !(flags & OCFS2_BH_CACHED)));
+ (flags & OCFS2_BH_IGNORE_CACHE));
- if (osb == NULL || osb->sb == NULL || bhs == NULL) {
+ if (bhs == NULL) {
status = -EINVAL;
mlog_errno(status);
goto bail;
@@ -122,26 +203,19 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
goto bail;
}
- sb = osb->sb;
-
- if (flags & OCFS2_BH_CACHED && !inode)
- flags &= ~OCFS2_BH_CACHED;
-
- if (inode)
- mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
+ mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
for (i = 0 ; i < nr ; i++) {
if (bhs[i] == NULL) {
- bhs[i] = sb_getblk(sb, block++);
+ bhs[i] = sb_getblk(inode->i_sb, block++);
if (bhs[i] == NULL) {
- if (inode)
- mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
+ mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
status = -EIO;
mlog_errno(status);
goto bail;
}
}
bh = bhs[i];
- ignore_cache = 0;
+ ignore_cache = (flags & OCFS2_BH_IGNORE_CACHE);
/* There are three read-ahead cases here which we need to
* be concerned with. All three assume a buffer has
@@ -167,26 +241,27 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
* before our is-it-in-flight check.
*/
- if (flags & OCFS2_BH_CACHED &&
- !ocfs2_buffer_uptodate(inode, bh)) {
+ if (!ignore_cache && !ocfs2_buffer_uptodate(inode, bh)) {
mlog(ML_UPTODATE,
"bh (%llu), inode %llu not uptodate\n",
(unsigned long long)bh->b_blocknr,
(unsigned long long)OCFS2_I(inode)->ip_blkno);
+ /* We're using ignore_cache here to say
+ * "go to disk" */
ignore_cache = 1;
}
/* XXX: Can we ever get this and *not* have the cached
* flag set? */
if (buffer_jbd(bh)) {
- if (!(flags & OCFS2_BH_CACHED) || ignore_cache)
+ if (ignore_cache)
mlog(ML_BH_IO, "trying to sync read a jbd "
"managed bh (blocknr = %llu)\n",
(unsigned long long)bh->b_blocknr);
continue;
}
- if (!(flags & OCFS2_BH_CACHED) || ignore_cache) {
+ if (ignore_cache) {
if (buffer_dirty(bh)) {
/* This should probably be a BUG, or
* at least return an error. */
@@ -221,7 +296,7 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
* previously read-ahead buffer may have
* completed I/O while we were waiting for the
* buffer lock. */
- if ((flags & OCFS2_BH_CACHED)
+ if (!(flags & OCFS2_BH_IGNORE_CACHE)
&& !(flags & OCFS2_BH_READAHEAD)
&& ocfs2_buffer_uptodate(inode, bh)) {
unlock_buffer(bh);
@@ -265,15 +340,14 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
/* Always set the buffer in the cache, even if it was
* a forced read, or read-ahead which hasn't yet
* completed. */
- if (inode)
- ocfs2_set_buffer_uptodate(inode, bh);
+ ocfs2_set_buffer_uptodate(inode, bh);
}
- if (inode)
- mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
+ mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n",
(unsigned long long)block, nr,
- (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes", flags);
+ ((flags & OCFS2_BH_IGNORE_CACHE) || ignore_cache) ? "no" : "yes",
+ flags);
bail:
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index c2e78614c3e5..75e1dcb1ade7 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -31,31 +31,29 @@
void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
int uptodate);
-static inline int ocfs2_read_block(struct ocfs2_super *osb,
+static inline int ocfs2_read_block(struct inode *inode,
u64 off,
- struct buffer_head **bh,
- int flags,
- struct inode *inode);
+ struct buffer_head **bh);
int ocfs2_write_block(struct ocfs2_super *osb,
struct buffer_head *bh,
struct inode *inode);
-int ocfs2_read_blocks(struct ocfs2_super *osb,
+int ocfs2_read_blocks(struct inode *inode,
u64 block,
int nr,
struct buffer_head *bhs[],
- int flags,
- struct inode *inode);
+ int flags);
+int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
+ unsigned int nr, struct buffer_head *bhs[]);
int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
struct buffer_head *bh);
-#define OCFS2_BH_CACHED 1
+#define OCFS2_BH_IGNORE_CACHE 1
#define OCFS2_BH_READAHEAD 8
-static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off,
- struct buffer_head **bh, int flags,
- struct inode *inode)
+static inline int ocfs2_read_block(struct inode *inode, u64 off,
+ struct buffer_head **bh)
{
int status = 0;
@@ -65,8 +63,7 @@ static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off,
goto bail;
}
- status = ocfs2_read_blocks(osb, off, 1, bh,
- flags, inode);
+ status = ocfs2_read_blocks(inode, off, 1, bh, 0);
bail:
return status;
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 23c732f27529..d8a0cb92cef6 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -109,6 +109,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
define_mask(CONN),
define_mask(QUORUM),
define_mask(EXPORT),
+ define_mask(XATTR),
define_mask(ERROR),
define_mask(NOTICE),
define_mask(KTHREAD),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 597e064bb94f..57670c680471 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -112,6 +112,7 @@
#define ML_CONN 0x0000000004000000ULL /* net connection management */
#define ML_QUORUM 0x0000000008000000ULL /* net connection quorum */
#define ML_EXPORT 0x0000000010000000ULL /* ocfs2 export operations */
+#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */
/* bits that are infrequently given and frequently matched in the high word */
#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */
#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 9cce563fd627..026e6eb85187 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -82,6 +82,49 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
struct ocfs2_alloc_context *meta_ac,
struct buffer_head **new_bh);
+static struct buffer_head *ocfs2_bread(struct inode *inode,
+ int block, int *err, int reada)
+{
+ struct buffer_head *bh = NULL;
+ int tmperr;
+ u64 p_blkno;
+ int readflags = 0;
+
+ if (reada)
+ readflags |= OCFS2_BH_READAHEAD;
+
+ if (((u64)block << inode->i_sb->s_blocksize_bits) >=
+ i_size_read(inode)) {
+ BUG_ON(!reada);
+ return NULL;
+ }
+
+ down_read(&OCFS2_I(inode)->ip_alloc_sem);
+ tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
+ NULL);
+ up_read(&OCFS2_I(inode)->ip_alloc_sem);
+ if (tmperr < 0) {
+ mlog_errno(tmperr);
+ goto fail;
+ }
+
+ tmperr = ocfs2_read_blocks(inode, p_blkno, 1, &bh, readflags);
+ if (tmperr < 0)
+ goto fail;
+
+ tmperr = 0;
+
+ *err = 0;
+ return bh;
+
+fail:
+ brelse(bh);
+ bh = NULL;
+
+ *err = -EIO;
+ return NULL;
+}
+
/*
* bh passed here can be an inode block or a dir data block, depending
* on the inode inline data flag.
@@ -188,8 +231,7 @@ static struct buffer_head *ocfs2_find_entry_id(const char *name,
struct ocfs2_dinode *di;
struct ocfs2_inline_data *data;
- ret = ocfs2_read_block(OCFS2_SB(dir->i_sb), OCFS2_I(dir)->ip_blkno,
- &di_bh, OCFS2_BH_CACHED, dir);
+ ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh);
if (ret) {
mlog_errno(ret);
goto out;
@@ -260,14 +302,13 @@ restart:
}
if ((bh = bh_use[ra_ptr++]) == NULL)
goto next;
- wait_on_buffer(bh);
- if (!buffer_uptodate(bh)) {
- /* read error, skip block & hope for the best */
+ if (ocfs2_read_block(dir, block, &bh)) {
+ /* read error, skip block & hope for the best.
+ * ocfs2_read_block() has released the bh. */
ocfs2_error(dir->i_sb, "reading directory %llu, "
"offset %lu\n",
(unsigned long long)OCFS2_I(dir)->ip_blkno,
block);
- brelse(bh);
goto next;
}
i = ocfs2_search_dirblock(bh, dir, name, namelen,
@@ -417,8 +458,7 @@ static inline int ocfs2_delete_entry_id(handle_t *handle,
struct ocfs2_dinode *di;
struct ocfs2_inline_data *data;
- ret = ocfs2_read_block(OCFS2_SB(dir->i_sb), OCFS2_I(dir)->ip_blkno,
- &di_bh, OCFS2_BH_CACHED, dir);
+ ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh);
if (ret) {
mlog_errno(ret);
goto out;
@@ -596,8 +636,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode,
struct ocfs2_inline_data *data;
struct ocfs2_dir_entry *de;
- ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
- &di_bh, OCFS2_BH_CACHED, inode);
+ ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
if (ret) {
mlog(ML_ERROR, "Unable to read inode block for dir %llu\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -716,8 +755,7 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
for (i = ra_sectors >> (sb->s_blocksize_bits - 9);
i > 0; i--) {
tmp = ocfs2_bread(inode, ++blk, &err, 1);
- if (tmp)
- brelse(tmp);
+ brelse(tmp);
}
last_ra_blk = blk;
ra_sectors = 8;
@@ -899,10 +937,8 @@ int ocfs2_find_files_on_disk(const char *name,
leave:
if (status < 0) {
*dirent = NULL;
- if (*dirent_bh) {
- brelse(*dirent_bh);
- *dirent_bh = NULL;
- }
+ brelse(*dirent_bh);
+ *dirent_bh = NULL;
}
mlog_exit(status);
@@ -951,8 +987,7 @@ int ocfs2_check_dir_for_entry(struct inode *dir,
ret = 0;
bail:
- if (dirent_bh)
- brelse(dirent_bh);
+ brelse(dirent_bh);
mlog_exit(ret);
return ret;
@@ -1127,8 +1162,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
status = 0;
bail:
- if (new_bh)
- brelse(new_bh);
+ brelse(new_bh);
mlog_exit(status);
return status;
@@ -1192,6 +1226,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
struct buffer_head *dirdata_bh = NULL;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
handle_t *handle;
+ struct ocfs2_extent_tree et;
+
+ ocfs2_init_dinode_extent_tree(&et, dir, di_bh);
alloc = ocfs2_clusters_for_bytes(sb, bytes);
@@ -1305,8 +1342,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
* This should never fail as our extent list is empty and all
* related blocks have been journaled already.
*/
- ret = ocfs2_insert_extent(osb, handle, dir, di_bh, 0, blkno, len, 0,
- NULL);
+ ret = ocfs2_insert_extent(osb, handle, dir, &et, 0, blkno, len,
+ 0, NULL);
if (ret) {
mlog_errno(ret);
goto out_commit;
@@ -1337,8 +1374,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
}
blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
- ret = ocfs2_insert_extent(osb, handle, dir, di_bh, 1, blkno,
- len, 0, NULL);
+ ret = ocfs2_insert_extent(osb, handle, dir, &et, 1,
+ blkno, len, 0, NULL);
if (ret) {
mlog_errno(ret);
goto out_commit;
@@ -1383,9 +1420,9 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
if (extend) {
u32 offset = OCFS2_I(dir)->ip_clusters;
- status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset,
- 1, 0, parent_fe_bh, handle,
- data_ac, meta_ac, NULL);
+ status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset,
+ 1, 0, parent_fe_bh, handle,
+ data_ac, meta_ac, NULL);
BUG_ON(status == -EAGAIN);
if (status < 0) {
mlog_errno(status);
@@ -1430,12 +1467,14 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
int credits, num_free_extents, drop_alloc_sem = 0;
loff_t dir_i_size;
struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
+ struct ocfs2_extent_list *el = &fe->id2.i_list;
struct ocfs2_alloc_context *data_ac = NULL;
struct ocfs2_alloc_context *meta_ac = NULL;
handle_t *handle = NULL;
struct buffer_head *new_bh = NULL;
struct ocfs2_dir_entry * de;
struct super_block *sb = osb->sb;
+ struct ocfs2_extent_tree et;
mlog_entry_void();
@@ -1479,7 +1518,8 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
spin_lock(&OCFS2_I(dir)->ip_lock);
if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) {
spin_unlock(&OCFS2_I(dir)->ip_lock);
- num_free_extents = ocfs2_num_free_extents(osb, dir, fe);
+ ocfs2_init_dinode_extent_tree(&et, dir, parent_fe_bh);
+ num_free_extents = ocfs2_num_free_extents(osb, dir, &et);
if (num_free_extents < 0) {
status = num_free_extents;
mlog_errno(status);
@@ -1487,7 +1527,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
}
if (!num_free_extents) {
- status = ocfs2_reserve_new_metadata(osb, fe, &meta_ac);
+ status = ocfs2_reserve_new_metadata(osb, el, &meta_ac);
if (status < 0) {
if (status != -ENOSPC)
mlog_errno(status);
@@ -1502,7 +1542,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
goto bail;
}
- credits = ocfs2_calc_extend_credits(sb, fe, 1);
+ credits = ocfs2_calc_extend_credits(sb, el, 1);
} else {
spin_unlock(&OCFS2_I(dir)->ip_lock);
credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
@@ -1568,8 +1608,7 @@ bail:
if (meta_ac)
ocfs2_free_alloc_context(meta_ac);
- if (new_bh)
- brelse(new_bh);
+ brelse(new_bh);
mlog_exit(status);
return status;
@@ -1696,8 +1735,7 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
status = 0;
bail:
- if (bh)
- brelse(bh);
+ brelse(bh);
mlog_exit(status);
return status;
@@ -1756,7 +1794,6 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
*ret_de_bh = bh;
bh = NULL;
out:
- if (bh)
- brelse(bh);
+ brelse(bh);
return ret;
}
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index eae3d643a5e4..ec684426034b 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2024,8 +2024,7 @@ static int ocfs2_inode_lock_update(struct inode *inode,
} else {
/* Boo, we have to go to disk. */
/* read bh, cast, ocfs2_refresh_inode */
- status = ocfs2_read_block(OCFS2_SB(inode->i_sb), oi->ip_blkno,
- bh, OCFS2_BH_CACHED, inode);
+ status = ocfs2_read_block(inode, oi->ip_blkno, bh);
if (status < 0) {
mlog_errno(status);
goto bail_refresh;
@@ -2086,11 +2085,7 @@ static int ocfs2_assign_bh(struct inode *inode,
return 0;
}
- status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
- OCFS2_I(inode)->ip_blkno,
- ret_bh,
- OCFS2_BH_CACHED,
- inode);
+ status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, ret_bh);
if (status < 0)
mlog_errno(status);
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index aed268e80b49..2baedac58234 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -293,8 +293,7 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el;
- ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), last_eb_blk,
- &eb_bh, OCFS2_BH_CACHED, inode);
+ ret = ocfs2_read_block(inode, last_eb_blk, &eb_bh);
if (ret) {
mlog_errno(ret);
goto out;
@@ -382,9 +381,9 @@ static int ocfs2_figure_hole_clusters(struct inode *inode,
if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
goto no_more_extents;
- ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+ ret = ocfs2_read_block(inode,
le64_to_cpu(eb->h_next_leaf_blk),
- &next_eb_bh, OCFS2_BH_CACHED, inode);
+ &next_eb_bh);
if (ret) {
mlog_errno(ret);
goto out;
@@ -551,6 +550,66 @@ static void ocfs2_relative_extent_offsets(struct super_block *sb,
*num_clusters = le16_to_cpu(rec->e_leaf_clusters) - coff;
}
+int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
+ u32 *p_cluster, u32 *num_clusters,
+ struct ocfs2_extent_list *el)
+{
+ int ret = 0, i;
+ struct buffer_head *eb_bh = NULL;
+ struct ocfs2_extent_block *eb;
+ struct ocfs2_extent_rec *rec;
+ u32 coff;
+
+ if (el->l_tree_depth) {
+ ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+ el = &eb->h_list;
+
+ if (el->l_tree_depth) {
+ ocfs2_error(inode->i_sb,
+ "Inode %lu has non zero tree depth in "
+ "xattr leaf block %llu\n", inode->i_ino,
+ (unsigned long long)eb_bh->b_blocknr);
+ ret = -EROFS;
+ goto out;
+ }
+ }
+
+ i = ocfs2_search_extent_list(el, v_cluster);
+ if (i == -1) {
+ ret = -EROFS;
+ mlog_errno(ret);
+ goto out;
+ } else {
+ rec = &el->l_recs[i];
+ BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
+
+ if (!rec->e_blkno) {
+ ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+ "record (%u, %u, 0) in xattr", inode->i_ino,
+ le32_to_cpu(rec->e_cpos),
+ ocfs2_rec_clusters(el, rec));
+ ret = -EROFS;
+ goto out;
+ }
+ coff = v_cluster - le32_to_cpu(rec->e_cpos);
+ *p_cluster = ocfs2_blocks_to_clusters(inode->i_sb,
+ le64_to_cpu(rec->e_blkno));
+ *p_cluster = *p_cluster + coff;
+ if (num_clusters)
+ *num_clusters = ocfs2_rec_clusters(el, rec) - coff;
+ }
+out:
+ if (eb_bh)
+ brelse(eb_bh);
+ return ret;
+}
+
int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
u32 *p_cluster, u32 *num_clusters,
unsigned int *extent_flags)
@@ -571,8 +630,7 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
if (ret == 0)
goto out;
- ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
- &di_bh, OCFS2_BH_CACHED, inode);
+ ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
if (ret) {
mlog_errno(ret);
goto out;
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index 1b97490e1ea8..1c4aa8b06f34 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -53,4 +53,8 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 map_start, u64 map_len);
+int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
+ u32 *p_cluster, u32 *num_clusters,
+ struct ocfs2_extent_list *el);
+
#endif /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index ed38796052d2..8d3225a78073 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -55,6 +55,7 @@
#include "mmap.h"
#include "suballoc.h"
#include "super.h"
+#include "xattr.h"
#include "buffer_head_io.h"
@@ -184,7 +185,7 @@ static int ocfs2_sync_file(struct file *file,
goto bail;
journal = osb->journal->j_journal;
- err = journal_force_commit(journal);
+ err = jbd2_journal_force_commit(journal);
bail:
mlog_exit(err);
@@ -488,7 +489,7 @@ bail:
}
/*
- * extend allocation only here.
+ * extend file allocation only here.
* we'll update all the disk stuff, and oip->alloc_size
*
* expect stuff to be locked, a transaction started and enough data /
@@ -497,189 +498,25 @@ bail:
* Will return -EAGAIN, and a reason if a restart is needed.
* If passed in, *reason will always be set, even in error.
*/
-int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
- struct inode *inode,
- u32 *logical_offset,
- u32 clusters_to_add,
- int mark_unwritten,
- struct buffer_head *fe_bh,
- handle_t *handle,
- struct ocfs2_alloc_context *data_ac,
- struct ocfs2_alloc_context *meta_ac,
- enum ocfs2_alloc_restarted *reason_ret)
+int ocfs2_add_inode_data(struct ocfs2_super *osb,
+ struct inode *inode,
+ u32 *logical_offset,
+ u32 clusters_to_add,
+ int mark_unwritten,
+ struct buffer_head *fe_bh,
+ handle_t *handle,
+ struct ocfs2_alloc_context *data_ac,
+ struct ocfs2_alloc_context *meta_ac,
+ enum ocfs2_alloc_restarted *reason_ret)
{
- int status = 0;
- int free_extents;
- struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
- enum ocfs2_alloc_restarted reason = RESTART_NONE;
- u32 bit_off, num_bits;
- u64 block;
- u8 flags = 0;
-
- BUG_ON(!clusters_to_add);
-
- if (mark_unwritten)
- flags = OCFS2_EXT_UNWRITTEN;
-
- free_extents = ocfs2_num_free_extents(osb, inode, fe);
- if (free_extents < 0) {
- status = free_extents;
- mlog_errno(status);
- goto leave;
- }
-
- /* there are two cases which could cause us to EAGAIN in the
- * we-need-more-metadata case:
- * 1) we haven't reserved *any*
- * 2) we are so fragmented, we've needed to add metadata too
- * many times. */
- if (!free_extents && !meta_ac) {
- mlog(0, "we haven't reserved any metadata!\n");
- status = -EAGAIN;
- reason = RESTART_META;
- goto leave;
- } else if ((!free_extents)
- && (ocfs2_alloc_context_bits_left(meta_ac)
- < ocfs2_extend_meta_needed(fe))) {
- mlog(0, "filesystem is really fragmented...\n");
- status = -EAGAIN;
- reason = RESTART_META;
- goto leave;
- }
-
- status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
- clusters_to_add, &bit_off, &num_bits);
- if (status < 0) {
- if (status != -ENOSPC)
- mlog_errno(status);
- goto leave;
- }
-
- BUG_ON(num_bits > clusters_to_add);
-
- /* reserve our write early -- insert_extent may update the inode */
- status = ocfs2_journal_access(handle, inode, fe_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
-
- block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
- mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
- num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
- status = ocfs2_insert_extent(osb, handle, inode, fe_bh,
- *logical_offset, block, num_bits,
- flags, meta_ac);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
-
- status = ocfs2_journal_dirty(handle, fe_bh);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
-
- clusters_to_add -= num_bits;
- *logical_offset += num_bits;
-
- if (clusters_to_add) {
- mlog(0, "need to alloc once more, clusters = %u, wanted = "
- "%u\n", fe->i_clusters, clusters_to_add);
- status = -EAGAIN;
- reason = RESTART_TRANS;
- }
-
-leave:
- mlog_exit(status);
- if (reason_ret)
- *reason_ret = reason;
- return status;
-}
-
-/*
- * For a given allocation, determine which allocators will need to be
- * accessed, and lock them, reserving the appropriate number of bits.
- *
- * Sparse file systems call this from ocfs2_write_begin_nolock()
- * and ocfs2_allocate_unwritten_extents().
- *
- * File systems which don't support holes call this from
- * ocfs2_extend_allocation().
- */
-int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
- u32 clusters_to_add, u32 extents_to_split,
- struct ocfs2_alloc_context **data_ac,
- struct ocfs2_alloc_context **meta_ac)
-{
- int ret = 0, num_free_extents;
- unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-
- *meta_ac = NULL;
- if (data_ac)
- *data_ac = NULL;
-
- BUG_ON(clusters_to_add != 0 && data_ac == NULL);
-
- mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
- "clusters_to_add = %u, extents_to_split = %u\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno, (long long)i_size_read(inode),
- le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split);
-
- num_free_extents = ocfs2_num_free_extents(osb, inode, di);
- if (num_free_extents < 0) {
- ret = num_free_extents;
- mlog_errno(ret);
- goto out;
- }
-
- /*
- * Sparse allocation file systems need to be more conservative
- * with reserving room for expansion - the actual allocation
- * happens while we've got a journal handle open so re-taking
- * a cluster lock (because we ran out of room for another
- * extent) will violate ordering rules.
- *
- * Most of the time we'll only be seeing this 1 cluster at a time
- * anyway.
- *
- * Always lock for any unwritten extents - we might want to
- * add blocks during a split.
- */
- if (!num_free_extents ||
- (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
- ret = ocfs2_reserve_new_metadata(osb, di, meta_ac);
- if (ret < 0) {
- if (ret != -ENOSPC)
- mlog_errno(ret);
- goto out;
- }
- }
-
- if (clusters_to_add == 0)
- goto out;
-
- ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
- if (ret < 0) {
- if (ret != -ENOSPC)
- mlog_errno(ret);
- goto out;
- }
-
-out:
- if (ret) {
- if (*meta_ac) {
- ocfs2_free_alloc_context(*meta_ac);
- *meta_ac = NULL;
- }
+ int ret;
+ struct ocfs2_extent_tree et;
- /*
- * We cannot have an error and a non null *data_ac.
- */
- }
+ ocfs2_init_dinode_extent_tree(&et, inode, fe_bh);
+ ret = ocfs2_add_clusters_in_btree(osb, inode, logical_offset,
+ clusters_to_add, mark_unwritten,
+ &et, handle,
+ data_ac, meta_ac, reason_ret);
return ret;
}
@@ -698,6 +535,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
struct ocfs2_alloc_context *meta_ac = NULL;
enum ocfs2_alloc_restarted why;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_extent_tree et;
mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
@@ -707,8 +545,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
*/
BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
- status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
- OCFS2_BH_CACHED, inode);
+ status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
if (status < 0) {
mlog_errno(status);
goto leave;
@@ -724,14 +561,21 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
restart_all:
BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
- status = ocfs2_lock_allocators(inode, fe, clusters_to_add, 0, &data_ac,
- &meta_ac);
+ mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
+ "clusters_to_add = %u\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters),
+ clusters_to_add);
+ ocfs2_init_dinode_extent_tree(&et, inode, bh);
+ status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
+ &data_ac, &meta_ac);
if (status) {
mlog_errno(status);
goto leave;
}
- credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
+ credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list,
+ clusters_to_add);
handle = ocfs2_start_trans(osb, credits);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
@@ -753,16 +597,16 @@ restarted_transaction:
prev_clusters = OCFS2_I(inode)->ip_clusters;
- status = ocfs2_do_extend_allocation(osb,
- inode,
- &logical_start,
- clusters_to_add,
- mark_unwritten,
- bh,
- handle,
- data_ac,
- meta_ac,
- &why);
+ status = ocfs2_add_inode_data(osb,
+ inode,
+ &logical_start,
+ clusters_to_add,
+ mark_unwritten,
+ bh,
+ handle,
+ data_ac,
+ meta_ac,
+ &why);
if ((status < 0) && (status != -EAGAIN)) {
if (status != -ENOSPC)
mlog_errno(status);
@@ -789,7 +633,7 @@ restarted_transaction:
mlog(0, "restarting transaction.\n");
/* TODO: This can be more intelligent. */
credits = ocfs2_calc_extend_credits(osb->sb,
- fe,
+ &fe->id2.i_list,
clusters_to_add);
status = ocfs2_extend_trans(handle, credits);
if (status < 0) {
@@ -826,10 +670,8 @@ leave:
restart_func = 0;
goto restart_all;
}
- if (bh) {
- brelse(bh);
- bh = NULL;
- }
+ brelse(bh);
+ bh = NULL;
mlog_exit(status);
return status;
@@ -1096,9 +938,15 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
goto bail_unlock;
}
- if (i_size_read(inode) > attr->ia_size)
+ if (i_size_read(inode) > attr->ia_size) {
+ if (ocfs2_should_order_data(inode)) {
+ status = ocfs2_begin_ordered_truncate(inode,
+ attr->ia_size);
+ if (status)
+ goto bail_unlock;
+ }
status = ocfs2_truncate_file(inode, bh, attr->ia_size);
- else
+ } else
status = ocfs2_extend_file(inode, bh, attr->ia_size);
if (status < 0) {
if (status != -ENOSPC)
@@ -1140,8 +988,7 @@ bail_unlock_rw:
if (size_change)
ocfs2_rw_unlock(inode, 1);
bail:
- if (bh)
- brelse(bh);
+ brelse(bh);
mlog_exit(status);
return status;
@@ -1284,8 +1131,7 @@ static int ocfs2_write_remove_suid(struct inode *inode)
struct buffer_head *bh = NULL;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
- ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
- oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
+ ret = ocfs2_read_block(inode, oi->ip_blkno, &bh);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -1311,9 +1157,8 @@ static int ocfs2_allocate_unwritten_extents(struct inode *inode,
struct buffer_head *di_bh = NULL;
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
- ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
- OCFS2_I(inode)->ip_blkno, &di_bh,
- OCFS2_BH_CACHED, inode);
+ ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno,
+ &di_bh);
if (ret) {
mlog_errno(ret);
goto out;
@@ -1394,8 +1239,11 @@ static int __ocfs2_remove_inode_range(struct inode *inode,
handle_t *handle;
struct ocfs2_alloc_context *meta_ac = NULL;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct ocfs2_extent_tree et;
- ret = ocfs2_lock_allocators(inode, di, 0, 1, NULL, &meta_ac);
+ ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
+
+ ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
if (ret) {
mlog_errno(ret);
return ret;
@@ -1425,7 +1273,7 @@ static int __ocfs2_remove_inode_range(struct inode *inode,
goto out;
}
- ret = ocfs2_remove_extent(inode, di_bh, cpos, len, handle, meta_ac,
+ ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
dealloc);
if (ret) {
mlog_errno(ret);
@@ -2040,7 +1888,7 @@ out_dio:
*/
if (old_size != i_size_read(inode) ||
old_clusters != OCFS2_I(inode)->ip_clusters) {
- ret = journal_force_commit(osb->journal->j_journal);
+ ret = jbd2_journal_force_commit(osb->journal->j_journal);
if (ret < 0)
written = ret;
}
@@ -2227,6 +2075,10 @@ const struct inode_operations ocfs2_file_iops = {
.setattr = ocfs2_setattr,
.getattr = ocfs2_getattr,
.permission = ocfs2_permission,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = ocfs2_listxattr,
+ .removexattr = generic_removexattr,
.fallocate = ocfs2_fallocate,
.fiemap = ocfs2_fiemap,
};
@@ -2237,6 +2089,10 @@ const struct inode_operations ocfs2_special_file_iops = {
.permission = ocfs2_permission,
};
+/*
+ * Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with
+ * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
+ */
const struct file_operations ocfs2_fops = {
.llseek = generic_file_llseek,
.read = do_sync_read,
@@ -2251,6 +2107,7 @@ const struct file_operations ocfs2_fops = {
#ifdef CONFIG_COMPAT
.compat_ioctl = ocfs2_compat_ioctl,
#endif
+ .lock = ocfs2_lock,
.flock = ocfs2_flock,
.splice_read = ocfs2_file_splice_read,
.splice_write = ocfs2_file_splice_write,
@@ -2267,5 +2124,51 @@ const struct file_operations ocfs2_dops = {
#ifdef CONFIG_COMPAT
.compat_ioctl = ocfs2_compat_ioctl,
#endif
+ .lock = ocfs2_lock,
+ .flock = ocfs2_flock,
+};
+
+/*
+ * POSIX-lockless variants of our file_operations.
+ *
+ * These will be used if the underlying cluster stack does not support
+ * posix file locking, if the user passes the "localflocks" mount
+ * option, or if we have a local-only fs.
+ *
+ * ocfs2_flock is in here because all stacks handle UNIX file locks,
+ * so we still want it in the case of no stack support for
+ * plocks. Internally, it will do the right thing when asked to ignore
+ * the cluster.
+ */
+const struct file_operations ocfs2_fops_no_plocks = {
+ .llseek = generic_file_llseek,
+ .read = do_sync_read,
+ .write = do_sync_write,
+ .mmap = ocfs2_mmap,
+ .fsync = ocfs2_sync_file,
+ .release = ocfs2_file_release,
+ .open = ocfs2_file_open,
+ .aio_read = ocfs2_file_aio_read,
+ .aio_write = ocfs2_file_aio_write,
+ .unlocked_ioctl = ocfs2_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = ocfs2_compat_ioctl,
+#endif
+ .flock = ocfs2_flock,
+ .splice_read = ocfs2_file_splice_read,
+ .splice_write = ocfs2_file_splice_write,
+};
+
+const struct file_operations ocfs2_dops_no_plocks = {
+ .llseek = generic_file_llseek,
+ .read = generic_read_dir,
+ .readdir = ocfs2_readdir,
+ .fsync = ocfs2_sync_file,
+ .release = ocfs2_dir_release,
+ .open = ocfs2_dir_open,
+ .unlocked_ioctl = ocfs2_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = ocfs2_compat_ioctl,
+#endif
.flock = ocfs2_flock,
};
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 1e27b4d017ea..e92382cbca5f 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -28,9 +28,12 @@
extern const struct file_operations ocfs2_fops;
extern const struct file_operations ocfs2_dops;
+extern const struct file_operations ocfs2_fops_no_plocks;
+extern const struct file_operations ocfs2_dops_no_plocks;
extern const struct inode_operations ocfs2_file_iops;
extern const struct inode_operations ocfs2_special_file_iops;
struct ocfs2_alloc_context;
+enum ocfs2_alloc_restarted;
struct ocfs2_file_private {
struct file *fp_file;
@@ -38,27 +41,18 @@ struct ocfs2_file_private {
struct ocfs2_lock_res fp_flock;
};
-enum ocfs2_alloc_restarted {
- RESTART_NONE = 0,
- RESTART_TRANS,
- RESTART_META
-};
-int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
- struct inode *inode,
- u32 *logical_offset,
- u32 clusters_to_add,
- int mark_unwritten,
- struct buffer_head *fe_bh,
- handle_t *handle,
- struct ocfs2_alloc_context *data_ac,
- struct ocfs2_alloc_context *meta_ac,
- enum ocfs2_alloc_restarted *reason_ret);
+int ocfs2_add_inode_data(struct ocfs2_super *osb,
+ struct inode *inode,
+ u32 *logical_offset,
+ u32 clusters_to_add,
+ int mark_unwritten,
+ struct buffer_head *fe_bh,
+ handle_t *handle,
+ struct ocfs2_alloc_context *data_ac,
+ struct ocfs2_alloc_context *meta_ac,
+ enum ocfs2_alloc_restarted *reason_ret);
int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size,
u64 zero_to);
-int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
- u32 clusters_to_add, u32 extents_to_split,
- struct ocfs2_alloc_context **data_ac,
- struct ocfs2_alloc_context **meta_ac);
int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 7e9e4c79aec7..4903688f72a9 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -49,6 +49,7 @@
#include "symlink.h"
#include "sysfile.h"
#include "uptodate.h"
+#include "xattr.h"
#include "buffer_head_io.h"
@@ -219,6 +220,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
struct super_block *sb;
struct ocfs2_super *osb;
int status = -EINVAL;
+ int use_plocks = 1;
mlog_entry("(0x%p, size:%llu)\n", inode,
(unsigned long long)le64_to_cpu(fe->i_size));
@@ -226,6 +228,10 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
sb = inode->i_sb;
osb = OCFS2_SB(sb);
+ if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) ||
+ ocfs2_mount_local(osb) || !ocfs2_stack_supports_plocks())
+ use_plocks = 0;
+
/* this means that read_inode cannot create a superblock inode
* today. change if needed. */
if (!OCFS2_IS_VALID_DINODE(fe) ||
@@ -295,13 +301,19 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
- inode->i_fop = &ocfs2_fops;
+ if (use_plocks)
+ inode->i_fop = &ocfs2_fops;
+ else
+ inode->i_fop = &ocfs2_fops_no_plocks;
inode->i_op = &ocfs2_file_iops;
i_size_write(inode, le64_to_cpu(fe->i_size));
break;
case S_IFDIR:
inode->i_op = &ocfs2_dir_iops;
- inode->i_fop = &ocfs2_dops;
+ if (use_plocks)
+ inode->i_fop = &ocfs2_dops;
+ else
+ inode->i_fop = &ocfs2_dops_no_plocks;
i_size_write(inode, le64_to_cpu(fe->i_size));
break;
case S_IFLNK:
@@ -448,8 +460,11 @@ static int ocfs2_read_locked_inode(struct inode *inode,
}
}
- status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0,
- can_lock ? inode : NULL);
+ if (can_lock)
+ status = ocfs2_read_blocks(inode, args->fi_blkno, 1, &bh,
+ OCFS2_BH_IGNORE_CACHE);
+ else
+ status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -522,6 +537,9 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
* data and fast symlinks.
*/
if (fe->i_clusters) {
+ if (ocfs2_should_order_data(inode))
+ ocfs2_begin_ordered_truncate(inode, 0);
+
handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
@@ -730,6 +748,13 @@ static int ocfs2_wipe_inode(struct inode *inode,
goto bail_unlock_dir;
}
+ /*Free extended attribute resources associated with this inode.*/
+ status = ocfs2_xattr_remove(inode, di_bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail_unlock_dir;
+ }
+
status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode,
orphan_dir_bh);
if (status < 0)
@@ -1081,6 +1106,8 @@ void ocfs2_clear_inode(struct inode *inode)
oi->ip_last_trans = 0;
oi->ip_dir_start_lookup = 0;
oi->ip_blkno = 0ULL;
+ jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal,
+ &oi->ip_jinode);
bail:
mlog_exit_void();
@@ -1107,58 +1134,6 @@ void ocfs2_drop_inode(struct inode *inode)
}
/*
- * TODO: this should probably be merged into ocfs2_get_block
- *
- * However, you now need to pay attention to the cont_prepare_write()
- * stuff in ocfs2_get_block (that is, ocfs2_get_block pretty much
- * expects never to extend).
- */
-struct buffer_head *ocfs2_bread(struct inode *inode,
- int block, int *err, int reada)
-{
- struct buffer_head *bh = NULL;
- int tmperr;
- u64 p_blkno;
- int readflags = OCFS2_BH_CACHED;
-
- if (reada)
- readflags |= OCFS2_BH_READAHEAD;
-
- if (((u64)block << inode->i_sb->s_blocksize_bits) >=
- i_size_read(inode)) {
- BUG_ON(!reada);
- return NULL;
- }
-
- down_read(&OCFS2_I(inode)->ip_alloc_sem);
- tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
- NULL);
- up_read(&OCFS2_I(inode)->ip_alloc_sem);
- if (tmperr < 0) {
- mlog_errno(tmperr);
- goto fail;
- }
-
- tmperr = ocfs2_read_block(OCFS2_SB(inode->i_sb), p_blkno, &bh,
- readflags, inode);
- if (tmperr < 0)
- goto fail;
-
- tmperr = 0;
-
- *err = 0;
- return bh;
-
-fail:
- if (bh) {
- brelse(bh);
- bh = NULL;
- }
- *err = -EIO;
- return NULL;
-}
-
-/*
* This is called from our getattr.
*/
int ocfs2_inode_revalidate(struct dentry *dentry)
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 390a85596aa0..2f37af9bcc4a 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -40,6 +40,9 @@ struct ocfs2_inode_info
/* protects allocation changes on this inode. */
struct rw_semaphore ip_alloc_sem;
+ /* protects extended attribute changes on this inode */
+ struct rw_semaphore ip_xattr_sem;
+
/* These fields are protected by ip_lock */
spinlock_t ip_lock;
u32 ip_open_count;
@@ -68,6 +71,7 @@ struct ocfs2_inode_info
struct ocfs2_extent_map ip_extent_map;
struct inode vfs_inode;
+ struct jbd2_inode ip_jinode;
};
/*
@@ -113,8 +117,6 @@ extern struct kmem_cache *ocfs2_inode_cache;
extern const struct address_space_operations ocfs2_aops;
-struct buffer_head *ocfs2_bread(struct inode *inode, int block,
- int *err, int reada);
void ocfs2_clear_inode(struct inode *inode);
void ocfs2_delete_inode(struct inode *inode);
void ocfs2_drop_inode(struct inode *inode);
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 7b142f0ce995..9fcd36dcc9a0 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -102,8 +102,7 @@ bail_unlock:
bail:
mutex_unlock(&inode->i_mutex);
- if (bh)
- brelse(bh);
+ brelse(bh);
mlog_exit(status);
return status;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index c47bc2a809c2..81e40677eecb 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -215,9 +215,9 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
goto finally;
}
- journal_lock_updates(journal->j_journal);
- status = journal_flush(journal->j_journal);
- journal_unlock_updates(journal->j_journal);
+ jbd2_journal_lock_updates(journal->j_journal);
+ status = jbd2_journal_flush(journal->j_journal);
+ jbd2_journal_unlock_updates(journal->j_journal);
if (status < 0) {
up_write(&journal->j_trans_barrier);
mlog_errno(status);
@@ -264,7 +264,7 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
down_read(&osb->journal->j_trans_barrier);
- handle = journal_start(journal, max_buffs);
+ handle = jbd2_journal_start(journal, max_buffs);
if (IS_ERR(handle)) {
up_read(&osb->journal->j_trans_barrier);
@@ -290,7 +290,7 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
BUG_ON(!handle);
- ret = journal_stop(handle);
+ ret = jbd2_journal_stop(handle);
if (ret < 0)
mlog_errno(ret);
@@ -304,7 +304,7 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
* transaction. extend_trans will either extend the current handle by
* nblocks, or commit it and start a new one with nblocks credits.
*
- * This might call journal_restart() which will commit dirty buffers
+ * This might call jbd2_journal_restart() which will commit dirty buffers
* and then restart the transaction. Before calling
* ocfs2_extend_trans(), any changed blocks should have been
* dirtied. After calling it, all blocks which need to be changed must
@@ -332,7 +332,7 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
#ifdef CONFIG_OCFS2_DEBUG_FS
status = 1;
#else
- status = journal_extend(handle, nblocks);
+ status = jbd2_journal_extend(handle, nblocks);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -340,8 +340,10 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
#endif
if (status > 0) {
- mlog(0, "journal_extend failed, trying journal_restart\n");
- status = journal_restart(handle, nblocks);
+ mlog(0,
+ "jbd2_journal_extend failed, trying "
+ "jbd2_journal_restart\n");
+ status = jbd2_journal_restart(handle, nblocks);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -393,11 +395,11 @@ int ocfs2_journal_access(handle_t *handle,
switch (type) {
case OCFS2_JOURNAL_ACCESS_CREATE:
case OCFS2_JOURNAL_ACCESS_WRITE:
- status = journal_get_write_access(handle, bh);
+ status = jbd2_journal_get_write_access(handle, bh);
break;
case OCFS2_JOURNAL_ACCESS_UNDO:
- status = journal_get_undo_access(handle, bh);
+ status = jbd2_journal_get_undo_access(handle, bh);
break;
default:
@@ -422,7 +424,7 @@ int ocfs2_journal_dirty(handle_t *handle,
mlog_entry("(bh->b_blocknr=%llu)\n",
(unsigned long long)bh->b_blocknr);
- status = journal_dirty_metadata(handle, bh);
+ status = jbd2_journal_dirty_metadata(handle, bh);
if (status < 0)
mlog(ML_ERROR, "Could not dirty metadata buffer. "
"(bh->b_blocknr=%llu)\n",
@@ -432,6 +434,7 @@ int ocfs2_journal_dirty(handle_t *handle,
return status;
}
+#ifdef CONFIG_OCFS2_COMPAT_JBD
int ocfs2_journal_dirty_data(handle_t *handle,
struct buffer_head *bh)
{
@@ -443,8 +446,9 @@ int ocfs2_journal_dirty_data(handle_t *handle,
return err;
}
+#endif
-#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD_DEFAULT_MAX_COMMIT_AGE)
+#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
void ocfs2_set_journal_params(struct ocfs2_super *osb)
{
@@ -457,9 +461,9 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb)
spin_lock(&journal->j_state_lock);
journal->j_commit_interval = commit_interval;
if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
- journal->j_flags |= JFS_BARRIER;
+ journal->j_flags |= JBD2_BARRIER;
else
- journal->j_flags &= ~JFS_BARRIER;
+ journal->j_flags &= ~JBD2_BARRIER;
spin_unlock(&journal->j_state_lock);
}
@@ -524,14 +528,14 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
mlog(0, "inode->ip_clusters = %u\n", OCFS2_I(inode)->ip_clusters);
/* call the kernels journal init function now */
- j_journal = journal_init_inode(inode);
+ j_journal = jbd2_journal_init_inode(inode);
if (j_journal == NULL) {
mlog(ML_ERROR, "Linux journal layer error\n");
status = -EINVAL;
goto done;
}
- mlog(0, "Returned from journal_init_inode\n");
+ mlog(0, "Returned from jbd2_journal_init_inode\n");
mlog(0, "j_journal->j_maxlen = %u\n", j_journal->j_maxlen);
*dirty = (le32_to_cpu(di->id1.journal1.ij_flags) &
@@ -550,8 +554,7 @@ done:
if (status < 0) {
if (inode_lock)
ocfs2_inode_unlock(inode, 1);
- if (bh != NULL)
- brelse(bh);
+ brelse(bh);
if (inode) {
OCFS2_I(inode)->ip_open_count--;
iput(inode);
@@ -639,7 +642,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
if (journal->j_state != OCFS2_JOURNAL_LOADED)
goto done;
- /* need to inc inode use count as journal_destroy will iput. */
+ /* need to inc inode use count - jbd2_journal_destroy will iput. */
if (!igrab(inode))
BUG();
@@ -668,9 +671,9 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0);
if (ocfs2_mount_local(osb)) {
- journal_lock_updates(journal->j_journal);
- status = journal_flush(journal->j_journal);
- journal_unlock_updates(journal->j_journal);
+ jbd2_journal_lock_updates(journal->j_journal);
+ status = jbd2_journal_flush(journal->j_journal);
+ jbd2_journal_unlock_updates(journal->j_journal);
if (status < 0)
mlog_errno(status);
}
@@ -686,7 +689,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
}
/* Shutdown the kernel journal system */
- journal_destroy(journal->j_journal);
+ jbd2_journal_destroy(journal->j_journal);
OCFS2_I(inode)->ip_open_count--;
@@ -711,15 +714,15 @@ static void ocfs2_clear_journal_error(struct super_block *sb,
{
int olderr;
- olderr = journal_errno(journal);
+ olderr = jbd2_journal_errno(journal);
if (olderr) {
mlog(ML_ERROR, "File system error %d recorded in "
"journal %u.\n", olderr, slot);
mlog(ML_ERROR, "File system on device %s needs checking.\n",
sb->s_id);
- journal_ack_err(journal);
- journal_clear_err(journal);
+ jbd2_journal_ack_err(journal);
+ jbd2_journal_clear_err(journal);
}
}
@@ -734,7 +737,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
osb = journal->j_osb;
- status = journal_load(journal->j_journal);
+ status = jbd2_journal_load(journal->j_journal);
if (status < 0) {
mlog(ML_ERROR, "Failed to load journal!\n");
goto done;
@@ -778,7 +781,7 @@ int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
BUG_ON(!journal);
- status = journal_wipe(journal->j_journal, full);
+ status = jbd2_journal_wipe(journal->j_journal, full);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -847,9 +850,8 @@ static int ocfs2_force_read_journal(struct inode *inode)
/* We are reading journal data which should not
* be put in the uptodate cache */
- status = ocfs2_read_blocks(OCFS2_SB(inode->i_sb),
- p_blkno, p_blocks, bhs, 0,
- NULL);
+ status = ocfs2_read_blocks_sync(OCFS2_SB(inode->i_sb),
+ p_blkno, p_blocks, bhs);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -865,8 +867,7 @@ static int ocfs2_force_read_journal(struct inode *inode)
bail:
for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++)
- if (bhs[i])
- brelse(bhs[i]);
+ brelse(bhs[i]);
mlog_exit(status);
return status;
}
@@ -1133,7 +1134,8 @@ static int ocfs2_read_journal_inode(struct ocfs2_super *osb,
}
SET_INODE_JOURNAL(inode);
- status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, bh, 0, inode);
+ status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, bh,
+ OCFS2_BH_IGNORE_CACHE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1229,19 +1231,19 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
}
mlog(0, "calling journal_init_inode\n");
- journal = journal_init_inode(inode);
+ journal = jbd2_journal_init_inode(inode);
if (journal == NULL) {
mlog(ML_ERROR, "Linux journal layer error\n");
status = -EIO;
goto done;
}
- status = journal_load(journal);
+ status = jbd2_journal_load(journal);
if (status < 0) {
mlog_errno(status);
if (!igrab(inode))
BUG();
- journal_destroy(journal);
+ jbd2_journal_destroy(journal);
goto done;
}
@@ -1249,9 +1251,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
/* wipe the journal */
mlog(0, "flushing the journal.\n");
- journal_lock_updates(journal);
- status = journal_flush(journal);
- journal_unlock_updates(journal);
+ jbd2_journal_lock_updates(journal);
+ status = jbd2_journal_flush(journal);
+ jbd2_journal_unlock_updates(journal);
if (status < 0)
mlog_errno(status);
@@ -1272,7 +1274,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
if (!igrab(inode))
BUG();
- journal_destroy(journal);
+ jbd2_journal_destroy(journal);
done:
/* drop the lock on this nodes journal */
@@ -1282,8 +1284,7 @@ done:
if (inode)
iput(inode);
- if (bh)
- brelse(bh);
+ brelse(bh);
mlog_exit(status);
return status;
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 2178ebffa05f..d4d14e9a3cea 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -27,7 +27,12 @@
#define OCFS2_JOURNAL_H
#include <linux/fs.h>
-#include <linux/jbd.h>
+#ifndef CONFIG_OCFS2_COMPAT_JBD
+# include <linux/jbd2.h>
+#else
+# include <linux/jbd.h>
+# include "ocfs2_jbd_compat.h"
+#endif
enum ocfs2_journal_state {
OCFS2_JOURNAL_FREE = 0,
@@ -215,8 +220,8 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode)
* buffer. Will have to call ocfs2_journal_dirty once
* we've actually dirtied it. Type is one of . or .
* ocfs2_journal_dirty - Mark a journalled buffer as having dirty data.
- * ocfs2_journal_dirty_data - Indicate that a data buffer should go out before
- * the current handle commits.
+ * ocfs2_jbd2_file_inode - Mark an inode so that its data goes out before
+ * the current handle commits.
*/
/* You must always start_trans with a number of buffs > 0, but it's
@@ -268,8 +273,10 @@ int ocfs2_journal_access(handle_t *handle,
*/
int ocfs2_journal_dirty(handle_t *handle,
struct buffer_head *bh);
+#ifdef CONFIG_OCFS2_COMPAT_JBD
int ocfs2_journal_dirty_data(handle_t *handle,
struct buffer_head *bh);
+#endif
/*
* Credit Macros:
@@ -283,6 +290,9 @@ int ocfs2_journal_dirty_data(handle_t *handle,
/* simple file updates like chmod, etc. */
#define OCFS2_INODE_UPDATE_CREDITS 1
+/* extended attribute block update */
+#define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1
+
/* group extend. inode update and last group update. */
#define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
@@ -340,11 +350,23 @@ int ocfs2_journal_dirty_data(handle_t *handle,
#define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3 \
+ OCFS2_UNLINK_CREDITS)
+/* global bitmap dinode, group desc., relinked group,
+ * suballocator dinode, group desc., relinked group,
+ * dinode, xattr block */
+#define OCFS2_XATTR_BLOCK_CREATE_CREDITS (OCFS2_SUBALLOC_ALLOC * 2 + \
+ + OCFS2_INODE_UPDATE_CREDITS \
+ + OCFS2_XATTR_BLOCK_UPDATE_CREDITS)
+
+/*
+ * Please note that the caller must make sure that root_el is the root
+ * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
+ * the result may be wrong.
+ */
static inline int ocfs2_calc_extend_credits(struct super_block *sb,
- struct ocfs2_dinode *fe,
+ struct ocfs2_extent_list *root_el,
u32 bits_wanted)
{
- int bitmap_blocks, sysfile_bitmap_blocks, dinode_blocks;
+ int bitmap_blocks, sysfile_bitmap_blocks, extent_blocks;
/* bitmap dinode, group desc. + relinked group. */
bitmap_blocks = OCFS2_SUBALLOC_ALLOC;
@@ -355,16 +377,16 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb,
* however many metadata chunks needed * a remaining suballoc
* alloc. */
sysfile_bitmap_blocks = 1 +
- (OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(fe);
+ (OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(root_el);
/* this does not include *new* metadata blocks, which are
- * accounted for in sysfile_bitmap_blocks. fe +
+ * accounted for in sysfile_bitmap_blocks. root_el +
* prev. last_eb_blk + blocks along edge of tree.
* calc_symlink_credits passes because we just need 1
* credit for the dinode there. */
- dinode_blocks = 1 + 1 + le16_to_cpu(fe->id2.i_list.l_tree_depth);
+ extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth);
- return bitmap_blocks + sysfile_bitmap_blocks + dinode_blocks;
+ return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks;
}
static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
@@ -415,4 +437,16 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
return credits;
}
+static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode)
+{
+ return jbd2_journal_file_inode(handle, &OCFS2_I(inode)->ip_jinode);
+}
+
+static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
+ loff_t new_size)
+{
+ return jbd2_journal_begin_ordered_truncate(&OCFS2_I(inode)->ip_jinode,
+ new_size);
+}
+
#endif /* OCFS2_JOURNAL_H */
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 28e492e4ec88..687b28713c32 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -28,6 +28,7 @@
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/bitops.h>
+#include <linux/debugfs.h>
#define MLOG_MASK_PREFIX ML_DISK_ALLOC
#include <cluster/masklog.h>
@@ -47,8 +48,6 @@
#define OCFS2_LOCAL_ALLOC(dinode) (&((dinode)->id2.i_lab))
-static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb);
-
static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc);
static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
@@ -75,24 +74,129 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
struct inode *local_alloc_inode);
-static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb)
+#ifdef CONFIG_OCFS2_FS_STATS
+
+static int ocfs2_la_debug_open(struct inode *inode, struct file *file)
+{
+ file->private_data = inode->i_private;
+ return 0;
+}
+
+#define LA_DEBUG_BUF_SZ PAGE_CACHE_SIZE
+#define LA_DEBUG_VER 1
+static ssize_t ocfs2_la_debug_read(struct file *file, char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ static DEFINE_MUTEX(la_debug_mutex);
+ struct ocfs2_super *osb = file->private_data;
+ int written, ret;
+ char *buf = osb->local_alloc_debug_buf;
+
+ mutex_lock(&la_debug_mutex);
+ memset(buf, 0, LA_DEBUG_BUF_SZ);
+
+ written = snprintf(buf, LA_DEBUG_BUF_SZ,
+ "0x%x\t0x%llx\t%u\t%u\t0x%x\n",
+ LA_DEBUG_VER,
+ (unsigned long long)osb->la_last_gd,
+ osb->local_alloc_default_bits,
+ osb->local_alloc_bits, osb->local_alloc_state);
+
+ ret = simple_read_from_buffer(userbuf, count, ppos, buf, written);
+
+ mutex_unlock(&la_debug_mutex);
+ return ret;
+}
+
+static const struct file_operations ocfs2_la_debug_fops = {
+ .open = ocfs2_la_debug_open,
+ .read = ocfs2_la_debug_read,
+};
+
+static void ocfs2_init_la_debug(struct ocfs2_super *osb)
+{
+ osb->local_alloc_debug_buf = kmalloc(LA_DEBUG_BUF_SZ, GFP_NOFS);
+ if (!osb->local_alloc_debug_buf)
+ return;
+
+ osb->local_alloc_debug = debugfs_create_file("local_alloc_stats",
+ S_IFREG|S_IRUSR,
+ osb->osb_debug_root,
+ osb,
+ &ocfs2_la_debug_fops);
+ if (!osb->local_alloc_debug) {
+ kfree(osb->local_alloc_debug_buf);
+ osb->local_alloc_debug_buf = NULL;
+ }
+}
+
+static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb)
+{
+ if (osb->local_alloc_debug)
+ debugfs_remove(osb->local_alloc_debug);
+
+ if (osb->local_alloc_debug_buf)
+ kfree(osb->local_alloc_debug_buf);
+
+ osb->local_alloc_debug_buf = NULL;
+ osb->local_alloc_debug = NULL;
+}
+#else /* CONFIG_OCFS2_FS_STATS */
+static void ocfs2_init_la_debug(struct ocfs2_super *osb)
+{
+ return;
+}
+static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb)
+{
+ return;
+}
+#endif
+
+static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb)
{
- BUG_ON(osb->s_clustersize_bits > 20);
+ return (osb->local_alloc_state == OCFS2_LA_THROTTLED ||
+ osb->local_alloc_state == OCFS2_LA_ENABLED);
+}
- /* Size local alloc windows by the megabyte */
- return osb->local_alloc_size << (20 - osb->s_clustersize_bits);
+void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb,
+ unsigned int num_clusters)
+{
+ spin_lock(&osb->osb_lock);
+ if (osb->local_alloc_state == OCFS2_LA_DISABLED ||
+ osb->local_alloc_state == OCFS2_LA_THROTTLED)
+ if (num_clusters >= osb->local_alloc_default_bits) {
+ cancel_delayed_work(&osb->la_enable_wq);
+ osb->local_alloc_state = OCFS2_LA_ENABLED;
+ }
+ spin_unlock(&osb->osb_lock);
+}
+
+void ocfs2_la_enable_worker(struct work_struct *work)
+{
+ struct ocfs2_super *osb =
+ container_of(work, struct ocfs2_super,
+ la_enable_wq.work);
+ spin_lock(&osb->osb_lock);
+ osb->local_alloc_state = OCFS2_LA_ENABLED;
+ spin_unlock(&osb->osb_lock);
}
/*
* Tell us whether a given allocation should use the local alloc
* file. Otherwise, it has to go to the main bitmap.
+ *
+ * This function does semi-dirty reads of local alloc size and state!
+ * This is ok however, as the values are re-checked once under mutex.
*/
int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
{
- int la_bits = ocfs2_local_alloc_window_bits(osb);
int ret = 0;
+ int la_bits;
+
+ spin_lock(&osb->osb_lock);
+ la_bits = osb->local_alloc_bits;
- if (osb->local_alloc_state != OCFS2_LA_ENABLED)
+ if (!ocfs2_la_state_enabled(osb))
goto bail;
/* la_bits should be at least twice the size (in clusters) of
@@ -106,6 +210,7 @@ int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
bail:
mlog(0, "state=%d, bits=%llu, la_bits=%d, ret=%d\n",
osb->local_alloc_state, (unsigned long long)bits, la_bits, ret);
+ spin_unlock(&osb->osb_lock);
return ret;
}
@@ -120,14 +225,18 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
mlog_entry_void();
- if (osb->local_alloc_size == 0)
+ ocfs2_init_la_debug(osb);
+
+ if (osb->local_alloc_bits == 0)
goto bail;
- if (ocfs2_local_alloc_window_bits(osb) >= osb->bitmap_cpg) {
+ if (osb->local_alloc_bits >= osb->bitmap_cpg) {
mlog(ML_NOTICE, "Requested local alloc window %d is larger "
"than max possible %u. Using defaults.\n",
- ocfs2_local_alloc_window_bits(osb), (osb->bitmap_cpg - 1));
- osb->local_alloc_size = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
+ osb->local_alloc_bits, (osb->bitmap_cpg - 1));
+ osb->local_alloc_bits =
+ ocfs2_megabytes_to_clusters(osb->sb,
+ OCFS2_DEFAULT_LOCAL_ALLOC_SIZE);
}
/* read the alloc off disk */
@@ -139,8 +248,8 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
goto bail;
}
- status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno,
- &alloc_bh, 0, inode);
+ status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1,
+ &alloc_bh, OCFS2_BH_IGNORE_CACHE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -185,13 +294,14 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
bail:
if (status < 0)
- if (alloc_bh)
- brelse(alloc_bh);
+ brelse(alloc_bh);
if (inode)
iput(inode);
- mlog(0, "Local alloc window bits = %d\n",
- ocfs2_local_alloc_window_bits(osb));
+ if (status < 0)
+ ocfs2_shutdown_la_debug(osb);
+
+ mlog(0, "Local alloc window bits = %d\n", osb->local_alloc_bits);
mlog_exit(status);
return status;
@@ -217,6 +327,11 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
mlog_entry_void();
+ cancel_delayed_work(&osb->la_enable_wq);
+ flush_workqueue(ocfs2_wq);
+
+ ocfs2_shutdown_la_debug(osb);
+
if (osb->local_alloc_state == OCFS2_LA_UNUSED)
goto out;
@@ -295,8 +410,7 @@ out_commit:
ocfs2_commit_trans(osb, handle);
out_unlock:
- if (main_bm_bh)
- brelse(main_bm_bh);
+ brelse(main_bm_bh);
ocfs2_inode_unlock(main_bm_inode, 1);
@@ -345,8 +459,8 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
mutex_lock(&inode->i_mutex);
- status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno,
- &alloc_bh, 0, inode);
+ status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1,
+ &alloc_bh, OCFS2_BH_IGNORE_CACHE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -372,8 +486,7 @@ bail:
*alloc_copy = NULL;
}
- if (alloc_bh)
- brelse(alloc_bh);
+ brelse(alloc_bh);
if (inode) {
mutex_unlock(&inode->i_mutex);
@@ -441,8 +554,7 @@ out_unlock:
out_mutex:
mutex_unlock(&main_bm_inode->i_mutex);
- if (main_bm_bh)
- brelse(main_bm_bh);
+ brelse(main_bm_bh);
iput(main_bm_inode);
@@ -453,8 +565,48 @@ out:
return status;
}
+/* Check to see if the local alloc window is within ac->ac_max_block */
+static int ocfs2_local_alloc_in_range(struct inode *inode,
+ struct ocfs2_alloc_context *ac,
+ u32 bits_wanted)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_dinode *alloc;
+ struct ocfs2_local_alloc *la;
+ int start;
+ u64 block_off;
+
+ if (!ac->ac_max_block)
+ return 1;
+
+ alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
+ la = OCFS2_LOCAL_ALLOC(alloc);
+
+ start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
+ if (start == -1) {
+ mlog_errno(-ENOSPC);
+ return 0;
+ }
+
+ /*
+ * Converting (bm_off + start + bits_wanted) to blocks gives us
+ * the blkno just past our actual allocation. This is perfect
+ * to compare with ac_max_block.
+ */
+ block_off = ocfs2_clusters_to_blocks(inode->i_sb,
+ le32_to_cpu(la->la_bm_off) +
+ start + bits_wanted);
+ mlog(0, "Checking %llu against %llu\n",
+ (unsigned long long)block_off,
+ (unsigned long long)ac->ac_max_block);
+ if (block_off > ac->ac_max_block)
+ return 0;
+
+ return 1;
+}
+
/*
- * make sure we've got at least bitswanted contiguous bits in the
+ * make sure we've got at least bits_wanted contiguous bits in the
* local alloc. You lose them when you drop i_mutex.
*
* We will add ourselves to the transaction passed in, but may start
@@ -485,16 +637,18 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
mutex_lock(&local_alloc_inode->i_mutex);
- if (osb->local_alloc_state != OCFS2_LA_ENABLED) {
- status = -ENOSPC;
- goto bail;
- }
-
- if (bits_wanted > ocfs2_local_alloc_window_bits(osb)) {
- mlog(0, "Asking for more than my max window size!\n");
+ /*
+ * We must double check state and allocator bits because
+ * another process may have changed them while holding i_mutex.
+ */
+ spin_lock(&osb->osb_lock);
+ if (!ocfs2_la_state_enabled(osb) ||
+ (bits_wanted > osb->local_alloc_bits)) {
+ spin_unlock(&osb->osb_lock);
status = -ENOSPC;
goto bail;
}
+ spin_unlock(&osb->osb_lock);
alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
@@ -522,6 +676,36 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
mlog_errno(status);
goto bail;
}
+
+ /*
+ * Under certain conditions, the window slide code
+ * might have reduced the number of bits available or
+ * disabled the the local alloc entirely. Re-check
+ * here and return -ENOSPC if necessary.
+ */
+ status = -ENOSPC;
+ if (!ocfs2_la_state_enabled(osb))
+ goto bail;
+
+ free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) -
+ le32_to_cpu(alloc->id1.bitmap1.i_used);
+ if (bits_wanted > free_bits)
+ goto bail;
+ }
+
+ if (ac->ac_max_block)
+ mlog(0, "Calling in_range for max block %llu\n",
+ (unsigned long long)ac->ac_max_block);
+
+ if (!ocfs2_local_alloc_in_range(local_alloc_inode, ac,
+ bits_wanted)) {
+ /*
+ * The window is outside ac->ac_max_block.
+ * This errno tells the caller to keep localalloc enabled
+ * but to get the allocation from the main bitmap.
+ */
+ status = -EFBIG;
+ goto bail;
}
ac->ac_inode = local_alloc_inode;
@@ -789,6 +973,85 @@ bail:
return status;
}
+enum ocfs2_la_event {
+ OCFS2_LA_EVENT_SLIDE, /* Normal window slide. */
+ OCFS2_LA_EVENT_FRAGMENTED, /* The global bitmap has
+ * enough bits theoretically
+ * free, but a contiguous
+ * allocation could not be
+ * found. */
+ OCFS2_LA_EVENT_ENOSPC, /* Global bitmap doesn't have
+ * enough bits free to satisfy
+ * our request. */
+};
+#define OCFS2_LA_ENABLE_INTERVAL (30 * HZ)
+/*
+ * Given an event, calculate the size of our next local alloc window.
+ *
+ * This should always be called under i_mutex of the local alloc inode
+ * so that local alloc disabling doesn't race with processes trying to
+ * use the allocator.
+ *
+ * Returns the state which the local alloc was left in. This value can
+ * be ignored by some paths.
+ */
+static int ocfs2_recalc_la_window(struct ocfs2_super *osb,
+ enum ocfs2_la_event event)
+{
+ unsigned int bits;
+ int state;
+
+ spin_lock(&osb->osb_lock);
+ if (osb->local_alloc_state == OCFS2_LA_DISABLED) {
+ WARN_ON_ONCE(osb->local_alloc_state == OCFS2_LA_DISABLED);
+ goto out_unlock;
+ }
+
+ /*
+ * ENOSPC and fragmentation are treated similarly for now.
+ */
+ if (event == OCFS2_LA_EVENT_ENOSPC ||
+ event == OCFS2_LA_EVENT_FRAGMENTED) {
+ /*
+ * We ran out of contiguous space in the primary
+ * bitmap. Drastically reduce the number of bits used
+ * by local alloc until we have to disable it.
+ */
+ bits = osb->local_alloc_bits >> 1;
+ if (bits > ocfs2_megabytes_to_clusters(osb->sb, 1)) {
+ /*
+ * By setting state to THROTTLED, we'll keep
+ * the number of local alloc bits used down
+ * until an event occurs which would give us
+ * reason to assume the bitmap situation might
+ * have changed.
+ */
+ osb->local_alloc_state = OCFS2_LA_THROTTLED;
+ osb->local_alloc_bits = bits;
+ } else {
+ osb->local_alloc_state = OCFS2_LA_DISABLED;
+ }
+ queue_delayed_work(ocfs2_wq, &osb->la_enable_wq,
+ OCFS2_LA_ENABLE_INTERVAL);
+ goto out_unlock;
+ }
+
+ /*
+ * Don't increase the size of the local alloc window until we
+ * know we might be able to fulfill the request. Otherwise, we
+ * risk bouncing around the global bitmap during periods of
+ * low space.
+ */
+ if (osb->local_alloc_state != OCFS2_LA_THROTTLED)
+ osb->local_alloc_bits = osb->local_alloc_default_bits;
+
+out_unlock:
+ state = osb->local_alloc_state;
+ spin_unlock(&osb->osb_lock);
+
+ return state;
+}
+
static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
struct ocfs2_alloc_context **ac,
struct inode **bitmap_inode,
@@ -803,12 +1066,21 @@ static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
goto bail;
}
- (*ac)->ac_bits_wanted = ocfs2_local_alloc_window_bits(osb);
+retry_enospc:
+ (*ac)->ac_bits_wanted = osb->local_alloc_bits;
status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
+ if (status == -ENOSPC) {
+ if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) ==
+ OCFS2_LA_DISABLED)
+ goto bail;
+
+ ocfs2_free_ac_resource(*ac);
+ memset(*ac, 0, sizeof(struct ocfs2_alloc_context));
+ goto retry_enospc;
+ }
if (status < 0) {
- if (status != -ENOSPC)
- mlog_errno(status);
+ mlog_errno(status);
goto bail;
}
@@ -849,7 +1121,7 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
"one\n");
mlog(0, "Allocating %u clusters for a new window.\n",
- ocfs2_local_alloc_window_bits(osb));
+ osb->local_alloc_bits);
/* Instruct the allocation code to try the most recently used
* cluster group. We'll re-record the group used this pass
@@ -859,9 +1131,36 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
/* we used the generic suballoc reserve function, but we set
* everything up nicely, so there's no reason why we can't use
* the more specific cluster api to claim bits. */
- status = ocfs2_claim_clusters(osb, handle, ac,
- ocfs2_local_alloc_window_bits(osb),
+ status = ocfs2_claim_clusters(osb, handle, ac, osb->local_alloc_bits,
&cluster_off, &cluster_count);
+ if (status == -ENOSPC) {
+retry_enospc:
+ /*
+ * Note: We could also try syncing the journal here to
+ * allow use of any free bits which the current
+ * transaction can't give us access to. --Mark
+ */
+ if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_FRAGMENTED) ==
+ OCFS2_LA_DISABLED)
+ goto bail;
+
+ status = ocfs2_claim_clusters(osb, handle, ac,
+ osb->local_alloc_bits,
+ &cluster_off,
+ &cluster_count);
+ if (status == -ENOSPC)
+ goto retry_enospc;
+ /*
+ * We only shrunk the *minimum* number of in our
+ * request - it's entirely possible that the allocator
+ * might give us more than we asked for.
+ */
+ if (status == 0) {
+ spin_lock(&osb->osb_lock);
+ osb->local_alloc_bits = cluster_count;
+ spin_unlock(&osb->osb_lock);
+ }
+ }
if (status < 0) {
if (status != -ENOSPC)
mlog_errno(status);
@@ -905,6 +1204,8 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
mlog_entry_void();
+ ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_SLIDE);
+
/* This will lock the main bitmap for us. */
status = ocfs2_local_alloc_reserve_for_window(osb,
&ac,
@@ -976,8 +1277,7 @@ bail:
if (handle)
ocfs2_commit_trans(osb, handle);
- if (main_bm_bh)
- brelse(main_bm_bh);
+ brelse(main_bm_bh);
if (main_bm_inode)
iput(main_bm_inode);
diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h
index 3f76631e110c..ac5ea9f86653 100644
--- a/fs/ocfs2/localalloc.h
+++ b/fs/ocfs2/localalloc.h
@@ -52,4 +52,8 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
u32 *bit_off,
u32 *num_bits);
+void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb,
+ unsigned int num_clusters);
+void ocfs2_la_enable_worker(struct work_struct *work);
+
#endif /* OCFS2_LOCALALLOC_H */
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index 203f87143877..544ac6245175 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -24,6 +24,7 @@
*/
#include <linux/fs.h>
+#include <linux/fcntl.h>
#define MLOG_MASK_PREFIX ML_INODE
#include <cluster/masklog.h>
@@ -32,6 +33,7 @@
#include "dlmglue.h"
#include "file.h"
+#include "inode.h"
#include "locks.h"
static int ocfs2_do_flock(struct file *file, struct inode *inode,
@@ -123,3 +125,16 @@ int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl)
else
return ocfs2_do_flock(file, inode, cmd, fl);
}
+
+int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (!(fl->fl_flags & FL_POSIX))
+ return -ENOLCK;
+ if (__mandatory_lock(inode))
+ return -ENOLCK;
+
+ return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl);
+}
diff --git a/fs/ocfs2/locks.h b/fs/ocfs2/locks.h
index 9743ef2324ec..496d488b271f 100644
--- a/fs/ocfs2/locks.h
+++ b/fs/ocfs2/locks.h
@@ -27,5 +27,6 @@
#define OCFS2_LOCKS_H
int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl);
+int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl);
#endif /* OCFS2_LOCKS_H */
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index d5d808fe0140..485a6aa0ad39 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -60,6 +60,7 @@
#include "symlink.h"
#include "sysfile.h"
#include "uptodate.h"
+#include "xattr.h"
#include "buffer_head_io.h"
@@ -327,14 +328,9 @@ leave:
if (status == -ENOSPC)
mlog(0, "Disk is full\n");
- if (new_fe_bh)
- brelse(new_fe_bh);
-
- if (de_bh)
- brelse(de_bh);
-
- if (parent_fe_bh)
- brelse(parent_fe_bh);
+ brelse(new_fe_bh);
+ brelse(de_bh);
+ brelse(parent_fe_bh);
if ((status < 0) && inode)
iput(inode);
@@ -647,12 +643,9 @@ out_unlock_inode:
out:
ocfs2_inode_unlock(dir, 1);
- if (de_bh)
- brelse(de_bh);
- if (fe_bh)
- brelse(fe_bh);
- if (parent_fe_bh)
- brelse(parent_fe_bh);
+ brelse(de_bh);
+ brelse(fe_bh);
+ brelse(parent_fe_bh);
mlog_exit(err);
@@ -851,17 +844,10 @@ leave:
iput(orphan_dir);
}
- if (fe_bh)
- brelse(fe_bh);
-
- if (dirent_bh)
- brelse(dirent_bh);
-
- if (parent_node_bh)
- brelse(parent_node_bh);
-
- if (orphan_entry_bh)
- brelse(orphan_entry_bh);
+ brelse(fe_bh);
+ brelse(dirent_bh);
+ brelse(parent_node_bh);
+ brelse(orphan_entry_bh);
mlog_exit(status);
@@ -1372,24 +1358,15 @@ bail:
if (new_inode)
iput(new_inode);
- if (newfe_bh)
- brelse(newfe_bh);
- if (old_inode_bh)
- brelse(old_inode_bh);
- if (old_dir_bh)
- brelse(old_dir_bh);
- if (new_dir_bh)
- brelse(new_dir_bh);
- if (new_de_bh)
- brelse(new_de_bh);
- if (old_de_bh)
- brelse(old_de_bh);
- if (old_inode_de_bh)
- brelse(old_inode_de_bh);
- if (orphan_entry_bh)
- brelse(orphan_entry_bh);
- if (insert_entry_bh)
- brelse(insert_entry_bh);
+ brelse(newfe_bh);
+ brelse(old_inode_bh);
+ brelse(old_dir_bh);
+ brelse(new_dir_bh);
+ brelse(new_de_bh);
+ brelse(old_de_bh);
+ brelse(old_inode_de_bh);
+ brelse(orphan_entry_bh);
+ brelse(insert_entry_bh);
mlog_exit(status);
@@ -1492,8 +1469,7 @@ bail:
if (bhs) {
for(i = 0; i < blocks; i++)
- if (bhs[i])
- brelse(bhs[i]);
+ brelse(bhs[i]);
kfree(bhs);
}
@@ -1598,10 +1574,10 @@ static int ocfs2_symlink(struct inode *dir,
u32 offset = 0;
inode->i_op = &ocfs2_symlink_inode_operations;
- status = ocfs2_do_extend_allocation(osb, inode, &offset, 1, 0,
- new_fe_bh,
- handle, data_ac, NULL,
- NULL);
+ status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0,
+ new_fe_bh,
+ handle, data_ac, NULL,
+ NULL);
if (status < 0) {
if (status != -ENOSPC && status != -EINTR) {
mlog(ML_ERROR,
@@ -1659,12 +1635,9 @@ bail:
ocfs2_inode_unlock(dir, 1);
- if (new_fe_bh)
- brelse(new_fe_bh);
- if (parent_fe_bh)
- brelse(parent_fe_bh);
- if (de_bh)
- brelse(de_bh);
+ brelse(new_fe_bh);
+ brelse(parent_fe_bh);
+ brelse(de_bh);
if (inode_ac)
ocfs2_free_alloc_context(inode_ac);
if (data_ac)
@@ -1759,8 +1732,7 @@ leave:
iput(orphan_dir_inode);
}
- if (orphan_dir_bh)
- brelse(orphan_dir_bh);
+ brelse(orphan_dir_bh);
mlog_exit(status);
return status;
@@ -1780,10 +1752,9 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
- status = ocfs2_read_block(osb,
+ status = ocfs2_read_block(orphan_dir_inode,
OCFS2_I(orphan_dir_inode)->ip_blkno,
- &orphan_dir_bh, OCFS2_BH_CACHED,
- orphan_dir_inode);
+ &orphan_dir_bh);
if (status < 0) {
mlog_errno(status);
goto leave;
@@ -1829,8 +1800,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
(unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
leave:
- if (orphan_dir_bh)
- brelse(orphan_dir_bh);
+ brelse(orphan_dir_bh);
mlog_exit(status);
return status;
@@ -1898,8 +1868,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
}
leave:
- if (target_de_bh)
- brelse(target_de_bh);
+ brelse(target_de_bh);
mlog_exit(status);
return status;
@@ -1918,4 +1887,8 @@ const struct inode_operations ocfs2_dir_iops = {
.setattr = ocfs2_setattr,
.getattr = ocfs2_getattr,
.permission = ocfs2_permission,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = ocfs2_listxattr,
+ .removexattr = generic_removexattr,
};
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 7f625f2b1117..a21a465490c4 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -34,7 +34,12 @@
#include <linux/workqueue.h>
#include <linux/kref.h>
#include <linux/mutex.h>
-#include <linux/jbd.h>
+#ifndef CONFIG_OCFS2_COMPAT_JBD
+# include <linux/jbd2.h>
+#else
+# include <linux/jbd.h>
+# include "ocfs2_jbd_compat.h"
+#endif
/* For union ocfs2_dlm_lksb */
#include "stackglue.h"
@@ -171,9 +176,13 @@ struct ocfs2_alloc_stats
enum ocfs2_local_alloc_state
{
- OCFS2_LA_UNUSED = 0,
- OCFS2_LA_ENABLED,
- OCFS2_LA_DISABLED
+ OCFS2_LA_UNUSED = 0, /* Local alloc will never be used for
+ * this mountpoint. */
+ OCFS2_LA_ENABLED, /* Local alloc is in use. */
+ OCFS2_LA_THROTTLED, /* Local alloc is in use, but number
+ * of bits has been reduced. */
+ OCFS2_LA_DISABLED /* Local alloc has temporarily been
+ * disabled. */
};
enum ocfs2_mount_options
@@ -184,6 +193,8 @@ enum ocfs2_mount_options
OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */
OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
+ OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */
+ OCFS2_MOUNT_INODE64 = 1 << 7, /* Allow inode numbers > 2^32 */
};
#define OCFS2_OSB_SOFT_RO 0x0001
@@ -214,6 +225,7 @@ struct ocfs2_super
u32 bitmap_cpg;
u8 *uuid;
char *uuid_str;
+ u32 uuid_hash;
u8 *vol_label;
u64 first_cluster_group_blkno;
u32 fs_generation;
@@ -241,6 +253,7 @@ struct ocfs2_super
int s_sectsize_bits;
int s_clustersize;
int s_clustersize_bits;
+ unsigned int s_xattr_inline_size;
atomic_t vol_state;
struct mutex recovery_lock;
@@ -252,11 +265,27 @@ struct ocfs2_super
struct ocfs2_journal *journal;
unsigned long osb_commit_interval;
- int local_alloc_size;
- enum ocfs2_local_alloc_state local_alloc_state;
+ struct delayed_work la_enable_wq;
+
+ /*
+ * Must hold local alloc i_mutex and osb->osb_lock to change
+ * local_alloc_bits. Reads can be done under either lock.
+ */
+ unsigned int local_alloc_bits;
+ unsigned int local_alloc_default_bits;
+
+ enum ocfs2_local_alloc_state local_alloc_state; /* protected
+ * by osb_lock */
+
struct buffer_head *local_alloc_bh;
+
u64 la_last_gd;
+#ifdef CONFIG_OCFS2_FS_STATS
+ struct dentry *local_alloc_debug;
+ char *local_alloc_debug_buf;
+#endif
+
/* Next two fields are for local node slot recovery during
* mount. */
int dirty;
@@ -340,6 +369,13 @@ static inline int ocfs2_supports_inline_data(struct ocfs2_super *osb)
return 0;
}
+static inline int ocfs2_supports_xattr(struct ocfs2_super *osb)
+{
+ if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR)
+ return 1;
+ return 0;
+}
+
/* set / clear functions because cluster events can make these happen
* in parallel so we want the transitions to be atomic. this also
* means that any future flags osb_flags must be protected by spinlock
@@ -554,6 +590,14 @@ static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb)
return pages_per_cluster;
}
+static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb,
+ unsigned int megs)
+{
+ BUILD_BUG_ON(OCFS2_MAX_CLUSTERSIZE > 1048576);
+
+ return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits);
+}
+
static inline void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
{
spin_lock(&osb->osb_lock);
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 4f619850ccf7..f24ce3d3f956 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -64,6 +64,7 @@
#define OCFS2_INODE_SIGNATURE "INODE01"
#define OCFS2_EXTENT_BLOCK_SIGNATURE "EXBLK01"
#define OCFS2_GROUP_DESC_SIGNATURE "GROUP01"
+#define OCFS2_XATTR_BLOCK_SIGNATURE "XATTR01"
/* Compatibility flags */
#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \
@@ -90,7 +91,8 @@
| OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \
| OCFS2_FEATURE_INCOMPAT_INLINE_DATA \
| OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
- | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK)
+ | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
+ | OCFS2_FEATURE_INCOMPAT_XATTR)
#define OCFS2_FEATURE_RO_COMPAT_SUPP OCFS2_FEATURE_RO_COMPAT_UNWRITTEN
/*
@@ -127,10 +129,6 @@
/* Support for data packed into inode blocks */
#define OCFS2_FEATURE_INCOMPAT_INLINE_DATA 0x0040
-/* Support for the extended slot map */
-#define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100
-
-
/*
* Support for alternate, userspace cluster stacks. If set, the superblock
* field s_cluster_info contains a tag for the alternate stack in use as
@@ -142,6 +140,12 @@
*/
#define OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK 0x0080
+/* Support for the extended slot map */
+#define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100
+
+/* Support for extended attributes */
+#define OCFS2_FEATURE_INCOMPAT_XATTR 0x0200
+
/*
* backup superblock flag is used to indicate that this volume
* has backup superblocks.
@@ -299,6 +303,12 @@ struct ocfs2_new_group_input {
*/
#define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE 8
+/*
+ * Inline extended attribute size (in bytes)
+ * The value chosen should be aligned to 16 byte boundaries.
+ */
+#define OCFS2_MIN_XATTR_INLINE_SIZE 256
+
struct ocfs2_system_inode_info {
char *si_name;
int si_iflags;
@@ -563,7 +573,7 @@ struct ocfs2_super_block {
/*40*/ __le16 s_max_slots; /* Max number of simultaneous mounts
before tunefs required */
__le16 s_tunefs_flag;
- __le32 s_reserved1;
+ __le32 s_uuid_hash; /* hash value of uuid */
__le64 s_first_cluster_group; /* Block offset of 1st cluster
* group header */
/*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */
@@ -571,7 +581,11 @@ struct ocfs2_super_block {
/*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Selected userspace
stack. Only valid
with INCOMPAT flag. */
-/*B8*/ __le64 s_reserved2[17]; /* Fill out superblock */
+/*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size
+ for this fs*/
+ __le16 s_reserved0;
+ __le32 s_reserved1;
+/*C0*/ __le64 s_reserved2[16]; /* Fill out superblock */
/*140*/
/*
@@ -621,7 +635,8 @@ struct ocfs2_dinode {
belongs to */
__le16 i_suballoc_bit; /* Bit offset in suballocator
block group */
-/*10*/ __le32 i_reserved0;
+/*10*/ __le16 i_reserved0;
+ __le16 i_xattr_inline_size;
__le32 i_clusters; /* Cluster count */
__le32 i_uid; /* Owner UID */
__le32 i_gid; /* Owning GID */
@@ -640,11 +655,12 @@ struct ocfs2_dinode {
__le32 i_atime_nsec;
__le32 i_ctime_nsec;
__le32 i_mtime_nsec;
- __le32 i_attr;
+/*70*/ __le32 i_attr;
__le16 i_orphaned_slot; /* Only valid when OCFS2_ORPHANED_FL
was set in i_flags */
__le16 i_dyn_features;
-/*70*/ __le64 i_reserved2[8];
+ __le64 i_xattr_loc;
+/*80*/ __le64 i_reserved2[7];
/*B8*/ union {
__le64 i_pad1; /* Generic way to refer to this
64bit union */
@@ -715,6 +731,136 @@ struct ocfs2_group_desc
/*40*/ __u8 bg_bitmap[0];
};
+/*
+ * On disk extended attribute structure for OCFS2.
+ */
+
+/*
+ * ocfs2_xattr_entry indicates one extend attribute.
+ *
+ * Note that it can be stored in inode, one block or one xattr bucket.
+ */
+struct ocfs2_xattr_entry {
+ __le32 xe_name_hash; /* hash value of xattr prefix+suffix. */
+ __le16 xe_name_offset; /* byte offset from the 1st etnry in the local
+ local xattr storage(inode, xattr block or
+ xattr bucket). */
+ __u8 xe_name_len; /* xattr name len, does't include prefix. */
+ __u8 xe_type; /* the low 7 bits indicates the name prefix's
+ * type and the highest 1 bits indicate whether
+ * the EA is stored in the local storage. */
+ __le64 xe_value_size; /* real xattr value length. */
+};
+
+/*
+ * On disk structure for xattr header.
+ *
+ * One ocfs2_xattr_header describes how many ocfs2_xattr_entry records in
+ * the local xattr storage.
+ */
+struct ocfs2_xattr_header {
+ __le16 xh_count; /* contains the count of how
+ many records are in the
+ local xattr storage. */
+ __le16 xh_free_start; /* current offset for storing
+ xattr. */
+ __le16 xh_name_value_len; /* total length of name/value
+ length in this bucket. */
+ __le16 xh_num_buckets; /* bucket nums in one extent
+ record, only valid in the
+ first bucket. */
+ __le64 xh_csum;
+ struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */
+};
+
+/*
+ * On disk structure for xattr value root.
+ *
+ * It is used when one extended attribute's size is larger, and we will save it
+ * in an outside cluster. It will stored in a b-tree like file content.
+ */
+struct ocfs2_xattr_value_root {
+/*00*/ __le32 xr_clusters; /* clusters covered by xattr value. */
+ __le32 xr_reserved0;
+ __le64 xr_last_eb_blk; /* Pointer to last extent block */
+/*10*/ struct ocfs2_extent_list xr_list; /* Extent record list */
+};
+
+/*
+ * On disk structure for xattr tree root.
+ *
+ * It is used when there are too many extended attributes for one file. These
+ * attributes will be organized and stored in an indexed-btree.
+ */
+struct ocfs2_xattr_tree_root {
+/*00*/ __le32 xt_clusters; /* clusters covered by xattr. */
+ __le32 xt_reserved0;
+ __le64 xt_last_eb_blk; /* Pointer to last extent block */
+/*10*/ struct ocfs2_extent_list xt_list; /* Extent record list */
+};
+
+#define OCFS2_XATTR_INDEXED 0x1
+#define OCFS2_HASH_SHIFT 5
+#define OCFS2_XATTR_ROUND 3
+#define OCFS2_XATTR_SIZE(size) (((size) + OCFS2_XATTR_ROUND) & \
+ ~(OCFS2_XATTR_ROUND))
+
+#define OCFS2_XATTR_BUCKET_SIZE 4096
+#define OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET (OCFS2_XATTR_BUCKET_SIZE \
+ / OCFS2_MIN_BLOCKSIZE)
+
+/*
+ * On disk structure for xattr block.
+ */
+struct ocfs2_xattr_block {
+/*00*/ __u8 xb_signature[8]; /* Signature for verification */
+ __le16 xb_suballoc_slot; /* Slot suballocator this
+ block belongs to. */
+ __le16 xb_suballoc_bit; /* Bit offset in suballocator
+ block group */
+ __le32 xb_fs_generation; /* Must match super block */
+/*10*/ __le64 xb_blkno; /* Offset on disk, in blocks */
+ __le64 xb_csum;
+/*20*/ __le16 xb_flags; /* Indicates whether this block contains
+ real xattr or a xattr tree. */
+ __le16 xb_reserved0;
+ __le32 xb_reserved1;
+ __le64 xb_reserved2;
+/*30*/ union {
+ struct ocfs2_xattr_header xb_header; /* xattr header if this
+ block contains xattr */
+ struct ocfs2_xattr_tree_root xb_root;/* xattr tree root if this
+ block cotains xattr
+ tree. */
+ } xb_attrs;
+};
+
+#define OCFS2_XATTR_ENTRY_LOCAL 0x80
+#define OCFS2_XATTR_TYPE_MASK 0x7F
+static inline void ocfs2_xattr_set_local(struct ocfs2_xattr_entry *xe,
+ int local)
+{
+ if (local)
+ xe->xe_type |= OCFS2_XATTR_ENTRY_LOCAL;
+ else
+ xe->xe_type &= ~OCFS2_XATTR_ENTRY_LOCAL;
+}
+
+static inline int ocfs2_xattr_is_local(struct ocfs2_xattr_entry *xe)
+{
+ return xe->xe_type & OCFS2_XATTR_ENTRY_LOCAL;
+}
+
+static inline void ocfs2_xattr_set_type(struct ocfs2_xattr_entry *xe, int type)
+{
+ xe->xe_type |= type & OCFS2_XATTR_TYPE_MASK;
+}
+
+static inline int ocfs2_xattr_get_type(struct ocfs2_xattr_entry *xe)
+{
+ return xe->xe_type & OCFS2_XATTR_TYPE_MASK;
+}
+
#ifdef __KERNEL__
static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
{
@@ -728,6 +874,20 @@ static inline int ocfs2_max_inline_data(struct super_block *sb)
offsetof(struct ocfs2_dinode, id2.i_data.id_data);
}
+static inline int ocfs2_max_inline_data_with_xattr(struct super_block *sb,
+ struct ocfs2_dinode *di)
+{
+ unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
+
+ if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
+ return sb->s_blocksize -
+ offsetof(struct ocfs2_dinode, id2.i_data.id_data) -
+ xattrsize;
+ else
+ return sb->s_blocksize -
+ offsetof(struct ocfs2_dinode, id2.i_data.id_data);
+}
+
static inline int ocfs2_extent_recs_per_inode(struct super_block *sb)
{
int size;
@@ -738,6 +898,24 @@ static inline int ocfs2_extent_recs_per_inode(struct super_block *sb)
return size / sizeof(struct ocfs2_extent_rec);
}
+static inline int ocfs2_extent_recs_per_inode_with_xattr(
+ struct super_block *sb,
+ struct ocfs2_dinode *di)
+{
+ int size;
+ unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
+
+ if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
+ size = sb->s_blocksize -
+ offsetof(struct ocfs2_dinode, id2.i_list.l_recs) -
+ xattrsize;
+ else
+ size = sb->s_blocksize -
+ offsetof(struct ocfs2_dinode, id2.i_list.l_recs);
+
+ return size / sizeof(struct ocfs2_extent_rec);
+}
+
static inline int ocfs2_chain_recs_per_inode(struct super_block *sb)
{
int size;
@@ -801,6 +979,17 @@ static inline u64 ocfs2_backup_super_blkno(struct super_block *sb, int index)
return 0;
}
+
+static inline u16 ocfs2_xattr_recs_per_xb(struct super_block *sb)
+{
+ int size;
+
+ size = sb->s_blocksize -
+ offsetof(struct ocfs2_xattr_block,
+ xb_attrs.xb_root.xt_list.l_recs);
+
+ return size / sizeof(struct ocfs2_extent_rec);
+}
#else
static inline int ocfs2_fast_symlink_chars(int blocksize)
{
@@ -884,6 +1073,17 @@ static inline uint64_t ocfs2_backup_super_blkno(int blocksize, int index)
return 0;
}
+
+static inline int ocfs2_xattr_recs_per_xb(int blocksize)
+{
+ int size;
+
+ size = blocksize -
+ offsetof(struct ocfs2_xattr_block,
+ xb_attrs.xb_root.xt_list.l_recs);
+
+ return size / sizeof(struct ocfs2_extent_rec);
+}
#endif /* __KERNEL__ */
diff --git a/fs/ocfs2/ocfs2_jbd_compat.h b/fs/ocfs2/ocfs2_jbd_compat.h
new file mode 100644
index 000000000000..b91c78f8f558
--- /dev/null
+++ b/fs/ocfs2/ocfs2_jbd_compat.h
@@ -0,0 +1,82 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_jbd_compat.h
+ *
+ * Compatibility defines for JBD.
+ *
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef OCFS2_JBD_COMPAT_H
+#define OCFS2_JBD_COMPAT_H
+
+#ifndef CONFIG_OCFS2_COMPAT_JBD
+# error Should not have been included
+#endif
+
+struct jbd2_inode {
+ unsigned int dummy;
+};
+
+#define JBD2_BARRIER JFS_BARRIER
+#define JBD2_DEFAULT_MAX_COMMIT_AGE JBD_DEFAULT_MAX_COMMIT_AGE
+
+#define jbd2_journal_ack_err journal_ack_err
+#define jbd2_journal_clear_err journal_clear_err
+#define jbd2_journal_destroy journal_destroy
+#define jbd2_journal_dirty_metadata journal_dirty_metadata
+#define jbd2_journal_errno journal_errno
+#define jbd2_journal_extend journal_extend
+#define jbd2_journal_flush journal_flush
+#define jbd2_journal_force_commit journal_force_commit
+#define jbd2_journal_get_write_access journal_get_write_access
+#define jbd2_journal_get_undo_access journal_get_undo_access
+#define jbd2_journal_init_inode journal_init_inode
+#define jbd2_journal_invalidatepage journal_invalidatepage
+#define jbd2_journal_load journal_load
+#define jbd2_journal_lock_updates journal_lock_updates
+#define jbd2_journal_restart journal_restart
+#define jbd2_journal_start journal_start
+#define jbd2_journal_start_commit journal_start_commit
+#define jbd2_journal_stop journal_stop
+#define jbd2_journal_try_to_free_buffers journal_try_to_free_buffers
+#define jbd2_journal_unlock_updates journal_unlock_updates
+#define jbd2_journal_wipe journal_wipe
+#define jbd2_log_wait_commit log_wait_commit
+
+static inline int jbd2_journal_file_inode(handle_t *handle,
+ struct jbd2_inode *inode)
+{
+ return 0;
+}
+
+static inline int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
+ loff_t new_size)
+{
+ return 0;
+}
+
+static inline void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode,
+ struct inode *inode)
+{
+ return;
+}
+
+static inline void jbd2_journal_release_jbd_inode(journal_t *journal,
+ struct jbd2_inode *jinode)
+{
+ return;
+}
+
+
+#endif /* OCFS2_JBD_COMPAT_H */
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 8166968e9015..ffd48db229a7 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -200,7 +200,7 @@ static int update_backups(struct inode * inode, u32 clusters, char *data)
if (cluster > clusters)
break;
- ret = ocfs2_read_block(osb, blkno, &backup, 0, NULL);
+ ret = ocfs2_read_blocks_sync(osb, blkno, 1, &backup);
if (ret < 0) {
mlog_errno(ret);
break;
@@ -236,8 +236,8 @@ static void ocfs2_update_super_and_backups(struct inode *inode,
* update the superblock last.
* It doesn't matter if the write failed.
*/
- ret = ocfs2_read_block(osb, OCFS2_SUPER_BLOCK_BLKNO,
- &super_bh, 0, NULL);
+ ret = ocfs2_read_blocks_sync(osb, OCFS2_SUPER_BLOCK_BLKNO, 1,
+ &super_bh);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -332,8 +332,7 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
lgd_blkno = ocfs2_which_cluster_group(main_bm_inode,
first_new_cluster - 1);
- ret = ocfs2_read_block(osb, lgd_blkno, &group_bh, OCFS2_BH_CACHED,
- main_bm_inode);
+ ret = ocfs2_read_block(main_bm_inode, lgd_blkno, &group_bh);
if (ret < 0) {
mlog_errno(ret);
goto out_unlock;
@@ -540,7 +539,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
goto out_unlock;
}
- ret = ocfs2_read_block(osb, input->group, &group_bh, 0, NULL);
+ ret = ocfs2_read_blocks_sync(osb, input->group, 1, &group_bh);
if (ret < 0) {
mlog(ML_ERROR, "Can't read the group descriptor # %llu "
"from the device.", (unsigned long long)input->group);
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index bb5ff8939bf1..bdda2d8f8508 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -150,8 +150,8 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
* be !NULL. Thus, ocfs2_read_blocks() will ignore blocknr. If
* this is not true, the read of -1 (UINT64_MAX) will fail.
*/
- ret = ocfs2_read_blocks(osb, -1, si->si_blocks, si->si_bh, 0,
- si->si_inode);
+ ret = ocfs2_read_blocks(si->si_inode, -1, si->si_blocks, si->si_bh,
+ OCFS2_BH_IGNORE_CACHE);
if (ret == 0) {
spin_lock(&osb->osb_lock);
ocfs2_update_slot_info(si);
@@ -404,7 +404,8 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
(unsigned long long)blkno);
bh = NULL; /* Acquire a fresh bh */
- status = ocfs2_read_block(osb, blkno, &bh, 0, si->si_inode);
+ status = ocfs2_read_blocks(si->si_inode, blkno, 1, &bh,
+ OCFS2_BH_IGNORE_CACHE);
if (status < 0) {
mlog_errno(status);
goto bail;
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 353fc35c6748..faec2d879357 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -28,6 +28,7 @@
#include "ocfs2.h" /* For struct ocfs2_lock_res */
#include "stackglue.h"
+#include <linux/dlm_plock.h>
/*
* The control protocol starts with a handshake. Until the handshake
@@ -746,6 +747,37 @@ static void user_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
{
}
+static int user_plock(struct ocfs2_cluster_connection *conn,
+ u64 ino,
+ struct file *file,
+ int cmd,
+ struct file_lock *fl)
+{
+ /*
+ * This more or less just demuxes the plock request into any
+ * one of three dlm calls.
+ *
+ * Internally, fs/dlm will pass these to a misc device, which
+ * a userspace daemon will read and write to.
+ *
+ * For now, cancel requests (which happen internally only),
+ * are turned into unlocks. Most of this function taken from
+ * gfs2_lock.
+ */
+
+ if (cmd == F_CANCELLK) {
+ cmd = F_SETLK;
+ fl->fl_type = F_UNLCK;
+ }
+
+ if (IS_GETLK(cmd))
+ return dlm_posix_get(conn->cc_lockspace, ino, file, fl);
+ else if (fl->fl_type == F_UNLCK)
+ return dlm_posix_unlock(conn->cc_lockspace, ino, file, fl);
+ else
+ return dlm_posix_lock(conn->cc_lockspace, ino, file, cmd, fl);
+}
+
/*
* Compare a requested locking protocol version against the current one.
*
@@ -839,6 +871,7 @@ static struct ocfs2_stack_operations ocfs2_user_plugin_ops = {
.dlm_unlock = user_dlm_unlock,
.lock_status = user_dlm_lock_status,
.lock_lvb = user_dlm_lvb,
+ .plock = user_plock,
.dump_lksb = user_dlm_dump_lksb,
};
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 07f348b8d721..68b668b0e60a 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -288,6 +288,26 @@ void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
}
EXPORT_SYMBOL_GPL(ocfs2_dlm_dump_lksb);
+int ocfs2_stack_supports_plocks(void)
+{
+ return active_stack && active_stack->sp_ops->plock;
+}
+EXPORT_SYMBOL_GPL(ocfs2_stack_supports_plocks);
+
+/*
+ * ocfs2_plock() can only be safely called if
+ * ocfs2_stack_supports_plocks() returned true
+ */
+int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
+ struct file *file, int cmd, struct file_lock *fl)
+{
+ WARN_ON_ONCE(active_stack->sp_ops->plock == NULL);
+ if (active_stack->sp_ops->plock)
+ return active_stack->sp_ops->plock(conn, ino, file, cmd, fl);
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(ocfs2_plock);
+
int ocfs2_cluster_connect(const char *stack_name,
const char *group,
int grouplen,
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index db56281dd1be..c571af375ef8 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -28,6 +28,10 @@
#include "dlm/dlmapi.h"
#include <linux/dlm.h>
+/* Needed for plock-related prototypes */
+struct file;
+struct file_lock;
+
/*
* dlmconstants.h does not have a LOCAL flag. We hope to remove it
* some day, but right now we need it. Let's fake it. This value is larger
@@ -187,6 +191,17 @@ struct ocfs2_stack_operations {
void *(*lock_lvb)(union ocfs2_dlm_lksb *lksb);
/*
+ * Cluster-aware posix locks
+ *
+ * This is NULL for stacks which do not support posix locks.
+ */
+ int (*plock)(struct ocfs2_cluster_connection *conn,
+ u64 ino,
+ struct file *file,
+ int cmd,
+ struct file_lock *fl);
+
+ /*
* This is an optoinal debugging hook. If provided, the
* stack can dump debugging information about this lock.
*/
@@ -240,6 +255,10 @@ int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb);
void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb);
void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb);
+int ocfs2_stack_supports_plocks(void);
+int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
+ struct file *file, int cmd, struct file_lock *fl);
+
void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto);
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index d2d278fb9819..c5ff18b46b57 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -62,15 +62,18 @@ static int ocfs2_block_group_fill(handle_t *handle,
struct ocfs2_chain_list *cl);
static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
struct inode *alloc_inode,
- struct buffer_head *bh);
+ struct buffer_head *bh,
+ u64 max_block);
static int ocfs2_cluster_group_search(struct inode *inode,
struct buffer_head *group_bh,
u32 bits_wanted, u32 min_bits,
+ u64 max_block,
u16 *bit_off, u16 *bits_found);
static int ocfs2_block_group_search(struct inode *inode,
struct buffer_head *group_bh,
u32 bits_wanted, u32 min_bits,
+ u64 max_block,
u16 *bit_off, u16 *bits_found);
static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
struct ocfs2_alloc_context *ac,
@@ -110,8 +113,11 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
u64 data_blkno,
u64 *bg_blkno,
u16 *bg_bit_off);
+static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
+ u32 bits_wanted, u64 max_block,
+ struct ocfs2_alloc_context **ac);
-static void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
+void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
{
struct inode *inode = ac->ac_inode;
@@ -124,10 +130,8 @@ static void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
iput(inode);
ac->ac_inode = NULL;
}
- if (ac->ac_bh) {
- brelse(ac->ac_bh);
- ac->ac_bh = NULL;
- }
+ brelse(ac->ac_bh);
+ ac->ac_bh = NULL;
}
void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
@@ -276,7 +280,8 @@ static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
*/
static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
struct inode *alloc_inode,
- struct buffer_head *bh)
+ struct buffer_head *bh,
+ u64 max_block)
{
int status, credits;
struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
@@ -294,9 +299,9 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
mlog_entry_void();
cl = &fe->id2.i_chain;
- status = ocfs2_reserve_clusters(osb,
- le16_to_cpu(cl->cl_cpg),
- &ac);
+ status = ocfs2_reserve_clusters_with_limit(osb,
+ le16_to_cpu(cl->cl_cpg),
+ max_block, &ac);
if (status < 0) {
if (status != -ENOSPC)
mlog_errno(status);
@@ -394,8 +399,7 @@ bail:
if (ac)
ocfs2_free_alloc_context(ac);
- if (bg_bh)
- brelse(bg_bh);
+ brelse(bg_bh);
mlog_exit(status);
return status;
@@ -469,7 +473,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
goto bail;
}
- status = ocfs2_block_group_alloc(osb, alloc_inode, bh);
+ status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
+ ac->ac_max_block);
if (status < 0) {
if (status != -ENOSPC)
mlog_errno(status);
@@ -486,16 +491,15 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
get_bh(bh);
ac->ac_bh = bh;
bail:
- if (bh)
- brelse(bh);
+ brelse(bh);
mlog_exit(status);
return status;
}
-int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
- struct ocfs2_dinode *fe,
- struct ocfs2_alloc_context **ac)
+int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
+ int blocks,
+ struct ocfs2_alloc_context **ac)
{
int status;
u32 slot;
@@ -507,7 +511,7 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
goto bail;
}
- (*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe);
+ (*ac)->ac_bits_wanted = blocks;
(*ac)->ac_which = OCFS2_AC_USE_META;
slot = osb->slot_num;
(*ac)->ac_group_search = ocfs2_block_group_search;
@@ -532,6 +536,15 @@ bail:
return status;
}
+int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
+ struct ocfs2_extent_list *root_el,
+ struct ocfs2_alloc_context **ac)
+{
+ return ocfs2_reserve_new_metadata_blocks(osb,
+ ocfs2_extend_meta_needed(root_el),
+ ac);
+}
+
static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb,
struct ocfs2_alloc_context *ac)
{
@@ -582,6 +595,14 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
(*ac)->ac_group_search = ocfs2_block_group_search;
/*
+ * stat(2) can't handle i_ino > 32bits, so we tell the
+ * lower levels not to allocate us a block group past that
+ * limit. The 'inode64' mount option avoids this behavior.
+ */
+ if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64))
+ (*ac)->ac_max_block = (u32)~0U;
+
+ /*
* slot is set when we successfully steal inode from other nodes.
* It is reset in 3 places:
* 1. when we flush the truncate log
@@ -661,9 +682,9 @@ bail:
/* Callers don't need to care which bitmap (local alloc or main) to
* use so we figure it out for them, but unfortunately this clutters
* things a bit. */
-int ocfs2_reserve_clusters(struct ocfs2_super *osb,
- u32 bits_wanted,
- struct ocfs2_alloc_context **ac)
+static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
+ u32 bits_wanted, u64 max_block,
+ struct ocfs2_alloc_context **ac)
{
int status;
@@ -677,24 +698,20 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,
}
(*ac)->ac_bits_wanted = bits_wanted;
+ (*ac)->ac_max_block = max_block;
status = -ENOSPC;
if (ocfs2_alloc_should_use_local(osb, bits_wanted)) {
status = ocfs2_reserve_local_alloc_bits(osb,
bits_wanted,
*ac);
- if ((status < 0) && (status != -ENOSPC)) {
+ if (status == -EFBIG) {
+ /* The local alloc window is outside ac_max_block.
+ * use the main bitmap. */
+ status = -ENOSPC;
+ } else if ((status < 0) && (status != -ENOSPC)) {
mlog_errno(status);
goto bail;
- } else if (status == -ENOSPC) {
- /* reserve_local_bits will return enospc with
- * the local alloc inode still locked, so we
- * can change this safely here. */
- mlog(0, "Disabling local alloc\n");
- /* We set to OCFS2_LA_DISABLED so that umount
- * can clean up what's left of the local
- * allocation */
- osb->local_alloc_state = OCFS2_LA_DISABLED;
}
}
@@ -718,6 +735,13 @@ bail:
return status;
}
+int ocfs2_reserve_clusters(struct ocfs2_super *osb,
+ u32 bits_wanted,
+ struct ocfs2_alloc_context **ac)
+{
+ return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0, ac);
+}
+
/*
* More or less lifted from ext3. I'll leave their description below:
*
@@ -1000,11 +1024,14 @@ static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg
static int ocfs2_cluster_group_search(struct inode *inode,
struct buffer_head *group_bh,
u32 bits_wanted, u32 min_bits,
+ u64 max_block,
u16 *bit_off, u16 *bits_found)
{
int search = -ENOSPC;
int ret;
+ u64 blkoff;
struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
u16 tmp_off, tmp_found;
unsigned int max_bits, gd_cluster_off;
@@ -1037,6 +1064,17 @@ static int ocfs2_cluster_group_search(struct inode *inode,
if (ret)
return ret;
+ if (max_block) {
+ blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
+ gd_cluster_off +
+ tmp_off + tmp_found);
+ mlog(0, "Checking %llu against %llu\n",
+ (unsigned long long)blkoff,
+ (unsigned long long)max_block);
+ if (blkoff > max_block)
+ return -ENOSPC;
+ }
+
/* ocfs2_block_group_find_clear_bits() might
* return success, but we still want to return
* -ENOSPC unless it found the minimum number
@@ -1045,6 +1083,12 @@ static int ocfs2_cluster_group_search(struct inode *inode,
*bit_off = tmp_off;
*bits_found = tmp_found;
search = 0; /* success */
+ } else if (tmp_found) {
+ /*
+ * Don't show bits which we'll be returning
+ * for allocation to the local alloc bitmap.
+ */
+ ocfs2_local_alloc_seen_free_bits(osb, tmp_found);
}
}
@@ -1054,19 +1098,31 @@ static int ocfs2_cluster_group_search(struct inode *inode,
static int ocfs2_block_group_search(struct inode *inode,
struct buffer_head *group_bh,
u32 bits_wanted, u32 min_bits,
+ u64 max_block,
u16 *bit_off, u16 *bits_found)
{
int ret = -ENOSPC;
+ u64 blkoff;
struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
BUG_ON(min_bits != 1);
BUG_ON(ocfs2_is_cluster_bitmap(inode));
- if (bg->bg_free_bits_count)
+ if (bg->bg_free_bits_count) {
ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
group_bh, bits_wanted,
le16_to_cpu(bg->bg_bits),
bit_off, bits_found);
+ if (!ret && max_block) {
+ blkoff = le64_to_cpu(bg->bg_blkno) + *bit_off +
+ *bits_found;
+ mlog(0, "Checking %llu against %llu\n",
+ (unsigned long long)blkoff,
+ (unsigned long long)max_block);
+ if (blkoff > max_block)
+ ret = -ENOSPC;
+ }
+ }
return ret;
}
@@ -1116,8 +1172,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
struct ocfs2_group_desc *gd;
struct inode *alloc_inode = ac->ac_inode;
- ret = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), gd_blkno,
- &group_bh, OCFS2_BH_CACHED, alloc_inode);
+ ret = ocfs2_read_block(alloc_inode, gd_blkno, &group_bh);
if (ret < 0) {
mlog_errno(ret);
return ret;
@@ -1131,7 +1186,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
}
ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
- bit_off, &found);
+ ac->ac_max_block, bit_off, &found);
if (ret < 0) {
if (ret != -ENOSPC)
mlog_errno(ret);
@@ -1186,9 +1241,9 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
bits_wanted, chain,
(unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
- status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
+ status = ocfs2_read_block(alloc_inode,
le64_to_cpu(cl->cl_recs[chain].c_blkno),
- &group_bh, OCFS2_BH_CACHED, alloc_inode);
+ &group_bh);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1204,21 +1259,20 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
/* for now, the chain search is a bit simplistic. We just use
* the 1st group with any empty bits. */
while ((status = ac->ac_group_search(alloc_inode, group_bh,
- bits_wanted, min_bits, bit_off,
+ bits_wanted, min_bits,
+ ac->ac_max_block, bit_off,
&tmp_bits)) == -ENOSPC) {
if (!bg->bg_next_group)
break;
- if (prev_group_bh) {
- brelse(prev_group_bh);
- prev_group_bh = NULL;
- }
+ brelse(prev_group_bh);
+ prev_group_bh = NULL;
+
next_group = le64_to_cpu(bg->bg_next_group);
prev_group_bh = group_bh;
group_bh = NULL;
- status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
- next_group, &group_bh,
- OCFS2_BH_CACHED, alloc_inode);
+ status = ocfs2_read_block(alloc_inode,
+ next_group, &group_bh);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1307,10 +1361,8 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
*bg_blkno = le64_to_cpu(bg->bg_blkno);
*bits_left = le16_to_cpu(bg->bg_free_bits_count);
bail:
- if (group_bh)
- brelse(group_bh);
- if (prev_group_bh)
- brelse(prev_group_bh);
+ brelse(group_bh);
+ brelse(prev_group_bh);
mlog_exit(status);
return status;
@@ -1723,7 +1775,6 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
{
int status = 0;
u32 tmp_used;
- struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
struct ocfs2_chain_list *cl = &fe->id2.i_chain;
struct buffer_head *group_bh = NULL;
@@ -1742,8 +1793,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
(unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
(unsigned long long)bg_blkno, start_bit);
- status = ocfs2_read_block(osb, bg_blkno, &group_bh, OCFS2_BH_CACHED,
- alloc_inode);
+ status = ocfs2_read_block(alloc_inode, bg_blkno, &group_bh);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1784,8 +1834,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
}
bail:
- if (group_bh)
- brelse(group_bh);
+ brelse(group_bh);
mlog_exit(status);
return status;
@@ -1838,9 +1887,15 @@ int ocfs2_free_clusters(handle_t *handle,
status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
bg_start_bit, bg_blkno,
num_clusters);
- if (status < 0)
+ if (status < 0) {
mlog_errno(status);
+ goto out;
+ }
+ ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb),
+ num_clusters);
+
+out:
mlog_exit(status);
return status;
}
@@ -1891,3 +1946,84 @@ static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
(unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno);
}
}
+
+/*
+ * For a given allocation, determine which allocators will need to be
+ * accessed, and lock them, reserving the appropriate number of bits.
+ *
+ * Sparse file systems call this from ocfs2_write_begin_nolock()
+ * and ocfs2_allocate_unwritten_extents().
+ *
+ * File systems which don't support holes call this from
+ * ocfs2_extend_allocation().
+ */
+int ocfs2_lock_allocators(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ u32 clusters_to_add, u32 extents_to_split,
+ struct ocfs2_alloc_context **data_ac,
+ struct ocfs2_alloc_context **meta_ac)
+{
+ int ret = 0, num_free_extents;
+ unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ *meta_ac = NULL;
+ if (data_ac)
+ *data_ac = NULL;
+
+ BUG_ON(clusters_to_add != 0 && data_ac == NULL);
+
+ num_free_extents = ocfs2_num_free_extents(osb, inode, et);
+ if (num_free_extents < 0) {
+ ret = num_free_extents;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * Sparse allocation file systems need to be more conservative
+ * with reserving room for expansion - the actual allocation
+ * happens while we've got a journal handle open so re-taking
+ * a cluster lock (because we ran out of room for another
+ * extent) will violate ordering rules.
+ *
+ * Most of the time we'll only be seeing this 1 cluster at a time
+ * anyway.
+ *
+ * Always lock for any unwritten extents - we might want to
+ * add blocks during a split.
+ */
+ if (!num_free_extents ||
+ (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
+ ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac);
+ if (ret < 0) {
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ if (clusters_to_add == 0)
+ goto out;
+
+ ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
+ if (ret < 0) {
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
+ goto out;
+ }
+
+out:
+ if (ret) {
+ if (*meta_ac) {
+ ocfs2_free_alloc_context(*meta_ac);
+ *meta_ac = NULL;
+ }
+
+ /*
+ * We cannot have an error and a non null *data_ac.
+ */
+ }
+
+ return ret;
+}
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 544c600662bd..4df159d8f450 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -28,10 +28,11 @@
typedef int (group_search_t)(struct inode *,
struct buffer_head *,
- u32,
- u32,
- u16 *,
- u16 *);
+ u32, /* bits_wanted */
+ u32, /* min_bits */
+ u64, /* max_block */
+ u16 *, /* *bit_off */
+ u16 *); /* *bits_found */
struct ocfs2_alloc_context {
struct inode *ac_inode; /* which bitmap are we allocating from? */
@@ -51,6 +52,8 @@ struct ocfs2_alloc_context {
group_search_t *ac_group_search;
u64 ac_last_group;
+ u64 ac_max_block; /* Highest block number to allocate. 0 is
+ is the same as ~0 - unlimited */
};
void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac);
@@ -59,9 +62,17 @@ static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac)
return ac->ac_bits_wanted - ac->ac_bits_given;
}
+/*
+ * Please note that the caller must make sure that root_el is the root
+ * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
+ * the result may be wrong.
+ */
int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
- struct ocfs2_dinode *fe,
+ struct ocfs2_extent_list *root_el,
struct ocfs2_alloc_context **ac);
+int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
+ int blocks,
+ struct ocfs2_alloc_context **ac);
int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
struct ocfs2_alloc_context **ac);
int ocfs2_reserve_clusters(struct ocfs2_super *osb,
@@ -147,6 +158,7 @@ static inline int ocfs2_is_cluster_bitmap(struct inode *inode)
* apis above. */
int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
struct ocfs2_alloc_context *ac);
+void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac);
/* given a cluster offset, calculate which block group it belongs to
* and return that block offset. */
@@ -156,4 +168,8 @@ u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster);
int ocfs2_check_group_descriptor(struct super_block *sb,
struct ocfs2_dinode *di,
struct ocfs2_group_desc *gd);
+int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
+ u32 clusters_to_add, u32 extents_to_split,
+ struct ocfs2_alloc_context **data_ac,
+ struct ocfs2_alloc_context **meta_ac);
#endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 70334d85aff1..304b63ac78cf 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -64,6 +64,7 @@
#include "sysfile.h"
#include "uptodate.h"
#include "ver.h"
+#include "xattr.h"
#include "buffer_head_io.h"
@@ -154,6 +155,9 @@ enum {
Opt_localalloc,
Opt_localflocks,
Opt_stack,
+ Opt_user_xattr,
+ Opt_nouser_xattr,
+ Opt_inode64,
Opt_err,
};
@@ -173,6 +177,9 @@ static const match_table_t tokens = {
{Opt_localalloc, "localalloc=%d"},
{Opt_localflocks, "localflocks"},
{Opt_stack, "cluster_stack=%s"},
+ {Opt_user_xattr, "user_xattr"},
+ {Opt_nouser_xattr, "nouser_xattr"},
+ {Opt_inode64, "inode64"},
{Opt_err, NULL}
};
@@ -205,10 +212,11 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait)
ocfs2_schedule_truncate_log_flush(osb, 0);
}
- if (journal_start_commit(OCFS2_SB(sb)->journal->j_journal, &target)) {
+ if (jbd2_journal_start_commit(OCFS2_SB(sb)->journal->j_journal,
+ &target)) {
if (wait)
- log_wait_commit(OCFS2_SB(sb)->journal->j_journal,
- target);
+ jbd2_log_wait_commit(OCFS2_SB(sb)->journal->j_journal,
+ target);
}
return 0;
}
@@ -325,6 +333,7 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb)
if (!oi)
return NULL;
+ jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode);
return &oi->vfs_inode;
}
@@ -406,6 +415,15 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
goto out;
}
+ /* Probably don't want this on remount; it might
+ * mess with other nodes */
+ if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64) &&
+ (parsed_options.mount_opt & OCFS2_MOUNT_INODE64)) {
+ ret = -EINVAL;
+ mlog(ML_ERROR, "Cannot enable inode64 on remount\n");
+ goto out;
+ }
+
/* We're going to/from readonly mode. */
if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
/* Lock here so the check of HARD_RO and the potential
@@ -637,7 +655,8 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
osb->s_atime_quantum = parsed_options.atime_quantum;
osb->preferred_slot = parsed_options.slot;
osb->osb_commit_interval = parsed_options.commit_interval;
- osb->local_alloc_size = parsed_options.localalloc_opt;
+ osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt);
+ osb->local_alloc_bits = osb->local_alloc_default_bits;
status = ocfs2_verify_userspace_stack(osb, &parsed_options);
if (status)
@@ -743,8 +762,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
return status;
read_super_error:
- if (bh != NULL)
- brelse(bh);
+ brelse(bh);
if (inode)
iput(inode);
@@ -847,6 +865,12 @@ static int ocfs2_parse_options(struct super_block *sb,
case Opt_data_writeback:
mopt->mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK;
break;
+ case Opt_user_xattr:
+ mopt->mount_opt &= ~OCFS2_MOUNT_NOUSERXATTR;
+ break;
+ case Opt_nouser_xattr:
+ mopt->mount_opt |= OCFS2_MOUNT_NOUSERXATTR;
+ break;
case Opt_atime_quantum:
if (match_int(&args[0], &option)) {
status = 0;
@@ -873,7 +897,7 @@ static int ocfs2_parse_options(struct super_block *sb,
if (option < 0)
return 0;
if (option == 0)
- option = JBD_DEFAULT_MAX_COMMIT_AGE;
+ option = JBD2_DEFAULT_MAX_COMMIT_AGE;
mopt->commit_interval = HZ * option;
break;
case Opt_localalloc:
@@ -918,6 +942,9 @@ static int ocfs2_parse_options(struct super_block *sb,
OCFS2_STACK_LABEL_LEN);
mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
break;
+ case Opt_inode64:
+ mopt->mount_opt |= OCFS2_MOUNT_INODE64;
+ break;
default:
mlog(ML_ERROR,
"Unrecognized mount option \"%s\" "
@@ -938,6 +965,7 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
{
struct ocfs2_super *osb = OCFS2_SB(mnt->mnt_sb);
unsigned long opts = osb->s_mount_opt;
+ unsigned int local_alloc_megs;
if (opts & OCFS2_MOUNT_HB_LOCAL)
seq_printf(s, ",_netdev,heartbeat=local");
@@ -970,8 +998,9 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
seq_printf(s, ",commit=%u",
(unsigned) (osb->osb_commit_interval / HZ));
- if (osb->local_alloc_size != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE)
- seq_printf(s, ",localalloc=%d", osb->local_alloc_size);
+ local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits);
+ if (local_alloc_megs != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE)
+ seq_printf(s, ",localalloc=%d", local_alloc_megs);
if (opts & OCFS2_MOUNT_LOCALFLOCKS)
seq_printf(s, ",localflocks,");
@@ -980,6 +1009,14 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
osb->osb_cluster_stack);
+ if (opts & OCFS2_MOUNT_NOUSERXATTR)
+ seq_printf(s, ",nouser_xattr");
+ else
+ seq_printf(s, ",user_xattr");
+
+ if (opts & OCFS2_MOUNT_INODE64)
+ seq_printf(s, ",inode64");
+
return 0;
}
@@ -1132,6 +1169,7 @@ static void ocfs2_inode_init_once(void *data)
oi->ip_dir_start_lookup = 0;
init_rwsem(&oi->ip_alloc_sem);
+ init_rwsem(&oi->ip_xattr_sem);
mutex_init(&oi->ip_io_mutex);
oi->ip_blkno = 0ULL;
@@ -1375,6 +1413,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
sb->s_fs_info = osb;
sb->s_op = &ocfs2_sops;
sb->s_export_op = &ocfs2_export_ops;
+ sb->s_xattr = ocfs2_xattr_handlers;
sb->s_time_gran = 1;
sb->s_flags |= MS_NOATIME;
/* this is needed to support O_LARGEFILE */
@@ -1421,8 +1460,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
osb->slot_num = OCFS2_INVALID_SLOT;
+ osb->s_xattr_inline_size = le16_to_cpu(
+ di->id2.i_super.s_xattr_inline_size);
+
osb->local_alloc_state = OCFS2_LA_UNUSED;
osb->local_alloc_bh = NULL;
+ INIT_DELAYED_WORK(&osb->la_enable_wq, ocfs2_la_enable_worker);
init_waitqueue_head(&osb->osb_mount_event);
@@ -1568,6 +1611,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
osb->first_cluster_group_blkno =
le64_to_cpu(di->id2.i_super.s_first_cluster_group);
osb->fs_generation = le32_to_cpu(di->i_fs_generation);
+ osb->uuid_hash = le32_to_cpu(di->id2.i_super.s_uuid_hash);
mlog(0, "vol_label: %s\n", osb->vol_label);
mlog(0, "uuid: %s\n", osb->uuid_str);
mlog(0, "root_blkno=%llu, system_dir_blkno=%llu\n",
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index ba9dbb51d25b..cbd03dfdc7b9 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -50,6 +50,7 @@
#include "inode.h"
#include "journal.h"
#include "symlink.h"
+#include "xattr.h"
#include "buffer_head_io.h"
@@ -83,11 +84,7 @@ static char *ocfs2_fast_symlink_getlink(struct inode *inode,
mlog_entry_void();
- status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
- OCFS2_I(inode)->ip_blkno,
- bh,
- OCFS2_BH_CACHED,
- inode);
+ status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, bh);
if (status < 0) {
mlog_errno(status);
link = ERR_PTR(status);
@@ -157,8 +154,7 @@ bail:
kunmap(page);
page_cache_release(page);
}
- if (bh)
- brelse(bh);
+ brelse(bh);
return ERR_PTR(status);
}
@@ -168,10 +164,18 @@ const struct inode_operations ocfs2_symlink_inode_operations = {
.follow_link = ocfs2_follow_link,
.getattr = ocfs2_getattr,
.setattr = ocfs2_setattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = ocfs2_listxattr,
+ .removexattr = generic_removexattr,
};
const struct inode_operations ocfs2_fast_symlink_inode_operations = {
.readlink = ocfs2_readlink,
.follow_link = ocfs2_follow_link,
.getattr = ocfs2_getattr,
.setattr = ocfs2_setattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = ocfs2_listxattr,
+ .removexattr = generic_removexattr,
};
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index 4da8851f2b23..187b99ff0368 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -53,7 +53,11 @@
#include <linux/highmem.h>
#include <linux/buffer_head.h>
#include <linux/rbtree.h>
-#include <linux/jbd.h>
+#ifndef CONFIG_OCFS2_COMPAT_JBD
+# include <linux/jbd2.h>
+#else
+# include <linux/jbd.h>
+#endif
#define MLOG_MASK_PREFIX ML_UPTODATE
@@ -511,14 +515,10 @@ static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci,
ci->ci_num_cached--;
}
-/* Called when we remove a chunk of metadata from an inode. We don't
- * bother reverting things to an inlined array in the case of a remove
- * which moves us back under the limit. */
-void ocfs2_remove_from_cache(struct inode *inode,
- struct buffer_head *bh)
+static void ocfs2_remove_block_from_cache(struct inode *inode,
+ sector_t block)
{
int index;
- sector_t block = bh->b_blocknr;
struct ocfs2_meta_cache_item *item = NULL;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
@@ -544,6 +544,30 @@ void ocfs2_remove_from_cache(struct inode *inode,
kmem_cache_free(ocfs2_uptodate_cachep, item);
}
+/*
+ * Called when we remove a chunk of metadata from an inode. We don't
+ * bother reverting things to an inlined array in the case of a remove
+ * which moves us back under the limit.
+ */
+void ocfs2_remove_from_cache(struct inode *inode,
+ struct buffer_head *bh)
+{
+ sector_t block = bh->b_blocknr;
+
+ ocfs2_remove_block_from_cache(inode, block);
+}
+
+/* Called when we remove xattr clusters from an inode. */
+void ocfs2_remove_xattr_clusters_from_cache(struct inode *inode,
+ sector_t block,
+ u32 c_len)
+{
+ unsigned int i, b_len = ocfs2_clusters_to_blocks(inode->i_sb, 1) * c_len;
+
+ for (i = 0; i < b_len; i++, block++)
+ ocfs2_remove_block_from_cache(inode, block);
+}
+
int __init init_ocfs2_uptodate_cache(void)
{
ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate",
diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h
index 2e73206059a8..531b4b3a0c47 100644
--- a/fs/ocfs2/uptodate.h
+++ b/fs/ocfs2/uptodate.h
@@ -40,6 +40,9 @@ void ocfs2_set_new_buffer_uptodate(struct inode *inode,
struct buffer_head *bh);
void ocfs2_remove_from_cache(struct inode *inode,
struct buffer_head *bh);
+void ocfs2_remove_xattr_clusters_from_cache(struct inode *inode,
+ sector_t block,
+ u32 c_len);
int ocfs2_buffer_read_ahead(struct inode *inode,
struct buffer_head *bh);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
new file mode 100644
index 000000000000..802c41492214
--- /dev/null
+++ b/fs/ocfs2/xattr.c
@@ -0,0 +1,4832 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * xattr.c
+ *
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * CREDITS:
+ * Lots of code in this file is taken from ext3.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/uio.h>
+#include <linux/sched.h>
+#include <linux/splice.h>
+#include <linux/mount.h>
+#include <linux/writeback.h>
+#include <linux/falloc.h>
+#include <linux/sort.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#define MLOG_MASK_PREFIX ML_XATTR
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "alloc.h"
+#include "dlmglue.h"
+#include "file.h"
+#include "symlink.h"
+#include "sysfile.h"
+#include "inode.h"
+#include "journal.h"
+#include "ocfs2_fs.h"
+#include "suballoc.h"
+#include "uptodate.h"
+#include "buffer_head_io.h"
+#include "super.h"
+#include "xattr.h"
+
+
+struct ocfs2_xattr_def_value_root {
+ struct ocfs2_xattr_value_root xv;
+ struct ocfs2_extent_rec er;
+};
+
+struct ocfs2_xattr_bucket {
+ struct buffer_head *bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET];
+ struct ocfs2_xattr_header *xh;
+};
+
+#define OCFS2_XATTR_ROOT_SIZE (sizeof(struct ocfs2_xattr_def_value_root))
+#define OCFS2_XATTR_INLINE_SIZE 80
+
+static struct ocfs2_xattr_def_value_root def_xv = {
+ .xv.xr_list.l_count = cpu_to_le16(1),
+};
+
+struct xattr_handler *ocfs2_xattr_handlers[] = {
+ &ocfs2_xattr_user_handler,
+ &ocfs2_xattr_trusted_handler,
+ NULL
+};
+
+static struct xattr_handler *ocfs2_xattr_handler_map[] = {
+ [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler,
+ [OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler,
+};
+
+struct ocfs2_xattr_info {
+ int name_index;
+ const char *name;
+ const void *value;
+ size_t value_len;
+};
+
+struct ocfs2_xattr_search {
+ struct buffer_head *inode_bh;
+ /*
+ * xattr_bh point to the block buffer head which has extended attribute
+ * when extended attribute in inode, xattr_bh is equal to inode_bh.
+ */
+ struct buffer_head *xattr_bh;
+ struct ocfs2_xattr_header *header;
+ struct ocfs2_xattr_bucket bucket;
+ void *base;
+ void *end;
+ struct ocfs2_xattr_entry *here;
+ int not_found;
+};
+
+static int ocfs2_xattr_bucket_get_name_value(struct inode *inode,
+ struct ocfs2_xattr_header *xh,
+ int index,
+ int *block_off,
+ int *new_offset);
+
+static int ocfs2_xattr_index_block_find(struct inode *inode,
+ struct buffer_head *root_bh,
+ int name_index,
+ const char *name,
+ struct ocfs2_xattr_search *xs);
+
+static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
+ struct ocfs2_xattr_tree_root *xt,
+ char *buffer,
+ size_t buffer_size);
+
+static int ocfs2_xattr_create_index_block(struct inode *inode,
+ struct ocfs2_xattr_search *xs);
+
+static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xs);
+
+static int ocfs2_delete_xattr_index_block(struct inode *inode,
+ struct buffer_head *xb_bh);
+
+static inline const char *ocfs2_xattr_prefix(int name_index)
+{
+ struct xattr_handler *handler = NULL;
+
+ if (name_index > 0 && name_index < OCFS2_XATTR_MAX)
+ handler = ocfs2_xattr_handler_map[name_index];
+
+ return handler ? handler->prefix : NULL;
+}
+
+static u32 ocfs2_xattr_name_hash(struct inode *inode,
+ const char *name,
+ int name_len)
+{
+ /* Get hash value of uuid from super block */
+ u32 hash = OCFS2_SB(inode->i_sb)->uuid_hash;
+ int i;
+
+ /* hash extended attribute name */
+ for (i = 0; i < name_len; i++) {
+ hash = (hash << OCFS2_HASH_SHIFT) ^
+ (hash >> (8*sizeof(hash) - OCFS2_HASH_SHIFT)) ^
+ *name++;
+ }
+
+ return hash;
+}
+
+/*
+ * ocfs2_xattr_hash_entry()
+ *
+ * Compute the hash of an extended attribute.
+ */
+static void ocfs2_xattr_hash_entry(struct inode *inode,
+ struct ocfs2_xattr_header *header,
+ struct ocfs2_xattr_entry *entry)
+{
+ u32 hash = 0;
+ char *name = (char *)header + le16_to_cpu(entry->xe_name_offset);
+
+ hash = ocfs2_xattr_name_hash(inode, name, entry->xe_name_len);
+ entry->xe_name_hash = cpu_to_le32(hash);
+
+ return;
+}
+
+static int ocfs2_xattr_extend_allocation(struct inode *inode,
+ u32 clusters_to_add,
+ struct buffer_head *xattr_bh,
+ struct ocfs2_xattr_value_root *xv)
+{
+ int status = 0;
+ int restart_func = 0;
+ int credits = 0;
+ handle_t *handle = NULL;
+ struct ocfs2_alloc_context *data_ac = NULL;
+ struct ocfs2_alloc_context *meta_ac = NULL;
+ enum ocfs2_alloc_restarted why;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ u32 prev_clusters, logical_start = le32_to_cpu(xv->xr_clusters);
+ struct ocfs2_extent_tree et;
+
+ mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add);
+
+ ocfs2_init_xattr_value_extent_tree(&et, inode, xattr_bh, xv);
+
+restart_all:
+
+ status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
+ &data_ac, &meta_ac);
+ if (status) {
+ mlog_errno(status);
+ goto leave;
+ }
+
+ credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el,
+ clusters_to_add);
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ handle = NULL;
+ mlog_errno(status);
+ goto leave;
+ }
+
+restarted_transaction:
+ status = ocfs2_journal_access(handle, inode, xattr_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
+
+ prev_clusters = le32_to_cpu(xv->xr_clusters);
+ status = ocfs2_add_clusters_in_btree(osb,
+ inode,
+ &logical_start,
+ clusters_to_add,
+ 0,
+ &et,
+ handle,
+ data_ac,
+ meta_ac,
+ &why);
+ if ((status < 0) && (status != -EAGAIN)) {
+ if (status != -ENOSPC)
+ mlog_errno(status);
+ goto leave;
+ }
+
+ status = ocfs2_journal_dirty(handle, xattr_bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
+
+ clusters_to_add -= le32_to_cpu(xv->xr_clusters) - prev_clusters;
+
+ if (why != RESTART_NONE && clusters_to_add) {
+ if (why == RESTART_META) {
+ mlog(0, "restarting function.\n");
+ restart_func = 1;
+ } else {
+ BUG_ON(why != RESTART_TRANS);
+
+ mlog(0, "restarting transaction.\n");
+ /* TODO: This can be more intelligent. */
+ credits = ocfs2_calc_extend_credits(osb->sb,
+ et.et_root_el,
+ clusters_to_add);
+ status = ocfs2_extend_trans(handle, credits);
+ if (status < 0) {
+ /* handle still has to be committed at
+ * this point. */
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto leave;
+ }
+ goto restarted_transaction;
+ }
+ }
+
+leave:
+ if (handle) {
+ ocfs2_commit_trans(osb, handle);
+ handle = NULL;
+ }
+ if (data_ac) {
+ ocfs2_free_alloc_context(data_ac);
+ data_ac = NULL;
+ }
+ if (meta_ac) {
+ ocfs2_free_alloc_context(meta_ac);
+ meta_ac = NULL;
+ }
+ if ((!status) && restart_func) {
+ restart_func = 0;
+ goto restart_all;
+ }
+
+ return status;
+}
+
+static int __ocfs2_remove_xattr_range(struct inode *inode,
+ struct buffer_head *root_bh,
+ struct ocfs2_xattr_value_root *xv,
+ u32 cpos, u32 phys_cpos, u32 len,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret;
+ u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct inode *tl_inode = osb->osb_tl_inode;
+ handle_t *handle;
+ struct ocfs2_alloc_context *meta_ac = NULL;
+ struct ocfs2_extent_tree et;
+
+ ocfs2_init_xattr_value_extent_tree(&et, inode, root_bh, xv);
+
+ ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ mutex_lock(&tl_inode->i_mutex);
+
+ if (ocfs2_truncate_log_needs_flush(osb)) {
+ ret = __ocfs2_flush_truncate_log(osb);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access(handle, inode, root_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
+ dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ le32_add_cpu(&xv->xr_clusters, -len);
+
+ ret = ocfs2_journal_dirty(handle, root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
+ if (ret)
+ mlog_errno(ret);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+out:
+ mutex_unlock(&tl_inode->i_mutex);
+
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+
+ return ret;
+}
+
+static int ocfs2_xattr_shrink_size(struct inode *inode,
+ u32 old_clusters,
+ u32 new_clusters,
+ struct buffer_head *root_bh,
+ struct ocfs2_xattr_value_root *xv)
+{
+ int ret = 0;
+ u32 trunc_len, cpos, phys_cpos, alloc_size;
+ u64 block;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_cached_dealloc_ctxt dealloc;
+
+ ocfs2_init_dealloc_ctxt(&dealloc);
+
+ if (old_clusters <= new_clusters)
+ return 0;
+
+ cpos = new_clusters;
+ trunc_len = old_clusters - new_clusters;
+ while (trunc_len) {
+ ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos,
+ &alloc_size, &xv->xr_list);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (alloc_size > trunc_len)
+ alloc_size = trunc_len;
+
+ ret = __ocfs2_remove_xattr_range(inode, root_bh, xv, cpos,
+ phys_cpos, alloc_size,
+ &dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ block = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+ ocfs2_remove_xattr_clusters_from_cache(inode, block,
+ alloc_size);
+ cpos += alloc_size;
+ trunc_len -= alloc_size;
+ }
+
+out:
+ ocfs2_schedule_truncate_log_flush(osb, 1);
+ ocfs2_run_deallocs(osb, &dealloc);
+
+ return ret;
+}
+
+static int ocfs2_xattr_value_truncate(struct inode *inode,
+ struct buffer_head *root_bh,
+ struct ocfs2_xattr_value_root *xv,
+ int len)
+{
+ int ret;
+ u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, len);
+ u32 old_clusters = le32_to_cpu(xv->xr_clusters);
+
+ if (new_clusters == old_clusters)
+ return 0;
+
+ if (new_clusters > old_clusters)
+ ret = ocfs2_xattr_extend_allocation(inode,
+ new_clusters - old_clusters,
+ root_bh, xv);
+ else
+ ret = ocfs2_xattr_shrink_size(inode,
+ old_clusters, new_clusters,
+ root_bh, xv);
+
+ return ret;
+}
+
+static int ocfs2_xattr_list_entry(char *buffer, size_t size,
+ size_t *result, const char *prefix,
+ const char *name, int name_len)
+{
+ char *p = buffer + *result;
+ int prefix_len = strlen(prefix);
+ int total_len = prefix_len + name_len + 1;
+
+ *result += total_len;
+
+ /* we are just looking for how big our buffer needs to be */
+ if (!size)
+ return 0;
+
+ if (*result > size)
+ return -ERANGE;
+
+ memcpy(p, prefix, prefix_len);
+ memcpy(p + prefix_len, name, name_len);
+ p[prefix_len + name_len] = '\0';
+
+ return 0;
+}
+
+static int ocfs2_xattr_list_entries(struct inode *inode,
+ struct ocfs2_xattr_header *header,
+ char *buffer, size_t buffer_size)
+{
+ size_t result = 0;
+ int i, type, ret;
+ const char *prefix, *name;
+
+ for (i = 0 ; i < le16_to_cpu(header->xh_count); i++) {
+ struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
+ type = ocfs2_xattr_get_type(entry);
+ prefix = ocfs2_xattr_prefix(type);
+
+ if (prefix) {
+ name = (const char *)header +
+ le16_to_cpu(entry->xe_name_offset);
+
+ ret = ocfs2_xattr_list_entry(buffer, buffer_size,
+ &result, prefix, name,
+ entry->xe_name_len);
+ if (ret)
+ return ret;
+ }
+ }
+
+ return result;
+}
+
+static int ocfs2_xattr_ibody_list(struct inode *inode,
+ struct ocfs2_dinode *di,
+ char *buffer,
+ size_t buffer_size)
+{
+ struct ocfs2_xattr_header *header = NULL;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ int ret = 0;
+
+ if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL))
+ return ret;
+
+ header = (struct ocfs2_xattr_header *)
+ ((void *)di + inode->i_sb->s_blocksize -
+ le16_to_cpu(di->i_xattr_inline_size));
+
+ ret = ocfs2_xattr_list_entries(inode, header, buffer, buffer_size);
+
+ return ret;
+}
+
+static int ocfs2_xattr_block_list(struct inode *inode,
+ struct ocfs2_dinode *di,
+ char *buffer,
+ size_t buffer_size)
+{
+ struct buffer_head *blk_bh = NULL;
+ struct ocfs2_xattr_block *xb;
+ int ret = 0;
+
+ if (!di->i_xattr_loc)
+ return ret;
+
+ ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+ /*Verify the signature of xattr block*/
+ if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
+ strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
+ ret = -EFAULT;
+ goto cleanup;
+ }
+
+ xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+
+ if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
+ struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
+ ret = ocfs2_xattr_list_entries(inode, header,
+ buffer, buffer_size);
+ } else {
+ struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
+ ret = ocfs2_xattr_tree_list_index_block(inode, xt,
+ buffer, buffer_size);
+ }
+cleanup:
+ brelse(blk_bh);
+
+ return ret;
+}
+
+ssize_t ocfs2_listxattr(struct dentry *dentry,
+ char *buffer,
+ size_t size)
+{
+ int ret = 0, i_ret = 0, b_ret = 0;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_dinode *di = NULL;
+ struct ocfs2_inode_info *oi = OCFS2_I(dentry->d_inode);
+
+ if (!ocfs2_supports_xattr(OCFS2_SB(dentry->d_sb)))
+ return -EOPNOTSUPP;
+
+ if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
+ return ret;
+
+ ret = ocfs2_inode_lock(dentry->d_inode, &di_bh, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ di = (struct ocfs2_dinode *)di_bh->b_data;
+
+ down_read(&oi->ip_xattr_sem);
+ i_ret = ocfs2_xattr_ibody_list(dentry->d_inode, di, buffer, size);
+ if (i_ret < 0)
+ b_ret = 0;
+ else {
+ if (buffer) {
+ buffer += i_ret;
+ size -= i_ret;
+ }
+ b_ret = ocfs2_xattr_block_list(dentry->d_inode, di,
+ buffer, size);
+ if (b_ret < 0)
+ i_ret = 0;
+ }
+ up_read(&oi->ip_xattr_sem);
+ ocfs2_inode_unlock(dentry->d_inode, 0);
+
+ brelse(di_bh);
+
+ return i_ret + b_ret;
+}
+
+static int ocfs2_xattr_find_entry(int name_index,
+ const char *name,
+ struct ocfs2_xattr_search *xs)
+{
+ struct ocfs2_xattr_entry *entry;
+ size_t name_len;
+ int i, cmp = 1;
+
+ if (name == NULL)
+ return -EINVAL;
+
+ name_len = strlen(name);
+ entry = xs->here;
+ for (i = 0; i < le16_to_cpu(xs->header->xh_count); i++) {
+ cmp = name_index - ocfs2_xattr_get_type(entry);
+ if (!cmp)
+ cmp = name_len - entry->xe_name_len;
+ if (!cmp)
+ cmp = memcmp(name, (xs->base +
+ le16_to_cpu(entry->xe_name_offset)),
+ name_len);
+ if (cmp == 0)
+ break;
+ entry += 1;
+ }
+ xs->here = entry;
+
+ return cmp ? -ENODATA : 0;
+}
+
+static int ocfs2_xattr_get_value_outside(struct inode *inode,
+ struct ocfs2_xattr_value_root *xv,
+ void *buffer,
+ size_t len)
+{
+ u32 cpos, p_cluster, num_clusters, bpc, clusters;
+ u64 blkno;
+ int i, ret = 0;
+ size_t cplen, blocksize;
+ struct buffer_head *bh = NULL;
+ struct ocfs2_extent_list *el;
+
+ el = &xv->xr_list;
+ clusters = le32_to_cpu(xv->xr_clusters);
+ bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+ blocksize = inode->i_sb->s_blocksize;
+
+ cpos = 0;
+ while (cpos < clusters) {
+ ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
+ &num_clusters, el);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
+ /* Copy ocfs2_xattr_value */
+ for (i = 0; i < num_clusters * bpc; i++, blkno++) {
+ ret = ocfs2_read_block(inode, blkno, &bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ cplen = len >= blocksize ? blocksize : len;
+ memcpy(buffer, bh->b_data, cplen);
+ len -= cplen;
+ buffer += cplen;
+
+ brelse(bh);
+ bh = NULL;
+ if (len == 0)
+ break;
+ }
+ cpos += num_clusters;
+ }
+out:
+ return ret;
+}
+
+static int ocfs2_xattr_ibody_get(struct inode *inode,
+ int name_index,
+ const char *name,
+ void *buffer,
+ size_t buffer_size,
+ struct ocfs2_xattr_search *xs)
+{
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+ struct ocfs2_xattr_value_root *xv;
+ size_t size;
+ int ret = 0;
+
+ if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL))
+ return -ENODATA;
+
+ xs->end = (void *)di + inode->i_sb->s_blocksize;
+ xs->header = (struct ocfs2_xattr_header *)
+ (xs->end - le16_to_cpu(di->i_xattr_inline_size));
+ xs->base = (void *)xs->header;
+ xs->here = xs->header->xh_entries;
+
+ ret = ocfs2_xattr_find_entry(name_index, name, xs);
+ if (ret)
+ return ret;
+ size = le64_to_cpu(xs->here->xe_value_size);
+ if (buffer) {
+ if (size > buffer_size)
+ return -ERANGE;
+ if (ocfs2_xattr_is_local(xs->here)) {
+ memcpy(buffer, (void *)xs->base +
+ le16_to_cpu(xs->here->xe_name_offset) +
+ OCFS2_XATTR_SIZE(xs->here->xe_name_len), size);
+ } else {
+ xv = (struct ocfs2_xattr_value_root *)
+ (xs->base + le16_to_cpu(
+ xs->here->xe_name_offset) +
+ OCFS2_XATTR_SIZE(xs->here->xe_name_len));
+ ret = ocfs2_xattr_get_value_outside(inode, xv,
+ buffer, size);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+ }
+ }
+
+ return size;
+}
+
+static int ocfs2_xattr_block_get(struct inode *inode,
+ int name_index,
+ const char *name,
+ void *buffer,
+ size_t buffer_size,
+ struct ocfs2_xattr_search *xs)
+{
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+ struct buffer_head *blk_bh = NULL;
+ struct ocfs2_xattr_block *xb;
+ struct ocfs2_xattr_value_root *xv;
+ size_t size;
+ int ret = -ENODATA, name_offset, name_len, block_off, i;
+
+ if (!di->i_xattr_loc)
+ return ret;
+
+ memset(&xs->bucket, 0, sizeof(xs->bucket));
+
+ ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+ /*Verify the signature of xattr block*/
+ if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
+ strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
+ ret = -EFAULT;
+ goto cleanup;
+ }
+
+ xs->xattr_bh = blk_bh;
+ xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+
+ if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
+ xs->header = &xb->xb_attrs.xb_header;
+ xs->base = (void *)xs->header;
+ xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size;
+ xs->here = xs->header->xh_entries;
+
+ ret = ocfs2_xattr_find_entry(name_index, name, xs);
+ } else
+ ret = ocfs2_xattr_index_block_find(inode, blk_bh,
+ name_index,
+ name, xs);
+
+ if (ret)
+ goto cleanup;
+ size = le64_to_cpu(xs->here->xe_value_size);
+ if (buffer) {
+ ret = -ERANGE;
+ if (size > buffer_size)
+ goto cleanup;
+
+ name_offset = le16_to_cpu(xs->here->xe_name_offset);
+ name_len = OCFS2_XATTR_SIZE(xs->here->xe_name_len);
+ i = xs->here - xs->header->xh_entries;
+
+ if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
+ ret = ocfs2_xattr_bucket_get_name_value(inode,
+ xs->bucket.xh,
+ i,
+ &block_off,
+ &name_offset);
+ xs->base = xs->bucket.bhs[block_off]->b_data;
+ }
+ if (ocfs2_xattr_is_local(xs->here)) {
+ memcpy(buffer, (void *)xs->base +
+ name_offset + name_len, size);
+ } else {
+ xv = (struct ocfs2_xattr_value_root *)
+ (xs->base + name_offset + name_len);
+ ret = ocfs2_xattr_get_value_outside(inode, xv,
+ buffer, size);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto cleanup;
+ }
+ }
+ }
+ ret = size;
+cleanup:
+ for (i = 0; i < OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET; i++)
+ brelse(xs->bucket.bhs[i]);
+ memset(&xs->bucket, 0, sizeof(xs->bucket));
+
+ brelse(blk_bh);
+ return ret;
+}
+
+/* ocfs2_xattr_get()
+ *
+ * Copy an extended attribute into the buffer provided.
+ * Buffer is NULL to compute the size of buffer required.
+ */
+int ocfs2_xattr_get(struct inode *inode,
+ int name_index,
+ const char *name,
+ void *buffer,
+ size_t buffer_size)
+{
+ int ret;
+ struct ocfs2_dinode *di = NULL;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_xattr_search xis = {
+ .not_found = -ENODATA,
+ };
+ struct ocfs2_xattr_search xbs = {
+ .not_found = -ENODATA,
+ };
+
+ if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
+ return -EOPNOTSUPP;
+
+ if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
+ ret = -ENODATA;
+
+ ret = ocfs2_inode_lock(inode, &di_bh, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+ xis.inode_bh = xbs.inode_bh = di_bh;
+ di = (struct ocfs2_dinode *)di_bh->b_data;
+
+ down_read(&oi->ip_xattr_sem);
+ ret = ocfs2_xattr_ibody_get(inode, name_index, name, buffer,
+ buffer_size, &xis);
+ if (ret == -ENODATA)
+ ret = ocfs2_xattr_block_get(inode, name_index, name, buffer,
+ buffer_size, &xbs);
+ up_read(&oi->ip_xattr_sem);
+ ocfs2_inode_unlock(inode, 0);
+
+ brelse(di_bh);
+
+ return ret;
+}
+
+static int __ocfs2_xattr_set_value_outside(struct inode *inode,
+ struct ocfs2_xattr_value_root *xv,
+ const void *value,
+ int value_len)
+{
+ int ret = 0, i, cp_len, credits;
+ u16 blocksize = inode->i_sb->s_blocksize;
+ u32 p_cluster, num_clusters;
+ u32 cpos = 0, bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+ u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len);
+ u64 blkno;
+ struct buffer_head *bh = NULL;
+ handle_t *handle;
+
+ BUG_ON(clusters > le32_to_cpu(xv->xr_clusters));
+
+ credits = clusters * bpc;
+ handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb), credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ while (cpos < clusters) {
+ ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
+ &num_clusters, &xv->xr_list);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
+
+ for (i = 0; i < num_clusters * bpc; i++, blkno++) {
+ ret = ocfs2_read_block(inode, blkno, &bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = ocfs2_journal_access(handle,
+ inode,
+ bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ cp_len = value_len > blocksize ? blocksize : value_len;
+ memcpy(bh->b_data, value, cp_len);
+ value_len -= cp_len;
+ value += cp_len;
+ if (cp_len < blocksize)
+ memset(bh->b_data + cp_len, 0,
+ blocksize - cp_len);
+
+ ret = ocfs2_journal_dirty(handle, bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+ brelse(bh);
+ bh = NULL;
+
+ /*
+ * XXX: do we need to empty all the following
+ * blocks in this cluster?
+ */
+ if (!value_len)
+ break;
+ }
+ cpos += num_clusters;
+ }
+out_commit:
+ ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+ brelse(bh);
+
+ return ret;
+}
+
+static int ocfs2_xattr_cleanup(struct inode *inode,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xs,
+ size_t offs)
+{
+ handle_t *handle = NULL;
+ int ret = 0;
+ size_t name_len = strlen(xi->name);
+ void *val = xs->base + offs;
+ size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
+
+ handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+ OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+ ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+ /* Decrease xattr count */
+ le16_add_cpu(&xs->header->xh_count, -1);
+ /* Remove the xattr entry and tree root which has already be set*/
+ memset((void *)xs->here, 0, sizeof(struct ocfs2_xattr_entry));
+ memset(val, 0, size);
+
+ ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
+ if (ret < 0)
+ mlog_errno(ret);
+out_commit:
+ ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+ return ret;
+}
+
+static int ocfs2_xattr_update_entry(struct inode *inode,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xs,
+ size_t offs)
+{
+ handle_t *handle = NULL;
+ int ret = 0;
+
+ handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+ OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+ ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ xs->here->xe_name_offset = cpu_to_le16(offs);
+ xs->here->xe_value_size = cpu_to_le64(xi->value_len);
+ if (xi->value_len <= OCFS2_XATTR_INLINE_SIZE)
+ ocfs2_xattr_set_local(xs->here, 1);
+ else
+ ocfs2_xattr_set_local(xs->here, 0);
+ ocfs2_xattr_hash_entry(inode, xs->header, xs->here);
+
+ ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
+ if (ret < 0)
+ mlog_errno(ret);
+out_commit:
+ ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+ return ret;
+}
+
+/*
+ * ocfs2_xattr_set_value_outside()
+ *
+ * Set large size value in B tree.
+ */
+static int ocfs2_xattr_set_value_outside(struct inode *inode,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xs,
+ size_t offs)
+{
+ size_t name_len = strlen(xi->name);
+ void *val = xs->base + offs;
+ struct ocfs2_xattr_value_root *xv = NULL;
+ size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
+ int ret = 0;
+
+ memset(val, 0, size);
+ memcpy(val, xi->name, name_len);
+ xv = (struct ocfs2_xattr_value_root *)
+ (val + OCFS2_XATTR_SIZE(name_len));
+ xv->xr_clusters = 0;
+ xv->xr_last_eb_blk = 0;
+ xv->xr_list.l_tree_depth = 0;
+ xv->xr_list.l_count = cpu_to_le16(1);
+ xv->xr_list.l_next_free_rec = 0;
+
+ ret = ocfs2_xattr_value_truncate(inode, xs->xattr_bh, xv,
+ xi->value_len);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+ ret = __ocfs2_xattr_set_value_outside(inode, xv, xi->value,
+ xi->value_len);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+ ret = ocfs2_xattr_update_entry(inode, xi, xs, offs);
+ if (ret < 0)
+ mlog_errno(ret);
+
+ return ret;
+}
+
+/*
+ * ocfs2_xattr_set_entry_local()
+ *
+ * Set, replace or remove extended attribute in local.
+ */
+static void ocfs2_xattr_set_entry_local(struct inode *inode,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xs,
+ struct ocfs2_xattr_entry *last,
+ size_t min_offs)
+{
+ size_t name_len = strlen(xi->name);
+ int i;
+
+ if (xi->value && xs->not_found) {
+ /* Insert the new xattr entry. */
+ le16_add_cpu(&xs->header->xh_count, 1);
+ ocfs2_xattr_set_type(last, xi->name_index);
+ ocfs2_xattr_set_local(last, 1);
+ last->xe_name_len = name_len;
+ } else {
+ void *first_val;
+ void *val;
+ size_t offs, size;
+
+ first_val = xs->base + min_offs;
+ offs = le16_to_cpu(xs->here->xe_name_offset);
+ val = xs->base + offs;
+
+ if (le64_to_cpu(xs->here->xe_value_size) >
+ OCFS2_XATTR_INLINE_SIZE)
+ size = OCFS2_XATTR_SIZE(name_len) +
+ OCFS2_XATTR_ROOT_SIZE;
+ else
+ size = OCFS2_XATTR_SIZE(name_len) +
+ OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
+
+ if (xi->value && size == OCFS2_XATTR_SIZE(name_len) +
+ OCFS2_XATTR_SIZE(xi->value_len)) {
+ /* The old and the new value have the
+ same size. Just replace the value. */
+ ocfs2_xattr_set_local(xs->here, 1);
+ xs->here->xe_value_size = cpu_to_le64(xi->value_len);
+ /* Clear value bytes. */
+ memset(val + OCFS2_XATTR_SIZE(name_len),
+ 0,
+ OCFS2_XATTR_SIZE(xi->value_len));
+ memcpy(val + OCFS2_XATTR_SIZE(name_len),
+ xi->value,
+ xi->value_len);
+ return;
+ }
+ /* Remove the old name+value. */
+ memmove(first_val + size, first_val, val - first_val);
+ memset(first_val, 0, size);
+ xs->here->xe_name_hash = 0;
+ xs->here->xe_name_offset = 0;
+ ocfs2_xattr_set_local(xs->here, 1);
+ xs->here->xe_value_size = 0;
+
+ min_offs += size;
+
+ /* Adjust all value offsets. */
+ last = xs->header->xh_entries;
+ for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) {
+ size_t o = le16_to_cpu(last->xe_name_offset);
+
+ if (o < offs)
+ last->xe_name_offset = cpu_to_le16(o + size);
+ last += 1;
+ }
+
+ if (!xi->value) {
+ /* Remove the old entry. */
+ last -= 1;
+ memmove(xs->here, xs->here + 1,
+ (void *)last - (void *)xs->here);
+ memset(last, 0, sizeof(struct ocfs2_xattr_entry));
+ le16_add_cpu(&xs->header->xh_count, -1);
+ }
+ }
+ if (xi->value) {
+ /* Insert the new name+value. */
+ size_t size = OCFS2_XATTR_SIZE(name_len) +
+ OCFS2_XATTR_SIZE(xi->value_len);
+ void *val = xs->base + min_offs - size;
+
+ xs->here->xe_name_offset = cpu_to_le16(min_offs - size);
+ memset(val, 0, size);
+ memcpy(val, xi->name, name_len);
+ memcpy(val + OCFS2_XATTR_SIZE(name_len),
+ xi->value,
+ xi->value_len);
+ xs->here->xe_value_size = cpu_to_le64(xi->value_len);
+ ocfs2_xattr_set_local(xs->here, 1);
+ ocfs2_xattr_hash_entry(inode, xs->header, xs->here);
+ }
+
+ return;
+}
+
+/*
+ * ocfs2_xattr_set_entry()
+ *
+ * Set extended attribute entry into inode or block.
+ *
+ * If extended attribute value size > OCFS2_XATTR_INLINE_SIZE,
+ * We first insert tree root(ocfs2_xattr_value_root) with set_entry_local(),
+ * then set value in B tree with set_value_outside().
+ */
+static int ocfs2_xattr_set_entry(struct inode *inode,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xs,
+ int flag)
+{
+ struct ocfs2_xattr_entry *last;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+ size_t min_offs = xs->end - xs->base, name_len = strlen(xi->name);
+ size_t size_l = 0;
+ handle_t *handle = NULL;
+ int free, i, ret;
+ struct ocfs2_xattr_info xi_l = {
+ .name_index = xi->name_index,
+ .name = xi->name,
+ .value = xi->value,
+ .value_len = xi->value_len,
+ };
+
+ /* Compute min_offs, last and free space. */
+ last = xs->header->xh_entries;
+
+ for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) {
+ size_t offs = le16_to_cpu(last->xe_name_offset);
+ if (offs < min_offs)
+ min_offs = offs;
+ last += 1;
+ }
+
+ free = min_offs - ((void *)last - xs->base) - sizeof(__u32);
+ if (free < 0)
+ return -EFAULT;
+
+ if (!xs->not_found) {
+ size_t size = 0;
+ if (ocfs2_xattr_is_local(xs->here))
+ size = OCFS2_XATTR_SIZE(name_len) +
+ OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
+ else
+ size = OCFS2_XATTR_SIZE(name_len) +
+ OCFS2_XATTR_ROOT_SIZE;
+ free += (size + sizeof(struct ocfs2_xattr_entry));
+ }
+ /* Check free space in inode or block */
+ if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+ if (free < sizeof(struct ocfs2_xattr_entry) +
+ OCFS2_XATTR_SIZE(name_len) +
+ OCFS2_XATTR_ROOT_SIZE) {
+ ret = -ENOSPC;
+ goto out;
+ }
+ size_l = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
+ xi_l.value = (void *)&def_xv;
+ xi_l.value_len = OCFS2_XATTR_ROOT_SIZE;
+ } else if (xi->value) {
+ if (free < sizeof(struct ocfs2_xattr_entry) +
+ OCFS2_XATTR_SIZE(name_len) +
+ OCFS2_XATTR_SIZE(xi->value_len)) {
+ ret = -ENOSPC;
+ goto out;
+ }
+ }
+
+ if (!xs->not_found) {
+ /* For existing extended attribute */
+ size_t size = OCFS2_XATTR_SIZE(name_len) +
+ OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
+ size_t offs = le16_to_cpu(xs->here->xe_name_offset);
+ void *val = xs->base + offs;
+
+ if (ocfs2_xattr_is_local(xs->here) && size == size_l) {
+ /* Replace existing local xattr with tree root */
+ ret = ocfs2_xattr_set_value_outside(inode, xi, xs,
+ offs);
+ if (ret < 0)
+ mlog_errno(ret);
+ goto out;
+ } else if (!ocfs2_xattr_is_local(xs->here)) {
+ /* For existing xattr which has value outside */
+ struct ocfs2_xattr_value_root *xv = NULL;
+ xv = (struct ocfs2_xattr_value_root *)(val +
+ OCFS2_XATTR_SIZE(name_len));
+
+ if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+ /*
+ * If new value need set outside also,
+ * first truncate old value to new value,
+ * then set new value with set_value_outside().
+ */
+ ret = ocfs2_xattr_value_truncate(inode,
+ xs->xattr_bh,
+ xv,
+ xi->value_len);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = __ocfs2_xattr_set_value_outside(inode,
+ xv,
+ xi->value,
+ xi->value_len);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_xattr_update_entry(inode,
+ xi,
+ xs,
+ offs);
+ if (ret < 0)
+ mlog_errno(ret);
+ goto out;
+ } else {
+ /*
+ * If new value need set in local,
+ * just trucate old value to zero.
+ */
+ ret = ocfs2_xattr_value_truncate(inode,
+ xs->xattr_bh,
+ xv,
+ 0);
+ if (ret < 0)
+ mlog_errno(ret);
+ }
+ }
+ }
+
+ handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+ OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access(handle, inode, xs->inode_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ if (!(flag & OCFS2_INLINE_XATTR_FL)) {
+ /* set extended attribute in external block. */
+ ret = ocfs2_extend_trans(handle,
+ OCFS2_INODE_UPDATE_CREDITS +
+ OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+ ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+ }
+
+ /*
+ * Set value in local, include set tree root in local.
+ * This is the first step for value size >INLINE_SIZE.
+ */
+ ocfs2_xattr_set_entry_local(inode, &xi_l, xs, last, min_offs);
+
+ if (!(flag & OCFS2_INLINE_XATTR_FL)) {
+ ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+ }
+
+ if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) &&
+ (flag & OCFS2_INLINE_XATTR_FL)) {
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ unsigned int xattrsize = osb->s_xattr_inline_size;
+
+ /*
+ * Adjust extent record count or inline data size
+ * to reserve space for extended attribute.
+ */
+ if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ struct ocfs2_inline_data *idata = &di->id2.i_data;
+ le16_add_cpu(&idata->id_count, -xattrsize);
+ } else if (!(ocfs2_inode_is_fast_symlink(inode))) {
+ struct ocfs2_extent_list *el = &di->id2.i_list;
+ le16_add_cpu(&el->l_count, -(xattrsize /
+ sizeof(struct ocfs2_extent_rec)));
+ }
+ di->i_xattr_inline_size = cpu_to_le16(xattrsize);
+ }
+ /* Update xattr flag */
+ spin_lock(&oi->ip_lock);
+ oi->ip_dyn_features |= flag;
+ di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+ spin_unlock(&oi->ip_lock);
+ /* Update inode ctime */
+ inode->i_ctime = CURRENT_TIME;
+ di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+ di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+
+ ret = ocfs2_journal_dirty(handle, xs->inode_bh);
+ if (ret < 0)
+ mlog_errno(ret);
+
+out_commit:
+ ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+
+ if (!ret && xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+ /*
+ * Set value outside in B tree.
+ * This is the second step for value size > INLINE_SIZE.
+ */
+ size_t offs = le16_to_cpu(xs->here->xe_name_offset);
+ ret = ocfs2_xattr_set_value_outside(inode, xi, xs, offs);
+ if (ret < 0) {
+ int ret2;
+
+ mlog_errno(ret);
+ /*
+ * If set value outside failed, we have to clean
+ * the junk tree root we have already set in local.
+ */
+ ret2 = ocfs2_xattr_cleanup(inode, xi, xs, offs);
+ if (ret2 < 0)
+ mlog_errno(ret2);
+ }
+ }
+out:
+ return ret;
+
+}
+
+static int ocfs2_remove_value_outside(struct inode*inode,
+ struct buffer_head *bh,
+ struct ocfs2_xattr_header *header)
+{
+ int ret = 0, i;
+
+ for (i = 0; i < le16_to_cpu(header->xh_count); i++) {
+ struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
+
+ if (!ocfs2_xattr_is_local(entry)) {
+ struct ocfs2_xattr_value_root *xv;
+ void *val;
+
+ val = (void *)header +
+ le16_to_cpu(entry->xe_name_offset);
+ xv = (struct ocfs2_xattr_value_root *)
+ (val + OCFS2_XATTR_SIZE(entry->xe_name_len));
+ ret = ocfs2_xattr_value_truncate(inode, bh, xv, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+ }
+ }
+
+ return ret;
+}
+
+static int ocfs2_xattr_ibody_remove(struct inode *inode,
+ struct buffer_head *di_bh)
+{
+
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct ocfs2_xattr_header *header;
+ int ret;
+
+ header = (struct ocfs2_xattr_header *)
+ ((void *)di + inode->i_sb->s_blocksize -
+ le16_to_cpu(di->i_xattr_inline_size));
+
+ ret = ocfs2_remove_value_outside(inode, di_bh, header);
+
+ return ret;
+}
+
+static int ocfs2_xattr_block_remove(struct inode *inode,
+ struct buffer_head *blk_bh)
+{
+ struct ocfs2_xattr_block *xb;
+ int ret = 0;
+
+ xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+ if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
+ struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header);
+ ret = ocfs2_remove_value_outside(inode, blk_bh, header);
+ } else
+ ret = ocfs2_delete_xattr_index_block(inode, blk_bh);
+
+ return ret;
+}
+
+static int ocfs2_xattr_free_block(struct inode *inode,
+ u64 block)
+{
+ struct inode *xb_alloc_inode;
+ struct buffer_head *xb_alloc_bh = NULL;
+ struct buffer_head *blk_bh = NULL;
+ struct ocfs2_xattr_block *xb;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ handle_t *handle;
+ int ret = 0;
+ u64 blk, bg_blkno;
+ u16 bit;
+
+ ret = ocfs2_read_block(inode, block, &blk_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*Verify the signature of xattr block*/
+ if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
+ strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ ret = ocfs2_xattr_block_remove(inode, blk_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+ blk = le64_to_cpu(xb->xb_blkno);
+ bit = le16_to_cpu(xb->xb_suballoc_bit);
+ bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+
+ xb_alloc_inode = ocfs2_get_system_file_inode(osb,
+ EXTENT_ALLOC_SYSTEM_INODE,
+ le16_to_cpu(xb->xb_suballoc_slot));
+ if (!xb_alloc_inode) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+ mutex_lock(&xb_alloc_inode->i_mutex);
+
+ ret = ocfs2_inode_lock(xb_alloc_inode, &xb_alloc_bh, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_mutex;
+ }
+
+ handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
+ ret = ocfs2_free_suballoc_bits(handle, xb_alloc_inode, xb_alloc_bh,
+ bit, bg_blkno, 1);
+ if (ret < 0)
+ mlog_errno(ret);
+
+ ocfs2_commit_trans(osb, handle);
+out_unlock:
+ ocfs2_inode_unlock(xb_alloc_inode, 1);
+ brelse(xb_alloc_bh);
+out_mutex:
+ mutex_unlock(&xb_alloc_inode->i_mutex);
+ iput(xb_alloc_inode);
+out:
+ brelse(blk_bh);
+ return ret;
+}
+
+/*
+ * ocfs2_xattr_remove()
+ *
+ * Free extended attribute resources associated with this inode.
+ */
+int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
+{
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ handle_t *handle;
+ int ret;
+
+ if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
+ return 0;
+
+ if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
+ return 0;
+
+ if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
+ ret = ocfs2_xattr_ibody_remove(inode, di_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ if (di->i_xattr_loc) {
+ ret = ocfs2_xattr_free_block(inode,
+ le64_to_cpu(di->i_xattr_loc));
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+ OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+ ret = ocfs2_journal_access(handle, inode, di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ di->i_xattr_loc = 0;
+
+ spin_lock(&oi->ip_lock);
+ oi->ip_dyn_features &= ~(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL);
+ di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+ spin_unlock(&oi->ip_lock);
+
+ ret = ocfs2_journal_dirty(handle, di_bh);
+ if (ret < 0)
+ mlog_errno(ret);
+out_commit:
+ ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+ return ret;
+}
+
+static int ocfs2_xattr_has_space_inline(struct inode *inode,
+ struct ocfs2_dinode *di)
+{
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ unsigned int xattrsize = OCFS2_SB(inode->i_sb)->s_xattr_inline_size;
+ int free;
+
+ if (xattrsize < OCFS2_MIN_XATTR_INLINE_SIZE)
+ return 0;
+
+ if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ struct ocfs2_inline_data *idata = &di->id2.i_data;
+ free = le16_to_cpu(idata->id_count) - le64_to_cpu(di->i_size);
+ } else if (ocfs2_inode_is_fast_symlink(inode)) {
+ free = ocfs2_fast_symlink_chars(inode->i_sb) -
+ le64_to_cpu(di->i_size);
+ } else {
+ struct ocfs2_extent_list *el = &di->id2.i_list;
+ free = (le16_to_cpu(el->l_count) -
+ le16_to_cpu(el->l_next_free_rec)) *
+ sizeof(struct ocfs2_extent_rec);
+ }
+ if (free >= xattrsize)
+ return 1;
+
+ return 0;
+}
+
+/*
+ * ocfs2_xattr_ibody_find()
+ *
+ * Find extended attribute in inode block and
+ * fill search info into struct ocfs2_xattr_search.
+ */
+static int ocfs2_xattr_ibody_find(struct inode *inode,
+ int name_index,
+ const char *name,
+ struct ocfs2_xattr_search *xs)
+{
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+ int ret;
+ int has_space = 0;
+
+ if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE)
+ return 0;
+
+ if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
+ down_read(&oi->ip_alloc_sem);
+ has_space = ocfs2_xattr_has_space_inline(inode, di);
+ up_read(&oi->ip_alloc_sem);
+ if (!has_space)
+ return 0;
+ }
+
+ xs->xattr_bh = xs->inode_bh;
+ xs->end = (void *)di + inode->i_sb->s_blocksize;
+ if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)
+ xs->header = (struct ocfs2_xattr_header *)
+ (xs->end - le16_to_cpu(di->i_xattr_inline_size));
+ else
+ xs->header = (struct ocfs2_xattr_header *)
+ (xs->end - OCFS2_SB(inode->i_sb)->s_xattr_inline_size);
+ xs->base = (void *)xs->header;
+ xs->here = xs->header->xh_entries;
+
+ /* Find the named attribute. */
+ if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
+ ret = ocfs2_xattr_find_entry(name_index, name, xs);
+ if (ret && ret != -ENODATA)
+ return ret;
+ xs->not_found = ret;
+ }
+
+ return 0;
+}
+
+/*
+ * ocfs2_xattr_ibody_set()
+ *
+ * Set, replace or remove an extended attribute into inode block.
+ *
+ */
+static int ocfs2_xattr_ibody_set(struct inode *inode,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xs)
+{
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+ int ret;
+
+ if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE)
+ return -ENOSPC;
+
+ down_write(&oi->ip_alloc_sem);
+ if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
+ if (!ocfs2_xattr_has_space_inline(inode, di)) {
+ ret = -ENOSPC;
+ goto out;
+ }
+ }
+
+ ret = ocfs2_xattr_set_entry(inode, xi, xs,
+ (OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL));
+out:
+ up_write(&oi->ip_alloc_sem);
+
+ return ret;
+}
+
+/*
+ * ocfs2_xattr_block_find()
+ *
+ * Find extended attribute in external block and
+ * fill search info into struct ocfs2_xattr_search.
+ */
+static int ocfs2_xattr_block_find(struct inode *inode,
+ int name_index,
+ const char *name,
+ struct ocfs2_xattr_search *xs)
+{
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+ struct buffer_head *blk_bh = NULL;
+ struct ocfs2_xattr_block *xb;
+ int ret = 0;
+
+ if (!di->i_xattr_loc)
+ return ret;
+
+ ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+ /*Verify the signature of xattr block*/
+ if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
+ strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
+ ret = -EFAULT;
+ goto cleanup;
+ }
+
+ xs->xattr_bh = blk_bh;
+ xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+
+ if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
+ xs->header = &xb->xb_attrs.xb_header;
+ xs->base = (void *)xs->header;
+ xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size;
+ xs->here = xs->header->xh_entries;
+
+ ret = ocfs2_xattr_find_entry(name_index, name, xs);
+ } else
+ ret = ocfs2_xattr_index_block_find(inode, blk_bh,
+ name_index,
+ name, xs);
+
+ if (ret && ret != -ENODATA) {
+ xs->xattr_bh = NULL;
+ goto cleanup;
+ }
+ xs->not_found = ret;
+ return 0;
+cleanup:
+ brelse(blk_bh);
+
+ return ret;
+}
+
+/*
+ * When all the xattrs are deleted from index btree, the ocfs2_xattr_tree
+ * will be erased and ocfs2_xattr_block will have its ocfs2_xattr_header
+ * re-initialized.
+ */
+static int ocfs2_restore_xattr_block(struct inode *inode,
+ struct ocfs2_xattr_search *xs)
+{
+ int ret;
+ handle_t *handle;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_xattr_block *xb =
+ (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
+ struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list;
+ u16 xb_flags = le16_to_cpu(xb->xb_flags);
+
+ BUG_ON(!(xb_flags & OCFS2_XATTR_INDEXED) ||
+ le16_to_cpu(el->l_next_free_rec) != 0);
+
+ handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ handle = NULL;
+ goto out;
+ }
+
+ ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize -
+ offsetof(struct ocfs2_xattr_block, xb_attrs));
+
+ xb->xb_flags = cpu_to_le16(xb_flags & ~OCFS2_XATTR_INDEXED);
+
+ ocfs2_journal_dirty(handle, xs->xattr_bh);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+out:
+ return ret;
+}
+
+/*
+ * ocfs2_xattr_block_set()
+ *
+ * Set, replace or remove an extended attribute into external block.
+ *
+ */
+static int ocfs2_xattr_block_set(struct inode *inode,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xs)
+{
+ struct buffer_head *new_bh = NULL;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+ struct ocfs2_alloc_context *meta_ac = NULL;
+ handle_t *handle = NULL;
+ struct ocfs2_xattr_block *xblk = NULL;
+ u16 suballoc_bit_start;
+ u32 num_got;
+ u64 first_blkno;
+ int ret;
+
+ if (!xs->xattr_bh) {
+ /*
+ * Alloc one external block for extended attribute
+ * outside of inode.
+ */
+ ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ handle = ocfs2_start_trans(osb,
+ OCFS2_XATTR_BLOCK_CREATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+ ret = ocfs2_journal_access(handle, inode, xs->inode_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
+ &suballoc_bit_start, &num_got,
+ &first_blkno);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ new_bh = sb_getblk(inode->i_sb, first_blkno);
+ ocfs2_set_new_buffer_uptodate(inode, new_bh);
+
+ ret = ocfs2_journal_access(handle, inode, new_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ /* Initialize ocfs2_xattr_block */
+ xs->xattr_bh = new_bh;
+ xblk = (struct ocfs2_xattr_block *)new_bh->b_data;
+ memset(xblk, 0, inode->i_sb->s_blocksize);
+ strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
+ xblk->xb_suballoc_slot = cpu_to_le16(osb->slot_num);
+ xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+ xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation);
+ xblk->xb_blkno = cpu_to_le64(first_blkno);
+
+ xs->header = &xblk->xb_attrs.xb_header;
+ xs->base = (void *)xs->header;
+ xs->end = (void *)xblk + inode->i_sb->s_blocksize;
+ xs->here = xs->header->xh_entries;
+
+
+ ret = ocfs2_journal_dirty(handle, new_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+ di->i_xattr_loc = cpu_to_le64(first_blkno);
+ ret = ocfs2_journal_dirty(handle, xs->inode_bh);
+ if (ret < 0)
+ mlog_errno(ret);
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+out:
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+ if (ret < 0)
+ return ret;
+ } else
+ xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
+
+ if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) {
+ /* Set extended attribute into external block */
+ ret = ocfs2_xattr_set_entry(inode, xi, xs, OCFS2_HAS_XATTR_FL);
+ if (!ret || ret != -ENOSPC)
+ goto end;
+
+ ret = ocfs2_xattr_create_index_block(inode, xs);
+ if (ret)
+ goto end;
+ }
+
+ ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs);
+ if (!ret && xblk->xb_attrs.xb_root.xt_list.l_next_free_rec == 0)
+ ret = ocfs2_restore_xattr_block(inode, xs);
+
+end:
+
+ return ret;
+}
+
+/*
+ * ocfs2_xattr_set()
+ *
+ * Set, replace or remove an extended attribute for this inode.
+ * value is NULL to remove an existing extended attribute, else either
+ * create or replace an extended attribute.
+ */
+int ocfs2_xattr_set(struct inode *inode,
+ int name_index,
+ const char *name,
+ const void *value,
+ size_t value_len,
+ int flags)
+{
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_dinode *di;
+ int ret;
+ u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+ struct ocfs2_xattr_info xi = {
+ .name_index = name_index,
+ .name = name,
+ .value = value,
+ .value_len = value_len,
+ };
+
+ struct ocfs2_xattr_search xis = {
+ .not_found = -ENODATA,
+ };
+
+ struct ocfs2_xattr_search xbs = {
+ .not_found = -ENODATA,
+ };
+
+ if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
+ return -EOPNOTSUPP;
+
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+ xis.inode_bh = xbs.inode_bh = di_bh;
+ di = (struct ocfs2_dinode *)di_bh->b_data;
+
+ down_write(&OCFS2_I(inode)->ip_xattr_sem);
+ /*
+ * Scan inode and external block to find the same name
+ * extended attribute and collect search infomation.
+ */
+ ret = ocfs2_xattr_ibody_find(inode, name_index, name, &xis);
+ if (ret)
+ goto cleanup;
+ if (xis.not_found) {
+ ret = ocfs2_xattr_block_find(inode, name_index, name, &xbs);
+ if (ret)
+ goto cleanup;
+ }
+
+ if (xis.not_found && xbs.not_found) {
+ ret = -ENODATA;
+ if (flags & XATTR_REPLACE)
+ goto cleanup;
+ ret = 0;
+ if (!value)
+ goto cleanup;
+ } else {
+ ret = -EEXIST;
+ if (flags & XATTR_CREATE)
+ goto cleanup;
+ }
+
+ if (!value) {
+ /* Remove existing extended attribute */
+ if (!xis.not_found)
+ ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
+ else if (!xbs.not_found)
+ ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
+ } else {
+ /* We always try to set extended attribute into inode first*/
+ ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
+ if (!ret && !xbs.not_found) {
+ /*
+ * If succeed and that extended attribute existing in
+ * external block, then we will remove it.
+ */
+ xi.value = NULL;
+ xi.value_len = 0;
+ ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
+ } else if (ret == -ENOSPC) {
+ if (di->i_xattr_loc && !xbs.xattr_bh) {
+ ret = ocfs2_xattr_block_find(inode, name_index,
+ name, &xbs);
+ if (ret)
+ goto cleanup;
+ }
+ /*
+ * If no space in inode, we will set extended attribute
+ * into external block.
+ */
+ ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
+ if (ret)
+ goto cleanup;
+ if (!xis.not_found) {
+ /*
+ * If succeed and that extended attribute
+ * existing in inode, we will remove it.
+ */
+ xi.value = NULL;
+ xi.value_len = 0;
+ ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
+ }
+ }
+ }
+cleanup:
+ up_write(&OCFS2_I(inode)->ip_xattr_sem);
+ ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+ brelse(xbs.xattr_bh);
+ for (i = 0; i < blk_per_bucket; i++)
+ brelse(xbs.bucket.bhs[i]);
+
+ return ret;
+}
+
+/*
+ * Find the xattr extent rec which may contains name_hash.
+ * e_cpos will be the first name hash of the xattr rec.
+ * el must be the ocfs2_xattr_header.xb_attrs.xb_root.xt_list.
+ */
+static int ocfs2_xattr_get_rec(struct inode *inode,
+ u32 name_hash,
+ u64 *p_blkno,
+ u32 *e_cpos,
+ u32 *num_clusters,
+ struct ocfs2_extent_list *el)
+{
+ int ret = 0, i;
+ struct buffer_head *eb_bh = NULL;
+ struct ocfs2_extent_block *eb;
+ struct ocfs2_extent_rec *rec = NULL;
+ u64 e_blkno = 0;
+
+ if (el->l_tree_depth) {
+ ret = ocfs2_find_leaf(inode, el, name_hash, &eb_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+ el = &eb->h_list;
+
+ if (el->l_tree_depth) {
+ ocfs2_error(inode->i_sb,
+ "Inode %lu has non zero tree depth in "
+ "xattr tree block %llu\n", inode->i_ino,
+ (unsigned long long)eb_bh->b_blocknr);
+ ret = -EROFS;
+ goto out;
+ }
+ }
+
+ for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
+ rec = &el->l_recs[i];
+
+ if (le32_to_cpu(rec->e_cpos) <= name_hash) {
+ e_blkno = le64_to_cpu(rec->e_blkno);
+ break;
+ }
+ }
+
+ if (!e_blkno) {
+ ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+ "record (%u, %u, 0) in xattr", inode->i_ino,
+ le32_to_cpu(rec->e_cpos),
+ ocfs2_rec_clusters(el, rec));
+ ret = -EROFS;
+ goto out;
+ }
+
+ *p_blkno = le64_to_cpu(rec->e_blkno);
+ *num_clusters = le16_to_cpu(rec->e_leaf_clusters);
+ if (e_cpos)
+ *e_cpos = le32_to_cpu(rec->e_cpos);
+out:
+ brelse(eb_bh);
+ return ret;
+}
+
+typedef int (xattr_bucket_func)(struct inode *inode,
+ struct ocfs2_xattr_bucket *bucket,
+ void *para);
+
+static int ocfs2_find_xe_in_bucket(struct inode *inode,
+ struct buffer_head *header_bh,
+ int name_index,
+ const char *name,
+ u32 name_hash,
+ u16 *xe_index,
+ int *found)
+{
+ int i, ret = 0, cmp = 1, block_off, new_offset;
+ struct ocfs2_xattr_header *xh =
+ (struct ocfs2_xattr_header *)header_bh->b_data;
+ size_t name_len = strlen(name);
+ struct ocfs2_xattr_entry *xe = NULL;
+ struct buffer_head *name_bh = NULL;
+ char *xe_name;
+
+ /*
+ * We don't use binary search in the bucket because there
+ * may be multiple entries with the same name hash.
+ */
+ for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+ xe = &xh->xh_entries[i];
+
+ if (name_hash > le32_to_cpu(xe->xe_name_hash))
+ continue;
+ else if (name_hash < le32_to_cpu(xe->xe_name_hash))
+ break;
+
+ cmp = name_index - ocfs2_xattr_get_type(xe);
+ if (!cmp)
+ cmp = name_len - xe->xe_name_len;
+ if (cmp)
+ continue;
+
+ ret = ocfs2_xattr_bucket_get_name_value(inode,
+ xh,
+ i,
+ &block_off,
+ &new_offset);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ ret = ocfs2_read_block(inode, header_bh->b_blocknr + block_off,
+ &name_bh);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+ xe_name = name_bh->b_data + new_offset;
+
+ cmp = memcmp(name, xe_name, name_len);
+ brelse(name_bh);
+ name_bh = NULL;
+
+ if (cmp == 0) {
+ *xe_index = i;
+ *found = 1;
+ ret = 0;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * Find the specified xattr entry in a series of buckets.
+ * This series start from p_blkno and last for num_clusters.
+ * The ocfs2_xattr_header.xh_num_buckets of the first bucket contains
+ * the num of the valid buckets.
+ *
+ * Return the buffer_head this xattr should reside in. And if the xattr's
+ * hash is in the gap of 2 buckets, return the lower bucket.
+ */
+static int ocfs2_xattr_bucket_find(struct inode *inode,
+ int name_index,
+ const char *name,
+ u32 name_hash,
+ u64 p_blkno,
+ u32 first_hash,
+ u32 num_clusters,
+ struct ocfs2_xattr_search *xs)
+{
+ int ret, found = 0;
+ struct buffer_head *bh = NULL;
+ struct buffer_head *lower_bh = NULL;
+ struct ocfs2_xattr_header *xh = NULL;
+ struct ocfs2_xattr_entry *xe = NULL;
+ u16 index = 0;
+ u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+ int low_bucket = 0, bucket, high_bucket;
+ u32 last_hash;
+ u64 blkno;
+
+ ret = ocfs2_read_block(inode, p_blkno, &bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ xh = (struct ocfs2_xattr_header *)bh->b_data;
+ high_bucket = le16_to_cpu(xh->xh_num_buckets) - 1;
+
+ while (low_bucket <= high_bucket) {
+ brelse(bh);
+ bh = NULL;
+ bucket = (low_bucket + high_bucket) / 2;
+
+ blkno = p_blkno + bucket * blk_per_bucket;
+
+ ret = ocfs2_read_block(inode, blkno, &bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ xh = (struct ocfs2_xattr_header *)bh->b_data;
+ xe = &xh->xh_entries[0];
+ if (name_hash < le32_to_cpu(xe->xe_name_hash)) {
+ high_bucket = bucket - 1;
+ continue;
+ }
+
+ /*
+ * Check whether the hash of the last entry in our
+ * bucket is larger than the search one. for an empty
+ * bucket, the last one is also the first one.
+ */
+ if (xh->xh_count)
+ xe = &xh->xh_entries[le16_to_cpu(xh->xh_count) - 1];
+
+ last_hash = le32_to_cpu(xe->xe_name_hash);
+
+ /* record lower_bh which may be the insert place. */
+ brelse(lower_bh);
+ lower_bh = bh;
+ bh = NULL;
+
+ if (name_hash > le32_to_cpu(xe->xe_name_hash)) {
+ low_bucket = bucket + 1;
+ continue;
+ }
+
+ /* the searched xattr should reside in this bucket if exists. */
+ ret = ocfs2_find_xe_in_bucket(inode, lower_bh,
+ name_index, name, name_hash,
+ &index, &found);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ break;
+ }
+
+ /*
+ * Record the bucket we have found.
+ * When the xattr's hash value is in the gap of 2 buckets, we will
+ * always set it to the previous bucket.
+ */
+ if (!lower_bh) {
+ /*
+ * We can't find any bucket whose first name_hash is less
+ * than the find name_hash.
+ */
+ BUG_ON(bh->b_blocknr != p_blkno);
+ lower_bh = bh;
+ bh = NULL;
+ }
+ xs->bucket.bhs[0] = lower_bh;
+ xs->bucket.xh = (struct ocfs2_xattr_header *)
+ xs->bucket.bhs[0]->b_data;
+ lower_bh = NULL;
+
+ xs->header = xs->bucket.xh;
+ xs->base = xs->bucket.bhs[0]->b_data;
+ xs->end = xs->base + inode->i_sb->s_blocksize;
+
+ if (found) {
+ /*
+ * If we have found the xattr enty, read all the blocks in
+ * this bucket.
+ */
+ ret = ocfs2_read_blocks(inode, xs->bucket.bhs[0]->b_blocknr + 1,
+ blk_per_bucket - 1, &xs->bucket.bhs[1],
+ 0);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ xs->here = &xs->header->xh_entries[index];
+ mlog(0, "find xattr %s in bucket %llu, entry = %u\n", name,
+ (unsigned long long)xs->bucket.bhs[0]->b_blocknr, index);
+ } else
+ ret = -ENODATA;
+
+out:
+ brelse(bh);
+ brelse(lower_bh);
+ return ret;
+}
+
+static int ocfs2_xattr_index_block_find(struct inode *inode,
+ struct buffer_head *root_bh,
+ int name_index,
+ const char *name,
+ struct ocfs2_xattr_search *xs)
+{
+ int ret;
+ struct ocfs2_xattr_block *xb =
+ (struct ocfs2_xattr_block *)root_bh->b_data;
+ struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root;
+ struct ocfs2_extent_list *el = &xb_root->xt_list;
+ u64 p_blkno = 0;
+ u32 first_hash, num_clusters = 0;
+ u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name));
+
+ if (le16_to_cpu(el->l_next_free_rec) == 0)
+ return -ENODATA;
+
+ mlog(0, "find xattr %s, hash = %u, index = %d in xattr tree\n",
+ name, name_hash, name_index);
+
+ ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &first_hash,
+ &num_clusters, el);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ BUG_ON(p_blkno == 0 || num_clusters == 0 || first_hash > name_hash);
+
+ mlog(0, "find xattr extent rec %u clusters from %llu, the first hash "
+ "in the rec is %u\n", num_clusters, p_blkno, first_hash);
+
+ ret = ocfs2_xattr_bucket_find(inode, name_index, name, name_hash,
+ p_blkno, first_hash, num_clusters, xs);
+
+out:
+ return ret;
+}
+
+static int ocfs2_iterate_xattr_buckets(struct inode *inode,
+ u64 blkno,
+ u32 clusters,
+ xattr_bucket_func *func,
+ void *para)
+{
+ int i, j, ret = 0;
+ int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+ u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb));
+ u32 num_buckets = clusters * bpc;
+ struct ocfs2_xattr_bucket bucket;
+
+ memset(&bucket, 0, sizeof(bucket));
+
+ mlog(0, "iterating xattr buckets in %u clusters starting from %llu\n",
+ clusters, blkno);
+
+ for (i = 0; i < num_buckets; i++, blkno += blk_per_bucket) {
+ ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket,
+ bucket.bhs, 0);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ bucket.xh = (struct ocfs2_xattr_header *)bucket.bhs[0]->b_data;
+ /*
+ * The real bucket num in this series of blocks is stored
+ * in the 1st bucket.
+ */
+ if (i == 0)
+ num_buckets = le16_to_cpu(bucket.xh->xh_num_buckets);
+
+ mlog(0, "iterating xattr bucket %llu, first hash %u\n", blkno,
+ le32_to_cpu(bucket.xh->xh_entries[0].xe_name_hash));
+ if (func) {
+ ret = func(inode, &bucket, para);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+ }
+
+ for (j = 0; j < blk_per_bucket; j++)
+ brelse(bucket.bhs[j]);
+ memset(&bucket, 0, sizeof(bucket));
+ }
+
+out:
+ for (j = 0; j < blk_per_bucket; j++)
+ brelse(bucket.bhs[j]);
+
+ return ret;
+}
+
+struct ocfs2_xattr_tree_list {
+ char *buffer;
+ size_t buffer_size;
+ size_t result;
+};
+
+static int ocfs2_xattr_bucket_get_name_value(struct inode *inode,
+ struct ocfs2_xattr_header *xh,
+ int index,
+ int *block_off,
+ int *new_offset)
+{
+ u16 name_offset;
+
+ if (index < 0 || index >= le16_to_cpu(xh->xh_count))
+ return -EINVAL;
+
+ name_offset = le16_to_cpu(xh->xh_entries[index].xe_name_offset);
+
+ *block_off = name_offset >> inode->i_sb->s_blocksize_bits;
+ *new_offset = name_offset % inode->i_sb->s_blocksize;
+
+ return 0;
+}
+
+static int ocfs2_list_xattr_bucket(struct inode *inode,
+ struct ocfs2_xattr_bucket *bucket,
+ void *para)
+{
+ int ret = 0, type;
+ struct ocfs2_xattr_tree_list *xl = (struct ocfs2_xattr_tree_list *)para;
+ int i, block_off, new_offset;
+ const char *prefix, *name;
+
+ for (i = 0 ; i < le16_to_cpu(bucket->xh->xh_count); i++) {
+ struct ocfs2_xattr_entry *entry = &bucket->xh->xh_entries[i];
+ type = ocfs2_xattr_get_type(entry);
+ prefix = ocfs2_xattr_prefix(type);
+
+ if (prefix) {
+ ret = ocfs2_xattr_bucket_get_name_value(inode,
+ bucket->xh,
+ i,
+ &block_off,
+ &new_offset);
+ if (ret)
+ break;
+
+ name = (const char *)bucket->bhs[block_off]->b_data +
+ new_offset;
+ ret = ocfs2_xattr_list_entry(xl->buffer,
+ xl->buffer_size,
+ &xl->result,
+ prefix, name,
+ entry->xe_name_len);
+ if (ret)
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
+ struct ocfs2_xattr_tree_root *xt,
+ char *buffer,
+ size_t buffer_size)
+{
+ struct ocfs2_extent_list *el = &xt->xt_list;
+ int ret = 0;
+ u32 name_hash = UINT_MAX, e_cpos = 0, num_clusters = 0;
+ u64 p_blkno = 0;
+ struct ocfs2_xattr_tree_list xl = {
+ .buffer = buffer,
+ .buffer_size = buffer_size,
+ .result = 0,
+ };
+
+ if (le16_to_cpu(el->l_next_free_rec) == 0)
+ return 0;
+
+ while (name_hash > 0) {
+ ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno,
+ &e_cpos, &num_clusters, el);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters,
+ ocfs2_list_xattr_bucket,
+ &xl);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (e_cpos == 0)
+ break;
+
+ name_hash = e_cpos - 1;
+ }
+
+ ret = xl.result;
+out:
+ return ret;
+}
+
+static int cmp_xe(const void *a, const void *b)
+{
+ const struct ocfs2_xattr_entry *l = a, *r = b;
+ u32 l_hash = le32_to_cpu(l->xe_name_hash);
+ u32 r_hash = le32_to_cpu(r->xe_name_hash);
+
+ if (l_hash > r_hash)
+ return 1;
+ if (l_hash < r_hash)
+ return -1;
+ return 0;
+}
+
+static void swap_xe(void *a, void *b, int size)
+{
+ struct ocfs2_xattr_entry *l = a, *r = b, tmp;
+
+ tmp = *l;
+ memcpy(l, r, sizeof(struct ocfs2_xattr_entry));
+ memcpy(r, &tmp, sizeof(struct ocfs2_xattr_entry));
+}
+
+/*
+ * When the ocfs2_xattr_block is filled up, new bucket will be created
+ * and all the xattr entries will be moved to the new bucket.
+ * Note: we need to sort the entries since they are not saved in order
+ * in the ocfs2_xattr_block.
+ */
+static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
+ struct buffer_head *xb_bh,
+ struct buffer_head *xh_bh,
+ struct buffer_head *data_bh)
+{
+ int i, blocksize = inode->i_sb->s_blocksize;
+ u16 offset, size, off_change;
+ struct ocfs2_xattr_entry *xe;
+ struct ocfs2_xattr_block *xb =
+ (struct ocfs2_xattr_block *)xb_bh->b_data;
+ struct ocfs2_xattr_header *xb_xh = &xb->xb_attrs.xb_header;
+ struct ocfs2_xattr_header *xh =
+ (struct ocfs2_xattr_header *)xh_bh->b_data;
+ u16 count = le16_to_cpu(xb_xh->xh_count);
+ char *target = xh_bh->b_data, *src = xb_bh->b_data;
+
+ mlog(0, "cp xattr from block %llu to bucket %llu\n",
+ (unsigned long long)xb_bh->b_blocknr,
+ (unsigned long long)xh_bh->b_blocknr);
+
+ memset(xh_bh->b_data, 0, blocksize);
+ if (data_bh)
+ memset(data_bh->b_data, 0, blocksize);
+ /*
+ * Since the xe_name_offset is based on ocfs2_xattr_header,
+ * there is a offset change corresponding to the change of
+ * ocfs2_xattr_header's position.
+ */
+ off_change = offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header);
+ xe = &xb_xh->xh_entries[count - 1];
+ offset = le16_to_cpu(xe->xe_name_offset) + off_change;
+ size = blocksize - offset;
+
+ /* copy all the names and values. */
+ if (data_bh)
+ target = data_bh->b_data;
+ memcpy(target + offset, src + offset, size);
+
+ /* Init new header now. */
+ xh->xh_count = xb_xh->xh_count;
+ xh->xh_num_buckets = cpu_to_le16(1);
+ xh->xh_name_value_len = cpu_to_le16(size);
+ xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE - size);
+
+ /* copy all the entries. */
+ target = xh_bh->b_data;
+ offset = offsetof(struct ocfs2_xattr_header, xh_entries);
+ size = count * sizeof(struct ocfs2_xattr_entry);
+ memcpy(target + offset, (char *)xb_xh + offset, size);
+
+ /* Change the xe offset for all the xe because of the move. */
+ off_change = OCFS2_XATTR_BUCKET_SIZE - blocksize +
+ offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header);
+ for (i = 0; i < count; i++)
+ le16_add_cpu(&xh->xh_entries[i].xe_name_offset, off_change);
+
+ mlog(0, "copy entry: start = %u, size = %u, offset_change = %u\n",
+ offset, size, off_change);
+
+ sort(target + offset, count, sizeof(struct ocfs2_xattr_entry),
+ cmp_xe, swap_xe);
+}
+
+/*
+ * After we move xattr from block to index btree, we have to
+ * update ocfs2_xattr_search to the new xe and base.
+ *
+ * When the entry is in xattr block, xattr_bh indicates the storage place.
+ * While if the entry is in index b-tree, "bucket" indicates the
+ * real place of the xattr.
+ */
+static int ocfs2_xattr_update_xattr_search(struct inode *inode,
+ struct ocfs2_xattr_search *xs,
+ struct buffer_head *old_bh,
+ struct buffer_head *new_bh)
+{
+ int ret = 0;
+ char *buf = old_bh->b_data;
+ struct ocfs2_xattr_block *old_xb = (struct ocfs2_xattr_block *)buf;
+ struct ocfs2_xattr_header *old_xh = &old_xb->xb_attrs.xb_header;
+ int i, blocksize = inode->i_sb->s_blocksize;
+ u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+ xs->bucket.bhs[0] = new_bh;
+ get_bh(new_bh);
+ xs->bucket.xh = (struct ocfs2_xattr_header *)xs->bucket.bhs[0]->b_data;
+ xs->header = xs->bucket.xh;
+
+ xs->base = new_bh->b_data;
+ xs->end = xs->base + inode->i_sb->s_blocksize;
+
+ if (!xs->not_found) {
+ if (OCFS2_XATTR_BUCKET_SIZE != blocksize) {
+ ret = ocfs2_read_blocks(inode,
+ xs->bucket.bhs[0]->b_blocknr + 1,
+ blk_per_bucket - 1, &xs->bucket.bhs[1],
+ 0);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ i = xs->here - old_xh->xh_entries;
+ xs->here = &xs->header->xh_entries[i];
+ }
+ }
+
+ return ret;
+}
+
+static int ocfs2_xattr_create_index_block(struct inode *inode,
+ struct ocfs2_xattr_search *xs)
+{
+ int ret, credits = OCFS2_SUBALLOC_ALLOC;
+ u32 bit_off, len;
+ u64 blkno;
+ handle_t *handle;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_alloc_context *data_ac;
+ struct buffer_head *xh_bh = NULL, *data_bh = NULL;
+ struct buffer_head *xb_bh = xs->xattr_bh;
+ struct ocfs2_xattr_block *xb =
+ (struct ocfs2_xattr_block *)xb_bh->b_data;
+ struct ocfs2_xattr_tree_root *xr;
+ u16 xb_flags = le16_to_cpu(xb->xb_flags);
+ u16 bpb = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+ mlog(0, "create xattr index block for %llu\n",
+ (unsigned long long)xb_bh->b_blocknr);
+
+ BUG_ON(xb_flags & OCFS2_XATTR_INDEXED);
+
+ ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * XXX:
+ * We can use this lock for now, and maybe move to a dedicated mutex
+ * if performance becomes a problem later.
+ */
+ down_write(&oi->ip_alloc_sem);
+
+ /*
+ * 3 more credits, one for xattr block update, one for the 1st block
+ * of the new xattr bucket and one for the value/data.
+ */
+ credits += 3;
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out_sem;
+ }
+
+ ret = ocfs2_journal_access(handle, inode, xb_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ /*
+ * The bucket may spread in many blocks, and
+ * we will only touch the 1st block and the last block
+ * in the whole bucket(one for entry and one for data).
+ */
+ blkno = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
+
+ mlog(0, "allocate 1 cluster from %llu to xattr block\n", blkno);
+
+ xh_bh = sb_getblk(inode->i_sb, blkno);
+ if (!xh_bh) {
+ ret = -EIO;
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ocfs2_set_new_buffer_uptodate(inode, xh_bh);
+
+ ret = ocfs2_journal_access(handle, inode, xh_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ if (bpb > 1) {
+ data_bh = sb_getblk(inode->i_sb, blkno + bpb - 1);
+ if (!data_bh) {
+ ret = -EIO;
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ocfs2_set_new_buffer_uptodate(inode, data_bh);
+
+ ret = ocfs2_journal_access(handle, inode, data_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+ }
+
+ ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xh_bh, data_bh);
+
+ ocfs2_journal_dirty(handle, xh_bh);
+ if (data_bh)
+ ocfs2_journal_dirty(handle, data_bh);
+
+ ocfs2_xattr_update_xattr_search(inode, xs, xb_bh, xh_bh);
+
+ /* Change from ocfs2_xattr_header to ocfs2_xattr_tree_root */
+ memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize -
+ offsetof(struct ocfs2_xattr_block, xb_attrs));
+
+ xr = &xb->xb_attrs.xb_root;
+ xr->xt_clusters = cpu_to_le32(1);
+ xr->xt_last_eb_blk = 0;
+ xr->xt_list.l_tree_depth = 0;
+ xr->xt_list.l_count = cpu_to_le16(ocfs2_xattr_recs_per_xb(inode->i_sb));
+ xr->xt_list.l_next_free_rec = cpu_to_le16(1);
+
+ xr->xt_list.l_recs[0].e_cpos = 0;
+ xr->xt_list.l_recs[0].e_blkno = cpu_to_le64(blkno);
+ xr->xt_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1);
+
+ xb->xb_flags = cpu_to_le16(xb_flags | OCFS2_XATTR_INDEXED);
+
+ ret = ocfs2_journal_dirty(handle, xb_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+
+out_sem:
+ up_write(&oi->ip_alloc_sem);
+
+out:
+ if (data_ac)
+ ocfs2_free_alloc_context(data_ac);
+
+ brelse(xh_bh);
+ brelse(data_bh);
+
+ return ret;
+}
+
+static int cmp_xe_offset(const void *a, const void *b)
+{
+ const struct ocfs2_xattr_entry *l = a, *r = b;
+ u32 l_name_offset = le16_to_cpu(l->xe_name_offset);
+ u32 r_name_offset = le16_to_cpu(r->xe_name_offset);
+
+ if (l_name_offset < r_name_offset)
+ return 1;
+ if (l_name_offset > r_name_offset)
+ return -1;
+ return 0;
+}
+
+/*
+ * defrag a xattr bucket if we find that the bucket has some
+ * holes beteen name/value pairs.
+ * We will move all the name/value pairs to the end of the bucket
+ * so that we can spare some space for insertion.
+ */
+static int ocfs2_defrag_xattr_bucket(struct inode *inode,
+ struct ocfs2_xattr_bucket *bucket)
+{
+ int ret, i;
+ size_t end, offset, len, value_len;
+ struct ocfs2_xattr_header *xh;
+ char *entries, *buf, *bucket_buf = NULL;
+ u64 blkno = bucket->bhs[0]->b_blocknr;
+ u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+ u16 xh_free_start;
+ size_t blocksize = inode->i_sb->s_blocksize;
+ handle_t *handle;
+ struct buffer_head **bhs;
+ struct ocfs2_xattr_entry *xe;
+
+ bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
+ GFP_NOFS);
+ if (!bhs)
+ return -ENOMEM;
+
+ ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket, bhs, 0);
+ if (ret)
+ goto out;
+
+ /*
+ * In order to make the operation more efficient and generic,
+ * we copy all the blocks into a contiguous memory and do the
+ * defragment there, so if anything is error, we will not touch
+ * the real block.
+ */
+ bucket_buf = kmalloc(OCFS2_XATTR_BUCKET_SIZE, GFP_NOFS);
+ if (!bucket_buf) {
+ ret = -EIO;
+ goto out;
+ }
+
+ buf = bucket_buf;
+ for (i = 0; i < blk_per_bucket; i++, buf += blocksize)
+ memcpy(buf, bhs[i]->b_data, blocksize);
+
+ handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), blk_per_bucket);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ handle = NULL;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ for (i = 0; i < blk_per_bucket; i++) {
+ ret = ocfs2_journal_access(handle, inode, bhs[i],
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto commit;
+ }
+ }
+
+ xh = (struct ocfs2_xattr_header *)bucket_buf;
+ entries = (char *)xh->xh_entries;
+ xh_free_start = le16_to_cpu(xh->xh_free_start);
+
+ mlog(0, "adjust xattr bucket in %llu, count = %u, "
+ "xh_free_start = %u, xh_name_value_len = %u.\n",
+ blkno, le16_to_cpu(xh->xh_count), xh_free_start,
+ le16_to_cpu(xh->xh_name_value_len));
+
+ /*
+ * sort all the entries by their offset.
+ * the largest will be the first, so that we can
+ * move them to the end one by one.
+ */
+ sort(entries, le16_to_cpu(xh->xh_count),
+ sizeof(struct ocfs2_xattr_entry),
+ cmp_xe_offset, swap_xe);
+
+ /* Move all name/values to the end of the bucket. */
+ xe = xh->xh_entries;
+ end = OCFS2_XATTR_BUCKET_SIZE;
+ for (i = 0; i < le16_to_cpu(xh->xh_count); i++, xe++) {
+ offset = le16_to_cpu(xe->xe_name_offset);
+ if (ocfs2_xattr_is_local(xe))
+ value_len = OCFS2_XATTR_SIZE(
+ le64_to_cpu(xe->xe_value_size));
+ else
+ value_len = OCFS2_XATTR_ROOT_SIZE;
+ len = OCFS2_XATTR_SIZE(xe->xe_name_len) + value_len;
+
+ /*
+ * We must make sure that the name/value pair
+ * exist in the same block. So adjust end to
+ * the previous block end if needed.
+ */
+ if (((end - len) / blocksize !=
+ (end - 1) / blocksize))
+ end = end - end % blocksize;
+
+ if (end > offset + len) {
+ memmove(bucket_buf + end - len,
+ bucket_buf + offset, len);
+ xe->xe_name_offset = cpu_to_le16(end - len);
+ }
+
+ mlog_bug_on_msg(end < offset + len, "Defrag check failed for "
+ "bucket %llu\n", (unsigned long long)blkno);
+
+ end -= len;
+ }
+
+ mlog_bug_on_msg(xh_free_start > end, "Defrag check failed for "
+ "bucket %llu\n", (unsigned long long)blkno);
+
+ if (xh_free_start == end)
+ goto commit;
+
+ memset(bucket_buf + xh_free_start, 0, end - xh_free_start);
+ xh->xh_free_start = cpu_to_le16(end);
+
+ /* sort the entries by their name_hash. */
+ sort(entries, le16_to_cpu(xh->xh_count),
+ sizeof(struct ocfs2_xattr_entry),
+ cmp_xe, swap_xe);
+
+ buf = bucket_buf;
+ for (i = 0; i < blk_per_bucket; i++, buf += blocksize) {
+ memcpy(bhs[i]->b_data, buf, blocksize);
+ ocfs2_journal_dirty(handle, bhs[i]);
+ }
+
+commit:
+ ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+
+ if (bhs) {
+ for (i = 0; i < blk_per_bucket; i++)
+ brelse(bhs[i]);
+ }
+ kfree(bhs);
+
+ kfree(bucket_buf);
+ return ret;
+}
+
+/*
+ * Move half nums of the xattr bucket in the previous cluster to this new
+ * cluster. We only touch the last cluster of the previous extend record.
+ *
+ * first_bh is the first buffer_head of a series of bucket in the same
+ * extent rec and header_bh is the header of one bucket in this cluster.
+ * They will be updated if we move the data header_bh contains to the new
+ * cluster. first_hash will be set as the 1st xe's name_hash of the new cluster.
+ */
+static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode,
+ handle_t *handle,
+ struct buffer_head **first_bh,
+ struct buffer_head **header_bh,
+ u64 new_blkno,
+ u64 prev_blkno,
+ u32 num_clusters,
+ u32 *first_hash)
+{
+ int i, ret, credits;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+ int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
+ int blocksize = inode->i_sb->s_blocksize;
+ struct buffer_head *old_bh, *new_bh, *prev_bh, *new_first_bh = NULL;
+ struct ocfs2_xattr_header *new_xh;
+ struct ocfs2_xattr_header *xh =
+ (struct ocfs2_xattr_header *)((*first_bh)->b_data);
+
+ BUG_ON(le16_to_cpu(xh->xh_num_buckets) < num_buckets);
+ BUG_ON(OCFS2_XATTR_BUCKET_SIZE == osb->s_clustersize);
+
+ prev_bh = *first_bh;
+ get_bh(prev_bh);
+ xh = (struct ocfs2_xattr_header *)prev_bh->b_data;
+
+ prev_blkno += (num_clusters - 1) * bpc + bpc / 2;
+
+ mlog(0, "move half of xattrs in cluster %llu to %llu\n",
+ prev_blkno, new_blkno);
+
+ /*
+ * We need to update the 1st half of the new cluster and
+ * 1 more for the update of the 1st bucket of the previous
+ * extent record.
+ */
+ credits = bpc / 2 + 1;
+ ret = ocfs2_extend_trans(handle, credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access(handle, inode, prev_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ for (i = 0; i < bpc / 2; i++, prev_blkno++, new_blkno++) {
+ old_bh = new_bh = NULL;
+ new_bh = sb_getblk(inode->i_sb, new_blkno);
+ if (!new_bh) {
+ ret = -EIO;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_set_new_buffer_uptodate(inode, new_bh);
+
+ ret = ocfs2_journal_access(handle, inode, new_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ brelse(new_bh);
+ goto out;
+ }
+
+ ret = ocfs2_read_block(inode, prev_blkno, &old_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ brelse(new_bh);
+ goto out;
+ }
+
+ memcpy(new_bh->b_data, old_bh->b_data, blocksize);
+
+ if (i == 0) {
+ new_xh = (struct ocfs2_xattr_header *)new_bh->b_data;
+ new_xh->xh_num_buckets = cpu_to_le16(num_buckets / 2);
+
+ if (first_hash)
+ *first_hash = le32_to_cpu(
+ new_xh->xh_entries[0].xe_name_hash);
+ new_first_bh = new_bh;
+ get_bh(new_first_bh);
+ }
+
+ ocfs2_journal_dirty(handle, new_bh);
+
+ if (*header_bh == old_bh) {
+ brelse(*header_bh);
+ *header_bh = new_bh;
+ get_bh(*header_bh);
+
+ brelse(*first_bh);
+ *first_bh = new_first_bh;
+ get_bh(*first_bh);
+ }
+ brelse(new_bh);
+ brelse(old_bh);
+ }
+
+ le16_add_cpu(&xh->xh_num_buckets, -(num_buckets / 2));
+
+ ocfs2_journal_dirty(handle, prev_bh);
+out:
+ brelse(prev_bh);
+ brelse(new_first_bh);
+ return ret;
+}
+
+static int ocfs2_read_xattr_bucket(struct inode *inode,
+ u64 blkno,
+ struct buffer_head **bhs,
+ int new)
+{
+ int ret = 0;
+ u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+ if (!new)
+ return ocfs2_read_blocks(inode, blkno,
+ blk_per_bucket, bhs, 0);
+
+ for (i = 0; i < blk_per_bucket; i++) {
+ bhs[i] = sb_getblk(inode->i_sb, blkno + i);
+ if (bhs[i] == NULL) {
+ ret = -EIO;
+ mlog_errno(ret);
+ break;
+ }
+ ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
+ }
+
+ return ret;
+}
+
+/*
+ * Move half num of the xattrs in old bucket(blk) to new bucket(new_blk).
+ * first_hash will record the 1st hash of the new bucket.
+ */
+static int ocfs2_half_xattr_bucket(struct inode *inode,
+ handle_t *handle,
+ u64 blk,
+ u64 new_blk,
+ u32 *first_hash,
+ int new_bucket_head)
+{
+ int ret, i;
+ u16 count, start, len, name_value_len, xe_len, name_offset;
+ u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+ struct buffer_head **s_bhs, **t_bhs = NULL;
+ struct ocfs2_xattr_header *xh;
+ struct ocfs2_xattr_entry *xe;
+ int blocksize = inode->i_sb->s_blocksize;
+
+ mlog(0, "move half of xattrs from bucket %llu to %llu\n",
+ blk, new_blk);
+
+ s_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS);
+ if (!s_bhs)
+ return -ENOMEM;
+
+ ret = ocfs2_read_xattr_bucket(inode, blk, s_bhs, 0);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access(handle, inode, s_bhs[0],
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ t_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS);
+ if (!t_bhs) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = ocfs2_read_xattr_bucket(inode, new_blk, t_bhs, new_bucket_head);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ for (i = 0; i < blk_per_bucket; i++) {
+ ret = ocfs2_journal_access(handle, inode, t_bhs[i],
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ /* copy the whole bucket to the new first. */
+ for (i = 0; i < blk_per_bucket; i++)
+ memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize);
+
+ /* update the new bucket. */
+ xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data;
+ count = le16_to_cpu(xh->xh_count);
+ start = count / 2;
+
+ /*
+ * Calculate the total name/value len and xh_free_start for
+ * the old bucket first.
+ */
+ name_offset = OCFS2_XATTR_BUCKET_SIZE;
+ name_value_len = 0;
+ for (i = 0; i < start; i++) {
+ xe = &xh->xh_entries[i];
+ xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+ if (ocfs2_xattr_is_local(xe))
+ xe_len +=
+ OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
+ else
+ xe_len += OCFS2_XATTR_ROOT_SIZE;
+ name_value_len += xe_len;
+ if (le16_to_cpu(xe->xe_name_offset) < name_offset)
+ name_offset = le16_to_cpu(xe->xe_name_offset);
+ }
+
+ /*
+ * Now begin the modification to the new bucket.
+ *
+ * In the new bucket, We just move the xattr entry to the beginning
+ * and don't touch the name/value. So there will be some holes in the
+ * bucket, and they will be removed when ocfs2_defrag_xattr_bucket is
+ * called.
+ */
+ xe = &xh->xh_entries[start];
+ len = sizeof(struct ocfs2_xattr_entry) * (count - start);
+ mlog(0, "mv xattr entry len %d from %d to %d\n", len,
+ (int)((char *)xe - (char *)xh),
+ (int)((char *)xh->xh_entries - (char *)xh));
+ memmove((char *)xh->xh_entries, (char *)xe, len);
+ xe = &xh->xh_entries[count - start];
+ len = sizeof(struct ocfs2_xattr_entry) * start;
+ memset((char *)xe, 0, len);
+
+ le16_add_cpu(&xh->xh_count, -start);
+ le16_add_cpu(&xh->xh_name_value_len, -name_value_len);
+
+ /* Calculate xh_free_start for the new bucket. */
+ xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE);
+ for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+ xe = &xh->xh_entries[i];
+ xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+ if (ocfs2_xattr_is_local(xe))
+ xe_len +=
+ OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
+ else
+ xe_len += OCFS2_XATTR_ROOT_SIZE;
+ if (le16_to_cpu(xe->xe_name_offset) <
+ le16_to_cpu(xh->xh_free_start))
+ xh->xh_free_start = xe->xe_name_offset;
+ }
+
+ /* set xh->xh_num_buckets for the new xh. */
+ if (new_bucket_head)
+ xh->xh_num_buckets = cpu_to_le16(1);
+ else
+ xh->xh_num_buckets = 0;
+
+ for (i = 0; i < blk_per_bucket; i++) {
+ ocfs2_journal_dirty(handle, t_bhs[i]);
+ if (ret)
+ mlog_errno(ret);
+ }
+
+ /* store the first_hash of the new bucket. */
+ if (first_hash)
+ *first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
+
+ /*
+ * Now only update the 1st block of the old bucket.
+ * Please note that the entry has been sorted already above.
+ */
+ xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data;
+ memset(&xh->xh_entries[start], 0,
+ sizeof(struct ocfs2_xattr_entry) * (count - start));
+ xh->xh_count = cpu_to_le16(start);
+ xh->xh_free_start = cpu_to_le16(name_offset);
+ xh->xh_name_value_len = cpu_to_le16(name_value_len);
+
+ ocfs2_journal_dirty(handle, s_bhs[0]);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ if (s_bhs) {
+ for (i = 0; i < blk_per_bucket; i++)
+ brelse(s_bhs[i]);
+ }
+ kfree(s_bhs);
+
+ if (t_bhs) {
+ for (i = 0; i < blk_per_bucket; i++)
+ brelse(t_bhs[i]);
+ }
+ kfree(t_bhs);
+
+ return ret;
+}
+
+/*
+ * Copy xattr from one bucket to another bucket.
+ *
+ * The caller must make sure that the journal transaction
+ * has enough space for journaling.
+ */
+static int ocfs2_cp_xattr_bucket(struct inode *inode,
+ handle_t *handle,
+ u64 s_blkno,
+ u64 t_blkno,
+ int t_is_new)
+{
+ int ret, i;
+ int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+ int blocksize = inode->i_sb->s_blocksize;
+ struct buffer_head **s_bhs, **t_bhs = NULL;
+
+ BUG_ON(s_blkno == t_blkno);
+
+ mlog(0, "cp bucket %llu to %llu, target is %d\n",
+ s_blkno, t_blkno, t_is_new);
+
+ s_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
+ GFP_NOFS);
+ if (!s_bhs)
+ return -ENOMEM;
+
+ ret = ocfs2_read_xattr_bucket(inode, s_blkno, s_bhs, 0);
+ if (ret)
+ goto out;
+
+ t_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
+ GFP_NOFS);
+ if (!t_bhs) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = ocfs2_read_xattr_bucket(inode, t_blkno, t_bhs, t_is_new);
+ if (ret)
+ goto out;
+
+ for (i = 0; i < blk_per_bucket; i++) {
+ ret = ocfs2_journal_access(handle, inode, t_bhs[i],
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret)
+ goto out;
+ }
+
+ for (i = 0; i < blk_per_bucket; i++) {
+ memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize);
+ ocfs2_journal_dirty(handle, t_bhs[i]);
+ }
+
+out:
+ if (s_bhs) {
+ for (i = 0; i < blk_per_bucket; i++)
+ brelse(s_bhs[i]);
+ }
+ kfree(s_bhs);
+
+ if (t_bhs) {
+ for (i = 0; i < blk_per_bucket; i++)
+ brelse(t_bhs[i]);
+ }
+ kfree(t_bhs);
+
+ return ret;
+}
+
+/*
+ * Copy one xattr cluster from src_blk to to_blk.
+ * The to_blk will become the first bucket header of the cluster, so its
+ * xh_num_buckets will be initialized as the bucket num in the cluster.
+ */
+static int ocfs2_cp_xattr_cluster(struct inode *inode,
+ handle_t *handle,
+ struct buffer_head *first_bh,
+ u64 src_blk,
+ u64 to_blk,
+ u32 *first_hash)
+{
+ int i, ret, credits;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+ int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
+ struct buffer_head *bh = NULL;
+ struct ocfs2_xattr_header *xh;
+ u64 to_blk_start = to_blk;
+
+ mlog(0, "cp xattrs from cluster %llu to %llu\n", src_blk, to_blk);
+
+ /*
+ * We need to update the new cluster and 1 more for the update of
+ * the 1st bucket of the previous extent rec.
+ */
+ credits = bpc + 1;
+ ret = ocfs2_extend_trans(handle, credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access(handle, inode, first_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ for (i = 0; i < num_buckets; i++) {
+ ret = ocfs2_cp_xattr_bucket(inode, handle,
+ src_blk, to_blk, 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ src_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+ to_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+ }
+
+ /* update the old bucket header. */
+ xh = (struct ocfs2_xattr_header *)first_bh->b_data;
+ le16_add_cpu(&xh->xh_num_buckets, -num_buckets);
+
+ ocfs2_journal_dirty(handle, first_bh);
+
+ /* update the new bucket header. */
+ ret = ocfs2_read_block(inode, to_blk_start, &bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access(handle, inode, bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ xh = (struct ocfs2_xattr_header *)bh->b_data;
+ xh->xh_num_buckets = cpu_to_le16(num_buckets);
+
+ ocfs2_journal_dirty(handle, bh);
+
+ if (first_hash)
+ *first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
+out:
+ brelse(bh);
+ return ret;
+}
+
+/*
+ * Move half of the xattrs in this cluster to the new cluster.
+ * This function should only be called when bucket size == cluster size.
+ * Otherwise ocfs2_mv_xattr_bucket_cross_cluster should be used instead.
+ */
+static int ocfs2_half_xattr_cluster(struct inode *inode,
+ handle_t *handle,
+ u64 prev_blk,
+ u64 new_blk,
+ u32 *first_hash)
+{
+ u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+ int ret, credits = 2 * blk_per_bucket;
+
+ BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize);
+
+ ret = ocfs2_extend_trans(handle, credits);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ /* Move half of the xattr in start_blk to the next bucket. */
+ return ocfs2_half_xattr_bucket(inode, handle, prev_blk,
+ new_blk, first_hash, 1);
+}
+
+/*
+ * Move some xattrs from the old cluster to the new one since they are not
+ * contiguous in ocfs2 xattr tree.
+ *
+ * new_blk starts a new separate cluster, and we will move some xattrs from
+ * prev_blk to it. v_start will be set as the first name hash value in this
+ * new cluster so that it can be used as e_cpos during tree insertion and
+ * don't collide with our original b-tree operations. first_bh and header_bh
+ * will also be updated since they will be used in ocfs2_extend_xattr_bucket
+ * to extend the insert bucket.
+ *
+ * The problem is how much xattr should we move to the new one and when should
+ * we update first_bh and header_bh?
+ * 1. If cluster size > bucket size, that means the previous cluster has more
+ * than 1 bucket, so just move half nums of bucket into the new cluster and
+ * update the first_bh and header_bh if the insert bucket has been moved
+ * to the new cluster.
+ * 2. If cluster_size == bucket_size:
+ * a) If the previous extent rec has more than one cluster and the insert
+ * place isn't in the last cluster, copy the entire last cluster to the
+ * new one. This time, we don't need to upate the first_bh and header_bh
+ * since they will not be moved into the new cluster.
+ * b) Otherwise, move the bottom half of the xattrs in the last cluster into
+ * the new one. And we set the extend flag to zero if the insert place is
+ * moved into the new allocated cluster since no extend is needed.
+ */
+static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
+ handle_t *handle,
+ struct buffer_head **first_bh,
+ struct buffer_head **header_bh,
+ u64 new_blk,
+ u64 prev_blk,
+ u32 prev_clusters,
+ u32 *v_start,
+ int *extend)
+{
+ int ret = 0;
+ int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+
+ mlog(0, "adjust xattrs from cluster %llu len %u to %llu\n",
+ prev_blk, prev_clusters, new_blk);
+
+ if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1)
+ ret = ocfs2_mv_xattr_bucket_cross_cluster(inode,
+ handle,
+ first_bh,
+ header_bh,
+ new_blk,
+ prev_blk,
+ prev_clusters,
+ v_start);
+ else {
+ u64 last_blk = prev_blk + bpc * (prev_clusters - 1);
+
+ if (prev_clusters > 1 && (*header_bh)->b_blocknr != last_blk)
+ ret = ocfs2_cp_xattr_cluster(inode, handle, *first_bh,
+ last_blk, new_blk,
+ v_start);
+ else {
+ ret = ocfs2_half_xattr_cluster(inode, handle,
+ last_blk, new_blk,
+ v_start);
+
+ if ((*header_bh)->b_blocknr == last_blk && extend)
+ *extend = 0;
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * Add a new cluster for xattr storage.
+ *
+ * If the new cluster is contiguous with the previous one, it will be
+ * appended to the same extent record, and num_clusters will be updated.
+ * If not, we will insert a new extent for it and move some xattrs in
+ * the last cluster into the new allocated one.
+ * We also need to limit the maximum size of a btree leaf, otherwise we'll
+ * lose the benefits of hashing because we'll have to search large leaves.
+ * So now the maximum size is OCFS2_MAX_XATTR_TREE_LEAF_SIZE(or clustersize,
+ * if it's bigger).
+ *
+ * first_bh is the first block of the previous extent rec and header_bh
+ * indicates the bucket we will insert the new xattrs. They will be updated
+ * when the header_bh is moved into the new cluster.
+ */
+static int ocfs2_add_new_xattr_cluster(struct inode *inode,
+ struct buffer_head *root_bh,
+ struct buffer_head **first_bh,
+ struct buffer_head **header_bh,
+ u32 *num_clusters,
+ u32 prev_cpos,
+ u64 prev_blkno,
+ int *extend)
+{
+ int ret, credits;
+ u16 bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+ u32 prev_clusters = *num_clusters;
+ u32 clusters_to_add = 1, bit_off, num_bits, v_start = 0;
+ u64 block;
+ handle_t *handle = NULL;
+ struct ocfs2_alloc_context *data_ac = NULL;
+ struct ocfs2_alloc_context *meta_ac = NULL;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_extent_tree et;
+
+ mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, "
+ "previous xattr blkno = %llu\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ prev_cpos, prev_blkno);
+
+ ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh);
+
+ ret = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
+ &data_ac, &meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto leave;
+ }
+
+ credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el,
+ clusters_to_add);
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ handle = NULL;
+ mlog_errno(ret);
+ goto leave;
+ }
+
+ ret = ocfs2_journal_access(handle, inode, root_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto leave;
+ }
+
+ ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
+ clusters_to_add, &bit_off, &num_bits);
+ if (ret < 0) {
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
+ goto leave;
+ }
+
+ BUG_ON(num_bits > clusters_to_add);
+
+ block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+ mlog(0, "Allocating %u clusters at block %u for xattr in inode %llu\n",
+ num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+ if (prev_blkno + prev_clusters * bpc == block &&
+ (prev_clusters + num_bits) << osb->s_clustersize_bits <=
+ OCFS2_MAX_XATTR_TREE_LEAF_SIZE) {
+ /*
+ * If this cluster is contiguous with the old one and
+ * adding this new cluster, we don't surpass the limit of
+ * OCFS2_MAX_XATTR_TREE_LEAF_SIZE, cool. We will let it be
+ * initialized and used like other buckets in the previous
+ * cluster.
+ * So add it as a contiguous one. The caller will handle
+ * its init process.
+ */
+ v_start = prev_cpos + prev_clusters;
+ *num_clusters = prev_clusters + num_bits;
+ mlog(0, "Add contiguous %u clusters to previous extent rec.\n",
+ num_bits);
+ } else {
+ ret = ocfs2_adjust_xattr_cross_cluster(inode,
+ handle,
+ first_bh,
+ header_bh,
+ block,
+ prev_blkno,
+ prev_clusters,
+ &v_start,
+ extend);
+ if (ret) {
+ mlog_errno(ret);
+ goto leave;
+ }
+ }
+
+ if (handle->h_buffer_credits < credits) {
+ /*
+ * The journal has been restarted before, and don't
+ * have enough space for the insertion, so extend it
+ * here.
+ */
+ ret = ocfs2_extend_trans(handle, credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto leave;
+ }
+ }
+ mlog(0, "Insert %u clusters at block %llu for xattr at %u\n",
+ num_bits, block, v_start);
+ ret = ocfs2_insert_extent(osb, handle, inode, &et, v_start, block,
+ num_bits, 0, meta_ac);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto leave;
+ }
+
+ ret = ocfs2_journal_dirty(handle, root_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto leave;
+ }
+
+leave:
+ if (handle)
+ ocfs2_commit_trans(osb, handle);
+ if (data_ac)
+ ocfs2_free_alloc_context(data_ac);
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+
+ return ret;
+}
+
+/*
+ * Extend a new xattr bucket and move xattrs to the end one by one until
+ * We meet with start_bh. Only move half of the xattrs to the bucket after it.
+ */
+static int ocfs2_extend_xattr_bucket(struct inode *inode,
+ struct buffer_head *first_bh,
+ struct buffer_head *start_bh,
+ u32 num_clusters)
+{
+ int ret, credits;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+ u64 start_blk = start_bh->b_blocknr, end_blk;
+ u32 num_buckets = num_clusters * ocfs2_xattr_buckets_per_cluster(osb);
+ handle_t *handle;
+ struct ocfs2_xattr_header *first_xh =
+ (struct ocfs2_xattr_header *)first_bh->b_data;
+ u16 bucket = le16_to_cpu(first_xh->xh_num_buckets);
+
+ mlog(0, "extend xattr bucket in %llu, xattr extend rec starting "
+ "from %llu, len = %u\n", start_blk,
+ (unsigned long long)first_bh->b_blocknr, num_clusters);
+
+ BUG_ON(bucket >= num_buckets);
+
+ end_blk = first_bh->b_blocknr + (bucket - 1) * blk_per_bucket;
+
+ /*
+ * We will touch all the buckets after the start_bh(include it).
+ * Add one more bucket and modify the first_bh.
+ */
+ credits = end_blk - start_blk + 2 * blk_per_bucket + 1;
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ handle = NULL;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access(handle, inode, first_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto commit;
+ }
+
+ while (end_blk != start_blk) {
+ ret = ocfs2_cp_xattr_bucket(inode, handle, end_blk,
+ end_blk + blk_per_bucket, 0);
+ if (ret)
+ goto commit;
+ end_blk -= blk_per_bucket;
+ }
+
+ /* Move half of the xattr in start_blk to the next bucket. */
+ ret = ocfs2_half_xattr_bucket(inode, handle, start_blk,
+ start_blk + blk_per_bucket, NULL, 0);
+
+ le16_add_cpu(&first_xh->xh_num_buckets, 1);
+ ocfs2_journal_dirty(handle, first_bh);
+
+commit:
+ ocfs2_commit_trans(osb, handle);
+out:
+ return ret;
+}
+
+/*
+ * Add new xattr bucket in an extent record and adjust the buckets accordingly.
+ * xb_bh is the ocfs2_xattr_block.
+ * We will move all the buckets starting from header_bh to the next place. As
+ * for this one, half num of its xattrs will be moved to the next one.
+ *
+ * We will allocate a new cluster if current cluster is full and adjust
+ * header_bh and first_bh if the insert place is moved to the new cluster.
+ */
+static int ocfs2_add_new_xattr_bucket(struct inode *inode,
+ struct buffer_head *xb_bh,
+ struct buffer_head *header_bh)
+{
+ struct ocfs2_xattr_header *first_xh = NULL;
+ struct buffer_head *first_bh = NULL;
+ struct ocfs2_xattr_block *xb =
+ (struct ocfs2_xattr_block *)xb_bh->b_data;
+ struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root;
+ struct ocfs2_extent_list *el = &xb_root->xt_list;
+ struct ocfs2_xattr_header *xh =
+ (struct ocfs2_xattr_header *)header_bh->b_data;
+ u32 name_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
+ struct super_block *sb = inode->i_sb;
+ struct ocfs2_super *osb = OCFS2_SB(sb);
+ int ret, num_buckets, extend = 1;
+ u64 p_blkno;
+ u32 e_cpos, num_clusters;
+
+ mlog(0, "Add new xattr bucket starting form %llu\n",
+ (unsigned long long)header_bh->b_blocknr);
+
+ /*
+ * Add refrence for header_bh here because it may be
+ * changed in ocfs2_add_new_xattr_cluster and we need
+ * to free it in the end.
+ */
+ get_bh(header_bh);
+
+ ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &e_cpos,
+ &num_clusters, el);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_read_block(inode, p_blkno, &first_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ num_buckets = ocfs2_xattr_buckets_per_cluster(osb) * num_clusters;
+ first_xh = (struct ocfs2_xattr_header *)first_bh->b_data;
+
+ if (num_buckets == le16_to_cpu(first_xh->xh_num_buckets)) {
+ ret = ocfs2_add_new_xattr_cluster(inode,
+ xb_bh,
+ &first_bh,
+ &header_bh,
+ &num_clusters,
+ e_cpos,
+ p_blkno,
+ &extend);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ if (extend)
+ ret = ocfs2_extend_xattr_bucket(inode,
+ first_bh,
+ header_bh,
+ num_clusters);
+ if (ret)
+ mlog_errno(ret);
+out:
+ brelse(first_bh);
+ brelse(header_bh);
+ return ret;
+}
+
+static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode,
+ struct ocfs2_xattr_bucket *bucket,
+ int offs)
+{
+ int block_off = offs >> inode->i_sb->s_blocksize_bits;
+
+ offs = offs % inode->i_sb->s_blocksize;
+ return bucket->bhs[block_off]->b_data + offs;
+}
+
+/*
+ * Handle the normal xattr set, including replace, delete and new.
+ *
+ * Note: "local" indicates the real data's locality. So we can't
+ * just its bucket locality by its length.
+ */
+static void ocfs2_xattr_set_entry_normal(struct inode *inode,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xs,
+ u32 name_hash,
+ int local)
+{
+ struct ocfs2_xattr_entry *last, *xe;
+ int name_len = strlen(xi->name);
+ struct ocfs2_xattr_header *xh = xs->header;
+ u16 count = le16_to_cpu(xh->xh_count), start;
+ size_t blocksize = inode->i_sb->s_blocksize;
+ char *val;
+ size_t offs, size, new_size;
+
+ last = &xh->xh_entries[count];
+ if (!xs->not_found) {
+ xe = xs->here;
+ offs = le16_to_cpu(xe->xe_name_offset);
+ if (ocfs2_xattr_is_local(xe))
+ size = OCFS2_XATTR_SIZE(name_len) +
+ OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
+ else
+ size = OCFS2_XATTR_SIZE(name_len) +
+ OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE);
+
+ /*
+ * If the new value will be stored outside, xi->value has been
+ * initalized as an empty ocfs2_xattr_value_root, and the same
+ * goes with xi->value_len, so we can set new_size safely here.
+ * See ocfs2_xattr_set_in_bucket.
+ */
+ new_size = OCFS2_XATTR_SIZE(name_len) +
+ OCFS2_XATTR_SIZE(xi->value_len);
+
+ le16_add_cpu(&xh->xh_name_value_len, -size);
+ if (xi->value) {
+ if (new_size > size)
+ goto set_new_name_value;
+
+ /* Now replace the old value with new one. */
+ if (local)
+ xe->xe_value_size = cpu_to_le64(xi->value_len);
+ else
+ xe->xe_value_size = 0;
+
+ val = ocfs2_xattr_bucket_get_val(inode,
+ &xs->bucket, offs);
+ memset(val + OCFS2_XATTR_SIZE(name_len), 0,
+ size - OCFS2_XATTR_SIZE(name_len));
+ if (OCFS2_XATTR_SIZE(xi->value_len) > 0)
+ memcpy(val + OCFS2_XATTR_SIZE(name_len),
+ xi->value, xi->value_len);
+
+ le16_add_cpu(&xh->xh_name_value_len, new_size);
+ ocfs2_xattr_set_local(xe, local);
+ return;
+ } else {
+ /*
+ * Remove the old entry if there is more than one.
+ * We don't remove the last entry so that we can
+ * use it to indicate the hash value of the empty
+ * bucket.
+ */
+ last -= 1;
+ le16_add_cpu(&xh->xh_count, -1);
+ if (xh->xh_count) {
+ memmove(xe, xe + 1,
+ (void *)last - (void *)xe);
+ memset(last, 0,
+ sizeof(struct ocfs2_xattr_entry));
+ } else
+ xh->xh_free_start =
+ cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE);
+
+ return;
+ }
+ } else {
+ /* find a new entry for insert. */
+ int low = 0, high = count - 1, tmp;
+ struct ocfs2_xattr_entry *tmp_xe;
+
+ while (low <= high && count) {
+ tmp = (low + high) / 2;
+ tmp_xe = &xh->xh_entries[tmp];
+
+ if (name_hash > le32_to_cpu(tmp_xe->xe_name_hash))
+ low = tmp + 1;
+ else if (name_hash <
+ le32_to_cpu(tmp_xe->xe_name_hash))
+ high = tmp - 1;
+ else {
+ low = tmp;
+ break;
+ }
+ }
+
+ xe = &xh->xh_entries[low];
+ if (low != count)
+ memmove(xe + 1, xe, (void *)last - (void *)xe);
+
+ le16_add_cpu(&xh->xh_count, 1);
+ memset(xe, 0, sizeof(struct ocfs2_xattr_entry));
+ xe->xe_name_hash = cpu_to_le32(name_hash);
+ xe->xe_name_len = name_len;
+ ocfs2_xattr_set_type(xe, xi->name_index);
+ }
+
+set_new_name_value:
+ /* Insert the new name+value. */
+ size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(xi->value_len);
+
+ /*
+ * We must make sure that the name/value pair
+ * exists in the same block.
+ */
+ offs = le16_to_cpu(xh->xh_free_start);
+ start = offs - size;
+
+ if (start >> inode->i_sb->s_blocksize_bits !=
+ (offs - 1) >> inode->i_sb->s_blocksize_bits) {
+ offs = offs - offs % blocksize;
+ xh->xh_free_start = cpu_to_le16(offs);
+ }
+
+ val = ocfs2_xattr_bucket_get_val(inode,
+ &xs->bucket, offs - size);
+ xe->xe_name_offset = cpu_to_le16(offs - size);
+
+ memset(val, 0, size);
+ memcpy(val, xi->name, name_len);
+ memcpy(val + OCFS2_XATTR_SIZE(name_len), xi->value, xi->value_len);
+
+ xe->xe_value_size = cpu_to_le64(xi->value_len);
+ ocfs2_xattr_set_local(xe, local);
+ xs->here = xe;
+ le16_add_cpu(&xh->xh_free_start, -size);
+ le16_add_cpu(&xh->xh_name_value_len, size);
+
+ return;
+}
+
+static int ocfs2_xattr_bucket_handle_journal(struct inode *inode,
+ handle_t *handle,
+ struct ocfs2_xattr_search *xs,
+ struct buffer_head **bhs,
+ u16 bh_num)
+{
+ int ret = 0, off, block_off;
+ struct ocfs2_xattr_entry *xe = xs->here;
+
+ /*
+ * First calculate all the blocks we should journal_access
+ * and journal_dirty. The first block should always be touched.
+ */
+ ret = ocfs2_journal_dirty(handle, bhs[0]);
+ if (ret)
+ mlog_errno(ret);
+
+ /* calc the data. */
+ off = le16_to_cpu(xe->xe_name_offset);
+ block_off = off >> inode->i_sb->s_blocksize_bits;
+ ret = ocfs2_journal_dirty(handle, bhs[block_off]);
+ if (ret)
+ mlog_errno(ret);
+
+ return ret;
+}
+
+/*
+ * Set the xattr entry in the specified bucket.
+ * The bucket is indicated by xs->bucket and it should have the enough
+ * space for the xattr insertion.
+ */
+static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xs,
+ u32 name_hash,
+ int local)
+{
+ int i, ret;
+ handle_t *handle = NULL;
+ u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n",
+ (unsigned long)xi->value_len, xi->name_index,
+ (unsigned long long)xs->bucket.bhs[0]->b_blocknr);
+
+ if (!xs->bucket.bhs[1]) {
+ ret = ocfs2_read_blocks(inode,
+ xs->bucket.bhs[0]->b_blocknr + 1,
+ blk_per_bucket - 1, &xs->bucket.bhs[1],
+ 0);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ handle = ocfs2_start_trans(osb, blk_per_bucket);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ handle = NULL;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ for (i = 0; i < blk_per_bucket; i++) {
+ ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[i],
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ ocfs2_xattr_set_entry_normal(inode, xi, xs, name_hash, local);
+
+ /*Only dirty the blocks we have touched in set xattr. */
+ ret = ocfs2_xattr_bucket_handle_journal(inode, handle, xs,
+ xs->bucket.bhs, blk_per_bucket);
+ if (ret)
+ mlog_errno(ret);
+out:
+ ocfs2_commit_trans(osb, handle);
+
+ return ret;
+}
+
+static int ocfs2_xattr_value_update_size(struct inode *inode,
+ struct buffer_head *xe_bh,
+ struct ocfs2_xattr_entry *xe,
+ u64 new_size)
+{
+ int ret;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ handle_t *handle = NULL;
+
+ handle = ocfs2_start_trans(osb, 1);
+ if (handle == NULL) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access(handle, inode, xe_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ xe->xe_value_size = cpu_to_le64(new_size);
+
+ ret = ocfs2_journal_dirty(handle, xe_bh);
+ if (ret < 0)
+ mlog_errno(ret);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+out:
+ return ret;
+}
+
+/*
+ * Truncate the specified xe_off entry in xattr bucket.
+ * bucket is indicated by header_bh and len is the new length.
+ * Both the ocfs2_xattr_value_root and the entry will be updated here.
+ *
+ * Copy the new updated xe and xe_value_root to new_xe and new_xv if needed.
+ */
+static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
+ struct buffer_head *header_bh,
+ int xe_off,
+ int len)
+{
+ int ret, offset;
+ u64 value_blk;
+ struct buffer_head *value_bh = NULL;
+ struct ocfs2_xattr_value_root *xv;
+ struct ocfs2_xattr_entry *xe;
+ struct ocfs2_xattr_header *xh =
+ (struct ocfs2_xattr_header *)header_bh->b_data;
+ size_t blocksize = inode->i_sb->s_blocksize;
+
+ xe = &xh->xh_entries[xe_off];
+
+ BUG_ON(!xe || ocfs2_xattr_is_local(xe));
+
+ offset = le16_to_cpu(xe->xe_name_offset) +
+ OCFS2_XATTR_SIZE(xe->xe_name_len);
+
+ value_blk = offset / blocksize;
+
+ /* We don't allow ocfs2_xattr_value to be stored in different block. */
+ BUG_ON(value_blk != (offset + OCFS2_XATTR_ROOT_SIZE - 1) / blocksize);
+ value_blk += header_bh->b_blocknr;
+
+ ret = ocfs2_read_block(inode, value_blk, &value_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ xv = (struct ocfs2_xattr_value_root *)
+ (value_bh->b_data + offset % blocksize);
+
+ mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n",
+ xe_off, (unsigned long long)header_bh->b_blocknr, len);
+ ret = ocfs2_xattr_value_truncate(inode, value_bh, xv, len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_xattr_value_update_size(inode, header_bh, xe, len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+out:
+ brelse(value_bh);
+ return ret;
+}
+
+static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
+ struct ocfs2_xattr_search *xs,
+ int len)
+{
+ int ret, offset;
+ struct ocfs2_xattr_entry *xe = xs->here;
+ struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)xs->base;
+
+ BUG_ON(!xs->bucket.bhs[0] || !xe || ocfs2_xattr_is_local(xe));
+
+ offset = xe - xh->xh_entries;
+ ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket.bhs[0],
+ offset, len);
+ if (ret)
+ mlog_errno(ret);
+
+ return ret;
+}
+
+static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
+ struct ocfs2_xattr_search *xs,
+ char *val,
+ int value_len)
+{
+ int offset;
+ struct ocfs2_xattr_value_root *xv;
+ struct ocfs2_xattr_entry *xe = xs->here;
+
+ BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe));
+
+ offset = le16_to_cpu(xe->xe_name_offset) +
+ OCFS2_XATTR_SIZE(xe->xe_name_len);
+
+ xv = (struct ocfs2_xattr_value_root *)(xs->base + offset);
+
+ return __ocfs2_xattr_set_value_outside(inode, xv, val, value_len);
+}
+
+static int ocfs2_rm_xattr_cluster(struct inode *inode,
+ struct buffer_head *root_bh,
+ u64 blkno,
+ u32 cpos,
+ u32 len)
+{
+ int ret;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct inode *tl_inode = osb->osb_tl_inode;
+ handle_t *handle;
+ struct ocfs2_xattr_block *xb =
+ (struct ocfs2_xattr_block *)root_bh->b_data;
+ struct ocfs2_alloc_context *meta_ac = NULL;
+ struct ocfs2_cached_dealloc_ctxt dealloc;
+ struct ocfs2_extent_tree et;
+
+ ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh);
+
+ ocfs2_init_dealloc_ctxt(&dealloc);
+
+ mlog(0, "rm xattr extent rec at %u len = %u, start from %llu\n",
+ cpos, len, (unsigned long long)blkno);
+
+ ocfs2_remove_xattr_clusters_from_cache(inode, blkno, len);
+
+ ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ mutex_lock(&tl_inode->i_mutex);
+
+ if (ocfs2_truncate_log_needs_flush(osb)) {
+ ret = __ocfs2_flush_truncate_log(osb);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
+ if (handle == NULL) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access(handle, inode, root_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
+ &dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, -len);
+
+ ret = ocfs2_journal_dirty(handle, root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = ocfs2_truncate_log_append(osb, handle, blkno, len);
+ if (ret)
+ mlog_errno(ret);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+out:
+ ocfs2_schedule_truncate_log_flush(osb, 1);
+
+ mutex_unlock(&tl_inode->i_mutex);
+
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+
+ ocfs2_run_deallocs(osb, &dealloc);
+
+ return ret;
+}
+
+static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
+ struct ocfs2_xattr_search *xs)
+{
+ handle_t *handle = NULL;
+ struct ocfs2_xattr_header *xh = xs->bucket.xh;
+ struct ocfs2_xattr_entry *last = &xh->xh_entries[
+ le16_to_cpu(xh->xh_count) - 1];
+ int ret = 0;
+
+ handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), 1);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ return;
+ }
+
+ ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[0],
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ /* Remove the old entry. */
+ memmove(xs->here, xs->here + 1,
+ (void *)last - (void *)xs->here);
+ memset(last, 0, sizeof(struct ocfs2_xattr_entry));
+ le16_add_cpu(&xh->xh_count, -1);
+
+ ret = ocfs2_journal_dirty(handle, xs->bucket.bhs[0]);
+ if (ret < 0)
+ mlog_errno(ret);
+out_commit:
+ ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+}
+
+/*
+ * Set the xattr name/value in the bucket specified in xs.
+ *
+ * As the new value in xi may be stored in the bucket or in an outside cluster,
+ * we divide the whole process into 3 steps:
+ * 1. insert name/value in the bucket(ocfs2_xattr_set_entry_in_bucket)
+ * 2. truncate of the outside cluster(ocfs2_xattr_bucket_value_truncate_xs)
+ * 3. Set the value to the outside cluster(ocfs2_xattr_bucket_set_value_outside)
+ * 4. If the clusters for the new outside value can't be allocated, we need
+ * to free the xattr we allocated in set.
+ */
+static int ocfs2_xattr_set_in_bucket(struct inode *inode,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xs)
+{
+ int ret, local = 1;
+ size_t value_len;
+ char *val = (char *)xi->value;
+ struct ocfs2_xattr_entry *xe = xs->here;
+ u32 name_hash = ocfs2_xattr_name_hash(inode, xi->name,
+ strlen(xi->name));
+
+ if (!xs->not_found && !ocfs2_xattr_is_local(xe)) {
+ /*
+ * We need to truncate the xattr storage first.
+ *
+ * If both the old and new value are stored to
+ * outside block, we only need to truncate
+ * the storage and then set the value outside.
+ *
+ * If the new value should be stored within block,
+ * we should free all the outside block first and
+ * the modification to the xattr block will be done
+ * by following steps.
+ */
+ if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
+ value_len = xi->value_len;
+ else
+ value_len = 0;
+
+ ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
+ value_len);
+ if (ret)
+ goto out;
+
+ if (value_len)
+ goto set_value_outside;
+ }
+
+ value_len = xi->value_len;
+ /* So we have to handle the inside block change now. */
+ if (value_len > OCFS2_XATTR_INLINE_SIZE) {
+ /*
+ * If the new value will be stored outside of block,
+ * initalize a new empty value root and insert it first.
+ */
+ local = 0;
+ xi->value = &def_xv;
+ xi->value_len = OCFS2_XATTR_ROOT_SIZE;
+ }
+
+ ret = ocfs2_xattr_set_entry_in_bucket(inode, xi, xs, name_hash, local);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (value_len <= OCFS2_XATTR_INLINE_SIZE)
+ goto out;
+
+ /* allocate the space now for the outside block storage. */
+ ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
+ value_len);
+ if (ret) {
+ mlog_errno(ret);
+
+ if (xs->not_found) {
+ /*
+ * We can't allocate enough clusters for outside
+ * storage and we have allocated xattr already,
+ * so need to remove it.
+ */
+ ocfs2_xattr_bucket_remove_xs(inode, xs);
+ }
+ goto out;
+ }
+
+set_value_outside:
+ ret = ocfs2_xattr_bucket_set_value_outside(inode, xs, val, value_len);
+out:
+ return ret;
+}
+
+/* check whether the xattr bucket is filled up with the same hash value. */
+static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
+ struct ocfs2_xattr_bucket *bucket)
+{
+ struct ocfs2_xattr_header *xh = bucket->xh;
+
+ if (xh->xh_entries[le16_to_cpu(xh->xh_count) - 1].xe_name_hash ==
+ xh->xh_entries[0].xe_name_hash) {
+ mlog(ML_ERROR, "Too much hash collision in xattr bucket %llu, "
+ "hash = %u\n",
+ (unsigned long long)bucket->bhs[0]->b_blocknr,
+ le32_to_cpu(xh->xh_entries[0].xe_name_hash));
+ return -ENOSPC;
+ }
+
+ return 0;
+}
+
+static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xs)
+{
+ struct ocfs2_xattr_header *xh;
+ struct ocfs2_xattr_entry *xe;
+ u16 count, header_size, xh_free_start;
+ int i, free, max_free, need, old;
+ size_t value_size = 0, name_len = strlen(xi->name);
+ size_t blocksize = inode->i_sb->s_blocksize;
+ int ret, allocation = 0;
+ u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+ mlog_entry("Set xattr %s in xattr index block\n", xi->name);
+
+try_again:
+ xh = xs->header;
+ count = le16_to_cpu(xh->xh_count);
+ xh_free_start = le16_to_cpu(xh->xh_free_start);
+ header_size = sizeof(struct ocfs2_xattr_header) +
+ count * sizeof(struct ocfs2_xattr_entry);
+ max_free = OCFS2_XATTR_BUCKET_SIZE -
+ le16_to_cpu(xh->xh_name_value_len) - header_size;
+
+ mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size "
+ "of %u which exceed block size\n",
+ (unsigned long long)xs->bucket.bhs[0]->b_blocknr,
+ header_size);
+
+ if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE)
+ value_size = OCFS2_XATTR_ROOT_SIZE;
+ else if (xi->value)
+ value_size = OCFS2_XATTR_SIZE(xi->value_len);
+
+ if (xs->not_found)
+ need = sizeof(struct ocfs2_xattr_entry) +
+ OCFS2_XATTR_SIZE(name_len) + value_size;
+ else {
+ need = value_size + OCFS2_XATTR_SIZE(name_len);
+
+ /*
+ * We only replace the old value if the new length is smaller
+ * than the old one. Otherwise we will allocate new space in the
+ * bucket to store it.
+ */
+ xe = xs->here;
+ if (ocfs2_xattr_is_local(xe))
+ old = OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
+ else
+ old = OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE);
+
+ if (old >= value_size)
+ need = 0;
+ }
+
+ free = xh_free_start - header_size;
+ /*
+ * We need to make sure the new name/value pair
+ * can exist in the same block.
+ */
+ if (xh_free_start % blocksize < need)
+ free -= xh_free_start % blocksize;
+
+ mlog(0, "xs->not_found = %d, in xattr bucket %llu: free = %d, "
+ "need = %d, max_free = %d, xh_free_start = %u, xh_name_value_len ="
+ " %u\n", xs->not_found,
+ (unsigned long long)xs->bucket.bhs[0]->b_blocknr,
+ free, need, max_free, le16_to_cpu(xh->xh_free_start),
+ le16_to_cpu(xh->xh_name_value_len));
+
+ if (free < need || count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
+ if (need <= max_free &&
+ count < ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
+ /*
+ * We can create the space by defragment. Since only the
+ * name/value will be moved, the xe shouldn't be changed
+ * in xs.
+ */
+ ret = ocfs2_defrag_xattr_bucket(inode, &xs->bucket);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ xh_free_start = le16_to_cpu(xh->xh_free_start);
+ free = xh_free_start - header_size;
+ if (xh_free_start % blocksize < need)
+ free -= xh_free_start % blocksize;
+
+ if (free >= need)
+ goto xattr_set;
+
+ mlog(0, "Can't get enough space for xattr insert by "
+ "defragment. Need %u bytes, but we have %d, so "
+ "allocate new bucket for it.\n", need, free);
+ }
+
+ /*
+ * We have to add new buckets or clusters and one
+ * allocation should leave us enough space for insert.
+ */
+ BUG_ON(allocation);
+
+ /*
+ * We do not allow for overlapping ranges between buckets. And
+ * the maximum number of collisions we will allow for then is
+ * one bucket's worth, so check it here whether we need to
+ * add a new bucket for the insert.
+ */
+ ret = ocfs2_check_xattr_bucket_collision(inode, &xs->bucket);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_add_new_xattr_bucket(inode,
+ xs->xattr_bh,
+ xs->bucket.bhs[0]);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ for (i = 0; i < blk_per_bucket; i++)
+ brelse(xs->bucket.bhs[i]);
+
+ memset(&xs->bucket, 0, sizeof(xs->bucket));
+
+ ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
+ xi->name_index,
+ xi->name, xs);
+ if (ret && ret != -ENODATA)
+ goto out;
+ xs->not_found = ret;
+ allocation = 1;
+ goto try_again;
+ }
+
+xattr_set:
+ ret = ocfs2_xattr_set_in_bucket(inode, xi, xs);
+out:
+ mlog_exit(ret);
+ return ret;
+}
+
+static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
+ struct ocfs2_xattr_bucket *bucket,
+ void *para)
+{
+ int ret = 0;
+ struct ocfs2_xattr_header *xh = bucket->xh;
+ u16 i;
+ struct ocfs2_xattr_entry *xe;
+
+ for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+ xe = &xh->xh_entries[i];
+ if (ocfs2_xattr_is_local(xe))
+ continue;
+
+ ret = ocfs2_xattr_bucket_value_truncate(inode,
+ bucket->bhs[0],
+ i, 0);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static int ocfs2_delete_xattr_index_block(struct inode *inode,
+ struct buffer_head *xb_bh)
+{
+ struct ocfs2_xattr_block *xb =
+ (struct ocfs2_xattr_block *)xb_bh->b_data;
+ struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list;
+ int ret = 0;
+ u32 name_hash = UINT_MAX, e_cpos, num_clusters;
+ u64 p_blkno;
+
+ if (le16_to_cpu(el->l_next_free_rec) == 0)
+ return 0;
+
+ while (name_hash > 0) {
+ ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno,
+ &e_cpos, &num_clusters, el);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters,
+ ocfs2_delete_xattr_in_bucket,
+ NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_rm_xattr_cluster(inode, xb_bh,
+ p_blkno, e_cpos, num_clusters);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ if (e_cpos == 0)
+ break;
+
+ name_hash = e_cpos - 1;
+ }
+
+out:
+ return ret;
+}
+
+/*
+ * 'trusted' attributes support
+ */
+
+#define XATTR_TRUSTED_PREFIX "trusted."
+
+static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list,
+ size_t list_size, const char *name,
+ size_t name_len)
+{
+ const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX) - 1;
+ const size_t total_len = prefix_len + name_len + 1;
+
+ if (list && total_len <= list_size) {
+ memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
+ memcpy(list + prefix_len, name, name_len);
+ list[prefix_len + name_len] = '\0';
+ }
+ return total_len;
+}
+
+static int ocfs2_xattr_trusted_get(struct inode *inode, const char *name,
+ void *buffer, size_t size)
+{
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+ return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_TRUSTED, name,
+ buffer, size);
+}
+
+static int ocfs2_xattr_trusted_set(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+
+ return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_TRUSTED, name, value,
+ size, flags);
+}
+
+struct xattr_handler ocfs2_xattr_trusted_handler = {
+ .prefix = XATTR_TRUSTED_PREFIX,
+ .list = ocfs2_xattr_trusted_list,
+ .get = ocfs2_xattr_trusted_get,
+ .set = ocfs2_xattr_trusted_set,
+};
+
+
+/*
+ * 'user' attributes support
+ */
+
+#define XATTR_USER_PREFIX "user."
+
+static size_t ocfs2_xattr_user_list(struct inode *inode, char *list,
+ size_t list_size, const char *name,
+ size_t name_len)
+{
+ const size_t prefix_len = sizeof(XATTR_USER_PREFIX) - 1;
+ const size_t total_len = prefix_len + name_len + 1;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
+ return 0;
+
+ if (list && total_len <= list_size) {
+ memcpy(list, XATTR_USER_PREFIX, prefix_len);
+ memcpy(list + prefix_len, name, name_len);
+ list[prefix_len + name_len] = '\0';
+ }
+ return total_len;
+}
+
+static int ocfs2_xattr_user_get(struct inode *inode, const char *name,
+ void *buffer, size_t size)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+ if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
+ return -EOPNOTSUPP;
+ return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_USER, name,
+ buffer, size);
+}
+
+static int ocfs2_xattr_user_set(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+ if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
+ return -EOPNOTSUPP;
+
+ return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_USER, name, value,
+ size, flags);
+}
+
+struct xattr_handler ocfs2_xattr_user_handler = {
+ .prefix = XATTR_USER_PREFIX,
+ .list = ocfs2_xattr_user_list,
+ .get = ocfs2_xattr_user_get,
+ .set = ocfs2_xattr_user_set,
+};
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
new file mode 100644
index 000000000000..c25c7c62a059
--- /dev/null
+++ b/fs/ocfs2/xattr.h
@@ -0,0 +1,68 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * xattr.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_XATTR_H
+#define OCFS2_XATTR_H
+
+#include <linux/init.h>
+#include <linux/xattr.h>
+
+enum ocfs2_xattr_type {
+ OCFS2_XATTR_INDEX_USER = 1,
+ OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS,
+ OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT,
+ OCFS2_XATTR_INDEX_TRUSTED,
+ OCFS2_XATTR_INDEX_SECURITY,
+ OCFS2_XATTR_MAX
+};
+
+extern struct xattr_handler ocfs2_xattr_user_handler;
+extern struct xattr_handler ocfs2_xattr_trusted_handler;
+
+extern ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
+extern int ocfs2_xattr_get(struct inode *, int, const char *, void *, size_t);
+extern int ocfs2_xattr_set(struct inode *, int, const char *, const void *,
+ size_t, int);
+extern int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh);
+extern struct xattr_handler *ocfs2_xattr_handlers[];
+
+static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
+{
+ return (1 << osb->s_clustersize_bits) / OCFS2_XATTR_BUCKET_SIZE;
+}
+
+static inline u16 ocfs2_blocks_per_xattr_bucket(struct super_block *sb)
+{
+ return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits);
+}
+
+static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb)
+{
+ u16 len = sb->s_blocksize -
+ offsetof(struct ocfs2_xattr_header, xh_entries);
+
+ return len / sizeof(struct ocfs2_xattr_entry);
+}
+#endif /* OCFS2_XATTR_H */
diff --git a/fs/partitions/acorn.c b/fs/partitions/acorn.c
index 3d3e16631472..a97b477ac0fc 100644
--- a/fs/partitions/acorn.c
+++ b/fs/partitions/acorn.c
@@ -275,16 +275,6 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
id = data[0x1fc] & 15;
put_dev_sector(sect);
-#ifdef CONFIG_BLK_DEV_MFM
- if (MAJOR(bdev->bd_dev) == MFM_ACORN_MAJOR) {
- extern void xd_set_geometry(struct block_device *,
- unsigned char, unsigned char, unsigned int);
- xd_set_geometry(bdev, dr->secspertrack, heads, 1);
- invalidate_bh_lrus();
- truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
- }
-#endif
-
/*
* Work out start of non-adfs partition.
*/
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 7408227c49c9..cfb0c80690aa 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -195,6 +195,14 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
return ERR_PTR(res);
}
+static ssize_t part_partition_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct hd_struct *p = dev_to_part(dev);
+
+ return sprintf(buf, "%d\n", p->partno);
+}
+
static ssize_t part_start_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
@@ -260,6 +268,7 @@ ssize_t part_fail_store(struct device *dev,
}
#endif
+static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
@@ -269,6 +278,7 @@ static struct device_attribute dev_attr_fail =
#endif
static struct attribute *part_attrs[] = {
+ &dev_attr_partition.attr,
&dev_attr_start.attr,
&dev_attr_size.attr,
&dev_attr_stat.attr,
@@ -538,10 +548,23 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
sector_t from = state->parts[p].from;
if (!size)
continue;
+ if (from >= get_capacity(disk)) {
+ printk(KERN_WARNING
+ "%s: p%d ignored, start %llu is behind the end of the disk\n",
+ disk->disk_name, p, (unsigned long long) from);
+ continue;
+ }
if (from + size > get_capacity(disk)) {
+ /*
+ * we can not ignore partitions of broken tables
+ * created by for example camera firmware, but we
+ * limit them to the end of the disk to avoid
+ * creating invalid block devices
+ */
printk(KERN_WARNING
- "%s: p%d exceeds device capacity\n",
- disk->disk_name, p);
+ "%s: p%d size %llu limited to end of disk\n",
+ disk->disk_name, p, (unsigned long long) size);
+ size = get_capacity(disk) - from;
}
res = add_partition(disk, p, from, size, state->parts[p].flags);
if (res) {
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 66c1ab87656c..59ea42e1ef03 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -45,7 +45,6 @@
#include <linux/blkdev.h>
#include <linux/hugetlb.h>
#include <linux/jiffies.h>
-#include <linux/sysrq.h>
#include <linux/vmalloc.h>
#include <linux/crash_dump.h>
#include <linux/pid_namespace.h>
@@ -683,6 +682,7 @@ static int cmdline_read_proc(char *page, char **start, off_t off,
return proc_calc_metrics(page, start, off, count, eof, len);
}
+#ifdef CONFIG_FILE_LOCKING
static int locks_open(struct inode *inode, struct file *filp)
{
return seq_open(filp, &locks_seq_operations);
@@ -694,6 +694,7 @@ static const struct file_operations proc_locks_operations = {
.llseek = seq_lseek,
.release = seq_release,
};
+#endif /* CONFIG_FILE_LOCKING */
static int execdomains_read_proc(char *page, char **start, off_t off,
int count, int *eof, void *data)
@@ -702,28 +703,6 @@ static int execdomains_read_proc(char *page, char **start, off_t off,
return proc_calc_metrics(page, start, off, count, eof, len);
}
-#ifdef CONFIG_MAGIC_SYSRQ
-/*
- * writing 'C' to /proc/sysrq-trigger is like sysrq-C
- */
-static ssize_t write_sysrq_trigger(struct file *file, const char __user *buf,
- size_t count, loff_t *ppos)
-{
- if (count) {
- char c;
-
- if (get_user(c, buf))
- return -EFAULT;
- __handle_sysrq(c, NULL, 0);
- }
- return count;
-}
-
-static const struct file_operations proc_sysrq_trigger_operations = {
- .write = write_sysrq_trigger,
-};
-#endif
-
#ifdef CONFIG_PROC_PAGE_MONITOR
#define KPMSIZE sizeof(u64)
#define KPMMASK (KPMSIZE - 1)
@@ -887,7 +866,9 @@ void __init proc_misc_init(void)
#ifdef CONFIG_PRINTK
proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations);
#endif
+#ifdef CONFIG_FILE_LOCKING
proc_create("locks", 0, NULL, &proc_locks_operations);
+#endif
proc_create("devices", 0, NULL, &proc_devinfo_operations);
proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations);
#ifdef CONFIG_BLOCK
@@ -930,7 +911,4 @@ void __init proc_misc_init(void)
#ifdef CONFIG_PROC_VMCORE
proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &proc_vmcore_operations);
#endif
-#ifdef CONFIG_MAGIC_SYSRQ
- proc_create("sysrq-trigger", S_IWUSR, NULL, &proc_sysrq_trigger_operations);
-#endif
}
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index b9dbeeca7049..37173fa07d15 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -8,8 +8,6 @@
/* proc info support a la one created by Sizif@Botik.RU for PGC */
-/* $Id: procfs.c,v 1.1.8.2 2001/07/15 17:08:42 god Exp $ */
-
#include <linux/module.h>
#include <linux/time.h>
#include <linux/seq_file.h>
@@ -621,7 +619,6 @@ int reiserfs_global_version_in_proc(char *buffer, char **start,
#endif
/*
- * $Log: procfs.c,v $
* Revision 1.1.8.2 2001/07/15 17:08:42 god
* . use get_super() in procfs.c
* . remove remove_save_link() from reiserfs_do_truncate()
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index bb3cb5b7cdb2..ad92461cbfc3 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -155,7 +155,7 @@ static struct dentry *get_xa_file_dentry(const struct inode *inode,
xadir = open_xa_dir(inode, flags);
if (IS_ERR(xadir)) {
return ERR_CAST(xadir);
- } else if (xadir && !xadir->d_inode) {
+ } else if (!xadir->d_inode) {
dput(xadir);
return ERR_PTR(-ENODATA);
}
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 006fc64227dd..66f6e58a7e4b 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -61,6 +61,7 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
int size = dentry->d_inode->i_size;
loff_t offs = *off;
int count = min_t(size_t, bytes, PAGE_SIZE);
+ char *temp;
if (size) {
if (offs > size)
@@ -69,23 +70,33 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
count = size - offs;
}
+ temp = kmalloc(count, GFP_KERNEL);
+ if (!temp)
+ return -ENOMEM;
+
mutex_lock(&bb->mutex);
count = fill_read(dentry, bb->buffer, offs, count);
- if (count < 0)
- goto out_unlock;
+ if (count < 0) {
+ mutex_unlock(&bb->mutex);
+ goto out_free;
+ }
- if (copy_to_user(userbuf, bb->buffer, count)) {
+ memcpy(temp, bb->buffer, count);
+
+ mutex_unlock(&bb->mutex);
+
+ if (copy_to_user(userbuf, temp, count)) {
count = -EFAULT;
- goto out_unlock;
+ goto out_free;
}
pr_debug("offs = %lld, *off = %lld, count = %d\n", offs, *off, count);
*off = offs + count;
- out_unlock:
- mutex_unlock(&bb->mutex);
+ out_free:
+ kfree(temp);
return count;
}
@@ -118,6 +129,7 @@ static ssize_t write(struct file *file, const char __user *userbuf,
int size = dentry->d_inode->i_size;
loff_t offs = *off;
int count = min_t(size_t, bytes, PAGE_SIZE);
+ char *temp;
if (size) {
if (offs > size)
@@ -126,19 +138,27 @@ static ssize_t write(struct file *file, const char __user *userbuf,
count = size - offs;
}
- mutex_lock(&bb->mutex);
+ temp = kmalloc(count, GFP_KERNEL);
+ if (!temp)
+ return -ENOMEM;
- if (copy_from_user(bb->buffer, userbuf, count)) {
+ if (copy_from_user(temp, userbuf, count)) {
count = -EFAULT;
- goto out_unlock;
+ goto out_free;
}
+ mutex_lock(&bb->mutex);
+
+ memcpy(bb->buffer, temp, count);
+
count = flush_write(dentry, bb->buffer, offs, count);
+ mutex_unlock(&bb->mutex);
+
if (count > 0)
*off = offs + count;
- out_unlock:
- mutex_unlock(&bb->mutex);
+out_free:
+ kfree(temp);
return count;
}
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index aedaeba82ae5..3a05a596e3b4 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -370,17 +370,17 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
memset(acxt, 0, sizeof(*acxt));
acxt->parent_sd = parent_sd;
- /* Lookup parent inode. inode initialization and I_NEW
- * clearing are protected by sysfs_mutex. By grabbing it and
- * looking up with _nowait variant, inode state can be
- * determined reliably.
+ /* Lookup parent inode. inode initialization is protected by
+ * sysfs_mutex, so inode existence can be determined by
+ * looking up inode while holding sysfs_mutex.
*/
mutex_lock(&sysfs_mutex);
- inode = ilookup5_nowait(sysfs_sb, parent_sd->s_ino, sysfs_ilookup_test,
- parent_sd);
+ inode = ilookup5(sysfs_sb, parent_sd->s_ino, sysfs_ilookup_test,
+ parent_sd);
+ if (inode) {
+ WARN_ON(inode->i_state & I_NEW);
- if (inode && !(inode->i_state & I_NEW)) {
/* parent inode available */
acxt->parent_inode = inode;
@@ -393,8 +393,7 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
mutex_lock(&inode->i_mutex);
mutex_lock(&sysfs_mutex);
}
- } else
- iput(inode);
+ }
}
/**
@@ -636,6 +635,7 @@ struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
return sd;
}
+EXPORT_SYMBOL_GPL(sysfs_get_dirent);
static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
const char *name, struct sysfs_dirent **p_sd)
@@ -829,16 +829,12 @@ int sysfs_rename_dir(struct kobject * kobj, const char *new_name)
if (!new_dentry)
goto out_unlock;
- /* rename kobject and sysfs_dirent */
+ /* rename sysfs_dirent */
error = -ENOMEM;
new_name = dup_name = kstrdup(new_name, GFP_KERNEL);
if (!new_name)
goto out_unlock;
- error = kobject_set_name(kobj, "%s", new_name);
- if (error)
- goto out_unlock;
-
dup_name = sd->s_name;
sd->s_name = new_name;
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index c9e4e5091da1..1f4a3f877262 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -19,10 +19,18 @@
#include <linux/poll.h>
#include <linux/list.h>
#include <linux/mutex.h>
+#include <linux/limits.h>
#include <asm/uaccess.h>
#include "sysfs.h"
+/* used in crash dumps to help with debugging */
+static char last_sysfs_file[PATH_MAX];
+void sysfs_printk_last_file(void)
+{
+ printk(KERN_EMERG "last sysfs file: %s\n", last_sysfs_file);
+}
+
/*
* There's one sysfs_buffer for each open file and one
* sysfs_open_dirent for each sysfs_dirent with one or more open
@@ -328,6 +336,11 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
struct sysfs_buffer *buffer;
struct sysfs_ops *ops;
int error = -EACCES;
+ char *p;
+
+ p = d_path(&file->f_path, last_sysfs_file, sizeof(last_sysfs_file));
+ if (p)
+ memmove(last_sysfs_file, p, strlen(p) + 1);
/* need attr_sd for attr and ops, its parent for kobj */
if (!sysfs_get_active_two(attr_sd))
@@ -440,7 +453,23 @@ static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
return POLLERR|POLLPRI;
}
-void sysfs_notify(struct kobject *k, char *dir, char *attr)
+void sysfs_notify_dirent(struct sysfs_dirent *sd)
+{
+ struct sysfs_open_dirent *od;
+
+ spin_lock(&sysfs_open_dirent_lock);
+
+ od = sd->s_attr.open;
+ if (od) {
+ atomic_inc(&od->event);
+ wake_up_interruptible(&od->poll);
+ }
+
+ spin_unlock(&sysfs_open_dirent_lock);
+}
+EXPORT_SYMBOL_GPL(sysfs_notify_dirent);
+
+void sysfs_notify(struct kobject *k, const char *dir, const char *attr)
{
struct sysfs_dirent *sd = k->sd;
@@ -450,19 +479,8 @@ void sysfs_notify(struct kobject *k, char *dir, char *attr)
sd = sysfs_find_dirent(sd, dir);
if (sd && attr)
sd = sysfs_find_dirent(sd, attr);
- if (sd) {
- struct sysfs_open_dirent *od;
-
- spin_lock(&sysfs_open_dirent_lock);
-
- od = sd->s_attr.open;
- if (od) {
- atomic_inc(&od->event);
- wake_up_interruptible(&od->poll);
- }
-
- spin_unlock(&sysfs_open_dirent_lock);
- }
+ if (sd)
+ sysfs_notify_dirent(sd);
mutex_unlock(&sysfs_mutex);
}
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 14f0023984d7..ab343e371d64 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -16,6 +16,7 @@
#include <linux/mount.h>
#include <linux/pagemap.h>
#include <linux/init.h>
+#include <linux/module.h>
#include "sysfs.h"
@@ -115,3 +116,17 @@ out_err:
sysfs_dir_cachep = NULL;
goto out;
}
+
+#undef sysfs_get
+struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd)
+{
+ return __sysfs_get(sd);
+}
+EXPORT_SYMBOL_GPL(sysfs_get);
+
+#undef sysfs_put
+void sysfs_put(struct sysfs_dirent *sd)
+{
+ __sysfs_put(sd);
+}
+EXPORT_SYMBOL_GPL(sysfs_put);
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index a5db496f71c7..93c6d6b27c4d 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -124,7 +124,7 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name,
struct sysfs_dirent **p_sd);
void sysfs_remove_subdir(struct sysfs_dirent *sd);
-static inline struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd)
+static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd)
{
if (sd) {
WARN_ON(!atomic_read(&sd->s_count));
@@ -132,12 +132,14 @@ static inline struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd)
}
return sd;
}
+#define sysfs_get(sd) __sysfs_get(sd)
-static inline void sysfs_put(struct sysfs_dirent *sd)
+static inline void __sysfs_put(struct sysfs_dirent *sd)
{
if (sd && atomic_dec_and_test(&sd->s_count))
release_sysfs_dirent(sd);
}
+#define sysfs_put(sd) __sysfs_put(sd)
/*
* inode.c
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 7227b2efef22..e39013619b26 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1323,7 +1323,7 @@ xfs_fs_remount(
"XFS: mount option \"%s\" not supported for remount\n", p);
return -EINVAL;
#else
- return 0;
+ break;
#endif
}
}