From b2366265e83b20ea3264d75f0d14dab2a99636ca Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Mon, 27 Oct 2008 18:27:55 +0200
Subject: exofs: osd Swiss army knife

In this patch are all the osd infrastructure that will be used later
by the file system.

Also the declarations of constants, on disk structures, and prototypes.

And the Kbuild+Kconfig files needed to build the exofs module.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/Kbuild   |  30 +++++
 fs/exofs/Kconfig  |  13 ++
 fs/exofs/common.h | 157 ++++++++++++++++++++++
 fs/exofs/exofs.h  | 168 +++++++++++++++++++++++
 fs/exofs/osd.c    | 395 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 763 insertions(+)
 create mode 100644 fs/exofs/Kbuild
 create mode 100644 fs/exofs/Kconfig
 create mode 100644 fs/exofs/common.h
 create mode 100644 fs/exofs/exofs.h
 create mode 100644 fs/exofs/osd.c

(limited to 'fs')

diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
new file mode 100644
index 000000000000..c806738c69d9
--- /dev/null
+++ b/fs/exofs/Kbuild
@@ -0,0 +1,30 @@
+#
+# Kbuild for the EXOFS module
+#
+# Copyright (C) 2008 Panasas Inc.  All rights reserved.
+#
+# Authors:
+#   Boaz Harrosh <bharrosh@panasas.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2
+#
+# Kbuild - Gets included from the Kernels Makefile and build system
+#
+
+ifneq ($(OSD_INC),)
+# we are built out-of-tree Kconfigure everything as on
+
+CONFIG_EXOFS_FS=m
+ccflags-y += -DCONFIG_EXOFS_FS -DCONFIG_EXOFS_FS_MODULE
+# ccflags-y += -DCONFIG_EXOFS_DEBUG
+
+# if we are built out-of-tree and the hosting kernel has OSD headers
+# then "ccflags-y +=" will not pick the out-off-tree headers. Only by doing
+# this it will work. This might break in future kernels
+KBUILD_CPPFLAGS := -I$(OSD_INC) $(KBUILD_CPPFLAGS)
+
+endif
+
+exofs-objs := osd.o
+obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig
new file mode 100644
index 000000000000..86194b2f799d
--- /dev/null
+++ b/fs/exofs/Kconfig
@@ -0,0 +1,13 @@
+config EXOFS_FS
+	tristate "exofs: OSD based file system support"
+	depends on SCSI_OSD_ULD
+	help
+	  EXOFS is a file system that uses an OSD storage device,
+	  as its backing storage.
+
+# Debugging-related stuff
+config EXOFS_DEBUG
+	bool "Enable debugging"
+	depends on EXOFS_FS
+	help
+	  This option enables EXOFS debug prints.
diff --git a/fs/exofs/common.h b/fs/exofs/common.h
new file mode 100644
index 000000000000..4f3a246b9d32
--- /dev/null
+++ b/fs/exofs/common.h
@@ -0,0 +1,157 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#ifndef __EXOFS_COM_H__
+#define __EXOFS_COM_H__
+
+#include <linux/types.h>
+#include <linux/timex.h>
+
+#include <scsi/osd_attributes.h>
+#include <scsi/osd_initiator.h>
+#include <scsi/osd_sec.h>
+
+/****************************************************************************
+ * Object ID related defines
+ * NOTE: inode# = object ID - EXOFS_OBJ_OFF
+ ****************************************************************************/
+#define EXOFS_OBJ_OFF	0x10000	/* offset for objects */
+#define EXOFS_SUPER_ID	0x10000	/* object ID for on-disk superblock */
+#define EXOFS_BM_ID	0x10001	/* object ID for ID bitmap */
+#define EXOFS_ROOT_ID	0x10002	/* object ID for root directory */
+#define EXOFS_TEST_ID	0x10003	/* object ID for test object */
+
+/* exofs Application specific page/attribute */
+#ifndef OSD_PAGE_NUM_IBM_UOBJ_FS_DATA
+# define OSD_PAGE_NUM_IBM_UOBJ_FS_DATA	   (OSD_APAGE_APP_DEFINED_FIRST + 3)
+# define OSD_ATTR_NUM_IBM_UOBJ_FS_DATA_INODE 1
+#endif
+
+/*
+ * The maximum number of files we can have is limited by the size of the
+ * inode number.  This is the largest object ID that the file system supports.
+ * Object IDs 0, 1, and 2 are always in use (see above defines).
+ */
+enum {
+	EXOFS_UINT64_MAX = (~0LL),
+	EXOFS_MAX_INO_ID = (sizeof(ino_t) * 8 == 64) ? EXOFS_UINT64_MAX :
+					(1LL << (sizeof(ino_t) * 8 - 1)),
+	EXOFS_MAX_ID	 = (EXOFS_MAX_INO_ID - 1 - EXOFS_OBJ_OFF),
+};
+
+/****************************************************************************
+ * Misc.
+ ****************************************************************************/
+#define EXOFS_BLKSHIFT	12
+#define EXOFS_BLKSIZE	(1UL << EXOFS_BLKSHIFT)
+
+/****************************************************************************
+ * superblock-related things
+ ****************************************************************************/
+#define EXOFS_SUPER_MAGIC	0x5DF5
+
+/*
+ * The file system control block - stored in an object's data (mainly, the one
+ * with ID EXOFS_SUPER_ID).  This is where the in-memory superblock is stored
+ * on disk.  Right now it just has a magic value, which is basically a sanity
+ * check on our ability to communicate with the object store.
+ */
+struct exofs_fscb {
+	__le64  s_nextid;	/* Highest object ID used */
+	__le32  s_numfiles;	/* Number of files on fs */
+	__le16  s_magic;	/* Magic signature */
+	__le16  s_newfs;	/* Non-zero if this is a new fs */
+};
+
+/****************************************************************************
+ * inode-related things
+ ****************************************************************************/
+#define EXOFS_IDATA		5
+
+/*
+ * The file control block - stored in an object's attributes.  This is where
+ * the in-memory inode is stored on disk.
+ */
+struct exofs_fcb {
+	__le64  i_size;			/* Size of the file */
+	__le16  i_mode;         	/* File mode */
+	__le16  i_links_count;  	/* Links count */
+	__le32  i_uid;          	/* Owner Uid */
+	__le32  i_gid;          	/* Group Id */
+	__le32  i_atime;        	/* Access time */
+	__le32  i_ctime;        	/* Creation time */
+	__le32  i_mtime;        	/* Modification time */
+	__le32  i_flags;        	/* File flags (unused for now)*/
+	__le32  i_generation;   	/* File version (for NFS) */
+	__le32  i_data[EXOFS_IDATA];	/* Short symlink names and device #s */
+};
+
+#define EXOFS_INO_ATTR_SIZE	sizeof(struct exofs_fcb)
+
+/****************************************************************************
+ * dentry-related things
+ ****************************************************************************/
+#define EXOFS_NAME_LEN	255
+
+/*
+ * The on-disk directory entry
+ */
+struct exofs_dir_entry {
+	__le64		inode_no;		/* inode number           */
+	__le16		rec_len;		/* directory entry length */
+	u8		name_len;		/* name length            */
+	u8		file_type;		/* umm...file type        */
+	char		name[EXOFS_NAME_LEN];	/* file name              */
+};
+
+enum {
+	EXOFS_FT_UNKNOWN,
+	EXOFS_FT_REG_FILE,
+	EXOFS_FT_DIR,
+	EXOFS_FT_CHRDEV,
+	EXOFS_FT_BLKDEV,
+	EXOFS_FT_FIFO,
+	EXOFS_FT_SOCK,
+	EXOFS_FT_SYMLINK,
+	EXOFS_FT_MAX
+};
+
+#define EXOFS_DIR_PAD			4
+#define EXOFS_DIR_ROUND			(EXOFS_DIR_PAD - 1)
+#define EXOFS_DIR_REC_LEN(name_len) \
+	(((name_len) + offsetof(struct exofs_dir_entry, name)  + \
+	  EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND)
+
+#endif /*ifndef __EXOFS_COM_H__*/
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
new file mode 100644
index 000000000000..eddfed624c60
--- /dev/null
+++ b/fs/exofs/exofs.h
@@ -0,0 +1,168 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include <linux/fs.h>
+#include <linux/time.h>
+#include "common.h"
+
+#ifndef __EXOFS_H__
+#define __EXOFS_H__
+
+#define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a)
+
+#ifdef CONFIG_EXOFS_DEBUG
+#define EXOFS_DBGMSG(fmt, a...) \
+	printk(KERN_NOTICE "exofs @%s:%d: " fmt, __func__, __LINE__, ##a)
+#else
+#define EXOFS_DBGMSG(fmt, a...) \
+	do {} while (0)
+#endif
+
+/* u64 has problems with printk this will cast it to unsigned long long */
+#define _LLU(x) (unsigned long long)(x)
+
+/*
+ * our extension to the in-memory superblock
+ */
+struct exofs_sb_info {
+	struct osd_dev	*s_dev;			/* returned by get_osd_dev    */
+	osd_id		s_pid;			/* partition ID of file system*/
+	int		s_timeout;		/* timeout for OSD operations */
+	uint64_t	s_nextid;		/* highest object ID used     */
+	uint32_t	s_numfiles;		/* number of files on fs      */
+	spinlock_t	s_next_gen_lock;	/* spinlock for gen # update  */
+	u32		s_next_generation;	/* next gen # to use          */
+	atomic_t	s_curr_pending;		/* number of pending commands */
+	uint8_t		s_cred[OSD_CAP_LEN];	/* all-powerful credential    */
+};
+
+/*
+ * our extension to the in-memory inode
+ */
+struct exofs_i_info {
+	unsigned long  i_flags;            /* various atomic flags            */
+	uint32_t       i_data[EXOFS_IDATA];/*short symlink names and device #s*/
+	uint32_t       i_dir_start_lookup; /* which page to start lookup      */
+	wait_queue_head_t i_wq;            /* wait queue for inode            */
+	uint64_t       i_commit_size;      /* the object's written length     */
+	uint8_t        i_cred[OSD_CAP_LEN];/* all-powerful credential         */
+	struct inode   vfs_inode;          /* normal in-memory inode          */
+};
+
+/*
+ * our inode flags
+ */
+#define OBJ_2BCREATED	0	/* object will be created soon*/
+#define OBJ_CREATED	1	/* object has been created on the osd*/
+
+static inline int obj_2bcreated(struct exofs_i_info *oi)
+{
+	return test_bit(OBJ_2BCREATED, &(oi->i_flags));
+}
+
+static inline void set_obj_2bcreated(struct exofs_i_info *oi)
+{
+	set_bit(OBJ_2BCREATED, &(oi->i_flags));
+}
+
+static inline int obj_created(struct exofs_i_info *oi)
+{
+	return test_bit(OBJ_CREATED, &(oi->i_flags));
+}
+
+static inline void set_obj_created(struct exofs_i_info *oi)
+{
+	set_bit(OBJ_CREATED, &(oi->i_flags));
+}
+
+/*
+ * get to our inode from the vfs inode
+ */
+static inline struct exofs_i_info *exofs_i(struct inode *inode)
+{
+	return container_of(inode, struct exofs_i_info, vfs_inode);
+}
+
+/*************************
+ * function declarations *
+ *************************/
+/* osd.c                 */
+void exofs_make_credential(uint8_t[], uint64_t, uint64_t);
+int exofs_check_ok(struct osd_request *or);
+int exofs_sync_op(struct osd_request *or, int timeout, uint8_t *credential);
+int exofs_async_op(struct osd_request *or, osd_req_done_fn *async_done,
+		   void *caller_context, char *credential);
+int prepare_get_attr_list_add_entry(struct osd_request *or,
+				    uint32_t page_num, uint32_t attr_num,
+				    uint32_t attr_len);
+int prepare_set_attr_list_add_entry(struct osd_request *or,
+				    uint32_t page_num, uint32_t attr_num,
+				    uint16_t attr_len, const u8 *attr_val);
+int extract_next_attr_from_req(struct osd_request *or,
+			       uint32_t *page_num, uint32_t *attr_num,
+			       uint16_t *attr_len, uint8_t **attr_val);
+struct osd_request *prepare_osd_format_lun(struct osd_dev *od,
+					   uint64_t formatted_capacity);
+struct osd_request *prepare_osd_create_partition(struct osd_dev *od,
+						 uint64_t requested_id);
+struct osd_request *prepare_osd_remove_partition(struct osd_dev *od,
+						 uint64_t requested_id);
+struct osd_request *prepare_osd_create(struct osd_dev *od,
+				       uint64_t part_id, uint64_t requested_id);
+struct osd_request *prepare_osd_remove(struct osd_dev *od,
+				       uint64_t part_id, uint64_t obj_id);
+struct osd_request *prepare_osd_set_attr(struct osd_dev *od,
+					 uint64_t part_id, uint64_t obj_id);
+struct osd_request *prepare_osd_get_attr(struct osd_dev *od,
+					 uint64_t part_id, uint64_t obj_id);
+struct osd_request *prepare_osd_read_pages(struct osd_dev *od,
+				     uint64_t part_id, uint64_t obj_id,
+				     uint64_t length, uint64_t offset,
+				     struct page **pages, int page_count);
+struct osd_request *prepare_osd_read(struct osd_dev *od,
+				     uint64_t part_id, uint64_t obj_id,
+				     uint64_t length, uint64_t offset,
+				     void *data);
+struct osd_request *prepare_osd_write_pages(struct osd_dev *od,
+				      uint64_t part_id, uint64_t obj_id,
+				      uint64_t length, uint64_t offset,
+				      struct page **pages, int page_count);
+struct osd_request *prepare_osd_write(struct osd_dev *od,
+				      uint64_t part_id, uint64_t obj_id,
+				      uint64_t length, uint64_t offset,
+				      void *data);
+void free_osd_req(struct osd_request *or);
+
+#endif
diff --git a/fs/exofs/osd.c b/fs/exofs/osd.c
new file mode 100644
index 000000000000..99375382ae57
--- /dev/null
+++ b/fs/exofs/osd.c
@@ -0,0 +1,395 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include <scsi/scsi_device.h>
+#include <scsi/osd_sense.h>
+
+#include "exofs.h"
+
+int exofs_check_ok(struct osd_request *or)
+{
+	struct osd_sense_info osi;
+	int ret = osd_req_decode_sense(or, &osi);
+
+	if (ret) { /* translate to Linux codes */
+		if (osi.additional_code == scsi_invalid_field_in_cdb) {
+			if (osi.cdb_field_offset == OSD_CFO_STARTING_BYTE)
+				ret = -EFAULT;
+			if (osi.cdb_field_offset == OSD_CFO_OBJECT_ID)
+				ret = -ENOENT;
+			else
+				ret = -EINVAL;
+		} else if (osi.additional_code == osd_quota_error)
+			ret = -ENOSPC;
+		else
+			ret = -EIO;
+	}
+
+	return ret;
+}
+
+void exofs_make_credential(uint8_t cred_a[OSD_CAP_LEN], uint64_t pid,
+			   uint64_t oid)
+{
+	struct osd_obj_id obj = {
+		.partition = pid,
+		.id = oid
+	};
+
+	osd_sec_init_nosec_doall_caps(cred_a, &obj, false, true);
+}
+
+/*
+ * Perform a synchronous OSD operation.
+ */
+int exofs_sync_op(struct osd_request *or, int timeout, uint8_t *credential)
+{
+	int ret;
+
+	or->timeout = timeout;
+	ret = osd_finalize_request(or, 0, credential, NULL);
+	if (ret) {
+		EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
+		return ret;
+	}
+
+	ret = osd_execute_request(or);
+
+	if (ret)
+		EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
+	/* osd_req_decode_sense(or, ret); */
+	return ret;
+}
+
+/*
+ * Perform an asynchronous OSD operation.
+ */
+int exofs_async_op(struct osd_request *or, osd_req_done_fn *async_done,
+		   void *caller_context, char *credential)
+{
+	int ret;
+
+	ret = osd_finalize_request(or, 0, credential, NULL);
+	if (ret) {
+		EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
+		return ret;
+	}
+
+	ret = osd_execute_request_async(or, async_done, caller_context);
+
+	if (ret)
+		EXOFS_DBGMSG("osd_execute_request_async() => %d\n", ret);
+	return ret;
+}
+
+int prepare_get_attr_list_add_entry(struct osd_request *or,
+				    uint32_t page_num, uint32_t attr_num,
+				    uint32_t attr_len)
+{
+	struct osd_attr attr = {
+		.attr_page = page_num,
+		.attr_id = attr_num,
+		.len = attr_len,
+	};
+
+	return osd_req_add_get_attr_list(or, &attr, 1);
+}
+
+int prepare_set_attr_list_add_entry(struct osd_request *or,
+				    uint32_t page_num, uint32_t attr_num,
+				    uint16_t attr_len, const u8 *attr_val)
+{
+	struct osd_attr attr = {
+		.attr_page = page_num,
+		.attr_id = attr_num,
+		.len = attr_len,
+		.val_ptr = (u8 *)attr_val,
+	};
+
+	return osd_req_add_set_attr_list(or, &attr, 1);
+}
+
+int extract_next_attr_from_req(struct osd_request *or,
+	uint32_t *page_num, uint32_t *attr_num,
+	uint16_t *attr_len, uint8_t **attr_val)
+{
+	struct osd_attr attr = {.attr_page = 0}; /* start with zeros */
+	void *iter = NULL;
+	int nelem;
+
+	do {
+		nelem = 1;
+		osd_req_decode_get_attr_list(or, &attr, &nelem, &iter);
+		if ((attr.attr_page == *page_num) &&
+		    (attr.attr_id == *attr_num)) {
+			*attr_len = attr.len;
+			*attr_val = attr.val_ptr;
+			return 0;
+		}
+	} while (iter);
+
+	return -EIO;
+}
+
+struct osd_request *prepare_osd_format_lun(struct osd_dev *od,
+					   uint64_t formatted_capacity)
+{
+	struct osd_request *or = osd_start_request(od, GFP_KERNEL);
+
+	if (!or)
+		return NULL;
+
+	osd_req_format(or, formatted_capacity);
+
+	return or;
+}
+
+struct osd_request *prepare_osd_create_partition(struct osd_dev *od,
+						 uint64_t requested_id)
+{
+	struct osd_request *or = osd_start_request(od, GFP_KERNEL);
+
+	if (!or)
+		return NULL;
+
+	osd_req_create_partition(or, requested_id);
+
+	return or;
+}
+
+struct osd_request *prepare_osd_remove_partition(struct osd_dev *od,
+						 uint64_t requested_id)
+{
+	struct osd_request *or = osd_start_request(od, GFP_KERNEL);
+
+	if (!or)
+		return NULL;
+
+	osd_req_remove_partition(or, requested_id);
+
+	return or;
+}
+
+struct osd_request *prepare_osd_create(struct osd_dev *od,
+				     uint64_t part_id,
+				     uint64_t requested_id)
+{
+	struct osd_obj_id obj = {
+		.partition = part_id,
+		.id = requested_id
+	};
+	struct osd_request *or = osd_start_request(od, GFP_KERNEL);
+
+	if (!or)
+		return NULL;
+
+	osd_req_create_object(or, &obj);
+
+	return or;
+}
+
+struct osd_request *prepare_osd_remove(struct osd_dev *od,
+				     uint64_t part_id,
+				     uint64_t obj_id)
+{
+	struct osd_obj_id obj = {
+		.partition = part_id,
+		.id = obj_id
+	};
+	struct osd_request *or = osd_start_request(od, GFP_KERNEL);
+
+	if (!or)
+		return NULL;
+
+	osd_req_remove_object(or, &obj);
+
+	return or;
+}
+
+struct osd_request *prepare_osd_set_attr(struct osd_dev *od,
+				       uint64_t part_id,
+				       uint64_t obj_id)
+{
+	struct osd_obj_id obj = {
+		.partition = part_id,
+		.id = obj_id
+	};
+	struct osd_request *or = osd_start_request(od, GFP_KERNEL);
+
+	if (!or)
+		return NULL;
+
+	osd_req_set_attributes(or, &obj);
+
+	return or;
+}
+
+struct osd_request *prepare_osd_get_attr(struct osd_dev *od,
+				       uint64_t part_id,
+				       uint64_t obj_id)
+{
+	struct osd_obj_id obj = {
+		.partition = part_id,
+		.id = obj_id
+	};
+	struct osd_request *or = osd_start_request(od, GFP_KERNEL);
+
+	if (!or)
+		return NULL;
+
+	osd_req_get_attributes(or, &obj);
+
+	return or;
+}
+
+static struct bio *_bio_map_pages(struct request_queue *req_q,
+				  struct page **pages, unsigned page_count,
+				  size_t length, gfp_t gfp_mask)
+{
+	struct bio *bio;
+	int i;
+
+	bio = bio_alloc(gfp_mask, page_count);
+	if (!bio) {
+		EXOFS_DBGMSG("Failed to bio_alloc page_count=%d\n", page_count);
+		return NULL;
+	}
+
+	for (i = 0; i < page_count && length; i++) {
+		size_t use_len = min(length, PAGE_SIZE);
+
+		if (use_len !=
+		    bio_add_pc_page(req_q, bio, pages[i], use_len, 0)) {
+			EXOFS_ERR("Failed bio_add_pc_page req_q=%p pages[i]=%p "
+				  "use_len=%Zd page_count=%d length=%Zd\n",
+				  req_q, pages[i], use_len, page_count, length);
+			bio_put(bio);
+			return NULL;
+		}
+
+		length -= use_len;
+	}
+
+	WARN_ON(length);
+	return bio;
+}
+
+static struct osd_request *_prepare_read(struct osd_dev *od,
+				   uint64_t part_id, uint64_t obj_id,
+				   uint64_t offset, struct bio *bio)
+{
+	struct osd_obj_id obj = {
+		.partition = part_id,
+		.id = obj_id
+	};
+	struct osd_request *or = osd_start_request(od, GFP_KERNEL);
+
+	if (!or)
+		return NULL;
+
+	osd_req_read(or, &obj, bio, offset);
+	EXOFS_DBGMSG("osd_req_read(p=%llX, ob=%llX, l=%llu, of=%llu)\n",
+		_LLU(part_id), _LLU(obj_id), _LLU(bio->bi_size), _LLU(offset));
+	return or;
+}
+
+struct osd_request *prepare_osd_read_pages(struct osd_dev *od,
+				   uint64_t part_id, uint64_t obj_id,
+				   uint64_t length, uint64_t offset,
+				   struct page **pages, int page_count)
+{
+	struct request_queue *req_q = od->scsi_device->request_queue;
+	struct bio *bio = _bio_map_pages(req_q, pages, page_count, length,
+					 GFP_KERNEL);
+
+	if (!bio)
+		return NULL;
+	return _prepare_read(od, part_id, obj_id, offset, bio);
+}
+
+struct osd_request *prepare_osd_read(struct osd_dev *od,
+				   uint64_t part_id, uint64_t obj_id,
+				   uint64_t length, uint64_t offset,
+				   void *data)
+{
+	struct request_queue *req_q = od->scsi_device->request_queue;
+	struct bio *bio = bio_map_kern(req_q, data, length, GFP_KERNEL);
+
+	if (!bio)
+		return NULL;
+	return _prepare_read(od, part_id, obj_id, offset, bio);
+}
+
+static struct osd_request *_prepare_write(struct osd_dev *od,
+				    uint64_t part_id, uint64_t obj_id,
+				    uint64_t offset, struct bio *bio)
+{
+	struct osd_obj_id obj = {
+		.partition = part_id,
+		.id = obj_id
+	};
+	struct osd_request *or = osd_start_request(od, GFP_KERNEL);
+
+	if (!or)
+		return NULL;
+
+	osd_req_write(or, &obj, bio, offset);
+	EXOFS_DBGMSG("osd_req_write(p=%llX, ob=%llX, l=%llu, of=%llu)\n",
+		_LLU(part_id), _LLU(obj_id), _LLU(bio->bi_size), _LLU(offset));
+	return or;
+}
+
+struct osd_request *prepare_osd_write_pages(struct osd_dev *od,
+				    uint64_t part_id, uint64_t obj_id,
+				    uint64_t length, uint64_t offset,
+				    struct page **pages, int page_count)
+{
+	struct request_queue *req_q = od->scsi_device->request_queue;
+	struct bio *bio = _bio_map_pages(req_q, pages, page_count, length,
+					 GFP_KERNEL);
+
+	if (!bio)
+		return NULL;
+	return _prepare_write(od, part_id, obj_id, offset, bio);
+}
+
+struct osd_request *prepare_osd_write(struct osd_dev *od,
+				    uint64_t part_id, uint64_t obj_id,
+				    uint64_t length, uint64_t offset,
+				    void *data)
+{
+	struct request_queue *req_q = od->scsi_device->request_queue;
+	struct bio *bio = bio_map_kern(req_q, data, length, GFP_KERNEL);
+
+	if (!bio)
+		return NULL;
+	return _prepare_write(od, part_id, obj_id, offset, bio);
+}
+
+void free_osd_req(struct osd_request *or)
+{
+	osd_end_request(or);
+}
-- 
cgit v1.2.3


From 5b6faf6aa0b9996b7e46fef888457c6717493980 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Mon, 27 Oct 2008 18:37:02 +0200
Subject: exofs: file and file_inode operations

implementation of the file_operations and inode_operations for
regular data files.

Most file_operations are generic vfs implementations except:
- exofs_truncate will truncate the OSD object as well
- Generic file_fsync is not good for none_bd devices so open code it
- The default for .flush in Linux is todo nothing so call exofs_fsync
  on the file.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/Kbuild  |   2 +-
 fs/exofs/exofs.h |  11 +++++
 fs/exofs/file.c  |  82 ++++++++++++++++++++++++++++++++
 fs/exofs/inode.c | 140 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 234 insertions(+), 1 deletion(-)
 create mode 100644 fs/exofs/file.c
 create mode 100644 fs/exofs/inode.c

(limited to 'fs')

diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
index c806738c69d9..d8bd4d5ef5d0 100644
--- a/fs/exofs/Kbuild
+++ b/fs/exofs/Kbuild
@@ -26,5 +26,5 @@ KBUILD_CPPFLAGS := -I$(OSD_INC) $(KBUILD_CPPFLAGS)
 
 endif
 
-exofs-objs := osd.o
+exofs-objs := osd.o inode.o file.o
 obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index eddfed624c60..dbb93560120d 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -165,4 +165,15 @@ struct osd_request *prepare_osd_write(struct osd_dev *od,
 				      void *data);
 void free_osd_req(struct osd_request *or);
 
+/* inode.c               */
+void exofs_truncate(struct inode *inode);
+int exofs_setattr(struct dentry *, struct iattr *);
+
+/*********************
+ * operation vectors *
+ *********************/
+/* file.c            */
+extern const struct inode_operations exofs_file_inode_operations;
+extern const struct file_operations exofs_file_operations;
+
 #endif
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
new file mode 100644
index 000000000000..4738c3fa7f29
--- /dev/null
+++ b/fs/exofs/file.c
@@ -0,0 +1,82 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include <linux/buffer_head.h>
+
+#include "exofs.h"
+
+static int exofs_release_file(struct inode *inode, struct file *filp)
+{
+	return 0;
+}
+
+static int exofs_file_fsync(struct file *filp, struct dentry *dentry,
+			    int datasync)
+{
+	int ret1, ret2;
+	struct address_space *mapping = filp->f_mapping;
+
+	ret1 = filemap_write_and_wait(mapping);
+	ret2 = file_fsync(filp, dentry, datasync);
+
+	return ret1 ? ret1 : ret2;
+}
+
+static int exofs_flush(struct file *file, fl_owner_t id)
+{
+	exofs_file_fsync(file, file->f_path.dentry, 1);
+	/* TODO: Flush the OSD target */
+	return 0;
+}
+
+const struct file_operations exofs_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= do_sync_read,
+	.write		= do_sync_write,
+	.aio_read	= generic_file_aio_read,
+	.aio_write	= generic_file_aio_write,
+	.mmap		= generic_file_mmap,
+	.open		= generic_file_open,
+	.release	= exofs_release_file,
+	.fsync		= exofs_file_fsync,
+	.flush		= exofs_flush,
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= generic_file_splice_write,
+};
+
+const struct inode_operations exofs_file_inode_operations = {
+	.truncate	= exofs_truncate,
+	.setattr	= exofs_setattr,
+};
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
new file mode 100644
index 000000000000..19f266ca8f06
--- /dev/null
+++ b/fs/exofs/inode.c
@@ -0,0 +1,140 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include <linux/writeback.h>
+#include <linux/buffer_head.h>
+
+#include "exofs.h"
+
+/******************************************************************************
+ * INODE OPERATIONS
+ *****************************************************************************/
+
+/*
+ * Test whether an inode is a fast symlink.
+ */
+static inline int exofs_inode_is_fast_symlink(struct inode *inode)
+{
+	struct exofs_i_info *oi = exofs_i(inode);
+
+	return S_ISLNK(inode->i_mode) && (oi->i_data[0] != 0);
+}
+
+/*
+ * get_block_t - Fill in a buffer_head
+ * An OSD takes care of block allocation so we just fake an allocation by
+ * putting in the inode's sector_t in the buffer_head.
+ * TODO: What about the case of create==0 and @iblock does not exist in the
+ * object?
+ */
+static int exofs_get_block(struct inode *inode, sector_t iblock,
+		    struct buffer_head *bh_result, int create)
+{
+	map_bh(bh_result, inode->i_sb, iblock);
+	return 0;
+}
+
+/*
+ * Truncate a file to the specified size - all we have to do is set the size
+ * attribute.  We make sure the object exists first.
+ */
+void exofs_truncate(struct inode *inode)
+{
+	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+	struct exofs_i_info *oi = exofs_i(inode);
+	struct osd_request *req = NULL;
+	loff_t isize = i_size_read(inode);
+	uint64_t newsize;
+	int ret;
+
+	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
+	     || S_ISLNK(inode->i_mode)))
+		return;
+	if (exofs_inode_is_fast_symlink(inode))
+		return;
+	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+		return;
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+	nobh_truncate_page(inode->i_mapping, isize, exofs_get_block);
+
+	req = prepare_osd_set_attr(sbi->s_dev, sbi->s_pid,
+				 inode->i_ino + EXOFS_OBJ_OFF);
+	if (!req) {
+		EXOFS_ERR("ERROR: prepare set_attr failed.\n");
+		goto fail;
+	}
+
+	newsize = cpu_to_le64((uint64_t) isize);
+	prepare_set_attr_list_add_entry(req, OSD_APAGE_OBJECT_INFORMATION,
+					OSD_ATTR_OI_LOGICAL_LENGTH, 8,
+					(unsigned char *)(&newsize));
+
+	/* if we are about to truncate an object, and it hasn't been
+	 * created yet, wait
+	 */
+	if (!obj_created(oi)) {
+		BUG_ON(!obj_2bcreated(oi));
+		wait_event(oi->i_wq, obj_created(oi));
+	}
+
+	ret = exofs_sync_op(req, sbi->s_timeout, oi->i_cred);
+	free_osd_req(req);
+	if (ret)
+		goto fail;
+
+out:
+	mark_inode_dirty(inode);
+	return;
+fail:
+	make_bad_inode(inode);
+	goto out;
+}
+
+/*
+ * Set inode attributes - just call generic functions.
+ */
+int exofs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+	struct inode *inode = dentry->d_inode;
+	int error;
+
+	error = inode_change_ok(inode, iattr);
+	if (error)
+		return error;
+
+	error = inode_setattr(inode, iattr);
+	return error;
+}
-- 
cgit v1.2.3


From 1b511bce7187f3aef21ec392b12509231cb864c8 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Mon, 27 Oct 2008 19:04:34 +0200
Subject: exofs: symlink_inode and fast_symlink_inode operations

Generic implementation of symlink ops.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/Kbuild    |  2 +-
 fs/exofs/exofs.h   |  4 ++++
 fs/exofs/symlink.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 62 insertions(+), 1 deletion(-)
 create mode 100644 fs/exofs/symlink.c

(limited to 'fs')

diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
index d8bd4d5ef5d0..18c615879788 100644
--- a/fs/exofs/Kbuild
+++ b/fs/exofs/Kbuild
@@ -26,5 +26,5 @@ KBUILD_CPPFLAGS := -I$(OSD_INC) $(KBUILD_CPPFLAGS)
 
 endif
 
-exofs-objs := osd.o inode.o file.o
+exofs-objs := osd.o inode.o file.o symlink.o
 obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index dbb93560120d..2e80d6223b0b 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -176,4 +176,8 @@ int exofs_setattr(struct dentry *, struct iattr *);
 extern const struct inode_operations exofs_file_inode_operations;
 extern const struct file_operations exofs_file_operations;
 
+/* symlink.c         */
+extern const struct inode_operations exofs_symlink_inode_operations;
+extern const struct inode_operations exofs_fast_symlink_inode_operations;
+
 #endif
diff --git a/fs/exofs/symlink.c b/fs/exofs/symlink.c
new file mode 100644
index 000000000000..36e2d7bc7f7b
--- /dev/null
+++ b/fs/exofs/symlink.c
@@ -0,0 +1,57 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include <linux/namei.h>
+
+#include "exofs.h"
+
+static void *exofs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct exofs_i_info *oi = exofs_i(dentry->d_inode);
+
+	nd_set_link(nd, (char *)oi->i_data);
+	return NULL;
+}
+
+const struct inode_operations exofs_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+};
+
+const struct inode_operations exofs_fast_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= exofs_follow_link,
+};
-- 
cgit v1.2.3


From 0863e2cc11b03d09233a9faff478865492f429d3 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Mon, 27 Oct 2008 19:31:34 +0200
Subject: exofs: address_space_operations

OK Now we start to read and write from osd-objects, page-by-page.
The page index is the object's offset.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/exofs.h |   6 ++
 fs/exofs/inode.c | 298 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 304 insertions(+)

(limited to 'fs')

diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 2e80d6223b0b..ac44cb69e498 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -168,6 +168,9 @@ void free_osd_req(struct osd_request *or);
 /* inode.c               */
 void exofs_truncate(struct inode *inode);
 int exofs_setattr(struct dentry *, struct iattr *);
+int exofs_write_begin(struct file *file, struct address_space *mapping,
+		loff_t pos, unsigned len, unsigned flags,
+		struct page **pagep, void **fsdata);
 
 /*********************
  * operation vectors *
@@ -176,6 +179,9 @@ int exofs_setattr(struct dentry *, struct iattr *);
 extern const struct inode_operations exofs_file_inode_operations;
 extern const struct file_operations exofs_file_operations;
 
+/* inode.c           */
+extern const struct address_space_operations exofs_aops;
+
 /* symlink.c         */
 extern const struct inode_operations exofs_symlink_inode_operations;
 extern const struct inode_operations exofs_fast_symlink_inode_operations;
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 19f266ca8f06..4d19dcd64d61 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -38,6 +38,304 @@
 
 #include "exofs.h"
 
+static int __readpage_filler(struct page *page, bool is_async_unlock);
+
+int exofs_write_begin(struct file *file, struct address_space *mapping,
+		loff_t pos, unsigned len, unsigned flags,
+		struct page **pagep, void **fsdata)
+{
+	int ret = 0;
+	struct page *page;
+
+	page = *pagep;
+	if (page == NULL) {
+		ret = simple_write_begin(file, mapping, pos, len, flags, pagep,
+					 fsdata);
+		page = *pagep;
+	}
+
+	 /* read modify write */
+	if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE))
+			ret = __readpage_filler(page, false);
+
+	return ret;
+}
+
+static int exofs_write_begin_export(struct file *file,
+		struct address_space *mapping,
+		loff_t pos, unsigned len, unsigned flags,
+		struct page **pagep, void **fsdata)
+{
+	*pagep = NULL;
+
+	return exofs_write_begin(file, mapping, pos, len, flags, pagep,
+					fsdata);
+}
+
+/*
+ * Callback function when writepage finishes.  Check for errors, unlock, clean
+ * up, etc.
+ */
+static void writepage_done(struct osd_request *req, void *p)
+{
+	int ret;
+	struct page *page = p;
+	struct inode *inode = page->mapping->host;
+	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+
+	ret = exofs_check_ok(req);
+	free_osd_req(req);
+	atomic_dec(&sbi->s_curr_pending);
+
+	if (ret) {
+		if (ret == -ENOSPC)
+			set_bit(AS_ENOSPC, &page->mapping->flags);
+		else
+			set_bit(AS_EIO, &page->mapping->flags);
+
+		SetPageError(page);
+	}
+
+	end_page_writeback(page);
+	unlock_page(page);
+}
+
+/*
+ * Write a page to disk.  page->index gives us the page number.  The page is
+ * locked before this function is called.  We write asynchronously and then the
+ * callback function (writepage_done) is called.  We signify that the operation
+ * has completed by unlocking the page and calling end_page_writeback().
+ */
+static int exofs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	struct exofs_i_info *oi = exofs_i(inode);
+	loff_t i_size = i_size_read(inode);
+	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
+	unsigned offset = 0;
+	struct osd_request *req = NULL;
+	struct exofs_sb_info *sbi;
+	uint64_t start;
+	uint64_t len = PAGE_CACHE_SIZE;
+	int ret = 0;
+
+	BUG_ON(!PageLocked(page));
+
+	/* if the object has not been created, and we are not in sync mode,
+	 * just return.  otherwise, wait. */
+	if (!obj_created(oi)) {
+		BUG_ON(!obj_2bcreated(oi));
+
+		if (wbc->sync_mode == WB_SYNC_NONE) {
+			redirty_page_for_writepage(wbc, page);
+			unlock_page(page);
+			ret = 0;
+			goto out;
+		} else
+			wait_event(oi->i_wq, obj_created(oi));
+	}
+
+	/* in this case, the page is within the limits of the file */
+	if (page->index < end_index)
+		goto do_it;
+
+	offset = i_size & (PAGE_CACHE_SIZE - 1);
+	len = offset;
+
+	/*in this case, the page is outside the limits (truncate in progress)*/
+	if (page->index >= end_index + 1 || !offset) {
+		unlock_page(page);
+		goto out;
+	}
+
+do_it:
+	BUG_ON(PageWriteback(page));
+	set_page_writeback(page);
+	start = page->index << PAGE_CACHE_SHIFT;
+	sbi = inode->i_sb->s_fs_info;
+
+	req = prepare_osd_write_pages(sbi->s_dev, sbi->s_pid,
+				inode->i_ino + EXOFS_OBJ_OFF, len, start,
+				&page, 1);
+	if (!req) {
+		EXOFS_ERR("ERROR: writepage failed.\n");
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	oi->i_commit_size = min_t(uint64_t, oi->i_commit_size, len + start);
+
+	ret = exofs_async_op(req, writepage_done, page, oi->i_cred);
+	if (ret) {
+		free_osd_req(req);
+		goto fail;
+	}
+	atomic_inc(&sbi->s_curr_pending);
+out:
+	return ret;
+fail:
+	set_bit(AS_EIO, &page->mapping->flags);
+	end_page_writeback(page);
+	unlock_page(page);
+	goto out;
+}
+
+/*
+ * Callback for readpage
+ */
+static int __readpage_done(struct osd_request *req, void *p, int unlock)
+{
+	struct page *page = p;
+	struct inode *inode = page->mapping->host;
+	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+	int ret;
+
+	ret = exofs_check_ok(req);
+	free_osd_req(req);
+	atomic_dec(&sbi->s_curr_pending);
+
+	if (ret == 0) {
+		/* Everything is OK */
+		SetPageUptodate(page);
+		if (PageError(page))
+			ClearPageError(page);
+	} else if (ret == -EFAULT) {
+		/* In this case we were trying to read something that wasn't on
+		 * disk yet - return a page full of zeroes.  This should be OK,
+		 * because the object should be empty (if there was a write
+		 * before this read, the read would be waiting with the page
+		 * locked */
+		clear_highpage(page);
+
+		SetPageUptodate(page);
+		if (PageError(page))
+			ClearPageError(page);
+	} else /* Error */
+		SetPageError(page);
+
+	if (unlock)
+		unlock_page(page);
+
+	return ret;
+}
+
+static void readpage_done(struct osd_request *req, void *p)
+{
+	__readpage_done(req, p, true);
+}
+
+/*
+ * Read a page from the OSD
+ */
+static int __readpage_filler(struct page *page, bool is_async_unlock)
+{
+	struct osd_request *req = NULL;
+	struct inode *inode = page->mapping->host;
+	struct exofs_i_info *oi = exofs_i(inode);
+	ino_t ino = inode->i_ino;
+	loff_t i_size = i_size_read(inode);
+	loff_t i_start = page->index << PAGE_CACHE_SHIFT;
+	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+	struct super_block *sb = inode->i_sb;
+	struct exofs_sb_info *sbi = sb->s_fs_info;
+	uint64_t amount;
+	int ret = 0;
+
+	BUG_ON(!PageLocked(page));
+
+	if (PageUptodate(page))
+		goto out;
+
+	if (page->index < end_index)
+		amount = PAGE_CACHE_SIZE;
+	else
+		amount = i_size & (PAGE_CACHE_SIZE - 1);
+
+	/* this will be out of bounds, or doesn't exist yet */
+	if ((page->index >= end_index + 1) || !obj_created(oi) || !amount
+	    /*|| (i_start >= oi->i_commit_size)*/) {
+		clear_highpage(page);
+
+		SetPageUptodate(page);
+		if (PageError(page))
+			ClearPageError(page);
+		if (is_async_unlock)
+			unlock_page(page);
+		goto out;
+	}
+
+	if (amount != PAGE_CACHE_SIZE)
+		zero_user(page, amount, PAGE_CACHE_SIZE - amount);
+
+
+	req = prepare_osd_read_pages(sbi->s_dev, sbi->s_pid,
+				     ino + EXOFS_OBJ_OFF, amount, i_start,
+				     &page, 1);
+	if (!req) {
+		EXOFS_ERR("ERROR: readpage failed.\n");
+		ret = -ENOMEM;
+		unlock_page(page);
+		goto out;
+	}
+
+	atomic_inc(&sbi->s_curr_pending);
+	if (!is_async_unlock) {
+		exofs_sync_op(req, sbi->s_timeout, oi->i_cred);
+		ret = __readpage_done(req, page, false);
+	} else {
+		ret = exofs_async_op(req, readpage_done, page, oi->i_cred);
+		if (ret) {
+			free_osd_req(req);
+			unlock_page(page);
+			atomic_dec(&sbi->s_curr_pending);
+		}
+	}
+
+out:
+	return ret;
+}
+
+static int readpage_filler(struct page *page)
+{
+	int ret = __readpage_filler(page, true);
+
+	return ret;
+}
+
+/*
+ * We don't need the file
+ */
+static int exofs_readpage(struct file *file, struct page *page)
+{
+	return readpage_filler(page);
+}
+
+/*
+ * We don't need the data
+ */
+static int readpage_strip(void *data, struct page *page)
+{
+	return readpage_filler(page);
+}
+
+/*
+ * read a bunch of pages - usually for readahead
+ */
+static int exofs_readpages(struct file *file, struct address_space *mapping,
+			   struct list_head *pages, unsigned nr_pages)
+{
+	return read_cache_pages(mapping, pages, readpage_strip, NULL);
+}
+
+const struct address_space_operations exofs_aops = {
+	.readpage	= exofs_readpage,
+	.readpages	= exofs_readpages,
+	.writepage	= exofs_writepage,
+	.write_begin	= exofs_write_begin_export,
+	.write_end	= simple_write_end,
+	.writepages	= generic_writepages,
+};
+
 /******************************************************************************
  * INODE OPERATIONS
  *****************************************************************************/
-- 
cgit v1.2.3


From 88c5e65896327d371b64a1baf4daee00207a8efd Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Tue, 28 Oct 2008 15:38:12 +0200
Subject: exofs: dir_inode and directory operations

implementation of directory and inode operations.

* A directory is treated as a file, and essentially contains a list
  of <file name, inode #> pairs for files that are found in that
  directory. The object IDs correspond to the files' inode numbers
  and are allocated using a 64bit incrementing global counter.
* Each file's control block (AKA on-disk inode) is stored in its
  object's attributes. This applies to both regular files and other
  types (directories, device files, symlinks, etc.).

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/Kbuild  |   2 +-
 fs/exofs/dir.c   | 643 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/exofs/exofs.h |  26 +++
 fs/exofs/inode.c | 270 +++++++++++++++++++++++
 fs/exofs/namei.c | 338 +++++++++++++++++++++++++++++
 5 files changed, 1278 insertions(+), 1 deletion(-)
 create mode 100644 fs/exofs/dir.c
 create mode 100644 fs/exofs/namei.c

(limited to 'fs')

diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
index 18c615879788..61162c6eedfe 100644
--- a/fs/exofs/Kbuild
+++ b/fs/exofs/Kbuild
@@ -26,5 +26,5 @@ KBUILD_CPPFLAGS := -I$(OSD_INC) $(KBUILD_CPPFLAGS)
 
 endif
 
-exofs-objs := osd.o inode.o file.o symlink.o
+exofs-objs := osd.o inode.o file.o symlink.o namei.o dir.o
 obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
new file mode 100644
index 000000000000..245fdeeb1e1e
--- /dev/null
+++ b/fs/exofs/dir.c
@@ -0,0 +1,643 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include "exofs.h"
+
+static inline unsigned exofs_chunk_size(struct inode *inode)
+{
+	return inode->i_sb->s_blocksize;
+}
+
+static inline void exofs_put_page(struct page *page)
+{
+	kunmap(page);
+	page_cache_release(page);
+}
+
+static inline unsigned long dir_pages(struct inode *inode)
+{
+	return (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+}
+
+static unsigned exofs_last_byte(struct inode *inode, unsigned long page_nr)
+{
+	unsigned last_byte = inode->i_size;
+
+	last_byte -= page_nr << PAGE_CACHE_SHIFT;
+	if (last_byte > PAGE_CACHE_SIZE)
+		last_byte = PAGE_CACHE_SIZE;
+	return last_byte;
+}
+
+static int exofs_commit_chunk(struct page *page, loff_t pos, unsigned len)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *dir = mapping->host;
+	int err = 0;
+
+	dir->i_version++;
+
+	if (!PageUptodate(page))
+		SetPageUptodate(page);
+
+	if (pos+len > dir->i_size) {
+		i_size_write(dir, pos+len);
+		mark_inode_dirty(dir);
+	}
+	set_page_dirty(page);
+
+	if (IS_DIRSYNC(dir))
+		err = write_one_page(page, 1);
+	else
+		unlock_page(page);
+
+	return err;
+}
+
+static void exofs_check_page(struct page *page)
+{
+	struct inode *dir = page->mapping->host;
+	unsigned chunk_size = exofs_chunk_size(dir);
+	char *kaddr = page_address(page);
+	unsigned offs, rec_len;
+	unsigned limit = PAGE_CACHE_SIZE;
+	struct exofs_dir_entry *p;
+	char *error;
+
+	/* if the page is the last one in the directory */
+	if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
+		limit = dir->i_size & ~PAGE_CACHE_MASK;
+		if (limit & (chunk_size - 1))
+			goto Ebadsize;
+		if (!limit)
+			goto out;
+	}
+	for (offs = 0; offs <= limit - EXOFS_DIR_REC_LEN(1); offs += rec_len) {
+		p = (struct exofs_dir_entry *)(kaddr + offs);
+		rec_len = le16_to_cpu(p->rec_len);
+
+		if (rec_len < EXOFS_DIR_REC_LEN(1))
+			goto Eshort;
+		if (rec_len & 3)
+			goto Ealign;
+		if (rec_len < EXOFS_DIR_REC_LEN(p->name_len))
+			goto Enamelen;
+		if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1))
+			goto Espan;
+	}
+	if (offs != limit)
+		goto Eend;
+out:
+	SetPageChecked(page);
+	return;
+
+Ebadsize:
+	EXOFS_ERR("ERROR [exofs_check_page]: "
+		"size of directory #%lu is not a multiple of chunk size",
+		dir->i_ino
+	);
+	goto fail;
+Eshort:
+	error = "rec_len is smaller than minimal";
+	goto bad_entry;
+Ealign:
+	error = "unaligned directory entry";
+	goto bad_entry;
+Enamelen:
+	error = "rec_len is too small for name_len";
+	goto bad_entry;
+Espan:
+	error = "directory entry across blocks";
+	goto bad_entry;
+bad_entry:
+	EXOFS_ERR(
+		"ERROR [exofs_check_page]: bad entry in directory #%lu: %s - "
+		"offset=%lu, inode=%llu, rec_len=%d, name_len=%d",
+		dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
+		_LLU(le64_to_cpu(p->inode_no)),
+		rec_len, p->name_len);
+	goto fail;
+Eend:
+	p = (struct exofs_dir_entry *)(kaddr + offs);
+	EXOFS_ERR("ERROR [exofs_check_page]: "
+		"entry in directory #%lu spans the page boundary"
+		"offset=%lu, inode=%llu",
+		dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
+		_LLU(le64_to_cpu(p->inode_no)));
+fail:
+	SetPageChecked(page);
+	SetPageError(page);
+}
+
+static struct page *exofs_get_page(struct inode *dir, unsigned long n)
+{
+	struct address_space *mapping = dir->i_mapping;
+	struct page *page = read_mapping_page(mapping, n, NULL);
+
+	if (!IS_ERR(page)) {
+		kmap(page);
+		if (!PageChecked(page))
+			exofs_check_page(page);
+		if (PageError(page))
+			goto fail;
+	}
+	return page;
+
+fail:
+	exofs_put_page(page);
+	return ERR_PTR(-EIO);
+}
+
+static inline int exofs_match(int len, const unsigned char *name,
+					struct exofs_dir_entry *de)
+{
+	if (len != de->name_len)
+		return 0;
+	if (!de->inode_no)
+		return 0;
+	return !memcmp(name, de->name, len);
+}
+
+static inline
+struct exofs_dir_entry *exofs_next_entry(struct exofs_dir_entry *p)
+{
+	return (struct exofs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len));
+}
+
+static inline unsigned
+exofs_validate_entry(char *base, unsigned offset, unsigned mask)
+{
+	struct exofs_dir_entry *de = (struct exofs_dir_entry *)(base + offset);
+	struct exofs_dir_entry *p =
+			(struct exofs_dir_entry *)(base + (offset&mask));
+	while ((char *)p < (char *)de) {
+		if (p->rec_len == 0)
+			break;
+		p = exofs_next_entry(p);
+	}
+	return (char *)p - base;
+}
+
+static unsigned char exofs_filetype_table[EXOFS_FT_MAX] = {
+	[EXOFS_FT_UNKNOWN]	= DT_UNKNOWN,
+	[EXOFS_FT_REG_FILE]	= DT_REG,
+	[EXOFS_FT_DIR]		= DT_DIR,
+	[EXOFS_FT_CHRDEV]	= DT_CHR,
+	[EXOFS_FT_BLKDEV]	= DT_BLK,
+	[EXOFS_FT_FIFO]		= DT_FIFO,
+	[EXOFS_FT_SOCK]		= DT_SOCK,
+	[EXOFS_FT_SYMLINK]	= DT_LNK,
+};
+
+#define S_SHIFT 12
+static unsigned char exofs_type_by_mode[S_IFMT >> S_SHIFT] = {
+	[S_IFREG >> S_SHIFT]	= EXOFS_FT_REG_FILE,
+	[S_IFDIR >> S_SHIFT]	= EXOFS_FT_DIR,
+	[S_IFCHR >> S_SHIFT]	= EXOFS_FT_CHRDEV,
+	[S_IFBLK >> S_SHIFT]	= EXOFS_FT_BLKDEV,
+	[S_IFIFO >> S_SHIFT]	= EXOFS_FT_FIFO,
+	[S_IFSOCK >> S_SHIFT]	= EXOFS_FT_SOCK,
+	[S_IFLNK >> S_SHIFT]	= EXOFS_FT_SYMLINK,
+};
+
+static inline
+void exofs_set_de_type(struct exofs_dir_entry *de, struct inode *inode)
+{
+	mode_t mode = inode->i_mode;
+	de->file_type = exofs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
+}
+
+static int
+exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	loff_t pos = filp->f_pos;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	unsigned int offset = pos & ~PAGE_CACHE_MASK;
+	unsigned long n = pos >> PAGE_CACHE_SHIFT;
+	unsigned long npages = dir_pages(inode);
+	unsigned chunk_mask = ~(exofs_chunk_size(inode)-1);
+	unsigned char *types = NULL;
+	int need_revalidate = (filp->f_version != inode->i_version);
+
+	if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1))
+		return 0;
+
+	types = exofs_filetype_table;
+
+	for ( ; n < npages; n++, offset = 0) {
+		char *kaddr, *limit;
+		struct exofs_dir_entry *de;
+		struct page *page = exofs_get_page(inode, n);
+
+		if (IS_ERR(page)) {
+			EXOFS_ERR("ERROR: "
+				   "bad page in #%lu",
+				   inode->i_ino);
+			filp->f_pos += PAGE_CACHE_SIZE - offset;
+			return PTR_ERR(page);
+		}
+		kaddr = page_address(page);
+		if (unlikely(need_revalidate)) {
+			if (offset) {
+				offset = exofs_validate_entry(kaddr, offset,
+								chunk_mask);
+				filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
+			}
+			filp->f_version = inode->i_version;
+			need_revalidate = 0;
+		}
+		de = (struct exofs_dir_entry *)(kaddr + offset);
+		limit = kaddr + exofs_last_byte(inode, n) -
+							EXOFS_DIR_REC_LEN(1);
+		for (; (char *)de <= limit; de = exofs_next_entry(de)) {
+			if (de->rec_len == 0) {
+				EXOFS_ERR("ERROR: "
+					"zero-length directory entry");
+				exofs_put_page(page);
+				return -EIO;
+			}
+			if (de->inode_no) {
+				int over;
+				unsigned char d_type = DT_UNKNOWN;
+
+				if (types && de->file_type < EXOFS_FT_MAX)
+					d_type = types[de->file_type];
+
+				offset = (char *)de - kaddr;
+				over = filldir(dirent, de->name, de->name_len,
+						(n<<PAGE_CACHE_SHIFT) | offset,
+						le64_to_cpu(de->inode_no),
+						d_type);
+				if (over) {
+					exofs_put_page(page);
+					return 0;
+				}
+			}
+			filp->f_pos += le16_to_cpu(de->rec_len);
+		}
+		exofs_put_page(page);
+	}
+
+	return 0;
+}
+
+struct exofs_dir_entry *exofs_find_entry(struct inode *dir,
+			struct dentry *dentry, struct page **res_page)
+{
+	const unsigned char *name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	unsigned reclen = EXOFS_DIR_REC_LEN(namelen);
+	unsigned long start, n;
+	unsigned long npages = dir_pages(dir);
+	struct page *page = NULL;
+	struct exofs_i_info *oi = exofs_i(dir);
+	struct exofs_dir_entry *de;
+
+	if (npages == 0)
+		goto out;
+
+	*res_page = NULL;
+
+	start = oi->i_dir_start_lookup;
+	if (start >= npages)
+		start = 0;
+	n = start;
+	do {
+		char *kaddr;
+		page = exofs_get_page(dir, n);
+		if (!IS_ERR(page)) {
+			kaddr = page_address(page);
+			de = (struct exofs_dir_entry *) kaddr;
+			kaddr += exofs_last_byte(dir, n) - reclen;
+			while ((char *) de <= kaddr) {
+				if (de->rec_len == 0) {
+					EXOFS_ERR(
+						"ERROR: exofs_find_entry: "
+						"zero-length directory entry");
+					exofs_put_page(page);
+					goto out;
+				}
+				if (exofs_match(namelen, name, de))
+					goto found;
+				de = exofs_next_entry(de);
+			}
+			exofs_put_page(page);
+		}
+		if (++n >= npages)
+			n = 0;
+	} while (n != start);
+out:
+	return NULL;
+
+found:
+	*res_page = page;
+	oi->i_dir_start_lookup = n;
+	return de;
+}
+
+struct exofs_dir_entry *exofs_dotdot(struct inode *dir, struct page **p)
+{
+	struct page *page = exofs_get_page(dir, 0);
+	struct exofs_dir_entry *de = NULL;
+
+	if (!IS_ERR(page)) {
+		de = exofs_next_entry(
+				(struct exofs_dir_entry *)page_address(page));
+		*p = page;
+	}
+	return de;
+}
+
+ino_t exofs_inode_by_name(struct inode *dir, struct dentry *dentry)
+{
+	ino_t res = 0;
+	struct exofs_dir_entry *de;
+	struct page *page;
+
+	de = exofs_find_entry(dir, dentry, &page);
+	if (de) {
+		res = le64_to_cpu(de->inode_no);
+		exofs_put_page(page);
+	}
+	return res;
+}
+
+void exofs_set_link(struct inode *dir, struct exofs_dir_entry *de,
+			struct page *page, struct inode *inode)
+{
+	loff_t pos = page_offset(page) +
+			(char *) de - (char *) page_address(page);
+	unsigned len = le16_to_cpu(de->rec_len);
+	int err;
+
+	lock_page(page);
+	err = exofs_write_begin(NULL, page->mapping, pos, len,
+				AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+	BUG_ON(err);
+	de->inode_no = cpu_to_le64(inode->i_ino);
+	exofs_set_de_type(de, inode);
+	err = exofs_commit_chunk(page, pos, len);
+	exofs_put_page(page);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+	mark_inode_dirty(dir);
+}
+
+int exofs_add_link(struct dentry *dentry, struct inode *inode)
+{
+	struct inode *dir = dentry->d_parent->d_inode;
+	const unsigned char *name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	unsigned chunk_size = exofs_chunk_size(dir);
+	unsigned reclen = EXOFS_DIR_REC_LEN(namelen);
+	unsigned short rec_len, name_len;
+	struct page *page = NULL;
+	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+	struct exofs_dir_entry *de;
+	unsigned long npages = dir_pages(dir);
+	unsigned long n;
+	char *kaddr;
+	loff_t pos;
+	int err;
+
+	for (n = 0; n <= npages; n++) {
+		char *dir_end;
+
+		page = exofs_get_page(dir, n);
+		err = PTR_ERR(page);
+		if (IS_ERR(page))
+			goto out;
+		lock_page(page);
+		kaddr = page_address(page);
+		dir_end = kaddr + exofs_last_byte(dir, n);
+		de = (struct exofs_dir_entry *)kaddr;
+		kaddr += PAGE_CACHE_SIZE - reclen;
+		while ((char *)de <= kaddr) {
+			if ((char *)de == dir_end) {
+				name_len = 0;
+				rec_len = chunk_size;
+				de->rec_len = cpu_to_le16(chunk_size);
+				de->inode_no = 0;
+				goto got_it;
+			}
+			if (de->rec_len == 0) {
+				EXOFS_ERR("ERROR: exofs_add_link: "
+					"zero-length directory entry");
+				err = -EIO;
+				goto out_unlock;
+			}
+			err = -EEXIST;
+			if (exofs_match(namelen, name, de))
+				goto out_unlock;
+			name_len = EXOFS_DIR_REC_LEN(de->name_len);
+			rec_len = le16_to_cpu(de->rec_len);
+			if (!de->inode_no && rec_len >= reclen)
+				goto got_it;
+			if (rec_len >= name_len + reclen)
+				goto got_it;
+			de = (struct exofs_dir_entry *) ((char *) de + rec_len);
+		}
+		unlock_page(page);
+		exofs_put_page(page);
+	}
+	BUG();
+	return -EINVAL;
+
+got_it:
+	pos = page_offset(page) +
+		(char *)de - (char *)page_address(page);
+	err = exofs_write_begin(NULL, page->mapping, pos, rec_len, 0,
+							&page, NULL);
+	if (err)
+		goto out_unlock;
+	if (de->inode_no) {
+		struct exofs_dir_entry *de1 =
+			(struct exofs_dir_entry *)((char *)de + name_len);
+		de1->rec_len = cpu_to_le16(rec_len - name_len);
+		de->rec_len = cpu_to_le16(name_len);
+		de = de1;
+	}
+	de->name_len = namelen;
+	memcpy(de->name, name, namelen);
+	de->inode_no = cpu_to_le64(inode->i_ino);
+	exofs_set_de_type(de, inode);
+	err = exofs_commit_chunk(page, pos, rec_len);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+	mark_inode_dirty(dir);
+	sbi->s_numfiles++;
+
+out_put:
+	exofs_put_page(page);
+out:
+	return err;
+out_unlock:
+	unlock_page(page);
+	goto out_put;
+}
+
+int exofs_delete_entry(struct exofs_dir_entry *dir, struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *inode = mapping->host;
+	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+	char *kaddr = page_address(page);
+	unsigned from = ((char *)dir - kaddr) & ~(exofs_chunk_size(inode)-1);
+	unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len);
+	loff_t pos;
+	struct exofs_dir_entry *pde = NULL;
+	struct exofs_dir_entry *de = (struct exofs_dir_entry *) (kaddr + from);
+	int err;
+
+	while ((char *)de < (char *)dir) {
+		if (de->rec_len == 0) {
+			EXOFS_ERR("ERROR: exofs_delete_entry:"
+				"zero-length directory entry");
+			err = -EIO;
+			goto out;
+		}
+		pde = de;
+		de = exofs_next_entry(de);
+	}
+	if (pde)
+		from = (char *)pde - (char *)page_address(page);
+	pos = page_offset(page) + from;
+	lock_page(page);
+	err = exofs_write_begin(NULL, page->mapping, pos, to - from, 0,
+							&page, NULL);
+	BUG_ON(err);
+	if (pde)
+		pde->rec_len = cpu_to_le16(to - from);
+	dir->inode_no = 0;
+	err = exofs_commit_chunk(page, pos, to - from);
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+	mark_inode_dirty(inode);
+	sbi->s_numfiles--;
+out:
+	exofs_put_page(page);
+	return err;
+}
+
+int exofs_make_empty(struct inode *inode, struct inode *parent)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page = grab_cache_page(mapping, 0);
+	unsigned chunk_size = exofs_chunk_size(inode);
+	struct exofs_dir_entry *de;
+	int err;
+	void *kaddr;
+
+	if (!page)
+		return -ENOMEM;
+
+	err = exofs_write_begin(NULL, page->mapping, 0, chunk_size, 0,
+							&page, NULL);
+	if (err) {
+		unlock_page(page);
+		goto fail;
+	}
+
+	kaddr = kmap_atomic(page, KM_USER0);
+	de = (struct exofs_dir_entry *)kaddr;
+	de->name_len = 1;
+	de->rec_len = cpu_to_le16(EXOFS_DIR_REC_LEN(1));
+	memcpy(de->name, ".\0\0", 4);
+	de->inode_no = cpu_to_le64(inode->i_ino);
+	exofs_set_de_type(de, inode);
+
+	de = (struct exofs_dir_entry *)(kaddr + EXOFS_DIR_REC_LEN(1));
+	de->name_len = 2;
+	de->rec_len = cpu_to_le16(chunk_size - EXOFS_DIR_REC_LEN(1));
+	de->inode_no = cpu_to_le64(parent->i_ino);
+	memcpy(de->name, "..\0", 4);
+	exofs_set_de_type(de, inode);
+	kunmap_atomic(page, KM_USER0);
+	err = exofs_commit_chunk(page, 0, chunk_size);
+fail:
+	page_cache_release(page);
+	return err;
+}
+
+int exofs_empty_dir(struct inode *inode)
+{
+	struct page *page = NULL;
+	unsigned long i, npages = dir_pages(inode);
+
+	for (i = 0; i < npages; i++) {
+		char *kaddr;
+		struct exofs_dir_entry *de;
+		page = exofs_get_page(inode, i);
+
+		if (IS_ERR(page))
+			continue;
+
+		kaddr = page_address(page);
+		de = (struct exofs_dir_entry *)kaddr;
+		kaddr += exofs_last_byte(inode, i) - EXOFS_DIR_REC_LEN(1);
+
+		while ((char *)de <= kaddr) {
+			if (de->rec_len == 0) {
+				EXOFS_ERR("ERROR: exofs_empty_dir: "
+					  "zero-length directory entry"
+					  "kaddr=%p, de=%p\n", kaddr, de);
+				goto not_empty;
+			}
+			if (de->inode_no != 0) {
+				/* check for . and .. */
+				if (de->name[0] != '.')
+					goto not_empty;
+				if (de->name_len > 2)
+					goto not_empty;
+				if (de->name_len < 2) {
+					if (le64_to_cpu(de->inode_no) !=
+					    inode->i_ino)
+						goto not_empty;
+				} else if (de->name[1] != '.')
+					goto not_empty;
+			}
+			de = exofs_next_entry(de);
+		}
+		exofs_put_page(page);
+	}
+	return 1;
+
+not_empty:
+	exofs_put_page(page);
+	return 0;
+}
+
+const struct file_operations exofs_dir_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.readdir	= exofs_readdir,
+};
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index ac44cb69e498..c2fa63d47084 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -115,6 +115,11 @@ static inline struct exofs_i_info *exofs_i(struct inode *inode)
 	return container_of(inode, struct exofs_i_info, vfs_inode);
 }
 
+/*
+ * Maximum count of links to a file
+ */
+#define EXOFS_LINK_MAX           32000
+
 /*************************
  * function declarations *
  *************************/
@@ -171,10 +176,27 @@ int exofs_setattr(struct dentry *, struct iattr *);
 int exofs_write_begin(struct file *file, struct address_space *mapping,
 		loff_t pos, unsigned len, unsigned flags,
 		struct page **pagep, void **fsdata);
+extern struct inode *exofs_iget(struct super_block *, unsigned long);
+struct inode *exofs_new_inode(struct inode *, int);
+
+/* dir.c:                */
+int exofs_add_link(struct dentry *, struct inode *);
+ino_t exofs_inode_by_name(struct inode *, struct dentry *);
+int exofs_delete_entry(struct exofs_dir_entry *, struct page *);
+int exofs_make_empty(struct inode *, struct inode *);
+struct exofs_dir_entry *exofs_find_entry(struct inode *, struct dentry *,
+					 struct page **);
+int exofs_empty_dir(struct inode *);
+struct exofs_dir_entry *exofs_dotdot(struct inode *, struct page **);
+void exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *,
+		    struct inode *);
 
 /*********************
  * operation vectors *
  *********************/
+/* dir.c:            */
+extern const struct file_operations exofs_dir_operations;
+
 /* file.c            */
 extern const struct inode_operations exofs_file_inode_operations;
 extern const struct file_operations exofs_file_operations;
@@ -182,6 +204,10 @@ extern const struct file_operations exofs_file_operations;
 /* inode.c           */
 extern const struct address_space_operations exofs_aops;
 
+/* namei.c           */
+extern const struct inode_operations exofs_dir_inode_operations;
+extern const struct inode_operations exofs_special_inode_operations;
+
 /* symlink.c         */
 extern const struct inode_operations exofs_symlink_inode_operations;
 extern const struct inode_operations exofs_fast_symlink_inode_operations;
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 4d19dcd64d61..c33630b8e66b 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -436,3 +436,273 @@ int exofs_setattr(struct dentry *dentry, struct iattr *iattr)
 	error = inode_setattr(inode, iattr);
 	return error;
 }
+
+/*
+ * Read an inode from the OSD, and return it as is.  We also return the size
+ * attribute in the 'sanity' argument if we got compiled with debugging turned
+ * on.
+ */
+static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
+		    struct exofs_fcb *inode, uint64_t *sanity)
+{
+	struct exofs_sb_info *sbi = sb->s_fs_info;
+	struct osd_request *req = NULL;
+	uint32_t attr_page;
+	uint32_t attr_id;
+	uint16_t expected;
+	uint8_t *buf;
+	uint64_t o_id;
+	int ret;
+
+	o_id = oi->vfs_inode.i_ino + EXOFS_OBJ_OFF;
+
+	exofs_make_credential(oi->i_cred, sbi->s_pid, o_id);
+
+	req = prepare_osd_get_attr(sbi->s_dev, sbi->s_pid, o_id);
+	if (!req) {
+		EXOFS_ERR("ERROR: prepare get_attr failed.\n");
+		return -ENOMEM;
+	}
+
+	/* we need the inode attribute */
+	prepare_get_attr_list_add_entry(req,
+					OSD_PAGE_NUM_IBM_UOBJ_FS_DATA,
+					OSD_ATTR_NUM_IBM_UOBJ_FS_DATA_INODE,
+					EXOFS_INO_ATTR_SIZE);
+
+#ifdef EXOFS_DEBUG
+	/* we get the size attributes to do a sanity check */
+	prepare_get_attr_list_add_entry(req,
+					OSD_APAGE_OBJECT_INFORMATION,
+					OSD_ATTR_OI_LOGICAL_LENGTH, 8);
+#endif
+
+	ret = exofs_sync_op(req, sbi->s_timeout, oi->i_cred);
+	if (ret)
+		goto out;
+
+	attr_page = OSD_PAGE_NUM_IBM_UOBJ_FS_DATA;
+	attr_id = OSD_ATTR_NUM_IBM_UOBJ_FS_DATA_INODE;
+	expected = EXOFS_INO_ATTR_SIZE;
+	ret = extract_next_attr_from_req(req, &attr_page, &attr_id, &expected,
+					 &buf);
+	if (ret) {
+		EXOFS_ERR("ERROR: extract attr from req failed\n");
+		goto out;
+	}
+	memcpy(inode, buf, sizeof(struct exofs_fcb));
+
+#ifdef EXOFS_DEBUG
+	attr_page = OSD_APAGE_OBJECT_INFORMATION;
+	attr_id = OSD_ATTR_OI_LOGICAL_LENGTH;
+	expected = 8;
+	ret = extract_next_attr_from_req(req, &attr_page, &attr_id, &expected,
+					 &buf);
+	if (ret) {
+		EXOFS_ERR("ERROR: extract attr from req failed\n");
+		goto out;
+	}
+	*sanity = le64_to_cpu(*((uint64_t *) buf));
+#endif
+
+out:
+	free_osd_req(req);
+	return ret;
+}
+
+/*
+ * Fill in an inode read from the OSD and set it up for use
+ */
+struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
+{
+	struct exofs_i_info *oi;
+	struct exofs_fcb fcb;
+	struct inode *inode;
+	uint64_t sanity;
+	int ret;
+
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+	oi = exofs_i(inode);
+
+	/* read the inode from the osd */
+	ret = exofs_get_inode(sb, oi, &fcb, &sanity);
+	if (ret)
+		goto bad_inode;
+
+	init_waitqueue_head(&oi->i_wq);
+	set_obj_created(oi);
+
+	/* copy stuff from on-disk struct to in-memory struct */
+	inode->i_mode = le16_to_cpu(fcb.i_mode);
+	inode->i_uid = le32_to_cpu(fcb.i_uid);
+	inode->i_gid = le32_to_cpu(fcb.i_gid);
+	inode->i_nlink = le16_to_cpu(fcb.i_links_count);
+	inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime);
+	inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime);
+	inode->i_mtime.tv_sec = (signed)le32_to_cpu(fcb.i_mtime);
+	inode->i_ctime.tv_nsec =
+		inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0;
+	oi->i_commit_size = le64_to_cpu(fcb.i_size);
+	i_size_write(inode, oi->i_commit_size);
+	inode->i_blkbits = EXOFS_BLKSHIFT;
+	inode->i_generation = le32_to_cpu(fcb.i_generation);
+
+#ifdef EXOFS_DEBUG
+	if ((inode->i_size != sanity) &&
+		(!exofs_inode_is_fast_symlink(inode))) {
+		EXOFS_ERR("WARNING: Size of object from inode and "
+			  "attributes differ (%lld != %llu)\n",
+			  inode->i_size, _LLU(sanity));
+	}
+#endif
+
+	oi->i_dir_start_lookup = 0;
+
+	if ((inode->i_nlink == 0) && (inode->i_mode == 0)) {
+		ret = -ESTALE;
+		goto bad_inode;
+	}
+
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+		if (fcb.i_data[0])
+			inode->i_rdev =
+				old_decode_dev(le32_to_cpu(fcb.i_data[0]));
+		else
+			inode->i_rdev =
+				new_decode_dev(le32_to_cpu(fcb.i_data[1]));
+	} else {
+		memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data));
+	}
+
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_op = &exofs_file_inode_operations;
+		inode->i_fop = &exofs_file_operations;
+		inode->i_mapping->a_ops = &exofs_aops;
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &exofs_dir_inode_operations;
+		inode->i_fop = &exofs_dir_operations;
+		inode->i_mapping->a_ops = &exofs_aops;
+	} else if (S_ISLNK(inode->i_mode)) {
+		if (exofs_inode_is_fast_symlink(inode))
+			inode->i_op = &exofs_fast_symlink_inode_operations;
+		else {
+			inode->i_op = &exofs_symlink_inode_operations;
+			inode->i_mapping->a_ops = &exofs_aops;
+		}
+	} else {
+		inode->i_op = &exofs_special_inode_operations;
+		if (fcb.i_data[0])
+			init_special_inode(inode, inode->i_mode,
+			   old_decode_dev(le32_to_cpu(fcb.i_data[0])));
+		else
+			init_special_inode(inode, inode->i_mode,
+			   new_decode_dev(le32_to_cpu(fcb.i_data[1])));
+	}
+
+	unlock_new_inode(inode);
+	return inode;
+
+bad_inode:
+	iget_failed(inode);
+	return ERR_PTR(ret);
+}
+
+/*
+ * Callback function from exofs_new_inode().  The important thing is that we
+ * set the obj_created flag so that other methods know that the object exists on
+ * the OSD.
+ */
+static void create_done(struct osd_request *req, void *p)
+{
+	struct inode *inode = p;
+	struct exofs_i_info *oi = exofs_i(inode);
+	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+	int ret;
+
+	ret = exofs_check_ok(req);
+	free_osd_req(req);
+	atomic_dec(&sbi->s_curr_pending);
+
+	if (ret)
+		make_bad_inode(inode);
+	else
+		set_obj_created(oi);
+
+	atomic_dec(&inode->i_count);
+}
+
+/*
+ * Set up a new inode and create an object for it on the OSD
+ */
+struct inode *exofs_new_inode(struct inode *dir, int mode)
+{
+	struct super_block *sb;
+	struct inode *inode;
+	struct exofs_i_info *oi;
+	struct exofs_sb_info *sbi;
+	struct osd_request *req = NULL;
+	int ret;
+
+	sb = dir->i_sb;
+	inode = new_inode(sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	oi = exofs_i(inode);
+
+	init_waitqueue_head(&oi->i_wq);
+	set_obj_2bcreated(oi);
+
+	sbi = sb->s_fs_info;
+
+	sb->s_dirt = 1;
+	inode->i_uid = current->cred->fsuid;
+	if (dir->i_mode & S_ISGID) {
+		inode->i_gid = dir->i_gid;
+		if (S_ISDIR(mode))
+			mode |= S_ISGID;
+	} else {
+		inode->i_gid = current->cred->fsgid;
+	}
+	inode->i_mode = mode;
+
+	inode->i_ino = sbi->s_nextid++;
+	inode->i_blkbits = EXOFS_BLKSHIFT;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+	oi->i_commit_size = inode->i_size = 0;
+	spin_lock(&sbi->s_next_gen_lock);
+	inode->i_generation = sbi->s_next_generation++;
+	spin_unlock(&sbi->s_next_gen_lock);
+	insert_inode_hash(inode);
+
+	mark_inode_dirty(inode);
+
+	req = prepare_osd_create(sbi->s_dev, sbi->s_pid,
+				 inode->i_ino + EXOFS_OBJ_OFF);
+	if (!req) {
+		EXOFS_ERR("ERROR: prepare_osd_create failed\n");
+		return ERR_PTR(-EIO);
+	}
+
+	exofs_make_credential(oi->i_cred, sbi->s_pid,
+			      inode->i_ino + EXOFS_OBJ_OFF);
+
+	/* increment the refcount so that the inode will still be around when we
+	 * reach the callback
+	 */
+	atomic_inc(&inode->i_count);
+
+	ret = exofs_async_op(req, create_done, inode, oi->i_cred);
+	if (ret) {
+		atomic_dec(&inode->i_count);
+		free_osd_req(req);
+		return ERR_PTR(-EIO);
+	}
+	atomic_inc(&sbi->s_curr_pending);
+
+	return inode;
+}
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
new file mode 100644
index 000000000000..4220d90f0fad
--- /dev/null
+++ b/fs/exofs/namei.c
@@ -0,0 +1,338 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include "exofs.h"
+
+static inline int exofs_add_nondir(struct dentry *dentry, struct inode *inode)
+{
+	int err = exofs_add_link(dentry, inode);
+	if (!err) {
+		d_instantiate(dentry, inode);
+		return 0;
+	}
+	inode_dec_link_count(inode);
+	iput(inode);
+	return err;
+}
+
+static struct dentry *exofs_lookup(struct inode *dir, struct dentry *dentry,
+				   struct nameidata *nd)
+{
+	struct inode *inode;
+	ino_t ino;
+
+	if (dentry->d_name.len > EXOFS_NAME_LEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	ino = exofs_inode_by_name(dir, dentry);
+	inode = NULL;
+	if (ino) {
+		inode = exofs_iget(dir->i_sb, ino);
+		if (IS_ERR(inode))
+			return ERR_CAST(inode);
+	}
+	return d_splice_alias(inode, dentry);
+}
+
+static int exofs_create(struct inode *dir, struct dentry *dentry, int mode,
+			 struct nameidata *nd)
+{
+	struct inode *inode = exofs_new_inode(dir, mode);
+	int err = PTR_ERR(inode);
+	if (!IS_ERR(inode)) {
+		inode->i_op = &exofs_file_inode_operations;
+		inode->i_fop = &exofs_file_operations;
+		inode->i_mapping->a_ops = &exofs_aops;
+		mark_inode_dirty(inode);
+		err = exofs_add_nondir(dentry, inode);
+	}
+	return err;
+}
+
+static int exofs_mknod(struct inode *dir, struct dentry *dentry, int mode,
+		       dev_t rdev)
+{
+	struct inode *inode;
+	int err;
+
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
+
+	inode = exofs_new_inode(dir, mode);
+	err = PTR_ERR(inode);
+	if (!IS_ERR(inode)) {
+		init_special_inode(inode, inode->i_mode, rdev);
+		mark_inode_dirty(inode);
+		err = exofs_add_nondir(dentry, inode);
+	}
+	return err;
+}
+
+static int exofs_symlink(struct inode *dir, struct dentry *dentry,
+			  const char *symname)
+{
+	struct super_block *sb = dir->i_sb;
+	int err = -ENAMETOOLONG;
+	unsigned l = strlen(symname)+1;
+	struct inode *inode;
+	struct exofs_i_info *oi;
+
+	if (l > sb->s_blocksize)
+		goto out;
+
+	inode = exofs_new_inode(dir, S_IFLNK | S_IRWXUGO);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out;
+
+	oi = exofs_i(inode);
+	if (l > sizeof(oi->i_data)) {
+		/* slow symlink */
+		inode->i_op = &exofs_symlink_inode_operations;
+		inode->i_mapping->a_ops = &exofs_aops;
+		memset(oi->i_data, 0, sizeof(oi->i_data));
+
+		err = page_symlink(inode, symname, l);
+		if (err)
+			goto out_fail;
+	} else {
+		/* fast symlink */
+		inode->i_op = &exofs_fast_symlink_inode_operations;
+		memcpy(oi->i_data, symname, l);
+		inode->i_size = l-1;
+	}
+	mark_inode_dirty(inode);
+
+	err = exofs_add_nondir(dentry, inode);
+out:
+	return err;
+
+out_fail:
+	inode_dec_link_count(inode);
+	iput(inode);
+	goto out;
+}
+
+static int exofs_link(struct dentry *old_dentry, struct inode *dir,
+		struct dentry *dentry)
+{
+	struct inode *inode = old_dentry->d_inode;
+
+	if (inode->i_nlink >= EXOFS_LINK_MAX)
+		return -EMLINK;
+
+	inode->i_ctime = CURRENT_TIME;
+	inode_inc_link_count(inode);
+	atomic_inc(&inode->i_count);
+
+	return exofs_add_nondir(dentry, inode);
+}
+
+static int exofs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct inode *inode;
+	int err = -EMLINK;
+
+	if (dir->i_nlink >= EXOFS_LINK_MAX)
+		goto out;
+
+	inode_inc_link_count(dir);
+
+	inode = exofs_new_inode(dir, S_IFDIR | mode);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out_dir;
+
+	inode->i_op = &exofs_dir_inode_operations;
+	inode->i_fop = &exofs_dir_operations;
+	inode->i_mapping->a_ops = &exofs_aops;
+
+	inode_inc_link_count(inode);
+
+	err = exofs_make_empty(inode, dir);
+	if (err)
+		goto out_fail;
+
+	err = exofs_add_link(dentry, inode);
+	if (err)
+		goto out_fail;
+
+	d_instantiate(dentry, inode);
+out:
+	return err;
+
+out_fail:
+	inode_dec_link_count(inode);
+	inode_dec_link_count(inode);
+	iput(inode);
+out_dir:
+	inode_dec_link_count(dir);
+	goto out;
+}
+
+static int exofs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	struct exofs_dir_entry *de;
+	struct page *page;
+	int err = -ENOENT;
+
+	de = exofs_find_entry(dir, dentry, &page);
+	if (!de)
+		goto out;
+
+	err = exofs_delete_entry(de, page);
+	if (err)
+		goto out;
+
+	inode->i_ctime = dir->i_ctime;
+	inode_dec_link_count(inode);
+	err = 0;
+out:
+	return err;
+}
+
+static int exofs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	int err = -ENOTEMPTY;
+
+	if (exofs_empty_dir(inode)) {
+		err = exofs_unlink(dir, dentry);
+		if (!err) {
+			inode->i_size = 0;
+			inode_dec_link_count(inode);
+			inode_dec_link_count(dir);
+		}
+	}
+	return err;
+}
+
+static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
+		struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct inode *old_inode = old_dentry->d_inode;
+	struct inode *new_inode = new_dentry->d_inode;
+	struct page *dir_page = NULL;
+	struct exofs_dir_entry *dir_de = NULL;
+	struct page *old_page;
+	struct exofs_dir_entry *old_de;
+	int err = -ENOENT;
+
+	old_de = exofs_find_entry(old_dir, old_dentry, &old_page);
+	if (!old_de)
+		goto out;
+
+	if (S_ISDIR(old_inode->i_mode)) {
+		err = -EIO;
+		dir_de = exofs_dotdot(old_inode, &dir_page);
+		if (!dir_de)
+			goto out_old;
+	}
+
+	if (new_inode) {
+		struct page *new_page;
+		struct exofs_dir_entry *new_de;
+
+		err = -ENOTEMPTY;
+		if (dir_de && !exofs_empty_dir(new_inode))
+			goto out_dir;
+
+		err = -ENOENT;
+		new_de = exofs_find_entry(new_dir, new_dentry, &new_page);
+		if (!new_de)
+			goto out_dir;
+		inode_inc_link_count(old_inode);
+		exofs_set_link(new_dir, new_de, new_page, old_inode);
+		new_inode->i_ctime = CURRENT_TIME;
+		if (dir_de)
+			drop_nlink(new_inode);
+		inode_dec_link_count(new_inode);
+	} else {
+		if (dir_de) {
+			err = -EMLINK;
+			if (new_dir->i_nlink >= EXOFS_LINK_MAX)
+				goto out_dir;
+		}
+		inode_inc_link_count(old_inode);
+		err = exofs_add_link(new_dentry, old_inode);
+		if (err) {
+			inode_dec_link_count(old_inode);
+			goto out_dir;
+		}
+		if (dir_de)
+			inode_inc_link_count(new_dir);
+	}
+
+	old_inode->i_ctime = CURRENT_TIME;
+
+	exofs_delete_entry(old_de, old_page);
+	inode_dec_link_count(old_inode);
+
+	if (dir_de) {
+		exofs_set_link(old_inode, dir_de, dir_page, new_dir);
+		inode_dec_link_count(old_dir);
+	}
+	return 0;
+
+
+out_dir:
+	if (dir_de) {
+		kunmap(dir_page);
+		page_cache_release(dir_page);
+	}
+out_old:
+	kunmap(old_page);
+	page_cache_release(old_page);
+out:
+	return err;
+}
+
+const struct inode_operations exofs_dir_inode_operations = {
+	.create 	= exofs_create,
+	.lookup 	= exofs_lookup,
+	.link   	= exofs_link,
+	.unlink 	= exofs_unlink,
+	.symlink	= exofs_symlink,
+	.mkdir  	= exofs_mkdir,
+	.rmdir  	= exofs_rmdir,
+	.mknod  	= exofs_mknod,
+	.rename 	= exofs_rename,
+	.setattr	= exofs_setattr,
+};
+
+const struct inode_operations exofs_special_inode_operations = {
+	.setattr	= exofs_setattr,
+};
-- 
cgit v1.2.3


From e4cc7e048e0470a110a9414493cb41a913b2081a Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Tue, 28 Oct 2008 16:11:41 +0200
Subject: exofs: super_operations and file_system_type

This patch ties all operation vectors into a file system superblock
and registers the exofs file_system_type at module's load time.

* The file system control block (AKA on-disk superblock) resides in
  an object with a special ID (defined in common.h).
  Information included in the file system control block is used to
  fill the in-memory superblock structure at mount time. This object
  is created before the file system is used by mkexofs.c It contains
  information such as:
	- The file system's magic number
	- The next inode number to be allocated

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/Kbuild  |   2 +-
 fs/exofs/exofs.h |  24 +++
 fs/exofs/inode.c | 181 ++++++++++++++++++++
 fs/exofs/super.c | 504 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 710 insertions(+), 1 deletion(-)
 create mode 100644 fs/exofs/super.c

(limited to 'fs')

diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
index 61162c6eedfe..22685022d495 100644
--- a/fs/exofs/Kbuild
+++ b/fs/exofs/Kbuild
@@ -26,5 +26,5 @@ KBUILD_CPPFLAGS := -I$(OSD_INC) $(KBUILD_CPPFLAGS)
 
 endif
 
-exofs-objs := osd.o inode.o file.o symlink.o namei.o dir.o
+exofs-objs := osd.o inode.o file.o symlink.o namei.o dir.o super.o
 obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index c2fa63d47084..c1d0980ec359 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -53,6 +53,17 @@
 /* u64 has problems with printk this will cast it to unsigned long long */
 #define _LLU(x) (unsigned long long)(x)
 
+/*
+ * struct to hold what we get from mount options
+ */
+struct exofs_mountopt {
+	const char *dev_name;
+	uint64_t pid;
+	int timeout;
+	bool mkfs;
+	int format; /*in Mbyte*/
+};
+
 /*
  * our extension to the in-memory superblock
  */
@@ -115,6 +126,14 @@ static inline struct exofs_i_info *exofs_i(struct inode *inode)
 	return container_of(inode, struct exofs_i_info, vfs_inode);
 }
 
+/*
+ * ugly struct so that we can pass two arguments to update_inode's callback
+ */
+struct updatei_args {
+	struct exofs_sb_info	*sbi;
+	struct exofs_fcb	fcb;
+};
+
 /*
  * Maximum count of links to a file
  */
@@ -178,6 +197,8 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
 		struct page **pagep, void **fsdata);
 extern struct inode *exofs_iget(struct super_block *, unsigned long);
 struct inode *exofs_new_inode(struct inode *, int);
+extern int exofs_write_inode(struct inode *, int);
+extern void exofs_delete_inode(struct inode *);
 
 /* dir.c:                */
 int exofs_add_link(struct dentry *, struct inode *);
@@ -208,6 +229,9 @@ extern const struct address_space_operations exofs_aops;
 extern const struct inode_operations exofs_dir_inode_operations;
 extern const struct inode_operations exofs_special_inode_operations;
 
+/* super.c           */
+extern const struct super_operations exofs_sops;
+
 /* symlink.c         */
 extern const struct inode_operations exofs_symlink_inode_operations;
 extern const struct inode_operations exofs_fast_symlink_inode_operations;
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index c33630b8e66b..80314258007d 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -706,3 +706,184 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
 
 	return inode;
 }
+
+/*
+ * Callback function from exofs_update_inode().
+ */
+static void updatei_done(struct osd_request *req, void *p)
+{
+	struct updatei_args *args = p;
+
+	free_osd_req(req);
+
+	atomic_dec(&args->sbi->s_curr_pending);
+
+	kfree(args);
+}
+
+/*
+ * Write the inode to the OSD.  Just fill up the struct, and set the attribute
+ * synchronously or asynchronously depending on the do_sync flag.
+ */
+static int exofs_update_inode(struct inode *inode, int do_sync)
+{
+	struct exofs_i_info *oi = exofs_i(inode);
+	struct super_block *sb = inode->i_sb;
+	struct exofs_sb_info *sbi = sb->s_fs_info;
+	struct osd_request *req;
+	struct exofs_fcb *fcb;
+	struct updatei_args *args;
+	int ret;
+
+	args = kzalloc(sizeof(*args), GFP_KERNEL);
+	if (!args)
+		return -ENOMEM;
+
+	fcb = &args->fcb;
+
+	fcb->i_mode = cpu_to_le16(inode->i_mode);
+	fcb->i_uid = cpu_to_le32(inode->i_uid);
+	fcb->i_gid = cpu_to_le32(inode->i_gid);
+	fcb->i_links_count = cpu_to_le16(inode->i_nlink);
+	fcb->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+	fcb->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
+	fcb->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
+	oi->i_commit_size = i_size_read(inode);
+	fcb->i_size = cpu_to_le64(oi->i_commit_size);
+	fcb->i_generation = cpu_to_le32(inode->i_generation);
+
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+		if (old_valid_dev(inode->i_rdev)) {
+			fcb->i_data[0] =
+				cpu_to_le32(old_encode_dev(inode->i_rdev));
+			fcb->i_data[1] = 0;
+		} else {
+			fcb->i_data[0] = 0;
+			fcb->i_data[1] =
+				cpu_to_le32(new_encode_dev(inode->i_rdev));
+			fcb->i_data[2] = 0;
+		}
+	} else
+		memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
+
+	req = prepare_osd_set_attr(sbi->s_dev, sbi->s_pid,
+				 (uint64_t) (inode->i_ino + EXOFS_OBJ_OFF));
+	if (!req) {
+		EXOFS_ERR("ERROR: prepare set_attr failed.\n");
+		ret = -ENOMEM;
+		goto free_args;
+	}
+
+	prepare_set_attr_list_add_entry(req,
+					OSD_PAGE_NUM_IBM_UOBJ_FS_DATA,
+					OSD_ATTR_NUM_IBM_UOBJ_FS_DATA_INODE,
+					EXOFS_INO_ATTR_SIZE,
+					(unsigned char *)fcb);
+
+	if (!obj_created(oi)) {
+		BUG_ON(!obj_2bcreated(oi));
+		wait_event(oi->i_wq, obj_created(oi));
+	}
+
+	if (do_sync) {
+		ret = exofs_sync_op(req, sbi->s_timeout, oi->i_cred);
+		free_osd_req(req);
+		goto free_args;
+	} else {
+		args->sbi = sbi;
+
+		ret = exofs_async_op(req, updatei_done, args, oi->i_cred);
+		if (ret) {
+			free_osd_req(req);
+			goto free_args;
+		}
+		atomic_inc(&sbi->s_curr_pending);
+		goto out; /* deallocation in updatei_done */
+	}
+
+free_args:
+	kfree(args);
+out:
+	return ret;
+}
+
+int exofs_write_inode(struct inode *inode, int wait)
+{
+	return exofs_update_inode(inode, wait);
+}
+
+int exofs_sync_inode(struct inode *inode)
+{
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_ALL,
+		.nr_to_write = 0,	/* sys_fsync did this */
+	};
+
+	return sync_inode(inode, &wbc);
+}
+
+/*
+ * Callback function from exofs_delete_inode() - don't have much cleaning up to
+ * do.
+ */
+static void delete_done(struct osd_request *req, void *p)
+{
+	struct exofs_sb_info *sbi;
+	free_osd_req(req);
+	sbi = p;
+	atomic_dec(&sbi->s_curr_pending);
+}
+
+/*
+ * Called when the refcount of an inode reaches zero.  We remove the object
+ * from the OSD here.  We make sure the object was created before we try and
+ * delete it.
+ */
+void exofs_delete_inode(struct inode *inode)
+{
+	struct exofs_i_info *oi = exofs_i(inode);
+	struct osd_request *req = NULL;
+	struct super_block *sb = inode->i_sb;
+	struct exofs_sb_info *sbi = sb->s_fs_info;
+	int ret;
+
+	truncate_inode_pages(&inode->i_data, 0);
+
+	if (is_bad_inode(inode))
+		goto no_delete;
+	mark_inode_dirty(inode);
+	exofs_update_inode(inode, inode_needs_sync(inode));
+
+	inode->i_size = 0;
+	if (inode->i_blocks)
+		exofs_truncate(inode);
+
+	clear_inode(inode);
+
+	req = prepare_osd_remove(sbi->s_dev, sbi->s_pid,
+				 inode->i_ino + EXOFS_OBJ_OFF);
+	if (!req) {
+		EXOFS_ERR("ERROR: prepare_osd_remove failed\n");
+		return;
+	}
+
+	/* if we are deleting an obj that hasn't been created yet, wait */
+	if (!obj_created(oi)) {
+		BUG_ON(!obj_2bcreated(oi));
+		wait_event(oi->i_wq, obj_created(oi));
+	}
+
+	ret = exofs_async_op(req, delete_done, sbi, oi->i_cred);
+	if (ret) {
+		EXOFS_ERR(
+		       "ERROR: @exofs_delete_inode exofs_async_op failed\n");
+		free_osd_req(req);
+		return;
+	}
+	atomic_inc(&sbi->s_curr_pending);
+
+	return;
+
+no_delete:
+	clear_inode(inode);
+}
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
new file mode 100644
index 000000000000..53d8b52e9ae7
--- /dev/null
+++ b/fs/exofs/super.c
@@ -0,0 +1,504 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include <linux/string.h>
+#include <linux/parser.h>
+#include <linux/vfs.h>
+#include <linux/random.h>
+
+#include "exofs.h"
+
+/******************************************************************************
+ * MOUNT OPTIONS
+ *****************************************************************************/
+
+/*
+ * exofs-specific mount-time options.
+ */
+enum { Opt_pid, Opt_to, Opt_mkfs, Opt_format, Opt_err };
+
+/*
+ * Our mount-time options.  These should ideally be 64-bit unsigned, but the
+ * kernel's parsing functions do not currently support that.  32-bit should be
+ * sufficient for most applications now.
+ */
+static match_table_t tokens = {
+	{Opt_pid, "pid=%u"},
+	{Opt_to, "to=%u"},
+	{Opt_err, NULL}
+};
+
+/*
+ * The main option parsing method.  Also makes sure that all of the mandatory
+ * mount options were set.
+ */
+static int parse_options(char *options, struct exofs_mountopt *opts)
+{
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int option;
+	int s_pid = 0;
+
+	EXOFS_DBGMSG("parse_options %s\n", options);
+	/* defaults */
+	memset(opts, 0, sizeof(*opts));
+	opts->timeout = BLK_DEFAULT_SG_TIMEOUT;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_pid:
+			if (match_int(&args[0], &option))
+				return -EINVAL;
+			if (option < 65536) {
+				EXOFS_ERR("Partition ID must be >= 65536");
+				return -EINVAL;
+			}
+			opts->pid = option;
+			s_pid = 1;
+			break;
+		case Opt_to:
+			if (match_int(&args[0], &option))
+				return -EINVAL;
+			if (option <= 0) {
+				EXOFS_ERR("Timout must be > 0");
+				return -EINVAL;
+			}
+			opts->timeout = option * HZ;
+			break;
+		}
+	}
+
+	if (!s_pid) {
+		EXOFS_ERR("Need to specify the following options:\n");
+		EXOFS_ERR("    -o pid=pid_no_to_use\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/******************************************************************************
+ * INODE CACHE
+ *****************************************************************************/
+
+/*
+ * Our inode cache.  Isn't it pretty?
+ */
+static struct kmem_cache *exofs_inode_cachep;
+
+/*
+ * Allocate an inode in the cache
+ */
+static struct inode *exofs_alloc_inode(struct super_block *sb)
+{
+	struct exofs_i_info *oi;
+
+	oi = kmem_cache_alloc(exofs_inode_cachep, GFP_KERNEL);
+	if (!oi)
+		return NULL;
+
+	oi->vfs_inode.i_version = 1;
+	return &oi->vfs_inode;
+}
+
+/*
+ * Remove an inode from the cache
+ */
+static void exofs_destroy_inode(struct inode *inode)
+{
+	kmem_cache_free(exofs_inode_cachep, exofs_i(inode));
+}
+
+/*
+ * Initialize the inode
+ */
+static void exofs_init_once(void *foo)
+{
+	struct exofs_i_info *oi = foo;
+
+	inode_init_once(&oi->vfs_inode);
+}
+
+/*
+ * Create and initialize the inode cache
+ */
+static int init_inodecache(void)
+{
+	exofs_inode_cachep = kmem_cache_create("exofs_inode_cache",
+				sizeof(struct exofs_i_info), 0,
+				SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+				exofs_init_once);
+	if (exofs_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+/*
+ * Destroy the inode cache
+ */
+static void destroy_inodecache(void)
+{
+	kmem_cache_destroy(exofs_inode_cachep);
+}
+
+/******************************************************************************
+ * SUPERBLOCK FUNCTIONS
+ *****************************************************************************/
+
+/*
+ * Write the superblock to the OSD
+ */
+static void exofs_write_super(struct super_block *sb)
+{
+	struct exofs_sb_info *sbi;
+	struct exofs_fscb *fscb = NULL;
+	struct osd_request *req = NULL;
+
+	fscb = kzalloc(sizeof(struct exofs_fscb), GFP_KERNEL);
+	if (!fscb)
+		return;
+
+	lock_kernel();
+	sbi = sb->s_fs_info;
+	fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
+	fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles);
+	fscb->s_magic = cpu_to_le16(sb->s_magic);
+	fscb->s_newfs = 0;
+
+	req = prepare_osd_write(sbi->s_dev, sbi->s_pid, EXOFS_SUPER_ID,
+				sizeof(struct exofs_fscb), 0, fscb);
+	if (!req) {
+		EXOFS_ERR("ERROR: write super failed.\n");
+		goto out;
+	}
+
+	exofs_sync_op(req, sbi->s_timeout, sbi->s_cred);
+	free_osd_req(req);
+	sb->s_dirt = 0;
+
+out:
+	unlock_kernel();
+	kfree(fscb);
+}
+
+/*
+ * This function is called when the vfs is freeing the superblock.  We just
+ * need to free our own part.
+ */
+static void exofs_put_super(struct super_block *sb)
+{
+	int num_pend;
+	struct exofs_sb_info *sbi = sb->s_fs_info;
+
+	/* make sure there are no pending commands */
+	for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0;
+	     num_pend = atomic_read(&sbi->s_curr_pending)) {
+		wait_queue_head_t wq;
+		init_waitqueue_head(&wq);
+		wait_event_timeout(wq,
+				  (atomic_read(&sbi->s_curr_pending) == 0),
+				  msecs_to_jiffies(100));
+	}
+
+	osduld_put_device(sbi->s_dev);
+	kfree(sb->s_fs_info);
+	sb->s_fs_info = NULL;
+}
+
+/*
+ * Read the superblock from the OSD and fill in the fields
+ */
+static int exofs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct inode *root;
+	struct exofs_mountopt *opts = data;
+	struct exofs_sb_info *sbi = NULL;    /*extended info                  */
+	struct exofs_fscb fscb;		     /*on-disk superblock info        */
+	struct osd_request *req = NULL;
+	int ret;
+
+	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+	sb->s_fs_info = sbi;
+
+	/* use mount options to fill superblock */
+	sbi->s_dev = osduld_path_lookup(opts->dev_name);
+	if (IS_ERR(sbi->s_dev)) {
+		ret = PTR_ERR(sbi->s_dev);
+		sbi->s_dev = NULL;
+		goto free_sbi;
+	}
+
+	sbi->s_pid = opts->pid;
+	sbi->s_timeout = opts->timeout;
+
+	/* fill in some other data by hand */
+	memset(sb->s_id, 0, sizeof(sb->s_id));
+	strcpy(sb->s_id, "exofs");
+	sb->s_blocksize = EXOFS_BLKSIZE;
+	sb->s_blocksize_bits = EXOFS_BLKSHIFT;
+	atomic_set(&sbi->s_curr_pending, 0);
+	sb->s_bdev = NULL;
+	sb->s_dev = 0;
+
+	/* read data from on-disk superblock object */
+	exofs_make_credential(sbi->s_cred, sbi->s_pid, EXOFS_SUPER_ID);
+
+	req = prepare_osd_read(sbi->s_dev, sbi->s_pid, EXOFS_SUPER_ID,
+			       sizeof(struct exofs_fscb), 0, &fscb);
+	if (!req) {
+		if (!silent)
+			EXOFS_ERR("ERROR: could not prepare read request.\n");
+		ret = -ENOMEM;
+		goto free_sbi;
+	}
+
+	ret = exofs_sync_op(req, sbi->s_timeout, sbi->s_cred);
+	if (ret != 0) {
+		if (!silent)
+			EXOFS_ERR("ERROR: read super failed.\n");
+		ret = -EIO;
+		goto free_sbi;
+	}
+
+	sb->s_magic = le16_to_cpu(fscb.s_magic);
+	sbi->s_nextid = le64_to_cpu(fscb.s_nextid);
+	sbi->s_numfiles = le32_to_cpu(fscb.s_numfiles);
+
+	/* make sure what we read from the object store is correct */
+	if (sb->s_magic != EXOFS_SUPER_MAGIC) {
+		if (!silent)
+			EXOFS_ERR("ERROR: Bad magic value\n");
+		ret = -EINVAL;
+		goto free_sbi;
+	}
+
+	/* start generation numbers from a random point */
+	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
+	spin_lock_init(&sbi->s_next_gen_lock);
+
+	/* set up operation vectors */
+	sb->s_op = &exofs_sops;
+	root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF);
+	if (IS_ERR(root)) {
+		EXOFS_ERR("ERROR: exofs_iget failed\n");
+		ret = PTR_ERR(root);
+		goto free_sbi;
+	}
+	sb->s_root = d_alloc_root(root);
+	if (!sb->s_root) {
+		iput(root);
+		EXOFS_ERR("ERROR: get root inode failed\n");
+		ret = -ENOMEM;
+		goto free_sbi;
+	}
+
+	if (!S_ISDIR(root->i_mode)) {
+		dput(sb->s_root);
+		sb->s_root = NULL;
+		EXOFS_ERR("ERROR: corrupt root inode (mode = %hd)\n",
+		       root->i_mode);
+		ret = -EINVAL;
+		goto free_sbi;
+	}
+
+	ret = 0;
+out:
+	if (req)
+		free_osd_req(req);
+	return ret;
+
+free_sbi:
+	osduld_put_device(sbi->s_dev); /* NULL safe */
+	kfree(sbi);
+	goto out;
+}
+
+/*
+ * Set up the superblock (calls exofs_fill_super eventually)
+ */
+static int exofs_get_sb(struct file_system_type *type,
+			  int flags, const char *dev_name,
+			  void *data, struct vfsmount *mnt)
+{
+	struct exofs_mountopt opts;
+	int ret;
+
+	ret = parse_options(data, &opts);
+	if (ret)
+		return ret;
+
+	opts.dev_name = dev_name;
+	return get_sb_nodev(type, flags, &opts, exofs_fill_super, mnt);
+}
+
+/*
+ * Return information about the file system state in the buffer.  This is used
+ * by the 'df' command, for example.
+ */
+static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct exofs_sb_info *sbi = sb->s_fs_info;
+	uint8_t cred_a[OSD_CAP_LEN];
+	struct osd_request *req = NULL;
+	uint32_t attr_page;
+	uint32_t attr_id;
+	uint16_t expected;
+	uint64_t capacity;
+	uint64_t used;
+	uint8_t *data;
+	int ret;
+
+	/* get used/capacity attributes */
+	exofs_make_credential(cred_a, sbi->s_pid, 0);
+
+	req = prepare_osd_get_attr(sbi->s_dev, sbi->s_pid, 0);
+	if (!req) {
+		EXOFS_ERR("ERROR: prepare get_attr failed.\n");
+		return -1;
+	}
+
+	prepare_get_attr_list_add_entry(req,
+					OSD_APAGE_PARTITION_QUOTAS,
+					OSD_ATTR_PQ_CAPACITY_QUOTA, 8);
+
+	prepare_get_attr_list_add_entry(req,
+					OSD_APAGE_PARTITION_INFORMATION,
+					OSD_ATTR_PI_USED_CAPACITY, 8);
+
+	ret = exofs_sync_op(req, sbi->s_timeout, cred_a);
+	if (ret)
+		goto out;
+
+	attr_page = OSD_APAGE_PARTITION_QUOTAS;
+	attr_id = OSD_ATTR_PQ_CAPACITY_QUOTA;
+	expected = 8;
+	ret = extract_next_attr_from_req(req, &attr_page, &attr_id, &expected,
+					 &data);
+	if (ret) {
+		EXOFS_ERR("ERROR: extract attr from req failed\n");
+		goto out;
+	}
+	capacity = get_unaligned_le64(data);
+
+	attr_page = OSD_APAGE_PARTITION_INFORMATION;
+	attr_id = OSD_ATTR_PI_USED_CAPACITY;
+	expected = 8;
+	ret = extract_next_attr_from_req(req, &attr_page, &attr_id, &expected,
+					 &data);
+	if (ret) {
+		EXOFS_ERR("ERROR: extract attr from req failed\n");
+		goto out;
+	}
+	used = get_unaligned_le64(data);
+
+	/* fill in the stats buffer */
+	buf->f_type = EXOFS_SUPER_MAGIC;
+	buf->f_bsize = EXOFS_BLKSIZE;
+	buf->f_blocks = (capacity >> EXOFS_BLKSHIFT);
+	buf->f_bfree = ((capacity - used) >> EXOFS_BLKSHIFT);
+	buf->f_bavail = buf->f_bfree;
+	buf->f_files = sbi->s_numfiles;
+	buf->f_ffree = EXOFS_MAX_ID - sbi->s_numfiles;
+	buf->f_namelen = EXOFS_NAME_LEN;
+out:
+	free_osd_req(req);
+
+	return ret;
+}
+
+const struct super_operations exofs_sops = {
+	.alloc_inode    = exofs_alloc_inode,
+	.destroy_inode  = exofs_destroy_inode,
+	.write_inode    = exofs_write_inode,
+	.delete_inode   = exofs_delete_inode,
+	.put_super      = exofs_put_super,
+	.write_super    = exofs_write_super,
+	.statfs         = exofs_statfs,
+};
+
+/******************************************************************************
+ * INSMOD/RMMOD
+ *****************************************************************************/
+
+/*
+ * struct that describes this file system
+ */
+static struct file_system_type exofs_type = {
+	.owner          = THIS_MODULE,
+	.name           = "exofs",
+	.get_sb         = exofs_get_sb,
+	.kill_sb        = generic_shutdown_super,
+};
+
+static int __init init_exofs(void)
+{
+	int err;
+
+	err = init_inodecache();
+	if (err)
+		goto out;
+
+	err = register_filesystem(&exofs_type);
+	if (err)
+		goto out_d;
+
+	return 0;
+out_d:
+	destroy_inodecache();
+out:
+	return err;
+}
+
+static void __exit exit_exofs(void)
+{
+	unregister_filesystem(&exofs_type);
+	destroy_inodecache();
+}
+
+MODULE_AUTHOR("Avishay Traeger <avishay@gmail.com>");
+MODULE_DESCRIPTION("exofs");
+MODULE_LICENSE("GPL");
+
+module_init(init_exofs)
+module_exit(exit_exofs)
-- 
cgit v1.2.3


From a2023d27ee3bad93458e0f49d82dfd3591f27962 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Tue, 28 Oct 2008 17:22:01 +0200
Subject: exofs: Documentation

Added some documentation in exofs.txt, as well as a BUGS file.

For further reading, operation instructions, example scripts
and up to date infomation and code please see:
http://open-osd.org

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 Documentation/filesystems/exofs.txt | 181 ++++++++++++++++++++++++++++++++++++
 fs/exofs/BUGS                       |   6 ++
 2 files changed, 187 insertions(+)
 create mode 100644 Documentation/filesystems/exofs.txt
 create mode 100644 fs/exofs/BUGS

(limited to 'fs')

diff --git a/Documentation/filesystems/exofs.txt b/Documentation/filesystems/exofs.txt
new file mode 100644
index 000000000000..576c79f5143a
--- /dev/null
+++ b/Documentation/filesystems/exofs.txt
@@ -0,0 +1,181 @@
+===============================================================================
+WHAT IS EXOFS?
+===============================================================================
+
+exofs is a file system that uses an OSD and exports the API of a normal Linux
+file system. Users access exofs like any other local file system, and exofs
+will in turn issue commands to the local OSD initiator.
+
+OSD is a new T10 command set that views storage devices not as a large/flat
+array of sectors but as a container of objects, each having a length, quota,
+time attributes and more. Each object is addressed by a 64bit ID, and is
+contained in a 64bit ID partition. Each object has associated attributes
+attached to it, which are integral part of the object and provide metadata about
+the object. The standard defines some common obligatory attributes, but user
+attributes can be added as needed.
+
+===============================================================================
+ENVIRONMENT
+===============================================================================
+
+To use this file system, you need to have an object store to run it on.  You
+may download a target from:
+http://open-osd.org
+
+See drivers/scsi/osd/README for how to setup a working osd environment.
+
+===============================================================================
+USAGE
+===============================================================================
+
+1. Download and compile exofs and open-osd initiator:
+  You need an external Kernel source tree or kernel headers from your
+  distribution. (anything based on 2.6.26 or later).
+
+  a. download open-osd including exofs source using:
+     [parent-directory]$ git clone git://git.open-osd.org/open-osd.git
+
+  b. Build the library module like this:
+     [parent-directory]$ make -C KSRC=$(KER_DIR) open-osd
+
+     This will build both the open-osd initiator as well as the exofs kernel
+     module. Use whatever parameters you compiled your Kernel with and
+     $(KER_DIR) above pointing to the Kernel you compile against. See the file
+     open-osd/top-level-Makefile for an example.
+
+2. Get the OSD initiator and target set up properly, and login to the target.
+  See drivers/scsi/osd/README for farther instructions. Also see ./do-osd-test
+  for example script that does all these steps.
+
+3. Insmod the exofs.ko module:
+   [exofs]$ insmod exofs.ko
+
+4. Make sure the directory where you want to mount exists. If not, create it.
+   (For example, /mnt/exofs)
+
+5. At first run you will need to invoke the mkexofs.c routine
+
+   As an example, this will create the file system on:
+   /dev/osd0 partition ID 65540, max capacity 1024 Mg bytes
+
+   mount -t exofs -o pid=65540,mkfs=1,format=1024 /dev/osd0 /mnt/exofs/
+
+   The format=1024 is optional if not specified no OSD_FORMAT will be preformed
+   and a clean file system will be created in the specified pid, in the
+   available space of the target.
+   If pid already exist it will be deleted and a new one will be created in it's
+   place. Be careful.
+
+6. Mount the file system. The above command left the filesystem mounted,
+   but on subsequent runs the mkfs=1 should not be invoked.
+
+   For example, to mount /dev/osd0, partition ID 65540 on /mnt/exofs:
+
+	mount -t exofs -o pid=65540 /dev/osd0 /mnt/exofs/
+
+7. For reference (under fs/exofs/):
+	do-exofs start - an example of how to perform the above steps.
+	do-exofs stop -  an example of how to unmount the file system.
+
+8. Extra compilation flags (uncomment in fs/exofs/Kbuild):
+	EXOFS_DEBUG - for debug messages and extra checks.
+
+===============================================================================
+exofs mount options
+===============================================================================
+Similar to any mount command:
+	mount -t exofs -o exofs_options /dev/osdX mount_exofs_directory
+
+Where:
+    -t exofs: specifies the exofs file system
+
+    /dev/osdX: X is a decimal number. /dev/osdX was created after a successful
+               login into an OSD target.
+
+    mount_exofs_directory: The directory to mount the file system on
+
+    exofs_options: Options are separated by commas (,)
+		pid=<integer> - The partition number to mount/create as
+                                container of the filesystem.
+                                This option is mandatory
+		mkfs=<1/0>    - If mkfs=1 make a new filesystem before mount.
+                                Default is 0 - don't make. If mkfs=0 pid must
+                                exist and an mkfs=1 was previously preformed
+                                on it.
+               format=<integer>- If mkfs=1 is specified then the format=
+                                 parameter will also invoke an OSD_FORMAT
+                                 command prior to creation of the filesystem
+                                 partition (mkfs). The integer specified is in
+                                 Mega bytes. If not specified or set to 0 then
+                                 no format is executed, and a partition is
+                                 created in the available space.
+                                 If mkfs=0 this option is ignored.
+                to=<integer>  - Timeout in ticks for a single command
+                                default is (60 * HZ) [for debugging only]
+
+===============================================================================
+DESIGN
+===============================================================================
+
+* The file system control block (AKA on-disk superblock) resides in an object
+  with a special ID (defined in common.h).
+  Information included in the file system control block is used to fill the
+  in-memory superblock structure at mount time. This object is created before
+  the file system is used by mkexofs.c It contains information such as:
+	- The file system's magic number
+	- The next inode number to be allocated
+
+* Each file resides in its own object and contains the data (and it will be
+  possible to extend the file over multiple objects, though this has not been
+  implemented yet).
+
+* A directory is treated as a file, and essentially contains a list of <file
+  name, inode #> pairs for files that are found in that directory. The object
+  IDs correspond to the files' inode numbers and will be allocated according to
+  a bitmap (stored in a separate object). Now they are allocated using a
+  counter.
+
+* Each file's control block (AKA on-disk inode) is stored in its object's
+  attributes. This applies to both regular files and other types (directories,
+  device files, symlinks, etc.).
+
+* Credentials are generated per object (inode and superblock) when they is
+  created in memory (read off disk or created). The credential works for all
+  operations and is used as long as the object remains in memory.
+
+* Async OSD operations are used whenever possible, but the target may execute
+  them out of order. The operations that concern us are create, delete,
+  readpage, writepage, update_inode, and truncate. The following pairs of
+  operations should execute in the order written, and we need to prevent them
+  from executing in reverse order:
+	- The following are handled with the OBJ_CREATED and OBJ_2BCREATED
+	  flags. OBJ_CREATED is set when we know the object exists on the OSD -
+	  in create's callback function, and when we successfully do a read_inode.
+	  OBJ_2BCREATED is set in the beginning of the create function, so we
+	  know that we should wait.
+		- create/delete: delete should wait until the object is created
+		  on the OSD.
+		- create/readpage: readpage should be able to return a page
+		  full of zeroes in this case. If there was a write already
+		  en-route (i.e. create, writepage, readpage) then the page
+		  would be locked, and so it would really be the same as
+		  create/writepage.
+		- create/writepage: if writepage is called for a sync write, it
+		  should wait until the object is created on the OSD.
+		  Otherwise, it should just return.
+		- create/truncate: truncate should wait until the object is
+		  created on the OSD.
+		- create/update_inode: update_inode should wait until the
+		  object is created on the OSD.
+	- Handled by VFS locks:
+		- readpage/delete: shouldn't happen because of page lock.
+		- writepage/delete: shouldn't happen because of page lock.
+		- readpage/writepage: shouldn't happen because of page lock.
+
+===============================================================================
+LICENSE/COPYRIGHT
+===============================================================================
+The exofs file system is based on ext2 v0.5b (distributed with the Linux kernel
+version 2.6.10).  All files include the original copyrights, and the license
+is GPL version 2 (only version 2, as is true for the Linux kernel).  The
+Linux kernel can be downloaded from www.kernel.org.
diff --git a/fs/exofs/BUGS b/fs/exofs/BUGS
new file mode 100644
index 000000000000..6d6e1f945111
--- /dev/null
+++ b/fs/exofs/BUGS
@@ -0,0 +1,6 @@
+- Some mount time options should have been 64-bit, but are declared as 32-bit
+  because that's what the kernel's parsing methods support at this time.
+
+- Out-of-space may cause a severe problem if the object (and directory entry)
+  were written, but the inode attributes failed. Then if the filesystem was
+  unmounted and mounted the kernel can get into an endless loop doing a readdir.
-- 
cgit v1.2.3


From 9db4bc13c7fa9073a63a9b3c67f7148f07ef05ea Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Tue, 28 Oct 2008 17:05:43 +0200
Subject: exofs: mkexofs

- Added two mount options mkfs=0/1,format=capacity_in_meg, so mkfs/format
  can be executed by kernel code just before mount. An mkexofs utility
  can now be implemented by means of a script that mounts and unmount the
  file system with proper options.

FIXME: We need a mechanism to prepare the file system (mkfs).
I chose to implement that by means of a couple of mount-options,
because there is not yet user-mode API for committing OSD commands.
Once submitted I will move this to a user-mode application.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/Kbuild    |   2 +-
 fs/exofs/exofs.h   |   3 +
 fs/exofs/mkexofs.c | 606 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/exofs/super.c   |  18 ++
 4 files changed, 628 insertions(+), 1 deletion(-)
 create mode 100644 fs/exofs/mkexofs.c

(limited to 'fs')

diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
index 22685022d495..d7aca83aa667 100644
--- a/fs/exofs/Kbuild
+++ b/fs/exofs/Kbuild
@@ -26,5 +26,5 @@ KBUILD_CPPFLAGS := -I$(OSD_INC) $(KBUILD_CPPFLAGS)
 
 endif
 
-exofs-objs := osd.o inode.o file.o symlink.o namei.o dir.o super.o
+exofs-objs := osd.o inode.o file.o symlink.o namei.o dir.o super.o mkexofs.o
 obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index c1d0980ec359..7d1a965c200d 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -189,6 +189,9 @@ struct osd_request *prepare_osd_write(struct osd_dev *od,
 				      void *data);
 void free_osd_req(struct osd_request *or);
 
+/* mkexofs.c             */
+int exofs_mkfs(struct osd_dev *dev, uint64_t p_id, uint64_t format_size);
+
 /* inode.c               */
 void exofs_truncate(struct inode *inode);
 int exofs_setattr(struct dentry *, struct iattr *);
diff --git a/fs/exofs/mkexofs.c b/fs/exofs/mkexofs.c
new file mode 100644
index 000000000000..8a2b3085a403
--- /dev/null
+++ b/fs/exofs/mkexofs.c
@@ -0,0 +1,606 @@
+/*
+ * mkexofs.c - make an exofs file system.
+ *
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights from mke2fs.c:
+ *     Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
+ *     2003, 2004, 2005 by Theodore Ts'o.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include "exofs.h"
+#include <linux/random.h>
+
+/* #define __MKEXOFS_DEBUG_CHECKS 1 */
+
+static int kick_it(struct osd_request *req, int timeout, uint8_t *cred_a,
+		   const char *op)
+{
+	return exofs_sync_op(req, timeout, cred_a);
+}
+
+/* Format the LUN to the specified size */
+static int format(uint64_t lun_capacity, struct osd_dev *dev, int timeout)
+{
+	struct osd_request *req = prepare_osd_format_lun(dev, lun_capacity);
+	uint8_t cred_a[OSD_CAP_LEN];
+	int ret;
+
+	exofs_make_credential(cred_a, 0, 0);
+
+	if (req == NULL) {
+		EXOFS_ERR("ERROR: Failed to allocate request.\n");
+		return -ENOMEM;
+	}
+
+	ret = kick_it(req, timeout, cred_a, "format");
+
+	free_osd_req(req);
+
+	return ret;
+}
+
+static int create_partition(struct osd_dev *dev, uint64_t p_id, int timeout)
+{
+	struct osd_request *req;
+	uint8_t cred_a[OSD_CAP_LEN];
+	bool try_remove = false;
+	int ret;
+
+	exofs_make_credential(cred_a, p_id, 0);
+
+create_part:
+	req = prepare_osd_create_partition(dev, p_id);
+	if (!req)
+		return -ENOMEM;
+	ret = kick_it(req, timeout, cred_a, "create partition");
+	free_osd_req(req);
+
+	if (ret && !try_remove) {
+		try_remove = true;
+		req = prepare_osd_remove_partition(dev, p_id);
+		if (!req)
+			return -ENOMEM;
+		ret = kick_it(req, timeout, cred_a, "remove partition");
+		free_osd_req(req);
+		if (!ret) /* Try again now */
+			goto create_part;
+	}
+
+	return ret;
+}
+
+#ifdef __MKEXOFS_DEBUG_CHECKS
+static int list(struct osd_dev *dev, uint64_t p_id, int timeout)
+{
+	struct osd_request *req;
+	uint8_t cred_a[OSD_CAP_LEN];
+	unsigned char *buf = NULL;
+	int ret;
+	uint64_t total_matches;
+	uint64_t total_ret;
+	uint64_t *id_list;
+	int is_part, is_utd;
+	uint64_t cont;
+	uint32_t more;
+	int i;
+
+	buf = kzalloc(1024, GFP_KERNEL);
+	if (!buf) {
+		EXOFS_ERR("ERROR: Failed to allocate memory.\n");
+		return -ENOMEM;
+	}
+
+	exofs_make_credential(cred_a, p_id, 0);
+
+	req = prepare_osd_list(dev, p_id, 0, 1024, 0, 0, buf);
+	if (req == NULL) {
+		EXOFS_ERR("ERROR: Failed to allocate request.\n");
+		return -ENOMEM;
+	}
+
+	ret = kick_it(req, timeout, cred_a, "list");
+	if (ret != 0)
+		goto out;
+
+	ret = extract_list_from_req(req, &total_matches, &total_ret, &id_list,
+				    &is_part, &is_utd, &cont, &more);
+
+	EXOFS_DBGMSG("created %llu objects:\n", _LLU(total_ret));
+	for (i = 0 ; i < total_ret ; i++)
+		EXOFS_DBGMSG("%llu\n", _LLU(id_list[i]));
+
+out:
+	free_osd_req(req);
+	kfree(buf);
+
+	return ret;
+}
+#endif
+
+static int create(struct osd_dev *dev, uint64_t p_id, uint64_t o_id,
+		  int timeout)
+{
+	struct osd_request *req;
+	uint8_t cred_a[OSD_CAP_LEN];
+	int ret;
+
+	exofs_make_credential(cred_a, p_id, o_id);
+	req = prepare_osd_create(dev, p_id, o_id);
+
+	if (req == NULL) {
+		EXOFS_ERR("ERROR: Failed to allocate request.\n");
+		return -ENOMEM;
+	}
+
+	ret = kick_it(req, timeout, cred_a, "create");
+
+	free_osd_req(req);
+
+	return ret;
+}
+
+static int write_super(struct osd_dev *dev, uint64_t p_id, int timeout,
+		       int newfile)
+{
+	struct osd_request *req;
+	uint8_t cred_a[OSD_CAP_LEN];
+	struct exofs_fscb data;
+	int ret;
+
+	exofs_make_credential(cred_a, p_id, EXOFS_SUPER_ID);
+
+	data.s_nextid = cpu_to_le64(4);
+	data.s_magic = cpu_to_le16(EXOFS_SUPER_MAGIC);
+	data.s_newfs = 1;
+	if (newfile)
+		data.s_numfiles = cpu_to_le32(1);
+	else
+		data.s_numfiles = 0;
+
+	req = prepare_osd_write(dev, p_id, EXOFS_SUPER_ID,
+				sizeof(struct exofs_fscb), 0, &data);
+
+	if (req == NULL) {
+		EXOFS_ERR("ERROR: Failed to allocate request.\n");
+		return -ENOMEM;
+	}
+
+	ret = kick_it(req, timeout, cred_a, "write super");
+
+	free_osd_req(req);
+
+	return ret;
+}
+
+#ifdef __MKEXOFS_DEBUG_CHECKS
+static int read_super(struct osd_dev *dev, uint64_t p_id, int timeout)
+{
+	struct osd_request *req;
+	uint8_t cred_a[OSD_CAP_LEN];
+	struct exofs_fscb data;
+	int ret;
+
+	exofs_make_credential(cred_a, p_id, EXOFS_SUPER_ID);
+
+	req = prepare_osd_read(dev, p_id, EXOFS_SUPER_ID,
+				sizeof(struct exofs_fscb), 0, 0,
+				(unsigned char *)(&data));
+
+	if (req == NULL) {
+		EXOFS_ERR("ERROR: Failed to allocate request.\n");
+		return -ENOMEM;
+	}
+
+	ret = kick_it(req, timeout, cred_a, "read super");
+	if (ret)
+		goto out;
+
+	EXOFS_DBGMSG("nextid:\t%u\n", le64_to_cpu(data.s_nextid));
+	EXOFS_DBGMSG("magic:\t%u\n", le16_to_cpu(data.s_magic));
+	EXOFS_DBGMSG("numfiles:\t%u\n", le32_to_cpu(data.s_numfiles));
+out:
+	free_osd_req(req);
+
+	return ret;
+}
+#endif
+
+static int write_bitmap(struct osd_dev *dev, uint64_t p_id, int timeout)
+{
+	struct osd_request *req;
+	uint8_t cred_a[OSD_CAP_LEN];
+	uint64_t off = 0;
+	unsigned int id = 3;
+	int ret;
+
+	/* XXX: For now just use counter - later make bitmap */
+	exofs_make_credential(cred_a, p_id, EXOFS_BM_ID);
+
+	req = prepare_osd_write(dev, p_id, EXOFS_BM_ID, sizeof(unsigned int),
+				off, &id);
+	if (req == NULL) {
+		EXOFS_ERR("ERROR: Failed to allocate request.\n");
+		return -ENOMEM;
+	}
+
+	ret = kick_it(req, timeout, cred_a, "write bitmap");
+
+	free_osd_req(req);
+
+	return ret;
+}
+
+#ifdef __MKEXOFS_DEBUG_CHECKS
+static int write_testfile(struct osd_dev *dev, uint64_t p_id, int timeout)
+{
+	struct osd_request *req;
+	uint8_t cred_a[OSD_CAP_LEN];
+	uint64_t off = 0;
+	unsigned char buf[64];
+	int ret;
+
+	strcpy((char *)buf, "This file is a test, it is only a test.");
+	exofs_make_credential(cred_a, p_id, EXOFS_TEST_ID);
+
+	req = prepare_osd_write(dev, p_id, EXOFS_TEST_ID, 64, off, 0, buf);
+
+	if (req == NULL) {
+		EXOFS_ERR("ERROR: Failed to allocate request.\n");
+		return -ENOMEM;
+	}
+
+	ret = kick_it(req, timeout, cred_a, "write bitmap");
+
+	free_osd_req(req);
+
+	return ret;
+}
+
+static int read_testfile(struct osd_dev *dev, uint64_t p_id, int timeout)
+{
+	struct osd_request *req;
+	uint8_t cred_a[OSD_CAP_LEN];
+	unsigned char data[64];
+	int ret;
+
+	exofs_make_credential(cred_a, p_id, EXOFS_TEST_ID);
+
+	req = prepare_osd_read(dev, p_id, EXOFS_TEST_ID, 64, 0, 0, data);
+
+	if (req == NULL) {
+		EXOFS_ERR("ERROR: Failed to allocate request.\n");
+		return -ENOMEM;
+	}
+
+	ret = kick_it(req, timeout, cred_a, "read test file");
+	if (ret)
+		goto out;
+
+	EXOFS_DBGMSG("test file: %s\n", data);
+
+out:
+	free_osd_req(req);
+
+	return ret;
+}
+#endif
+
+static int write_rootdir(struct osd_dev *dev, uint64_t p_id, int timeout,
+			 int newfile)
+{
+	struct osd_request *req;
+	uint8_t cred_a[OSD_CAP_LEN];
+	struct exofs_dir_entry *dir;
+	uint64_t off = 0;
+	unsigned char *buf = NULL;
+	int filetype = EXOFS_FT_DIR << 8;
+	int filetype2 = EXOFS_FT_REG_FILE << 8;
+	int rec_len;
+	int done;
+	int ret;
+
+	buf = kzalloc(EXOFS_BLKSIZE, GFP_KERNEL);
+	if (!buf) {
+		EXOFS_ERR("ERROR: Failed to allocate memory.\n");
+		return -ENOMEM;
+	}
+	dir = (struct exofs_dir_entry *)buf;
+
+	/* create entry for '.' */
+	dir->name[0] = '.';
+	dir->name_len = 1 | filetype;
+	dir->inode_no = cpu_to_le64(EXOFS_ROOT_ID - EXOFS_OBJ_OFF);
+	dir->rec_len = cpu_to_le16(EXOFS_DIR_REC_LEN(1));
+	rec_len = EXOFS_BLKSIZE - EXOFS_DIR_REC_LEN(1);
+
+	/* create entry for '..' */
+	dir = (struct exofs_dir_entry *) (buf + le16_to_cpu(dir->rec_len));
+	dir->name[0] = '.';
+	dir->name[1] = '.';
+	dir->name_len = 2 | filetype;
+	dir->inode_no = cpu_to_le64(EXOFS_ROOT_ID - EXOFS_OBJ_OFF);
+	if (newfile) {
+		rec_len -= EXOFS_DIR_REC_LEN(2);
+		dir->rec_len = cpu_to_le16(EXOFS_DIR_REC_LEN(2));
+	} else
+		dir->rec_len = cpu_to_le16(rec_len);
+	done = EXOFS_DIR_REC_LEN(1) + le16_to_cpu(dir->rec_len);
+
+	/* create entry for 'test', if specified */
+	if (newfile) {
+		dir = (struct exofs_dir_entry *) (buf + done);
+		dir->inode_no = cpu_to_le64(EXOFS_TEST_ID - EXOFS_OBJ_OFF);
+		dir->name_len = 4 | filetype2;
+		dir->name[0] = 't';
+		dir->name[1] = 'e';
+		dir->name[2] = 's';
+		dir->name[3] = 't';
+		dir->rec_len = cpu_to_le16(rec_len);
+	}
+
+	exofs_make_credential(cred_a, p_id, EXOFS_ROOT_ID);
+
+	req = prepare_osd_write(dev, p_id, EXOFS_ROOT_ID, EXOFS_BLKSIZE, off,
+				buf);
+
+	if (req == NULL) {
+		EXOFS_ERR("ERROR: Failed to allocate request.\n");
+		return -ENOMEM;
+	}
+
+	ret = kick_it(req, timeout, cred_a, "write rootdir");
+
+	free_osd_req(req);
+
+	return ret;
+}
+
+static int set_inode(struct osd_dev *dev, uint64_t p_id, int timeout,
+		     uint64_t o_id, uint16_t mode)
+{
+	struct osd_request *req;
+	uint8_t cred_a[OSD_CAP_LEN];
+	struct exofs_fcb in = {0};
+	struct exofs_fcb *inode = &in;
+	uint32_t i_generation;
+	int ret;
+
+	inode->i_mode = cpu_to_le16(mode);
+	inode->i_uid = inode->i_gid = 0;
+	inode->i_links_count = cpu_to_le16(2);
+	inode->i_ctime = inode->i_atime = inode->i_mtime =
+				       (signed)cpu_to_le32(CURRENT_TIME.tv_sec);
+	inode->i_size = cpu_to_le64(EXOFS_BLKSIZE);
+	if (o_id != EXOFS_ROOT_ID)
+		inode->i_size = cpu_to_le64(64);
+
+	get_random_bytes(&i_generation, sizeof(i_generation));
+	inode->i_generation = cpu_to_le32(i_generation);
+
+	exofs_make_credential(cred_a, p_id, o_id);
+
+	req = prepare_osd_set_attr(dev, p_id, o_id);
+
+	if (req == NULL) {
+		EXOFS_ERR("ERROR: Failed to allocate request.\n");
+		return -ENOMEM;
+	}
+
+	prepare_set_attr_list_add_entry(req,
+			OSD_PAGE_NUM_IBM_UOBJ_FS_DATA,
+			OSD_ATTR_NUM_IBM_UOBJ_FS_DATA_INODE,
+			EXOFS_INO_ATTR_SIZE, (u8 *)inode);
+
+	ret = kick_it(req, timeout, cred_a, "set inode");
+
+	free_osd_req(req);
+
+	return ret;
+}
+
+#ifdef __MKEXOFS_DEBUG_CHECKS
+static int get_root_attr(struct osd_dev *dev, uint64_t p_id, int timeout)
+{
+	struct osd_request *req;
+	uint8_t cred_a[OSD_CAP_LEN];
+	struct exofs_fcb in = {0};
+	struct exofs_fcb *inode = &in;
+	uint32_t attr_page = OSD_PAGE_NUM_IBM_UOBJ_FS_DATA;
+	uint32_t attr_id = OSD_ATTR_NUM_IBM_UOBJ_FS_DATA_INODE;
+	uint16_t expected = EXOFS_INO_ATTR_SIZE;
+	uint8_t *buf;
+	int ret;
+
+	exofs_make_credential(cred_a, p_id, EXOFS_ROOT_ID);
+
+	req = prepare_osd_get_attr(dev, p_id, EXOFS_ROOT_ID);
+	if (req == NULL) {
+		EXOFS_ERR("ERROR: Failed to allocate request.\n");
+		return -ENOMEM;
+	}
+
+	prepare_get_attr_list_add_entry(req,
+			OSD_PAGE_NUM_IBM_UOBJ_FS_DATA,
+			OSD_ATTR_NUM_IBM_UOBJ_FS_DATA_INODE,
+			EXOFS_INO_ATTR_SIZE);
+
+	ret = kick_it(req, timeout, cred_a, "get root inode");
+	if (ret)
+		goto out;
+
+	ret = extract_next_attr_from_req(req, &attr_page, &attr_id, &expected,
+					 &buf);
+	if (ret) {
+		EXOFS_ERR("ERROR: extract attr from req failed\n");
+		goto out;
+	}
+
+	memcpy(inode, buf, sizeof(struct exofs_fcb));
+
+	EXOFS_DBGMSG("mode: %u\n", le16_to_cpu(inode->i_mode));
+	EXOFS_DBGMSG("uid: %u\n", le32_to_cpu(inode->i_uid));
+	EXOFS_DBGMSG("gid: %u\n", le32_to_cpu(inode->i_gid));
+	EXOFS_DBGMSG("links: %u\n", le16_to_cpu(inode->i_links_count));
+	EXOFS_DBGMSG("ctime: %u\n", le32_to_cpu(inode->i_ctime));
+	EXOFS_DBGMSG("atime: %u\n", le32_to_cpu(inode->i_atime));
+	EXOFS_DBGMSG("mtime: %u\n", le32_to_cpu(inode->i_mtime));
+	EXOFS_DBGMSG("gen: %u\n", le32_to_cpu(inode->i_generation));
+	EXOFS_DBGMSG("size: %llu\n", _LLU(le64_to_cpu(inode->i_size)));
+
+out:
+	free_osd_req(req);
+
+	return ret;
+}
+#endif
+
+/*
+ * This function creates an exofs file system on the specified OSD partition.
+ */
+int exofs_mkfs(struct osd_dev *dev, uint64_t p_id, uint64_t format_size_meg)
+{
+	int err;
+	const int to_format = (4 * 60 * HZ);
+	const int to_gen = (60 * HZ);
+	bool newfile = false;
+
+	/* Get a handle */
+	EXOFS_DBGMSG("setting up exofs on partition %llu:\n", _LLU(p_id));
+
+	/* Format LUN if requested */
+	if (format_size_meg > 0) {
+		EXOFS_DBGMSG("formatting %llu Mgb...\n", _LLU(format_size_meg));
+		err = format(format_size_meg * 1024 * 1024, dev, to_format);
+		if (err)
+			goto out;
+		EXOFS_DBGMSG(" OK\n");
+	}
+
+	/* Create partition */
+	EXOFS_DBGMSG("creating partition...\n");
+	err = create_partition(dev, p_id, to_gen);
+	if (err)
+		goto out;
+	EXOFS_DBGMSG(" OK\n");
+
+	/* Create object with known ID for superblock info */
+	EXOFS_DBGMSG("creating superblock...\n");
+	err = create(dev, p_id, EXOFS_SUPER_ID, to_gen);
+	if (err)
+		goto out;
+	EXOFS_DBGMSG(" OK\n");
+
+	/* Create root directory object */
+	EXOFS_DBGMSG("creating root directory...\n");
+	err = create(dev, p_id, EXOFS_ROOT_ID, to_gen);
+	if (err)
+		goto out;
+	EXOFS_DBGMSG(" OK\n");
+
+	/* Create bitmap object */
+	EXOFS_DBGMSG("creating free ID bitmap...\n");
+	err = create(dev, p_id, EXOFS_BM_ID, to_gen);
+	if (err)
+		goto out;
+	EXOFS_DBGMSG(" OK\n");
+
+#ifdef __MKEXOFS_DEBUG_CHECKS
+	/* Create a test file, if specified by options */
+	if (newfile) {
+		EXOFS_DBGMSG("creating test file...\n");
+		err = create(dev, p_id, EXOFS_TEST_ID, to_gen);
+		if (err)
+			goto out;
+		EXOFS_DBGMSG(" OK\n");
+	}
+#endif
+
+	/* Write superblock */
+	EXOFS_DBGMSG("writing superblock...\n");
+	err = write_super(dev, p_id, to_gen, newfile);
+	if (err)
+		goto out;
+	EXOFS_DBGMSG(" OK\n");
+
+	/* Write root directory */
+	EXOFS_DBGMSG("writing root directory...\n");
+	err = write_rootdir(dev, p_id, to_gen, newfile);
+	if (err)
+		goto out;
+	EXOFS_DBGMSG(" OK\n");
+
+	/* Set root partition inode attribute */
+	EXOFS_DBGMSG("writing root inode...\n");
+	err = set_inode(dev, p_id, to_gen, EXOFS_ROOT_ID,
+			0040000 | (0777 & ~022));
+	if (err)
+		goto out;
+	EXOFS_DBGMSG(" OK\n");
+
+#ifdef __MKEXOFS_DEBUG_CHECKS
+	/* Set test file inode attribute */
+	if (newfile) {
+		EXOFS_DBGMSG("writing test inode...\n");
+		err = set_inode(dev, p_id, to_gen, EXOFS_TEST_ID,
+				0100000 | (0777 & ~022));
+		if (err)
+			goto out;
+		EXOFS_DBGMSG(" OK\n");
+	}
+#endif
+	/* Write bitmap */
+	EXOFS_DBGMSG("writing free ID bitmap...\n");
+	err = write_bitmap(dev, p_id, to_gen);
+	if (err)
+		goto out;
+	EXOFS_DBGMSG(" OK\n");
+
+#ifdef __MKEXOFS_DEBUG_CHECKS
+	/* Write test file */
+	if (newfile) {
+		EXOFS_DBGMSG("writing test file...\n");
+		err = write_testfile(dev, p_id, to_gen);
+		if (err)
+			goto out;
+		EXOFS_DBGMSG(" OK\n");
+	}
+
+	/* some debug info */
+	{
+		EXOFS_DBGMSG("listing:\n");
+		list(dev, p_id, to_gen);
+		EXOFS_DBGMSG("contents of superblock:\n");
+		read_super(dev, p_id, to_gen);
+		EXOFS_DBGMSG("contents of root inode:\n");
+		get_root_attr(dev, p_id, to_gen);
+		if (newfile) {
+			EXOFS_DBGMSG("contents of test file:\n");
+			read_testfile(dev, p_id, to_gen);
+		}
+	}
+#endif
+	EXOFS_DBGMSG("\nsetup complete: enjoy your shiny new exofs!\n");
+
+out:
+	return err;
+}
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 53d8b52e9ae7..af2ea9d2ca58 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -57,6 +57,8 @@ enum { Opt_pid, Opt_to, Opt_mkfs, Opt_format, Opt_err };
 static match_table_t tokens = {
 	{Opt_pid, "pid=%u"},
 	{Opt_to, "to=%u"},
+	{Opt_mkfs, "mkfs=%u"},
+	{Opt_format, "format=%u"},
 	{Opt_err, NULL}
 };
 
@@ -103,6 +105,16 @@ static int parse_options(char *options, struct exofs_mountopt *opts)
 			}
 			opts->timeout = option * HZ;
 			break;
+		case Opt_mkfs:
+			if (match_int(&args[0], &option))
+				return -EINVAL;
+			opts->mkfs = option != 0;
+			break;
+		case Opt_format:
+			if (match_int(&args[0], &option))
+				return -EINVAL;
+			opts->format = option;
+			break;
 		}
 	}
 
@@ -280,6 +292,12 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_bdev = NULL;
 	sb->s_dev = 0;
 
+	/* see if we need to make the file system on the obsd */
+	if (opts->mkfs) {
+		EXOFS_DBGMSG("exofs_mkfs %p\n", sbi->s_dev);
+		exofs_mkfs(sbi->s_dev, sbi->s_pid, opts->format);
+	}
+
 	/* read data from on-disk superblock object */
 	exofs_make_credential(sbi->s_cred, sbi->s_pid, EXOFS_SUPER_ID);
 
-- 
cgit v1.2.3


From d12079b4b806e3c1d7b657c30abf876bbb818557 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Mon, 15 Dec 2008 12:53:08 +0200
Subject: fs: Add exofs to Kernel build

- Add exofs to fs/Kconfig under "menu 'Miscellaneous filesystems'"
- Add exofs to fs/Makefile

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/Kconfig  | 2 ++
 fs/Makefile | 1 +
 2 files changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 51307b0fdf0f..37bbec255c79 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1175,6 +1175,8 @@ config UFS_DEBUG
 	  Y here.  This will result in _many_ additional debugging messages to be
 	  written to the system log.
 
+source "fs/exofs/Kconfig"
+
 endif # MISC_FILESYSTEMS
 
 menuconfig NETWORK_FILESYSTEMS
diff --git a/fs/Makefile b/fs/Makefile
index 38bc735c67ad..c907161258cb 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -122,3 +122,4 @@ obj-$(CONFIG_DEBUG_FS)		+= debugfs/
 obj-$(CONFIG_OCFS2_FS)		+= ocfs2/
 obj-$(CONFIG_BTRFS_FS)		+= btrfs/
 obj-$(CONFIG_GFS2_FS)           += gfs2/
+obj-$(CONFIG_EXOFS_FS)          += exofs/
-- 
cgit v1.2.3