Merge remote-tracking branch 'driver-core/driver-core-next'

author: Stephen Rothwell <sfr@canb.auug.org.au> 2013-12-17 14:45:09 +1100
committer: Stephen Rothwell <sfr@canb.auug.org.au> 2013-12-17 14:45:09 +1100
commit: 4fa9f39b102e94073a4a3fbb52014e75772060d4 (patch)
tree: dd243d8a30e5791da58446d75ff78b1e72916059
parent: ca7a7723217f08a9f8b40d958a915bf8887013b8 (diff)
parent: c637b8acbe079edb477d887041755b489036f146 (diff)
35 files changed, 3631 insertions, 2837 deletions
diff --git a/Documentation/driver-model/design-patterns.txt b/Documentation/driver-model/design-patterns.txt
new file mode 100644
index 000000000000..9ef8c1684558
--- /dev/null
+++ b/Documentation/driver-model/design-patterns.txt
@@ -0,0 +1,116 @@
+
+Device Driver Design Patterns
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This document describes a few common design patterns found in device drivers.
+It is likely that subsystem maintainers will ask driver developers to
+conform to these design patterns.
+
+1. State Container
+2. container_of()
+
+
+1. State Container
+~~~~~~~~~~~~~~~~~~
+
+While the kernel contains a few device drivers that assume that they will
+only be probed() once on a certain system (singletons), it is custom to assume
+that the device the driver binds to will appear in several instances. This
+means that the probe() function and all callbacks need to be reentrant.
+
+The most common way to achieve this is to use the state container design
+pattern. It usually has this form:
+
+struct foo {
+    spinlock_t lock; /* Example member */
+    (...)
+};
+
+static int foo_probe(...)
+{
+    struct foo *foo;
+
+    foo = devm_kzalloc(dev, sizeof(*foo), GFP_KERNEL);
+    if (!foo)
+        return -ENOMEM;
+    spin_lock_init(&foo->lock);
+    (...)
+}
+
+This will create an instance of struct foo in memory every time probe() is
+called. This is our state container for this instance of the device driver.
+Of course it is then necessary to always pass this instance of the
+state around to all functions that need access to the state and its members.
+
+For example, if the driver is registering an interrupt handler, you would
+pass around a pointer to struct foo like this:
+
+static irqreturn_t foo_handler(int irq, void *arg)
+{
+    struct foo *foo = arg;
+    (...)
+}
+
+static int foo_probe(...)
+{
+    struct foo *foo;
+
+    (...)
+    ret = request_irq(irq, foo_handler, 0, "foo", foo);
+}
+
+This way you always get a pointer back to the correct instance of foo in
+your interrupt handler.
+
+
+2. container_of()
+~~~~~~~~~~~~~~~~~
+
+Continuing on the above example we add a offloaded work:
+
+struct foo {
+    spinlock_t lock;
+    struct workqueue_struct *wq;
+    struct work_struct offload;
+    (...)
+};
+
+static void foo_work(struct work_struct *work)
+{
+    struct foo *foo = container_of(work, struct foo, offload);
+
+    (...)
+}
+
+static irqreturn_t foo_handler(int irq, void *arg)
+{
+    struct foo *foo = arg;
+
+    queue_work(foo->wq, &foo->offload);
+    (...)
+}
+
+static int foo_probe(...)
+{
+    struct foo *foo;
+
+    foo->wq = create_singlethread_workqueue("foo-wq");
+    INIT_WORK(&foo->offload, foo_work);
+    (...)
+}
+
+The design pattern is the same for a a hrtimer or something similar that will
+return a single argument which is a pointer to a struct member in the
+callback.
+
+container_of() is a macro defined in <linux/kernel.h>
+
+What container_of() does is to obtain a pointer to the containing struct from
+a pointer to a member by a simple subtraction using the offsetof() macro from
+standard C, which allows something similar to object oriented behaviours.
+Notice that the contained member must not be a pointer, but an actual member
+for this to work.
+
+We can see here that we avoid having global pointers to our struct foo *
+instance this way, while still keeping the number of parameters passed to the
+work function to a single pointer.
diff --git a/Documentation/kobject.txt b/Documentation/kobject.txt
index c5182bb2c16c..f87241dfed87 100644
--- a/Documentation/kobject.txt
+++ b/Documentation/kobject.txt
@@ -342,7 +342,10 @@ kset use:
 
 When you are finished with the kset, call:
   void kset_unregister(struct kset *kset);
-to destroy it.
+to destroy it.  This removes the kset from sysfs and decrements its reference
+count.  When the reference count goes to zero, the kset will be released.
+Because other references to the kset may still exist, the release may happen
+after kset_unregister() returns.
 
 An example of using a kset can be seen in the
 samples/kobject/kset-example.c file in the kernel tree.
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index c3d4cc972eca..22b3a1191ab3 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -430,7 +430,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device,
 	if (c->x86 >= 0x15)
 		snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86);
 
-	if (request_firmware(&fw, (const char *)fw_name, device)) {
+	if (request_firmware_direct(&fw, (const char *)fw_name, device)) {
 		pr_debug("failed to load file %s\n", fw_name);
 		goto out;
 	}
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 5fb2cebf556b..a276fa75d9b5 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -278,7 +278,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device,
 	sprintf(name, "intel-ucode/%02x-%02x-%02x",
 		c->x86, c->x86_model, c->x86_mask);
 
-	if (request_firmware(&firmware, name, device)) {
+	if (request_firmware_direct(&firmware, name, device)) {
 		pr_debug("data file %s load failed\n", name);
 		return UCODE_NFOUND;
 	}
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 67b180d855b2..aab43fbb8336 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -1603,6 +1603,7 @@ device_create_groups_vargs(struct class *class, struct device *parent,
 		goto error;
 	}
 
+	device_initialize(dev);
 	dev->devt = devt;
 	dev->class = class;
 	dev->parent = parent;
@@ -1614,7 +1615,7 @@ device_create_groups_vargs(struct class *class, struct device *parent,
 	if (retval)
 		goto error;
 
-	retval = device_register(dev);
+	retval = device_add(dev);
 	if (retval)
 		goto error;
 
diff --git a/drivers/base/firmware_class.c b/drivers/base/firmware_class.c
index eb8fb94ae2c5..33b87bf664ab 100644
--- a/drivers/base/firmware_class.c
+++ b/drivers/base/firmware_class.c
@@ -96,6 +96,15 @@ static inline long firmware_loading_timeout(void)
 	return loading_timeout > 0 ? loading_timeout * HZ : MAX_SCHEDULE_TIMEOUT;
 }
 
+/* firmware behavior options */
+#define FW_OPT_UEVENT	(1U << 0)
+#define FW_OPT_NOWAIT	(1U << 1)
+#ifdef CONFIG_FW_LOADER_USER_HELPER
+#define FW_OPT_FALLBACK	(1U << 2)
+#else
+#define FW_OPT_FALLBACK	0
+#endif
+
 struct firmware_cache {
 	/* firmware_buf instance will be added into the below list */
 	spinlock_t lock;
@@ -820,7 +829,7 @@ static void firmware_class_timeout_work(struct work_struct *work)
 
 static struct firmware_priv *
 fw_create_instance(struct firmware *firmware, const char *fw_name,
-		   struct device *device, bool uevent, bool nowait)
+		   struct device *device, unsigned int opt_flags)
 {
 	struct firmware_priv *fw_priv;
 	struct device *f_dev;
@@ -832,7 +841,7 @@ fw_create_instance(struct firmware *firmware, const char *fw_name,
 		goto exit;
 	}
 
-	fw_priv->nowait = nowait;
+	fw_priv->nowait = !!(opt_flags & FW_OPT_NOWAIT);
 	fw_priv->fw = firmware;
 	INIT_DELAYED_WORK(&fw_priv->timeout_work,
 		firmware_class_timeout_work);
@@ -848,8 +857,8 @@ exit:
 }
 
 /* load a firmware via user helper */
-static int _request_firmware_load(struct firmware_priv *fw_priv, bool uevent,
-				  long timeout)
+static int _request_firmware_load(struct firmware_priv *fw_priv,
+				  unsigned int opt_flags, long timeout)
 {
 	int retval = 0;
 	struct device *f_dev = &fw_priv->dev;
@@ -885,7 +894,7 @@ static int _request_firmware_load(struct firmware_priv *fw_priv, bool uevent,
 		goto err_del_bin_attr;
 	}
 
-	if (uevent) {
+	if (opt_flags & FW_OPT_UEVENT) {
 		buf->need_uevent = true;
 		dev_set_uevent_suppress(f_dev, false);
 		dev_dbg(f_dev, "firmware: requesting %s\n", buf->fw_id);
@@ -911,16 +920,16 @@ err_put_dev:
 
 static int fw_load_from_user_helper(struct firmware *firmware,
 				    const char *name, struct device *device,
-				    bool uevent, bool nowait, long timeout)
+				    unsigned int opt_flags, long timeout)
 {
 	struct firmware_priv *fw_priv;
 
-	fw_priv = fw_create_instance(firmware, name, device, uevent, nowait);
+	fw_priv = fw_create_instance(firmware, name, device, opt_flags);
 	if (IS_ERR(fw_priv))
 		return PTR_ERR(fw_priv);
 
 	fw_priv->buf = firmware->priv;
-	return _request_firmware_load(fw_priv, uevent, timeout);
+	return _request_firmware_load(fw_priv, opt_flags, timeout);
 }
 
 #ifdef CONFIG_PM_SLEEP
@@ -942,7 +951,7 @@ static void kill_requests_without_uevent(void)
 #else /* CONFIG_FW_LOADER_USER_HELPER */
 static inline int
 fw_load_from_user_helper(struct firmware *firmware, const char *name,
-			 struct device *device, bool uevent, bool nowait,
+			 struct device *device, unsigned int opt_flags,
 			 long timeout)
 {
 	return -ENOENT;
@@ -1023,7 +1032,7 @@ _request_firmware_prepare(struct firmware **firmware_p, const char *name,
 }
 
 static int assign_firmware_buf(struct firmware *fw, struct device *device,
-				bool skip_cache)
+			       unsigned int opt_flags)
 {
 	struct firmware_buf *buf = fw->priv;
 
@@ -1040,7 +1049,8 @@ static int assign_firmware_buf(struct firmware *fw, struct device *device,
 	 * device may has been deleted already, but the problem
 	 * should be fixed in devres or driver core.
 	 */
-	if (device && !skip_cache)
+	/* don't cache firmware handled without uevent */
+	if (device && (opt_flags & FW_OPT_UEVENT))
 		fw_add_devm_name(device, buf->fw_id);
 
 	/*
@@ -1061,7 +1071,7 @@ static int assign_firmware_buf(struct firmware *fw, struct device *device,
 /* called from request_firmware() and request_firmware_work_func() */
 static int
 _request_firmware(const struct firmware **firmware_p, const char *name,
-		  struct device *device, bool uevent, bool nowait)
+		  struct device *device, unsigned int opt_flags)
 {
 	struct firmware *fw;
 	long timeout;
@@ -1076,7 +1086,7 @@ _request_firmware(const struct firmware **firmware_p, const char *name,
 
 	ret = 0;
 	timeout = firmware_loading_timeout();
-	if (nowait) {
+	if (opt_flags & FW_OPT_NOWAIT) {
 		timeout = usermodehelper_read_lock_wait(timeout);
 		if (!timeout) {
 			dev_dbg(device, "firmware: %s loading timed out\n",
@@ -1095,16 +1105,18 @@ _request_firmware(const struct firmware **firmware_p, const char *name,
 
 	ret = fw_get_filesystem_firmware(device, fw->priv);
 	if (ret) {
-		dev_warn(device, "Direct firmware load failed with error %d\n",
-			 ret);
-		dev_warn(device, "Falling back to user helper\n");
-		ret = fw_load_from_user_helper(fw, name, device,
-					       uevent, nowait, timeout);
+		if (opt_flags & FW_OPT_FALLBACK) {
+			dev_warn(device,
+				 "Direct firmware load failed with error %d\n",
+				 ret);
+			dev_warn(device, "Falling back to user helper\n");
+			ret = fw_load_from_user_helper(fw, name, device,
+						       opt_flags, timeout);
+		}
 	}
 
-	/* don't cache firmware handled without uevent */
 	if (!ret)
-		ret = assign_firmware_buf(fw, device, !uevent);
+		ret = assign_firmware_buf(fw, device, opt_flags);
 
 	usermodehelper_read_unlock();
 
@@ -1146,12 +1158,37 @@ request_firmware(const struct firmware **firmware_p, const char *name,
 
 	/* Need to pin this module until return */
 	__module_get(THIS_MODULE);
-	ret = _request_firmware(firmware_p, name, device, true, false);
+	ret = _request_firmware(firmware_p, name, device,
+				FW_OPT_UEVENT | FW_OPT_FALLBACK);
 	module_put(THIS_MODULE);
 	return ret;
 }
 EXPORT_SYMBOL(request_firmware);
 
+#ifdef CONFIG_FW_LOADER_USER_HELPER
+/**
+ * request_firmware: - load firmware directly without usermode helper
+ * @firmware_p: pointer to firmware image
+ * @name: name of firmware file
+ * @device: device for which firmware is being loaded
+ *
+ * This function works pretty much like request_firmware(), but this doesn't
+ * fall back to usermode helper even if the firmware couldn't be loaded
+ * directly from fs.  Hence it's useful for loading optional firmwares, which
+ * aren't always present, without extra long timeouts of udev.
+ **/
+int request_firmware_direct(const struct firmware **firmware_p,
+			    const char *name, struct device *device)
+{
+	int ret;
+	__module_get(THIS_MODULE);
+	ret = _request_firmware(firmware_p, name, device, FW_OPT_UEVENT);
+	module_put(THIS_MODULE);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(request_firmware_direct);
+#endif
+
 /**
  * release_firmware: - release the resource associated with a firmware image
  * @fw: firmware resource to release
@@ -1174,7 +1211,7 @@ struct firmware_work {
 	struct device *device;
 	void *context;
 	void (*cont)(const struct firmware *fw, void *context);
-	bool uevent;
+	unsigned int opt_flags;
 };
 
 static void request_firmware_work_func(struct work_struct *work)
@@ -1185,7 +1222,7 @@ static void request_firmware_work_func(struct work_struct *work)
 	fw_work = container_of(work, struct firmware_work, work);
 
 	_request_firmware(&fw, fw_work->name, fw_work->device,
-			  fw_work->uevent, true);
+			  fw_work->opt_flags);
 	fw_work->cont(fw, fw_work->context);
 	put_device(fw_work->device); /* taken in request_firmware_nowait() */
 
@@ -1233,7 +1270,8 @@ request_firmware_nowait(
 	fw_work->device = device;
 	fw_work->context = context;
 	fw_work->cont = cont;
-	fw_work->uevent = uevent;
+	fw_work->opt_flags = FW_OPT_NOWAIT | FW_OPT_FALLBACK |
+		(uevent ? FW_OPT_UEVENT : 0);
 
 	if (!try_module_get(module)) {
 		kfree(fw_work);
diff --git a/drivers/firmware/dmi-sysfs.c b/drivers/firmware/dmi-sysfs.c
index eb26d62e5188..e0f1cb3d3598 100644
--- a/drivers/firmware/dmi-sysfs.c
+++ b/drivers/firmware/dmi-sysfs.c
@@ -553,7 +553,7 @@ static const struct bin_attribute dmi_entry_raw_attr = {
 static void dmi_sysfs_entry_release(struct kobject *kobj)
 {
 	struct dmi_sysfs_entry *entry = to_entry(kobj);
-	sysfs_remove_bin_file(&entry->kobj, &dmi_entry_raw_attr);
+
 	spin_lock(&entry_list_lock);
 	list_del(&entry->list);
 	spin_unlock(&entry_list_lock);
@@ -685,6 +685,7 @@ static void __exit dmi_sysfs_exit(void)
 	pr_debug("dmi-sysfs: unloading.\n");
 	cleanup_entry_list();
 	kset_unregister(dmi_kset);
+	kobject_del(dmi_kobj);
 	kobject_put(dmi_kobj);
 }
 
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 85f772c0b26a..c8a7c810bade 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -393,7 +393,7 @@ static const DEVICE_ATTR(value, 0644,
 
 static irqreturn_t gpio_sysfs_irq(int irq, void *priv)
 {
-	struct sysfs_dirent	*value_sd = priv;
+	struct kernfs_node	*value_sd = priv;
 
 	sysfs_notify_dirent(value_sd);
 	return IRQ_HANDLED;
@@ -402,7 +402,7 @@ static irqreturn_t gpio_sysfs_irq(int irq, void *priv)
 static int gpio_setup_irq(struct gpio_desc *desc, struct device *dev,
 		unsigned long gpio_flags)
 {
-	struct sysfs_dirent	*value_sd;
+	struct kernfs_node	*value_sd;
 	unsigned long		irq_flags;
 	int			ret, irq, id;
 
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 12dc29ba7399..4195a01b1535 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1635,7 +1635,7 @@ int bitmap_create(struct mddev *mddev)
 	sector_t blocks = mddev->resync_max_sectors;
 	struct file *file = mddev->bitmap_info.file;
 	int err;
-	struct sysfs_dirent *bm = NULL;
+	struct kernfs_node *bm = NULL;
 
 	BUILD_BUG_ON(sizeof(bitmap_super_t) != 256);
 
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index df4aeb6ac6f0..30210b9c4ef9 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -225,7 +225,7 @@ struct bitmap {
 	wait_queue_head_t overflow_wait;
 	wait_queue_head_t behind_wait;
 
-	struct sysfs_dirent *sysfs_can_clear;
+	struct kernfs_node *sysfs_can_clear;
 };
 
 /* the bitmap API */
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 2f5cc8a7ef3e..389a3c93cdb7 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -106,7 +106,7 @@ struct md_rdev {
 					   */
 	struct work_struct del_work;	/* used for delayed sysfs removal */
 
-	struct sysfs_dirent *sysfs_state; /* handle for 'state'
+	struct kernfs_node *sysfs_state; /* handle for 'state'
 					   * sysfs entry */
 
 	struct badblocks {
@@ -376,10 +376,10 @@ struct mddev {
 	sector_t			resync_max;	/* resync should pause
 							 * when it gets here */
 
-	struct sysfs_dirent		*sysfs_state;	/* handle for 'array_state'
+	struct kernfs_node		*sysfs_state;	/* handle for 'array_state'
 							 * file in sysfs.
 							 */
-	struct sysfs_dirent		*sysfs_action;  /* handle for 'sync_action' */
+	struct kernfs_node		*sysfs_action;  /* handle for 'sync_action' */
 
 	struct work_struct del_work;	/* used for delayed sysfs removal */
 
@@ -498,13 +498,13 @@ struct md_sysfs_entry {
 };
 extern struct attribute_group md_bitmap_group;
 
-static inline struct sysfs_dirent *sysfs_get_dirent_safe(struct sysfs_dirent *sd, char *name)
+static inline struct kernfs_node *sysfs_get_dirent_safe(struct kernfs_node *sd, char *name)
 {
 	if (sd)
 		return sysfs_get_dirent(sd, name);
 	return sd;
 }
-static inline void sysfs_notify_dirent_safe(struct sysfs_dirent *sd)
+static inline void sysfs_notify_dirent_safe(struct kernfs_node *sd)
 {
 	if (sd)
 		sysfs_notify_dirent(sd);
diff --git a/drivers/misc/mic/host/mic_device.h b/drivers/misc/mic/host/mic_device.h
index 3574cc375bb9..538e3d3d3c8c 100644
--- a/drivers/misc/mic/host/mic_device.h
+++ b/drivers/misc/mic/host/mic_device.h
@@ -112,7 +112,7 @@ struct mic_device {
 	struct work_struct shutdown_work;
 	u8 state;
 	u8 shutdown_status;
-	struct sysfs_dirent *state_sysfs;
+	struct kernfs_node *state_sysfs;
 	struct completion reset_wait;
 	void *log_buf_addr;
 	int *log_buf_len;
diff --git a/fs/Makefile b/fs/Makefile
index 4fe6df3ec28f..39a824f44e7c 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -53,7 +53,7 @@ obj-$(CONFIG_FHANDLE)		+= fhandle.o
 obj-y				+= quota/
 
 obj-$(CONFIG_PROC_FS)		+= proc/
-obj-$(CONFIG_SYSFS)		+= sysfs/
+obj-$(CONFIG_SYSFS)		+= sysfs/ kernfs/
 obj-$(CONFIG_CONFIGFS_FS)	+= configfs/
 obj-y				+= devpts/
 
diff --git a/fs/kernfs/Makefile b/fs/kernfs/Makefile
new file mode 100644
index 000000000000..674337c76673
--- /dev/null
+++ b/fs/kernfs/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the kernfs pseudo filesystem
+#
+
+obj-y		:= mount.o inode.o dir.o file.o symlink.o
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
new file mode 100644
index 000000000000..6520066c49ea
--- /dev/null
+++ b/fs/kernfs/dir.c
@@ -0,0 +1,1018 @@
+/*
+ * fs/kernfs/dir.c - kernfs directory implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/idr.h>
+#include <linux/slab.h>
+#include <linux/security.h>
+#include <linux/hash.h>
+
+#include "kernfs-internal.h"
+
+DEFINE_MUTEX(kernfs_mutex);
+
+#define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)
+
+/**
+ *	kernfs_name_hash
+ *	@name: Null terminated string to hash
+ *	@ns:   Namespace tag to hash
+ *
+ *	Returns 31 bit hash of ns + name (so it fits in an off_t )
+ */
+static unsigned int kernfs_name_hash(const char *name, const void *ns)
+{
+	unsigned long hash = init_name_hash();
+	unsigned int len = strlen(name);
+	while (len--)
+		hash = partial_name_hash(*name++, hash);
+	hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31));
+	hash &= 0x7fffffffU;
+	/* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */
+	if (hash < 1)
+		hash += 2;
+	if (hash >= INT_MAX)
+		hash = INT_MAX - 1;
+	return hash;
+}
+
+static int kernfs_name_compare(unsigned int hash, const char *name,
+			       const void *ns, const struct kernfs_node *kn)
+{
+	if (hash != kn->hash)
+		return hash - kn->hash;
+	if (ns != kn->ns)
+		return ns - kn->ns;
+	return strcmp(name, kn->name);
+}
+
+static int kernfs_sd_compare(const struct kernfs_node *left,
+			     const struct kernfs_node *right)
+{
+	return kernfs_name_compare(left->hash, left->name, left->ns, right);
+}
+
+/**
+ *	kernfs_link_sibling - link kernfs_node into sibling rbtree
+ *	@kn: kernfs_node of interest
+ *
+ *	Link @kn into its sibling rbtree which starts from
+ *	@kn->parent->dir.children.
+ *
+ *	Locking:
+ *	mutex_lock(kernfs_mutex)
+ *
+ *	RETURNS:
+ *	0 on susccess -EEXIST on failure.
+ */
+static int kernfs_link_sibling(struct kernfs_node *kn)
+{
+	struct rb_node **node = &kn->parent->dir.children.rb_node;
+	struct rb_node *parent = NULL;
+
+	if (kernfs_type(kn) == KERNFS_DIR)
+		kn->parent->dir.subdirs++;
+
+	while (*node) {
+		struct kernfs_node *pos;
+		int result;
+
+		pos = rb_to_kn(*node);
+		parent = *node;
+		result = kernfs_sd_compare(kn, pos);
+		if (result < 0)
+			node = &pos->rb.rb_left;
+		else if (result > 0)
+			node = &pos->rb.rb_right;
+		else
+			return -EEXIST;
+	}
+	/* add new node and rebalance the tree */
+	rb_link_node(&kn->rb, parent, node);
+	rb_insert_color(&kn->rb, &kn->parent->dir.children);
+	return 0;
+}
+
+/**
+ *	kernfs_unlink_sibling - unlink kernfs_node from sibling rbtree
+ *	@kn: kernfs_node of interest
+ *
+ *	Unlink @kn from its sibling rbtree which starts from
+ *	kn->parent->dir.children.
+ *
+ *	Locking:
+ *	mutex_lock(kernfs_mutex)
+ */
+static void kernfs_unlink_sibling(struct kernfs_node *kn)
+{
+	if (kernfs_type(kn) == KERNFS_DIR)
+		kn->parent->dir.subdirs--;
+
+	rb_erase(&kn->rb, &kn->parent->dir.children);
+}
+
+/**
+ *	kernfs_get_active - get an active reference to kernfs_node
+ *	@kn: kernfs_node to get an active reference to
+ *
+ *	Get an active reference of @kn.  This function is noop if @kn
+ *	is NULL.
+ *
+ *	RETURNS:
+ *	Pointer to @kn on success, NULL on failure.
+ */
+struct kernfs_node *kernfs_get_active(struct kernfs_node *kn)
+{
+	if (unlikely(!kn))
+		return NULL;
+
+	if (!atomic_inc_unless_negative(&kn->active))
+		return NULL;
+
+	if (kn->flags & KERNFS_LOCKDEP)
+		rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_);
+	return kn;
+}
+
+/**
+ *	kernfs_put_active - put an active reference to kernfs_node
+ *	@kn: kernfs_node to put an active reference to
+ *
+ *	Put an active reference to @kn.  This function is noop if @kn
+ *	is NULL.
+ */
+void kernfs_put_active(struct kernfs_node *kn)
+{
+	int v;
+
+	if (unlikely(!kn))
+		return;
+
+	if (kn->flags & KERNFS_LOCKDEP)
+		rwsem_release(&kn->dep_map, 1, _RET_IP_);
+	v = atomic_dec_return(&kn->active);
+	if (likely(v != KN_DEACTIVATED_BIAS))
+		return;
+
+	/*
+	 * atomic_dec_return() is a mb(), we'll always see the updated
+	 * kn->u.completion.
+	 */
+	complete(kn->u.completion);
+}
+
+/**
+ *	kernfs_deactivate - deactivate kernfs_node
+ *	@kn: kernfs_node to deactivate
+ *
+ *	Deny new active references and drain existing ones.
+ */
+static void kernfs_deactivate(struct kernfs_node *kn)
+{
+	DECLARE_COMPLETION_ONSTACK(wait);
+	int v;
+
+	BUG_ON(!(kn->flags & KERNFS_REMOVED));
+
+	if (!(kernfs_type(kn) & KERNFS_ACTIVE_REF))
+		return;
+
+	kn->u.completion = (void *)&wait;
+
+	rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_);
+	/* atomic_add_return() is a mb(), put_active() will always see
+	 * the updated kn->u.completion.
+	 */
+	v = atomic_add_return(KN_DEACTIVATED_BIAS, &kn->active);
+
+	if (v != KN_DEACTIVATED_BIAS) {
+		lock_contended(&kn->dep_map, _RET_IP_);
+		wait_for_completion(&wait);
+	}
+
+	lock_acquired(&kn->dep_map, _RET_IP_);
+	rwsem_release(&kn->dep_map, 1, _RET_IP_);
+}
+
+/**
+ * kernfs_get - get a reference count on a kernfs_node
+ * @kn: the target kernfs_node
+ */
+void kernfs_get(struct kernfs_node *kn)
+{
+	if (kn) {
+		WARN_ON(!atomic_read(&kn->count));
+		atomic_inc(&kn->count);
+	}
+}
+EXPORT_SYMBOL_GPL(kernfs_get);
+
+/**
+ * kernfs_put - put a reference count on a kernfs_node
+ * @kn: the target kernfs_node
+ *
+ * Put a reference count of @kn and destroy it if it reached zero.
+ */
+void kernfs_put(struct kernfs_node *kn)
+{
+	struct kernfs_node *parent;
+	struct kernfs_root *root;
+
+	if (!kn || !atomic_dec_and_test(&kn->count))
+		return;
+	root = kernfs_root(kn);
+ repeat:
+	/* Moving/renaming is always done while holding reference.
+	 * kn->parent won't change beneath us.
+	 */
+	parent = kn->parent;
+
+	WARN(!(kn->flags & KERNFS_REMOVED), "kernfs: free using entry: %s/%s\n",
+	     parent ? parent->name : "", kn->name);
+
+	if (kernfs_type(kn) == KERNFS_LINK)
+		kernfs_put(kn->symlink.target_kn);
+	if (kernfs_type(kn) & KERNFS_COPY_NAME)
+		kfree(kn->name);
+	if (kn->iattr) {
+		if (kn->iattr->ia_secdata)
+			security_release_secctx(kn->iattr->ia_secdata,
+						kn->iattr->ia_secdata_len);
+		simple_xattrs_free(&kn->iattr->xattrs);
+	}
+	kfree(kn->iattr);
+	ida_simple_remove(&root->ino_ida, kn->ino);
+	kmem_cache_free(kernfs_node_cache, kn);
+
+	kn = parent;
+	if (kn) {
+		if (atomic_dec_and_test(&kn->count))
+			goto repeat;
+	} else {
+		/* just released the root kn, free @root too */
+		ida_destroy(&root->ino_ida);
+		kfree(root);
+	}
+}
+EXPORT_SYMBOL_GPL(kernfs_put);
+
+static int kernfs_dop_delete(const struct dentry *dentry)
+{
+	struct kernfs_node *kn = dentry->d_fsdata;
+	return !(kn && !(kn->flags & KERNFS_REMOVED));
+}
+
+static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
+{
+	struct kernfs_node *kn;
+
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	kn = dentry->d_fsdata;
+	mutex_lock(&kernfs_mutex);
+
+	/* The kernfs node has been deleted */
+	if (kn->flags & KERNFS_REMOVED)
+		goto out_bad;
+
+	/* The kernfs node has been moved? */
+	if (dentry->d_parent->d_fsdata != kn->parent)
+		goto out_bad;
+
+	/* The kernfs node has been renamed */
+	if (strcmp(dentry->d_name.name, kn->name) != 0)
+		goto out_bad;
+
+	/* The kernfs node has been moved to a different namespace */
+	if (kn->parent && kernfs_ns_enabled(kn->parent) &&
+	    kernfs_info(dentry->d_sb)->ns != kn->ns)
+		goto out_bad;
+
+	mutex_unlock(&kernfs_mutex);
+out_valid:
+	return 1;
+out_bad:
+	/*
+	 * Remove the dentry from the dcache hashes.
+	 * If this is a deleted dentry we use d_drop instead of d_delete
+	 * so kernfs doesn't need to cope with negative dentries.
+	 *
+	 * If this is a dentry that has simply been renamed we
+	 * use d_drop to remove it from the dcache lookup on its
+	 * old parent.  If this dentry persists later when a lookup
+	 * is performed at its new name the dentry will be readded
+	 * to the dcache hashes.
+	 */
+	mutex_unlock(&kernfs_mutex);
+
+	/* If we have submounts we must allow the vfs caches
+	 * to lie about the state of the filesystem to prevent
+	 * leaks and other nasty things.
+	 */
+	if (check_submounts_and_drop(dentry) != 0)
+		goto out_valid;
+
+	return 0;
+}
+
+static void kernfs_dop_release(struct dentry *dentry)
+{
+	kernfs_put(dentry->d_fsdata);
+}
+
+const struct dentry_operations kernfs_dops = {
+	.d_revalidate	= kernfs_dop_revalidate,
+	.d_delete	= kernfs_dop_delete,
+	.d_release	= kernfs_dop_release,
+};
+
+struct kernfs_node *kernfs_new_node(struct kernfs_root *root, const char *name,
+				    umode_t mode, int type)
+{
+	char *dup_name = NULL;
+	struct kernfs_node *kn;
+	int ret;
+
+	if (type & KERNFS_COPY_NAME) {
+		name = dup_name = kstrdup(name, GFP_KERNEL);
+		if (!name)
+			return NULL;
+	}
+
+	kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL);
+	if (!kn)
+		goto err_out1;
+
+	ret = ida_simple_get(&root->ino_ida, 1, 0, GFP_KERNEL);
+	if (ret < 0)
+		goto err_out2;
+	kn->ino = ret;
+
+	atomic_set(&kn->count, 1);
+	atomic_set(&kn->active, 0);
+
+	kn->name = name;
+	kn->mode = mode;
+	kn->flags = type | KERNFS_REMOVED;
+
+	return kn;
+
+ err_out2:
+	kmem_cache_free(kernfs_node_cache, kn);
+ err_out1:
+	kfree(dup_name);
+	return NULL;
+}
+
+/**
+ *	kernfs_addrm_start - prepare for kernfs_node add/remove
+ *	@acxt: pointer to kernfs_addrm_cxt to be used
+ *
+ *	This function is called when the caller is about to add or remove
+ *	kernfs_node.  This function acquires kernfs_mutex.  @acxt is used
+ *	to keep and pass context to other addrm functions.
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep).  kernfs_mutex is locked on
+ *	return.
+ */
+void kernfs_addrm_start(struct kernfs_addrm_cxt *acxt)
+	__acquires(kernfs_mutex)
+{
+	memset(acxt, 0, sizeof(*acxt));
+
+	mutex_lock(&kernfs_mutex);
+}
+
+/**
+ *	kernfs_add_one - add kernfs_node to parent without warning
+ *	@acxt: addrm context to use
+ *	@kn: kernfs_node to be added
+ *	@parent: the parent kernfs_node to add @kn to
+ *
+ *	Get @parent and set @kn->parent to it and increment nlink of the
+ *	parent inode if @kn is a directory and link into the children list
+ *	of the parent.
+ *
+ *	This function should be called between calls to
+ *	kernfs_addrm_start() and kernfs_addrm_finish() and should be passed
+ *	the same @acxt as passed to kernfs_addrm_start().
+ *
+ *	LOCKING:
+ *	Determined by kernfs_addrm_start().
+ *
+ *	RETURNS:
+ *	0 on success, -EEXIST if entry with the given name already
+ *	exists.
+ */
+int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn,
+		  struct kernfs_node *parent)
+{
+	bool has_ns = kernfs_ns_enabled(parent);
+	struct kernfs_iattrs *ps_iattr;
+	int ret;
+
+	if (has_ns != (bool)kn->ns) {
+		WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
+		     has_ns ? "required" : "invalid", parent->name, kn->name);
+		return -EINVAL;
+	}
+
+	if (kernfs_type(parent) != KERNFS_DIR)
+		return -EINVAL;
+
+	kn->hash = kernfs_name_hash(kn->name, kn->ns);
+	kn->parent = parent;
+	kernfs_get(parent);
+
+	ret = kernfs_link_sibling(kn);
+	if (ret)
+		return ret;
+
+	/* Update timestamps on the parent */
+	ps_iattr = parent->iattr;
+	if (ps_iattr) {
+		struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
+		ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
+	}
+
+	/* Mark the entry added into directory tree */
+	kn->flags &= ~KERNFS_REMOVED;
+
+	return 0;
+}
+
+/**
+ *	kernfs_remove_one - remove kernfs_node from parent
+ *	@acxt: addrm context to use
+ *	@kn: kernfs_node to be removed
+ *
+ *	Mark @kn removed and drop nlink of parent inode if @kn is a
+ *	directory.  @kn is unlinked from the children list.
+ *
+ *	This function should be called between calls to
+ *	kernfs_addrm_start() and kernfs_addrm_finish() and should be
+ *	passed the same @acxt as passed to kernfs_addrm_start().
+ *
+ *	LOCKING:
+ *	Determined by kernfs_addrm_start().
+ */
+static void kernfs_remove_one(struct kernfs_addrm_cxt *acxt,
+			      struct kernfs_node *kn)
+{
+	struct kernfs_iattrs *ps_iattr;
+
+	/*
+	 * Removal can be called multiple times on the same node.  Only the
+	 * first invocation is effective and puts the base ref.
+	 */
+	if (kn->flags & KERNFS_REMOVED)
+		return;
+
+	if (kn->parent) {
+		kernfs_unlink_sibling(kn);
+
+		/* Update timestamps on the parent */
+		ps_iattr = kn->parent->iattr;
+		if (ps_iattr) {
+			ps_iattr->ia_iattr.ia_ctime = CURRENT_TIME;
+			ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME;
+		}
+	}
+
+	kn->flags |= KERNFS_REMOVED;
+	kn->u.removed_list = acxt->removed;
+	acxt->removed = kn;
+}
+
+/**
+ *	kernfs_addrm_finish - finish up kernfs_node add/remove
+ *	@acxt: addrm context to finish up
+ *
+ *	Finish up kernfs_node add/remove.  Resources acquired by
+ *	kernfs_addrm_start() are released and removed kernfs_nodes are
+ *	cleaned up.
+ *
+ *	LOCKING:
+ *	kernfs_mutex is released.
+ */
+void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt)
+	__releases(kernfs_mutex)
+{
+	/* release resources acquired by kernfs_addrm_start() */
+	mutex_unlock(&kernfs_mutex);
+
+	/* kill removed kernfs_nodes */
+	while (acxt->removed) {
+		struct kernfs_node *kn = acxt->removed;
+
+		acxt->removed = kn->u.removed_list;
+
+		kernfs_deactivate(kn);
+		kernfs_unmap_bin_file(kn);
+		kernfs_put(kn);
+	}
+}
+
+/**
+ * kernfs_find_ns - find kernfs_node with the given name
+ * @parent: kernfs_node to search under
+ * @name: name to look for
+ * @ns: the namespace tag to use
+ *
+ * Look for kernfs_node with name @name under @parent.  Returns pointer to
+ * the found kernfs_node on success, %NULL on failure.
+ */
+static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent,
+					  const unsigned char *name,
+					  const void *ns)
+{
+	struct rb_node *node = parent->dir.children.rb_node;
+	bool has_ns = kernfs_ns_enabled(parent);
+	unsigned int hash;
+
+	lockdep_assert_held(&kernfs_mutex);
+
+	if (has_ns != (bool)ns) {
+		WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
+		     has_ns ? "required" : "invalid", parent->name, name);
+		return NULL;
+	}
+
+	hash = kernfs_name_hash(name, ns);
+	while (node) {
+		struct kernfs_node *kn;
+		int result;
+
+		kn = rb_to_kn(node);
+		result = kernfs_name_compare(hash, name, ns, kn);
+		if (result < 0)
+			node = node->rb_left;
+		else if (result > 0)
+			node = node->rb_right;
+		else
+			return kn;
+	}
+	return NULL;
+}
+
+/**
+ * kernfs_find_and_get_ns - find and get kernfs_node with the given name
+ * @parent: kernfs_node to search under
+ * @name: name to look for
+ * @ns: the namespace tag to use
+ *
+ * Look for kernfs_node with name @name under @parent and get a reference
+ * if found.  This function may sleep and returns pointer to the found
+ * kernfs_node on success, %NULL on failure.
+ */
+struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
+					   const char *name, const void *ns)
+{
+	struct kernfs_node *kn;
+
+	mutex_lock(&kernfs_mutex);
+	kn = kernfs_find_ns(parent, name, ns);
+	kernfs_get(kn);
+	mutex_unlock(&kernfs_mutex);
+
+	return kn;
+}
+EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns);
+
+/**
+ * kernfs_create_root - create a new kernfs hierarchy
+ * @priv: opaque data associated with the new directory
+ *
+ * Returns the root of the new hierarchy on success, ERR_PTR() value on
+ * failure.
+ */
+struct kernfs_root *kernfs_create_root(void *priv)
+{
+	struct kernfs_root *root;
+	struct kernfs_node *kn;
+
+	root = kzalloc(sizeof(*root), GFP_KERNEL);
+	if (!root)
+		return ERR_PTR(-ENOMEM);
+
+	ida_init(&root->ino_ida);
+
+	kn = kernfs_new_node(root, "", S_IFDIR | S_IRUGO | S_IXUGO, KERNFS_DIR);
+	if (!kn) {
+		ida_destroy(&root->ino_ida);
+		kfree(root);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	kn->flags &= ~KERNFS_REMOVED;
+	kn->priv = priv;
+	kn->dir.root = root;
+
+	root->kn = kn;
+
+	return root;
+}
+
+/**
+ * kernfs_destroy_root - destroy a kernfs hierarchy
+ * @root: root of the hierarchy to destroy
+ *
+ * Destroy the hierarchy anchored at @root by removing all existing
+ * directories and destroying @root.
+ */
+void kernfs_destroy_root(struct kernfs_root *root)
+{
+	kernfs_remove(root->kn);	/* will also free @root */
+}
+
+/**
+ * kernfs_create_dir_ns - create a directory
+ * @parent: parent in which to create a new directory
+ * @name: name of the new directory
+ * @priv: opaque data associated with the new directory
+ * @ns: optional namespace tag of the directory
+ *
+ * Returns the created node on success, ERR_PTR() value on failure.
+ */
+struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
+					 const char *name, void *priv,
+					 const void *ns)
+{
+	umode_t mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+	struct kernfs_addrm_cxt acxt;
+	struct kernfs_node *kn;
+	int rc;
+
+	/* allocate */
+	kn = kernfs_new_node(kernfs_root(parent), name, mode, KERNFS_DIR);
+	if (!kn)
+		return ERR_PTR(-ENOMEM);
+
+	kn->dir.root = parent->dir.root;
+	kn->ns = ns;
+	kn->priv = priv;
+
+	/* link in */
+	kernfs_addrm_start(&acxt);
+	rc = kernfs_add_one(&acxt, kn, parent);
+	kernfs_addrm_finish(&acxt);
+
+	if (!rc)
+		return kn;
+
+	kernfs_put(kn);
+	return ERR_PTR(rc);
+}
+
+static struct dentry *kernfs_iop_lookup(struct inode *dir,
+					struct dentry *dentry,
+					unsigned int flags)
+{
+	struct dentry *ret = NULL;
+	struct kernfs_node *parent = dentry->d_parent->d_fsdata;
+	struct kernfs_node *kn;
+	struct inode *inode;
+	const void *ns = NULL;
+
+	mutex_lock(&kernfs_mutex);
+
+	if (kernfs_ns_enabled(parent))
+		ns = kernfs_info(dir->i_sb)->ns;
+
+	kn = kernfs_find_ns(parent, dentry->d_name.name, ns);
+
+	/* no such entry */
+	if (!kn) {
+		ret = ERR_PTR(-ENOENT);
+		goto out_unlock;
+	}
+	kernfs_get(kn);
+	dentry->d_fsdata = kn;
+
+	/* attach dentry and inode */
+	inode = kernfs_get_inode(dir->i_sb, kn);
+	if (!inode) {
+		ret = ERR_PTR(-ENOMEM);
+		goto out_unlock;
+	}
+
+	/* instantiate and hash dentry */
+	ret = d_materialise_unique(dentry, inode);
+ out_unlock:
+	mutex_unlock(&kernfs_mutex);
+	return ret;
+}
+
+const struct inode_operations kernfs_dir_iops = {
+	.lookup		= kernfs_iop_lookup,
+	.permission	= kernfs_iop_permission,
+	.setattr	= kernfs_iop_setattr,
+	.getattr	= kernfs_iop_getattr,
+	.setxattr	= kernfs_iop_setxattr,
+	.removexattr	= kernfs_iop_removexattr,
+	.getxattr	= kernfs_iop_getxattr,
+	.listxattr	= kernfs_iop_listxattr,
+};
+
+static struct kernfs_node *kernfs_leftmost_descendant(struct kernfs_node *pos)
+{
+	struct kernfs_node *last;
+
+	while (true) {
+		struct rb_node *rbn;
+
+		last = pos;
+
+		if (kernfs_type(pos) != KERNFS_DIR)
+			break;
+
+		rbn = rb_first(&pos->dir.children);
+		if (!rbn)
+			break;
+
+		pos = rb_to_kn(rbn);
+	}
+
+	return last;
+}
+
+/**
+ * kernfs_next_descendant_post - find the next descendant for post-order walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @root: kernfs_node whose descendants to walk
+ *
+ * Find the next descendant to visit for post-order traversal of @root's
+ * descendants.  @root is included in the iteration and the last node to be
+ * visited.
+ */
+static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos,
+						       struct kernfs_node *root)
+{
+	struct rb_node *rbn;
+
+	lockdep_assert_held(&kernfs_mutex);
+
+	/* if first iteration, visit leftmost descendant which may be root */
+	if (!pos)
+		return kernfs_leftmost_descendant(root);
+
+	/* if we visited @root, we're done */
+	if (pos == root)
+		return NULL;
+
+	/* if there's an unvisited sibling, visit its leftmost descendant */
+	rbn = rb_next(&pos->rb);
+	if (rbn)
+		return kernfs_leftmost_descendant(rb_to_kn(rbn));
+
+	/* no sibling left, visit parent */
+	return pos->parent;
+}
+
+static void __kernfs_remove(struct kernfs_addrm_cxt *acxt,
+			    struct kernfs_node *kn)
+{
+	struct kernfs_node *pos, *next;
+
+	if (!kn)
+		return;
+
+	pr_debug("kernfs %s: removing\n", kn->name);
+
+	next = NULL;
+	do {
+		pos = next;
+		next = kernfs_next_descendant_post(pos, kn);
+		if (pos)
+			kernfs_remove_one(acxt, pos);
+	} while (next);
+}
+
+/**
+ * kernfs_remove - remove a kernfs_node recursively
+ * @kn: the kernfs_node to remove
+ *
+ * Remove @kn along with all its subdirectories and files.
+ */
+void kernfs_remove(struct kernfs_node *kn)
+{
+	struct kernfs_addrm_cxt acxt;
+
+	kernfs_addrm_start(&acxt);
+	__kernfs_remove(&acxt, kn);
+	kernfs_addrm_finish(&acxt);
+}
+
+/**
+ * kernfs_remove_by_name_ns - find a kernfs_node by name and remove it
+ * @parent: parent of the target
+ * @name: name of the kernfs_node to remove
+ * @ns: namespace tag of the kernfs_node to remove
+ *
+ * Look for the kernfs_node with @name and @ns under @parent and remove it.
+ * Returns 0 on success, -ENOENT if such entry doesn't exist.
+ */
+int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
+			     const void *ns)
+{
+	struct kernfs_addrm_cxt acxt;
+	struct kernfs_node *kn;
+
+	if (!parent) {
+		WARN(1, KERN_WARNING "kernfs: can not remove '%s', no directory\n",
+			name);
+		return -ENOENT;
+	}
+
+	kernfs_addrm_start(&acxt);
+
+	kn = kernfs_find_ns(parent, name, ns);
+	if (kn)
+		__kernfs_remove(&acxt, kn);
+
+	kernfs_addrm_finish(&acxt);
+
+	if (kn)
+		return 0;
+	else
+		return -ENOENT;
+}
+
+/**
+ * kernfs_rename_ns - move and rename a kernfs_node
+ * @kn: target node
+ * @new_parent: new parent to put @sd under
+ * @new_name: new name
+ * @new_ns: new namespace tag
+ */
+int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
+		     const char *new_name, const void *new_ns)
+{
+	int error;
+
+	mutex_lock(&kernfs_mutex);
+
+	error = 0;
+	if ((kn->parent == new_parent) && (kn->ns == new_ns) &&
+	    (strcmp(kn->name, new_name) == 0))
+		goto out;	/* nothing to rename */
+
+	error = -EEXIST;
+	if (kernfs_find_ns(new_parent, new_name, new_ns))
+		goto out;
+
+	/* rename kernfs_node */
+	if (strcmp(kn->name, new_name) != 0) {
+		error = -ENOMEM;
+		new_name = kstrdup(new_name, GFP_KERNEL);
+		if (!new_name)
+			goto out;
+
+		kfree(kn->name);
+		kn->name = new_name;
+	}
+
+	/*
+	 * Move to the appropriate place in the appropriate directories rbtree.
+	 */
+	kernfs_unlink_sibling(kn);
+	kernfs_get(new_parent);
+	kernfs_put(kn->parent);
+	kn->ns = new_ns;
+	kn->hash = kernfs_name_hash(kn->name, kn->ns);
+	kn->parent = new_parent;
+	kernfs_link_sibling(kn);
+
+	error = 0;
+ out:
+	mutex_unlock(&kernfs_mutex);
+	return error;
+}
+
+/* Relationship between s_mode and the DT_xxx types */
+static inline unsigned char dt_type(struct kernfs_node *kn)
+{
+	return (kn->mode >> 12) & 15;
+}
+
+static int kernfs_dir_fop_release(struct inode *inode, struct file *filp)
+{
+	kernfs_put(filp->private_data);
+	return 0;
+}
+
+static struct kernfs_node *kernfs_dir_pos(const void *ns,
+	struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos)
+{
+	if (pos) {
+		int valid = !(pos->flags & KERNFS_REMOVED) &&
+			pos->parent == parent && hash == pos->hash;
+		kernfs_put(pos);
+		if (!valid)
+			pos = NULL;
+	}
+	if (!pos && (hash > 1) && (hash < INT_MAX)) {
+		struct rb_node *node = parent->dir.children.rb_node;
+		while (node) {
+			pos = rb_to_kn(node);
+
+			if (hash < pos->hash)
+				node = node->rb_left;
+			else if (hash > pos->hash)
+				node = node->rb_right;
+			else
+				break;
+		}
+	}
+	/* Skip over entries in the wrong namespace */
+	while (pos && pos->ns != ns) {
+		struct rb_node *node = rb_next(&pos->rb);
+		if (!node)
+			pos = NULL;
+		else
+			pos = rb_to_kn(node);
+	}
+	return pos;
+}
+
+static struct kernfs_node *kernfs_dir_next_pos(const void *ns,
+	struct kernfs_node *parent, ino_t ino, struct kernfs_node *pos)
+{
+	pos = kernfs_dir_pos(ns, parent, ino, pos);
+	if (pos)
+		do {
+			struct rb_node *node = rb_next(&pos->rb);
+			if (!node)
+				pos = NULL;
+			else
+				pos = rb_to_kn(node);
+		} while (pos && pos->ns != ns);
+	return pos;
+}
+
+static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct dentry *dentry = file->f_path.dentry;
+	struct kernfs_node *parent = dentry->d_fsdata;
+	struct kernfs_node *pos = file->private_data;
+	const void *ns = NULL;
+
+	if (!dir_emit_dots(file, ctx))
+		return 0;
+	mutex_lock(&kernfs_mutex);
+
+	if (kernfs_ns_enabled(parent))
+		ns = kernfs_info(dentry->d_sb)->ns;
+
+	for (pos = kernfs_dir_pos(ns, parent, ctx->pos, pos);
+	     pos;
+	     pos = kernfs_dir_next_pos(ns, parent, ctx->pos, pos)) {
+		const char *name = pos->name;
+		unsigned int type = dt_type(pos);
+		int len = strlen(name);
+		ino_t ino = pos->ino;
+
+		ctx->pos = pos->hash;
+		file->private_data = pos;
+		kernfs_get(pos);
+
+		mutex_unlock(&kernfs_mutex);
+		if (!dir_emit(ctx, name, len, ino, type))
+			return 0;
+		mutex_lock(&kernfs_mutex);
+	}
+	mutex_unlock(&kernfs_mutex);
+	file->private_data = NULL;
+	ctx->pos = INT_MAX;
+	return 0;
+}
+
+static loff_t kernfs_dir_fop_llseek(struct file *file, loff_t offset,
+				    int whence)
+{
+	struct inode *inode = file_inode(file);
+	loff_t ret;
+
+	mutex_lock(&inode->i_mutex);
+	ret = generic_file_llseek(file, offset, whence);
+	mutex_unlock(&inode->i_mutex);
+
+	return ret;
+}
+
+const struct file_operations kernfs_dir_fops = {
+	.read		= generic_read_dir,
+	.iterate	= kernfs_fop_readdir,
+	.release	= kernfs_dir_fop_release,
+	.llseek		= kernfs_dir_fop_llseek,
+};
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
new file mode 100644
index 000000000000..053cfd9a6a40
--- /dev/null
+++ b/fs/kernfs/file.c
@@ -0,0 +1,824 @@
+/*
+ * fs/kernfs/file.c - kernfs file implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/poll.h>
+#include <linux/pagemap.h>
+#include <linux/sched.h>
+
+#include "kernfs-internal.h"
+
+/*
+ * There's one kernfs_open_file for each open file and one kernfs_open_node
+ * for each kernfs_node with one or more open files.
+ *
+ * kernfs_node->attr.open points to kernfs_open_node.  attr.open is
+ * protected by kernfs_open_node_lock.
+ *
+ * filp->private_data points to seq_file whose ->private points to
+ * kernfs_open_file.  kernfs_open_files are chained at
+ * kernfs_open_node->files, which is protected by kernfs_open_file_mutex.
+ */
+static DEFINE_SPINLOCK(kernfs_open_node_lock);
+static DEFINE_MUTEX(kernfs_open_file_mutex);
+
+struct kernfs_open_node {
+	atomic_t		refcnt;
+	atomic_t		event;
+	wait_queue_head_t	poll;
+	struct list_head	files; /* goes through kernfs_open_file.list */
+};
+
+static struct kernfs_open_file *kernfs_of(struct file *file)
+{
+	return ((struct seq_file *)file->private_data)->private;
+}
+
+/*
+ * Determine the kernfs_ops for the given kernfs_node.  This function must
+ * be called while holding an active reference.
+ */
+static const struct kernfs_ops *kernfs_ops(struct kernfs_node *kn)
+{
+	if (kn->flags & KERNFS_LOCKDEP)
+		lockdep_assert_held(kn);
+	return kn->attr.ops;
+}
+
+static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
+{
+	struct kernfs_open_file *of = sf->private;
+	const struct kernfs_ops *ops;
+
+	/*
+	 * @of->mutex nests outside active ref and is just to ensure that
+	 * the ops aren't called concurrently for the same open file.
+	 */
+	mutex_lock(&of->mutex);
+	if (!kernfs_get_active(of->kn))
+		return ERR_PTR(-ENODEV);
+
+	ops = kernfs_ops(of->kn);
+	if (ops->seq_start) {
+		return ops->seq_start(sf, ppos);
+	} else {
+		/*
+		 * The same behavior and code as single_open().  Returns
+		 * !NULL if pos is at the beginning; otherwise, NULL.
+		 */
+		return NULL + !*ppos;
+	}
+}
+
+static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos)
+{
+	struct kernfs_open_file *of = sf->private;
+	const struct kernfs_ops *ops = kernfs_ops(of->kn);
+
+	if (ops->seq_next) {
+		return ops->seq_next(sf, v, ppos);
+	} else {
+		/*
+		 * The same behavior and code as single_open(), always
+		 * terminate after the initial read.
+		 */
+		++*ppos;
+		return NULL;
+	}
+}
+
+static void kernfs_seq_stop(struct seq_file *sf, void *v)
+{
+	struct kernfs_open_file *of = sf->private;
+	const struct kernfs_ops *ops = kernfs_ops(of->kn);
+
+	if (ops->seq_stop)
+		ops->seq_stop(sf, v);
+
+	kernfs_put_active(of->kn);
+	mutex_unlock(&of->mutex);
+}
+
+static int kernfs_seq_show(struct seq_file *sf, void *v)
+{
+	struct kernfs_open_file *of = sf->private;
+
+	of->event = atomic_read(&of->kn->attr.open->event);
+
+	return of->kn->attr.ops->seq_show(sf, v);
+}
+
+static const struct seq_operations kernfs_seq_ops = {
+	.start = kernfs_seq_start,
+	.next = kernfs_seq_next,
+	.stop = kernfs_seq_stop,
+	.show = kernfs_seq_show,
+};
+
+/*
+ * As reading a bin file can have side-effects, the exact offset and bytes
+ * specified in read(2) call should be passed to the read callback making
+ * it difficult to use seq_file.  Implement simplistic custom buffering for
+ * bin files.
+ */
+static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of,
+				       char __user *user_buf, size_t count,
+				       loff_t *ppos)
+{
+	ssize_t len = min_t(size_t, count, PAGE_SIZE);
+	const struct kernfs_ops *ops;
+	char *buf;
+
+	buf = kmalloc(len, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	/*
+	 * @of->mutex nests outside active ref and is just to ensure that
+	 * the ops aren't called concurrently for the same open file.
+	 */
+	mutex_lock(&of->mutex);
+	if (!kernfs_get_active(of->kn)) {
+		len = -ENODEV;
+		mutex_unlock(&of->mutex);
+		goto out_free;
+	}
+
+	ops = kernfs_ops(of->kn);
+	if (ops->read)
+		len = ops->read(of, buf, len, *ppos);
+	else
+		len = -EINVAL;
+
+	kernfs_put_active(of->kn);
+	mutex_unlock(&of->mutex);
+
+	if (len < 0)
+		goto out_free;
+
+	if (copy_to_user(user_buf, buf, len)) {
+		len = -EFAULT;
+		goto out_free;
+	}
+
+	*ppos += len;
+
+ out_free:
+	kfree(buf);
+	return len;
+}
+
+/**
+ * kernfs_fop_read - kernfs vfs read callback
+ * @file: file pointer
+ * @user_buf: data to write
+ * @count: number of bytes
+ * @ppos: starting offset
+ */
+static ssize_t kernfs_fop_read(struct file *file, char __user *user_buf,
+			       size_t count, loff_t *ppos)
+{
+	struct kernfs_open_file *of = kernfs_of(file);
+
+	if (of->kn->flags & KERNFS_HAS_SEQ_SHOW)
+		return seq_read(file, user_buf, count, ppos);
+	else
+		return kernfs_file_direct_read(of, user_buf, count, ppos);
+}
+
+/**
+ * kernfs_fop_write - kernfs vfs write callback
+ * @file: file pointer
+ * @user_buf: data to write
+ * @count: number of bytes
+ * @ppos: starting offset
+ *
+ * Copy data in from userland and pass it to the matching kernfs write
+ * operation.
+ *
+ * There is no easy way for us to know if userspace is only doing a partial
+ * write, so we don't support them. We expect the entire buffer to come on
+ * the first write.  Hint: if you're writing a value, first read the file,
+ * modify only the the value you're changing, then write entire buffer
+ * back.
+ */
+static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf,
+				size_t count, loff_t *ppos)
+{
+	struct kernfs_open_file *of = kernfs_of(file);
+	ssize_t len = min_t(size_t, count, PAGE_SIZE);
+	const struct kernfs_ops *ops;
+	char *buf;
+
+	buf = kmalloc(len + 1, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	if (copy_from_user(buf, user_buf, len)) {
+		len = -EFAULT;
+		goto out_free;
+	}
+	buf[len] = '\0';	/* guarantee string termination */
+
+	/*
+	 * @of->mutex nests outside active ref and is just to ensure that
+	 * the ops aren't called concurrently for the same open file.
+	 */
+	mutex_lock(&of->mutex);
+	if (!kernfs_get_active(of->kn)) {
+		mutex_unlock(&of->mutex);
+		len = -ENODEV;
+		goto out_free;
+	}
+
+	ops = kernfs_ops(of->kn);
+	if (ops->write)
+		len = ops->write(of, buf, len, *ppos);
+	else
+		len = -EINVAL;
+
+	kernfs_put_active(of->kn);
+	mutex_unlock(&of->mutex);
+
+	if (len > 0)
+		*ppos += len;
+out_free:
+	kfree(buf);
+	return len;
+}
+
+static void kernfs_vma_open(struct vm_area_struct *vma)
+{
+	struct file *file = vma->vm_file;
+	struct kernfs_open_file *of = kernfs_of(file);
+
+	if (!of->vm_ops)
+		return;
+
+	if (!kernfs_get_active(of->kn))
+		return;
+
+	if (of->vm_ops->open)
+		of->vm_ops->open(vma);
+
+	kernfs_put_active(of->kn);
+}
+
+static int kernfs_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct file *file = vma->vm_file;
+	struct kernfs_open_file *of = kernfs_of(file);
+	int ret;
+
+	if (!of->vm_ops)
+		return VM_FAULT_SIGBUS;
+
+	if (!kernfs_get_active(of->kn))
+		return VM_FAULT_SIGBUS;
+
+	ret = VM_FAULT_SIGBUS;
+	if (of->vm_ops->fault)
+		ret = of->vm_ops->fault(vma, vmf);
+
+	kernfs_put_active(of->kn);
+	return ret;
+}
+
+static int kernfs_vma_page_mkwrite(struct vm_area_struct *vma,
+				   struct vm_fault *vmf)
+{
+	struct file *file = vma->vm_file;
+	struct kernfs_open_file *of = kernfs_of(file);
+	int ret;
+
+	if (!of->vm_ops)
+		return VM_FAULT_SIGBUS;
+
+	if (!kernfs_get_active(of->kn))
+		return VM_FAULT_SIGBUS;
+
+	ret = 0;
+	if (of->vm_ops->page_mkwrite)
+		ret = of->vm_ops->page_mkwrite(vma, vmf);
+	else
+		file_update_time(file);
+
+	kernfs_put_active(of->kn);
+	return ret;
+}
+
+static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr,
+			     void *buf, int len, int write)
+{
+	struct file *file = vma->vm_file;
+	struct kernfs_open_file *of = kernfs_of(file);
+	int ret;
+
+	if (!of->vm_ops)
+		return -EINVAL;
+
+	if (!kernfs_get_active(of->kn))
+		return -EINVAL;
+
+	ret = -EINVAL;
+	if (of->vm_ops->access)
+		ret = of->vm_ops->access(vma, addr, buf, len, write);
+
+	kernfs_put_active(of->kn);
+	return ret;
+}
+
+#ifdef CONFIG_NUMA
+static int kernfs_vma_set_policy(struct vm_area_struct *vma,
+				 struct mempolicy *new)
+{
+	struct file *file = vma->vm_file;
+	struct kernfs_open_file *of = kernfs_of(file);
+	int ret;
+
+	if (!of->vm_ops)
+		return 0;
+
+	if (!kernfs_get_active(of->kn))
+		return -EINVAL;
+
+	ret = 0;
+	if (of->vm_ops->set_policy)
+		ret = of->vm_ops->set_policy(vma, new);
+
+	kernfs_put_active(of->kn);
+	return ret;
+}
+
+static struct mempolicy *kernfs_vma_get_policy(struct vm_area_struct *vma,
+					       unsigned long addr)
+{
+	struct file *file = vma->vm_file;
+	struct kernfs_open_file *of = kernfs_of(file);
+	struct mempolicy *pol;
+
+	if (!of->vm_ops)
+		return vma->vm_policy;
+
+	if (!kernfs_get_active(of->kn))
+		return vma->vm_policy;
+
+	pol = vma->vm_policy;
+	if (of->vm_ops->get_policy)
+		pol = of->vm_ops->get_policy(vma, addr);
+
+	kernfs_put_active(of->kn);
+	return pol;
+}
+
+static int kernfs_vma_migrate(struct vm_area_struct *vma,
+			      const nodemask_t *from, const nodemask_t *to,
+			      unsigned long flags)
+{
+	struct file *file = vma->vm_file;
+	struct kernfs_open_file *of = kernfs_of(file);
+	int ret;
+
+	if (!of->vm_ops)
+		return 0;
+
+	if (!kernfs_get_active(of->kn))
+		return 0;
+
+	ret = 0;
+	if (of->vm_ops->migrate)
+		ret = of->vm_ops->migrate(vma, from, to, flags);
+
+	kernfs_put_active(of->kn);
+	return ret;
+}
+#endif
+
+static const struct vm_operations_struct kernfs_vm_ops = {
+	.open		= kernfs_vma_open,
+	.fault		= kernfs_vma_fault,
+	.page_mkwrite	= kernfs_vma_page_mkwrite,
+	.access		= kernfs_vma_access,
+#ifdef CONFIG_NUMA
+	.set_policy	= kernfs_vma_set_policy,
+	.get_policy	= kernfs_vma_get_policy,
+	.migrate	= kernfs_vma_migrate,
+#endif
+};
+
+static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct kernfs_open_file *of = kernfs_of(file);
+	const struct kernfs_ops *ops;
+	int rc;
+
+	/*
+	 * mmap path and of->mutex are prone to triggering spurious lockdep
+	 * warnings and we don't want to add spurious locking dependency
+	 * between the two.  Check whether mmap is actually implemented
+	 * without grabbing @of->mutex by testing HAS_MMAP flag.  See the
+	 * comment in kernfs_file_open() for more details.
+	 */
+	if (!(of->kn->flags & KERNFS_HAS_MMAP))
+		return -ENODEV;
+
+	mutex_lock(&of->mutex);
+
+	rc = -ENODEV;
+	if (!kernfs_get_active(of->kn))
+		goto out_unlock;
+
+	ops = kernfs_ops(of->kn);
+	rc = ops->mmap(of, vma);
+
+	/*
+	 * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup()
+	 * to satisfy versions of X which crash if the mmap fails: that
+	 * substitutes a new vm_file, and we don't then want bin_vm_ops.
+	 */
+	if (vma->vm_file != file)
+		goto out_put;
+
+	rc = -EINVAL;
+	if (of->mmapped && of->vm_ops != vma->vm_ops)
+		goto out_put;
+
+	/*
+	 * It is not possible to successfully wrap close.
+	 * So error if someone is trying to use close.
+	 */
+	rc = -EINVAL;
+	if (vma->vm_ops && vma->vm_ops->close)
+		goto out_put;
+
+	rc = 0;
+	of->mmapped = 1;
+	of->vm_ops = vma->vm_ops;
+	vma->vm_ops = &kernfs_vm_ops;
+out_put:
+	kernfs_put_active(of->kn);
+out_unlock:
+	mutex_unlock(&of->mutex);
+
+	return rc;
+}
+
+/**
+ *	kernfs_get_open_node - get or create kernfs_open_node
+ *	@kn: target kernfs_node
+ *	@of: kernfs_open_file for this instance of open
+ *
+ *	If @kn->attr.open exists, increment its reference count; otherwise,
+ *	create one.  @of is chained to the files list.
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep).
+ *
+ *	RETURNS:
+ *	0 on success, -errno on failure.
+ */
+static int kernfs_get_open_node(struct kernfs_node *kn,
+				struct kernfs_open_file *of)
+{
+	struct kernfs_open_node *on, *new_on = NULL;
+
+ retry:
+	mutex_lock(&kernfs_open_file_mutex);
+	spin_lock_irq(&kernfs_open_node_lock);
+
+	if (!kn->attr.open && new_on) {
+		kn->attr.open = new_on;
+		new_on = NULL;
+	}
+
+	on = kn->attr.open;
+	if (on) {
+		atomic_inc(&on->refcnt);
+		list_add_tail(&of->list, &on->files);
+	}
+
+	spin_unlock_irq(&kernfs_open_node_lock);
+	mutex_unlock(&kernfs_open_file_mutex);
+
+	if (on) {
+		kfree(new_on);
+		return 0;
+	}
+
+	/* not there, initialize a new one and retry */
+	new_on = kmalloc(sizeof(*new_on), GFP_KERNEL);
+	if (!new_on)
+		return -ENOMEM;
+
+	atomic_set(&new_on->refcnt, 0);
+	atomic_set(&new_on->event, 1);
+	init_waitqueue_head(&new_on->poll);
+	INIT_LIST_HEAD(&new_on->files);
+	goto retry;
+}
+
+/**
+ *	kernfs_put_open_node - put kernfs_open_node
+ *	@kn: target kernfs_nodet
+ *	@of: associated kernfs_open_file
+ *
+ *	Put @kn->attr.open and unlink @of from the files list.  If
+ *	reference count reaches zero, disassociate and free it.
+ *
+ *	LOCKING:
+ *	None.
+ */
+static void kernfs_put_open_node(struct kernfs_node *kn,
+				 struct kernfs_open_file *of)
+{
+	struct kernfs_open_node *on = kn->attr.open;
+	unsigned long flags;
+
+	mutex_lock(&kernfs_open_file_mutex);
+	spin_lock_irqsave(&kernfs_open_node_lock, flags);
+
+	if (of)
+		list_del(&of->list);
+
+	if (atomic_dec_and_test(&on->refcnt))
+		kn->attr.open = NULL;
+	else
+		on = NULL;
+
+	spin_unlock_irqrestore(&kernfs_open_node_lock, flags);
+	mutex_unlock(&kernfs_open_file_mutex);
+
+	kfree(on);
+}
+
+static int kernfs_fop_open(struct inode *inode, struct file *file)
+{
+	struct kernfs_node *kn = file->f_path.dentry->d_fsdata;
+	const struct kernfs_ops *ops;
+	struct kernfs_open_file *of;
+	bool has_read, has_write, has_mmap;
+	int error = -EACCES;
+
+	if (!kernfs_get_active(kn))
+		return -ENODEV;
+
+	ops = kernfs_ops(kn);
+
+	has_read = ops->seq_show || ops->read || ops->mmap;
+	has_write = ops->write || ops->mmap;
+	has_mmap = ops->mmap;
+
+	/* check perms and supported operations */
+	if ((file->f_mode & FMODE_WRITE) &&
+	    (!(inode->i_mode & S_IWUGO) || !has_write))
+		goto err_out;
+
+	if ((file->f_mode & FMODE_READ) &&
+	    (!(inode->i_mode & S_IRUGO) || !has_read))
+		goto err_out;
+
+	/* allocate a kernfs_open_file for the file */
+	error = -ENOMEM;
+	of = kzalloc(sizeof(struct kernfs_open_file), GFP_KERNEL);
+	if (!of)
+		goto err_out;
+
+	/*
+	 * The following is done to give a different lockdep key to
+	 * @of->mutex for files which implement mmap.  This is a rather
+	 * crude way to avoid false positive lockdep warning around
+	 * mm->mmap_sem - mmap nests @of->mutex under mm->mmap_sem and
+	 * reading /sys/block/sda/trace/act_mask grabs sr_mutex, under
+	 * which mm->mmap_sem nests, while holding @of->mutex.  As each
+	 * open file has a separate mutex, it's okay as long as those don't
+	 * happen on the same file.  At this point, we can't easily give
+	 * each file a separate locking class.  Let's differentiate on
+	 * whether the file has mmap or not for now.
+	 *
+	 * Both paths of the branch look the same.  They're supposed to
+	 * look that way and give @of->mutex different static lockdep keys.
+	 */
+	if (has_mmap)
+		mutex_init(&of->mutex);
+	else
+		mutex_init(&of->mutex);
+
+	of->kn = kn;
+	of->file = file;
+
+	/*
+	 * Always instantiate seq_file even if read access doesn't use
+	 * seq_file or is not requested.  This unifies private data access
+	 * and readable regular files are the vast majority anyway.
+	 */
+	if (ops->seq_show)
+		error = seq_open(file, &kernfs_seq_ops);
+	else
+		error = seq_open(file, NULL);
+	if (error)
+		goto err_free;
+
+	((struct seq_file *)file->private_data)->private = of;
+
+	/* seq_file clears PWRITE unconditionally, restore it if WRITE */
+	if (file->f_mode & FMODE_WRITE)
+		file->f_mode |= FMODE_PWRITE;
+
+	/* make sure we have open node struct */
+	error = kernfs_get_open_node(kn, of);
+	if (error)
+		goto err_close;
+
+	/* open succeeded, put active references */
+	kernfs_put_active(kn);
+	return 0;
+
+err_close:
+	seq_release(inode, file);
+err_free:
+	kfree(of);
+err_out:
+	kernfs_put_active(kn);
+	return error;
+}
+
+static int kernfs_fop_release(struct inode *inode, struct file *filp)
+{
+	struct kernfs_node *kn = filp->f_path.dentry->d_fsdata;
+	struct kernfs_open_file *of = kernfs_of(filp);
+
+	kernfs_put_open_node(kn, of);
+	seq_release(inode, filp);
+	kfree(of);
+
+	return 0;
+}
+
+void kernfs_unmap_bin_file(struct kernfs_node *kn)
+{
+	struct kernfs_open_node *on;
+	struct kernfs_open_file *of;
+
+	if (!(kn->flags & KERNFS_HAS_MMAP))
+		return;
+
+	spin_lock_irq(&kernfs_open_node_lock);
+	on = kn->attr.open;
+	if (on)
+		atomic_inc(&on->refcnt);
+	spin_unlock_irq(&kernfs_open_node_lock);
+	if (!on)
+		return;
+
+	mutex_lock(&kernfs_open_file_mutex);
+	list_for_each_entry(of, &on->files, list) {
+		struct inode *inode = file_inode(of->file);
+		unmap_mapping_range(inode->i_mapping, 0, 0, 1);
+	}
+	mutex_unlock(&kernfs_open_file_mutex);
+
+	kernfs_put_open_node(kn, NULL);
+}
+
+/*
+ * Kernfs attribute files are pollable.  The idea is that you read
+ * the content and then you use 'poll' or 'select' to wait for
+ * the content to change.  When the content changes (assuming the
+ * manager for the kobject supports notification), poll will
+ * return POLLERR|POLLPRI, and select will return the fd whether
+ * it is waiting for read, write, or exceptions.
+ * Once poll/select indicates that the value has changed, you
+ * need to close and re-open the file, or seek to 0 and read again.
+ * Reminder: this only works for attributes which actively support
+ * it, and it is not possible to test an attribute from userspace
+ * to see if it supports poll (Neither 'poll' nor 'select' return
+ * an appropriate error code).  When in doubt, set a suitable timeout value.
+ */
+static unsigned int kernfs_fop_poll(struct file *filp, poll_table *wait)
+{
+	struct kernfs_open_file *of = kernfs_of(filp);
+	struct kernfs_node *kn = filp->f_path.dentry->d_fsdata;
+	struct kernfs_open_node *on = kn->attr.open;
+
+	/* need parent for the kobj, grab both */
+	if (!kernfs_get_active(kn))
+		goto trigger;
+
+	poll_wait(filp, &on->poll, wait);
+
+	kernfs_put_active(kn);
+
+	if (of->event != atomic_read(&on->event))
+		goto trigger;
+
+	return DEFAULT_POLLMASK;
+
+ trigger:
+	return DEFAULT_POLLMASK|POLLERR|POLLPRI;
+}
+
+/**
+ * kernfs_notify - notify a kernfs file
+ * @kn: file to notify
+ *
+ * Notify @kn such that poll(2) on @kn wakes up.
+ */
+void kernfs_notify(struct kernfs_node *kn)
+{
+	struct kernfs_open_node *on;
+	unsigned long flags;
+
+	spin_lock_irqsave(&kernfs_open_node_lock, flags);
+
+	if (!WARN_ON(kernfs_type(kn) != KERNFS_FILE)) {
+		on = kn->attr.open;
+		if (on) {
+			atomic_inc(&on->event);
+			wake_up_interruptible(&on->poll);
+		}
+	}
+
+	spin_unlock_irqrestore(&kernfs_open_node_lock, flags);
+}
+EXPORT_SYMBOL_GPL(kernfs_notify);
+
+const struct file_operations kernfs_file_fops = {
+	.read		= kernfs_fop_read,
+	.write		= kernfs_fop_write,
+	.llseek		= generic_file_llseek,
+	.mmap		= kernfs_fop_mmap,
+	.open		= kernfs_fop_open,
+	.release	= kernfs_fop_release,
+	.poll		= kernfs_fop_poll,
+};
+
+/**
+ * kernfs_create_file_ns_key - create a file
+ * @parent: directory to create the file in
+ * @name: name of the file
+ * @mode: mode of the file
+ * @size: size of the file
+ * @ops: kernfs operations for the file
+ * @priv: private data for the file
+ * @ns: optional namespace tag of the file
+ * @key: lockdep key for the file's active_ref, %NULL to disable lockdep
+ *
+ * Returns the created node on success, ERR_PTR() value on error.
+ */
+struct kernfs_node *kernfs_create_file_ns_key(struct kernfs_node *parent,
+					      const char *name,
+					      umode_t mode, loff_t size,
+					      const struct kernfs_ops *ops,
+					      void *priv, const void *ns,
+					      struct lock_class_key *key)
+{
+	struct kernfs_addrm_cxt acxt;
+	struct kernfs_node *kn;
+	int rc;
+
+	kn = kernfs_new_node(kernfs_root(parent), name,
+			     (mode & S_IALLUGO) | S_IFREG, KERNFS_FILE);
+	if (!kn)
+		return ERR_PTR(-ENOMEM);
+
+	kn->attr.ops = ops;
+	kn->attr.size = size;
+	kn->ns = ns;
+	kn->priv = priv;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	if (key) {
+		lockdep_init_map(&kn->dep_map, "s_active", key, 0);
+		kn->flags |= KERNFS_LOCKDEP;
+	}
+#endif
+
+	/*
+	 * kn->attr.ops is accesible only while holding active ref.  We
+	 * need to know whether some ops are implemented outside active
+	 * ref.  Cache their existence in flags.
+	 */
+	if (ops->seq_show)
+		kn->flags |= KERNFS_HAS_SEQ_SHOW;
+	if (ops->mmap)
+		kn->flags |= KERNFS_HAS_MMAP;
+
+	kernfs_addrm_start(&acxt);
+	rc = kernfs_add_one(&acxt, kn, parent);
+	kernfs_addrm_finish(&acxt);
+
+	if (rc) {
+		kernfs_put(kn);
+		return ERR_PTR(rc);
+	}
+	return kn;
+}
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
new file mode 100644
index 000000000000..e55126f85bd2
--- /dev/null
+++ b/fs/kernfs/inode.c
@@ -0,0 +1,377 @@
+/*
+ * fs/kernfs/inode.c - kernfs inode implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/pagemap.h>
+#include <linux/backing-dev.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+
+#include "kernfs-internal.h"
+
+static const struct address_space_operations kernfs_aops = {
+	.readpage	= simple_readpage,
+	.write_begin	= simple_write_begin,
+	.write_end	= simple_write_end,
+};
+
+static struct backing_dev_info kernfs_bdi = {
+	.name		= "kernfs",
+	.ra_pages	= 0,	/* No readahead */
+	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
+};
+
+static const struct inode_operations kernfs_iops = {
+	.permission	= kernfs_iop_permission,
+	.setattr	= kernfs_iop_setattr,
+	.getattr	= kernfs_iop_getattr,
+	.setxattr	= kernfs_iop_setxattr,
+	.removexattr	= kernfs_iop_removexattr,
+	.getxattr	= kernfs_iop_getxattr,
+	.listxattr	= kernfs_iop_listxattr,
+};
+
+void __init kernfs_inode_init(void)
+{
+	if (bdi_init(&kernfs_bdi))
+		panic("failed to init kernfs_bdi");
+}
+
+static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn)
+{
+	struct iattr *iattrs;
+
+	if (kn->iattr)
+		return kn->iattr;
+
+	kn->iattr = kzalloc(sizeof(struct kernfs_iattrs), GFP_KERNEL);
+	if (!kn->iattr)
+		return NULL;
+	iattrs = &kn->iattr->ia_iattr;
+
+	/* assign default attributes */
+	iattrs->ia_mode = kn->mode;
+	iattrs->ia_uid = GLOBAL_ROOT_UID;
+	iattrs->ia_gid = GLOBAL_ROOT_GID;
+	iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME;
+
+	simple_xattrs_init(&kn->iattr->xattrs);
+
+	return kn->iattr;
+}
+
+static int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
+{
+	struct kernfs_iattrs *attrs;
+	struct iattr *iattrs;
+	unsigned int ia_valid = iattr->ia_valid;
+
+	attrs = kernfs_iattrs(kn);
+	if (!attrs)
+		return -ENOMEM;
+
+	iattrs = &attrs->ia_iattr;
+
+	if (ia_valid & ATTR_UID)
+		iattrs->ia_uid = iattr->ia_uid;
+	if (ia_valid & ATTR_GID)
+		iattrs->ia_gid = iattr->ia_gid;
+	if (ia_valid & ATTR_ATIME)
+		iattrs->ia_atime = iattr->ia_atime;
+	if (ia_valid & ATTR_MTIME)
+		iattrs->ia_mtime = iattr->ia_mtime;
+	if (ia_valid & ATTR_CTIME)
+		iattrs->ia_ctime = iattr->ia_ctime;
+	if (ia_valid & ATTR_MODE) {
+		umode_t mode = iattr->ia_mode;
+		iattrs->ia_mode = kn->mode = mode;
+	}
+	return 0;
+}
+
+/**
+ * kernfs_setattr - set iattr on a node
+ * @kn: target node
+ * @iattr: iattr to set
+ *
+ * Returns 0 on success, -errno on failure.
+ */
+int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
+{
+	int ret;
+
+	mutex_lock(&kernfs_mutex);
+	ret = __kernfs_setattr(kn, iattr);
+	mutex_unlock(&kernfs_mutex);
+	return ret;
+}
+
+int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+	struct inode *inode = dentry->d_inode;
+	struct kernfs_node *kn = dentry->d_fsdata;
+	int error;
+
+	if (!kn)
+		return -EINVAL;
+
+	mutex_lock(&kernfs_mutex);
+	error = inode_change_ok(inode, iattr);
+	if (error)
+		goto out;
+
+	error = __kernfs_setattr(kn, iattr);
+	if (error)
+		goto out;
+
+	/* this ignores size changes */
+	setattr_copy(inode, iattr);
+
+out:
+	mutex_unlock(&kernfs_mutex);
+	return error;
+}
+
+static int kernfs_node_setsecdata(struct kernfs_node *kn, void **secdata,
+				  u32 *secdata_len)
+{
+	struct kernfs_iattrs *attrs;
+	void *old_secdata;
+	size_t old_secdata_len;
+
+	attrs = kernfs_iattrs(kn);
+	if (!attrs)
+		return -ENOMEM;
+
+	old_secdata = attrs->ia_secdata;
+	old_secdata_len = attrs->ia_secdata_len;
+
+	attrs->ia_secdata = *secdata;
+	attrs->ia_secdata_len = *secdata_len;
+
+	*secdata = old_secdata;
+	*secdata_len = old_secdata_len;
+	return 0;
+}
+
+int kernfs_iop_setxattr(struct dentry *dentry, const char *name,
+			const void *value, size_t size, int flags)
+{
+	struct kernfs_node *kn = dentry->d_fsdata;
+	struct kernfs_iattrs *attrs;
+	void *secdata;
+	int error;
+	u32 secdata_len = 0;
+
+	attrs = kernfs_iattrs(kn);
+	if (!attrs)
+		return -ENOMEM;
+
+	if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) {
+		const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
+		error = security_inode_setsecurity(dentry->d_inode, suffix,
+						value, size, flags);
+		if (error)
+			return error;
+		error = security_inode_getsecctx(dentry->d_inode,
+						&secdata, &secdata_len);
+		if (error)
+			return error;
+
+		mutex_lock(&kernfs_mutex);
+		error = kernfs_node_setsecdata(kn, &secdata, &secdata_len);
+		mutex_unlock(&kernfs_mutex);
+
+		if (secdata)
+			security_release_secctx(secdata, secdata_len);
+		return error;
+	} else if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) {
+		return simple_xattr_set(&attrs->xattrs, name, value, size,
+					flags);
+	}
+
+	return -EINVAL;
+}
+
+int kernfs_iop_removexattr(struct dentry *dentry, const char *name)
+{
+	struct kernfs_node *kn = dentry->d_fsdata;
+	struct kernfs_iattrs *attrs;
+
+	attrs = kernfs_iattrs(kn);
+	if (!attrs)
+		return -ENOMEM;
+
+	return simple_xattr_remove(&attrs->xattrs, name);
+}
+
+ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf,
+			    size_t size)
+{
+	struct kernfs_node *kn = dentry->d_fsdata;
+	struct kernfs_iattrs *attrs;
+
+	attrs = kernfs_iattrs(kn);
+	if (!attrs)
+		return -ENOMEM;
+
+	return simple_xattr_get(&attrs->xattrs, name, buf, size);
+}
+
+ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size)
+{
+	struct kernfs_node *kn = dentry->d_fsdata;
+	struct kernfs_iattrs *attrs;
+
+	attrs = kernfs_iattrs(kn);
+	if (!attrs)
+		return -ENOMEM;
+
+	return simple_xattr_list(&attrs->xattrs, buf, size);
+}
+
+static inline void set_default_inode_attr(struct inode *inode, umode_t mode)
+{
+	inode->i_mode = mode;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+}
+
+static inline void set_inode_attr(struct inode *inode, struct iattr *iattr)
+{
+	inode->i_uid = iattr->ia_uid;
+	inode->i_gid = iattr->ia_gid;
+	inode->i_atime = iattr->ia_atime;
+	inode->i_mtime = iattr->ia_mtime;
+	inode->i_ctime = iattr->ia_ctime;
+}
+
+static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode)
+{
+	struct kernfs_iattrs *attrs = kn->iattr;
+
+	inode->i_mode = kn->mode;
+	if (attrs) {
+		/*
+		 * kernfs_node has non-default attributes get them from
+		 * persistent copy in kernfs_node.
+		 */
+		set_inode_attr(inode, &attrs->ia_iattr);
+		security_inode_notifysecctx(inode, attrs->ia_secdata,
+					    attrs->ia_secdata_len);
+	}
+
+	if (kernfs_type(kn) == KERNFS_DIR)
+		set_nlink(inode, kn->dir.subdirs + 2);
+}
+
+int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		   struct kstat *stat)
+{
+	struct kernfs_node *kn = dentry->d_fsdata;
+	struct inode *inode = dentry->d_inode;
+
+	mutex_lock(&kernfs_mutex);
+	kernfs_refresh_inode(kn, inode);
+	mutex_unlock(&kernfs_mutex);
+
+	generic_fillattr(inode, stat);
+	return 0;
+}
+
+static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode)
+{
+	kernfs_get(kn);
+	inode->i_private = kn;
+	inode->i_mapping->a_ops = &kernfs_aops;
+	inode->i_mapping->backing_dev_info = &kernfs_bdi;
+	inode->i_op = &kernfs_iops;
+
+	set_default_inode_attr(inode, kn->mode);
+	kernfs_refresh_inode(kn, inode);
+
+	/* initialize inode according to type */
+	switch (kernfs_type(kn)) {
+	case KERNFS_DIR:
+		inode->i_op = &kernfs_dir_iops;
+		inode->i_fop = &kernfs_dir_fops;
+		break;
+	case KERNFS_FILE:
+		inode->i_size = kn->attr.size;
+		inode->i_fop = &kernfs_file_fops;
+		break;
+	case KERNFS_LINK:
+		inode->i_op = &kernfs_symlink_iops;
+		break;
+	default:
+		BUG();
+	}
+
+	unlock_new_inode(inode);
+}
+
+/**
+ *	kernfs_get_inode - get inode for kernfs_node
+ *	@sb: super block
+ *	@kn: kernfs_node to allocate inode for
+ *
+ *	Get inode for @kn.  If such inode doesn't exist, a new inode is
+ *	allocated and basics are initialized.  New inode is returned
+ *	locked.
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep).
+ *
+ *	RETURNS:
+ *	Pointer to allocated inode on success, NULL on failure.
+ */
+struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn)
+{
+	struct inode *inode;
+
+	inode = iget_locked(sb, kn->ino);
+	if (inode && (inode->i_state & I_NEW))
+		kernfs_init_inode(kn, inode);
+
+	return inode;
+}
+
+/*
+ * The kernfs_node serves as both an inode and a directory entry for
+ * kernfs.  To prevent the kernfs inode numbers from being freed
+ * prematurely we take a reference to kernfs_node from the kernfs inode.  A
+ * super_operations.evict_inode() implementation is needed to drop that
+ * reference upon inode destruction.
+ */
+void kernfs_evict_inode(struct inode *inode)
+{
+	struct kernfs_node *kn = inode->i_private;
+
+	truncate_inode_pages(&inode->i_data, 0);
+	clear_inode(inode);
+	kernfs_put(kn);
+}
+
+int kernfs_iop_permission(struct inode *inode, int mask)
+{
+	struct kernfs_node *kn;
+
+	if (mask & MAY_NOT_BLOCK)
+		return -ECHILD;
+
+	kn = inode->i_private;
+
+	mutex_lock(&kernfs_mutex);
+	kernfs_refresh_inode(kn, inode);
+	mutex_unlock(&kernfs_mutex);
+
+	return generic_permission(inode, mask);
+}
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
new file mode 100644
index 000000000000..a4ff491fd59c
--- /dev/null
+++ b/fs/kernfs/kernfs-internal.h
@@ -0,0 +1,122 @@
+/*
+ * fs/kernfs/kernfs-internal.h - kernfs internal header file
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007, 2013 Tejun Heo <teheo@suse.de>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#ifndef __KERNFS_INTERNAL_H
+#define __KERNFS_INTERNAL_H
+
+#include <linux/lockdep.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/xattr.h>
+
+#include <linux/kernfs.h>
+
+struct kernfs_iattrs {
+	struct iattr		ia_iattr;
+	void			*ia_secdata;
+	u32			ia_secdata_len;
+
+	struct simple_xattrs	xattrs;
+};
+
+#define KN_DEACTIVATED_BIAS		INT_MIN
+
+/* KERNFS_TYPE_MASK and types are defined in include/linux/kernfs.h */
+
+/**
+ * kernfs_root - find out the kernfs_root a kernfs_node belongs to
+ * @kn: kernfs_node of interest
+ *
+ * Return the kernfs_root @kn belongs to.
+ */
+static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn)
+{
+	/* if parent exists, it's always a dir; otherwise, @sd is a dir */
+	if (kn->parent)
+		kn = kn->parent;
+	return kn->dir.root;
+}
+
+/*
+ * Context structure to be used while adding/removing nodes.
+ */
+struct kernfs_addrm_cxt {
+	struct kernfs_node	*removed;
+};
+
+/*
+ * mount.c
+ */
+struct kernfs_super_info {
+	/*
+	 * The root associated with this super_block.  Each super_block is
+	 * identified by the root and ns it's associated with.
+	 */
+	struct kernfs_root	*root;
+
+	/*
+	 * Each sb is associated with one namespace tag, currently the
+	 * network namespace of the task which mounted this kernfs
+	 * instance.  If multiple tags become necessary, make the following
+	 * an array and compare kernfs_node tag against every entry.
+	 */
+	const void		*ns;
+};
+#define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info))
+
+extern struct kmem_cache *kernfs_node_cache;
+
+/*
+ * inode.c
+ */
+struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn);
+void kernfs_evict_inode(struct inode *inode);
+int kernfs_iop_permission(struct inode *inode, int mask);
+int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr);
+int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		       struct kstat *stat);
+int kernfs_iop_setxattr(struct dentry *dentry, const char *name, const void *value,
+			size_t size, int flags);
+int kernfs_iop_removexattr(struct dentry *dentry, const char *name);
+ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf,
+			    size_t size);
+ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size);
+void kernfs_inode_init(void);
+
+/*
+ * dir.c
+ */
+extern struct mutex kernfs_mutex;
+extern const struct dentry_operations kernfs_dops;
+extern const struct file_operations kernfs_dir_fops;
+extern const struct inode_operations kernfs_dir_iops;
+
+struct kernfs_node *kernfs_get_active(struct kernfs_node *kn);
+void kernfs_put_active(struct kernfs_node *kn);
+void kernfs_addrm_start(struct kernfs_addrm_cxt *acxt);
+int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn,
+		   struct kernfs_node *parent);
+void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt);
+struct kernfs_node *kernfs_new_node(struct kernfs_root *root, const char *name,
+				    umode_t mode, int type);
+
+/*
+ * file.c
+ */
+extern const struct file_operations kernfs_file_fops;
+
+void kernfs_unmap_bin_file(struct kernfs_node *kn);
+
+/*
+ * symlink.c
+ */
+extern const struct inode_operations kernfs_symlink_iops;
+
+#endif	/* __KERNFS_INTERNAL_H */
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
new file mode 100644
index 000000000000..0d6ce895a9ee
--- /dev/null
+++ b/fs/kernfs/mount.c
@@ -0,0 +1,165 @@
+/*
+ * fs/kernfs/mount.c - kernfs mount implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/init.h>
+#include <linux/magic.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+
+#include "kernfs-internal.h"
+
+struct kmem_cache *kernfs_node_cache;
+
+static const struct super_operations kernfs_sops = {
+	.statfs		= simple_statfs,
+	.drop_inode	= generic_delete_inode,
+	.evict_inode	= kernfs_evict_inode,
+};
+
+static int kernfs_fill_super(struct super_block *sb)
+{
+	struct kernfs_super_info *info = kernfs_info(sb);
+	struct inode *inode;
+	struct dentry *root;
+
+	sb->s_blocksize = PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_magic = SYSFS_MAGIC;
+	sb->s_op = &kernfs_sops;
+	sb->s_time_gran = 1;
+
+	/* get root inode, initialize and unlock it */
+	mutex_lock(&kernfs_mutex);
+	inode = kernfs_get_inode(sb, info->root->kn);
+	mutex_unlock(&kernfs_mutex);
+	if (!inode) {
+		pr_debug("kernfs: could not get root inode\n");
+		return -ENOMEM;
+	}
+
+	/* instantiate and link root dentry */
+	root = d_make_root(inode);
+	if (!root) {
+		pr_debug("%s: could not get root dentry!\n", __func__);
+		return -ENOMEM;
+	}
+	kernfs_get(info->root->kn);
+	root->d_fsdata = info->root->kn;
+	sb->s_root = root;
+	sb->s_d_op = &kernfs_dops;
+	return 0;
+}
+
+static int kernfs_test_super(struct super_block *sb, void *data)
+{
+	struct kernfs_super_info *sb_info = kernfs_info(sb);
+	struct kernfs_super_info *info = data;
+
+	return sb_info->root == info->root && sb_info->ns == info->ns;
+}
+
+static int kernfs_set_super(struct super_block *sb, void *data)
+{
+	int error;
+	error = set_anon_super(sb, data);
+	if (!error)
+		sb->s_fs_info = data;
+	return error;
+}
+
+/**
+ * kernfs_super_ns - determine the namespace tag of a kernfs super_block
+ * @sb: super_block of interest
+ *
+ * Return the namespace tag associated with kernfs super_block @sb.
+ */
+const void *kernfs_super_ns(struct super_block *sb)
+{
+	struct kernfs_super_info *info = kernfs_info(sb);
+
+	return info->ns;
+}
+
+/**
+ * kernfs_mount_ns - kernfs mount helper
+ * @fs_type: file_system_type of the fs being mounted
+ * @flags: mount flags specified for the mount
+ * @root: kernfs_root of the hierarchy being mounted
+ * @ns: optional namespace tag of the mount
+ *
+ * This is to be called from each kernfs user's file_system_type->mount()
+ * implementation, which should pass through the specified @fs_type and
+ * @flags, and specify the hierarchy and namespace tag to mount via @root
+ * and @ns, respectively.
+ *
+ * The return value can be passed to the vfs layer verbatim.
+ */
+struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
+			       struct kernfs_root *root, const void *ns)
+{
+	struct super_block *sb;
+	struct kernfs_super_info *info;
+	int error;
+
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return ERR_PTR(-ENOMEM);
+
+	info->root = root;
+	info->ns = ns;
+
+	sb = sget(fs_type, kernfs_test_super, kernfs_set_super, flags, info);
+	if (IS_ERR(sb) || sb->s_fs_info != info)
+		kfree(info);
+	if (IS_ERR(sb))
+		return ERR_CAST(sb);
+	if (!sb->s_root) {
+		error = kernfs_fill_super(sb);
+		if (error) {
+			deactivate_locked_super(sb);
+			return ERR_PTR(error);
+		}
+		sb->s_flags |= MS_ACTIVE;
+	}
+
+	return dget(sb->s_root);
+}
+
+/**
+ * kernfs_kill_sb - kill_sb for kernfs
+ * @sb: super_block being killed
+ *
+ * This can be used directly for file_system_type->kill_sb().  If a kernfs
+ * user needs extra cleanup, it can implement its own kill_sb() and call
+ * this function at the end.
+ */
+void kernfs_kill_sb(struct super_block *sb)
+{
+	struct kernfs_super_info *info = kernfs_info(sb);
+	struct kernfs_node *root_kn = sb->s_root->d_fsdata;
+
+	/*
+	 * Remove the superblock from fs_supers/s_instances
+	 * so we can't find it, before freeing kernfs_super_info.
+	 */
+	kill_anon_super(sb);
+	kfree(info);
+	kernfs_put(root_kn);
+}
+
+void __init kernfs_init(void)
+{
+	kernfs_node_cache = kmem_cache_create("kernfs_node_cache",
+					      sizeof(struct kernfs_node),
+					      0, SLAB_PANIC, NULL);
+	kernfs_inode_init();
+}
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
new file mode 100644
index 000000000000..a03e26036ef9
--- /dev/null
+++ b/fs/kernfs/symlink.c
@@ -0,0 +1,152 @@
+/*
+ * fs/kernfs/symlink.c - kernfs symlink implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/fs.h>
+#include <linux/gfp.h>
+#include <linux/namei.h>
+
+#include "kernfs-internal.h"
+
+/**
+ * kernfs_create_link - create a symlink
+ * @parent: directory to create the symlink in
+ * @name: name of the symlink
+ * @target: target node for the symlink to point to
+ *
+ * Returns the created node on success, ERR_PTR() value on error.
+ */
+struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
+				       const char *name,
+				       struct kernfs_node *target)
+{
+	struct kernfs_node *kn;
+	struct kernfs_addrm_cxt acxt;
+	int error;
+
+	kn = kernfs_new_node(kernfs_root(parent), name, S_IFLNK|S_IRWXUGO,
+			     KERNFS_LINK);
+	if (!kn)
+		return ERR_PTR(-ENOMEM);
+
+	if (kernfs_ns_enabled(parent))
+		kn->ns = target->ns;
+	kn->symlink.target_kn = target;
+	kernfs_get(target);	/* ref owned by symlink */
+
+	kernfs_addrm_start(&acxt);
+	error = kernfs_add_one(&acxt, kn, parent);
+	kernfs_addrm_finish(&acxt);
+
+	if (!error)
+		return kn;
+
+	kernfs_put(kn);
+	return ERR_PTR(error);
+}
+
+static int kernfs_get_target_path(struct kernfs_node *parent,
+				  struct kernfs_node *target, char *path)
+{
+	struct kernfs_node *base, *kn;
+	char *s = path;
+	int len = 0;
+
+	/* go up to the root, stop at the base */
+	base = parent;
+	while (base->parent) {
+		kn = target->parent;
+		while (kn->parent && base != kn)
+			kn = kn->parent;
+
+		if (base == kn)
+			break;
+
+		strcpy(s, "../");
+		s += 3;
+		base = base->parent;
+	}
+
+	/* determine end of target string for reverse fillup */
+	kn = target;
+	while (kn->parent && kn != base) {
+		len += strlen(kn->name) + 1;
+		kn = kn->parent;
+	}
+
+	/* check limits */
+	if (len < 2)
+		return -EINVAL;
+	len--;
+	if ((s - path) + len > PATH_MAX)
+		return -ENAMETOOLONG;
+
+	/* reverse fillup of target string from target to base */
+	kn = target;
+	while (kn->parent && kn != base) {
+		int slen = strlen(kn->name);
+
+		len -= slen;
+		strncpy(s + len, kn->name, slen);
+		if (len)
+			s[--len] = '/';
+
+		kn = kn->parent;
+	}
+
+	return 0;
+}
+
+static int kernfs_getlink(struct dentry *dentry, char *path)
+{
+	struct kernfs_node *kn = dentry->d_fsdata;
+	struct kernfs_node *parent = kn->parent;
+	struct kernfs_node *target = kn->symlink.target_kn;
+	int error;
+
+	mutex_lock(&kernfs_mutex);
+	error = kernfs_get_target_path(parent, target, path);
+	mutex_unlock(&kernfs_mutex);
+
+	return error;
+}
+
+static void *kernfs_iop_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	int error = -ENOMEM;
+	unsigned long page = get_zeroed_page(GFP_KERNEL);
+	if (page) {
+		error = kernfs_getlink(dentry, (char *) page);
+		if (error < 0)
+			free_page((unsigned long)page);
+	}
+	nd_set_link(nd, error ? ERR_PTR(error) : (char *)page);
+	return NULL;
+}
+
+static void kernfs_iop_put_link(struct dentry *dentry, struct nameidata *nd,
+				void *cookie)
+{
+	char *page = nd_get_link(nd);
+	if (!IS_ERR(page))
+		free_page((unsigned long)page);
+}
+
+const struct inode_operations kernfs_symlink_iops = {
+	.setxattr	= kernfs_iop_setxattr,
+	.removexattr	= kernfs_iop_removexattr,
+	.getxattr	= kernfs_iop_getxattr,
+	.listxattr	= kernfs_iop_listxattr,
+	.readlink	= generic_readlink,
+	.follow_link	= kernfs_iop_follow_link,
+	.put_link	= kernfs_iop_put_link,
+	.setattr	= kernfs_iop_setattr,
+	.getattr	= kernfs_iop_getattr,
+	.permission	= kernfs_iop_permission,
+};
diff --git a/fs/namespace.c b/fs/namespace.c
index ac2ce8a766e1..a511ea003f89 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2790,6 +2790,8 @@ void __init mnt_init(void)
 	for (u = 0; u < HASH_SIZE; u++)
 		INIT_LIST_HEAD(&mountpoint_hashtable[u]);
 
+	kernfs_init();
+
 	err = sysfs_init();
 	if (err)
 		printk(KERN_WARNING "%s: sysfs_init error: %d\n",
diff --git a/fs/sysfs/Makefile b/fs/sysfs/Makefile
index 8876ac183373..6eff6e1205a5 100644
--- a/fs/sysfs/Makefile
+++ b/fs/sysfs/Makefile
@@ -2,4 +2,4 @@
 # Makefile for the sysfs virtual filesystem
 #
 
-obj-y		:= inode.o file.o dir.o symlink.o mount.o group.o
+obj-y		:= file.o dir.o symlink.o mount.o group.o
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 5e73d6626e50..aa007401bfc9 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -13,465 +13,31 @@
 #undef DEBUG
 
 #include <linux/fs.h>
-#include <linux/mount.h>
-#include <linux/module.h>
 #include <linux/kobject.h>
-#include <linux/namei.h>
-#include <linux/idr.h>
-#include <linux/completion.h>
-#include <linux/mutex.h>
 #include <linux/slab.h>
-#include <linux/security.h>
-#include <linux/hash.h>
 #include "sysfs.h"
 
-DEFINE_MUTEX(sysfs_mutex);
 DEFINE_SPINLOCK(sysfs_symlink_target_lock);
 
-#define to_sysfs_dirent(X) rb_entry((X), struct sysfs_dirent, s_rb)
-
-static DEFINE_SPINLOCK(sysfs_ino_lock);
-static DEFINE_IDA(sysfs_ino_ida);
-
-/**
- *	sysfs_name_hash
- *	@name: Null terminated string to hash
- *	@ns:   Namespace tag to hash
- *
- *	Returns 31 bit hash of ns + name (so it fits in an off_t )
- */
-static unsigned int sysfs_name_hash(const char *name, const void *ns)
-{
-	unsigned long hash = init_name_hash();
-	unsigned int len = strlen(name);
-	while (len--)
-		hash = partial_name_hash(*name++, hash);
-	hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31));
-	hash &= 0x7fffffffU;
-	/* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */
-	if (hash < 1)
-		hash += 2;
-	if (hash >= INT_MAX)
-		hash = INT_MAX - 1;
-	return hash;
-}
-
-static int sysfs_name_compare(unsigned int hash, const char *name,
-			      const void *ns, const struct sysfs_dirent *sd)
-{
-	if (hash != sd->s_hash)
-		return hash - sd->s_hash;
-	if (ns != sd->s_ns)
-		return ns - sd->s_ns;
-	return strcmp(name, sd->s_name);
-}
-
-static int sysfs_sd_compare(const struct sysfs_dirent *left,
-			    const struct sysfs_dirent *right)
-{
-	return sysfs_name_compare(left->s_hash, left->s_name, left->s_ns,
-				  right);
-}
-
-/**
- *	sysfs_link_sibling - link sysfs_dirent into sibling rbtree
- *	@sd: sysfs_dirent of interest
- *
- *	Link @sd into its sibling rbtree which starts from
- *	sd->s_parent->s_dir.children.
- *
- *	Locking:
- *	mutex_lock(sysfs_mutex)
- *
- *	RETURNS:
- *	0 on susccess -EEXIST on failure.
- */
-static int sysfs_link_sibling(struct sysfs_dirent *sd)
-{
-	struct rb_node **node = &sd->s_parent->s_dir.children.rb_node;
-	struct rb_node *parent = NULL;
-
-	if (sysfs_type(sd) == SYSFS_DIR)
-		sd->s_parent->s_dir.subdirs++;
-
-	while (*node) {
-		struct sysfs_dirent *pos;
-		int result;
-
-		pos = to_sysfs_dirent(*node);
-		parent = *node;
-		result = sysfs_sd_compare(sd, pos);
-		if (result < 0)
-			node = &pos->s_rb.rb_left;
-		else if (result > 0)
-			node = &pos->s_rb.rb_right;
-		else
-			return -EEXIST;
-	}
-	/* add new node and rebalance the tree */
-	rb_link_node(&sd->s_rb, parent, node);
-	rb_insert_color(&sd->s_rb, &sd->s_parent->s_dir.children);
-	return 0;
-}
-
-/**
- *	sysfs_unlink_sibling - unlink sysfs_dirent from sibling rbtree
- *	@sd: sysfs_dirent of interest
- *
- *	Unlink @sd from its sibling rbtree which starts from
- *	sd->s_parent->s_dir.children.
- *
- *	Locking:
- *	mutex_lock(sysfs_mutex)
- */
-static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
-{
-	if (sysfs_type(sd) == SYSFS_DIR)
-		sd->s_parent->s_dir.subdirs--;
-
-	rb_erase(&sd->s_rb, &sd->s_parent->s_dir.children);
-}
-
-/**
- *	sysfs_get_active - get an active reference to sysfs_dirent
- *	@sd: sysfs_dirent to get an active reference to
- *
- *	Get an active reference of @sd.  This function is noop if @sd
- *	is NULL.
- *
- *	RETURNS:
- *	Pointer to @sd on success, NULL on failure.
- */
-struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
-{
-	if (unlikely(!sd))
-		return NULL;
-
-	if (!atomic_inc_unless_negative(&sd->s_active))
-		return NULL;
-
-	if (likely(!sysfs_ignore_lockdep(sd)))
-		rwsem_acquire_read(&sd->dep_map, 0, 1, _RET_IP_);
-	return sd;
-}
-
-/**
- *	sysfs_put_active - put an active reference to sysfs_dirent
- *	@sd: sysfs_dirent to put an active reference to
- *
- *	Put an active reference to @sd.  This function is noop if @sd
- *	is NULL.
- */
-void sysfs_put_active(struct sysfs_dirent *sd)
-{
-	int v;
-
-	if (unlikely(!sd))
-		return;
-
-	if (likely(!sysfs_ignore_lockdep(sd)))
-		rwsem_release(&sd->dep_map, 1, _RET_IP_);
-	v = atomic_dec_return(&sd->s_active);
-	if (likely(v != SD_DEACTIVATED_BIAS))
-		return;
-
-	/* atomic_dec_return() is a mb(), we'll always see the updated
-	 * sd->u.completion.
-	 */
-	complete(sd->u.completion);
-}
-
-/**
- *	sysfs_deactivate - deactivate sysfs_dirent
- *	@sd: sysfs_dirent to deactivate
- *
- *	Deny new active references and drain existing ones.
- */
-static void sysfs_deactivate(struct sysfs_dirent *sd)
-{
-	DECLARE_COMPLETION_ONSTACK(wait);
-	int v;
-
-	BUG_ON(!(sd->s_flags & SYSFS_FLAG_REMOVED));
-
-	if (!(sysfs_type(sd) & SYSFS_ACTIVE_REF))
-		return;
-
-	sd->u.completion = (void *)&wait;
-
-	rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_);
-	/* atomic_add_return() is a mb(), put_active() will always see
-	 * the updated sd->u.completion.
-	 */
-	v = atomic_add_return(SD_DEACTIVATED_BIAS, &sd->s_active);
-
-	if (v != SD_DEACTIVATED_BIAS) {
-		lock_contended(&sd->dep_map, _RET_IP_);
-		wait_for_completion(&wait);
-	}
-
-	lock_acquired(&sd->dep_map, _RET_IP_);
-	rwsem_release(&sd->dep_map, 1, _RET_IP_);
-}
-
-static int sysfs_alloc_ino(unsigned int *pino)
-{
-	int ino, rc;
-
- retry:
-	spin_lock(&sysfs_ino_lock);
-	rc = ida_get_new_above(&sysfs_ino_ida, 2, &ino);
-	spin_unlock(&sysfs_ino_lock);
-
-	if (rc == -EAGAIN) {
-		if (ida_pre_get(&sysfs_ino_ida, GFP_KERNEL))
-			goto retry;
-		rc = -ENOMEM;
-	}
-
-	*pino = ino;
-	return rc;
-}
-
-static void sysfs_free_ino(unsigned int ino)
-{
-	spin_lock(&sysfs_ino_lock);
-	ida_remove(&sysfs_ino_ida, ino);
-	spin_unlock(&sysfs_ino_lock);
-}
-
-void release_sysfs_dirent(struct sysfs_dirent *sd)
-{
-	struct sysfs_dirent *parent_sd;
-
- repeat:
-	/* Moving/renaming is always done while holding reference.
-	 * sd->s_parent won't change beneath us.
-	 */
-	parent_sd = sd->s_parent;
-
-	WARN(!(sd->s_flags & SYSFS_FLAG_REMOVED),
-		"sysfs: free using entry: %s/%s\n",
-		parent_sd ? parent_sd->s_name : "", sd->s_name);
-
-	if (sysfs_type(sd) == SYSFS_KOBJ_LINK)
-		sysfs_put(sd->s_symlink.target_sd);
-	if (sysfs_type(sd) & SYSFS_COPY_NAME)
-		kfree(sd->s_name);
-	if (sd->s_iattr && sd->s_iattr->ia_secdata)
-		security_release_secctx(sd->s_iattr->ia_secdata,
-					sd->s_iattr->ia_secdata_len);
-	kfree(sd->s_iattr);
-	sysfs_free_ino(sd->s_ino);
-	kmem_cache_free(sysfs_dir_cachep, sd);
-
-	sd = parent_sd;
-	if (sd && atomic_dec_and_test(&sd->s_count))
-		goto repeat;
-}
-
-static int sysfs_dentry_delete(const struct dentry *dentry)
-{
-	struct sysfs_dirent *sd = dentry->d_fsdata;
-	return !(sd && !(sd->s_flags & SYSFS_FLAG_REMOVED));
-}
-
-static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags)
-{
-	struct sysfs_dirent *sd;
-	int type;
-
-	if (flags & LOOKUP_RCU)
-		return -ECHILD;
-
-	sd = dentry->d_fsdata;
-	mutex_lock(&sysfs_mutex);
-
-	/* The sysfs dirent has been deleted */
-	if (sd->s_flags & SYSFS_FLAG_REMOVED)
-		goto out_bad;
-
-	/* The sysfs dirent has been moved? */
-	if (dentry->d_parent->d_fsdata != sd->s_parent)
-		goto out_bad;
-
-	/* The sysfs dirent has been renamed */
-	if (strcmp(dentry->d_name.name, sd->s_name) != 0)
-		goto out_bad;
-
-	/* The sysfs dirent has been moved to a different namespace */
-	type = KOBJ_NS_TYPE_NONE;
-	if (sd->s_parent) {
-		type = sysfs_ns_type(sd->s_parent);
-		if (type != KOBJ_NS_TYPE_NONE &&
-				sysfs_info(dentry->d_sb)->ns[type] != sd->s_ns)
-			goto out_bad;
-	}
-
-	mutex_unlock(&sysfs_mutex);
-out_valid:
-	return 1;
-out_bad:
-	/* Remove the dentry from the dcache hashes.
-	 * If this is a deleted dentry we use d_drop instead of d_delete
-	 * so sysfs doesn't need to cope with negative dentries.
-	 *
-	 * If this is a dentry that has simply been renamed we
-	 * use d_drop to remove it from the dcache lookup on its
-	 * old parent.  If this dentry persists later when a lookup
-	 * is performed at its new name the dentry will be readded
-	 * to the dcache hashes.
-	 */
-	mutex_unlock(&sysfs_mutex);
-
-	/* If we have submounts we must allow the vfs caches
-	 * to lie about the state of the filesystem to prevent
-	 * leaks and other nasty things.
-	 */
-	if (check_submounts_and_drop(dentry) != 0)
-		goto out_valid;
-
-	return 0;
-}
-
-static void sysfs_dentry_release(struct dentry *dentry)
-{
-	sysfs_put(dentry->d_fsdata);
-}
-
-const struct dentry_operations sysfs_dentry_ops = {
-	.d_revalidate	= sysfs_dentry_revalidate,
-	.d_delete	= sysfs_dentry_delete,
-	.d_release	= sysfs_dentry_release,
-};
-
-struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type)
-{
-	char *dup_name = NULL;
-	struct sysfs_dirent *sd;
-
-	if (type & SYSFS_COPY_NAME) {
-		name = dup_name = kstrdup(name, GFP_KERNEL);
-		if (!name)
-			return NULL;
-	}
-
-	sd = kmem_cache_zalloc(sysfs_dir_cachep, GFP_KERNEL);
-	if (!sd)
-		goto err_out1;
-
-	if (sysfs_alloc_ino(&sd->s_ino))
-		goto err_out2;
-
-	atomic_set(&sd->s_count, 1);
-	atomic_set(&sd->s_active, 0);
-
-	sd->s_name = name;
-	sd->s_mode = mode;
-	sd->s_flags = type | SYSFS_FLAG_REMOVED;
-
-	return sd;
-
- err_out2:
-	kmem_cache_free(sysfs_dir_cachep, sd);
- err_out1:
-	kfree(dup_name);
-	return NULL;
-}
-
-/**
- *	sysfs_addrm_start - prepare for sysfs_dirent add/remove
- *	@acxt: pointer to sysfs_addrm_cxt to be used
- *
- *	This function is called when the caller is about to add or remove
- *	sysfs_dirent.  This function acquires sysfs_mutex.  @acxt is used
- *	to keep and pass context to other addrm functions.
- *
- *	LOCKING:
- *	Kernel thread context (may sleep).  sysfs_mutex is locked on
- *	return.
- */
-void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt)
-	__acquires(sysfs_mutex)
-{
-	memset(acxt, 0, sizeof(*acxt));
-
-	mutex_lock(&sysfs_mutex);
-}
-
-/**
- *	__sysfs_add_one - add sysfs_dirent to parent without warning
- *	@acxt: addrm context to use
- *	@sd: sysfs_dirent to be added
- *	@parent_sd: the parent sysfs_dirent to add @sd to
- *
- *	Get @parent_sd and set @sd->s_parent to it and increment nlink of
- *	the parent inode if @sd is a directory and link into the children
- *	list of the parent.
- *
- *	This function should be called between calls to
- *	sysfs_addrm_start() and sysfs_addrm_finish() and should be
- *	passed the same @acxt as passed to sysfs_addrm_start().
- *
- *	LOCKING:
- *	Determined by sysfs_addrm_start().
- *
- *	RETURNS:
- *	0 on success, -EEXIST if entry with the given name already
- *	exists.
- */
-int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
-		    struct sysfs_dirent *parent_sd)
-{
-	struct sysfs_inode_attrs *ps_iattr;
-	int ret;
-
-	if (!!sysfs_ns_type(parent_sd) != !!sd->s_ns) {
-		WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
-			sysfs_ns_type(parent_sd) ? "required" : "invalid",
-			parent_sd->s_name, sd->s_name);
-		return -EINVAL;
-	}
-
-	sd->s_hash = sysfs_name_hash(sd->s_name, sd->s_ns);
-	sd->s_parent = sysfs_get(parent_sd);
-
-	ret = sysfs_link_sibling(sd);
-	if (ret)
-		return ret;
-
-	/* Update timestamps on the parent */
-	ps_iattr = parent_sd->s_iattr;
-	if (ps_iattr) {
-		struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
-		ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
-	}
-
-	/* Mark the entry added into directory tree */
-	sd->s_flags &= ~SYSFS_FLAG_REMOVED;
-
-	return 0;
-}
-
 /**
  *	sysfs_pathname - return full path to sysfs dirent
- *	@sd: sysfs_dirent whose path we want
+ *	@kn: kernfs_node whose path we want
  *	@path: caller allocated buffer of size PATH_MAX
  *
  *	Gives the name "/" to the sysfs_root entry; any path returned
  *	is relative to wherever sysfs is mounted.
  */
-static char *sysfs_pathname(struct sysfs_dirent *sd, char *path)
+static char *sysfs_pathname(struct kernfs_node *kn, char *path)
 {
-	if (sd->s_parent) {
-		sysfs_pathname(sd->s_parent, path);
+	if (kn->parent) {
+		sysfs_pathname(kn->parent, path);
 		strlcat(path, "/", PATH_MAX);
 	}
-	strlcat(path, sd->s_name, PATH_MAX);
+	strlcat(path, kn->name, PATH_MAX);
 	return path;
 }
 
-void sysfs_warn_dup(struct sysfs_dirent *parent, const char *name)
+void sysfs_warn_dup(struct kernfs_node *parent, const char *name)
 {
 	char *path;
 
@@ -489,445 +55,33 @@ void sysfs_warn_dup(struct sysfs_dirent *parent, const char *name)
 }
 
 /**
- *	sysfs_add_one - add sysfs_dirent to parent
- *	@acxt: addrm context to use
- *	@sd: sysfs_dirent to be added
- *	@parent_sd: the parent sysfs_dirent to add @sd to
- *
- *	Get @parent_sd and set @sd->s_parent to it and increment nlink of
- *	the parent inode if @sd is a directory and link into the children
- *	list of the parent.
- *
- *	This function should be called between calls to
- *	sysfs_addrm_start() and sysfs_addrm_finish() and should be
- *	passed the same @acxt as passed to sysfs_addrm_start().
- *
- *	LOCKING:
- *	Determined by sysfs_addrm_start().
- *
- *	RETURNS:
- *	0 on success, -EEXIST if entry with the given name already
- *	exists.
- */
-int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
-		  struct sysfs_dirent *parent_sd)
-{
-	int ret;
-
-	ret = __sysfs_add_one(acxt, sd, parent_sd);
-
-	if (ret == -EEXIST)
-		sysfs_warn_dup(parent_sd, sd->s_name);
-	return ret;
-}
-
-/**
- *	sysfs_remove_one - remove sysfs_dirent from parent
- *	@acxt: addrm context to use
- *	@sd: sysfs_dirent to be removed
- *
- *	Mark @sd removed and drop nlink of parent inode if @sd is a
- *	directory.  @sd is unlinked from the children list.
- *
- *	This function should be called between calls to
- *	sysfs_addrm_start() and sysfs_addrm_finish() and should be
- *	passed the same @acxt as passed to sysfs_addrm_start().
- *
- *	LOCKING:
- *	Determined by sysfs_addrm_start().
- */
-static void sysfs_remove_one(struct sysfs_addrm_cxt *acxt,
-			     struct sysfs_dirent *sd)
-{
-	struct sysfs_inode_attrs *ps_iattr;
-
-	/*
-	 * Removal can be called multiple times on the same node.  Only the
-	 * first invocation is effective and puts the base ref.
-	 */
-	if (sd->s_flags & SYSFS_FLAG_REMOVED)
-		return;
-
-	sysfs_unlink_sibling(sd);
-
-	/* Update timestamps on the parent */
-	ps_iattr = sd->s_parent->s_iattr;
-	if (ps_iattr) {
-		struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
-		ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
-	}
-
-	sd->s_flags |= SYSFS_FLAG_REMOVED;
-	sd->u.removed_list = acxt->removed;
-	acxt->removed = sd;
-}
-
-/**
- *	sysfs_addrm_finish - finish up sysfs_dirent add/remove
- *	@acxt: addrm context to finish up
- *
- *	Finish up sysfs_dirent add/remove.  Resources acquired by
- *	sysfs_addrm_start() are released and removed sysfs_dirents are
- *	cleaned up.
- *
- *	LOCKING:
- *	sysfs_mutex is released.
- */
-void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
-	__releases(sysfs_mutex)
-{
-	/* release resources acquired by sysfs_addrm_start() */
-	mutex_unlock(&sysfs_mutex);
-
-	/* kill removed sysfs_dirents */
-	while (acxt->removed) {
-		struct sysfs_dirent *sd = acxt->removed;
-
-		acxt->removed = sd->u.removed_list;
-
-		sysfs_deactivate(sd);
-		sysfs_unmap_bin_file(sd);
-		sysfs_put(sd);
-	}
-}
-
-/**
- *	sysfs_find_dirent - find sysfs_dirent with the given name
- *	@parent_sd: sysfs_dirent to search under
- *	@name: name to look for
- *	@ns: the namespace tag to use
- *
- *	Look for sysfs_dirent with name @name under @parent_sd.
- *
- *	LOCKING:
- *	mutex_lock(sysfs_mutex)
- *
- *	RETURNS:
- *	Pointer to sysfs_dirent if found, NULL if not.
- */
-struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
-				       const unsigned char *name,
-				       const void *ns)
-{
-	struct rb_node *node = parent_sd->s_dir.children.rb_node;
-	unsigned int hash;
-
-	if (!!sysfs_ns_type(parent_sd) != !!ns) {
-		WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
-			sysfs_ns_type(parent_sd) ? "required" : "invalid",
-			parent_sd->s_name, name);
-		return NULL;
-	}
-
-	hash = sysfs_name_hash(name, ns);
-	while (node) {
-		struct sysfs_dirent *sd;
-		int result;
-
-		sd = to_sysfs_dirent(node);
-		result = sysfs_name_compare(hash, name, ns, sd);
-		if (result < 0)
-			node = node->rb_left;
-		else if (result > 0)
-			node = node->rb_right;
-		else
-			return sd;
-	}
-	return NULL;
-}
-
-/**
- *	sysfs_get_dirent_ns - find and get sysfs_dirent with the given name
- *	@parent_sd: sysfs_dirent to search under
- *	@name: name to look for
- *	@ns: the namespace tag to use
- *
- *	Look for sysfs_dirent with name @name under @parent_sd and get
- *	it if found.
- *
- *	LOCKING:
- *	Kernel thread context (may sleep).  Grabs sysfs_mutex.
- *
- *	RETURNS:
- *	Pointer to sysfs_dirent if found, NULL if not.
- */
-struct sysfs_dirent *sysfs_get_dirent_ns(struct sysfs_dirent *parent_sd,
-					 const unsigned char *name,
-					 const void *ns)
-{
-	struct sysfs_dirent *sd;
-
-	mutex_lock(&sysfs_mutex);
-	sd = sysfs_find_dirent(parent_sd, name, ns);
-	sysfs_get(sd);
-	mutex_unlock(&sysfs_mutex);
-
-	return sd;
-}
-EXPORT_SYMBOL_GPL(sysfs_get_dirent_ns);
-
-static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
-		      enum kobj_ns_type type,
-		      const char *name, const void *ns,
-		      struct sysfs_dirent **p_sd)
-{
-	umode_t mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
-	struct sysfs_addrm_cxt acxt;
-	struct sysfs_dirent *sd;
-	int rc;
-
-	/* allocate */
-	sd = sysfs_new_dirent(name, mode, SYSFS_DIR);
-	if (!sd)
-		return -ENOMEM;
-
-	sd->s_flags |= (type << SYSFS_NS_TYPE_SHIFT);
-	sd->s_ns = ns;
-	sd->s_dir.kobj = kobj;
-
-	/* link in */
-	sysfs_addrm_start(&acxt);
-	rc = sysfs_add_one(&acxt, sd, parent_sd);
-	sysfs_addrm_finish(&acxt);
-
-	if (rc == 0)
-		*p_sd = sd;
-	else
-		sysfs_put(sd);
-
-	return rc;
-}
-
-int sysfs_create_subdir(struct kobject *kobj, const char *name,
-			struct sysfs_dirent **p_sd)
-{
-	return create_dir(kobj, kobj->sd,
-			  KOBJ_NS_TYPE_NONE, name, NULL, p_sd);
-}
-
-/**
- *	sysfs_read_ns_type: return associated ns_type
- *	@kobj: the kobject being queried
- *
- *	Each kobject can be tagged with exactly one namespace type
- *	(i.e. network or user).  Return the ns_type associated with
- *	this object if any
- */
-static enum kobj_ns_type sysfs_read_ns_type(struct kobject *kobj)
-{
-	const struct kobj_ns_type_operations *ops;
-	enum kobj_ns_type type;
-
-	ops = kobj_child_ns_ops(kobj);
-	if (!ops)
-		return KOBJ_NS_TYPE_NONE;
-
-	type = ops->type;
-	BUG_ON(type <= KOBJ_NS_TYPE_NONE);
-	BUG_ON(type >= KOBJ_NS_TYPES);
-	BUG_ON(!kobj_ns_type_registered(type));
-
-	return type;
-}
-
-/**
  * sysfs_create_dir_ns - create a directory for an object with a namespace tag
  * @kobj: object we're creating directory for
  * @ns: the namespace tag to use
  */
 int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
 {
-	enum kobj_ns_type type;
-	struct sysfs_dirent *parent_sd, *sd;
-	int error = 0;
+	struct kernfs_node *parent, *kn;
 
 	BUG_ON(!kobj);
 
 	if (kobj->parent)
-		parent_sd = kobj->parent->sd;
+		parent = kobj->parent->sd;
 	else
-		parent_sd = &sysfs_root;
+		parent = sysfs_root_kn;
 
-	if (!parent_sd)
+	if (!parent)
 		return -ENOENT;
 
-	type = sysfs_read_ns_type(kobj);
-
-	error = create_dir(kobj, parent_sd, type, kobject_name(kobj), ns, &sd);
-	if (!error)
-		kobj->sd = sd;
-	return error;
-}
-
-static struct dentry *sysfs_lookup(struct inode *dir, struct dentry *dentry,
-				   unsigned int flags)
-{
-	struct dentry *ret = NULL;
-	struct dentry *parent = dentry->d_parent;
-	struct sysfs_dirent *parent_sd = parent->d_fsdata;
-	struct sysfs_dirent *sd;
-	struct inode *inode;
-	enum kobj_ns_type type;
-	const void *ns;
-
-	mutex_lock(&sysfs_mutex);
-
-	type = sysfs_ns_type(parent_sd);
-	ns = sysfs_info(dir->i_sb)->ns[type];
-
-	sd = sysfs_find_dirent(parent_sd, dentry->d_name.name, ns);
-
-	/* no such entry */
-	if (!sd) {
-		ret = ERR_PTR(-ENOENT);
-		goto out_unlock;
-	}
-	dentry->d_fsdata = sysfs_get(sd);
-
-	/* attach dentry and inode */
-	inode = sysfs_get_inode(dir->i_sb, sd);
-	if (!inode) {
-		ret = ERR_PTR(-ENOMEM);
-		goto out_unlock;
-	}
-
-	/* instantiate and hash dentry */
-	ret = d_materialise_unique(dentry, inode);
- out_unlock:
-	mutex_unlock(&sysfs_mutex);
-	return ret;
-}
-
-const struct inode_operations sysfs_dir_inode_operations = {
-	.lookup		= sysfs_lookup,
-	.permission	= sysfs_permission,
-	.setattr	= sysfs_setattr,
-	.getattr	= sysfs_getattr,
-	.setxattr	= sysfs_setxattr,
-};
-
-static struct sysfs_dirent *sysfs_leftmost_descendant(struct sysfs_dirent *pos)
-{
-	struct sysfs_dirent *last;
-
-	while (true) {
-		struct rb_node *rbn;
-
-		last = pos;
-
-		if (sysfs_type(pos) != SYSFS_DIR)
-			break;
-
-		rbn = rb_first(&pos->s_dir.children);
-		if (!rbn)
-			break;
-
-		pos = to_sysfs_dirent(rbn);
-	}
-
-	return last;
-}
-
-/**
- * sysfs_next_descendant_post - find the next descendant for post-order walk
- * @pos: the current position (%NULL to initiate traversal)
- * @root: sysfs_dirent whose descendants to walk
- *
- * Find the next descendant to visit for post-order traversal of @root's
- * descendants.  @root is included in the iteration and the last node to be
- * visited.
- */
-static struct sysfs_dirent *sysfs_next_descendant_post(struct sysfs_dirent *pos,
-						       struct sysfs_dirent *root)
-{
-	struct rb_node *rbn;
-
-	lockdep_assert_held(&sysfs_mutex);
-
-	/* if first iteration, visit leftmost descendant which may be root */
-	if (!pos)
-		return sysfs_leftmost_descendant(root);
-
-	/* if we visited @root, we're done */
-	if (pos == root)
-		return NULL;
-
-	/* if there's an unvisited sibling, visit its leftmost descendant */
-	rbn = rb_next(&pos->s_rb);
-	if (rbn)
-		return sysfs_leftmost_descendant(to_sysfs_dirent(rbn));
-
-	/* no sibling left, visit parent */
-	return pos->s_parent;
-}
-
-static void __sysfs_remove(struct sysfs_addrm_cxt *acxt,
-			   struct sysfs_dirent *sd)
-{
-	struct sysfs_dirent *pos, *next;
-
-	if (!sd)
-		return;
-
-	pr_debug("sysfs %s: removing\n", sd->s_name);
-
-	next = NULL;
-	do {
-		pos = next;
-		next = sysfs_next_descendant_post(pos, sd);
-		if (pos)
-			sysfs_remove_one(acxt, pos);
-	} while (next);
-}
-
-/**
- * sysfs_remove - remove a sysfs_dirent recursively
- * @sd: the sysfs_dirent to remove
- *
- * Remove @sd along with all its subdirectories and files.
- */
-void sysfs_remove(struct sysfs_dirent *sd)
-{
-	struct sysfs_addrm_cxt acxt;
-
-	sysfs_addrm_start(&acxt);
-	__sysfs_remove(&acxt, sd);
-	sysfs_addrm_finish(&acxt);
-}
-
-/**
- * sysfs_hash_and_remove - find a sysfs_dirent by name and remove it
- * @dir_sd: parent of the target
- * @name: name of the sysfs_dirent to remove
- * @ns: namespace tag of the sysfs_dirent to remove
- *
- * Look for the sysfs_dirent with @name and @ns under @dir_sd and remove
- * it.  Returns 0 on success, -ENOENT if such entry doesn't exist.
- */
-int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name,
-			  const void *ns)
-{
-	struct sysfs_addrm_cxt acxt;
-	struct sysfs_dirent *sd;
-
-	if (!dir_sd) {
-		WARN(1, KERN_WARNING "sysfs: can not remove '%s', no directory\n",
-			name);
-		return -ENOENT;
+	kn = kernfs_create_dir_ns(parent, kobject_name(kobj), kobj, ns);
+	if (IS_ERR(kn)) {
+		if (PTR_ERR(kn) == -EEXIST)
+			sysfs_warn_dup(parent, kobject_name(kobj));
+		return PTR_ERR(kn);
 	}
 
-	sysfs_addrm_start(&acxt);
-
-	sd = sysfs_find_dirent(dir_sd, name, ns);
-	if (sd)
-		__sysfs_remove(&acxt, sd);
-
-	sysfs_addrm_finish(&acxt);
-
-	if (sd)
-		return 0;
-	else
-		return -ENOENT;
+	kobj->sd = kn;
+	return 0;
 }
 
 /**
@@ -940,207 +94,47 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name,
  */
 void sysfs_remove_dir(struct kobject *kobj)
 {
-	struct sysfs_dirent *sd = kobj->sd;
+	struct kernfs_node *kn = kobj->sd;
 
 	/*
 	 * In general, kboject owner is responsible for ensuring removal
 	 * doesn't race with other operations and sysfs doesn't provide any
 	 * protection; however, when @kobj is used as a symlink target, the
 	 * symlinking entity usually doesn't own @kobj and thus has no
-	 * control over removal.  @kobj->sd may be removed anytime and
-	 * symlink code may end up dereferencing an already freed sd.
+	 * control over removal.  @kobj->sd may be removed anytime
+	 * and symlink code may end up dereferencing an already freed node.
 	 *
-	 * sysfs_symlink_target_lock synchronizes @kobj->sd disassociation
-	 * against symlink operations so that symlink code can safely
-	 * dereference @kobj->sd.
+	 * sysfs_symlink_target_lock synchronizes @kobj->sd
+	 * disassociation against symlink operations so that symlink code
+	 * can safely dereference @kobj->sd.
 	 */
 	spin_lock(&sysfs_symlink_target_lock);
 	kobj->sd = NULL;
 	spin_unlock(&sysfs_symlink_target_lock);
 
-	if (sd) {
-		WARN_ON_ONCE(sysfs_type(sd) != SYSFS_DIR);
-		sysfs_remove(sd);
+	if (kn) {
+		WARN_ON_ONCE(kernfs_type(kn) != KERNFS_DIR);
+		kernfs_remove(kn);
 	}
 }
 
-int sysfs_rename(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent_sd,
-		 const char *new_name, const void *new_ns)
-{
-	int error;
-
-	mutex_lock(&sysfs_mutex);
-
-	error = 0;
-	if ((sd->s_parent == new_parent_sd) && (sd->s_ns == new_ns) &&
-	    (strcmp(sd->s_name, new_name) == 0))
-		goto out;	/* nothing to rename */
-
-	error = -EEXIST;
-	if (sysfs_find_dirent(new_parent_sd, new_name, new_ns))
-		goto out;
-
-	/* rename sysfs_dirent */
-	if (strcmp(sd->s_name, new_name) != 0) {
-		error = -ENOMEM;
-		new_name = kstrdup(new_name, GFP_KERNEL);
-		if (!new_name)
-			goto out;
-
-		kfree(sd->s_name);
-		sd->s_name = new_name;
-	}
-
-	/*
-	 * Move to the appropriate place in the appropriate directories rbtree.
-	 */
-	sysfs_unlink_sibling(sd);
-	sysfs_get(new_parent_sd);
-	sysfs_put(sd->s_parent);
-	sd->s_ns = new_ns;
-	sd->s_hash = sysfs_name_hash(sd->s_name, sd->s_ns);
-	sd->s_parent = new_parent_sd;
-	sysfs_link_sibling(sd);
-
-	error = 0;
- out:
-	mutex_unlock(&sysfs_mutex);
-	return error;
-}
-
 int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name,
 			const void *new_ns)
 {
-	struct sysfs_dirent *parent_sd = kobj->sd->s_parent;
+	struct kernfs_node *parent = kobj->sd->parent;
 
-	return sysfs_rename(kobj->sd, parent_sd, new_name, new_ns);
+	return kernfs_rename_ns(kobj->sd, parent, new_name, new_ns);
 }
 
 int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj,
 		      const void *new_ns)
 {
-	struct sysfs_dirent *sd = kobj->sd;
-	struct sysfs_dirent *new_parent_sd;
+	struct kernfs_node *kn = kobj->sd;
+	struct kernfs_node *new_parent;
 
-	BUG_ON(!sd->s_parent);
-	new_parent_sd = new_parent_kobj && new_parent_kobj->sd ?
-		new_parent_kobj->sd : &sysfs_root;
+	BUG_ON(!kn->parent);
+	new_parent = new_parent_kobj && new_parent_kobj->sd ?
+		new_parent_kobj->sd : sysfs_root_kn;
 
-	return sysfs_rename(sd, new_parent_sd, sd->s_name, new_ns);
+	return kernfs_rename_ns(kn, new_parent, kn->name, new_ns);
 }
-
-/* Relationship between s_mode and the DT_xxx types */
-static inline unsigned char dt_type(struct sysfs_dirent *sd)
-{
-	return (sd->s_mode >> 12) & 15;
-}
-
-static int sysfs_dir_release(struct inode *inode, struct file *filp)
-{
-	sysfs_put(filp->private_data);
-	return 0;
-}
-
-static struct sysfs_dirent *sysfs_dir_pos(const void *ns,
-	struct sysfs_dirent *parent_sd,	loff_t hash, struct sysfs_dirent *pos)
-{
-	if (pos) {
-		int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) &&
-			pos->s_parent == parent_sd &&
-			hash == pos->s_hash;
-		sysfs_put(pos);
-		if (!valid)
-			pos = NULL;
-	}
-	if (!pos && (hash > 1) && (hash < INT_MAX)) {
-		struct rb_node *node = parent_sd->s_dir.children.rb_node;
-		while (node) {
-			pos = to_sysfs_dirent(node);
-
-			if (hash < pos->s_hash)
-				node = node->rb_left;
-			else if (hash > pos->s_hash)
-				node = node->rb_right;
-			else
-				break;
-		}
-	}
-	/* Skip over entries in the wrong namespace */
-	while (pos && pos->s_ns != ns) {
-		struct rb_node *node = rb_next(&pos->s_rb);
-		if (!node)
-			pos = NULL;
-		else
-			pos = to_sysfs_dirent(node);
-	}
-	return pos;
-}
-
-static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
-	struct sysfs_dirent *parent_sd,	ino_t ino, struct sysfs_dirent *pos)
-{
-	pos = sysfs_dir_pos(ns, parent_sd, ino, pos);
-	if (pos)
-		do {
-			struct rb_node *node = rb_next(&pos->s_rb);
-			if (!node)
-				pos = NULL;
-			else
-				pos = to_sysfs_dirent(node);
-		} while (pos && pos->s_ns != ns);
-	return pos;
-}
-
-static int sysfs_readdir(struct file *file, struct dir_context *ctx)
-{
-	struct dentry *dentry = file->f_path.dentry;
-	struct sysfs_dirent *parent_sd = dentry->d_fsdata;
-	struct sysfs_dirent *pos = file->private_data;
-	enum kobj_ns_type type;
-	const void *ns;
-
-	type = sysfs_ns_type(parent_sd);
-	ns = sysfs_info(dentry->d_sb)->ns[type];
-
-	if (!dir_emit_dots(file, ctx))
-		return 0;
-	mutex_lock(&sysfs_mutex);
-	for (pos = sysfs_dir_pos(ns, parent_sd, ctx->pos, pos);
-	     pos;
-	     pos = sysfs_dir_next_pos(ns, parent_sd, ctx->pos, pos)) {
-		const char *name = pos->s_name;
-		unsigned int type = dt_type(pos);
-		int len = strlen(name);
-		ino_t ino = pos->s_ino;
-		ctx->pos = pos->s_hash;
-		file->private_data = sysfs_get(pos);
-
-		mutex_unlock(&sysfs_mutex);
-		if (!dir_emit(ctx, name, len, ino, type))
-			return 0;
-		mutex_lock(&sysfs_mutex);
-	}
-	mutex_unlock(&sysfs_mutex);
-	file->private_data = NULL;
-	ctx->pos = INT_MAX;
-	return 0;
-}
-
-static loff_t sysfs_dir_llseek(struct file *file, loff_t offset, int whence)
-{
-	struct inode *inode = file_inode(file);
-	loff_t ret;
-
-	mutex_lock(&inode->i_mutex);
-	ret = generic_file_llseek(file, offset, whence);
-	mutex_unlock(&inode->i_mutex);
-
-	return ret;
-}
-
-const struct file_operations sysfs_dir_operations = {
-	.read		= generic_read_dir,
-	.iterate	= sysfs_readdir,
-	.release	= sysfs_dir_release,
-	.llseek		= sysfs_dir_llseek,
-};
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 35e7d08fe629..fe6388fbd154 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -14,70 +14,23 @@
 #include <linux/kobject.h>
 #include <linux/kallsyms.h>
 #include <linux/slab.h>
-#include <linux/fsnotify.h>
-#include <linux/namei.h>
-#include <linux/poll.h>
 #include <linux/list.h>
 #include <linux/mutex.h>
-#include <linux/limits.h>
-#include <linux/uaccess.h>
 #include <linux/seq_file.h>
-#include <linux/mm.h>
 
 #include "sysfs.h"
+#include "../kernfs/kernfs-internal.h"
 
 /*
- * There's one sysfs_open_file for each open file and one sysfs_open_dirent
- * for each sysfs_dirent with one or more open files.
- *
- * sysfs_dirent->s_attr.open points to sysfs_open_dirent.  s_attr.open is
- * protected by sysfs_open_dirent_lock.
- *
- * filp->private_data points to seq_file whose ->private points to
- * sysfs_open_file.  sysfs_open_files are chained at
- * sysfs_open_dirent->files, which is protected by sysfs_open_file_mutex.
- */
-static DEFINE_SPINLOCK(sysfs_open_dirent_lock);
-static DEFINE_MUTEX(sysfs_open_file_mutex);
-
-struct sysfs_open_dirent {
-	atomic_t		refcnt;
-	atomic_t		event;
-	wait_queue_head_t	poll;
-	struct list_head	files; /* goes through sysfs_open_file.list */
-};
-
-struct sysfs_open_file {
-	struct sysfs_dirent	*sd;
-	struct file		*file;
-	struct mutex		mutex;
-	int			event;
-	struct list_head	list;
-
-	bool			mmapped;
-	const struct vm_operations_struct *vm_ops;
-};
-
-static bool sysfs_is_bin(struct sysfs_dirent *sd)
-{
-	return sysfs_type(sd) == SYSFS_KOBJ_BIN_ATTR;
-}
-
-static struct sysfs_open_file *sysfs_of(struct file *file)
-{
-	return ((struct seq_file *)file->private_data)->private;
-}
-
-/*
- * Determine ktype->sysfs_ops for the given sysfs_dirent.  This function
+ * Determine ktype->sysfs_ops for the given kernfs_node.  This function
  * must be called while holding an active reference.
  */
-static const struct sysfs_ops *sysfs_file_ops(struct sysfs_dirent *sd)
+static const struct sysfs_ops *sysfs_file_ops(struct kernfs_node *kn)
 {
-	struct kobject *kobj = sd->s_parent->s_dir.kobj;
+	struct kobject *kobj = kn->parent->priv;
 
-	if (!sysfs_ignore_lockdep(sd))
-		lockdep_assert_held(sd);
+	if (kn->flags & KERNFS_LOCKDEP)
+		lockdep_assert_held(kn);
 	return kobj->ktype ? kobj->ktype->sysfs_ops : NULL;
 }
 
@@ -86,13 +39,13 @@ static const struct sysfs_ops *sysfs_file_ops(struct sysfs_dirent *sd)
  * details like buffering and seeking.  The following function pipes
  * sysfs_ops->show() result through seq_file.
  */
-static int sysfs_seq_show(struct seq_file *sf, void *v)
+static int sysfs_kf_seq_show(struct seq_file *sf, void *v)
 {
-	struct sysfs_open_file *of = sf->private;
-	struct kobject *kobj = of->sd->s_parent->s_dir.kobj;
-	const struct sysfs_ops *ops;
-	char *buf;
+	struct kernfs_open_file *of = sf->private;
+	struct kobject *kobj = of->kn->parent->priv;
+	const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
 	ssize_t count;
+	char *buf;
 
 	/* acquire buffer and ensure that it's >= PAGE_SIZE */
 	count = seq_get_buf(sf, &buf);
@@ -102,34 +55,15 @@ static int sysfs_seq_show(struct seq_file *sf, void *v)
 	}
 
 	/*
-	 * Need @of->sd for attr and ops, its parent for kobj.  @of->mutex
-	 * nests outside active ref and is just to ensure that the ops
-	 * aren't called concurrently for the same open file.
+	 * Invoke show().  Control may reach here via seq file lseek even
+	 * if @ops->show() isn't implemented.
 	 */
-	mutex_lock(&of->mutex);
-	if (!sysfs_get_active(of->sd)) {
-		mutex_unlock(&of->mutex);
-		return -ENODEV;
+	if (ops->show) {
+		count = ops->show(kobj, of->kn->priv, buf);
+		if (count < 0)
+			return count;
 	}
 
-	of->event = atomic_read(&of->sd->s_attr.open->event);
-
-	/*
-	 * Lookup @ops and invoke show().  Control may reach here via seq
-	 * file lseek even if @ops->show() isn't implemented.
-	 */
-	ops = sysfs_file_ops(of->sd);
-	if (ops->show)
-		count = ops->show(kobj, of->sd->s_attr.attr, buf);
-	else
-		count = 0;
-
-	sysfs_put_active(of->sd);
-	mutex_unlock(&of->mutex);
-
-	if (count < 0)
-		return count;
-
 	/*
 	 * The code works fine with PAGE_SIZE return but it's likely to
 	 * indicate truncated result or overflow in normal use cases.
@@ -144,726 +78,194 @@ static int sysfs_seq_show(struct seq_file *sf, void *v)
 	return 0;
 }
 
-/*
- * Read method for bin files.  As reading a bin file can have side-effects,
- * the exact offset and bytes specified in read(2) call should be passed to
- * the read callback making it difficult to use seq_file.  Implement
- * simplistic custom buffering for bin files.
- */
-static ssize_t sysfs_bin_read(struct file *file, char __user *userbuf,
-			      size_t bytes, loff_t *off)
+static ssize_t sysfs_kf_bin_read(struct kernfs_open_file *of, char *buf,
+				 size_t count, loff_t pos)
 {
-	struct sysfs_open_file *of = sysfs_of(file);
-	struct bin_attribute *battr = of->sd->s_attr.bin_attr;
-	struct kobject *kobj = of->sd->s_parent->s_dir.kobj;
-	loff_t size = file_inode(file)->i_size;
-	int count = min_t(size_t, bytes, PAGE_SIZE);
-	loff_t offs = *off;
-	char *buf;
+	struct bin_attribute *battr = of->kn->priv;
+	struct kobject *kobj = of->kn->parent->priv;
+	loff_t size = file_inode(of->file)->i_size;
 
-	if (!bytes)
+	if (!count)
 		return 0;
 
 	if (size) {
-		if (offs > size)
+		if (pos > size)
 			return 0;
-		if (offs + count > size)
-			count = size - offs;
-	}
-
-	buf = kmalloc(count, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
-
-	/* need of->sd for battr, its parent for kobj */
-	mutex_lock(&of->mutex);
-	if (!sysfs_get_active(of->sd)) {
-		count = -ENODEV;
-		mutex_unlock(&of->mutex);
-		goto out_free;
-	}
-
-	if (battr->read)
-		count = battr->read(file, kobj, battr, buf, offs, count);
-	else
-		count = -EIO;
-
-	sysfs_put_active(of->sd);
-	mutex_unlock(&of->mutex);
-
-	if (count < 0)
-		goto out_free;
-
-	if (copy_to_user(userbuf, buf, count)) {
-		count = -EFAULT;
-		goto out_free;
+		if (pos + count > size)
+			count = size - pos;
 	}
 
-	pr_debug("offs = %lld, *off = %lld, count = %d\n", offs, *off, count);
-
-	*off = offs + count;
+	if (!battr->read)
+		return -EIO;
 
- out_free:
-	kfree(buf);
-	return count;
+	return battr->read(of->file, kobj, battr, buf, pos, count);
 }
 
-/**
- * flush_write_buffer - push buffer to kobject
- * @of: open file
- * @buf: data buffer for file
- * @off: file offset to write to
- * @count: number of bytes
- *
- * Get the correct pointers for the kobject and the attribute we're dealing
- * with, then call the store() method for it with @buf.
- */
-static int flush_write_buffer(struct sysfs_open_file *of, char *buf, loff_t off,
-			      size_t count)
+/* kernfs write callback for regular sysfs files */
+static ssize_t sysfs_kf_write(struct kernfs_open_file *of, char *buf,
+			      size_t count, loff_t pos)
 {
-	struct kobject *kobj = of->sd->s_parent->s_dir.kobj;
-	int rc = 0;
-
-	/*
-	 * Need @of->sd for attr and ops, its parent for kobj.  @of->mutex
-	 * nests outside active ref and is just to ensure that the ops
-	 * aren't called concurrently for the same open file.
-	 */
-	mutex_lock(&of->mutex);
-	if (!sysfs_get_active(of->sd)) {
-		mutex_unlock(&of->mutex);
-		return -ENODEV;
-	}
+	const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
+	struct kobject *kobj = of->kn->parent->priv;
 
-	if (sysfs_is_bin(of->sd)) {
-		struct bin_attribute *battr = of->sd->s_attr.bin_attr;
-
-		rc = -EIO;
-		if (battr->write)
-			rc = battr->write(of->file, kobj, battr, buf, off,
-					  count);
-	} else {
-		const struct sysfs_ops *ops = sysfs_file_ops(of->sd);
-
-		rc = ops->store(kobj, of->sd->s_attr.attr, buf, count);
-	}
-
-	sysfs_put_active(of->sd);
-	mutex_unlock(&of->mutex);
+	if (!count)
+		return 0;
 
-	return rc;
+	return ops->store(kobj, of->kn->priv, buf, count);
 }
 
-/**
- * sysfs_write_file - write an attribute
- * @file: file pointer
- * @user_buf: data to write
- * @count: number of bytes
- * @ppos: starting offset
- *
- * Copy data in from userland and pass it to the matching
- * sysfs_ops->store() by invoking flush_write_buffer().
- *
- * There is no easy way for us to know if userspace is only doing a partial
- * write, so we don't support them. We expect the entire buffer to come on
- * the first write.  Hint: if you're writing a value, first read the file,
- * modify only the the value you're changing, then write entire buffer
- * back.
- */
-static ssize_t sysfs_write_file(struct file *file, const char __user *user_buf,
-				size_t count, loff_t *ppos)
+/* kernfs write callback for bin sysfs files */
+static ssize_t sysfs_kf_bin_write(struct kernfs_open_file *of, char *buf,
+				  size_t count, loff_t pos)
 {
-	struct sysfs_open_file *of = sysfs_of(file);
-	ssize_t len = min_t(size_t, count, PAGE_SIZE);
-	loff_t size = file_inode(file)->i_size;
-	char *buf;
+	struct bin_attribute *battr = of->kn->priv;
+	struct kobject *kobj = of->kn->parent->priv;
+	loff_t size = file_inode(of->file)->i_size;
 
-	if (sysfs_is_bin(of->sd) && size) {
-		if (size <= *ppos)
+	if (size) {
+		if (size <= pos)
 			return 0;
-		len = min_t(ssize_t, len, size - *ppos);
+		count = min_t(ssize_t, count, size - pos);
 	}
-
-	if (!len)
+	if (!count)
 		return 0;
 
-	buf = kmalloc(len + 1, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
+	if (!battr->write)
+		return -EIO;
 
-	if (copy_from_user(buf, user_buf, len)) {
-		len = -EFAULT;
-		goto out_free;
-	}
-	buf[len] = '\0';	/* guarantee string termination */
-
-	len = flush_write_buffer(of, buf, *ppos, len);
-	if (len > 0)
-		*ppos += len;
-out_free:
-	kfree(buf);
-	return len;
-}
-
-static void sysfs_bin_vma_open(struct vm_area_struct *vma)
-{
-	struct file *file = vma->vm_file;
-	struct sysfs_open_file *of = sysfs_of(file);
-
-	if (!of->vm_ops)
-		return;
-
-	if (!sysfs_get_active(of->sd))
-		return;
-
-	if (of->vm_ops->open)
-		of->vm_ops->open(vma);
-
-	sysfs_put_active(of->sd);
+	return battr->write(of->file, kobj, battr, buf, pos, count);
 }
 
-static int sysfs_bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int sysfs_kf_bin_mmap(struct kernfs_open_file *of,
+			     struct vm_area_struct *vma)
 {
-	struct file *file = vma->vm_file;
-	struct sysfs_open_file *of = sysfs_of(file);
-	int ret;
+	struct bin_attribute *battr = of->kn->priv;
+	struct kobject *kobj = of->kn->parent->priv;
 
-	if (!of->vm_ops)
-		return VM_FAULT_SIGBUS;
-
-	if (!sysfs_get_active(of->sd))
-		return VM_FAULT_SIGBUS;
-
-	ret = VM_FAULT_SIGBUS;
-	if (of->vm_ops->fault)
-		ret = of->vm_ops->fault(vma, vmf);
-
-	sysfs_put_active(of->sd);
-	return ret;
+	return battr->mmap(of->file, kobj, battr, vma);
 }
 
-static int sysfs_bin_page_mkwrite(struct vm_area_struct *vma,
-				  struct vm_fault *vmf)
+void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr)
 {
-	struct file *file = vma->vm_file;
-	struct sysfs_open_file *of = sysfs_of(file);
-	int ret;
-
-	if (!of->vm_ops)
-		return VM_FAULT_SIGBUS;
+	struct kernfs_node *kn = kobj->sd, *tmp;
 
-	if (!sysfs_get_active(of->sd))
-		return VM_FAULT_SIGBUS;
-
-	ret = 0;
-	if (of->vm_ops->page_mkwrite)
-		ret = of->vm_ops->page_mkwrite(vma, vmf);
+	if (kn && dir)
+		kn = kernfs_find_and_get(kn, dir);
 	else
-		file_update_time(file);
-
-	sysfs_put_active(of->sd);
-	return ret;
-}
-
-static int sysfs_bin_access(struct vm_area_struct *vma, unsigned long addr,
-			    void *buf, int len, int write)
-{
-	struct file *file = vma->vm_file;
-	struct sysfs_open_file *of = sysfs_of(file);
-	int ret;
-
-	if (!of->vm_ops)
-		return -EINVAL;
-
-	if (!sysfs_get_active(of->sd))
-		return -EINVAL;
-
-	ret = -EINVAL;
-	if (of->vm_ops->access)
-		ret = of->vm_ops->access(vma, addr, buf, len, write);
-
-	sysfs_put_active(of->sd);
-	return ret;
-}
-
-#ifdef CONFIG_NUMA
-static int sysfs_bin_set_policy(struct vm_area_struct *vma,
-				struct mempolicy *new)
-{
-	struct file *file = vma->vm_file;
-	struct sysfs_open_file *of = sysfs_of(file);
-	int ret;
-
-	if (!of->vm_ops)
-		return 0;
-
-	if (!sysfs_get_active(of->sd))
-		return -EINVAL;
-
-	ret = 0;
-	if (of->vm_ops->set_policy)
-		ret = of->vm_ops->set_policy(vma, new);
-
-	sysfs_put_active(of->sd);
-	return ret;
-}
-
-static struct mempolicy *sysfs_bin_get_policy(struct vm_area_struct *vma,
-					      unsigned long addr)
-{
-	struct file *file = vma->vm_file;
-	struct sysfs_open_file *of = sysfs_of(file);
-	struct mempolicy *pol;
-
-	if (!of->vm_ops)
-		return vma->vm_policy;
-
-	if (!sysfs_get_active(of->sd))
-		return vma->vm_policy;
-
-	pol = vma->vm_policy;
-	if (of->vm_ops->get_policy)
-		pol = of->vm_ops->get_policy(vma, addr);
-
-	sysfs_put_active(of->sd);
-	return pol;
-}
-
-static int sysfs_bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
-			     const nodemask_t *to, unsigned long flags)
-{
-	struct file *file = vma->vm_file;
-	struct sysfs_open_file *of = sysfs_of(file);
-	int ret;
-
-	if (!of->vm_ops)
-		return 0;
-
-	if (!sysfs_get_active(of->sd))
-		return 0;
-
-	ret = 0;
-	if (of->vm_ops->migrate)
-		ret = of->vm_ops->migrate(vma, from, to, flags);
-
-	sysfs_put_active(of->sd);
-	return ret;
-}
-#endif
-
-static const struct vm_operations_struct sysfs_bin_vm_ops = {
-	.open		= sysfs_bin_vma_open,
-	.fault		= sysfs_bin_fault,
-	.page_mkwrite	= sysfs_bin_page_mkwrite,
-	.access		= sysfs_bin_access,
-#ifdef CONFIG_NUMA
-	.set_policy	= sysfs_bin_set_policy,
-	.get_policy	= sysfs_bin_get_policy,
-	.migrate	= sysfs_bin_migrate,
-#endif
-};
-
-static int sysfs_bin_mmap(struct file *file, struct vm_area_struct *vma)
-{
-	struct sysfs_open_file *of = sysfs_of(file);
-	struct bin_attribute *battr = of->sd->s_attr.bin_attr;
-	struct kobject *kobj = of->sd->s_parent->s_dir.kobj;
-	int rc;
-
-	mutex_lock(&of->mutex);
-
-	/* need of->sd for battr, its parent for kobj */
-	rc = -ENODEV;
-	if (!sysfs_get_active(of->sd))
-		goto out_unlock;
-
-	if (!battr->mmap)
-		goto out_put;
-
-	rc = battr->mmap(file, kobj, battr, vma);
-	if (rc)
-		goto out_put;
-
-	/*
-	 * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup()
-	 * to satisfy versions of X which crash if the mmap fails: that
-	 * substitutes a new vm_file, and we don't then want bin_vm_ops.
-	 */
-	if (vma->vm_file != file)
-		goto out_put;
-
-	rc = -EINVAL;
-	if (of->mmapped && of->vm_ops != vma->vm_ops)
-		goto out_put;
+		kernfs_get(kn);
 
-	/*
-	 * It is not possible to successfully wrap close.
-	 * So error if someone is trying to use close.
-	 */
-	rc = -EINVAL;
-	if (vma->vm_ops && vma->vm_ops->close)
-		goto out_put;
-
-	rc = 0;
-	of->mmapped = 1;
-	of->vm_ops = vma->vm_ops;
-	vma->vm_ops = &sysfs_bin_vm_ops;
-out_put:
-	sysfs_put_active(of->sd);
-out_unlock:
-	mutex_unlock(&of->mutex);
-
-	return rc;
-}
-
-/**
- *	sysfs_get_open_dirent - get or create sysfs_open_dirent
- *	@sd: target sysfs_dirent
- *	@of: sysfs_open_file for this instance of open
- *
- *	If @sd->s_attr.open exists, increment its reference count;
- *	otherwise, create one.  @of is chained to the files list.
- *
- *	LOCKING:
- *	Kernel thread context (may sleep).
- *
- *	RETURNS:
- *	0 on success, -errno on failure.
- */
-static int sysfs_get_open_dirent(struct sysfs_dirent *sd,
-				 struct sysfs_open_file *of)
-{
-	struct sysfs_open_dirent *od, *new_od = NULL;
-
- retry:
-	mutex_lock(&sysfs_open_file_mutex);
-	spin_lock_irq(&sysfs_open_dirent_lock);
-
-	if (!sd->s_attr.open && new_od) {
-		sd->s_attr.open = new_od;
-		new_od = NULL;
+	if (kn && attr) {
+		tmp = kernfs_find_and_get(kn, attr);
+		kernfs_put(kn);
+		kn = tmp;
 	}
 
-	od = sd->s_attr.open;
-	if (od) {
-		atomic_inc(&od->refcnt);
-		list_add_tail(&of->list, &od->files);
-	}
-
-	spin_unlock_irq(&sysfs_open_dirent_lock);
-	mutex_unlock(&sysfs_open_file_mutex);
-
-	if (od) {
-		kfree(new_od);
-		return 0;
+	if (kn) {
+		kernfs_notify(kn);
+		kernfs_put(kn);
 	}
+}
+EXPORT_SYMBOL_GPL(sysfs_notify);
 
-	/* not there, initialize a new one and retry */
-	new_od = kmalloc(sizeof(*new_od), GFP_KERNEL);
-	if (!new_od)
-		return -ENOMEM;
+static const struct kernfs_ops sysfs_file_kfops_empty = {
+};
 
-	atomic_set(&new_od->refcnt, 0);
-	atomic_set(&new_od->event, 1);
-	init_waitqueue_head(&new_od->poll);
-	INIT_LIST_HEAD(&new_od->files);
-	goto retry;
-}
+static const struct kernfs_ops sysfs_file_kfops_ro = {
+	.seq_show	= sysfs_kf_seq_show,
+};
 
-/**
- *	sysfs_put_open_dirent - put sysfs_open_dirent
- *	@sd: target sysfs_dirent
- *	@of: associated sysfs_open_file
- *
- *	Put @sd->s_attr.open and unlink @of from the files list.  If
- *	reference count reaches zero, disassociate and free it.
- *
- *	LOCKING:
- *	None.
- */
-static void sysfs_put_open_dirent(struct sysfs_dirent *sd,
-				  struct sysfs_open_file *of)
-{
-	struct sysfs_open_dirent *od = sd->s_attr.open;
-	unsigned long flags;
+static const struct kernfs_ops sysfs_file_kfops_wo = {
+	.write		= sysfs_kf_write,
+};
 
-	mutex_lock(&sysfs_open_file_mutex);
-	spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
+static const struct kernfs_ops sysfs_file_kfops_rw = {
+	.seq_show	= sysfs_kf_seq_show,
+	.write		= sysfs_kf_write,
+};
 
-	if (of)
-		list_del(&of->list);
+static const struct kernfs_ops sysfs_bin_kfops_ro = {
+	.read		= sysfs_kf_bin_read,
+};
 
-	if (atomic_dec_and_test(&od->refcnt))
-		sd->s_attr.open = NULL;
-	else
-		od = NULL;
+static const struct kernfs_ops sysfs_bin_kfops_wo = {
+	.write		= sysfs_kf_bin_write,
+};
 
-	spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
-	mutex_unlock(&sysfs_open_file_mutex);
+static const struct kernfs_ops sysfs_bin_kfops_rw = {
+	.read		= sysfs_kf_bin_read,
+	.write		= sysfs_kf_bin_write,
+};
 
-	kfree(od);
-}
+static const struct kernfs_ops sysfs_bin_kfops_mmap = {
+	.read		= sysfs_kf_bin_read,
+	.write		= sysfs_kf_bin_write,
+	.mmap		= sysfs_kf_bin_mmap,
+};
 
-static int sysfs_open_file(struct inode *inode, struct file *file)
+int sysfs_add_file_mode_ns(struct kernfs_node *parent,
+			   const struct attribute *attr, bool is_bin,
+			   umode_t mode, const void *ns)
 {
-	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-	struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
-	struct sysfs_open_file *of;
-	bool has_read, has_write;
-	int error = -EACCES;
-
-	/* need attr_sd for attr and ops, its parent for kobj */
-	if (!sysfs_get_active(attr_sd))
-		return -ENODEV;
+	struct lock_class_key *key = NULL;
+	const struct kernfs_ops *ops;
+	struct kernfs_node *kn;
+	loff_t size;
 
-	if (sysfs_is_bin(attr_sd)) {
-		struct bin_attribute *battr = attr_sd->s_attr.bin_attr;
-
-		has_read = battr->read || battr->mmap;
-		has_write = battr->write || battr->mmap;
-	} else {
-		const struct sysfs_ops *ops = sysfs_file_ops(attr_sd);
+	if (!is_bin) {
+		struct kobject *kobj = parent->priv;
+		const struct sysfs_ops *sysfs_ops = kobj->ktype->sysfs_ops;
 
 		/* every kobject with an attribute needs a ktype assigned */
-		if (WARN(!ops, KERN_ERR
+		if (WARN(!sysfs_ops, KERN_ERR
 			 "missing sysfs attribute operations for kobject: %s\n",
 			 kobject_name(kobj)))
-			goto err_out;
-
-		has_read = ops->show;
-		has_write = ops->store;
-	}
-
-	/* check perms and supported operations */
-	if ((file->f_mode & FMODE_WRITE) &&
-	    (!(inode->i_mode & S_IWUGO) || !has_write))
-		goto err_out;
-
-	if ((file->f_mode & FMODE_READ) &&
-	    (!(inode->i_mode & S_IRUGO) || !has_read))
-		goto err_out;
-
-	/* allocate a sysfs_open_file for the file */
-	error = -ENOMEM;
-	of = kzalloc(sizeof(struct sysfs_open_file), GFP_KERNEL);
-	if (!of)
-		goto err_out;
-
-	/*
-	 * The following is done to give a different lockdep key to
-	 * @of->mutex for files which implement mmap.  This is a rather
-	 * crude way to avoid false positive lockdep warning around
-	 * mm->mmap_sem - mmap nests @of->mutex under mm->mmap_sem and
-	 * reading /sys/block/sda/trace/act_mask grabs sr_mutex, under
-	 * which mm->mmap_sem nests, while holding @of->mutex.  As each
-	 * open file has a separate mutex, it's okay as long as those don't
-	 * happen on the same file.  At this point, we can't easily give
-	 * each file a separate locking class.  Let's differentiate on
-	 * whether the file is bin or not for now.
-	 */
-	if (sysfs_is_bin(attr_sd))
-		mutex_init(&of->mutex);
-	else
-		mutex_init(&of->mutex);
-
-	of->sd = attr_sd;
-	of->file = file;
-
-	/*
-	 * Always instantiate seq_file even if read access doesn't use
-	 * seq_file or is not requested.  This unifies private data access
-	 * and readable regular files are the vast majority anyway.
-	 */
-	if (sysfs_is_bin(attr_sd))
-		error = single_open(file, NULL, of);
-	else
-		error = single_open(file, sysfs_seq_show, of);
-	if (error)
-		goto err_free;
-
-	/* seq_file clears PWRITE unconditionally, restore it if WRITE */
-	if (file->f_mode & FMODE_WRITE)
-		file->f_mode |= FMODE_PWRITE;
-
-	/* make sure we have open dirent struct */
-	error = sysfs_get_open_dirent(attr_sd, of);
-	if (error)
-		goto err_close;
-
-	/* open succeeded, put active references */
-	sysfs_put_active(attr_sd);
-	return 0;
-
-err_close:
-	single_release(inode, file);
-err_free:
-	kfree(of);
-err_out:
-	sysfs_put_active(attr_sd);
-	return error;
-}
-
-static int sysfs_release(struct inode *inode, struct file *filp)
-{
-	struct sysfs_dirent *sd = filp->f_path.dentry->d_fsdata;
-	struct sysfs_open_file *of = sysfs_of(filp);
-
-	sysfs_put_open_dirent(sd, of);
-	single_release(inode, filp);
-	kfree(of);
-
-	return 0;
-}
-
-void sysfs_unmap_bin_file(struct sysfs_dirent *sd)
-{
-	struct sysfs_open_dirent *od;
-	struct sysfs_open_file *of;
-
-	if (!sysfs_is_bin(sd))
-		return;
-
-	spin_lock_irq(&sysfs_open_dirent_lock);
-	od = sd->s_attr.open;
-	if (od)
-		atomic_inc(&od->refcnt);
-	spin_unlock_irq(&sysfs_open_dirent_lock);
-	if (!od)
-		return;
-
-	mutex_lock(&sysfs_open_file_mutex);
-	list_for_each_entry(of, &od->files, list) {
-		struct inode *inode = file_inode(of->file);
-		unmap_mapping_range(inode->i_mapping, 0, 0, 1);
+			return -EINVAL;
+
+		if (sysfs_ops->show && sysfs_ops->store)
+			ops = &sysfs_file_kfops_rw;
+		else if (sysfs_ops->show)
+			ops = &sysfs_file_kfops_ro;
+		else if (sysfs_ops->store)
+			ops = &sysfs_file_kfops_wo;
+		else
+			ops = &sysfs_file_kfops_empty;
+
+		size = PAGE_SIZE;
+	} else {
+		struct bin_attribute *battr = (void *)attr;
+
+		if (battr->mmap)
+			ops = &sysfs_bin_kfops_mmap;
+		else if (battr->read && battr->write)
+			ops = &sysfs_bin_kfops_rw;
+		else if (battr->read)
+			ops = &sysfs_bin_kfops_ro;
+		else if (battr->write)
+			ops = &sysfs_bin_kfops_wo;
+		else
+			ops = &sysfs_file_kfops_empty;
+
+		size = battr->size;
 	}
-	mutex_unlock(&sysfs_open_file_mutex);
-
-	sysfs_put_open_dirent(sd, NULL);
-}
-
-/* Sysfs attribute files are pollable.  The idea is that you read
- * the content and then you use 'poll' or 'select' to wait for
- * the content to change.  When the content changes (assuming the
- * manager for the kobject supports notification), poll will
- * return POLLERR|POLLPRI, and select will return the fd whether
- * it is waiting for read, write, or exceptions.
- * Once poll/select indicates that the value has changed, you
- * need to close and re-open the file, or seek to 0 and read again.
- * Reminder: this only works for attributes which actively support
- * it, and it is not possible to test an attribute from userspace
- * to see if it supports poll (Neither 'poll' nor 'select' return
- * an appropriate error code).  When in doubt, set a suitable timeout value.
- */
-static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
-{
-	struct sysfs_open_file *of = sysfs_of(filp);
-	struct sysfs_dirent *attr_sd = filp->f_path.dentry->d_fsdata;
-	struct sysfs_open_dirent *od = attr_sd->s_attr.open;
-
-	/* need parent for the kobj, grab both */
-	if (!sysfs_get_active(attr_sd))
-		goto trigger;
-
-	poll_wait(filp, &od->poll, wait);
 
-	sysfs_put_active(attr_sd);
-
-	if (of->event != atomic_read(&od->event))
-		goto trigger;
-
-	return DEFAULT_POLLMASK;
-
- trigger:
-	return DEFAULT_POLLMASK|POLLERR|POLLPRI;
-}
-
-void sysfs_notify_dirent(struct sysfs_dirent *sd)
-{
-	struct sysfs_open_dirent *od;
-	unsigned long flags;
-
-	spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
-
-	if (!WARN_ON(sysfs_type(sd) != SYSFS_KOBJ_ATTR)) {
-		od = sd->s_attr.open;
-		if (od) {
-			atomic_inc(&od->event);
-			wake_up_interruptible(&od->poll);
-		}
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	if (!attr->ignore_lockdep)
+		key = attr->key ?: (struct lock_class_key *)&attr->skey;
+#endif
+	kn = kernfs_create_file_ns_key(parent, attr->name, mode, size,
+				       ops, (void *)attr, ns, key);
+	if (IS_ERR(kn)) {
+		if (PTR_ERR(kn) == -EEXIST)
+			sysfs_warn_dup(parent, attr->name);
+		return PTR_ERR(kn);
 	}
-
-	spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
-}
-EXPORT_SYMBOL_GPL(sysfs_notify_dirent);
-
-void sysfs_notify(struct kobject *k, const char *dir, const char *attr)
-{
-	struct sysfs_dirent *sd = k->sd;
-
-	mutex_lock(&sysfs_mutex);
-
-	if (sd && dir)
-		sd = sysfs_find_dirent(sd, dir, NULL);
-	if (sd && attr)
-		sd = sysfs_find_dirent(sd, attr, NULL);
-	if (sd)
-		sysfs_notify_dirent(sd);
-
-	mutex_unlock(&sysfs_mutex);
-}
-EXPORT_SYMBOL_GPL(sysfs_notify);
-
-const struct file_operations sysfs_file_operations = {
-	.read		= seq_read,
-	.write		= sysfs_write_file,
-	.llseek		= generic_file_llseek,
-	.open		= sysfs_open_file,
-	.release	= sysfs_release,
-	.poll		= sysfs_poll,
-};
-
-const struct file_operations sysfs_bin_operations = {
-	.read		= sysfs_bin_read,
-	.write		= sysfs_write_file,
-	.llseek		= generic_file_llseek,
-	.mmap		= sysfs_bin_mmap,
-	.open		= sysfs_open_file,
-	.release	= sysfs_release,
-	.poll		= sysfs_poll,
-};
-
-int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
-			   const struct attribute *attr, int type,
-			   umode_t amode, const void *ns)
-{
-	umode_t mode = (amode & S_IALLUGO) | S_IFREG;
-	struct sysfs_addrm_cxt acxt;
-	struct sysfs_dirent *sd;
-	int rc;
-
-	sd = sysfs_new_dirent(attr->name, mode, type);
-	if (!sd)
-		return -ENOMEM;
-
-	sd->s_ns = ns;
-	sd->s_attr.attr = (void *)attr;
-	sysfs_dirent_init_lockdep(sd);
-
-	sysfs_addrm_start(&acxt);
-	rc = sysfs_add_one(&acxt, sd, dir_sd);
-	sysfs_addrm_finish(&acxt);
-
-	if (rc)
-		sysfs_put(sd);
-
-	return rc;
+	return 0;
 }
 
-
-int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr,
-		   int type)
+int sysfs_add_file(struct kernfs_node *parent, const struct attribute *attr,
+		   bool is_bin)
 {
-	return sysfs_add_file_mode_ns(dir_sd, attr, type, attr->mode, NULL);
+	return sysfs_add_file_mode_ns(parent, attr, is_bin, attr->mode, NULL);
 }
 
 /**
@@ -877,8 +279,7 @@ int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr,
 {
 	BUG_ON(!kobj || !kobj->sd || !attr);
 
-	return sysfs_add_file_mode_ns(kobj->sd, attr, SYSFS_KOBJ_ATTR,
-				      attr->mode, ns);
+	return sysfs_add_file_mode_ns(kobj->sd, attr, false, attr->mode, ns);
 
 }
 EXPORT_SYMBOL_GPL(sysfs_create_file_ns);
@@ -906,19 +307,21 @@ EXPORT_SYMBOL_GPL(sysfs_create_files);
 int sysfs_add_file_to_group(struct kobject *kobj,
 		const struct attribute *attr, const char *group)
 {
-	struct sysfs_dirent *dir_sd;
+	struct kernfs_node *parent;
 	int error;
 
-	if (group)
-		dir_sd = sysfs_get_dirent(kobj->sd, group);
-	else
-		dir_sd = sysfs_get(kobj->sd);
+	if (group) {
+		parent = kernfs_find_and_get(kobj->sd, group);
+	} else {
+		parent = kobj->sd;
+		kernfs_get(parent);
+	}
 
-	if (!dir_sd)
+	if (!parent)
 		return -ENOENT;
 
-	error = sysfs_add_file(dir_sd, attr, SYSFS_KOBJ_ATTR);
-	sysfs_put(dir_sd);
+	error = sysfs_add_file(parent, attr, false);
+	kernfs_put(parent);
 
 	return error;
 }
@@ -934,23 +337,20 @@ EXPORT_SYMBOL_GPL(sysfs_add_file_to_group);
 int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr,
 		     umode_t mode)
 {
-	struct sysfs_dirent *sd;
+	struct kernfs_node *kn;
 	struct iattr newattrs;
 	int rc;
 
-	mutex_lock(&sysfs_mutex);
-
-	rc = -ENOENT;
-	sd = sysfs_find_dirent(kobj->sd, attr->name, NULL);
-	if (!sd)
-		goto out;
+	kn = kernfs_find_and_get(kobj->sd, attr->name);
+	if (!kn)
+		return -ENOENT;
 
-	newattrs.ia_mode = (mode & S_IALLUGO) | (sd->s_mode & ~S_IALLUGO);
+	newattrs.ia_mode = (mode & S_IALLUGO) | (kn->mode & ~S_IALLUGO);
 	newattrs.ia_valid = ATTR_MODE;
-	rc = sysfs_sd_setattr(sd, &newattrs);
 
- out:
-	mutex_unlock(&sysfs_mutex);
+	rc = kernfs_setattr(kn, &newattrs);
+
+	kernfs_put(kn);
 	return rc;
 }
 EXPORT_SYMBOL_GPL(sysfs_chmod_file);
@@ -966,9 +366,9 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file);
 void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr,
 			  const void *ns)
 {
-	struct sysfs_dirent *dir_sd = kobj->sd;
+	struct kernfs_node *parent = kobj->sd;
 
-	sysfs_hash_and_remove(dir_sd, attr->name, ns);
+	kernfs_remove_by_name_ns(parent, attr->name, ns);
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_file_ns);
 
@@ -989,15 +389,18 @@ EXPORT_SYMBOL_GPL(sysfs_remove_files);
 void sysfs_remove_file_from_group(struct kobject *kobj,
 		const struct attribute *attr, const char *group)
 {
-	struct sysfs_dirent *dir_sd;
+	struct kernfs_node *parent;
 
-	if (group)
-		dir_sd = sysfs_get_dirent(kobj->sd, group);
-	else
-		dir_sd = sysfs_get(kobj->sd);
-	if (dir_sd) {
-		sysfs_hash_and_remove(dir_sd, attr->name, NULL);
-		sysfs_put(dir_sd);
+	if (group) {
+		parent = kernfs_find_and_get(kobj->sd, group);
+	} else {
+		parent = kobj->sd;
+		kernfs_get(parent);
+	}
+
+	if (parent) {
+		kernfs_remove_by_name(parent, attr->name);
+		kernfs_put(parent);
 	}
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_file_from_group);
@@ -1012,7 +415,7 @@ int sysfs_create_bin_file(struct kobject *kobj,
 {
 	BUG_ON(!kobj || !kobj->sd || !attr);
 
-	return sysfs_add_file(kobj->sd, &attr->attr, SYSFS_KOBJ_BIN_ATTR);
+	return sysfs_add_file(kobj->sd, &attr->attr, true);
 }
 EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
 
@@ -1024,7 +427,7 @@ EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
 void sysfs_remove_bin_file(struct kobject *kobj,
 			   const struct bin_attribute *attr)
 {
-	sysfs_hash_and_remove(kobj->sd, attr->attr.name, NULL);
+	kernfs_remove_by_name(kobj->sd, attr->attr.name);
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_bin_file);
 
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 1898a10e38ce..4d00d3996477 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -18,7 +18,7 @@
 #include "sysfs.h"
 
 
-static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
+static void remove_files(struct kernfs_node *parent, struct kobject *kobj,
 			 const struct attribute_group *grp)
 {
 	struct attribute *const *attr;
@@ -26,13 +26,13 @@ static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
 
 	if (grp->attrs)
 		for (attr = grp->attrs; *attr; attr++)
-			sysfs_hash_and_remove(dir_sd, (*attr)->name, NULL);
+			kernfs_remove_by_name(parent, (*attr)->name);
 	if (grp->bin_attrs)
 		for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++)
 			sysfs_remove_bin_file(kobj, *bin_attr);
 }
 
-static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
+static int create_files(struct kernfs_node *parent, struct kobject *kobj,
 			const struct attribute_group *grp, int update)
 {
 	struct attribute *const *attr;
@@ -49,22 +49,20 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
 			 * re-adding (if required) the file.
 			 */
 			if (update)
-				sysfs_hash_and_remove(dir_sd, (*attr)->name,
-						      NULL);
+				kernfs_remove_by_name(parent, (*attr)->name);
 			if (grp->is_visible) {
 				mode = grp->is_visible(kobj, *attr, i);
 				if (!mode)
 					continue;
 			}
-			error = sysfs_add_file_mode_ns(dir_sd, *attr,
-						       SYSFS_KOBJ_ATTR,
+			error = sysfs_add_file_mode_ns(parent, *attr, false,
 						       (*attr)->mode | mode,
 						       NULL);
 			if (unlikely(error))
 				break;
 		}
 		if (error) {
-			remove_files(dir_sd, kobj, grp);
+			remove_files(parent, kobj, grp);
 			goto exit;
 		}
 	}
@@ -78,7 +76,7 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
 				break;
 		}
 		if (error)
-			remove_files(dir_sd, kobj, grp);
+			remove_files(parent, kobj, grp);
 	}
 exit:
 	return error;
@@ -88,7 +86,7 @@ exit:
 static int internal_create_group(struct kobject *kobj, int update,
 				 const struct attribute_group *grp)
 {
-	struct sysfs_dirent *sd;
+	struct kernfs_node *kn;
 	int error;
 
 	BUG_ON(!kobj || (!update && !kobj->sd));
@@ -102,18 +100,21 @@ static int internal_create_group(struct kobject *kobj, int update,
 		return -EINVAL;
 	}
 	if (grp->name) {
-		error = sysfs_create_subdir(kobj, grp->name, &sd);
-		if (error)
-			return error;
+		kn = kernfs_create_dir(kobj->sd, grp->name, kobj);
+		if (IS_ERR(kn)) {
+			if (PTR_ERR(kn) == -EEXIST)
+				sysfs_warn_dup(kobj->sd, grp->name);
+			return PTR_ERR(kn);
+		}
 	} else
-		sd = kobj->sd;
-	sysfs_get(sd);
-	error = create_files(sd, kobj, grp, update);
+		kn = kobj->sd;
+	kernfs_get(kn);
+	error = create_files(kn, kobj, grp, update);
 	if (error) {
 		if (grp->name)
-			sysfs_remove(sd);
+			kernfs_remove(kn);
 	}
-	sysfs_put(sd);
+	kernfs_put(kn);
 	return error;
 }
 
@@ -203,25 +204,27 @@ EXPORT_SYMBOL_GPL(sysfs_update_group);
 void sysfs_remove_group(struct kobject *kobj,
 			const struct attribute_group *grp)
 {
-	struct sysfs_dirent *dir_sd = kobj->sd;
-	struct sysfs_dirent *sd;
+	struct kernfs_node *parent = kobj->sd;
+	struct kernfs_node *kn;
 
 	if (grp->name) {
-		sd = sysfs_get_dirent(dir_sd, grp->name);
-		if (!sd) {
-			WARN(!sd, KERN_WARNING
+		kn = kernfs_find_and_get(parent, grp->name);
+		if (!kn) {
+			WARN(!kn, KERN_WARNING
 			     "sysfs group %p not found for kobject '%s'\n",
 			     grp, kobject_name(kobj));
 			return;
 		}
-	} else
-		sd = sysfs_get(dir_sd);
+	} else {
+		kn = parent;
+		kernfs_get(kn);
+	}
 
-	remove_files(sd, kobj, grp);
+	remove_files(kn, kobj, grp);
 	if (grp->name)
-		sysfs_remove(sd);
+		kernfs_remove(kn);
 
-	sysfs_put(sd);
+	kernfs_put(kn);
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_group);
 
@@ -257,22 +260,22 @@ EXPORT_SYMBOL_GPL(sysfs_remove_groups);
 int sysfs_merge_group(struct kobject *kobj,
 		       const struct attribute_group *grp)
 {
-	struct sysfs_dirent *dir_sd;
+	struct kernfs_node *parent;
 	int error = 0;
 	struct attribute *const *attr;
 	int i;
 
-	dir_sd = sysfs_get_dirent(kobj->sd, grp->name);
-	if (!dir_sd)
+	parent = kernfs_find_and_get(kobj->sd, grp->name);
+	if (!parent)
 		return -ENOENT;
 
 	for ((i = 0, attr = grp->attrs); *attr && !error; (++i, ++attr))
-		error = sysfs_add_file(dir_sd, *attr, SYSFS_KOBJ_ATTR);
+		error = sysfs_add_file(parent, *attr, false);
 	if (error) {
 		while (--i >= 0)
-			sysfs_hash_and_remove(dir_sd, (*--attr)->name, NULL);
+			kernfs_remove_by_name(parent, (*--attr)->name);
 	}
-	sysfs_put(dir_sd);
+	kernfs_put(parent);
 
 	return error;
 }
@@ -286,14 +289,14 @@ EXPORT_SYMBOL_GPL(sysfs_merge_group);
 void sysfs_unmerge_group(struct kobject *kobj,
 		       const struct attribute_group *grp)
 {
-	struct sysfs_dirent *dir_sd;
+	struct kernfs_node *parent;
 	struct attribute *const *attr;
 
-	dir_sd = sysfs_get_dirent(kobj->sd, grp->name);
-	if (dir_sd) {
+	parent = kernfs_find_and_get(kobj->sd, grp->name);
+	if (parent) {
 		for (attr = grp->attrs; *attr; ++attr)
-			sysfs_hash_and_remove(dir_sd, (*attr)->name, NULL);
-		sysfs_put(dir_sd);
+			kernfs_remove_by_name(parent, (*attr)->name);
+		kernfs_put(parent);
 	}
 }
 EXPORT_SYMBOL_GPL(sysfs_unmerge_group);
@@ -308,15 +311,15 @@ EXPORT_SYMBOL_GPL(sysfs_unmerge_group);
 int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name,
 			    struct kobject *target, const char *link_name)
 {
-	struct sysfs_dirent *dir_sd;
+	struct kernfs_node *parent;
 	int error = 0;
 
-	dir_sd = sysfs_get_dirent(kobj->sd, group_name);
-	if (!dir_sd)
+	parent = kernfs_find_and_get(kobj->sd, group_name);
+	if (!parent)
 		return -ENOENT;
 
-	error = sysfs_create_link_sd(dir_sd, target, link_name);
-	sysfs_put(dir_sd);
+	error = sysfs_create_link_sd(parent, target, link_name);
+	kernfs_put(parent);
 
 	return error;
 }
@@ -331,12 +334,12 @@ EXPORT_SYMBOL_GPL(sysfs_add_link_to_group);
 void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name,
 				  const char *link_name)
 {
-	struct sysfs_dirent *dir_sd;
+	struct kernfs_node *parent;
 
-	dir_sd = sysfs_get_dirent(kobj->sd, group_name);
-	if (dir_sd) {
-		sysfs_hash_and_remove(dir_sd, link_name, NULL);
-		sysfs_put(dir_sd);
+	parent = kernfs_find_and_get(kobj->sd, group_name);
+	if (parent) {
+		kernfs_remove_by_name(parent, link_name);
+		kernfs_put(parent);
 	}
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
deleted file mode 100644
index 1750f790af3b..000000000000
--- a/fs/sysfs/inode.c
+++ /dev/null
@@ -1,331 +0,0 @@
-/*
- * fs/sysfs/inode.c - basic sysfs inode and dentry operations
- *
- * Copyright (c) 2001-3 Patrick Mochel
- * Copyright (c) 2007 SUSE Linux Products GmbH
- * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
- *
- * This file is released under the GPLv2.
- *
- * Please see Documentation/filesystems/sysfs.txt for more information.
- */
-
-#undef DEBUG
-
-#include <linux/pagemap.h>
-#include <linux/namei.h>
-#include <linux/backing-dev.h>
-#include <linux/capability.h>
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/sysfs.h>
-#include <linux/xattr.h>
-#include <linux/security.h>
-#include "sysfs.h"
-
-static const struct address_space_operations sysfs_aops = {
-	.readpage	= simple_readpage,
-	.write_begin	= simple_write_begin,
-	.write_end	= simple_write_end,
-};
-
-static struct backing_dev_info sysfs_backing_dev_info = {
-	.name		= "sysfs",
-	.ra_pages	= 0,	/* No readahead */
-	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
-};
-
-static const struct inode_operations sysfs_inode_operations = {
-	.permission	= sysfs_permission,
-	.setattr	= sysfs_setattr,
-	.getattr	= sysfs_getattr,
-	.setxattr	= sysfs_setxattr,
-};
-
-int __init sysfs_inode_init(void)
-{
-	return bdi_init(&sysfs_backing_dev_info);
-}
-
-static struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd)
-{
-	struct sysfs_inode_attrs *attrs;
-	struct iattr *iattrs;
-
-	attrs = kzalloc(sizeof(struct sysfs_inode_attrs), GFP_KERNEL);
-	if (!attrs)
-		return NULL;
-	iattrs = &attrs->ia_iattr;
-
-	/* assign default attributes */
-	iattrs->ia_mode = sd->s_mode;
-	iattrs->ia_uid = GLOBAL_ROOT_UID;
-	iattrs->ia_gid = GLOBAL_ROOT_GID;
-	iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME;
-
-	return attrs;
-}
-
-int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr)
-{
-	struct sysfs_inode_attrs *sd_attrs;
-	struct iattr *iattrs;
-	unsigned int ia_valid = iattr->ia_valid;
-
-	sd_attrs = sd->s_iattr;
-
-	if (!sd_attrs) {
-		/* setting attributes for the first time, allocate now */
-		sd_attrs = sysfs_init_inode_attrs(sd);
-		if (!sd_attrs)
-			return -ENOMEM;
-		sd->s_iattr = sd_attrs;
-	}
-	/* attributes were changed at least once in past */
-	iattrs = &sd_attrs->ia_iattr;
-
-	if (ia_valid & ATTR_UID)
-		iattrs->ia_uid = iattr->ia_uid;
-	if (ia_valid & ATTR_GID)
-		iattrs->ia_gid = iattr->ia_gid;
-	if (ia_valid & ATTR_ATIME)
-		iattrs->ia_atime = iattr->ia_atime;
-	if (ia_valid & ATTR_MTIME)
-		iattrs->ia_mtime = iattr->ia_mtime;
-	if (ia_valid & ATTR_CTIME)
-		iattrs->ia_ctime = iattr->ia_ctime;
-	if (ia_valid & ATTR_MODE) {
-		umode_t mode = iattr->ia_mode;
-		iattrs->ia_mode = sd->s_mode = mode;
-	}
-	return 0;
-}
-
-int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
-{
-	struct inode *inode = dentry->d_inode;
-	struct sysfs_dirent *sd = dentry->d_fsdata;
-	int error;
-
-	if (!sd)
-		return -EINVAL;
-
-	mutex_lock(&sysfs_mutex);
-	error = inode_change_ok(inode, iattr);
-	if (error)
-		goto out;
-
-	error = sysfs_sd_setattr(sd, iattr);
-	if (error)
-		goto out;
-
-	/* this ignores size changes */
-	setattr_copy(inode, iattr);
-
-out:
-	mutex_unlock(&sysfs_mutex);
-	return error;
-}
-
-static int sysfs_sd_setsecdata(struct sysfs_dirent *sd, void **secdata,
-			       u32 *secdata_len)
-{
-	struct sysfs_inode_attrs *iattrs;
-	void *old_secdata;
-	size_t old_secdata_len;
-
-	if (!sd->s_iattr) {
-		sd->s_iattr = sysfs_init_inode_attrs(sd);
-		if (!sd->s_iattr)
-			return -ENOMEM;
-	}
-
-	iattrs = sd->s_iattr;
-	old_secdata = iattrs->ia_secdata;
-	old_secdata_len = iattrs->ia_secdata_len;
-
-	iattrs->ia_secdata = *secdata;
-	iattrs->ia_secdata_len = *secdata_len;
-
-	*secdata = old_secdata;
-	*secdata_len = old_secdata_len;
-	return 0;
-}
-
-int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
-		size_t size, int flags)
-{
-	struct sysfs_dirent *sd = dentry->d_fsdata;
-	void *secdata;
-	int error;
-	u32 secdata_len = 0;
-
-	if (!sd)
-		return -EINVAL;
-
-	if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) {
-		const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
-		error = security_inode_setsecurity(dentry->d_inode, suffix,
-						value, size, flags);
-		if (error)
-			goto out;
-		error = security_inode_getsecctx(dentry->d_inode,
-						&secdata, &secdata_len);
-		if (error)
-			goto out;
-
-		mutex_lock(&sysfs_mutex);
-		error = sysfs_sd_setsecdata(sd, &secdata, &secdata_len);
-		mutex_unlock(&sysfs_mutex);
-
-		if (secdata)
-			security_release_secctx(secdata, secdata_len);
-	} else
-		return -EINVAL;
-out:
-	return error;
-}
-
-static inline void set_default_inode_attr(struct inode *inode, umode_t mode)
-{
-	inode->i_mode = mode;
-	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-}
-
-static inline void set_inode_attr(struct inode *inode, struct iattr *iattr)
-{
-	inode->i_uid = iattr->ia_uid;
-	inode->i_gid = iattr->ia_gid;
-	inode->i_atime = iattr->ia_atime;
-	inode->i_mtime = iattr->ia_mtime;
-	inode->i_ctime = iattr->ia_ctime;
-}
-
-static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode)
-{
-	struct sysfs_inode_attrs *iattrs = sd->s_iattr;
-
-	inode->i_mode = sd->s_mode;
-	if (iattrs) {
-		/* sysfs_dirent has non-default attributes
-		 * get them from persistent copy in sysfs_dirent
-		 */
-		set_inode_attr(inode, &iattrs->ia_iattr);
-		security_inode_notifysecctx(inode,
-					    iattrs->ia_secdata,
-					    iattrs->ia_secdata_len);
-	}
-
-	if (sysfs_type(sd) == SYSFS_DIR)
-		set_nlink(inode, sd->s_dir.subdirs + 2);
-}
-
-int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
-		  struct kstat *stat)
-{
-	struct sysfs_dirent *sd = dentry->d_fsdata;
-	struct inode *inode = dentry->d_inode;
-
-	mutex_lock(&sysfs_mutex);
-	sysfs_refresh_inode(sd, inode);
-	mutex_unlock(&sysfs_mutex);
-
-	generic_fillattr(inode, stat);
-	return 0;
-}
-
-static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
-{
-	struct bin_attribute *bin_attr;
-
-	inode->i_private = sysfs_get(sd);
-	inode->i_mapping->a_ops = &sysfs_aops;
-	inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info;
-	inode->i_op = &sysfs_inode_operations;
-
-	set_default_inode_attr(inode, sd->s_mode);
-	sysfs_refresh_inode(sd, inode);
-
-	/* initialize inode according to type */
-	switch (sysfs_type(sd)) {
-	case SYSFS_DIR:
-		inode->i_op = &sysfs_dir_inode_operations;
-		inode->i_fop = &sysfs_dir_operations;
-		break;
-	case SYSFS_KOBJ_ATTR:
-		inode->i_size = PAGE_SIZE;
-		inode->i_fop = &sysfs_file_operations;
-		break;
-	case SYSFS_KOBJ_BIN_ATTR:
-		bin_attr = sd->s_attr.bin_attr;
-		inode->i_size = bin_attr->size;
-		inode->i_fop = &sysfs_bin_operations;
-		break;
-	case SYSFS_KOBJ_LINK:
-		inode->i_op = &sysfs_symlink_inode_operations;
-		break;
-	default:
-		BUG();
-	}
-
-	unlock_new_inode(inode);
-}
-
-/**
- *	sysfs_get_inode - get inode for sysfs_dirent
- *	@sb: super block
- *	@sd: sysfs_dirent to allocate inode for
- *
- *	Get inode for @sd.  If such inode doesn't exist, a new inode
- *	is allocated and basics are initialized.  New inode is
- *	returned locked.
- *
- *	LOCKING:
- *	Kernel thread context (may sleep).
- *
- *	RETURNS:
- *	Pointer to allocated inode on success, NULL on failure.
- */
-struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd)
-{
-	struct inode *inode;
-
-	inode = iget_locked(sb, sd->s_ino);
-	if (inode && (inode->i_state & I_NEW))
-		sysfs_init_inode(sd, inode);
-
-	return inode;
-}
-
-/*
- * The sysfs_dirent serves as both an inode and a directory entry for sysfs.
- * To prevent the sysfs inode numbers from being freed prematurely we take a
- * reference to sysfs_dirent from the sysfs inode.  A
- * super_operations.evict_inode() implementation is needed to drop that
- * reference upon inode destruction.
- */
-void sysfs_evict_inode(struct inode *inode)
-{
-	struct sysfs_dirent *sd  = inode->i_private;
-
-	truncate_inode_pages(&inode->i_data, 0);
-	clear_inode(inode);
-	sysfs_put(sd);
-}
-
-int sysfs_permission(struct inode *inode, int mask)
-{
-	struct sysfs_dirent *sd;
-
-	if (mask & MAY_NOT_BLOCK)
-		return -ECHILD;
-
-	sd = inode->i_private;
-
-	mutex_lock(&sysfs_mutex);
-	sysfs_refresh_inode(sd, inode);
-	mutex_unlock(&sysfs_mutex);
-
-	return generic_permission(inode, mask);
-}
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 834ec2cdb7a3..701a56f341c6 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -14,146 +14,41 @@
 
 #include <linux/fs.h>
 #include <linux/mount.h>
-#include <linux/pagemap.h>
 #include <linux/init.h>
-#include <linux/module.h>
-#include <linux/magic.h>
-#include <linux/slab.h>
 #include <linux/user_namespace.h>
 
 #include "sysfs.h"
 
-
-static struct vfsmount *sysfs_mnt;
-struct kmem_cache *sysfs_dir_cachep;
-
-static const struct super_operations sysfs_ops = {
-	.statfs		= simple_statfs,
-	.drop_inode	= generic_delete_inode,
-	.evict_inode	= sysfs_evict_inode,
-};
-
-struct sysfs_dirent sysfs_root = {
-	.s_name		= "",
-	.s_count	= ATOMIC_INIT(1),
-	.s_flags	= SYSFS_DIR | (KOBJ_NS_TYPE_NONE << SYSFS_NS_TYPE_SHIFT),
-	.s_mode		= S_IFDIR | S_IRUGO | S_IXUGO,
-	.s_ino		= 1,
-};
-
-static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
-{
-	struct inode *inode;
-	struct dentry *root;
-
-	sb->s_blocksize = PAGE_CACHE_SIZE;
-	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
-	sb->s_magic = SYSFS_MAGIC;
-	sb->s_op = &sysfs_ops;
-	sb->s_time_gran = 1;
-
-	/* get root inode, initialize and unlock it */
-	mutex_lock(&sysfs_mutex);
-	inode = sysfs_get_inode(sb, &sysfs_root);
-	mutex_unlock(&sysfs_mutex);
-	if (!inode) {
-		pr_debug("sysfs: could not get root inode\n");
-		return -ENOMEM;
-	}
-
-	/* instantiate and link root dentry */
-	root = d_make_root(inode);
-	if (!root) {
-		pr_debug("%s: could not get root dentry!\n", __func__);
-		return -ENOMEM;
-	}
-	root->d_fsdata = &sysfs_root;
-	sb->s_root = root;
-	sb->s_d_op = &sysfs_dentry_ops;
-	return 0;
-}
-
-static int sysfs_test_super(struct super_block *sb, void *data)
-{
-	struct sysfs_super_info *sb_info = sysfs_info(sb);
-	struct sysfs_super_info *info = data;
-	enum kobj_ns_type type;
-	int found = 1;
-
-	for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) {
-		if (sb_info->ns[type] != info->ns[type])
-			found = 0;
-	}
-	return found;
-}
-
-static int sysfs_set_super(struct super_block *sb, void *data)
-{
-	int error;
-	error = set_anon_super(sb, data);
-	if (!error)
-		sb->s_fs_info = data;
-	return error;
-}
-
-static void free_sysfs_super_info(struct sysfs_super_info *info)
-{
-	int type;
-	for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
-		kobj_ns_drop(type, info->ns[type]);
-	kfree(info);
-}
+static struct kernfs_root *sysfs_root;
+struct kernfs_node *sysfs_root_kn;
 
 static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
-	struct sysfs_super_info *info;
-	enum kobj_ns_type type;
-	struct super_block *sb;
-	int error;
+	struct dentry *root;
+	void *ns;
 
 	if (!(flags & MS_KERNMOUNT)) {
 		if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type))
 			return ERR_PTR(-EPERM);
 
-		for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) {
-			if (!kobj_ns_current_may_mount(type))
-				return ERR_PTR(-EPERM);
-		}
-	}
-
-	info = kzalloc(sizeof(*info), GFP_KERNEL);
-	if (!info)
-		return ERR_PTR(-ENOMEM);
-
-	for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
-		info->ns[type] = kobj_ns_grab_current(type);
-
-	sb = sget(fs_type, sysfs_test_super, sysfs_set_super, flags, info);
-	if (IS_ERR(sb) || sb->s_fs_info != info)
-		free_sysfs_super_info(info);
-	if (IS_ERR(sb))
-		return ERR_CAST(sb);
-	if (!sb->s_root) {
-		error = sysfs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
-		if (error) {
-			deactivate_locked_super(sb);
-			return ERR_PTR(error);
-		}
-		sb->s_flags |= MS_ACTIVE;
+		if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET))
+			return ERR_PTR(-EPERM);
 	}
 
-	return dget(sb->s_root);
+	ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET);
+	root = kernfs_mount_ns(fs_type, flags, sysfs_root, ns);
+	if (IS_ERR(root))
+		kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
+	return root;
 }
 
 static void sysfs_kill_sb(struct super_block *sb)
 {
-	struct sysfs_super_info *info = sysfs_info(sb);
-	/* Remove the superblock from fs_supers/s_instances
-	 * so we can't find it, before freeing sysfs_super_info.
-	 */
-	kill_anon_super(sb);
-	free_sysfs_super_info(info);
+	void *ns = (void *)kernfs_super_ns(sb);
+
+	kernfs_kill_sb(sb);
+	kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
 }
 
 static struct file_system_type sysfs_fs_type = {
@@ -165,48 +60,19 @@ static struct file_system_type sysfs_fs_type = {
 
 int __init sysfs_init(void)
 {
-	int err = -ENOMEM;
+	int err;
 
-	sysfs_dir_cachep = kmem_cache_create("sysfs_dir_cache",
-					      sizeof(struct sysfs_dirent),
-					      0, 0, NULL);
-	if (!sysfs_dir_cachep)
-		goto out;
+	sysfs_root = kernfs_create_root(NULL);
+	if (IS_ERR(sysfs_root))
+		return PTR_ERR(sysfs_root);
 
-	err = sysfs_inode_init();
-	if (err)
-		goto out_err;
+	sysfs_root_kn = sysfs_root->kn;
 
 	err = register_filesystem(&sysfs_fs_type);
-	if (!err) {
-		sysfs_mnt = kern_mount(&sysfs_fs_type);
-		if (IS_ERR(sysfs_mnt)) {
-			printk(KERN_ERR "sysfs: could not mount!\n");
-			err = PTR_ERR(sysfs_mnt);
-			sysfs_mnt = NULL;
-			unregister_filesystem(&sysfs_fs_type);
-			goto out_err;
-		}
-	} else
-		goto out_err;
-out:
-	return err;
-out_err:
-	kmem_cache_destroy(sysfs_dir_cachep);
-	sysfs_dir_cachep = NULL;
-	goto out;
-}
-
-#undef sysfs_get
-struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd)
-{
-	return __sysfs_get(sd);
-}
-EXPORT_SYMBOL_GPL(sysfs_get);
+	if (err) {
+		kernfs_destroy_root(sysfs_root);
+		return err;
+	}
 
-#undef sysfs_put
-void sysfs_put(struct sysfs_dirent *sd)
-{
-	__sysfs_put(sd);
+	return 0;
 }
-EXPORT_SYMBOL_GPL(sysfs_put);
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 3ae3f1bf1a09..aecb15f84557 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -11,109 +11,73 @@
  */
 
 #include <linux/fs.h>
-#include <linux/gfp.h>
-#include <linux/mount.h>
 #include <linux/module.h>
 #include <linux/kobject.h>
-#include <linux/namei.h>
 #include <linux/mutex.h>
 #include <linux/security.h>
 
 #include "sysfs.h"
 
-static int sysfs_do_create_link_sd(struct sysfs_dirent *parent_sd,
-				   struct kobject *target,
+static int sysfs_do_create_link_sd(struct kernfs_node *parent,
+				   struct kobject *target_kobj,
 				   const char *name, int warn)
 {
-	struct sysfs_dirent *target_sd = NULL;
-	struct sysfs_dirent *sd = NULL;
-	struct sysfs_addrm_cxt acxt;
-	enum kobj_ns_type ns_type;
-	int error;
+	struct kernfs_node *kn, *target = NULL;
 
-	BUG_ON(!name || !parent_sd);
+	BUG_ON(!name || !parent);
 
 	/*
-	 * We don't own @target and it may be removed at any time.
+	 * We don't own @target_kobj and it may be removed at any time.
 	 * Synchronize using sysfs_symlink_target_lock.  See
 	 * sysfs_remove_dir() for details.
 	 */
 	spin_lock(&sysfs_symlink_target_lock);
-	if (target->sd)
-		target_sd = sysfs_get(target->sd);
+	if (target_kobj->sd) {
+		target = target_kobj->sd;
+		kernfs_get(target);
+	}
 	spin_unlock(&sysfs_symlink_target_lock);
 
-	error = -ENOENT;
-	if (!target_sd)
-		goto out_put;
-
-	error = -ENOMEM;
-	sd = sysfs_new_dirent(name, S_IFLNK|S_IRWXUGO, SYSFS_KOBJ_LINK);
-	if (!sd)
-		goto out_put;
+	if (!target)
+		return -ENOENT;
 
-	ns_type = sysfs_ns_type(parent_sd);
-	if (ns_type)
-		sd->s_ns = target_sd->s_ns;
-	sd->s_symlink.target_sd = target_sd;
-	target_sd = NULL;	/* reference is now owned by the symlink */
-
-	sysfs_addrm_start(&acxt);
-	/* Symlinks must be between directories with the same ns_type */
-	if (!ns_type ||
-	    (ns_type == sysfs_ns_type(sd->s_symlink.target_sd->s_parent))) {
-		if (warn)
-			error = sysfs_add_one(&acxt, sd, parent_sd);
-		else
-			error = __sysfs_add_one(&acxt, sd, parent_sd);
-	} else {
-		error = -EINVAL;
-		WARN(1, KERN_WARNING
-			"sysfs: symlink across ns_types %s/%s -> %s/%s\n",
-			parent_sd->s_name,
-			sd->s_name,
-			sd->s_symlink.target_sd->s_parent->s_name,
-			sd->s_symlink.target_sd->s_name);
-	}
-	sysfs_addrm_finish(&acxt);
+	kn = kernfs_create_link(parent, name, target);
+	kernfs_put(target);
 
-	if (error)
-		goto out_put;
+	if (!IS_ERR(kn))
+		return 0;
 
-	return 0;
-
- out_put:
-	sysfs_put(target_sd);
-	sysfs_put(sd);
-	return error;
+	if (warn && PTR_ERR(kn) == -EEXIST)
+		sysfs_warn_dup(parent, name);
+	return PTR_ERR(kn);
 }
 
 /**
  *	sysfs_create_link_sd - create symlink to a given object.
- *	@sd:		directory we're creating the link in.
+ *	@kn:		directory we're creating the link in.
  *	@target:	object we're pointing to.
  *	@name:		name of the symlink.
  */
-int sysfs_create_link_sd(struct sysfs_dirent *sd, struct kobject *target,
+int sysfs_create_link_sd(struct kernfs_node *kn, struct kobject *target,
 			 const char *name)
 {
-	return sysfs_do_create_link_sd(sd, target, name, 1);
+	return sysfs_do_create_link_sd(kn, target, name, 1);
 }
 
 static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
 				const char *name, int warn)
 {
-	struct sysfs_dirent *parent_sd = NULL;
+	struct kernfs_node *parent = NULL;
 
 	if (!kobj)
-		parent_sd = &sysfs_root;
+		parent = sysfs_root_kn;
 	else
-		parent_sd = kobj->sd;
+		parent = kobj->sd;
 
-	if (!parent_sd)
+	if (!parent)
 		return -EFAULT;
 
-	return sysfs_do_create_link_sd(parent_sd, target, name, warn);
+	return sysfs_do_create_link_sd(parent, target, name, warn);
 }
 
 /**
@@ -164,10 +128,10 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ,
 	 * sysfs_remove_dir() for details.
 	 */
 	spin_lock(&sysfs_symlink_target_lock);
-	if (targ->sd && sysfs_ns_type(kobj->sd))
-		ns = targ->sd->s_ns;
+	if (targ->sd && kernfs_ns_enabled(kobj->sd))
+		ns = targ->sd->ns;
 	spin_unlock(&sysfs_symlink_target_lock);
-	sysfs_hash_and_remove(kobj->sd, name, ns);
+	kernfs_remove_by_name_ns(kobj->sd, name, ns);
 }
 
 /**
@@ -177,14 +141,14 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ,
  */
 void sysfs_remove_link(struct kobject *kobj, const char *name)
 {
-	struct sysfs_dirent *parent_sd = NULL;
+	struct kernfs_node *parent = NULL;
 
 	if (!kobj)
-		parent_sd = &sysfs_root;
+		parent = sysfs_root_kn;
 	else
-		parent_sd = kobj->sd;
+		parent = kobj->sd;
 
-	sysfs_hash_and_remove(parent_sd, name, NULL);
+	kernfs_remove_by_name(parent, name);
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_link);
 
@@ -201,130 +165,33 @@ EXPORT_SYMBOL_GPL(sysfs_remove_link);
 int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *targ,
 			 const char *old, const char *new, const void *new_ns)
 {
-	struct sysfs_dirent *parent_sd, *sd = NULL;
+	struct kernfs_node *parent, *kn = NULL;
 	const void *old_ns = NULL;
 	int result;
 
 	if (!kobj)
-		parent_sd = &sysfs_root;
+		parent = sysfs_root_kn;
 	else
-		parent_sd = kobj->sd;
+		parent = kobj->sd;
 
 	if (targ->sd)
-		old_ns = targ->sd->s_ns;
+		old_ns = targ->sd->ns;
 
 	result = -ENOENT;
-	sd = sysfs_get_dirent_ns(parent_sd, old, old_ns);
-	if (!sd)
+	kn = kernfs_find_and_get_ns(parent, old, old_ns);
+	if (!kn)
 		goto out;
 
 	result = -EINVAL;
-	if (sysfs_type(sd) != SYSFS_KOBJ_LINK)
+	if (kernfs_type(kn) != KERNFS_LINK)
 		goto out;
-	if (sd->s_symlink.target_sd->s_dir.kobj != targ)
+	if (kn->symlink.target_kn->priv != targ)
 		goto out;
 
-	result = sysfs_rename(sd, parent_sd, new, new_ns);
+	result = kernfs_rename_ns(kn, parent, new, new_ns);
 
 out:
-	sysfs_put(sd);
+	kernfs_put(kn);
 	return result;
 }
 EXPORT_SYMBOL_GPL(sysfs_rename_link_ns);
-
-static int sysfs_get_target_path(struct sysfs_dirent *parent_sd,
-				 struct sysfs_dirent *target_sd, char *path)
-{
-	struct sysfs_dirent *base, *sd;
-	char *s = path;
-	int len = 0;
-
-	/* go up to the root, stop at the base */
-	base = parent_sd;
-	while (base->s_parent) {
-		sd = target_sd->s_parent;
-		while (sd->s_parent && base != sd)
-			sd = sd->s_parent;
-
-		if (base == sd)
-			break;
-
-		strcpy(s, "../");
-		s += 3;
-		base = base->s_parent;
-	}
-
-	/* determine end of target string for reverse fillup */
-	sd = target_sd;
-	while (sd->s_parent && sd != base) {
-		len += strlen(sd->s_name) + 1;
-		sd = sd->s_parent;
-	}
-
-	/* check limits */
-	if (len < 2)
-		return -EINVAL;
-	len--;
-	if ((s - path) + len > PATH_MAX)
-		return -ENAMETOOLONG;
-
-	/* reverse fillup of target string from target to base */
-	sd = target_sd;
-	while (sd->s_parent && sd != base) {
-		int slen = strlen(sd->s_name);
-
-		len -= slen;
-		strncpy(s + len, sd->s_name, slen);
-		if (len)
-			s[--len] = '/';
-
-		sd = sd->s_parent;
-	}
-
-	return 0;
-}
-
-static int sysfs_getlink(struct dentry *dentry, char *path)
-{
-	struct sysfs_dirent *sd = dentry->d_fsdata;
-	struct sysfs_dirent *parent_sd = sd->s_parent;
-	struct sysfs_dirent *target_sd = sd->s_symlink.target_sd;
-	int error;
-
-	mutex_lock(&sysfs_mutex);
-	error = sysfs_get_target_path(parent_sd, target_sd, path);
-	mutex_unlock(&sysfs_mutex);
-
-	return error;
-}
-
-static void *sysfs_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-	int error = -ENOMEM;
-	unsigned long page = get_zeroed_page(GFP_KERNEL);
-	if (page) {
-		error = sysfs_getlink(dentry, (char *) page);
-		if (error < 0)
-			free_page((unsigned long)page);
-	}
-	nd_set_link(nd, error ? ERR_PTR(error) : (char *)page);
-	return NULL;
-}
-
-static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd,
-			   void *cookie)
-{
-	char *page = nd_get_link(nd);
-	if (!IS_ERR(page))
-		free_page((unsigned long)page);
-}
-
-const struct inode_operations sysfs_symlink_inode_operations = {
-	.setxattr	= sysfs_setxattr,
-	.readlink	= generic_readlink,
-	.follow_link	= sysfs_follow_link,
-	.put_link	= sysfs_put_link,
-	.setattr	= sysfs_setattr,
-	.getattr	= sysfs_getattr,
-	.permission	= sysfs_permission,
-};
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 0af09fbfb3f6..0e2f1cccb812 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -8,248 +8,36 @@
  * This file is released under the GPLv2.
  */
 
-#include <linux/lockdep.h>
-#include <linux/kobject_ns.h>
-#include <linux/fs.h>
-#include <linux/rbtree.h>
+#ifndef __SYSFS_INTERNAL_H
+#define __SYSFS_INTERNAL_H
 
-struct sysfs_open_dirent;
-
-/* type-specific structures for sysfs_dirent->s_* union members */
-struct sysfs_elem_dir {
-	struct kobject		*kobj;
-
-	unsigned long		subdirs;
-	/* children rbtree starts here and goes through sd->s_rb */
-	struct rb_root		children;
-};
-
-struct sysfs_elem_symlink {
-	struct sysfs_dirent	*target_sd;
-};
-
-struct sysfs_elem_attr {
-	union {
-		struct attribute	*attr;
-		struct bin_attribute	*bin_attr;
-	};
-	struct sysfs_open_dirent *open;
-};
-
-struct sysfs_inode_attrs {
-	struct iattr	ia_iattr;
-	void		*ia_secdata;
-	u32		ia_secdata_len;
-};
-
-/*
- * sysfs_dirent - the building block of sysfs hierarchy.  Each and
- * every sysfs node is represented by single sysfs_dirent.
- *
- * As long as s_count reference is held, the sysfs_dirent itself is
- * accessible.  Dereferencing s_elem or any other outer entity
- * requires s_active reference.
- */
-struct sysfs_dirent {
-	atomic_t		s_count;
-	atomic_t		s_active;
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	struct lockdep_map	dep_map;
-#endif
-	struct sysfs_dirent	*s_parent;
-	const char		*s_name;
-
-	struct rb_node		s_rb;
-
-	union {
-		struct completion	*completion;
-		struct sysfs_dirent	*removed_list;
-	} u;
-
-	const void		*s_ns; /* namespace tag */
-	unsigned int		s_hash; /* ns + name hash */
-	union {
-		struct sysfs_elem_dir		s_dir;
-		struct sysfs_elem_symlink	s_symlink;
-		struct sysfs_elem_attr		s_attr;
-	};
-
-	unsigned short		s_flags;
-	umode_t			s_mode;
-	unsigned int		s_ino;
-	struct sysfs_inode_attrs *s_iattr;
-};
-
-#define SD_DEACTIVATED_BIAS		INT_MIN
-
-#define SYSFS_TYPE_MASK			0x00ff
-#define SYSFS_DIR			0x0001
-#define SYSFS_KOBJ_ATTR			0x0002
-#define SYSFS_KOBJ_BIN_ATTR		0x0004
-#define SYSFS_KOBJ_LINK			0x0008
-#define SYSFS_COPY_NAME			(SYSFS_DIR | SYSFS_KOBJ_LINK)
-#define SYSFS_ACTIVE_REF		(SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR)
-
-/* identify any namespace tag on sysfs_dirents */
-#define SYSFS_NS_TYPE_MASK		0xf00
-#define SYSFS_NS_TYPE_SHIFT		8
-
-#define SYSFS_FLAG_MASK			~(SYSFS_NS_TYPE_MASK|SYSFS_TYPE_MASK)
-#define SYSFS_FLAG_REMOVED		0x02000
-
-static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
-{
-	return sd->s_flags & SYSFS_TYPE_MASK;
-}
-
-/*
- * Return any namespace tags on this dirent.
- * enum kobj_ns_type is defined in linux/kobject.h
- */
-static inline enum kobj_ns_type sysfs_ns_type(struct sysfs_dirent *sd)
-{
-	return (sd->s_flags & SYSFS_NS_TYPE_MASK) >> SYSFS_NS_TYPE_SHIFT;
-}
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-
-#define sysfs_dirent_init_lockdep(sd)				\
-do {								\
-	struct attribute *attr = sd->s_attr.attr;		\
-	struct lock_class_key *key = attr->key;			\
-	if (!key)						\
-		key = &attr->skey;				\
-								\
-	lockdep_init_map(&sd->dep_map, "s_active", key, 0);	\
-} while (0)
-
-/* Test for attributes that want to ignore lockdep for read-locking */
-static inline bool sysfs_ignore_lockdep(struct sysfs_dirent *sd)
-{
-	int type = sysfs_type(sd);
-
-	return (type == SYSFS_KOBJ_ATTR || type == SYSFS_KOBJ_BIN_ATTR) &&
-		sd->s_attr.attr->ignore_lockdep;
-}
-
-#else
-
-#define sysfs_dirent_init_lockdep(sd) do {} while (0)
-
-static inline bool sysfs_ignore_lockdep(struct sysfs_dirent *sd)
-{
-	return true;
-}
-
-#endif
-
-/*
- * Context structure to be used while adding/removing nodes.
- */
-struct sysfs_addrm_cxt {
-	struct sysfs_dirent	*removed;
-};
+#include <linux/sysfs.h>
 
 /*
  * mount.c
  */
-
-/*
- * Each sb is associated with a set of namespace tags (i.e.
- * the network namespace of the task which mounted this sysfs
- * instance).
- */
-struct sysfs_super_info {
-	void *ns[KOBJ_NS_TYPES];
-};
-#define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info))
-extern struct sysfs_dirent sysfs_root;
-extern struct kmem_cache *sysfs_dir_cachep;
+extern struct kernfs_node *sysfs_root_kn;
 
 /*
  * dir.c
  */
-extern struct mutex sysfs_mutex;
 extern spinlock_t sysfs_symlink_target_lock;
-extern const struct dentry_operations sysfs_dentry_ops;
-
-extern const struct file_operations sysfs_dir_operations;
-extern const struct inode_operations sysfs_dir_inode_operations;
 
-struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd);
-void sysfs_put_active(struct sysfs_dirent *sd);
-void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt);
-void sysfs_warn_dup(struct sysfs_dirent *parent, const char *name);
-int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
-		    struct sysfs_dirent *parent_sd);
-int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
-		  struct sysfs_dirent *parent_sd);
-void sysfs_remove(struct sysfs_dirent *sd);
-int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name,
-			  const void *ns);
-void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
-
-struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
-				       const unsigned char *name,
-				       const void *ns);
-struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type);
-
-void release_sysfs_dirent(struct sysfs_dirent *sd);
-
-int sysfs_create_subdir(struct kobject *kobj, const char *name,
-			struct sysfs_dirent **p_sd);
-
-int sysfs_rename(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent_sd,
-		 const char *new_name, const void *new_ns);
-
-static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd)
-{
-	if (sd) {
-		WARN_ON(!atomic_read(&sd->s_count));
-		atomic_inc(&sd->s_count);
-	}
-	return sd;
-}
-#define sysfs_get(sd) __sysfs_get(sd)
-
-static inline void __sysfs_put(struct sysfs_dirent *sd)
-{
-	if (sd && atomic_dec_and_test(&sd->s_count))
-		release_sysfs_dirent(sd);
-}
-#define sysfs_put(sd) __sysfs_put(sd)
-
-/*
- * inode.c
- */
-struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd);
-void sysfs_evict_inode(struct inode *inode);
-int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr);
-int sysfs_permission(struct inode *inode, int mask);
-int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
-int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
-		  struct kstat *stat);
-int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
-		   size_t size, int flags);
-int sysfs_inode_init(void);
+void sysfs_warn_dup(struct kernfs_node *parent, const char *name);
 
 /*
  * file.c
  */
-extern const struct file_operations sysfs_file_operations;
-extern const struct file_operations sysfs_bin_operations;
-
-int sysfs_add_file(struct sysfs_dirent *dir_sd,
-		   const struct attribute *attr, int type);
-
-int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
-			   const struct attribute *attr, int type,
+int sysfs_add_file(struct kernfs_node *parent,
+		   const struct attribute *attr, bool is_bin);
+int sysfs_add_file_mode_ns(struct kernfs_node *parent,
+			   const struct attribute *attr, bool is_bin,
 			   umode_t amode, const void *ns);
-void sysfs_unmap_bin_file(struct sysfs_dirent *sd);
 
 /*
  * symlink.c
  */
-extern const struct inode_operations sysfs_symlink_inode_operations;
-int sysfs_create_link_sd(struct sysfs_dirent *sd, struct kobject *target,
+int sysfs_create_link_sd(struct kernfs_node *kn, struct kobject *target,
 			 const char *name);
+
+#endif	/* __SYSFS_INTERNAL_H */
diff --git a/include/linux/firmware.h b/include/linux/firmware.h
index e154c1005cd1..59529330efd6 100644
--- a/include/linux/firmware.h
+++ b/include/linux/firmware.h
@@ -68,4 +68,11 @@ static inline void release_firmware(const struct firmware *fw)
 
 #endif
 
+#ifdef CONFIG_FW_LOADER_USER_HELPER
+int request_firmware_direct(const struct firmware **fw, const char *name,
+			    struct device *device);
+#else
+#define request_firmware_direct	request_firmware
+#endif
+
 #endif
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
new file mode 100644
index 000000000000..e9c4e3a03960
--- /dev/null
+++ b/include/linux/kernfs.h
@@ -0,0 +1,356 @@
+/*
+ * kernfs.h - pseudo filesystem decoupled from vfs locking
+ *
+ * This file is released under the GPLv2.
+ */
+
+#ifndef __LINUX_KERNFS_H
+#define __LINUX_KERNFS_H
+
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/idr.h>
+#include <linux/lockdep.h>
+#include <linux/rbtree.h>
+#include <linux/atomic.h>
+#include <linux/completion.h>
+
+struct file;
+struct iattr;
+struct seq_file;
+struct vm_area_struct;
+struct super_block;
+struct file_system_type;
+
+struct kernfs_open_node;
+struct kernfs_iattrs;
+
+enum kernfs_node_type {
+	KERNFS_DIR		= 0x0001,
+	KERNFS_FILE		= 0x0002,
+	KERNFS_LINK		= 0x0004,
+};
+
+#define KERNFS_TYPE_MASK	0x000f
+#define KERNFS_COPY_NAME	(KERNFS_DIR | KERNFS_LINK)
+#define KERNFS_ACTIVE_REF	KERNFS_FILE
+#define KERNFS_FLAG_MASK	~KERNFS_TYPE_MASK
+
+enum kernfs_node_flag {
+	KERNFS_REMOVED		= 0x0010,
+	KERNFS_NS		= 0x0020,
+	KERNFS_HAS_SEQ_SHOW	= 0x0040,
+	KERNFS_HAS_MMAP		= 0x0080,
+	KERNFS_LOCKDEP		= 0x0100,
+};
+
+/* type-specific structures for kernfs_node union members */
+struct kernfs_elem_dir {
+	unsigned long		subdirs;
+	/* children rbtree starts here and goes through kn->rb */
+	struct rb_root		children;
+
+	/*
+	 * The kernfs hierarchy this directory belongs to.  This fits
+	 * better directly in kernfs_node but is here to save space.
+	 */
+	struct kernfs_root	*root;
+};
+
+struct kernfs_elem_symlink {
+	struct kernfs_node	*target_kn;
+};
+
+struct kernfs_elem_attr {
+	const struct kernfs_ops	*ops;
+	struct kernfs_open_node	*open;
+	loff_t			size;
+};
+
+/*
+ * kernfs_node - the building block of kernfs hierarchy.  Each and every
+ * kernfs node is represented by single kernfs_node.  Most fields are
+ * private to kernfs and shouldn't be accessed directly by kernfs users.
+ *
+ * As long as s_count reference is held, the kernfs_node itself is
+ * accessible.  Dereferencing elem or any other outer entity requires
+ * active reference.
+ */
+struct kernfs_node {
+	atomic_t		count;
+	atomic_t		active;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
+	/* the following two fields are published */
+	struct kernfs_node	*parent;
+	const char		*name;
+
+	struct rb_node		rb;
+
+	union {
+		struct completion	*completion;
+		struct kernfs_node	*removed_list;
+	} u;
+
+	const void		*ns;	/* namespace tag */
+	unsigned int		hash;	/* ns + name hash */
+	union {
+		struct kernfs_elem_dir		dir;
+		struct kernfs_elem_symlink	symlink;
+		struct kernfs_elem_attr		attr;
+	};
+
+	void			*priv;
+
+	unsigned short		flags;
+	umode_t			mode;
+	unsigned int		ino;
+	struct kernfs_iattrs	*iattr;
+};
+
+struct kernfs_root {
+	/* published fields */
+	struct kernfs_node	*kn;
+
+	/* private fields, do not use outside kernfs proper */
+	struct ida		ino_ida;
+};
+
+struct kernfs_open_file {
+	/* published fields */
+	struct kernfs_node	*kn;
+	struct file		*file;
+
+	/* private fields, do not use outside kernfs proper */
+	struct mutex		mutex;
+	int			event;
+	struct list_head	list;
+
+	bool			mmapped;
+	const struct vm_operations_struct *vm_ops;
+};
+
+struct kernfs_ops {
+	/*
+	 * Read is handled by either seq_file or raw_read().
+	 *
+	 * If seq_show() is present, seq_file path is active.  Other seq
+	 * operations are optional and if not implemented, the behavior is
+	 * equivalent to single_open().  @sf->private points to the
+	 * associated kernfs_open_file.
+	 *
+	 * read() is bounced through kernel buffer and a read larger than
+	 * PAGE_SIZE results in partial operation of PAGE_SIZE.
+	 */
+	int (*seq_show)(struct seq_file *sf, void *v);
+
+	void *(*seq_start)(struct seq_file *sf, loff_t *ppos);
+	void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos);
+	void (*seq_stop)(struct seq_file *sf, void *v);
+
+	ssize_t (*read)(struct kernfs_open_file *of, char *buf, size_t bytes,
+			loff_t off);
+
+	/*
+	 * write() is bounced through kernel buffer and a write larger than
+	 * PAGE_SIZE results in partial operation of PAGE_SIZE.
+	 */
+	ssize_t (*write)(struct kernfs_open_file *of, char *buf, size_t bytes,
+			 loff_t off);
+
+	int (*mmap)(struct kernfs_open_file *of, struct vm_area_struct *vma);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lock_class_key	lockdep_key;
+#endif
+};
+
+#ifdef CONFIG_SYSFS
+
+static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn)
+{
+	return kn->flags & KERNFS_TYPE_MASK;
+}
+
+/**
+ * kernfs_enable_ns - enable namespace under a directory
+ * @kn: directory of interest, should be empty
+ *
+ * This is to be called right after @kn is created to enable namespace
+ * under it.  All children of @kn must have non-NULL namespace tags and
+ * only the ones which match the super_block's tag will be visible.
+ */
+static inline void kernfs_enable_ns(struct kernfs_node *kn)
+{
+	WARN_ON_ONCE(kernfs_type(kn) != KERNFS_DIR);
+	WARN_ON_ONCE(!RB_EMPTY_ROOT(&kn->dir.children));
+	kn->flags |= KERNFS_NS;
+}
+
+/**
+ * kernfs_ns_enabled - test whether namespace is enabled
+ * @kn: the node to test
+ *
+ * Test whether namespace filtering is enabled for the children of @ns.
+ */
+static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
+{
+	return kn->flags & KERNFS_NS;
+}
+
+struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
+					   const char *name, const void *ns);
+void kernfs_get(struct kernfs_node *kn);
+void kernfs_put(struct kernfs_node *kn);
+
+struct kernfs_root *kernfs_create_root(void *priv);
+void kernfs_destroy_root(struct kernfs_root *root);
+
+struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
+					 const char *name, void *priv,
+					 const void *ns);
+struct kernfs_node *kernfs_create_file_ns_key(struct kernfs_node *parent,
+					      const char *name,
+					      umode_t mode, loff_t size,
+					      const struct kernfs_ops *ops,
+					      void *priv, const void *ns,
+					      struct lock_class_key *key);
+struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
+				       const char *name,
+				       struct kernfs_node *target);
+void kernfs_remove(struct kernfs_node *kn);
+int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
+			     const void *ns);
+int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
+		     const char *new_name, const void *new_ns);
+int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr);
+void kernfs_notify(struct kernfs_node *kn);
+
+const void *kernfs_super_ns(struct super_block *sb);
+struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
+			       struct kernfs_root *root, const void *ns);
+void kernfs_kill_sb(struct super_block *sb);
+
+void kernfs_init(void);
+
+#else	/* CONFIG_SYSFS */
+
+static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn)
+{ return 0; }	/* whatever */
+
+static inline void kernfs_enable_ns(struct kernfs_node *kn) { }
+
+static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
+{ return false; }
+
+static inline struct kernfs_node *
+kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name,
+		       const void *ns)
+{ return NULL; }
+
+static inline void kernfs_get(struct kernfs_node *kn) { }
+static inline void kernfs_put(struct kernfs_node *kn) { }
+
+static inline struct kernfs_root *kernfs_create_root(void *priv)
+{ return ERR_PTR(-ENOSYS); }
+
+static inline void kernfs_destroy_root(struct kernfs_root *root) { }
+
+static inline struct kernfs_node *
+kernfs_create_dir_ns(struct kernfs_node *parent, const char *name, void *priv,
+		     const void *ns)
+{ return ERR_PTR(-ENOSYS); }
+
+static inline struct kernfs_node *
+kernfs_create_file_ns_key(struct kernfs_node *parent, const char *name,
+			  umode_t mode, loff_t size,
+			  const struct kernfs_ops *ops, void *priv,
+			  const void *ns, struct lock_class_key *key)
+{ return ERR_PTR(-ENOSYS); }
+
+static inline struct kernfs_node *
+kernfs_create_link(struct kernfs_node *parent, const char *name,
+		   struct kernfs_node *target)
+{ return ERR_PTR(-ENOSYS); }
+
+static inline void kernfs_remove(struct kernfs_node *kn) { }
+
+static inline int kernfs_remove_by_name_ns(struct kernfs_node *kn,
+					   const char *name, const void *ns)
+{ return -ENOSYS; }
+
+static inline int kernfs_rename_ns(struct kernfs_node *kn,
+				   struct kernfs_node *new_parent,
+				   const char *new_name, const void *new_ns)
+{ return -ENOSYS; }
+
+static inline int kernfs_setattr(struct kernfs_node *kn,
+				 const struct iattr *iattr)
+{ return -ENOSYS; }
+
+static inline void kernfs_notify(struct kernfs_node *kn) { }
+
+static inline const void *kernfs_super_ns(struct super_block *sb)
+{ return NULL; }
+
+static inline struct dentry *
+kernfs_mount_ns(struct file_system_type *fs_type, int flags,
+		struct kernfs_root *root, const void *ns)
+{ return ERR_PTR(-ENOSYS); }
+
+static inline void kernfs_kill_sb(struct super_block *sb) { }
+
+static inline void kernfs_init(void) { }
+
+#endif	/* CONFIG_SYSFS */
+
+static inline struct kernfs_node *
+kernfs_find_and_get(struct kernfs_node *kn, const char *name)
+{
+	return kernfs_find_and_get_ns(kn, name, NULL);
+}
+
+static inline struct kernfs_node *
+kernfs_create_dir(struct kernfs_node *parent, const char *name, void *priv)
+{
+	return kernfs_create_dir_ns(parent, name, priv, NULL);
+}
+
+static inline struct kernfs_node *
+kernfs_create_file_ns(struct kernfs_node *parent, const char *name,
+		      umode_t mode, loff_t size, const struct kernfs_ops *ops,
+		      void *priv, const void *ns)
+{
+	struct lock_class_key *key = NULL;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	key = (struct lock_class_key *)&ops->lockdep_key;
+#endif
+	return kernfs_create_file_ns_key(parent, name, mode, size, ops, priv,
+					 ns, key);
+}
+
+static inline struct kernfs_node *
+kernfs_create_file(struct kernfs_node *parent, const char *name, umode_t mode,
+		   loff_t size, const struct kernfs_ops *ops, void *priv)
+{
+	return kernfs_create_file_ns(parent, name, mode, size, ops, priv, NULL);
+}
+
+static inline int kernfs_remove_by_name(struct kernfs_node *parent,
+					const char *name)
+{
+	return kernfs_remove_by_name_ns(parent, name, NULL);
+}
+
+static inline struct dentry *
+kernfs_mount(struct file_system_type *fs_type, int flags,
+	     struct kernfs_root *root)
+{
+	return kernfs_mount_ns(fs_type, flags, root, NULL);
+}
+
+#endif	/* __LINUX_KERNFS_H */
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index e7ba650086ce..926afb6f6b5f 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -64,7 +64,7 @@ struct kobject {
 	struct kobject		*parent;
 	struct kset		*kset;
 	struct kobj_type	*ktype;
-	struct sysfs_dirent	*sd;
+	struct kernfs_node	*sd;
 	struct kref		kref;
 #ifdef CONFIG_DEBUG_KOBJECT_RELEASE
 	struct delayed_work	release;
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 6695040a0317..30b2ebee6439 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -12,6 +12,7 @@
 #ifndef _SYSFS_H_
 #define _SYSFS_H_
 
+#include <linux/kernfs.h>
 #include <linux/compiler.h>
 #include <linux/errno.h>
 #include <linux/list.h>
@@ -175,8 +176,6 @@ struct sysfs_ops {
 	ssize_t	(*store)(struct kobject *, struct attribute *, const char *, size_t);
 };
 
-struct sysfs_dirent;
-
 #ifdef CONFIG_SYSFS
 
 int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *),
@@ -244,12 +243,6 @@ void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name,
 				  const char *link_name);
 
 void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr);
-void sysfs_notify_dirent(struct sysfs_dirent *sd);
-struct sysfs_dirent *sysfs_get_dirent_ns(struct sysfs_dirent *parent_sd,
-					 const unsigned char *name,
-					 const void *ns);
-struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd);
-void sysfs_put(struct sysfs_dirent *sd);
 
 int __must_check sysfs_init(void);
 
@@ -419,22 +412,6 @@ static inline void sysfs_notify(struct kobject *kobj, const char *dir,
 				const char *attr)
 {
 }
-static inline void sysfs_notify_dirent(struct sysfs_dirent *sd)
-{
-}
-static inline struct sysfs_dirent *
-sysfs_get_dirent_ns(struct sysfs_dirent *parent_sd, const unsigned char *name,
-		    const void *ns)
-{
-	return NULL;
-}
-static inline struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd)
-{
-	return NULL;
-}
-static inline void sysfs_put(struct sysfs_dirent *sd)
-{
-}
 
 static inline int __must_check sysfs_init(void)
 {
@@ -461,10 +438,26 @@ static inline int sysfs_rename_link(struct kobject *kobj, struct kobject *target
 	return sysfs_rename_link_ns(kobj, target, old_name, new_name, NULL);
 }
 
-static inline struct sysfs_dirent *
-sysfs_get_dirent(struct sysfs_dirent *parent_sd, const unsigned char *name)
+static inline void sysfs_notify_dirent(struct kernfs_node *kn)
+{
+	kernfs_notify(kn);
+}
+
+static inline struct kernfs_node *sysfs_get_dirent(struct kernfs_node *parent,
+						   const unsigned char *name)
+{
+	return kernfs_find_and_get(parent, name);
+}
+
+static inline struct kernfs_node *sysfs_get(struct kernfs_node *kn)
+{
+	kernfs_get(kn);
+	return kn;
+}
+
+static inline void sysfs_put(struct kernfs_node *kn)
 {
-	return sysfs_get_dirent_ns(parent_sd, name, NULL);
+	kernfs_put(kn);
 }
 
 #endif /* _SYSFS_H_ */
diff --git a/lib/kobject.c b/lib/kobject.c
index 5b4b8886435e..064451f2a6c3 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -18,6 +18,7 @@
 #include <linux/export.h>
 #include <linux/stat.h>
 #include <linux/slab.h>
+#include <linux/random.h>
 
 /**
  * kobject_namespace - return @kobj's namespace tag
@@ -65,13 +66,17 @@ static int populate_dir(struct kobject *kobj)
 
 static int create_dir(struct kobject *kobj)
 {
+	const struct kobj_ns_type_operations *ops;
 	int error;
 
 	error = sysfs_create_dir_ns(kobj, kobject_namespace(kobj));
-	if (!error) {
-		error = populate_dir(kobj);
-		if (error)
-			sysfs_remove_dir(kobj);
+	if (error)
+		return error;
+
+	error = populate_dir(kobj);
+	if (error) {
+		sysfs_remove_dir(kobj);
+		return error;
 	}
 
 	/*
@@ -80,7 +85,20 @@ static int create_dir(struct kobject *kobj)
 	 */
 	sysfs_get(kobj->sd);
 
-	return error;
+	/*
+	 * If @kobj has ns_ops, its children need to be filtered based on
+	 * their namespace tags.  Enable namespace support on @kobj->sd.
+	 */
+	ops = kobj_child_ns_ops(kobj);
+	if (ops) {
+		BUG_ON(ops->type <= KOBJ_NS_TYPE_NONE);
+		BUG_ON(ops->type >= KOBJ_NS_TYPES);
+		BUG_ON(!kobj_ns_type_registered(ops->type));
+
+		kernfs_enable_ns(kobj->sd);
+	}
+
+	return 0;
 }
 
 static int get_kobj_path_length(struct kobject *kobj)
@@ -247,8 +265,10 @@ int kobject_set_name_vargs(struct kobject *kobj, const char *fmt,
 		return 0;
 
 	kobj->name = kvasprintf(GFP_KERNEL, fmt, vargs);
-	if (!kobj->name)
+	if (!kobj->name) {
+		kobj->name = old_name;
 		return -ENOMEM;
+	}
 
 	/* ewww... some of these buggers have '/' in the name ... */
 	while ((s = strchr(kobj->name, '/')))
@@ -536,7 +556,7 @@ out:
  */
 void kobject_del(struct kobject *kobj)
 {
-	struct sysfs_dirent *sd;
+	struct kernfs_node *sd;
 
 	if (!kobj)
 		return;
@@ -625,10 +645,12 @@ static void kobject_release(struct kref *kref)
 {
 	struct kobject *kobj = container_of(kref, struct kobject, kref);
 #ifdef CONFIG_DEBUG_KOBJECT_RELEASE
-	pr_info("kobject: '%s' (%p): %s, parent %p (delayed)\n",
-		 kobject_name(kobj), kobj, __func__, kobj->parent);
+	unsigned long delay = HZ + HZ * (get_random_int() & 0x3);
+	pr_info("kobject: '%s' (%p): %s, parent %p (delayed %ld)\n",
+		 kobject_name(kobj), kobj, __func__, kobj->parent, delay);
 	INIT_DELAYED_WORK(&kobj->release, kobject_delayed_cleanup);
-	schedule_delayed_work(&kobj->release, HZ);
+
+	schedule_delayed_work(&kobj->release, delay);
 #else
 	kobject_cleanup(kobj);
 #endif
@@ -835,6 +857,7 @@ void kset_unregister(struct kset *k)
 {
 	if (!k)
 		return;
+	kobject_del(&k->kobj);
 	kobject_put(&k->kobj);
 }
 
diff --git a/samples/kobject/kset-example.c b/samples/kobject/kset-example.c
index d0c687fd9802..5dce351f131f 100644
--- a/samples/kobject/kset-example.c
+++ b/samples/kobject/kset-example.c
@@ -262,6 +262,7 @@ baz_error:
 bar_error:
 	destroy_foo_obj(foo_obj);
 foo_error:
+	kset_unregister(example_kset);
 	return -EINVAL;
 }
author	Stephen Rothwell <sfr@canb.auug.org.au>	2013-12-17 14:45:09 +1100
committer	Stephen Rothwell <sfr@canb.auug.org.au>	2013-12-17 14:45:09 +1100
commit	4fa9f39b102e94073a4a3fbb52014e75772060d4 (patch)
tree	dd243d8a30e5791da58446d75ff78b1e72916059
parent	ca7a7723217f08a9f8b40d958a915bf8887013b8 (diff)
parent	c637b8acbe079edb477d887041755b489036f146 (diff)