From ee86273062cbb310665fe49e1f1937d2cf85b0b9 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Mon, 23 Aug 2010 15:16:00 +0200
Subject: loop: add some basic read-only sysfs attributes

Create /sys/block/loopX/loop directory and provide these attributes:
 - backing_file
 - autoclear
 - offset
 - sizelimit

This loop directory is present only if loop device is configured.

To be used in util-linux-ng (and possibly elsewhere like udev rules)
where code need to get loop attributes from kernel (and not store
duplicate info in userspace).

Moreover loop ioctls are not even able to provide full backing
file info because of buffer limits.

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/loop.c | 101 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 101 insertions(+)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index f3c636d23718..dc552308668e 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -74,6 +74,7 @@
 #include <linux/highmem.h>
 #include <linux/kthread.h>
 #include <linux/splice.h>
+#include <linux/sysfs.h>
 
 #include <asm/uaccess.h>
 
@@ -737,6 +738,103 @@ static inline int is_loop_device(struct file *file)
 	return i && S_ISBLK(i->i_mode) && MAJOR(i->i_rdev) == LOOP_MAJOR;
 }
 
+/* loop sysfs attributes */
+
+static ssize_t loop_attr_show(struct device *dev, char *page,
+			      ssize_t (*callback)(struct loop_device *, char *))
+{
+	struct loop_device *l, *lo = NULL;
+
+	mutex_lock(&loop_devices_mutex);
+	list_for_each_entry(l, &loop_devices, lo_list)
+		if (disk_to_dev(l->lo_disk) == dev) {
+			lo = l;
+			break;
+		}
+	mutex_unlock(&loop_devices_mutex);
+
+	return lo ? callback(lo, page) : -EIO;
+}
+
+#define LOOP_ATTR_RO(_name)						\
+static ssize_t loop_attr_##_name##_show(struct loop_device *, char *);	\
+static ssize_t loop_attr_do_show_##_name(struct device *d,		\
+				struct device_attribute *attr, char *b)	\
+{									\
+	return loop_attr_show(d, b, loop_attr_##_name##_show);		\
+}									\
+static struct device_attribute loop_attr_##_name =			\
+	__ATTR(_name, S_IRUGO, loop_attr_do_show_##_name, NULL);
+
+static ssize_t loop_attr_backing_file_show(struct loop_device *lo, char *buf)
+{
+	ssize_t ret;
+	char *p = NULL;
+
+	mutex_lock(&lo->lo_ctl_mutex);
+	if (lo->lo_backing_file)
+		p = d_path(&lo->lo_backing_file->f_path, buf, PAGE_SIZE - 1);
+	mutex_unlock(&lo->lo_ctl_mutex);
+
+	if (IS_ERR_OR_NULL(p))
+		ret = PTR_ERR(p);
+	else {
+		ret = strlen(p);
+		memmove(buf, p, ret);
+		buf[ret++] = '\n';
+		buf[ret] = 0;
+	}
+
+	return ret;
+}
+
+static ssize_t loop_attr_offset_show(struct loop_device *lo, char *buf)
+{
+	return sprintf(buf, "%llu\n", (unsigned long long)lo->lo_offset);
+}
+
+static ssize_t loop_attr_sizelimit_show(struct loop_device *lo, char *buf)
+{
+	return sprintf(buf, "%llu\n", (unsigned long long)lo->lo_sizelimit);
+}
+
+static ssize_t loop_attr_autoclear_show(struct loop_device *lo, char *buf)
+{
+	int autoclear = (lo->lo_flags & LO_FLAGS_AUTOCLEAR);
+
+	return sprintf(buf, "%s\n", autoclear ? "1" : "0");
+}
+
+LOOP_ATTR_RO(backing_file);
+LOOP_ATTR_RO(offset);
+LOOP_ATTR_RO(sizelimit);
+LOOP_ATTR_RO(autoclear);
+
+static struct attribute *loop_attrs[] = {
+	&loop_attr_backing_file.attr,
+	&loop_attr_offset.attr,
+	&loop_attr_sizelimit.attr,
+	&loop_attr_autoclear.attr,
+	NULL,
+};
+
+static struct attribute_group loop_attribute_group = {
+	.name = "loop",
+	.attrs= loop_attrs,
+};
+
+static int loop_sysfs_init(struct loop_device *lo)
+{
+	return sysfs_create_group(&disk_to_dev(lo->lo_disk)->kobj,
+				  &loop_attribute_group);
+}
+
+static void loop_sysfs_exit(struct loop_device *lo)
+{
+	sysfs_remove_group(&disk_to_dev(lo->lo_disk)->kobj,
+			   &loop_attribute_group);
+}
+
 static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 		       struct block_device *bdev, unsigned int arg)
 {
@@ -836,6 +934,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 
 	set_capacity(lo->lo_disk, size);
 	bd_set_size(bdev, size << 9);
+	loop_sysfs_init(lo);
 	/* let user-space know about the new size */
 	kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
 
@@ -854,6 +953,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	return 0;
 
 out_clr:
+	loop_sysfs_exit(lo);
 	lo->lo_thread = NULL;
 	lo->lo_device = NULL;
 	lo->lo_backing_file = NULL;
@@ -950,6 +1050,7 @@ static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev)
 	set_capacity(lo->lo_disk, 0);
 	if (bdev) {
 		bd_set_size(bdev, 0);
+		loop_sysfs_exit(lo);
 		/* let user-space know about this change */
 		kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
 	}
-- 
cgit v1.2.3


From 0a25a5aee727c4a56c7d39e0e595947b02ee2696 Mon Sep 17 00:00:00 2001
From: "Stephen M. Cameron" <scameron@beardog.cce.hp.com>
Date: Thu, 26 Aug 2010 13:55:34 -0500
Subject: cciss: factor out cciss_getpciinfo

Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/cciss.c | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 31064df1370a..4fe5e427c480 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1230,9 +1230,22 @@ static void check_ioctl_unit_attention(ctlr_info_t *h, CommandList_struct *c)
 			c->err_info->ScsiStatus != SAM_STAT_CHECK_CONDITION)
 		(void)check_for_unit_attention(h, c);
 }
-/*
- * ioctl
- */
+
+static int cciss_getpciinfo(ctlr_info_t *h, void __user *argp)
+{
+	cciss_pci_info_struct pciinfo;
+
+	if (!argp)
+		return -EINVAL;
+	pciinfo.domain = pci_domain_nr(h->pdev->bus);
+	pciinfo.bus = h->pdev->bus->number;
+	pciinfo.dev_fn = h->pdev->devfn;
+	pciinfo.board_id = h->board_id;
+	if (copy_to_user(argp, &pciinfo, sizeof(cciss_pci_info_struct)))
+		return -EFAULT;
+	return 0;
+}
+
 static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
 		       unsigned int cmd, unsigned long arg)
 {
@@ -1245,20 +1258,7 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
 		cmd, arg);
 	switch (cmd) {
 	case CCISS_GETPCIINFO:
-		{
-			cciss_pci_info_struct pciinfo;
-
-			if (!arg)
-				return -EINVAL;
-			pciinfo.domain = pci_domain_nr(h->pdev->bus);
-			pciinfo.bus = h->pdev->bus->number;
-			pciinfo.dev_fn = h->pdev->devfn;
-			pciinfo.board_id = h->board_id;
-			if (copy_to_user
-			    (argp, &pciinfo, sizeof(cciss_pci_info_struct)))
-				return -EFAULT;
-			return 0;
-		}
+		return cciss_getpciinfo(h, argp);
 	case CCISS_GETINTINFO:
 		{
 			cciss_coalint_struct intinfo;
-- 
cgit v1.2.3


From 576e661c658ab7d2a15cc12d5b8a1600db81ec0a Mon Sep 17 00:00:00 2001
From: "Stephen M. Cameron" <scameron@beardog.cce.hp.com>
Date: Thu, 26 Aug 2010 13:55:39 -0500
Subject: cciss: factor out cciss_getintinfo

Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/cciss.c | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 4fe5e427c480..fc761404cef7 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1246,6 +1246,20 @@ static int cciss_getpciinfo(ctlr_info_t *h, void __user *argp)
 	return 0;
 }
 
+static int cciss_getintinfo(ctlr_info_t *h, void __user *argp)
+{
+	cciss_coalint_struct intinfo;
+
+	if (!argp)
+		return -EINVAL;
+	intinfo.delay = readl(&h->cfgtable->HostWrite.CoalIntDelay);
+	intinfo.count = readl(&h->cfgtable->HostWrite.CoalIntCount);
+	if (copy_to_user
+	    (argp, &intinfo, sizeof(cciss_coalint_struct)))
+		return -EFAULT;
+	return 0;
+}
+
 static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
 		       unsigned int cmd, unsigned long arg)
 {
@@ -1260,19 +1274,7 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
 	case CCISS_GETPCIINFO:
 		return cciss_getpciinfo(h, argp);
 	case CCISS_GETINTINFO:
-		{
-			cciss_coalint_struct intinfo;
-			if (!arg)
-				return -EINVAL;
-			intinfo.delay =
-			    readl(&h->cfgtable->HostWrite.CoalIntDelay);
-			intinfo.count =
-			    readl(&h->cfgtable->HostWrite.CoalIntCount);
-			if (copy_to_user
-			    (argp, &intinfo, sizeof(cciss_coalint_struct)))
-				return -EFAULT;
-			return 0;
-		}
+		return cciss_getintinfo(h, argp);
 	case CCISS_SETINTINFO:
 		{
 			cciss_coalint_struct intinfo;
-- 
cgit v1.2.3


From 4c800eed9a46f7b6a469d24e7e6051b23e62bb69 Mon Sep 17 00:00:00 2001
From: "Stephen M. Cameron" <scameron@beardog.cce.hp.com>
Date: Thu, 26 Aug 2010 13:55:44 -0500
Subject: cciss: factor out cciss_setintinfo

Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/cciss.c | 66 +++++++++++++++++++++++++--------------------------
 1 file changed, 32 insertions(+), 34 deletions(-)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index fc761404cef7..5119c8837cf5 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1260,6 +1260,37 @@ static int cciss_getintinfo(ctlr_info_t *h, void __user *argp)
 	return 0;
 }
 
+static int cciss_setintinfo(ctlr_info_t *h, void __user *argp)
+{
+	cciss_coalint_struct intinfo;
+	unsigned long flags;
+	int i;
+
+	if (!argp)
+		return -EINVAL;
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (copy_from_user(&intinfo, argp, sizeof(intinfo)))
+		return -EFAULT;
+	if ((intinfo.delay == 0) && (intinfo.count == 0))
+		return -EINVAL;
+	spin_lock_irqsave(&h->lock, flags);
+	/* Update the field, and then ring the doorbell */
+	writel(intinfo.delay, &(h->cfgtable->HostWrite.CoalIntDelay));
+	writel(intinfo.count, &(h->cfgtable->HostWrite.CoalIntCount));
+	writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL);
+
+	for (i = 0; i < MAX_IOCTL_CONFIG_WAIT; i++) {
+		if (!(readl(h->vaddr + SA5_DOORBELL) & CFGTBL_ChangeReq))
+			break;
+		udelay(1000); /* delay and try again */
+	}
+	spin_unlock_irqrestore(&h->lock, flags);
+	if (i >= MAX_IOCTL_CONFIG_WAIT)
+		return -EAGAIN;
+	return 0;
+}
+
 static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
 		       unsigned int cmd, unsigned long arg)
 {
@@ -1276,40 +1307,7 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
 	case CCISS_GETINTINFO:
 		return cciss_getintinfo(h, argp);
 	case CCISS_SETINTINFO:
-		{
-			cciss_coalint_struct intinfo;
-			unsigned long flags;
-			int i;
-
-			if (!arg)
-				return -EINVAL;
-			if (!capable(CAP_SYS_ADMIN))
-				return -EPERM;
-			if (copy_from_user
-			    (&intinfo, argp, sizeof(cciss_coalint_struct)))
-				return -EFAULT;
-			if ((intinfo.delay == 0) && (intinfo.count == 0))
-				return -EINVAL;
-			spin_lock_irqsave(&h->lock, flags);
-			/* Update the field, and then ring the doorbell */
-			writel(intinfo.delay,
-			       &(h->cfgtable->HostWrite.CoalIntDelay));
-			writel(intinfo.count,
-			       &(h->cfgtable->HostWrite.CoalIntCount));
-			writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL);
-
-			for (i = 0; i < MAX_IOCTL_CONFIG_WAIT; i++) {
-				if (!(readl(h->vaddr + SA5_DOORBELL)
-				      & CFGTBL_ChangeReq))
-					break;
-				/* delay and try again */
-				udelay(1000);
-			}
-			spin_unlock_irqrestore(&h->lock, flags);
-			if (i >= MAX_IOCTL_CONFIG_WAIT)
-				return -EAGAIN;
-			return 0;
-		}
+		return cciss_setintinfo(h, argp);
 	case CCISS_GETNODENAME:
 		{
 			NodeName_type NodeName;
-- 
cgit v1.2.3


From 252161094255ffdc277b8a0e5e12b0a8ff67b10f Mon Sep 17 00:00:00 2001
From: "Stephen M. Cameron" <scameron@beardog.cce.hp.com>
Date: Thu, 26 Aug 2010 13:55:49 -0500
Subject: cciss: factor out cciss_getnodename

Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/cciss.c | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 5119c8837cf5..9a3dd20ea93a 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1291,6 +1291,20 @@ static int cciss_setintinfo(ctlr_info_t *h, void __user *argp)
 	return 0;
 }
 
+static int cciss_getnodename(ctlr_info_t *h, void __user *argp)
+{
+	NodeName_type NodeName;
+	int i;
+
+	if (!argp)
+		return -EINVAL;
+	for (i = 0; i < 16; i++)
+		NodeName[i] = readb(&h->cfgtable->ServerName[i]);
+	if (copy_to_user(argp, NodeName, sizeof(NodeName_type)))
+		return -EFAULT;
+	return 0;
+}
+
 static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
 		       unsigned int cmd, unsigned long arg)
 {
@@ -1309,19 +1323,7 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
 	case CCISS_SETINTINFO:
 		return cciss_setintinfo(h, argp);
 	case CCISS_GETNODENAME:
-		{
-			NodeName_type NodeName;
-			int i;
-
-			if (!arg)
-				return -EINVAL;
-			for (i = 0; i < 16; i++)
-				NodeName[i] =
-				    readb(&h->cfgtable->ServerName[i]);
-			if (copy_to_user(argp, NodeName, sizeof(NodeName_type)))
-				return -EFAULT;
-			return 0;
-		}
+		return cciss_getnodename(h, argp);
 	case CCISS_SETNODENAME:
 		{
 			NodeName_type NodeName;
-- 
cgit v1.2.3


From 4f43f32cd3f65cfee5f30d7e6be55854cf33809b Mon Sep 17 00:00:00 2001
From: "Stephen M. Cameron" <scameron@beardog.cce.hp.com>
Date: Thu, 26 Aug 2010 13:55:54 -0500
Subject: cciss: factor out cciss_setnodename

Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/cciss.c | 65 +++++++++++++++++++++++----------------------------
 1 file changed, 29 insertions(+), 36 deletions(-)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 9a3dd20ea93a..8bbb1c81bf86 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1305,6 +1305,34 @@ static int cciss_getnodename(ctlr_info_t *h, void __user *argp)
 	return 0;
 }
 
+static int cciss_setnodename(ctlr_info_t *h, void __user *argp)
+{
+	NodeName_type NodeName;
+	unsigned long flags;
+	int i;
+
+	if (!argp)
+		return -EINVAL;
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (copy_from_user(NodeName, argp, sizeof(NodeName_type)))
+		return -EFAULT;
+	spin_lock_irqsave(&h->lock, flags);
+	/* Update the field, and then ring the doorbell */
+	for (i = 0; i < 16; i++)
+		writeb(NodeName[i], &h->cfgtable->ServerName[i]);
+	writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL);
+	for (i = 0; i < MAX_IOCTL_CONFIG_WAIT; i++) {
+		if (!(readl(h->vaddr + SA5_DOORBELL) & CFGTBL_ChangeReq))
+			break;
+		udelay(1000); /* delay and try again */
+	}
+	spin_unlock_irqrestore(&h->lock, flags);
+	if (i >= MAX_IOCTL_CONFIG_WAIT)
+		return -EAGAIN;
+	return 0;
+}
+
 static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
 		       unsigned int cmd, unsigned long arg)
 {
@@ -1325,42 +1353,7 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
 	case CCISS_GETNODENAME:
 		return cciss_getnodename(h, argp);
 	case CCISS_SETNODENAME:
-		{
-			NodeName_type NodeName;
-			unsigned long flags;
-			int i;
-
-			if (!arg)
-				return -EINVAL;
-			if (!capable(CAP_SYS_ADMIN))
-				return -EPERM;
-
-			if (copy_from_user
-			    (NodeName, argp, sizeof(NodeName_type)))
-				return -EFAULT;
-
-			spin_lock_irqsave(&h->lock, flags);
-
-			/* Update the field, and then ring the doorbell */
-			for (i = 0; i < 16; i++)
-				writeb(NodeName[i],
-				       &h->cfgtable->ServerName[i]);
-
-			writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL);
-
-			for (i = 0; i < MAX_IOCTL_CONFIG_WAIT; i++) {
-				if (!(readl(h->vaddr + SA5_DOORBELL)
-				      & CFGTBL_ChangeReq))
-					break;
-				/* delay and try again */
-				udelay(1000);
-			}
-			spin_unlock_irqrestore(&h->lock, flags);
-			if (i >= MAX_IOCTL_CONFIG_WAIT)
-				return -EAGAIN;
-			return 0;
-		}
-
+		return cciss_setnodename(h, argp);
 	case CCISS_GETHEARTBEAT:
 		{
 			Heartbeat_type heartbeat;
-- 
cgit v1.2.3


From 93c74931131d3a7af3a6aa18aab0d39978b5d3b5 Mon Sep 17 00:00:00 2001
From: "Stephen M. Cameron" <scameron@beardog.cce.hp.com>
Date: Thu, 26 Aug 2010 13:55:59 -0500
Subject: cciss: factor out cciss_getheartbeat

Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/cciss.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 8bbb1c81bf86..e716fc9e4b3e 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1333,6 +1333,18 @@ static int cciss_setnodename(ctlr_info_t *h, void __user *argp)
 	return 0;
 }
 
+static int cciss_getheartbeat(ctlr_info_t *h, void __user *argp)
+{
+	Heartbeat_type heartbeat;
+
+	if (!argp)
+		return -EINVAL;
+	heartbeat = readl(&h->cfgtable->HeartBeat);
+	if (copy_to_user(argp, &heartbeat, sizeof(Heartbeat_type)))
+		return -EFAULT;
+	return 0;
+}
+
 static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
 		       unsigned int cmd, unsigned long arg)
 {
@@ -1355,17 +1367,7 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
 	case CCISS_SETNODENAME:
 		return cciss_setnodename(h, argp);
 	case CCISS_GETHEARTBEAT:
-		{
-			Heartbeat_type heartbeat;
-
-			if (!arg)
-				return -EINVAL;
-			heartbeat = readl(&h->cfgtable->HeartBeat);
-			if (copy_to_user
-			    (argp, &heartbeat, sizeof(Heartbeat_type)))
-				return -EFAULT;
-			return 0;
-		}
+		return cciss_getheartbeat(h, argp);
 	case CCISS_GETBUSTYPES:
 		{
 			BusTypes_type BusTypes;
-- 
cgit v1.2.3


From d18dfad4e2bf6c2d2c5c104d882b14b9fee71b14 Mon Sep 17 00:00:00 2001
From: "Stephen M. Cameron" <scameron@beardog.cce.hp.com>
Date: Thu, 26 Aug 2010 13:56:05 -0500
Subject: cciss: factor out cciss_getbustypes

Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/cciss.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index e716fc9e4b3e..e7b650b7f548 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1345,6 +1345,18 @@ static int cciss_getheartbeat(ctlr_info_t *h, void __user *argp)
 	return 0;
 }
 
+static int cciss_getbustypes(ctlr_info_t *h, void __user *argp)
+{
+	BusTypes_type BusTypes;
+
+	if (!argp)
+		return -EINVAL;
+	BusTypes = readl(&h->cfgtable->BusTypes);
+	if (copy_to_user(argp, &BusTypes, sizeof(BusTypes_type)))
+		return -EFAULT;
+	return 0;
+}
+
 static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
 		       unsigned int cmd, unsigned long arg)
 {
@@ -1369,17 +1381,7 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
 	case CCISS_GETHEARTBEAT:
 		return cciss_getheartbeat(h, argp);
 	case CCISS_GETBUSTYPES:
-		{
-			BusTypes_type BusTypes;
-
-			if (!arg)
-				return -EINVAL;
-			BusTypes = readl(&h->cfgtable->BusTypes);
-			if (copy_to_user
-			    (argp, &BusTypes, sizeof(BusTypes_type)))
-				return -EFAULT;
-			return 0;
-		}
+		return cciss_getbustypes(h, argp);
 	case CCISS_GETFIRMVER:
 		{
 			FirmwareVer_type firmware;
-- 
cgit v1.2.3


From 8a4f7fbfdd76a304c8f04dde6b8fd0e5c50bbe76 Mon Sep 17 00:00:00 2001
From: "Stephen M. Cameron" <scameron@beardog.cce.hp.com>
Date: Thu, 26 Aug 2010 13:56:10 -0500
Subject: cciss: factor out cciss_getfirmver

Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/cciss.c | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index e7b650b7f548..c47e601f601c 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1357,6 +1357,20 @@ static int cciss_getbustypes(ctlr_info_t *h, void __user *argp)
 	return 0;
 }
 
+static int cciss_getfirmver(ctlr_info_t *h, void __user *argp)
+{
+	FirmwareVer_type firmware;
+
+	if (!argp)
+		return -EINVAL;
+	memcpy(firmware, h->firm_ver, 4);
+
+	if (copy_to_user
+	    (argp, firmware, sizeof(FirmwareVer_type)))
+		return -EFAULT;
+	return 0;
+}
+
 static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
 		       unsigned int cmd, unsigned long arg)
 {
@@ -1383,18 +1397,7 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
 	case CCISS_GETBUSTYPES:
 		return cciss_getbustypes(h, argp);
 	case CCISS_GETFIRMVER:
-		{
-			FirmwareVer_type firmware;
-
-			if (!arg)
-				return -EINVAL;
-			memcpy(firmware, h->firm_ver, 4);
-
-			if (copy_to_user
-			    (argp, firmware, sizeof(FirmwareVer_type)))
-				return -EFAULT;
-			return 0;
-		}
+		return cciss_getfirmver(h, argp);
 	case CCISS_GETDRIVVER:
 		{
 			DriverVer_type DriverVer = DRIVER_VERSION;
-- 
cgit v1.2.3


From c525919ddf0b10ff0e1f528dd50d90edfdee4797 Mon Sep 17 00:00:00 2001
From: "Stephen M. Cameron" <scameron@beardog.cce.hp.com>
Date: Thu, 26 Aug 2010 13:56:15 -0500
Subject: cciss: factor out cciss_getdrivver

Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/cciss.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index c47e601f601c..71018b988595 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1371,8 +1371,19 @@ static int cciss_getfirmver(ctlr_info_t *h, void __user *argp)
 	return 0;
 }
 
+static int cciss_getdrivver(ctlr_info_t *h, void __user *argp)
+{
+	DriverVer_type DriverVer = DRIVER_VERSION;
+
+	if (!argp)
+		return -EINVAL;
+	if (copy_to_user(argp, &DriverVer, sizeof(DriverVer_type)))
+		return -EFAULT;
+	return 0;
+}
+
 static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
-		       unsigned int cmd, unsigned long arg)
+	unsigned int cmd, unsigned long arg)
 {
 	struct gendisk *disk = bdev->bd_disk;
 	ctlr_info_t *h = get_host(disk);
@@ -1399,18 +1410,7 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
 	case CCISS_GETFIRMVER:
 		return cciss_getfirmver(h, argp);
 	case CCISS_GETDRIVVER:
-		{
-			DriverVer_type DriverVer = DRIVER_VERSION;
-
-			if (!arg)
-				return -EINVAL;
-
-			if (copy_to_user
-			    (argp, &DriverVer, sizeof(DriverVer_type)))
-				return -EFAULT;
-			return 0;
-		}
-
+		return cciss_getdrivver(h, argp);
 	case CCISS_DEREGDISK:
 	case CCISS_REGNEWD:
 	case CCISS_REVALIDVOLS:
-- 
cgit v1.2.3


From 0894b32c5c1444d25f3e988cf03415ce8dfc5142 Mon Sep 17 00:00:00 2001
From: "Stephen M. Cameron" <scameron@beardog.cce.hp.com>
Date: Thu, 26 Aug 2010 13:56:20 -0500
Subject: cciss: factor out cciss_getluninfo

Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/cciss.c | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 71018b988595..5abdce3ef70b 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1382,12 +1382,27 @@ static int cciss_getdrivver(ctlr_info_t *h, void __user *argp)
 	return 0;
 }
 
+static int cciss_getluninfo(ctlr_info_t *h,
+	struct gendisk *disk, void __user *argp)
+{
+	LogvolInfo_struct luninfo;
+	drive_info_struct *drv = get_drv(disk);
+
+	if (!argp)
+		return -EINVAL;
+	memcpy(&luninfo.LunID, drv->LunID, sizeof(luninfo.LunID));
+	luninfo.num_opens = drv->usage_count;
+	luninfo.num_parts = 0;
+	if (copy_to_user(argp, &luninfo, sizeof(LogvolInfo_struct)))
+		return -EFAULT;
+	return 0;
+}
+
 static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
 	unsigned int cmd, unsigned long arg)
 {
 	struct gendisk *disk = bdev->bd_disk;
 	ctlr_info_t *h = get_host(disk);
-	drive_info_struct *drv = get_drv(disk);
 	void __user *argp = (void __user *)arg;
 
 	dev_dbg(&h->pdev->dev, "cciss_ioctl: Called with cmd=%x %lx\n",
@@ -1415,19 +1430,8 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
 	case CCISS_REGNEWD:
 	case CCISS_REVALIDVOLS:
 		return rebuild_lun_table(h, 0, 1);
-
-	case CCISS_GETLUNINFO:{
-			LogvolInfo_struct luninfo;
-
-			memcpy(&luninfo.LunID, drv->LunID,
-				sizeof(luninfo.LunID));
-			luninfo.num_opens = drv->usage_count;
-			luninfo.num_parts = 0;
-			if (copy_to_user(argp, &luninfo,
-					 sizeof(LogvolInfo_struct)))
-				return -EFAULT;
-			return 0;
-		}
+	case CCISS_GETLUNINFO:
+		return cciss_getluninfo(h, disk, argp);
 	case CCISS_PASSTHRU:
 		{
 			IOCTL_Command_struct iocommand;
-- 
cgit v1.2.3


From f32f125b1c14dcde49ec415ec941af750433251e Mon Sep 17 00:00:00 2001
From: "Stephen M. Cameron" <scameron@beardog.cce.hp.com>
Date: Thu, 26 Aug 2010 13:56:25 -0500
Subject: cciss: factor out cciss_passthru

Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/cciss.c | 212 ++++++++++++++++++++++++--------------------------
 1 file changed, 101 insertions(+), 111 deletions(-)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 5abdce3ef70b..076dbcfa9471 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1398,6 +1398,106 @@ static int cciss_getluninfo(ctlr_info_t *h,
 	return 0;
 }
 
+static int cciss_passthru(ctlr_info_t *h, void __user *argp)
+{
+	IOCTL_Command_struct iocommand;
+	CommandList_struct *c;
+	char *buff = NULL;
+	u64bit temp64;
+	DECLARE_COMPLETION_ONSTACK(wait);
+
+	if (!argp)
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_RAWIO))
+		return -EPERM;
+
+	if (copy_from_user
+	    (&iocommand, argp, sizeof(IOCTL_Command_struct)))
+		return -EFAULT;
+	if ((iocommand.buf_size < 1) &&
+	    (iocommand.Request.Type.Direction != XFER_NONE)) {
+		return -EINVAL;
+	}
+	if (iocommand.buf_size > 0) {
+		buff = kmalloc(iocommand.buf_size, GFP_KERNEL);
+		if (buff == NULL)
+			return -EFAULT;
+	}
+	if (iocommand.Request.Type.Direction == XFER_WRITE) {
+		/* Copy the data into the buffer we created */
+		if (copy_from_user(buff, iocommand.buf, iocommand.buf_size)) {
+			kfree(buff);
+			return -EFAULT;
+		}
+	} else {
+		memset(buff, 0, iocommand.buf_size);
+	}
+	c = cmd_special_alloc(h);
+	if (!c) {
+		kfree(buff);
+		return -ENOMEM;
+	}
+	/* Fill in the command type */
+	c->cmd_type = CMD_IOCTL_PEND;
+	/* Fill in Command Header */
+	c->Header.ReplyQueue = 0;   /* unused in simple mode */
+	if (iocommand.buf_size > 0) { /* buffer to fill */
+		c->Header.SGList = 1;
+		c->Header.SGTotal = 1;
+	} else { /* no buffers to fill */
+		c->Header.SGList = 0;
+		c->Header.SGTotal = 0;
+	}
+	c->Header.LUN = iocommand.LUN_info;
+	/* use the kernel address the cmd block for tag */
+	c->Header.Tag.lower = c->busaddr;
+
+	/* Fill in Request block */
+	c->Request = iocommand.Request;
+
+	/* Fill in the scatter gather information */
+	if (iocommand.buf_size > 0) {
+		temp64.val = pci_map_single(h->pdev, buff,
+			iocommand.buf_size, PCI_DMA_BIDIRECTIONAL);
+		c->SG[0].Addr.lower = temp64.val32.lower;
+		c->SG[0].Addr.upper = temp64.val32.upper;
+		c->SG[0].Len = iocommand.buf_size;
+		c->SG[0].Ext = 0;  /* we are not chaining */
+	}
+	c->waiting = &wait;
+
+	enqueue_cmd_and_start_io(h, c);
+	wait_for_completion(&wait);
+
+	/* unlock the buffers from DMA */
+	temp64.val32.lower = c->SG[0].Addr.lower;
+	temp64.val32.upper = c->SG[0].Addr.upper;
+	pci_unmap_single(h->pdev, (dma_addr_t) temp64.val, iocommand.buf_size,
+			 PCI_DMA_BIDIRECTIONAL);
+	check_ioctl_unit_attention(h, c);
+
+	/* Copy the error information out */
+	iocommand.error_info = *(c->err_info);
+	if (copy_to_user(argp, &iocommand, sizeof(IOCTL_Command_struct))) {
+		kfree(buff);
+		cmd_special_free(h, c);
+		return -EFAULT;
+	}
+
+	if (iocommand.Request.Type.Direction == XFER_READ) {
+		/* Copy the data out of the buffer we created */
+		if (copy_to_user(iocommand.buf, buff, iocommand.buf_size)) {
+			kfree(buff);
+			cmd_special_free(h, c);
+			return -EFAULT;
+		}
+	}
+	kfree(buff);
+	cmd_special_free(h, c);
+	return 0;
+}
+
 static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
 	unsigned int cmd, unsigned long arg)
 {
@@ -1433,117 +1533,7 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
 	case CCISS_GETLUNINFO:
 		return cciss_getluninfo(h, disk, argp);
 	case CCISS_PASSTHRU:
-		{
-			IOCTL_Command_struct iocommand;
-			CommandList_struct *c;
-			char *buff = NULL;
-			u64bit temp64;
-			DECLARE_COMPLETION_ONSTACK(wait);
-
-			if (!arg)
-				return -EINVAL;
-
-			if (!capable(CAP_SYS_RAWIO))
-				return -EPERM;
-
-			if (copy_from_user
-			    (&iocommand, argp, sizeof(IOCTL_Command_struct)))
-				return -EFAULT;
-			if ((iocommand.buf_size < 1) &&
-			    (iocommand.Request.Type.Direction != XFER_NONE)) {
-				return -EINVAL;
-			}
-#if 0				/* 'buf_size' member is 16-bits, and always smaller than kmalloc limit */
-			/* Check kmalloc limits */
-			if (iocommand.buf_size > 128000)
-				return -EINVAL;
-#endif
-			if (iocommand.buf_size > 0) {
-				buff = kmalloc(iocommand.buf_size, GFP_KERNEL);
-				if (buff == NULL)
-					return -EFAULT;
-			}
-			if (iocommand.Request.Type.Direction == XFER_WRITE) {
-				/* Copy the data into the buffer we created */
-				if (copy_from_user
-				    (buff, iocommand.buf, iocommand.buf_size)) {
-					kfree(buff);
-					return -EFAULT;
-				}
-			} else {
-				memset(buff, 0, iocommand.buf_size);
-			}
-			c = cmd_special_alloc(h);
-			if (!c) {
-				kfree(buff);
-				return -ENOMEM;
-			}
-			/* Fill in the command type */
-			c->cmd_type = CMD_IOCTL_PEND;
-			/* Fill in Command Header */
-			c->Header.ReplyQueue = 0;   /* unused in simple mode */
-			if (iocommand.buf_size > 0) /* buffer to fill */
-			{
-				c->Header.SGList = 1;
-				c->Header.SGTotal = 1;
-			} else /* no buffers to fill */
-			{
-				c->Header.SGList = 0;
-				c->Header.SGTotal = 0;
-			}
-			c->Header.LUN = iocommand.LUN_info;
-			/* use the kernel address the cmd block for tag */
-			c->Header.Tag.lower = c->busaddr;
-
-			/* Fill in Request block */
-			c->Request = iocommand.Request;
-
-			/* Fill in the scatter gather information */
-			if (iocommand.buf_size > 0) {
-				temp64.val = pci_map_single(h->pdev, buff,
-					iocommand.buf_size,
-					PCI_DMA_BIDIRECTIONAL);
-				c->SG[0].Addr.lower = temp64.val32.lower;
-				c->SG[0].Addr.upper = temp64.val32.upper;
-				c->SG[0].Len = iocommand.buf_size;
-				c->SG[0].Ext = 0;  /* we are not chaining */
-			}
-			c->waiting = &wait;
-
-			enqueue_cmd_and_start_io(h, c);
-			wait_for_completion(&wait);
-
-			/* unlock the buffers from DMA */
-			temp64.val32.lower = c->SG[0].Addr.lower;
-			temp64.val32.upper = c->SG[0].Addr.upper;
-			pci_unmap_single(h->pdev, (dma_addr_t) temp64.val,
-					 iocommand.buf_size,
-					 PCI_DMA_BIDIRECTIONAL);
-
-			check_ioctl_unit_attention(h, c);
-
-			/* Copy the error information out */
-			iocommand.error_info = *(c->err_info);
-			if (copy_to_user
-			    (argp, &iocommand, sizeof(IOCTL_Command_struct))) {
-				kfree(buff);
-				cmd_special_free(h, c);
-				return -EFAULT;
-			}
-
-			if (iocommand.Request.Type.Direction == XFER_READ) {
-				/* Copy the data out of the buffer we created */
-				if (copy_to_user
-				    (iocommand.buf, buff, iocommand.buf_size)) {
-					kfree(buff);
-					cmd_special_free(h, c);
-					return -EFAULT;
-				}
-			}
-			kfree(buff);
-			cmd_special_free(h, c);
-			return 0;
-		}
+		return cciss_passthru(h, argp);
 	case CCISS_BIG_PASSTHRU:{
 			BIG_IOCTL_Command_struct *ioc;
 			CommandList_struct *c;
-- 
cgit v1.2.3


From 0c9f5ba7cb7435ea4b99599de4af0729f0740647 Mon Sep 17 00:00:00 2001
From: "Stephen M. Cameron" <scameron@beardog.cce.hp.com>
Date: Thu, 26 Aug 2010 13:56:30 -0500
Subject: cciss: factor out cciss_big_passthru

Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/cciss.c | 307 +++++++++++++++++++++++++-------------------------
 1 file changed, 151 insertions(+), 156 deletions(-)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 076dbcfa9471..cff2fa1972cb 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1498,6 +1498,155 @@ static int cciss_passthru(ctlr_info_t *h, void __user *argp)
 	return 0;
 }
 
+static int cciss_bigpassthru(ctlr_info_t *h, void __user *argp)
+{
+	BIG_IOCTL_Command_struct *ioc;
+	CommandList_struct *c;
+	unsigned char **buff = NULL;
+	int *buff_size = NULL;
+	u64bit temp64;
+	BYTE sg_used = 0;
+	int status = 0;
+	int i;
+	DECLARE_COMPLETION_ONSTACK(wait);
+	__u32 left;
+	__u32 sz;
+	BYTE __user *data_ptr;
+
+	if (!argp)
+		return -EINVAL;
+	if (!capable(CAP_SYS_RAWIO))
+		return -EPERM;
+	ioc = (BIG_IOCTL_Command_struct *)
+	    kmalloc(sizeof(*ioc), GFP_KERNEL);
+	if (!ioc) {
+		status = -ENOMEM;
+		goto cleanup1;
+	}
+	if (copy_from_user(ioc, argp, sizeof(*ioc))) {
+		status = -EFAULT;
+		goto cleanup1;
+	}
+	if ((ioc->buf_size < 1) &&
+	    (ioc->Request.Type.Direction != XFER_NONE)) {
+		status = -EINVAL;
+		goto cleanup1;
+	}
+	/* Check kmalloc limits  using all SGs */
+	if (ioc->malloc_size > MAX_KMALLOC_SIZE) {
+		status = -EINVAL;
+		goto cleanup1;
+	}
+	if (ioc->buf_size > ioc->malloc_size * MAXSGENTRIES) {
+		status = -EINVAL;
+		goto cleanup1;
+	}
+	buff = kzalloc(MAXSGENTRIES * sizeof(char *), GFP_KERNEL);
+	if (!buff) {
+		status = -ENOMEM;
+		goto cleanup1;
+	}
+	buff_size = kmalloc(MAXSGENTRIES * sizeof(int), GFP_KERNEL);
+	if (!buff_size) {
+		status = -ENOMEM;
+		goto cleanup1;
+	}
+	left = ioc->buf_size;
+	data_ptr = ioc->buf;
+	while (left) {
+		sz = (left > ioc->malloc_size) ? ioc->malloc_size : left;
+		buff_size[sg_used] = sz;
+		buff[sg_used] = kmalloc(sz, GFP_KERNEL);
+		if (buff[sg_used] == NULL) {
+			status = -ENOMEM;
+			goto cleanup1;
+		}
+		if (ioc->Request.Type.Direction == XFER_WRITE) {
+			if (copy_from_user(buff[sg_used], data_ptr, sz)) {
+				status = -EFAULT;
+				goto cleanup1;
+			}
+		} else {
+			memset(buff[sg_used], 0, sz);
+		}
+		left -= sz;
+		data_ptr += sz;
+		sg_used++;
+	}
+	c = cmd_special_alloc(h);
+	if (!c) {
+		status = -ENOMEM;
+		goto cleanup1;
+	}
+	c->cmd_type = CMD_IOCTL_PEND;
+	c->Header.ReplyQueue = 0;
+
+	if (ioc->buf_size > 0) {
+		c->Header.SGList = sg_used;
+		c->Header.SGTotal = sg_used;
+	} else {
+		c->Header.SGList = 0;
+		c->Header.SGTotal = 0;
+	}
+	c->Header.LUN = ioc->LUN_info;
+	c->Header.Tag.lower = c->busaddr;
+
+	c->Request = ioc->Request;
+	if (ioc->buf_size > 0) {
+		for (i = 0; i < sg_used; i++) {
+			temp64.val =
+			    pci_map_single(h->pdev, buff[i], buff_size[i],
+				    PCI_DMA_BIDIRECTIONAL);
+			c->SG[i].Addr.lower = temp64.val32.lower;
+			c->SG[i].Addr.upper = temp64.val32.upper;
+			c->SG[i].Len = buff_size[i];
+			c->SG[i].Ext = 0;	/* we are not chaining */
+		}
+	}
+	c->waiting = &wait;
+	enqueue_cmd_and_start_io(h, c);
+	wait_for_completion(&wait);
+	/* unlock the buffers from DMA */
+	for (i = 0; i < sg_used; i++) {
+		temp64.val32.lower = c->SG[i].Addr.lower;
+		temp64.val32.upper = c->SG[i].Addr.upper;
+		pci_unmap_single(h->pdev,
+			(dma_addr_t) temp64.val, buff_size[i],
+			PCI_DMA_BIDIRECTIONAL);
+	}
+	check_ioctl_unit_attention(h, c);
+	/* Copy the error information out */
+	ioc->error_info = *(c->err_info);
+	if (copy_to_user(argp, ioc, sizeof(*ioc))) {
+		cmd_special_free(h, c);
+		status = -EFAULT;
+		goto cleanup1;
+	}
+	if (ioc->Request.Type.Direction == XFER_READ) {
+		/* Copy the data out of the buffer we created */
+		BYTE __user *ptr = ioc->buf;
+		for (i = 0; i < sg_used; i++) {
+			if (copy_to_user(ptr, buff[i], buff_size[i])) {
+				cmd_special_free(h, c);
+				status = -EFAULT;
+				goto cleanup1;
+			}
+			ptr += buff_size[i];
+		}
+	}
+	cmd_special_free(h, c);
+	status = 0;
+cleanup1:
+	if (buff) {
+		for (i = 0; i < sg_used; i++)
+			kfree(buff[i]);
+		kfree(buff);
+	}
+	kfree(buff_size);
+	kfree(ioc);
+	return status;
+}
+
 static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
 	unsigned int cmd, unsigned long arg)
 {
@@ -1534,162 +1683,8 @@ static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
 		return cciss_getluninfo(h, disk, argp);
 	case CCISS_PASSTHRU:
 		return cciss_passthru(h, argp);
-	case CCISS_BIG_PASSTHRU:{
-			BIG_IOCTL_Command_struct *ioc;
-			CommandList_struct *c;
-			unsigned char **buff = NULL;
-			int *buff_size = NULL;
-			u64bit temp64;
-			BYTE sg_used = 0;
-			int status = 0;
-			int i;
-			DECLARE_COMPLETION_ONSTACK(wait);
-			__u32 left;
-			__u32 sz;
-			BYTE __user *data_ptr;
-
-			if (!arg)
-				return -EINVAL;
-			if (!capable(CAP_SYS_RAWIO))
-				return -EPERM;
-			ioc = (BIG_IOCTL_Command_struct *)
-			    kmalloc(sizeof(*ioc), GFP_KERNEL);
-			if (!ioc) {
-				status = -ENOMEM;
-				goto cleanup1;
-			}
-			if (copy_from_user(ioc, argp, sizeof(*ioc))) {
-				status = -EFAULT;
-				goto cleanup1;
-			}
-			if ((ioc->buf_size < 1) &&
-			    (ioc->Request.Type.Direction != XFER_NONE)) {
-				status = -EINVAL;
-				goto cleanup1;
-			}
-			/* Check kmalloc limits  using all SGs */
-			if (ioc->malloc_size > MAX_KMALLOC_SIZE) {
-				status = -EINVAL;
-				goto cleanup1;
-			}
-			if (ioc->buf_size > ioc->malloc_size * MAXSGENTRIES) {
-				status = -EINVAL;
-				goto cleanup1;
-			}
-			buff =
-			    kzalloc(MAXSGENTRIES * sizeof(char *), GFP_KERNEL);
-			if (!buff) {
-				status = -ENOMEM;
-				goto cleanup1;
-			}
-			buff_size = kmalloc(MAXSGENTRIES * sizeof(int),
-						   GFP_KERNEL);
-			if (!buff_size) {
-				status = -ENOMEM;
-				goto cleanup1;
-			}
-			left = ioc->buf_size;
-			data_ptr = ioc->buf;
-			while (left) {
-				sz = (left >
-				      ioc->malloc_size) ? ioc->
-				    malloc_size : left;
-				buff_size[sg_used] = sz;
-				buff[sg_used] = kmalloc(sz, GFP_KERNEL);
-				if (buff[sg_used] == NULL) {
-					status = -ENOMEM;
-					goto cleanup1;
-				}
-				if (ioc->Request.Type.Direction == XFER_WRITE) {
-					if (copy_from_user
-					    (buff[sg_used], data_ptr, sz)) {
-						status = -EFAULT;
-						goto cleanup1;
-					}
-				} else {
-					memset(buff[sg_used], 0, sz);
-				}
-				left -= sz;
-				data_ptr += sz;
-				sg_used++;
-			}
-			c = cmd_special_alloc(h);
-			if (!c) {
-				status = -ENOMEM;
-				goto cleanup1;
-			}
-			c->cmd_type = CMD_IOCTL_PEND;
-			c->Header.ReplyQueue = 0;
-
-			if (ioc->buf_size > 0) {
-				c->Header.SGList = sg_used;
-				c->Header.SGTotal = sg_used;
-			} else {
-				c->Header.SGList = 0;
-				c->Header.SGTotal = 0;
-			}
-			c->Header.LUN = ioc->LUN_info;
-			c->Header.Tag.lower = c->busaddr;
-
-			c->Request = ioc->Request;
-			if (ioc->buf_size > 0) {
-				for (i = 0; i < sg_used; i++) {
-					temp64.val =
-					    pci_map_single(h->pdev, buff[i],
-						    buff_size[i],
-						    PCI_DMA_BIDIRECTIONAL);
-					c->SG[i].Addr.lower =
-					    temp64.val32.lower;
-					c->SG[i].Addr.upper =
-					    temp64.val32.upper;
-					c->SG[i].Len = buff_size[i];
-					c->SG[i].Ext = 0;	/* we are not chaining */
-				}
-			}
-			c->waiting = &wait;
-			enqueue_cmd_and_start_io(h, c);
-			wait_for_completion(&wait);
-			/* unlock the buffers from DMA */
-			for (i = 0; i < sg_used; i++) {
-				temp64.val32.lower = c->SG[i].Addr.lower;
-				temp64.val32.upper = c->SG[i].Addr.upper;
-				pci_unmap_single(h->pdev,
-					(dma_addr_t) temp64.val, buff_size[i],
-					PCI_DMA_BIDIRECTIONAL);
-			}
-			check_ioctl_unit_attention(h, c);
-			/* Copy the error information out */
-			ioc->error_info = *(c->err_info);
-			if (copy_to_user(argp, ioc, sizeof(*ioc))) {
-				cmd_special_free(h, c);
-				status = -EFAULT;
-				goto cleanup1;
-			}
-			if (ioc->Request.Type.Direction == XFER_READ) {
-				/* Copy the data out of the buffer we created */
-				BYTE __user *ptr = ioc->buf;
-				for (i = 0; i < sg_used; i++) {
-					if (copy_to_user
-					    (ptr, buff[i], buff_size[i])) {
-						cmd_special_free(h, c);
-						status = -EFAULT;
-						goto cleanup1;
-					}
-					ptr += buff_size[i];
-				}
-			}
-			cmd_special_free(h, c);
-			status = 0;
-		      cleanup1:
-			if (buff) {
-				for (i = 0; i < sg_used; i++)
-					kfree(buff[i]);
-				kfree(buff);
-			}
-			kfree(buff_size);
-			kfree(ioc);
-			return status;
-		}
+	case CCISS_BIG_PASSTHRU:
+		return cciss_bigpassthru(h, argp);
 
 	/* scsi_cmd_ioctl handles these, below, though some are not */
 	/* very meaningful for cciss.  SG_IO is the main one people want. */
-- 
cgit v1.2.3


From fcfb5c0ce11fe474cd6b4f7e83e0cc25f220d911 Mon Sep 17 00:00:00 2001
From: "Stephen M. Cameron" <scameron@beardog.cce.hp.com>
Date: Thu, 26 Aug 2010 13:56:35 -0500
Subject: cciss: remove some superfluous tests from cciss_bigpassthru()

Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/cciss.c | 25 ++++++++-----------------
 1 file changed, 8 insertions(+), 17 deletions(-)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index cff2fa1972cb..df2ed4d6684d 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1580,28 +1580,19 @@ static int cciss_bigpassthru(ctlr_info_t *h, void __user *argp)
 	}
 	c->cmd_type = CMD_IOCTL_PEND;
 	c->Header.ReplyQueue = 0;
-
-	if (ioc->buf_size > 0) {
-		c->Header.SGList = sg_used;
-		c->Header.SGTotal = sg_used;
-	} else {
-		c->Header.SGList = 0;
-		c->Header.SGTotal = 0;
-	}
+	c->Header.SGList = sg_used;
+	c->Header.SGTotal = sg_used;
 	c->Header.LUN = ioc->LUN_info;
 	c->Header.Tag.lower = c->busaddr;
 
 	c->Request = ioc->Request;
-	if (ioc->buf_size > 0) {
-		for (i = 0; i < sg_used; i++) {
-			temp64.val =
-			    pci_map_single(h->pdev, buff[i], buff_size[i],
+	for (i = 0; i < sg_used; i++) {
+		temp64.val = pci_map_single(h->pdev, buff[i], buff_size[i],
 				    PCI_DMA_BIDIRECTIONAL);
-			c->SG[i].Addr.lower = temp64.val32.lower;
-			c->SG[i].Addr.upper = temp64.val32.upper;
-			c->SG[i].Len = buff_size[i];
-			c->SG[i].Ext = 0;	/* we are not chaining */
-		}
+		c->SG[i].Addr.lower = temp64.val32.lower;
+		c->SG[i].Addr.upper = temp64.val32.upper;
+		c->SG[i].Len = buff_size[i];
+		c->SG[i].Ext = 0;	/* we are not chaining */
 	}
 	c->waiting = &wait;
 	enqueue_cmd_and_start_io(h, c);
-- 
cgit v1.2.3


From 0da2f50944976e890ccc9436ab88c0da87788d02 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:16 +0200
Subject: ide: remove unnecessary blk_queue_flushing() test in do_ide_request()

Unplugging from a request function doesn't really help much (it's
already in the request_fn) and soon block layer will be updated to mix
barrier sequence with other commands, so there's no need to treat
queue flushing any differently.

ide was the only user of blk_queue_flushing().  Remove it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/ide/ide-io.c   | 13 -------------
 include/linux/blkdev.h |  1 -
 2 files changed, 14 deletions(-)

diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index a381be814070..999dac054bcc 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -441,19 +441,6 @@ void do_ide_request(struct request_queue *q)
 	struct request	*rq = NULL;
 	ide_startstop_t	startstop;
 
-	/*
-	 * drive is doing pre-flush, ordered write, post-flush sequence. even
-	 * though that is 3 requests, it must be seen as a single transaction.
-	 * we must not preempt this drive until that is complete
-	 */
-	if (blk_queue_flushing(q))
-		/*
-		 * small race where queue could get replugged during
-		 * the 3-request flush cycle, just yank the plug since
-		 * we want it to finish asap
-		 */
-		blk_remove_plug(q);
-
 	spin_unlock_irq(q->queue_lock);
 
 	/* HLD do_request() callback might sleep, make sure it's okay */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2c54906f678f..015375c7d031 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -521,7 +521,6 @@ enum {
 #define blk_queue_nonrot(q)	test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags)
 #define blk_queue_io_stat(q)	test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags)
 #define blk_queue_add_random(q)	test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags)
-#define blk_queue_flushing(q)	((q)->ordseq)
 #define blk_queue_stackable(q)	\
 	test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags)
 #define blk_queue_discard(q)	test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags)
-- 
cgit v1.2.3


From 589d7ed02ade0d06a3510da2e15a7edfdb2ef3d8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:16 +0200
Subject: block/loop: queue ordered mode should be DRAIN_FLUSH

loop implements FLUSH using fsync but was incorrectly setting its
ordered mode to DRAIN.  Change it to DRAIN_FLUSH.  In practice, this
doesn't change anything as loop doesn't make use of the block layer
ordered implementation.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/loop.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index f3c636d23718..c3a4a2e176da 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -832,7 +832,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	lo->lo_queue->unplug_fn = loop_unplug;
 
 	if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
-		blk_queue_ordered(lo->lo_queue, QUEUE_ORDERED_DRAIN);
+		blk_queue_ordered(lo->lo_queue, QUEUE_ORDERED_DRAIN_FLUSH);
 
 	set_capacity(lo->lo_disk, size);
 	bd_set_size(bdev, size << 9);
-- 
cgit v1.2.3


From 6958f145459ca7ad9715024de97445addacb8510 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:16 +0200
Subject: block: kill QUEUE_ORDERED_BY_TAG

Nobody is making meaningful use of ORDERED_BY_TAG now and queue
draining for barrier requests will be removed soon which will render
the advantage of tag ordering moot.  Kill ORDERED_BY_TAG.  The
following users are affected.

* brd: converted to ORDERED_DRAIN.
* virtio_blk: ORDERED_TAG path was already marked deprecated.  Removed.
* xen-blkfront: ORDERED_TAG case dropped.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-barrier.c          | 35 +++++++----------------------------
 drivers/block/brd.c          |  2 +-
 drivers/block/virtio_blk.c   |  9 ---------
 drivers/block/xen-blkfront.c |  8 +++-----
 drivers/scsi/sd.c            |  4 +---
 include/linux/blkdev.h       | 17 +----------------
 6 files changed, 13 insertions(+), 62 deletions(-)

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index f0faefca032f..c807e9ca3a68 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -26,10 +26,7 @@ int blk_queue_ordered(struct request_queue *q, unsigned ordered)
 	if (ordered != QUEUE_ORDERED_NONE &&
 	    ordered != QUEUE_ORDERED_DRAIN &&
 	    ordered != QUEUE_ORDERED_DRAIN_FLUSH &&
-	    ordered != QUEUE_ORDERED_DRAIN_FUA &&
-	    ordered != QUEUE_ORDERED_TAG &&
-	    ordered != QUEUE_ORDERED_TAG_FLUSH &&
-	    ordered != QUEUE_ORDERED_TAG_FUA) {
+	    ordered != QUEUE_ORDERED_DRAIN_FUA) {
 		printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered);
 		return -EINVAL;
 	}
@@ -155,21 +152,9 @@ static inline bool start_ordered(struct request_queue *q, struct request **rqp)
 	 * For an empty barrier, there's no actual BAR request, which
 	 * in turn makes POSTFLUSH unnecessary.  Mask them off.
 	 */
-	if (!blk_rq_sectors(rq)) {
+	if (!blk_rq_sectors(rq))
 		q->ordered &= ~(QUEUE_ORDERED_DO_BAR |
 				QUEUE_ORDERED_DO_POSTFLUSH);
-		/*
-		 * Empty barrier on a write-through device w/ ordered
-		 * tag has no command to issue and without any command
-		 * to issue, ordering by tag can't be used.  Drain
-		 * instead.
-		 */
-		if ((q->ordered & QUEUE_ORDERED_BY_TAG) &&
-		    !(q->ordered & QUEUE_ORDERED_DO_PREFLUSH)) {
-			q->ordered &= ~QUEUE_ORDERED_BY_TAG;
-			q->ordered |= QUEUE_ORDERED_BY_DRAIN;
-		}
-	}
 
 	/* stash away the original request */
 	blk_dequeue_request(rq);
@@ -210,7 +195,7 @@ static inline bool start_ordered(struct request_queue *q, struct request **rqp)
 	} else
 		skip |= QUEUE_ORDSEQ_PREFLUSH;
 
-	if ((q->ordered & QUEUE_ORDERED_BY_DRAIN) && queue_in_flight(q))
+	if (queue_in_flight(q))
 		rq = NULL;
 	else
 		skip |= QUEUE_ORDSEQ_DRAIN;
@@ -257,16 +242,10 @@ bool blk_do_ordered(struct request_queue *q, struct request **rqp)
 	    rq != &q->pre_flush_rq && rq != &q->post_flush_rq)
 		return true;
 
-	if (q->ordered & QUEUE_ORDERED_BY_TAG) {
-		/* Ordered by tag.  Blocking the next barrier is enough. */
-		if (is_barrier && rq != &q->bar_rq)
-			*rqp = NULL;
-	} else {
-		/* Ordered by draining.  Wait for turn. */
-		WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q));
-		if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q))
-			*rqp = NULL;
-	}
+	/* Ordered by draining.  Wait for turn. */
+	WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q));
+	if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q))
+		*rqp = NULL;
 
 	return true;
 }
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 1c7f63792ff8..47a41272d26b 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -482,7 +482,7 @@ static struct brd_device *brd_alloc(int i)
 	if (!brd->brd_queue)
 		goto out_free_dev;
 	blk_queue_make_request(brd->brd_queue, brd_make_request);
-	blk_queue_ordered(brd->brd_queue, QUEUE_ORDERED_TAG);
+	blk_queue_ordered(brd->brd_queue, QUEUE_ORDERED_DRAIN);
 	blk_queue_max_hw_sectors(brd->brd_queue, 1024);
 	blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);
 
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 2aafafca2b13..79652809eee8 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -395,15 +395,6 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
 		 * to implement write barrier support.
 		 */
 		blk_queue_ordered(q, QUEUE_ORDERED_DRAIN_FLUSH);
-	} else if (virtio_has_feature(vdev, VIRTIO_BLK_F_BARRIER)) {
-		/*
-		 * If the BARRIER feature is supported the host expects us
-		 * to order request by tags.  This implies there is not
-		 * volatile write cache on the host, and that the host
-		 * never re-orders outstanding I/O.  This feature is not
-		 * useful for real life scenarious and deprecated.
-		 */
-		blk_queue_ordered(q, QUEUE_ORDERED_TAG);
 	} else {
 		/*
 		 * If the FLUSH feature is not supported we must assume that
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index ac1b682edecb..50ec6f834996 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -424,8 +424,7 @@ static int xlvbd_barrier(struct blkfront_info *info)
 	const char *barrier;
 
 	switch (info->feature_barrier) {
-	case QUEUE_ORDERED_DRAIN:	barrier = "enabled (drain)"; break;
-	case QUEUE_ORDERED_TAG:		barrier = "enabled (tag)"; break;
+	case QUEUE_ORDERED_DRAIN:	barrier = "enabled"; break;
 	case QUEUE_ORDERED_NONE:	barrier = "disabled"; break;
 	default:			return -EINVAL;
 	}
@@ -1078,8 +1077,7 @@ static void blkfront_connect(struct blkfront_info *info)
 	 * we're dealing with a very old backend which writes
 	 * synchronously; draining will do what needs to get done.
 	 *
-	 * If there are barriers, then we can do full queued writes
-	 * with tagged barriers.
+	 * If there are barriers, then we use flush.
 	 *
 	 * If barriers are not supported, then there's no much we can
 	 * do, so just set ordering to NONE.
@@ -1087,7 +1085,7 @@ static void blkfront_connect(struct blkfront_info *info)
 	if (err)
 		info->feature_barrier = QUEUE_ORDERED_DRAIN;
 	else if (barrier)
-		info->feature_barrier = QUEUE_ORDERED_TAG;
+		info->feature_barrier = QUEUE_ORDERED_DRAIN_FLUSH;
 	else
 		info->feature_barrier = QUEUE_ORDERED_NONE;
 
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 2714becc2eaf..cdfc51ab9cf2 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -2151,9 +2151,7 @@ static int sd_revalidate_disk(struct gendisk *disk)
 
 	/*
 	 * We now have all cache related info, determine how we deal
-	 * with ordered requests.  Note that as the current SCSI
-	 * dispatch function can alter request order, we cannot use
-	 * QUEUE_ORDERED_TAG_* even when ordered tag is supported.
+	 * with ordered requests.
 	 */
 	if (sdkp->WCE)
 		ordered = sdkp->DPOFUA
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 015375c7d031..7077bc0d6138 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -470,12 +470,7 @@ enum {
 	 * DRAIN	: ordering by draining is enough
 	 * DRAIN_FLUSH	: ordering by draining w/ pre and post flushes
 	 * DRAIN_FUA	: ordering by draining w/ pre flush and FUA write
-	 * TAG		: ordering by tag is enough
-	 * TAG_FLUSH	: ordering by tag w/ pre and post flushes
-	 * TAG_FUA	: ordering by tag w/ pre flush and FUA write
 	 */
-	QUEUE_ORDERED_BY_DRAIN		= 0x01,
-	QUEUE_ORDERED_BY_TAG		= 0x02,
 	QUEUE_ORDERED_DO_PREFLUSH	= 0x10,
 	QUEUE_ORDERED_DO_BAR		= 0x20,
 	QUEUE_ORDERED_DO_POSTFLUSH	= 0x40,
@@ -483,8 +478,7 @@ enum {
 
 	QUEUE_ORDERED_NONE		= 0x00,
 
-	QUEUE_ORDERED_DRAIN		= QUEUE_ORDERED_BY_DRAIN |
-					  QUEUE_ORDERED_DO_BAR,
+	QUEUE_ORDERED_DRAIN		= QUEUE_ORDERED_DO_BAR,
 	QUEUE_ORDERED_DRAIN_FLUSH	= QUEUE_ORDERED_DRAIN |
 					  QUEUE_ORDERED_DO_PREFLUSH |
 					  QUEUE_ORDERED_DO_POSTFLUSH,
@@ -492,15 +486,6 @@ enum {
 					  QUEUE_ORDERED_DO_PREFLUSH |
 					  QUEUE_ORDERED_DO_FUA,
 
-	QUEUE_ORDERED_TAG		= QUEUE_ORDERED_BY_TAG |
-					  QUEUE_ORDERED_DO_BAR,
-	QUEUE_ORDERED_TAG_FLUSH		= QUEUE_ORDERED_TAG |
-					  QUEUE_ORDERED_DO_PREFLUSH |
-					  QUEUE_ORDERED_DO_POSTFLUSH,
-	QUEUE_ORDERED_TAG_FUA		= QUEUE_ORDERED_TAG |
-					  QUEUE_ORDERED_DO_PREFLUSH |
-					  QUEUE_ORDERED_DO_FUA,
-
 	/*
 	 * Ordered operation sequence
 	 */
-- 
cgit v1.2.3


From 4913efe456c987057e5d36a3f0a55422a9072cae Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:16 +0200
Subject: block: deprecate barrier and replace blk_queue_ordered() with
 blk_queue_flush()

Barrier is deemed too heavy and will soon be replaced by FLUSH/FUA
requests.  Deprecate barrier.  All REQ_HARDBARRIERs are failed with
-EOPNOTSUPP and blk_queue_ordered() is replaced with simpler
blk_queue_flush().

blk_queue_flush() takes combinations of REQ_FLUSH and FUA.  If a
device has write cache and can flush it, it should set REQ_FLUSH.  If
the device can handle FUA writes, it should also set REQ_FUA.

All blk_queue_ordered() users are converted.

* ORDERED_DRAIN is mapped to 0 which is the default value.
* ORDERED_DRAIN_FLUSH is mapped to REQ_FLUSH.
* ORDERED_DRAIN_FLUSH_FUA is mapped to REQ_FLUSH | REQ_FUA.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Boaz Harrosh <bharrosh@panasas.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Chris Wright <chrisw@sous-sol.org>
Cc: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Alasdair G Kergon <agk@redhat.com>
Cc: Pierre Ossman <drzeus@drzeus.cx>
Cc: Stefan Weinhuber <wein@de.ibm.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-barrier.c          | 29 -----------------------------
 block/blk-core.c             |  6 ++++--
 block/blk-settings.c         | 20 ++++++++++++++++++++
 drivers/block/brd.c          |  1 -
 drivers/block/loop.c         |  2 +-
 drivers/block/osdblk.c       |  2 +-
 drivers/block/ps3disk.c      |  2 +-
 drivers/block/virtio_blk.c   | 25 +++++++++----------------
 drivers/block/xen-blkfront.c | 43 ++++++++++++-------------------------------
 drivers/ide/ide-disk.c       | 13 ++++++-------
 drivers/md/dm.c              |  2 +-
 drivers/mmc/card/queue.c     |  1 -
 drivers/s390/block/dasd.c    |  1 -
 drivers/scsi/sd.c            | 16 ++++++++--------
 include/linux/blkdev.h       |  6 ++++--
 15 files changed, 67 insertions(+), 102 deletions(-)

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index c807e9ca3a68..ed0aba5463ab 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -9,35 +9,6 @@
 
 #include "blk.h"
 
-/**
- * blk_queue_ordered - does this queue support ordered writes
- * @q:        the request queue
- * @ordered:  one of QUEUE_ORDERED_*
- *
- * Description:
- *   For journalled file systems, doing ordered writes on a commit
- *   block instead of explicitly doing wait_on_buffer (which is bad
- *   for performance) can be a big win. Block drivers supporting this
- *   feature should call this function and indicate so.
- *
- **/
-int blk_queue_ordered(struct request_queue *q, unsigned ordered)
-{
-	if (ordered != QUEUE_ORDERED_NONE &&
-	    ordered != QUEUE_ORDERED_DRAIN &&
-	    ordered != QUEUE_ORDERED_DRAIN_FLUSH &&
-	    ordered != QUEUE_ORDERED_DRAIN_FUA) {
-		printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered);
-		return -EINVAL;
-	}
-
-	q->ordered = ordered;
-	q->next_ordered = ordered;
-
-	return 0;
-}
-EXPORT_SYMBOL(blk_queue_ordered);
-
 /*
  * Cache flushing for ordered writes handling
  */
diff --git a/block/blk-core.c b/block/blk-core.c
index ee1a1e7e63cc..f06354183b29 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1203,11 +1203,13 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 	const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK;
 	int rw_flags;
 
-	if ((bio->bi_rw & REQ_HARDBARRIER) &&
-	    (q->next_ordered == QUEUE_ORDERED_NONE)) {
+	/* REQ_HARDBARRIER is no more */
+	if (WARN_ONCE(bio->bi_rw & REQ_HARDBARRIER,
+		"block: HARDBARRIER is deprecated, use FLUSH/FUA instead\n")) {
 		bio_endio(bio, -EOPNOTSUPP);
 		return 0;
 	}
+
 	/*
 	 * low level driver can indicate that it wants pages above a
 	 * certain limit bounced to low memory (ie for highmem, or even
diff --git a/block/blk-settings.c b/block/blk-settings.c
index a234f4bf1d6f..9b18afcfe925 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -794,6 +794,26 @@ void blk_queue_update_dma_alignment(struct request_queue *q, int mask)
 }
 EXPORT_SYMBOL(blk_queue_update_dma_alignment);
 
+/**
+ * blk_queue_flush - configure queue's cache flush capability
+ * @q:		the request queue for the device
+ * @flush:	0, REQ_FLUSH or REQ_FLUSH | REQ_FUA
+ *
+ * Tell block layer cache flush capability of @q.  If it supports
+ * flushing, REQ_FLUSH should be set.  If it supports bypassing
+ * write cache for individual writes, REQ_FUA should be set.
+ */
+void blk_queue_flush(struct request_queue *q, unsigned int flush)
+{
+	WARN_ON_ONCE(flush & ~(REQ_FLUSH | REQ_FUA));
+
+	if (WARN_ON_ONCE(!(flush & REQ_FLUSH) && (flush & REQ_FUA)))
+		flush &= ~REQ_FUA;
+
+	q->flush_flags = flush & (REQ_FLUSH | REQ_FUA);
+}
+EXPORT_SYMBOL_GPL(blk_queue_flush);
+
 static int __init blk_settings_init(void)
 {
 	blk_max_low_pfn = max_low_pfn - 1;
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 47a41272d26b..fa33f97722ba 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -482,7 +482,6 @@ static struct brd_device *brd_alloc(int i)
 	if (!brd->brd_queue)
 		goto out_free_dev;
 	blk_queue_make_request(brd->brd_queue, brd_make_request);
-	blk_queue_ordered(brd->brd_queue, QUEUE_ORDERED_DRAIN);
 	blk_queue_max_hw_sectors(brd->brd_queue, 1024);
 	blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);
 
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index c3a4a2e176da..953d1e12f4d4 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -832,7 +832,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	lo->lo_queue->unplug_fn = loop_unplug;
 
 	if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
-		blk_queue_ordered(lo->lo_queue, QUEUE_ORDERED_DRAIN_FLUSH);
+		blk_queue_flush(lo->lo_queue, REQ_FLUSH);
 
 	set_capacity(lo->lo_disk, size);
 	bd_set_size(bdev, size << 9);
diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c
index 2284b4f05c62..72d62462433d 100644
--- a/drivers/block/osdblk.c
+++ b/drivers/block/osdblk.c
@@ -439,7 +439,7 @@ static int osdblk_init_disk(struct osdblk_device *osdev)
 	blk_queue_stack_limits(q, osd_request_queue(osdev->osd));
 
 	blk_queue_prep_rq(q, blk_queue_start_tag);
-	blk_queue_ordered(q, QUEUE_ORDERED_DRAIN_FLUSH);
+	blk_queue_flush(q, REQ_FLUSH);
 
 	disk->queue = q;
 
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index e9da874d0419..4911f9e57bc7 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -468,7 +468,7 @@ static int __devinit ps3disk_probe(struct ps3_system_bus_device *_dev)
 	blk_queue_dma_alignment(queue, dev->blk_size-1);
 	blk_queue_logical_block_size(queue, dev->blk_size);
 
-	blk_queue_ordered(queue, QUEUE_ORDERED_DRAIN_FLUSH);
+	blk_queue_flush(queue, REQ_FLUSH);
 
 	blk_queue_max_segments(queue, -1);
 	blk_queue_max_segment_size(queue, dev->bounce_size);
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 79652809eee8..d10b635b3946 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -388,22 +388,15 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
 	vblk->disk->driverfs_dev = &vdev->dev;
 	index++;
 
-	if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH)) {
-		/*
-		 * If the FLUSH feature is supported we do have support for
-		 * flushing a volatile write cache on the host.  Use that
-		 * to implement write barrier support.
-		 */
-		blk_queue_ordered(q, QUEUE_ORDERED_DRAIN_FLUSH);
-	} else {
-		/*
-		 * If the FLUSH feature is not supported we must assume that
-		 * the host does not perform any kind of volatile write
-		 * caching. We still need to drain the queue to provider
-		 * proper barrier semantics.
-		 */
-		blk_queue_ordered(q, QUEUE_ORDERED_DRAIN);
-	}
+	/*
+	 * If the FLUSH feature is supported we do have support for
+	 * flushing a volatile write cache on the host.  Use that to
+	 * implement write barrier support; otherwise, we must assume
+	 * that the host does not perform any kind of volatile write
+	 * caching.
+	 */
+	if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH))
+		blk_queue_flush(q, REQ_FLUSH);
 
 	/* If disk is read-only in the host, the guest should obey */
 	if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO))
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 50ec6f834996..0b1eea643262 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -95,7 +95,7 @@ struct blkfront_info
 	struct gnttab_free_callback callback;
 	struct blk_shadow shadow[BLK_RING_SIZE];
 	unsigned long shadow_free;
-	int feature_barrier;
+	unsigned int feature_flush;
 	int is_ready;
 };
 
@@ -418,25 +418,12 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
 }
 
 
-static int xlvbd_barrier(struct blkfront_info *info)
+static void xlvbd_flush(struct blkfront_info *info)
 {
-	int err;
-	const char *barrier;
-
-	switch (info->feature_barrier) {
-	case QUEUE_ORDERED_DRAIN:	barrier = "enabled"; break;
-	case QUEUE_ORDERED_NONE:	barrier = "disabled"; break;
-	default:			return -EINVAL;
-	}
-
-	err = blk_queue_ordered(info->rq, info->feature_barrier);
-
-	if (err)
-		return err;
-
+	blk_queue_flush(info->rq, info->feature_flush);
 	printk(KERN_INFO "blkfront: %s: barriers %s\n",
-	       info->gd->disk_name, barrier);
-	return 0;
+	       info->gd->disk_name,
+	       info->feature_flush ? "enabled" : "disabled");
 }
 
 
@@ -515,7 +502,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 	info->rq = gd->queue;
 	info->gd = gd;
 
-	xlvbd_barrier(info);
+	xlvbd_flush(info);
 
 	if (vdisk_info & VDISK_READONLY)
 		set_disk_ro(gd, 1);
@@ -661,8 +648,8 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 				printk(KERN_WARNING "blkfront: %s: write barrier op failed\n",
 				       info->gd->disk_name);
 				error = -EOPNOTSUPP;
-				info->feature_barrier = QUEUE_ORDERED_NONE;
-				xlvbd_barrier(info);
+				info->feature_flush = 0;
+				xlvbd_flush(info);
 			}
 			/* fall through */
 		case BLKIF_OP_READ:
@@ -1075,19 +1062,13 @@ static void blkfront_connect(struct blkfront_info *info)
 	/*
 	 * If there's no "feature-barrier" defined, then it means
 	 * we're dealing with a very old backend which writes
-	 * synchronously; draining will do what needs to get done.
+	 * synchronously; nothing to do.
 	 *
 	 * If there are barriers, then we use flush.
-	 *
-	 * If barriers are not supported, then there's no much we can
-	 * do, so just set ordering to NONE.
 	 */
-	if (err)
-		info->feature_barrier = QUEUE_ORDERED_DRAIN;
-	else if (barrier)
-		info->feature_barrier = QUEUE_ORDERED_DRAIN_FLUSH;
-	else
-		info->feature_barrier = QUEUE_ORDERED_NONE;
+	info->feature_flush = 0;
+	if (!err && barrier)
+		info->feature_flush = REQ_FLUSH;
 
 	err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size);
 	if (err) {
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 7433e07de30e..7c5b01ce51d2 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -516,10 +516,10 @@ static int ide_do_setfeature(ide_drive_t *drive, u8 feature, u8 nsect)
 	return ide_no_data_taskfile(drive, &cmd);
 }
 
-static void update_ordered(ide_drive_t *drive)
+static void update_flush(ide_drive_t *drive)
 {
 	u16 *id = drive->id;
-	unsigned ordered = QUEUE_ORDERED_NONE;
+	unsigned flush = 0;
 
 	if (drive->dev_flags & IDE_DFLAG_WCACHE) {
 		unsigned long long capacity;
@@ -543,13 +543,12 @@ static void update_ordered(ide_drive_t *drive)
 		       drive->name, barrier ? "" : "not ");
 
 		if (barrier) {
-			ordered = QUEUE_ORDERED_DRAIN_FLUSH;
+			flush = REQ_FLUSH;
 			blk_queue_prep_rq(drive->queue, idedisk_prep_fn);
 		}
-	} else
-		ordered = QUEUE_ORDERED_DRAIN;
+	}
 
-	blk_queue_ordered(drive->queue, ordered);
+	blk_queue_flush(drive->queue, flush);
 }
 
 ide_devset_get_flag(wcache, IDE_DFLAG_WCACHE);
@@ -572,7 +571,7 @@ static int set_wcache(ide_drive_t *drive, int arg)
 		}
 	}
 
-	update_ordered(drive);
+	update_flush(drive);
 
 	return err;
 }
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index ac384b2a6a33..b1d92be8f990 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2245,7 +2245,7 @@ static int dm_init_request_based_queue(struct mapped_device *md)
 	blk_queue_softirq_done(md->queue, dm_softirq_done);
 	blk_queue_prep_rq(md->queue, dm_prep_fn);
 	blk_queue_lld_busy(md->queue, dm_lld_busy);
-	blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH);
+	blk_queue_flush(md->queue, REQ_FLUSH);
 
 	elv_register_queue(md->queue);
 
diff --git a/drivers/mmc/card/queue.c b/drivers/mmc/card/queue.c
index e876678176be..9c0b42bfe089 100644
--- a/drivers/mmc/card/queue.c
+++ b/drivers/mmc/card/queue.c
@@ -128,7 +128,6 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card, spinlock_t *lock
 	mq->req = NULL;
 
 	blk_queue_prep_rq(mq->queue, mmc_prep_request);
-	blk_queue_ordered(mq->queue, QUEUE_ORDERED_DRAIN);
 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, mq->queue);
 	if (mmc_can_erase(card)) {
 		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mq->queue);
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 8373ca0de8e0..9b106d83b0cd 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -2197,7 +2197,6 @@ static void dasd_setup_queue(struct dasd_block *block)
 	 */
 	blk_queue_max_segment_size(block->request_queue, PAGE_SIZE);
 	blk_queue_segment_boundary(block->request_queue, PAGE_SIZE - 1);
-	blk_queue_ordered(block->request_queue, QUEUE_ORDERED_DRAIN);
 }
 
 /*
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index cdfc51ab9cf2..63bd01ae534f 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -2109,7 +2109,7 @@ static int sd_revalidate_disk(struct gendisk *disk)
 	struct scsi_disk *sdkp = scsi_disk(disk);
 	struct scsi_device *sdp = sdkp->device;
 	unsigned char *buffer;
-	unsigned ordered;
+	unsigned flush = 0;
 
 	SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp,
 				      "sd_revalidate_disk\n"));
@@ -2151,15 +2151,15 @@ static int sd_revalidate_disk(struct gendisk *disk)
 
 	/*
 	 * We now have all cache related info, determine how we deal
-	 * with ordered requests.
+	 * with flush requests.
 	 */
-	if (sdkp->WCE)
-		ordered = sdkp->DPOFUA
-			? QUEUE_ORDERED_DRAIN_FUA : QUEUE_ORDERED_DRAIN_FLUSH;
-	else
-		ordered = QUEUE_ORDERED_DRAIN;
+	if (sdkp->WCE) {
+		flush |= REQ_FLUSH;
+		if (sdkp->DPOFUA)
+			flush |= REQ_FUA;
+	}
 
-	blk_queue_ordered(sdkp->disk->queue, ordered);
+	blk_queue_flush(sdkp->disk->queue, flush);
 
 	set_capacity(disk, sdkp->capacity);
 	kfree(buffer);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 7077bc0d6138..e97911d4dec3 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -355,8 +355,10 @@ struct request_queue
 	struct blk_trace	*blk_trace;
 #endif
 	/*
-	 * reserved for flush operations
+	 * for flush operations
 	 */
+	unsigned int		flush_flags;
+
 	unsigned int		ordered, next_ordered, ordseq;
 	int			orderr, ordcolor;
 	struct request		pre_flush_rq, bar_rq, post_flush_rq;
@@ -865,8 +867,8 @@ extern void blk_queue_update_dma_alignment(struct request_queue *, int);
 extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
 extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *);
 extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
+extern void blk_queue_flush(struct request_queue *q, unsigned int flush);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
-extern int blk_queue_ordered(struct request_queue *, unsigned);
 extern bool blk_do_ordered(struct request_queue *, struct request **);
 extern unsigned blk_ordered_cur_seq(struct request_queue *);
 extern unsigned blk_ordered_req_seq(struct request *);
-- 
cgit v1.2.3


From 9cbbdca44ae1a6f512ea1e2be11ced8bbb9d430a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:16 +0200
Subject: block: remove spurious uses of REQ_HARDBARRIER

REQ_HARDBARRIER is deprecated.  Remove spurious uses in the following
users.  Please note that other than osdblk, all other uses were
already spurious before deprecation.

* osdblk: osdblk_rq_fn() won't receive any request with
  REQ_HARDBARRIER set.  Remove the test for it.

* pktcdvd: use of REQ_HARDBARRIER in pkt_generic_packet() doesn't mean
  anything.  Removed.

* aic7xxx_old: Setting MSG_ORDERED_Q_TAG on REQ_HARDBARRIER is
  spurious.  Removed.

* sas_scsi_host: Setting TASK_ATTR_ORDERED on REQ_HARDBARRIER is
  spurious.  Removed.

* scsi_tcq: The ordered tag path wasn't being used anyway.  Removed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Boaz Harrosh <bharrosh@panasas.com>
Cc: James Bottomley <James.Bottomley@suse.de>
Cc: Peter Osterlund <petero2@telia.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/osdblk.c              |  3 +--
 drivers/block/pktcdvd.c             |  1 -
 drivers/scsi/aic7xxx_old.c          | 21 ++-------------------
 drivers/scsi/libsas/sas_scsi_host.c | 13 +------------
 include/scsi/scsi_tcq.h             |  6 +-----
 5 files changed, 5 insertions(+), 39 deletions(-)

diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c
index 72d62462433d..87311ebac0db 100644
--- a/drivers/block/osdblk.c
+++ b/drivers/block/osdblk.c
@@ -310,8 +310,7 @@ static void osdblk_rq_fn(struct request_queue *q)
 			break;
 
 		/* filter out block requests we don't understand */
-		if (rq->cmd_type != REQ_TYPE_FS &&
-		    !(rq->cmd_flags & REQ_HARDBARRIER)) {
+		if (rq->cmd_type != REQ_TYPE_FS) {
 			blk_end_request_all(rq, 0);
 			continue;
 		}
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index b1cbeb59bb76..0166ea136045 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -753,7 +753,6 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *
 
 	rq->timeout = 60*HZ;
 	rq->cmd_type = REQ_TYPE_BLOCK_PC;
-	rq->cmd_flags |= REQ_HARDBARRIER;
 	if (cgc->quiet)
 		rq->cmd_flags |= REQ_QUIET;
 
diff --git a/drivers/scsi/aic7xxx_old.c b/drivers/scsi/aic7xxx_old.c
index 93984c9dfe14..e1cd6062776c 100644
--- a/drivers/scsi/aic7xxx_old.c
+++ b/drivers/scsi/aic7xxx_old.c
@@ -2850,12 +2850,6 @@ aic7xxx_done(struct aic7xxx_host *p, struct aic7xxx_scb *scb)
       aic_dev->r_total++;
       ptr = aic_dev->r_bins;
     }
-    if(cmd->device->simple_tags && cmd->request->cmd_flags & REQ_HARDBARRIER)
-    {
-      aic_dev->barrier_total++;
-      if(scb->tag_action == MSG_ORDERED_Q_TAG)
-        aic_dev->ordered_total++;
-    }
     x = scb->sg_length;
     x >>= 10;
     for(i=0; i<6; i++)
@@ -10144,19 +10138,8 @@ static void aic7xxx_buildscb(struct aic7xxx_host *p, struct scsi_cmnd *cmd,
     /* We always force TEST_UNIT_READY to untagged */
     if (cmd->cmnd[0] != TEST_UNIT_READY && sdptr->simple_tags)
     {
-      if (req->cmd_flags & REQ_HARDBARRIER)
-      {
-	if(sdptr->ordered_tags)
-	{
-          hscb->control |= MSG_ORDERED_Q_TAG;
-          scb->tag_action = MSG_ORDERED_Q_TAG;
-	}
-      }
-      else
-      {
-        hscb->control |= MSG_SIMPLE_Q_TAG;
-        scb->tag_action = MSG_SIMPLE_Q_TAG;
-      }
+      hscb->control |= MSG_SIMPLE_Q_TAG;
+      scb->tag_action = MSG_SIMPLE_Q_TAG;
     }
   }
   if ( !(aic_dev->dtr_pending) &&
diff --git a/drivers/scsi/libsas/sas_scsi_host.c b/drivers/scsi/libsas/sas_scsi_host.c
index f0cfba9a1fc8..535085cd27ec 100644
--- a/drivers/scsi/libsas/sas_scsi_host.c
+++ b/drivers/scsi/libsas/sas_scsi_host.c
@@ -130,17 +130,6 @@ static void sas_scsi_task_done(struct sas_task *task)
 	sc->scsi_done(sc);
 }
 
-static enum task_attribute sas_scsi_get_task_attr(struct scsi_cmnd *cmd)
-{
-	enum task_attribute ta = TASK_ATTR_SIMPLE;
-	if (cmd->request && blk_rq_tagged(cmd->request)) {
-		if (cmd->device->ordered_tags &&
-		    (cmd->request->cmd_flags & REQ_HARDBARRIER))
-			ta = TASK_ATTR_ORDERED;
-	}
-	return ta;
-}
-
 static struct sas_task *sas_create_task(struct scsi_cmnd *cmd,
 					       struct domain_device *dev,
 					       gfp_t gfp_flags)
@@ -160,7 +149,7 @@ static struct sas_task *sas_create_task(struct scsi_cmnd *cmd,
 	task->ssp_task.retry_count = 1;
 	int_to_scsilun(cmd->device->lun, &lun);
 	memcpy(task->ssp_task.LUN, &lun.scsi_lun, 8);
-	task->ssp_task.task_attr = sas_scsi_get_task_attr(cmd);
+	task->ssp_task.task_attr = TASK_ATTR_SIMPLE;
 	memcpy(task->ssp_task.cdb, cmd->cmnd, 16);
 
 	task->scatter = scsi_sglist(cmd);
diff --git a/include/scsi/scsi_tcq.h b/include/scsi/scsi_tcq.h
index 17231385cb37..d6e7994aa634 100644
--- a/include/scsi/scsi_tcq.h
+++ b/include/scsi/scsi_tcq.h
@@ -97,13 +97,9 @@ static inline void scsi_deactivate_tcq(struct scsi_device *sdev, int depth)
 static inline int scsi_populate_tag_msg(struct scsi_cmnd *cmd, char *msg)
 {
         struct request *req = cmd->request;
-	struct scsi_device *sdev = cmd->device;
 
         if (blk_rq_tagged(req)) {
-		if (sdev->ordered_tags && req->cmd_flags & REQ_HARDBARRIER)
-        	        *msg++ = MSG_ORDERED_TAG;
-        	else
-        	        *msg++ = MSG_SIMPLE_TAG;
+		*msg++ = MSG_SIMPLE_TAG;
         	*msg++ = req->tag;
         	return 2;
 	}
-- 
cgit v1.2.3


From dd831006d5be7f74c3fe7aef82380c51c3637960 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:16 +0200
Subject: block: misc cleanups in barrier code

Make the following cleanups in preparation of barrier/flush update.

* blk_do_ordered() declaration is moved from include/linux/blkdev.h to
  block/blk.h.

* blk_do_ordered() now returns pointer to struct request, with %NULL
  meaning "try the next request" and ERR_PTR(-EAGAIN) "try again
  later".  The third case will be dropped with further changes.

* In the initialization of proxy barrier request, data direction is
  already set by init_request_from_bio().  Drop unnecessary explicit
  REQ_WRITE setting and move init_request_from_bio() above REQ_FUA
  flag setting.

* add_request() is collapsed into __make_request().

These changes don't make any functional difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-barrier.c    | 32 ++++++++++++++------------------
 block/blk-core.c       | 21 ++++-----------------
 block/blk.h            |  7 +++++--
 include/linux/blkdev.h |  1 -
 4 files changed, 23 insertions(+), 38 deletions(-)

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index ed0aba5463ab..f1be85ba2bb5 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -110,9 +110,9 @@ static void queue_flush(struct request_queue *q, unsigned which)
 	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
 }
 
-static inline bool start_ordered(struct request_queue *q, struct request **rqp)
+static inline struct request *start_ordered(struct request_queue *q,
+					    struct request *rq)
 {
-	struct request *rq = *rqp;
 	unsigned skip = 0;
 
 	q->orderr = 0;
@@ -149,11 +149,9 @@ static inline bool start_ordered(struct request_queue *q, struct request **rqp)
 
 		/* initialize proxy request and queue it */
 		blk_rq_init(q, rq);
-		if (bio_data_dir(q->orig_bar_rq->bio) == WRITE)
-			rq->cmd_flags |= REQ_WRITE;
+		init_request_from_bio(rq, q->orig_bar_rq->bio);
 		if (q->ordered & QUEUE_ORDERED_DO_FUA)
 			rq->cmd_flags |= REQ_FUA;
-		init_request_from_bio(rq, q->orig_bar_rq->bio);
 		rq->end_io = bar_end_io;
 
 		elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
@@ -171,27 +169,26 @@ static inline bool start_ordered(struct request_queue *q, struct request **rqp)
 	else
 		skip |= QUEUE_ORDSEQ_DRAIN;
 
-	*rqp = rq;
-
 	/*
 	 * Complete skipped sequences.  If whole sequence is complete,
-	 * return false to tell elevator that this request is gone.
+	 * return %NULL to tell elevator that this request is gone.
 	 */
-	return !blk_ordered_complete_seq(q, skip, 0);
+	if (blk_ordered_complete_seq(q, skip, 0))
+		rq = NULL;
+	return rq;
 }
 
-bool blk_do_ordered(struct request_queue *q, struct request **rqp)
+struct request *blk_do_ordered(struct request_queue *q, struct request *rq)
 {
-	struct request *rq = *rqp;
 	const int is_barrier = rq->cmd_type == REQ_TYPE_FS &&
 				(rq->cmd_flags & REQ_HARDBARRIER);
 
 	if (!q->ordseq) {
 		if (!is_barrier)
-			return true;
+			return rq;
 
 		if (q->next_ordered != QUEUE_ORDERED_NONE)
-			return start_ordered(q, rqp);
+			return start_ordered(q, rq);
 		else {
 			/*
 			 * Queue ordering not supported.  Terminate
@@ -199,8 +196,7 @@ bool blk_do_ordered(struct request_queue *q, struct request **rqp)
 			 */
 			blk_dequeue_request(rq);
 			__blk_end_request_all(rq, -EOPNOTSUPP);
-			*rqp = NULL;
-			return false;
+			return NULL;
 		}
 	}
 
@@ -211,14 +207,14 @@ bool blk_do_ordered(struct request_queue *q, struct request **rqp)
 	/* Special requests are not subject to ordering rules. */
 	if (rq->cmd_type != REQ_TYPE_FS &&
 	    rq != &q->pre_flush_rq && rq != &q->post_flush_rq)
-		return true;
+		return rq;
 
 	/* Ordered by draining.  Wait for turn. */
 	WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q));
 	if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q))
-		*rqp = NULL;
+		rq = ERR_PTR(-EAGAIN);
 
-	return true;
+	return rq;
 }
 
 static void bio_end_empty_barrier(struct bio *bio, int err)
diff --git a/block/blk-core.c b/block/blk-core.c
index f06354183b29..f8d37a8e2c55 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1037,22 +1037,6 @@ void blk_insert_request(struct request_queue *q, struct request *rq,
 }
 EXPORT_SYMBOL(blk_insert_request);
 
-/*
- * add-request adds a request to the linked list.
- * queue lock is held and interrupts disabled, as we muck with the
- * request queue list.
- */
-static inline void add_request(struct request_queue *q, struct request *req)
-{
-	drive_stat_acct(req, 1);
-
-	/*
-	 * elevator indicated where it wants this request to be
-	 * inserted at elevator_merge time
-	 */
-	__elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
-}
-
 static void part_round_stats_single(int cpu, struct hd_struct *part,
 				    unsigned long now)
 {
@@ -1316,7 +1300,10 @@ get_rq:
 		req->cpu = blk_cpu_to_group(smp_processor_id());
 	if (queue_should_plug(q) && elv_queue_empty(q))
 		blk_plug_device(q);
-	add_request(q, req);
+
+	/* insert the request into the elevator */
+	drive_stat_acct(req, 1);
+	__elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
 out:
 	if (unplug || !queue_should_plug(q))
 		__generic_unplug_device(q);
diff --git a/block/blk.h b/block/blk.h
index 6e7dc87141e4..874eb4ea8093 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -51,6 +51,8 @@ static inline void blk_clear_rq_complete(struct request *rq)
  */
 #define ELV_ON_HASH(rq)		(!hlist_unhashed(&(rq)->hash))
 
+struct request *blk_do_ordered(struct request_queue *q, struct request *rq);
+
 static inline struct request *__elv_next_request(struct request_queue *q)
 {
 	struct request *rq;
@@ -58,8 +60,9 @@ static inline struct request *__elv_next_request(struct request_queue *q)
 	while (1) {
 		while (!list_empty(&q->queue_head)) {
 			rq = list_entry_rq(q->queue_head.next);
-			if (blk_do_ordered(q, &rq))
-				return rq;
+			rq = blk_do_ordered(q, rq);
+			if (rq)
+				return !IS_ERR(rq) ? rq : NULL;
 		}
 
 		if (!q->elevator->ops->elevator_dispatch_fn(q, 0))
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e97911d4dec3..996549d71923 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -869,7 +869,6 @@ extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *);
 extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
 extern void blk_queue_flush(struct request_queue *q, unsigned int flush);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
-extern bool blk_do_ordered(struct request_queue *, struct request **);
 extern unsigned blk_ordered_cur_seq(struct request_queue *);
 extern unsigned blk_ordered_req_seq(struct request *);
 extern bool blk_ordered_complete_seq(struct request_queue *, unsigned, int);
-- 
cgit v1.2.3


From 28e7d1845216538303bb95d679d8fd4de50e2f1a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:16 +0200
Subject: block: drop barrier ordering by queue draining

Filesystems will take all the responsibilities for ordering requests
around commit writes and will only indicate how the commit writes
themselves should be handled by block layers.  This patch drops
barrier ordering by queue draining from block layer.  Ordering by
draining implementation was somewhat invasive to request handling.
List of notable changes follow.

* Each queue has 1 bit color which is flipped on each barrier issue.
  This is used to track whether a given request is issued before the
  current barrier or not.  REQ_ORDERED_COLOR flag and coloring
  implementation in __elv_add_request() are removed.

* Requests which shouldn't be processed yet for draining were stalled
  by returning -EAGAIN from blk_do_ordered() according to the test
  result between blk_ordered_req_seq() and blk_blk_ordered_cur_seq().
  This logic is removed.

* Draining completion logic in elv_completed_request() removed.

* All barrier sequence requests were queued to request queue and then
  trckled to lower layer according to progress and thus maintaining
  request orders during requeue was necessary.  This is replaced by
  queueing the next request in the barrier sequence only after the
  current one is complete from blk_ordered_complete_seq(), which
  removes the need for multiple proxy requests in struct request_queue
  and the request sorting logic in the ELEVATOR_INSERT_REQUEUE path of
  elv_insert().

* As barriers no longer have ordering constraints, there's no need to
  dump the whole elevator onto the dispatch queue on each barrier.
  Insert barriers at the front instead.

* If other barrier requests come to the front of the dispatch queue
  while one is already in progress, they are stored in
  q->pending_barriers and restored to dispatch queue one-by-one after
  each barrier completion from blk_ordered_complete_seq().

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-barrier.c       | 220 ++++++++++++++++++----------------------------
 block/blk-core.c          |  11 ++-
 block/blk.h               |   2 +-
 block/elevator.c          |  79 ++---------------
 include/linux/blk_types.h |   2 -
 include/linux/blkdev.h    |  19 ++--
 6 files changed, 113 insertions(+), 220 deletions(-)

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index f1be85ba2bb5..e8b2e5c091b1 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -9,6 +9,8 @@
 
 #include "blk.h"
 
+static struct request *queue_next_ordseq(struct request_queue *q);
+
 /*
  * Cache flushing for ordered writes handling
  */
@@ -19,38 +21,10 @@ unsigned blk_ordered_cur_seq(struct request_queue *q)
 	return 1 << ffz(q->ordseq);
 }
 
-unsigned blk_ordered_req_seq(struct request *rq)
-{
-	struct request_queue *q = rq->q;
-
-	BUG_ON(q->ordseq == 0);
-
-	if (rq == &q->pre_flush_rq)
-		return QUEUE_ORDSEQ_PREFLUSH;
-	if (rq == &q->bar_rq)
-		return QUEUE_ORDSEQ_BAR;
-	if (rq == &q->post_flush_rq)
-		return QUEUE_ORDSEQ_POSTFLUSH;
-
-	/*
-	 * !fs requests don't need to follow barrier ordering.  Always
-	 * put them at the front.  This fixes the following deadlock.
-	 *
-	 * http://thread.gmane.org/gmane.linux.kernel/537473
-	 */
-	if (rq->cmd_type != REQ_TYPE_FS)
-		return QUEUE_ORDSEQ_DRAIN;
-
-	if ((rq->cmd_flags & REQ_ORDERED_COLOR) ==
-	    (q->orig_bar_rq->cmd_flags & REQ_ORDERED_COLOR))
-		return QUEUE_ORDSEQ_DRAIN;
-	else
-		return QUEUE_ORDSEQ_DONE;
-}
-
-bool blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)
+static struct request *blk_ordered_complete_seq(struct request_queue *q,
+						unsigned seq, int error)
 {
-	struct request *rq;
+	struct request *next_rq = NULL;
 
 	if (error && !q->orderr)
 		q->orderr = error;
@@ -58,16 +32,22 @@ bool blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)
 	BUG_ON(q->ordseq & seq);
 	q->ordseq |= seq;
 
-	if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE)
-		return false;
-
-	/*
-	 * Okay, sequence complete.
-	 */
-	q->ordseq = 0;
-	rq = q->orig_bar_rq;
-	__blk_end_request_all(rq, q->orderr);
-	return true;
+	if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE) {
+		/* not complete yet, queue the next ordered sequence */
+		next_rq = queue_next_ordseq(q);
+	} else {
+		/* complete this barrier request */
+		__blk_end_request_all(q->orig_bar_rq, q->orderr);
+		q->orig_bar_rq = NULL;
+		q->ordseq = 0;
+
+		/* dispatch the next barrier if there's one */
+		if (!list_empty(&q->pending_barriers)) {
+			next_rq = list_entry_rq(q->pending_barriers.next);
+			list_move(&next_rq->queuelist, &q->queue_head);
+		}
+	}
+	return next_rq;
 }
 
 static void pre_flush_end_io(struct request *rq, int error)
@@ -88,133 +68,105 @@ static void post_flush_end_io(struct request *rq, int error)
 	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error);
 }
 
-static void queue_flush(struct request_queue *q, unsigned which)
+static void queue_flush(struct request_queue *q, struct request *rq,
+			rq_end_io_fn *end_io)
 {
-	struct request *rq;
-	rq_end_io_fn *end_io;
-
-	if (which == QUEUE_ORDERED_DO_PREFLUSH) {
-		rq = &q->pre_flush_rq;
-		end_io = pre_flush_end_io;
-	} else {
-		rq = &q->post_flush_rq;
-		end_io = post_flush_end_io;
-	}
-
 	blk_rq_init(q, rq);
 	rq->cmd_type = REQ_TYPE_FS;
-	rq->cmd_flags = REQ_HARDBARRIER | REQ_FLUSH;
+	rq->cmd_flags = REQ_FLUSH;
 	rq->rq_disk = q->orig_bar_rq->rq_disk;
 	rq->end_io = end_io;
 
 	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
 }
 
-static inline struct request *start_ordered(struct request_queue *q,
-					    struct request *rq)
+static struct request *queue_next_ordseq(struct request_queue *q)
 {
-	unsigned skip = 0;
-
-	q->orderr = 0;
-	q->ordered = q->next_ordered;
-	q->ordseq |= QUEUE_ORDSEQ_STARTED;
-
-	/*
-	 * For an empty barrier, there's no actual BAR request, which
-	 * in turn makes POSTFLUSH unnecessary.  Mask them off.
-	 */
-	if (!blk_rq_sectors(rq))
-		q->ordered &= ~(QUEUE_ORDERED_DO_BAR |
-				QUEUE_ORDERED_DO_POSTFLUSH);
-
-	/* stash away the original request */
-	blk_dequeue_request(rq);
-	q->orig_bar_rq = rq;
-	rq = NULL;
-
-	/*
-	 * Queue ordered sequence.  As we stack them at the head, we
-	 * need to queue in reverse order.  Note that we rely on that
-	 * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs
-	 * request gets inbetween ordered sequence.
-	 */
-	if (q->ordered & QUEUE_ORDERED_DO_POSTFLUSH) {
-		queue_flush(q, QUEUE_ORDERED_DO_POSTFLUSH);
-		rq = &q->post_flush_rq;
-	} else
-		skip |= QUEUE_ORDSEQ_POSTFLUSH;
+	struct request *rq = &q->bar_rq;
 
-	if (q->ordered & QUEUE_ORDERED_DO_BAR) {
-		rq = &q->bar_rq;
+	switch (blk_ordered_cur_seq(q)) {
+	case QUEUE_ORDSEQ_PREFLUSH:
+		queue_flush(q, rq, pre_flush_end_io);
+		break;
 
+	case QUEUE_ORDSEQ_BAR:
 		/* initialize proxy request and queue it */
 		blk_rq_init(q, rq);
 		init_request_from_bio(rq, q->orig_bar_rq->bio);
+		rq->cmd_flags &= ~REQ_HARDBARRIER;
 		if (q->ordered & QUEUE_ORDERED_DO_FUA)
 			rq->cmd_flags |= REQ_FUA;
 		rq->end_io = bar_end_io;
 
 		elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
-	} else
-		skip |= QUEUE_ORDSEQ_BAR;
+		break;
 
-	if (q->ordered & QUEUE_ORDERED_DO_PREFLUSH) {
-		queue_flush(q, QUEUE_ORDERED_DO_PREFLUSH);
-		rq = &q->pre_flush_rq;
-	} else
-		skip |= QUEUE_ORDSEQ_PREFLUSH;
+	case QUEUE_ORDSEQ_POSTFLUSH:
+		queue_flush(q, rq, post_flush_end_io);
+		break;
 
-	if (queue_in_flight(q))
-		rq = NULL;
-	else
-		skip |= QUEUE_ORDSEQ_DRAIN;
-
-	/*
-	 * Complete skipped sequences.  If whole sequence is complete,
-	 * return %NULL to tell elevator that this request is gone.
-	 */
-	if (blk_ordered_complete_seq(q, skip, 0))
-		rq = NULL;
+	default:
+		BUG();
+	}
 	return rq;
 }
 
 struct request *blk_do_ordered(struct request_queue *q, struct request *rq)
 {
-	const int is_barrier = rq->cmd_type == REQ_TYPE_FS &&
-				(rq->cmd_flags & REQ_HARDBARRIER);
-
-	if (!q->ordseq) {
-		if (!is_barrier)
-			return rq;
-
-		if (q->next_ordered != QUEUE_ORDERED_NONE)
-			return start_ordered(q, rq);
-		else {
-			/*
-			 * Queue ordering not supported.  Terminate
-			 * with prejudice.
-			 */
-			blk_dequeue_request(rq);
-			__blk_end_request_all(rq, -EOPNOTSUPP);
-			return NULL;
-		}
+	unsigned skip = 0;
+
+	if (!(rq->cmd_flags & REQ_HARDBARRIER))
+		return rq;
+
+	if (q->ordseq) {
+		/*
+		 * Barrier is already in progress and they can't be
+		 * processed in parallel.  Queue for later processing.
+		 */
+		list_move_tail(&rq->queuelist, &q->pending_barriers);
+		return NULL;
+	}
+
+	if (unlikely(q->next_ordered == QUEUE_ORDERED_NONE)) {
+		/*
+		 * Queue ordering not supported.  Terminate
+		 * with prejudice.
+		 */
+		blk_dequeue_request(rq);
+		__blk_end_request_all(rq, -EOPNOTSUPP);
+		return NULL;
 	}
 
 	/*
-	 * Ordered sequence in progress
+	 * Start a new ordered sequence
 	 */
+	q->orderr = 0;
+	q->ordered = q->next_ordered;
+	q->ordseq |= QUEUE_ORDSEQ_STARTED;
 
-	/* Special requests are not subject to ordering rules. */
-	if (rq->cmd_type != REQ_TYPE_FS &&
-	    rq != &q->pre_flush_rq && rq != &q->post_flush_rq)
-		return rq;
+	/*
+	 * For an empty barrier, there's no actual BAR request, which
+	 * in turn makes POSTFLUSH unnecessary.  Mask them off.
+	 */
+	if (!blk_rq_sectors(rq))
+		q->ordered &= ~(QUEUE_ORDERED_DO_BAR |
+				QUEUE_ORDERED_DO_POSTFLUSH);
 
-	/* Ordered by draining.  Wait for turn. */
-	WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q));
-	if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q))
-		rq = ERR_PTR(-EAGAIN);
+	/* stash away the original request */
+	blk_dequeue_request(rq);
+	q->orig_bar_rq = rq;
 
-	return rq;
+	if (!(q->ordered & QUEUE_ORDERED_DO_PREFLUSH))
+		skip |= QUEUE_ORDSEQ_PREFLUSH;
+
+	if (!(q->ordered & QUEUE_ORDERED_DO_BAR))
+		skip |= QUEUE_ORDSEQ_BAR;
+
+	if (!(q->ordered & QUEUE_ORDERED_DO_POSTFLUSH))
+		skip |= QUEUE_ORDSEQ_POSTFLUSH;
+
+	/* complete skipped sequences and return the first sequence */
+	return blk_ordered_complete_seq(q, skip, 0);
 }
 
 static void bio_end_empty_barrier(struct bio *bio, int err)
diff --git a/block/blk-core.c b/block/blk-core.c
index f8d37a8e2c55..d316662682c8 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -520,6 +520,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	init_timer(&q->unplug_timer);
 	setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
 	INIT_LIST_HEAD(&q->timeout_list);
+	INIT_LIST_HEAD(&q->pending_barriers);
 	INIT_WORK(&q->unplug_work, blk_unplug_work);
 
 	kobject_init(&q->kobj, &blk_queue_ktype);
@@ -1185,6 +1186,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 	const bool sync = (bio->bi_rw & REQ_SYNC);
 	const bool unplug = (bio->bi_rw & REQ_UNPLUG);
 	const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK;
+	int where = ELEVATOR_INSERT_SORT;
 	int rw_flags;
 
 	/* REQ_HARDBARRIER is no more */
@@ -1203,7 +1205,12 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 
 	spin_lock_irq(q->queue_lock);
 
-	if (unlikely((bio->bi_rw & REQ_HARDBARRIER)) || elv_queue_empty(q))
+	if (bio->bi_rw & REQ_HARDBARRIER) {
+		where = ELEVATOR_INSERT_FRONT;
+		goto get_rq;
+	}
+
+	if (elv_queue_empty(q))
 		goto get_rq;
 
 	el_ret = elv_merge(q, &req, bio);
@@ -1303,7 +1310,7 @@ get_rq:
 
 	/* insert the request into the elevator */
 	drive_stat_acct(req, 1);
-	__elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
+	__elv_add_request(q, req, where, 0);
 out:
 	if (unplug || !queue_should_plug(q))
 		__generic_unplug_device(q);
diff --git a/block/blk.h b/block/blk.h
index 874eb4ea8093..08081e4b294e 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -62,7 +62,7 @@ static inline struct request *__elv_next_request(struct request_queue *q)
 			rq = list_entry_rq(q->queue_head.next);
 			rq = blk_do_ordered(q, rq);
 			if (rq)
-				return !IS_ERR(rq) ? rq : NULL;
+				return rq;
 		}
 
 		if (!q->elevator->ops->elevator_dispatch_fn(q, 0))
diff --git a/block/elevator.c b/block/elevator.c
index ec585c9554d3..241c69c45c5f 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -617,8 +617,6 @@ void elv_quiesce_end(struct request_queue *q)
 
 void elv_insert(struct request_queue *q, struct request *rq, int where)
 {
-	struct list_head *pos;
-	unsigned ordseq;
 	int unplug_it = 1;
 
 	trace_block_rq_insert(q, rq);
@@ -626,9 +624,16 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
 	rq->q = q;
 
 	switch (where) {
+	case ELEVATOR_INSERT_REQUEUE:
+		/*
+		 * Most requeues happen because of a busy condition,
+		 * don't force unplug of the queue for that case.
+		 * Clear unplug_it and fall through.
+		 */
+		unplug_it = 0;
+
 	case ELEVATOR_INSERT_FRONT:
 		rq->cmd_flags |= REQ_SOFTBARRIER;
-
 		list_add(&rq->queuelist, &q->queue_head);
 		break;
 
@@ -668,36 +673,6 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
 		q->elevator->ops->elevator_add_req_fn(q, rq);
 		break;
 
-	case ELEVATOR_INSERT_REQUEUE:
-		/*
-		 * If ordered flush isn't in progress, we do front
-		 * insertion; otherwise, requests should be requeued
-		 * in ordseq order.
-		 */
-		rq->cmd_flags |= REQ_SOFTBARRIER;
-
-		/*
-		 * Most requeues happen because of a busy condition,
-		 * don't force unplug of the queue for that case.
-		 */
-		unplug_it = 0;
-
-		if (q->ordseq == 0) {
-			list_add(&rq->queuelist, &q->queue_head);
-			break;
-		}
-
-		ordseq = blk_ordered_req_seq(rq);
-
-		list_for_each(pos, &q->queue_head) {
-			struct request *pos_rq = list_entry_rq(pos);
-			if (ordseq <= blk_ordered_req_seq(pos_rq))
-				break;
-		}
-
-		list_add_tail(&rq->queuelist, pos);
-		break;
-
 	default:
 		printk(KERN_ERR "%s: bad insertion point %d\n",
 		       __func__, where);
@@ -716,26 +691,8 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
 void __elv_add_request(struct request_queue *q, struct request *rq, int where,
 		       int plug)
 {
-	if (q->ordcolor)
-		rq->cmd_flags |= REQ_ORDERED_COLOR;
-
 	if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) {
-		/*
-		 * toggle ordered color
-		 */
-		if (rq->cmd_flags & REQ_HARDBARRIER)
-			q->ordcolor ^= 1;
-
-		/*
-		 * barriers implicitly indicate back insertion
-		 */
-		if (where == ELEVATOR_INSERT_SORT)
-			where = ELEVATOR_INSERT_BACK;
-
-		/*
-		 * this request is scheduling boundary, update
-		 * end_sector
-		 */
+		/* barriers are scheduling boundary, update end_sector */
 		if (rq->cmd_type == REQ_TYPE_FS ||
 		    (rq->cmd_flags & REQ_DISCARD)) {
 			q->end_sector = rq_end_sector(rq);
@@ -855,24 +812,6 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
 		    e->ops->elevator_completed_req_fn)
 			e->ops->elevator_completed_req_fn(q, rq);
 	}
-
-	/*
-	 * Check if the queue is waiting for fs requests to be
-	 * drained for flush sequence.
-	 */
-	if (unlikely(q->ordseq)) {
-		struct request *next = NULL;
-
-		if (!list_empty(&q->queue_head))
-			next = list_entry_rq(q->queue_head.next);
-
-		if (!queue_in_flight(q) &&
-		    blk_ordered_cur_seq(q) == QUEUE_ORDSEQ_DRAIN &&
-		    (!next || blk_ordered_req_seq(next) > QUEUE_ORDSEQ_DRAIN)) {
-			blk_ordered_complete_seq(q, QUEUE_ORDSEQ_DRAIN, 0);
-			__blk_run_queue(q);
-		}
-	}
 }
 
 #define to_elv(atr) container_of((atr), struct elv_fs_entry, attr)
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index ca83a97c9715..9192282b4259 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -143,7 +143,6 @@ enum rq_flag_bits {
 	__REQ_FAILED,		/* set if the request failed */
 	__REQ_QUIET,		/* don't worry about errors */
 	__REQ_PREEMPT,		/* set for "ide_preempt" requests */
-	__REQ_ORDERED_COLOR,	/* is before or after barrier */
 	__REQ_ALLOCED,		/* request came from our alloc pool */
 	__REQ_COPY_USER,	/* contains copies of user pages */
 	__REQ_INTEGRITY,	/* integrity metadata has been remapped */
@@ -184,7 +183,6 @@ enum rq_flag_bits {
 #define REQ_FAILED		(1 << __REQ_FAILED)
 #define REQ_QUIET		(1 << __REQ_QUIET)
 #define REQ_PREEMPT		(1 << __REQ_PREEMPT)
-#define REQ_ORDERED_COLOR	(1 << __REQ_ORDERED_COLOR)
 #define REQ_ALLOCED		(1 << __REQ_ALLOCED)
 #define REQ_COPY_USER		(1 << __REQ_COPY_USER)
 #define REQ_INTEGRITY		(1 << __REQ_INTEGRITY)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 996549d71923..20a3710a481b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -360,9 +360,10 @@ struct request_queue
 	unsigned int		flush_flags;
 
 	unsigned int		ordered, next_ordered, ordseq;
-	int			orderr, ordcolor;
-	struct request		pre_flush_rq, bar_rq, post_flush_rq;
+	int			orderr;
+	struct request		bar_rq;
 	struct request		*orig_bar_rq;
+	struct list_head	pending_barriers;
 
 	struct mutex		sysfs_lock;
 
@@ -491,12 +492,11 @@ enum {
 	/*
 	 * Ordered operation sequence
 	 */
-	QUEUE_ORDSEQ_STARTED	= 0x01,	/* flushing in progress */
-	QUEUE_ORDSEQ_DRAIN	= 0x02,	/* waiting for the queue to be drained */
-	QUEUE_ORDSEQ_PREFLUSH	= 0x04,	/* pre-flushing in progress */
-	QUEUE_ORDSEQ_BAR	= 0x08,	/* original barrier req in progress */
-	QUEUE_ORDSEQ_POSTFLUSH	= 0x10,	/* post-flushing in progress */
-	QUEUE_ORDSEQ_DONE	= 0x20,
+	QUEUE_ORDSEQ_STARTED	= (1 << 0), /* flushing in progress */
+	QUEUE_ORDSEQ_PREFLUSH	= (1 << 1), /* pre-flushing in progress */
+	QUEUE_ORDSEQ_BAR	= (1 << 2), /* barrier write in progress */
+	QUEUE_ORDSEQ_POSTFLUSH	= (1 << 3), /* post-flushing in progress */
+	QUEUE_ORDSEQ_DONE	= (1 << 4),
 };
 
 #define blk_queue_plugged(q)	test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
@@ -869,9 +869,6 @@ extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *);
 extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
 extern void blk_queue_flush(struct request_queue *q, unsigned int flush);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
-extern unsigned blk_ordered_cur_seq(struct request_queue *);
-extern unsigned blk_ordered_req_seq(struct request *);
-extern bool blk_ordered_complete_seq(struct request_queue *, unsigned, int);
 
 extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
 extern void blk_dump_rq_flags(struct request *, char *);
-- 
cgit v1.2.3


From 8839a0e055d9abd6c011d533373a8dd266cad011 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:16 +0200
Subject: block: rename blk-barrier.c to blk-flush.c

Without ordering requirements, barrier and ordering are minomers.
Rename block/blk-barrier.c to block/blk-flush.c.  Rename of symbols
will follow.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/Makefile      |   2 +-
 block/blk-barrier.c | 248 ----------------------------------------------------
 block/blk-flush.c   | 248 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 249 insertions(+), 249 deletions(-)
 delete mode 100644 block/blk-barrier.c
 create mode 100644 block/blk-flush.c

diff --git a/block/Makefile b/block/Makefile
index 0bb499a739cd..f627e4b1a9da 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -3,7 +3,7 @@
 #
 
 obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
-			blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \
+			blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
 			blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
 			blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o
 
diff --git a/block/blk-barrier.c b/block/blk-barrier.c
deleted file mode 100644
index e8b2e5c091b1..000000000000
--- a/block/blk-barrier.c
+++ /dev/null
@@ -1,248 +0,0 @@
-/*
- * Functions related to barrier IO handling
- */
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/bio.h>
-#include <linux/blkdev.h>
-#include <linux/gfp.h>
-
-#include "blk.h"
-
-static struct request *queue_next_ordseq(struct request_queue *q);
-
-/*
- * Cache flushing for ordered writes handling
- */
-unsigned blk_ordered_cur_seq(struct request_queue *q)
-{
-	if (!q->ordseq)
-		return 0;
-	return 1 << ffz(q->ordseq);
-}
-
-static struct request *blk_ordered_complete_seq(struct request_queue *q,
-						unsigned seq, int error)
-{
-	struct request *next_rq = NULL;
-
-	if (error && !q->orderr)
-		q->orderr = error;
-
-	BUG_ON(q->ordseq & seq);
-	q->ordseq |= seq;
-
-	if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE) {
-		/* not complete yet, queue the next ordered sequence */
-		next_rq = queue_next_ordseq(q);
-	} else {
-		/* complete this barrier request */
-		__blk_end_request_all(q->orig_bar_rq, q->orderr);
-		q->orig_bar_rq = NULL;
-		q->ordseq = 0;
-
-		/* dispatch the next barrier if there's one */
-		if (!list_empty(&q->pending_barriers)) {
-			next_rq = list_entry_rq(q->pending_barriers.next);
-			list_move(&next_rq->queuelist, &q->queue_head);
-		}
-	}
-	return next_rq;
-}
-
-static void pre_flush_end_io(struct request *rq, int error)
-{
-	elv_completed_request(rq->q, rq);
-	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error);
-}
-
-static void bar_end_io(struct request *rq, int error)
-{
-	elv_completed_request(rq->q, rq);
-	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error);
-}
-
-static void post_flush_end_io(struct request *rq, int error)
-{
-	elv_completed_request(rq->q, rq);
-	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error);
-}
-
-static void queue_flush(struct request_queue *q, struct request *rq,
-			rq_end_io_fn *end_io)
-{
-	blk_rq_init(q, rq);
-	rq->cmd_type = REQ_TYPE_FS;
-	rq->cmd_flags = REQ_FLUSH;
-	rq->rq_disk = q->orig_bar_rq->rq_disk;
-	rq->end_io = end_io;
-
-	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
-}
-
-static struct request *queue_next_ordseq(struct request_queue *q)
-{
-	struct request *rq = &q->bar_rq;
-
-	switch (blk_ordered_cur_seq(q)) {
-	case QUEUE_ORDSEQ_PREFLUSH:
-		queue_flush(q, rq, pre_flush_end_io);
-		break;
-
-	case QUEUE_ORDSEQ_BAR:
-		/* initialize proxy request and queue it */
-		blk_rq_init(q, rq);
-		init_request_from_bio(rq, q->orig_bar_rq->bio);
-		rq->cmd_flags &= ~REQ_HARDBARRIER;
-		if (q->ordered & QUEUE_ORDERED_DO_FUA)
-			rq->cmd_flags |= REQ_FUA;
-		rq->end_io = bar_end_io;
-
-		elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
-		break;
-
-	case QUEUE_ORDSEQ_POSTFLUSH:
-		queue_flush(q, rq, post_flush_end_io);
-		break;
-
-	default:
-		BUG();
-	}
-	return rq;
-}
-
-struct request *blk_do_ordered(struct request_queue *q, struct request *rq)
-{
-	unsigned skip = 0;
-
-	if (!(rq->cmd_flags & REQ_HARDBARRIER))
-		return rq;
-
-	if (q->ordseq) {
-		/*
-		 * Barrier is already in progress and they can't be
-		 * processed in parallel.  Queue for later processing.
-		 */
-		list_move_tail(&rq->queuelist, &q->pending_barriers);
-		return NULL;
-	}
-
-	if (unlikely(q->next_ordered == QUEUE_ORDERED_NONE)) {
-		/*
-		 * Queue ordering not supported.  Terminate
-		 * with prejudice.
-		 */
-		blk_dequeue_request(rq);
-		__blk_end_request_all(rq, -EOPNOTSUPP);
-		return NULL;
-	}
-
-	/*
-	 * Start a new ordered sequence
-	 */
-	q->orderr = 0;
-	q->ordered = q->next_ordered;
-	q->ordseq |= QUEUE_ORDSEQ_STARTED;
-
-	/*
-	 * For an empty barrier, there's no actual BAR request, which
-	 * in turn makes POSTFLUSH unnecessary.  Mask them off.
-	 */
-	if (!blk_rq_sectors(rq))
-		q->ordered &= ~(QUEUE_ORDERED_DO_BAR |
-				QUEUE_ORDERED_DO_POSTFLUSH);
-
-	/* stash away the original request */
-	blk_dequeue_request(rq);
-	q->orig_bar_rq = rq;
-
-	if (!(q->ordered & QUEUE_ORDERED_DO_PREFLUSH))
-		skip |= QUEUE_ORDSEQ_PREFLUSH;
-
-	if (!(q->ordered & QUEUE_ORDERED_DO_BAR))
-		skip |= QUEUE_ORDSEQ_BAR;
-
-	if (!(q->ordered & QUEUE_ORDERED_DO_POSTFLUSH))
-		skip |= QUEUE_ORDSEQ_POSTFLUSH;
-
-	/* complete skipped sequences and return the first sequence */
-	return blk_ordered_complete_seq(q, skip, 0);
-}
-
-static void bio_end_empty_barrier(struct bio *bio, int err)
-{
-	if (err) {
-		if (err == -EOPNOTSUPP)
-			set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
-		clear_bit(BIO_UPTODATE, &bio->bi_flags);
-	}
-	if (bio->bi_private)
-		complete(bio->bi_private);
-	bio_put(bio);
-}
-
-/**
- * blkdev_issue_flush - queue a flush
- * @bdev:	blockdev to issue flush for
- * @gfp_mask:	memory allocation flags (for bio_alloc)
- * @error_sector:	error sector
- * @flags:	BLKDEV_IFL_* flags to control behaviour
- *
- * Description:
- *    Issue a flush for the block device in question. Caller can supply
- *    room for storing the error offset in case of a flush error, if they
- *    wish to. If WAIT flag is not passed then caller may check only what
- *    request was pushed in some internal queue for later handling.
- */
-int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
-		sector_t *error_sector, unsigned long flags)
-{
-	DECLARE_COMPLETION_ONSTACK(wait);
-	struct request_queue *q;
-	struct bio *bio;
-	int ret = 0;
-
-	if (bdev->bd_disk == NULL)
-		return -ENXIO;
-
-	q = bdev_get_queue(bdev);
-	if (!q)
-		return -ENXIO;
-
-	/*
-	 * some block devices may not have their queue correctly set up here
-	 * (e.g. loop device without a backing file) and so issuing a flush
-	 * here will panic. Ensure there is a request function before issuing
-	 * the barrier.
-	 */
-	if (!q->make_request_fn)
-		return -ENXIO;
-
-	bio = bio_alloc(gfp_mask, 0);
-	bio->bi_end_io = bio_end_empty_barrier;
-	bio->bi_bdev = bdev;
-	if (test_bit(BLKDEV_WAIT, &flags))
-		bio->bi_private = &wait;
-
-	bio_get(bio);
-	submit_bio(WRITE_BARRIER, bio);
-	if (test_bit(BLKDEV_WAIT, &flags)) {
-		wait_for_completion(&wait);
-		/*
-		 * The driver must store the error location in ->bi_sector, if
-		 * it supports it. For non-stacked drivers, this should be
-		 * copied from blk_rq_pos(rq).
-		 */
-		if (error_sector)
-			*error_sector = bio->bi_sector;
-	}
-
-	if (bio_flagged(bio, BIO_EOPNOTSUPP))
-		ret = -EOPNOTSUPP;
-	else if (!bio_flagged(bio, BIO_UPTODATE))
-		ret = -EIO;
-
-	bio_put(bio);
-	return ret;
-}
-EXPORT_SYMBOL(blkdev_issue_flush);
diff --git a/block/blk-flush.c b/block/blk-flush.c
new file mode 100644
index 000000000000..e8b2e5c091b1
--- /dev/null
+++ b/block/blk-flush.c
@@ -0,0 +1,248 @@
+/*
+ * Functions related to barrier IO handling
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/gfp.h>
+
+#include "blk.h"
+
+static struct request *queue_next_ordseq(struct request_queue *q);
+
+/*
+ * Cache flushing for ordered writes handling
+ */
+unsigned blk_ordered_cur_seq(struct request_queue *q)
+{
+	if (!q->ordseq)
+		return 0;
+	return 1 << ffz(q->ordseq);
+}
+
+static struct request *blk_ordered_complete_seq(struct request_queue *q,
+						unsigned seq, int error)
+{
+	struct request *next_rq = NULL;
+
+	if (error && !q->orderr)
+		q->orderr = error;
+
+	BUG_ON(q->ordseq & seq);
+	q->ordseq |= seq;
+
+	if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE) {
+		/* not complete yet, queue the next ordered sequence */
+		next_rq = queue_next_ordseq(q);
+	} else {
+		/* complete this barrier request */
+		__blk_end_request_all(q->orig_bar_rq, q->orderr);
+		q->orig_bar_rq = NULL;
+		q->ordseq = 0;
+
+		/* dispatch the next barrier if there's one */
+		if (!list_empty(&q->pending_barriers)) {
+			next_rq = list_entry_rq(q->pending_barriers.next);
+			list_move(&next_rq->queuelist, &q->queue_head);
+		}
+	}
+	return next_rq;
+}
+
+static void pre_flush_end_io(struct request *rq, int error)
+{
+	elv_completed_request(rq->q, rq);
+	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error);
+}
+
+static void bar_end_io(struct request *rq, int error)
+{
+	elv_completed_request(rq->q, rq);
+	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error);
+}
+
+static void post_flush_end_io(struct request *rq, int error)
+{
+	elv_completed_request(rq->q, rq);
+	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error);
+}
+
+static void queue_flush(struct request_queue *q, struct request *rq,
+			rq_end_io_fn *end_io)
+{
+	blk_rq_init(q, rq);
+	rq->cmd_type = REQ_TYPE_FS;
+	rq->cmd_flags = REQ_FLUSH;
+	rq->rq_disk = q->orig_bar_rq->rq_disk;
+	rq->end_io = end_io;
+
+	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
+}
+
+static struct request *queue_next_ordseq(struct request_queue *q)
+{
+	struct request *rq = &q->bar_rq;
+
+	switch (blk_ordered_cur_seq(q)) {
+	case QUEUE_ORDSEQ_PREFLUSH:
+		queue_flush(q, rq, pre_flush_end_io);
+		break;
+
+	case QUEUE_ORDSEQ_BAR:
+		/* initialize proxy request and queue it */
+		blk_rq_init(q, rq);
+		init_request_from_bio(rq, q->orig_bar_rq->bio);
+		rq->cmd_flags &= ~REQ_HARDBARRIER;
+		if (q->ordered & QUEUE_ORDERED_DO_FUA)
+			rq->cmd_flags |= REQ_FUA;
+		rq->end_io = bar_end_io;
+
+		elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
+		break;
+
+	case QUEUE_ORDSEQ_POSTFLUSH:
+		queue_flush(q, rq, post_flush_end_io);
+		break;
+
+	default:
+		BUG();
+	}
+	return rq;
+}
+
+struct request *blk_do_ordered(struct request_queue *q, struct request *rq)
+{
+	unsigned skip = 0;
+
+	if (!(rq->cmd_flags & REQ_HARDBARRIER))
+		return rq;
+
+	if (q->ordseq) {
+		/*
+		 * Barrier is already in progress and they can't be
+		 * processed in parallel.  Queue for later processing.
+		 */
+		list_move_tail(&rq->queuelist, &q->pending_barriers);
+		return NULL;
+	}
+
+	if (unlikely(q->next_ordered == QUEUE_ORDERED_NONE)) {
+		/*
+		 * Queue ordering not supported.  Terminate
+		 * with prejudice.
+		 */
+		blk_dequeue_request(rq);
+		__blk_end_request_all(rq, -EOPNOTSUPP);
+		return NULL;
+	}
+
+	/*
+	 * Start a new ordered sequence
+	 */
+	q->orderr = 0;
+	q->ordered = q->next_ordered;
+	q->ordseq |= QUEUE_ORDSEQ_STARTED;
+
+	/*
+	 * For an empty barrier, there's no actual BAR request, which
+	 * in turn makes POSTFLUSH unnecessary.  Mask them off.
+	 */
+	if (!blk_rq_sectors(rq))
+		q->ordered &= ~(QUEUE_ORDERED_DO_BAR |
+				QUEUE_ORDERED_DO_POSTFLUSH);
+
+	/* stash away the original request */
+	blk_dequeue_request(rq);
+	q->orig_bar_rq = rq;
+
+	if (!(q->ordered & QUEUE_ORDERED_DO_PREFLUSH))
+		skip |= QUEUE_ORDSEQ_PREFLUSH;
+
+	if (!(q->ordered & QUEUE_ORDERED_DO_BAR))
+		skip |= QUEUE_ORDSEQ_BAR;
+
+	if (!(q->ordered & QUEUE_ORDERED_DO_POSTFLUSH))
+		skip |= QUEUE_ORDSEQ_POSTFLUSH;
+
+	/* complete skipped sequences and return the first sequence */
+	return blk_ordered_complete_seq(q, skip, 0);
+}
+
+static void bio_end_empty_barrier(struct bio *bio, int err)
+{
+	if (err) {
+		if (err == -EOPNOTSUPP)
+			set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+		clear_bit(BIO_UPTODATE, &bio->bi_flags);
+	}
+	if (bio->bi_private)
+		complete(bio->bi_private);
+	bio_put(bio);
+}
+
+/**
+ * blkdev_issue_flush - queue a flush
+ * @bdev:	blockdev to issue flush for
+ * @gfp_mask:	memory allocation flags (for bio_alloc)
+ * @error_sector:	error sector
+ * @flags:	BLKDEV_IFL_* flags to control behaviour
+ *
+ * Description:
+ *    Issue a flush for the block device in question. Caller can supply
+ *    room for storing the error offset in case of a flush error, if they
+ *    wish to. If WAIT flag is not passed then caller may check only what
+ *    request was pushed in some internal queue for later handling.
+ */
+int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
+		sector_t *error_sector, unsigned long flags)
+{
+	DECLARE_COMPLETION_ONSTACK(wait);
+	struct request_queue *q;
+	struct bio *bio;
+	int ret = 0;
+
+	if (bdev->bd_disk == NULL)
+		return -ENXIO;
+
+	q = bdev_get_queue(bdev);
+	if (!q)
+		return -ENXIO;
+
+	/*
+	 * some block devices may not have their queue correctly set up here
+	 * (e.g. loop device without a backing file) and so issuing a flush
+	 * here will panic. Ensure there is a request function before issuing
+	 * the barrier.
+	 */
+	if (!q->make_request_fn)
+		return -ENXIO;
+
+	bio = bio_alloc(gfp_mask, 0);
+	bio->bi_end_io = bio_end_empty_barrier;
+	bio->bi_bdev = bdev;
+	if (test_bit(BLKDEV_WAIT, &flags))
+		bio->bi_private = &wait;
+
+	bio_get(bio);
+	submit_bio(WRITE_BARRIER, bio);
+	if (test_bit(BLKDEV_WAIT, &flags)) {
+		wait_for_completion(&wait);
+		/*
+		 * The driver must store the error location in ->bi_sector, if
+		 * it supports it. For non-stacked drivers, this should be
+		 * copied from blk_rq_pos(rq).
+		 */
+		if (error_sector)
+			*error_sector = bio->bi_sector;
+	}
+
+	if (bio_flagged(bio, BIO_EOPNOTSUPP))
+		ret = -EOPNOTSUPP;
+	else if (!bio_flagged(bio, BIO_UPTODATE))
+		ret = -EIO;
+
+	bio_put(bio);
+	return ret;
+}
+EXPORT_SYMBOL(blkdev_issue_flush);
-- 
cgit v1.2.3


From dd4c133f387c48f526022860ad70354637a80f4c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:16 +0200
Subject: block: rename barrier/ordered to flush

With ordering requirements dropped, barrier and ordered are misnomers.
Now all block layer does is sequencing FLUSH and FUA.  Rename them to
flush.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c       | 21 ++++++-----
 block/blk-flush.c      | 98 +++++++++++++++++++++++++-------------------------
 block/blk.h            |  4 +--
 include/linux/blkdev.h | 24 ++++++-------
 4 files changed, 72 insertions(+), 75 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index d316662682c8..8870ae40179d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -136,7 +136,7 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
 {
 	struct request_queue *q = rq->q;
 
-	if (&q->bar_rq != rq) {
+	if (&q->flush_rq != rq) {
 		if (error)
 			clear_bit(BIO_UPTODATE, &bio->bi_flags);
 		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
@@ -160,13 +160,12 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
 		if (bio->bi_size == 0)
 			bio_endio(bio, error);
 	} else {
-
 		/*
-		 * Okay, this is the barrier request in progress, just
-		 * record the error;
+		 * Okay, this is the sequenced flush request in
+		 * progress, just record the error;
 		 */
-		if (error && !q->orderr)
-			q->orderr = error;
+		if (error && !q->flush_err)
+			q->flush_err = error;
 	}
 }
 
@@ -520,7 +519,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	init_timer(&q->unplug_timer);
 	setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
 	INIT_LIST_HEAD(&q->timeout_list);
-	INIT_LIST_HEAD(&q->pending_barriers);
+	INIT_LIST_HEAD(&q->pending_flushes);
 	INIT_WORK(&q->unplug_work, blk_unplug_work);
 
 	kobject_init(&q->kobj, &blk_queue_ktype);
@@ -1764,11 +1763,11 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
 static void blk_account_io_done(struct request *req)
 {
 	/*
-	 * Account IO completion.  bar_rq isn't accounted as a normal
-	 * IO on queueing nor completion.  Accounting the containing
-	 * request is enough.
+	 * Account IO completion.  flush_rq isn't accounted as a
+	 * normal IO on queueing nor completion.  Accounting the
+	 * containing request is enough.
 	 */
-	if (blk_do_io_stat(req) && req != &req->q->bar_rq) {
+	if (blk_do_io_stat(req) && req != &req->q->flush_rq) {
 		unsigned long duration = jiffies - req->start_time;
 		const int rw = rq_data_dir(req);
 		struct hd_struct *part;
diff --git a/block/blk-flush.c b/block/blk-flush.c
index e8b2e5c091b1..dd873225da97 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -9,41 +9,38 @@
 
 #include "blk.h"
 
-static struct request *queue_next_ordseq(struct request_queue *q);
+static struct request *queue_next_fseq(struct request_queue *q);
 
-/*
- * Cache flushing for ordered writes handling
- */
-unsigned blk_ordered_cur_seq(struct request_queue *q)
+unsigned blk_flush_cur_seq(struct request_queue *q)
 {
-	if (!q->ordseq)
+	if (!q->flush_seq)
 		return 0;
-	return 1 << ffz(q->ordseq);
+	return 1 << ffz(q->flush_seq);
 }
 
-static struct request *blk_ordered_complete_seq(struct request_queue *q,
-						unsigned seq, int error)
+static struct request *blk_flush_complete_seq(struct request_queue *q,
+					      unsigned seq, int error)
 {
 	struct request *next_rq = NULL;
 
-	if (error && !q->orderr)
-		q->orderr = error;
+	if (error && !q->flush_err)
+		q->flush_err = error;
 
-	BUG_ON(q->ordseq & seq);
-	q->ordseq |= seq;
+	BUG_ON(q->flush_seq & seq);
+	q->flush_seq |= seq;
 
-	if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE) {
-		/* not complete yet, queue the next ordered sequence */
-		next_rq = queue_next_ordseq(q);
+	if (blk_flush_cur_seq(q) != QUEUE_FSEQ_DONE) {
+		/* not complete yet, queue the next flush sequence */
+		next_rq = queue_next_fseq(q);
 	} else {
-		/* complete this barrier request */
-		__blk_end_request_all(q->orig_bar_rq, q->orderr);
-		q->orig_bar_rq = NULL;
-		q->ordseq = 0;
-
-		/* dispatch the next barrier if there's one */
-		if (!list_empty(&q->pending_barriers)) {
-			next_rq = list_entry_rq(q->pending_barriers.next);
+		/* complete this flush request */
+		__blk_end_request_all(q->orig_flush_rq, q->flush_err);
+		q->orig_flush_rq = NULL;
+		q->flush_seq = 0;
+
+		/* dispatch the next flush if there's one */
+		if (!list_empty(&q->pending_flushes)) {
+			next_rq = list_entry_rq(q->pending_flushes.next);
 			list_move(&next_rq->queuelist, &q->queue_head);
 		}
 	}
@@ -53,19 +50,19 @@ static struct request *blk_ordered_complete_seq(struct request_queue *q,
 static void pre_flush_end_io(struct request *rq, int error)
 {
 	elv_completed_request(rq->q, rq);
-	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error);
+	blk_flush_complete_seq(rq->q, QUEUE_FSEQ_PREFLUSH, error);
 }
 
-static void bar_end_io(struct request *rq, int error)
+static void flush_data_end_io(struct request *rq, int error)
 {
 	elv_completed_request(rq->q, rq);
-	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error);
+	blk_flush_complete_seq(rq->q, QUEUE_FSEQ_DATA, error);
 }
 
 static void post_flush_end_io(struct request *rq, int error)
 {
 	elv_completed_request(rq->q, rq);
-	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error);
+	blk_flush_complete_seq(rq->q, QUEUE_FSEQ_POSTFLUSH, error);
 }
 
 static void queue_flush(struct request_queue *q, struct request *rq,
@@ -74,34 +71,34 @@ static void queue_flush(struct request_queue *q, struct request *rq,
 	blk_rq_init(q, rq);
 	rq->cmd_type = REQ_TYPE_FS;
 	rq->cmd_flags = REQ_FLUSH;
-	rq->rq_disk = q->orig_bar_rq->rq_disk;
+	rq->rq_disk = q->orig_flush_rq->rq_disk;
 	rq->end_io = end_io;
 
 	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
 }
 
-static struct request *queue_next_ordseq(struct request_queue *q)
+static struct request *queue_next_fseq(struct request_queue *q)
 {
-	struct request *rq = &q->bar_rq;
+	struct request *rq = &q->flush_rq;
 
-	switch (blk_ordered_cur_seq(q)) {
-	case QUEUE_ORDSEQ_PREFLUSH:
+	switch (blk_flush_cur_seq(q)) {
+	case QUEUE_FSEQ_PREFLUSH:
 		queue_flush(q, rq, pre_flush_end_io);
 		break;
 
-	case QUEUE_ORDSEQ_BAR:
+	case QUEUE_FSEQ_DATA:
 		/* initialize proxy request and queue it */
 		blk_rq_init(q, rq);
-		init_request_from_bio(rq, q->orig_bar_rq->bio);
+		init_request_from_bio(rq, q->orig_flush_rq->bio);
 		rq->cmd_flags &= ~REQ_HARDBARRIER;
 		if (q->ordered & QUEUE_ORDERED_DO_FUA)
 			rq->cmd_flags |= REQ_FUA;
-		rq->end_io = bar_end_io;
+		rq->end_io = flush_data_end_io;
 
 		elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
 		break;
 
-	case QUEUE_ORDSEQ_POSTFLUSH:
+	case QUEUE_FSEQ_POSTFLUSH:
 		queue_flush(q, rq, post_flush_end_io);
 		break;
 
@@ -111,19 +108,20 @@ static struct request *queue_next_ordseq(struct request_queue *q)
 	return rq;
 }
 
-struct request *blk_do_ordered(struct request_queue *q, struct request *rq)
+struct request *blk_do_flush(struct request_queue *q, struct request *rq)
 {
 	unsigned skip = 0;
 
 	if (!(rq->cmd_flags & REQ_HARDBARRIER))
 		return rq;
 
-	if (q->ordseq) {
+	if (q->flush_seq) {
 		/*
-		 * Barrier is already in progress and they can't be
-		 * processed in parallel.  Queue for later processing.
+		 * Sequenced flush is already in progress and they
+		 * can't be processed in parallel.  Queue for later
+		 * processing.
 		 */
-		list_move_tail(&rq->queuelist, &q->pending_barriers);
+		list_move_tail(&rq->queuelist, &q->pending_flushes);
 		return NULL;
 	}
 
@@ -138,11 +136,11 @@ struct request *blk_do_ordered(struct request_queue *q, struct request *rq)
 	}
 
 	/*
-	 * Start a new ordered sequence
+	 * Start a new flush sequence
 	 */
-	q->orderr = 0;
+	q->flush_err = 0;
 	q->ordered = q->next_ordered;
-	q->ordseq |= QUEUE_ORDSEQ_STARTED;
+	q->flush_seq |= QUEUE_FSEQ_STARTED;
 
 	/*
 	 * For an empty barrier, there's no actual BAR request, which
@@ -154,19 +152,19 @@ struct request *blk_do_ordered(struct request_queue *q, struct request *rq)
 
 	/* stash away the original request */
 	blk_dequeue_request(rq);
-	q->orig_bar_rq = rq;
+	q->orig_flush_rq = rq;
 
 	if (!(q->ordered & QUEUE_ORDERED_DO_PREFLUSH))
-		skip |= QUEUE_ORDSEQ_PREFLUSH;
+		skip |= QUEUE_FSEQ_PREFLUSH;
 
 	if (!(q->ordered & QUEUE_ORDERED_DO_BAR))
-		skip |= QUEUE_ORDSEQ_BAR;
+		skip |= QUEUE_FSEQ_DATA;
 
 	if (!(q->ordered & QUEUE_ORDERED_DO_POSTFLUSH))
-		skip |= QUEUE_ORDSEQ_POSTFLUSH;
+		skip |= QUEUE_FSEQ_POSTFLUSH;
 
 	/* complete skipped sequences and return the first sequence */
-	return blk_ordered_complete_seq(q, skip, 0);
+	return blk_flush_complete_seq(q, skip, 0);
 }
 
 static void bio_end_empty_barrier(struct bio *bio, int err)
diff --git a/block/blk.h b/block/blk.h
index 08081e4b294e..24b92bd78f37 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -51,7 +51,7 @@ static inline void blk_clear_rq_complete(struct request *rq)
  */
 #define ELV_ON_HASH(rq)		(!hlist_unhashed(&(rq)->hash))
 
-struct request *blk_do_ordered(struct request_queue *q, struct request *rq);
+struct request *blk_do_flush(struct request_queue *q, struct request *rq);
 
 static inline struct request *__elv_next_request(struct request_queue *q)
 {
@@ -60,7 +60,7 @@ static inline struct request *__elv_next_request(struct request_queue *q)
 	while (1) {
 		while (!list_empty(&q->queue_head)) {
 			rq = list_entry_rq(q->queue_head.next);
-			rq = blk_do_ordered(q, rq);
+			rq = blk_do_flush(q, rq);
 			if (rq)
 				return rq;
 		}
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 20a3710a481b..1cd83ec077db 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -357,13 +357,13 @@ struct request_queue
 	/*
 	 * for flush operations
 	 */
+	unsigned int		ordered, next_ordered;
 	unsigned int		flush_flags;
-
-	unsigned int		ordered, next_ordered, ordseq;
-	int			orderr;
-	struct request		bar_rq;
-	struct request		*orig_bar_rq;
-	struct list_head	pending_barriers;
+	unsigned int		flush_seq;
+	int			flush_err;
+	struct request		flush_rq;
+	struct request		*orig_flush_rq;
+	struct list_head	pending_flushes;
 
 	struct mutex		sysfs_lock;
 
@@ -490,13 +490,13 @@ enum {
 					  QUEUE_ORDERED_DO_FUA,
 
 	/*
-	 * Ordered operation sequence
+	 * FLUSH/FUA sequences.
 	 */
-	QUEUE_ORDSEQ_STARTED	= (1 << 0), /* flushing in progress */
-	QUEUE_ORDSEQ_PREFLUSH	= (1 << 1), /* pre-flushing in progress */
-	QUEUE_ORDSEQ_BAR	= (1 << 2), /* barrier write in progress */
-	QUEUE_ORDSEQ_POSTFLUSH	= (1 << 3), /* post-flushing in progress */
-	QUEUE_ORDSEQ_DONE	= (1 << 4),
+	QUEUE_FSEQ_STARTED	= (1 << 0), /* flushing in progress */
+	QUEUE_FSEQ_PREFLUSH	= (1 << 1), /* pre-flushing in progress */
+	QUEUE_FSEQ_DATA		= (1 << 2), /* data write in progress */
+	QUEUE_FSEQ_POSTFLUSH	= (1 << 3), /* post-flushing in progress */
+	QUEUE_FSEQ_DONE		= (1 << 4),
 };
 
 #define blk_queue_plugged(q)	test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
-- 
cgit v1.2.3


From 4fed947cb311e5aa51781d316cefca836352f6ce Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:17 +0200
Subject: block: implement REQ_FLUSH/FUA based interface for FLUSH/FUA requests

Now that the backend conversion is complete, export sequenced
FLUSH/FUA capability through REQ_FLUSH/FUA flags.  REQ_FLUSH means the
device cache should be flushed before executing the request.  REQ_FUA
means that the data in the request should be on non-volatile media on
completion.

Block layer will choose the correct way of implementing the semantics
and execute it.  The request may be passed to the device directly if
the device can handle it; otherwise, it will be sequenced using one or
more proxy requests.  Devices will never see REQ_FLUSH and/or FUA
which it doesn't support.

Also, unlike the original REQ_HARDBARRIER, REQ_FLUSH/FUA requests are
never failed with -EOPNOTSUPP.  If the underlying device doesn't
support FLUSH/FUA, the block layer simply make those noop.  IOW, it no
longer distinguishes between writeback cache which doesn't support
cache flush and writethrough/no cache.  Devices which have WB cache
w/o flush are very difficult to come by these days and there's nothing
much we can do anyway, so it doesn't make sense to require everyone to
implement -EOPNOTSUPP handling.  This will simplify filesystems and
block drivers as they can drop -EOPNOTSUPP retry logic for barriers.

* QUEUE_ORDERED_* are removed and QUEUE_FSEQ_* are moved into
  blk-flush.c.

* REQ_FLUSH w/o data can also be directly passed to drivers without
  sequencing but some drivers assume that zero length requests don't
  have rq->bio which isn't true for these requests requiring the use
  of proxy requests.

* REQ_COMMON_MASK now includes REQ_FLUSH | REQ_FUA so that they are
  copied from bio to request.

* WRITE_BARRIER is marked deprecated and WRITE_FLUSH, WRITE_FUA and
  WRITE_FLUSH_FUA are added.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c            |  2 +-
 block/blk-flush.c           | 85 ++++++++++++++++++++++++---------------------
 block/blk.h                 |  3 ++
 include/linux/blk_types.h   |  2 +-
 include/linux/blkdev.h      | 38 ++------------------
 include/linux/buffer_head.h |  2 +-
 include/linux/fs.h          | 19 ++++++----
 7 files changed, 67 insertions(+), 84 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 8870ae40179d..18455c4f618a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1204,7 +1204,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 
 	spin_lock_irq(q->queue_lock);
 
-	if (bio->bi_rw & REQ_HARDBARRIER) {
+	if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
 		where = ELEVATOR_INSERT_FRONT;
 		goto get_rq;
 	}
diff --git a/block/blk-flush.c b/block/blk-flush.c
index dd873225da97..452c552e9ead 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -1,5 +1,5 @@
 /*
- * Functions related to barrier IO handling
+ * Functions to sequence FLUSH and FUA writes.
  */
 #include <linux/kernel.h>
 #include <linux/module.h>
@@ -9,6 +9,15 @@
 
 #include "blk.h"
 
+/* FLUSH/FUA sequences */
+enum {
+	QUEUE_FSEQ_STARTED	= (1 << 0), /* flushing in progress */
+	QUEUE_FSEQ_PREFLUSH	= (1 << 1), /* pre-flushing in progress */
+	QUEUE_FSEQ_DATA		= (1 << 2), /* data write in progress */
+	QUEUE_FSEQ_POSTFLUSH	= (1 << 3), /* post-flushing in progress */
+	QUEUE_FSEQ_DONE		= (1 << 4),
+};
+
 static struct request *queue_next_fseq(struct request_queue *q);
 
 unsigned blk_flush_cur_seq(struct request_queue *q)
@@ -79,6 +88,7 @@ static void queue_flush(struct request_queue *q, struct request *rq,
 
 static struct request *queue_next_fseq(struct request_queue *q)
 {
+	struct request *orig_rq = q->orig_flush_rq;
 	struct request *rq = &q->flush_rq;
 
 	switch (blk_flush_cur_seq(q)) {
@@ -87,12 +97,11 @@ static struct request *queue_next_fseq(struct request_queue *q)
 		break;
 
 	case QUEUE_FSEQ_DATA:
-		/* initialize proxy request and queue it */
+		/* initialize proxy request, inherit FLUSH/FUA and queue it */
 		blk_rq_init(q, rq);
-		init_request_from_bio(rq, q->orig_flush_rq->bio);
-		rq->cmd_flags &= ~REQ_HARDBARRIER;
-		if (q->ordered & QUEUE_ORDERED_DO_FUA)
-			rq->cmd_flags |= REQ_FUA;
+		init_request_from_bio(rq, orig_rq->bio);
+		rq->cmd_flags &= ~(REQ_FLUSH | REQ_FUA);
+		rq->cmd_flags |= orig_rq->cmd_flags & (REQ_FLUSH | REQ_FUA);
 		rq->end_io = flush_data_end_io;
 
 		elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
@@ -110,60 +119,58 @@ static struct request *queue_next_fseq(struct request_queue *q)
 
 struct request *blk_do_flush(struct request_queue *q, struct request *rq)
 {
+	unsigned int fflags = q->flush_flags; /* may change, cache it */
+	bool has_flush = fflags & REQ_FLUSH, has_fua = fflags & REQ_FUA;
+	bool do_preflush = has_flush && (rq->cmd_flags & REQ_FLUSH);
+	bool do_postflush = has_flush && !has_fua && (rq->cmd_flags & REQ_FUA);
 	unsigned skip = 0;
 
-	if (!(rq->cmd_flags & REQ_HARDBARRIER))
+	/*
+	 * Special case.  If there's data but flush is not necessary,
+	 * the request can be issued directly.
+	 *
+	 * Flush w/o data should be able to be issued directly too but
+	 * currently some drivers assume that rq->bio contains
+	 * non-zero data if it isn't NULL and empty FLUSH requests
+	 * getting here usually have bio's without data.
+	 */
+	if (blk_rq_sectors(rq) && !do_preflush && !do_postflush) {
+		rq->cmd_flags &= ~REQ_FLUSH;
+		if (!has_fua)
+			rq->cmd_flags &= ~REQ_FUA;
 		return rq;
+	}
 
+	/*
+	 * Sequenced flushes can't be processed in parallel.  If
+	 * another one is already in progress, queue for later
+	 * processing.
+	 */
 	if (q->flush_seq) {
-		/*
-		 * Sequenced flush is already in progress and they
-		 * can't be processed in parallel.  Queue for later
-		 * processing.
-		 */
 		list_move_tail(&rq->queuelist, &q->pending_flushes);
 		return NULL;
 	}
 
-	if (unlikely(q->next_ordered == QUEUE_ORDERED_NONE)) {
-		/*
-		 * Queue ordering not supported.  Terminate
-		 * with prejudice.
-		 */
-		blk_dequeue_request(rq);
-		__blk_end_request_all(rq, -EOPNOTSUPP);
-		return NULL;
-	}
-
 	/*
 	 * Start a new flush sequence
 	 */
 	q->flush_err = 0;
-	q->ordered = q->next_ordered;
 	q->flush_seq |= QUEUE_FSEQ_STARTED;
 
-	/*
-	 * For an empty barrier, there's no actual BAR request, which
-	 * in turn makes POSTFLUSH unnecessary.  Mask them off.
-	 */
-	if (!blk_rq_sectors(rq))
-		q->ordered &= ~(QUEUE_ORDERED_DO_BAR |
-				QUEUE_ORDERED_DO_POSTFLUSH);
-
-	/* stash away the original request */
+	/* adjust FLUSH/FUA of the original request and stash it away */
+	rq->cmd_flags &= ~REQ_FLUSH;
+	if (!has_fua)
+		rq->cmd_flags &= ~REQ_FUA;
 	blk_dequeue_request(rq);
 	q->orig_flush_rq = rq;
 
-	if (!(q->ordered & QUEUE_ORDERED_DO_PREFLUSH))
+	/* skip unneded sequences and return the first one */
+	if (!do_preflush)
 		skip |= QUEUE_FSEQ_PREFLUSH;
-
-	if (!(q->ordered & QUEUE_ORDERED_DO_BAR))
+	if (!blk_rq_sectors(rq))
 		skip |= QUEUE_FSEQ_DATA;
-
-	if (!(q->ordered & QUEUE_ORDERED_DO_POSTFLUSH))
+	if (!do_postflush)
 		skip |= QUEUE_FSEQ_POSTFLUSH;
-
-	/* complete skipped sequences and return the first sequence */
 	return blk_flush_complete_seq(q, skip, 0);
 }
 
diff --git a/block/blk.h b/block/blk.h
index 24b92bd78f37..a09c18b19116 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -60,6 +60,9 @@ static inline struct request *__elv_next_request(struct request_queue *q)
 	while (1) {
 		while (!list_empty(&q->queue_head)) {
 			rq = list_entry_rq(q->queue_head.next);
+			if (!(rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) ||
+			    rq == &q->flush_rq)
+				return rq;
 			rq = blk_do_flush(q, rq);
 			if (rq)
 				return rq;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 9192282b4259..179799479e6f 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -167,7 +167,7 @@ enum rq_flag_bits {
 	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
 #define REQ_COMMON_MASK \
 	(REQ_WRITE | REQ_FAILFAST_MASK | REQ_HARDBARRIER | REQ_SYNC | \
-	 REQ_META| REQ_DISCARD | REQ_NOIDLE)
+	 REQ_META | REQ_DISCARD | REQ_NOIDLE | REQ_FLUSH | REQ_FUA)
 
 #define REQ_UNPLUG		(1 << __REQ_UNPLUG)
 #define REQ_RAHEAD		(1 << __REQ_RAHEAD)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1cd83ec077db..8ef705f800ab 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -357,7 +357,6 @@ struct request_queue
 	/*
 	 * for flush operations
 	 */
-	unsigned int		ordered, next_ordered;
 	unsigned int		flush_flags;
 	unsigned int		flush_seq;
 	int			flush_err;
@@ -465,40 +464,6 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
 	__clear_bit(flag, &q->queue_flags);
 }
 
-enum {
-	/*
-	 * Hardbarrier is supported with one of the following methods.
-	 *
-	 * NONE		: hardbarrier unsupported
-	 * DRAIN	: ordering by draining is enough
-	 * DRAIN_FLUSH	: ordering by draining w/ pre and post flushes
-	 * DRAIN_FUA	: ordering by draining w/ pre flush and FUA write
-	 */
-	QUEUE_ORDERED_DO_PREFLUSH	= 0x10,
-	QUEUE_ORDERED_DO_BAR		= 0x20,
-	QUEUE_ORDERED_DO_POSTFLUSH	= 0x40,
-	QUEUE_ORDERED_DO_FUA		= 0x80,
-
-	QUEUE_ORDERED_NONE		= 0x00,
-
-	QUEUE_ORDERED_DRAIN		= QUEUE_ORDERED_DO_BAR,
-	QUEUE_ORDERED_DRAIN_FLUSH	= QUEUE_ORDERED_DRAIN |
-					  QUEUE_ORDERED_DO_PREFLUSH |
-					  QUEUE_ORDERED_DO_POSTFLUSH,
-	QUEUE_ORDERED_DRAIN_FUA		= QUEUE_ORDERED_DRAIN |
-					  QUEUE_ORDERED_DO_PREFLUSH |
-					  QUEUE_ORDERED_DO_FUA,
-
-	/*
-	 * FLUSH/FUA sequences.
-	 */
-	QUEUE_FSEQ_STARTED	= (1 << 0), /* flushing in progress */
-	QUEUE_FSEQ_PREFLUSH	= (1 << 1), /* pre-flushing in progress */
-	QUEUE_FSEQ_DATA		= (1 << 2), /* data write in progress */
-	QUEUE_FSEQ_POSTFLUSH	= (1 << 3), /* post-flushing in progress */
-	QUEUE_FSEQ_DONE		= (1 << 4),
-};
-
 #define blk_queue_plugged(q)	test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
 #define blk_queue_tagged(q)	test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
 #define blk_queue_stopped(q)	test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
@@ -578,7 +543,8 @@ static inline void blk_clear_queue_full(struct request_queue *q, int sync)
  * it already be started by driver.
  */
 #define RQ_NOMERGE_FLAGS	\
-	(REQ_NOMERGE | REQ_STARTED | REQ_HARDBARRIER | REQ_SOFTBARRIER)
+	(REQ_NOMERGE | REQ_STARTED | REQ_HARDBARRIER | REQ_SOFTBARRIER | \
+	 REQ_FLUSH | REQ_FUA)
 #define rq_mergeable(rq)	\
 	(!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && \
 	 (((rq)->cmd_flags & REQ_DISCARD) || \
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index ec94c12f21da..fc999f583fda 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -32,7 +32,7 @@ enum bh_state_bits {
 	BH_Delay,	/* Buffer is not yet allocated on disk */
 	BH_Boundary,	/* Block is followed by a discontiguity */
 	BH_Write_EIO,	/* I/O error on write */
-	BH_Eopnotsupp,	/* operation not supported (barrier) */
+	BH_Eopnotsupp,	/* DEPRECATED: operation not supported (barrier) */
 	BH_Unwritten,	/* Buffer is allocated on disk but not written */
 	BH_Quiet,	/* Buffer Error Prinks to be quiet */
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 76041b614758..352c48627381 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -135,12 +135,13 @@ struct inodes_stat_t {
  *			immediately after submission. The write equivalent
  *			of READ_SYNC.
  * WRITE_ODIRECT_PLUG	Special case write for O_DIRECT only.
- * WRITE_BARRIER	Like WRITE_SYNC, but tells the block layer that all
- *			previously submitted writes must be safely on storage
- *			before this one is started. Also guarantees that when
- *			this write is complete, it itself is also safely on
- *			storage. Prevents reordering of writes on both sides
- *			of this IO.
+ * WRITE_BARRIER	DEPRECATED. Always fails. Use FLUSH/FUA instead.
+ * WRITE_FLUSH		Like WRITE_SYNC but with preceding cache flush.
+ * WRITE_FUA		Like WRITE_SYNC but data is guaranteed to be on
+ *			non-volatile media on completion.
+ * WRITE_FLUSH_FUA	Combination of WRITE_FLUSH and FUA. The IO is preceded
+ *			by a cache flush and data is guaranteed to be on
+ *			non-volatile media on completion.
  *
  */
 #define RW_MASK			REQ_WRITE
@@ -158,6 +159,12 @@ struct inodes_stat_t {
 #define WRITE_META		(WRITE | REQ_META)
 #define WRITE_BARRIER		(WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
 				 REQ_HARDBARRIER)
+#define WRITE_FLUSH		(WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
+				 REQ_FLUSH)
+#define WRITE_FUA		(WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
+				 REQ_FUA)
+#define WRITE_FLUSH_FUA		(WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
+				 REQ_FLUSH | REQ_FUA)
 
 /*
  * These aren't really reads or writes, they pass down information about
-- 
cgit v1.2.3


From 1e87901e189c8f01750d67485009fe3827c691bf Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:17 +0200
Subject: block: filter flush bio's in __generic_make_request()

There are a number of make_request based drivers which don't support
cache flushes.  Filter out flush bio's in __generic_make_request() so
that they don't have to worry about them.  All FLUSH/FUA requests with
data are converted to regular IO requests and empty ones are completed
immediately.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/block/blk-core.c b/block/blk-core.c
index 18455c4f618a..495bdc4a23da 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1509,6 +1509,19 @@ static inline void __generic_make_request(struct bio *bio)
 		if (bio_check_eod(bio, nr_sectors))
 			goto end_io;
 
+		/*
+		 * Filter flush bio's early so that make_request based
+		 * drivers without flush support don't have to worry
+		 * about them.
+		 */
+		if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
+			bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
+			if (!nr_sectors) {
+				err = 0;
+				goto end_io;
+			}
+		}
+
 		if ((bio->bi_rw & REQ_DISCARD) &&
 		    (!blk_queue_discard(q) ||
 		     ((bio->bi_rw & REQ_SECURE) &&
-- 
cgit v1.2.3


From cde4c406d8fb051c5aafc917643adbb9dbd0abc2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 3 Sep 2010 11:56:17 +0200
Subject: block: simplify queue_next_fseq

We need to call blk_rq_init and elv_insert for all cases in queue_next_fseq,
so take these calls into common code.  Also move the end_io initialization
from queue_flush into queue_next_fseq and rename queue_flush to
init_flush_request now that it's old name doesn't apply anymore.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-flush.c | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 452c552e9ead..72905f862d31 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -74,16 +74,11 @@ static void post_flush_end_io(struct request *rq, int error)
 	blk_flush_complete_seq(rq->q, QUEUE_FSEQ_POSTFLUSH, error);
 }
 
-static void queue_flush(struct request_queue *q, struct request *rq,
-			rq_end_io_fn *end_io)
+static void init_flush_request(struct request *rq, struct gendisk *disk)
 {
-	blk_rq_init(q, rq);
 	rq->cmd_type = REQ_TYPE_FS;
 	rq->cmd_flags = REQ_FLUSH;
-	rq->rq_disk = q->orig_flush_rq->rq_disk;
-	rq->end_io = end_io;
-
-	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
+	rq->rq_disk = disk;
 }
 
 static struct request *queue_next_fseq(struct request_queue *q)
@@ -91,29 +86,28 @@ static struct request *queue_next_fseq(struct request_queue *q)
 	struct request *orig_rq = q->orig_flush_rq;
 	struct request *rq = &q->flush_rq;
 
+	blk_rq_init(q, rq);
+
 	switch (blk_flush_cur_seq(q)) {
 	case QUEUE_FSEQ_PREFLUSH:
-		queue_flush(q, rq, pre_flush_end_io);
+		init_flush_request(rq, orig_rq->rq_disk);
+		rq->end_io = pre_flush_end_io;
 		break;
-
 	case QUEUE_FSEQ_DATA:
-		/* initialize proxy request, inherit FLUSH/FUA and queue it */
-		blk_rq_init(q, rq);
 		init_request_from_bio(rq, orig_rq->bio);
 		rq->cmd_flags &= ~(REQ_FLUSH | REQ_FUA);
 		rq->cmd_flags |= orig_rq->cmd_flags & (REQ_FLUSH | REQ_FUA);
 		rq->end_io = flush_data_end_io;
-
-		elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
 		break;
-
 	case QUEUE_FSEQ_POSTFLUSH:
-		queue_flush(q, rq, post_flush_end_io);
+		init_flush_request(rq, orig_rq->rq_disk);
+		rq->end_io = post_flush_end_io;
 		break;
-
 	default:
 		BUG();
 	}
+
+	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
 	return rq;
 }
 
-- 
cgit v1.2.3


From 337238be1bf52e1242f940fc6fe83fb395e55057 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:17 +0200
Subject: block: initialize flush request with WRITE_FLUSH instead of REQ_FLUSH

init_flush_request() only set REQ_FLUSH when initializing flush
requests making them READ requests.  Use WRITE_FLUSH instead.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-flush.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 72905f862d31..f357f1fc411c 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -77,7 +77,7 @@ static void post_flush_end_io(struct request *rq, int error)
 static void init_flush_request(struct request *rq, struct gendisk *disk)
 {
 	rq->cmd_type = REQ_TYPE_FS;
-	rq->cmd_flags = REQ_FLUSH;
+	rq->cmd_flags = WRITE_FLUSH;
 	rq->rq_disk = disk;
 }
 
-- 
cgit v1.2.3


From 47f70d5a6ca78c40a1c799d43506efbfed914f7b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:17 +0200
Subject: block: kick queue after sequencing REQ_FLUSH/FUA

While completing a request from a REQ_FLUSH/FUA sequence, another
request can be pushed to the request queue.  If a driver tests
elv_queue_empty() before completing a request and runs the queue again
only if the queue wasn't empty, this may lead to hang.  Please note
that most drivers either kick the queue unconditionally or test queue
emptiness after completing the current request and don't have this
problem.

This patch removes this possibility by making REQ_FLUSH/FUA sequence
code kick the queue if the queue was empty before completing a request
from REQ_FLUSH/FUA sequence.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-flush.c | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index f357f1fc411c..cb4c8440a1fc 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -56,22 +56,38 @@ static struct request *blk_flush_complete_seq(struct request_queue *q,
 	return next_rq;
 }
 
+static void blk_flush_complete_seq_end_io(struct request_queue *q,
+					  unsigned seq, int error)
+{
+	bool was_empty = elv_queue_empty(q);
+	struct request *next_rq;
+
+	next_rq = blk_flush_complete_seq(q, seq, error);
+
+	/*
+	 * Moving a request silently to empty queue_head may stall the
+	 * queue.  Kick the queue in those cases.
+	 */
+	if (was_empty && next_rq)
+		__blk_run_queue(q);
+}
+
 static void pre_flush_end_io(struct request *rq, int error)
 {
 	elv_completed_request(rq->q, rq);
-	blk_flush_complete_seq(rq->q, QUEUE_FSEQ_PREFLUSH, error);
+	blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_PREFLUSH, error);
 }
 
 static void flush_data_end_io(struct request *rq, int error)
 {
 	elv_completed_request(rq->q, rq);
-	blk_flush_complete_seq(rq->q, QUEUE_FSEQ_DATA, error);
+	blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_DATA, error);
 }
 
 static void post_flush_end_io(struct request *rq, int error)
 {
 	elv_completed_request(rq->q, rq);
-	blk_flush_complete_seq(rq->q, QUEUE_FSEQ_POSTFLUSH, error);
+	blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_POSTFLUSH, error);
 }
 
 static void init_flush_request(struct request *rq, struct gendisk *disk)
-- 
cgit v1.2.3


From 09d60c701b64b509f328cac72970eb894f485b9e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:17 +0200
Subject: block: make sure FSEQ_DATA request has the same rq_disk as the
 original

rq->rq_disk and bio->bi_bdev->bd_disk may differ if a request has
passed through remapping drivers.  FSEQ_DATA request incorrectly
followed bio->bi_bdev->bd_disk ending up being issued w/ mismatching
rq_disk.  Make it follow orig_rq->rq_disk.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Tested-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-flush.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index cb4c8440a1fc..7d1fc982e78f 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -111,6 +111,13 @@ static struct request *queue_next_fseq(struct request_queue *q)
 		break;
 	case QUEUE_FSEQ_DATA:
 		init_request_from_bio(rq, orig_rq->bio);
+		/*
+		 * orig_rq->rq_disk may be different from
+		 * bio->bi_bdev->bd_disk if orig_rq got here through
+		 * remapping drivers.  Make sure rq->rq_disk points
+		 * to the same one as orig_rq.
+		 */
+		rq->rq_disk = orig_rq->rq_disk;
 		rq->cmd_flags &= ~(REQ_FLUSH | REQ_FUA);
 		rq->cmd_flags |= orig_rq->cmd_flags & (REQ_FLUSH | REQ_FUA);
 		rq->end_io = flush_data_end_io;
-- 
cgit v1.2.3


From 04ccc65cd1f57aee861708e08cd2272c5a0d088c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 3 Sep 2010 11:56:17 +0200
Subject: block: update documentation for REQ_FLUSH / REQ_FUA

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 Documentation/block/00-INDEX                    |   4 +-
 Documentation/block/barrier.txt                 | 261 ------------------------
 Documentation/block/writeback_cache_control.txt |  86 ++++++++
 3 files changed, 88 insertions(+), 263 deletions(-)
 delete mode 100644 Documentation/block/barrier.txt
 create mode 100644 Documentation/block/writeback_cache_control.txt

diff --git a/Documentation/block/00-INDEX b/Documentation/block/00-INDEX
index a406286f6f3e..d111e3b23db0 100644
--- a/Documentation/block/00-INDEX
+++ b/Documentation/block/00-INDEX
@@ -1,7 +1,5 @@
 00-INDEX
 	- This file
-barrier.txt
-	- I/O Barriers
 biodoc.txt
 	- Notes on the Generic Block Layer Rewrite in Linux 2.5
 capability.txt
@@ -16,3 +14,5 @@ stat.txt
 	- Block layer statistics in /sys/block/<dev>/stat
 switching-sched.txt
 	- Switching I/O schedulers at runtime
+writeback_cache_control.txt
+	- Control of volatile write back caches
diff --git a/Documentation/block/barrier.txt b/Documentation/block/barrier.txt
deleted file mode 100644
index 2c2f24f634e4..000000000000
--- a/Documentation/block/barrier.txt
+++ /dev/null
@@ -1,261 +0,0 @@
-I/O Barriers
-============
-Tejun Heo <htejun@gmail.com>, July 22 2005
-
-I/O barrier requests are used to guarantee ordering around the barrier
-requests.  Unless you're crazy enough to use disk drives for
-implementing synchronization constructs (wow, sounds interesting...),
-the ordering is meaningful only for write requests for things like
-journal checkpoints.  All requests queued before a barrier request
-must be finished (made it to the physical medium) before the barrier
-request is started, and all requests queued after the barrier request
-must be started only after the barrier request is finished (again,
-made it to the physical medium).
-
-In other words, I/O barrier requests have the following two properties.
-
-1. Request ordering
-
-Requests cannot pass the barrier request.  Preceding requests are
-processed before the barrier and following requests after.
-
-Depending on what features a drive supports, this can be done in one
-of the following three ways.
-
-i. For devices which have queue depth greater than 1 (TCQ devices) and
-support ordered tags, block layer can just issue the barrier as an
-ordered request and the lower level driver, controller and drive
-itself are responsible for making sure that the ordering constraint is
-met.  Most modern SCSI controllers/drives should support this.
-
-NOTE: SCSI ordered tag isn't currently used due to limitation in the
-      SCSI midlayer, see the following random notes section.
-
-ii. For devices which have queue depth greater than 1 but don't
-support ordered tags, block layer ensures that the requests preceding
-a barrier request finishes before issuing the barrier request.  Also,
-it defers requests following the barrier until the barrier request is
-finished.  Older SCSI controllers/drives and SATA drives fall in this
-category.
-
-iii. Devices which have queue depth of 1.  This is a degenerate case
-of ii.  Just keeping issue order suffices.  Ancient SCSI
-controllers/drives and IDE drives are in this category.
-
-2. Forced flushing to physical medium
-
-Again, if you're not gonna do synchronization with disk drives (dang,
-it sounds even more appealing now!), the reason you use I/O barriers
-is mainly to protect filesystem integrity when power failure or some
-other events abruptly stop the drive from operating and possibly make
-the drive lose data in its cache.  So, I/O barriers need to guarantee
-that requests actually get written to non-volatile medium in order.
-
-There are four cases,
-
-i. No write-back cache.  Keeping requests ordered is enough.
-
-ii. Write-back cache but no flush operation.  There's no way to
-guarantee physical-medium commit order.  This kind of devices can't to
-I/O barriers.
-
-iii. Write-back cache and flush operation but no FUA (forced unit
-access).  We need two cache flushes - before and after the barrier
-request.
-
-iv. Write-back cache, flush operation and FUA.  We still need one
-flush to make sure requests preceding a barrier are written to medium,
-but post-barrier flush can be avoided by using FUA write on the
-barrier itself.
-
-
-How to support barrier requests in drivers
-------------------------------------------
-
-All barrier handling is done inside block layer proper.  All low level
-drivers have to are implementing its prepare_flush_fn and using one
-the following two functions to indicate what barrier type it supports
-and how to prepare flush requests.  Note that the term 'ordered' is
-used to indicate the whole sequence of performing barrier requests
-including draining and flushing.
-
-typedef void (prepare_flush_fn)(struct request_queue *q, struct request *rq);
-
-int blk_queue_ordered(struct request_queue *q, unsigned ordered,
-		      prepare_flush_fn *prepare_flush_fn);
-
-@q			: the queue in question
-@ordered		: the ordered mode the driver/device supports
-@prepare_flush_fn	: this function should prepare @rq such that it
-			  flushes cache to physical medium when executed
-
-For example, SCSI disk driver's prepare_flush_fn looks like the
-following.
-
-static void sd_prepare_flush(struct request_queue *q, struct request *rq)
-{
-	memset(rq->cmd, 0, sizeof(rq->cmd));
-	rq->cmd_type = REQ_TYPE_BLOCK_PC;
-	rq->timeout = SD_TIMEOUT;
-	rq->cmd[0] = SYNCHRONIZE_CACHE;
-	rq->cmd_len = 10;
-}
-
-The following seven ordered modes are supported.  The following table
-shows which mode should be used depending on what features a
-device/driver supports.  In the leftmost column of table,
-QUEUE_ORDERED_ prefix is omitted from the mode names to save space.
-
-The table is followed by description of each mode.  Note that in the
-descriptions of QUEUE_ORDERED_DRAIN*, '=>' is used whereas '->' is
-used for QUEUE_ORDERED_TAG* descriptions.  '=>' indicates that the
-preceding step must be complete before proceeding to the next step.
-'->' indicates that the next step can start as soon as the previous
-step is issued.
-
-	    write-back cache	ordered tag	flush		FUA
------------------------------------------------------------------------
-NONE		yes/no		N/A		no		N/A
-DRAIN		no		no		N/A		N/A
-DRAIN_FLUSH	yes		no		yes		no
-DRAIN_FUA	yes		no		yes		yes
-TAG		no		yes		N/A		N/A
-TAG_FLUSH	yes		yes		yes		no
-TAG_FUA		yes		yes		yes		yes
-
-
-QUEUE_ORDERED_NONE
-	I/O barriers are not needed and/or supported.
-
-	Sequence: N/A
-
-QUEUE_ORDERED_DRAIN
-	Requests are ordered by draining the request queue and cache
-	flushing isn't needed.
-
-	Sequence: drain => barrier
-
-QUEUE_ORDERED_DRAIN_FLUSH
-	Requests are ordered by draining the request queue and both
-	pre-barrier and post-barrier cache flushings are needed.
-
-	Sequence: drain => preflush => barrier => postflush
-
-QUEUE_ORDERED_DRAIN_FUA
-	Requests are ordered by draining the request queue and
-	pre-barrier cache flushing is needed.  By using FUA on barrier
-	request, post-barrier flushing can be skipped.
-
-	Sequence: drain => preflush => barrier
-
-QUEUE_ORDERED_TAG
-	Requests are ordered by ordered tag and cache flushing isn't
-	needed.
-
-	Sequence: barrier
-
-QUEUE_ORDERED_TAG_FLUSH
-	Requests are ordered by ordered tag and both pre-barrier and
-	post-barrier cache flushings are needed.
-
-	Sequence: preflush -> barrier -> postflush
-
-QUEUE_ORDERED_TAG_FUA
-	Requests are ordered by ordered tag and pre-barrier cache
-	flushing is needed.  By using FUA on barrier request,
-	post-barrier flushing can be skipped.
-
-	Sequence: preflush -> barrier
-
-
-Random notes/caveats
---------------------
-
-* SCSI layer currently can't use TAG ordering even if the drive,
-controller and driver support it.  The problem is that SCSI midlayer
-request dispatch function is not atomic.  It releases queue lock and
-switch to SCSI host lock during issue and it's possible and likely to
-happen in time that requests change their relative positions.  Once
-this problem is solved, TAG ordering can be enabled.
-
-* Currently, no matter which ordered mode is used, there can be only
-one barrier request in progress.  All I/O barriers are held off by
-block layer until the previous I/O barrier is complete.  This doesn't
-make any difference for DRAIN ordered devices, but, for TAG ordered
-devices with very high command latency, passing multiple I/O barriers
-to low level *might* be helpful if they are very frequent.  Well, this
-certainly is a non-issue.  I'm writing this just to make clear that no
-two I/O barrier is ever passed to low-level driver.
-
-* Completion order.  Requests in ordered sequence are issued in order
-but not required to finish in order.  Barrier implementation can
-handle out-of-order completion of ordered sequence.  IOW, the requests
-MUST be processed in order but the hardware/software completion paths
-are allowed to reorder completion notifications - eg. current SCSI
-midlayer doesn't preserve completion order during error handling.
-
-* Requeueing order.  Low-level drivers are free to requeue any request
-after they removed it from the request queue with
-blkdev_dequeue_request().  As barrier sequence should be kept in order
-when requeued, generic elevator code takes care of putting requests in
-order around barrier.  See blk_ordered_req_seq() and
-ELEVATOR_INSERT_REQUEUE handling in __elv_add_request() for details.
-
-Note that block drivers must not requeue preceding requests while
-completing latter requests in an ordered sequence.  Currently, no
-error checking is done against this.
-
-* Error handling.  Currently, block layer will report error to upper
-layer if any of requests in an ordered sequence fails.  Unfortunately,
-this doesn't seem to be enough.  Look at the following request flow.
-QUEUE_ORDERED_TAG_FLUSH is in use.
-
- [0] [1] [2] [3] [pre] [barrier] [post] < [4] [5] [6] ... >
-					  still in elevator
-
-Let's say request [2], [3] are write requests to update file system
-metadata (journal or whatever) and [barrier] is used to mark that
-those updates are valid.  Consider the following sequence.
-
- i.	Requests [0] ~ [post] leaves the request queue and enters
-	low-level driver.
- ii.	After a while, unfortunately, something goes wrong and the
-	drive fails [2].  Note that any of [0], [1] and [3] could have
-	completed by this time, but [pre] couldn't have been finished
-	as the drive must process it in order and it failed before
-	processing that command.
- iii.	Error handling kicks in and determines that the error is
-	unrecoverable and fails [2], and resumes operation.
- iv.	[pre] [barrier] [post] gets processed.
- v.	*BOOM* power fails
-
-The problem here is that the barrier request is *supposed* to indicate
-that filesystem update requests [2] and [3] made it safely to the
-physical medium and, if the machine crashes after the barrier is
-written, filesystem recovery code can depend on that.  Sadly, that
-isn't true in this case anymore.  IOW, the success of a I/O barrier
-should also be dependent on success of some of the preceding requests,
-where only upper layer (filesystem) knows what 'some' is.
-
-This can be solved by implementing a way to tell the block layer which
-requests affect the success of the following barrier request and
-making lower lever drivers to resume operation on error only after
-block layer tells it to do so.
-
-As the probability of this happening is very low and the drive should
-be faulty, implementing the fix is probably an overkill.  But, still,
-it's there.
-
-* In previous drafts of barrier implementation, there was fallback
-mechanism such that, if FUA or ordered TAG fails, less fancy ordered
-mode can be selected and the failed barrier request is retried
-automatically.  The rationale for this feature was that as FUA is
-pretty new in ATA world and ordered tag was never used widely, there
-could be devices which report to support those features but choke when
-actually given such requests.
-
- This was removed for two reasons 1. it's an overkill 2. it's
-impossible to implement properly when TAG ordering is used as low
-level drivers resume after an error automatically.  If it's ever
-needed adding it back and modifying low level drivers accordingly
-shouldn't be difficult.
diff --git a/Documentation/block/writeback_cache_control.txt b/Documentation/block/writeback_cache_control.txt
new file mode 100644
index 000000000000..83407d36630a
--- /dev/null
+++ b/Documentation/block/writeback_cache_control.txt
@@ -0,0 +1,86 @@
+
+Explicit volatile write back cache control
+=====================================
+
+Introduction
+------------
+
+Many storage devices, especially in the consumer market, come with volatile
+write back caches.  That means the devices signal I/O completion to the
+operating system before data actually has hit the non-volatile storage.  This
+behavior obviously speeds up various workloads, but it means the operating
+system needs to force data out to the non-volatile storage when it performs
+a data integrity operation like fsync, sync or an unmount.
+
+The Linux block layer provides two simple mechanisms that let filesystems
+control the caching behavior of the storage device.  These mechanisms are
+a forced cache flush, and the Force Unit Access (FUA) flag for requests.
+
+
+Explicit cache flushes
+----------------------
+
+The REQ_FLUSH flag can be OR ed into the r/w flags of a bio submitted from
+the filesystem and will make sure the volatile cache of the storage device
+has been flushed before the actual I/O operation is started.  This explicitly
+guarantees that previously completed write requests are on non-volatile
+storage before the flagged bio starts. In addition the REQ_FLUSH flag can be
+set on an otherwise empty bio structure, which causes only an explicit cache
+flush without any dependent I/O.  It is recommend to use
+the blkdev_issue_flush() helper for a pure cache flush.
+
+
+Forced Unit Access
+-----------------
+
+The REQ_FUA flag can be OR ed into the r/w flags of a bio submitted from the
+filesystem and will make sure that I/O completion for this request is only
+signaled after the data has been committed to non-volatile storage.
+
+
+Implementation details for filesystems
+--------------------------------------
+
+Filesystems can simply set the REQ_FLUSH and REQ_FUA bits and do not have to
+worry if the underlying devices need any explicit cache flushing and how
+the Forced Unit Access is implemented.  The REQ_FLUSH and REQ_FUA flags
+may both be set on a single bio.
+
+
+Implementation details for make_request_fn based block drivers
+--------------------------------------------------------------
+
+These drivers will always see the REQ_FLUSH and REQ_FUA bits as they sit
+directly below the submit_bio interface.  For remapping drivers the REQ_FUA
+bits need to be propagated to underlying devices, and a global flush needs
+to be implemented for bios with the REQ_FLUSH bit set.  For real device
+drivers that do not have a volatile cache the REQ_FLUSH and REQ_FUA bits
+on non-empty bios can simply be ignored, and REQ_FLUSH requests without
+data can be completed successfully without doing any work.  Drivers for
+devices with volatile caches need to implement the support for these
+flags themselves without any help from the block layer.
+
+
+Implementation details for request_fn based block drivers
+--------------------------------------------------------------
+
+For devices that do not support volatile write caches there is no driver
+support required, the block layer completes empty REQ_FLUSH requests before
+entering the driver and strips off the REQ_FLUSH and REQ_FUA bits from
+requests that have a payload.  For devices with volatile write caches the
+driver needs to tell the block layer that it supports flushing caches by
+doing:
+
+	blk_queue_flush(sdkp->disk->queue, REQ_FLUSH);
+
+and handle empty REQ_FLUSH requests in its prep_fn/request_fn.  Note that
+REQ_FLUSH requests with a payload are automatically turned into a sequence
+of an empty REQ_FLUSH request followed by the actual write by the block
+layer.  For devices that also support the FUA bit the block layer needs
+to be told to pass through the REQ_FUA bit using:
+
+	blk_queue_flush(sdkp->disk->queue, REQ_FLUSH | REQ_FUA);
+
+and the driver must handle write requests that have the REQ_FUA bit set
+in prep_fn/request_fn.  If the FUA bit is not natively supported the block
+layer turns it into an empty REQ_FLUSH request after the actual write.
-- 
cgit v1.2.3


From d391a2dda2f1c993f094bdb3a8a342c5e0546553 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:17 +0200
Subject: block: use REQ_FLUSH in blkdev_issue_flush()

Update blkdev_issue_flush() to use new REQ_FLUSH interface.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-flush.c | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 7d1fc982e78f..62b7df9bca9d 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -191,13 +191,10 @@ struct request *blk_do_flush(struct request_queue *q, struct request *rq)
 	return blk_flush_complete_seq(q, skip, 0);
 }
 
-static void bio_end_empty_barrier(struct bio *bio, int err)
+static void bio_end_flush(struct bio *bio, int err)
 {
-	if (err) {
-		if (err == -EOPNOTSUPP)
-			set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+	if (err)
 		clear_bit(BIO_UPTODATE, &bio->bi_flags);
-	}
 	if (bio->bi_private)
 		complete(bio->bi_private);
 	bio_put(bio);
@@ -235,19 +232,19 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
 	 * some block devices may not have their queue correctly set up here
 	 * (e.g. loop device without a backing file) and so issuing a flush
 	 * here will panic. Ensure there is a request function before issuing
-	 * the barrier.
+	 * the flush.
 	 */
 	if (!q->make_request_fn)
 		return -ENXIO;
 
 	bio = bio_alloc(gfp_mask, 0);
-	bio->bi_end_io = bio_end_empty_barrier;
+	bio->bi_end_io = bio_end_flush;
 	bio->bi_bdev = bdev;
 	if (test_bit(BLKDEV_WAIT, &flags))
 		bio->bi_private = &wait;
 
 	bio_get(bio);
-	submit_bio(WRITE_BARRIER, bio);
+	submit_bio(WRITE_FLUSH, bio);
 	if (test_bit(BLKDEV_WAIT, &flags)) {
 		wait_for_completion(&wait);
 		/*
@@ -259,9 +256,7 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
 			*error_sector = bio->bi_sector;
 	}
 
-	if (bio_flagged(bio, BIO_EOPNOTSUPP))
-		ret = -EOPNOTSUPP;
-	else if (!bio_flagged(bio, BIO_UPTODATE))
+	if (!bio_flagged(bio, BIO_UPTODATE))
 		ret = -EIO;
 
 	bio_put(bio);
-- 
cgit v1.2.3


From 6259f28459a9de820f47a9ece4ffa22d4596a9af Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:17 +0200
Subject: block/loop: implement REQ_FLUSH/FUA support

Deprecate REQ_HARDBARRIER and implement REQ_FLUSH/FUA instead.  Also,
instead of checking file->f_op->fsync() directly, look at the value of
vfs_fsync() and ignore -EINVAL return.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/loop.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 953d1e12f4d4..5d27bc6596de 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -477,17 +477,17 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
 	pos = ((loff_t) bio->bi_sector << 9) + lo->lo_offset;
 
 	if (bio_rw(bio) == WRITE) {
-		bool barrier = (bio->bi_rw & REQ_HARDBARRIER);
 		struct file *file = lo->lo_backing_file;
 
-		if (barrier) {
-			if (unlikely(!file->f_op->fsync)) {
-				ret = -EOPNOTSUPP;
-				goto out;
-			}
+		/* REQ_HARDBARRIER is deprecated */
+		if (bio->bi_rw & REQ_HARDBARRIER) {
+			ret = -EOPNOTSUPP;
+			goto out;
+		}
 
+		if (bio->bi_rw & REQ_FLUSH) {
 			ret = vfs_fsync(file, 0);
-			if (unlikely(ret)) {
+			if (unlikely(ret && ret != -EINVAL)) {
 				ret = -EIO;
 				goto out;
 			}
@@ -495,9 +495,9 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
 
 		ret = lo_send(lo, bio, pos);
 
-		if (barrier && !ret) {
+		if ((bio->bi_rw & REQ_FUA) && !ret) {
 			ret = vfs_fsync(file, 0);
-			if (unlikely(ret))
+			if (unlikely(ret && ret != -EINVAL))
 				ret = -EIO;
 		}
 	} else
-- 
cgit v1.2.3


From 02c42b7a68695c2c3e3fecf9f6b9a4ea43abe52f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:18 +0200
Subject: virtio_blk: drop REQ_HARDBARRIER support

Remove now unused REQ_HARDBARRIER support.  virtio_blk already
supports REQ_FLUSH and the usefulness of REQ_FUA for virtio_blk is
questionable at this point, so there's nothing else to do to support
new REQ_FLUSH/FUA interface.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/virtio_blk.c | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index d10b635b3946..126062802cb7 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -128,9 +128,6 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
 		}
 	}
 
-	if (vbr->req->cmd_flags & REQ_HARDBARRIER)
-		vbr->out_hdr.type |= VIRTIO_BLK_T_BARRIER;
-
 	sg_set_buf(&vblk->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
 
 	/*
@@ -388,13 +385,7 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
 	vblk->disk->driverfs_dev = &vdev->dev;
 	index++;
 
-	/*
-	 * If the FLUSH feature is supported we do have support for
-	 * flushing a volatile write cache on the host.  Use that to
-	 * implement write barrier support; otherwise, we must assume
-	 * that the host does not perform any kind of volatile write
-	 * caching.
-	 */
+	/* configure queue flush support */
 	if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH))
 		blk_queue_flush(q, REQ_FLUSH);
 
@@ -515,9 +506,9 @@ static const struct virtio_device_id id_table[] = {
 };
 
 static unsigned int features[] = {
-	VIRTIO_BLK_F_BARRIER, VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX,
-	VIRTIO_BLK_F_GEOMETRY, VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
-	VIRTIO_BLK_F_SCSI, VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY
+	VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
+	VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, VIRTIO_BLK_F_SCSI,
+	VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY
 };
 
 /*
-- 
cgit v1.2.3


From 7bc9fddab074d6bb630344e1969e28d20b140621 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:18 +0200
Subject: lguest: replace VIRTIO_F_BARRIER support with VIRTIO_F_FLUSH support

VIRTIO_F_BARRIER is deprecated.  Replace it with VIRTIO_F_FLUSH
support.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 Documentation/lguest/lguest.c | 29 +++++++++--------------------
 1 file changed, 9 insertions(+), 20 deletions(-)

diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index e9ce3c554514..fbc64b331436 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -1638,15 +1638,6 @@ static void blk_request(struct virtqueue *vq)
 	 */
 	off = out->sector * 512;
 
-	/*
-	 * The block device implements "barriers", where the Guest indicates
-	 * that it wants all previous writes to occur before this write.  We
-	 * don't have a way of asking our kernel to do a barrier, so we just
-	 * synchronize all the data in the file.  Pretty poor, no?
-	 */
-	if (out->type & VIRTIO_BLK_T_BARRIER)
-		fdatasync(vblk->fd);
-
 	/*
 	 * In general the virtio block driver is allowed to try SCSI commands.
 	 * It'd be nice if we supported eject, for example, but we don't.
@@ -1679,6 +1670,13 @@ static void blk_request(struct virtqueue *vq)
 			/* Die, bad Guest, die. */
 			errx(1, "Write past end %llu+%u", off, ret);
 		}
+
+		wlen = sizeof(*in);
+		*in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
+	} else if (out->type & VIRTIO_BLK_T_FLUSH) {
+		/* Flush */
+		ret = fdatasync(vblk->fd);
+		verbose("FLUSH fdatasync: %i\n", ret);
 		wlen = sizeof(*in);
 		*in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
 	} else {
@@ -1702,15 +1700,6 @@ static void blk_request(struct virtqueue *vq)
 		}
 	}
 
-	/*
-	 * OK, so we noted that it was pretty poor to use an fdatasync as a
-	 * barrier.  But Christoph Hellwig points out that we need a sync
-	 * *afterwards* as well: "Barriers specify no reordering to the front
-	 * or the back."  And Jens Axboe confirmed it, so here we are:
-	 */
-	if (out->type & VIRTIO_BLK_T_BARRIER)
-		fdatasync(vblk->fd);
-
 	/* Finished that request. */
 	add_used(vq, head, wlen);
 }
@@ -1735,8 +1724,8 @@ static void setup_block_file(const char *filename)
 	vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE);
 	vblk->len = lseek64(vblk->fd, 0, SEEK_END);
 
-	/* We support barriers. */
-	add_feature(dev, VIRTIO_BLK_F_BARRIER);
+	/* We support FLUSH. */
+	add_feature(dev, VIRTIO_BLK_F_FLUSH);
 
 	/* Tell Guest how many sectors this device has. */
 	conf.capacity = cpu_to_le64(vblk->len / 512);
-- 
cgit v1.2.3


From e9c7469bb4f502dafc092166201bea1ad5fc0fbf Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:18 +0200
Subject: md: implment REQ_FLUSH/FUA support

This patch converts md to support REQ_FLUSH/FUA instead of now
deprecated REQ_HARDBARRIER.  In the core part (md.c), the following
changes are notable.

* Unlike REQ_HARDBARRIER, REQ_FLUSH/FUA don't interfere with
  processing of other requests and thus there is no reason to mark the
  queue congested while FLUSH/FUA is in progress.

* REQ_FLUSH/FUA failures are final and its users don't need retry
  logic.  Retry logic is removed.

* Preflush needs to be issued to all member devices but FUA writes can
  be handled the same way as other writes - their processing can be
  deferred to request_queue of member devices.  md_barrier_request()
  is renamed to md_flush_request() and simplified accordingly.

For linear, raid0 and multipath, the core changes are enough.  raid1,
5 and 10 need the following conversions.

* raid1: Handling of FLUSH/FUA bio's can simply be deferred to
  request_queues of member devices.  Barrier related logic removed.

* raid5: Queue draining logic dropped.  FUA bit is propagated through
  biodrain and stripe resconstruction such that all the updated parts
  of the stripe are written out with FUA writes if any of the dirtying
  writes was FUA.  preread_active_stripes handling in make_request()
  is updated as suggested by Neil Brown.

* raid10: FUA bit needs to be propagated to write clones.

linear, raid0, 1, 5 and 10 tested.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Neil Brown <neilb@suse.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/md/linear.c    |   4 +-
 drivers/md/md.c        | 117 +++++++-------------------------
 drivers/md/md.h        |  23 ++-----
 drivers/md/multipath.c |   4 +-
 drivers/md/raid0.c     |   4 +-
 drivers/md/raid1.c     | 176 +++++++++++++++++--------------------------------
 drivers/md/raid1.h     |   2 -
 drivers/md/raid10.c    |   7 +-
 drivers/md/raid5.c     |  43 ++++++------
 drivers/md/raid5.h     |   1 +
 10 files changed, 122 insertions(+), 259 deletions(-)

diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index ba19060bcf3f..8a2f767f26d8 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -294,8 +294,8 @@ static int linear_make_request (mddev_t *mddev, struct bio *bio)
 	dev_info_t *tmp_dev;
 	sector_t start_sector;
 
-	if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
-		md_barrier_request(mddev, bio);
+	if (unlikely(bio->bi_rw & REQ_FLUSH)) {
+		md_flush_request(mddev, bio);
 		return 0;
 	}
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index c148b6302154..3640f025cb72 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -226,12 +226,12 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
 		return 0;
 	}
 	rcu_read_lock();
-	if (mddev->suspended || mddev->barrier) {
+	if (mddev->suspended) {
 		DEFINE_WAIT(__wait);
 		for (;;) {
 			prepare_to_wait(&mddev->sb_wait, &__wait,
 					TASK_UNINTERRUPTIBLE);
-			if (!mddev->suspended && !mddev->barrier)
+			if (!mddev->suspended)
 				break;
 			rcu_read_unlock();
 			schedule();
@@ -282,40 +282,29 @@ EXPORT_SYMBOL_GPL(mddev_resume);
 
 int mddev_congested(mddev_t *mddev, int bits)
 {
-	if (mddev->barrier)
-		return 1;
 	return mddev->suspended;
 }
 EXPORT_SYMBOL(mddev_congested);
 
 /*
- * Generic barrier handling for md
+ * Generic flush handling for md
  */
 
-#define POST_REQUEST_BARRIER ((void*)1)
-
-static void md_end_barrier(struct bio *bio, int err)
+static void md_end_flush(struct bio *bio, int err)
 {
 	mdk_rdev_t *rdev = bio->bi_private;
 	mddev_t *mddev = rdev->mddev;
-	if (err == -EOPNOTSUPP && mddev->barrier != POST_REQUEST_BARRIER)
-		set_bit(BIO_EOPNOTSUPP, &mddev->barrier->bi_flags);
 
 	rdev_dec_pending(rdev, mddev);
 
 	if (atomic_dec_and_test(&mddev->flush_pending)) {
-		if (mddev->barrier == POST_REQUEST_BARRIER) {
-			/* This was a post-request barrier */
-			mddev->barrier = NULL;
-			wake_up(&mddev->sb_wait);
-		} else
-			/* The pre-request barrier has finished */
-			schedule_work(&mddev->barrier_work);
+		/* The pre-request flush has finished */
+		schedule_work(&mddev->flush_work);
 	}
 	bio_put(bio);
 }
 
-static void submit_barriers(mddev_t *mddev)
+static void submit_flushes(mddev_t *mddev)
 {
 	mdk_rdev_t *rdev;
 
@@ -332,60 +321,56 @@ static void submit_barriers(mddev_t *mddev)
 			atomic_inc(&rdev->nr_pending);
 			rcu_read_unlock();
 			bi = bio_alloc(GFP_KERNEL, 0);
-			bi->bi_end_io = md_end_barrier;
+			bi->bi_end_io = md_end_flush;
 			bi->bi_private = rdev;
 			bi->bi_bdev = rdev->bdev;
 			atomic_inc(&mddev->flush_pending);
-			submit_bio(WRITE_BARRIER, bi);
+			submit_bio(WRITE_FLUSH, bi);
 			rcu_read_lock();
 			rdev_dec_pending(rdev, mddev);
 		}
 	rcu_read_unlock();
 }
 
-static void md_submit_barrier(struct work_struct *ws)
+static void md_submit_flush_data(struct work_struct *ws)
 {
-	mddev_t *mddev = container_of(ws, mddev_t, barrier_work);
-	struct bio *bio = mddev->barrier;
+	mddev_t *mddev = container_of(ws, mddev_t, flush_work);
+	struct bio *bio = mddev->flush_bio;
 
 	atomic_set(&mddev->flush_pending, 1);
 
-	if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
-		bio_endio(bio, -EOPNOTSUPP);
-	else if (bio->bi_size == 0)
+	if (bio->bi_size == 0)
 		/* an empty barrier - all done */
 		bio_endio(bio, 0);
 	else {
-		bio->bi_rw &= ~REQ_HARDBARRIER;
+		bio->bi_rw &= ~REQ_FLUSH;
 		if (mddev->pers->make_request(mddev, bio))
 			generic_make_request(bio);
-		mddev->barrier = POST_REQUEST_BARRIER;
-		submit_barriers(mddev);
 	}
 	if (atomic_dec_and_test(&mddev->flush_pending)) {
-		mddev->barrier = NULL;
+		mddev->flush_bio = NULL;
 		wake_up(&mddev->sb_wait);
 	}
 }
 
-void md_barrier_request(mddev_t *mddev, struct bio *bio)
+void md_flush_request(mddev_t *mddev, struct bio *bio)
 {
 	spin_lock_irq(&mddev->write_lock);
 	wait_event_lock_irq(mddev->sb_wait,
-			    !mddev->barrier,
+			    !mddev->flush_bio,
 			    mddev->write_lock, /*nothing*/);
-	mddev->barrier = bio;
+	mddev->flush_bio = bio;
 	spin_unlock_irq(&mddev->write_lock);
 
 	atomic_set(&mddev->flush_pending, 1);
-	INIT_WORK(&mddev->barrier_work, md_submit_barrier);
+	INIT_WORK(&mddev->flush_work, md_submit_flush_data);
 
-	submit_barriers(mddev);
+	submit_flushes(mddev);
 
 	if (atomic_dec_and_test(&mddev->flush_pending))
-		schedule_work(&mddev->barrier_work);
+		schedule_work(&mddev->flush_work);
 }
-EXPORT_SYMBOL(md_barrier_request);
+EXPORT_SYMBOL(md_flush_request);
 
 /* Support for plugging.
  * This mirrors the plugging support in request_queue, but does not
@@ -696,31 +681,6 @@ static void super_written(struct bio *bio, int error)
 	bio_put(bio);
 }
 
-static void super_written_barrier(struct bio *bio, int error)
-{
-	struct bio *bio2 = bio->bi_private;
-	mdk_rdev_t *rdev = bio2->bi_private;
-	mddev_t *mddev = rdev->mddev;
-
-	if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
-	    error == -EOPNOTSUPP) {
-		unsigned long flags;
-		/* barriers don't appear to be supported :-( */
-		set_bit(BarriersNotsupp, &rdev->flags);
-		mddev->barriers_work = 0;
-		spin_lock_irqsave(&mddev->write_lock, flags);
-		bio2->bi_next = mddev->biolist;
-		mddev->biolist = bio2;
-		spin_unlock_irqrestore(&mddev->write_lock, flags);
-		wake_up(&mddev->sb_wait);
-		bio_put(bio);
-	} else {
-		bio_put(bio2);
-		bio->bi_private = rdev;
-		super_written(bio, error);
-	}
-}
-
 void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
 		   sector_t sector, int size, struct page *page)
 {
@@ -729,51 +689,28 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
 	 * and decrement it on completion, waking up sb_wait
 	 * if zero is reached.
 	 * If an error occurred, call md_error
-	 *
-	 * As we might need to resubmit the request if REQ_HARDBARRIER
-	 * causes ENOTSUPP, we allocate a spare bio...
 	 */
 	struct bio *bio = bio_alloc(GFP_NOIO, 1);
-	int rw = REQ_WRITE | REQ_SYNC | REQ_UNPLUG;
 
 	bio->bi_bdev = rdev->bdev;
 	bio->bi_sector = sector;
 	bio_add_page(bio, page, size, 0);
 	bio->bi_private = rdev;
 	bio->bi_end_io = super_written;
-	bio->bi_rw = rw;
 
 	atomic_inc(&mddev->pending_writes);
-	if (!test_bit(BarriersNotsupp, &rdev->flags)) {
-		struct bio *rbio;
-		rw |= REQ_HARDBARRIER;
-		rbio = bio_clone(bio, GFP_NOIO);
-		rbio->bi_private = bio;
-		rbio->bi_end_io = super_written_barrier;
-		submit_bio(rw, rbio);
-	} else
-		submit_bio(rw, bio);
+	submit_bio(REQ_WRITE | REQ_SYNC | REQ_UNPLUG | REQ_FLUSH | REQ_FUA,
+		   bio);
 }
 
 void md_super_wait(mddev_t *mddev)
 {
-	/* wait for all superblock writes that were scheduled to complete.
-	 * if any had to be retried (due to BARRIER problems), retry them
-	 */
+	/* wait for all superblock writes that were scheduled to complete */
 	DEFINE_WAIT(wq);
 	for(;;) {
 		prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
 		if (atomic_read(&mddev->pending_writes)==0)
 			break;
-		while (mddev->biolist) {
-			struct bio *bio;
-			spin_lock_irq(&mddev->write_lock);
-			bio = mddev->biolist;
-			mddev->biolist = bio->bi_next ;
-			bio->bi_next = NULL;
-			spin_unlock_irq(&mddev->write_lock);
-			submit_bio(bio->bi_rw, bio);
-		}
 		schedule();
 	}
 	finish_wait(&mddev->sb_wait, &wq);
@@ -1070,7 +1007,6 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 	clear_bit(Faulty, &rdev->flags);
 	clear_bit(In_sync, &rdev->flags);
 	clear_bit(WriteMostly, &rdev->flags);
-	clear_bit(BarriersNotsupp, &rdev->flags);
 
 	if (mddev->raid_disks == 0) {
 		mddev->major_version = 0;
@@ -1485,7 +1421,6 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 	clear_bit(Faulty, &rdev->flags);
 	clear_bit(In_sync, &rdev->flags);
 	clear_bit(WriteMostly, &rdev->flags);
-	clear_bit(BarriersNotsupp, &rdev->flags);
 
 	if (mddev->raid_disks == 0) {
 		mddev->major_version = 1;
@@ -4506,7 +4441,6 @@ int md_run(mddev_t *mddev)
 	/* may be over-ridden by personality */
 	mddev->resync_max_sectors = mddev->dev_sectors;
 
-	mddev->barriers_work = 1;
 	mddev->ok_start_degraded = start_dirty_degraded;
 
 	if (start_readonly && mddev->ro == 0)
@@ -4685,7 +4619,6 @@ static void md_clean(mddev_t *mddev)
 	mddev->recovery = 0;
 	mddev->in_sync = 0;
 	mddev->degraded = 0;
-	mddev->barriers_work = 0;
 	mddev->safemode = 0;
 	mddev->bitmap_info.offset = 0;
 	mddev->bitmap_info.default_offset = 0;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index a953fe2808ae..d8e2ab25103b 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -87,7 +87,6 @@ struct mdk_rdev_s
 #define	Faulty		1		/* device is known to have a fault */
 #define	In_sync		2		/* device is in_sync with rest of array */
 #define	WriteMostly	4		/* Avoid reading if at all possible */
-#define	BarriersNotsupp	5		/* REQ_HARDBARRIER is not supported */
 #define	AllReserved	6		/* If whole device is reserved for
 					 * one array */
 #define	AutoDetected	7		/* added by auto-detect */
@@ -273,13 +272,6 @@ struct mddev_s
 	int				degraded;	/* whether md should consider
 							 * adding a spare
 							 */
-	int				barriers_work;	/* initialised to true, cleared as soon
-							 * as a barrier request to slave
-							 * fails.  Only supported
-							 */
-	struct bio			*biolist; 	/* bios that need to be retried
-							 * because REQ_HARDBARRIER is not supported
-							 */
 
 	atomic_t			recovery_active; /* blocks scheduled, but not written */
 	wait_queue_head_t		recovery_wait;
@@ -339,16 +331,13 @@ struct mddev_s
 	struct attribute_group		*to_remove;
 	struct plug_handle		*plug; /* if used by personality */
 
-	/* Generic barrier handling.
-	 * If there is a pending barrier request, all other
-	 * writes are blocked while the devices are flushed.
-	 * The last to finish a flush schedules a worker to
-	 * submit the barrier request (without the barrier flag),
-	 * then submit more flush requests.
+	/* Generic flush handling.
+	 * The last to finish preflush schedules a worker to submit
+	 * the rest of the request (without the REQ_FLUSH flag).
 	 */
-	struct bio *barrier;
+	struct bio *flush_bio;
 	atomic_t flush_pending;
-	struct work_struct barrier_work;
+	struct work_struct flush_work;
 	struct work_struct event_work;	/* used by dm to report failure event */
 };
 
@@ -502,7 +491,7 @@ extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
 extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
 
 extern int mddev_congested(mddev_t *mddev, int bits);
-extern void md_barrier_request(mddev_t *mddev, struct bio *bio);
+extern void md_flush_request(mddev_t *mddev, struct bio *bio);
 extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
 			   sector_t sector, int size, struct page *page);
 extern void md_super_wait(mddev_t *mddev);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 0307d217e7a4..6d7ddf32ef2e 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -142,8 +142,8 @@ static int multipath_make_request(mddev_t *mddev, struct bio * bio)
 	struct multipath_bh * mp_bh;
 	struct multipath_info *multipath;
 
-	if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
-		md_barrier_request(mddev, bio);
+	if (unlikely(bio->bi_rw & REQ_FLUSH)) {
+		md_flush_request(mddev, bio);
 		return 0;
 	}
 
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 6f7af46d623c..a39f4c355e55 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -483,8 +483,8 @@ static int raid0_make_request(mddev_t *mddev, struct bio *bio)
 	struct strip_zone *zone;
 	mdk_rdev_t *tmp_dev;
 
-	if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
-		md_barrier_request(mddev, bio);
+	if (unlikely(bio->bi_rw & REQ_FLUSH)) {
+		md_flush_request(mddev, bio);
 		return 0;
 	}
 
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index ad83a4dcadc3..886a9d865488 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -319,83 +319,74 @@ static void raid1_end_write_request(struct bio *bio, int error)
 		if (r1_bio->bios[mirror] == bio)
 			break;
 
-	if (error == -EOPNOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) {
-		set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags);
-		set_bit(R1BIO_BarrierRetry, &r1_bio->state);
-		r1_bio->mddev->barriers_work = 0;
-		/* Don't rdev_dec_pending in this branch - keep it for the retry */
-	} else {
+	/*
+	 * 'one mirror IO has finished' event handler:
+	 */
+	r1_bio->bios[mirror] = NULL;
+	to_put = bio;
+	if (!uptodate) {
+		md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
+		/* an I/O failed, we can't clear the bitmap */
+		set_bit(R1BIO_Degraded, &r1_bio->state);
+	} else
 		/*
-		 * this branch is our 'one mirror IO has finished' event handler:
+		 * Set R1BIO_Uptodate in our master bio, so that we
+		 * will return a good error code for to the higher
+		 * levels even if IO on some other mirrored buffer
+		 * fails.
+		 *
+		 * The 'master' represents the composite IO operation
+		 * to user-side. So if something waits for IO, then it
+		 * will wait for the 'master' bio.
 		 */
-		r1_bio->bios[mirror] = NULL;
-		to_put = bio;
-		if (!uptodate) {
-			md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
-			/* an I/O failed, we can't clear the bitmap */
-			set_bit(R1BIO_Degraded, &r1_bio->state);
-		} else
-			/*
-			 * Set R1BIO_Uptodate in our master bio, so that
-			 * we will return a good error code for to the higher
-			 * levels even if IO on some other mirrored buffer fails.
-			 *
-			 * The 'master' represents the composite IO operation to
-			 * user-side. So if something waits for IO, then it will
-			 * wait for the 'master' bio.
-			 */
-			set_bit(R1BIO_Uptodate, &r1_bio->state);
-
-		update_head_pos(mirror, r1_bio);
-
-		if (behind) {
-			if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
-				atomic_dec(&r1_bio->behind_remaining);
-
-			/* In behind mode, we ACK the master bio once the I/O has safely
-			 * reached all non-writemostly disks. Setting the Returned bit
-			 * ensures that this gets done only once -- we don't ever want to
-			 * return -EIO here, instead we'll wait */
-
-			if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
-			    test_bit(R1BIO_Uptodate, &r1_bio->state)) {
-				/* Maybe we can return now */
-				if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
-					struct bio *mbio = r1_bio->master_bio;
-					PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
-					       (unsigned long long) mbio->bi_sector,
-					       (unsigned long long) mbio->bi_sector +
-					       (mbio->bi_size >> 9) - 1);
-					bio_endio(mbio, 0);
-				}
+		set_bit(R1BIO_Uptodate, &r1_bio->state);
+
+	update_head_pos(mirror, r1_bio);
+
+	if (behind) {
+		if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
+			atomic_dec(&r1_bio->behind_remaining);
+
+		/*
+		 * In behind mode, we ACK the master bio once the I/O
+		 * has safely reached all non-writemostly
+		 * disks. Setting the Returned bit ensures that this
+		 * gets done only once -- we don't ever want to return
+		 * -EIO here, instead we'll wait
+		 */
+		if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
+		    test_bit(R1BIO_Uptodate, &r1_bio->state)) {
+			/* Maybe we can return now */
+			if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
+				struct bio *mbio = r1_bio->master_bio;
+				PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
+				       (unsigned long long) mbio->bi_sector,
+				       (unsigned long long) mbio->bi_sector +
+				       (mbio->bi_size >> 9) - 1);
+				bio_endio(mbio, 0);
 			}
 		}
-		rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
 	}
+	rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
+
 	/*
-	 *
 	 * Let's see if all mirrored write operations have finished
 	 * already.
 	 */
 	if (atomic_dec_and_test(&r1_bio->remaining)) {
-		if (test_bit(R1BIO_BarrierRetry, &r1_bio->state))
-			reschedule_retry(r1_bio);
-		else {
-			/* it really is the end of this request */
-			if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
-				/* free extra copy of the data pages */
-				int i = bio->bi_vcnt;
-				while (i--)
-					safe_put_page(bio->bi_io_vec[i].bv_page);
-			}
-			/* clear the bitmap if all writes complete successfully */
-			bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
-					r1_bio->sectors,
-					!test_bit(R1BIO_Degraded, &r1_bio->state),
-					behind);
-			md_write_end(r1_bio->mddev);
-			raid_end_bio_io(r1_bio);
+		if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
+			/* free extra copy of the data pages */
+			int i = bio->bi_vcnt;
+			while (i--)
+				safe_put_page(bio->bi_io_vec[i].bv_page);
 		}
+		/* clear the bitmap if all writes complete successfully */
+		bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
+				r1_bio->sectors,
+				!test_bit(R1BIO_Degraded, &r1_bio->state),
+				behind);
+		md_write_end(r1_bio->mddev);
+		raid_end_bio_io(r1_bio);
 	}
 
 	if (to_put)
@@ -788,16 +779,13 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 	struct page **behind_pages = NULL;
 	const int rw = bio_data_dir(bio);
 	const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
-	unsigned long do_barriers;
+	const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
 	mdk_rdev_t *blocked_rdev;
 
 	/*
 	 * Register the new request and wait if the reconstruction
 	 * thread has put up a bar for new requests.
 	 * Continue immediately if no resync is active currently.
-	 * We test barriers_work *after* md_write_start as md_write_start
-	 * may cause the first superblock write, and that will check out
-	 * if barriers work.
 	 */
 
 	md_write_start(mddev, bio); /* wait on superblock update early */
@@ -821,13 +809,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 		}
 		finish_wait(&conf->wait_barrier, &w);
 	}
-	if (unlikely(!mddev->barriers_work &&
-		     (bio->bi_rw & REQ_HARDBARRIER))) {
-		if (rw == WRITE)
-			md_write_end(mddev);
-		bio_endio(bio, -EOPNOTSUPP);
-		return 0;
-	}
 
 	wait_barrier(conf);
 
@@ -959,10 +940,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 	atomic_set(&r1_bio->remaining, 0);
 	atomic_set(&r1_bio->behind_remaining, 0);
 
-	do_barriers = bio->bi_rw & REQ_HARDBARRIER;
-	if (do_barriers)
-		set_bit(R1BIO_Barrier, &r1_bio->state);
-
 	bio_list_init(&bl);
 	for (i = 0; i < disks; i++) {
 		struct bio *mbio;
@@ -975,7 +952,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 		mbio->bi_sector	= r1_bio->sector + conf->mirrors[i].rdev->data_offset;
 		mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
 		mbio->bi_end_io	= raid1_end_write_request;
-		mbio->bi_rw = WRITE | do_barriers | do_sync;
+		mbio->bi_rw = WRITE | do_flush_fua | do_sync;
 		mbio->bi_private = r1_bio;
 
 		if (behind_pages) {
@@ -1634,41 +1611,6 @@ static void raid1d(mddev_t *mddev)
 		if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
 			sync_request_write(mddev, r1_bio);
 			unplug = 1;
-		} else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {
-			/* some requests in the r1bio were REQ_HARDBARRIER
-			 * requests which failed with -EOPNOTSUPP.  Hohumm..
-			 * Better resubmit without the barrier.
-			 * We know which devices to resubmit for, because
-			 * all others have had their bios[] entry cleared.
-			 * We already have a nr_pending reference on these rdevs.
-			 */
-			int i;
-			const unsigned long do_sync = (r1_bio->master_bio->bi_rw & REQ_SYNC);
-			clear_bit(R1BIO_BarrierRetry, &r1_bio->state);
-			clear_bit(R1BIO_Barrier, &r1_bio->state);
-			for (i=0; i < conf->raid_disks; i++)
-				if (r1_bio->bios[i])
-					atomic_inc(&r1_bio->remaining);
-			for (i=0; i < conf->raid_disks; i++)
-				if (r1_bio->bios[i]) {
-					struct bio_vec *bvec;
-					int j;
-
-					bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
-					/* copy pages from the failed bio, as
-					 * this might be a write-behind device */
-					__bio_for_each_segment(bvec, bio, j, 0)
-						bvec->bv_page = bio_iovec_idx(r1_bio->bios[i], j)->bv_page;
-					bio_put(r1_bio->bios[i]);
-					bio->bi_sector = r1_bio->sector +
-						conf->mirrors[i].rdev->data_offset;
-					bio->bi_bdev = conf->mirrors[i].rdev->bdev;
-					bio->bi_end_io = raid1_end_write_request;
-					bio->bi_rw = WRITE | do_sync;
-					bio->bi_private = r1_bio;
-					r1_bio->bios[i] = bio;
-					generic_make_request(bio);
-				}
 		} else {
 			int disk;
 
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 5f2d443ae28a..adf8cfd73313 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -117,8 +117,6 @@ struct r1bio_s {
 #define	R1BIO_IsSync	1
 #define	R1BIO_Degraded	2
 #define	R1BIO_BehindIO	3
-#define	R1BIO_Barrier	4
-#define R1BIO_BarrierRetry 5
 /* For write-behind requests, we call bi_end_io when
  * the last non-write-behind device completes, providing
  * any write was successful.  Otherwise we call when
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 84718383124d..f0d082f749be 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -800,12 +800,13 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 	int chunk_sects = conf->chunk_mask + 1;
 	const int rw = bio_data_dir(bio);
 	const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
+	const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
 	struct bio_list bl;
 	unsigned long flags;
 	mdk_rdev_t *blocked_rdev;
 
-	if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
-		md_barrier_request(mddev, bio);
+	if (unlikely(bio->bi_rw & REQ_FLUSH)) {
+		md_flush_request(mddev, bio);
 		return 0;
 	}
 
@@ -965,7 +966,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
 			conf->mirrors[d].rdev->data_offset;
 		mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
 		mbio->bi_end_io	= raid10_end_write_request;
-		mbio->bi_rw = WRITE | do_sync;
+		mbio->bi_rw = WRITE | do_sync | do_fua;
 		mbio->bi_private = r10_bio;
 
 		atomic_inc(&r10_bio->remaining);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 69b0a169e43d..31140d1259dc 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -506,9 +506,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 		int rw;
 		struct bio *bi;
 		mdk_rdev_t *rdev;
-		if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
-			rw = WRITE;
-		else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
+		if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
+			if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
+				rw = WRITE_FUA;
+			else
+				rw = WRITE;
+		} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
 			rw = READ;
 		else
 			continue;
@@ -1031,6 +1034,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 
 			while (wbi && wbi->bi_sector <
 				dev->sector + STRIPE_SECTORS) {
+				if (wbi->bi_rw & REQ_FUA)
+					set_bit(R5_WantFUA, &dev->flags);
 				tx = async_copy_data(1, wbi, dev->page,
 					dev->sector, tx);
 				wbi = r5_next_bio(wbi, dev->sector);
@@ -1048,15 +1053,22 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
 	int pd_idx = sh->pd_idx;
 	int qd_idx = sh->qd_idx;
 	int i;
+	bool fua = false;
 
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
 
+	for (i = disks; i--; )
+		fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
+
 	for (i = disks; i--; ) {
 		struct r5dev *dev = &sh->dev[i];
 
-		if (dev->written || i == pd_idx || i == qd_idx)
+		if (dev->written || i == pd_idx || i == qd_idx) {
 			set_bit(R5_UPTODATE, &dev->flags);
+			if (fua)
+				set_bit(R5_WantFUA, &dev->flags);
+		}
 	}
 
 	if (sh->reconstruct_state == reconstruct_state_drain_run)
@@ -3281,7 +3293,7 @@ static void handle_stripe5(struct stripe_head *sh)
 
 	if (dec_preread_active) {
 		/* We delay this until after ops_run_io so that if make_request
-		 * is waiting on a barrier, it won't continue until the writes
+		 * is waiting on a flush, it won't continue until the writes
 		 * have actually been submitted.
 		 */
 		atomic_dec(&conf->preread_active_stripes);
@@ -3583,7 +3595,7 @@ static void handle_stripe6(struct stripe_head *sh)
 
 	if (dec_preread_active) {
 		/* We delay this until after ops_run_io so that if make_request
-		 * is waiting on a barrier, it won't continue until the writes
+		 * is waiting on a flush, it won't continue until the writes
 		 * have actually been submitted.
 		 */
 		atomic_dec(&conf->preread_active_stripes);
@@ -3978,14 +3990,8 @@ static int make_request(mddev_t *mddev, struct bio * bi)
 	const int rw = bio_data_dir(bi);
 	int remaining;
 
-	if (unlikely(bi->bi_rw & REQ_HARDBARRIER)) {
-		/* Drain all pending writes.  We only really need
-		 * to ensure they have been submitted, but this is
-		 * easier.
-		 */
-		mddev->pers->quiesce(mddev, 1);
-		mddev->pers->quiesce(mddev, 0);
-		md_barrier_request(mddev, bi);
+	if (unlikely(bi->bi_rw & REQ_FLUSH)) {
+		md_flush_request(mddev, bi);
 		return 0;
 	}
 
@@ -4103,7 +4109,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
 			finish_wait(&conf->wait_for_overlap, &w);
 			set_bit(STRIPE_HANDLE, &sh->state);
 			clear_bit(STRIPE_DELAYED, &sh->state);
-			if (mddev->barrier && 
+			if ((bi->bi_rw & REQ_SYNC) &&
 			    !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 				atomic_inc(&conf->preread_active_stripes);
 			release_stripe(sh);
@@ -4126,13 +4132,6 @@ static int make_request(mddev_t *mddev, struct bio * bi)
 		bio_endio(bi, 0);
 	}
 
-	if (mddev->barrier) {
-		/* We need to wait for the stripes to all be handled.
-		 * So: wait for preread_active_stripes to drop to 0.
-		 */
-		wait_event(mddev->thread->wqueue,
-			   atomic_read(&conf->preread_active_stripes) == 0);
-	}
 	return 0;
 }
 
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 36eaed5dfd6e..2ace0582b409 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -275,6 +275,7 @@ struct r6_state {
 				    * filling
 				    */
 #define R5_Wantdrain	13 /* dev->towrite needs to be drained */
+#define R5_WantFUA	14	/* Write should be FUA */
 /*
  * Write method
  */
-- 
cgit v1.2.3


From 3a2edd0d6ddbd5fa3b389ea6db811285415ce6c8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:18 +0200
Subject: block: make __blk_rq_prep_clone() copy most command flags

Currently __blk_rq_prep_clone() copies only REQ_WRITE and REQ_DISCARD.
There's no reason to omit other command flags and REQ_FUA needs to be
copied to implement FUA support in request-based dm.

REQ_COMMON_MASK which specifies flags to be copied from bio to request
already identifies all the command flags.  Define REQ_CLONE_MASK to be
the same as REQ_COMMON_MASK for clarity and make __blk_rq_prep_clone()
copy all flags in the mask.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c          | 4 +---
 include/linux/blk_types.h | 1 +
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 495bdc4a23da..2a5b19204546 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2505,9 +2505,7 @@ EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
 static void __blk_rq_prep_clone(struct request *dst, struct request *src)
 {
 	dst->cpu = src->cpu;
-	dst->cmd_flags = (rq_data_dir(src) | REQ_NOMERGE);
-	if (src->cmd_flags & REQ_DISCARD)
-		dst->cmd_flags |= REQ_DISCARD;
+	dst->cmd_flags = (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE;
 	dst->cmd_type = src->cmd_type;
 	dst->__sector = blk_rq_pos(src);
 	dst->__data_len = blk_rq_bytes(src);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 179799479e6f..36edadf5b41a 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -168,6 +168,7 @@ enum rq_flag_bits {
 #define REQ_COMMON_MASK \
 	(REQ_WRITE | REQ_FAILFAST_MASK | REQ_HARDBARRIER | REQ_SYNC | \
 	 REQ_META | REQ_DISCARD | REQ_NOIDLE | REQ_FLUSH | REQ_FUA)
+#define REQ_CLONE_MASK		REQ_COMMON_MASK
 
 #define REQ_UNPLUG		(1 << __REQ_UNPLUG)
 #define REQ_RAHEAD		(1 << __REQ_RAHEAD)
-- 
cgit v1.2.3


From d87f4c14f27dc82d215108d8392a7d26687148a1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:19 +0200
Subject: dm: implement REQ_FLUSH/FUA support for bio-based dm

This patch converts bio-based dm to support REQ_FLUSH/FUA instead of
now deprecated REQ_HARDBARRIER.

* -EOPNOTSUPP handling logic dropped.

* Preflush is handled as before but postflush is dropped and replaced
  with passing down REQ_FUA to member request_queues.  This replaces
  one array wide cache flush w/ member specific FUA writes.

* __split_and_process_bio() now calls __clone_and_map_flush() directly
  for flushes and guarantees all FLUSH bio's going to targets are zero
`  length.

* It's now guaranteed that all FLUSH bio's which are passed onto dm
  targets are zero length.  bio_empty_barrier() tests are replaced
  with REQ_FLUSH tests.

* Empty WRITE_BARRIERs are replaced with WRITE_FLUSHes.

* Dropped unlikely() around REQ_FLUSH tests.  Flushes are not unlikely
  enough to be marked with unlikely().

* Block layer now filters out REQ_FLUSH/FUA bio's if the request_queue
  doesn't support cache flushing.  Advertise REQ_FLUSH | REQ_FUA
  capability.

* Request based dm isn't converted yet.  dm_init_request_based_queue()
  resets flush support to 0 for now.  To avoid disturbing request
  based dm code, dm->flush_error is added for bio based dm while
  requested based dm continues to use dm->barrier_error.

Lightly tested linear, stripe, raid1, snap and crypt targets.  Please
proceed with caution as I'm not familiar with the code base.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: dm-devel@redhat.com
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/md/dm-crypt.c           |   2 +-
 drivers/md/dm-io.c              |  20 ++-----
 drivers/md/dm-log.c             |   2 +-
 drivers/md/dm-raid1.c           |   8 +--
 drivers/md/dm-region-hash.c     |  16 +++---
 drivers/md/dm-snap-persistent.c |   2 +-
 drivers/md/dm-snap.c            |   6 +-
 drivers/md/dm-stripe.c          |   2 +-
 drivers/md/dm.c                 | 119 +++++++++++++++++++---------------------
 9 files changed, 80 insertions(+), 97 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 368e8e98f705..d5b0e4c0e702 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1278,7 +1278,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
 	struct dm_crypt_io *io;
 	struct crypt_config *cc;
 
-	if (unlikely(bio_empty_barrier(bio))) {
+	if (bio->bi_rw & REQ_FLUSH) {
 		cc = ti->private;
 		bio->bi_bdev = cc->dev->bdev;
 		return DM_MAPIO_REMAPPED;
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 0590c75b0ab6..136d4f71a116 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -31,7 +31,6 @@ struct dm_io_client {
  */
 struct io {
 	unsigned long error_bits;
-	unsigned long eopnotsupp_bits;
 	atomic_t count;
 	struct task_struct *sleeper;
 	struct dm_io_client *client;
@@ -130,11 +129,8 @@ static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io,
  *---------------------------------------------------------------*/
 static void dec_count(struct io *io, unsigned int region, int error)
 {
-	if (error) {
+	if (error)
 		set_bit(region, &io->error_bits);
-		if (error == -EOPNOTSUPP)
-			set_bit(region, &io->eopnotsupp_bits);
-	}
 
 	if (atomic_dec_and_test(&io->count)) {
 		if (io->sleeper)
@@ -310,8 +306,8 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
 	sector_t remaining = where->count;
 
 	/*
-	 * where->count may be zero if rw holds a write barrier and we
-	 * need to send a zero-sized barrier.
+	 * where->count may be zero if rw holds a flush and we need to
+	 * send a zero-sized flush.
 	 */
 	do {
 		/*
@@ -364,7 +360,7 @@ static void dispatch_io(int rw, unsigned int num_regions,
 	 */
 	for (i = 0; i < num_regions; i++) {
 		*dp = old_pages;
-		if (where[i].count || (rw & REQ_HARDBARRIER))
+		if (where[i].count || (rw & REQ_FLUSH))
 			do_region(rw, i, where + i, dp, io);
 	}
 
@@ -393,9 +389,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
 		return -EIO;
 	}
 
-retry:
 	io->error_bits = 0;
-	io->eopnotsupp_bits = 0;
 	atomic_set(&io->count, 1); /* see dispatch_io() */
 	io->sleeper = current;
 	io->client = client;
@@ -412,11 +406,6 @@ retry:
 	}
 	set_current_state(TASK_RUNNING);
 
-	if (io->eopnotsupp_bits && (rw & REQ_HARDBARRIER)) {
-		rw &= ~REQ_HARDBARRIER;
-		goto retry;
-	}
-
 	if (error_bits)
 		*error_bits = io->error_bits;
 
@@ -437,7 +426,6 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions,
 
 	io = mempool_alloc(client->pool, GFP_NOIO);
 	io->error_bits = 0;
-	io->eopnotsupp_bits = 0;
 	atomic_set(&io->count, 1); /* see dispatch_io() */
 	io->sleeper = NULL;
 	io->client = client;
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 5a08be0222db..33420e68d153 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -300,7 +300,7 @@ static int flush_header(struct log_c *lc)
 		.count = 0,
 	};
 
-	lc->io_req.bi_rw = WRITE_BARRIER;
+	lc->io_req.bi_rw = WRITE_FLUSH;
 
 	return dm_io(&lc->io_req, 1, &null_location, NULL);
 }
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 7c081bcbc3cf..19a59b041c27 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -259,7 +259,7 @@ static int mirror_flush(struct dm_target *ti)
 	struct dm_io_region io[ms->nr_mirrors];
 	struct mirror *m;
 	struct dm_io_request io_req = {
-		.bi_rw = WRITE_BARRIER,
+		.bi_rw = WRITE_FLUSH,
 		.mem.type = DM_IO_KMEM,
 		.mem.ptr.bvec = NULL,
 		.client = ms->io_client,
@@ -629,7 +629,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
 	struct dm_io_region io[ms->nr_mirrors], *dest = io;
 	struct mirror *m;
 	struct dm_io_request io_req = {
-		.bi_rw = WRITE | (bio->bi_rw & WRITE_BARRIER),
+		.bi_rw = WRITE | (bio->bi_rw & WRITE_FLUSH_FUA),
 		.mem.type = DM_IO_BVEC,
 		.mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx,
 		.notify.fn = write_callback,
@@ -670,7 +670,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
 	bio_list_init(&requeue);
 
 	while ((bio = bio_list_pop(writes))) {
-		if (unlikely(bio_empty_barrier(bio))) {
+		if (bio->bi_rw & REQ_FLUSH) {
 			bio_list_add(&sync, bio);
 			continue;
 		}
@@ -1203,7 +1203,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
 	 * We need to dec pending if this was a write.
 	 */
 	if (rw == WRITE) {
-		if (likely(!bio_empty_barrier(bio)))
+		if (!(bio->bi_rw & REQ_FLUSH))
 			dm_rh_dec(ms->rh, map_context->ll);
 		return error;
 	}
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index bd5c58b28868..dad011aed0c9 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -81,9 +81,9 @@ struct dm_region_hash {
 	struct list_head failed_recovered_regions;
 
 	/*
-	 * If there was a barrier failure no regions can be marked clean.
+	 * If there was a flush failure no regions can be marked clean.
 	 */
-	int barrier_failure;
+	int flush_failure;
 
 	void *context;
 	sector_t target_begin;
@@ -217,7 +217,7 @@ struct dm_region_hash *dm_region_hash_create(
 	INIT_LIST_HEAD(&rh->quiesced_regions);
 	INIT_LIST_HEAD(&rh->recovered_regions);
 	INIT_LIST_HEAD(&rh->failed_recovered_regions);
-	rh->barrier_failure = 0;
+	rh->flush_failure = 0;
 
 	rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS,
 						      sizeof(struct dm_region));
@@ -399,8 +399,8 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio)
 	region_t region = dm_rh_bio_to_region(rh, bio);
 	int recovering = 0;
 
-	if (bio_empty_barrier(bio)) {
-		rh->barrier_failure = 1;
+	if (bio->bi_rw & REQ_FLUSH) {
+		rh->flush_failure = 1;
 		return;
 	}
 
@@ -524,7 +524,7 @@ void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios)
 	struct bio *bio;
 
 	for (bio = bios->head; bio; bio = bio->bi_next) {
-		if (bio_empty_barrier(bio))
+		if (bio->bi_rw & REQ_FLUSH)
 			continue;
 		rh_inc(rh, dm_rh_bio_to_region(rh, bio));
 	}
@@ -555,9 +555,9 @@ void dm_rh_dec(struct dm_region_hash *rh, region_t region)
 		 */
 
 		/* do nothing for DM_RH_NOSYNC */
-		if (unlikely(rh->barrier_failure)) {
+		if (unlikely(rh->flush_failure)) {
 			/*
-			 * If a write barrier failed some time ago, we
+			 * If a write flush failed some time ago, we
 			 * don't know whether or not this write made it
 			 * to the disk, so we must resync the device.
 			 */
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index cc2bdb83f9ad..0b61792a2780 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -687,7 +687,7 @@ static void persistent_commit_exception(struct dm_exception_store *store,
 	/*
 	 * Commit exceptions to disk.
 	 */
-	if (ps->valid && area_io(ps, WRITE_BARRIER))
+	if (ps->valid && area_io(ps, WRITE_FLUSH_FUA))
 		ps->valid = 0;
 
 	/*
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 5974d3094d97..eed210152b75 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1587,7 +1587,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
 	chunk_t chunk;
 	struct dm_snap_pending_exception *pe = NULL;
 
-	if (unlikely(bio_empty_barrier(bio))) {
+	if (bio->bi_rw & REQ_FLUSH) {
 		bio->bi_bdev = s->cow->bdev;
 		return DM_MAPIO_REMAPPED;
 	}
@@ -1691,7 +1691,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio,
 	int r = DM_MAPIO_REMAPPED;
 	chunk_t chunk;
 
-	if (unlikely(bio_empty_barrier(bio))) {
+	if (bio->bi_rw & REQ_FLUSH) {
 		if (!map_context->target_request_nr)
 			bio->bi_bdev = s->origin->bdev;
 		else
@@ -2135,7 +2135,7 @@ static int origin_map(struct dm_target *ti, struct bio *bio,
 	struct dm_dev *dev = ti->private;
 	bio->bi_bdev = dev->bdev;
 
-	if (unlikely(bio_empty_barrier(bio)))
+	if (bio->bi_rw & REQ_FLUSH)
 		return DM_MAPIO_REMAPPED;
 
 	/* Only tell snapshots if this is a write */
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index c297f6da91ea..f0371b4c4fbf 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -271,7 +271,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio,
 	uint32_t stripe;
 	unsigned target_request_nr;
 
-	if (unlikely(bio_empty_barrier(bio))) {
+	if (bio->bi_rw & REQ_FLUSH) {
 		target_request_nr = map_context->target_request_nr;
 		BUG_ON(target_request_nr >= sc->stripes);
 		bio->bi_bdev = sc->stripe[target_request_nr].dev->bdev;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index b1d92be8f990..32e6622767ad 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -144,15 +144,16 @@ struct mapped_device {
 	spinlock_t deferred_lock;
 
 	/*
-	 * An error from the barrier request currently being processed.
+	 * An error from the flush request currently being processed.
 	 */
-	int barrier_error;
+	int flush_error;
 
 	/*
 	 * Protect barrier_error from concurrent endio processing
 	 * in request-based dm.
 	 */
 	spinlock_t barrier_error_lock;
+	int barrier_error;
 
 	/*
 	 * Processing queue (flush/barriers)
@@ -200,8 +201,8 @@ struct mapped_device {
 	/* sysfs handle */
 	struct kobject kobj;
 
-	/* zero-length barrier that will be cloned and submitted to targets */
-	struct bio barrier_bio;
+	/* zero-length flush that will be cloned and submitted to targets */
+	struct bio flush_bio;
 };
 
 /*
@@ -512,7 +513,7 @@ static void end_io_acct(struct dm_io *io)
 
 	/*
 	 * After this is decremented the bio must not be touched if it is
-	 * a barrier.
+	 * a flush.
 	 */
 	dm_disk(md)->part0.in_flight[rw] = pending =
 		atomic_dec_return(&md->pending[rw]);
@@ -626,7 +627,7 @@ static void dec_pending(struct dm_io *io, int error)
 			 */
 			spin_lock_irqsave(&md->deferred_lock, flags);
 			if (__noflush_suspending(md)) {
-				if (!(io->bio->bi_rw & REQ_HARDBARRIER))
+				if (!(io->bio->bi_rw & REQ_FLUSH))
 					bio_list_add_head(&md->deferred,
 							  io->bio);
 			} else
@@ -638,20 +639,14 @@ static void dec_pending(struct dm_io *io, int error)
 		io_error = io->error;
 		bio = io->bio;
 
-		if (bio->bi_rw & REQ_HARDBARRIER) {
+		if (bio->bi_rw & REQ_FLUSH) {
 			/*
-			 * There can be just one barrier request so we use
+			 * There can be just one flush request so we use
 			 * a per-device variable for error reporting.
 			 * Note that you can't touch the bio after end_io_acct
-			 *
-			 * We ignore -EOPNOTSUPP for empty flush reported by
-			 * underlying devices. We assume that if the device
-			 * doesn't support empty barriers, it doesn't need
-			 * cache flushing commands.
 			 */
-			if (!md->barrier_error &&
-			    !(bio_empty_barrier(bio) && io_error == -EOPNOTSUPP))
-				md->barrier_error = io_error;
+			if (!md->flush_error)
+				md->flush_error = io_error;
 			end_io_acct(io);
 			free_io(md, io);
 		} else {
@@ -1119,7 +1114,7 @@ static void dm_bio_destructor(struct bio *bio)
 }
 
 /*
- * Creates a little bio that is just does part of a bvec.
+ * Creates a little bio that just does part of a bvec.
  */
 static struct bio *split_bvec(struct bio *bio, sector_t sector,
 			      unsigned short idx, unsigned int offset,
@@ -1134,7 +1129,7 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector,
 
 	clone->bi_sector = sector;
 	clone->bi_bdev = bio->bi_bdev;
-	clone->bi_rw = bio->bi_rw & ~REQ_HARDBARRIER;
+	clone->bi_rw = bio->bi_rw;
 	clone->bi_vcnt = 1;
 	clone->bi_size = to_bytes(len);
 	clone->bi_io_vec->bv_offset = offset;
@@ -1161,7 +1156,6 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector,
 
 	clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
 	__bio_clone(clone, bio);
-	clone->bi_rw &= ~REQ_HARDBARRIER;
 	clone->bi_destructor = dm_bio_destructor;
 	clone->bi_sector = sector;
 	clone->bi_idx = idx;
@@ -1225,7 +1219,7 @@ static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti,
 		__issue_target_request(ci, ti, request_nr, len);
 }
 
-static int __clone_and_map_empty_barrier(struct clone_info *ci)
+static int __clone_and_map_flush(struct clone_info *ci)
 {
 	unsigned target_nr = 0;
 	struct dm_target *ti;
@@ -1289,9 +1283,6 @@ static int __clone_and_map(struct clone_info *ci)
 	sector_t len = 0, max;
 	struct dm_target_io *tio;
 
-	if (unlikely(bio_empty_barrier(bio)))
-		return __clone_and_map_empty_barrier(ci);
-
 	if (unlikely(bio->bi_rw & REQ_DISCARD))
 		return __clone_and_map_discard(ci);
 
@@ -1383,11 +1374,11 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
 
 	ci.map = dm_get_live_table(md);
 	if (unlikely(!ci.map)) {
-		if (!(bio->bi_rw & REQ_HARDBARRIER))
+		if (!(bio->bi_rw & REQ_FLUSH))
 			bio_io_error(bio);
 		else
-			if (!md->barrier_error)
-				md->barrier_error = -EIO;
+			if (!md->flush_error)
+				md->flush_error = -EIO;
 		return;
 	}
 
@@ -1400,14 +1391,22 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
 	ci.io->md = md;
 	spin_lock_init(&ci.io->endio_lock);
 	ci.sector = bio->bi_sector;
-	ci.sector_count = bio_sectors(bio);
-	if (unlikely(bio_empty_barrier(bio)))
+	if (!(bio->bi_rw & REQ_FLUSH))
+		ci.sector_count = bio_sectors(bio);
+	else {
+		/* all FLUSH bio's reaching here should be empty */
+		WARN_ON_ONCE(bio_has_data(bio));
 		ci.sector_count = 1;
+	}
 	ci.idx = bio->bi_idx;
 
 	start_io_acct(ci.io);
-	while (ci.sector_count && !error)
-		error = __clone_and_map(&ci);
+	while (ci.sector_count && !error) {
+		if (!(bio->bi_rw & REQ_FLUSH))
+			error = __clone_and_map(&ci);
+		else
+			error = __clone_and_map_flush(&ci);
+	}
 
 	/* drop the extra reference count */
 	dec_pending(ci.io, error);
@@ -1492,11 +1491,11 @@ static int _dm_request(struct request_queue *q, struct bio *bio)
 	part_stat_unlock();
 
 	/*
-	 * If we're suspended or the thread is processing barriers
+	 * If we're suspended or the thread is processing flushes
 	 * we have to queue this io for later.
 	 */
 	if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) ||
-	    unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
+	    (bio->bi_rw & REQ_FLUSH)) {
 		up_read(&md->io_lock);
 
 		if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) &&
@@ -1940,6 +1939,7 @@ static void dm_init_md_queue(struct mapped_device *md)
 	blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
 	md->queue->unplug_fn = dm_unplug_all;
 	blk_queue_merge_bvec(md->queue, dm_merge_bvec);
+	blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA);
 }
 
 /*
@@ -2245,7 +2245,8 @@ static int dm_init_request_based_queue(struct mapped_device *md)
 	blk_queue_softirq_done(md->queue, dm_softirq_done);
 	blk_queue_prep_rq(md->queue, dm_prep_fn);
 	blk_queue_lld_busy(md->queue, dm_lld_busy);
-	blk_queue_flush(md->queue, REQ_FLUSH);
+	/* no flush support for request based dm yet */
+	blk_queue_flush(md->queue, 0);
 
 	elv_register_queue(md->queue);
 
@@ -2406,41 +2407,35 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
 	return r;
 }
 
-static void dm_flush(struct mapped_device *md)
+static void process_flush(struct mapped_device *md, struct bio *bio)
 {
-	dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
-
-	bio_init(&md->barrier_bio);
-	md->barrier_bio.bi_bdev = md->bdev;
-	md->barrier_bio.bi_rw = WRITE_BARRIER;
-	__split_and_process_bio(md, &md->barrier_bio);
+	md->flush_error = 0;
 
+	/* handle REQ_FLUSH */
 	dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
-}
 
-static void process_barrier(struct mapped_device *md, struct bio *bio)
-{
-	md->barrier_error = 0;
+	bio_init(&md->flush_bio);
+	md->flush_bio.bi_bdev = md->bdev;
+	md->flush_bio.bi_rw = WRITE_FLUSH;
+	__split_and_process_bio(md, &md->flush_bio);
 
-	dm_flush(md);
+	dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
 
-	if (!bio_empty_barrier(bio)) {
-		__split_and_process_bio(md, bio);
-		/*
-		 * If the request isn't supported, don't waste time with
-		 * the second flush.
-		 */
-		if (md->barrier_error != -EOPNOTSUPP)
-			dm_flush(md);
+	/* if it's an empty flush or the preflush failed, we're done */
+	if (!bio_has_data(bio) || md->flush_error) {
+		if (md->flush_error != DM_ENDIO_REQUEUE)
+			bio_endio(bio, md->flush_error);
+		else {
+			spin_lock_irq(&md->deferred_lock);
+			bio_list_add_head(&md->deferred, bio);
+			spin_unlock_irq(&md->deferred_lock);
+		}
+		return;
 	}
 
-	if (md->barrier_error != DM_ENDIO_REQUEUE)
-		bio_endio(bio, md->barrier_error);
-	else {
-		spin_lock_irq(&md->deferred_lock);
-		bio_list_add_head(&md->deferred, bio);
-		spin_unlock_irq(&md->deferred_lock);
-	}
+	/* issue data + REQ_FUA */
+	bio->bi_rw &= ~REQ_FLUSH;
+	__split_and_process_bio(md, bio);
 }
 
 /*
@@ -2469,8 +2464,8 @@ static void dm_wq_work(struct work_struct *work)
 		if (dm_request_based(md))
 			generic_make_request(c);
 		else {
-			if (c->bi_rw & REQ_HARDBARRIER)
-				process_barrier(md, c);
+			if (c->bi_rw & REQ_FLUSH)
+				process_flush(md, c);
 			else
 				__split_and_process_bio(md, c);
 		}
-- 
cgit v1.2.3


From 29e4013de7ad950280e4b220894986866697d419 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 8 Sep 2010 18:07:00 +0200
Subject: dm: implement REQ_FLUSH/FUA support for request-based dm

This patch converts request-based dm to support the new REQ_FLUSH/FUA.

The original request-based flush implementation depended on
request_queue blocking other requests while a barrier sequence is in
progress, which is no longer true for the new REQ_FLUSH/FUA.

In general, request-based dm doesn't have infrastructure for cloning
one source request to multiple targets, but the original flush
implementation had a special mostly independent path which can issue
flushes to multiple targets and sequence them.  However, the
capability isn't currently in use and adds a lot of complexity.
Moreoever, it's unlikely to be useful in its current form as it
doesn't make sense to be able to send out flushes to multiple targets
when write requests can't be.

This patch rips out special flush code path and deals handles
REQ_FLUSH/FUA requests the same way as other requests.  The only
special treatment is that REQ_FLUSH requests use the block address 0
when finding target, which is enough for now.

* added BUG_ON(!dm_target_is_valid(ti)) in dm_request_fn() as
  suggested by Mike Snitzer

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Tested-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/md/dm.c | 206 ++++++--------------------------------------------------
 1 file changed, 22 insertions(+), 184 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 32e6622767ad..65114e4d9f65 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -149,20 +149,9 @@ struct mapped_device {
 	int flush_error;
 
 	/*
-	 * Protect barrier_error from concurrent endio processing
-	 * in request-based dm.
-	 */
-	spinlock_t barrier_error_lock;
-	int barrier_error;
-
-	/*
-	 * Processing queue (flush/barriers)
+	 * Processing queue (flush)
 	 */
 	struct workqueue_struct *wq;
-	struct work_struct barrier_work;
-
-	/* A pointer to the currently processing pre/post flush request */
-	struct request *flush_request;
 
 	/*
 	 * The current mapping.
@@ -750,23 +739,6 @@ static void end_clone_bio(struct bio *clone, int error)
 	blk_update_request(tio->orig, 0, nr_bytes);
 }
 
-static void store_barrier_error(struct mapped_device *md, int error)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&md->barrier_error_lock, flags);
-	/*
-	 * Basically, the first error is taken, but:
-	 *   -EOPNOTSUPP supersedes any I/O error.
-	 *   Requeue request supersedes any I/O error but -EOPNOTSUPP.
-	 */
-	if (!md->barrier_error || error == -EOPNOTSUPP ||
-	    (md->barrier_error != -EOPNOTSUPP &&
-	     error == DM_ENDIO_REQUEUE))
-		md->barrier_error = error;
-	spin_unlock_irqrestore(&md->barrier_error_lock, flags);
-}
-
 /*
  * Don't touch any member of the md after calling this function because
  * the md may be freed in dm_put() at the end of this function.
@@ -804,13 +776,11 @@ static void free_rq_clone(struct request *clone)
 static void dm_end_request(struct request *clone, int error)
 {
 	int rw = rq_data_dir(clone);
-	int run_queue = 1;
-	bool is_barrier = clone->cmd_flags & REQ_HARDBARRIER;
 	struct dm_rq_target_io *tio = clone->end_io_data;
 	struct mapped_device *md = tio->md;
 	struct request *rq = tio->orig;
 
-	if (rq->cmd_type == REQ_TYPE_BLOCK_PC && !is_barrier) {
+	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
 		rq->errors = clone->errors;
 		rq->resid_len = clone->resid_len;
 
@@ -824,15 +794,8 @@ static void dm_end_request(struct request *clone, int error)
 	}
 
 	free_rq_clone(clone);
-
-	if (unlikely(is_barrier)) {
-		if (unlikely(error))
-			store_barrier_error(md, error);
-		run_queue = 0;
-	} else
-		blk_end_request_all(rq, error);
-
-	rq_completed(md, rw, run_queue);
+	blk_end_request_all(rq, error);
+	rq_completed(md, rw, true);
 }
 
 static void dm_unprep_request(struct request *rq)
@@ -857,16 +820,6 @@ void dm_requeue_unmapped_request(struct request *clone)
 	struct request_queue *q = rq->q;
 	unsigned long flags;
 
-	if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
-		/*
-		 * Barrier clones share an original request.
-		 * Leave it to dm_end_request(), which handles this special
-		 * case.
-		 */
-		dm_end_request(clone, DM_ENDIO_REQUEUE);
-		return;
-	}
-
 	dm_unprep_request(rq);
 
 	spin_lock_irqsave(q->queue_lock, flags);
@@ -956,19 +909,6 @@ static void dm_complete_request(struct request *clone, int error)
 	struct dm_rq_target_io *tio = clone->end_io_data;
 	struct request *rq = tio->orig;
 
-	if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
-		/*
-		 * Barrier clones share an original request.  So can't use
-		 * softirq_done with the original.
-		 * Pass the clone to dm_done() directly in this special case.
-		 * It is safe (even if clone->q->queue_lock is held here)
-		 * because there is no I/O dispatching during the completion
-		 * of barrier clone.
-		 */
-		dm_done(clone, error, true);
-		return;
-	}
-
 	tio->error = error;
 	rq->completion_data = clone;
 	blk_complete_request(rq);
@@ -985,17 +925,6 @@ void dm_kill_unmapped_request(struct request *clone, int error)
 	struct dm_rq_target_io *tio = clone->end_io_data;
 	struct request *rq = tio->orig;
 
-	if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
-		/*
-		 * Barrier clones share an original request.
-		 * Leave it to dm_end_request(), which handles this special
-		 * case.
-		 */
-		BUG_ON(error > 0);
-		dm_end_request(clone, error);
-		return;
-	}
-
 	rq->cmd_flags |= REQ_FAILED;
 	dm_complete_request(clone, error);
 }
@@ -1536,14 +1465,6 @@ static int dm_request(struct request_queue *q, struct bio *bio)
 	return _dm_request(q, bio);
 }
 
-static bool dm_rq_is_flush_request(struct request *rq)
-{
-	if (rq->cmd_flags & REQ_FLUSH)
-		return true;
-	else
-		return false;
-}
-
 void dm_dispatch_request(struct request *rq)
 {
 	int r;
@@ -1591,22 +1512,15 @@ static int setup_clone(struct request *clone, struct request *rq,
 {
 	int r;
 
-	if (dm_rq_is_flush_request(rq)) {
-		blk_rq_init(NULL, clone);
-		clone->cmd_type = REQ_TYPE_FS;
-		clone->cmd_flags |= (REQ_HARDBARRIER | WRITE);
-	} else {
-		r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
-				      dm_rq_bio_constructor, tio);
-		if (r)
-			return r;
-
-		clone->cmd = rq->cmd;
-		clone->cmd_len = rq->cmd_len;
-		clone->sense = rq->sense;
-		clone->buffer = rq->buffer;
-	}
+	r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
+			      dm_rq_bio_constructor, tio);
+	if (r)
+		return r;
 
+	clone->cmd = rq->cmd;
+	clone->cmd_len = rq->cmd_len;
+	clone->sense = rq->sense;
+	clone->buffer = rq->buffer;
 	clone->end_io = end_clone_request;
 	clone->end_io_data = tio;
 
@@ -1647,9 +1561,6 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq)
 	struct mapped_device *md = q->queuedata;
 	struct request *clone;
 
-	if (unlikely(dm_rq_is_flush_request(rq)))
-		return BLKPREP_OK;
-
 	if (unlikely(rq->special)) {
 		DMWARN("Already has something in rq->special.");
 		return BLKPREP_KILL;
@@ -1726,6 +1637,7 @@ static void dm_request_fn(struct request_queue *q)
 	struct dm_table *map = dm_get_live_table(md);
 	struct dm_target *ti;
 	struct request *rq, *clone;
+	sector_t pos;
 
 	/*
 	 * For suspend, check blk_queue_stopped() and increment
@@ -1738,15 +1650,14 @@ static void dm_request_fn(struct request_queue *q)
 		if (!rq)
 			goto plug_and_out;
 
-		if (unlikely(dm_rq_is_flush_request(rq))) {
-			BUG_ON(md->flush_request);
-			md->flush_request = rq;
-			blk_start_request(rq);
-			queue_work(md->wq, &md->barrier_work);
-			goto out;
-		}
+		/* always use block 0 to find the target for flushes for now */
+		pos = 0;
+		if (!(rq->cmd_flags & REQ_FLUSH))
+			pos = blk_rq_pos(rq);
+
+		ti = dm_table_find_target(map, pos);
+		BUG_ON(!dm_target_is_valid(ti));
 
-		ti = dm_table_find_target(map, blk_rq_pos(rq));
 		if (ti->type->busy && ti->type->busy(ti))
 			goto plug_and_out;
 
@@ -1917,7 +1828,6 @@ out:
 static const struct block_device_operations dm_blk_dops;
 
 static void dm_wq_work(struct work_struct *work);
-static void dm_rq_barrier_work(struct work_struct *work);
 
 static void dm_init_md_queue(struct mapped_device *md)
 {
@@ -1972,7 +1882,6 @@ static struct mapped_device *alloc_dev(int minor)
 	mutex_init(&md->suspend_lock);
 	mutex_init(&md->type_lock);
 	spin_lock_init(&md->deferred_lock);
-	spin_lock_init(&md->barrier_error_lock);
 	rwlock_init(&md->map_lock);
 	atomic_set(&md->holders, 1);
 	atomic_set(&md->open_count, 0);
@@ -1995,7 +1904,6 @@ static struct mapped_device *alloc_dev(int minor)
 	atomic_set(&md->pending[1], 0);
 	init_waitqueue_head(&md->wait);
 	INIT_WORK(&md->work, dm_wq_work);
-	INIT_WORK(&md->barrier_work, dm_rq_barrier_work);
 	init_waitqueue_head(&md->eventq);
 
 	md->disk->major = _major;
@@ -2245,8 +2153,6 @@ static int dm_init_request_based_queue(struct mapped_device *md)
 	blk_queue_softirq_done(md->queue, dm_softirq_done);
 	blk_queue_prep_rq(md->queue, dm_prep_fn);
 	blk_queue_lld_busy(md->queue, dm_lld_busy);
-	/* no flush support for request based dm yet */
-	blk_queue_flush(md->queue, 0);
 
 	elv_register_queue(md->queue);
 
@@ -2483,73 +2389,6 @@ static void dm_queue_flush(struct mapped_device *md)
 	queue_work(md->wq, &md->work);
 }
 
-static void dm_rq_set_target_request_nr(struct request *clone, unsigned request_nr)
-{
-	struct dm_rq_target_io *tio = clone->end_io_data;
-
-	tio->info.target_request_nr = request_nr;
-}
-
-/* Issue barrier requests to targets and wait for their completion. */
-static int dm_rq_barrier(struct mapped_device *md)
-{
-	int i, j;
-	struct dm_table *map = dm_get_live_table(md);
-	unsigned num_targets = dm_table_get_num_targets(map);
-	struct dm_target *ti;
-	struct request *clone;
-
-	md->barrier_error = 0;
-
-	for (i = 0; i < num_targets; i++) {
-		ti = dm_table_get_target(map, i);
-		for (j = 0; j < ti->num_flush_requests; j++) {
-			clone = clone_rq(md->flush_request, md, GFP_NOIO);
-			dm_rq_set_target_request_nr(clone, j);
-			atomic_inc(&md->pending[rq_data_dir(clone)]);
-			map_request(ti, clone, md);
-		}
-	}
-
-	dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
-	dm_table_put(map);
-
-	return md->barrier_error;
-}
-
-static void dm_rq_barrier_work(struct work_struct *work)
-{
-	int error;
-	struct mapped_device *md = container_of(work, struct mapped_device,
-						barrier_work);
-	struct request_queue *q = md->queue;
-	struct request *rq;
-	unsigned long flags;
-
-	/*
-	 * Hold the md reference here and leave it at the last part so that
-	 * the md can't be deleted by device opener when the barrier request
-	 * completes.
-	 */
-	dm_get(md);
-
-	error = dm_rq_barrier(md);
-
-	rq = md->flush_request;
-	md->flush_request = NULL;
-
-	if (error == DM_ENDIO_REQUEUE) {
-		spin_lock_irqsave(q->queue_lock, flags);
-		blk_requeue_request(q, rq);
-		spin_unlock_irqrestore(q->queue_lock, flags);
-	} else
-		blk_end_request_all(rq, error);
-
-	blk_run_queue(q);
-
-	dm_put(md);
-}
-
 /*
  * Swap in a new table, returning the old one for the caller to destroy.
  */
@@ -2686,9 +2525,8 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 	up_write(&md->io_lock);
 
 	/*
-	 * Request-based dm uses md->wq for barrier (dm_rq_barrier_work) which
-	 * can be kicked until md->queue is stopped.  So stop md->queue before
-	 * flushing md->wq.
+	 * Stop md->queue before flushing md->wq in case request-based
+	 * dm defers requests to md->wq from md->queue.
 	 */
 	if (dm_request_based(md))
 		stop_queue(md->queue);
-- 
cgit v1.2.3


From 6a8736d10cb413be95ea443ba40f25c93f4ef9b2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 8 Sep 2010 18:07:00 +0200
Subject: dm: relax ordering of bio-based flush implementation

Unlike REQ_HARDBARRIER, REQ_FLUSH/FUA doesn't mandate any ordering
against other bio's.  This patch relaxes ordering around flushes.

* A flush bio is no longer deferred to workqueue directly.  It's
  processed like other bio's but __split_and_process_bio() uses
  md->flush_bio as the clone source.  md->flush_bio is initialized to
  empty flush during md initialization and shared for all flushes.

* As a flush bio now travels through the same execution path as other
  bio's, there's no need for dedicated error handling path either.  It
  can use the same error handling path in dec_pending().  Dedicated
  error handling removed along with md->flush_error.

* When dec_pending() detects that a flush has completed, it checks
  whether the original bio has data.  If so, the bio is queued to the
  deferred list w/ REQ_FLUSH cleared; otherwise, it's completed.

* As flush sequencing is handled in the usual issue/completion path,
  dm_wq_work() no longer needs to handle flushes differently.  Now its
  only responsibility is re-issuing deferred bio's the same way as
  _dm_request() would.  REQ_FLUSH handling logic including
  process_flush() is dropped.

* There's no reason for queue_io() and dm_wq_work() write lock
  dm->io_lock.  queue_io() now only uses md->deferred_lock and
  dm_wq_work() read locks dm->io_lock.

* bio's no longer need to be queued on the deferred list while a flush
  is in progress making DMF_QUEUE_IO_TO_THREAD unncessary.  Drop it.

This avoids stalling the device during flushes and simplifies the
implementation.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/md/dm.c | 157 ++++++++++++++++----------------------------------------
 1 file changed, 45 insertions(+), 112 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 65114e4d9f65..2011704b8ba0 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -110,7 +110,6 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
 #define DMF_FREEING 3
 #define DMF_DELETING 4
 #define DMF_NOFLUSH_SUSPENDING 5
-#define DMF_QUEUE_IO_TO_THREAD 6
 
 /*
  * Work processed by per-device workqueue.
@@ -143,11 +142,6 @@ struct mapped_device {
 	struct bio_list deferred;
 	spinlock_t deferred_lock;
 
-	/*
-	 * An error from the flush request currently being processed.
-	 */
-	int flush_error;
-
 	/*
 	 * Processing queue (flush)
 	 */
@@ -518,16 +512,10 @@ static void end_io_acct(struct dm_io *io)
  */
 static void queue_io(struct mapped_device *md, struct bio *bio)
 {
-	down_write(&md->io_lock);
-
 	spin_lock_irq(&md->deferred_lock);
 	bio_list_add(&md->deferred, bio);
 	spin_unlock_irq(&md->deferred_lock);
-
-	if (!test_and_set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags))
-		queue_work(md->wq, &md->work);
-
-	up_write(&md->io_lock);
+	queue_work(md->wq, &md->work);
 }
 
 /*
@@ -615,11 +603,9 @@ static void dec_pending(struct dm_io *io, int error)
 			 * Target requested pushing back the I/O.
 			 */
 			spin_lock_irqsave(&md->deferred_lock, flags);
-			if (__noflush_suspending(md)) {
-				if (!(io->bio->bi_rw & REQ_FLUSH))
-					bio_list_add_head(&md->deferred,
-							  io->bio);
-			} else
+			if (__noflush_suspending(md))
+				bio_list_add_head(&md->deferred, io->bio);
+			else
 				/* noflush suspend was interrupted. */
 				io->error = -EIO;
 			spin_unlock_irqrestore(&md->deferred_lock, flags);
@@ -627,26 +613,22 @@ static void dec_pending(struct dm_io *io, int error)
 
 		io_error = io->error;
 		bio = io->bio;
+		end_io_acct(io);
+		free_io(md, io);
+
+		if (io_error == DM_ENDIO_REQUEUE)
+			return;
 
-		if (bio->bi_rw & REQ_FLUSH) {
+		if (!(bio->bi_rw & REQ_FLUSH) || !bio->bi_size) {
+			trace_block_bio_complete(md->queue, bio);
+			bio_endio(bio, io_error);
+		} else {
 			/*
-			 * There can be just one flush request so we use
-			 * a per-device variable for error reporting.
-			 * Note that you can't touch the bio after end_io_acct
+			 * Preflush done for flush with data, reissue
+			 * without REQ_FLUSH.
 			 */
-			if (!md->flush_error)
-				md->flush_error = io_error;
-			end_io_acct(io);
-			free_io(md, io);
-		} else {
-			end_io_acct(io);
-			free_io(md, io);
-
-			if (io_error != DM_ENDIO_REQUEUE) {
-				trace_block_bio_complete(md->queue, bio);
-
-				bio_endio(bio, io_error);
-			}
+			bio->bi_rw &= ~REQ_FLUSH;
+			queue_io(md, bio);
 		}
 	}
 }
@@ -1298,21 +1280,17 @@ static int __clone_and_map(struct clone_info *ci)
  */
 static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
 {
+	bool is_flush = bio->bi_rw & REQ_FLUSH;
 	struct clone_info ci;
 	int error = 0;
 
 	ci.map = dm_get_live_table(md);
 	if (unlikely(!ci.map)) {
-		if (!(bio->bi_rw & REQ_FLUSH))
-			bio_io_error(bio);
-		else
-			if (!md->flush_error)
-				md->flush_error = -EIO;
+		bio_io_error(bio);
 		return;
 	}
 
 	ci.md = md;
-	ci.bio = bio;
 	ci.io = alloc_io(md);
 	ci.io->error = 0;
 	atomic_set(&ci.io->io_count, 1);
@@ -1320,18 +1298,19 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
 	ci.io->md = md;
 	spin_lock_init(&ci.io->endio_lock);
 	ci.sector = bio->bi_sector;
-	if (!(bio->bi_rw & REQ_FLUSH))
+	ci.idx = bio->bi_idx;
+
+	if (!is_flush) {
+		ci.bio = bio;
 		ci.sector_count = bio_sectors(bio);
-	else {
-		/* all FLUSH bio's reaching here should be empty */
-		WARN_ON_ONCE(bio_has_data(bio));
+	} else {
+		ci.bio = &ci.md->flush_bio;
 		ci.sector_count = 1;
 	}
-	ci.idx = bio->bi_idx;
 
 	start_io_acct(ci.io);
 	while (ci.sector_count && !error) {
-		if (!(bio->bi_rw & REQ_FLUSH))
+		if (!is_flush)
 			error = __clone_and_map(&ci);
 		else
 			error = __clone_and_map_flush(&ci);
@@ -1419,22 +1398,14 @@ static int _dm_request(struct request_queue *q, struct bio *bio)
 	part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio));
 	part_stat_unlock();
 
-	/*
-	 * If we're suspended or the thread is processing flushes
-	 * we have to queue this io for later.
-	 */
-	if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) ||
-	    (bio->bi_rw & REQ_FLUSH)) {
+	/* if we're suspended, we have to queue this io for later */
+	if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
 		up_read(&md->io_lock);
 
-		if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) &&
-		    bio_rw(bio) == READA) {
+		if (bio_rw(bio) != READA)
+			queue_io(md, bio);
+		else
 			bio_io_error(bio);
-			return 0;
-		}
-
-		queue_io(md, bio);
-
 		return 0;
 	}
 
@@ -1923,6 +1894,10 @@ static struct mapped_device *alloc_dev(int minor)
 	if (!md->bdev)
 		goto bad_bdev;
 
+	bio_init(&md->flush_bio);
+	md->flush_bio.bi_bdev = md->bdev;
+	md->flush_bio.bi_rw = WRITE_FLUSH;
+
 	/* Populate the mapping, nobody knows we exist yet */
 	spin_lock(&_minor_lock);
 	old_md = idr_replace(&_minor_idr, md, minor);
@@ -2313,37 +2288,6 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
 	return r;
 }
 
-static void process_flush(struct mapped_device *md, struct bio *bio)
-{
-	md->flush_error = 0;
-
-	/* handle REQ_FLUSH */
-	dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
-
-	bio_init(&md->flush_bio);
-	md->flush_bio.bi_bdev = md->bdev;
-	md->flush_bio.bi_rw = WRITE_FLUSH;
-	__split_and_process_bio(md, &md->flush_bio);
-
-	dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
-
-	/* if it's an empty flush or the preflush failed, we're done */
-	if (!bio_has_data(bio) || md->flush_error) {
-		if (md->flush_error != DM_ENDIO_REQUEUE)
-			bio_endio(bio, md->flush_error);
-		else {
-			spin_lock_irq(&md->deferred_lock);
-			bio_list_add_head(&md->deferred, bio);
-			spin_unlock_irq(&md->deferred_lock);
-		}
-		return;
-	}
-
-	/* issue data + REQ_FUA */
-	bio->bi_rw &= ~REQ_FLUSH;
-	__split_and_process_bio(md, bio);
-}
-
 /*
  * Process the deferred bios
  */
@@ -2353,33 +2297,27 @@ static void dm_wq_work(struct work_struct *work)
 						work);
 	struct bio *c;
 
-	down_write(&md->io_lock);
+	down_read(&md->io_lock);
 
 	while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
 		spin_lock_irq(&md->deferred_lock);
 		c = bio_list_pop(&md->deferred);
 		spin_unlock_irq(&md->deferred_lock);
 
-		if (!c) {
-			clear_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
+		if (!c)
 			break;
-		}
 
-		up_write(&md->io_lock);
+		up_read(&md->io_lock);
 
 		if (dm_request_based(md))
 			generic_make_request(c);
-		else {
-			if (c->bi_rw & REQ_FLUSH)
-				process_flush(md, c);
-			else
-				__split_and_process_bio(md, c);
-		}
+		else
+			__split_and_process_bio(md, c);
 
-		down_write(&md->io_lock);
+		down_read(&md->io_lock);
 	}
 
-	up_write(&md->io_lock);
+	up_read(&md->io_lock);
 }
 
 static void dm_queue_flush(struct mapped_device *md)
@@ -2511,17 +2449,12 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 	 *
 	 * To get all processes out of __split_and_process_bio in dm_request,
 	 * we take the write lock. To prevent any process from reentering
-	 * __split_and_process_bio from dm_request, we set
-	 * DMF_QUEUE_IO_TO_THREAD.
-	 *
-	 * To quiesce the thread (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND
-	 * and call flush_workqueue(md->wq). flush_workqueue will wait until
-	 * dm_wq_work exits and DMF_BLOCK_IO_FOR_SUSPEND will prevent any
-	 * further calls to __split_and_process_bio from dm_wq_work.
+	 * __split_and_process_bio from dm_request and quiesce the thread
+	 * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
+	 * flush_workqueue(md->wq).
 	 */
 	down_write(&md->io_lock);
 	set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
-	set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
 	up_write(&md->io_lock);
 
 	/*
-- 
cgit v1.2.3


From 05447420f99c1c44063c7f00054667c022cc1365 Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Wed, 8 Sep 2010 18:07:01 +0200
Subject: dm: fix locking context in queue_io()

Now queue_io() is called from dec_pending(), which may be called with
interrupts disabled, so queue_io() must not enable interrupts
unconditionally and must save/restore the current interrupts status.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/md/dm.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 2011704b8ba0..cd2f7e77b625 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -512,9 +512,11 @@ static void end_io_acct(struct dm_io *io)
  */
 static void queue_io(struct mapped_device *md, struct bio *bio)
 {
-	spin_lock_irq(&md->deferred_lock);
+	unsigned long flags;
+
+	spin_lock_irqsave(&md->deferred_lock, flags);
 	bio_list_add(&md->deferred, bio);
-	spin_unlock_irq(&md->deferred_lock);
+	spin_unlock_irqrestore(&md->deferred_lock, flags);
 	queue_work(md->wq, &md->work);
 }
 
-- 
cgit v1.2.3


From b372d360df6deaf79a58a02fa0cc0d7e0aa3e92f Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 8 Sep 2010 18:07:01 +0200
Subject: dm: convey that all flushes are processed as empty

Rename __clone_and_map_flush to __clone_and_map_empty_flush for added
clarity.

Simplify logic associated with REQ_FLUSH conditionals.

Introduce a BUG_ON() and add a few more helpful comments to the code
so that it is clear that all flushes are empty.

Cleanup __split_and_process_bio() so that an empty flush isn't processed
by a 'sector_count' focused while loop.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/md/dm.c | 34 +++++++++++++++-------------------
 1 file changed, 15 insertions(+), 19 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index cd2f7e77b625..f934e9878436 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -621,16 +621,17 @@ static void dec_pending(struct dm_io *io, int error)
 		if (io_error == DM_ENDIO_REQUEUE)
 			return;
 
-		if (!(bio->bi_rw & REQ_FLUSH) || !bio->bi_size) {
-			trace_block_bio_complete(md->queue, bio);
-			bio_endio(bio, io_error);
-		} else {
+		if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) {
 			/*
 			 * Preflush done for flush with data, reissue
 			 * without REQ_FLUSH.
 			 */
 			bio->bi_rw &= ~REQ_FLUSH;
 			queue_io(md, bio);
+		} else {
+			/* done with normal IO or empty flush */
+			trace_block_bio_complete(md->queue, bio);
+			bio_endio(bio, io_error);
 		}
 	}
 }
@@ -1132,16 +1133,15 @@ static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti,
 		__issue_target_request(ci, ti, request_nr, len);
 }
 
-static int __clone_and_map_flush(struct clone_info *ci)
+static int __clone_and_map_empty_flush(struct clone_info *ci)
 {
 	unsigned target_nr = 0;
 	struct dm_target *ti;
 
+	BUG_ON(bio_has_data(ci->bio));
 	while ((ti = dm_table_get_target(ci->map, target_nr++)))
 		__issue_target_requests(ci, ti, ti->num_flush_requests, 0);
 
-	ci->sector_count = 0;
-
 	return 0;
 }
 
@@ -1282,7 +1282,6 @@ static int __clone_and_map(struct clone_info *ci)
  */
 static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
 {
-	bool is_flush = bio->bi_rw & REQ_FLUSH;
 	struct clone_info ci;
 	int error = 0;
 
@@ -1302,20 +1301,17 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
 	ci.sector = bio->bi_sector;
 	ci.idx = bio->bi_idx;
 
-	if (!is_flush) {
+	start_io_acct(ci.io);
+	if (bio->bi_rw & REQ_FLUSH) {
+		ci.bio = &ci.md->flush_bio;
+		ci.sector_count = 0;
+		error = __clone_and_map_empty_flush(&ci);
+		/* dec_pending submits any data associated with flush */
+	} else {
 		ci.bio = bio;
 		ci.sector_count = bio_sectors(bio);
-	} else {
-		ci.bio = &ci.md->flush_bio;
-		ci.sector_count = 1;
-	}
-
-	start_io_acct(ci.io);
-	while (ci.sector_count && !error) {
-		if (!is_flush)
+		while (ci.sector_count && !error)
 			error = __clone_and_map(&ci);
-		else
-			error = __clone_and_map_flush(&ci);
 	}
 
 	/* drop the extra reference count */
-- 
cgit v1.2.3


From 2cf6d26a354ab6362e301b5a323832b02867df47 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 18 Aug 2010 05:29:10 -0400
Subject: block: pass gfp_mask and flags to sb_issue_discard

We'll need to get rid of the BLKDEV_IFL_BARRIER flag, and to facilitate
that and to make the interface less confusing pass all flags explicitly.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/ext4/mballoc.c      |  3 ++-
 fs/fat/fatent.c        |  4 +++-
 include/linux/blkdev.h | 11 +++++------
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 4b4ad4b7ce57..df44b345f662 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2566,7 +2566,8 @@ static inline void ext4_issue_discard(struct super_block *sb,
 	discard_block = block + ext4_group_first_block_no(sb, block_group);
 	trace_ext4_discard_blocks(sb,
 			(unsigned long long) discard_block, count);
-	ret = sb_issue_discard(sb, discard_block, count);
+	ret = sb_issue_discard(sb, discard_block, count, GFP_NOFS,
+			       BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
 	if (ret == EOPNOTSUPP) {
 		ext4_warning(sb, "discard not supported, disabling");
 		clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 81184d3b75a3..3a56a82f5658 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -577,7 +577,9 @@ int fat_free_clusters(struct inode *inode, int cluster)
 
 				sb_issue_discard(sb,
 					fat_clus_to_blknr(sbi, first_cl),
-					nr_clus * sbi->sec_per_clus);
+					nr_clus * sbi->sec_per_clus,
+					GFP_NOFS,
+					BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
 
 				first_cl = cluster;
 			}
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8ef705f800ab..6b305eb4a343 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -881,13 +881,12 @@ extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
 extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 			sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
-static inline int sb_issue_discard(struct super_block *sb,
-				   sector_t block, sector_t nr_blocks)
+static inline int sb_issue_discard(struct super_block *sb, sector_t block,
+		sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags)
 {
-	block <<= (sb->s_blocksize_bits - 9);
-	nr_blocks <<= (sb->s_blocksize_bits - 9);
-	return blkdev_issue_discard(sb->s_bdev, block, nr_blocks, GFP_NOFS,
-				   BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
+	return blkdev_issue_discard(sb->s_bdev, block << (sb->s_blocksize_bits - 9),
+				    nr_blocks << (sb->s_blocksize_bits - 9),
+				    gfp_mask, flags);
 }
 
 extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
-- 
cgit v1.2.3


From 80f6c29d8a758650d5c9eac05074b4b3e8c266df Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 18 Aug 2010 05:29:11 -0400
Subject: xfs: replace barriers with explicit flush / FUA usage

Switch to the WRITE_FLUSH_FUA flag for log writes and remove the EOPNOTSUPP
detection for barriers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/xfs/linux-2.6/xfs_buf.c   | 16 ++--------------
 fs/xfs/linux-2.6/xfs_buf.h   | 11 +----------
 fs/xfs/linux-2.6/xfs_trace.h |  1 -
 fs/xfs/xfs_log.c             | 13 -------------
 4 files changed, 3 insertions(+), 38 deletions(-)

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index ea79072f5210..b93ea3342281 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -929,19 +929,7 @@ xfs_buf_iodone_work(
 	xfs_buf_t		*bp =
 		container_of(work, xfs_buf_t, b_iodone_work);
 
-	/*
-	 * We can get an EOPNOTSUPP to ordered writes.  Here we clear the
-	 * ordered flag and reissue them.  Because we can't tell the higher
-	 * layers directly that they should not issue ordered I/O anymore, they
-	 * need to check if the _XFS_BARRIER_FAILED flag was set during I/O completion.
-	 */
-	if ((bp->b_error == EOPNOTSUPP) &&
-	    (bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) {
-		trace_xfs_buf_ordered_retry(bp, _RET_IP_);
-		bp->b_flags &= ~XBF_ORDERED;
-		bp->b_flags |= _XFS_BARRIER_FAILED;
-		xfs_buf_iorequest(bp);
-	} else if (bp->b_iodone)
+	if (bp->b_iodone)
 		(*(bp->b_iodone))(bp);
 	else if (bp->b_flags & XBF_ASYNC)
 		xfs_buf_relse(bp);
@@ -1200,7 +1188,7 @@ _xfs_buf_ioapply(
 
 	if (bp->b_flags & XBF_ORDERED) {
 		ASSERT(!(bp->b_flags & XBF_READ));
-		rw = WRITE_BARRIER;
+		rw = WRITE_FLUSH_FUA;
 	} else if (bp->b_flags & XBF_LOG_BUFFER) {
 		ASSERT(!(bp->b_flags & XBF_READ_AHEAD));
 		bp->b_flags &= ~_XBF_RUN_QUEUES;
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index d072e5ff923b..d533d64e2c3e 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -86,14 +86,6 @@ typedef enum {
  */
 #define _XBF_PAGE_LOCKED	(1 << 22)
 
-/*
- * If we try a barrier write, but it fails we have to communicate
- * this to the upper layers.  Unfortunately b_error gets overwritten
- * when the buffer is re-issued so we have to add another flag to
- * keep this information.
- */
-#define _XFS_BARRIER_FAILED	(1 << 23)
-
 typedef unsigned int xfs_buf_flags_t;
 
 #define XFS_BUF_FLAGS \
@@ -114,8 +106,7 @@ typedef unsigned int xfs_buf_flags_t;
 	{ _XBF_PAGES,		"PAGES" }, \
 	{ _XBF_RUN_QUEUES,	"RUN_QUEUES" }, \
 	{ _XBF_DELWRI_Q,	"DELWRI_Q" }, \
-	{ _XBF_PAGE_LOCKED,	"PAGE_LOCKED" }, \
-	{ _XFS_BARRIER_FAILED,	"BARRIER_FAILED" }
+	{ _XBF_PAGE_LOCKED,	"PAGE_LOCKED" }
 
 
 typedef enum {
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index be5dffd282a1..8fe311a456e2 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -325,7 +325,6 @@ DEFINE_BUF_EVENT(xfs_buf_lock);
 DEFINE_BUF_EVENT(xfs_buf_lock_done);
 DEFINE_BUF_EVENT(xfs_buf_cond_lock);
 DEFINE_BUF_EVENT(xfs_buf_unlock);
-DEFINE_BUF_EVENT(xfs_buf_ordered_retry);
 DEFINE_BUF_EVENT(xfs_buf_iowait);
 DEFINE_BUF_EVENT(xfs_buf_iowait_done);
 DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 925d572bf0f4..430a8fc02c1f 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -916,19 +916,6 @@ xlog_iodone(xfs_buf_t *bp)
 	aborted = 0;
 	l = iclog->ic_log;
 
-	/*
-	 * If the _XFS_BARRIER_FAILED flag was set by a lower
-	 * layer, it means the underlying device no longer supports
-	 * barrier I/O. Warn loudly and turn off barriers.
-	 */
-	if (bp->b_flags & _XFS_BARRIER_FAILED) {
-		bp->b_flags &= ~_XFS_BARRIER_FAILED;
-		l->l_mp->m_flags &= ~XFS_MOUNT_BARRIER;
-		xfs_fs_cmn_err(CE_WARN, l->l_mp,
-				"xlog_iodone: Barriers are no longer supported"
-				" by device. Disabling barriers\n");
-	}
-
 	/*
 	 * Race to shutdown the filesystem if we see an error.
 	 */
-- 
cgit v1.2.3


From c3b9a62c8f932f32a733d6b628f61f3f28345727 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 18 Aug 2010 05:29:12 -0400
Subject: btrfs: replace barriers with explicit flush / FUA usage

Switch to the WRITE_FLUSH_FUA flag for log writes, remove the EOPNOTSUPP
detection for barriers and stop setting the barrier flag for discards.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/btrfs/disk-io.c     | 19 ++++---------------
 fs/btrfs/extent-tree.c |  2 +-
 fs/btrfs/volumes.c     |  4 ----
 fs/btrfs/volumes.h     |  1 -
 4 files changed, 5 insertions(+), 21 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 64f10082f048..5e789f4a3ed0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2063,7 +2063,7 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 	if (uptodate) {
 		set_buffer_uptodate(bh);
 	} else {
-		if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
+		if (printk_ratelimit()) {
 			printk(KERN_WARNING "lost page write due to "
 					"I/O error on %s\n",
 				       bdevname(bh->b_bdev, b));
@@ -2200,21 +2200,10 @@ static int write_dev_supers(struct btrfs_device *device,
 			bh->b_end_io = btrfs_end_buffer_write_sync;
 		}
 
-		if (i == last_barrier && do_barriers && device->barriers) {
-			ret = submit_bh(WRITE_BARRIER, bh);
-			if (ret == -EOPNOTSUPP) {
-				printk("btrfs: disabling barriers on dev %s\n",
-				       device->name);
-				set_buffer_uptodate(bh);
-				device->barriers = 0;
-				/* one reference for submit_bh */
-				get_bh(bh);
-				lock_buffer(bh);
-				ret = submit_bh(WRITE_SYNC, bh);
-			}
-		} else {
+		if (i == last_barrier && do_barriers)
+			ret = submit_bh(WRITE_FLUSH_FUA, bh);
+		else
 			ret = submit_bh(WRITE_SYNC, bh);
-		}
 
 		if (ret)
 			errors++;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 32d094002a57..43dc9ea9aef6 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1696,7 +1696,7 @@ static void btrfs_issue_discard(struct block_device *bdev,
 				u64 start, u64 len)
 {
 	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
-			BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
+			BLKDEV_IFL_WAIT);
 }
 
 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index dd318ff280b2..e25e46a8b4e2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -398,7 +398,6 @@ static noinline int device_list_add(const char *path,
 		device->work.func = pending_bios_fn;
 		memcpy(device->uuid, disk_super->dev_item.uuid,
 		       BTRFS_UUID_SIZE);
-		device->barriers = 1;
 		spin_lock_init(&device->io_lock);
 		device->name = kstrdup(path, GFP_NOFS);
 		if (!device->name) {
@@ -462,7 +461,6 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
 		device->devid = orig_dev->devid;
 		device->work.func = pending_bios_fn;
 		memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
-		device->barriers = 1;
 		spin_lock_init(&device->io_lock);
 		INIT_LIST_HEAD(&device->dev_list);
 		INIT_LIST_HEAD(&device->dev_alloc_list);
@@ -1489,7 +1487,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	trans = btrfs_start_transaction(root, 0);
 	lock_chunks(root);
 
-	device->barriers = 1;
 	device->writeable = 1;
 	device->work.func = pending_bios_fn;
 	generate_random_uuid(device->uuid);
@@ -3084,7 +3081,6 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
 		return NULL;
 	list_add(&device->dev_list,
 		 &fs_devices->devices);
-	device->barriers = 1;
 	device->dev_root = root->fs_info->dev_root;
 	device->devid = devid;
 	device->work.func = pending_bios_fn;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 31b0fabdd2ea..2b638b6e4eea 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -42,7 +42,6 @@ struct btrfs_device {
 	int running_pending;
 	u64 generation;
 
-	int barriers;
 	int writeable;
 	int in_fs_metadata;
 
-- 
cgit v1.2.3


From f1e4d518c3beddf67f7722f3548eda0ec7006204 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 18 Aug 2010 05:29:13 -0400
Subject: gfs2: replace barriers with explicit flush / FUA usage

Switch to the WRITE_FLUSH_FUA flag for log writes, remove the EOPNOTSUPP
detection for barriers and stop setting the barrier flag for discards.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Steven Whitehouse <swhiteho@redhat.com>
Acked-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/gfs2/log.c  | 19 +++++--------------
 fs/gfs2/rgrp.c |  5 ++---
 2 files changed, 7 insertions(+), 17 deletions(-)

diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index cde1248a6225..9c65170e932e 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -592,22 +592,13 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
 	lh->lh_hash = cpu_to_be32(hash);
 
 	bh->b_end_io = end_buffer_write_sync;
-	if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
-		goto skip_barrier;
 	get_bh(bh);
-	submit_bh(WRITE_BARRIER | REQ_META, bh);
-	wait_on_buffer(bh);
-	if (buffer_eopnotsupp(bh)) {
-		clear_buffer_eopnotsupp(bh);
-		set_buffer_uptodate(bh);
-		fs_info(sdp, "barrier sync failed - disabling barriers\n");
-		set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
-		lock_buffer(bh);
-skip_barrier:
-		get_bh(bh);
+	if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
 		submit_bh(WRITE_SYNC | REQ_META, bh);
-		wait_on_buffer(bh);
-	}
+	else
+		submit_bh(WRITE_FLUSH_FUA | REQ_META, bh);
+	wait_on_buffer(bh);
+
 	if (!buffer_uptodate(bh))
 		gfs2_io_error_bh(sdp, bh);
 	brelse(bh);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 171a744f8e45..379316472918 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -854,8 +854,7 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
 				if ((start + nr_sects) != blk) {
 					rv = blkdev_issue_discard(bdev, start,
 							    nr_sects, GFP_NOFS,
-							    BLKDEV_IFL_WAIT |
-							    BLKDEV_IFL_BARRIER);
+							    BLKDEV_IFL_WAIT);
 					if (rv)
 						goto fail;
 					nr_sects = 0;
@@ -870,7 +869,7 @@ start_new_extent:
 	}
 	if (nr_sects) {
 		rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS,
-					 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
+					 BLKDEV_IFL_WAIT);
 		if (rv)
 			goto fail;
 	}
-- 
cgit v1.2.3


From 7cd33ad23ec41d685902159be8b2c6552fab8bd0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 18 Aug 2010 05:29:14 -0400
Subject: reiserfs: replace barriers with explicit flush / FUA usage

Switch to the WRITE_FLUSH_FUA flag for log writes and remove the EOPNOTSUPP
detection for barriers.  Note that reiserfs had a fairly different code
path for barriers before as it wa the only filesystem actually making use
of them.  The new code always uses the old non-barrier codepath and just
sets the WRITE_FLUSH_FUA explicitly for the journal commits.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jan Kara <jack@suse.cz>
Acked-by: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/reiserfs/journal.c | 106 ++++++++++----------------------------------------
 1 file changed, 20 insertions(+), 86 deletions(-)

diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 812e2c05aa29..076c8b194682 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -138,13 +138,6 @@ static int reiserfs_clean_and_file_buffer(struct buffer_head *bh)
 	return 0;
 }
 
-static void disable_barrier(struct super_block *s)
-{
-	REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_BARRIER_FLUSH);
-	printk("reiserfs: disabling flush barriers on %s\n",
-	       reiserfs_bdevname(s));
-}
-
 static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block
 							 *sb)
 {
@@ -677,30 +670,6 @@ static void submit_ordered_buffer(struct buffer_head *bh)
 	submit_bh(WRITE, bh);
 }
 
-static int submit_barrier_buffer(struct buffer_head *bh)
-{
-	get_bh(bh);
-	bh->b_end_io = reiserfs_end_ordered_io;
-	clear_buffer_dirty(bh);
-	if (!buffer_uptodate(bh))
-		BUG();
-	return submit_bh(WRITE_BARRIER, bh);
-}
-
-static void check_barrier_completion(struct super_block *s,
-				     struct buffer_head *bh)
-{
-	if (buffer_eopnotsupp(bh)) {
-		clear_buffer_eopnotsupp(bh);
-		disable_barrier(s);
-		set_buffer_uptodate(bh);
-		set_buffer_dirty(bh);
-		reiserfs_write_unlock(s);
-		sync_dirty_buffer(bh);
-		reiserfs_write_lock(s);
-	}
-}
-
 #define CHUNK_SIZE 32
 struct buffer_chunk {
 	struct buffer_head *bh[CHUNK_SIZE];
@@ -1009,7 +978,6 @@ static int flush_commit_list(struct super_block *s,
 	struct buffer_head *tbh = NULL;
 	unsigned int trans_id = jl->j_trans_id;
 	struct reiserfs_journal *journal = SB_JOURNAL(s);
-	int barrier = 0;
 	int retval = 0;
 	int write_len;
 
@@ -1094,24 +1062,6 @@ static int flush_commit_list(struct super_block *s,
 	}
 	atomic_dec(&journal->j_async_throttle);
 
-	/* We're skipping the commit if there's an error */
-	if (retval || reiserfs_is_journal_aborted(journal))
-		barrier = 0;
-
-	/* wait on everything written so far before writing the commit
-	 * if we are in barrier mode, send the commit down now
-	 */
-	barrier = reiserfs_barrier_flush(s);
-	if (barrier) {
-		int ret;
-		lock_buffer(jl->j_commit_bh);
-		ret = submit_barrier_buffer(jl->j_commit_bh);
-		if (ret == -EOPNOTSUPP) {
-			set_buffer_uptodate(jl->j_commit_bh);
-			disable_barrier(s);
-			barrier = 0;
-		}
-	}
 	for (i = 0; i < (jl->j_len + 1); i++) {
 		bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
 		    (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s);
@@ -1143,27 +1093,22 @@ static int flush_commit_list(struct super_block *s,
 
 	BUG_ON(atomic_read(&(jl->j_commit_left)) != 1);
 
-	if (!barrier) {
-		/* If there was a write error in the journal - we can't commit
-		 * this transaction - it will be invalid and, if successful,
-		 * will just end up propagating the write error out to
-		 * the file system. */
-		if (likely(!retval && !reiserfs_is_journal_aborted (journal))) {
-			if (buffer_dirty(jl->j_commit_bh))
-				BUG();
-			mark_buffer_dirty(jl->j_commit_bh) ;
-			reiserfs_write_unlock(s);
-			sync_dirty_buffer(jl->j_commit_bh) ;
-			reiserfs_write_lock(s);
-		}
-	} else {
+	/* If there was a write error in the journal - we can't commit
+	 * this transaction - it will be invalid and, if successful,
+	 * will just end up propagating the write error out to
+	 * the file system. */
+	if (likely(!retval && !reiserfs_is_journal_aborted (journal))) {
+		if (buffer_dirty(jl->j_commit_bh))
+			BUG();
+		mark_buffer_dirty(jl->j_commit_bh) ;
 		reiserfs_write_unlock(s);
-		wait_on_buffer(jl->j_commit_bh);
+		if (reiserfs_barrier_flush(s))
+			__sync_dirty_buffer(jl->j_commit_bh, WRITE_FLUSH_FUA);
+		else
+			sync_dirty_buffer(jl->j_commit_bh);
 		reiserfs_write_lock(s);
 	}
 
-	check_barrier_completion(s, jl->j_commit_bh);
-
 	/* If there was a write error in the journal - we can't commit this
 	 * transaction - it will be invalid and, if successful, will just end
 	 * up propagating the write error out to the filesystem. */
@@ -1319,26 +1264,15 @@ static int _update_journal_header_block(struct super_block *sb,
 		jh->j_first_unflushed_offset = cpu_to_le32(offset);
 		jh->j_mount_id = cpu_to_le32(journal->j_mount_id);
 
-		if (reiserfs_barrier_flush(sb)) {
-			int ret;
-			lock_buffer(journal->j_header_bh);
-			ret = submit_barrier_buffer(journal->j_header_bh);
-			if (ret == -EOPNOTSUPP) {
-				set_buffer_uptodate(journal->j_header_bh);
-				disable_barrier(sb);
-				goto sync;
-			}
-			reiserfs_write_unlock(sb);
-			wait_on_buffer(journal->j_header_bh);
-			reiserfs_write_lock(sb);
-			check_barrier_completion(sb, journal->j_header_bh);
-		} else {
-		      sync:
-			set_buffer_dirty(journal->j_header_bh);
-			reiserfs_write_unlock(sb);
+		set_buffer_dirty(journal->j_header_bh);
+		reiserfs_write_unlock(sb);
+
+		if (reiserfs_barrier_flush(sb))
+			__sync_dirty_buffer(journal->j_header_bh, WRITE_FLUSH_FUA);
+		else
 			sync_dirty_buffer(journal->j_header_bh);
-			reiserfs_write_lock(sb);
-		}
+
+		reiserfs_write_lock(sb);
 		if (!buffer_uptodate(journal->j_header_bh)) {
 			reiserfs_warning(sb, "journal-837",
 					 "IO error during journal replay");
-- 
cgit v1.2.3


From f8c131f5b6ffc899a70b30e541f367d47f89691c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 18 Aug 2010 05:29:15 -0400
Subject: nilfs2: replace barriers with explicit flush / FUA usage

Switch to the WRITE_FLUSH_FUA flag for log writes, remove the EOPNOTSUPP
detection for barriers and stop setting the barrier flag for discards.

tj: nilfs is now fixed to wait for discard completion.  Updated this
    patch accordingly and dropped warning about it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/nilfs2/super.c     | 10 +---------
 fs/nilfs2/the_nilfs.c |  7 ++-----
 2 files changed, 3 insertions(+), 14 deletions(-)

diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 922263393c76..faa5078ff751 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -178,17 +178,9 @@ static int nilfs_sync_super(struct nilfs_sb_info *sbi, int flag)
 
  retry:
 	set_buffer_dirty(nilfs->ns_sbh[0]);
-
 	if (nilfs_test_opt(sbi, BARRIER)) {
 		err = __sync_dirty_buffer(nilfs->ns_sbh[0],
-					  WRITE_SYNC | WRITE_BARRIER);
-		if (err == -EOPNOTSUPP) {
-			nilfs_warning(sbi->s_super, __func__,
-				      "barrier-based sync failed. "
-				      "disabling barriers\n");
-			nilfs_clear_opt(sbi, BARRIER);
-			goto retry;
-		}
+					  WRITE_SYNC | WRITE_FLUSH_FUA);
 	} else {
 		err = sync_dirty_buffer(nilfs->ns_sbh[0]);
 	}
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 4317f177ea7c..400b2caef4d8 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -774,9 +774,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
 			ret = blkdev_issue_discard(nilfs->ns_bdev,
 						   start * sects_per_block,
 						   nblocks * sects_per_block,
-						   GFP_NOFS,
-						   BLKDEV_IFL_WAIT |
-						   BLKDEV_IFL_BARRIER);
+						   GFP_NOFS, BLKDEV_IFL_WAIT);
 			if (ret < 0)
 				return ret;
 			nblocks = 0;
@@ -786,8 +784,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
 		ret = blkdev_issue_discard(nilfs->ns_bdev,
 					   start * sects_per_block,
 					   nblocks * sects_per_block,
-					   GFP_NOFS,
-					  BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
+					   GFP_NOFS, BLKDEV_IFL_WAIT);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 4524451ef7e88c64a868a8f5a0b49bda73beb2a3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 18 Aug 2010 05:29:16 -0400
Subject: jbd: replace barriers with explicit flush / FUA usage

Switch to the WRITE_FLUSH_FUA flag for journal commits and remove the
EOPNOTSUPP detection for barriers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/jbd/commit.c | 30 +++---------------------------
 1 file changed, 3 insertions(+), 27 deletions(-)

diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 95d8c11c929e..484c5e5fa8af 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -137,34 +137,10 @@ static int journal_write_commit_record(journal_t *journal,
 	JBUFFER_TRACE(descriptor, "write commit block");
 	set_buffer_dirty(bh);
 
-	if (journal->j_flags & JFS_BARRIER) {
-		ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_BARRIER);
-
-		/*
-		 * Is it possible for another commit to fail at roughly
-		 * the same time as this one?  If so, we don't want to
-		 * trust the barrier flag in the super, but instead want
-		 * to remember if we sent a barrier request
-		 */
-		if (ret == -EOPNOTSUPP) {
-			char b[BDEVNAME_SIZE];
-
-			printk(KERN_WARNING
-				"JBD: barrier-based sync failed on %s - "
-				"disabling barriers\n",
-				bdevname(journal->j_dev, b));
-			spin_lock(&journal->j_state_lock);
-			journal->j_flags &= ~JFS_BARRIER;
-			spin_unlock(&journal->j_state_lock);
-
-			/* And try again, without the barrier */
-			set_buffer_uptodate(bh);
-			set_buffer_dirty(bh);
-			ret = sync_dirty_buffer(bh);
-		}
-	} else {
+	if (journal->j_flags & JFS_BARRIER)
+		ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_FLUSH_FUA);
+	else
 		ret = sync_dirty_buffer(bh);
-	}
 
 	put_bh(bh);		/* One for getblk() */
 	journal_put_journal_head(descriptor);
-- 
cgit v1.2.3


From f73bee49855fe968e87af18e44f2d17e1e675e74 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 18 Aug 2010 15:56:56 +0200
Subject: jbd2: Modify ASYNC_COMMIT code to not rely on queue draining on
 barrier

Currently JBD2 relies blkdev_issue_flush() draining the queue when ASYNC_COMMIT
feature is set. This property is going away so make JBD2 wait for buffers it
needs on its own before submitting the cache flush.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/jbd2/commit.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 7c068c189d80..d6aeb1f6cfe0 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -701,6 +701,16 @@ start_journal_io:
 		}
 	}
 
+	err = journal_finish_inode_data_buffers(journal, commit_transaction);
+	if (err) {
+		printk(KERN_WARNING
+			"JBD2: Detected IO errors while flushing file data "
+		       "on %s\n", journal->j_devname);
+		if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
+			jbd2_journal_abort(journal, err);
+		err = 0;
+	}
+
 	/* 
 	 * If the journal is not located on the file system device,
 	 * then we must flush the file system device before we issue
@@ -719,19 +729,6 @@ start_journal_io:
 						 &cbh, crc32_sum);
 		if (err)
 			__jbd2_journal_abort_hard(journal);
-		if (journal->j_flags & JBD2_BARRIER)
-			blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
-				BLKDEV_IFL_WAIT);
-	}
-
-	err = journal_finish_inode_data_buffers(journal, commit_transaction);
-	if (err) {
-		printk(KERN_WARNING
-			"JBD2: Detected IO errors while flushing file data "
-		       "on %s\n", journal->j_devname);
-		if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
-			jbd2_journal_abort(journal, err);
-		err = 0;
 	}
 
 	/* Lo and behold: we have just managed to send a transaction to
@@ -845,6 +842,12 @@ wait_for_iobuf:
 	}
 	if (!err && !is_journal_aborted(journal))
 		err = journal_wait_on_commit_record(journal, cbh);
+	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
+				      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
+	    journal->j_flags & JBD2_BARRIER) {
+		blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
+				   BLKDEV_IFL_WAIT);
+	}
 
 	if (err)
 		jbd2_journal_abort(journal, err);
-- 
cgit v1.2.3


From 9c35575bbe6b1dd4914a5323c8df8b3159edcc75 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 18 Aug 2010 05:29:17 -0400
Subject: jbd2: replace barriers with explicit flush / FUA usage

Switch to the WRITE_FLUSH_FUA flag for journal commits and remove the
EOPNOTSUPP detection for barriers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/jbd2/commit.c | 43 ++++---------------------------------------
 1 file changed, 4 insertions(+), 39 deletions(-)

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index d6aeb1f6cfe0..f204e27f44d1 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -134,25 +134,11 @@ static int journal_submit_commit_record(journal_t *journal,
 
 	if (journal->j_flags & JBD2_BARRIER &&
 	    !JBD2_HAS_INCOMPAT_FEATURE(journal,
-				       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
-		ret = submit_bh(WRITE_SYNC_PLUG | WRITE_BARRIER, bh);
-		if (ret == -EOPNOTSUPP) {
-			printk(KERN_WARNING
-			       "JBD2: Disabling barriers on %s, "
-			       "not supported by device\n", journal->j_devname);
-			write_lock(&journal->j_state_lock);
-			journal->j_flags &= ~JBD2_BARRIER;
-			write_unlock(&journal->j_state_lock);
-
-			/* And try again, without the barrier */
-			lock_buffer(bh);
-			set_buffer_uptodate(bh);
-			clear_buffer_dirty(bh);
-			ret = submit_bh(WRITE_SYNC_PLUG, bh);
-		}
-	} else {
+				       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
+		ret = submit_bh(WRITE_SYNC_PLUG | WRITE_FLUSH_FUA, bh);
+	else
 		ret = submit_bh(WRITE_SYNC_PLUG, bh);
-	}
+
 	*cbh = bh;
 	return ret;
 }
@@ -166,29 +152,8 @@ static int journal_wait_on_commit_record(journal_t *journal,
 {
 	int ret = 0;
 
-retry:
 	clear_buffer_dirty(bh);
 	wait_on_buffer(bh);
-	if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
-		printk(KERN_WARNING
-		       "JBD2: %s: disabling barries on %s - not supported "
-		       "by device\n", __func__, journal->j_devname);
-		write_lock(&journal->j_state_lock);
-		journal->j_flags &= ~JBD2_BARRIER;
-		write_unlock(&journal->j_state_lock);
-
-		lock_buffer(bh);
-		clear_buffer_dirty(bh);
-		set_buffer_uptodate(bh);
-		bh->b_end_io = journal_end_buffer_io_sync;
-
-		ret = submit_bh(WRITE_SYNC_PLUG, bh);
-		if (ret) {
-			unlock_buffer(bh);
-			return ret;
-		}
-		goto retry;
-	}
 
 	if (unlikely(!buffer_uptodate(bh)))
 		ret = -EIO;
-- 
cgit v1.2.3


From 61002f7db33c7d064cddcdab680fb750fa43d8fd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 18 Aug 2010 05:29:18 -0400
Subject: ext4: do not send discards as barriers

ext4 already uses synchronous discards, no need to add I/O barriers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/ext4/mballoc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index df44b345f662..a22bfef3da95 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2567,7 +2567,7 @@ static inline void ext4_issue_discard(struct super_block *sb,
 	trace_ext4_discard_blocks(sb,
 			(unsigned long long) discard_block, count);
 	ret = sb_issue_discard(sb, discard_block, count, GFP_NOFS,
-			       BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
+			       BLKDEV_IFL_WAIT);
 	if (ret == EOPNOTSUPP) {
 		ext4_warning(sb, "discard not supported, disabling");
 		clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
-- 
cgit v1.2.3


From e045db80d07250312500b2ed707b84dc703189d6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 18 Aug 2010 05:29:19 -0400
Subject: fat: do not send discards as barriers

fat already uses synchronous discards, no need to add I/O barriers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/fat/fatent.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 3a56a82f5658..f9a0b7ae8648 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -579,7 +579,7 @@ int fat_free_clusters(struct inode *inode, int cluster)
 					fat_clus_to_blknr(sbi, first_cl),
 					nr_clus * sbi->sec_per_clus,
 					GFP_NOFS,
-					BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
+					BLKDEV_IFL_WAIT);
 
 				first_cl = cluster;
 			}
-- 
cgit v1.2.3


From 349f429eec36cb743fee20f4c193ecde97a3ed2e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 18 Aug 2010 05:29:20 -0400
Subject: swap: do not send discards as barriers

The swap code already uses synchronous discards, no need to add I/O barriers.

tj: superflous newlines removed.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Hugh Dickins <hughd@google.com>
Tested-by: Nigel Cunningham <nigel@tuxonice.net>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 mm/swapfile.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 1f3f9c59a73a..68cda164dff6 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -141,8 +141,7 @@ static int discard_swap(struct swap_info_struct *si)
 	nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
 	if (nr_blocks) {
 		err = blkdev_issue_discard(si->bdev, start_block,
-				nr_blocks, GFP_KERNEL,
-				BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
+				nr_blocks, GFP_KERNEL, BLKDEV_IFL_WAIT);
 		if (err)
 			return err;
 		cond_resched();
@@ -153,8 +152,7 @@ static int discard_swap(struct swap_info_struct *si)
 		nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
 
 		err = blkdev_issue_discard(si->bdev, start_block,
-				nr_blocks, GFP_KERNEL,
-				BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
+				nr_blocks, GFP_KERNEL, BLKDEV_IFL_WAIT);
 		if (err)
 			break;
 
@@ -193,8 +191,7 @@ static void discard_swap_cluster(struct swap_info_struct *si,
 			start_block <<= PAGE_SHIFT - 9;
 			nr_blocks <<= PAGE_SHIFT - 9;
 			if (blkdev_issue_discard(si->bdev, start_block,
-				    nr_blocks, GFP_NOIO, BLKDEV_IFL_WAIT |
-							BLKDEV_IFL_BARRIER))
+				    nr_blocks, GFP_NOIO, BLKDEV_IFL_WAIT))
 				break;
 		}
 
-- 
cgit v1.2.3


From 31725e65c7214b52b607eba705fc4f306be4d5a5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 18 Aug 2010 05:29:21 -0400
Subject: block: remove the WRITE_BARRIER flag

It's unused now.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 include/linux/fs.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 352c48627381..d6add69bc170 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -135,7 +135,6 @@ struct inodes_stat_t {
  *			immediately after submission. The write equivalent
  *			of READ_SYNC.
  * WRITE_ODIRECT_PLUG	Special case write for O_DIRECT only.
- * WRITE_BARRIER	DEPRECATED. Always fails. Use FLUSH/FUA instead.
  * WRITE_FLUSH		Like WRITE_SYNC but with preceding cache flush.
  * WRITE_FUA		Like WRITE_SYNC but data is guaranteed to be on
  *			non-volatile media on completion.
@@ -157,8 +156,6 @@ struct inodes_stat_t {
 #define WRITE_SYNC		(WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG)
 #define WRITE_ODIRECT_PLUG	(WRITE | REQ_SYNC)
 #define WRITE_META		(WRITE | REQ_META)
-#define WRITE_BARRIER		(WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
-				 REQ_HARDBARRIER)
 #define WRITE_FLUSH		(WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
 				 REQ_FLUSH)
 #define WRITE_FUA		(WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
-- 
cgit v1.2.3


From 8c5553678237b7121355108e03c36086037d8975 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 18 Aug 2010 05:29:22 -0400
Subject: block: remove the BLKDEV_IFL_BARRIER flag

Remove support for barriers on discards, which is unused now.  Also
remove the DISCARD_NOBARRIER I/O type in favour of just setting the
rw flags up locally in blkdev_issue_discard.

tj: Also remove DISCARD_SECURE and use REQ_SECURE directly.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-lib.c        | 18 ++----------------
 include/linux/blkdev.h |  2 --
 include/linux/fs.h     |  8 --------
 3 files changed, 2 insertions(+), 26 deletions(-)

diff --git a/block/blk-lib.c b/block/blk-lib.c
index c392029a104e..fe2e6ed0f510 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -39,8 +39,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 {
 	DECLARE_COMPLETION_ONSTACK(wait);
 	struct request_queue *q = bdev_get_queue(bdev);
-	int type = flags & BLKDEV_IFL_BARRIER ?
-		DISCARD_BARRIER : DISCARD_NOBARRIER;
+	int type = REQ_WRITE | REQ_DISCARD;
 	unsigned int max_discard_sectors;
 	struct bio *bio;
 	int ret = 0;
@@ -65,7 +64,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 	if (flags & BLKDEV_IFL_SECURE) {
 		if (!blk_queue_secdiscard(q))
 			return -EOPNOTSUPP;
-		type |= DISCARD_SECURE;
+		type |= REQ_SECURE;
 	}
 
 	while (nr_sects && !ret) {
@@ -162,12 +161,6 @@ int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 	bb.wait = &wait;
 	bb.end_io = NULL;
 
-	if (flags & BLKDEV_IFL_BARRIER) {
-		/* issue async barrier before the data */
-		ret = blkdev_issue_flush(bdev, gfp_mask, NULL, 0);
-		if (ret)
-			return ret;
-	}
 submit:
 	ret = 0;
 	while (nr_sects != 0) {
@@ -199,13 +192,6 @@ submit:
 		issued++;
 		submit_bio(WRITE, bio);
 	}
-	/*
-	 * When all data bios are in flight. Send final barrier if requeted.
-	 */
-	if (nr_sects == 0 && flags & BLKDEV_IFL_BARRIER)
-		ret = blkdev_issue_flush(bdev, gfp_mask, NULL,
-					flags & BLKDEV_IFL_WAIT);
-
 
 	if (flags & BLKDEV_IFL_WAIT)
 		/* Wait for bios in-flight */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6b305eb4a343..cfcb3a610605 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -869,11 +869,9 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
 }
 enum{
 	BLKDEV_WAIT,	/* wait for completion */
-	BLKDEV_BARRIER,	/* issue request with barrier */
 	BLKDEV_SECURE,	/* secure discard */
 };
 #define BLKDEV_IFL_WAIT		(1 << BLKDEV_WAIT)
-#define BLKDEV_IFL_BARRIER	(1 << BLKDEV_BARRIER)
 #define BLKDEV_IFL_SECURE	(1 << BLKDEV_SECURE)
 extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *,
 			unsigned long);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d6add69bc170..6b0f6e9993a3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -163,14 +163,6 @@ struct inodes_stat_t {
 #define WRITE_FLUSH_FUA		(WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
 				 REQ_FLUSH | REQ_FUA)
 
-/*
- * These aren't really reads or writes, they pass down information about
- * parts of device that are now unused by the file system.
- */
-#define DISCARD_NOBARRIER	(WRITE | REQ_DISCARD)
-#define DISCARD_BARRIER		(WRITE | REQ_DISCARD | REQ_HARDBARRIER)
-#define DISCARD_SECURE		(DISCARD_NOBARRIER | REQ_SECURE)
-
 #define SEL_IN		1
 #define SEL_OUT		2
 #define SEL_EX		4
-- 
cgit v1.2.3


From 0edd55faea7c8081bc826234b917501738a6218f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 18 Aug 2010 05:29:23 -0400
Subject: block: remove the BH_Eopnotsupp flag

This flag was only set for barrier buffers, which we don't submit
anymore.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/buffer.c                 | 7 +------
 fs/fat/misc.c               | 5 +----
 include/linux/buffer_head.h | 2 --
 3 files changed, 2 insertions(+), 12 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 3e7dca279d1c..7f0b9b083f77 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -156,7 +156,7 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 	if (uptodate) {
 		set_buffer_uptodate(bh);
 	} else {
-		if (!buffer_eopnotsupp(bh) && !quiet_error(bh)) {
+		if (!quiet_error(bh)) {
 			buffer_io_error(bh);
 			printk(KERN_WARNING "lost page write due to "
 					"I/O error on %s\n",
@@ -2891,7 +2891,6 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
 
 	if (err == -EOPNOTSUPP) {
 		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
-		set_bit(BH_Eopnotsupp, &bh->b_state);
 	}
 
 	if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
@@ -3031,10 +3030,6 @@ int __sync_dirty_buffer(struct buffer_head *bh, int rw)
 		bh->b_end_io = end_buffer_write_sync;
 		ret = submit_bh(rw, bh);
 		wait_on_buffer(bh);
-		if (buffer_eopnotsupp(bh)) {
-			clear_buffer_eopnotsupp(bh);
-			ret = -EOPNOTSUPP;
-		}
 		if (!ret && !buffer_uptodate(bh))
 			ret = -EIO;
 	} else {
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 1736f2356388..970e682ea754 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -255,10 +255,7 @@ int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
 
 	for (i = 0; i < nr_bhs; i++) {
 		wait_on_buffer(bhs[i]);
-		if (buffer_eopnotsupp(bhs[i])) {
-			clear_buffer_eopnotsupp(bhs[i]);
-			err = -EOPNOTSUPP;
-		} else if (!err && !buffer_uptodate(bhs[i]))
+		if (!err && !buffer_uptodate(bhs[i]))
 			err = -EIO;
 	}
 	return err;
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index fc999f583fda..dd1b25b2641c 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -32,7 +32,6 @@ enum bh_state_bits {
 	BH_Delay,	/* Buffer is not yet allocated on disk */
 	BH_Boundary,	/* Block is followed by a discontiguity */
 	BH_Write_EIO,	/* I/O error on write */
-	BH_Eopnotsupp,	/* DEPRECATED: operation not supported (barrier) */
 	BH_Unwritten,	/* Buffer is allocated on disk but not written */
 	BH_Quiet,	/* Buffer Error Prinks to be quiet */
 
@@ -124,7 +123,6 @@ BUFFER_FNS(Async_Write, async_write)
 BUFFER_FNS(Delay, delay)
 BUFFER_FNS(Boundary, boundary)
 BUFFER_FNS(Write_EIO, write_io_error)
-BUFFER_FNS(Eopnotsupp, eopnotsupp)
 BUFFER_FNS(Unwritten, unwritten)
 
 #define bh_offset(bh)		((unsigned long)(bh)->b_data & ~PAGE_MASK)
-- 
cgit v1.2.3


From c8bf1336824ebd698d37b71763e1c43190f2229a Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Fri, 10 Sep 2010 20:07:38 +0200
Subject: Consolidate min_not_zero

We have several users of min_not_zero, each of them using their own
definition.  Move the define to kernel.h.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@carl.home.kernel.dk>
---
 block/blk-settings.c               |  5 -----
 drivers/block/drbd/drbd_receiver.c |  1 -
 drivers/md/dm-snap.c               |  2 --
 drivers/md/dm-table.c              |  5 -----
 include/linux/kernel.h             | 10 ++++++++++
 5 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/block/blk-settings.c b/block/blk-settings.c
index a234f4bf1d6f..8d592b559bd3 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -455,11 +455,6 @@ void blk_queue_io_opt(struct request_queue *q, unsigned int opt)
 }
 EXPORT_SYMBOL(blk_queue_io_opt);
 
-/*
- * Returns the minimum that is _not_ zero, unless both are zero.
- */
-#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
-
 /**
  * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers
  * @t:	the stacking driver (top)
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 081522d3c742..484ecbb6b772 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -2972,7 +2972,6 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
 	 * we still need to figure out whether we accept that. */
 	mdev->p_size = p_size;
 
-#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
 	if (get_ldev(mdev)) {
 		warn_if_differ_considerably(mdev, "lower level device sizes",
 			   p_size, drbd_get_max_capacity(mdev->ldev));
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 5974d3094d97..f30f6e8d594e 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -706,8 +706,6 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new)
 	return 0;
 }
 
-#define min_not_zero(l, r) (((l) == 0) ? (r) : (((r) == 0) ? (l) : min(l, r)))
-
 /*
  * Return a minimum chunk size of all snapshots that have the specified origin.
  * Return zero if the origin has no snapshots.
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index f9fc07d7a4b9..90267f8d64ee 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -486,11 +486,6 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
 	return 0;
 }
 
-/*
- * Returns the minimum that is _not_ zero, unless both are zero.
- */
-#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
-
 int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
 			 sector_t start, sector_t len, void *data)
 {
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 2b0a35e6bc69..f5df2f4acb0d 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -640,6 +640,16 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
 	(void) (&_max1 == &_max2);		\
 	_max1 > _max2 ? _max1 : _max2; })
 
+/**
+ * min_not_zero - return the minimum that is _not_ zero, unless both are zero
+ * @x: value1
+ * @y: value2
+ */
+#define min_not_zero(x, y) ({			\
+	typeof(x) __x = (x);			\
+	typeof(y) __y = (y);			\
+	__x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); })
+
 /**
  * clamp - return a value clamped to a given range with strict typechecking
  * @val: current value
-- 
cgit v1.2.3


From 13f05c8d8e98bbdce89158bfdb2e380940695a88 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Fri, 10 Sep 2010 20:50:10 +0200
Subject: block/scsi: Provide a limit on the number of integrity segments

Some controllers have a hardware limit on the number of protection
information scatter-gather list segments they can handle.

Introduce a max_integrity_segments limit in the block layer and provide
a new scsi_host_template setting that allows HBA drivers to provide a
value suitable for the hardware.

Add support for honoring the integrity segment limit when merging both
bios and requests.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@carl.home.kernel.dk>
---
 block/blk-integrity.c     | 93 ++++++++++++++++++++++++++++++++++++-----------
 block/blk-merge.c         | 23 +++++++-----
 block/blk-settings.c      |  3 ++
 block/blk-sysfs.c         | 11 ++++++
 block/blk.h               |  8 ----
 drivers/scsi/hosts.c      |  1 +
 drivers/scsi/scsi_lib.c   | 26 +++++++++----
 drivers/scsi/scsi_sysfs.c |  2 +
 include/linux/bio.h       |  4 ++
 include/linux/blkdev.h    | 33 +++++++++++++++--
 include/scsi/scsi.h       |  6 +++
 include/scsi/scsi_host.h  |  7 ++++
 12 files changed, 167 insertions(+), 50 deletions(-)

diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index edce1ef7933d..885cbb59967e 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -32,24 +32,37 @@ static struct kmem_cache *integrity_cachep;
 
 /**
  * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements
- * @rq:		request with integrity metadata attached
+ * @q:		request queue
+ * @bio:	bio with integrity metadata attached
  *
  * Description: Returns the number of elements required in a
- * scatterlist corresponding to the integrity metadata in a request.
+ * scatterlist corresponding to the integrity metadata in a bio.
  */
-int blk_rq_count_integrity_sg(struct request *rq)
+int blk_rq_count_integrity_sg(struct request_queue *q, struct bio *bio)
 {
-	struct bio_vec *iv, *ivprv;
-	struct req_iterator iter;
-	unsigned int segments;
+	struct bio_vec *iv, *ivprv = NULL;
+	unsigned int segments = 0;
+	unsigned int seg_size = 0;
+	unsigned int i = 0;
 
-	ivprv = NULL;
-	segments = 0;
+	bio_for_each_integrity_vec(iv, bio, i) {
 
-	rq_for_each_integrity_segment(iv, rq, iter) {
+		if (ivprv) {
+			if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv))
+				goto new_segment;
+
+			if (!BIOVEC_SEG_BOUNDARY(q, ivprv, iv))
+				goto new_segment;
+
+			if (seg_size + iv->bv_len > queue_max_segment_size(q))
+				goto new_segment;
 
-		if (!ivprv || !BIOVEC_PHYS_MERGEABLE(ivprv, iv))
+			seg_size += iv->bv_len;
+		} else {
+new_segment:
 			segments++;
+			seg_size = iv->bv_len;
+		}
 
 		ivprv = iv;
 	}
@@ -60,30 +73,34 @@ EXPORT_SYMBOL(blk_rq_count_integrity_sg);
 
 /**
  * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist
- * @rq:		request with integrity metadata attached
+ * @q:		request queue
+ * @bio:	bio with integrity metadata attached
  * @sglist:	target scatterlist
  *
  * Description: Map the integrity vectors in request into a
  * scatterlist.  The scatterlist must be big enough to hold all
  * elements.  I.e. sized using blk_rq_count_integrity_sg().
  */
-int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist)
+int blk_rq_map_integrity_sg(struct request_queue *q, struct bio *bio,
+			    struct scatterlist *sglist)
 {
-	struct bio_vec *iv, *ivprv;
-	struct req_iterator iter;
-	struct scatterlist *sg;
-	unsigned int segments;
+	struct bio_vec *iv, *ivprv = NULL;
+	struct scatterlist *sg = NULL;
+	unsigned int segments = 0;
+	unsigned int i = 0;
 
-	ivprv = NULL;
-	sg = NULL;
-	segments = 0;
-
-	rq_for_each_integrity_segment(iv, rq, iter) {
+	bio_for_each_integrity_vec(iv, bio, i) {
 
 		if (ivprv) {
 			if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv))
 				goto new_segment;
 
+			if (!BIOVEC_SEG_BOUNDARY(q, ivprv, iv))
+				goto new_segment;
+
+			if (sg->length + iv->bv_len > queue_max_segment_size(q))
+				goto new_segment;
+
 			sg->length += iv->bv_len;
 		} else {
 new_segment:
@@ -162,6 +179,40 @@ int blk_integrity_compare(struct gendisk *gd1, struct gendisk *gd2)
 }
 EXPORT_SYMBOL(blk_integrity_compare);
 
+int blk_integrity_merge_rq(struct request_queue *q, struct request *req,
+			   struct request *next)
+{
+	if (blk_integrity_rq(req) != blk_integrity_rq(next))
+		return -1;
+
+	if (req->nr_integrity_segments + next->nr_integrity_segments >
+	    q->limits.max_integrity_segments)
+		return -1;
+
+	return 0;
+}
+EXPORT_SYMBOL(blk_integrity_merge_rq);
+
+int blk_integrity_merge_bio(struct request_queue *q, struct request *req,
+			    struct bio *bio)
+{
+	int nr_integrity_segs;
+	struct bio *next = bio->bi_next;
+
+	bio->bi_next = NULL;
+	nr_integrity_segs = blk_rq_count_integrity_sg(q, bio);
+	bio->bi_next = next;
+
+	if (req->nr_integrity_segments + nr_integrity_segs >
+	    q->limits.max_integrity_segments)
+		return -1;
+
+	req->nr_integrity_segments += nr_integrity_segs;
+
+	return 0;
+}
+EXPORT_SYMBOL(blk_integrity_merge_bio);
+
 struct integrity_sysfs_entry {
 	struct attribute attr;
 	ssize_t (*show)(struct blk_integrity *, char *);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 3b0cd4249671..6a725461654d 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -205,12 +205,11 @@ static inline int ll_new_hw_segment(struct request_queue *q,
 {
 	int nr_phys_segs = bio_phys_segments(q, bio);
 
-	if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q)) {
-		req->cmd_flags |= REQ_NOMERGE;
-		if (req == q->last_merge)
-			q->last_merge = NULL;
-		return 0;
-	}
+	if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q))
+		goto no_merge;
+
+	if (bio_integrity(bio) && blk_integrity_merge_bio(q, req, bio))
+		goto no_merge;
 
 	/*
 	 * This will form the start of a new hw segment.  Bump both
@@ -218,6 +217,12 @@ static inline int ll_new_hw_segment(struct request_queue *q,
 	 */
 	req->nr_phys_segments += nr_phys_segs;
 	return 1;
+
+no_merge:
+	req->cmd_flags |= REQ_NOMERGE;
+	if (req == q->last_merge)
+		q->last_merge = NULL;
+	return 0;
 }
 
 int ll_back_merge_fn(struct request_queue *q, struct request *req,
@@ -301,6 +306,9 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 	if (total_phys_segments > queue_max_segments(q))
 		return 0;
 
+	if (blk_integrity_rq(req) && blk_integrity_merge_rq(q, req, next))
+		return 0;
+
 	/* Merge is OK... */
 	req->nr_phys_segments = total_phys_segments;
 	return 1;
@@ -372,9 +380,6 @@ static int attempt_merge(struct request_queue *q, struct request *req,
 	    || next->special)
 		return 0;
 
-	if (blk_integrity_rq(req) != blk_integrity_rq(next))
-		return 0;
-
 	/*
 	 * If we are allowed to merge, then append bio list
 	 * from next to rq and release next. merge_requests_fn
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 8d592b559bd3..f8f2ddf20613 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -111,6 +111,7 @@ EXPORT_SYMBOL_GPL(blk_queue_lld_busy);
 void blk_set_default_limits(struct queue_limits *lim)
 {
 	lim->max_segments = BLK_MAX_SEGMENTS;
+	lim->max_integrity_segments = 0;
 	lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
 	lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
 	lim->max_sectors = BLK_DEF_MAX_SECTORS;
@@ -509,6 +510,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 					    b->seg_boundary_mask);
 
 	t->max_segments = min_not_zero(t->max_segments, b->max_segments);
+	t->max_integrity_segments = min_not_zero(t->max_integrity_segments,
+						 b->max_integrity_segments);
 
 	t->max_segment_size = min_not_zero(t->max_segment_size,
 					   b->max_segment_size);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 001ab18078f5..b014f7739e98 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -112,6 +112,11 @@ static ssize_t queue_max_segments_show(struct request_queue *q, char *page)
 	return queue_var_show(queue_max_segments(q), (page));
 }
 
+static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(q->limits.max_integrity_segments, (page));
+}
+
 static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page)
 {
 	if (test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags))
@@ -288,6 +293,11 @@ static struct queue_sysfs_entry queue_max_segments_entry = {
 	.show = queue_max_segments_show,
 };
 
+static struct queue_sysfs_entry queue_max_integrity_segments_entry = {
+	.attr = {.name = "max_integrity_segments", .mode = S_IRUGO },
+	.show = queue_max_integrity_segments_show,
+};
+
 static struct queue_sysfs_entry queue_max_segment_size_entry = {
 	.attr = {.name = "max_segment_size", .mode = S_IRUGO },
 	.show = queue_max_segment_size_show,
@@ -375,6 +385,7 @@ static struct attribute *default_attrs[] = {
 	&queue_max_hw_sectors_entry.attr,
 	&queue_max_sectors_entry.attr,
 	&queue_max_segments_entry.attr,
+	&queue_max_integrity_segments_entry.attr,
 	&queue_max_segment_size_entry.attr,
 	&queue_iosched_entry.attr,
 	&queue_hw_sector_size_entry.attr,
diff --git a/block/blk.h b/block/blk.h
index 6e7dc87141e4..6738831ba447 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -132,14 +132,6 @@ static inline int queue_congestion_off_threshold(struct request_queue *q)
 	return q->nr_congestion_off;
 }
 
-#if defined(CONFIG_BLK_DEV_INTEGRITY)
-
-#define rq_for_each_integrity_segment(bvl, _rq, _iter)		\
-	__rq_for_each_bio(_iter.bio, _rq)			\
-		bip_for_each_vec(bvl, _iter.bio->bi_integrity, _iter.i)
-
-#endif /* BLK_DEV_INTEGRITY */
-
 static inline int blk_cpu_to_group(int cpu)
 {
 #ifdef CONFIG_SCHED_MC
diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index 8a8f803439e1..10478153641b 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -376,6 +376,7 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
 	shost->this_id = sht->this_id;
 	shost->can_queue = sht->can_queue;
 	shost->sg_tablesize = sht->sg_tablesize;
+	shost->sg_prot_tablesize = sht->sg_prot_tablesize;
 	shost->cmd_per_lun = sht->cmd_per_lun;
 	shost->unchecked_isa_dma = sht->unchecked_isa_dma;
 	shost->use_clustering = sht->use_clustering;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 9ade720422c6..861c0b937ac9 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -968,11 +968,13 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb,
  */
 int scsi_init_io(struct scsi_cmnd *cmd, gfp_t gfp_mask)
 {
-	int error = scsi_init_sgtable(cmd->request, &cmd->sdb, gfp_mask);
+	struct request *rq = cmd->request;
+
+	int error = scsi_init_sgtable(rq, &cmd->sdb, gfp_mask);
 	if (error)
 		goto err_exit;
 
-	if (blk_bidi_rq(cmd->request)) {
+	if (blk_bidi_rq(rq)) {
 		struct scsi_data_buffer *bidi_sdb = kmem_cache_zalloc(
 			scsi_sdb_cache, GFP_ATOMIC);
 		if (!bidi_sdb) {
@@ -980,28 +982,28 @@ int scsi_init_io(struct scsi_cmnd *cmd, gfp_t gfp_mask)
 			goto err_exit;
 		}
 
-		cmd->request->next_rq->special = bidi_sdb;
-		error = scsi_init_sgtable(cmd->request->next_rq, bidi_sdb,
-								    GFP_ATOMIC);
+		rq->next_rq->special = bidi_sdb;
+		error = scsi_init_sgtable(rq->next_rq, bidi_sdb, GFP_ATOMIC);
 		if (error)
 			goto err_exit;
 	}
 
-	if (blk_integrity_rq(cmd->request)) {
+	if (blk_integrity_rq(rq)) {
 		struct scsi_data_buffer *prot_sdb = cmd->prot_sdb;
 		int ivecs, count;
 
 		BUG_ON(prot_sdb == NULL);
-		ivecs = blk_rq_count_integrity_sg(cmd->request);
+		ivecs = blk_rq_count_integrity_sg(rq->q, rq->bio);
 
 		if (scsi_alloc_sgtable(prot_sdb, ivecs, gfp_mask)) {
 			error = BLKPREP_DEFER;
 			goto err_exit;
 		}
 
-		count = blk_rq_map_integrity_sg(cmd->request,
+		count = blk_rq_map_integrity_sg(rq->q, rq->bio,
 						prot_sdb->table.sgl);
 		BUG_ON(unlikely(count > ivecs));
+		BUG_ON(unlikely(count > queue_max_integrity_segments(rq->q)));
 
 		cmd->prot_sdb = prot_sdb;
 		cmd->prot_sdb->table.nents = count;
@@ -1625,6 +1627,14 @@ struct request_queue *__scsi_alloc_queue(struct Scsi_Host *shost,
 	blk_queue_max_segments(q, min_t(unsigned short, shost->sg_tablesize,
 					SCSI_MAX_SG_CHAIN_SEGMENTS));
 
+	if (scsi_host_prot_dma(shost)) {
+		shost->sg_prot_tablesize =
+			min_not_zero(shost->sg_prot_tablesize,
+				     (unsigned short)SCSI_MAX_PROT_SG_SEGMENTS);
+		BUG_ON(shost->sg_prot_tablesize < shost->sg_tablesize);
+		blk_queue_max_integrity_segments(q, shost->sg_prot_tablesize);
+	}
+
 	blk_queue_max_hw_sectors(q, shost->max_sectors);
 	blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost));
 	blk_queue_segment_boundary(q, shost->dma_boundary);
diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
index c3f67373a4f8..20ad59dff730 100644
--- a/drivers/scsi/scsi_sysfs.c
+++ b/drivers/scsi/scsi_sysfs.c
@@ -251,6 +251,7 @@ shost_rd_attr(host_busy, "%hu\n");
 shost_rd_attr(cmd_per_lun, "%hd\n");
 shost_rd_attr(can_queue, "%hd\n");
 shost_rd_attr(sg_tablesize, "%hu\n");
+shost_rd_attr(sg_prot_tablesize, "%hu\n");
 shost_rd_attr(unchecked_isa_dma, "%d\n");
 shost_rd_attr(prot_capabilities, "%u\n");
 shost_rd_attr(prot_guard_type, "%hd\n");
@@ -262,6 +263,7 @@ static struct attribute *scsi_sysfs_shost_attrs[] = {
 	&dev_attr_cmd_per_lun.attr,
 	&dev_attr_can_queue.attr,
 	&dev_attr_sg_tablesize.attr,
+	&dev_attr_sg_prot_tablesize.attr,
 	&dev_attr_unchecked_isa_dma.attr,
 	&dev_attr_proc_name.attr,
 	&dev_attr_scan.attr,
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 5274103434ad..2c3fd7421607 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -496,6 +496,10 @@ static inline struct bio *bio_list_get(struct bio_list *bl)
 #define bip_for_each_vec(bvl, bip, i)					\
 	__bip_for_each_vec(bvl, bip, i, (bip)->bip_idx)
 
+#define bio_for_each_integrity_vec(_bvl, _bio, _iter)			\
+	for_each_bio(_bio)						\
+		bip_for_each_vec(_bvl, _bio->bi_integrity, _iter)
+
 #define bio_integrity(bio) (bio->bi_integrity != NULL)
 
 extern struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *, gfp_t, unsigned int, struct bio_set *);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2c54906f678f..7e661106270a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -124,6 +124,9 @@ struct request {
 	 * physical address coalescing is performed.
 	 */
 	unsigned short nr_phys_segments;
+#if defined(CONFIG_BLK_DEV_INTEGRITY)
+	unsigned short nr_integrity_segments;
+#endif
 
 	unsigned short ioprio;
 
@@ -243,6 +246,7 @@ struct queue_limits {
 
 	unsigned short		logical_block_size;
 	unsigned short		max_segments;
+	unsigned short		max_integrity_segments;
 
 	unsigned char		misaligned;
 	unsigned char		discard_misaligned;
@@ -1213,8 +1217,13 @@ struct blk_integrity {
 extern int blk_integrity_register(struct gendisk *, struct blk_integrity *);
 extern void blk_integrity_unregister(struct gendisk *);
 extern int blk_integrity_compare(struct gendisk *, struct gendisk *);
-extern int blk_rq_map_integrity_sg(struct request *, struct scatterlist *);
-extern int blk_rq_count_integrity_sg(struct request *);
+extern int blk_rq_map_integrity_sg(struct request_queue *, struct bio *,
+				   struct scatterlist *);
+extern int blk_rq_count_integrity_sg(struct request_queue *, struct bio *);
+extern int blk_integrity_merge_rq(struct request_queue *, struct request *,
+				  struct request *);
+extern int blk_integrity_merge_bio(struct request_queue *, struct request *,
+				   struct bio *);
 
 static inline
 struct blk_integrity *bdev_get_integrity(struct block_device *bdev)
@@ -1235,16 +1244,32 @@ static inline int blk_integrity_rq(struct request *rq)
 	return bio_integrity(rq->bio);
 }
 
+static inline void blk_queue_max_integrity_segments(struct request_queue *q,
+						    unsigned int segs)
+{
+	q->limits.max_integrity_segments = segs;
+}
+
+static inline unsigned short
+queue_max_integrity_segments(struct request_queue *q)
+{
+	return q->limits.max_integrity_segments;
+}
+
 #else /* CONFIG_BLK_DEV_INTEGRITY */
 
 #define blk_integrity_rq(rq)			(0)
-#define blk_rq_count_integrity_sg(a)		(0)
-#define blk_rq_map_integrity_sg(a, b)		(0)
+#define blk_rq_count_integrity_sg(a, b)		(0)
+#define blk_rq_map_integrity_sg(a, b, c)	(0)
 #define bdev_get_integrity(a)			(0)
 #define blk_get_integrity(a)			(0)
 #define blk_integrity_compare(a, b)		(0)
 #define blk_integrity_register(a, b)		(0)
 #define blk_integrity_unregister(a)		do { } while (0);
+#define blk_queue_max_integrity_segments(a, b)	do { } while (0);
+#define queue_max_integrity_segments(a)		(0)
+#define blk_integrity_merge_rq(a, b, c)		(0)
+#define blk_integrity_merge_bio(a, b, c)	(0)
 
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
 
diff --git a/include/scsi/scsi.h b/include/scsi/scsi.h
index 8fcb6e0e9e72..d63533a4a59e 100644
--- a/include/scsi/scsi.h
+++ b/include/scsi/scsi.h
@@ -31,6 +31,12 @@ struct scsi_cmnd;
 #define SCSI_MAX_SG_CHAIN_SEGMENTS	SCSI_MAX_SG_SEGMENTS
 #endif
 
+/*
+ * DIX-capable adapters effectively support infinite chaining for the
+ * protection information scatterlist
+ */
+#define SCSI_MAX_PROT_SG_SEGMENTS	0xFFFF
+
 /*
  * Special value for scanning to specify scanning or rescanning of all
  * possible channels, (target) ids, or luns on a given shost.
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index b7bdecb7b76e..d0a6a845f204 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -388,6 +388,7 @@ struct scsi_host_template {
 	 * of scatter-gather.
 	 */
 	unsigned short sg_tablesize;
+	unsigned short sg_prot_tablesize;
 
 	/*
 	 * Set this if the host adapter has limitations beside segment count.
@@ -599,6 +600,7 @@ struct Scsi_Host {
 	int can_queue;
 	short cmd_per_lun;
 	short unsigned int sg_tablesize;
+	short unsigned int sg_prot_tablesize;
 	short unsigned int max_sectors;
 	unsigned long dma_boundary;
 	/* 
@@ -823,6 +825,11 @@ static inline unsigned int scsi_host_get_prot(struct Scsi_Host *shost)
 	return shost->prot_capabilities;
 }
 
+static inline int scsi_host_prot_dma(struct Scsi_Host *shost)
+{
+	return shost->prot_capabilities >= SHOST_DIX_TYPE0_PROTECTION;
+}
+
 static inline unsigned int scsi_host_dif_capable(struct Scsi_Host *shost, unsigned int target_type)
 {
 	static unsigned char cap[] = { 0,
-- 
cgit v1.2.3


From 175b79f0632544d62aae72e5496c14e3e3ff2ae7 Mon Sep 17 00:00:00 2001
From: Christof Schmitt <christof.schmitt@de.ibm.com>
Date: Fri, 10 Sep 2010 20:50:40 +0200
Subject: zfcp: Report scatter gather limit for DIX protection information

When sending DIX integrity segments with an I/O request, the
restriction for the maximum number of segments is still the same for
the zfcp hardware. Report the new sg_prot_tablesize for the SCSI host,
so that the number of integrity segments plus the number of data
segments is not larger than the hardware limit. This results in using
half of the hardware segments for integrity data and the other half
for regular data.

Reviewed-by: Swen Schillig <swen@vnet.ibm.com>
Signed-off-by: Christof Schmitt <christof.schmitt@de.ibm.com>
Signed-off-by: Jens Axboe <axboe@carl.home.kernel.dk>
---
 drivers/s390/scsi/zfcp_scsi.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/s390/scsi/zfcp_scsi.c b/drivers/s390/scsi/zfcp_scsi.c
index cb000c9833bb..208256e39def 100644
--- a/drivers/s390/scsi/zfcp_scsi.c
+++ b/drivers/s390/scsi/zfcp_scsi.c
@@ -681,6 +681,7 @@ void zfcp_scsi_set_prot(struct zfcp_adapter *adapter)
 	    adapter->adapter_features & FSF_FEATURE_DIX_PROT_TCPIP) {
 		mask |= SHOST_DIX_TYPE1_PROTECTION;
 		scsi_host_set_guard(shost, SHOST_DIX_GUARD_IP);
+		shost->sg_prot_tablesize = ZFCP_QDIO_MAX_SBALES_PER_REQ / 2;
 		shost->sg_tablesize = ZFCP_QDIO_MAX_SBALES_PER_REQ / 2;
 		shost->max_sectors = ZFCP_QDIO_MAX_SBALES_PER_REQ * 8 / 2;
 	}
-- 
cgit v1.2.3


From 8dcbdc742fec9e6c542ff9cb599d2ee620753d07 Mon Sep 17 00:00:00 2001
From: San Mehat <san@android.com>
Date: Tue, 14 Sep 2010 08:48:01 +0200
Subject: block: block_dump: Add number of sectors to debug output

Signed-off-by: San Mehat <san@android.com>
Signed-off-by: Linus Walleij <linus.walleij@stericsson.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index ee1a1e7e63cc..8d07c1b7e701 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1612,11 +1612,12 @@ void submit_bio(int rw, struct bio *bio)
 
 		if (unlikely(block_dump)) {
 			char b[BDEVNAME_SIZE];
-			printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n",
+			printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
 			current->comm, task_pid_nr(current),
 				(rw & WRITE) ? "WRITE" : "READ",
 				(unsigned long long)bio->bi_sector,
-				bdevname(bio->bi_bdev, b));
+				bdevname(bio->bi_bdev, b),
+				count);
 		}
 	}
 
-- 
cgit v1.2.3


From 144177991ca624841ddbd1e7edff958fc0f6d1fe Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Wed, 15 Sep 2010 13:08:27 +0200
Subject: block: fix an address space warning in blk-map.c

Change type of 2nd parameter of blk_rq_aligned() into unsigned long
and remove unnecessary casting. Now we can call it with 'uaddr'
instead of 'ubuf' in __blk_rq_map_user() so that it can remove
following warnings from sparse:

 block/blk-map.c:57:31: warning: incorrect type in argument 2 (different address spaces)
 block/blk-map.c:57:31:    expected void *addr
 block/blk-map.c:57:31:    got void [noderef] <asn:1>*ubuf

However blk_rq_map_kern() needs one more local variable to handle it.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-map.c        | 5 +++--
 include/linux/blkdev.h | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/block/blk-map.c b/block/blk-map.c
index c65d7593f7f1..ac0f7d46db63 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -54,7 +54,7 @@ static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
 	 * direct dma. else, set up kernel bounce buffers
 	 */
 	uaddr = (unsigned long) ubuf;
-	if (blk_rq_aligned(q, ubuf, len) && !map_data)
+	if (blk_rq_aligned(q, uaddr, len) && !map_data)
 		bio = bio_map_user(q, NULL, uaddr, len, reading, gfp_mask);
 	else
 		bio = bio_copy_user(q, map_data, uaddr, len, reading, gfp_mask);
@@ -288,6 +288,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 		    unsigned int len, gfp_t gfp_mask)
 {
 	int reading = rq_data_dir(rq) == READ;
+	unsigned long addr = (unsigned long) kbuf;
 	int do_copy = 0;
 	struct bio *bio;
 	int ret;
@@ -297,7 +298,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 	if (!len || !kbuf)
 		return -EINVAL;
 
-	do_copy = !blk_rq_aligned(q, kbuf, len) || object_is_on_stack(kbuf);
+	do_copy = !blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf);
 	if (do_copy)
 		bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading);
 	else
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 7e661106270a..780824edac16 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1097,11 +1097,11 @@ static inline int queue_dma_alignment(struct request_queue *q)
 	return q ? q->dma_alignment : 511;
 }
 
-static inline int blk_rq_aligned(struct request_queue *q, void *addr,
+static inline int blk_rq_aligned(struct request_queue *q, unsigned long addr,
 				 unsigned int len)
 {
 	unsigned int alignment = queue_dma_alignment(q) | q->dma_pad_mask;
-	return !((unsigned long)addr & alignment) && !(len & alignment);
+	return !(addr & alignment) && !(len & alignment);
 }
 
 /* assumes size > 256 */
-- 
cgit v1.2.3


From 6d1d8050b4bc89d0165d29b58e894aeba2564a97 Mon Sep 17 00:00:00 2001
From: Will Drewry <wad@chromium.org>
Date: Tue, 31 Aug 2010 15:47:05 -0500
Subject: block, partition: add partition_meta_info to hd_struct

I'm reposting this patch series as v4 since there have been no additional
comments, and I cleaned up one extra bit of unneeded code (in 3/3). The patches
are against Linus's tree: 2bfc96a127bc1cc94d26bfaa40159966064f9c8c
(2.6.36-rc3).

Would this patchset be suitable for inclusion in an mm branch?

This changes adds a partition_meta_info struct which itself contains a
union of structures that provide partition table specific metadata.

This change leaves the union empty. The subsequent patch includes an
implementation for CONFIG_EFI_PARTITION-based metadata.

Signed-off-by: Will Drewry <wad@chromium.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/genhd.c         |  1 +
 block/ioctl.c         |  2 +-
 fs/partitions/check.c | 23 +++++++++++++++++++---
 fs/partitions/check.h |  3 +++
 include/linux/genhd.h | 53 +++++++++++++++++++++++++++++++++++++++++++++++++--
 5 files changed, 76 insertions(+), 6 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index 59a2db6fecef..c8da12055264 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1004,6 +1004,7 @@ static void disk_release(struct device *dev)
 	kfree(disk->random);
 	disk_replace_part_tbl(disk, NULL);
 	free_part_stats(&disk->part0);
+	free_part_info(&disk->part0);
 	kfree(disk);
 }
 struct class block_class = {
diff --git a/block/ioctl.c b/block/ioctl.c
index d8052f0dabd3..2c15fe0912c4 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -62,7 +62,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 
 			/* all seems OK */
 			part = add_partition(disk, partno, start, length,
-					     ADDPART_FLAG_NONE);
+					     ADDPART_FLAG_NONE, NULL);
 			mutex_unlock(&bdev->bd_mutex);
 			return IS_ERR(part) ? PTR_ERR(part) : 0;
 		case BLKPG_DEL_PARTITION:
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 79fbf3f390f0..6dfbee03ccc6 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -352,6 +352,7 @@ static void part_release(struct device *dev)
 {
 	struct hd_struct *p = dev_to_part(dev);
 	free_part_stats(p);
+	free_part_info(p);
 	kfree(p);
 }
 
@@ -401,7 +402,8 @@ static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH,
 		   whole_disk_show, NULL);
 
 struct hd_struct *add_partition(struct gendisk *disk, int partno,
-				sector_t start, sector_t len, int flags)
+				sector_t start, sector_t len, int flags,
+				struct partition_meta_info *info)
 {
 	struct hd_struct *p;
 	dev_t devt = MKDEV(0, 0);
@@ -438,6 +440,14 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 	p->partno = partno;
 	p->policy = get_disk_ro(disk);
 
+	if (info) {
+		struct partition_meta_info *pinfo = alloc_part_info(disk);
+		if (!pinfo)
+			goto out_free_stats;
+		memcpy(pinfo, info, sizeof(*info));
+		p->info = pinfo;
+	}
+
 	dname = dev_name(ddev);
 	if (isdigit(dname[strlen(dname) - 1]))
 		dev_set_name(pdev, "%sp%d", dname, partno);
@@ -451,7 +461,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 
 	err = blk_alloc_devt(p, &devt);
 	if (err)
-		goto out_free_stats;
+		goto out_free_info;
 	pdev->devt = devt;
 
 	/* delay uevent until 'holders' subdir is created */
@@ -481,6 +491,8 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 
 	return p;
 
+out_free_info:
+	free_part_info(p);
 out_free_stats:
 	free_part_stats(p);
 out_free:
@@ -642,6 +654,7 @@ rescan:
 	/* add partitions */
 	for (p = 1; p < state->limit; p++) {
 		sector_t size, from;
+		struct partition_meta_info *info = NULL;
 
 		size = state->parts[p].size;
 		if (!size)
@@ -675,8 +688,12 @@ rescan:
 				size = get_capacity(disk) - from;
 			}
 		}
+
+		if (state->parts[p].has_info)
+			info = &state->parts[p].info;
 		part = add_partition(disk, p, from, size,
-				     state->parts[p].flags);
+				     state->parts[p].flags,
+				     &state->parts[p].info);
 		if (IS_ERR(part)) {
 			printk(KERN_ERR " %s: p%d could not be added: %ld\n",
 			       disk->disk_name, p, -PTR_ERR(part));
diff --git a/fs/partitions/check.h b/fs/partitions/check.h
index 8e4e103ba216..d68bf4dc3bc2 100644
--- a/fs/partitions/check.h
+++ b/fs/partitions/check.h
@@ -1,5 +1,6 @@
 #include <linux/pagemap.h>
 #include <linux/blkdev.h>
+#include <linux/genhd.h>
 
 /*
  * add_gd_partition adds a partitions details to the devices partition
@@ -12,6 +13,8 @@ struct parsed_partitions {
 		sector_t from;
 		sector_t size;
 		int flags;
+		bool has_info;
+		struct partition_meta_info info;
 	} parts[DISK_MAX_PARTS];
 	int next;
 	int limit;
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 5f2f4c4d8fb0..66e26b5a1537 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -12,6 +12,7 @@
 #include <linux/types.h>
 #include <linux/kdev_t.h>
 #include <linux/rcupdate.h>
+#include <linux/slab.h>
 
 #ifdef CONFIG_BLOCK
 
@@ -86,7 +87,15 @@ struct disk_stats {
 	unsigned long io_ticks;
 	unsigned long time_in_queue;
 };
-	
+
+#define PARTITION_META_INFO_VOLNAMELTH	64
+#define PARTITION_META_INFO_UUIDLTH	16
+
+struct partition_meta_info {
+	u8 uuid[PARTITION_META_INFO_UUIDLTH];	/* always big endian */
+	u8 volname[PARTITION_META_INFO_VOLNAMELTH];
+};
+
 struct hd_struct {
 	sector_t start_sect;
 	sector_t nr_sects;
@@ -95,6 +104,7 @@ struct hd_struct {
 	struct device __dev;
 	struct kobject *holder_dir;
 	int policy, partno;
+	struct partition_meta_info *info;
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	int make_it_fail;
 #endif
@@ -181,6 +191,30 @@ static inline struct gendisk *part_to_disk(struct hd_struct *part)
 	return NULL;
 }
 
+static inline void part_pack_uuid(const u8 *uuid_str, u8 *to)
+{
+	int i;
+	for (i = 0; i < 16; ++i) {
+		*to++ = (hex_to_bin(*uuid_str) << 4) |
+			(hex_to_bin(*(uuid_str + 1)));
+		uuid_str += 2;
+		switch (i) {
+		case 3:
+		case 5:
+		case 7:
+		case 9:
+			uuid_str++;
+			continue;
+		}
+	}
+}
+
+static inline char *part_unpack_uuid(const u8 *uuid, char *out)
+{
+	sprintf(out, "%pU", uuid);
+	return out;
+}
+
 static inline int disk_max_parts(struct gendisk *disk)
 {
 	if (disk->flags & GENHD_FL_EXT_DEVT)
@@ -342,6 +376,19 @@ static inline int part_in_flight(struct hd_struct *part)
 	return part->in_flight[0] + part->in_flight[1];
 }
 
+static inline struct partition_meta_info *alloc_part_info(struct gendisk *disk)
+{
+	if (disk)
+		return kzalloc_node(sizeof(struct partition_meta_info),
+				    GFP_KERNEL, disk->node_id);
+	return kzalloc(sizeof(struct partition_meta_info), GFP_KERNEL);
+}
+
+static inline void free_part_info(struct hd_struct *part)
+{
+	kfree(part->info);
+}
+
 /* block/blk-core.c */
 extern void part_round_stats(int cpu, struct hd_struct *part);
 
@@ -533,7 +580,9 @@ extern int disk_expand_part_tbl(struct gendisk *disk, int target);
 extern int rescan_partitions(struct gendisk *disk, struct block_device *bdev);
 extern struct hd_struct * __must_check add_partition(struct gendisk *disk,
 						     int partno, sector_t start,
-						     sector_t len, int flags);
+						     sector_t len, int flags,
+						     struct partition_meta_info
+						       *info);
 extern void delete_partition(struct gendisk *, int);
 extern void printk_all_partitions(void);
 
-- 
cgit v1.2.3


From eec7ecfede74bb996060efefd5c157acd5794e8a Mon Sep 17 00:00:00 2001
From: Will Drewry <wad@chromium.org>
Date: Tue, 31 Aug 2010 15:47:06 -0500
Subject: genhd, efi: add efi partition metadata to hd_structs

This change extends the partition_meta_info structure to
support EFI GPT-specific metadata and ensures that data
is copied in on partition scanning.

Signed-off-by: Will Drewry <wad@chromium.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/partitions/efi.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index dbb44d4bb8a7..ac0ccb5026a2 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -94,6 +94,7 @@
  *
  ************************************************************/
 #include <linux/crc32.h>
+#include <linux/ctype.h>
 #include <linux/math64.h>
 #include <linux/slab.h>
 #include "check.h"
@@ -604,6 +605,7 @@ int efi_partition(struct parsed_partitions *state)
 	gpt_entry *ptes = NULL;
 	u32 i;
 	unsigned ssz = bdev_logical_block_size(state->bdev) / 512;
+	u8 unparsed_guid[37];
 
 	if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) {
 		kfree(gpt);
@@ -614,6 +616,9 @@ int efi_partition(struct parsed_partitions *state)
 	pr_debug("GUID Partition Table is valid!  Yea!\n");
 
 	for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) {
+		struct partition_meta_info *info;
+		unsigned label_count = 0;
+		unsigned label_max;
 		u64 start = le64_to_cpu(ptes[i].starting_lba);
 		u64 size = le64_to_cpu(ptes[i].ending_lba) -
 			   le64_to_cpu(ptes[i].starting_lba) + 1ULL;
@@ -627,6 +632,26 @@ int efi_partition(struct parsed_partitions *state)
 		if (!efi_guidcmp(ptes[i].partition_type_guid,
 				 PARTITION_LINUX_RAID_GUID))
 			state->parts[i + 1].flags = ADDPART_FLAG_RAID;
+
+		info = &state->parts[i + 1].info;
+		/* Instead of doing a manual swap to big endian, reuse the
+		 * common ASCII hex format as the interim.
+		 */
+		efi_guid_unparse(&ptes[i].unique_partition_guid, unparsed_guid);
+		part_pack_uuid(unparsed_guid, info->uuid);
+
+		/* Naively convert UTF16-LE to 7 bits. */
+		label_max = min(sizeof(info->volname) - 1,
+				sizeof(ptes[i].partition_name));
+		info->volname[label_max] = 0;
+		while (label_count < label_max) {
+			u8 c = ptes[i].partition_name[label_count] & 0xff;
+			if (c && !isprint(c))
+				c = '!';
+			info->volname[label_count] = c;
+			label_count++;
+		}
+		state->parts[i + 1].has_info = true;
 	}
 	kfree(ptes);
 	kfree(gpt);
-- 
cgit v1.2.3


From b5af921ec02333e943efb59aca4f56b78fc0e100 Mon Sep 17 00:00:00 2001
From: Will Drewry <wad@chromium.org>
Date: Tue, 31 Aug 2010 15:47:07 -0500
Subject: init: add support for root devices specified by partition UUID

This is the third patch in a series which adds support for
storing partition metadata, optionally, off of the hd_struct.

One major use for that data is being able to resolve partition
by other identities than just the index on a block device.  Device
enumeration varies by platform and there's a benefit to being able
to use something like EFI GPT's GUIDs to determine the correct
block device and partition to mount as the root.

This change adds that support to root= by adding support for
the following syntax:

  root=PARTUUID=hex-uuid

Signed-off-by: Will Drewry <wad@chromium.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/genhd.c    |  9 ++++++--
 init/do_mounts.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+), 2 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index c8da12055264..5c9c503de423 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -642,6 +642,7 @@ void __init printk_all_partitions(void)
 		struct hd_struct *part;
 		char name_buf[BDEVNAME_SIZE];
 		char devt_buf[BDEVT_SIZE];
+		u8 uuid[PARTITION_META_INFO_UUIDLTH * 2 + 1];
 
 		/*
 		 * Don't show empty devices or things that have been
@@ -660,10 +661,14 @@ void __init printk_all_partitions(void)
 		while ((part = disk_part_iter_next(&piter))) {
 			bool is_part0 = part == &disk->part0;
 
-			printk("%s%s %10llu %s", is_part0 ? "" : "  ",
+			uuid[0] = 0;
+			if (part->info)
+				part_unpack_uuid(part->info->uuid, uuid);
+
+			printk("%s%s %10llu %s %s", is_part0 ? "" : "  ",
 			       bdevt_str(part_devt(part), devt_buf),
 			       (unsigned long long)part->nr_sects >> 1,
-			       disk_name(disk, part->partno, name_buf));
+			       disk_name(disk, part->partno, name_buf), uuid);
 			if (is_part0) {
 				if (disk->driverfs_dev != NULL &&
 				    disk->driverfs_dev->driver != NULL)
diff --git a/init/do_mounts.c b/init/do_mounts.c
index 02e3ca4fc527..804f9c6ba216 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -58,6 +58,60 @@ static int __init readwrite(char *str)
 __setup("ro", readonly);
 __setup("rw", readwrite);
 
+/**
+ * match_dev_by_uuid - callback for finding a partition using its uuid
+ * @dev:	device passed in by the caller
+ * @data:	opaque pointer to a 36 byte char array with a UUID
+ *
+ * Returns 1 if the device matches, and 0 otherwise.
+ */
+static int __init match_dev_by_uuid(struct device *dev, void *data)
+{
+	u8 *uuid = data;
+	struct hd_struct *part = dev_to_part(dev);
+
+	if (!part->info)
+		goto no_match;
+
+	if (memcmp(uuid, part->info->uuid, sizeof(part->info->uuid)))
+			goto no_match;
+
+	return 1;
+no_match:
+	return 0;
+}
+
+
+/**
+ * devt_from_partuuid - looks up the dev_t of a partition by its UUID
+ * @uuid:	36 byte char array containing a hex ascii UUID
+ *
+ * The function will return the first partition which contains a matching
+ * UUID value in its partition_meta_info struct.  This does not search
+ * by filesystem UUIDs.
+ *
+ * Returns the matching dev_t on success or 0 on failure.
+ */
+static dev_t __init devt_from_partuuid(char *uuid_str)
+{
+	dev_t res = 0;
+	struct device *dev = NULL;
+	u8 uuid[16];
+
+	/* Pack the requested UUID in the expected format. */
+	part_pack_uuid(uuid_str, uuid);
+
+	dev = class_find_device(&block_class, NULL, uuid, &match_dev_by_uuid);
+	if (!dev)
+		goto done;
+
+	res = dev->devt;
+	put_device(dev);
+
+done:
+	return res;
+}
+
 /*
  *	Convert a name into device number.  We accept the following variants:
  *
@@ -68,6 +122,8 @@ __setup("rw", readwrite);
  *         of partition - device number of disk plus the partition number
  *	5) /dev/<disk_name>p<decimal> - same as the above, that form is
  *	   used when disk name of partitioned disk ends on a digit.
+ *	6) PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF representing the
+ *	   unique id of a partition if the partition table provides it.
  *
  *	If name doesn't have fall into the categories above, we return (0,0).
  *	block_class is used to check if something is a disk name. If the disk
@@ -82,6 +138,16 @@ dev_t name_to_dev_t(char *name)
 	dev_t res = 0;
 	int part;
 
+	if (strncmp(name, "PARTUUID=", 9) == 0) {
+		name += 9;
+		if (strlen(name) != 36)
+			goto fail;
+		res = devt_from_partuuid(name);
+		if (!res)
+			goto fail;
+		goto done;
+	}
+
 	if (strncmp(name, "/dev/", 5) != 0) {
 		unsigned maj, min;
 
-- 
cgit v1.2.3


From 2610a25406087ef797f4187e7f82dd04335056c7 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Thu, 16 Sep 2010 12:55:57 +0900
Subject: sg: fix a warning in blk_rq_aligned() call

2nd argument of blk_rq_aligned() has changed to 'unsigned long' by
the previous commit 'block: fix an address space warning in blk-map.c'.
That commit neglected to update a user of that function.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/scsi/sg.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 78d616315d8e..655ab920cff4 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1657,7 +1657,7 @@ static int sg_start_req(Sg_request *srp, unsigned char *cmd)
 	if (sg_allow_dio && hp->flags & SG_FLAG_DIRECT_IO &&
 	    dxfer_dir != SG_DXFER_UNKNOWN && !iov_count &&
 	    !sfp->parentdp->device->host->unchecked_isa_dma &&
-	    blk_rq_aligned(q, hp->dxferp, dxfer_len))
+	    blk_rq_aligned(q, (unsigned long)hp->dxferp, dxfer_len))
 		md = NULL;
 	else
 		md = &map_data;
-- 
cgit v1.2.3


From 8786fb70ccb36c7cff64680bb80c46d3a09d44db Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Thu, 16 Sep 2010 08:27:34 +0200
Subject: aic7xxx_old: removed unused 'req' variable

After the patch "block: remove spurious uses of REQ_HARDBARRIER",
we no longer use 'req' in aic7xxx_buildscb(). Kill it.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/scsi/aic7xxx_old.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/scsi/aic7xxx_old.c b/drivers/scsi/aic7xxx_old.c
index e1cd6062776c..aee73fafccc8 100644
--- a/drivers/scsi/aic7xxx_old.c
+++ b/drivers/scsi/aic7xxx_old.c
@@ -10119,7 +10119,6 @@ static void aic7xxx_buildscb(struct aic7xxx_host *p, struct scsi_cmnd *cmd,
   struct aic_dev_data *aic_dev = cmd->device->hostdata;
   struct scsi_device *sdptr = cmd->device;
   unsigned char tindex = TARGET_INDEX(cmd);
-  struct request *req = cmd->request;
   int use_sg;
 
   mask = (0x01 << tindex);
-- 
cgit v1.2.3


From 38b6f45a97bbb8536cc7d095b577f580bd4d643e Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Thu, 16 Sep 2010 08:33:54 +0200
Subject: core: match_dev_by_uuid() should not be marked __init

It is also called outside the scope of init functions. Stephen
reports:

WARNING: init/mounts.o(.text+0x21a): Section mismatch in reference from the function name_to_dev_t() to the function .init.text:match_dev_by_uuid()
The function name_to_dev_t() references
the function __init match_dev_by_uuid().
This is often because name_to_dev_t lacks a __init
annotation or the annotation of match_dev_by_uuid is wrong.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 init/do_mounts.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/init/do_mounts.c b/init/do_mounts.c
index 804f9c6ba216..b7fc83994f39 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -65,7 +65,7 @@ __setup("rw", readwrite);
  *
  * Returns 1 if the device matches, and 0 otherwise.
  */
-static int __init match_dev_by_uuid(struct device *dev, void *data)
+static int match_dev_by_uuid(struct device *dev, void *data)
 {
 	u8 *uuid = data;
 	struct hd_struct *part = dev_to_part(dev);
-- 
cgit v1.2.3


From af41d7bd9b685ab4e8f930627874ba4f4728e128 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 15 Sep 2010 17:06:32 -0400
Subject: blk-cgroup: Kill the header printed at the start of
 blkio.weight_device file

o Kill extra "dev weight" header which is printed when somebody reads
  blkio.weight_device file. This really seems to be out of convention. No other
  blkio files are printing any header at the start of file. I think it is ok
  to just print values and how to interpret values should be part of
  documentation.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-cgroup.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index a6809645d212..c1a39d90d14a 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -811,8 +811,6 @@ static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft,
 	struct blkio_cgroup *blkcg;
 	struct blkio_policy_node *pn;
 
-	seq_printf(m, "dev\tweight\n");
-
 	blkcg = cgroup_to_blkio_cgroup(cgrp);
 	if (!list_empty(&blkcg->policy_list)) {
 		spin_lock_irq(&blkcg->lock);
-- 
cgit v1.2.3


From 062a644d6121d5e2f51c0b2ca0cbc5155ebf845b Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 15 Sep 2010 17:06:33 -0400
Subject: blk-cgroup: Prepare the base for supporting more than one IO control
 policies

o This patch prepares the base for introducing new IO control policies.
  Currently all the code is written knowing there is only one policy
  and that is proportional bandwidth. Creating infrastructure for newer
  policies to come in.

o Also there were many functions which were generated using macro. It was
  very confusing. Got rid of those.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-cgroup.c  | 544 +++++++++++++++++++++++++++++++++++++---------------
 block/blk-cgroup.h  |  36 +++-
 block/cfq-iosched.c |   1 +
 block/cfq.h         |   2 +-
 4 files changed, 429 insertions(+), 154 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index c1a39d90d14a..7762987fdf9e 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -37,6 +37,12 @@ static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
 static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
 static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
 
+/* for encoding cft->private value on file */
+#define BLKIOFILE_PRIVATE(x, val)	(((x) << 16) | (val))
+/* What policy owns the file, proportional or throttle */
+#define BLKIOFILE_POLICY(val)		(((val) >> 16) & 0xffff)
+#define BLKIOFILE_ATTR(val)		((val) & 0xffff)
+
 struct cgroup_subsys blkio_subsys = {
 	.name = "blkio",
 	.create = blkiocg_create,
@@ -59,6 +65,27 @@ static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,
 	list_add(&pn->node, &blkcg->policy_list);
 }
 
+static inline bool cftype_blkg_same_policy(struct cftype *cft,
+			struct blkio_group *blkg)
+{
+	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
+
+	if (blkg->plid == plid)
+		return 1;
+
+	return 0;
+}
+
+/* Determines if policy node matches cgroup file being accessed */
+static inline bool pn_matches_cftype(struct cftype *cft,
+			struct blkio_policy_node *pn)
+{
+	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
+	int fileid = BLKIOFILE_ATTR(cft->private);
+
+	return (plid == pn->plid && fileid == pn->fileid);
+}
+
 /* Must be called with blkcg->lock held */
 static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
 {
@@ -67,12 +94,13 @@ static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
 
 /* Must be called with blkcg->lock held */
 static struct blkio_policy_node *
-blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev)
+blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev,
+		enum blkio_policy_id plid, int fileid)
 {
 	struct blkio_policy_node *pn;
 
 	list_for_each_entry(pn, &blkcg->policy_list, node) {
-		if (pn->dev == dev)
+		if (pn->dev == dev && pn->plid == plid && pn->fileid == fileid)
 			return pn;
 	}
 
@@ -86,6 +114,20 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
 }
 EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
 
+static inline void
+blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight)
+{
+	struct blkio_policy_type *blkiop;
+
+	list_for_each_entry(blkiop, &blkio_list, list) {
+		/* If this policy does not own the blkg, do not send updates */
+		if (blkiop->plid != blkg->plid)
+			continue;
+		if (blkiop->ops.blkio_update_group_weight_fn)
+			blkiop->ops.blkio_update_group_weight_fn(blkg, weight);
+	}
+}
+
 /*
  * Add to the appropriate stat variable depending on the request type.
  * This should be called with the blkg->stats_lock held.
@@ -341,7 +383,8 @@ void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
 EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
 
 void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
-			struct blkio_group *blkg, void *key, dev_t dev)
+		struct blkio_group *blkg, void *key, dev_t dev,
+		enum blkio_policy_id plid)
 {
 	unsigned long flags;
 
@@ -350,6 +393,7 @@ void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
 	rcu_assign_pointer(blkg->key, key);
 	blkg->blkcg_id = css_id(&blkcg->css);
 	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
+	blkg->plid = plid;
 	spin_unlock_irqrestore(&blkcg->lock, flags);
 	/* Need to take css reference ? */
 	cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
@@ -408,51 +452,6 @@ struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
 }
 EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
 
-#define SHOW_FUNCTION(__VAR)						\
-static u64 blkiocg_##__VAR##_read(struct cgroup *cgroup,		\
-				       struct cftype *cftype)		\
-{									\
-	struct blkio_cgroup *blkcg;					\
-									\
-	blkcg = cgroup_to_blkio_cgroup(cgroup);				\
-	return (u64)blkcg->__VAR;					\
-}
-
-SHOW_FUNCTION(weight);
-#undef SHOW_FUNCTION
-
-static int
-blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
-{
-	struct blkio_cgroup *blkcg;
-	struct blkio_group *blkg;
-	struct hlist_node *n;
-	struct blkio_policy_type *blkiop;
-	struct blkio_policy_node *pn;
-
-	if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
-		return -EINVAL;
-
-	blkcg = cgroup_to_blkio_cgroup(cgroup);
-	spin_lock(&blkio_list_lock);
-	spin_lock_irq(&blkcg->lock);
-	blkcg->weight = (unsigned int)val;
-
-	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
-		pn = blkio_policy_search_node(blkcg, blkg->dev);
-
-		if (pn)
-			continue;
-
-		list_for_each_entry(blkiop, &blkio_list, list)
-			blkiop->ops.blkio_update_group_weight_fn(blkg,
-					blkcg->weight);
-	}
-	spin_unlock_irq(&blkcg->lock);
-	spin_unlock(&blkio_list_lock);
-	return 0;
-}
-
 static int
 blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 {
@@ -593,52 +592,6 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg,
 	return disk_total;
 }
 
-#define SHOW_FUNCTION_PER_GROUP(__VAR, type, show_total)		\
-static int blkiocg_##__VAR##_read(struct cgroup *cgroup,		\
-		struct cftype *cftype, struct cgroup_map_cb *cb)	\
-{									\
-	struct blkio_cgroup *blkcg;					\
-	struct blkio_group *blkg;					\
-	struct hlist_node *n;						\
-	uint64_t cgroup_total = 0;					\
-									\
-	if (!cgroup_lock_live_group(cgroup))				\
-		return -ENODEV;						\
-									\
-	blkcg = cgroup_to_blkio_cgroup(cgroup);				\
-	rcu_read_lock();						\
-	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\
-		if (blkg->dev) {					\
-			spin_lock_irq(&blkg->stats_lock);		\
-			cgroup_total += blkio_get_stat(blkg, cb,	\
-						blkg->dev, type);	\
-			spin_unlock_irq(&blkg->stats_lock);		\
-		}							\
-	}								\
-	if (show_total)							\
-		cb->fill(cb, "Total", cgroup_total);			\
-	rcu_read_unlock();						\
-	cgroup_unlock();						\
-	return 0;							\
-}
-
-SHOW_FUNCTION_PER_GROUP(time, BLKIO_STAT_TIME, 0);
-SHOW_FUNCTION_PER_GROUP(sectors, BLKIO_STAT_SECTORS, 0);
-SHOW_FUNCTION_PER_GROUP(io_service_bytes, BLKIO_STAT_SERVICE_BYTES, 1);
-SHOW_FUNCTION_PER_GROUP(io_serviced, BLKIO_STAT_SERVICED, 1);
-SHOW_FUNCTION_PER_GROUP(io_service_time, BLKIO_STAT_SERVICE_TIME, 1);
-SHOW_FUNCTION_PER_GROUP(io_wait_time, BLKIO_STAT_WAIT_TIME, 1);
-SHOW_FUNCTION_PER_GROUP(io_merged, BLKIO_STAT_MERGED, 1);
-SHOW_FUNCTION_PER_GROUP(io_queued, BLKIO_STAT_QUEUED, 1);
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0);
-SHOW_FUNCTION_PER_GROUP(avg_queue_size, BLKIO_STAT_AVG_QUEUE_SIZE, 0);
-SHOW_FUNCTION_PER_GROUP(group_wait_time, BLKIO_STAT_GROUP_WAIT_TIME, 0);
-SHOW_FUNCTION_PER_GROUP(idle_time, BLKIO_STAT_IDLE_TIME, 0);
-SHOW_FUNCTION_PER_GROUP(empty_time, BLKIO_STAT_EMPTY_TIME, 0);
-#endif
-#undef SHOW_FUNCTION_PER_GROUP
-
 static int blkio_check_dev_num(dev_t dev)
 {
 	int part = 0;
@@ -652,7 +605,7 @@ static int blkio_check_dev_num(dev_t dev)
 }
 
 static int blkio_policy_parse_and_set(char *buf,
-				      struct blkio_policy_node *newpn)
+	struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid)
 {
 	char *s[4], *p, *major_s = NULL, *minor_s = NULL;
 	int ret;
@@ -705,12 +658,20 @@ static int blkio_policy_parse_and_set(char *buf,
 	if (s[1] == NULL)
 		return -EINVAL;
 
-	ret = strict_strtoul(s[1], 10, &temp);
-	if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
-	    temp > BLKIO_WEIGHT_MAX)
-		return -EINVAL;
+	switch (plid) {
+	case BLKIO_POLICY_PROP:
+		ret = strict_strtoul(s[1], 10, &temp);
+		if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
+			temp > BLKIO_WEIGHT_MAX)
+			return -EINVAL;
 
-	newpn->weight =  temp;
+		newpn->plid = plid;
+		newpn->fileid = fileid;
+		newpn->weight = temp;
+		break;
+	default:
+		BUG();
+	}
 
 	return 0;
 }
@@ -720,7 +681,8 @@ unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
 {
 	struct blkio_policy_node *pn;
 
-	pn = blkio_policy_search_node(blkcg, dev);
+	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP,
+				BLKIO_PROP_weight_device);
 	if (pn)
 		return pn->weight;
 	else
@@ -728,18 +690,86 @@ unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
 }
 EXPORT_SYMBOL_GPL(blkcg_get_weight);
 
+/* Checks whether user asked for deleting a policy rule */
+static bool blkio_delete_rule_command(struct blkio_policy_node *pn)
+{
+	switch(pn->plid) {
+	case BLKIO_POLICY_PROP:
+		if (pn->weight == 0)
+			return 1;
+		break;
+	default:
+		BUG();
+	}
+
+	return 0;
+}
+
+static void blkio_update_policy_rule(struct blkio_policy_node *oldpn,
+					struct blkio_policy_node *newpn)
+{
+	switch(oldpn->plid) {
+	case BLKIO_POLICY_PROP:
+		oldpn->weight = newpn->weight;
+		break;
+	default:
+		BUG();
+	}
+}
+
+/*
+ * Some rules/values in blkg have changed. Propogate those to respective
+ * policies.
+ */
+static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
+		struct blkio_group *blkg, struct blkio_policy_node *pn)
+{
+	unsigned int weight;
+
+	switch(pn->plid) {
+	case BLKIO_POLICY_PROP:
+		weight = pn->weight ? pn->weight :
+				blkcg->weight;
+		blkio_update_group_weight(blkg, weight);
+		break;
+	default:
+		BUG();
+	}
+}
 
-static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
-				       const char *buffer)
+/*
+ * A policy node rule has been updated. Propogate this update to all the
+ * block groups which might be affected by this update.
+ */
+static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg,
+				struct blkio_policy_node *pn)
+{
+	struct blkio_group *blkg;
+	struct hlist_node *n;
+
+	spin_lock(&blkio_list_lock);
+	spin_lock_irq(&blkcg->lock);
+
+	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+		if (pn->dev != blkg->dev || pn->plid != blkg->plid)
+			continue;
+		blkio_update_blkg_policy(blkcg, blkg, pn);
+	}
+
+	spin_unlock_irq(&blkcg->lock);
+	spin_unlock(&blkio_list_lock);
+}
+
+static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
+ 				       const char *buffer)
 {
 	int ret = 0;
 	char *buf;
 	struct blkio_policy_node *newpn, *pn;
 	struct blkio_cgroup *blkcg;
-	struct blkio_group *blkg;
 	int keep_newpn = 0;
-	struct hlist_node *n;
-	struct blkio_policy_type *blkiop;
+	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
+	int fileid = BLKIOFILE_ATTR(cft->private);
 
 	buf = kstrdup(buffer, GFP_KERNEL);
 	if (!buf)
@@ -751,7 +781,7 @@ static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
 		goto free_buf;
 	}
 
-	ret = blkio_policy_parse_and_set(buf, newpn);
+	ret = blkio_policy_parse_and_set(buf, newpn, plid, fileid);
 	if (ret)
 		goto free_newpn;
 
@@ -759,9 +789,9 @@ static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
 
 	spin_lock_irq(&blkcg->lock);
 
-	pn = blkio_policy_search_node(blkcg, newpn->dev);
+	pn = blkio_policy_search_node(blkcg, newpn->dev, plid, fileid);
 	if (!pn) {
-		if (newpn->weight != 0) {
+		if (!blkio_delete_rule_command(newpn)) {
 			blkio_policy_insert_node(blkcg, newpn);
 			keep_newpn = 1;
 		}
@@ -769,33 +799,17 @@ static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
 		goto update_io_group;
 	}
 
-	if (newpn->weight == 0) {
-		/* weight == 0 means deleteing a specific weight */
+	if (blkio_delete_rule_command(newpn)) {
 		blkio_policy_delete_node(pn);
 		spin_unlock_irq(&blkcg->lock);
 		goto update_io_group;
 	}
 	spin_unlock_irq(&blkcg->lock);
 
-	pn->weight = newpn->weight;
+	blkio_update_policy_rule(pn, newpn);
 
 update_io_group:
-	/* update weight for each cfqg */
-	spin_lock(&blkio_list_lock);
-	spin_lock_irq(&blkcg->lock);
-
-	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
-		if (newpn->dev == blkg->dev) {
-			list_for_each_entry(blkiop, &blkio_list, list)
-				blkiop->ops.blkio_update_group_weight_fn(blkg,
-							 newpn->weight ?
-							 newpn->weight :
-							 blkcg->weight);
-		}
-	}
-
-	spin_unlock_irq(&blkcg->lock);
-	spin_unlock(&blkio_list_lock);
+	blkio_update_policy_node_blkg(blkcg, newpn);
 
 free_newpn:
 	if (!keep_newpn)
@@ -805,21 +819,219 @@ free_buf:
 	return ret;
 }
 
-static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft,
-				      struct seq_file *m)
+static void
+blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn)
+{
+	switch(pn->plid) {
+		case BLKIO_POLICY_PROP:
+			if (pn->fileid == BLKIO_PROP_weight_device)
+				seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
+					MINOR(pn->dev), pn->weight);
+			break;
+		default:
+			BUG();
+	}
+}
+
+/* cgroup files which read their data from policy nodes end up here */
+static void blkio_read_policy_node_files(struct cftype *cft,
+			struct blkio_cgroup *blkcg, struct seq_file *m)
 {
-	struct blkio_cgroup *blkcg;
 	struct blkio_policy_node *pn;
 
-	blkcg = cgroup_to_blkio_cgroup(cgrp);
 	if (!list_empty(&blkcg->policy_list)) {
 		spin_lock_irq(&blkcg->lock);
 		list_for_each_entry(pn, &blkcg->policy_list, node) {
-			seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
-				   MINOR(pn->dev), pn->weight);
+			if (!pn_matches_cftype(cft, pn))
+				continue;
+			blkio_print_policy_node(m, pn);
 		}
 		spin_unlock_irq(&blkcg->lock);
 	}
+}
+
+static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
+				struct seq_file *m)
+{
+	struct blkio_cgroup *blkcg;
+	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
+	int name = BLKIOFILE_ATTR(cft->private);
+
+	blkcg = cgroup_to_blkio_cgroup(cgrp);
+
+	switch(plid) {
+	case BLKIO_POLICY_PROP:
+		switch(name) {
+		case BLKIO_PROP_weight_device:
+			blkio_read_policy_node_files(cft, blkcg, m);
+			return 0;
+		default:
+			BUG();
+		}
+		break;
+	default:
+		BUG();
+	}
+
+	return 0;
+}
+
+static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
+		struct cftype *cft, struct cgroup_map_cb *cb, enum stat_type type,
+		bool show_total)
+{
+	struct blkio_group *blkg;
+	struct hlist_node *n;
+	uint64_t cgroup_total = 0;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
+		if (blkg->dev) {
+			if (!cftype_blkg_same_policy(cft, blkg))
+				continue;
+			spin_lock_irq(&blkg->stats_lock);
+			cgroup_total += blkio_get_stat(blkg, cb, blkg->dev,
+						type);
+			spin_unlock_irq(&blkg->stats_lock);
+		}
+	}
+	if (show_total)
+		cb->fill(cb, "Total", cgroup_total);
+	rcu_read_unlock();
+	return 0;
+}
+
+/* All map kind of cgroup file get serviced by this function */
+static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
+				struct cgroup_map_cb *cb)
+{
+	struct blkio_cgroup *blkcg;
+	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
+	int name = BLKIOFILE_ATTR(cft->private);
+
+	blkcg = cgroup_to_blkio_cgroup(cgrp);
+
+	switch(plid) {
+	case BLKIO_POLICY_PROP:
+		switch(name) {
+		case BLKIO_PROP_time:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_TIME, 0);
+		case BLKIO_PROP_sectors:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_SECTORS, 0);
+		case BLKIO_PROP_io_service_bytes:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_SERVICE_BYTES, 1);
+		case BLKIO_PROP_io_serviced:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_SERVICED, 1);
+		case BLKIO_PROP_io_service_time:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_SERVICE_TIME, 1);
+		case BLKIO_PROP_io_wait_time:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_WAIT_TIME, 1);
+		case BLKIO_PROP_io_merged:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_MERGED, 1);
+		case BLKIO_PROP_io_queued:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_QUEUED, 1);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+		case BLKIO_PROP_dequeue:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_DEQUEUE, 0);
+		case BLKIO_PROP_avg_queue_size:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_AVG_QUEUE_SIZE, 0);
+		case BLKIO_PROP_group_wait_time:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_GROUP_WAIT_TIME, 0);
+		case BLKIO_PROP_idle_time:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_IDLE_TIME, 0);
+		case BLKIO_PROP_empty_time:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_EMPTY_TIME, 0);
+#endif
+		default:
+			BUG();
+		}
+		break;
+
+	default:
+		BUG();
+	}
+
+	return 0;
+}
+
+static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val)
+{
+	struct blkio_group *blkg;
+	struct hlist_node *n;
+	struct blkio_policy_node *pn;
+
+	if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
+		return -EINVAL;
+
+	spin_lock(&blkio_list_lock);
+	spin_lock_irq(&blkcg->lock);
+	blkcg->weight = (unsigned int)val;
+
+	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+		pn = blkio_policy_search_node(blkcg, blkg->dev,
+				BLKIO_POLICY_PROP, BLKIO_PROP_weight_device);
+		if (pn)
+			continue;
+
+		blkio_update_group_weight(blkg, blkcg->weight);
+	}
+	spin_unlock_irq(&blkcg->lock);
+	spin_unlock(&blkio_list_lock);
+	return 0;
+}
+
+static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
+	struct blkio_cgroup *blkcg;
+	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
+	int name = BLKIOFILE_ATTR(cft->private);
+
+	blkcg = cgroup_to_blkio_cgroup(cgrp);
+
+	switch(plid) {
+	case BLKIO_POLICY_PROP:
+		switch(name) {
+		case BLKIO_PROP_weight:
+			return (u64)blkcg->weight;
+		}
+		break;
+	default:
+		BUG();
+	}
+	return 0;
+}
+
+static int
+blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
+{
+	struct blkio_cgroup *blkcg;
+	enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
+	int name = BLKIOFILE_ATTR(cft->private);
+
+	blkcg = cgroup_to_blkio_cgroup(cgrp);
+
+	switch(plid) {
+	case BLKIO_POLICY_PROP:
+		switch(name) {
+		case BLKIO_PROP_weight:
+			return blkio_weight_write(blkcg, val);
+		}
+		break;
+	default:
+		BUG();
+	}
 
 	return 0;
 }
@@ -827,46 +1039,66 @@ static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft,
 struct cftype blkio_files[] = {
 	{
 		.name = "weight_device",
-		.read_seq_string = blkiocg_weight_device_read,
-		.write_string = blkiocg_weight_device_write,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_weight_device),
+		.read_seq_string = blkiocg_file_read,
+		.write_string = blkiocg_file_write,
 		.max_write_len = 256,
 	},
 	{
 		.name = "weight",
-		.read_u64 = blkiocg_weight_read,
-		.write_u64 = blkiocg_weight_write,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_weight),
+		.read_u64 = blkiocg_file_read_u64,
+		.write_u64 = blkiocg_file_write_u64,
 	},
 	{
 		.name = "time",
-		.read_map = blkiocg_time_read,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_time),
+		.read_map = blkiocg_file_read_map,
 	},
 	{
 		.name = "sectors",
-		.read_map = blkiocg_sectors_read,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_sectors),
+		.read_map = blkiocg_file_read_map,
 	},
 	{
 		.name = "io_service_bytes",
-		.read_map = blkiocg_io_service_bytes_read,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_io_service_bytes),
+		.read_map = blkiocg_file_read_map,
 	},
 	{
 		.name = "io_serviced",
-		.read_map = blkiocg_io_serviced_read,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_io_serviced),
+		.read_map = blkiocg_file_read_map,
 	},
 	{
 		.name = "io_service_time",
-		.read_map = blkiocg_io_service_time_read,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_io_service_time),
+		.read_map = blkiocg_file_read_map,
 	},
 	{
 		.name = "io_wait_time",
-		.read_map = blkiocg_io_wait_time_read,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_io_wait_time),
+		.read_map = blkiocg_file_read_map,
 	},
 	{
 		.name = "io_merged",
-		.read_map = blkiocg_io_merged_read,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_io_merged),
+		.read_map = blkiocg_file_read_map,
 	},
 	{
 		.name = "io_queued",
-		.read_map = blkiocg_io_queued_read,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_io_queued),
+		.read_map = blkiocg_file_read_map,
 	},
 	{
 		.name = "reset_stats",
@@ -875,23 +1107,33 @@ struct cftype blkio_files[] = {
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 	{
 		.name = "avg_queue_size",
-		.read_map = blkiocg_avg_queue_size_read,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_avg_queue_size),
+		.read_map = blkiocg_file_read_map,
 	},
 	{
 		.name = "group_wait_time",
-		.read_map = blkiocg_group_wait_time_read,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_group_wait_time),
+		.read_map = blkiocg_file_read_map,
 	},
 	{
 		.name = "idle_time",
-		.read_map = blkiocg_idle_time_read,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_idle_time),
+		.read_map = blkiocg_file_read_map,
 	},
 	{
 		.name = "empty_time",
-		.read_map = blkiocg_empty_time_read,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_empty_time),
+		.read_map = blkiocg_file_read_map,
 	},
 	{
 		.name = "dequeue",
-		.read_map = blkiocg_dequeue_read,
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+				BLKIO_PROP_dequeue),
+		.read_map = blkiocg_file_read_map,
 	},
 #endif
 };
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 2b866ec1dcea..c8de2598429d 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -15,6 +15,10 @@
 
 #include <linux/cgroup.h>
 
+enum blkio_policy_id {
+	BLKIO_POLICY_PROP = 0,		/* Proportional Bandwidth division */
+};
+
 #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
 
 #ifndef CONFIG_BLK_CGROUP
@@ -65,6 +69,25 @@ enum blkg_state_flags {
 	BLKG_empty,
 };
 
+/* cgroup files owned by proportional weight policy */
+enum blkcg_file_name_prop {
+	BLKIO_PROP_weight = 1,
+	BLKIO_PROP_weight_device,
+	BLKIO_PROP_io_service_bytes,
+	BLKIO_PROP_io_serviced,
+	BLKIO_PROP_time,
+	BLKIO_PROP_sectors,
+	BLKIO_PROP_io_service_time,
+	BLKIO_PROP_io_wait_time,
+	BLKIO_PROP_io_merged,
+	BLKIO_PROP_io_queued,
+	BLKIO_PROP_avg_queue_size,
+	BLKIO_PROP_group_wait_time,
+	BLKIO_PROP_idle_time,
+	BLKIO_PROP_empty_time,
+	BLKIO_PROP_dequeue,
+};
+
 struct blkio_cgroup {
 	struct cgroup_subsys_state css;
 	unsigned int weight;
@@ -112,6 +135,8 @@ struct blkio_group {
 	char path[128];
 	/* The device MKDEV(major, minor), this group has been created for */
 	dev_t dev;
+	/* policy which owns this blk group */
+	enum blkio_policy_id plid;
 
 	/* Need to serialize the stats in the case of reset/update */
 	spinlock_t stats_lock;
@@ -122,6 +147,10 @@ struct blkio_policy_node {
 	struct list_head node;
 	dev_t dev;
 	unsigned int weight;
+	/* This node belongs to max bw policy or porportional weight policy */
+	enum blkio_policy_id plid;
+	/* cgroup file to which this rule belongs to */
+	int fileid;
 };
 
 extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
@@ -139,6 +168,7 @@ struct blkio_policy_ops {
 struct blkio_policy_type {
 	struct list_head list;
 	struct blkio_policy_ops ops;
+	enum blkio_policy_id plid;
 };
 
 /* Blkio controller policy registration */
@@ -212,7 +242,8 @@ static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {}
 extern struct blkio_cgroup blkio_root_cgroup;
 extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
 extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
-			struct blkio_group *blkg, void *key, dev_t dev);
+	struct blkio_group *blkg, void *key, dev_t dev,
+	enum blkio_policy_id plid);
 extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
 extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
 						void *key);
@@ -234,7 +265,8 @@ static inline struct blkio_cgroup *
 cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
 
 static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
-			struct blkio_group *blkg, void *key, dev_t dev) {}
+		struct blkio_group *blkg, void *key, dev_t dev,
+		enum blkio_policy_id plid) {}
 
 static inline int
 blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index eb4086f7dfef..b9f86190763b 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -4013,6 +4013,7 @@ static struct blkio_policy_type blkio_policy_cfq = {
 		.blkio_unlink_group_fn =	cfq_unlink_blkio_group,
 		.blkio_update_group_weight_fn =	cfq_update_blkio_group_weight,
 	},
+	.plid = BLKIO_POLICY_PROP,
 };
 #else
 static struct blkio_policy_type blkio_policy_cfq;
diff --git a/block/cfq.h b/block/cfq.h
index 93448e5a2e41..54a6d90f8e8c 100644
--- a/block/cfq.h
+++ b/block/cfq.h
@@ -69,7 +69,7 @@ static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg,
 
 static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
 			struct blkio_group *blkg, void *key, dev_t dev) {
-	blkiocg_add_blkio_group(blkcg, blkg, key, dev);
+	blkiocg_add_blkio_group(blkcg, blkg, key, dev, BLKIO_POLICY_PROP);
 }
 
 static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg)
-- 
cgit v1.2.3


From 4c9eefa16c6f124ffcc736cb719b24ea27f85017 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 15 Sep 2010 17:06:34 -0400
Subject: blk-cgroup: Introduce cgroup changes for throttling policy

o cgroup chagnes for throttle policy.

o Introduces READ and WRITE bytes per second throttling rules.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-cgroup.c | 145 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 block/blk-cgroup.h |  30 ++++++++++-
 2 files changed, 167 insertions(+), 8 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 7762987fdf9e..aae8c930a6f8 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -128,6 +128,27 @@ blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight)
 	}
 }
 
+static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps,
+				int fileid)
+{
+	struct blkio_policy_type *blkiop;
+
+	list_for_each_entry(blkiop, &blkio_list, list) {
+
+		/* If this policy does not own the blkg, do not send updates */
+		if (blkiop->plid != blkg->plid)
+			continue;
+
+		if (fileid == BLKIO_THROTL_read_bps_device
+		    && blkiop->ops.blkio_update_group_read_bps_fn)
+			blkiop->ops.blkio_update_group_read_bps_fn(blkg, bps);
+
+		if (fileid == BLKIO_THROTL_write_bps_device
+		    && blkiop->ops.blkio_update_group_write_bps_fn)
+			blkiop->ops.blkio_update_group_write_bps_fn(blkg, bps);
+	}
+}
+
 /*
  * Add to the appropriate stat variable depending on the request type.
  * This should be called with the blkg->stats_lock held.
@@ -612,6 +633,7 @@ static int blkio_policy_parse_and_set(char *buf,
 	unsigned long major, minor, temp;
 	int i = 0;
 	dev_t dev;
+	u64 bps;
 
 	memset(s, 0, sizeof(s));
 
@@ -667,7 +689,16 @@ static int blkio_policy_parse_and_set(char *buf,
 
 		newpn->plid = plid;
 		newpn->fileid = fileid;
-		newpn->weight = temp;
+		newpn->val.weight = temp;
+		break;
+	case BLKIO_POLICY_THROTL:
+		ret = strict_strtoull(s[1], 10, &bps);
+		if (ret)
+			return -EINVAL;
+
+		newpn->plid = plid;
+		newpn->fileid = fileid;
+		newpn->val.bps = bps;
 		break;
 	default:
 		BUG();
@@ -684,18 +715,45 @@ unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
 	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP,
 				BLKIO_PROP_weight_device);
 	if (pn)
-		return pn->weight;
+		return pn->val.weight;
 	else
 		return blkcg->weight;
 }
 EXPORT_SYMBOL_GPL(blkcg_get_weight);
 
+uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev)
+{
+	struct blkio_policy_node *pn;
+
+	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
+				BLKIO_THROTL_read_bps_device);
+	if (pn)
+		return pn->val.bps;
+	else
+		return -1;
+}
+
+uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev)
+{
+	struct blkio_policy_node *pn;
+	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
+				BLKIO_THROTL_write_bps_device);
+	if (pn)
+		return pn->val.bps;
+	else
+		return -1;
+}
+
 /* Checks whether user asked for deleting a policy rule */
 static bool blkio_delete_rule_command(struct blkio_policy_node *pn)
 {
 	switch(pn->plid) {
 	case BLKIO_POLICY_PROP:
-		if (pn->weight == 0)
+		if (pn->val.weight == 0)
+			return 1;
+		break;
+	case BLKIO_POLICY_THROTL:
+		if (pn->val.bps == 0)
 			return 1;
 		break;
 	default:
@@ -710,7 +768,10 @@ static void blkio_update_policy_rule(struct blkio_policy_node *oldpn,
 {
 	switch(oldpn->plid) {
 	case BLKIO_POLICY_PROP:
-		oldpn->weight = newpn->weight;
+		oldpn->val.weight = newpn->val.weight;
+		break;
+	case BLKIO_POLICY_THROTL:
+		oldpn->val.bps = newpn->val.bps;
 		break;
 	default:
 		BUG();
@@ -725,13 +786,23 @@ static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
 		struct blkio_group *blkg, struct blkio_policy_node *pn)
 {
 	unsigned int weight;
+	u64 bps;
 
 	switch(pn->plid) {
 	case BLKIO_POLICY_PROP:
-		weight = pn->weight ? pn->weight :
+		weight = pn->val.weight ? pn->val.weight :
 				blkcg->weight;
 		blkio_update_group_weight(blkg, weight);
 		break;
+	case BLKIO_POLICY_THROTL:
+		switch(pn->fileid) {
+		case BLKIO_THROTL_read_bps_device:
+		case BLKIO_THROTL_write_bps_device:
+			bps = pn->val.bps ? pn->val.bps : (-1);
+			blkio_update_group_bps(blkg, bps, pn->fileid);
+			break;
+		}
+		break;
 	default:
 		BUG();
 	}
@@ -826,7 +897,17 @@ blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn)
 		case BLKIO_POLICY_PROP:
 			if (pn->fileid == BLKIO_PROP_weight_device)
 				seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
-					MINOR(pn->dev), pn->weight);
+					MINOR(pn->dev), pn->val.weight);
+			break;
+		case BLKIO_POLICY_THROTL:
+			if (pn->fileid == BLKIO_THROTL_read_bps_device)
+				seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev),
+					MINOR(pn->dev), pn->val.bps);
+			else if (pn->fileid == BLKIO_THROTL_write_bps_device)
+				seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev),
+					MINOR(pn->dev), pn->val.bps);
+			else
+				BUG();
 			break;
 		default:
 			BUG();
@@ -869,6 +950,16 @@ static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
 			BUG();
 		}
 		break;
+	case BLKIO_POLICY_THROTL:
+		switch(name){
+		case BLKIO_THROTL_read_bps_device:
+		case BLKIO_THROTL_write_bps_device:
+			blkio_read_policy_node_files(cft, blkcg, m);
+			return 0;
+		default:
+			BUG();
+		}
+		break;
 	default:
 		BUG();
 	}
@@ -959,7 +1050,18 @@ static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
 			BUG();
 		}
 		break;
-
+	case BLKIO_POLICY_THROTL:
+		switch(name){
+		case BLKIO_THROTL_io_service_bytes:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_SERVICE_BYTES, 1);
+		case BLKIO_THROTL_io_serviced:
+			return blkio_read_blkg_stats(blkcg, cft, cb,
+						BLKIO_STAT_SERVICED, 1);
+		default:
+			BUG();
+		}
+		break;
 	default:
 		BUG();
 	}
@@ -1052,6 +1154,23 @@ struct cftype blkio_files[] = {
 		.read_u64 = blkiocg_file_read_u64,
 		.write_u64 = blkiocg_file_write_u64,
 	},
+	{
+		.name = "throttle.read_bps_device",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+				BLKIO_THROTL_read_bps_device),
+		.read_seq_string = blkiocg_file_read,
+		.write_string = blkiocg_file_write,
+		.max_write_len = 256,
+	},
+
+	{
+		.name = "throttle.write_bps_device",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+				BLKIO_THROTL_write_bps_device),
+		.read_seq_string = blkiocg_file_read,
+		.write_string = blkiocg_file_write,
+		.max_write_len = 256,
+	},
 	{
 		.name = "time",
 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
@@ -1070,12 +1189,24 @@ struct cftype blkio_files[] = {
 				BLKIO_PROP_io_service_bytes),
 		.read_map = blkiocg_file_read_map,
 	},
+	{
+		.name = "throttle.io_service_bytes",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+				BLKIO_THROTL_io_service_bytes),
+		.read_map = blkiocg_file_read_map,
+	},
 	{
 		.name = "io_serviced",
 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
 				BLKIO_PROP_io_serviced),
 		.read_map = blkiocg_file_read_map,
 	},
+	{
+		.name = "throttle.io_serviced",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+				BLKIO_THROTL_io_serviced),
+		.read_map = blkiocg_file_read_map,
+	},
 	{
 		.name = "io_service_time",
 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index c8de2598429d..1b738827b2f6 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -17,6 +17,7 @@
 
 enum blkio_policy_id {
 	BLKIO_POLICY_PROP = 0,		/* Proportional Bandwidth division */
+	BLKIO_POLICY_THROTL,		/* Throttling */
 };
 
 #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
@@ -88,6 +89,14 @@ enum blkcg_file_name_prop {
 	BLKIO_PROP_dequeue,
 };
 
+/* cgroup files owned by throttle policy */
+enum blkcg_file_name_throtl {
+	BLKIO_THROTL_read_bps_device,
+	BLKIO_THROTL_write_bps_device,
+	BLKIO_THROTL_io_service_bytes,
+	BLKIO_THROTL_io_serviced,
+};
+
 struct blkio_cgroup {
 	struct cgroup_subsys_state css;
 	unsigned int weight;
@@ -146,23 +155,42 @@ struct blkio_group {
 struct blkio_policy_node {
 	struct list_head node;
 	dev_t dev;
-	unsigned int weight;
 	/* This node belongs to max bw policy or porportional weight policy */
 	enum blkio_policy_id plid;
 	/* cgroup file to which this rule belongs to */
 	int fileid;
+
+	union {
+		unsigned int weight;
+		/*
+		 * Rate read/write in terms of byptes per second
+		 * Whether this rate represents read or write is determined
+		 * by file type "fileid".
+		 */
+		u64 bps;
+	} val;
 };
 
 extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
 				     dev_t dev);
+extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg,
+				     dev_t dev);
+extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg,
+				     dev_t dev);
 
 typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
 typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg,
 						unsigned int weight);
+typedef void (blkio_update_group_read_bps_fn) (struct blkio_group *blkg,
+						u64 read_bps);
+typedef void (blkio_update_group_write_bps_fn) (struct blkio_group *blkg,
+						u64 write_bps);
 
 struct blkio_policy_ops {
 	blkio_unlink_group_fn *blkio_unlink_group_fn;
 	blkio_update_group_weight_fn *blkio_update_group_weight_fn;
+	blkio_update_group_read_bps_fn *blkio_update_group_read_bps_fn;
+	blkio_update_group_write_bps_fn *blkio_update_group_write_bps_fn;
 };
 
 struct blkio_policy_type {
-- 
cgit v1.2.3


From e43473b7f223ec866f7db273697e76c337c390f9 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 15 Sep 2010 17:06:35 -0400
Subject: blkio: Core implementation of throttle policy

o Actual implementation of throttling policy in block layer. Currently it
  implements READ and WRITE bytes per second throttling logic. IOPS throttling
  comes in later patches.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/Kconfig             |  12 +
 block/Makefile            |   1 +
 block/blk-core.c          |  24 ++
 block/blk-throttle.c      | 909 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/blk_types.h |   3 +
 include/linux/blkdev.h    |  24 ++
 init/Kconfig              |   9 +-
 7 files changed, 979 insertions(+), 3 deletions(-)
 create mode 100644 block/blk-throttle.c

diff --git a/block/Kconfig b/block/Kconfig
index 9be0b56eaee1..6c9213ef15a1 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -77,6 +77,18 @@ config BLK_DEV_INTEGRITY
 	T10/SCSI Data Integrity Field or the T13/ATA External Path
 	Protection.  If in doubt, say N.
 
+config BLK_DEV_THROTTLING
+	bool "Block layer bio throttling support"
+	depends on BLK_CGROUP=y && EXPERIMENTAL
+	default n
+	---help---
+	Block layer bio throttling support. It can be used to limit
+	the IO rate to a device. IO rate policies are per cgroup and
+	one needs to mount and use blkio cgroup controller for creating
+	cgroups and specifying per device IO rate policies.
+
+	See Documentation/cgroups/blkio-controller.txt for more information.
+
 endif # BLOCK
 
 config BLOCK_COMPAT
diff --git a/block/Makefile b/block/Makefile
index 0bb499a739cd..c850d5ef80a2 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -9,6 +9,7 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
 
 obj-$(CONFIG_BLK_DEV_BSG)	+= bsg.o
 obj-$(CONFIG_BLK_CGROUP)	+= blk-cgroup.o
+obj-$(CONFIG_BLK_DEV_THROTTLING)	+= blk-throttle.o
 obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
 obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o
 obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
diff --git a/block/blk-core.c b/block/blk-core.c
index 8d07c1b7e701..797d5095eb83 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -382,6 +382,7 @@ void blk_sync_queue(struct request_queue *q)
 	del_timer_sync(&q->unplug_timer);
 	del_timer_sync(&q->timeout);
 	cancel_work_sync(&q->unplug_work);
+	throtl_shutdown_timer_wq(q);
 }
 EXPORT_SYMBOL(blk_sync_queue);
 
@@ -459,6 +460,8 @@ void blk_cleanup_queue(struct request_queue *q)
 	if (q->elevator)
 		elevator_exit(q->elevator);
 
+	blk_throtl_exit(q);
+
 	blk_put_queue(q);
 }
 EXPORT_SYMBOL(blk_cleanup_queue);
@@ -515,6 +518,11 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 		return NULL;
 	}
 
+	if (blk_throtl_init(q)) {
+		kmem_cache_free(blk_requestq_cachep, q);
+		return NULL;
+	}
+
 	setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
 		    laptop_mode_timer_fn, (unsigned long) q);
 	init_timer(&q->unplug_timer);
@@ -1522,6 +1530,15 @@ static inline void __generic_make_request(struct bio *bio)
 			goto end_io;
 		}
 
+		blk_throtl_bio(q, &bio);
+
+		/*
+		 * If bio = NULL, bio has been throttled and will be submitted
+		 * later.
+		 */
+		if (!bio)
+			break;
+
 		trace_block_bio_queue(q, bio);
 
 		ret = q->make_request_fn(q, bio);
@@ -2580,6 +2597,13 @@ int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)
 }
 EXPORT_SYMBOL(kblockd_schedule_work);
 
+int kblockd_schedule_delayed_work(struct request_queue *q,
+			struct delayed_work *dwork, unsigned long delay)
+{
+	return queue_delayed_work(kblockd_workqueue, dwork, delay);
+}
+EXPORT_SYMBOL(kblockd_schedule_delayed_work);
+
 int __init blk_dev_init(void)
 {
 	BUILD_BUG_ON(__REQ_NR_BITS > 8 *
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
new file mode 100644
index 000000000000..4b492011e0de
--- /dev/null
+++ b/block/blk-throttle.c
@@ -0,0 +1,909 @@
+/*
+ * Interface for controlling IO bandwidth on a request queue
+ *
+ * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/blktrace_api.h>
+#include "blk-cgroup.h"
+
+/* Max dispatch from a group in 1 round */
+static int throtl_grp_quantum = 8;
+
+/* Total max dispatch from all groups in one round */
+static int throtl_quantum = 32;
+
+/* Throttling is performed over 100ms slice and after that slice is renewed */
+static unsigned long throtl_slice = HZ/10;	/* 100 ms */
+
+struct throtl_rb_root {
+	struct rb_root rb;
+	struct rb_node *left;
+	unsigned int count;
+	unsigned long min_disptime;
+};
+
+#define THROTL_RB_ROOT	(struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \
+			.count = 0, .min_disptime = 0}
+
+#define rb_entry_tg(node)	rb_entry((node), struct throtl_grp, rb_node)
+
+struct throtl_grp {
+	/* List of throtl groups on the request queue*/
+	struct hlist_node tg_node;
+
+	/* active throtl group service_tree member */
+	struct rb_node rb_node;
+
+	/*
+	 * Dispatch time in jiffies. This is the estimated time when group
+	 * will unthrottle and is ready to dispatch more bio. It is used as
+	 * key to sort active groups in service tree.
+	 */
+	unsigned long disptime;
+
+	struct blkio_group blkg;
+	atomic_t ref;
+	unsigned int flags;
+
+	/* Two lists for READ and WRITE */
+	struct bio_list bio_lists[2];
+
+	/* Number of queued bios on READ and WRITE lists */
+	unsigned int nr_queued[2];
+
+	/* bytes per second rate limits */
+	uint64_t bps[2];
+
+	/* Number of bytes disptached in current slice */
+	uint64_t bytes_disp[2];
+
+	/* When did we start a new slice */
+	unsigned long slice_start[2];
+	unsigned long slice_end[2];
+};
+
+struct throtl_data
+{
+	/* List of throtl groups */
+	struct hlist_head tg_list;
+
+	/* service tree for active throtl groups */
+	struct throtl_rb_root tg_service_tree;
+
+	struct throtl_grp root_tg;
+	struct request_queue *queue;
+
+	/* Total Number of queued bios on READ and WRITE lists */
+	unsigned int nr_queued[2];
+
+	/*
+	 * number of total undestroyed groups (excluding root group)
+	 */
+	unsigned int nr_undestroyed_grps;
+
+	/* Work for dispatching throttled bios */
+	struct delayed_work throtl_work;
+};
+
+enum tg_state_flags {
+	THROTL_TG_FLAG_on_rr = 0,	/* on round-robin busy list */
+};
+
+#define THROTL_TG_FNS(name)						\
+static inline void throtl_mark_tg_##name(struct throtl_grp *tg)		\
+{									\
+	(tg)->flags |= (1 << THROTL_TG_FLAG_##name);			\
+}									\
+static inline void throtl_clear_tg_##name(struct throtl_grp *tg)	\
+{									\
+	(tg)->flags &= ~(1 << THROTL_TG_FLAG_##name);			\
+}									\
+static inline int throtl_tg_##name(const struct throtl_grp *tg)		\
+{									\
+	return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0;	\
+}
+
+THROTL_TG_FNS(on_rr);
+
+#define throtl_log_tg(td, tg, fmt, args...)				\
+	blk_add_trace_msg((td)->queue, "throtl %s " fmt,		\
+				blkg_path(&(tg)->blkg), ##args);      	\
+
+#define throtl_log(td, fmt, args...)	\
+	blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)
+
+static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg)
+{
+	if (blkg)
+		return container_of(blkg, struct throtl_grp, blkg);
+
+	return NULL;
+}
+
+static inline int total_nr_queued(struct throtl_data *td)
+{
+	return (td->nr_queued[0] + td->nr_queued[1]);
+}
+
+static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg)
+{
+	atomic_inc(&tg->ref);
+	return tg;
+}
+
+static void throtl_put_tg(struct throtl_grp *tg)
+{
+	BUG_ON(atomic_read(&tg->ref) <= 0);
+	if (!atomic_dec_and_test(&tg->ref))
+		return;
+	kfree(tg);
+}
+
+static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
+			struct cgroup *cgroup)
+{
+	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
+	struct throtl_grp *tg = NULL;
+	void *key = td;
+	struct backing_dev_info *bdi = &td->queue->backing_dev_info;
+	unsigned int major, minor;
+
+	/*
+	 * TODO: Speed up blkiocg_lookup_group() by maintaining a radix
+	 * tree of blkg (instead of traversing through hash list all
+	 * the time.
+	 */
+	tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
+
+	/* Fill in device details for root group */
+	if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
+		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+		tg->blkg.dev = MKDEV(major, minor);
+		goto done;
+	}
+
+	if (tg)
+		goto done;
+
+	tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
+	if (!tg)
+		goto done;
+
+	INIT_HLIST_NODE(&tg->tg_node);
+	RB_CLEAR_NODE(&tg->rb_node);
+	bio_list_init(&tg->bio_lists[0]);
+	bio_list_init(&tg->bio_lists[1]);
+
+	/*
+	 * Take the initial reference that will be released on destroy
+	 * This can be thought of a joint reference by cgroup and
+	 * request queue which will be dropped by either request queue
+	 * exit or cgroup deletion path depending on who is exiting first.
+	 */
+	atomic_set(&tg->ref, 1);
+
+	/* Add group onto cgroup list */
+	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+	blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td,
+				MKDEV(major, minor), BLKIO_POLICY_THROTL);
+
+	tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
+	tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);
+
+	hlist_add_head(&tg->tg_node, &td->tg_list);
+	td->nr_undestroyed_grps++;
+done:
+	return tg;
+}
+
+static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
+{
+	struct cgroup *cgroup;
+	struct throtl_grp *tg = NULL;
+
+	rcu_read_lock();
+	cgroup = task_cgroup(current, blkio_subsys_id);
+	tg = throtl_find_alloc_tg(td, cgroup);
+	if (!tg)
+		tg = &td->root_tg;
+	rcu_read_unlock();
+	return tg;
+}
+
+static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root)
+{
+	/* Service tree is empty */
+	if (!root->count)
+		return NULL;
+
+	if (!root->left)
+		root->left = rb_first(&root->rb);
+
+	if (root->left)
+		return rb_entry_tg(root->left);
+
+	return NULL;
+}
+
+static void rb_erase_init(struct rb_node *n, struct rb_root *root)
+{
+	rb_erase(n, root);
+	RB_CLEAR_NODE(n);
+}
+
+static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root)
+{
+	if (root->left == n)
+		root->left = NULL;
+	rb_erase_init(n, &root->rb);
+	--root->count;
+}
+
+static void update_min_dispatch_time(struct throtl_rb_root *st)
+{
+	struct throtl_grp *tg;
+
+	tg = throtl_rb_first(st);
+	if (!tg)
+		return;
+
+	st->min_disptime = tg->disptime;
+}
+
+static void
+tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg)
+{
+	struct rb_node **node = &st->rb.rb_node;
+	struct rb_node *parent = NULL;
+	struct throtl_grp *__tg;
+	unsigned long key = tg->disptime;
+	int left = 1;
+
+	while (*node != NULL) {
+		parent = *node;
+		__tg = rb_entry_tg(parent);
+
+		if (time_before(key, __tg->disptime))
+			node = &parent->rb_left;
+		else {
+			node = &parent->rb_right;
+			left = 0;
+		}
+	}
+
+	if (left)
+		st->left = &tg->rb_node;
+
+	rb_link_node(&tg->rb_node, parent, node);
+	rb_insert_color(&tg->rb_node, &st->rb);
+}
+
+static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
+{
+	struct throtl_rb_root *st = &td->tg_service_tree;
+
+	tg_service_tree_add(st, tg);
+	throtl_mark_tg_on_rr(tg);
+	st->count++;
+}
+
+static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
+{
+	if (!throtl_tg_on_rr(tg))
+		__throtl_enqueue_tg(td, tg);
+}
+
+static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
+{
+	throtl_rb_erase(&tg->rb_node, &td->tg_service_tree);
+	throtl_clear_tg_on_rr(tg);
+}
+
+static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
+{
+	if (throtl_tg_on_rr(tg))
+		__throtl_dequeue_tg(td, tg);
+}
+
+static void throtl_schedule_next_dispatch(struct throtl_data *td)
+{
+	struct throtl_rb_root *st = &td->tg_service_tree;
+
+	/*
+	 * If there are more bios pending, schedule more work.
+	 */
+	if (!total_nr_queued(td))
+		return;
+
+	BUG_ON(!st->count);
+
+	update_min_dispatch_time(st);
+
+	if (time_before_eq(st->min_disptime, jiffies))
+		throtl_schedule_delayed_work(td->queue, 0);
+	else
+		throtl_schedule_delayed_work(td->queue,
+				(st->min_disptime - jiffies));
+}
+
+static inline void
+throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
+{
+	tg->bytes_disp[rw] = 0;
+	tg->slice_start[rw] = jiffies;
+	tg->slice_end[rw] = jiffies + throtl_slice;
+	throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu",
+			rw == READ ? 'R' : 'W', tg->slice_start[rw],
+			tg->slice_end[rw], jiffies);
+}
+
+static inline void throtl_extend_slice(struct throtl_data *td,
+		struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
+{
+	tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
+	throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu",
+			rw == READ ? 'R' : 'W', tg->slice_start[rw],
+			tg->slice_end[rw], jiffies);
+}
+
+/* Determine if previously allocated or extended slice is complete or not */
+static bool
+throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
+{
+	if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
+		return 0;
+
+	return 1;
+}
+
+/* Trim the used slices and adjust slice start accordingly */
+static inline void
+throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
+{
+	unsigned long nr_slices, bytes_trim, time_elapsed;
+
+	BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));
+
+	/*
+	 * If bps are unlimited (-1), then time slice don't get
+	 * renewed. Don't try to trim the slice if slice is used. A new
+	 * slice will start when appropriate.
+	 */
+	if (throtl_slice_used(td, tg, rw))
+		return;
+
+	time_elapsed = jiffies - tg->slice_start[rw];
+
+	nr_slices = time_elapsed / throtl_slice;
+
+	if (!nr_slices)
+		return;
+
+	bytes_trim = (tg->bps[rw] * throtl_slice * nr_slices)/HZ;
+
+	if (!bytes_trim)
+		return;
+
+	if (tg->bytes_disp[rw] >= bytes_trim)
+		tg->bytes_disp[rw] -= bytes_trim;
+	else
+		tg->bytes_disp[rw] = 0;
+
+	tg->slice_start[rw] += nr_slices * throtl_slice;
+
+	throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%lu"
+			" start=%lu end=%lu jiffies=%lu",
+			rw == READ ? 'R' : 'W', nr_slices, bytes_trim,
+			tg->slice_start[rw], tg->slice_end[rw], jiffies);
+}
+
+/*
+ * Returns whether one can dispatch a bio or not. Also returns approx number
+ * of jiffies to wait before this bio is with-in IO rate and can be dispatched
+ */
+static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
+				struct bio *bio, unsigned long *wait)
+{
+	bool rw = bio_data_dir(bio);
+	u64 bytes_allowed, extra_bytes;
+	unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
+
+	/*
+	 * Currently whole state machine of group depends on first bio
+	 * queued in the group bio list. So one should not be calling
+	 * this function with a different bio if there are other bios
+	 * queued.
+	 */
+	BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw]));
+
+	/* If tg->bps = -1, then BW is unlimited */
+	if (tg->bps[rw] == -1) {
+		if (wait)
+			*wait = 0;
+		return 1;
+	}
+
+	/*
+	 * If previous slice expired, start a new one otherwise renew/extend
+	 * existing slice to make sure it is at least throtl_slice interval
+	 * long since now.
+	 */
+	if (throtl_slice_used(td, tg, rw))
+		throtl_start_new_slice(td, tg, rw);
+	else {
+		if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
+			throtl_extend_slice(td, tg, rw, jiffies + throtl_slice);
+	}
+
+	jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
+
+	/* Slice has just started. Consider one slice interval */
+	if (!jiffy_elapsed)
+		jiffy_elapsed_rnd = throtl_slice;
+
+	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
+
+	bytes_allowed = (tg->bps[rw] * jiffies_to_msecs(jiffy_elapsed_rnd))
+				/ MSEC_PER_SEC;
+
+	if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) {
+		if (wait)
+			*wait = 0;
+		return 1;
+	}
+
+	/* Calc approx time to dispatch */
+	extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed;
+	jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]);
+
+	if (!jiffy_wait)
+		jiffy_wait = 1;
+
+	/*
+	 * This wait time is without taking into consideration the rounding
+	 * up we did. Add that time also.
+	 */
+	jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
+
+	if (wait)
+		*wait = jiffy_wait;
+
+	if (time_before(tg->slice_end[rw], jiffies + jiffy_wait))
+		throtl_extend_slice(td, tg, rw, jiffies + jiffy_wait);
+
+	return 0;
+}
+
+static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
+{
+	bool rw = bio_data_dir(bio);
+	bool sync = bio->bi_rw & REQ_SYNC;
+
+	/* Charge the bio to the group */
+	tg->bytes_disp[rw] += bio->bi_size;
+
+	/*
+	 * TODO: This will take blkg->stats_lock. Figure out a way
+	 * to avoid this cost.
+	 */
+	blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync);
+
+}
+
+static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
+			struct bio *bio)
+{
+	bool rw = bio_data_dir(bio);
+
+	bio_list_add(&tg->bio_lists[rw], bio);
+	/* Take a bio reference on tg */
+	throtl_ref_get_tg(tg);
+	tg->nr_queued[rw]++;
+	td->nr_queued[rw]++;
+	throtl_enqueue_tg(td, tg);
+}
+
+static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg)
+{
+	unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
+	struct bio *bio;
+
+	if ((bio = bio_list_peek(&tg->bio_lists[READ])))
+		tg_may_dispatch(td, tg, bio, &read_wait);
+
+	if ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
+		tg_may_dispatch(td, tg, bio, &write_wait);
+
+	min_wait = min(read_wait, write_wait);
+	disptime = jiffies + min_wait;
+
+	/*
+	 * If group is already on active tree, then update dispatch time
+	 * only if it is lesser than existing dispatch time. Otherwise
+	 * always update the dispatch time
+	 */
+
+	if (throtl_tg_on_rr(tg) && time_before(disptime, tg->disptime))
+		return;
+
+	/* Update dispatch time */
+	throtl_dequeue_tg(td, tg);
+	tg->disptime = disptime;
+	throtl_enqueue_tg(td, tg);
+}
+
+static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg,
+				bool rw, struct bio_list *bl)
+{
+	struct bio *bio;
+
+	bio = bio_list_pop(&tg->bio_lists[rw]);
+	tg->nr_queued[rw]--;
+	/* Drop bio reference on tg */
+	throtl_put_tg(tg);
+
+	BUG_ON(td->nr_queued[rw] <= 0);
+	td->nr_queued[rw]--;
+
+	throtl_charge_bio(tg, bio);
+	bio_list_add(bl, bio);
+	bio->bi_rw |= REQ_THROTTLED;
+
+	throtl_trim_slice(td, tg, rw);
+}
+
+static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
+				struct bio_list *bl)
+{
+	unsigned int nr_reads = 0, nr_writes = 0;
+	unsigned int max_nr_reads = throtl_grp_quantum*3/4;
+	unsigned int max_nr_writes = throtl_grp_quantum - nr_reads;
+	struct bio *bio;
+
+	/* Try to dispatch 75% READS and 25% WRITES */
+
+	while ((bio = bio_list_peek(&tg->bio_lists[READ]))
+		&& tg_may_dispatch(td, tg, bio, NULL)) {
+
+		tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
+		nr_reads++;
+
+		if (nr_reads >= max_nr_reads)
+			break;
+	}
+
+	while ((bio = bio_list_peek(&tg->bio_lists[WRITE]))
+		&& tg_may_dispatch(td, tg, bio, NULL)) {
+
+		tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
+		nr_writes++;
+
+		if (nr_writes >= max_nr_writes)
+			break;
+	}
+
+	return nr_reads + nr_writes;
+}
+
+static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
+{
+	unsigned int nr_disp = 0;
+	struct throtl_grp *tg;
+	struct throtl_rb_root *st = &td->tg_service_tree;
+
+	while (1) {
+		tg = throtl_rb_first(st);
+
+		if (!tg)
+			break;
+
+		if (time_before(jiffies, tg->disptime))
+			break;
+
+		throtl_dequeue_tg(td, tg);
+
+		nr_disp += throtl_dispatch_tg(td, tg, bl);
+
+		if (tg->nr_queued[0] || tg->nr_queued[1]) {
+			tg_update_disptime(td, tg);
+			throtl_enqueue_tg(td, tg);
+		}
+
+		if (nr_disp >= throtl_quantum)
+			break;
+	}
+
+	return nr_disp;
+}
+
+/* Dispatch throttled bios. Should be called without queue lock held. */
+static int throtl_dispatch(struct request_queue *q)
+{
+	struct throtl_data *td = q->td;
+	unsigned int nr_disp = 0;
+	struct bio_list bio_list_on_stack;
+	struct bio *bio;
+
+	spin_lock_irq(q->queue_lock);
+
+	if (!total_nr_queued(td))
+		goto out;
+
+	bio_list_init(&bio_list_on_stack);
+
+	throtl_log(td, "dispatch nr_queued=%lu read=%u write=%u",
+			total_nr_queued(td), td->nr_queued[READ],
+			td->nr_queued[WRITE]);
+
+	nr_disp = throtl_select_dispatch(td, &bio_list_on_stack);
+
+	if (nr_disp)
+		throtl_log(td, "bios disp=%u", nr_disp);
+
+	throtl_schedule_next_dispatch(td);
+out:
+	spin_unlock_irq(q->queue_lock);
+
+	/*
+	 * If we dispatched some requests, unplug the queue to make sure
+	 * immediate dispatch
+	 */
+	if (nr_disp) {
+		while((bio = bio_list_pop(&bio_list_on_stack)))
+			generic_make_request(bio);
+		blk_unplug(q);
+	}
+	return nr_disp;
+}
+
+void blk_throtl_work(struct work_struct *work)
+{
+	struct throtl_data *td = container_of(work, struct throtl_data,
+					throtl_work.work);
+	struct request_queue *q = td->queue;
+
+	throtl_dispatch(q);
+}
+
+/* Call with queue lock held */
+void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay)
+{
+
+	struct throtl_data *td = q->td;
+	struct delayed_work *dwork = &td->throtl_work;
+
+	if (total_nr_queued(td) > 0) {
+		/*
+		 * We might have a work scheduled to be executed in future.
+		 * Cancel that and schedule a new one.
+		 */
+		__cancel_delayed_work(dwork);
+		kblockd_schedule_delayed_work(q, dwork, delay);
+		throtl_log(td, "schedule work. delay=%lu jiffies=%lu",
+				delay, jiffies);
+	}
+}
+EXPORT_SYMBOL(throtl_schedule_delayed_work);
+
+static void
+throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg)
+{
+	/* Something wrong if we are trying to remove same group twice */
+	BUG_ON(hlist_unhashed(&tg->tg_node));
+
+	hlist_del_init(&tg->tg_node);
+
+	/*
+	 * Put the reference taken at the time of creation so that when all
+	 * queues are gone, group can be destroyed.
+	 */
+	throtl_put_tg(tg);
+	td->nr_undestroyed_grps--;
+}
+
+static void throtl_release_tgs(struct throtl_data *td)
+{
+	struct hlist_node *pos, *n;
+	struct throtl_grp *tg;
+
+	hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
+		/*
+		 * If cgroup removal path got to blk_group first and removed
+		 * it from cgroup list, then it will take care of destroying
+		 * cfqg also.
+		 */
+		if (!blkiocg_del_blkio_group(&tg->blkg))
+			throtl_destroy_tg(td, tg);
+	}
+}
+
+static void throtl_td_free(struct throtl_data *td)
+{
+	kfree(td);
+}
+
+/*
+ * Blk cgroup controller notification saying that blkio_group object is being
+ * delinked as associated cgroup object is going away. That also means that
+ * no new IO will come in this group. So get rid of this group as soon as
+ * any pending IO in the group is finished.
+ *
+ * This function is called under rcu_read_lock(). key is the rcu protected
+ * pointer. That means "key" is a valid throtl_data pointer as long as we are
+ * rcu read lock.
+ *
+ * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
+ * it should not be NULL as even if queue was going away, cgroup deltion
+ * path got to it first.
+ */
+void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg)
+{
+	unsigned long flags;
+	struct throtl_data *td = key;
+
+	spin_lock_irqsave(td->queue->queue_lock, flags);
+	throtl_destroy_tg(td, tg_of_blkg(blkg));
+	spin_unlock_irqrestore(td->queue->queue_lock, flags);
+}
+
+static void throtl_update_blkio_group_read_bps (struct blkio_group *blkg,
+			u64 read_bps)
+{
+	tg_of_blkg(blkg)->bps[READ] = read_bps;
+}
+
+static void throtl_update_blkio_group_write_bps (struct blkio_group *blkg,
+			u64 write_bps)
+{
+	tg_of_blkg(blkg)->bps[WRITE] = write_bps;
+}
+
+void throtl_shutdown_timer_wq(struct request_queue *q)
+{
+	struct throtl_data *td = q->td;
+
+	cancel_delayed_work_sync(&td->throtl_work);
+}
+
+static struct blkio_policy_type blkio_policy_throtl = {
+	.ops = {
+		.blkio_unlink_group_fn = throtl_unlink_blkio_group,
+		.blkio_update_group_read_bps_fn =
+					throtl_update_blkio_group_read_bps,
+		.blkio_update_group_write_bps_fn =
+					throtl_update_blkio_group_write_bps,
+	},
+};
+
+int blk_throtl_bio(struct request_queue *q, struct bio **biop)
+{
+	struct throtl_data *td = q->td;
+	struct throtl_grp *tg;
+	struct bio *bio = *biop;
+	bool rw = bio_data_dir(bio), update_disptime = true;
+
+	if (bio->bi_rw & REQ_THROTTLED) {
+		bio->bi_rw &= ~REQ_THROTTLED;
+		return 0;
+	}
+
+	spin_lock_irq(q->queue_lock);
+	tg = throtl_get_tg(td);
+
+	if (tg->nr_queued[rw]) {
+		/*
+		 * There is already another bio queued in same dir. No
+		 * need to update dispatch time.
+		 */
+		update_disptime = false;
+		goto queue_bio;
+	}
+
+	/* Bio is with-in rate limit of group */
+	if (tg_may_dispatch(td, tg, bio, NULL)) {
+		throtl_charge_bio(tg, bio);
+		goto out;
+	}
+
+queue_bio:
+	throtl_log_tg(td, tg, "[%c] bio. disp=%u sz=%u bps=%llu"
+			" queued=%d/%d", rw == READ ? 'R' : 'W',
+			tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
+			tg->nr_queued[READ], tg->nr_queued[WRITE]);
+
+	throtl_add_bio_tg(q->td, tg, bio);
+	*biop = NULL;
+
+	if (update_disptime) {
+		tg_update_disptime(td, tg);
+		throtl_schedule_next_dispatch(td);
+	}
+
+out:
+	spin_unlock_irq(q->queue_lock);
+	return 0;
+}
+
+int blk_throtl_init(struct request_queue *q)
+{
+	struct throtl_data *td;
+	struct throtl_grp *tg;
+
+	td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
+	if (!td)
+		return -ENOMEM;
+
+	INIT_HLIST_HEAD(&td->tg_list);
+	td->tg_service_tree = THROTL_RB_ROOT;
+
+	/* Init root group */
+	tg = &td->root_tg;
+	INIT_HLIST_NODE(&tg->tg_node);
+	RB_CLEAR_NODE(&tg->rb_node);
+	bio_list_init(&tg->bio_lists[0]);
+	bio_list_init(&tg->bio_lists[1]);
+
+	/* Practically unlimited BW */
+	tg->bps[0] = tg->bps[1] = -1;
+	atomic_set(&tg->ref, 1);
+
+	INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
+
+	rcu_read_lock();
+	blkiocg_add_blkio_group(&blkio_root_cgroup, &tg->blkg, (void *)td,
+					0, BLKIO_POLICY_THROTL);
+	rcu_read_unlock();
+
+	/* Attach throtl data to request queue */
+	td->queue = q;
+	q->td = td;
+	return 0;
+}
+
+void blk_throtl_exit(struct request_queue *q)
+{
+	struct throtl_data *td = q->td;
+	bool wait = false;
+
+	BUG_ON(!td);
+
+	throtl_shutdown_timer_wq(q);
+
+	spin_lock_irq(q->queue_lock);
+	throtl_release_tgs(td);
+	blkiocg_del_blkio_group(&td->root_tg.blkg);
+
+	/* If there are other groups */
+	if (td->nr_undestroyed_grps >= 1)
+		wait = true;
+
+	spin_unlock_irq(q->queue_lock);
+
+	/*
+	 * Wait for tg->blkg->key accessors to exit their grace periods.
+	 * Do this wait only if there are other undestroyed groups out
+	 * there (other than root group). This can happen if cgroup deletion
+	 * path claimed the responsibility of cleaning up a group before
+	 * queue cleanup code get to the group.
+	 *
+	 * Do not call synchronize_rcu() unconditionally as there are drivers
+	 * which create/delete request queue hundreds of times during scan/boot
+	 * and synchronize_rcu() can take significant time and slow down boot.
+	 */
+	if (wait)
+		synchronize_rcu();
+	throtl_td_free(td);
+}
+
+static int __init throtl_init(void)
+{
+	blkio_policy_register(&blkio_policy_throtl);
+	return 0;
+}
+
+module_init(throtl_init);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index ca83a97c9715..10a0c291b55a 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -130,6 +130,8 @@ enum rq_flag_bits {
 	/* bio only flags */
 	__REQ_UNPLUG,		/* unplug the immediately after submission */
 	__REQ_RAHEAD,		/* read ahead, can fail anytime */
+	__REQ_THROTTLED,	/* This bio has already been subjected to
+				 * throttling rules. Don't do it again. */
 
 	/* request only flags */
 	__REQ_SORTED,		/* elevator knows about this request */
@@ -172,6 +174,7 @@ enum rq_flag_bits {
 
 #define REQ_UNPLUG		(1 << __REQ_UNPLUG)
 #define REQ_RAHEAD		(1 << __REQ_RAHEAD)
+#define REQ_THROTTLED		(1 << __REQ_THROTTLED)
 
 #define REQ_SORTED		(1 << __REQ_SORTED)
 #define REQ_SOFTBARRIER		(1 << __REQ_SOFTBARRIER)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 780824edac16..1341df5806df 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -371,6 +371,11 @@ struct request_queue
 #if defined(CONFIG_BLK_DEV_BSG)
 	struct bsg_class_device bsg_dev;
 #endif
+
+#ifdef CONFIG_BLK_DEV_THROTTLING
+	/* Throttle data */
+	struct throtl_data *td;
+#endif
 };
 
 #define QUEUE_FLAG_CLUSTER	0	/* cluster several segments into 1 */
@@ -1131,6 +1136,7 @@ static inline void put_dev_sector(Sector p)
 
 struct work_struct;
 int kblockd_schedule_work(struct request_queue *q, struct work_struct *work);
+int kblockd_schedule_delayed_work(struct request_queue *q, struct delayed_work *dwork, unsigned long delay);
 
 #ifdef CONFIG_BLK_CGROUP
 /*
@@ -1174,6 +1180,24 @@ static inline uint64_t rq_io_start_time_ns(struct request *req)
 }
 #endif
 
+#ifdef CONFIG_BLK_DEV_THROTTLING
+extern int blk_throtl_init(struct request_queue *q);
+extern void blk_throtl_exit(struct request_queue *q);
+extern int blk_throtl_bio(struct request_queue *q, struct bio **bio);
+extern void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay);
+extern void throtl_shutdown_timer_wq(struct request_queue *q);
+#else /* CONFIG_BLK_DEV_THROTTLING */
+static inline int blk_throtl_bio(struct request_queue *q, struct bio **bio)
+{
+	return 0;
+}
+
+static inline int blk_throtl_init(struct request_queue *q) { return 0; }
+static inline int blk_throtl_exit(struct request_queue *q) { return 0; }
+static inline void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay) {}
+static inline void throtl_shutdown_timer_wq(struct request_queue *q) {}
+#endif /* CONFIG_BLK_DEV_THROTTLING */
+
 #define MODULE_ALIAS_BLOCKDEV(major,minor) \
 	MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor))
 #define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \
diff --git a/init/Kconfig b/init/Kconfig
index 2de5b1cbadd9..950ba26f7233 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -634,11 +634,14 @@ config BLK_CGROUP
 
 	Currently, CFQ IO scheduler uses it to recognize task groups and
 	control disk bandwidth allocation (proportional time slice allocation)
-	to such task groups.
+	to such task groups. It is also used by bio throttling logic in
+	block layer to implement upper limit in IO rates on a device.
 
 	This option only enables generic Block IO controller infrastructure.
-	One needs to also enable actual IO controlling logic in CFQ for it
-	to take effect. (CONFIG_CFQ_GROUP_IOSCHED=y).
+	One needs to also enable actual IO controlling logic/policy. For
+	enabling proportional weight division of disk bandwidth in CFQ seti
+	CONFIG_CFQ_GROUP_IOSCHED=y and for enabling throttling policy set
+	CONFIG_BLK_THROTTLE=y.
 
 	See Documentation/cgroups/blkio-controller.txt for more information.
 
-- 
cgit v1.2.3


From 7702e8f45b0a3bb262b9366c60beb5445758d94c Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 15 Sep 2010 17:06:36 -0400
Subject: blk-cgroup: cgroup changes for IOPS limit support

o cgroup changes for IOPS throttling rules.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-cgroup.c | 139 ++++++++++++++++++++++++++++++++++++++++++++++-------
 block/blk-cgroup.h |  13 +++++
 2 files changed, 135 insertions(+), 17 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index aae8c930a6f8..20ce6f584e43 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -149,6 +149,27 @@ static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps,
 	}
 }
 
+static inline void blkio_update_group_iops(struct blkio_group *blkg,
+			unsigned int iops, int fileid)
+{
+	struct blkio_policy_type *blkiop;
+
+	list_for_each_entry(blkiop, &blkio_list, list) {
+
+		/* If this policy does not own the blkg, do not send updates */
+		if (blkiop->plid != blkg->plid)
+			continue;
+
+		if (fileid == BLKIO_THROTL_read_iops_device
+		    && blkiop->ops.blkio_update_group_read_iops_fn)
+			blkiop->ops.blkio_update_group_read_iops_fn(blkg, iops);
+
+		if (fileid == BLKIO_THROTL_write_iops_device
+		    && blkiop->ops.blkio_update_group_write_iops_fn)
+			blkiop->ops.blkio_update_group_write_iops_fn(blkg,iops);
+	}
+}
+
 /*
  * Add to the appropriate stat variable depending on the request type.
  * This should be called with the blkg->stats_lock held.
@@ -630,7 +651,7 @@ static int blkio_policy_parse_and_set(char *buf,
 {
 	char *s[4], *p, *major_s = NULL, *minor_s = NULL;
 	int ret;
-	unsigned long major, minor, temp;
+	unsigned long major, minor, temp, iops;
 	int i = 0;
 	dev_t dev;
 	u64 bps;
@@ -692,13 +713,28 @@ static int blkio_policy_parse_and_set(char *buf,
 		newpn->val.weight = temp;
 		break;
 	case BLKIO_POLICY_THROTL:
-		ret = strict_strtoull(s[1], 10, &bps);
-		if (ret)
-			return -EINVAL;
+		switch(fileid) {
+		case BLKIO_THROTL_read_bps_device:
+		case BLKIO_THROTL_write_bps_device:
+			ret = strict_strtoull(s[1], 10, &bps);
+			if (ret)
+				return -EINVAL;
 
-		newpn->plid = plid;
-		newpn->fileid = fileid;
-		newpn->val.bps = bps;
+			newpn->plid = plid;
+			newpn->fileid = fileid;
+			newpn->val.bps = bps;
+			break;
+		case BLKIO_THROTL_read_iops_device:
+		case BLKIO_THROTL_write_iops_device:
+			ret = strict_strtoul(s[1], 10, &iops);
+			if (ret)
+				return -EINVAL;
+
+			newpn->plid = plid;
+			newpn->fileid = fileid;
+			newpn->val.iops = iops;
+			break;
+		}
 		break;
 	default:
 		BUG();
@@ -744,6 +780,29 @@ uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev)
 		return -1;
 }
 
+unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev)
+{
+	struct blkio_policy_node *pn;
+
+	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
+				BLKIO_THROTL_read_iops_device);
+	if (pn)
+		return pn->val.iops;
+	else
+		return -1;
+}
+
+unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev)
+{
+	struct blkio_policy_node *pn;
+	pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
+				BLKIO_THROTL_write_iops_device);
+	if (pn)
+		return pn->val.iops;
+	else
+		return -1;
+}
+
 /* Checks whether user asked for deleting a policy rule */
 static bool blkio_delete_rule_command(struct blkio_policy_node *pn)
 {
@@ -753,8 +812,17 @@ static bool blkio_delete_rule_command(struct blkio_policy_node *pn)
 			return 1;
 		break;
 	case BLKIO_POLICY_THROTL:
-		if (pn->val.bps == 0)
-			return 1;
+		switch(pn->fileid) {
+		case BLKIO_THROTL_read_bps_device:
+		case BLKIO_THROTL_write_bps_device:
+			if (pn->val.bps == 0)
+				return 1;
+			break;
+		case BLKIO_THROTL_read_iops_device:
+		case BLKIO_THROTL_write_iops_device:
+			if (pn->val.iops == 0)
+				return 1;
+		}
 		break;
 	default:
 		BUG();
@@ -771,7 +839,15 @@ static void blkio_update_policy_rule(struct blkio_policy_node *oldpn,
 		oldpn->val.weight = newpn->val.weight;
 		break;
 	case BLKIO_POLICY_THROTL:
-		oldpn->val.bps = newpn->val.bps;
+		switch(newpn->fileid) {
+		case BLKIO_THROTL_read_bps_device:
+		case BLKIO_THROTL_write_bps_device:
+			oldpn->val.bps = newpn->val.bps;
+			break;
+		case BLKIO_THROTL_read_iops_device:
+		case BLKIO_THROTL_write_iops_device:
+			oldpn->val.iops = newpn->val.iops;
+		}
 		break;
 	default:
 		BUG();
@@ -785,7 +861,7 @@ static void blkio_update_policy_rule(struct blkio_policy_node *oldpn,
 static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
 		struct blkio_group *blkg, struct blkio_policy_node *pn)
 {
-	unsigned int weight;
+	unsigned int weight, iops;
 	u64 bps;
 
 	switch(pn->plid) {
@@ -801,6 +877,11 @@ static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
 			bps = pn->val.bps ? pn->val.bps : (-1);
 			blkio_update_group_bps(blkg, bps, pn->fileid);
 			break;
+		case BLKIO_THROTL_read_iops_device:
+		case BLKIO_THROTL_write_iops_device:
+			iops = pn->val.iops ? pn->val.iops : (-1);
+			blkio_update_group_iops(blkg, iops, pn->fileid);
+			break;
 		}
 		break;
 	default:
@@ -900,14 +981,18 @@ blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn)
 					MINOR(pn->dev), pn->val.weight);
 			break;
 		case BLKIO_POLICY_THROTL:
-			if (pn->fileid == BLKIO_THROTL_read_bps_device)
+			switch(pn->fileid) {
+			case BLKIO_THROTL_read_bps_device:
+			case BLKIO_THROTL_write_bps_device:
 				seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev),
 					MINOR(pn->dev), pn->val.bps);
-			else if (pn->fileid == BLKIO_THROTL_write_bps_device)
-				seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev),
-					MINOR(pn->dev), pn->val.bps);
-			else
-				BUG();
+				break;
+			case BLKIO_THROTL_read_iops_device:
+			case BLKIO_THROTL_write_iops_device:
+				seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
+					MINOR(pn->dev), pn->val.iops);
+				break;
+			}
 			break;
 		default:
 			BUG();
@@ -954,6 +1039,8 @@ static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
 		switch(name){
 		case BLKIO_THROTL_read_bps_device:
 		case BLKIO_THROTL_write_bps_device:
+		case BLKIO_THROTL_read_iops_device:
+		case BLKIO_THROTL_write_iops_device:
 			blkio_read_policy_node_files(cft, blkcg, m);
 			return 0;
 		default:
@@ -1171,6 +1258,24 @@ struct cftype blkio_files[] = {
 		.write_string = blkiocg_file_write,
 		.max_write_len = 256,
 	},
+
+	{
+		.name = "throttle.read_iops_device",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+				BLKIO_THROTL_read_iops_device),
+		.read_seq_string = blkiocg_file_read,
+		.write_string = blkiocg_file_write,
+		.max_write_len = 256,
+	},
+
+	{
+		.name = "throttle.write_iops_device",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+				BLKIO_THROTL_write_iops_device),
+		.read_seq_string = blkiocg_file_read,
+		.write_string = blkiocg_file_write,
+		.max_write_len = 256,
+	},
 	{
 		.name = "time",
 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 1b738827b2f6..2070053a30b1 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -93,6 +93,8 @@ enum blkcg_file_name_prop {
 enum blkcg_file_name_throtl {
 	BLKIO_THROTL_read_bps_device,
 	BLKIO_THROTL_write_bps_device,
+	BLKIO_THROTL_read_iops_device,
+	BLKIO_THROTL_write_iops_device,
 	BLKIO_THROTL_io_service_bytes,
 	BLKIO_THROTL_io_serviced,
 };
@@ -168,6 +170,7 @@ struct blkio_policy_node {
 		 * by file type "fileid".
 		 */
 		u64 bps;
+		unsigned int iops;
 	} val;
 };
 
@@ -177,6 +180,10 @@ extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg,
 				     dev_t dev);
 extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg,
 				     dev_t dev);
+extern unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg,
+				     dev_t dev);
+extern unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg,
+				     dev_t dev);
 
 typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
 typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg,
@@ -185,12 +192,18 @@ typedef void (blkio_update_group_read_bps_fn) (struct blkio_group *blkg,
 						u64 read_bps);
 typedef void (blkio_update_group_write_bps_fn) (struct blkio_group *blkg,
 						u64 write_bps);
+typedef void (blkio_update_group_read_iops_fn) (struct blkio_group *blkg,
+						unsigned int read_iops);
+typedef void (blkio_update_group_write_iops_fn) (struct blkio_group *blkg,
+						unsigned int write_iops);
 
 struct blkio_policy_ops {
 	blkio_unlink_group_fn *blkio_unlink_group_fn;
 	blkio_update_group_weight_fn *blkio_update_group_weight_fn;
 	blkio_update_group_read_bps_fn *blkio_update_group_read_bps_fn;
 	blkio_update_group_write_bps_fn *blkio_update_group_write_bps_fn;
+	blkio_update_group_read_iops_fn *blkio_update_group_read_iops_fn;
+	blkio_update_group_write_iops_fn *blkio_update_group_write_iops_fn;
 };
 
 struct blkio_policy_type {
-- 
cgit v1.2.3


From 8e89d13f4ede2467629a971618537430fafaaea3 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 15 Sep 2010 17:06:37 -0400
Subject: blkio: Implementation of IOPS limit logic

o core logic of implementing IOPS throttling.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-throttle.c | 164 +++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 127 insertions(+), 37 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 4b492011e0de..af53f37c1b13 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -59,8 +59,13 @@ struct throtl_grp {
 	/* bytes per second rate limits */
 	uint64_t bps[2];
 
+	/* IOPS limits */
+	unsigned int iops[2];
+
 	/* Number of bytes disptached in current slice */
 	uint64_t bytes_disp[2];
+	/* Number of bio's dispatched in current slice */
+	unsigned int io_disp[2];
 
 	/* When did we start a new slice */
 	unsigned long slice_start[2];
@@ -194,6 +199,8 @@ static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
 
 	tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
 	tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);
+	tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev);
+	tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev);
 
 	hlist_add_head(&tg->tg_node, &td->tg_list);
 	td->nr_undestroyed_grps++;
@@ -335,6 +342,7 @@ static inline void
 throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 {
 	tg->bytes_disp[rw] = 0;
+	tg->io_disp[rw] = 0;
 	tg->slice_start[rw] = jiffies;
 	tg->slice_end[rw] = jiffies + throtl_slice;
 	throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu",
@@ -365,7 +373,7 @@ throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 static inline void
 throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 {
-	unsigned long nr_slices, bytes_trim, time_elapsed;
+	unsigned long nr_slices, bytes_trim, time_elapsed, io_trim;
 
 	BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));
 
@@ -385,8 +393,9 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 		return;
 
 	bytes_trim = (tg->bps[rw] * throtl_slice * nr_slices)/HZ;
+	io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ;
 
-	if (!bytes_trim)
+	if (!bytes_trim && !io_trim)
 		return;
 
 	if (tg->bytes_disp[rw] >= bytes_trim)
@@ -394,51 +403,62 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 	else
 		tg->bytes_disp[rw] = 0;
 
+	if (tg->io_disp[rw] >= io_trim)
+		tg->io_disp[rw] -= io_trim;
+	else
+		tg->io_disp[rw] = 0;
+
 	tg->slice_start[rw] += nr_slices * throtl_slice;
 
-	throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%lu"
+	throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%lu io=%lu"
 			" start=%lu end=%lu jiffies=%lu",
-			rw == READ ? 'R' : 'W', nr_slices, bytes_trim,
+			rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
 			tg->slice_start[rw], tg->slice_end[rw], jiffies);
 }
 
-/*
- * Returns whether one can dispatch a bio or not. Also returns approx number
- * of jiffies to wait before this bio is with-in IO rate and can be dispatched
- */
-static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
-				struct bio *bio, unsigned long *wait)
+static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
+		struct bio *bio, unsigned long *wait)
 {
 	bool rw = bio_data_dir(bio);
-	u64 bytes_allowed, extra_bytes;
+	unsigned int io_allowed;
 	unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
 
-	/*
-	 * Currently whole state machine of group depends on first bio
-	 * queued in the group bio list. So one should not be calling
-	 * this function with a different bio if there are other bios
-	 * queued.
-	 */
-	BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw]));
+	jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
 
-	/* If tg->bps = -1, then BW is unlimited */
-	if (tg->bps[rw] == -1) {
+	/* Slice has just started. Consider one slice interval */
+	if (!jiffy_elapsed)
+		jiffy_elapsed_rnd = throtl_slice;
+
+	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
+
+	io_allowed = (tg->iops[rw] * jiffies_to_msecs(jiffy_elapsed_rnd))
+				/ MSEC_PER_SEC;
+
+	if (tg->io_disp[rw] + 1 <= io_allowed) {
 		if (wait)
 			*wait = 0;
 		return 1;
 	}
 
-	/*
-	 * If previous slice expired, start a new one otherwise renew/extend
-	 * existing slice to make sure it is at least throtl_slice interval
-	 * long since now.
-	 */
-	if (throtl_slice_used(td, tg, rw))
-		throtl_start_new_slice(td, tg, rw);
-	else {
-		if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
-			throtl_extend_slice(td, tg, rw, jiffies + throtl_slice);
-	}
+	/* Calc approx time to dispatch */
+	jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1;
+
+	if (jiffy_wait > jiffy_elapsed)
+		jiffy_wait = jiffy_wait - jiffy_elapsed;
+	else
+		jiffy_wait = 1;
+
+	if (wait)
+		*wait = jiffy_wait;
+	return 0;
+}
+
+static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
+		struct bio *bio, unsigned long *wait)
+{
+	bool rw = bio_data_dir(bio);
+	u64 bytes_allowed, extra_bytes;
+	unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
 
 	jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
 
@@ -469,12 +489,62 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
 	 * up we did. Add that time also.
 	 */
 	jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
-
 	if (wait)
 		*wait = jiffy_wait;
+	return 0;
+}
+
+/*
+ * Returns whether one can dispatch a bio or not. Also returns approx number
+ * of jiffies to wait before this bio is with-in IO rate and can be dispatched
+ */
+static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
+				struct bio *bio, unsigned long *wait)
+{
+	bool rw = bio_data_dir(bio);
+	unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
+
+	/*
+ 	 * Currently whole state machine of group depends on first bio
+	 * queued in the group bio list. So one should not be calling
+	 * this function with a different bio if there are other bios
+	 * queued.
+	 */
+	BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw]));
 
-	if (time_before(tg->slice_end[rw], jiffies + jiffy_wait))
-		throtl_extend_slice(td, tg, rw, jiffies + jiffy_wait);
+	/* If tg->bps = -1, then BW is unlimited */
+	if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
+		if (wait)
+			*wait = 0;
+		return 1;
+	}
+
+	/*
+	 * If previous slice expired, start a new one otherwise renew/extend
+	 * existing slice to make sure it is at least throtl_slice interval
+	 * long since now.
+	 */
+	if (throtl_slice_used(td, tg, rw))
+		throtl_start_new_slice(td, tg, rw);
+	else {
+		if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
+			throtl_extend_slice(td, tg, rw, jiffies + throtl_slice);
+	}
+
+	if (tg_with_in_bps_limit(td, tg, bio, &bps_wait)
+	    && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) {
+		if (wait)
+			*wait = 0;
+		return 1;
+	}
+
+	max_wait = max(bps_wait, iops_wait);
+
+	if (wait)
+		*wait = max_wait;
+
+	if (time_before(tg->slice_end[rw], jiffies + max_wait))
+		throtl_extend_slice(td, tg, rw, jiffies + max_wait);
 
 	return 0;
 }
@@ -486,13 +556,13 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 
 	/* Charge the bio to the group */
 	tg->bytes_disp[rw] += bio->bi_size;
+	tg->io_disp[rw]++;
 
 	/*
 	 * TODO: This will take blkg->stats_lock. Figure out a way
 	 * to avoid this cost.
 	 */
 	blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync);
-
 }
 
 static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
@@ -763,6 +833,18 @@ static void throtl_update_blkio_group_write_bps (struct blkio_group *blkg,
 	tg_of_blkg(blkg)->bps[WRITE] = write_bps;
 }
 
+static void throtl_update_blkio_group_read_iops (struct blkio_group *blkg,
+			unsigned int read_iops)
+{
+	tg_of_blkg(blkg)->iops[READ] = read_iops;
+}
+
+static void throtl_update_blkio_group_write_iops (struct blkio_group *blkg,
+			unsigned int write_iops)
+{
+	tg_of_blkg(blkg)->iops[WRITE] = write_iops;
+}
+
 void throtl_shutdown_timer_wq(struct request_queue *q)
 {
 	struct throtl_data *td = q->td;
@@ -777,7 +859,12 @@ static struct blkio_policy_type blkio_policy_throtl = {
 					throtl_update_blkio_group_read_bps,
 		.blkio_update_group_write_bps_fn =
 					throtl_update_blkio_group_write_bps,
+		.blkio_update_group_read_iops_fn =
+					throtl_update_blkio_group_read_iops,
+		.blkio_update_group_write_iops_fn =
+					throtl_update_blkio_group_write_iops,
 	},
+	.plid = BLKIO_POLICY_THROTL,
 };
 
 int blk_throtl_bio(struct request_queue *q, struct bio **biop)
@@ -811,9 +898,11 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop)
 	}
 
 queue_bio:
-	throtl_log_tg(td, tg, "[%c] bio. disp=%u sz=%u bps=%llu"
-			" queued=%d/%d", rw == READ ? 'R' : 'W',
+	throtl_log_tg(td, tg, "[%c] bio. bdisp=%u sz=%u bps=%llu"
+			" iodisp=%u iops=%u queued=%d/%d",
+			rw == READ ? 'R' : 'W',
 			tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
+			tg->io_disp[rw], tg->iops[rw],
 			tg->nr_queued[READ], tg->nr_queued[WRITE]);
 
 	throtl_add_bio_tg(q->td, tg, bio);
@@ -850,6 +939,7 @@ int blk_throtl_init(struct request_queue *q)
 
 	/* Practically unlimited BW */
 	tg->bps[0] = tg->bps[1] = -1;
+	tg->iops[0] = tg->iops[1] = -1;
 	atomic_set(&tg->ref, 1);
 
 	INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
-- 
cgit v1.2.3


From 2786c4e5e54802c34297e55050fef3e862a27b3f Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 15 Sep 2010 17:06:38 -0400
Subject: blkio: Documentation Update

o Documentation update

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 Documentation/cgroups/blkio-controller.txt | 106 ++++++++++++++++++++++++++++-
 1 file changed, 103 insertions(+), 3 deletions(-)

diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
index 48e0b21b0059..2f5613fbe680 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -8,12 +8,17 @@ both at leaf nodes as well as at intermediate nodes in a storage hierarchy.
 Plan is to use the same cgroup based management interface for blkio controller
 and based on user options switch IO policies in the background.
 
-In the first phase, this patchset implements proportional weight time based
-division of disk policy. It is implemented in CFQ. Hence this policy takes
-effect only on leaf nodes when CFQ is being used.
+Currently two IO control policies are implemented. First one is proportional
+weight time based division of disk policy. It is implemented in CFQ. Hence
+this policy takes effect only on leaf nodes when CFQ is being used. The second
+one is throttling policy which can be used to specify upper IO rate limits
+on devices. This policy is implemented in generic block layer and can be
+used on leaf nodes as well as higher level logical devices like device mapper.
 
 HOWTO
 =====
+Proportional Weight division of bandwidth
+-----------------------------------------
 You can do a very simple testing of running two dd threads in two different
 cgroups. Here is what you can do.
 
@@ -55,6 +60,35 @@ cgroups. Here is what you can do.
   group dispatched to the disk. We provide fairness in terms of disk time, so
   ideally io.disk_time of cgroups should be in proportion to the weight.
 
+Throttling/Upper Limit policy
+-----------------------------
+- Enable Block IO controller
+	CONFIG_BLK_CGROUP=y
+
+- Enable throttling in block layer
+	CONFIG_BLK_DEV_THROTTLING=y
+
+- Mount blkio controller
+        mount -t cgroup -o blkio none /cgroup/blkio
+
+- Specify a bandwidth rate on particular device for root group. The format
+  for policy is "<major>:<minor>  <byes_per_second>".
+
+        echo "8:16  1048576" > /cgroup/blkio/blkio.read_bps_device
+
+  Above will put a limit of 1MB/second on reads happening for root group
+  on device having major/minor number 8:16.
+
+- Run dd to read a file and see if rate is throttled to 1MB/s or not.
+
+		# dd if=/mnt/common/zerofile of=/dev/null bs=4K count=1024
+		# iflag=direct
+        1024+0 records in
+        1024+0 records out
+        4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s
+
+ Limits for writes can be put using blkio.write_bps_device file.
+
 Various user visible config options
 ===================================
 CONFIG_BLK_CGROUP
@@ -68,8 +102,13 @@ CONFIG_CFQ_GROUP_IOSCHED
 	- Enables group scheduling in CFQ. Currently only 1 level of group
 	  creation is allowed.
 
+CONFIG_BLK_DEV_THROTTLING
+	- Enable block device throttling support in block layer.
+
 Details of cgroup files
 =======================
+Proportional weight policy files
+--------------------------------
 - blkio.weight
 	- Specifies per cgroup weight. This is default weight of the group
 	  on all the devices until and unless overridden by per device rule.
@@ -210,6 +249,67 @@ Details of cgroup files
 	  and minor number of the device and third field specifies the number
 	  of times a group was dequeued from a particular device.
 
+Throttling/Upper limit policy files
+-----------------------------------
+- blkio.throttle.read_bps_device
+	- Specifies upper limit on READ rate from the device. IO rate is
+	  specified in bytes per second. Rules are per deivce. Following is
+	  the format.
+
+  echo "<major>:<minor>  <rate_bytes_per_second>" > /cgrp/blkio.read_bps_device
+
+- blkio.throttle.write_bps_device
+	- Specifies upper limit on WRITE rate to the device. IO rate is
+	  specified in bytes per second. Rules are per deivce. Following is
+	  the format.
+
+  echo "<major>:<minor>  <rate_bytes_per_second>" > /cgrp/blkio.write_bps_device
+
+- blkio.throttle.read_iops_device
+	- Specifies upper limit on READ rate from the device. IO rate is
+	  specified in IO per second. Rules are per deivce. Following is
+	  the format.
+
+  echo "<major>:<minor>  <rate_io_per_second>" > /cgrp/blkio.read_iops_device
+
+- blkio.throttle.write_iops_device
+	- Specifies upper limit on WRITE rate to the device. IO rate is
+	  specified in io per second. Rules are per deivce. Following is
+	  the format.
+
+  echo "<major>:<minor>  <rate_io_per_second>" > /cgrp/blkio.write_iops_device
+
+Note: If both BW and IOPS rules are specified for a device, then IO is
+      subjectd to both the constraints.
+
+- blkio.throttle.io_serviced
+	- Number of IOs (bio) completed to/from the disk by the group (as
+	  seen by throttling policy). These are further divided by the type
+	  of operation - read or write, sync or async. First two fields specify
+	  the major and minor number of the device, third field specifies the
+	  operation type and the fourth field specifies the number of IOs.
+
+	  blkio.io_serviced does accounting as seen by CFQ and counts are in
+	  number of requests (struct request). On the other hand,
+	  blkio.throttle.io_serviced counts number of IO in terms of number
+	  of bios as seen by throttling policy.  These bios can later be
+	  merged by elevator and total number of requests completed can be
+	  lesser.
+
+- blkio.throttle.io_service_bytes
+	- Number of bytes transferred to/from the disk by the group. These
+	  are further divided by the type of operation - read or write, sync
+	  or async. First two fields specify the major and minor number of the
+	  device, third field specifies the operation type and the fourth field
+	  specifies the number of bytes.
+
+	  These numbers should roughly be same as blkio.io_service_bytes as
+	  updated by CFQ. The difference between two is that
+	  blkio.io_service_bytes will not be updated if CFQ is not operating
+	  on request queue.
+
+Common files among various policies
+-----------------------------------
 - blkio.reset_stats
 	- Writing an int to this file will result in resetting all the stats
 	  for that cgroup.
-- 
cgit v1.2.3


From 01ea50638bc04ca5259f5711fcdedefcdde1cf43 Mon Sep 17 00:00:00 2001
From: "Signed-off-by: Jan Kara" <jack@suse.cz>
Date: Thu, 16 Sep 2010 20:36:36 +0200
Subject: block: Fix race during disk initialization

When a new disk is being discovered, add_disk() first ties the bdev to gendisk
(via register_disk()->blkdev_get()) and only after that calls
bdi_register_bdev(). Because register_disk() also creates disk's kobject, it
can happen that userspace manages to open and modify the device's data (or
inode) before its BDI is properly initialized leading to a warning in
__mark_inode_dirty().

Fix the problem by registering BDI early enough.

This patch addresses https://bugzilla.kernel.org/show_bug.cgi?id=16312

Cc: stable@kernel.org
Reported-by: Larry Finger <Larry.Finger@lwfinger.net>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/genhd.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index 5c9c503de423..7923e720ddf5 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -541,13 +541,15 @@ void add_disk(struct gendisk *disk)
 	disk->major = MAJOR(devt);
 	disk->first_minor = MINOR(devt);
 
+	/* Register BDI before referencing it from bdev */ 
+	bdi = &disk->queue->backing_dev_info;
+	bdi_register_dev(bdi, disk_devt(disk));
+
 	blk_register_region(disk_devt(disk), disk->minors, NULL,
 			    exact_match, exact_lock, disk);
 	register_disk(disk);
 	blk_register_queue(disk);
 
-	bdi = &disk->queue->backing_dev_info;
-	bdi_register_dev(bdi, disk_devt(disk));
 	retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
 				   "bdi");
 	WARN_ON(retval);
-- 
cgit v1.2.3


From dd3932eddf428571762596e17b65f5dc92ca361b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 16 Sep 2010 20:51:46 +0200
Subject: block: remove BLKDEV_IFL_WAIT

All the blkdev_issue_* helpers can only sanely be used for synchronous
caller.  To issue cache flushes or barriers asynchronously the caller needs
to set up a bio by itself with a completion callback to move the asynchronous
state machine ahead.  So drop the BLKDEV_IFL_WAIT flag that is always
specified when calling blkdev_issue_* and also remove the now unused flags
argument to blkdev_issue_flush and blkdev_issue_zeroout.  For
blkdev_issue_discard we need to keep it for the secure discard flag, which
gains a more descriptive name and loses the bitops vs flag confusion.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-flush.c                  | 25 +++++++++++--------------
 block/blk-lib.c                    | 21 ++++++++-------------
 block/ioctl.c                      |  4 ++--
 drivers/block/drbd/drbd_int.h      |  3 +--
 drivers/block/drbd/drbd_receiver.c |  2 +-
 fs/block_dev.c                     |  2 +-
 fs/btrfs/extent-tree.c             |  3 +--
 fs/ext3/fsync.c                    |  3 +--
 fs/ext4/fsync.c                    |  5 ++---
 fs/ext4/mballoc.c                  |  3 +--
 fs/fat/fatent.c                    |  3 +--
 fs/gfs2/rgrp.c                     |  5 ++---
 fs/jbd2/checkpoint.c               |  3 +--
 fs/jbd2/commit.c                   |  6 ++----
 fs/nilfs2/the_nilfs.c              |  4 ++--
 fs/reiserfs/file.c                 |  3 +--
 fs/xfs/linux-2.6/xfs_super.c       |  3 +--
 include/linux/blkdev.h             | 14 +++++---------
 mm/swapfile.c                      |  6 +++---
 19 files changed, 47 insertions(+), 71 deletions(-)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 62b7df9bca9d..54b123d6563e 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -205,7 +205,6 @@ static void bio_end_flush(struct bio *bio, int err)
  * @bdev:	blockdev to issue flush for
  * @gfp_mask:	memory allocation flags (for bio_alloc)
  * @error_sector:	error sector
- * @flags:	BLKDEV_IFL_* flags to control behaviour
  *
  * Description:
  *    Issue a flush for the block device in question. Caller can supply
@@ -214,7 +213,7 @@ static void bio_end_flush(struct bio *bio, int err)
  *    request was pushed in some internal queue for later handling.
  */
 int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
-		sector_t *error_sector, unsigned long flags)
+		sector_t *error_sector)
 {
 	DECLARE_COMPLETION_ONSTACK(wait);
 	struct request_queue *q;
@@ -240,21 +239,19 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
 	bio = bio_alloc(gfp_mask, 0);
 	bio->bi_end_io = bio_end_flush;
 	bio->bi_bdev = bdev;
-	if (test_bit(BLKDEV_WAIT, &flags))
-		bio->bi_private = &wait;
+	bio->bi_private = &wait;
 
 	bio_get(bio);
 	submit_bio(WRITE_FLUSH, bio);
-	if (test_bit(BLKDEV_WAIT, &flags)) {
-		wait_for_completion(&wait);
-		/*
-		 * The driver must store the error location in ->bi_sector, if
-		 * it supports it. For non-stacked drivers, this should be
-		 * copied from blk_rq_pos(rq).
-		 */
-		if (error_sector)
-			*error_sector = bio->bi_sector;
-	}
+	wait_for_completion(&wait);
+
+	/*
+	 * The driver must store the error location in ->bi_sector, if
+	 * it supports it. For non-stacked drivers, this should be
+	 * copied from blk_rq_pos(rq).
+	 */
+	if (error_sector)
+               *error_sector = bio->bi_sector;
 
 	if (!bio_flagged(bio, BIO_UPTODATE))
 		ret = -EIO;
diff --git a/block/blk-lib.c b/block/blk-lib.c
index fe2e6ed0f510..1a320d2406b0 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -61,7 +61,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		max_discard_sectors &= ~(disc_sects - 1);
 	}
 
-	if (flags & BLKDEV_IFL_SECURE) {
+	if (flags & BLKDEV_DISCARD_SECURE) {
 		if (!blk_queue_secdiscard(q))
 			return -EOPNOTSUPP;
 		type |= REQ_SECURE;
@@ -77,8 +77,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		bio->bi_sector = sector;
 		bio->bi_end_io = blkdev_discard_end_io;
 		bio->bi_bdev = bdev;
-		if (flags & BLKDEV_IFL_WAIT)
-			bio->bi_private = &wait;
+		bio->bi_private = &wait;
 
 		if (nr_sects > max_discard_sectors) {
 			bio->bi_size = max_discard_sectors << 9;
@@ -92,8 +91,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		bio_get(bio);
 		submit_bio(type, bio);
 
-		if (flags & BLKDEV_IFL_WAIT)
-			wait_for_completion(&wait);
+		wait_for_completion(&wait);
 
 		if (bio_flagged(bio, BIO_EOPNOTSUPP))
 			ret = -EOPNOTSUPP;
@@ -139,7 +137,6 @@ static void bio_batch_end_io(struct bio *bio, int err)
  * @sector:	start sector
  * @nr_sects:	number of sectors to write
  * @gfp_mask:	memory allocation flags (for bio_alloc)
- * @flags:	BLKDEV_IFL_* flags to control behaviour
  *
  * Description:
  *  Generate and issue number of bios with zerofiled pages.
@@ -148,7 +145,7 @@ static void bio_batch_end_io(struct bio *bio, int err)
  */
 
 int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
-			sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
+			sector_t nr_sects, gfp_t gfp_mask)
 {
 	int ret;
 	struct bio *bio;
@@ -174,8 +171,7 @@ submit:
 		bio->bi_sector = sector;
 		bio->bi_bdev   = bdev;
 		bio->bi_end_io = bio_batch_end_io;
-		if (flags & BLKDEV_IFL_WAIT)
-			bio->bi_private = &bb;
+		bio->bi_private = &bb;
 
 		while (nr_sects != 0) {
 			sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects);
@@ -193,10 +189,9 @@ submit:
 		submit_bio(WRITE, bio);
 	}
 
-	if (flags & BLKDEV_IFL_WAIT)
-		/* Wait for bios in-flight */
-		while ( issued != atomic_read(&bb.done))
-			wait_for_completion(&wait);
+	/* Wait for bios in-flight */
+	while (issued != atomic_read(&bb.done))
+		wait_for_completion(&wait);
 
 	if (!test_bit(BIO_UPTODATE, &bb.flags))
 		/* One of bios in the batch was completed with error.*/
diff --git a/block/ioctl.c b/block/ioctl.c
index d8052f0dabd3..cb2b9099862b 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -116,7 +116,7 @@ static int blkdev_reread_part(struct block_device *bdev)
 static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
 			     uint64_t len, int secure)
 {
-	unsigned long flags = BLKDEV_IFL_WAIT;
+	unsigned long flags = 0;
 
 	if (start & 511)
 		return -EINVAL;
@@ -128,7 +128,7 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
 	if (start + len > (bdev->bd_inode->i_size >> 9))
 		return -EINVAL;
 	if (secure)
-		flags |= BLKDEV_IFL_SECURE;
+		flags |= BLKDEV_DISCARD_SECURE;
 	return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags);
 }
 
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 352441b0f92f..c2ef476f5711 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -2321,8 +2321,7 @@ static inline void drbd_md_flush(struct drbd_conf *mdev)
 	if (test_bit(MD_NO_BARRIER, &mdev->flags))
 		return;
 
-	r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL,
-			BLKDEV_IFL_WAIT);
+	r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL);
 	if (r) {
 		set_bit(MD_NO_BARRIER, &mdev->flags);
 		dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r);
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 081522d3c742..df15e7f0e7b7 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -975,7 +975,7 @@ static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct d
 
 	if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) {
 		rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL,
-					NULL, BLKDEV_IFL_WAIT);
+					NULL);
 		if (rv) {
 			dev_err(DEV, "local disk flush failed with status %d\n", rv);
 			/* would rather check on EOPNOTSUPP, but that is not reliable.
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 50e8c8582faa..b737451e2e9d 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -370,7 +370,7 @@ int blkdev_fsync(struct file *filp, int datasync)
 	 */
 	mutex_unlock(&bd_inode->i_mutex);
 
-	error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL, BLKDEV_IFL_WAIT);
+	error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL);
 	if (error == -EOPNOTSUPP)
 		error = 0;
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 43dc9ea9aef6..0b81ecdb101c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1695,8 +1695,7 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
 static void btrfs_issue_discard(struct block_device *bdev,
 				u64 start, u64 len)
 {
-	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
-			BLKDEV_IFL_WAIT);
+	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 0);
 }
 
 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index d7e9f74dc3a6..09b13bb34c94 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -90,7 +90,6 @@ int ext3_sync_file(struct file *file, int datasync)
 	 * storage
 	 */
 	if (needs_barrier)
-		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
-				BLKDEV_IFL_WAIT);
+		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
 	return ret;
 }
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 592adf2e546e..3f3ff5ee8f9d 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -128,10 +128,9 @@ int ext4_sync_file(struct file *file, int datasync)
 		    (journal->j_fs_dev != journal->j_dev) &&
 		    (journal->j_flags & JBD2_BARRIER))
 			blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
-					NULL, BLKDEV_IFL_WAIT);
+					NULL);
 		ret = jbd2_log_wait_commit(journal, commit_tid);
 	} else if (journal->j_flags & JBD2_BARRIER)
-		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
-			BLKDEV_IFL_WAIT);
+		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
 	return ret;
 }
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index a22bfef3da95..19aa0d44d822 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2566,8 +2566,7 @@ static inline void ext4_issue_discard(struct super_block *sb,
 	discard_block = block + ext4_group_first_block_no(sb, block_group);
 	trace_ext4_discard_blocks(sb,
 			(unsigned long long) discard_block, count);
-	ret = sb_issue_discard(sb, discard_block, count, GFP_NOFS,
-			       BLKDEV_IFL_WAIT);
+	ret = sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
 	if (ret == EOPNOTSUPP) {
 		ext4_warning(sb, "discard not supported, disabling");
 		clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index f9a0b7ae8648..b47d2c9f4fa1 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -578,8 +578,7 @@ int fat_free_clusters(struct inode *inode, int cluster)
 				sb_issue_discard(sb,
 					fat_clus_to_blknr(sbi, first_cl),
 					nr_clus * sbi->sec_per_clus,
-					GFP_NOFS,
-					BLKDEV_IFL_WAIT);
+					GFP_NOFS, 0);
 
 				first_cl = cluster;
 			}
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 379316472918..38b3ea1abacc 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -854,7 +854,7 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
 				if ((start + nr_sects) != blk) {
 					rv = blkdev_issue_discard(bdev, start,
 							    nr_sects, GFP_NOFS,
-							    BLKDEV_IFL_WAIT);
+							    0);
 					if (rv)
 						goto fail;
 					nr_sects = 0;
@@ -868,8 +868,7 @@ start_new_extent:
 		}
 	}
 	if (nr_sects) {
-		rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS,
-					 BLKDEV_IFL_WAIT);
+		rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, 0);
 		if (rv)
 			goto fail;
 	}
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 5247e7ffdcb4..6571a056e55d 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -532,8 +532,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
 	 */
 	if ((journal->j_fs_dev != journal->j_dev) &&
 	    (journal->j_flags & JBD2_BARRIER))
-		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
-			BLKDEV_IFL_WAIT);
+		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
 	if (!(journal->j_flags & JBD2_ABORT))
 		jbd2_journal_update_superblock(journal, 1);
 	return 0;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index f204e27f44d1..cb43c605cfaa 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -684,8 +684,7 @@ start_journal_io:
 	if (commit_transaction->t_flushed_data_blocks &&
 	    (journal->j_fs_dev != journal->j_dev) &&
 	    (journal->j_flags & JBD2_BARRIER))
-		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
-			BLKDEV_IFL_WAIT);
+		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
 
 	/* Done it all: now write the commit record asynchronously. */
 	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
@@ -810,8 +809,7 @@ wait_for_iobuf:
 	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
 				      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
 	    journal->j_flags & JBD2_BARRIER) {
-		blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
-				   BLKDEV_IFL_WAIT);
+		blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL);
 	}
 
 	if (err)
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 400b2caef4d8..d97310f07bef 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -774,7 +774,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
 			ret = blkdev_issue_discard(nilfs->ns_bdev,
 						   start * sects_per_block,
 						   nblocks * sects_per_block,
-						   GFP_NOFS, BLKDEV_IFL_WAIT);
+						   GFP_NOFS, 0);
 			if (ret < 0)
 				return ret;
 			nblocks = 0;
@@ -784,7 +784,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
 		ret = blkdev_issue_discard(nilfs->ns_bdev,
 					   start * sects_per_block,
 					   nblocks * sects_per_block,
-					   GFP_NOFS, BLKDEV_IFL_WAIT);
+					   GFP_NOFS, 0);
 	return ret;
 }
 
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 6846371498b6..91f080cc76c8 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -152,8 +152,7 @@ static int reiserfs_sync_file(struct file *filp, int datasync)
 	barrier_done = reiserfs_commit_for_inode(inode);
 	reiserfs_write_unlock(inode->i_sb);
 	if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
-		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, 
-			BLKDEV_IFL_WAIT);
+		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
 	if (barrier_done < 0)
 		return barrier_done;
 	return (err < 0) ? -EIO : 0;
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 15c35b62ff14..5fa7a30cc3f0 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -693,8 +693,7 @@ void
 xfs_blkdev_issue_flush(
 	xfs_buftarg_t		*buftarg)
 {
-	blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL,
-			BLKDEV_IFL_WAIT);
+	blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL);
 }
 
 STATIC void
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index cfcb3a610605..accbd0e5c893 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -867,18 +867,14 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
 		return NULL;
 	return bqt->tag_index[tag];
 }
-enum{
-	BLKDEV_WAIT,	/* wait for completion */
-	BLKDEV_SECURE,	/* secure discard */
-};
-#define BLKDEV_IFL_WAIT		(1 << BLKDEV_WAIT)
-#define BLKDEV_IFL_SECURE	(1 << BLKDEV_SECURE)
-extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *,
-			unsigned long);
+
+#define BLKDEV_DISCARD_SECURE  0x01    /* secure discard */
+
+extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *);
 extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
 extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
-			sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
+			sector_t nr_sects, gfp_t gfp_mask);
 static inline int sb_issue_discard(struct super_block *sb, sector_t block,
 		sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags)
 {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 68cda164dff6..e132e1708acc 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -141,7 +141,7 @@ static int discard_swap(struct swap_info_struct *si)
 	nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
 	if (nr_blocks) {
 		err = blkdev_issue_discard(si->bdev, start_block,
-				nr_blocks, GFP_KERNEL, BLKDEV_IFL_WAIT);
+				nr_blocks, GFP_KERNEL, 0);
 		if (err)
 			return err;
 		cond_resched();
@@ -152,7 +152,7 @@ static int discard_swap(struct swap_info_struct *si)
 		nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
 
 		err = blkdev_issue_discard(si->bdev, start_block,
-				nr_blocks, GFP_KERNEL, BLKDEV_IFL_WAIT);
+				nr_blocks, GFP_KERNEL, 0);
 		if (err)
 			break;
 
@@ -191,7 +191,7 @@ static void discard_swap_cluster(struct swap_info_struct *si,
 			start_block <<= PAGE_SHIFT - 9;
 			nr_blocks <<= PAGE_SHIFT - 9;
 			if (blkdev_issue_discard(si->bdev, start_block,
-				    nr_blocks, GFP_NOIO, BLKDEV_IFL_WAIT))
+				    nr_blocks, GFP_NOIO, 0))
 				break;
 		}
 
-- 
cgit v1.2.3


From 6d0aed7a38d06284db2a0e46c0a072b0c1c3299b Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Fri, 17 Sep 2010 10:00:46 +0200
Subject: do_mounts: only enable PARTUUID for CONFIG_BLOCK

When CONFIG_BLOCK is not enabled:

init/do_mounts.c:71: error: implicit declaration of function 'dev_to_part'
init/do_mounts.c:71: warning: initialization makes pointer from integer without a cast
init/do_mounts.c:73: error: dereferencing pointer to incomplete type
init/do_mounts.c:76: error: dereferencing pointer to incomplete type
init/do_mounts.c:76: error: dereferencing pointer to incomplete type
init/do_mounts.c:102: error: implicit declaration of function 'part_pack_uuid'
init/do_mounts.c:104: error: 'block_class' undeclared (first use in this function)

Reported-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 init/do_mounts.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/init/do_mounts.c b/init/do_mounts.c
index b7fc83994f39..42db0551c3aa 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -58,6 +58,7 @@ static int __init readwrite(char *str)
 __setup("ro", readonly);
 __setup("rw", readwrite);
 
+#ifdef CONFIG_BLOCK
 /**
  * match_dev_by_uuid - callback for finding a partition using its uuid
  * @dev:	device passed in by the caller
@@ -111,6 +112,7 @@ static dev_t __init devt_from_partuuid(char *uuid_str)
 done:
 	return res;
 }
+#endif
 
 /*
  *	Convert a name into device number.  We accept the following variants:
@@ -138,6 +140,7 @@ dev_t name_to_dev_t(char *name)
 	dev_t res = 0;
 	int part;
 
+#ifdef CONFIG_BLOCK
 	if (strncmp(name, "PARTUUID=", 9) == 0) {
 		name += 9;
 		if (strlen(name) != 36)
@@ -147,6 +150,7 @@ dev_t name_to_dev_t(char *name)
 			goto fail;
 		goto done;
 	}
+#endif
 
 	if (strncmp(name, "/dev/", 5) != 0) {
 		unsigned maj, min;
-- 
cgit v1.2.3


From 749ef9f8423054e326f3a246327ed2db4b6d395f Mon Sep 17 00:00:00 2001
From: Corrado Zoccolo <czoccolo@gmail.com>
Date: Mon, 20 Sep 2010 15:24:50 +0200
Subject: cfq: improve fsync performance for small files

Fsync performance for small files achieved by cfq on high-end disks is
lower than what deadline can achieve, due to idling introduced between
the sync write happening in process context and the journal commit.

Moreover, when competing with a sequential reader, a process writing
small files and fsync-ing them is starved.

This patch fixes the two problems by:
- marking journal commits as WRITE_SYNC, so that they get the REQ_NOIDLE
  flag set,
- force all queues that have REQ_NOIDLE requests to be put in the noidle
  tree.

Having the queue associated to the fsync-ing process and the one associated
 to journal commits in the noidle tree allows:
- switching between them without idling,
- fairness vs. competing idling queues, since they will be serviced only
  after the noidle tree expires its slice.

Acked-by: Vivek Goyal <vgoyal@redhat.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Tested-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 18 ++++--------------
 fs/jbd/commit.c     |  2 +-
 fs/jbd2/commit.c    |  2 +-
 3 files changed, 6 insertions(+), 16 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index b9f86190763b..684592621736 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -216,7 +216,6 @@ struct cfq_data {
 	enum wl_type_t serving_type;
 	unsigned long workload_expires;
 	struct cfq_group *serving_group;
-	bool noidle_tree_requires_idle;
 
 	/*
 	 * Each priority tree is sorted by next_request position.  These
@@ -2126,7 +2125,6 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	slice = max_t(unsigned, slice, CFQ_MIN_TT);
 	cfq_log(cfqd, "workload slice:%d", slice);
 	cfqd->workload_expires = jiffies + slice;
-	cfqd->noidle_tree_requires_idle = false;
 }
 
 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
@@ -3108,7 +3106,9 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	if (cfqq->queued[0] + cfqq->queued[1] >= 4)
 		cfq_mark_cfqq_deep(cfqq);
 
-	if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
+	if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))
+		enable_idle = 0;
+	else if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
 	    (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
 		enable_idle = 0;
 	else if (sample_valid(cic->ttime_samples)) {
@@ -3421,17 +3421,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 			cfq_slice_expired(cfqd, 1);
 		else if (sync && cfqq_empty &&
 			 !cfq_close_cooperator(cfqd, cfqq)) {
-			cfqd->noidle_tree_requires_idle |=
-				!(rq->cmd_flags & REQ_NOIDLE);
-			/*
-			 * Idling is enabled for SYNC_WORKLOAD.
-			 * SYNC_NOIDLE_WORKLOAD idles at the end of the tree
-			 * only if we processed at least one !REQ_NOIDLE request
-			 */
-			if (cfqd->serving_type == SYNC_WORKLOAD
-			    || cfqd->noidle_tree_requires_idle
-			    || cfqq->cfqg->nr_cfqq == 1)
-				cfq_arm_slice_timer(cfqd);
+			cfq_arm_slice_timer(cfqd);
 		}
 	}
 
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 95d8c11c929e..3f030e9efea6 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -318,7 +318,7 @@ void journal_commit_transaction(journal_t *journal)
 	int first_tag = 0;
 	int tag_flag;
 	int i;
-	int write_op = WRITE;
+	int write_op = WRITE_SYNC;
 
 	/*
 	 * First job: lock down the current transaction and wait for
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 7c068c189d80..80910f51d4b4 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -360,7 +360,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	int tag_bytes = journal_tag_bytes(journal);
 	struct buffer_head *cbh = NULL; /* For transactional checksums */
 	__u32 crc32_sum = ~0;
-	int write_op = WRITE;
+	int write_op = WRITE_SYNC;
 
 	/*
 	 * First job: lock down the current transaction and wait for
-- 
cgit v1.2.3


From 8f45d7bdd81814288d38552139fbe906e66a2d04 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Mon, 20 Sep 2010 15:26:42 +0200
Subject: ocfs2: update for removed BLKDEV_IFL_WAIT

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/ocfs2/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 9a03c151b5ce..77f05b88720e 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -198,7 +198,7 @@ static int ocfs2_sync_file(struct file *file, int datasync)
 		 */
 		if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
 			blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
-					   NULL, BLKDEV_IFL_WAIT);
+					   NULL);
 		goto bail;
 	}
 
-- 
cgit v1.2.3


From 488211844e0c3fad6ffa98a6f3c4f2139074e79a Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Wed, 22 Sep 2010 09:32:36 +0200
Subject: floppy: switch to one queue per drive instead of sharing a queue

Pretty straight forward conversion. Note that we do round-robin
between the drives that have available requests, before we simply
used the drive that the IO scheduler told us to. Since the IO
scheduler doesn't care about multiple devices per queue, the resulting
sort would not have made sense.

Fixed by Vivek to get rid of a double lock problem in set_next_request()

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
---
 drivers/block/floppy.c | 66 ++++++++++++++++++++++++++++++++++----------------
 1 file changed, 45 insertions(+), 21 deletions(-)

diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index cf04c1b234ed..aa42e7766c6a 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -258,8 +258,8 @@ static int irqdma_allocated;
 #include <linux/completion.h>
 
 static struct request *current_req;
-static struct request_queue *floppy_queue;
 static void do_fd_request(struct request_queue *q);
+static int set_next_request(void);
 
 #ifndef fd_get_dma_residue
 #define fd_get_dma_residue() get_dma_residue(FLOPPY_DMA)
@@ -413,6 +413,7 @@ static struct gendisk *disks[N_DRIVE];
 static struct block_device *opened_bdev[N_DRIVE];
 static DEFINE_MUTEX(open_lock);
 static struct floppy_raw_cmd *raw_cmd, default_raw_cmd;
+static int fdc_queue;
 
 /*
  * This struct defines the different floppy types.
@@ -890,8 +891,8 @@ static void unlock_fdc(void)
 	del_timer(&fd_timeout);
 	cont = NULL;
 	clear_bit(0, &fdc_busy);
-	if (current_req || blk_peek_request(floppy_queue))
-		do_fd_request(floppy_queue);
+	if (current_req || set_next_request())
+		do_fd_request(current_req->q);
 	spin_unlock_irqrestore(&floppy_lock, flags);
 	wake_up(&fdc_wait);
 }
@@ -2243,8 +2244,8 @@ static void floppy_end_request(struct request *req, int error)
  * logical buffer */
 static void request_done(int uptodate)
 {
-	struct request_queue *q = floppy_queue;
 	struct request *req = current_req;
+	struct request_queue *q;
 	unsigned long flags;
 	int block;
 	char msg[sizeof("request done ") + sizeof(int) * 3];
@@ -2258,6 +2259,8 @@ static void request_done(int uptodate)
 		return;
 	}
 
+	q = req->q;
+
 	if (uptodate) {
 		/* maintain values for invalidation on geometry
 		 * change */
@@ -2811,6 +2814,28 @@ static int make_raw_rw_request(void)
 	return 2;
 }
 
+/*
+ * Round-robin between our available drives, doing one request from each
+ */
+static int set_next_request(void)
+{
+	struct request_queue *q;
+	int old_pos = fdc_queue;
+
+	do {
+		q = disks[fdc_queue]->queue;
+		if (++fdc_queue == N_DRIVE)
+			fdc_queue = 0;
+		if (q) {
+			current_req = blk_fetch_request(q);
+			if (current_req)
+				break;
+		}
+	} while (fdc_queue != old_pos);
+
+	return current_req != NULL;
+}
+
 static void redo_fd_request(void)
 {
 	int drive;
@@ -2822,17 +2847,17 @@ static void redo_fd_request(void)
 
 do_request:
 	if (!current_req) {
-		struct request *req;
+		int pending;
+
+		spin_lock_irq(&floppy_lock);
+		pending = set_next_request();
+		spin_unlock_irq(&floppy_lock);
 
-		spin_lock_irq(floppy_queue->queue_lock);
-		req = blk_fetch_request(floppy_queue);
-		spin_unlock_irq(floppy_queue->queue_lock);
-		if (!req) {
+		if (!pending) {
 			do_floppy = NULL;
 			unlock_fdc();
 			return;
 		}
-		current_req = req;
 	}
 	drive = (long)current_req->rq_disk->private_data;
 	set_fdc(drive);
@@ -4165,6 +4190,13 @@ static int __init floppy_init(void)
 			goto out_put_disk;
 		}
 
+		disks[dr]->queue = blk_init_queue(do_fd_request, &floppy_lock);
+		if (!disks[dr]->queue) {
+			err = -ENOMEM;
+			goto out_put_disk;
+		}
+
+		blk_queue_max_hw_sectors(disks[dr]->queue, 64);
 		disks[dr]->major = FLOPPY_MAJOR;
 		disks[dr]->first_minor = TOMINOR(dr);
 		disks[dr]->fops = &floppy_fops;
@@ -4183,13 +4215,6 @@ static int __init floppy_init(void)
 	if (err)
 		goto out_unreg_blkdev;
 
-	floppy_queue = blk_init_queue(do_fd_request, &floppy_lock);
-	if (!floppy_queue) {
-		err = -ENOMEM;
-		goto out_unreg_driver;
-	}
-	blk_queue_max_hw_sectors(floppy_queue, 64);
-
 	blk_register_region(MKDEV(FLOPPY_MAJOR, 0), 256, THIS_MODULE,
 			    floppy_find, NULL, NULL);
 
@@ -4317,7 +4342,6 @@ static int __init floppy_init(void)
 
 		/* to be cleaned up... */
 		disks[drive]->private_data = (void *)(long)drive;
-		disks[drive]->queue = floppy_queue;
 		disks[drive]->flags |= GENHD_FL_REMOVABLE;
 		disks[drive]->driverfs_dev = &floppy_device[drive].dev;
 		add_disk(disks[drive]);
@@ -4333,8 +4357,6 @@ out_flush_work:
 		floppy_release_irq_and_dma();
 out_unreg_region:
 	blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256);
-	blk_cleanup_queue(floppy_queue);
-out_unreg_driver:
 	platform_driver_unregister(&floppy_driver);
 out_unreg_blkdev:
 	unregister_blkdev(FLOPPY_MAJOR, "fd");
@@ -4342,6 +4364,8 @@ out_put_disk:
 	while (dr--) {
 		del_timer(&motor_off_timer[dr]);
 		put_disk(disks[dr]);
+		if (disks[dr]->queue)
+			blk_cleanup_queue(disks[dr]->queue);
 	}
 	return err;
 }
@@ -4550,11 +4574,11 @@ static void __exit floppy_module_exit(void)
 			platform_device_unregister(&floppy_device[drive]);
 		}
 		put_disk(disks[drive]);
+		blk_cleanup_queue(disks[drive]->queue);
 	}
 
 	del_timer_sync(&fd_timeout);
 	del_timer_sync(&fd_timer);
-	blk_cleanup_queue(floppy_queue);
 
 	if (atomic_read(&usage_count))
 		floppy_release_irq_and_dma();
-- 
cgit v1.2.3


From 4b1977698ceb4c4caa800d475127139da49966f9 Mon Sep 17 00:00:00 2001
From: Mark Lord <kernel@teksavvy.com>
Date: Fri, 24 Sep 2010 09:51:13 -0400
Subject: block: Prevent hang_check firing during long I/O

During long I/O operations, the hang_check timer may fire,
trigger stack dumps that unnecessarily alarm the user.

Eg.  hdparm --security-erase NULL /dev/sdb  ## can take *hours* to complete

So, if hang_check is armed, we should wake up periodically
to prevent it from triggering.  This patch uses a wake-up interval
equal to half the hang_check timer period, which keeps overhead low enough.

Signed-off-by: Mark Lord <mlord@pobox.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-exec.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/block/blk-exec.c b/block/blk-exec.c
index e1672f14840e..cf1456a02acd 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -80,6 +80,7 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
 	DECLARE_COMPLETION_ONSTACK(wait);
 	char sense[SCSI_SENSE_BUFFERSIZE];
 	int err = 0;
+	unsigned long hang_check;
 
 	/*
 	 * we need an extra reference to the request, so we can look at
@@ -95,7 +96,13 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
 
 	rq->end_io_data = &wait;
 	blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq);
-	wait_for_completion(&wait);
+
+	/* Prevent hang_check timer from firing at us during very long I/O */
+	hang_check = sysctl_hung_task_timeout_secs;
+	if (hang_check)
+		while (!wait_for_completion_timeout(&wait, hang_check * (HZ/2)));
+	else
+		wait_for_completion(&wait);
 
 	if (rq->errors)
 		err = -EIO;
-- 
cgit v1.2.3


From c49825facfd4969585224a896a5e717f88450cad Mon Sep 17 00:00:00 2001
From: Malahal Naineni <malahal@us.ibm.com>
Date: Fri, 24 Sep 2010 20:25:49 +0200
Subject: block: set the bounce_pfn to the actual DMA limit rather than to max
 memory

The bounce_pfn of the request queue in 64 bit systems is set to the
current max_low_pfn. Adding more memory later makes this incorrect.
Memory allocated beyond this boot time max_low_pfn appear to require
bounce buffers (bounce buffers are actually not allocated but used in
calculating segments that may result in "over max segments limit"
errors).

Signed-off-by: Malahal Naineni <malahal@us.ibm.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-settings.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/block/blk-settings.c b/block/blk-settings.c
index f8f2ddf20613..f47af5031eaa 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -214,16 +214,14 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask)
 	 */
 	if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT))
 		dma = 1;
-	q->limits.bounce_pfn = max_low_pfn;
 #else
 	if (b_pfn < blk_max_low_pfn)
 		dma = 1;
-	q->limits.bounce_pfn = b_pfn;
 #endif
+	q->limits.bounce_pfn = b_pfn;
 	if (dma) {
 		init_emergency_isa_pool();
 		q->bounce_gfp = GFP_NOIO | GFP_DMA;
-		q->limits.bounce_pfn = b_pfn;
 	}
 }
 EXPORT_SYMBOL(blk_queue_bounce_limit);
-- 
cgit v1.2.3


From 786029ff810ff4a2fd52c0462713985a415417ab Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Fri, 24 Sep 2010 20:35:44 +0200
Subject: amiga floppy: Stop sharing request queue across multiple gendisks

o Use one request queue per gendisk instead of sharing request queue

o Don't have hardware. No compile testing or run time testing done. Completely
  untested.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/amiflop.c | 59 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 48 insertions(+), 11 deletions(-)

diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 76f114f0bba3..ead8b7792c52 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -114,8 +114,6 @@ static unsigned long int fd_def_df0 = FD_DD_3;     /* default for df0 if it does
 module_param(fd_def_df0, ulong, 0);
 MODULE_LICENSE("GPL");
 
-static struct request_queue *floppy_queue;
-
 /*
  *  Macros
  */
@@ -164,6 +162,7 @@ static volatile int selected = -1;	/* currently selected drive */
 static int writepending;
 static int writefromint;
 static char *raw_buf;
+static int fdc_queue;
 
 static DEFINE_SPINLOCK(amiflop_lock);
 
@@ -1334,6 +1333,42 @@ static int get_track(int drive, int track)
 	return -1;
 }
 
+/*
+ * Round-robin between our available drives, doing one request from each
+ */
+static struct request *set_next_request(void)
+{
+	struct request_queue *q;
+	int cnt = FD_MAX_UNITS;
+	struct request *rq;
+
+	/* Find next queue we can dispatch from */
+	fdc_queue = fdc_queue + 1;
+	if (fdc_queue == FD_MAX_UNITS)
+		fdc_queue = 0;
+
+	for(cnt = FD_MAX_UNITS; cnt > 0, cnt--) {
+
+		if (unit[fdc_queue].type->code == FD_NODRIVE) {
+			if (++fdc_queue == FD_MAX_UNITS)
+				fdc_queue = 0;
+			cotinue;
+		}
+
+		q = unit[fdc_queue].gendisk->queue;
+		if (q) {
+			rq = blk_fetch_request(q);
+			if (rq)
+				break;
+		}
+
+		if (++fdc_queue == FD_MAX_UNITS)
+			fdc_queue = 0;
+	}
+
+	return rq;
+}
+
 static void redo_fd_request(void)
 {
 	struct request *rq;
@@ -1345,7 +1380,7 @@ static void redo_fd_request(void)
 	int err;
 
 next_req:
-	rq = blk_fetch_request(floppy_queue);
+	rq = set_next_request();
 	if (!rq) {
 		/* Nothing left to do */
 		return;
@@ -1682,6 +1717,13 @@ static int __init fd_probe_drives(void)
 			continue;
 		}
 		unit[drive].gendisk = disk;
+
+		disk->queue = blk_init_queue(do_fd_request, &amiflop_lock);
+		if (!disk->queue) {
+			unit[drive].type->code = FD_NODRIVE;
+			continue;
+		}
+
 		drives++;
 		if ((unit[drive].trackbuf = kmalloc(FLOPPY_MAX_SECTORS * 512, GFP_KERNEL)) == NULL) {
 			printk("no mem for ");
@@ -1695,7 +1737,6 @@ static int __init fd_probe_drives(void)
 		disk->fops = &floppy_fops;
 		sprintf(disk->disk_name, "fd%d", drive);
 		disk->private_data = &unit[drive];
-		disk->queue = floppy_queue;
 		set_capacity(disk, 880*2);
 		add_disk(disk);
 	}
@@ -1743,11 +1784,6 @@ static int __init amiga_floppy_probe(struct platform_device *pdev)
 		goto out_irq2;
 	}
 
-	ret = -ENOMEM;
-	floppy_queue = blk_init_queue(do_fd_request, &amiflop_lock);
-	if (!floppy_queue)
-		goto out_queue;
-
 	ret = -ENODEV;
 	if (fd_probe_drives() < 1) /* No usable drives */
 		goto out_probe;
@@ -1791,7 +1827,6 @@ static int __init amiga_floppy_probe(struct platform_device *pdev)
 	return 0;
 
 out_probe:
-	blk_cleanup_queue(floppy_queue);
 out_queue:
 	free_irq(IRQ_AMIGA_CIAA_TB, NULL);
 out_irq2:
@@ -1810,9 +1845,12 @@ static int __exit amiga_floppy_remove(struct platform_device *pdev)
 
 	for( i = 0; i < FD_MAX_UNITS; i++) {
 		if (unit[i].type->code != FD_NODRIVE) {
+			struct request_queue *q = unit[i].gendisk->queue;
 			del_gendisk(unit[i].gendisk);
 			put_disk(unit[i].gendisk);
 			kfree(unit[i].trackbuf);
+			if (q)
+				blk_cleanup_queue(q);
 		}
 	}
 	blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256);
@@ -1820,7 +1858,6 @@ static int __exit amiga_floppy_remove(struct platform_device *pdev)
 	free_irq(IRQ_AMIGA_DSKBLK, NULL);
 	custom.dmacon = DMAF_DISK; /* disable DMA */
 	amiga_chip_free(raw_buf);
-	blk_cleanup_queue(floppy_queue);
 	unregister_blkdev(FLOPPY_MAJOR, "fd");
 }
 #endif
-- 
cgit v1.2.3


From 639e2f2aa76eefaf22078dccbbf2f3483f587aa7 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Fri, 24 Sep 2010 20:35:45 +0200
Subject: atari floppy: Stop sharing request queue across multiple gendisks

o Use one request queue per gendisk instead of sharing the queue.

o Don't have hardware. No compile testing or run time testing done. Completely
  untested.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/ataflop.c | 50 ++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 39 insertions(+), 11 deletions(-)

diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index aceb96476524..0f4eec442e5d 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -79,8 +79,8 @@
 
 #undef DEBUG
 
-static struct request_queue *floppy_queue;
 static struct request *fd_request;
+static int fdc_queue;
 
 /* Disk types: DD, HD, ED */
 static struct atari_disk_type {
@@ -1391,6 +1391,29 @@ static void setup_req_params( int drive )
 			ReqTrack, ReqSector, (unsigned long)ReqData ));
 }
 
+/*
+ * Round-robin between our available drives, doing one request from each
+ */
+static struct request *set_next_request(void)
+{
+	struct request_queue *q;
+	int old_pos = fdc_queue;
+	struct request *rq;
+
+	do {
+		q = unit[fdc_queue].disk->queue;
+		if (++fdc_queue == FD_MAX_UNITS)
+			fdc_queue = 0;
+		if (q) {
+			rq = blk_fetch_request(q);
+			if (rq)
+				break;
+		}
+	} while (fdc_queue != old_pos);
+
+	return rq;
+}
+
 
 static void redo_fd_request(void)
 {
@@ -1405,7 +1428,7 @@ static void redo_fd_request(void)
 
 repeat:
 	if (!fd_request) {
-		fd_request = blk_fetch_request(floppy_queue);
+		fd_request = set_next_request();
 		if (!fd_request)
 			goto the_end;
 	}
@@ -1932,10 +1955,6 @@ static int __init atari_floppy_init (void)
 	PhysTrackBuffer = virt_to_phys(TrackBuffer);
 	BufferDrive = BufferSide = BufferTrack = -1;
 
-	floppy_queue = blk_init_queue(do_fd_request, &ataflop_lock);
-	if (!floppy_queue)
-		goto Enomem;
-
 	for (i = 0; i < FD_MAX_UNITS; i++) {
 		unit[i].track = -1;
 		unit[i].flags = 0;
@@ -1944,7 +1963,10 @@ static int __init atari_floppy_init (void)
 		sprintf(unit[i].disk->disk_name, "fd%d", i);
 		unit[i].disk->fops = &floppy_fops;
 		unit[i].disk->private_data = &unit[i];
-		unit[i].disk->queue = floppy_queue;
+		unit[i].disk->queue = blk_init_queue(do_fd_request,
+					&ataflop_lock);
+		if (!unit[i].disk->queue)
+			goto Enomem;
 		set_capacity(unit[i].disk, MAX_DISK_SIZE * 2);
 		add_disk(unit[i].disk);
 	}
@@ -1959,10 +1981,14 @@ static int __init atari_floppy_init (void)
 
 	return 0;
 Enomem:
-	while (i--)
+	while (i--) {
+		struct request_queue *q = unit[i].disk->queue;
+
 		put_disk(unit[i].disk);
-	if (floppy_queue)
-		blk_cleanup_queue(floppy_queue);
+		if (q)
+			blk_cleanup_queue(q);
+	}
+
 	unregister_blkdev(FLOPPY_MAJOR, "fd");
 	return -ENOMEM;
 }
@@ -2011,12 +2037,14 @@ static void __exit atari_floppy_exit(void)
 	int i;
 	blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256);
 	for (i = 0; i < FD_MAX_UNITS; i++) {
+		struct request_queue *q = unit[i].disk->queue;
+
 		del_gendisk(unit[i].disk);
 		put_disk(unit[i].disk);
+		blk_cleanup_queue(q);
 	}
 	unregister_blkdev(FLOPPY_MAJOR, "fd");
 
-	blk_cleanup_queue(floppy_queue);
 	del_timer_sync(&fd_timer);
 	atari_stram_free( DMABuffer );
 }
-- 
cgit v1.2.3


From e4ecda1b60bfd2333c12bbe71b153d3b6bdc831a Mon Sep 17 00:00:00 2001
From: Mark Lord <mlord@pobox.com>
Date: Sat, 25 Sep 2010 11:17:22 +0200
Subject: Fix compile error in blk-exec.c for !CONFIG_DETECT_HUNG_TASK

Ensure that 'sysctl_hung_task_timeout_secs' is defined
even when CONFIG_DETECT_HUNG_TASK is not set.
This way we can safely reference it without need for
ifdefs in the code elsewhere.  eg. in block/blk-exec.c

Signed-off-by: Mark Lord <mlord@pobox.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 include/linux/sched.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1e2a6db2d7dd..dbafa9e34a2d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -336,6 +336,9 @@ extern unsigned long sysctl_hung_task_warnings;
 extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
 					 void __user *buffer,
 					 size_t *lenp, loff_t *ppos);
+#else
+/* Avoid need for ifdefs elsewhere in the code */
+enum { sysctl_hung_task_timeout_secs = 0 };
 #endif
 
 /* Attach to any functions which should be ignored in wchan output. */
-- 
cgit v1.2.3


From 504c6d1b44bc6e694bdba8d9a2a4e046275b5e2b Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Sun, 26 Sep 2010 12:23:25 +0900
Subject: amiga floppy: Compile failure fixes

o Compile fixes for amiga floppy driver.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/amiflop.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index ead8b7792c52..327ed27dfe1a 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -1347,12 +1347,12 @@ static struct request *set_next_request(void)
 	if (fdc_queue == FD_MAX_UNITS)
 		fdc_queue = 0;
 
-	for(cnt = FD_MAX_UNITS; cnt > 0, cnt--) {
+	for(cnt = FD_MAX_UNITS; cnt > 0; cnt--) {
 
 		if (unit[fdc_queue].type->code == FD_NODRIVE) {
 			if (++fdc_queue == FD_MAX_UNITS)
 				fdc_queue = 0;
-			cotinue;
+			continue;
 		}
 
 		q = unit[fdc_queue].gendisk->queue;
@@ -1827,7 +1827,6 @@ static int __init amiga_floppy_probe(struct platform_device *pdev)
 	return 0;
 
 out_probe:
-out_queue:
 	free_irq(IRQ_AMIGA_CIAA_TB, NULL);
 out_irq2:
 	free_irq(IRQ_AMIGA_DSKBLK, NULL);
-- 
cgit v1.2.3


From 260a67a9e534f0c7d49ddd6451833d54ba39ac81 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Fri, 1 Oct 2010 14:42:43 +0200
Subject: block: revert bad fix for memory hotplug causing bounces

Revert "block: set the bounce_pfn to the actual DMA limit rather than to max memory"

This reverts commit c49825facfd4969585224a896a5e717f88450cad.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-settings.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/block/blk-settings.c b/block/blk-settings.c
index f47af5031eaa..f8f2ddf20613 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -214,14 +214,16 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask)
 	 */
 	if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT))
 		dma = 1;
+	q->limits.bounce_pfn = max_low_pfn;
 #else
 	if (b_pfn < blk_max_low_pfn)
 		dma = 1;
-#endif
 	q->limits.bounce_pfn = b_pfn;
+#endif
 	if (dma) {
 		init_emergency_isa_pool();
 		q->bounce_gfp = GFP_NOIO | GFP_DMA;
+		q->limits.bounce_pfn = b_pfn;
 	}
 }
 EXPORT_SYMBOL(blk_queue_bounce_limit);
-- 
cgit v1.2.3


From efb012b361cf9319cd86ff169afa7550b7aa9336 Mon Sep 17 00:00:00 2001
From: Malahal Naineni <malahal@us.ibm.com>
Date: Fri, 1 Oct 2010 14:45:27 +0200
Subject: block: set the bounce_pfn to the actual DMA limit rather than to max
 memory

The bounce_pfn of the request queue in 64 bit systems is set to the
current max_low_pfn. Adding more memory later makes this incorrect.
Memory allocated beyond this boot time max_low_pfn appear to require
bounce buffers (bounce buffers are actually not allocated but used in
calculating segments that may result in "over max segments limit"
errors).

Signed-off-by: Malahal Naineni <malahal@us.ibm.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-settings.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-settings.c b/block/blk-settings.c
index f8f2ddf20613..a3600a7ab8bb 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -214,7 +214,7 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask)
 	 */
 	if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT))
 		dma = 1;
-	q->limits.bounce_pfn = max_low_pfn;
+	q->limits.bounce_pfn = max(max_low_pfn, b_pfn);
 #else
 	if (b_pfn < blk_max_low_pfn)
 		dma = 1;
-- 
cgit v1.2.3


From 13f98250f587b7defa39ed738dfa74b600e46e7b Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Fri, 1 Oct 2010 14:49:41 +0200
Subject: blkio: Do not export throttle files if CONFIG_BLK_DEV_THROTTLING=n

Currently throttling related files were visible even if user had disabled
throttling using config options. It was switching off background throttling
of bio but not the cgroup files. This patch fixes it.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-cgroup.c | 97 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 50 insertions(+), 47 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 20ce6f584e43..86e7066a0b62 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1241,41 +1241,6 @@ struct cftype blkio_files[] = {
 		.read_u64 = blkiocg_file_read_u64,
 		.write_u64 = blkiocg_file_write_u64,
 	},
-	{
-		.name = "throttle.read_bps_device",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
-				BLKIO_THROTL_read_bps_device),
-		.read_seq_string = blkiocg_file_read,
-		.write_string = blkiocg_file_write,
-		.max_write_len = 256,
-	},
-
-	{
-		.name = "throttle.write_bps_device",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
-				BLKIO_THROTL_write_bps_device),
-		.read_seq_string = blkiocg_file_read,
-		.write_string = blkiocg_file_write,
-		.max_write_len = 256,
-	},
-
-	{
-		.name = "throttle.read_iops_device",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
-				BLKIO_THROTL_read_iops_device),
-		.read_seq_string = blkiocg_file_read,
-		.write_string = blkiocg_file_write,
-		.max_write_len = 256,
-	},
-
-	{
-		.name = "throttle.write_iops_device",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
-				BLKIO_THROTL_write_iops_device),
-		.read_seq_string = blkiocg_file_read,
-		.write_string = blkiocg_file_write,
-		.max_write_len = 256,
-	},
 	{
 		.name = "time",
 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
@@ -1294,24 +1259,12 @@ struct cftype blkio_files[] = {
 				BLKIO_PROP_io_service_bytes),
 		.read_map = blkiocg_file_read_map,
 	},
-	{
-		.name = "throttle.io_service_bytes",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
-				BLKIO_THROTL_io_service_bytes),
-		.read_map = blkiocg_file_read_map,
-	},
 	{
 		.name = "io_serviced",
 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
 				BLKIO_PROP_io_serviced),
 		.read_map = blkiocg_file_read_map,
 	},
-	{
-		.name = "throttle.io_serviced",
-		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
-				BLKIO_THROTL_io_serviced),
-		.read_map = blkiocg_file_read_map,
-	},
 	{
 		.name = "io_service_time",
 		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
@@ -1340,6 +1293,56 @@ struct cftype blkio_files[] = {
 		.name = "reset_stats",
 		.write_u64 = blkiocg_reset_stats,
 	},
+#ifdef CONFIG_BLK_DEV_THROTTLING
+	{
+		.name = "throttle.read_bps_device",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+				BLKIO_THROTL_read_bps_device),
+		.read_seq_string = blkiocg_file_read,
+		.write_string = blkiocg_file_write,
+		.max_write_len = 256,
+	},
+
+	{
+		.name = "throttle.write_bps_device",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+				BLKIO_THROTL_write_bps_device),
+		.read_seq_string = blkiocg_file_read,
+		.write_string = blkiocg_file_write,
+		.max_write_len = 256,
+	},
+
+	{
+		.name = "throttle.read_iops_device",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+				BLKIO_THROTL_read_iops_device),
+		.read_seq_string = blkiocg_file_read,
+		.write_string = blkiocg_file_write,
+		.max_write_len = 256,
+	},
+
+	{
+		.name = "throttle.write_iops_device",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+				BLKIO_THROTL_write_iops_device),
+		.read_seq_string = blkiocg_file_read,
+		.write_string = blkiocg_file_write,
+		.max_write_len = 256,
+	},
+	{
+		.name = "throttle.io_service_bytes",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+				BLKIO_THROTL_io_service_bytes),
+		.read_map = blkiocg_file_read_map,
+	},
+	{
+		.name = "throttle.io_serviced",
+		.private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+				BLKIO_THROTL_io_serviced),
+		.read_map = blkiocg_file_read_map,
+	},
+#endif /* CONFIG_BLK_DEV_THROTTLING */
+
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 	{
 		.name = "avg_queue_size",
-- 
cgit v1.2.3


From 61014e96e6ed55b8db0af31574eec2a75d4e8755 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Fri, 1 Oct 2010 14:49:44 +0200
Subject: blkio: deletion of a cgroup was causes oops

o Now a cgroup list of blkg elements can contain blkg from multiple policies.
  Before sending an unlink event, make sure blkg belongs to they policy. If
  policy does not own the blkg, do not send update for this blkg.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-cgroup.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 86e7066a0b62..b06ca70354e3 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1411,13 +1411,14 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 		/*
 		 * This blkio_group is being unlinked as associated cgroup is
 		 * going away. Let all the IO controlling policies know about
-		 * this event. Currently this is static call to one io
-		 * controlling policy. Once we have more policies in place, we
-		 * need some dynamic registration of callback function.
+		 * this event.
 		 */
 		spin_lock(&blkio_list_lock);
-		list_for_each_entry(blkiop, &blkio_list, list)
+		list_for_each_entry(blkiop, &blkio_list, list) {
+			if (blkiop->plid != blkg->plid)
+				continue;
 			blkiop->ops.blkio_unlink_group_fn(key, blkg);
+		}
 		spin_unlock(&blkio_list_lock);
 	} while (1);
 
-- 
cgit v1.2.3


From 02977e4af7ed3b478c505e50491ffdf3e1314cf4 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Fri, 1 Oct 2010 14:49:48 +0200
Subject: blkio: Add root group to td->tg_list

o Currently all the dynamically allocated groups, except root grp is added
  to td->tg_list. This was not a problem so far but in next patch I will
  travel through td->tg_list to process any updates of limits on the group.
  If root group is not in tg_list, then root group's updates are not
  processed.

o It is better to root group also to tg_list instead of doing special
  processing for it during limit updates.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-throttle.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index af53f37c1b13..bc2936b80add 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -87,7 +87,7 @@ struct throtl_data
 	unsigned int nr_queued[2];
 
 	/*
-	 * number of total undestroyed groups (excluding root group)
+	 * number of total undestroyed groups
 	 */
 	unsigned int nr_undestroyed_grps;
 
@@ -940,7 +940,17 @@ int blk_throtl_init(struct request_queue *q)
 	/* Practically unlimited BW */
 	tg->bps[0] = tg->bps[1] = -1;
 	tg->iops[0] = tg->iops[1] = -1;
-	atomic_set(&tg->ref, 1);
+
+	/*
+	 * Set root group reference to 2. One reference will be dropped when
+	 * all groups on tg_list are being deleted during queue exit. Other
+	 * reference will remain there as we don't want to delete this group
+	 * as it is statically allocated and gets destroyed when throtl_data
+	 * goes away.
+	 */
+	atomic_set(&tg->ref, 2);
+	hlist_add_head(&tg->tg_node, &td->tg_list);
+	td->nr_undestroyed_grps++;
 
 	INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
 
@@ -966,10 +976,9 @@ void blk_throtl_exit(struct request_queue *q)
 
 	spin_lock_irq(q->queue_lock);
 	throtl_release_tgs(td);
-	blkiocg_del_blkio_group(&td->root_tg.blkg);
 
 	/* If there are other groups */
-	if (td->nr_undestroyed_grps >= 1)
+	if (td->nr_undestroyed_grps > 0)
 		wait = true;
 
 	spin_unlock_irq(q->queue_lock);
-- 
cgit v1.2.3


From fe0714377ee2ca161bf2afb7773e22f15f1786d4 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Fri, 1 Oct 2010 14:49:49 +0200
Subject: blkio: Recalculate the throttled bio dispatch time upon throttle
 limit change

o Currently any cgroup throttle limit changes are processed asynchronousy and
  the change does not take affect till a new bio is dispatched from same group.

o It might happen that a user sets a redicuously low limit on throttling.
  Say 1 bytes per second on reads. In such cases simple operations like mount
  a disk can wait for a very long time.

o Once bio is throttled, there is no easy way to come out of that wait even if
  user increases the read limit later.

o This patch fixes it. Now if a user changes the cgroup limits, we recalculate
  the bio dispatch time according to new limits.

o Can't take queueu lock under blkcg_lock, hence after the change I wake
  up the dispatch thread again which recalculates the time. So there are some
  variables being synchronized across two threads without lock and I had to
  make use of barriers. Hoping I have used barriers correctly. Any review of
  memory barrier code especially will help.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-cgroup.c   |  15 ++++--
 block/blk-cgroup.h   |  21 ++++----
 block/blk-throttle.c | 134 ++++++++++++++++++++++++++++++++++++++++++++-------
 block/cfq-iosched.c  |   4 +-
 4 files changed, 139 insertions(+), 35 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index b06ca70354e3..52c12130a5de 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -124,7 +124,8 @@ blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight)
 		if (blkiop->plid != blkg->plid)
 			continue;
 		if (blkiop->ops.blkio_update_group_weight_fn)
-			blkiop->ops.blkio_update_group_weight_fn(blkg, weight);
+			blkiop->ops.blkio_update_group_weight_fn(blkg->key,
+							blkg, weight);
 	}
 }
 
@@ -141,11 +142,13 @@ static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps,
 
 		if (fileid == BLKIO_THROTL_read_bps_device
 		    && blkiop->ops.blkio_update_group_read_bps_fn)
-			blkiop->ops.blkio_update_group_read_bps_fn(blkg, bps);
+			blkiop->ops.blkio_update_group_read_bps_fn(blkg->key,
+								blkg, bps);
 
 		if (fileid == BLKIO_THROTL_write_bps_device
 		    && blkiop->ops.blkio_update_group_write_bps_fn)
-			blkiop->ops.blkio_update_group_write_bps_fn(blkg, bps);
+			blkiop->ops.blkio_update_group_write_bps_fn(blkg->key,
+								blkg, bps);
 	}
 }
 
@@ -162,11 +165,13 @@ static inline void blkio_update_group_iops(struct blkio_group *blkg,
 
 		if (fileid == BLKIO_THROTL_read_iops_device
 		    && blkiop->ops.blkio_update_group_read_iops_fn)
-			blkiop->ops.blkio_update_group_read_iops_fn(blkg, iops);
+			blkiop->ops.blkio_update_group_read_iops_fn(blkg->key,
+								blkg, iops);
 
 		if (fileid == BLKIO_THROTL_write_iops_device
 		    && blkiop->ops.blkio_update_group_write_iops_fn)
-			blkiop->ops.blkio_update_group_write_iops_fn(blkg,iops);
+			blkiop->ops.blkio_update_group_write_iops_fn(blkg->key,
+								blkg,iops);
 	}
 }
 
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 2070053a30b1..034c35562dba 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -186,16 +186,17 @@ extern unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg,
 				     dev_t dev);
 
 typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
-typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg,
-						unsigned int weight);
-typedef void (blkio_update_group_read_bps_fn) (struct blkio_group *blkg,
-						u64 read_bps);
-typedef void (blkio_update_group_write_bps_fn) (struct blkio_group *blkg,
-						u64 write_bps);
-typedef void (blkio_update_group_read_iops_fn) (struct blkio_group *blkg,
-						unsigned int read_iops);
-typedef void (blkio_update_group_write_iops_fn) (struct blkio_group *blkg,
-						unsigned int write_iops);
+
+typedef void (blkio_update_group_weight_fn) (void *key,
+			struct blkio_group *blkg, unsigned int weight);
+typedef void (blkio_update_group_read_bps_fn) (void * key,
+			struct blkio_group *blkg, u64 read_bps);
+typedef void (blkio_update_group_write_bps_fn) (void *key,
+			struct blkio_group *blkg, u64 write_bps);
+typedef void (blkio_update_group_read_iops_fn) (void *key,
+			struct blkio_group *blkg, unsigned int read_iops);
+typedef void (blkio_update_group_write_iops_fn) (void *key,
+			struct blkio_group *blkg, unsigned int write_iops);
 
 struct blkio_policy_ops {
 	blkio_unlink_group_fn *blkio_unlink_group_fn;
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index bc2936b80add..11713ed852f4 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -70,6 +70,9 @@ struct throtl_grp {
 	/* When did we start a new slice */
 	unsigned long slice_start[2];
 	unsigned long slice_end[2];
+
+	/* Some throttle limits got updated for the group */
+	bool limits_changed;
 };
 
 struct throtl_data
@@ -93,6 +96,8 @@ struct throtl_data
 
 	/* Work for dispatching throttled bios */
 	struct delayed_work throtl_work;
+
+	atomic_t limits_changed;
 };
 
 enum tg_state_flags {
@@ -592,15 +597,6 @@ static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg)
 	min_wait = min(read_wait, write_wait);
 	disptime = jiffies + min_wait;
 
-	/*
-	 * If group is already on active tree, then update dispatch time
-	 * only if it is lesser than existing dispatch time. Otherwise
-	 * always update the dispatch time
-	 */
-
-	if (throtl_tg_on_rr(tg) && time_before(disptime, tg->disptime))
-		return;
-
 	/* Update dispatch time */
 	throtl_dequeue_tg(td, tg);
 	tg->disptime = disptime;
@@ -691,6 +687,46 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
 	return nr_disp;
 }
 
+static void throtl_process_limit_change(struct throtl_data *td)
+{
+	struct throtl_grp *tg;
+	struct hlist_node *pos, *n;
+
+	/*
+	 * Make sure atomic_inc() effects from
+	 * throtl_update_blkio_group_read_bps(), group of functions are
+	 * visible.
+	 * Is this required or smp_mb__after_atomic_inc() was suffcient
+	 * after the atomic_inc().
+	 */
+	smp_rmb();
+	if (!atomic_read(&td->limits_changed))
+		return;
+
+	throtl_log(td, "limit changed =%d", atomic_read(&td->limits_changed));
+
+	hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
+		/*
+		 * Do I need an smp_rmb() here to make sure tg->limits_changed
+		 * update is visible. I am relying on smp_rmb() at the
+		 * beginning of function and not putting a new one here.
+		 */
+
+		if (throtl_tg_on_rr(tg) && tg->limits_changed) {
+			throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu"
+				" riops=%u wiops=%u", tg->bps[READ],
+				tg->bps[WRITE], tg->iops[READ],
+				tg->iops[WRITE]);
+			tg_update_disptime(td, tg);
+			tg->limits_changed = false;
+		}
+	}
+
+	smp_mb__before_atomic_dec();
+	atomic_dec(&td->limits_changed);
+	smp_mb__after_atomic_dec();
+}
+
 /* Dispatch throttled bios. Should be called without queue lock held. */
 static int throtl_dispatch(struct request_queue *q)
 {
@@ -701,6 +737,8 @@ static int throtl_dispatch(struct request_queue *q)
 
 	spin_lock_irq(q->queue_lock);
 
+	throtl_process_limit_change(td);
+
 	if (!total_nr_queued(td))
 		goto out;
 
@@ -821,28 +859,74 @@ void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg)
 	spin_unlock_irqrestore(td->queue->queue_lock, flags);
 }
 
-static void throtl_update_blkio_group_read_bps (struct blkio_group *blkg,
-			u64 read_bps)
+/*
+ * For all update functions, key should be a valid pointer because these
+ * update functions are called under blkcg_lock, that means, blkg is
+ * valid and in turn key is valid. queue exit path can not race becuase
+ * of blkcg_lock
+ *
+ * Can not take queue lock in update functions as queue lock under blkcg_lock
+ * is not allowed. Under other paths we take blkcg_lock under queue_lock.
+ */
+static void throtl_update_blkio_group_read_bps(void *key,
+				struct blkio_group *blkg, u64 read_bps)
 {
+	struct throtl_data *td = key;
+
 	tg_of_blkg(blkg)->bps[READ] = read_bps;
+	/* Make sure read_bps is updated before setting limits_changed */
+	smp_wmb();
+	tg_of_blkg(blkg)->limits_changed = true;
+
+	/* Make sure tg->limits_changed is updated before td->limits_changed */
+	smp_mb__before_atomic_inc();
+	atomic_inc(&td->limits_changed);
+	smp_mb__after_atomic_inc();
+
+	/* Schedule a work now to process the limit change */
+	throtl_schedule_delayed_work(td->queue, 0);
 }
 
-static void throtl_update_blkio_group_write_bps (struct blkio_group *blkg,
-			u64 write_bps)
+static void throtl_update_blkio_group_write_bps(void *key,
+				struct blkio_group *blkg, u64 write_bps)
 {
+	struct throtl_data *td = key;
+
 	tg_of_blkg(blkg)->bps[WRITE] = write_bps;
+	smp_wmb();
+	tg_of_blkg(blkg)->limits_changed = true;
+	smp_mb__before_atomic_inc();
+	atomic_inc(&td->limits_changed);
+	smp_mb__after_atomic_inc();
+	throtl_schedule_delayed_work(td->queue, 0);
 }
 
-static void throtl_update_blkio_group_read_iops (struct blkio_group *blkg,
-			unsigned int read_iops)
+static void throtl_update_blkio_group_read_iops(void *key,
+			struct blkio_group *blkg, unsigned int read_iops)
 {
+	struct throtl_data *td = key;
+
 	tg_of_blkg(blkg)->iops[READ] = read_iops;
+	smp_wmb();
+	tg_of_blkg(blkg)->limits_changed = true;
+	smp_mb__before_atomic_inc();
+	atomic_inc(&td->limits_changed);
+	smp_mb__after_atomic_inc();
+	throtl_schedule_delayed_work(td->queue, 0);
 }
 
-static void throtl_update_blkio_group_write_iops (struct blkio_group *blkg,
-			unsigned int write_iops)
+static void throtl_update_blkio_group_write_iops(void *key,
+			struct blkio_group *blkg, unsigned int write_iops)
 {
+	struct throtl_data *td = key;
+
 	tg_of_blkg(blkg)->iops[WRITE] = write_iops;
+	smp_wmb();
+	tg_of_blkg(blkg)->limits_changed = true;
+	smp_mb__before_atomic_inc();
+	atomic_inc(&td->limits_changed);
+	smp_mb__after_atomic_inc();
+	throtl_schedule_delayed_work(td->queue, 0);
 }
 
 void throtl_shutdown_timer_wq(struct request_queue *q)
@@ -886,8 +970,14 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop)
 		/*
 		 * There is already another bio queued in same dir. No
 		 * need to update dispatch time.
+		 * Still update the disptime if rate limits on this group
+		 * were changed.
 		 */
-		update_disptime = false;
+		if (!tg->limits_changed)
+			update_disptime = false;
+		else
+			tg->limits_changed = false;
+
 		goto queue_bio;
 	}
 
@@ -929,6 +1019,7 @@ int blk_throtl_init(struct request_queue *q)
 
 	INIT_HLIST_HEAD(&td->tg_list);
 	td->tg_service_tree = THROTL_RB_ROOT;
+	atomic_set(&td->limits_changed, 0);
 
 	/* Init root group */
 	tg = &td->root_tg;
@@ -996,6 +1087,13 @@ void blk_throtl_exit(struct request_queue *q)
 	 */
 	if (wait)
 		synchronize_rcu();
+
+	/*
+	 * Just being safe to make sure after previous flush if some body did
+	 * update limits through cgroup and another work got queued, cancel
+	 * it.
+	 */
+	throtl_shutdown_timer_wq(q);
 	throtl_td_free(td);
 }
 
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 684592621736..86338d5d4d09 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -951,8 +951,8 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
 	return NULL;
 }
 
-void
-cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight)
+void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
+					unsigned int weight)
 {
 	cfqg_of_blkg(blkg)->weight = weight;
 }
-- 
cgit v1.2.3


From 3aad5d3ee4e4fce8f4b5bb6ca73342dcade42b33 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Fri, 1 Oct 2010 14:51:14 +0200
Subject: blkio-throttle: Fix link failure failure on i386

o Randy Dunlap reported following linux-next failure. This patch fixes it.

on i386:

blk-throttle.c:(.text+0x1abb8): undefined reference to `__udivdi3'
blk-throttle.c:(.text+0x1b1dc): undefined reference to `__udivdi3'

o bytes_per_second interface is 64bit and I was continuing to do 64 bit
  division even on 32bit platform without help of special macros/functions
  hence the failure.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Reported-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-throttle.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 11713ed852f4..a46700255719 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -378,7 +378,8 @@ throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 static inline void
 throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 {
-	unsigned long nr_slices, bytes_trim, time_elapsed, io_trim;
+	unsigned long nr_slices, time_elapsed, io_trim;
+	u64 bytes_trim, tmp;
 
 	BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));
 
@@ -396,8 +397,10 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 
 	if (!nr_slices)
 		return;
+	tmp = tg->bps[rw] * throtl_slice * nr_slices;
+	do_div(tmp, HZ);
+	bytes_trim = tmp;
 
-	bytes_trim = (tg->bps[rw] * throtl_slice * nr_slices)/HZ;
 	io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ;
 
 	if (!bytes_trim && !io_trim)
@@ -415,7 +418,7 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 
 	tg->slice_start[rw] += nr_slices * throtl_slice;
 
-	throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%lu io=%lu"
+	throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu"
 			" start=%lu end=%lu jiffies=%lu",
 			rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
 			tg->slice_start[rw], tg->slice_end[rw], jiffies);
@@ -462,7 +465,7 @@ static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
 		struct bio *bio, unsigned long *wait)
 {
 	bool rw = bio_data_dir(bio);
-	u64 bytes_allowed, extra_bytes;
+	u64 bytes_allowed, extra_bytes, tmp;
 	unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
 
 	jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
@@ -473,8 +476,9 @@ static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
 
 	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
 
-	bytes_allowed = (tg->bps[rw] * jiffies_to_msecs(jiffy_elapsed_rnd))
-				/ MSEC_PER_SEC;
+	tmp = tg->bps[rw] * jiffies_to_msecs(jiffy_elapsed_rnd);
+	do_div(tmp, MSEC_PER_SEC);
+	bytes_allowed = tmp;
 
 	if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) {
 		if (wait)
-- 
cgit v1.2.3


From 5e901a2b95db709c5e40599ff4df6029be1e2a12 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Fri, 1 Oct 2010 21:16:38 +0200
Subject: blkio-throttle: There is no need to convert jiffies to milli seconds

o Do not convert jiffies to mili seconds as it is not required. Just work
  with jiffies and HZ.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-throttle.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index a46700255719..c1bc1b6c887a 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -439,8 +439,7 @@ static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
 
 	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
 
-	io_allowed = (tg->iops[rw] * jiffies_to_msecs(jiffy_elapsed_rnd))
-				/ MSEC_PER_SEC;
+	io_allowed = (tg->iops[rw] * jiffy_elapsed_rnd) / HZ;
 
 	if (tg->io_disp[rw] + 1 <= io_allowed) {
 		if (wait)
@@ -476,8 +475,8 @@ static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
 
 	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
 
-	tmp = tg->bps[rw] * jiffies_to_msecs(jiffy_elapsed_rnd);
-	do_div(tmp, MSEC_PER_SEC);
+	tmp = tg->bps[rw] * jiffy_elapsed_rnd;
+	do_div(tmp, HZ);
 	bytes_allowed = tmp;
 
 	if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) {
-- 
cgit v1.2.3


From 9355aede5a3c4975e0ba8bbfe2b9d1fd73308916 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Fri, 1 Oct 2010 21:16:41 +0200
Subject: blkio-throttle: limit max iops value to UINT_MAX

- Limit max iops value to UINT_MAX and return error to user if value is more
  than that instead of accepting bigger values and truncating implicitly.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-cgroup.c | 11 +++++++----
 block/blk-cgroup.h |  3 +++
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 52c12130a5de..0f59b23096db 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -656,10 +656,10 @@ static int blkio_policy_parse_and_set(char *buf,
 {
 	char *s[4], *p, *major_s = NULL, *minor_s = NULL;
 	int ret;
-	unsigned long major, minor, temp, iops;
+	unsigned long major, minor, temp;
 	int i = 0;
 	dev_t dev;
-	u64 bps;
+	u64 bps, iops;
 
 	memset(s, 0, sizeof(s));
 
@@ -731,13 +731,16 @@ static int blkio_policy_parse_and_set(char *buf,
 			break;
 		case BLKIO_THROTL_read_iops_device:
 		case BLKIO_THROTL_write_iops_device:
-			ret = strict_strtoul(s[1], 10, &iops);
+			ret = strict_strtoull(s[1], 10, &iops);
 			if (ret)
 				return -EINVAL;
 
+			if (iops > THROTL_IOPS_MAX)
+				return -EINVAL;
+
 			newpn->plid = plid;
 			newpn->fileid = fileid;
-			newpn->val.iops = iops;
+			newpn->val.iops = (unsigned int)iops;
 			break;
 		}
 		break;
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 034c35562dba..ea4861bdd549 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -20,6 +20,9 @@ enum blkio_policy_id {
 	BLKIO_POLICY_THROTL,		/* Throttling */
 };
 
+/* Max limits for throttle policy */
+#define THROTL_IOPS_MAX		UINT_MAX
+
 #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
 
 #ifndef CONFIG_BLK_CGROUP
-- 
cgit v1.2.3


From c49c06e4960949a9bced708858433fcf6ca36a9c Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Fri, 1 Oct 2010 21:16:42 +0200
Subject: blkio-throttle: Fix possible multiplication overflow in iops
 calculations

o User can specify max iops value of 32bit (UINT_MAX), through cgroup
  interface. If a user has specified say 4294967294 (UNIT_MAX  - 2), then
  on 32bit platform, following multiplication can overflow.

  io_allowed = (tg->iops[rw] * jiffy_elapsed_rnd)

o Explicitly cast the multiplication to 64bit and then perform division and
  then check whether result is still great then UNINT_MAX.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-throttle.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index c1bc1b6c887a..56ad4531b412 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -430,6 +430,7 @@ static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
 	bool rw = bio_data_dir(bio);
 	unsigned int io_allowed;
 	unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
+	u64 tmp;
 
 	jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
 
@@ -439,7 +440,20 @@ static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
 
 	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
 
-	io_allowed = (tg->iops[rw] * jiffy_elapsed_rnd) / HZ;
+	/*
+	 * jiffy_elapsed_rnd should not be a big value as minimum iops can be
+	 * 1 then at max jiffy elapsed should be equivalent of 1 second as we
+	 * will allow dispatch after 1 second and after that slice should
+	 * have been trimmed.
+	 */
+
+	tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd;
+	do_div(tmp, HZ);
+
+	if (tmp > UINT_MAX)
+		io_allowed = UINT_MAX;
+	else
+		io_allowed = tmp;
 
 	if (tg->io_disp[rw] + 1 <= io_allowed) {
 		if (wait)
-- 
cgit v1.2.3


From 892b6f90db81cccb723d5d92f4fddc2d68b206e1 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Wed, 13 Oct 2010 21:18:03 +0200
Subject: block: Ensure physical block size is unsigned int

Physical block size was declared unsigned int to accomodate the maximum
size reported by READ CAPACITY(16).  Make sure we use the right type in
the related functions.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-settings.c   | 2 +-
 include/linux/blkdev.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/block/blk-settings.c b/block/blk-settings.c
index a3600a7ab8bb..315b88c8cbbb 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -344,7 +344,7 @@ EXPORT_SYMBOL(blk_queue_logical_block_size);
  *   hardware can operate on without reverting to read-modify-write
  *   operations.
  */
-void blk_queue_physical_block_size(struct request_queue *q, unsigned short size)
+void blk_queue_physical_block_size(struct request_queue *q, unsigned int size)
 {
 	q->limits.physical_block_size = size;
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1341df5806df..8f3dd981b973 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -860,7 +860,7 @@ extern void blk_queue_max_segment_size(struct request_queue *, unsigned int);
 extern void blk_queue_max_discard_sectors(struct request_queue *q,
 		unsigned int max_discard_sectors);
 extern void blk_queue_logical_block_size(struct request_queue *, unsigned short);
-extern void blk_queue_physical_block_size(struct request_queue *, unsigned short);
+extern void blk_queue_physical_block_size(struct request_queue *, unsigned int);
 extern void blk_queue_alignment_offset(struct request_queue *q,
 				       unsigned int alignment);
 extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min);
@@ -1013,7 +1013,7 @@ static inline unsigned int queue_physical_block_size(struct request_queue *q)
 	return q->limits.physical_block_size;
 }
 
-static inline int bdev_physical_block_size(struct block_device *bdev)
+static inline unsigned int bdev_physical_block_size(struct block_device *bdev)
 {
 	return queue_physical_block_size(bdev_get_queue(bdev));
 }
-- 
cgit v1.2.3


From 7e602c0aaf3e686c36cc742119f0f53f42e9befe Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Thu, 27 May 2010 14:49:27 +0200
Subject: drbd: renamed drbd_tl_epoch.n_req to drbd_tl_epoch.n_writes

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h  |  2 +-
 drivers/block/drbd/drbd_main.c | 12 ++++++------
 drivers/block/drbd/drbd_req.c  |  4 ++--
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 352441b0f92f..11b7c6f84cd3 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -697,7 +697,7 @@ struct drbd_tl_epoch {
 	struct list_head requests; /* requests before */
 	struct drbd_tl_epoch *next; /* pointer to the next barrier */
 	unsigned int br_number;  /* the barriers identifier. */
-	int n_req;	/* number of requests attached before this barrier */
+	int n_writes;	/* number of requests attached before this barrier */
 };
 
 struct drbd_request;
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index fa650dd85b90..a9bc6bc62400 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -199,7 +199,7 @@ static int tl_init(struct drbd_conf *mdev)
 	INIT_LIST_HEAD(&b->w.list);
 	b->next = NULL;
 	b->br_number = 4711;
-	b->n_req = 0;
+	b->n_writes = 0;
 	b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
 
 	mdev->oldest_tle = b;
@@ -240,7 +240,7 @@ void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
 	INIT_LIST_HEAD(&new->w.list);
 	new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
 	new->next = NULL;
-	new->n_req = 0;
+	new->n_writes = 0;
 
 	newest_before = mdev->newest_tle;
 	/* never send a barrier number == 0, because that is special-cased
@@ -284,9 +284,9 @@ void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
 			barrier_nr, b->br_number);
 		goto bail;
 	}
-	if (b->n_req != set_size) {
-		dev_err(DEV, "BAD! BarrierAck #%u received with n_req=%u, expected n_req=%u!\n",
-			barrier_nr, set_size, b->n_req);
+	if (b->n_writes != set_size) {
+		dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
+			barrier_nr, set_size, b->n_writes);
 		goto bail;
 	}
 
@@ -378,7 +378,7 @@ void tl_clear(struct drbd_conf *mdev)
 			INIT_LIST_HEAD(&b->w.list);
 			b->w.cb = NULL;
 			b->br_number = new_initial_bnr;
-			b->n_req = 0;
+			b->n_writes = 0;
 
 			mdev->oldest_tle = b;
 			break;
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index f761d98a4e90..976d7941f71e 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -521,7 +521,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
 				&mdev->newest_tle->requests);
 
 		/* increment size of current epoch */
-		mdev->newest_tle->n_req++;
+		mdev->newest_tle->n_writes++;
 
 		/* queue work item to send data */
 		D_ASSERT(req->rq_state & RQ_NET_PENDING);
@@ -530,7 +530,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		drbd_queue_work(&mdev->data.work, &req->w);
 
 		/* close the epoch, in case it outgrew the limit */
-		if (mdev->newest_tle->n_req >= mdev->net_conf->max_epoch_size)
+		if (mdev->newest_tle->n_writes >= mdev->net_conf->max_epoch_size)
 			queue_barrier(mdev);
 
 		break;
-- 
cgit v1.2.3


From 288f422ec13667de40b278535d2a5fb5c77352c4 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Thu, 27 May 2010 15:07:43 +0200
Subject: drbd: Track all IO requests on the TL, not writes only

With that the drbd_fail_pending_reads() function becomes obsolete.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_main.c     |  2 ++
 drivers/block/drbd/drbd_receiver.c | 37 -------------------------------------
 drivers/block/drbd/drbd_req.c      | 24 +++++++++++++++---------
 drivers/block/drbd/drbd_req.h      |  7 ++++++-
 4 files changed, 23 insertions(+), 47 deletions(-)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index a9bc6bc62400..a86e6f1ff7f4 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -401,6 +401,8 @@ void tl_clear(struct drbd_conf *mdev)
 	/* ensure bit indicating barrier is required is clear */
 	clear_bit(CREATE_BARRIER, &mdev->flags);
 
+	memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
+
 	spin_unlock_irq(&mdev->req_lock);
 }
 
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 081522d3c742..88a5e1f4ec1d 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -3666,41 +3666,6 @@ static void drbdd(struct drbd_conf *mdev)
 	}
 }
 
-static void drbd_fail_pending_reads(struct drbd_conf *mdev)
-{
-	struct hlist_head *slot;
-	struct hlist_node *pos;
-	struct hlist_node *tmp;
-	struct drbd_request *req;
-	int i;
-
-	/*
-	 * Application READ requests
-	 */
-	spin_lock_irq(&mdev->req_lock);
-	for (i = 0; i < APP_R_HSIZE; i++) {
-		slot = mdev->app_reads_hash+i;
-		hlist_for_each_entry_safe(req, pos, tmp, slot, colision) {
-			/* it may (but should not any longer!)
-			 * be on the work queue; if that assert triggers,
-			 * we need to also grab the
-			 * spin_lock_irq(&mdev->data.work.q_lock);
-			 * and list_del_init here. */
-			D_ASSERT(list_empty(&req->w.list));
-			/* It would be nice to complete outside of spinlock.
-			 * But this is easier for now. */
-			_req_mod(req, connection_lost_while_pending);
-		}
-	}
-	for (i = 0; i < APP_R_HSIZE; i++)
-		if (!hlist_empty(mdev->app_reads_hash+i))
-			dev_warn(DEV, "ASSERT FAILED: app_reads_hash[%d].first: "
-				"%p, should be NULL\n", i, mdev->app_reads_hash[i].first);
-
-	memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
-	spin_unlock_irq(&mdev->req_lock);
-}
-
 void drbd_flush_workqueue(struct drbd_conf *mdev)
 {
 	struct drbd_wq_barrier barr;
@@ -3770,8 +3735,6 @@ static void drbd_disconnect(struct drbd_conf *mdev)
 	if (!mdev->state.susp)
 		tl_clear(mdev);
 
-	drbd_fail_pending_reads(mdev);
-
 	dev_info(DEV, "Connection closed\n");
 
 	drbd_md_sync(mdev);
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 976d7941f71e..4a30e2cae56d 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -59,17 +59,19 @@ static void _drbd_end_io_acct(struct drbd_conf *mdev, struct drbd_request *req)
 static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const int rw)
 {
 	const unsigned long s = req->rq_state;
+
+	/* remove it from the transfer log.
+	 * well, only if it had been there in the first
+	 * place... if it had not (local only or conflicting
+	 * and never sent), it should still be "empty" as
+	 * initialized in drbd_req_new(), so we can list_del() it
+	 * here unconditionally */
+	list_del(&req->tl_requests);
+
 	/* if it was a write, we may have to set the corresponding
 	 * bit(s) out-of-sync first. If it had a local part, we need to
 	 * release the reference to the activity log. */
 	if (rw == WRITE) {
-		/* remove it from the transfer log.
-		 * well, only if it had been there in the first
-		 * place... if it had not (local only or conflicting
-		 * and never sent), it should still be "empty" as
-		 * initialized in drbd_req_new(), so we can list_del() it
-		 * here unconditionally */
-		list_del(&req->tl_requests);
 		/* Set out-of-sync unless both OK flags are set
 		 * (local only or remote failed).
 		 * Other places where we set out-of-sync:
@@ -517,8 +519,6 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		D_ASSERT(test_bit(CREATE_BARRIER, &mdev->flags) == 0);
 
 		req->epoch = mdev->newest_tle->br_number;
-		list_add_tail(&req->tl_requests,
-				&mdev->newest_tle->requests);
 
 		/* increment size of current epoch */
 		mdev->newest_tle->n_writes++;
@@ -634,6 +634,9 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		break;
 
 	case barrier_acked:
+		if (!(req->rq_state & RQ_WRITE))
+			break;
+
 		if (req->rq_state & RQ_NET_PENDING) {
 			/* barrier came in before all requests have been acked.
 			 * this is bad, because if the connection is lost now,
@@ -892,6 +895,9 @@ allocate_barrier:
 		remote = 0;
 	}
 
+
+	list_add_tail(&req->tl_requests, &mdev->newest_tle->requests);
+
 	/* NOTE remote first: to get the concurrent write detection right,
 	 * we must register the request before start of local IO.  */
 	if (remote) {
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index 02d575d24518..47b931fe0366 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -183,6 +183,9 @@ enum drbd_req_state_bits {
 
 	/* keep this last, its for the RQ_NET_MASK */
 	__RQ_NET_MAX,
+
+	/* Set when this is a write, clear for a read */
+	__RQ_WRITE,
 };
 
 #define RQ_LOCAL_PENDING   (1UL << __RQ_LOCAL_PENDING)
@@ -201,6 +204,8 @@ enum drbd_req_state_bits {
 /* 0x1f8 */
 #define RQ_NET_MASK        (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK)
 
+#define RQ_WRITE           (1UL << __RQ_WRITE)
+
 /* epoch entries */
 static inline
 struct hlist_head *ee_hash_slot(struct drbd_conf *mdev, sector_t sector)
@@ -253,7 +258,7 @@ static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev,
 	if (likely(req)) {
 		bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */
 
-		req->rq_state    = 0;
+		req->rq_state    = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0;
 		req->mdev        = mdev;
 		req->master_bio  = bio_src;
 		req->private_bio = bio;
-- 
cgit v1.2.3


From 2a80699f807885d501f08a7006f6a56c1c937a6e Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Wed, 9 Jun 2010 14:07:43 +0200
Subject: drbd: mod_req has now a return value

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_req.c |  5 ++++-
 drivers/block/drbd/drbd_req.h | 17 ++++++++++++-----
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 4a30e2cae56d..d9df1a1c40b9 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -382,10 +382,11 @@ out_conflict:
  *  and it enforces that we have to think in a very structured manner
  *  about the "events" that may happen to a request during its life time ...
  */
-void __req_mod(struct drbd_request *req, enum drbd_req_event what,
+int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		struct bio_and_error *m)
 {
 	struct drbd_conf *mdev = req->mdev;
+	int rv = 0;
 	m->bio = NULL;
 
 	switch (what) {
@@ -657,6 +658,8 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		_req_may_be_done(req, m);
 		break;
 	};
+
+	return rv;
 }
 
 /* we may do a local read if:
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index 47b931fe0366..db37c6e47fa9 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -297,36 +297,43 @@ struct bio_and_error {
 
 extern void _req_may_be_done(struct drbd_request *req,
 		struct bio_and_error *m);
-extern void __req_mod(struct drbd_request *req, enum drbd_req_event what,
+extern int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		struct bio_and_error *m);
 extern void complete_master_bio(struct drbd_conf *mdev,
 		struct bio_and_error *m);
 
 /* use this if you don't want to deal with calling complete_master_bio()
  * outside the spinlock, e.g. when walking some list on cleanup. */
-static inline void _req_mod(struct drbd_request *req, enum drbd_req_event what)
+static inline int _req_mod(struct drbd_request *req, enum drbd_req_event what)
 {
 	struct drbd_conf *mdev = req->mdev;
 	struct bio_and_error m;
+	int rv;
 
 	/* __req_mod possibly frees req, do not touch req after that! */
-	__req_mod(req, what, &m);
+	rv = __req_mod(req, what, &m);
 	if (m.bio)
 		complete_master_bio(mdev, &m);
+
+	return rv;
 }
 
 /* completion of master bio is outside of spinlock.
  * If you need it irqsave, do it your self! */
-static inline void req_mod(struct drbd_request *req,
+static inline int req_mod(struct drbd_request *req,
 		enum drbd_req_event what)
 {
 	struct drbd_conf *mdev = req->mdev;
 	struct bio_and_error m;
+	int rv;
+
 	spin_lock_irq(&mdev->req_lock);
-	__req_mod(req, what, &m);
+	rv = __req_mod(req, what, &m);
 	spin_unlock_irq(&mdev->req_lock);
 
 	if (m.bio)
 		complete_master_bio(mdev, &m);
+
+	return rv;
 }
 #endif
-- 
cgit v1.2.3


From 11b58e73a3a3d1bbb582370d59f9b2c4d0136b42 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Wed, 12 May 2010 17:08:26 +0200
Subject: drbd: factored tl_restart() out of tl_clear().

If IO was frozen for a temporal network outage, resend the
content of the transfer-log into the newly established connection.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h      |   2 +
 drivers/block/drbd/drbd_main.c     | 116 +++++++++++++++++++++++++------------
 drivers/block/drbd/drbd_receiver.c |   3 -
 drivers/block/drbd/drbd_req.c      |  14 +++++
 drivers/block/drbd/drbd_req.h      |   8 +++
 5 files changed, 103 insertions(+), 40 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 11b7c6f84cd3..bef9138f1975 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1138,6 +1138,8 @@ extern void drbd_free_resources(struct drbd_conf *mdev);
 extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
 		       unsigned int set_size);
 extern void tl_clear(struct drbd_conf *mdev);
+enum drbd_req_event;
+extern void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what);
 extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *);
 extern void drbd_free_sock(struct drbd_conf *mdev);
 extern int drbd_send(struct drbd_conf *mdev, struct socket *sock,
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index a86e6f1ff7f4..a8a0341fce53 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -333,59 +333,94 @@ bail:
 	drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
 }
 
-
 /**
- * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
+ * _tl_restart() - Walks the transfer log, and applies an action to all requests
  * @mdev:	DRBD device.
+ * @what:       The action/event to perform with all request objects
  *
- * This is called after the connection to the peer was lost. The storage covered
- * by the requests on the transfer gets marked as our of sync. Called from the
- * receiver thread and the worker thread.
+ * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
+ * restart_frozen_disk_io.
  */
-void tl_clear(struct drbd_conf *mdev)
+static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
 {
-	struct drbd_tl_epoch *b, *tmp;
+	struct drbd_tl_epoch *b, *tmp, **pn;
 	struct list_head *le, *tle;
-	struct drbd_request *r;
-	int new_initial_bnr = net_random();
-
-	spin_lock_irq(&mdev->req_lock);
+	struct drbd_request *req;
+	int rv, n_writes, n_reads;
 
 	b = mdev->oldest_tle;
+	pn = &mdev->oldest_tle;
 	while (b) {
+		n_writes = 0;
+		n_reads = 0;
 		list_for_each_safe(le, tle, &b->requests) {
-			r = list_entry(le, struct drbd_request, tl_requests);
-			/* It would be nice to complete outside of spinlock.
-			 * But this is easier for now. */
-			_req_mod(r, connection_lost_while_pending);
+			req = list_entry(le, struct drbd_request, tl_requests);
+			rv = _req_mod(req, what);
+
+			n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
+			n_reads  += (rv & MR_READ) >> MR_READ_SHIFT;
 		}
 		tmp = b->next;
 
-		/* there could still be requests on that ring list,
-		 * in case local io is still pending */
-		list_del(&b->requests);
-
-		/* dec_ap_pending corresponding to queue_barrier.
-		 * the newest barrier may not have been queued yet,
-		 * in which case w.cb is still NULL. */
-		if (b->w.cb != NULL)
-			dec_ap_pending(mdev);
-
-		if (b == mdev->newest_tle) {
-			/* recycle, but reinit! */
-			D_ASSERT(tmp == NULL);
-			INIT_LIST_HEAD(&b->requests);
-			INIT_LIST_HEAD(&b->w.list);
-			b->w.cb = NULL;
-			b->br_number = new_initial_bnr;
-			b->n_writes = 0;
-
-			mdev->oldest_tle = b;
-			break;
+		if (n_writes + n_reads) {
+			if (what == resend) {
+				b->n_writes = n_writes;
+				if (b->w.cb == NULL) {
+					b->w.cb = w_send_barrier;
+					inc_ap_pending(mdev);
+					set_bit(CREATE_BARRIER, &mdev->flags);
+				}
+
+				drbd_queue_work(&mdev->data.work, &b->w);
+			}
+			pn = &b->next;
+		} else {
+			/* there could still be requests on that ring list,
+			 * in case local io is still pending */
+			list_del(&b->requests);
+
+			/* dec_ap_pending corresponding to queue_barrier.
+			 * the newest barrier may not have been queued yet,
+			 * in which case w.cb is still NULL. */
+			if (b->w.cb != NULL)
+				dec_ap_pending(mdev);
+
+			if (b == mdev->newest_tle) {
+				/* recycle, but reinit! */
+				D_ASSERT(tmp == NULL);
+				INIT_LIST_HEAD(&b->requests);
+				INIT_LIST_HEAD(&b->w.list);
+				b->w.cb = NULL;
+				b->br_number = net_random();
+				b->n_writes = 0;
+
+				*pn = b;
+				break;
+			}
+			*pn = tmp;
+			kfree(b);
 		}
-		kfree(b);
 		b = tmp;
 	}
+}
+
+
+/**
+ * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
+ * @mdev:	DRBD device.
+ *
+ * This is called after the connection to the peer was lost. The storage covered
+ * by the requests on the transfer gets marked as our of sync. Called from the
+ * receiver thread and the worker thread.
+ */
+void tl_clear(struct drbd_conf *mdev)
+{
+	struct list_head *le, *tle;
+	struct drbd_request *r;
+
+	spin_lock_irq(&mdev->req_lock);
+
+	_tl_restart(mdev, connection_lost_while_pending);
 
 	/* we expect this list to be empty. */
 	D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
@@ -406,6 +441,13 @@ void tl_clear(struct drbd_conf *mdev)
 	spin_unlock_irq(&mdev->req_lock);
 }
 
+void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
+{
+	spin_lock_irq(&mdev->req_lock);
+	_tl_restart(mdev, what);
+	spin_unlock_irq(&mdev->req_lock);
+}
+
 /**
  * cl_wide_st_chg() - TRUE if the state change is a cluster wide one
  * @mdev:	DRBD device.
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 88a5e1f4ec1d..8daa920c40a4 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -776,9 +776,6 @@ static int drbd_connect(struct drbd_conf *mdev)
 
 	D_ASSERT(!mdev->data.socket);
 
-	if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags))
-		dev_err(DEV, "CREATE_BARRIER flag was set in drbd_connect - now cleared!\n");
-
 	if (drbd_request_state(mdev, NS(conn, C_WF_CONNECTION)) < SS_SUCCESS)
 		return -2;
 
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index d9df1a1c40b9..39c2cc3614e4 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -634,6 +634,20 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		/* else: done by handed_over_to_network */
 		break;
 
+	case resend:
+		/* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK
+		   before the connection loss; only P_BARRIER_ACK was missing.
+		   Trowing them out of the TL here by pretending we got a BARRIER_ACK
+		   TODO: Either resync them, or ensure peer was not rebooted. */
+		if (!(req->rq_state & RQ_NET_OK)) {
+			if (req->w.cb) {
+				drbd_queue_work(&mdev->data.work, &req->w);
+				rv = req->rq_state & RQ_WRITE ? MR_WRITE : MR_READ;
+			}
+			break;
+		}
+		/* else, fall through to barrier_acked */
+
 	case barrier_acked:
 		if (!(req->rq_state & RQ_WRITE))
 			break;
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index db37c6e47fa9..1bcb85539735 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -104,6 +104,7 @@ enum drbd_req_event {
 	read_ahead_completed_with_error,
 	write_completed_with_error,
 	completed_ok,
+	resend,
 	nothing, /* for tracing only */
 };
 
@@ -206,6 +207,13 @@ enum drbd_req_state_bits {
 
 #define RQ_WRITE           (1UL << __RQ_WRITE)
 
+/* For waking up the frozen transfer log mod_req() has to return if the request
+   should be counted in the epoch object*/
+#define MR_WRITE_SHIFT 0
+#define MR_WRITE       (1 << MR_WRITE_SHIFT)
+#define MR_READ_SHIFT  1
+#define MR_READ        (1 << MR_READ_SHIFT)
+
 /* epoch entries */
 static inline
 struct hlist_head *ee_hash_slot(struct drbd_conf *mdev, sector_t sector)
-- 
cgit v1.2.3


From b9b98716f83856b928f1c985ab55520c67663dd2 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Tue, 22 Jun 2010 11:26:48 +0200
Subject: drbd: Do not send two barriers without any writes between them

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_main.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index a8a0341fce53..7d359863ae32 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -344,7 +344,7 @@ bail:
 static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
 {
 	struct drbd_tl_epoch *b, *tmp, **pn;
-	struct list_head *le, *tle;
+	struct list_head *le, *tle, carry_reads;
 	struct drbd_request *req;
 	int rv, n_writes, n_reads;
 
@@ -353,6 +353,7 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
 	while (b) {
 		n_writes = 0;
 		n_reads = 0;
+		INIT_LIST_HEAD(&carry_reads);
 		list_for_each_safe(le, tle, &b->requests) {
 			req = list_entry(le, struct drbd_request, tl_requests);
 			rv = _req_mod(req, what);
@@ -362,7 +363,7 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
 		}
 		tmp = b->next;
 
-		if (n_writes + n_reads) {
+		if (n_writes) {
 			if (what == resend) {
 				b->n_writes = n_writes;
 				if (b->w.cb == NULL) {
@@ -375,6 +376,8 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
 			}
 			pn = &b->next;
 		} else {
+			if (n_reads)
+				list_add(&carry_reads, &b->requests);
 			/* there could still be requests on that ring list,
 			 * in case local io is still pending */
 			list_del(&b->requests);
@@ -389,6 +392,7 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
 				/* recycle, but reinit! */
 				D_ASSERT(tmp == NULL);
 				INIT_LIST_HEAD(&b->requests);
+				list_splice(&carry_reads, &b->requests);
 				INIT_LIST_HEAD(&b->w.list);
 				b->w.cb = NULL;
 				b->br_number = net_random();
@@ -401,6 +405,7 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
 			kfree(b);
 		}
 		b = tmp;
+		list_splice(&carry_reads, &b->requests);
 	}
 }
 
-- 
cgit v1.2.3


From 5ba82308ea766b33404cb130a88fe4113d9c20a3 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Thu, 10 Jun 2010 13:30:36 +0200
Subject: drbd: factored drbd_req_make_private_bio() out of drbd_req_new()

Preparing tl_thaw_dio()

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_req.h | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index 1bcb85539735..07cb3b12edb4 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -257,30 +257,36 @@ static inline struct drbd_request *_ar_id_to_req(struct drbd_conf *mdev,
 	return NULL;
 }
 
+static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bio *bio_src)
+{
+	struct bio *bio;
+	bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */
+
+	req->private_bio = bio;
+
+	bio->bi_private  = req;
+	bio->bi_end_io   = drbd_endio_pri;
+	bio->bi_next     = NULL;
+}
+
 static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev,
 	struct bio *bio_src)
 {
-	struct bio *bio;
 	struct drbd_request *req =
 		mempool_alloc(drbd_request_mempool, GFP_NOIO);
 	if (likely(req)) {
-		bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */
+		drbd_req_make_private_bio(req, bio_src);
 
 		req->rq_state    = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0;
 		req->mdev        = mdev;
 		req->master_bio  = bio_src;
-		req->private_bio = bio;
 		req->epoch       = 0;
-		req->sector      = bio->bi_sector;
-		req->size        = bio->bi_size;
+		req->sector      = bio_src->bi_sector;
+		req->size        = bio_src->bi_size;
 		req->start_time  = jiffies;
 		INIT_HLIST_NODE(&req->colision);
 		INIT_LIST_HEAD(&req->tl_requests);
 		INIT_LIST_HEAD(&req->w.list);
-
-		bio->bi_private  = req;
-		bio->bi_end_io   = drbd_endio_pri;
-		bio->bi_next     = NULL;
 	}
 	return req;
 }
-- 
cgit v1.2.3


From 905cd7d8ac9b18e1f122b90dbebe1246b1c364fd Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Mon, 10 May 2010 16:03:10 +0200
Subject: drbd: Removed redundant error checks in the request code path

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_req.c | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 39c2cc3614e4..48647589aa0d 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -984,21 +984,6 @@ static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write)
 		return 1;
 	}
 
-	/*
-	 * Paranoia: we might have been primary, but sync target, or
-	 * even diskless, then lost the connection.
-	 * This should have been handled (panic? suspend?) somewhere
-	 * else. But maybe it was not, so check again here.
-	 * Caution: as long as we do not have a read/write lock on mdev,
-	 * to serialize state changes, this is racy, since we may lose
-	 * the connection *after* we test for the cstate.
-	 */
-	if (mdev->state.disk < D_UP_TO_DATE && mdev->state.pdsk < D_UP_TO_DATE) {
-		if (__ratelimit(&drbd_ratelimit_state))
-			dev_err(DEV, "Sorry, I have no access to good data anymore.\n");
-		return 1;
-	}
-
 	return 0;
 }
 
-- 
cgit v1.2.3


From 265be2d09853d425ad14a61cda0ca63345613d0c Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Mon, 31 May 2010 10:14:17 +0200
Subject: drbd: Finished the "on-no-data-accessible suspend-io;" functionality

When no data is accessible (no connection to the peer, nor a local disk)
allow the user to select to freeze all IO operations instead of getting
IO errors.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h    |  1 +
 drivers/block/drbd/drbd_main.c   | 26 +++++++++++++++++++++++++-
 drivers/block/drbd/drbd_nl.c     | 13 +++++++++++++
 drivers/block/drbd/drbd_req.c    | 24 ++++++++++++++++++++++++
 drivers/block/drbd/drbd_req.h    |  2 ++
 drivers/block/drbd/drbd_worker.c | 18 ++++++++++++++++++
 include/linux/drbd.h             |  5 +++++
 include/linux/drbd_limits.h      |  1 +
 include/linux/drbd_nl.h          |  1 +
 9 files changed, 90 insertions(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index bef9138f1975..03cc975b9e6c 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1469,6 +1469,7 @@ extern int w_send_barrier(struct drbd_conf *, struct drbd_work *, int);
 extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int);
 extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int);
 extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int);
+extern int w_restart_disk_io(struct drbd_conf *, struct drbd_work *, int);
 
 extern void resync_timer_fn(unsigned long data);
 
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 7d359863ae32..106b9abdc430 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -925,7 +925,12 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
 	if (fp == FP_STONITH &&
 	    (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
 	    !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
-		ns.susp = 1;
+		ns.susp = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
+
+	if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
+	    (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
+	    !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
+		ns.susp = 1; /* Suspend IO while no data available (no accessible data available) */
 
 	if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
 		if (ns.conn == C_SYNC_SOURCE)
@@ -1236,6 +1241,25 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 	/* Here we have the actions that are performed after a
 	   state change. This function might sleep */
 
+	if (os.susp && ns.susp && mdev->sync_conf.on_no_data == OND_SUSPEND_IO) {
+		if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
+			if (ns.conn == C_CONNECTED) {
+				spin_lock_irq(&mdev->req_lock);
+				_tl_restart(mdev, resend);
+				_drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL);
+				spin_unlock_irq(&mdev->req_lock);
+			} else /* ns.conn > C_CONNECTED */
+				dev_err(DEV, "Unexpected Resynd going on!\n");
+		}
+
+		if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING) {
+			spin_lock_irq(&mdev->req_lock);
+			_tl_restart(mdev, restart_frozen_disk_io);
+			_drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL);
+			spin_unlock_irq(&mdev->req_lock);
+		}
+	}
+
 	if (fp == FP_STONITH && ns.susp) {
 		/* case1: The outdate peer handler is successful:
 		 * case2: The connection was established again: */
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 73131c5ae339..563a6ade0179 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -33,6 +33,7 @@
 #include <linux/blkpg.h>
 #include <linux/cpumask.h>
 #include "drbd_int.h"
+#include "drbd_req.h"
 #include "drbd_wrappers.h"
 #include <asm/unaligned.h>
 #include <linux/drbd_tag_magic.h>
@@ -494,6 +495,8 @@ char *ppsize(char *buf, unsigned long long size)
 void drbd_suspend_io(struct drbd_conf *mdev)
 {
 	set_bit(SUSPEND_IO, &mdev->flags);
+	if (mdev->state.susp)
+		return;
 	wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
 }
 
@@ -1557,6 +1560,7 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
 		sc.rate       = DRBD_RATE_DEF;
 		sc.after      = DRBD_AFTER_DEF;
 		sc.al_extents = DRBD_AL_EXTENTS_DEF;
+		sc.on_no_data  = DRBD_ON_NO_DATA_DEF;
 	} else
 		memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf));
 
@@ -1765,7 +1769,16 @@ static int drbd_nl_suspend_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
 static int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 			     struct drbd_nl_cfg_reply *reply)
 {
+	drbd_suspend_io(mdev);
 	reply->ret_code = drbd_request_state(mdev, NS(susp, 0));
+	if (reply->ret_code == SS_SUCCESS) {
+		if (mdev->state.conn < C_CONNECTED)
+			tl_clear(mdev);
+		if (mdev->state.disk == D_DISKLESS || mdev->state.disk == D_FAILED)
+			tl_restart(mdev, fail_frozen_disk_io);
+	}
+	drbd_resume_io(mdev);
+
 	return 0;
 }
 
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 48647589aa0d..8259d4f77285 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -226,6 +226,8 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
 		return;
 	if (s & RQ_LOCAL_PENDING)
 		return;
+	if (mdev->state.susp)
+		return;
 
 	if (req->master_bio) {
 		/* this is data_received (remote read)
@@ -634,6 +636,28 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		/* else: done by handed_over_to_network */
 		break;
 
+	case fail_frozen_disk_io:
+		if (!(req->rq_state & RQ_LOCAL_COMPLETED))
+			break;
+
+		_req_may_be_done(req, m);
+		break;
+
+	case restart_frozen_disk_io:
+		if (!(req->rq_state & RQ_LOCAL_COMPLETED))
+			break;
+
+		req->rq_state &= ~RQ_LOCAL_COMPLETED;
+
+		rv = MR_READ;
+		if (bio_data_dir(req->master_bio) == WRITE)
+			rv = MR_WRITE;
+
+		get_ldev(mdev);
+		req->w.cb = w_restart_disk_io;
+		drbd_queue_work(&mdev->data.work, &req->w);
+		break;
+
 	case resend:
 		/* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK
 		   before the connection loss; only P_BARRIER_ACK was missing.
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index 07cb3b12edb4..f2e45aaa2cd5 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -105,6 +105,8 @@ enum drbd_req_event {
 	write_completed_with_error,
 	completed_ok,
 	resend,
+	fail_frozen_disk_io,
+	restart_frozen_disk_io,
 	nothing, /* for tracing only */
 };
 
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index ca4a16cea2d8..3c1e88480d37 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -1173,6 +1173,24 @@ int w_send_read_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 	return ok;
 }
 
+int w_restart_disk_io(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct drbd_request *req = container_of(w, struct drbd_request, w);
+
+	if (bio_data_dir(req->master_bio) == WRITE)
+		drbd_al_begin_io(mdev, req->sector);
+	/* Calling drbd_al_begin_io() out of the worker might deadlocks
+	   theoretically. Practically it can not deadlock, since this is
+	   only used when unfreezing IOs. All the extents of the requests
+	   that made it into the TL are already active */
+
+	drbd_req_make_private_bio(req, req->master_bio);
+	req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
+	generic_make_request(req->private_bio);
+
+	return 1;
+}
+
 static int _drbd_may_sync_now(struct drbd_conf *mdev)
 {
 	struct drbd_conf *odev = mdev;
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
index 479ee3a1d901..7be069fcca57 100644
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -91,6 +91,11 @@ enum drbd_after_sb_p {
 	ASB_VIOLENTLY
 };
 
+enum drbd_on_no_data {
+	OND_IO_ERROR,
+	OND_SUSPEND_IO
+};
+
 /* KEEP the order, do not delete or insert. Only append. */
 enum drbd_ret_codes {
 	ERR_CODE_BASE		= 100,
diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h
index 440b42e38e89..7eb1e98009ec 100644
--- a/include/linux/drbd_limits.h
+++ b/include/linux/drbd_limits.h
@@ -128,6 +128,7 @@
 #define DRBD_AFTER_SB_1P_DEF ASB_DISCONNECT
 #define DRBD_AFTER_SB_2P_DEF ASB_DISCONNECT
 #define DRBD_RR_CONFLICT_DEF ASB_DISCONNECT
+#define DRBD_ON_NO_DATA_DEF OND_IO_ERROR
 
 #define DRBD_MAX_BIO_BVECS_MIN 0
 #define DRBD_MAX_BIO_BVECS_MAX 128
diff --git a/include/linux/drbd_nl.h b/include/linux/drbd_nl.h
index 5f042810a56c..9aebd0d80a5d 100644
--- a/include/linux/drbd_nl.h
+++ b/include/linux/drbd_nl.h
@@ -87,6 +87,7 @@ NL_PACKET(syncer_conf, 8,
 	NL_STRING(      51,     T_MAY_IGNORE,   cpu_mask,       32)
 	NL_STRING(	64,	T_MAY_IGNORE,	csums_alg,	SHARED_SECRET_MAX)
 	NL_BIT(         65,     T_MAY_IGNORE,   use_rle)
+	NL_INTEGER(	75,	T_MAY_IGNORE,	on_no_data)
 )
 
 NL_PACKET(invalidate, 9, )
-- 
cgit v1.2.3


From 47ff2d0a8e7ce87fed180729e8341f650bf585c8 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Fri, 18 Jun 2010 13:56:57 +0200
Subject: drbd: Do not allow a fencing-policy of resource-and-stonith with
 protocol A

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_nl.c  | 20 +++++++++++++++++++-
 drivers/block/drbd/drbd_req.c |  2 +-
 include/linux/drbd.h          |  1 +
 3 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 563a6ade0179..5288bd72cd27 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -806,6 +806,15 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 		goto fail;
 	}
 
+	if (get_net_conf(mdev)) {
+		int prot = mdev->net_conf->wire_protocol;
+		put_net_conf(mdev);
+		if (nbc->dc.fencing == FP_STONITH && prot == DRBD_PROT_A) {
+			retcode = ERR_STONITH_AND_PROT_A;
+			goto fail;
+		}
+	}
+
 	nbc->lo_file = filp_open(nbc->dc.backing_dev, O_RDWR, 0);
 	if (IS_ERR(nbc->lo_file)) {
 		dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev,
@@ -1238,7 +1247,16 @@ static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 	    && (new_conf->wire_protocol != DRBD_PROT_C)) {
 		retcode = ERR_NOT_PROTO_C;
 		goto fail;
-	};
+	}
+
+	if (get_ldev(mdev)) {
+		enum drbd_fencing_p fp = mdev->ldev->dc.fencing;
+		put_ldev(mdev);
+		if (new_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH) {
+			retcode = ERR_STONITH_AND_PROT_A;
+			goto fail;
+		}
+	}
 
 	if (mdev->state.role == R_PRIMARY && new_conf->want_lose) {
 		retcode = ERR_DISCARD;
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 8259d4f77285..fbe027886bad 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -660,7 +660,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 
 	case resend:
 		/* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK
-		   before the connection loss; only P_BARRIER_ACK was missing.
+		   before the connection loss (B&C only); only P_BARRIER_ACK was missing.
 		   Trowing them out of the TL here by pretending we got a BARRIER_ACK
 		   TODO: Either resync them, or ensure peer was not rebooted. */
 		if (!(req->rq_state & RQ_NET_OK)) {
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
index 7be069fcca57..0b2bfb58d9c5 100644
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -145,6 +145,7 @@ enum drbd_ret_codes {
 	ERR_CONNECTED		= 151, /* DRBD 8.3 only */
 	ERR_PERM		= 152,
 	ERR_NEED_APV_93		= 153,
+	ERR_STONITH_AND_PROT_A  = 154,
 
 	/* insert new ones above this line */
 	AFTER_LAST_ERR_CODE
-- 
cgit v1.2.3


From 894c6a946199cf91e52bc1864c3dc6529cceb3db Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Fri, 18 Jun 2010 16:03:20 +0200
Subject: drbd: Disabled the crashed_primary detection for re-attach of last
 data while IO is frozen

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_nl.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 5288bd72cd27..cda7cb3202b9 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1033,7 +1033,9 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	else
 		clear_bit(CRASHED_PRIMARY, &mdev->flags);
 
-	if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND)) {
+	if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) &&
+	    !(mdev->state.role == R_PRIMARY && mdev->state.susp &&
+	      mdev->sync_conf.on_no_data == OND_SUSPEND_IO)) {
 		set_bit(CRASHED_PRIMARY, &mdev->flags);
 		cp_discovered = 1;
 	}
-- 
cgit v1.2.3


From 18a50fa213d46d5592f6542c91ab4c4760cf346c Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Mon, 21 Jun 2010 14:14:15 +0200
Subject: drbd: Now we need to handle the ed_uuid of an diskless, unconnected
 primary correctly

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_main.c     | 4 +++-
 drivers/block/drbd/drbd_receiver.c | 5 +++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 106b9abdc430..c701805ed4b9 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1297,8 +1297,10 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 	}
 
 	if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
-		if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0)
+		if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) {
 			drbd_uuid_new_current(mdev);
+			drbd_send_uuids(mdev);
+		}
 
 		/* D_DISKLESS Peer becomes secondary */
 		if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 8daa920c40a4..72bc1a130645 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -3104,6 +3104,11 @@ static int receive_uuids(struct drbd_conf *mdev, struct p_header *h)
 			drbd_md_sync(mdev);
 		}
 		put_ldev(mdev);
+	} else if (mdev->state.disk < D_INCONSISTENT &&
+		   mdev->state.role == R_PRIMARY) {
+		/* I am a diskless primary, the peer just created a new current UUID
+		   for me. */
+		drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
 	}
 
 	/* Before we test for the disk state, we should wait until an eventually
-- 
cgit v1.2.3


From 999122bc188a4d0a4847bdf1915d357bd6ab53dc Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Thu, 10 Jun 2010 16:46:54 +0200
Subject: drbd: Removing a by now obsolete clause in the state sanitizing

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_main.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index c701805ed4b9..c502648b7b40 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -847,9 +847,6 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
 	if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
 		ns.aftr_isp = 0;
 
-	if (ns.conn <= C_DISCONNECTING && ns.disk == D_DISKLESS)
-		ns.pdsk = D_UNKNOWN;
-
 	/* Abort resync if a disk fails/detaches */
 	if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
 	    (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
-- 
cgit v1.2.3


From 1616a25493cce727d582886f162c5bb0abd87e6a Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Thu, 10 Jun 2010 16:55:15 +0200
Subject: drbd: Reduce the verbosity of some state transitions

State transitions in the space of non-allowed states used
to be very noisy. Reduce that, since that has little value
for the majority of the user base.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_main.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index c502648b7b40..40baddd94a5b 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -997,14 +997,8 @@ int __drbd_set_state(struct drbd_conf *mdev,
 			/* If the old state was illegal as well, then let
 			   this happen...*/
 
-			if (is_valid_state(mdev, os) == rv) {
-				dev_err(DEV, "Considering state change from bad state. "
-				    "Error would be: '%s'\n",
-				    drbd_set_st_err_str(rv));
-				print_st(mdev, "old", os);
-				print_st(mdev, "new", ns);
+			if (is_valid_state(mdev, os) == rv)
 				rv = is_valid_state_transition(mdev, ns, os);
-			}
 		} else
 			rv = is_valid_state_transition(mdev, ns, os);
 	}
-- 
cgit v1.2.3


From 87f7be4cf88e93069f4cc63baf2ce70fdfc59c63 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Fri, 11 Jun 2010 13:56:33 +0200
Subject: drbd: Run the fence-peer helper asynchronously

Since we can not thaw the transfer log, the next logical step is
to allow reconnects while the fence-peer handler runs.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h      |  3 ++-
 drivers/block/drbd/drbd_nl.c       | 21 +++++++++++++++++++++
 drivers/block/drbd/drbd_receiver.c |  8 ++------
 3 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 03cc975b9e6c..ab20c0062d21 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1423,7 +1423,8 @@ extern void resync_after_online_grow(struct drbd_conf *);
 extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local);
 extern int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role,
 		int force);
-enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev);
+extern enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev);
+extern void drbd_try_outdate_peer_async(struct drbd_conf *mdev);
 extern int drbd_khelper(struct drbd_conf *mdev, char *cmd);
 
 /* drbd_worker.c */
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index cda7cb3202b9..32d00720470b 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -38,6 +38,8 @@
 #include <asm/unaligned.h>
 #include <linux/drbd_tag_magic.h>
 #include <linux/drbd_limits.h>
+#include <linux/compiler.h>
+#include <linux/kthread.h>
 
 static unsigned short *tl_add_blob(unsigned short *, enum drbd_tags, const void *, int);
 static unsigned short *tl_add_str(unsigned short *, enum drbd_tags, const char *);
@@ -256,6 +258,25 @@ enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev)
 	return nps;
 }
 
+static int _try_outdate_peer_async(void *data)
+{
+	struct drbd_conf *mdev = (struct drbd_conf *)data;
+	enum drbd_disk_state nps;
+
+	nps = drbd_try_outdate_peer(mdev);
+	drbd_request_state(mdev, NS(pdsk, nps));
+
+	return 0;
+}
+
+void drbd_try_outdate_peer_async(struct drbd_conf *mdev)
+{
+	struct task_struct *opa;
+
+	opa = kthread_run(_try_outdate_peer_async, mdev, "drbd%d_a_helper", mdev_to_minor(mdev));
+	if (IS_ERR(opa))
+		dev_err(DEV, "out of mem, failed to invoke fence-peer helper\n");
+}
 
 int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
 {
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 72bc1a130645..101ad186244c 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -3747,12 +3747,8 @@ static void drbd_disconnect(struct drbd_conf *mdev)
 		put_ldev(mdev);
 	}
 
-	if (mdev->state.role == R_PRIMARY) {
-		if (fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN) {
-			enum drbd_disk_state nps = drbd_try_outdate_peer(mdev);
-			drbd_request_state(mdev, NS(pdsk, nps));
-		}
-	}
+	if (mdev->state.role == R_PRIMARY && fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN)
+		drbd_try_outdate_peer_async(mdev);
 
 	spin_lock_irq(&mdev->req_lock);
 	os = mdev->state;
-- 
cgit v1.2.3


From 43a5182cccae5850f7590f78dd9651bd407be440 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Fri, 11 Jun 2010 11:26:34 +0200
Subject: drbd: Delayed creation of current-UUID

When a fencing policy of "resource-and-stonith" is configured,
and DRBD looses connection to it's peer, we can delay the
creation of a new current-UUID until IO gets thawed.

That allows one to deploy fence-peer handlers that actually
commit suicide on the machine they get started.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h  |  1 +
 drivers/block/drbd/drbd_main.c | 27 +++++++++++++++++++++------
 drivers/block/drbd/drbd_nl.c   |  5 +++++
 3 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index ab20c0062d21..e0e08f5e0a76 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -829,6 +829,7 @@ enum {
 				 * the peer, if it changed there as well. */
 	CONN_DRY_RUN,		/* Expect disconnect after resync handshake. */
 	GOT_PING_ACK,		/* set when we receive a ping_ack packet, misc wait gets woken */
+	NEW_CUR_UUID,		/* Create new current UUID when thawing IO */
 };
 
 struct drbd_bitmap; /* opaque for drbd_conf */
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 40baddd94a5b..440b1d5dcfe2 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1252,12 +1252,23 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 	}
 
 	if (fp == FP_STONITH && ns.susp) {
-		/* case1: The outdate peer handler is successful:
-		 * case2: The connection was established again: */
-		if ((os.pdsk > D_OUTDATED  && ns.pdsk <= D_OUTDATED) ||
-		    (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)) {
+		/* case1: The outdate peer handler is successful: */
+		if (os.pdsk > D_OUTDATED  && ns.pdsk <= D_OUTDATED) {
 			tl_clear(mdev);
+			if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
+				drbd_uuid_new_current(mdev);
+				clear_bit(NEW_CUR_UUID, &mdev->flags);
+				drbd_md_sync(mdev);
+			}
+			spin_lock_irq(&mdev->req_lock);
+			_drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL);
+			spin_unlock_irq(&mdev->req_lock);
+		}
+		/* case2: The connection was established again: */
+		if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
+			clear_bit(NEW_CUR_UUID, &mdev->flags);
 			spin_lock_irq(&mdev->req_lock);
+			_tl_restart(mdev, resend);
 			_drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL);
 			spin_unlock_irq(&mdev->req_lock);
 		}
@@ -1280,8 +1291,12 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 		if (get_ldev(mdev)) {
 			if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
 			    mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
-				drbd_uuid_new_current(mdev);
-				drbd_send_uuids(mdev);
+				if (mdev->state.susp) {
+					set_bit(NEW_CUR_UUID, &mdev->flags);
+				} else {
+					drbd_uuid_new_current(mdev);
+					drbd_send_uuids(mdev);
+				}
 			}
 			put_ldev(mdev);
 		}
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 32d00720470b..d764f3cd5866 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1810,6 +1810,11 @@ static int drbd_nl_suspend_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
 static int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 			     struct drbd_nl_cfg_reply *reply)
 {
+	if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
+		drbd_uuid_new_current(mdev);
+		clear_bit(NEW_CUR_UUID, &mdev->flags);
+		drbd_md_sync(mdev);
+	}
 	drbd_suspend_io(mdev);
 	reply->ret_code = drbd_request_state(mdev, NS(susp, 0));
 	if (reply->ret_code == SS_SUCCESS) {
-- 
cgit v1.2.3


From 481c6f503213ab14f69ce88fff7b1ece325522f2 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Tue, 22 Jun 2010 14:03:27 +0200
Subject: drbd: Ensure that the peer was not rebootet in the meantime before
 resending TL

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_nl.c       |  3 ---
 drivers/block/drbd/drbd_receiver.c | 13 ++++++++++++-
 drivers/block/drbd/drbd_req.c      |  2 +-
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index d764f3cd5866..921793ca18a5 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -208,9 +208,6 @@ enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev)
 		return mdev->state.pdsk;
 	}
 
-	if (fp == FP_STONITH)
-		_drbd_request_state(mdev, NS(susp, 1), CS_WAIT_COMPLETE);
-
 	r = drbd_khelper(mdev, "fence-peer");
 
 	switch ((r>>8) & 0xff) {
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 101ad186244c..3a8131a26559 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -3265,7 +3265,18 @@ static int receive_state(struct drbd_conf *mdev, struct p_header *h)
 	ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
 	if ((nconn == C_CONNECTED || nconn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
 		ns.disk = mdev->new_state_tmp.disk;
-
+	if (ns.pdsk == D_CONSISTENT && ns.susp && nconn == C_CONNECTED && oconn < C_CONNECTED &&
+	    test_bit(NEW_CUR_UUID, &mdev->flags)) {
+		/* Do not allow tl_restart(resend) for a rebooted peer. We can only allow this
+		   for temporal network outages! */
+		spin_unlock_irq(&mdev->req_lock);
+		dev_err(DEV, "Aborting Connect, can not thaw IO with an only Consistent peer\n");
+		tl_clear(mdev);
+		drbd_uuid_new_current(mdev);
+		clear_bit(NEW_CUR_UUID, &mdev->flags);
+		drbd_force_state(mdev, NS2(conn, C_PROTOCOL_ERROR, susp, 0));
+		return FALSE;
+	}
 	rv = _drbd_set_state(mdev, ns, CS_VERBOSE | CS_HARD, NULL);
 	ns = mdev->state;
 	spin_unlock_irq(&mdev->req_lock);
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index fbe027886bad..76b668245612 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -662,7 +662,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		/* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK
 		   before the connection loss (B&C only); only P_BARRIER_ACK was missing.
 		   Trowing them out of the TL here by pretending we got a BARRIER_ACK
-		   TODO: Either resync them, or ensure peer was not rebooted. */
+		   We ensure that the peer was not rebooted */
 		if (!(req->rq_state & RQ_NET_OK)) {
 			if (req->w.cb) {
 				drbd_queue_work(&mdev->data.work, &req->w);
-- 
cgit v1.2.3


From 65d922c33ebd359db25d5846929b2eafc4238fcc Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Wed, 16 Jun 2010 16:18:09 +0200
Subject: drbd: Do not do a hard state change when establishing a connection
 [bugz 304]

Make sure the state engine can deny two primaries to connect

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_receiver.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 3a8131a26559..224c79ed16e9 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -3188,6 +3188,7 @@ static int receive_state(struct drbd_conf *mdev, struct p_header *h)
 	enum drbd_conns nconn, oconn;
 	union drbd_state ns, peer_state;
 	enum drbd_disk_state real_peer_disk;
+	enum chg_state_flags cs_flags;
 	int rv;
 
 	ERR_IF(h->length != (sizeof(*p)-sizeof(*h)))
@@ -3265,6 +3266,7 @@ static int receive_state(struct drbd_conf *mdev, struct p_header *h)
 	ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
 	if ((nconn == C_CONNECTED || nconn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
 		ns.disk = mdev->new_state_tmp.disk;
+	cs_flags = CS_VERBOSE + (oconn < C_CONNECTED && nconn >= C_CONNECTED ? 0 : CS_HARD);
 	if (ns.pdsk == D_CONSISTENT && ns.susp && nconn == C_CONNECTED && oconn < C_CONNECTED &&
 	    test_bit(NEW_CUR_UUID, &mdev->flags)) {
 		/* Do not allow tl_restart(resend) for a rebooted peer. We can only allow this
@@ -3277,7 +3279,7 @@ static int receive_state(struct drbd_conf *mdev, struct p_header *h)
 		drbd_force_state(mdev, NS2(conn, C_PROTOCOL_ERROR, susp, 0));
 		return FALSE;
 	}
-	rv = _drbd_set_state(mdev, ns, CS_VERBOSE | CS_HARD, NULL);
+	rv = _drbd_set_state(mdev, ns, cs_flags, NULL);
 	ns = mdev->state;
 	spin_unlock_irq(&mdev->req_lock);
 
-- 
cgit v1.2.3


From 84dfb9f564208a0331131d1ab922382c7d61a553 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Wed, 23 Jun 2010 11:20:05 +0200
Subject: drbd: Fixed a deadlock, probably only affected UP machines

After disconnect (most likely mdev->net_cnt == 0) and we are
still in an unstable state (!drbd_state_is_stable()). When we
get an IO request in drbd_get_max_buffers() (called from
__inc_ap_bio_cond(), called from inc_ap_bio()) we wake up
misc_wait. Misc_wait is also used in inc_ap_bio() to sleep
until the outcome of __inc_ap_bio_cond() changes. => Busy loop!

Solution: Have a dedicated wait queue for get_net_conf() and
put_net_conf().

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h      | 3 ++-
 drivers/block/drbd/drbd_main.c     | 1 +
 drivers/block/drbd/drbd_receiver.c | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index e0e08f5e0a76..aa9bb213fe70 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -947,6 +947,7 @@ struct drbd_conf {
 	union drbd_state state;
 	wait_queue_head_t misc_wait;
 	wait_queue_head_t state_wait;  /* upon each state change. */
+	wait_queue_head_t net_cnt_wait;
 	unsigned int send_cnt;
 	unsigned int recv_cnt;
 	unsigned int read_cnt;
@@ -2018,7 +2019,7 @@ static inline void inc_unacked(struct drbd_conf *mdev)
 static inline void put_net_conf(struct drbd_conf *mdev)
 {
 	if (atomic_dec_and_test(&mdev->net_cnt))
-		wake_up(&mdev->misc_wait);
+		wake_up(&mdev->net_cnt_wait);
 }
 
 /**
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 440b1d5dcfe2..9fe9bdd9e33d 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2763,6 +2763,7 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
 
 	init_waitqueue_head(&mdev->misc_wait);
 	init_waitqueue_head(&mdev->state_wait);
+	init_waitqueue_head(&mdev->net_cnt_wait);
 	init_waitqueue_head(&mdev->ee_wait);
 	init_waitqueue_head(&mdev->al_wait);
 	init_waitqueue_head(&mdev->seq_wait);
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 224c79ed16e9..22d74d79ba42 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -3775,7 +3775,7 @@ static void drbd_disconnect(struct drbd_conf *mdev)
 
 	if (os.conn == C_DISCONNECTING) {
 		struct hlist_head *h;
-		wait_event(mdev->misc_wait, atomic_read(&mdev->net_cnt) == 0);
+		wait_event(mdev->net_cnt_wait, atomic_read(&mdev->net_cnt) == 0);
 
 		/* we must not free the tl_hash
 		 * while application io is still on the fly */
-- 
cgit v1.2.3


From cfa03415a14dd0055f2ff8c3d348d4c1452acba6 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Wed, 23 Jun 2010 17:18:51 +0200
Subject: drbd: Allow tl_restart() to do IO completion while IO is suspended

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_req.c | 34 ++++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 76b668245612..4e1e10d67c4b 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -226,8 +226,6 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
 		return;
 	if (s & RQ_LOCAL_PENDING)
 		return;
-	if (mdev->state.susp)
-		return;
 
 	if (req->master_bio) {
 		/* this is data_received (remote read)
@@ -284,6 +282,14 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
 	 * protocol A or B, barrier ack still pending... */
 }
 
+static void _req_may_be_done_not_susp(struct drbd_request *req, struct bio_and_error *m)
+{
+	struct drbd_conf *mdev = req->mdev;
+
+	if (!mdev->state.susp)
+		_req_may_be_done(req, m);
+}
+
 /*
  * checks whether there was an overlapping request
  * or ee already registered.
@@ -425,7 +431,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		req->rq_state |= (RQ_LOCAL_COMPLETED|RQ_LOCAL_OK);
 		req->rq_state &= ~RQ_LOCAL_PENDING;
 
-		_req_may_be_done(req, m);
+		_req_may_be_done_not_susp(req, m);
 		put_ldev(mdev);
 		break;
 
@@ -434,7 +440,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		req->rq_state &= ~RQ_LOCAL_PENDING;
 
 		__drbd_chk_io_error(mdev, FALSE);
-		_req_may_be_done(req, m);
+		_req_may_be_done_not_susp(req, m);
 		put_ldev(mdev);
 		break;
 
@@ -442,7 +448,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		/* it is legal to fail READA */
 		req->rq_state |= RQ_LOCAL_COMPLETED;
 		req->rq_state &= ~RQ_LOCAL_PENDING;
-		_req_may_be_done(req, m);
+		_req_may_be_done_not_susp(req, m);
 		put_ldev(mdev);
 		break;
 
@@ -460,7 +466,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		/* no point in retrying if there is no good remote data,
 		 * or we have no connection. */
 		if (mdev->state.pdsk != D_UP_TO_DATE) {
-			_req_may_be_done(req, m);
+			_req_may_be_done_not_susp(req, m);
 			break;
 		}
 
@@ -546,7 +552,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		req->rq_state &= ~RQ_NET_QUEUED;
 		/* if we did it right, tl_clear should be scheduled only after
 		 * this, so this should not be necessary! */
-		_req_may_be_done(req, m);
+		_req_may_be_done_not_susp(req, m);
 		break;
 
 	case handed_over_to_network:
@@ -571,7 +577,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		 * "completed_ok" events came in, once we return from
 		 * _drbd_send_zc_bio (drbd_send_dblock), we have to check
 		 * whether it is done already, and end it.  */
-		_req_may_be_done(req, m);
+		_req_may_be_done_not_susp(req, m);
 		break;
 
 	case read_retry_remote_canceled:
@@ -587,7 +593,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		/* if it is still queued, we may not complete it here.
 		 * it will be canceled soon. */
 		if (!(req->rq_state & RQ_NET_QUEUED))
-			_req_may_be_done(req, m);
+			_req_may_be_done(req, m); /* Allowed while state.susp */
 		break;
 
 	case write_acked_by_peer_and_sis:
@@ -622,7 +628,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		D_ASSERT(req->rq_state & RQ_NET_PENDING);
 		dec_ap_pending(mdev);
 		req->rq_state &= ~RQ_NET_PENDING;
-		_req_may_be_done(req, m);
+		_req_may_be_done_not_susp(req, m);
 		break;
 
 	case neg_acked:
@@ -632,7 +638,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
 
 		req->rq_state |= RQ_NET_DONE;
-		_req_may_be_done(req, m);
+		_req_may_be_done_not_susp(req, m);
 		/* else: done by handed_over_to_network */
 		break;
 
@@ -640,7 +646,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		if (!(req->rq_state & RQ_LOCAL_COMPLETED))
 			break;
 
-		_req_may_be_done(req, m);
+		_req_may_be_done(req, m); /* Allowed while state.susp */
 		break;
 
 	case restart_frozen_disk_io:
@@ -685,7 +691,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		}
 		D_ASSERT(req->rq_state & RQ_NET_SENT);
 		req->rq_state |= RQ_NET_DONE;
-		_req_may_be_done(req, m);
+		_req_may_be_done(req, m); /* Allowed while state.susp */
 		break;
 
 	case data_received:
@@ -693,7 +699,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		dec_ap_pending(mdev);
 		req->rq_state &= ~RQ_NET_PENDING;
 		req->rq_state |= (RQ_NET_OK|RQ_NET_DONE);
-		_req_may_be_done(req, m);
+		_req_may_be_done_not_susp(req, m);
 		break;
 	};
 
-- 
cgit v1.2.3


From 8f488156c0635dcc9c668737d05386113a745ef9 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Thu, 24 Jun 2010 12:05:53 +0200
Subject: drbd: Allow attach while IO is suspended

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_nl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 921793ca18a5..dc0a9acbfdf1 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -954,7 +954,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 
 	drbd_suspend_io(mdev);
 	/* also wait for the last barrier ack. */
-	wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt));
+	wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt) || mdev->state.susp);
 	/* and for any other previously queued work */
 	drbd_flush_workqueue(mdev);
 
-- 
cgit v1.2.3


From f70b3511599c49a3dc20ae349d6cdc5af47659df Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Thu, 24 Jun 2010 14:34:40 +0200
Subject: drbd: Do not try to free tl_hash in drbd_disconnect() when IO is
 suspended

We may not free tl_hash when IO is suspended, since we can not wait
until ap_bio_cnt reaches zero.

We can do this after susp reched 0, since then tl_clear was called

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h      |  1 +
 drivers/block/drbd/drbd_main.c     |  4 +++
 drivers/block/drbd/drbd_nl.c       |  4 +--
 drivers/block/drbd/drbd_receiver.c | 60 +++++++++++++++++++++++---------------
 4 files changed, 43 insertions(+), 26 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index aa9bb213fe70..f84ffb17a7e5 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1493,6 +1493,7 @@ extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev,
 extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled);
 extern void _drbd_clear_done_ee(struct drbd_conf *mdev, struct list_head *to_be_freed);
 extern void drbd_flush_workqueue(struct drbd_conf *mdev);
+extern void drbd_free_tl_hash(struct drbd_conf *mdev);
 
 /* yes, there is kernel_setsockopt, but only since 2.6.18. we don't need to
  * mess with get_fs/set_fs, we know we are KERNEL_DS always. */
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 9fe9bdd9e33d..8d14635e7faf 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1409,6 +1409,10 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 	    (os.user_isp && !ns.user_isp))
 		resume_next_sg(mdev);
 
+	/* free tl_hash if we Got thawed and are C_STANDALONE */
+	if (ns.conn == C_STANDALONE && ns.susp == 0 && mdev->tl_hash)
+		drbd_free_tl_hash(mdev);
+
 	/* Upon network connection, we need to start the receiver */
 	if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
 		drbd_thread_start(&mdev->receiver);
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index dc0a9acbfdf1..6c08e637e25c 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1391,6 +1391,7 @@ static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 		}
 	}
 
+	drbd_flush_workqueue(mdev);
 	spin_lock_irq(&mdev->req_lock);
 	if (mdev->net_conf != NULL) {
 		retcode = ERR_NET_CONFIGURED;
@@ -1429,10 +1430,9 @@ static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 	mdev->int_dig_out=int_dig_out;
 	mdev->int_dig_in=int_dig_in;
 	mdev->int_dig_vv=int_dig_vv;
+	retcode = _drbd_set_state(_NS(mdev, conn, C_UNCONNECTED), CS_VERBOSE, NULL);
 	spin_unlock_irq(&mdev->req_lock);
 
-	retcode = _drbd_request_state(mdev, NS(conn, C_UNCONNECTED), CS_VERBOSE);
-
 	kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
 	reply->ret_code = retcode;
 	drbd_reconfig_done(mdev);
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 22d74d79ba42..5e49ee75d3c9 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -3691,6 +3691,36 @@ void drbd_flush_workqueue(struct drbd_conf *mdev)
 	wait_for_completion(&barr.done);
 }
 
+void drbd_free_tl_hash(struct drbd_conf *mdev)
+{
+	struct hlist_head *h;
+
+	spin_lock_irq(&mdev->req_lock);
+
+	if (!mdev->tl_hash || mdev->state.conn != C_STANDALONE) {
+		spin_unlock_irq(&mdev->req_lock);
+		return;
+	}
+	/* paranoia code */
+	for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++)
+		if (h->first)
+			dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n",
+				(int)(h - mdev->ee_hash), h->first);
+	kfree(mdev->ee_hash);
+	mdev->ee_hash = NULL;
+	mdev->ee_hash_s = 0;
+
+	/* paranoia code */
+	for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++)
+		if (h->first)
+			dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n",
+				(int)(h - mdev->tl_hash), h->first);
+	kfree(mdev->tl_hash);
+	mdev->tl_hash = NULL;
+	mdev->tl_hash_s = 0;
+	spin_unlock_irq(&mdev->req_lock);
+}
+
 static void drbd_disconnect(struct drbd_conf *mdev)
 {
 	enum drbd_fencing_p fp;
@@ -3774,32 +3804,14 @@ static void drbd_disconnect(struct drbd_conf *mdev)
 	spin_unlock_irq(&mdev->req_lock);
 
 	if (os.conn == C_DISCONNECTING) {
-		struct hlist_head *h;
 		wait_event(mdev->net_cnt_wait, atomic_read(&mdev->net_cnt) == 0);
 
-		/* we must not free the tl_hash
-		 * while application io is still on the fly */
-		wait_event(mdev->misc_wait, atomic_read(&mdev->ap_bio_cnt) == 0);
-
-		spin_lock_irq(&mdev->req_lock);
-		/* paranoia code */
-		for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++)
-			if (h->first)
-				dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n",
-						(int)(h - mdev->ee_hash), h->first);
-		kfree(mdev->ee_hash);
-		mdev->ee_hash = NULL;
-		mdev->ee_hash_s = 0;
-
-		/* paranoia code */
-		for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++)
-			if (h->first)
-				dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n",
-						(int)(h - mdev->tl_hash), h->first);
-		kfree(mdev->tl_hash);
-		mdev->tl_hash = NULL;
-		mdev->tl_hash_s = 0;
-		spin_unlock_irq(&mdev->req_lock);
+		if (!mdev->state.susp) {
+			/* we must not free the tl_hash
+			 * while application io is still on the fly */
+			wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
+			drbd_free_tl_hash(mdev);
+		}
 
 		crypto_free_hash(mdev->cram_hmac_tfm);
 		mdev->cram_hmac_tfm = NULL;
-- 
cgit v1.2.3


From 6709893059105d7859ae772af70c7db5bbab7de0 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Thu, 24 Jun 2010 16:24:25 +0200
Subject: drbd: Make sure tl_restart(, resend) can not get called multiple
 times for a new connection

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_main.c | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 8d14635e7faf..c6658f5a5c1c 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1209,6 +1209,7 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 			   union drbd_state ns, enum chg_state_flags flags)
 {
 	enum drbd_fencing_p fp;
+	enum drbd_req_event what = nothing;
 
 	if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
 		clear_bit(CRASHED_PRIMARY, &mdev->flags);
@@ -1234,21 +1235,14 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 
 	if (os.susp && ns.susp && mdev->sync_conf.on_no_data == OND_SUSPEND_IO) {
 		if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
-			if (ns.conn == C_CONNECTED) {
-				spin_lock_irq(&mdev->req_lock);
-				_tl_restart(mdev, resend);
-				_drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL);
-				spin_unlock_irq(&mdev->req_lock);
-			} else /* ns.conn > C_CONNECTED */
+			if (ns.conn == C_CONNECTED)
+				what = resend;
+			else /* ns.conn > C_CONNECTED */
 				dev_err(DEV, "Unexpected Resynd going on!\n");
 		}
 
-		if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING) {
-			spin_lock_irq(&mdev->req_lock);
-			_tl_restart(mdev, restart_frozen_disk_io);
-			_drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL);
-			spin_unlock_irq(&mdev->req_lock);
-		}
+		if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING)
+			what = restart_frozen_disk_io;
 	}
 
 	if (fp == FP_STONITH && ns.susp) {
@@ -1267,12 +1261,17 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 		/* case2: The connection was established again: */
 		if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
 			clear_bit(NEW_CUR_UUID, &mdev->flags);
-			spin_lock_irq(&mdev->req_lock);
-			_tl_restart(mdev, resend);
-			_drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL);
-			spin_unlock_irq(&mdev->req_lock);
+			what = resend;
 		}
 	}
+
+	if (what != nothing) {
+		spin_lock_irq(&mdev->req_lock);
+		_tl_restart(mdev, what);
+		_drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL);
+		spin_unlock_irq(&mdev->req_lock);
+	}
+
 	/* Do not change the order of the if above and the two below... */
 	if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) {      /* attach on the peer */
 		drbd_send_uuids(mdev);
-- 
cgit v1.2.3


From e756414f7daa93b862f1670dd0a6aaa676ea71e3 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Tue, 29 Jun 2010 17:35:34 +0200
Subject: drbd: Initialize all members of sync_conf to their defaults [Bugz
 315]

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_main.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index c6658f5a5c1c..410d3d4f361e 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2689,7 +2689,8 @@ static void drbd_set_defaults(struct drbd_conf *mdev)
 		/* .verify_alg = */	{}, 0,
 		/* .cpu_mask = */	{}, 0,
 		/* .csums_alg = */	{}, 0,
-		/* .use_rle = */	0
+		/* .use_rle = */	0,
+		/* .on_no_data = */	DRBD_ON_NO_DATA_DEF
 	};
 
 	/* Have to use that way, because the layout differs between
-- 
cgit v1.2.3


From d28fd092a55b504a0d699b65802a995086d70647 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Fri, 9 Jul 2010 23:28:10 +0200
Subject: drbd: fix list corruption (recent regression)

The commit 288f422ec13667de40b278535d2a5fb5c77352c4
 drbd: Track all IO requests on the TL, not writes only
moved a list_add_tail(req, ) into a region where req
may have just been freed due to conflict detection.

Fix this by adding a proper cleanup section for that code path.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_req.c | 42 +++++++++++++++++-------------------------
 1 file changed, 17 insertions(+), 25 deletions(-)

diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 4e1e10d67c4b..3b61d767d9c4 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -917,31 +917,8 @@ allocate_barrier:
 	/* check this request on the collision detection hash tables.
 	 * if we have a conflict, just complete it here.
 	 * THINK do we want to check reads, too? (I don't think so...) */
-	if (rw == WRITE && _req_conflicts(req)) {
-		/* this is a conflicting request.
-		 * even though it may have been only _partially_
-		 * overlapping with one of the currently pending requests,
-		 * without even submitting or sending it, we will
-		 * pretend that it was successfully served right now.
-		 */
-		if (local) {
-			bio_put(req->private_bio);
-			req->private_bio = NULL;
-			drbd_al_complete_io(mdev, req->sector);
-			put_ldev(mdev);
-			local = 0;
-		}
-		if (remote)
-			dec_ap_pending(mdev);
-		_drbd_end_io_acct(mdev, req);
-		/* THINK: do we want to fail it (-EIO), or pretend success? */
-		bio_endio(req->master_bio, 0);
-		req->master_bio = NULL;
-		dec_ap_bio(mdev);
-		drbd_req_free(req);
-		remote = 0;
-	}
-
+	if (rw == WRITE && _req_conflicts(req))
+		goto fail_conflicting;
 
 	list_add_tail(&req->tl_requests, &mdev->newest_tle->requests);
 
@@ -976,6 +953,21 @@ allocate_barrier:
 
 	return 0;
 
+fail_conflicting:
+	/* this is a conflicting request.
+	 * even though it may have been only _partially_
+	 * overlapping with one of the currently pending requests,
+	 * without even submitting or sending it, we will
+	 * pretend that it was successfully served right now.
+	 */
+	_drbd_end_io_acct(mdev, req);
+	spin_unlock_irq(&mdev->req_lock);
+	if (remote)
+		dec_ap_pending(mdev);
+	/* THINK: do we want to fail it (-EIO), or pretend success?
+	 * this pretends success. */
+	err = 0;
+
 fail_free_complete:
 	if (rw == WRITE && local)
 		drbd_al_complete_io(mdev, sector);
-- 
cgit v1.2.3


From 9a31d7164d409ca59cfadb7957ac7b0acf4545b8 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Mon, 5 Jul 2010 13:42:03 +0200
Subject: drbd: New sync parameters for the smart resync rate controller

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_main.c |  6 +++++-
 drivers/block/drbd/drbd_nl.c   |  4 ++++
 include/linux/drbd_limits.h    | 24 ++++++++++++------------
 include/linux/drbd_nl.h        |  4 ++++
 4 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 410d3d4f361e..5a484c1f5ce7 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2690,7 +2690,11 @@ static void drbd_set_defaults(struct drbd_conf *mdev)
 		/* .cpu_mask = */	{}, 0,
 		/* .csums_alg = */	{}, 0,
 		/* .use_rle = */	0,
-		/* .on_no_data = */	DRBD_ON_NO_DATA_DEF
+		/* .on_no_data = */	DRBD_ON_NO_DATA_DEF,
+		/* .c_plan_ahead = */	DRBD_C_PLAN_AHEAD_DEF,
+		/* .c_delay_target = */	DRBD_C_DELAY_TARGET_DEF,
+		/* .c_fill_target = */	DRBD_C_FILL_TARGET_DEF,
+		/* .c_max_rate = */	DRBD_C_MAX_RATE_DEF
 	};
 
 	/* Have to use that way, because the layout differs between
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 6c08e637e25c..7d384fd39c16 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1599,6 +1599,10 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
 		sc.after      = DRBD_AFTER_DEF;
 		sc.al_extents = DRBD_AL_EXTENTS_DEF;
 		sc.on_no_data  = DRBD_ON_NO_DATA_DEF;
+		sc.c_plan_ahead = DRBD_C_PLAN_AHEAD_DEF;
+		sc.c_delay_target = DRBD_C_DELAY_TARGET_DEF;
+		sc.c_fill_target = DRBD_C_FILL_TARGET_DEF;
+		sc.c_max_rate   = DRBD_C_MAX_RATE_DEF;
 	} else
 		memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf));
 
diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h
index 7eb1e98009ec..06dbba47a8ef 100644
--- a/include/linux/drbd_limits.h
+++ b/include/linux/drbd_limits.h
@@ -134,21 +134,21 @@
 #define DRBD_MAX_BIO_BVECS_MAX 128
 #define DRBD_MAX_BIO_BVECS_DEF 0
 
-#define DRBD_DP_VOLUME_MIN 4
-#define DRBD_DP_VOLUME_MAX 1048576
-#define DRBD_DP_VOLUME_DEF 16384
+#define DRBD_C_PLAN_AHEAD_MIN  0
+#define DRBD_C_PLAN_AHEAD_MAX  300
+#define DRBD_C_PLAN_AHEAD_DEF  0 /* RS rate controller disabled by default */
 
-#define DRBD_DP_INTERVAL_MIN 1
-#define DRBD_DP_INTERVAL_MAX 600
-#define DRBD_DP_INTERVAL_DEF 5
+#define DRBD_C_DELAY_TARGET_MIN 1
+#define DRBD_C_DELAY_TARGET_MAX 100
+#define DRBD_C_DELAY_TARGET_DEF 10
 
-#define DRBD_RS_THROTTLE_TH_MIN 1
-#define DRBD_RS_THROTTLE_TH_MAX 600
-#define DRBD_RS_THROTTLE_TH_DEF 20
+#define DRBD_C_FILL_TARGET_MIN 0
+#define DRBD_C_FILL_TARGET_MAX 100000
+#define DRBD_C_FILL_TARGET_DEF 0 /* By default disabled -> controlled by delay_target */
 
-#define DRBD_RS_HOLD_OFF_TH_MIN 1
-#define DRBD_RS_HOLD_OFF_TH_MAX 6000
-#define DRBD_RS_HOLD_OFF_TH_DEF 100
+#define DRBD_C_MAX_RATE_MIN     250 /* kByte/sec */
+#define DRBD_C_MAX_RATE_MAX     (4 << 20)
+#define DRBD_C_MAX_RATE_DEF     102400
 
 #undef RANGE
 #endif
diff --git a/include/linux/drbd_nl.h b/include/linux/drbd_nl.h
index 9aebd0d80a5d..e23683c87ca1 100644
--- a/include/linux/drbd_nl.h
+++ b/include/linux/drbd_nl.h
@@ -88,6 +88,10 @@ NL_PACKET(syncer_conf, 8,
 	NL_STRING(	64,	T_MAY_IGNORE,	csums_alg,	SHARED_SECRET_MAX)
 	NL_BIT(         65,     T_MAY_IGNORE,   use_rle)
 	NL_INTEGER(	75,	T_MAY_IGNORE,	on_no_data)
+	NL_INTEGER(	76,	T_MAY_IGNORE,	c_plan_ahead)
+	NL_INTEGER(     77,	T_MAY_IGNORE,	c_delay_target)
+	NL_INTEGER(     78,	T_MAY_IGNORE,	c_fill_target)
+	NL_INTEGER(     79,	T_MAY_IGNORE,	c_max_rate)
 )
 
 NL_PACKET(invalidate, 9, )
-- 
cgit v1.2.3


From 8e26f9ccb9be00fdb33551a34c8f6029e89ab79f Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Tue, 6 Jul 2010 17:25:54 +0200
Subject: drbd: New sync_param packet, that includes the parameters of the new
 controller

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h      | 14 ++++++++++++++
 drivers/block/drbd/drbd_main.c     | 11 ++++++++---
 drivers/block/drbd/drbd_receiver.c | 18 +++++++++++++++---
 drivers/block/drbd/drbd_worker.c   |  2 --
 4 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index f84ffb17a7e5..fd2cdd45f155 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -451,6 +451,17 @@ struct p_rs_param_89 {
 	char csums_alg[SHARED_SECRET_MAX];
 } __packed;
 
+struct p_rs_param_95 {
+	struct p_header head;
+	u32 rate;
+	char verify_alg[SHARED_SECRET_MAX];
+	char csums_alg[SHARED_SECRET_MAX];
+	u32 c_plan_ahead;
+	u32 c_delay_target;
+	u32 c_fill_target;
+	u32 c_max_rate;
+} __packed;
+
 enum drbd_conn_flags {
 	CF_WANT_LOSE = 1,
 	CF_DRY_RUN = 2,
@@ -610,6 +621,7 @@ union p_polymorph {
         struct p_barrier         barrier;
         struct p_barrier_ack     barrier_ack;
         struct p_rs_param_89     rs_param_89;
+        struct p_rs_param_95     rs_param_95;
         struct p_protocol        protocol;
         struct p_sizes           sizes;
         struct p_uuids           uuids;
@@ -1268,6 +1280,8 @@ struct bm_extent {
  * Bit 1 ==> local node thinks this block needs to be synced.
  */
 
+#define SLEEP_TIME (HZ/10)
+
 #define BM_BLOCK_SHIFT  12			 /* 4k per bit */
 #define BM_BLOCK_SIZE	 (1<<BM_BLOCK_SHIFT)
 /* (9+3) : 512 bytes @ 8 bits; representing 16M storage
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 5a484c1f5ce7..bff4f598d38f 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1713,7 +1713,7 @@ int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
 
 int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
 {
-	struct p_rs_param_89 *p;
+	struct p_rs_param_95 *p;
 	struct socket *sock;
 	int size, rv;
 	const int apv = mdev->agreed_pro_version;
@@ -1721,7 +1721,8 @@ int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
 	size = apv <= 87 ? sizeof(struct p_rs_param)
 		: apv == 88 ? sizeof(struct p_rs_param)
 			+ strlen(mdev->sync_conf.verify_alg) + 1
-		: /* 89 */    sizeof(struct p_rs_param_89);
+		: apv <= 94 ? sizeof(struct p_rs_param_89)
+		: /* apv >= 95 */ sizeof(struct p_rs_param_95);
 
 	/* used from admin command context and receiver/worker context.
 	 * to avoid kmalloc, grab the socket right here,
@@ -1732,12 +1733,16 @@ int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
 	if (likely(sock != NULL)) {
 		enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
 
-		p = &mdev->data.sbuf.rs_param_89;
+		p = &mdev->data.sbuf.rs_param_95;
 
 		/* initialize verify_alg and csums_alg */
 		memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
 
 		p->rate = cpu_to_be32(sc->rate);
+		p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
+		p->c_delay_target = cpu_to_be32(sc->c_delay_target);
+		p->c_fill_target = cpu_to_be32(sc->c_fill_target);
+		p->c_max_rate = cpu_to_be32(sc->c_max_rate);
 
 		if (apv >= 88)
 			strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 5e49ee75d3c9..34bea972f734 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -2805,7 +2805,7 @@ struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev,
 static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h)
 {
 	int ok = TRUE;
-	struct p_rs_param_89 *p = (struct p_rs_param_89 *)h;
+	struct p_rs_param_95 *p = (struct p_rs_param_95 *)h;
 	unsigned int header_size, data_size, exp_max_sz;
 	struct crypto_hash *verify_tfm = NULL;
 	struct crypto_hash *csums_tfm = NULL;
@@ -2814,7 +2814,8 @@ static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h)
 	exp_max_sz  = apv <= 87 ? sizeof(struct p_rs_param)
 		    : apv == 88 ? sizeof(struct p_rs_param)
 					+ SHARED_SECRET_MAX
-		    : /* 89 */    sizeof(struct p_rs_param_89);
+		    : apv <= 94 ? sizeof(struct p_rs_param_89)
+		    : /* apv >= 95 */ sizeof(struct p_rs_param_95);
 
 	if (h->length > exp_max_sz) {
 		dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n",
@@ -2825,10 +2826,14 @@ static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h)
 	if (apv <= 88) {
 		header_size = sizeof(struct p_rs_param) - sizeof(*h);
 		data_size   = h->length  - header_size;
-	} else /* apv >= 89 */ {
+	} else if (apv <= 94) {
 		header_size = sizeof(struct p_rs_param_89) - sizeof(*h);
 		data_size   = h->length  - header_size;
 		D_ASSERT(data_size == 0);
+	} else {
+		header_size = sizeof(struct p_rs_param_95) - sizeof(*h);
+		data_size   = h->length  - header_size;
+		D_ASSERT(data_size == 0);
 	}
 
 	/* initialize verify_alg and csums_alg */
@@ -2893,6 +2898,13 @@ static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h)
 			}
 		}
 
+		if (apv > 94) {
+			mdev->sync_conf.rate	  = be32_to_cpu(p->rate);
+			mdev->sync_conf.c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
+			mdev->sync_conf.c_delay_target = be32_to_cpu(p->c_delay_target);
+			mdev->sync_conf.c_fill_target = be32_to_cpu(p->c_fill_target);
+			mdev->sync_conf.c_max_rate = be32_to_cpu(p->c_max_rate);
+		}
 
 		spin_lock(&mdev->peer_seq_lock);
 		/* lock against drbd_nl_syncer_conf() */
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 3c1e88480d37..d94720f4bd07 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -39,8 +39,6 @@
 #include "drbd_int.h"
 #include "drbd_req.h"
 
-#define SLEEP_TIME (HZ/10)
-
 static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel);
 
 
-- 
cgit v1.2.3


From 778f271dfe7a7173c0bae2d6cde8d9bd1533e668 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Tue, 6 Jul 2010 11:14:00 +0200
Subject: drbd: The new, smarter resync speed controller

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h      | 11 +++++
 drivers/block/drbd/drbd_main.c     |  1 +
 drivers/block/drbd/drbd_nl.c       | 22 +++++++++
 drivers/block/drbd/drbd_receiver.c | 20 ++++++++
 drivers/block/drbd/drbd_worker.c   | 98 +++++++++++++++++++++++++++++++++++++-
 5 files changed, 151 insertions(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index fd2cdd45f155..facb72ccc56b 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -928,6 +928,12 @@ enum write_ordering_e {
 	WO_bio_barrier
 };
 
+struct fifo_buffer {
+	int *values;
+	unsigned int head_index;
+	unsigned int size;
+};
+
 struct drbd_conf {
 	/* things that are stored as / read from meta data on disk */
 	unsigned long flags;
@@ -1068,6 +1074,11 @@ struct drbd_conf {
 	u64 ed_uuid; /* UUID of the exposed data */
 	struct mutex state_mutex;
 	char congestion_reason;  /* Why we where congested... */
+	atomic_t rs_sect_in; /* counter to measure the incoming resync data rate */
+	int c_sync_rate; /* current resync rate after delay_probe magic */
+	struct fifo_buffer rs_plan_s; /* correction values of resync planer */
+	int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */
+	int rs_planed;    /* resync sectors already planed */
 };
 
 static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index bff4f598d38f..ed09a840d838 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2734,6 +2734,7 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
 	atomic_set(&mdev->net_cnt, 0);
 	atomic_set(&mdev->packet_seq, 0);
 	atomic_set(&mdev->pp_in_use, 0);
+	atomic_set(&mdev->rs_sect_in, 0);
 
 	mutex_init(&mdev->md_io_mutex);
 	mutex_init(&mdev->data.mutex);
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 7d384fd39c16..295b8d593708 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1587,6 +1587,8 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
 	struct crypto_hash *csums_tfm = NULL;
 	struct syncer_conf sc;
 	cpumask_var_t new_cpu_mask;
+	int *rs_plan_s = NULL;
+	int fifo_size;
 
 	if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) {
 		retcode = ERR_NOMEM;
@@ -1687,6 +1689,16 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
 	if (retcode != NO_ERROR)
 		goto fail;
 
+	fifo_size = (sc.c_plan_ahead * 10 * SLEEP_TIME) / HZ;
+	if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) {
+		rs_plan_s   = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL);
+		if (!rs_plan_s) {
+			dev_err(DEV, "kmalloc of fifo_buffer failed");
+			retcode = ERR_NOMEM;
+			goto fail;
+		}
+	}
+
 	/* ok, assign the rest of it as well.
 	 * lock against receive_SyncParam() */
 	spin_lock(&mdev->peer_seq_lock);
@@ -1703,6 +1715,15 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
 		mdev->verify_tfm = verify_tfm;
 		verify_tfm = NULL;
 	}
+
+	if (fifo_size != mdev->rs_plan_s.size) {
+		kfree(mdev->rs_plan_s.values);
+		mdev->rs_plan_s.values = rs_plan_s;
+		mdev->rs_plan_s.size   = fifo_size;
+		mdev->rs_planed = 0;
+		rs_plan_s = NULL;
+	}
+
 	spin_unlock(&mdev->peer_seq_lock);
 
 	if (get_ldev(mdev)) {
@@ -1734,6 +1755,7 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
 
 	kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
 fail:
+	kfree(rs_plan_s);
 	free_cpumask_var(new_cpu_mask);
 	crypto_free_hash(csums_tfm);
 	crypto_free_hash(verify_tfm);
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 34bea972f734..5f80b22e711d 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1640,6 +1640,8 @@ static int receive_RSDataReply(struct drbd_conf *mdev, struct p_header *h)
 		drbd_send_ack_dp(mdev, P_NEG_ACK, p);
 	}
 
+	atomic_add(data_size >> 9, &mdev->rs_sect_in);
+
 	return ok;
 }
 
@@ -2810,6 +2812,8 @@ static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h)
 	struct crypto_hash *verify_tfm = NULL;
 	struct crypto_hash *csums_tfm = NULL;
 	const int apv = mdev->agreed_pro_version;
+	int *rs_plan_s = NULL;
+	int fifo_size = 0;
 
 	exp_max_sz  = apv <= 87 ? sizeof(struct p_rs_param)
 		    : apv == 88 ? sizeof(struct p_rs_param)
@@ -2904,6 +2908,15 @@ static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h)
 			mdev->sync_conf.c_delay_target = be32_to_cpu(p->c_delay_target);
 			mdev->sync_conf.c_fill_target = be32_to_cpu(p->c_fill_target);
 			mdev->sync_conf.c_max_rate = be32_to_cpu(p->c_max_rate);
+
+			fifo_size = (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ;
+			if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) {
+				rs_plan_s   = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL);
+				if (!rs_plan_s) {
+					dev_err(DEV, "kmalloc of fifo_buffer failed");
+					goto disconnect;
+				}
+			}
 		}
 
 		spin_lock(&mdev->peer_seq_lock);
@@ -2922,6 +2935,12 @@ static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h)
 			mdev->csums_tfm = csums_tfm;
 			dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg);
 		}
+		if (fifo_size != mdev->rs_plan_s.size) {
+			kfree(mdev->rs_plan_s.values);
+			mdev->rs_plan_s.values = rs_plan_s;
+			mdev->rs_plan_s.size   = fifo_size;
+			mdev->rs_planed = 0;
+		}
 		spin_unlock(&mdev->peer_seq_lock);
 	}
 
@@ -4202,6 +4221,7 @@ static int got_IsInSync(struct drbd_conf *mdev, struct p_header *h)
 	/* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
 	mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
 	dec_rs_pending(mdev);
+	atomic_add(blksize >> 9, &mdev->rs_sect_in);
 
 	return TRUE;
 }
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index d94720f4bd07..fd3e1e9561cb 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -422,6 +422,89 @@ void resync_timer_fn(unsigned long data)
 		drbd_queue_work(&mdev->data.work, &mdev->resync_work);
 }
 
+static void fifo_set(struct fifo_buffer *fb, int value)
+{
+	int i;
+
+	for (i = 0; i < fb->size; i++)
+		fb->values[i] += value;
+}
+
+static int fifo_push(struct fifo_buffer *fb, int value)
+{
+	int ov;
+
+	ov = fb->values[fb->head_index];
+	fb->values[fb->head_index++] = value;
+
+	if (fb->head_index >= fb->size)
+		fb->head_index = 0;
+
+	return ov;
+}
+
+static void fifo_add_val(struct fifo_buffer *fb, int value)
+{
+	int i;
+
+	for (i = 0; i < fb->size; i++)
+		fb->values[i] += value;
+}
+
+int drbd_rs_controller(struct drbd_conf *mdev)
+{
+	unsigned int sect_in;  /* Number of sectors that came in since the last turn */
+	unsigned int want;     /* The number of sectors we want in the proxy */
+	int req_sect; /* Number of sectors to request in this turn */
+	int correction; /* Number of sectors more we need in the proxy*/
+	int cps; /* correction per invocation of drbd_rs_controller() */
+	int steps; /* Number of time steps to plan ahead */
+	int curr_corr;
+	int max_sect;
+
+	sect_in = atomic_xchg(&mdev->rs_sect_in, 0); /* Number of sectors that came in */
+	mdev->rs_in_flight -= sect_in;
+
+	spin_lock(&mdev->peer_seq_lock); /* get an atomic view on mdev->rs_plan_s */
+
+	steps = mdev->rs_plan_s.size; /* (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ; */
+
+	if (mdev->rs_in_flight + sect_in == 0) { /* At start of resync */
+		want = ((mdev->sync_conf.rate * 2 * SLEEP_TIME) / HZ) * steps;
+	} else { /* normal path */
+		want = mdev->sync_conf.c_fill_target ? mdev->sync_conf.c_fill_target :
+			sect_in * mdev->sync_conf.c_delay_target * HZ / (SLEEP_TIME * 10);
+	}
+
+	correction = want - mdev->rs_in_flight - mdev->rs_planed;
+
+	/* Plan ahead */
+	cps = correction / steps;
+	fifo_add_val(&mdev->rs_plan_s, cps);
+	mdev->rs_planed += cps * steps;
+
+	/* What we do in this step */
+	curr_corr = fifo_push(&mdev->rs_plan_s, 0);
+	spin_unlock(&mdev->peer_seq_lock);
+	mdev->rs_planed -= curr_corr;
+
+	req_sect = sect_in + curr_corr;
+	if (req_sect < 0)
+		req_sect = 0;
+
+	max_sect = (mdev->sync_conf.c_max_rate * 2 * SLEEP_TIME) / HZ;
+	if (req_sect > max_sect)
+		req_sect = max_sect;
+
+	/*
+	dev_warn(DEV, "si=%u if=%d wa=%u co=%d st=%d cps=%d pl=%d cc=%d rs=%d\n",
+		 sect_in, mdev->rs_in_flight, want, correction,
+		 steps, cps, mdev->rs_planed, curr_corr, req_sect);
+	*/
+
+	return req_sect;
+}
+
 int w_make_resync_request(struct drbd_conf *mdev,
 		struct drbd_work *w, int cancel)
 {
@@ -459,7 +542,13 @@ int w_make_resync_request(struct drbd_conf *mdev,
 	max_segment_size = mdev->agreed_pro_version < 94 ?
 		queue_max_segment_size(mdev->rq_queue) : DRBD_MAX_SEGMENT_SIZE;
 
-	number = SLEEP_TIME * mdev->sync_conf.rate  / ((BM_BLOCK_SIZE / 1024) * HZ);
+	if (mdev->rs_plan_s.size) { /* mdev->sync_conf.c_plan_ahead */
+		number = drbd_rs_controller(mdev) >> (BM_BLOCK_SHIFT - 9);
+		mdev->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME;
+	} else {
+		mdev->c_sync_rate = mdev->sync_conf.rate;
+		number = SLEEP_TIME * mdev->c_sync_rate  / ((BM_BLOCK_SIZE / 1024) * HZ);
+	}
 	pe = atomic_read(&mdev->rs_pending_cnt);
 
 	mutex_lock(&mdev->data.mutex);
@@ -593,6 +682,7 @@ next_sector:
 	}
 
  requeue:
+	mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
 	mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
 	put_ldev(mdev);
 	return 1;
@@ -1419,6 +1509,12 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
 			drbd_resync_finished(mdev);
 		}
 
+		atomic_set(&mdev->rs_sect_in, 0);
+		mdev->rs_in_flight = 0;
+		mdev->rs_planed = 0;
+		spin_lock(&mdev->peer_seq_lock);
+		fifo_set(&mdev->rs_plan_s, 0);
+		spin_unlock(&mdev->peer_seq_lock);
 		/* ns.conn may already be != mdev->state.conn,
 		 * we may have been paused in between, or become paused until
 		 * the timer triggers.
-- 
cgit v1.2.3


From d207450cf2731c6a2afa8c78fb31c7206cd35eba Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Thu, 22 Jul 2010 15:27:27 +0200
Subject: drbd: Bugfix: rs_in_flight could become wrong if read_for_csum()
 requested reschedule later

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_worker.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index fd3e1e9561cb..30b8e466a224 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -512,7 +512,7 @@ int w_make_resync_request(struct drbd_conf *mdev,
 	sector_t sector;
 	const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
 	int max_segment_size;
-	int number, i, size, pe, mx;
+	int number, i, rollback_i, size, pe, mx;
 	int align, queued, sndbuf;
 
 	if (unlikely(cancel))
@@ -613,6 +613,7 @@ next_sector:
 		 * be prepared for all stripe sizes of software RAIDs.
 		 */
 		align = 1;
+		rollback_i = i;
 		for (;;) {
 			if (size + BM_BLOCK_SIZE > max_segment_size)
 				break;
@@ -654,6 +655,7 @@ next_sector:
 			case 2: /* Allocation failed */
 				drbd_rs_complete_io(mdev, sector);
 				mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
+				i = rollback_i;
 				goto requeue;
 			/* case 1: everything ok */
 			}
-- 
cgit v1.2.3


From 85719573dd716bc2ac3e098b44adfed884250bab Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Wed, 21 Jul 2010 10:20:17 +0200
Subject: drbd: Replaced some casts by an union. Improved comments

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h      | 13 ++++++++-----
 drivers/block/drbd/drbd_receiver.c |  3 ++-
 drivers/block/drbd/drbd_worker.c   |  4 ++--
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index facb72ccc56b..b0cbfa143775 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -759,7 +759,7 @@ struct digest_info {
 struct drbd_epoch_entry {
 	struct drbd_work w;
 	struct hlist_node colision;
-	struct drbd_epoch *epoch;
+	struct drbd_epoch *epoch; /* for writes */
 	struct drbd_conf *mdev;
 	struct page *pages;
 	atomic_t pending_bios;
@@ -767,7 +767,10 @@ struct drbd_epoch_entry {
 	/* see comments on ee flag bits below */
 	unsigned long flags;
 	sector_t sector;
-	u64 block_id;
+	union {
+		u64 block_id;
+		struct digest_info *digest;
+	};
 };
 
 /* ee flag bits.
@@ -1032,10 +1035,10 @@ struct drbd_conf {
 	spinlock_t epoch_lock;
 	unsigned int epochs;
 	enum write_ordering_e write_ordering;
-	struct list_head active_ee; /* IO in progress */
-	struct list_head sync_ee;   /* IO in progress */
+	struct list_head active_ee; /* IO in progress (P_DATA gets written to disk) */
+	struct list_head sync_ee;   /* IO in progress (P_RS_DATA_REPLY gets written to disk) */
 	struct list_head done_ee;   /* send ack */
-	struct list_head read_ee;   /* IO in progress */
+	struct list_head read_ee;   /* IO in progress (any read) */
 	struct list_head net_ee;    /* zero-copy network send in progress */
 	struct hlist_head *ee_hash; /* is proteced by req_lock! */
 	unsigned int ee_hash_s;
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 5f80b22e711d..0b03e3174f76 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -2097,7 +2097,7 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
 		if (drbd_recv(mdev, di->digest, digest_size) != digest_size)
 			goto out_free_e;
 
-		e->block_id = (u64)(unsigned long)di;
+		e->digest = di;
 		if (h->command == P_CSUM_RS_REQUEST) {
 			D_ASSERT(mdev->agreed_pro_version >= 89);
 			e->w.cb = w_e_end_csum_rs_req;
@@ -3769,6 +3769,7 @@ static void drbd_disconnect(struct drbd_conf *mdev)
 	drbd_thread_stop(&mdev->asender);
 	drbd_free_sock(mdev);
 
+	/* wait for current activity to cease. */
 	spin_lock_irq(&mdev->req_lock);
 	_drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
 	_drbd_wait_ee_list_empty(mdev, &mdev->sync_ee);
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 30b8e466a224..f979e22cc6fb 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -1018,7 +1018,7 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 
 	drbd_rs_complete_io(mdev, e->sector);
 
-	di = (struct digest_info *)(unsigned long)e->block_id;
+	di = e->digest;
 
 	if (likely((e->flags & EE_WAS_ERROR) == 0)) {
 		/* quick hack to try to avoid a race against reconfiguration.
@@ -1126,7 +1126,7 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 	 * the resync lru has been cleaned up already */
 	drbd_rs_complete_io(mdev, e->sector);
 
-	di = (struct digest_info *)(unsigned long)e->block_id;
+	di = e->digest;
 
 	if (likely((e->flags & EE_WAS_ERROR) == 0)) {
 		digest_size = crypto_hash_digestsize(mdev->verify_tfm);
-- 
cgit v1.2.3


From c36c3ced692b38d0cf90a5e6f875be2f9ebbc037 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Wed, 11 Aug 2010 20:42:55 +0200
Subject: drbd: let drbd_free_ee implicitly free any digest

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h      | 4 ++++
 drivers/block/drbd/drbd_receiver.c | 7 +++++--
 drivers/block/drbd/drbd_worker.c   | 6 ------
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index b0cbfa143775..3a941744f069 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -796,12 +796,16 @@ enum {
 	 * if any of those fail, we set this flag atomically
 	 * from the endio callback */
 	__EE_WAS_ERROR,
+
+	/* This ee has a pointer to a digest instead of a block id */
+	__EE_HAS_DIGEST,
 };
 #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
 #define EE_MAY_SET_IN_SYNC     (1<<__EE_MAY_SET_IN_SYNC)
 #define EE_IS_BARRIER          (1<<__EE_IS_BARRIER)
 #define	EE_RESUBMITTED         (1<<__EE_RESUBMITTED)
 #define EE_WAS_ERROR           (1<<__EE_WAS_ERROR)
+#define EE_HAS_DIGEST          (1<<__EE_HAS_DIGEST)
 
 /* global flag bits */
 enum {
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 0b03e3174f76..2f9320be4906 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -377,6 +377,8 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
 
 void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
 {
+	if (e->flags & EE_HAS_DIGEST)
+		kfree(e->digest);
 	drbd_pp_free(mdev, e->pages);
 	D_ASSERT(atomic_read(&e->pending_bios) == 0);
 	D_ASSERT(hlist_unhashed(&e->colision));
@@ -2094,10 +2096,12 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
 		di->digest_size = digest_size;
 		di->digest = (((char *)di)+sizeof(struct digest_info));
 
+		e->digest = di;
+		e->flags |= EE_HAS_DIGEST;
+
 		if (drbd_recv(mdev, di->digest, digest_size) != digest_size)
 			goto out_free_e;
 
-		e->digest = di;
 		if (h->command == P_CSUM_RS_REQUEST) {
 			D_ASSERT(mdev->agreed_pro_version >= 89);
 			e->w.cb = w_e_end_csum_rs_req;
@@ -2159,7 +2163,6 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
 		return TRUE;
 
 out_free_e:
-	kfree(di);
 	put_ldev(mdev);
 	drbd_free_ee(mdev, e);
 	return FALSE;
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index f979e22cc6fb..48452fe83603 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -1052,9 +1052,6 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 	}
 
 	dec_unacked(mdev);
-
-	kfree(di);
-
 	move_to_net_ee_or_free(mdev, e);
 
 	if (unlikely(!ok))
@@ -1145,9 +1142,6 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 	}
 
 	dec_unacked(mdev);
-
-	kfree(di);
-
 	if (!eq)
 		drbd_ov_oos_found(mdev, e->sector, e->size);
 	else
-- 
cgit v1.2.3


From 0bb70bf601579b0d4c56acbb54b8eb0688541e19 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Wed, 11 Aug 2010 20:53:21 +0200
Subject: drbd: remove outdated comment and dead code

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 3a941744f069..72d204750408 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -858,10 +858,6 @@ struct drbd_bitmap; /* opaque for drbd_conf */
 
 /* THINK maybe we actually want to use the default "event/%s" worker threads
  * or similar in linux 2.6, which uses per cpu data and threads.
- *
- * To be general, this might need a spin_lock member.
- * For now, please use the mdev->req_lock to protect list_head,
- * see drbd_queue_work below.
  */
 struct drbd_work_queue {
 	struct list_head q;
@@ -1894,13 +1890,6 @@ static inline sector_t drbd_md_ss__(struct drbd_conf *mdev,
 	}
 }
 
-static inline void
-_drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
-{
-	list_add_tail(&w->list, &q->q);
-	up(&q->s);
-}
-
 static inline void
 drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w)
 {
-- 
cgit v1.2.3


From 1d7734a0df02ff5068ff8baa1447c7baee601db1 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Wed, 11 Aug 2010 21:21:50 +0200
Subject: drbd: use rolling marks for resync speed calculation

The current resync speed as displayed in /proc/drbd fluctuates a lot.
Using an array of rolling marks makes this calculation much more stable.
We used to have this (a long time ago with 0.7), but it got lost somehow.

If "stalled", do not discard the rest of the information, just add a
" (stalled)" tag to the progress line.

This patch also shortens a spinlock critical section somewhat, and
reduces the number of atomic operations in put_ldev.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_actlog.c | 29 +++++++++++++++--------------
 drivers/block/drbd/drbd_int.h    | 25 +++++++++++++++++--------
 drivers/block/drbd/drbd_main.c   | 29 ++++++++++++++++++++---------
 drivers/block/drbd/drbd_proc.c   | 27 +++++++++++++++++----------
 drivers/block/drbd/drbd_worker.c | 14 ++++++++++----
 5 files changed, 79 insertions(+), 45 deletions(-)

diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 9400845d602e..b895470e53d7 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -965,29 +965,30 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
 	 * ok, (capacity & 7) != 0 sometimes, but who cares...
 	 * we count rs_{total,left} in bits, not sectors.
 	 */
-	spin_lock_irqsave(&mdev->al_lock, flags);
 	count = drbd_bm_clear_bits(mdev, sbnr, ebnr);
-	if (count) {
-		/* we need the lock for drbd_try_clear_on_disk_bm */
-		if (jiffies - mdev->rs_mark_time > HZ*10) {
-			/* should be rolling marks,
-			 * but we estimate only anyways. */
-			if (mdev->rs_mark_left != drbd_bm_total_weight(mdev) &&
+	if (count && get_ldev(mdev)) {
+		unsigned long now = jiffies;
+		unsigned long last = mdev->rs_mark_time[mdev->rs_last_mark];
+		int next = (mdev->rs_last_mark + 1) % DRBD_SYNC_MARKS;
+		if (time_after_eq(now, last + DRBD_SYNC_MARK_STEP)) {
+			unsigned long tw = drbd_bm_total_weight(mdev);
+			if (mdev->rs_mark_left[mdev->rs_last_mark] != tw &&
 			    mdev->state.conn != C_PAUSED_SYNC_T &&
 			    mdev->state.conn != C_PAUSED_SYNC_S) {
-				mdev->rs_mark_time = jiffies;
-				mdev->rs_mark_left = drbd_bm_total_weight(mdev);
+				mdev->rs_mark_time[next] = now;
+				mdev->rs_mark_left[next] = tw;
+				mdev->rs_last_mark = next;
 			}
 		}
-		if (get_ldev(mdev)) {
-			drbd_try_clear_on_disk_bm(mdev, sector, count, TRUE);
-			put_ldev(mdev);
-		}
+		spin_lock_irqsave(&mdev->al_lock, flags);
+		drbd_try_clear_on_disk_bm(mdev, sector, count, TRUE);
+		spin_unlock_irqrestore(&mdev->al_lock, flags);
+
 		/* just wake_up unconditional now, various lc_chaged(),
 		 * lc_put() in drbd_try_clear_on_disk_bm(). */
 		wake_up = 1;
+		put_ldev(mdev);
 	}
-	spin_unlock_irqrestore(&mdev->al_lock, flags);
 	if (wake_up)
 		wake_up(&mdev->al_wait);
 }
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 72d204750408..0fce3f36fc1c 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -997,12 +997,16 @@ struct drbd_conf {
 	unsigned long rs_start;
 	/* cumulated time in PausedSyncX state [unit jiffies] */
 	unsigned long rs_paused;
+	/* skipped because csum was equal [unit BM_BLOCK_SIZE] */
+	unsigned long rs_same_csum;
+#define DRBD_SYNC_MARKS 8
+#define DRBD_SYNC_MARK_STEP (3*HZ)
 	/* block not up-to-date at mark [unit BM_BLOCK_SIZE] */
-	unsigned long rs_mark_left;
+	unsigned long rs_mark_left[DRBD_SYNC_MARKS];
 	/* marks's time [unit jiffies] */
-	unsigned long rs_mark_time;
-	/* skipped because csum was equeal [unit BM_BLOCK_SIZE] */
-	unsigned long rs_same_csum;
+	unsigned long rs_mark_time[DRBD_SYNC_MARKS];
+	/* current index into rs_mark_{left,time} */
+	int rs_last_mark;
 
 	/* where does the admin want us to start? (sector) */
 	sector_t ov_start_sector;
@@ -1077,8 +1081,12 @@ struct drbd_conf {
 	u64 ed_uuid; /* UUID of the exposed data */
 	struct mutex state_mutex;
 	char congestion_reason;  /* Why we where congested... */
-	atomic_t rs_sect_in; /* counter to measure the incoming resync data rate */
-	int c_sync_rate; /* current resync rate after delay_probe magic */
+	atomic_t rs_sect_in; /* for incoming resync data rate, SyncTarget */
+	atomic_t rs_sect_ev; /* for submitted resync data rate, both */
+	int rs_last_sect_ev; /* counter to compare with */
+	int rs_last_events;  /* counter of read or write "events" (unit sectors)
+			      * on the lower level device when we last looked. */
+	int c_sync_rate; /* current resync rate after syncer throttle magic */
 	struct fifo_buffer rs_plan_s; /* correction values of resync planer */
 	int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */
 	int rs_planed;    /* resync sectors already planed */
@@ -2072,10 +2080,11 @@ static inline int get_net_conf(struct drbd_conf *mdev)
 
 static inline void put_ldev(struct drbd_conf *mdev)
 {
+	int i = atomic_dec_return(&mdev->local_cnt);
 	__release(local);
-	if (atomic_dec_and_test(&mdev->local_cnt))
+	D_ASSERT(i >= 0);
+	if (i == 0)
 		wake_up(&mdev->misc_wait);
-	D_ASSERT(atomic_read(&mdev->local_cnt) >= 0);
 }
 
 #ifndef __CHECKER__
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index ed09a840d838..1ff8418ae0fa 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1064,7 +1064,8 @@ int __drbd_set_state(struct drbd_conf *mdev,
 	if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
 	    (ns.conn == C_SYNC_TARGET  || ns.conn == C_SYNC_SOURCE)) {
 		dev_info(DEV, "Syncer continues.\n");
-		mdev->rs_paused += (long)jiffies-(long)mdev->rs_mark_time;
+		mdev->rs_paused += (long)jiffies
+				  -(long)mdev->rs_mark_time[mdev->rs_last_mark];
 		if (ns.conn == C_SYNC_TARGET) {
 			if (!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))
 				mod_timer(&mdev->resync_timer, jiffies);
@@ -1078,27 +1079,33 @@ int __drbd_set_state(struct drbd_conf *mdev,
 	if ((os.conn == C_SYNC_TARGET  || os.conn == C_SYNC_SOURCE) &&
 	    (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
 		dev_info(DEV, "Resync suspended\n");
-		mdev->rs_mark_time = jiffies;
+		mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
 		if (ns.conn == C_PAUSED_SYNC_T)
 			set_bit(STOP_SYNC_TIMER, &mdev->flags);
 	}
 
 	if (os.conn == C_CONNECTED &&
 	    (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
+		unsigned long now = jiffies;
+		int i;
+
 		mdev->ov_position = 0;
-		mdev->rs_total =
-		mdev->rs_mark_left = drbd_bm_bits(mdev);
+		mdev->rs_total = drbd_bm_bits(mdev);
 		if (mdev->agreed_pro_version >= 90)
 			set_ov_position(mdev, ns.conn);
 		else
 			mdev->ov_start_sector = 0;
 		mdev->ov_left = mdev->rs_total
 			      - BM_SECT_TO_BIT(mdev->ov_position);
-		mdev->rs_start     =
-		mdev->rs_mark_time = jiffies;
+		mdev->rs_start = now;
 		mdev->ov_last_oos_size = 0;
 		mdev->ov_last_oos_start = 0;
 
+		for (i = 0; i < DRBD_SYNC_MARKS; i++) {
+			mdev->rs_mark_left[i] = mdev->rs_total;
+			mdev->rs_mark_time[i] = now;
+		}
+
 		if (ns.conn == C_VERIFY_S) {
 			dev_info(DEV, "Starting Online Verify from sector %llu\n",
 					(unsigned long long)mdev->ov_position);
@@ -2793,6 +2800,7 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
 
 void drbd_mdev_cleanup(struct drbd_conf *mdev)
 {
+	int i;
 	if (mdev->receiver.t_state != None)
 		dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
 				mdev->receiver.t_state);
@@ -2809,9 +2817,12 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev)
 	mdev->p_size       =
 	mdev->rs_start     =
 	mdev->rs_total     =
-	mdev->rs_failed    =
-	mdev->rs_mark_left =
-	mdev->rs_mark_time = 0;
+	mdev->rs_failed    = 0;
+	mdev->rs_last_events = 0;
+	for (i = 0; i < DRBD_SYNC_MARKS; i++) {
+		mdev->rs_mark_left[i] = 0;
+		mdev->rs_mark_time[i] = 0;
+	}
 	D_ASSERT(mdev->net_conf == NULL);
 
 	drbd_set_my_capacity(mdev, 0);
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index be3374b68460..c159692c3b56 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -57,6 +57,7 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
 	unsigned long db, dt, dbdt, rt, rs_left;
 	unsigned int res;
 	int i, x, y;
+	int stalled = 0;
 
 	drbd_get_syncer_progress(mdev, &rs_left, &res);
 
@@ -90,18 +91,17 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
 	 * db: blocks written from mark until now
 	 * rt: remaining time
 	 */
-	dt = (jiffies - mdev->rs_mark_time) / HZ;
-
-	if (dt > 20) {
-		/* if we made no update to rs_mark_time for too long,
-		 * we are stalled. show that. */
-		seq_printf(seq, "stalled\n");
-		return;
-	}
+	/* Rolling marks. last_mark+1 may just now be modified.  last_mark+2 is
+	 * at least (DRBD_SYNC_MARKS-2)*DRBD_SYNC_MARK_STEP old, and has at
+	 * least DRBD_SYNC_MARK_STEP time before it will be modified. */
+	i = (mdev->rs_last_mark + 2) % DRBD_SYNC_MARKS;
+	dt = (jiffies - mdev->rs_mark_time[i]) / HZ;
+	if (dt > (DRBD_SYNC_MARK_STEP * DRBD_SYNC_MARKS))
+		stalled = 1;
 
 	if (!dt)
 		dt++;
-	db = mdev->rs_mark_left - rs_left;
+	db = mdev->rs_mark_left[i] - rs_left;
 	rt = (dt * (rs_left / (db/100+1)))/100; /* seconds */
 
 	seq_printf(seq, "finish: %lu:%02lu:%02lu",
@@ -128,7 +128,14 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
 	else
 		seq_printf(seq, " (%ld)", dbdt);
 
-	seq_printf(seq, " K/sec\n");
+	if (mdev->state.conn == C_SYNC_TARGET) {
+		if (mdev->c_sync_rate > 1000)
+			seq_printf(seq, " want: %d,%03d",
+				   mdev->c_sync_rate / 1000, mdev->c_sync_rate % 1000);
+		else
+			seq_printf(seq, " want: %d", mdev->c_sync_rate);
+	}
+	seq_printf(seq, " K/sec%s\n", stalled ? " (stalled)" : "");
 }
 
 static void resync_dump_detail(struct seq_file *seq, struct lc_element *e)
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 48452fe83603..53b74254b1c2 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -1481,13 +1481,19 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
 		r = SS_UNKNOWN_ERROR;
 
 	if (r == SS_SUCCESS) {
-		mdev->rs_total     =
-		mdev->rs_mark_left = drbd_bm_total_weight(mdev);
+		unsigned long tw = drbd_bm_total_weight(mdev);
+		unsigned long now = jiffies;
+		int i;
+
 		mdev->rs_failed    = 0;
 		mdev->rs_paused    = 0;
-		mdev->rs_start     =
-		mdev->rs_mark_time = jiffies;
 		mdev->rs_same_csum = 0;
+		mdev->rs_total     = tw;
+		mdev->rs_start     = now;
+		for (i = 0; i < DRBD_SYNC_MARKS; i++) {
+			mdev->rs_mark_left[i] = tw;
+			mdev->rs_mark_time[i] = now;
+		}
 		_drbd_pause_after(mdev);
 	}
 	write_unlock_irq(&global_state_lock);
-- 
cgit v1.2.3


From 80a40e439e5a3f30b0a6210a1add6d7c33392e54 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Wed, 11 Aug 2010 23:28:00 +0200
Subject: drbd: reduce code duplication when receiving data requests

also canonicalize the return values of read_for_csum
and drbd_rs_begin_io to return -ESOMETHING, or 0 for success.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_actlog.c   | 12 +++++-----
 drivers/block/drbd/drbd_receiver.c | 46 ++++++++++----------------------------
 drivers/block/drbd/drbd_worker.c   | 22 ++++++++++--------
 3 files changed, 30 insertions(+), 50 deletions(-)

diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index b895470e53d7..ac04ef97eac2 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -1119,7 +1119,7 @@ static int _is_in_al(struct drbd_conf *mdev, unsigned int enr)
  * @mdev:	DRBD device.
  * @sector:	The sector number.
  *
- * This functions sleeps on al_wait. Returns 1 on success, 0 if interrupted.
+ * This functions sleeps on al_wait. Returns 0 on success, -EINTR if interrupted.
  */
 int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
 {
@@ -1130,10 +1130,10 @@ int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
 	sig = wait_event_interruptible(mdev->al_wait,
 			(bm_ext = _bme_get(mdev, enr)));
 	if (sig)
-		return 0;
+		return -EINTR;
 
 	if (test_bit(BME_LOCKED, &bm_ext->flags))
-		return 1;
+		return 0;
 
 	for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
 		sig = wait_event_interruptible(mdev->al_wait,
@@ -1146,13 +1146,11 @@ int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
 				wake_up(&mdev->al_wait);
 			}
 			spin_unlock_irq(&mdev->al_lock);
-			return 0;
+			return -EINTR;
 		}
 	}
-
 	set_bit(BME_LOCKED, &bm_ext->flags);
-
-	return 1;
+	return 0;
 }
 
 /**
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 2f9320be4906..346aed98027f 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -2068,21 +2068,12 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
 	case P_DATA_REQUEST:
 		e->w.cb = w_e_end_data_req;
 		fault_type = DRBD_FAULT_DT_RD;
-		break;
+		/* application IO, don't drbd_rs_begin_io */
+		goto submit;
+
 	case P_RS_DATA_REQUEST:
 		e->w.cb = w_e_end_rsdata_req;
 		fault_type = DRBD_FAULT_RS_RD;
-		/* Eventually this should become asynchronously. Currently it
-		 * blocks the whole receiver just to delay the reading of a
-		 * resync data block.
-		 * the drbd_work_queue mechanism is made for this...
-		 */
-		if (!drbd_rs_begin_io(mdev, sector)) {
-			/* we have been interrupted,
-			 * probably connection lost! */
-			D_ASSERT(signal_pending(current));
-			goto out_free_e;
-		}
 		break;
 
 	case P_OV_REPLY:
@@ -2108,13 +2099,8 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
 		} else if (h->command == P_OV_REPLY) {
 			e->w.cb = w_e_end_ov_reply;
 			dec_rs_pending(mdev);
-			break;
-		}
-
-		if (!drbd_rs_begin_io(mdev, sector)) {
-			/* we have been interrupted, probably connection lost! */
-			D_ASSERT(signal_pending(current));
-			goto out_free_e;
+			/* drbd_rs_begin_io done when we sent this request */
+			goto submit;
 		}
 		break;
 
@@ -2133,31 +2119,23 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
 		}
 		e->w.cb = w_e_end_ov_req;
 		fault_type = DRBD_FAULT_RS_RD;
-		/* Eventually this should become asynchronous. Currently it
-		 * blocks the whole receiver just to delay the reading of a
-		 * resync data block.
-		 * the drbd_work_queue mechanism is made for this...
-		 */
-		if (!drbd_rs_begin_io(mdev, sector)) {
-			/* we have been interrupted,
-			 * probably connection lost! */
-			D_ASSERT(signal_pending(current));
-			goto out_free_e;
-		}
 		break;
 
-
 	default:
 		dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
 		    cmdname(h->command));
 		fault_type = DRBD_FAULT_MAX;
+		goto out_free_e;
 	}
 
-	spin_lock_irq(&mdev->req_lock);
-	list_add(&e->w.list, &mdev->read_ee);
-	spin_unlock_irq(&mdev->req_lock);
+	if (drbd_rs_begin_io(mdev, e->sector))
+		goto out_free_e;
 
+submit:
 	inc_unacked(mdev);
+	spin_lock_irq(&mdev->req_lock);
+	list_add_tail(&e->w.list, &mdev->read_ee);
+	spin_unlock_irq(&mdev->req_lock);
 
 	if (drbd_submit_ee(mdev, e, READ, fault_type) == 0)
 		return TRUE;
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 53b74254b1c2..f5d779b4d685 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -374,26 +374,26 @@ static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
 	struct drbd_epoch_entry *e;
 
 	if (!get_ldev(mdev))
-		return 0;
+		return -EIO;
 
 	/* GFP_TRY, because if there is no memory available right now, this may
 	 * be rescheduled for later. It is "only" background resync, after all. */
 	e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY);
 	if (!e)
-		goto fail;
+		goto defer;
 
+	e->w.cb = w_e_send_csum;
 	spin_lock_irq(&mdev->req_lock);
 	list_add(&e->w.list, &mdev->read_ee);
 	spin_unlock_irq(&mdev->req_lock);
 
-	e->w.cb = w_e_send_csum;
 	if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0)
-		return 1;
+		return 0;
 
 	drbd_free_ee(mdev, e);
-fail:
+defer:
 	put_ldev(mdev);
-	return 2;
+	return -EAGAIN;
 }
 
 void resync_timer_fn(unsigned long data)
@@ -649,15 +649,19 @@ next_sector:
 			size = (capacity-sector)<<9;
 		if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) {
 			switch (read_for_csum(mdev, sector, size)) {
-			case 0: /* Disk failure*/
+			case -EIO: /* Disk failure */
 				put_ldev(mdev);
 				return 0;
-			case 2: /* Allocation failed */
+			case -EAGAIN: /* allocation failed, or ldev busy */
 				drbd_rs_complete_io(mdev, sector);
 				mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
 				i = rollback_i;
 				goto requeue;
-			/* case 1: everything ok */
+			case 0:
+				/* everything ok */
+				break;
+			default:
+				BUG();
 			}
 		} else {
 			inc_rs_pending(mdev);
-- 
cgit v1.2.3


From 0f0601f4ea2f53cfd8bcae060fb03d9bbde070ec Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Wed, 11 Aug 2010 23:40:24 +0200
Subject: drbd: new configuration parameter c-min-rate

We now track the data rate of locally submitted resync related requests,
and can thus detect non-resync activity on the lower level device.

If the current sync rate is above c-min-rate, and the lower level device
appears to be busy, we throttle the resyncer.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h      |  1 +
 drivers/block/drbd/drbd_main.c     |  7 ++-
 drivers/block/drbd/drbd_nl.c       |  3 +-
 drivers/block/drbd/drbd_receiver.c | 88 +++++++++++++++++++++++++++++++++++---
 drivers/block/drbd/drbd_worker.c   | 29 ++++++++-----
 include/linux/drbd_limits.h        |  4 ++
 include/linux/drbd_nl.h            |  1 +
 7 files changed, 116 insertions(+), 17 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 0fce3f36fc1c..0fedcc0b8dc9 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1513,6 +1513,7 @@ extern int w_restart_disk_io(struct drbd_conf *, struct drbd_work *, int);
 extern void resync_timer_fn(unsigned long data);
 
 /* drbd_receiver.c */
+extern int drbd_rs_should_slow_down(struct drbd_conf *mdev);
 extern int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
 		const unsigned rw, const int fault_type);
 extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list);
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 1ff8418ae0fa..db93eee7e543 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1098,6 +1098,8 @@ int __drbd_set_state(struct drbd_conf *mdev,
 		mdev->ov_left = mdev->rs_total
 			      - BM_SECT_TO_BIT(mdev->ov_position);
 		mdev->rs_start = now;
+		mdev->rs_last_events = 0;
+		mdev->rs_last_sect_ev = 0;
 		mdev->ov_last_oos_size = 0;
 		mdev->ov_last_oos_start = 0;
 
@@ -2706,7 +2708,8 @@ static void drbd_set_defaults(struct drbd_conf *mdev)
 		/* .c_plan_ahead = */	DRBD_C_PLAN_AHEAD_DEF,
 		/* .c_delay_target = */	DRBD_C_DELAY_TARGET_DEF,
 		/* .c_fill_target = */	DRBD_C_FILL_TARGET_DEF,
-		/* .c_max_rate = */	DRBD_C_MAX_RATE_DEF
+		/* .c_max_rate = */	DRBD_C_MAX_RATE_DEF,
+		/* .c_min_rate = */	DRBD_C_MIN_RATE_DEF
 	};
 
 	/* Have to use that way, because the layout differs between
@@ -2742,6 +2745,7 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
 	atomic_set(&mdev->packet_seq, 0);
 	atomic_set(&mdev->pp_in_use, 0);
 	atomic_set(&mdev->rs_sect_in, 0);
+	atomic_set(&mdev->rs_sect_ev, 0);
 
 	mutex_init(&mdev->md_io_mutex);
 	mutex_init(&mdev->data.mutex);
@@ -2819,6 +2823,7 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev)
 	mdev->rs_total     =
 	mdev->rs_failed    = 0;
 	mdev->rs_last_events = 0;
+	mdev->rs_last_sect_ev = 0;
 	for (i = 0; i < DRBD_SYNC_MARKS; i++) {
 		mdev->rs_mark_left[i] = 0;
 		mdev->rs_mark_time[i] = 0;
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 295b8d593708..6b35d41706e4 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1604,7 +1604,8 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
 		sc.c_plan_ahead = DRBD_C_PLAN_AHEAD_DEF;
 		sc.c_delay_target = DRBD_C_DELAY_TARGET_DEF;
 		sc.c_fill_target = DRBD_C_FILL_TARGET_DEF;
-		sc.c_max_rate   = DRBD_C_MAX_RATE_DEF;
+		sc.c_max_rate = DRBD_C_MAX_RATE_DEF;
+		sc.c_min_rate = DRBD_C_MIN_RATE_DEF;
 	} else
 		memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf));
 
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 346aed98027f..0d9967fef528 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1561,6 +1561,7 @@ static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_si
 	list_add(&e->w.list, &mdev->sync_ee);
 	spin_unlock_irq(&mdev->req_lock);
 
+	atomic_add(data_size >> 9, &mdev->rs_sect_ev);
 	if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0)
 		return TRUE;
 
@@ -2017,17 +2018,66 @@ out_interrupted:
 	return FALSE;
 }
 
+/* We may throttle resync, if the lower device seems to be busy,
+ * and current sync rate is above c_min_rate.
+ *
+ * To decide whether or not the lower device is busy, we use a scheme similar
+ * to MD RAID is_mddev_idle(): if the partition stats reveal "significant"
+ * (more than 64 sectors) of activity we cannot account for with our own resync
+ * activity, it obviously is "busy".
+ *
+ * The current sync rate used here uses only the most recent two step marks,
+ * to have a short time average so we can react faster.
+ */
+int drbd_rs_should_slow_down(struct drbd_conf *mdev)
+{
+	struct gendisk *disk = mdev->ldev->backing_bdev->bd_contains->bd_disk;
+	unsigned long db, dt, dbdt;
+	int curr_events;
+	int throttle = 0;
+
+	/* feature disabled? */
+	if (mdev->sync_conf.c_min_rate == 0)
+		return 0;
+
+	curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
+		      (int)part_stat_read(&disk->part0, sectors[1]) -
+			atomic_read(&mdev->rs_sect_ev);
+	if (!mdev->rs_last_events || curr_events - mdev->rs_last_events > 64) {
+		unsigned long rs_left;
+		int i;
+
+		mdev->rs_last_events = curr_events;
+
+		/* sync speed average over the last 2*DRBD_SYNC_MARK_STEP,
+		 * approx. */
+		i = (mdev->rs_last_mark + DRBD_SYNC_MARKS-2) % DRBD_SYNC_MARKS;
+		rs_left = drbd_bm_total_weight(mdev) - mdev->rs_failed;
+
+		dt = ((long)jiffies - (long)mdev->rs_mark_time[i]) / HZ;
+		if (!dt)
+			dt++;
+		db = mdev->rs_mark_left[i] - rs_left;
+		dbdt = Bit2KB(db/dt);
+
+		if (dbdt > mdev->sync_conf.c_min_rate)
+			throttle = 1;
+	}
+	return throttle;
+}
+
+
 static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
 {
 	sector_t sector;
 	const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
 	struct drbd_epoch_entry *e;
 	struct digest_info *di = NULL;
+	struct p_block_req *p = (struct p_block_req *)h;
+	const int brps = sizeof(*p)-sizeof(*h);
 	int size, digest_size;
 	unsigned int fault_type;
-	struct p_block_req *p =
-		(struct p_block_req *)h;
-	const int brps = sizeof(*p)-sizeof(*h);
+
 
 	if (drbd_recv(mdev, h->payload, brps) != brps)
 		return FALSE;
@@ -2099,8 +2149,9 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
 		} else if (h->command == P_OV_REPLY) {
 			e->w.cb = w_e_end_ov_reply;
 			dec_rs_pending(mdev);
-			/* drbd_rs_begin_io done when we sent this request */
-			goto submit;
+			/* drbd_rs_begin_io done when we sent this request,
+			 * but accounting still needs to be done. */
+			goto submit_for_resync;
 		}
 		break;
 
@@ -2128,9 +2179,36 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
 		goto out_free_e;
 	}
 
+	/* Throttle, drbd_rs_begin_io and submit should become asynchronous
+	 * wrt the receiver, but it is not as straightforward as it may seem.
+	 * Various places in the resync start and stop logic assume resync
+	 * requests are processed in order, requeuing this on the worker thread
+	 * introduces a bunch of new code for synchronization between threads.
+	 *
+	 * Unlimited throttling before drbd_rs_begin_io may stall the resync
+	 * "forever", throttling after drbd_rs_begin_io will lock that extent
+	 * for application writes for the same time.  For now, just throttle
+	 * here, where the rest of the code expects the receiver to sleep for
+	 * a while, anyways.
+	 */
+
+	/* Throttle before drbd_rs_begin_io, as that locks out application IO;
+	 * this defers syncer requests for some time, before letting at least
+	 * on request through.  The resync controller on the receiving side
+	 * will adapt to the incoming rate accordingly.
+	 *
+	 * We cannot throttle here if remote is Primary/SyncTarget:
+	 * we would also throttle its application reads.
+	 * In that case, throttling is done on the SyncTarget only.
+	 */
+	if (mdev->state.peer != R_PRIMARY && drbd_rs_should_slow_down(mdev))
+		msleep(100);
 	if (drbd_rs_begin_io(mdev, e->sector))
 		goto out_free_e;
 
+submit_for_resync:
+	atomic_add(size >> 9, &mdev->rs_sect_ev);
+
 submit:
 	inc_unacked(mdev);
 	spin_lock_irq(&mdev->req_lock);
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index f5d779b4d685..99c937acb471 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -215,10 +215,8 @@ void drbd_endio_sec(struct bio *bio, int error)
  */
 void drbd_endio_pri(struct bio *bio, int error)
 {
-	unsigned long flags;
 	struct drbd_request *req = bio->bi_private;
 	struct drbd_conf *mdev = req->mdev;
-	struct bio_and_error m;
 	enum drbd_req_event what;
 	int uptodate = bio_flagged(bio, BIO_UPTODATE);
 
@@ -244,12 +242,7 @@ void drbd_endio_pri(struct bio *bio, int error)
 	bio_put(req->private_bio);
 	req->private_bio = ERR_PTR(error);
 
-	spin_lock_irqsave(&mdev->req_lock, flags);
-	__req_mod(req, what, &m);
-	spin_unlock_irqrestore(&mdev->req_lock, flags);
-
-	if (m.bio)
-		complete_master_bio(mdev, &m);
+	req_mod(req, what);
 }
 
 int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
@@ -376,6 +369,9 @@ static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
 	if (!get_ldev(mdev))
 		return -EIO;
 
+	if (drbd_rs_should_slow_down(mdev))
+		goto defer;
+
 	/* GFP_TRY, because if there is no memory available right now, this may
 	 * be rescheduled for later. It is "only" background resync, after all. */
 	e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY);
@@ -387,6 +383,7 @@ static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
 	list_add(&e->w.list, &mdev->read_ee);
 	spin_unlock_irq(&mdev->req_lock);
 
+	atomic_add(size >> 9, &mdev->rs_sect_ev);
 	if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0)
 		return 0;
 
@@ -512,8 +509,9 @@ int w_make_resync_request(struct drbd_conf *mdev,
 	sector_t sector;
 	const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
 	int max_segment_size;
-	int number, i, rollback_i, size, pe, mx;
+	int number, rollback_i, size, pe, mx;
 	int align, queued, sndbuf;
+	int i = 0;
 
 	if (unlikely(cancel))
 		return 1;
@@ -549,7 +547,14 @@ int w_make_resync_request(struct drbd_conf *mdev,
 		mdev->c_sync_rate = mdev->sync_conf.rate;
 		number = SLEEP_TIME * mdev->c_sync_rate  / ((BM_BLOCK_SIZE / 1024) * HZ);
 	}
-	pe = atomic_read(&mdev->rs_pending_cnt);
+
+	/* Throttle resync on lower level disk activity, which may also be
+	 * caused by application IO on Primary/SyncTarget.
+	 * Keep this after the call to drbd_rs_controller, as that assumes
+	 * to be called as precisely as possible every SLEEP_TIME,
+	 * and would be confused otherwise. */
+	if (drbd_rs_should_slow_down(mdev))
+		goto requeue;
 
 	mutex_lock(&mdev->data.mutex);
 	if (mdev->data.socket)
@@ -563,6 +568,7 @@ int w_make_resync_request(struct drbd_conf *mdev,
 		mx = number;
 
 	/* Limit the number of pending RS requests to no more than the peer's receive buffer */
+	pe = atomic_read(&mdev->rs_pending_cnt);
 	if ((pe + number) > mx) {
 		number = mx - pe;
 	}
@@ -1492,6 +1498,8 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
 		mdev->rs_failed    = 0;
 		mdev->rs_paused    = 0;
 		mdev->rs_same_csum = 0;
+		mdev->rs_last_events = 0;
+		mdev->rs_last_sect_ev = 0;
 		mdev->rs_total     = tw;
 		mdev->rs_start     = now;
 		for (i = 0; i < DRBD_SYNC_MARKS; i++) {
@@ -1516,6 +1524,7 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
 		}
 
 		atomic_set(&mdev->rs_sect_in, 0);
+		atomic_set(&mdev->rs_sect_ev, 0);
 		mdev->rs_in_flight = 0;
 		mdev->rs_planed = 0;
 		spin_lock(&mdev->peer_seq_lock);
diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h
index 06dbba47a8ef..0b24ded6fffd 100644
--- a/include/linux/drbd_limits.h
+++ b/include/linux/drbd_limits.h
@@ -150,5 +150,9 @@
 #define DRBD_C_MAX_RATE_MAX     (4 << 20)
 #define DRBD_C_MAX_RATE_DEF     102400
 
+#define DRBD_C_MIN_RATE_MIN     0 /* kByte/sec */
+#define DRBD_C_MIN_RATE_MAX     (4 << 20)
+#define DRBD_C_MIN_RATE_DEF     4096
+
 #undef RANGE
 #endif
diff --git a/include/linux/drbd_nl.h b/include/linux/drbd_nl.h
index e23683c87ca1..ade91107c9a5 100644
--- a/include/linux/drbd_nl.h
+++ b/include/linux/drbd_nl.h
@@ -92,6 +92,7 @@ NL_PACKET(syncer_conf, 8,
 	NL_INTEGER(     77,	T_MAY_IGNORE,	c_delay_target)
 	NL_INTEGER(     78,	T_MAY_IGNORE,	c_fill_target)
 	NL_INTEGER(     79,	T_MAY_IGNORE,	c_max_rate)
+	NL_INTEGER(     80,	T_MAY_IGNORE,	c_min_rate)
 )
 
 NL_PACKET(invalidate, 9, )
-- 
cgit v1.2.3


From 48acf8689847c061bd82c808c379f1bd79dfe407 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Mon, 23 Aug 2010 15:51:56 +0200
Subject: drbd: Microfix: Assigning sector once is sufficient

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_receiver.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 0d9967fef528..591a171291d9 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -365,7 +365,6 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
 	e->size = data_size;
 	e->flags = 0;
 	e->sector = sector;
-	e->sector = sector;
 	e->block_id = id;
 
 	return e;
-- 
cgit v1.2.3


From 204bba9965c4cc175bf5bc65ddd19889e9085c72 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Mon, 23 Aug 2010 16:17:13 +0200
Subject: drbd: Bugfix for regression introduced with f9bc8913c06022e

If we intent to use the block_id member of an epoch entry,
we may not use the digest member.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_worker.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 99c937acb471..1eeb55423b3e 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -1052,7 +1052,9 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 			ok = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, e);
 		} else {
 			inc_rs_pending(mdev);
-			e->block_id = ID_SYNCER;
+			e->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */
+			e->flags &= ~EE_HAS_DIGEST; /* This e no longer has a digest pointer */
+			kfree(di);
 			ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
 		}
 	} else {
-- 
cgit v1.2.3


From 0b70a13dac014ec9274640b9e945bde493ba365e Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Fri, 20 Aug 2010 13:36:10 +0200
Subject: drbd: Sending of big packets, for payloads from 64KByte to 4GByte

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h      | 74 +++++++++++++++++-------------
 drivers/block/drbd/drbd_main.c     | 79 +++++++++++++++++++-------------
 drivers/block/drbd/drbd_receiver.c | 94 +++++++++++++++++++-------------------
 drivers/block/drbd/drbd_worker.c   |  2 +-
 include/linux/drbd.h               |  2 +
 5 files changed, 139 insertions(+), 112 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 0fedcc0b8dc9..3f10efc2ac14 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -337,13 +337,25 @@ static inline void bm_xfer_ctx_bit_to_word_offset(struct bm_xfer_ctx *c)
  * NOTE that the payload starts at a long aligned offset,
  * regardless of 32 or 64 bit arch!
  */
-struct p_header {
+struct p_header80 {
 	u32	  magic;
 	u16	  command;
 	u16	  length;	/* bytes of data after this header */
 	u8	  payload[0];
 } __packed;
-/* 8 bytes. packet FIXED for the next century! */
+
+/* Header for big packets, Used for data packets exceeding 64kB */
+struct p_header95 {
+	u16	  magic;	/* use DRBD_MAGIC_BIG here */
+	u16	  command;
+	u32	  length;
+	u8	  payload[0];
+} __packed;
+
+union p_header {
+	struct p_header80 h80;
+	struct p_header95 h95;
+};
 
 /*
  * short commands, packets without payload, plain p_header:
@@ -367,7 +379,7 @@ struct p_header {
 #define DP_MAY_SET_IN_SYNC    4
 
 struct p_data {
-	struct p_header head;
+	union p_header head;
 	u64	    sector;    /* 64 bits sector number */
 	u64	    block_id;  /* to identify the request in protocol B&C */
 	u32	    seq_num;
@@ -383,7 +395,7 @@ struct p_data {
  *   P_DATA_REQUEST, P_RS_DATA_REQUEST
  */
 struct p_block_ack {
-	struct p_header head;
+	struct p_header80 head;
 	u64	    sector;
 	u64	    block_id;
 	u32	    blksize;
@@ -392,7 +404,7 @@ struct p_block_ack {
 
 
 struct p_block_req {
-	struct p_header head;
+	struct p_header80 head;
 	u64 sector;
 	u64 block_id;
 	u32 blksize;
@@ -409,7 +421,7 @@ struct p_block_req {
  */
 
 struct p_handshake {
-	struct p_header head;	/* 8 bytes */
+	struct p_header80 head;	/* 8 bytes */
 	u32 protocol_min;
 	u32 feature_flags;
 	u32 protocol_max;
@@ -424,19 +436,19 @@ struct p_handshake {
 /* 80 bytes, FIXED for the next century */
 
 struct p_barrier {
-	struct p_header head;
+	struct p_header80 head;
 	u32 barrier;	/* barrier number _handle_ only */
 	u32 pad;	/* to multiple of 8 Byte */
 } __packed;
 
 struct p_barrier_ack {
-	struct p_header head;
+	struct p_header80 head;
 	u32 barrier;
 	u32 set_size;
 } __packed;
 
 struct p_rs_param {
-	struct p_header head;
+	struct p_header80 head;
 	u32 rate;
 
 	      /* Since protocol version 88 and higher. */
@@ -444,7 +456,7 @@ struct p_rs_param {
 } __packed;
 
 struct p_rs_param_89 {
-	struct p_header head;
+	struct p_header80 head;
 	u32 rate;
         /* protocol version 89: */
 	char verify_alg[SHARED_SECRET_MAX];
@@ -452,7 +464,7 @@ struct p_rs_param_89 {
 } __packed;
 
 struct p_rs_param_95 {
-	struct p_header head;
+	struct p_header80 head;
 	u32 rate;
 	char verify_alg[SHARED_SECRET_MAX];
 	char csums_alg[SHARED_SECRET_MAX];
@@ -468,7 +480,7 @@ enum drbd_conn_flags {
 };
 
 struct p_protocol {
-	struct p_header head;
+	struct p_header80 head;
 	u32 protocol;
 	u32 after_sb_0p;
 	u32 after_sb_1p;
@@ -482,17 +494,17 @@ struct p_protocol {
 } __packed;
 
 struct p_uuids {
-	struct p_header head;
+	struct p_header80 head;
 	u64 uuid[UI_EXTENDED_SIZE];
 } __packed;
 
 struct p_rs_uuid {
-	struct p_header head;
+	struct p_header80 head;
 	u64	    uuid;
 } __packed;
 
 struct p_sizes {
-	struct p_header head;
+	struct p_header80 head;
 	u64	    d_size;  /* size of disk */
 	u64	    u_size;  /* user requested size */
 	u64	    c_size;  /* current exported size */
@@ -502,18 +514,18 @@ struct p_sizes {
 } __packed;
 
 struct p_state {
-	struct p_header head;
+	struct p_header80 head;
 	u32	    state;
 } __packed;
 
 struct p_req_state {
-	struct p_header head;
+	struct p_header80 head;
 	u32	    mask;
 	u32	    val;
 } __packed;
 
 struct p_req_state_reply {
-	struct p_header head;
+	struct p_header80 head;
 	u32	    retcode;
 } __packed;
 
@@ -528,7 +540,7 @@ struct p_drbd06_param {
 } __packed;
 
 struct p_discard {
-	struct p_header head;
+	struct p_header80 head;
 	u64	    block_id;
 	u32	    seq_num;
 	u32	    pad;
@@ -544,7 +556,7 @@ enum drbd_bitmap_code {
 };
 
 struct p_compressed_bm {
-	struct p_header head;
+	struct p_header80 head;
 	/* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code
 	 * (encoding & 0x80): polarity (set/unset) of first runlength
 	 * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits
@@ -555,10 +567,10 @@ struct p_compressed_bm {
 	u8 code[0];
 } __packed;
 
-struct p_delay_probe {
-	struct p_header head;
-	u32	seq_num; /* sequence number to match the two probe packets */
-	u32	offset;	 /* usecs the probe got sent after the reference time point */
+struct p_delay_probe93 {
+	struct p_header80 head;
+	u32     seq_num; /* sequence number to match the two probe packets */
+	u32     offset;  /* usecs the probe got sent after the reference time point */
 } __packed;
 
 /* DCBP: Drbd Compressed Bitmap Packet ... */
@@ -605,7 +617,7 @@ DCBP_set_pad_bits(struct p_compressed_bm *p, int n)
  * so we need to use the fixed size 4KiB page size
  * most architechtures have used for a long time.
  */
-#define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header))
+#define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header80))
 #define BM_PACKET_WORDS (BM_PACKET_PAYLOAD_BYTES/sizeof(long))
 #define BM_PACKET_VLI_BYTES_MAX (4096 - sizeof(struct p_compressed_bm))
 #if (PAGE_SIZE < 4096)
@@ -614,7 +626,7 @@ DCBP_set_pad_bits(struct p_compressed_bm *p, int n)
 #endif
 
 union p_polymorph {
-        struct p_header          header;
+        struct p_header80        header;
         struct p_handshake       handshake;
         struct p_data            data;
         struct p_block_ack       block_ack;
@@ -1188,12 +1200,12 @@ extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_f
 extern int _drbd_send_state(struct drbd_conf *mdev);
 extern int drbd_send_state(struct drbd_conf *mdev);
 extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
-			enum drbd_packets cmd, struct p_header *h,
+			enum drbd_packets cmd, struct p_header80 *h,
 			size_t size, unsigned msg_flags);
 #define USE_DATA_SOCKET 1
 #define USE_META_SOCKET 0
 extern int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
-			enum drbd_packets cmd, struct p_header *h,
+			enum drbd_packets cmd, struct p_header80 *h,
 			size_t size);
 extern int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd,
 			char *data, size_t size);
@@ -1936,19 +1948,19 @@ static inline void request_ping(struct drbd_conf *mdev)
 static inline int drbd_send_short_cmd(struct drbd_conf *mdev,
 	enum drbd_packets cmd)
 {
-	struct p_header h;
+	struct p_header80 h;
 	return drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &h, sizeof(h));
 }
 
 static inline int drbd_send_ping(struct drbd_conf *mdev)
 {
-	struct p_header h;
+	struct p_header80 h;
 	return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING, &h, sizeof(h));
 }
 
 static inline int drbd_send_ping_ack(struct drbd_conf *mdev)
 {
-	struct p_header h;
+	struct p_header80 h;
 	return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING_ACK, &h, sizeof(h));
 }
 
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index db93eee7e543..f3f4ea9c5eb9 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1647,7 +1647,7 @@ void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
 
 /* the appropriate socket mutex must be held already */
 int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
-			  enum drbd_packets cmd, struct p_header *h,
+			  enum drbd_packets cmd, struct p_header80 *h,
 			  size_t size, unsigned msg_flags)
 {
 	int sent, ok;
@@ -1657,7 +1657,7 @@ int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
 
 	h->magic   = BE_DRBD_MAGIC;
 	h->command = cpu_to_be16(cmd);
-	h->length  = cpu_to_be16(size-sizeof(struct p_header));
+	h->length  = cpu_to_be16(size-sizeof(struct p_header80));
 
 	sent = drbd_send(mdev, sock, h, size, msg_flags);
 
@@ -1672,7 +1672,7 @@ int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
  * when we hold the appropriate socket mutex.
  */
 int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
-		  enum drbd_packets cmd, struct p_header *h, size_t size)
+		  enum drbd_packets cmd, struct p_header80 *h, size_t size)
 {
 	int ok = 0;
 	struct socket *sock;
@@ -1700,7 +1700,7 @@ int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
 int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
 		   size_t size)
 {
-	struct p_header h;
+	struct p_header80 h;
 	int ok;
 
 	h.magic   = BE_DRBD_MAGIC;
@@ -1807,7 +1807,7 @@ int drbd_send_protocol(struct drbd_conf *mdev)
 		strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
 
 	rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
-			   (struct p_header *)p, size);
+			   (struct p_header80 *)p, size);
 	kfree(p);
 	return rv;
 }
@@ -1833,7 +1833,7 @@ int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
 	put_ldev(mdev);
 
 	return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
-			     (struct p_header *)&p, sizeof(p));
+			     (struct p_header80 *)&p, sizeof(p));
 }
 
 int drbd_send_uuids(struct drbd_conf *mdev)
@@ -1854,7 +1854,7 @@ int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val)
 	p.uuid = cpu_to_be64(val);
 
 	return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
-			     (struct p_header *)&p, sizeof(p));
+			     (struct p_header80 *)&p, sizeof(p));
 }
 
 int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
@@ -1884,7 +1884,7 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl
 	p.dds_flags = cpu_to_be16(flags);
 
 	ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
-			   (struct p_header *)&p, sizeof(p));
+			   (struct p_header80 *)&p, sizeof(p));
 	return ok;
 }
 
@@ -1909,7 +1909,7 @@ int drbd_send_state(struct drbd_conf *mdev)
 
 	if (likely(sock != NULL)) {
 		ok = _drbd_send_cmd(mdev, sock, P_STATE,
-				    (struct p_header *)&p, sizeof(p), 0);
+				    (struct p_header80 *)&p, sizeof(p), 0);
 	}
 
 	mutex_unlock(&mdev->data.mutex);
@@ -1927,7 +1927,7 @@ int drbd_send_state_req(struct drbd_conf *mdev,
 	p.val     = cpu_to_be32(val.i);
 
 	return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
-			     (struct p_header *)&p, sizeof(p));
+			     (struct p_header80 *)&p, sizeof(p));
 }
 
 int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode)
@@ -1937,7 +1937,7 @@ int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode)
 	p.retcode    = cpu_to_be32(retcode);
 
 	return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
-			     (struct p_header *)&p, sizeof(p));
+			     (struct p_header80 *)&p, sizeof(p));
 }
 
 int fill_bitmap_rle_bits(struct drbd_conf *mdev,
@@ -2036,7 +2036,7 @@ int fill_bitmap_rle_bits(struct drbd_conf *mdev,
 
 enum { OK, FAILED, DONE }
 send_bitmap_rle_or_plain(struct drbd_conf *mdev,
-	struct p_header *h, struct bm_xfer_ctx *c)
+	struct p_header80 *h, struct bm_xfer_ctx *c)
 {
 	struct p_compressed_bm *p = (void*)h;
 	unsigned long num_words;
@@ -2066,12 +2066,12 @@ send_bitmap_rle_or_plain(struct drbd_conf *mdev,
 		if (len)
 			drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
 		ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
-				   h, sizeof(struct p_header) + len, 0);
+				   h, sizeof(struct p_header80) + len, 0);
 		c->word_offset += num_words;
 		c->bit_offset = c->word_offset * BITS_PER_LONG;
 
 		c->packets[1]++;
-		c->bytes[1] += sizeof(struct p_header) + len;
+		c->bytes[1] += sizeof(struct p_header80) + len;
 
 		if (c->bit_offset > c->bm_bits)
 			c->bit_offset = c->bm_bits;
@@ -2087,14 +2087,14 @@ send_bitmap_rle_or_plain(struct drbd_conf *mdev,
 int _drbd_send_bitmap(struct drbd_conf *mdev)
 {
 	struct bm_xfer_ctx c;
-	struct p_header *p;
+	struct p_header80 *p;
 	int ret;
 
 	ERR_IF(!mdev->bitmap) return FALSE;
 
 	/* maybe we should use some per thread scratch page,
 	 * and allocate that during initial device creation? */
-	p = (struct p_header *) __get_free_page(GFP_NOIO);
+	p = (struct p_header80 *) __get_free_page(GFP_NOIO);
 	if (!p) {
 		dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
 		return FALSE;
@@ -2152,7 +2152,7 @@ int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
 	if (mdev->state.conn < C_CONNECTED)
 		return FALSE;
 	ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
-			(struct p_header *)&p, sizeof(p));
+			(struct p_header80 *)&p, sizeof(p));
 	return ok;
 }
 
@@ -2180,7 +2180,7 @@ static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
 	if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
 		return FALSE;
 	ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
-				(struct p_header *)&p, sizeof(p));
+				(struct p_header80 *)&p, sizeof(p));
 	return ok;
 }
 
@@ -2188,8 +2188,8 @@ int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
 		     struct p_data *dp)
 {
 	const int header_size = sizeof(struct p_data)
-			      - sizeof(struct p_header);
-	int data_size  = ((struct p_header *)dp)->length - header_size;
+			      - sizeof(struct p_header80);
+	int data_size  = ((struct p_header80 *)dp)->length - header_size;
 
 	return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
 			      dp->block_id);
@@ -2238,7 +2238,7 @@ int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
 	p.blksize  = cpu_to_be32(size);
 
 	ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
-				(struct p_header *)&p, sizeof(p));
+				(struct p_header80 *)&p, sizeof(p));
 	return ok;
 }
 
@@ -2256,7 +2256,7 @@ int drbd_send_drequest_csum(struct drbd_conf *mdev,
 
 	p.head.magic   = BE_DRBD_MAGIC;
 	p.head.command = cpu_to_be16(cmd);
-	p.head.length  = cpu_to_be16(sizeof(p) - sizeof(struct p_header) + digest_size);
+	p.head.length  = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
 
 	mutex_lock(&mdev->data.mutex);
 
@@ -2278,7 +2278,7 @@ int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
 	p.blksize  = cpu_to_be32(size);
 
 	ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
-			   (struct p_header *)&p, sizeof(p));
+			   (struct p_header80 *)&p, sizeof(p));
 	return ok;
 }
 
@@ -2447,10 +2447,17 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
 	dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
 		crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
 
-	p.head.magic   = BE_DRBD_MAGIC;
-	p.head.command = cpu_to_be16(P_DATA);
-	p.head.length  =
-		cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + req->size);
+	if (req->size <= (1 << 15)) {
+		p.head.h80.magic   = BE_DRBD_MAGIC;
+		p.head.h80.command = cpu_to_be16(P_DATA);
+		p.head.h80.length  =
+			cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size);
+	} else {
+		p.head.h95.magic   = BE_DRBD_MAGIC_BIG;
+		p.head.h95.command = cpu_to_be16(P_DATA);
+		p.head.h95.length  =
+			cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size);
+	}
 
 	p.sector   = cpu_to_be64(req->sector);
 	p.block_id = (unsigned long)req;
@@ -2511,10 +2518,17 @@ int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
 	dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
 		crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
 
-	p.head.magic   = BE_DRBD_MAGIC;
-	p.head.command = cpu_to_be16(cmd);
-	p.head.length  =
-		cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + e->size);
+	if (e->size <= (1 << 15)) {
+		p.head.h80.magic   = BE_DRBD_MAGIC;
+		p.head.h80.command = cpu_to_be16(cmd);
+		p.head.h80.length  =
+			cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
+	} else {
+		p.head.h95.magic   = BE_DRBD_MAGIC_BIG;
+		p.head.h95.command = cpu_to_be16(cmd);
+		p.head.h95.length  =
+			cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
+	}
 
 	p.sector   = cpu_to_be64(e->sector);
 	p.block_id = e->block_id;
@@ -2527,8 +2541,7 @@ int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
 	if (!drbd_get_data_sock(mdev))
 		return 0;
 
-	ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p,
-					sizeof(p), dgs ? MSG_MORE : 0);
+	ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
 	if (ok && dgs) {
 		dgb = mdev->int_dig_out;
 		drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 591a171291d9..9b3321e2c3cd 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -720,14 +720,14 @@ out:
 static int drbd_send_fp(struct drbd_conf *mdev,
 	struct socket *sock, enum drbd_packets cmd)
 {
-	struct p_header *h = (struct p_header *) &mdev->data.sbuf.header;
+	struct p_header80 *h = (struct p_header80 *) &mdev->data.sbuf.header;
 
 	return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0);
 }
 
 static enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock)
 {
-	struct p_header *h = (struct p_header *) &mdev->data.sbuf.header;
+	struct p_header80 *h = (struct p_header80 *) &mdev->data.sbuf.header;
 	int rr;
 
 	rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0);
@@ -944,7 +944,7 @@ out_release_sockets:
 	return -1;
 }
 
-static int drbd_recv_header(struct drbd_conf *mdev, struct p_header *h)
+static int drbd_recv_header(struct drbd_conf *mdev, struct p_header80 *h)
 {
 	int r;
 
@@ -1266,7 +1266,7 @@ int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __relea
 	return 1;
 }
 
-static int receive_Barrier(struct drbd_conf *mdev, struct p_header *h)
+static int receive_Barrier(struct drbd_conf *mdev, struct p_header80 *h)
 {
 	int rv, issue_flush;
 	struct p_barrier *p = (struct p_barrier *)h;
@@ -1570,7 +1570,7 @@ fail:
 	return FALSE;
 }
 
-static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h)
+static int receive_DataReply(struct drbd_conf *mdev, struct p_header80 *h)
 {
 	struct drbd_request *req;
 	sector_t sector;
@@ -1610,7 +1610,7 @@ static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h)
 	return ok;
 }
 
-static int receive_RSDataReply(struct drbd_conf *mdev, struct p_header *h)
+static int receive_RSDataReply(struct drbd_conf *mdev, struct p_header80 *h)
 {
 	sector_t sector;
 	unsigned int header_size, data_size;
@@ -1767,7 +1767,7 @@ static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq)
 }
 
 /* mirrored write */
-static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
+static int receive_Data(struct drbd_conf *mdev, struct p_header80 *h)
 {
 	sector_t sector;
 	struct drbd_epoch_entry *e;
@@ -2066,7 +2066,7 @@ int drbd_rs_should_slow_down(struct drbd_conf *mdev)
 }
 
 
-static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
+static int receive_DataRequest(struct drbd_conf *mdev, struct p_header80 *h)
 {
 	sector_t sector;
 	const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
@@ -2756,7 +2756,7 @@ static int cmp_after_sb(enum drbd_after_sb_p peer, enum drbd_after_sb_p self)
 	return 1;
 }
 
-static int receive_protocol(struct drbd_conf *mdev, struct p_header *h)
+static int receive_protocol(struct drbd_conf *mdev, struct p_header80 *h)
 {
 	struct p_protocol *p = (struct p_protocol *)h;
 	int header_size, data_size;
@@ -2862,7 +2862,7 @@ struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev,
 	return tfm;
 }
 
-static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h)
+static int receive_SyncParam(struct drbd_conf *mdev, struct p_header80 *h)
 {
 	int ok = TRUE;
 	struct p_rs_param_95 *p = (struct p_rs_param_95 *)h;
@@ -3032,7 +3032,7 @@ static void warn_if_differ_considerably(struct drbd_conf *mdev,
 		     (unsigned long long)a, (unsigned long long)b);
 }
 
-static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
+static int receive_sizes(struct drbd_conf *mdev, struct p_header80 *h)
 {
 	struct p_sizes *p = (struct p_sizes *)h;
 	enum determine_dev_size dd = unchanged;
@@ -3148,7 +3148,7 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
 	return TRUE;
 }
 
-static int receive_uuids(struct drbd_conf *mdev, struct p_header *h)
+static int receive_uuids(struct drbd_conf *mdev, struct p_header80 *h)
 {
 	struct p_uuids *p = (struct p_uuids *)h;
 	u64 *p_uuid;
@@ -3241,7 +3241,7 @@ static union drbd_state convert_state(union drbd_state ps)
 	return ms;
 }
 
-static int receive_req_state(struct drbd_conf *mdev, struct p_header *h)
+static int receive_req_state(struct drbd_conf *mdev, struct p_header80 *h)
 {
 	struct p_req_state *p = (struct p_req_state *)h;
 	union drbd_state mask, val;
@@ -3271,7 +3271,7 @@ static int receive_req_state(struct drbd_conf *mdev, struct p_header *h)
 	return TRUE;
 }
 
-static int receive_state(struct drbd_conf *mdev, struct p_header *h)
+static int receive_state(struct drbd_conf *mdev, struct p_header80 *h)
 {
 	struct p_state *p = (struct p_state *)h;
 	enum drbd_conns nconn, oconn;
@@ -3395,7 +3395,7 @@ static int receive_state(struct drbd_conf *mdev, struct p_header *h)
 	return TRUE;
 }
 
-static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header *h)
+static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header80 *h)
 {
 	struct p_rs_uuid *p = (struct p_rs_uuid *)h;
 
@@ -3428,7 +3428,7 @@ static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header *h)
 enum receive_bitmap_ret { OK, DONE, FAILED };
 
 static enum receive_bitmap_ret
-receive_bitmap_plain(struct drbd_conf *mdev, struct p_header *h,
+receive_bitmap_plain(struct drbd_conf *mdev, struct p_header80 *h,
 	unsigned long *buffer, struct bm_xfer_ctx *c)
 {
 	unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
@@ -3533,7 +3533,7 @@ void INFO_bm_xfer_stats(struct drbd_conf *mdev,
 		const char *direction, struct bm_xfer_ctx *c)
 {
 	/* what would it take to transfer it "plaintext" */
-	unsigned plain = sizeof(struct p_header) *
+	unsigned plain = sizeof(struct p_header80) *
 		((c->bm_words+BM_PACKET_WORDS-1)/BM_PACKET_WORDS+1)
 		+ c->bm_words * sizeof(long);
 	unsigned total = c->bytes[0] + c->bytes[1];
@@ -3571,7 +3571,7 @@ void INFO_bm_xfer_stats(struct drbd_conf *mdev,
    in order to be agnostic to the 32 vs 64 bits issue.
 
    returns 0 on failure, 1 if we successfully received it. */
-static int receive_bitmap(struct drbd_conf *mdev, struct p_header *h)
+static int receive_bitmap(struct drbd_conf *mdev, struct p_header80 *h)
 {
 	struct bm_xfer_ctx c;
 	void *buffer;
@@ -3623,7 +3623,7 @@ static int receive_bitmap(struct drbd_conf *mdev, struct p_header *h)
 		}
 
 		c.packets[h->command == P_BITMAP]++;
-		c.bytes[h->command == P_BITMAP] += sizeof(struct p_header) + h->length;
+		c.bytes[h->command == P_BITMAP] += sizeof(struct p_header80) + h->length;
 
 		if (ret != OK)
 			break;
@@ -3659,7 +3659,7 @@ static int receive_bitmap(struct drbd_conf *mdev, struct p_header *h)
 	return ok;
 }
 
-static int receive_skip_(struct drbd_conf *mdev, struct p_header *h, int silent)
+static int receive_skip_(struct drbd_conf *mdev, struct p_header80 *h, int silent)
 {
 	/* TODO zero copy sink :) */
 	static char sink[128];
@@ -3679,17 +3679,17 @@ static int receive_skip_(struct drbd_conf *mdev, struct p_header *h, int silent)
 	return size == 0;
 }
 
-static int receive_skip(struct drbd_conf *mdev, struct p_header *h)
+static int receive_skip(struct drbd_conf *mdev, struct p_header80 *h)
 {
 	return receive_skip_(mdev, h, 0);
 }
 
-static int receive_skip_silent(struct drbd_conf *mdev, struct p_header *h)
+static int receive_skip_silent(struct drbd_conf *mdev, struct p_header80 *h)
 {
 	return receive_skip_(mdev, h, 1);
 }
 
-static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header *h)
+static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header80 *h)
 {
 	if (mdev->state.disk >= D_INCONSISTENT)
 		drbd_kick_lo(mdev);
@@ -3701,7 +3701,7 @@ static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header *h)
 	return TRUE;
 }
 
-typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, struct p_header *);
+typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, struct p_header80 *);
 
 static drbd_cmd_handler_f drbd_default_handler[] = {
 	[P_DATA]	    = receive_Data,
@@ -3736,7 +3736,7 @@ static drbd_cmd_handler_f *drbd_opt_cmd_handler;
 static void drbdd(struct drbd_conf *mdev)
 {
 	drbd_cmd_handler_f handler;
-	struct p_header *header = &mdev->data.rbuf.header;
+	struct p_header80 *header = &mdev->data.rbuf.header;
 
 	while (get_t_state(&mdev->receiver) == Running) {
 		drbd_thread_current_set_cpu(mdev);
@@ -3964,7 +3964,7 @@ static int drbd_send_handshake(struct drbd_conf *mdev)
 	p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
 	p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
 	ok = _drbd_send_cmd( mdev, mdev->data.socket, P_HAND_SHAKE,
-			     (struct p_header *)p, sizeof(*p), 0 );
+			     (struct p_header80 *)p, sizeof(*p), 0 );
 	mutex_unlock(&mdev->data.mutex);
 	return ok;
 }
@@ -3981,7 +3981,7 @@ static int drbd_do_handshake(struct drbd_conf *mdev)
 	/* ASSERT current == mdev->receiver ... */
 	struct p_handshake *p = &mdev->data.rbuf.handshake;
 	const int expect = sizeof(struct p_handshake)
-			  -sizeof(struct p_header);
+			  -sizeof(struct p_header80);
 	int rv;
 
 	rv = drbd_send_handshake(mdev);
@@ -4058,7 +4058,7 @@ static int drbd_do_auth(struct drbd_conf *mdev)
 	char *response = NULL;
 	char *right_response = NULL;
 	char *peers_ch = NULL;
-	struct p_header p;
+	struct p_header80 p;
 	unsigned int key_len = strlen(mdev->net_conf->shared_secret);
 	unsigned int resp_size;
 	struct hash_desc desc;
@@ -4231,7 +4231,7 @@ int drbdd_init(struct drbd_thread *thi)
 
 /* ********* acknowledge sender ******** */
 
-static int got_RqSReply(struct drbd_conf *mdev, struct p_header *h)
+static int got_RqSReply(struct drbd_conf *mdev, struct p_header80 *h)
 {
 	struct p_req_state_reply *p = (struct p_req_state_reply *)h;
 
@@ -4249,13 +4249,13 @@ static int got_RqSReply(struct drbd_conf *mdev, struct p_header *h)
 	return TRUE;
 }
 
-static int got_Ping(struct drbd_conf *mdev, struct p_header *h)
+static int got_Ping(struct drbd_conf *mdev, struct p_header80 *h)
 {
 	return drbd_send_ping_ack(mdev);
 
 }
 
-static int got_PingAck(struct drbd_conf *mdev, struct p_header *h)
+static int got_PingAck(struct drbd_conf *mdev, struct p_header80 *h)
 {
 	/* restore idle timeout */
 	mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
@@ -4265,7 +4265,7 @@ static int got_PingAck(struct drbd_conf *mdev, struct p_header *h)
 	return TRUE;
 }
 
-static int got_IsInSync(struct drbd_conf *mdev, struct p_header *h)
+static int got_IsInSync(struct drbd_conf *mdev, struct p_header80 *h)
 {
 	struct p_block_ack *p = (struct p_block_ack *)h;
 	sector_t sector = be64_to_cpu(p->sector);
@@ -4336,7 +4336,7 @@ static int validate_req_change_req_state(struct drbd_conf *mdev,
 	return TRUE;
 }
 
-static int got_BlockAck(struct drbd_conf *mdev, struct p_header *h)
+static int got_BlockAck(struct drbd_conf *mdev, struct p_header80 *h)
 {
 	struct p_block_ack *p = (struct p_block_ack *)h;
 	sector_t sector = be64_to_cpu(p->sector);
@@ -4376,7 +4376,7 @@ static int got_BlockAck(struct drbd_conf *mdev, struct p_header *h)
 		_ack_id_to_req, __func__ , what);
 }
 
-static int got_NegAck(struct drbd_conf *mdev, struct p_header *h)
+static int got_NegAck(struct drbd_conf *mdev, struct p_header80 *h)
 {
 	struct p_block_ack *p = (struct p_block_ack *)h;
 	sector_t sector = be64_to_cpu(p->sector);
@@ -4396,7 +4396,7 @@ static int got_NegAck(struct drbd_conf *mdev, struct p_header *h)
 		_ack_id_to_req, __func__ , neg_acked);
 }
 
-static int got_NegDReply(struct drbd_conf *mdev, struct p_header *h)
+static int got_NegDReply(struct drbd_conf *mdev, struct p_header80 *h)
 {
 	struct p_block_ack *p = (struct p_block_ack *)h;
 	sector_t sector = be64_to_cpu(p->sector);
@@ -4409,7 +4409,7 @@ static int got_NegDReply(struct drbd_conf *mdev, struct p_header *h)
 		_ar_id_to_req, __func__ , neg_acked);
 }
 
-static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header *h)
+static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header80 *h)
 {
 	sector_t sector;
 	int size;
@@ -4431,7 +4431,7 @@ static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header *h)
 	return TRUE;
 }
 
-static int got_BarrierAck(struct drbd_conf *mdev, struct p_header *h)
+static int got_BarrierAck(struct drbd_conf *mdev, struct p_header80 *h)
 {
 	struct p_barrier_ack *p = (struct p_barrier_ack *)h;
 
@@ -4440,7 +4440,7 @@ static int got_BarrierAck(struct drbd_conf *mdev, struct p_header *h)
 	return TRUE;
 }
 
-static int got_OVResult(struct drbd_conf *mdev, struct p_header *h)
+static int got_OVResult(struct drbd_conf *mdev, struct p_header80 *h)
 {
 	struct p_block_ack *p = (struct p_block_ack *)h;
 	struct drbd_work *w;
@@ -4474,7 +4474,7 @@ static int got_OVResult(struct drbd_conf *mdev, struct p_header *h)
 	return TRUE;
 }
 
-static int got_something_to_ignore_m(struct drbd_conf *mdev, struct p_header *h)
+static int got_something_to_ignore_m(struct drbd_conf *mdev, struct p_header80 *h)
 {
 	/* IGNORE */
 	return TRUE;
@@ -4482,7 +4482,7 @@ static int got_something_to_ignore_m(struct drbd_conf *mdev, struct p_header *h)
 
 struct asender_cmd {
 	size_t pkt_size;
-	int (*process)(struct drbd_conf *mdev, struct p_header *h);
+	int (*process)(struct drbd_conf *mdev, struct p_header80 *h);
 };
 
 static struct asender_cmd *get_asender_cmd(int cmd)
@@ -4491,8 +4491,8 @@ static struct asender_cmd *get_asender_cmd(int cmd)
 		/* anything missing from this table is in
 		 * the drbd_cmd_handler (drbd_default_handler) table,
 		 * see the beginning of drbdd() */
-	[P_PING]	    = { sizeof(struct p_header), got_Ping },
-	[P_PING_ACK]	    = { sizeof(struct p_header), got_PingAck },
+	[P_PING]	    = { sizeof(struct p_header80), got_Ping },
+	[P_PING_ACK]	    = { sizeof(struct p_header80), got_PingAck },
 	[P_RECV_ACK]	    = { sizeof(struct p_block_ack), got_BlockAck },
 	[P_WRITE_ACK]	    = { sizeof(struct p_block_ack), got_BlockAck },
 	[P_RS_WRITE_ACK]    = { sizeof(struct p_block_ack), got_BlockAck },
@@ -4504,7 +4504,7 @@ static struct asender_cmd *get_asender_cmd(int cmd)
 	[P_BARRIER_ACK]	    = { sizeof(struct p_barrier_ack), got_BarrierAck },
 	[P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
 	[P_RS_IS_IN_SYNC]   = { sizeof(struct p_block_ack), got_IsInSync },
-	[P_DELAY_PROBE]     = { sizeof(struct p_delay_probe), got_something_to_ignore_m },
+	[P_DELAY_PROBE]     = { sizeof(struct p_delay_probe93), got_something_to_ignore_m },
 	[P_MAX_CMD]	    = { 0, NULL },
 	};
 	if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL)
@@ -4515,13 +4515,13 @@ static struct asender_cmd *get_asender_cmd(int cmd)
 int drbd_asender(struct drbd_thread *thi)
 {
 	struct drbd_conf *mdev = thi->mdev;
-	struct p_header *h = &mdev->meta.rbuf.header;
+	struct p_header80 *h = &mdev->meta.rbuf.header;
 	struct asender_cmd *cmd = NULL;
 
 	int rv, len;
 	void *buf    = h;
 	int received = 0;
-	int expect   = sizeof(struct p_header);
+	int expect   = sizeof(struct p_header80);
 	int empty;
 
 	sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev));
@@ -4621,7 +4621,7 @@ int drbd_asender(struct drbd_thread *thi)
 				goto disconnect;
 			}
 			expect = cmd->pkt_size;
-			ERR_IF(len != expect-sizeof(struct p_header))
+			ERR_IF(len != expect-sizeof(struct p_header80))
 				goto reconnect;
 		}
 		if (received == expect) {
@@ -4631,7 +4631,7 @@ int drbd_asender(struct drbd_thread *thi)
 
 			buf	 = h;
 			received = 0;
-			expect	 = sizeof(struct p_header);
+			expect	 = sizeof(struct p_header80);
 			cmd	 = NULL;
 		}
 	}
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 1eeb55423b3e..3d0e14e3ade3 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -1204,7 +1204,7 @@ int w_send_barrier(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 	 * dec_ap_pending will be done in got_BarrierAck
 	 * or (on connection loss) in w_clear_epoch.  */
 	ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BARRIER,
-				(struct p_header *)p, sizeof(*p), 0);
+				(struct p_header80 *)p, sizeof(*p), 0);
 	drbd_put_data_sock(mdev);
 
 	return ok;
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
index 0b2bfb58d9c5..89718a39791e 100644
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -318,6 +318,8 @@ enum drbd_timeout_flag {
 
 #define DRBD_MAGIC 0x83740267
 #define BE_DRBD_MAGIC __constant_cpu_to_be32(DRBD_MAGIC)
+#define DRBD_MAGIC_BIG 0x835a
+#define BE_DRBD_MAGIC_BIG __constant_cpu_to_be16(DRBD_MAGIC_BIG)
 
 /* these are of type "int" */
 #define DRBD_MD_INDEX_INTERNAL -1
-- 
cgit v1.2.3


From 02918be2273a6b086292e0d85b740336eda46e36 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Fri, 20 Aug 2010 14:35:10 +0200
Subject: drbd: receiving of big packets, for payloads between 64kByte and
 4GByte

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h      |   4 +-
 drivers/block/drbd/drbd_receiver.c | 389 ++++++++++++++++---------------------
 2 files changed, 171 insertions(+), 222 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 3f10efc2ac14..db7e65531afa 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -626,7 +626,7 @@ DCBP_set_pad_bits(struct p_compressed_bm *p, int n)
 #endif
 
 union p_polymorph {
-        struct p_header80        header;
+        union p_header           header;
         struct p_handshake       handshake;
         struct p_data            data;
         struct p_block_ack       block_ack;
@@ -641,6 +641,8 @@ union p_polymorph {
         struct p_req_state       req_state;
         struct p_req_state_reply req_state_reply;
         struct p_block_req       block_req;
+	struct p_delay_probe93   delay_probe93;
+	struct p_rs_uuid         rs_uuid;
 } __packed;
 
 /**********************************************************************/
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 9b3321e2c3cd..fe308644a63c 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -720,14 +720,14 @@ out:
 static int drbd_send_fp(struct drbd_conf *mdev,
 	struct socket *sock, enum drbd_packets cmd)
 {
-	struct p_header80 *h = (struct p_header80 *) &mdev->data.sbuf.header;
+	struct p_header80 *h = &mdev->data.sbuf.header.h80;
 
 	return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0);
 }
 
 static enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock)
 {
-	struct p_header80 *h = (struct p_header80 *) &mdev->data.sbuf.header;
+	struct p_header80 *h = &mdev->data.rbuf.header.h80;
 	int rr;
 
 	rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0);
@@ -944,22 +944,27 @@ out_release_sockets:
 	return -1;
 }
 
-static int drbd_recv_header(struct drbd_conf *mdev, struct p_header80 *h)
+static int drbd_recv_header(struct drbd_conf *mdev, enum drbd_packets *cmd, unsigned int *packet_size)
 {
+	union p_header *h = &mdev->data.rbuf.header;
 	int r;
 
 	r = drbd_recv(mdev, h, sizeof(*h));
-
 	if (unlikely(r != sizeof(*h))) {
 		dev_err(DEV, "short read expecting header on sock: r=%d\n", r);
 		return FALSE;
-	};
-	h->command = be16_to_cpu(h->command);
-	h->length  = be16_to_cpu(h->length);
-	if (unlikely(h->magic != BE_DRBD_MAGIC)) {
+	}
+
+	if (likely(h->h80.magic == BE_DRBD_MAGIC)) {
+		*cmd = be16_to_cpu(h->h80.command);
+		*packet_size = be16_to_cpu(h->h80.length);
+	} else if (h->h95.magic == BE_DRBD_MAGIC_BIG) {
+		*cmd = be16_to_cpu(h->h95.command);
+		*packet_size = be32_to_cpu(h->h95.length);
+	} else {
 		dev_err(DEV, "magic?? on data m: 0x%lx c: %d l: %d\n",
-		    (long)be32_to_cpu(h->magic),
-		    h->command, h->length);
+		    (long)be32_to_cpu(h->h80.magic),
+		    h->h80.command, h->h80.length);
 		return FALSE;
 	}
 	mdev->last_received = jiffies;
@@ -1266,17 +1271,12 @@ int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __relea
 	return 1;
 }
 
-static int receive_Barrier(struct drbd_conf *mdev, struct p_header80 *h)
+static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
 {
 	int rv, issue_flush;
-	struct p_barrier *p = (struct p_barrier *)h;
+	struct p_barrier *p = &mdev->data.rbuf.barrier;
 	struct drbd_epoch *epoch;
 
-	ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
-
-	rv = drbd_recv(mdev, h->payload, h->length);
-	ERR_IF(rv != h->length) return FALSE;
-
 	inc_unacked(mdev);
 
 	if (mdev->net_conf->wire_protocol != DRBD_PROT_C)
@@ -1570,21 +1570,12 @@ fail:
 	return FALSE;
 }
 
-static int receive_DataReply(struct drbd_conf *mdev, struct p_header80 *h)
+static int receive_DataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
 {
 	struct drbd_request *req;
 	sector_t sector;
-	unsigned int header_size, data_size;
 	int ok;
-	struct p_data *p = (struct p_data *)h;
-
-	header_size = sizeof(*p) - sizeof(*h);
-	data_size   = h->length  - header_size;
-
-	ERR_IF(data_size == 0) return FALSE;
-
-	if (drbd_recv(mdev, h->payload, header_size) != header_size)
-		return FALSE;
+	struct p_data *p = &mdev->data.rbuf.data;
 
 	sector = be64_to_cpu(p->sector);
 
@@ -1610,20 +1601,11 @@ static int receive_DataReply(struct drbd_conf *mdev, struct p_header80 *h)
 	return ok;
 }
 
-static int receive_RSDataReply(struct drbd_conf *mdev, struct p_header80 *h)
+static int receive_RSDataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
 {
 	sector_t sector;
-	unsigned int header_size, data_size;
 	int ok;
-	struct p_data *p = (struct p_data *)h;
-
-	header_size = sizeof(*p) - sizeof(*h);
-	data_size   = h->length  - header_size;
-
-	ERR_IF(data_size == 0) return FALSE;
-
-	if (drbd_recv(mdev, h->payload, header_size) != header_size)
-		return FALSE;
+	struct p_data *p = &mdev->data.rbuf.data;
 
 	sector = be64_to_cpu(p->sector);
 	D_ASSERT(p->block_id == ID_SYNCER);
@@ -1767,23 +1749,14 @@ static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq)
 }
 
 /* mirrored write */
-static int receive_Data(struct drbd_conf *mdev, struct p_header80 *h)
+static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
 {
 	sector_t sector;
 	struct drbd_epoch_entry *e;
-	struct p_data *p = (struct p_data *)h;
-	int header_size, data_size;
+	struct p_data *p = &mdev->data.rbuf.data;
 	int rw = WRITE;
 	u32 dp_flags;
 
-	header_size = sizeof(*p) - sizeof(*h);
-	data_size   = h->length  - header_size;
-
-	ERR_IF(data_size == 0) return FALSE;
-
-	if (drbd_recv(mdev, h->payload, header_size) != header_size)
-		return FALSE;
-
 	if (!get_ldev(mdev)) {
 		if (__ratelimit(&drbd_ratelimit_state))
 			dev_err(DEV, "Can not write mirrored data block "
@@ -2066,20 +2039,15 @@ int drbd_rs_should_slow_down(struct drbd_conf *mdev)
 }
 
 
-static int receive_DataRequest(struct drbd_conf *mdev, struct p_header80 *h)
+static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int digest_size)
 {
 	sector_t sector;
 	const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
 	struct drbd_epoch_entry *e;
 	struct digest_info *di = NULL;
-	struct p_block_req *p = (struct p_block_req *)h;
-	const int brps = sizeof(*p)-sizeof(*h);
-	int size, digest_size;
+	int size;
 	unsigned int fault_type;
-
-
-	if (drbd_recv(mdev, h->payload, brps) != brps)
-		return FALSE;
+	struct p_block_req *p =	&mdev->data.rbuf.block_req;
 
 	sector = be64_to_cpu(p->sector);
 	size   = be32_to_cpu(p->blksize);
@@ -2099,9 +2067,9 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header80 *h)
 		if (__ratelimit(&drbd_ratelimit_state))
 			dev_err(DEV, "Can not satisfy peer's read request, "
 			    "no local data.\n");
-		drbd_send_ack_rp(mdev, h->command == P_DATA_REQUEST ? P_NEG_DREPLY :
+		drbd_send_ack_rp(mdev, cmd == P_DATA_REQUEST ? P_NEG_DREPLY :
 				 P_NEG_RS_DREPLY , p);
-		return drbd_drain_block(mdev, h->length - brps);
+		return TRUE;
 	}
 
 	/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
@@ -2113,7 +2081,7 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header80 *h)
 		return FALSE;
 	}
 
-	switch (h->command) {
+	switch (cmd) {
 	case P_DATA_REQUEST:
 		e->w.cb = w_e_end_data_req;
 		fault_type = DRBD_FAULT_DT_RD;
@@ -2128,7 +2096,6 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header80 *h)
 	case P_OV_REPLY:
 	case P_CSUM_RS_REQUEST:
 		fault_type = DRBD_FAULT_RS_RD;
-		digest_size = h->length - brps ;
 		di = kmalloc(sizeof(*di) + digest_size, GFP_NOIO);
 		if (!di)
 			goto out_free_e;
@@ -2142,10 +2109,10 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header80 *h)
 		if (drbd_recv(mdev, di->digest, digest_size) != digest_size)
 			goto out_free_e;
 
-		if (h->command == P_CSUM_RS_REQUEST) {
+		if (cmd == P_CSUM_RS_REQUEST) {
 			D_ASSERT(mdev->agreed_pro_version >= 89);
 			e->w.cb = w_e_end_csum_rs_req;
-		} else if (h->command == P_OV_REPLY) {
+		} else if (cmd == P_OV_REPLY) {
 			e->w.cb = w_e_end_ov_reply;
 			dec_rs_pending(mdev);
 			/* drbd_rs_begin_io done when we sent this request,
@@ -2173,7 +2140,7 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header80 *h)
 
 	default:
 		dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
-		    cmdname(h->command));
+		    cmdname(cmd));
 		fault_type = DRBD_FAULT_MAX;
 		goto out_free_e;
 	}
@@ -2756,20 +2723,13 @@ static int cmp_after_sb(enum drbd_after_sb_p peer, enum drbd_after_sb_p self)
 	return 1;
 }
 
-static int receive_protocol(struct drbd_conf *mdev, struct p_header80 *h)
+static int receive_protocol(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
 {
-	struct p_protocol *p = (struct p_protocol *)h;
-	int header_size, data_size;
+	struct p_protocol *p = &mdev->data.rbuf.protocol;
 	int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
 	int p_want_lose, p_two_primaries, cf;
 	char p_integrity_alg[SHARED_SECRET_MAX] = "";
 
-	header_size = sizeof(*p) - sizeof(*h);
-	data_size   = h->length  - header_size;
-
-	if (drbd_recv(mdev, h->payload, header_size) != header_size)
-		return FALSE;
-
 	p_proto		= be32_to_cpu(p->protocol);
 	p_after_sb_0p	= be32_to_cpu(p->after_sb_0p);
 	p_after_sb_1p	= be32_to_cpu(p->after_sb_1p);
@@ -2862,10 +2822,10 @@ struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev,
 	return tfm;
 }
 
-static int receive_SyncParam(struct drbd_conf *mdev, struct p_header80 *h)
+static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int packet_size)
 {
 	int ok = TRUE;
-	struct p_rs_param_95 *p = (struct p_rs_param_95 *)h;
+	struct p_rs_param_95 *p = &mdev->data.rbuf.rs_param_95;
 	unsigned int header_size, data_size, exp_max_sz;
 	struct crypto_hash *verify_tfm = NULL;
 	struct crypto_hash *csums_tfm = NULL;
@@ -2879,29 +2839,29 @@ static int receive_SyncParam(struct drbd_conf *mdev, struct p_header80 *h)
 		    : apv <= 94 ? sizeof(struct p_rs_param_89)
 		    : /* apv >= 95 */ sizeof(struct p_rs_param_95);
 
-	if (h->length > exp_max_sz) {
+	if (packet_size > exp_max_sz) {
 		dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n",
-		    h->length, exp_max_sz);
+		    packet_size, exp_max_sz);
 		return FALSE;
 	}
 
 	if (apv <= 88) {
-		header_size = sizeof(struct p_rs_param) - sizeof(*h);
-		data_size   = h->length  - header_size;
+		header_size = sizeof(struct p_rs_param) - sizeof(struct p_header80);
+		data_size   = packet_size  - header_size;
 	} else if (apv <= 94) {
-		header_size = sizeof(struct p_rs_param_89) - sizeof(*h);
-		data_size   = h->length  - header_size;
+		header_size = sizeof(struct p_rs_param_89) - sizeof(struct p_header80);
+		data_size   = packet_size  - header_size;
 		D_ASSERT(data_size == 0);
 	} else {
-		header_size = sizeof(struct p_rs_param_95) - sizeof(*h);
-		data_size   = h->length  - header_size;
+		header_size = sizeof(struct p_rs_param_95) - sizeof(struct p_header80);
+		data_size   = packet_size  - header_size;
 		D_ASSERT(data_size == 0);
 	}
 
 	/* initialize verify_alg and csums_alg */
 	memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
 
-	if (drbd_recv(mdev, h->payload, header_size) != header_size)
+	if (drbd_recv(mdev, &p->head.payload, header_size) != header_size)
 		return FALSE;
 
 	mdev->sync_conf.rate	  = be32_to_cpu(p->rate);
@@ -3032,19 +2992,15 @@ static void warn_if_differ_considerably(struct drbd_conf *mdev,
 		     (unsigned long long)a, (unsigned long long)b);
 }
 
-static int receive_sizes(struct drbd_conf *mdev, struct p_header80 *h)
+static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
 {
-	struct p_sizes *p = (struct p_sizes *)h;
+	struct p_sizes *p = &mdev->data.rbuf.sizes;
 	enum determine_dev_size dd = unchanged;
 	unsigned int max_seg_s;
 	sector_t p_size, p_usize, my_usize;
 	int ldsc = 0; /* local disk size changed */
 	enum dds_flags ddsf;
 
-	ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
-	if (drbd_recv(mdev, h->payload, h->length) != h->length)
-		return FALSE;
-
 	p_size = be64_to_cpu(p->d_size);
 	p_usize = be64_to_cpu(p->u_size);
 
@@ -3148,16 +3104,12 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header80 *h)
 	return TRUE;
 }
 
-static int receive_uuids(struct drbd_conf *mdev, struct p_header80 *h)
+static int receive_uuids(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
 {
-	struct p_uuids *p = (struct p_uuids *)h;
+	struct p_uuids *p = &mdev->data.rbuf.uuids;
 	u64 *p_uuid;
 	int i;
 
-	ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
-	if (drbd_recv(mdev, h->payload, h->length) != h->length)
-		return FALSE;
-
 	p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO);
 
 	for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
@@ -3241,16 +3193,12 @@ static union drbd_state convert_state(union drbd_state ps)
 	return ms;
 }
 
-static int receive_req_state(struct drbd_conf *mdev, struct p_header80 *h)
+static int receive_req_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
 {
-	struct p_req_state *p = (struct p_req_state *)h;
+	struct p_req_state *p = &mdev->data.rbuf.req_state;
 	union drbd_state mask, val;
 	int rv;
 
-	ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
-	if (drbd_recv(mdev, h->payload, h->length) != h->length)
-		return FALSE;
-
 	mask.i = be32_to_cpu(p->mask);
 	val.i = be32_to_cpu(p->val);
 
@@ -3271,21 +3219,15 @@ static int receive_req_state(struct drbd_conf *mdev, struct p_header80 *h)
 	return TRUE;
 }
 
-static int receive_state(struct drbd_conf *mdev, struct p_header80 *h)
+static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
 {
-	struct p_state *p = (struct p_state *)h;
+	struct p_state *p = &mdev->data.rbuf.state;
 	enum drbd_conns nconn, oconn;
 	union drbd_state ns, peer_state;
 	enum drbd_disk_state real_peer_disk;
 	enum chg_state_flags cs_flags;
 	int rv;
 
-	ERR_IF(h->length != (sizeof(*p)-sizeof(*h)))
-		return FALSE;
-
-	if (drbd_recv(mdev, h->payload, h->length) != h->length)
-		return FALSE;
-
 	peer_state.i = be32_to_cpu(p->state);
 
 	real_peer_disk = peer_state.disk;
@@ -3395,9 +3337,9 @@ static int receive_state(struct drbd_conf *mdev, struct p_header80 *h)
 	return TRUE;
 }
 
-static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header80 *h)
+static int receive_sync_uuid(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
 {
-	struct p_rs_uuid *p = (struct p_rs_uuid *)h;
+	struct p_rs_uuid *p = &mdev->data.rbuf.rs_uuid;
 
 	wait_event(mdev->misc_wait,
 		   mdev->state.conn == C_WF_SYNC_UUID ||
@@ -3406,10 +3348,6 @@ static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header80 *h)
 
 	/* D_ASSERT( mdev->state.conn == C_WF_SYNC_UUID ); */
 
-	ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
-	if (drbd_recv(mdev, h->payload, h->length) != h->length)
-		return FALSE;
-
 	/* Here the _drbd_uuid_ functions are right, current should
 	   _not_ be rotated into the history */
 	if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
@@ -3428,14 +3366,14 @@ static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header80 *h)
 enum receive_bitmap_ret { OK, DONE, FAILED };
 
 static enum receive_bitmap_ret
-receive_bitmap_plain(struct drbd_conf *mdev, struct p_header80 *h,
-	unsigned long *buffer, struct bm_xfer_ctx *c)
+receive_bitmap_plain(struct drbd_conf *mdev, unsigned int data_size,
+		     unsigned long *buffer, struct bm_xfer_ctx *c)
 {
 	unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
 	unsigned want = num_words * sizeof(long);
 
-	if (want != h->length) {
-		dev_err(DEV, "%s:want (%u) != h->length (%u)\n", __func__, want, h->length);
+	if (want != data_size) {
+		dev_err(DEV, "%s:want (%u) != data_size (%u)\n", __func__, want, data_size);
 		return FAILED;
 	}
 	if (want == 0)
@@ -3571,12 +3509,13 @@ void INFO_bm_xfer_stats(struct drbd_conf *mdev,
    in order to be agnostic to the 32 vs 64 bits issue.
 
    returns 0 on failure, 1 if we successfully received it. */
-static int receive_bitmap(struct drbd_conf *mdev, struct p_header80 *h)
+static int receive_bitmap(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
 {
 	struct bm_xfer_ctx c;
 	void *buffer;
 	enum receive_bitmap_ret ret;
 	int ok = FALSE;
+	struct p_header80 *h = &mdev->data.rbuf.header.h80;
 
 	wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
 
@@ -3596,21 +3535,21 @@ static int receive_bitmap(struct drbd_conf *mdev, struct p_header80 *h)
 	};
 
 	do {
-		if (h->command == P_BITMAP) {
-			ret = receive_bitmap_plain(mdev, h, buffer, &c);
-		} else if (h->command == P_COMPRESSED_BITMAP) {
+		if (cmd == P_BITMAP) {
+			ret = receive_bitmap_plain(mdev, data_size, buffer, &c);
+		} else if (cmd == P_COMPRESSED_BITMAP) {
 			/* MAYBE: sanity check that we speak proto >= 90,
 			 * and the feature is enabled! */
 			struct p_compressed_bm *p;
 
-			if (h->length > BM_PACKET_PAYLOAD_BYTES) {
+			if (data_size > BM_PACKET_PAYLOAD_BYTES) {
 				dev_err(DEV, "ReportCBitmap packet too large\n");
 				goto out;
 			}
 			/* use the page buff */
 			p = buffer;
 			memcpy(p, h, sizeof(*h));
-			if (drbd_recv(mdev, p->head.payload, h->length) != h->length)
+			if (drbd_recv(mdev, p->head.payload, data_size) != data_size)
 				goto out;
 			if (p->head.length <= (sizeof(*p) - sizeof(p->head))) {
 				dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", p->head.length);
@@ -3618,17 +3557,17 @@ static int receive_bitmap(struct drbd_conf *mdev, struct p_header80 *h)
 			}
 			ret = decode_bitmap_c(mdev, p, &c);
 		} else {
-			dev_warn(DEV, "receive_bitmap: h->command neither ReportBitMap nor ReportCBitMap (is 0x%x)", h->command);
+			dev_warn(DEV, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", cmd);
 			goto out;
 		}
 
-		c.packets[h->command == P_BITMAP]++;
-		c.bytes[h->command == P_BITMAP] += sizeof(struct p_header80) + h->length;
+		c.packets[cmd == P_BITMAP]++;
+		c.bytes[cmd == P_BITMAP] += sizeof(struct p_header80) + data_size;
 
 		if (ret != OK)
 			break;
 
-		if (!drbd_recv_header(mdev, h))
+		if (!drbd_recv_header(mdev, &cmd, &data_size))
 			goto out;
 	} while (ret == OK);
 	if (ret == FAILED)
@@ -3659,17 +3598,16 @@ static int receive_bitmap(struct drbd_conf *mdev, struct p_header80 *h)
 	return ok;
 }
 
-static int receive_skip_(struct drbd_conf *mdev, struct p_header80 *h, int silent)
+static int receive_skip(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
 {
 	/* TODO zero copy sink :) */
 	static char sink[128];
 	int size, want, r;
 
-	if (!silent)
-		dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n",
-		     h->command, h->length);
+	dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n",
+		 cmd, data_size);
 
-	size = h->length;
+	size = data_size;
 	while (size > 0) {
 		want = min_t(int, size, sizeof(sink));
 		r = drbd_recv(mdev, sink, want);
@@ -3679,17 +3617,7 @@ static int receive_skip_(struct drbd_conf *mdev, struct p_header80 *h, int silen
 	return size == 0;
 }
 
-static int receive_skip(struct drbd_conf *mdev, struct p_header80 *h)
-{
-	return receive_skip_(mdev, h, 0);
-}
-
-static int receive_skip_silent(struct drbd_conf *mdev, struct p_header80 *h)
-{
-	return receive_skip_(mdev, h, 1);
-}
-
-static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header80 *h)
+static int receive_UnplugRemote(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
 {
 	if (mdev->state.disk >= D_INCONSISTENT)
 		drbd_kick_lo(mdev);
@@ -3701,73 +3629,91 @@ static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header80 *h)
 	return TRUE;
 }
 
-typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, struct p_header80 *);
-
-static drbd_cmd_handler_f drbd_default_handler[] = {
-	[P_DATA]	    = receive_Data,
-	[P_DATA_REPLY]	    = receive_DataReply,
-	[P_RS_DATA_REPLY]   = receive_RSDataReply,
-	[P_BARRIER]	    = receive_Barrier,
-	[P_BITMAP]	    = receive_bitmap,
-	[P_COMPRESSED_BITMAP]    = receive_bitmap,
-	[P_UNPLUG_REMOTE]   = receive_UnplugRemote,
-	[P_DATA_REQUEST]    = receive_DataRequest,
-	[P_RS_DATA_REQUEST] = receive_DataRequest,
-	[P_SYNC_PARAM]	    = receive_SyncParam,
-	[P_SYNC_PARAM89]	   = receive_SyncParam,
-	[P_PROTOCOL]        = receive_protocol,
-	[P_UUIDS]	    = receive_uuids,
-	[P_SIZES]	    = receive_sizes,
-	[P_STATE]	    = receive_state,
-	[P_STATE_CHG_REQ]   = receive_req_state,
-	[P_SYNC_UUID]       = receive_sync_uuid,
-	[P_OV_REQUEST]      = receive_DataRequest,
-	[P_OV_REPLY]        = receive_DataRequest,
-	[P_CSUM_RS_REQUEST]    = receive_DataRequest,
-	[P_DELAY_PROBE]     = receive_skip_silent,
+typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, enum drbd_packets cmd, unsigned int to_receive);
+
+struct data_cmd {
+	int expect_payload;
+	size_t pkt_size;
+	drbd_cmd_handler_f function;
+};
+
+static struct data_cmd drbd_cmd_handler[] = {
+	[P_DATA]	    = { 1, sizeof(struct p_data), receive_Data },
+	[P_DATA_REPLY]	    = { 1, sizeof(struct p_data), receive_DataReply },
+	[P_RS_DATA_REPLY]   = { 1, sizeof(struct p_data), receive_RSDataReply } ,
+	[P_BARRIER]	    = { 0, sizeof(struct p_barrier), receive_Barrier } ,
+	[P_BITMAP]	    = { 1, sizeof(struct p_header80), receive_bitmap } ,
+	[P_COMPRESSED_BITMAP] = { 1, sizeof(struct p_header80), receive_bitmap } ,
+	[P_UNPLUG_REMOTE]   = { 0, sizeof(struct p_header80), receive_UnplugRemote },
+	[P_DATA_REQUEST]    = { 0, sizeof(struct p_block_req), receive_DataRequest },
+	[P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
+	[P_SYNC_PARAM]	    = { 1, sizeof(struct p_header80), receive_SyncParam },
+	[P_SYNC_PARAM89]    = { 1, sizeof(struct p_header80), receive_SyncParam },
+	[P_PROTOCOL]        = { 1, sizeof(struct p_protocol), receive_protocol },
+	[P_UUIDS]	    = { 0, sizeof(struct p_uuids), receive_uuids },
+	[P_SIZES]	    = { 0, sizeof(struct p_sizes), receive_sizes },
+	[P_STATE]	    = { 0, sizeof(struct p_state), receive_state },
+	[P_STATE_CHG_REQ]   = { 0, sizeof(struct p_req_state), receive_req_state },
+	[P_SYNC_UUID]       = { 0, sizeof(struct p_rs_uuid), receive_sync_uuid },
+	[P_OV_REQUEST]      = { 0, sizeof(struct p_block_req), receive_DataRequest },
+	[P_OV_REPLY]        = { 1, sizeof(struct p_block_req), receive_DataRequest },
+	[P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
+	[P_DELAY_PROBE]     = { 0, sizeof(struct p_delay_probe93), receive_skip },
 	/* anything missing from this table is in
 	 * the asender_tbl, see get_asender_cmd */
-	[P_MAX_CMD]	    = NULL,
+	[P_MAX_CMD]	    = { 0, 0, NULL },
 };
 
-static drbd_cmd_handler_f *drbd_cmd_handler = drbd_default_handler;
-static drbd_cmd_handler_f *drbd_opt_cmd_handler;
+/* All handler functions that expect a sub-header get that sub-heder in
+   mdev->data.rbuf.header.head.payload.
+
+   Usually in mdev->data.rbuf.header.head the callback can find the usual
+   p_header, but they may not rely on that. Since there is also p_header95 !
+ */
 
 static void drbdd(struct drbd_conf *mdev)
 {
-	drbd_cmd_handler_f handler;
-	struct p_header80 *header = &mdev->data.rbuf.header;
+	union p_header *header = &mdev->data.rbuf.header;
+	unsigned int packet_size;
+	enum drbd_packets cmd;
+	size_t shs; /* sub header size */
+	int rv;
 
 	while (get_t_state(&mdev->receiver) == Running) {
 		drbd_thread_current_set_cpu(mdev);
-		if (!drbd_recv_header(mdev, header)) {
-			drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
-			break;
+		if (!drbd_recv_header(mdev, &cmd, &packet_size))
+			goto err_out;
+
+		if (unlikely(cmd >= P_MAX_CMD || !drbd_cmd_handler[cmd].function)) {
+			dev_err(DEV, "unknown packet type %d, l: %d!\n", cmd, packet_size);
+			goto err_out;
 		}
 
-		if (header->command < P_MAX_CMD)
-			handler = drbd_cmd_handler[header->command];
-		else if (P_MAY_IGNORE < header->command
-		     && header->command < P_MAX_OPT_CMD)
-			handler = drbd_opt_cmd_handler[header->command-P_MAY_IGNORE];
-		else if (header->command > P_MAX_OPT_CMD)
-			handler = receive_skip;
-		else
-			handler = NULL;
+		shs = drbd_cmd_handler[cmd].pkt_size - sizeof(union p_header);
+		rv = drbd_recv(mdev, &header->h80.payload, shs);
+		if (unlikely(rv != shs)) {
+			dev_err(DEV, "short read while reading sub header: rv=%d\n", rv);
+			goto err_out;
+		}
 
-		if (unlikely(!handler)) {
-			dev_err(DEV, "unknown packet type %d, l: %d!\n",
-			    header->command, header->length);
-			drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
-			break;
+		if (packet_size - shs > 0 && !drbd_cmd_handler[cmd].expect_payload) {
+			dev_err(DEV, "No payload expected %s l:%d\n", cmdname(cmd), packet_size);
+			goto err_out;
 		}
-		if (unlikely(!handler(mdev, header))) {
+
+		rv = drbd_cmd_handler[cmd].function(mdev, cmd, packet_size - shs);
+
+		if (unlikely(!rv)) {
 			dev_err(DEV, "error receiving %s, l: %d!\n",
-			    cmdname(header->command), header->length);
-			drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
-			break;
+			    cmdname(cmd), packet_size);
+			goto err_out;
 		}
 	}
+
+	if (0) {
+	err_out:
+		drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
+	}
 }
 
 void drbd_flush_workqueue(struct drbd_conf *mdev)
@@ -3980,27 +3926,28 @@ static int drbd_do_handshake(struct drbd_conf *mdev)
 {
 	/* ASSERT current == mdev->receiver ... */
 	struct p_handshake *p = &mdev->data.rbuf.handshake;
-	const int expect = sizeof(struct p_handshake)
-			  -sizeof(struct p_header80);
+	const int expect = sizeof(struct p_handshake) - sizeof(struct p_header80);
+	unsigned int length;
+	enum drbd_packets cmd;
 	int rv;
 
 	rv = drbd_send_handshake(mdev);
 	if (!rv)
 		return 0;
 
-	rv = drbd_recv_header(mdev, &p->head);
+	rv = drbd_recv_header(mdev, &cmd, &length);
 	if (!rv)
 		return 0;
 
-	if (p->head.command != P_HAND_SHAKE) {
+	if (cmd != P_HAND_SHAKE) {
 		dev_err(DEV, "expected HandShake packet, received: %s (0x%04x)\n",
-		     cmdname(p->head.command), p->head.command);
+		     cmdname(cmd), cmd);
 		return -1;
 	}
 
-	if (p->head.length != expect) {
+	if (length != expect) {
 		dev_err(DEV, "expected HandShake length: %u, received: %u\n",
-		     expect, p->head.length);
+		     expect, length);
 		return -1;
 	}
 
@@ -4058,10 +4005,11 @@ static int drbd_do_auth(struct drbd_conf *mdev)
 	char *response = NULL;
 	char *right_response = NULL;
 	char *peers_ch = NULL;
-	struct p_header80 p;
 	unsigned int key_len = strlen(mdev->net_conf->shared_secret);
 	unsigned int resp_size;
 	struct hash_desc desc;
+	enum drbd_packets cmd;
+	unsigned int length;
 	int rv;
 
 	desc.tfm = mdev->cram_hmac_tfm;
@@ -4081,33 +4029,33 @@ static int drbd_do_auth(struct drbd_conf *mdev)
 	if (!rv)
 		goto fail;
 
-	rv = drbd_recv_header(mdev, &p);
+	rv = drbd_recv_header(mdev, &cmd, &length);
 	if (!rv)
 		goto fail;
 
-	if (p.command != P_AUTH_CHALLENGE) {
+	if (cmd != P_AUTH_CHALLENGE) {
 		dev_err(DEV, "expected AuthChallenge packet, received: %s (0x%04x)\n",
-		    cmdname(p.command), p.command);
+		    cmdname(cmd), cmd);
 		rv = 0;
 		goto fail;
 	}
 
-	if (p.length > CHALLENGE_LEN*2) {
+	if (length > CHALLENGE_LEN * 2) {
 		dev_err(DEV, "expected AuthChallenge payload too big.\n");
 		rv = -1;
 		goto fail;
 	}
 
-	peers_ch = kmalloc(p.length, GFP_NOIO);
+	peers_ch = kmalloc(length, GFP_NOIO);
 	if (peers_ch == NULL) {
 		dev_err(DEV, "kmalloc of peers_ch failed\n");
 		rv = -1;
 		goto fail;
 	}
 
-	rv = drbd_recv(mdev, peers_ch, p.length);
+	rv = drbd_recv(mdev, peers_ch, length);
 
-	if (rv != p.length) {
+	if (rv != length) {
 		dev_err(DEV, "short read AuthChallenge: l=%u\n", rv);
 		rv = 0;
 		goto fail;
@@ -4122,7 +4070,7 @@ static int drbd_do_auth(struct drbd_conf *mdev)
 	}
 
 	sg_init_table(&sg, 1);
-	sg_set_buf(&sg, peers_ch, p.length);
+	sg_set_buf(&sg, peers_ch, length);
 
 	rv = crypto_hash_digest(&desc, &sg, sg.length, response);
 	if (rv) {
@@ -4135,18 +4083,18 @@ static int drbd_do_auth(struct drbd_conf *mdev)
 	if (!rv)
 		goto fail;
 
-	rv = drbd_recv_header(mdev, &p);
+	rv = drbd_recv_header(mdev, &cmd, &length);
 	if (!rv)
 		goto fail;
 
-	if (p.command != P_AUTH_RESPONSE) {
+	if (cmd != P_AUTH_RESPONSE) {
 		dev_err(DEV, "expected AuthResponse packet, received: %s (0x%04x)\n",
-		    cmdname(p.command), p.command);
+			cmdname(cmd), cmd);
 		rv = 0;
 		goto fail;
 	}
 
-	if (p.length != resp_size) {
+	if (length != resp_size) {
 		dev_err(DEV, "expected AuthResponse payload of wrong size\n");
 		rv = 0;
 		goto fail;
@@ -4474,9 +4422,8 @@ static int got_OVResult(struct drbd_conf *mdev, struct p_header80 *h)
 	return TRUE;
 }
 
-static int got_something_to_ignore_m(struct drbd_conf *mdev, struct p_header80 *h)
+static int got_skip(struct drbd_conf *mdev, struct p_header80 *h)
 {
-	/* IGNORE */
 	return TRUE;
 }
 
@@ -4504,7 +4451,7 @@ static struct asender_cmd *get_asender_cmd(int cmd)
 	[P_BARRIER_ACK]	    = { sizeof(struct p_barrier_ack), got_BarrierAck },
 	[P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
 	[P_RS_IS_IN_SYNC]   = { sizeof(struct p_block_ack), got_IsInSync },
-	[P_DELAY_PROBE]     = { sizeof(struct p_delay_probe93), got_something_to_ignore_m },
+	[P_DELAY_PROBE]     = { sizeof(struct p_delay_probe93), got_skip },
 	[P_MAX_CMD]	    = { 0, NULL },
 	};
 	if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL)
@@ -4515,7 +4462,7 @@ static struct asender_cmd *get_asender_cmd(int cmd)
 int drbd_asender(struct drbd_thread *thi)
 {
 	struct drbd_conf *mdev = thi->mdev;
-	struct p_header80 *h = &mdev->meta.rbuf.header;
+	struct p_header80 *h = &mdev->meta.rbuf.header.h80;
 	struct asender_cmd *cmd = NULL;
 
 	int rv, len;
-- 
cgit v1.2.3


From d53733893dc43f4ebb5be510863c5debf0f8990b Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Mon, 23 Aug 2010 15:18:33 +0200
Subject: drbd: Actually allow BIOs up to 128k (was 32k).

Now we have multiple BIOs per ee, packets with a 32 bit length field,
it gets time to use these goodies.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h      | 6 ++++--
 drivers/block/drbd/drbd_main.c     | 4 ++--
 drivers/block/drbd/drbd_nl.c       | 4 +++-
 drivers/block/drbd/drbd_receiver.c | 5 +++++
 4 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index db7e65531afa..58dc02bd16c2 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1389,11 +1389,13 @@ struct bm_extent {
 #endif
 
 /* Sector shift value for the "hash" functions of tl_hash and ee_hash tables.
- * With a value of 6 all IO in one 32K block make it to the same slot of the
+ * With a value of 8 all IO in one 128K block make it to the same slot of the
  * hash table. */
-#define HT_SHIFT 6
+#define HT_SHIFT 8
 #define DRBD_MAX_SEGMENT_SIZE (1U<<(9+HT_SHIFT))
 
+#define DRBD_MAX_SIZE_H80_PACKET (1 << 15) /* The old header only allows packets up to 32Kib data */
+
 /* Number of elements in the app_reads_hash */
 #define APP_R_HSIZE 15
 
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index f3f4ea9c5eb9..71c4c261573e 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2447,7 +2447,7 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
 	dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
 		crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
 
-	if (req->size <= (1 << 15)) {
+	if (req->size <= DRBD_MAX_SIZE_H80_PACKET) {
 		p.head.h80.magic   = BE_DRBD_MAGIC;
 		p.head.h80.command = cpu_to_be16(P_DATA);
 		p.head.h80.length  =
@@ -2518,7 +2518,7 @@ int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
 	dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
 		crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
 
-	if (e->size <= (1 << 15)) {
+	if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
 		p.head.h80.magic   = BE_DRBD_MAGIC;
 		p.head.h80.command = cpu_to_be16(cmd);
 		p.head.h80.length  =
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 6b35d41706e4..97fb2c2a7a57 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1063,7 +1063,9 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	mdev->read_cnt = 0;
 	mdev->writ_cnt = 0;
 
-	drbd_setup_queue_param(mdev, DRBD_MAX_SEGMENT_SIZE);
+	drbd_setup_queue_param(mdev, mdev->state.conn == C_CONNECTED &&
+			       mdev->agreed_pro_version < 95 ?
+			       DRBD_MAX_SIZE_H80_PACKET : DRBD_MAX_SEGMENT_SIZE);
 
 	/* If I am currently not R_PRIMARY,
 	 * but meta data primary indicator is set,
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index fe308644a63c..4249117f1f67 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -925,6 +925,11 @@ retry:
 
 	drbd_thread_start(&mdev->asender);
 
+	if (mdev->agreed_pro_version < 95 && get_ldev(mdev)) {
+		drbd_setup_queue_param(mdev, DRBD_MAX_SIZE_H80_PACKET);
+		put_ldev(mdev);
+	}
+
 	if (!drbd_send_protocol(mdev))
 		return -1;
 	drbd_send_sync_param(mdev, &mdev->sync_conf);
-- 
cgit v1.2.3


From 0778286a133d2d3f81861a4e5db308e359583006 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Tue, 31 Aug 2010 12:00:50 +0200
Subject: drbd: Disable activity log updates when the whole device is out of
 sync

When the complete device is marked as out of sync, we can disable
updates of the on disk AL. Currently AL updates are only disabled
if one uses the "invalidate-remote" command on an unconnected,
primary device, or when at attach time all bits in the bitmap are
set.

As of now, AL updated do not get disabled when a all bits becomes
set due to application writes to an unconnected DRBD device.
While this is a missing feature, it is not considered important,
and might get added later.

BTW, after initializing a "one legged" DRBD device
drbdadm create-md resX
drbdadm -- --force primary resX
AL updates also get disabled, until the first connect.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_bitmap.c |  2 +-
 drivers/block/drbd/drbd_int.h    |  2 ++
 drivers/block/drbd/drbd_main.c   | 12 +++++++++
 drivers/block/drbd/drbd_nl.c     | 54 +++++++++++++++++++++++++++++++++++++++-
 drivers/block/drbd/drbd_proc.c   |  3 ++-
 drivers/block/drbd/drbd_req.c    |  7 ++++--
 drivers/block/drbd/drbd_req.h    |  4 +++
 drivers/block/drbd/drbd_worker.c |  2 +-
 8 files changed, 80 insertions(+), 6 deletions(-)

diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index e3f88d6e1412..fd42832f785b 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -569,7 +569,7 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
  *
  * maybe bm_set should be atomic_t ?
  */
-static unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev)
+unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev)
 {
 	struct drbd_bitmap *b = mdev->bitmap;
 	unsigned long s;
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 58dc02bd16c2..bb3a488b6fd6 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -863,6 +863,7 @@ enum {
 	CONN_DRY_RUN,		/* Expect disconnect after resync handshake. */
 	GOT_PING_ACK,		/* set when we receive a ping_ack packet, misc wait gets woken */
 	NEW_CUR_UUID,		/* Create new current UUID when thawing IO */
+	AL_SUSPENDED,		/* Activity logging is currently suspended. */
 };
 
 struct drbd_bitmap; /* opaque for drbd_conf */
@@ -1425,6 +1426,7 @@ extern unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_
 /* bm_find_next variants for use while you hold drbd_bm_lock() */
 extern unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo);
 extern unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo);
+extern unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev);
 extern unsigned long drbd_bm_total_weight(struct drbd_conf *mdev);
 extern int drbd_bm_rs_done(struct drbd_conf *mdev);
 /* for receive_bitmap */
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 71c4c261573e..23878ffc43c8 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -963,6 +963,12 @@ static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
 	}
 }
 
+static void drbd_resume_al(struct drbd_conf *mdev)
+{
+	if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
+		dev_info(DEV, "Resumed AL updates\n");
+}
+
 /**
  * __drbd_set_state() - Set a new DRBD state
  * @mdev:	DRBD device.
@@ -1160,6 +1166,10 @@ int __drbd_set_state(struct drbd_conf *mdev,
 	    ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
 		drbd_thread_restart_nowait(&mdev->receiver);
 
+	/* Resume AL writing if we get a connection */
+	if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
+		drbd_resume_al(mdev);
+
 	ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
 	if (ascw) {
 		ascw->os = os;
@@ -2851,6 +2861,7 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev)
 	}
 
 	drbd_free_resources(mdev);
+	clear_bit(AL_SUSPENDED, &mdev->flags);
 
 	/*
 	 * currently we drbd_init_ee only on module load, so
@@ -3652,6 +3663,7 @@ int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
 {
 	int rv = -EIO;
 
+	drbd_resume_al(mdev);
 	if (get_ldev_if_state(mdev, D_ATTACHING)) {
 		drbd_bm_clear_all(mdev);
 		rv = drbd_bm_write(mdev);
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 97fb2c2a7a57..6742652c8abc 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -777,6 +777,29 @@ static void drbd_reconfig_done(struct drbd_conf *mdev)
 	wake_up(&mdev->state_wait);
 }
 
+/* Make sure IO is suspended before calling this function(). */
+static void drbd_suspend_al(struct drbd_conf *mdev)
+{
+	int s = 0;
+
+	if (lc_try_lock(mdev->act_log)) {
+		drbd_al_shrink(mdev);
+		lc_unlock(mdev->act_log);
+	} else {
+		dev_warn(DEV, "Failed to lock al in drbd_suspend_al()\n");
+		return;
+	}
+
+	spin_lock_irq(&mdev->req_lock);
+	if (mdev->state.conn < C_CONNECTED)
+		s = !test_and_set_bit(AL_SUSPENDED, &mdev->flags);
+
+	spin_unlock_irq(&mdev->req_lock);
+
+	if (s)
+		dev_info(DEV, "Suspended AL updates\n");
+}
+
 /* does always return 0;
  * interesting return code is in reply->ret_code */
 static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
@@ -1113,6 +1136,9 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 		drbd_al_to_on_disk_bm(mdev);
 	}
 
+	if (_drbd_bm_total_weight(mdev) == drbd_bm_bits(mdev))
+		drbd_suspend_al(mdev); /* IO is still suspended here... */
+
 	spin_lock_irq(&mdev->req_lock);
 	os = mdev->state;
 	ns.i = os.i;
@@ -1792,12 +1818,38 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
 	return 0;
 }
 
+static int drbd_bmio_set_susp_al(struct drbd_conf *mdev)
+{
+	int rv;
+
+	rv = drbd_bmio_set_n_write(mdev);
+	drbd_suspend_al(mdev);
+	return rv;
+}
+
 static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 				   struct drbd_nl_cfg_reply *reply)
 {
+	int retcode;
 
-	reply->ret_code = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S));
+	retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED);
+
+	if (retcode < SS_SUCCESS) {
+		if (retcode == SS_NEED_CONNECTION && mdev->state.role == R_PRIMARY) {
+			/* The peer will get a resync upon connect anyways. Just make that
+			   into a full resync. */
+			retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT));
+			if (retcode >= SS_SUCCESS) {
+				/* open coded drbd_bitmap_io() */
+				if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al,
+						   "set_n_write from invalidate_peer"))
+					retcode = ERR_IO_MD_DISK;
+			}
+		} else
+			retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S));
+	}
 
+	reply->ret_code = retcode;
 	return 0;
 }
 
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index c159692c3b56..a4a4a06908c5 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -203,7 +203,7 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
 			seq_printf(seq, "%2d: cs:Unconfigured\n", i);
 		} else {
 			seq_printf(seq,
-			   "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c\n"
+			   "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c%c\n"
 			   "    ns:%u nr:%u dw:%u dr:%u al:%u bm:%u "
 			   "lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c",
 			   i, sn,
@@ -218,6 +218,7 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
 			   mdev->state.peer_isp ? 'p' : '-',
 			   mdev->state.user_isp ? 'u' : '-',
 			   mdev->congestion_reason ?: '-',
+			   test_bit(AL_SUSPENDED, &mdev->flags) ? 's' : '-',
 			   mdev->send_cnt/2,
 			   mdev->recv_cnt/2,
 			   mdev->writ_cnt/2,
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 3b61d767d9c4..af608b39c4e0 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -94,7 +94,8 @@ static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const
 		 */
 		if (s & RQ_LOCAL_MASK) {
 			if (get_ldev_if_state(mdev, D_FAILED)) {
-				drbd_al_complete_io(mdev, req->sector);
+				if (s & RQ_IN_ACT_LOG)
+					drbd_al_complete_io(mdev, req->sector);
 				put_ldev(mdev);
 			} else if (__ratelimit(&drbd_ratelimit_state)) {
 				dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu), "
@@ -802,8 +803,10 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
 	 * resync extent to finish, and, if necessary, pulls in the target
 	 * extent into the activity log, which involves further disk io because
 	 * of transactional on-disk meta data updates. */
-	if (rw == WRITE && local)
+	if (rw == WRITE && local && !test_bit(AL_SUSPENDED, &mdev->flags)) {
+		req->rq_state |= RQ_IN_ACT_LOG;
 		drbd_al_begin_io(mdev, sector);
+	}
 
 	remote = remote && (mdev->state.pdsk == D_UP_TO_DATE ||
 			    (mdev->state.pdsk == D_INCONSISTENT &&
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index f2e45aaa2cd5..181ea0364822 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -189,6 +189,9 @@ enum drbd_req_state_bits {
 
 	/* Set when this is a write, clear for a read */
 	__RQ_WRITE,
+
+	/* Should call drbd_al_complete_io() for this request... */
+	__RQ_IN_ACT_LOG,
 };
 
 #define RQ_LOCAL_PENDING   (1UL << __RQ_LOCAL_PENDING)
@@ -208,6 +211,7 @@ enum drbd_req_state_bits {
 #define RQ_NET_MASK        (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK)
 
 #define RQ_WRITE           (1UL << __RQ_WRITE)
+#define RQ_IN_ACT_LOG      (1UL << __RQ_IN_ACT_LOG)
 
 /* For waking up the frozen transfer log mod_req() has to return if the request
    should be counted in the epoch object*/
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 3d0e14e3ade3..8be983263374 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -1273,7 +1273,7 @@ int w_restart_disk_io(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 {
 	struct drbd_request *req = container_of(w, struct drbd_request, w);
 
-	if (bio_data_dir(req->master_bio) == WRITE)
+	if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
 		drbd_al_begin_io(mdev, req->sector);
 	/* Calling drbd_al_begin_io() out of the worker might deadlocks
 	   theoretically. Practically it can not deadlock, since this is
-- 
cgit v1.2.3


From c518d04fdec3d8b9d6f8b2228040934de9ee6708 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Wed, 1 Sep 2010 09:50:23 +0200
Subject: drbd: fix race between deconfiguring and reconfiguring network

If a drbd_nl_net_conf hits the small window between the state change
to C_STANDALONE and the corresponding cleanup in after_state_ch,
that cleanup would throw away stuff we now need again,
and later trigger BUG_ON()s.

Fixed by properly serializing the new config request with
any pending cleanup.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_nl.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 6742652c8abc..d066190f997a 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -750,14 +750,16 @@ void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __mu
 /* serialize deconfig (worker exiting, doing cleanup)
  * and reconfig (drbdsetup disk, drbdsetup net)
  *
- * wait for a potentially exiting worker, then restart it,
- * or start a new one.
+ * Wait for a potentially exiting worker, then restart it,
+ * or start a new one.  Flush any pending work, there may still be an
+ * after_state_change queued.
  */
 static void drbd_reconfig_start(struct drbd_conf *mdev)
 {
 	wait_event(mdev->state_wait, !test_and_set_bit(CONFIG_PENDING, &mdev->flags));
 	wait_event(mdev->state_wait, !test_bit(DEVICE_DYING, &mdev->flags));
 	drbd_thread_start(&mdev->worker);
+	drbd_flush_workqueue(mdev);
 }
 
 /* if still unconfigured, stops worker again.
-- 
cgit v1.2.3


From 3f3a9b849d2b703934c07fa17f5eac2dc37c1f6b Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Wed, 1 Sep 2010 15:12:12 +0200
Subject: drbd: fix race on meta-data update

The race:
	drbd_md_mark_dirty()
	drbd_md_sync()
		if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
			return;
		drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)
  ==> RACE
		clear_bit(MD_DIRTY, &mdev->flags); <== spurious

Fixed by removing the spurious clear_bit.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_main.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 23878ffc43c8..73c905d0ef18 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -3446,12 +3446,9 @@ void drbd_md_sync(struct drbd_conf *mdev)
 	D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
 	sector = mdev->ldev->md.md_offset;
 
-	if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
-		clear_bit(MD_DIRTY, &mdev->flags);
-	} else {
+	if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
 		/* this was a try anyways ... */
 		dev_err(DEV, "meta data update failed!\n");
-
 		drbd_chk_io_error(mdev, 1, TRUE);
 	}
 
-- 
cgit v1.2.3


From ef50a3e34f93a067ada541346be3175e924331a2 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Wed, 1 Sep 2010 14:39:30 +0200
Subject: drbd: implicitly create unconfigured devices on sync-after
 dependencies

If pacemaker (for example) decided to initialize minor devices not in
the exact sync-after dependency order, the configuration partially
failed with an error "The sync-after minor number is invalid". (Bugz. #322)

We can avoid that by implicitly creating unconfigured minor devices,
if others depend on them.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_nl.c | 76 ++++++++++++++++++++++++--------------------
 1 file changed, 41 insertions(+), 35 deletions(-)

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index d066190f997a..e0061a906ba8 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -413,6 +413,39 @@ int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
 	return r;
 }
 
+static struct drbd_conf *ensure_mdev(int minor, int create)
+{
+	struct drbd_conf *mdev;
+
+	if (minor >= minor_count)
+		return NULL;
+
+	mdev = minor_to_mdev(minor);
+
+	if (!mdev && create) {
+		struct gendisk *disk = NULL;
+		mdev = drbd_new_device(minor);
+
+		spin_lock_irq(&drbd_pp_lock);
+		if (minor_table[minor] == NULL) {
+			minor_table[minor] = mdev;
+			disk = mdev->vdisk;
+			mdev = NULL;
+		} /* else: we lost the race */
+		spin_unlock_irq(&drbd_pp_lock);
+
+		if (disk) /* we won the race above */
+			/* in case we ever add a drbd_delete_device(),
+			 * don't forget the del_gendisk! */
+			add_disk(disk);
+		else /* we lost the race above */
+			drbd_free_mdev(mdev);
+
+		mdev = minor_to_mdev(minor);
+	}
+
+	return mdev;
+}
 
 static int drbd_nl_primary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 			   struct drbd_nl_cfg_reply *reply)
@@ -1713,6 +1746,12 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
 	}
 #undef AL_MAX
 
+	/* to avoid spurious errors when configuring minors before configuring
+	 * the minors they depend on: if necessary, first create the minor we
+	 * depend on */
+	if (sc.after >= 0)
+		ensure_mdev(sc.after, 1);
+
 	/* most sanity checks done, try to assign the new sync-after
 	 * dependency.  need to hold the global lock in there,
 	 * to avoid a race in the dependency loop check. */
@@ -2080,40 +2119,6 @@ out:
 	return 0;
 }
 
-static struct drbd_conf *ensure_mdev(struct drbd_nl_cfg_req *nlp)
-{
-	struct drbd_conf *mdev;
-
-	if (nlp->drbd_minor >= minor_count)
-		return NULL;
-
-	mdev = minor_to_mdev(nlp->drbd_minor);
-
-	if (!mdev && (nlp->flags & DRBD_NL_CREATE_DEVICE)) {
-		struct gendisk *disk = NULL;
-		mdev = drbd_new_device(nlp->drbd_minor);
-
-		spin_lock_irq(&drbd_pp_lock);
-		if (minor_table[nlp->drbd_minor] == NULL) {
-			minor_table[nlp->drbd_minor] = mdev;
-			disk = mdev->vdisk;
-			mdev = NULL;
-		} /* else: we lost the race */
-		spin_unlock_irq(&drbd_pp_lock);
-
-		if (disk) /* we won the race above */
-			/* in case we ever add a drbd_delete_device(),
-			 * don't forget the del_gendisk! */
-			add_disk(disk);
-		else /* we lost the race above */
-			drbd_free_mdev(mdev);
-
-		mdev = minor_to_mdev(nlp->drbd_minor);
-	}
-
-	return mdev;
-}
-
 struct cn_handler_struct {
 	int (*function)(struct drbd_conf *,
 			 struct drbd_nl_cfg_req *,
@@ -2174,7 +2179,8 @@ static void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms
 		goto fail;
 	}
 
-	mdev = ensure_mdev(nlp);
+	mdev = ensure_mdev(nlp->drbd_minor,
+			(nlp->flags & DRBD_NL_CREATE_DEVICE));
 	if (!mdev) {
 		retcode = ERR_MINOR_INVALID;
 		goto fail;
-- 
cgit v1.2.3


From 63106d3c6c769b6219bd04edde513b12abae3f61 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Wed, 1 Sep 2010 15:47:15 +0200
Subject: drbd: Removed a race that could cause unexpected execution of
 w_make_resync_request()

The actual race happened int the drbd_start_resync() function. Where
drbd_resync_finished() -> __drbd_set_state() set STOP_SYNC_TIMER and
armed the timer.

If the timer fired before execution reaches the mod_timer statement
at the end of drbd_start_resync() the latter would cause an
unexpected call to w_make_resync_request().

Removed the STOP_SYNC_TIMER bit, and base it on the connection state.

The STOP_SYNC_TIMER bit probably originates probably the time before
the state engine.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h      |  1 -
 drivers/block/drbd/drbd_main.c     | 18 ++----------------
 drivers/block/drbd/drbd_receiver.c |  1 -
 drivers/block/drbd/drbd_worker.c   | 21 +++++++++------------
 4 files changed, 11 insertions(+), 30 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index bb3a488b6fd6..d5e38de83a19 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -827,7 +827,6 @@ enum {
 	SIGNAL_ASENDER,		/* whether asender wants to be interrupted */
 	SEND_PING,		/* whether asender should send a ping asap */
 
-	STOP_SYNC_TIMER,	/* tell timer to cancel itself */
 	UNPLUG_QUEUED,		/* only relevant with kernel 2.4 */
 	UNPLUG_REMOTE,		/* sending a "UnplugRemote" could help */
 	MD_DIRTY,		/* current uuids and flags not yet on disk */
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 73c905d0ef18..5dd071e5c921 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1052,12 +1052,6 @@ int __drbd_set_state(struct drbd_conf *mdev,
 	wake_up(&mdev->misc_wait);
 	wake_up(&mdev->state_wait);
 
-	/*   post-state-change actions   */
-	if (os.conn >= C_SYNC_SOURCE   && ns.conn <= C_CONNECTED) {
-		set_bit(STOP_SYNC_TIMER, &mdev->flags);
-		mod_timer(&mdev->resync_timer, jiffies);
-	}
-
 	/* aborted verify run. log the last position */
 	if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
 	    ns.conn < C_CONNECTED) {
@@ -1072,22 +1066,14 @@ int __drbd_set_state(struct drbd_conf *mdev,
 		dev_info(DEV, "Syncer continues.\n");
 		mdev->rs_paused += (long)jiffies
 				  -(long)mdev->rs_mark_time[mdev->rs_last_mark];
-		if (ns.conn == C_SYNC_TARGET) {
-			if (!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))
-				mod_timer(&mdev->resync_timer, jiffies);
-			/* This if (!test_bit) is only needed for the case
-			   that a device that has ceased to used its timer,
-			   i.e. it is already in drbd_resync_finished() gets
-			   paused and resumed. */
-		}
+		if (ns.conn == C_SYNC_TARGET)
+			mod_timer(&mdev->resync_timer, jiffies);
 	}
 
 	if ((os.conn == C_SYNC_TARGET  || os.conn == C_SYNC_SOURCE) &&
 	    (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
 		dev_info(DEV, "Resync suspended\n");
 		mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
-		if (ns.conn == C_PAUSED_SYNC_T)
-			set_bit(STOP_SYNC_TIMER, &mdev->flags);
 	}
 
 	if (os.conn == C_CONNECTED &&
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 4249117f1f67..885471ded2fb 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -3803,7 +3803,6 @@ static void drbd_disconnect(struct drbd_conf *mdev)
 
 	/* make sure syncer is stopped and w_resume_next_sg queued */
 	del_timer_sync(&mdev->resync_timer);
-	set_bit(STOP_SYNC_TIMER, &mdev->flags);
 	resync_timer_fn((unsigned long)mdev);
 
 	/* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 8be983263374..0e5bf8c98293 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -395,25 +395,22 @@ defer:
 
 void resync_timer_fn(unsigned long data)
 {
-	unsigned long flags;
 	struct drbd_conf *mdev = (struct drbd_conf *) data;
 	int queue;
 
-	spin_lock_irqsave(&mdev->req_lock, flags);
-
-	if (likely(!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))) {
-		queue = 1;
-		if (mdev->state.conn == C_VERIFY_S)
-			mdev->resync_work.cb = w_make_ov_request;
-		else
-			mdev->resync_work.cb = w_make_resync_request;
-	} else {
+	queue = 1;
+	switch (mdev->state.conn) {
+	case C_VERIFY_S:
+		mdev->resync_work.cb = w_make_ov_request;
+		break;
+	case C_SYNC_TARGET:
+		mdev->resync_work.cb = w_make_resync_request;
+		break;
+	default:
 		queue = 0;
 		mdev->resync_work.cb = w_resync_inactive;
 	}
 
-	spin_unlock_irqrestore(&mdev->req_lock, flags);
-
 	/* harmless race: list_empty outside data.work.q_lock */
 	if (list_empty(&mdev->resync_work.list) && queue)
 		drbd_queue_work(&mdev->data.work, &mdev->resync_work);
-- 
cgit v1.2.3


From ee15b038164fcf19b798021762dee3cf5cbc6433 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Fri, 3 Sep 2010 10:00:09 +0200
Subject: drbd: fix race on meta-data update, addendum

addendum to baa33ae4eaa4477b60af7c434c0ddd1d182c1ae7

The race:
    drbd_md_sync()
	if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
		return;
    ==> RACE with drbd_md_mark_dirty() rearming the timer.
	del_timer(&mdev->md_sync_timer);

    Fixed by moving the del_timer before the test_and_clear_bit.

Additionally only rearm the timer in drbd_md_mark_dirty, if MD_DIRTY was
not already set, reduce the grace period from five to one second, and
add an ifdef'ed debuging aid to find code paths missing an explicit
drbd_md_sync, if any, as those are the only relevant ones for this race.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h  | 12 ++++++++++++
 drivers/block/drbd/drbd_main.c | 24 +++++++++++++++++++-----
 2 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index d5e38de83a19..f9b75fc30569 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -975,6 +975,12 @@ struct drbd_conf {
 			  md_sync_work;
 	struct timer_list resync_timer;
 	struct timer_list md_sync_timer;
+#ifdef DRBD_DEBUG_MD_SYNC
+	struct {
+		unsigned int line;
+		const char* func;
+	} last_md_mark_dirty;
+#endif
 
 	/* Used after attach while negotiating new disk state. */
 	union drbd_state new_state_tmp;
@@ -1253,7 +1259,13 @@ extern void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
 extern void drbd_md_set_flag(struct drbd_conf *mdev, int flags) __must_hold(local);
 extern void drbd_md_clear_flag(struct drbd_conf *mdev, int flags)__must_hold(local);
 extern int drbd_md_test_flag(struct drbd_backing_dev *, int);
+#ifndef DRBD_DEBUG_MD_SYNC
 extern void drbd_md_mark_dirty(struct drbd_conf *mdev);
+#else
+#define drbd_md_mark_dirty(m)	drbd_md_mark_dirty_(m, __LINE__ , __func__ )
+extern void drbd_md_mark_dirty_(struct drbd_conf *mdev,
+		unsigned int line, const char *func);
+#endif
 extern void drbd_queue_bitmap_io(struct drbd_conf *mdev,
 				 int (*io_fn)(struct drbd_conf *),
 				 void (*done)(struct drbd_conf *, int),
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 5dd071e5c921..ab1244e0045c 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -3402,9 +3402,10 @@ void drbd_md_sync(struct drbd_conf *mdev)
 	sector_t sector;
 	int i;
 
+	del_timer(&mdev->md_sync_timer);
+	/* timer may be rearmed by drbd_md_mark_dirty() now. */
 	if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
 		return;
-	del_timer(&mdev->md_sync_timer);
 
 	/* We use here D_FAILED and not D_ATTACHING because we try to write
 	 * metadata even if we detach due to a disk failure! */
@@ -3529,12 +3530,22 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
  * the meta-data super block. This function sets MD_DIRTY, and starts a
  * timer that ensures that within five seconds you have to call drbd_md_sync().
  */
+#ifdef DRBD_DEBUG_MD_SYNC
+void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
+{
+	if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
+		mod_timer(&mdev->md_sync_timer, jiffies + HZ);
+		mdev->last_md_mark_dirty.line = line;
+		mdev->last_md_mark_dirty.func = func;
+	}
+}
+#else
 void drbd_md_mark_dirty(struct drbd_conf *mdev)
 {
-	set_bit(MD_DIRTY, &mdev->flags);
-	mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
+	if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
+		mod_timer(&mdev->md_sync_timer, jiffies + HZ);
 }
-
+#endif
 
 static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
 {
@@ -3775,8 +3786,11 @@ static void md_sync_timer_fn(unsigned long data)
 static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
 {
 	dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
+#ifdef DEBUG
+	dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
+		mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
+#endif
 	drbd_md_sync(mdev);
-
 	return 1;
 }
 
-- 
cgit v1.2.3


From 1090c056c5eb6d5335cceb381683e77ac24c71ab Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Mon, 19 Jul 2010 17:41:04 +0200
Subject: drbd: drbd_md_sync before calling user space helpers

Just in case we have some pending meta data changes to sync, do it
before we call our userland helper, as that may take some time,
or even cause a hard reboot.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_nl.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index e0061a906ba8..5b30f90cab3e 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -172,6 +172,10 @@ int drbd_khelper(struct drbd_conf *mdev, char *cmd)
 		put_net_conf(mdev);
 	}
 
+	/* The helper may take some time.
+	 * write out any unsynced meta data changes now */
+	drbd_md_sync(mdev);
+
 	dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb);
 
 	drbd_bcast_ev_helper(mdev, cmd);
-- 
cgit v1.2.3


From 76d2e7eca8e7675c6d7a6592f9e747b121cc8a87 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Wed, 25 Aug 2010 11:58:05 +0200
Subject: drbd: Adding support for BIO/Request flags: REQ_FUA, REQ_FLUSH and
 REQ_DISCARD

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h      |  8 ++++++--
 drivers/block/drbd/drbd_main.c     | 28 ++++++++++++++--------------
 drivers/block/drbd/drbd_receiver.c | 20 ++++++++++++++------
 3 files changed, 34 insertions(+), 22 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index f9b75fc30569..79b877db9a39 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -374,9 +374,13 @@ union p_header {
  */
 
 /* these defines must not be changed without changing the protocol version */
-#define DP_HARDBARRIER	      1
-#define DP_RW_SYNC	      2
+#define DP_HARDBARRIER	      1 /* depricated */
+#define DP_RW_SYNC	      2 /* equals REQ_SYNC    */
 #define DP_MAY_SET_IN_SYNC    4
+#define DP_UNPLUG             8 /* equals REQ_UNPLUG  */
+#define DP_FUA               16 /* equals REQ_FUA     */
+#define DP_FLUSH             32 /* equals REQ_FLUSH   */
+#define DP_DISCARD           64 /* equals REQ_DISCARD */
 
 struct p_data {
 	union p_header head;
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index ab1244e0045c..1827cf073c2e 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2426,6 +2426,18 @@ static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
 	return 1;
 }
 
+static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
+{
+	if (mdev->agreed_pro_version >= 95)
+		return  (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
+			(bi_rw & REQ_UNPLUG ? DP_UNPLUG : 0) |
+			(bi_rw & REQ_FUA ? DP_FUA : 0) |
+			(bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
+			(bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
+	else
+		return bi_rw & (REQ_SYNC | REQ_UNPLUG) ? DP_RW_SYNC : 0;
+}
+
 /* Used to send write requests
  * R_PRIMARY -> Peer	(P_DATA)
  */
@@ -2459,21 +2471,9 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
 	p.block_id = (unsigned long)req;
 	p.seq_num  = cpu_to_be32(req->seq_num =
 				 atomic_add_return(1, &mdev->packet_seq));
-	dp_flags = 0;
 
-	/* NOTE: no need to check if barriers supported here as we would
-	 *       not pass the test in make_request_common in that case
-	 */
-	if (req->master_bio->bi_rw & REQ_HARDBARRIER) {
-		dev_err(DEV, "ASSERT FAILED would have set DP_HARDBARRIER\n");
-		/* dp_flags |= DP_HARDBARRIER; */
-	}
-	if (req->master_bio->bi_rw & REQ_SYNC)
-		dp_flags |= DP_RW_SYNC;
-	/* for now handle SYNCIO and UNPLUG
-	 * as if they still were one and the same flag */
-	if (req->master_bio->bi_rw & REQ_UNPLUG)
-		dp_flags |= DP_RW_SYNC;
+	dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
+
 	if (mdev->state.conn >= C_SYNC_SOURCE &&
 	    mdev->state.conn <= C_PAUSED_SYNC_T)
 		dp_flags |= DP_MAY_SET_IN_SYNC;
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 885471ded2fb..e96fbb04ea4d 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1753,6 +1753,18 @@ static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq)
 	return ret;
 }
 
+static unsigned long write_flags_to_bio(struct drbd_conf *mdev, u32 dpf)
+{
+	if (mdev->agreed_pro_version >= 95)
+		return  (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
+			(dpf & DP_UNPLUG ? REQ_UNPLUG : 0) |
+			(dpf & DP_FUA ? REQ_FUA : 0) |
+			(dpf & DP_FLUSH ? REQ_FUA : 0) |
+			(dpf & DP_DISCARD ? REQ_DISCARD : 0);
+	else
+		return dpf & DP_RW_SYNC ? (REQ_SYNC | REQ_UNPLUG) : 0;
+}
+
 /* mirrored write */
 static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
 {
@@ -1818,12 +1830,8 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 	spin_unlock(&mdev->epoch_lock);
 
 	dp_flags = be32_to_cpu(p->dp_flags);
-	if (dp_flags & DP_HARDBARRIER) {
-		dev_err(DEV, "ASSERT FAILED would have submitted barrier request\n");
-		/* rw |= REQ_HARDBARRIER; */
-	}
-	if (dp_flags & DP_RW_SYNC)
-		rw |= REQ_SYNC | REQ_UNPLUG;
+	rw |= write_flags_to_bio(mdev, dp_flags);
+
 	if (dp_flags & DP_MAY_SET_IN_SYNC)
 		e->flags |= EE_MAY_SET_IN_SYNC;
 
-- 
cgit v1.2.3


From 435f07402b3165b90592073bc0f8c6f8fa160ff9 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Mon, 6 Sep 2010 12:30:25 +0200
Subject: drbd: don't count sendpage()d pages only referenced by tcp as in use

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h      |  8 ++++++--
 drivers/block/drbd/drbd_main.c     |  1 +
 drivers/block/drbd/drbd_receiver.c | 26 ++++++++++++++++----------
 drivers/block/drbd/drbd_worker.c   |  4 ++++
 4 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 79b877db9a39..eb1273d04caf 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1077,7 +1077,8 @@ struct drbd_conf {
 	int next_barrier_nr;
 	struct hlist_head *app_reads_hash; /* is proteced by req_lock */
 	struct list_head resync_reads;
-	atomic_t pp_in_use;
+	atomic_t pp_in_use;		/* allocated from page pool */
+	atomic_t pp_in_use_by_net;	/* sendpage()d, still referenced by tcp */
 	wait_queue_head_t ee_wait;
 	struct page *md_io_page;	/* one page buffer for md_io */
 	struct page *md_io_tmpp;	/* for logical_block_size != 512 */
@@ -1555,7 +1556,10 @@ extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
 					    sector_t sector,
 					    unsigned int data_size,
 					    gfp_t gfp_mask) __must_hold(local);
-extern void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e);
+extern void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
+		int is_net);
+#define drbd_free_ee(m,e)	drbd_free_some_ee(m, e, 0)
+#define drbd_free_net_ee(m,e)	drbd_free_some_ee(m, e, 1)
 extern void drbd_wait_ee_list_empty(struct drbd_conf *mdev,
 		struct list_head *head);
 extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev,
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 1827cf073c2e..981cfd178b09 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2753,6 +2753,7 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
 	atomic_set(&mdev->net_cnt, 0);
 	atomic_set(&mdev->packet_seq, 0);
 	atomic_set(&mdev->pp_in_use, 0);
+	atomic_set(&mdev->pp_in_use_by_net, 0);
 	atomic_set(&mdev->rs_sect_in, 0);
 	atomic_set(&mdev->rs_sect_ev, 0);
 
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index e96fbb04ea4d..2c3edf0ac5ca 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -241,7 +241,7 @@ static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev)
 	spin_unlock_irq(&mdev->req_lock);
 
 	list_for_each_entry_safe(e, t, &reclaimed, w.list)
-		drbd_free_ee(mdev, e);
+		drbd_free_net_ee(mdev, e);
 }
 
 /**
@@ -298,9 +298,11 @@ static struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool
  * Is also used from inside an other spin_lock_irq(&mdev->req_lock);
  * Either links the page chain back to the global pool,
  * or returns all pages to the system. */
-static void drbd_pp_free(struct drbd_conf *mdev, struct page *page)
+static void drbd_pp_free(struct drbd_conf *mdev, struct page *page, int is_net)
 {
+	atomic_t *a = is_net ? &mdev->pp_in_use_by_net : &mdev->pp_in_use;
 	int i;
+
 	if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count)
 		i = page_chain_free(page);
 	else {
@@ -311,10 +313,10 @@ static void drbd_pp_free(struct drbd_conf *mdev, struct page *page)
 		drbd_pp_vacant += i;
 		spin_unlock(&drbd_pp_lock);
 	}
-	atomic_sub(i, &mdev->pp_in_use);
-	i = atomic_read(&mdev->pp_in_use);
+	i = atomic_sub_return(i, a);
 	if (i < 0)
-		dev_warn(DEV, "ASSERTION FAILED: pp_in_use: %d < 0\n", i);
+		dev_warn(DEV, "ASSERTION FAILED: %s: %d < 0\n",
+			is_net ? "pp_in_use_by_net" : "pp_in_use", i);
 	wake_up(&drbd_pp_wait);
 }
 
@@ -374,11 +376,11 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
 	return NULL;
 }
 
-void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
+void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, int is_net)
 {
 	if (e->flags & EE_HAS_DIGEST)
 		kfree(e->digest);
-	drbd_pp_free(mdev, e->pages);
+	drbd_pp_free(mdev, e->pages, is_net);
 	D_ASSERT(atomic_read(&e->pending_bios) == 0);
 	D_ASSERT(hlist_unhashed(&e->colision));
 	mempool_free(e, drbd_ee_mempool);
@@ -389,13 +391,14 @@ int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list)
 	LIST_HEAD(work_list);
 	struct drbd_epoch_entry *e, *t;
 	int count = 0;
+	int is_net = list == &mdev->net_ee;
 
 	spin_lock_irq(&mdev->req_lock);
 	list_splice_init(list, &work_list);
 	spin_unlock_irq(&mdev->req_lock);
 
 	list_for_each_entry_safe(e, t, &work_list, w.list) {
-		drbd_free_ee(mdev, e);
+		drbd_free_some_ee(mdev, e, is_net);
 		count++;
 	}
 	return count;
@@ -424,7 +427,7 @@ static int drbd_process_done_ee(struct drbd_conf *mdev)
 	spin_unlock_irq(&mdev->req_lock);
 
 	list_for_each_entry_safe(e, t, &reclaimed, w.list)
-		drbd_free_ee(mdev, e);
+		drbd_free_net_ee(mdev, e);
 
 	/* possible callbacks here:
 	 * e_end_block, and e_end_resync_block, e_send_discard_ack.
@@ -1460,7 +1463,7 @@ static int drbd_drain_block(struct drbd_conf *mdev, int data_size)
 		data_size -= rr;
 	}
 	kunmap(page);
-	drbd_pp_free(mdev, page);
+	drbd_pp_free(mdev, page, 0);
 	return rv;
 }
 
@@ -3879,6 +3882,9 @@ static void drbd_disconnect(struct drbd_conf *mdev)
 	i = drbd_release_ee(mdev, &mdev->net_ee);
 	if (i)
 		dev_info(DEV, "net_ee not empty, killed %u entries\n", i);
+	i = atomic_read(&mdev->pp_in_use_by_net);
+	if (i)
+		dev_info(DEV, "pp_in_use_by_net = %d, expected 0\n", i);
 	i = atomic_read(&mdev->pp_in_use);
 	if (i)
 		dev_info(DEV, "pp_in_use = %d, expected 0\n", i);
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 0e5bf8c98293..01743193f321 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -914,9 +914,13 @@ static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_ent
 {
 	if (drbd_ee_has_active_page(e)) {
 		/* This might happen if sendpage() has not finished */
+		int i = DIV_ROUND_UP(e->size, PAGE_SIZE);
+		atomic_add(i, &mdev->pp_in_use_by_net);
+		atomic_sub(i, &mdev->pp_in_use);
 		spin_lock_irq(&mdev->req_lock);
 		list_add_tail(&e->w.list, &mdev->net_ee);
 		spin_unlock_irq(&mdev->req_lock);
+		wake_up(&drbd_pp_wait);
 	} else
 		drbd_free_ee(mdev, e);
 }
-- 
cgit v1.2.3


From 1d53f09e170e477de67babd7a10e277479260d51 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Sun, 5 Sep 2010 01:13:24 +0200
Subject: drbd: fix potential kernel BUG (NULL deref)

BUG trace would look like:
 lc_find
 drbd_rs_complete_io
 got_OVResult
 drbd_asender

Could be triggered by explicit, or IO-error policy based,
detach during online-verify.

We may only dereference mdev->resync, if we first get_ldev(), as the
disk may break any time, causing mdev->resync to disappear once all
ldev references have been returned.
Already in flight online-verify requests or replies may still come in,
which we then need to ignore.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_receiver.c | 15 +++++++++++----
 drivers/block/drbd/drbd_worker.c   | 10 ++++++++--
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 2c3edf0ac5ca..e4e4eddf04f2 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -4241,10 +4241,13 @@ static int got_IsInSync(struct drbd_conf *mdev, struct p_header80 *h)
 
 	update_peer_seq(mdev, be32_to_cpu(p->seq_num));
 
-	drbd_rs_complete_io(mdev, sector);
-	drbd_set_in_sync(mdev, sector, blksize);
-	/* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
-	mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
+	if (get_ldev(mdev)) {
+		drbd_rs_complete_io(mdev, sector);
+		drbd_set_in_sync(mdev, sector, blksize);
+		/* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
+		mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
+		put_ldev(mdev);
+	}
 	dec_rs_pending(mdev);
 	atomic_add(blksize >> 9, &mdev->rs_sect_in);
 
@@ -4423,6 +4426,9 @@ static int got_OVResult(struct drbd_conf *mdev, struct p_header80 *h)
 	else
 		ov_oos_print(mdev);
 
+	if (!get_ldev(mdev))
+		return TRUE;
+
 	drbd_rs_complete_io(mdev, sector);
 	dec_rs_pending(mdev);
 
@@ -4437,6 +4443,7 @@ static int got_OVResult(struct drbd_conf *mdev, struct p_header80 *h)
 			drbd_resync_finished(mdev);
 		}
 	}
+	put_ldev(mdev);
 	return TRUE;
 }
 
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 01743193f321..c72a5fc1c88e 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -1027,7 +1027,10 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 		return 1;
 	}
 
-	drbd_rs_complete_io(mdev, e->sector);
+	if (get_ldev(mdev)) {
+		drbd_rs_complete_io(mdev, e->sector);
+		put_ldev(mdev);
+	}
 
 	di = e->digest;
 
@@ -1134,7 +1137,10 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 
 	/* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
 	 * the resync lru has been cleaned up already */
-	drbd_rs_complete_io(mdev, e->sector);
+	if (get_ldev(mdev)) {
+		drbd_rs_complete_io(mdev, e->sector);
+		put_ldev(mdev);
+	}
 
 	di = e->digest;
 
-- 
cgit v1.2.3


From a821cc4a9a8d9e67356f9a5cfc1206aa3cfd30f7 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Mon, 6 Sep 2010 12:31:37 +0200
Subject: drbd: fix spurious protocol error

If we cannot satisfy a request (because our disk just broke),
we still need to drain the payload.  Or we'll get a protocol error
when interpreting the payload as DRBD packet header.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_receiver.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index e4e4eddf04f2..983e49cbd233 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -2085,7 +2085,8 @@ static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, un
 			    "no local data.\n");
 		drbd_send_ack_rp(mdev, cmd == P_DATA_REQUEST ? P_NEG_DREPLY :
 				 P_NEG_RS_DREPLY , p);
-		return TRUE;
+		/* drain possibly payload */
+		return drbd_drain_block(mdev, digest_size);
 	}
 
 	/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
-- 
cgit v1.2.3


From 02bc7174ae83617b4364dc179d95d848d9fd6db5 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Mon, 6 Sep 2010 12:13:20 +0200
Subject: drbd: cosmetic, don't report resync for online-verify

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_main.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 981cfd178b09..2060db69f182 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -504,7 +504,7 @@ static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns);
 static int is_valid_state_transition(struct drbd_conf *,
 				     union drbd_state, union drbd_state);
 static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
-				       union drbd_state ns, int *warn_sync_abort);
+				       union drbd_state ns, const char **warn_sync_abort);
 int drbd_send_state_req(struct drbd_conf *,
 			union drbd_state, union drbd_state);
 
@@ -812,7 +812,7 @@ static int is_valid_state_transition(struct drbd_conf *mdev,
  * to D_UNKNOWN. This rule and many more along those lines are in this function.
  */
 static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
-				       union drbd_state ns, int *warn_sync_abort)
+				       union drbd_state ns, const char **warn_sync_abort)
 {
 	enum drbd_fencing_p fp;
 
@@ -851,7 +851,9 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
 	if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
 	    (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
 		if (warn_sync_abort)
-			*warn_sync_abort = 1;
+			*warn_sync_abort =
+				os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
+				"Online-verify" : "Resync";
 		ns.conn = C_CONNECTED;
 	}
 
@@ -984,7 +986,7 @@ int __drbd_set_state(struct drbd_conf *mdev,
 {
 	union drbd_state os;
 	int rv = SS_SUCCESS;
-	int warn_sync_abort = 0;
+	const char *warn_sync_abort = NULL;
 	struct after_state_chg_work *ascw;
 
 	os = mdev->state;
@@ -1016,7 +1018,7 @@ int __drbd_set_state(struct drbd_conf *mdev,
 	}
 
 	if (warn_sync_abort)
-		dev_warn(DEV, "Resync aborted.\n");
+		dev_warn(DEV, "%s aborted.\n", warn_sync_abort);
 
 	{
 		char *pbp, pb[300];
-- 
cgit v1.2.3


From f2906e183f5460df9d9e774f5952f5ff670b3913 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Wed, 21 Jul 2010 17:04:32 +0200
Subject: drbd: fix for spurious full sync (becoming sync target looked like
 invalidate)

If a synctarget lost connection while being WFSyncUUID,
due to "state sanitizing", the attempted state change to SyncTarget
looked like an "invalidate" to after_state_ch() later,
thus caused a full sync on next handshake (Bug #318).

drbd0: PingAck did not arrive in time.
drbd0: peer( Primary -> Unknown ) conn( WFSyncUUID -> NetworkFailure ) pdsk( UpToDate -> DUnknown )

        from  : { cs:NetworkFailure ro:Secondary/Unknown ds:UpToDate/DUnknown r--- }
        to    : { cs:SyncTarget ro:Secondary/Unknown ds:Inconsistent/DUnknown r--- }
        after sanizising, resulted in
        state: { cs:NetworkFailure ro:Secondary/Unknown ds:Inconsistent/DUnknown r--- }
        drbd0: disk( UpToDate -> Inconsistent )

Fix:
don't mask state transition errors in "sanitizing",
so the requested state change to SyncTarget fails,
instead of being implicitly "remaped" to invalidate.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_main.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 2060db69f182..04c305d36f8e 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -827,9 +827,10 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
 	    os.conn <= C_DISCONNECTING)
 		ns.conn = os.conn;
 
-	/* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow */
+	/* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
+	 * If you try to go into some Sync* state, that shall fail (elsewhere). */
 	if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
-	    ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING)
+	    ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN)
 		ns.conn = os.conn;
 
 	/* After C_DISCONNECTING only C_STANDALONE may follow */
-- 
cgit v1.2.3


From 5a75cc7cfbb98e896232902214432dae30653dfe Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Thu, 9 Sep 2010 14:22:21 +0200
Subject: drbd: Fixed compatibility with protocol versions smaller than 95

Forgot to consider the max size for the resync requests.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_worker.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index c72a5fc1c88e..daa672fc46e9 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -534,8 +534,9 @@ int w_make_resync_request(struct drbd_conf *mdev,
 
 	/* starting with drbd 8.3.8, we can handle multi-bio EEs,
 	 * if it should be necessary */
-	max_segment_size = mdev->agreed_pro_version < 94 ?
-		queue_max_segment_size(mdev->rq_queue) : DRBD_MAX_SEGMENT_SIZE;
+	max_segment_size =
+		mdev->agreed_pro_version < 94 ? queue_max_segment_size(mdev->rq_queue) :
+		mdev->agreed_pro_version < 95 ?	DRBD_MAX_SIZE_H80_PACKET : DRBD_MAX_SEGMENT_SIZE;
 
 	if (mdev->rs_plan_s.size) { /* mdev->sync_conf.c_plan_ahead */
 		number = drbd_rs_controller(mdev) >> (BM_BLOCK_SHIFT - 9);
-- 
cgit v1.2.3


From 78db89287ce0f146a1f2a019a0b243ea4557caac Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Mon, 13 Sep 2010 13:27:10 +0200
Subject: drbd: DIV_ROUND_UP not needed here

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_worker.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index daa672fc46e9..83ba63ab2358 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -915,7 +915,7 @@ static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_ent
 {
 	if (drbd_ee_has_active_page(e)) {
 		/* This might happen if sendpage() has not finished */
-		int i = DIV_ROUND_UP(e->size, PAGE_SIZE);
+		int i = (e->size + PAGE_SIZE -1) >> PAGE_SHIFT;
 		atomic_add(i, &mdev->pp_in_use_by_net);
 		atomic_sub(i, &mdev->pp_in_use);
 		spin_lock_irq(&mdev->req_lock);
-- 
cgit v1.2.3


From fb22c402ffdf61dd121795b5809de587185d5240 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Wed, 8 Sep 2010 23:20:21 +0200
Subject: drbd: Track the reasons to suspend IO in dedicated state bits

There are three ways to get IO suspended:

 * Loss of any access to data
 * Fence-peer-handler running
 * User requested to suspend IO

Track those in different bits, so that one condition clearing its
state bit does not interfere with the other two conditions.

Only when the user resumes IO he overrules all three bits.

The fact is hidden from the user, he sees only a single suspend
bit.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h      |  9 ++++++++-
 drivers/block/drbd/drbd_main.c     | 36 +++++++++++++++++++++++-------------
 drivers/block/drbd/drbd_nl.c       | 20 ++++++++++++++------
 drivers/block/drbd/drbd_proc.c     |  2 +-
 drivers/block/drbd/drbd_receiver.c |  6 +++---
 drivers/block/drbd/drbd_req.c      |  6 +++---
 include/linux/drbd.h               | 10 +++++++---
 7 files changed, 59 insertions(+), 30 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index eb1273d04caf..ff7fffa00dac 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1681,6 +1681,8 @@ void drbd_bcast_ee(struct drbd_conf *mdev,
 #define susp_MASK 1
 #define user_isp_MASK 1
 #define aftr_isp_MASK 1
+#define susp_nod_MASK 1
+#define susp_fen_MASK 1
 
 #define NS(T, S) \
 	({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \
@@ -2254,11 +2256,16 @@ static inline int drbd_state_is_stable(union drbd_state s)
 	return 1;
 }
 
+static inline int is_susp(union drbd_state s)
+{
+	return s.susp || s.susp_nod || s.susp_fen;
+}
+
 static inline int __inc_ap_bio_cond(struct drbd_conf *mdev)
 {
 	int mxb = drbd_get_max_buffers(mdev);
 
-	if (mdev->state.susp)
+	if (is_susp(mdev->state))
 		return 0;
 	if (test_bit(SUSPEND_IO, &mdev->flags))
 		return 0;
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 04c305d36f8e..4f33714fb3cd 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -654,7 +654,7 @@ static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
 	    drbd_role_str(ns.peer),
 	    drbd_disk_str(ns.disk),
 	    drbd_disk_str(ns.pdsk),
-	    ns.susp ? 's' : 'r',
+	    is_susp(ns) ? 's' : 'r',
 	    ns.aftr_isp ? 'a' : '-',
 	    ns.peer_isp ? 'p' : '-',
 	    ns.user_isp ? 'u' : '-'
@@ -925,12 +925,12 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
 	if (fp == FP_STONITH &&
 	    (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
 	    !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
-		ns.susp = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
+		ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
 
 	if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
 	    (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
 	    !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
-		ns.susp = 1; /* Suspend IO while no data available (no accessible data available) */
+		ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
 
 	if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
 		if (ns.conn == C_SYNC_SOURCE)
@@ -1030,7 +1030,10 @@ int __drbd_set_state(struct drbd_conf *mdev,
 		PSC(conn);
 		PSC(disk);
 		PSC(pdsk);
-		PSC(susp);
+		if (is_susp(ns) != is_susp(os))
+			pbp += sprintf(pbp, "susp( %s -> %s ) ",
+				       drbd_susp_str(is_susp(os)),
+				       drbd_susp_str(is_susp(ns)));
 		PSC(aftr_isp);
 		PSC(peer_isp);
 		PSC(user_isp);
@@ -1218,6 +1221,7 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 {
 	enum drbd_fencing_p fp;
 	enum drbd_req_event what = nothing;
+	union drbd_state nsm = (union drbd_state){ .i = -1 };
 
 	if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
 		clear_bit(CRASHED_PRIMARY, &mdev->flags);
@@ -1241,19 +1245,21 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 	/* Here we have the actions that are performed after a
 	   state change. This function might sleep */
 
-	if (os.susp && ns.susp && mdev->sync_conf.on_no_data == OND_SUSPEND_IO) {
+	nsm.i = -1;
+	if (ns.susp_nod) {
 		if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
 			if (ns.conn == C_CONNECTED)
-				what = resend;
+				what = resend, nsm.susp_nod = 0;
 			else /* ns.conn > C_CONNECTED */
 				dev_err(DEV, "Unexpected Resynd going on!\n");
 		}
 
 		if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING)
-			what = restart_frozen_disk_io;
+			what = restart_frozen_disk_io, nsm.susp_nod = 0;
+
 	}
 
-	if (fp == FP_STONITH && ns.susp) {
+	if (ns.susp_fen) {
 		/* case1: The outdate peer handler is successful: */
 		if (os.pdsk > D_OUTDATED  && ns.pdsk <= D_OUTDATED) {
 			tl_clear(mdev);
@@ -1263,20 +1269,22 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 				drbd_md_sync(mdev);
 			}
 			spin_lock_irq(&mdev->req_lock);
-			_drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL);
+			_drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
 			spin_unlock_irq(&mdev->req_lock);
 		}
 		/* case2: The connection was established again: */
 		if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
 			clear_bit(NEW_CUR_UUID, &mdev->flags);
 			what = resend;
+			nsm.susp_fen = 0;
 		}
 	}
 
 	if (what != nothing) {
 		spin_lock_irq(&mdev->req_lock);
 		_tl_restart(mdev, what);
-		_drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL);
+		nsm.i &= mdev->state.i;
+		_drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
 		spin_unlock_irq(&mdev->req_lock);
 	}
 
@@ -1298,7 +1306,7 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 		if (get_ldev(mdev)) {
 			if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
 			    mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
-				if (mdev->state.susp) {
+				if (is_susp(mdev->state)) {
 					set_bit(NEW_CUR_UUID, &mdev->flags);
 				} else {
 					drbd_uuid_new_current(mdev);
@@ -1417,7 +1425,7 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 		resume_next_sg(mdev);
 
 	/* free tl_hash if we Got thawed and are C_STANDALONE */
-	if (ns.conn == C_STANDALONE && ns.susp == 0 && mdev->tl_hash)
+	if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
 		drbd_free_tl_hash(mdev);
 
 	/* Upon network connection, we need to start the receiver */
@@ -2732,7 +2740,9 @@ static void drbd_set_defaults(struct drbd_conf *mdev)
 		  .conn = C_STANDALONE,
 		  .disk = D_DISKLESS,
 		  .pdsk = D_UNKNOWN,
-		  .susp = 0
+		  .susp = 0,
+		  .susp_nod = 0,
+		  .susp_fen = 0
 		} };
 }
 
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 5b30f90cab3e..9ee44568dce3 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -209,7 +209,8 @@ enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev)
 		put_ldev(mdev);
 	} else {
 		dev_warn(DEV, "Not fencing peer, I'm not even Consistent myself.\n");
-		return mdev->state.pdsk;
+		nps = mdev->state.pdsk;
+		goto out;
 	}
 
 	r = drbd_khelper(mdev, "fence-peer");
@@ -256,6 +257,14 @@ enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev)
 
 	dev_info(DEV, "fence-peer helper returned %d (%s)\n",
 			(r>>8) & 0xff, ex_to_string);
+
+out:
+	if (mdev->state.susp_fen && nps >= D_UNKNOWN) {
+		/* The handler was not successful... unfreeze here, the
+		   state engine can not unfreeze... */
+		_drbd_request_state(mdev, NS(susp_fen, 0), CS_VERBOSE);
+	}
+
 	return nps;
 }
 
@@ -550,7 +559,7 @@ char *ppsize(char *buf, unsigned long long size)
 void drbd_suspend_io(struct drbd_conf *mdev)
 {
 	set_bit(SUSPEND_IO, &mdev->flags);
-	if (mdev->state.susp)
+	if (is_susp(mdev->state))
 		return;
 	wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
 }
@@ -1016,7 +1025,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 
 	drbd_suspend_io(mdev);
 	/* also wait for the last barrier ack. */
-	wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt) || mdev->state.susp);
+	wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt) || is_susp(mdev->state));
 	/* and for any other previously queued work */
 	drbd_flush_workqueue(mdev);
 
@@ -1114,8 +1123,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 		clear_bit(CRASHED_PRIMARY, &mdev->flags);
 
 	if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) &&
-	    !(mdev->state.role == R_PRIMARY && mdev->state.susp &&
-	      mdev->sync_conf.on_no_data == OND_SUSPEND_IO)) {
+	    !(mdev->state.role == R_PRIMARY && mdev->state.susp_nod)) {
 		set_bit(CRASHED_PRIMARY, &mdev->flags);
 		cp_discovered = 1;
 	}
@@ -1939,7 +1947,7 @@ static int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 		drbd_md_sync(mdev);
 	}
 	drbd_suspend_io(mdev);
-	reply->ret_code = drbd_request_state(mdev, NS(susp, 0));
+	reply->ret_code = drbd_request_state(mdev, NS3(susp, 0, susp_nod, 0, susp_fen, 0));
 	if (reply->ret_code == SS_SUCCESS) {
 		if (mdev->state.conn < C_CONNECTED)
 			tl_clear(mdev);
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index a4a4a06908c5..aec8426c1bf6 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -213,7 +213,7 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
 			   drbd_disk_str(mdev->state.pdsk),
 			   (mdev->net_conf == NULL ? ' ' :
 			    (mdev->net_conf->wire_protocol - DRBD_PROT_A+'A')),
-			   mdev->state.susp ? 's' : 'r',
+			   is_susp(mdev->state) ? 's' : 'r',
 			   mdev->state.aftr_isp ? 'a' : '-',
 			   mdev->state.peer_isp ? 'p' : '-',
 			   mdev->state.user_isp ? 'u' : '-',
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 983e49cbd233..6b69b2f734dc 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -3315,7 +3315,7 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 	if ((nconn == C_CONNECTED || nconn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
 		ns.disk = mdev->new_state_tmp.disk;
 	cs_flags = CS_VERBOSE + (oconn < C_CONNECTED && nconn >= C_CONNECTED ? 0 : CS_HARD);
-	if (ns.pdsk == D_CONSISTENT && ns.susp && nconn == C_CONNECTED && oconn < C_CONNECTED &&
+	if (ns.pdsk == D_CONSISTENT && is_susp(ns) && nconn == C_CONNECTED && oconn < C_CONNECTED &&
 	    test_bit(NEW_CUR_UUID, &mdev->flags)) {
 		/* Do not allow tl_restart(resend) for a rebooted peer. We can only allow this
 		   for temporal network outages! */
@@ -3829,7 +3829,7 @@ static void drbd_disconnect(struct drbd_conf *mdev)
 	kfree(mdev->p_uuid);
 	mdev->p_uuid = NULL;
 
-	if (!mdev->state.susp)
+	if (!is_susp(mdev->state))
 		tl_clear(mdev);
 
 	dev_info(DEV, "Connection closed\n");
@@ -3858,7 +3858,7 @@ static void drbd_disconnect(struct drbd_conf *mdev)
 	if (os.conn == C_DISCONNECTING) {
 		wait_event(mdev->net_cnt_wait, atomic_read(&mdev->net_cnt) == 0);
 
-		if (!mdev->state.susp) {
+		if (!is_susp(mdev->state)) {
 			/* we must not free the tl_hash
 			 * while application io is still on the fly */
 			wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index af608b39c4e0..9e91a2545fc8 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -287,7 +287,7 @@ static void _req_may_be_done_not_susp(struct drbd_request *req, struct bio_and_e
 {
 	struct drbd_conf *mdev = req->mdev;
 
-	if (!mdev->state.susp)
+	if (!is_susp(mdev->state))
 		_req_may_be_done(req, m);
 }
 
@@ -812,7 +812,7 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
 			    (mdev->state.pdsk == D_INCONSISTENT &&
 			     mdev->state.conn >= C_CONNECTED));
 
-	if (!(local || remote) && !mdev->state.susp) {
+	if (!(local || remote) && !is_susp(mdev->state)) {
 		dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
 		goto fail_free_complete;
 	}
@@ -838,7 +838,7 @@ allocate_barrier:
 	/* GOOD, everything prepared, grab the spin_lock */
 	spin_lock_irq(&mdev->req_lock);
 
-	if (mdev->state.susp) {
+	if (is_susp(mdev->state)) {
 		/* If we got suspended, use the retry mechanism of
 		   generic_make_request() to restart processing of this
 		   bio. In the next call to drbd_make_request_26
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
index 89718a39791e..5e72a5d3d48f 100644
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -232,13 +232,17 @@ union drbd_state {
 		unsigned conn:5 ;   /* 17/32	 cstates */
 		unsigned disk:4 ;   /* 8/16	 from D_DISKLESS to D_UP_TO_DATE */
 		unsigned pdsk:4 ;   /* 8/16	 from D_DISKLESS to D_UP_TO_DATE */
-		unsigned susp:1 ;   /* 2/2	 IO suspended  no/yes */
+		unsigned susp:1 ;   /* 2/2	 IO suspended no/yes (by user) */
 		unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
 		unsigned peer_isp:1 ;
 		unsigned user_isp:1 ;
-		unsigned _pad:11;   /* 0	 unused */
+		unsigned susp_nod:1 ; /* IO suspended because no data */
+		unsigned susp_fen:1 ; /* IO suspended because fence peer handler runs*/
+		unsigned _pad:9;   /* 0	 unused */
 #elif defined(__BIG_ENDIAN_BITFIELD)
-		unsigned _pad:11;   /* 0	 unused */
+		unsigned _pad:9;
+		unsigned susp_fen:1 ;
+		unsigned susp_nod:1 ;
 		unsigned user_isp:1 ;
 		unsigned peer_isp:1 ;
 		unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
-- 
cgit v1.2.3


From 8979d9c9e0bc8e54cf5bd7a89abb2145f087b5e1 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Tue, 14 Sep 2010 15:56:29 +0200
Subject: drbd: protocol compatibility for maximum packet sizes

Two missing corner cases to the "maximum packet size" handshake.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_nl.c       | 18 +++++++++++++++---
 drivers/block/drbd/drbd_receiver.c |  2 ++
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 9ee44568dce3..9ae33a5bcf66 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -861,6 +861,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	struct inode *inode, *inode2;
 	struct lru_cache *resync_lru = NULL;
 	union drbd_state ns, os;
+	unsigned int max_seg_s;
 	int rv;
 	int cp_discovered = 0;
 	int logical_block_size;
@@ -1133,9 +1134,20 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	mdev->read_cnt = 0;
 	mdev->writ_cnt = 0;
 
-	drbd_setup_queue_param(mdev, mdev->state.conn == C_CONNECTED &&
-			       mdev->agreed_pro_version < 95 ?
-			       DRBD_MAX_SIZE_H80_PACKET : DRBD_MAX_SEGMENT_SIZE);
+	max_seg_s = DRBD_MAX_SEGMENT_SIZE;
+	if (mdev->state.conn == C_CONNECTED) {
+		/* We are Primary, Connected, and now attach a new local
+		 * backing store. We must not increase the user visible maximum
+		 * bio size on this device to something the peer may not be
+		 * able to handle. */
+		if (mdev->agreed_pro_version < 94)
+			max_seg_s = queue_max_segment_size(mdev->rq_queue);
+		else if (mdev->agreed_pro_version == 94)
+			max_seg_s = DRBD_MAX_SIZE_H80_PACKET;
+		/* else: drbd 8.3.9 and later, stay with default */
+	}
+
+	drbd_setup_queue_param(mdev, max_seg_s);
 
 	/* If I am currently not R_PRIMARY,
 	 * but meta data primary indicator is set,
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 6b69b2f734dc..9da32ac62c22 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -3088,6 +3088,8 @@ static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 
 		if (mdev->agreed_pro_version < 94)
 			max_seg_s = be32_to_cpu(p->max_segment_size);
+		else if (mdev->agreed_pro_version == 94)
+			max_seg_s = DRBD_MAX_SIZE_H80_PACKET;
 		else /* drbd 8.3.8 onwards */
 			max_seg_s = DRBD_MAX_SEGMENT_SIZE;
 
-- 
cgit v1.2.3


From f65363cfa05fe60874030461a0eeb84b7e60cba4 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Tue, 14 Sep 2010 20:14:09 +0200
Subject: drbd: fix possible access after free

If we release the page pointed to by md_io_tmpp, we need to zero out the
pointer, too, as that may be used later to decide whether we need to
allocate a new page again.

Impact: a previously freed page may be used and clobbered.  Depending on
what that particular page is being used for meanwhile, this may result
in silent data corruption of completely unrelated things.

Only of concern on devices with logical_block_size != 512 byte,
if you re-attach after becoming diskless once.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_main.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 4f33714fb3cd..e1f2c2e54f5f 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1407,8 +1407,10 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 			drbd_free_bc(mdev->ldev);
 			mdev->ldev = NULL;);
 
-		if (mdev->md_io_tmpp)
+		if (mdev->md_io_tmpp) {
 			__free_page(mdev->md_io_tmpp);
+			mdev->md_io_tmpp = NULL;
+		}
 	}
 
 	/* Disks got bigger while they were detached */
-- 
cgit v1.2.3


From 00b425377d60e67e86721d4ce6d7cbf131a5d0fd Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Tue, 5 Oct 2010 11:19:39 +0200
Subject: drbd: Allow larger values for c-fill-target.

Connections through a compressing proxy might have more bits
on the fly. 500MByte instead of 50MByte

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h | 2 +-
 include/linux/drbd_limits.h   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index ff7fffa00dac..1680939de101 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -348,7 +348,7 @@ struct p_header80 {
 struct p_header95 {
 	u16	  magic;	/* use DRBD_MAGIC_BIG here */
 	u16	  command;
-	u32	  length;
+	u32	  length;	/* Use only 24 bits of that. Ignore the highest 8 bit. */
 	u8	  payload[0];
 } __packed;
 
diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h
index 0b24ded6fffd..4ac33f34b77e 100644
--- a/include/linux/drbd_limits.h
+++ b/include/linux/drbd_limits.h
@@ -143,7 +143,7 @@
 #define DRBD_C_DELAY_TARGET_DEF 10
 
 #define DRBD_C_FILL_TARGET_MIN 0
-#define DRBD_C_FILL_TARGET_MAX 100000
+#define DRBD_C_FILL_TARGET_MAX (1<<20) /* 500MByte in sec */
 #define DRBD_C_FILL_TARGET_DEF 0 /* By default disabled -> controlled by delay_target */
 
 #define DRBD_C_MAX_RATE_MIN     250 /* kByte/sec */
-- 
cgit v1.2.3


From f10f262349762c96ab247b6108af3a30b52b6f5a Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Tue, 5 Oct 2010 16:50:17 +0200
Subject: drbd: Fixed a stupid copy and paste error

This caused rs_planed to be not in sync with the content of the fifo.
That in turn could cause that the resync comes to a complete halt.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_worker.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 83ba63ab2358..166b51ec7b67 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -421,7 +421,7 @@ static void fifo_set(struct fifo_buffer *fb, int value)
 	int i;
 
 	for (i = 0; i < fb->size; i++)
-		fb->values[i] += value;
+		fb->values[i] = value;
 }
 
 static int fifo_push(struct fifo_buffer *fb, int value)
-- 
cgit v1.2.3


From 004352fa60345e499379af310de73a2df1a5762a Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Tue, 5 Oct 2010 20:13:58 +0200
Subject: drbd: Fix regression in recv_bm_rle_bits (compressed bitmap)

We used to be16_to_cpu the length field in our received packet header.
drbd commit 17c854fea474a5eb3cfa12e4fb019e46debbc4ec
    drbd: receiving of big packets, for payloads between 64kByte and 4GByte
changed this, but forgot to adjust a few places where we relied on
h->length being in native byte order.

This broke the receiving side of the RLE compressed bitmap exchange.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_receiver.c | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 9da32ac62c22..b5d3fa6c7a8b 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -970,9 +970,10 @@ static int drbd_recv_header(struct drbd_conf *mdev, enum drbd_packets *cmd, unsi
 		*cmd = be16_to_cpu(h->h95.command);
 		*packet_size = be32_to_cpu(h->h95.length);
 	} else {
-		dev_err(DEV, "magic?? on data m: 0x%lx c: %d l: %d\n",
-		    (long)be32_to_cpu(h->h80.magic),
-		    h->h80.command, h->h80.length);
+		dev_err(DEV, "magic?? on data m: 0x%08x c: %d l: %d\n",
+		    be32_to_cpu(h->h80.magic),
+		    be16_to_cpu(h->h80.command),
+		    be16_to_cpu(h->h80.length));
 		return FALSE;
 	}
 	mdev->last_received = jiffies;
@@ -3421,7 +3422,7 @@ recv_bm_rle_bits(struct drbd_conf *mdev,
 	u64 tmp;
 	unsigned long s = c->bit_offset;
 	unsigned long e;
-	int len = p->head.length - (sizeof(*p) - sizeof(p->head));
+	int len = be16_to_cpu(p->head.length) - (sizeof(*p) - sizeof(p->head));
 	int toggle = DCBP_get_start(p);
 	int have;
 	int bits;
@@ -3570,8 +3571,8 @@ static int receive_bitmap(struct drbd_conf *mdev, enum drbd_packets cmd, unsigne
 			memcpy(p, h, sizeof(*h));
 			if (drbd_recv(mdev, p->head.payload, data_size) != data_size)
 				goto out;
-			if (p->head.length <= (sizeof(*p) - sizeof(p->head))) {
-				dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", p->head.length);
+			if (data_size <= (sizeof(*p) - sizeof(p->head))) {
+				dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", data_size);
 				return FAILED;
 			}
 			ret = decode_bitmap_c(mdev, p, &c);
@@ -4582,17 +4583,19 @@ int drbd_asender(struct drbd_thread *thi)
 
 		if (received == expect && cmd == NULL) {
 			if (unlikely(h->magic != BE_DRBD_MAGIC)) {
-				dev_err(DEV, "magic?? on meta m: 0x%lx c: %d l: %d\n",
-				    (long)be32_to_cpu(h->magic),
-				    h->command, h->length);
+				dev_err(DEV, "magic?? on meta m: 0x%08x c: %d l: %d\n",
+				    be32_to_cpu(h->magic),
+				    be16_to_cpu(h->command),
+				    be16_to_cpu(h->length));
 				goto reconnect;
 			}
 			cmd = get_asender_cmd(be16_to_cpu(h->command));
 			len = be16_to_cpu(h->length);
 			if (unlikely(cmd == NULL)) {
-				dev_err(DEV, "unknown command?? on meta m: 0x%lx c: %d l: %d\n",
-				    (long)be32_to_cpu(h->magic),
-				    h->command, h->length);
+				dev_err(DEV, "unknown command?? on meta m: 0x%08x c: %d l: %d\n",
+				    be32_to_cpu(h->magic),
+				    be16_to_cpu(h->command),
+				    be16_to_cpu(h->length));
 				goto disconnect;
 			}
 			expect = cmd->pkt_size;
-- 
cgit v1.2.3


From 2b2bf2148fd46874ee72a877c951e5c6675d1caa Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Wed, 6 Oct 2010 11:46:55 +0200
Subject: drbd: drbd_send_ack_dp must not rely on header information

drbd commit 17c854fea474a5eb3cfa12e4fb019e46debbc4ec
drbd: receiving of big packets, for payloads between 64kByte and 4GByte
introduced a new on-the-wire packet header format.  We must no longer
assume either format, but use the result of whatever drbd_recv_header
has decoded.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h      |  2 +-
 drivers/block/drbd/drbd_main.c     | 11 ++++++-----
 drivers/block/drbd/drbd_receiver.c |  4 ++--
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 1680939de101..8ab6fed39539 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1230,7 +1230,7 @@ extern int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
 extern int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
 			struct p_block_req *rp);
 extern int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
-			struct p_data *dp);
+			struct p_data *dp, int data_size);
 extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
 			    sector_t sector, int blksize, u64 block_id);
 extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index e1f2c2e54f5f..accb37d1215f 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2193,13 +2193,14 @@ static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
 	return ok;
 }
 
+/* dp->sector and dp->block_id already/still in network byte order,
+ * data_size is payload size according to dp->head,
+ * and may need to be corrected for digest size. */
 int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
-		     struct p_data *dp)
+		     struct p_data *dp, int data_size)
 {
-	const int header_size = sizeof(struct p_data)
-			      - sizeof(struct p_header80);
-	int data_size  = ((struct p_header80 *)dp)->length - header_size;
-
+	data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
+		crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
 	return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
 			      dp->block_id);
 }
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index b5d3fa6c7a8b..45a2d610ca1d 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1630,7 +1630,7 @@ static int receive_RSDataReply(struct drbd_conf *mdev, enum drbd_packets cmd, un
 
 		ok = drbd_drain_block(mdev, data_size);
 
-		drbd_send_ack_dp(mdev, P_NEG_ACK, p);
+		drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size);
 	}
 
 	atomic_add(data_size >> 9, &mdev->rs_sect_in);
@@ -1787,7 +1787,7 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 			mdev->peer_seq++;
 		spin_unlock(&mdev->peer_seq_lock);
 
-		drbd_send_ack_dp(mdev, P_NEG_ACK, p);
+		drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size);
 		atomic_inc(&mdev->current_epoch->epoch_size);
 		return drbd_drain_block(mdev, data_size);
 	}
-- 
cgit v1.2.3


From 4ac4aadacb5badc45679cd94cd362132daafe8c4 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Thu, 22 Jul 2010 17:39:26 +0200
Subject: drbd: preparation commit, using full state in receive_state()

no functional change, just using full state instead of just the .conn
part of it for comparisons.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_receiver.c | 39 ++++++++++++++++++--------------------
 1 file changed, 18 insertions(+), 21 deletions(-)

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 45a2d610ca1d..585049dfb711 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -3242,8 +3242,7 @@ static int receive_req_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsi
 static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
 {
 	struct p_state *p = &mdev->data.rbuf.state;
-	enum drbd_conns nconn, oconn;
-	union drbd_state ns, peer_state;
+	union drbd_state os, ns, peer_state;
 	enum drbd_disk_state real_peer_disk;
 	enum chg_state_flags cs_flags;
 	int rv;
@@ -3258,38 +3257,38 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 
 	spin_lock_irq(&mdev->req_lock);
  retry:
-	oconn = nconn = mdev->state.conn;
+	os = ns = mdev->state;
 	spin_unlock_irq(&mdev->req_lock);
 
-	if (nconn == C_WF_REPORT_PARAMS)
-		nconn = C_CONNECTED;
+	if (ns.conn == C_WF_REPORT_PARAMS)
+		ns.conn = C_CONNECTED;
 
 	if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING &&
 	    get_ldev_if_state(mdev, D_NEGOTIATING)) {
 		int cr; /* consider resync */
 
 		/* if we established a new connection */
-		cr  = (oconn < C_CONNECTED);
+		cr  = (os.conn < C_CONNECTED);
 		/* if we had an established connection
 		 * and one of the nodes newly attaches a disk */
-		cr |= (oconn == C_CONNECTED &&
+		cr |= (os.conn == C_CONNECTED &&
 		       (peer_state.disk == D_NEGOTIATING ||
-			mdev->state.disk == D_NEGOTIATING));
+			os.disk == D_NEGOTIATING));
 		/* if we have both been inconsistent, and the peer has been
 		 * forced to be UpToDate with --overwrite-data */
 		cr |= test_bit(CONSIDER_RESYNC, &mdev->flags);
 		/* if we had been plain connected, and the admin requested to
 		 * start a sync by "invalidate" or "invalidate-remote" */
-		cr |= (oconn == C_CONNECTED &&
+		cr |= (os.conn == C_CONNECTED &&
 				(peer_state.conn >= C_STARTING_SYNC_S &&
 				 peer_state.conn <= C_WF_BITMAP_T));
 
 		if (cr)
-			nconn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk);
+			ns.conn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk);
 
 		put_ldev(mdev);
-		if (nconn == C_MASK) {
-			nconn = C_CONNECTED;
+		if (ns.conn == C_MASK) {
+			ns.conn = C_CONNECTED;
 			if (mdev->state.disk == D_NEGOTIATING) {
 				drbd_force_state(mdev, NS(disk, D_DISKLESS));
 			} else if (peer_state.disk == D_NEGOTIATING) {
@@ -3299,7 +3298,7 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 			} else {
 				if (test_and_clear_bit(CONN_DRY_RUN, &mdev->flags))
 					return FALSE;
-				D_ASSERT(oconn == C_WF_REPORT_PARAMS);
+				D_ASSERT(os.conn == C_WF_REPORT_PARAMS);
 				drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
 				return FALSE;
 			}
@@ -3307,18 +3306,16 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 	}
 
 	spin_lock_irq(&mdev->req_lock);
-	if (mdev->state.conn != oconn)
+	if (mdev->state.i != os.i)
 		goto retry;
 	clear_bit(CONSIDER_RESYNC, &mdev->flags);
-	ns.i = mdev->state.i;
-	ns.conn = nconn;
 	ns.peer = peer_state.role;
 	ns.pdsk = real_peer_disk;
 	ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
-	if ((nconn == C_CONNECTED || nconn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
+	if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
 		ns.disk = mdev->new_state_tmp.disk;
-	cs_flags = CS_VERBOSE + (oconn < C_CONNECTED && nconn >= C_CONNECTED ? 0 : CS_HARD);
-	if (ns.pdsk == D_CONSISTENT && is_susp(ns) && nconn == C_CONNECTED && oconn < C_CONNECTED &&
+	cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD);
+	if (ns.pdsk == D_CONSISTENT && is_susp(ns) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED &&
 	    test_bit(NEW_CUR_UUID, &mdev->flags)) {
 		/* Do not allow tl_restart(resend) for a rebooted peer. We can only allow this
 		   for temporal network outages! */
@@ -3339,8 +3336,8 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 		return FALSE;
 	}
 
-	if (oconn > C_WF_REPORT_PARAMS) {
-		if (nconn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
+	if (os.conn > C_WF_REPORT_PARAMS) {
+		if (ns.conn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
 		    peer_state.disk != D_NEGOTIATING ) {
 			/* we want resync, peer has not yet decided to sync... */
 			/* Nowadays only used when forcing a node into primary role and
-- 
cgit v1.2.3


From e9ef7bb6f9696471ddddf0065afac8b435e5d051 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Thu, 7 Oct 2010 15:55:39 +0200
Subject: drbd: allow for explicit resync-finished notifications

Preparation patch so more drbd_send_state() usage on the peer
will not confuse drbd in receive_state().

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_receiver.c | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 585049dfb711..990fe01afa50 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -3260,6 +3260,40 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 	os = ns = mdev->state;
 	spin_unlock_irq(&mdev->req_lock);
 
+	/* peer says his disk is uptodate, while we think it is inconsistent,
+	 * and this happens while we think we have a sync going on. */
+	if (os.pdsk == D_INCONSISTENT && real_peer_disk == D_UP_TO_DATE &&
+	    os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) {
+		/* If we are (becoming) SyncSource, but peer is still in sync
+		 * preparation, ignore its uptodate-ness to avoid flapping, it
+		 * will change to inconsistent once the peer reaches active
+		 * syncing states.
+		 * It may have changed syncer-paused flags, however, so we
+		 * cannot ignore this completely. */
+		if (peer_state.conn > C_CONNECTED &&
+		    peer_state.conn < C_SYNC_SOURCE)
+			real_peer_disk = D_INCONSISTENT;
+
+		/* if peer_state changes to connected at the same time,
+		 * it explicitly notifies us that it finished resync.
+		 * Maybe we should finish it up, too? */
+		else if (os.conn >= C_SYNC_SOURCE &&
+			 peer_state.conn == C_CONNECTED) {
+			if (drbd_bm_total_weight(mdev) <= mdev->rs_failed)
+				drbd_resync_finished(mdev);
+			return TRUE;
+		}
+	}
+
+	/* peer says his disk is inconsistent, while we think it is uptodate,
+	 * and this happens while the peer still thinks we have a sync going on,
+	 * but we think we are already done with the sync.
+	 * We ignore this to avoid flapping pdsk.
+	 * This should not happen, if the peer is a recent version of drbd. */
+	if (os.pdsk == D_UP_TO_DATE && real_peer_disk == D_INCONSISTENT &&
+	    os.conn == C_CONNECTED && peer_state.conn > C_SYNC_SOURCE)
+		real_peer_disk = D_UP_TO_DATE;
+
 	if (ns.conn == C_WF_REPORT_PARAMS)
 		ns.conn = C_CONNECTED;
 
-- 
cgit v1.2.3


From af85e8e83d160f72a10e4467852646ac08614260 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Thu, 7 Oct 2010 16:07:55 +0200
Subject: drbd: fix for spurious fullsync (uuids rotated too fast)

If it was an "empty" resync, the SyncSource may have already "finished"
the resync and rotated the UUIDs, before noticing the connection loss
(and generating a new uuid, if Primary, rotating again), while the
SyncTarget did not change its uuids at all, or only got to the previous
sync-uuid.
This would then again lead to a full sync on next handshake
(see also Bug #251).

Fix:
Use explicit resync finished notification even for empty resyncs,
do not finish an empty resync implicitly on the SyncSource.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_main.c   |  5 +++++
 drivers/block/drbd/drbd_worker.c | 42 +++++++++++++++++++++++++++++-----------
 2 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index accb37d1215f..63f45d730f3f 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1426,6 +1426,11 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 	    (os.user_isp && !ns.user_isp))
 		resume_next_sg(mdev);
 
+	/* sync target done with resync.  Explicitly notify peer, even though
+	 * it should (at least for non-empty resyncs) already know itself. */
+	if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
+		drbd_send_state(mdev);
+
 	/* free tl_hash if we Got thawed and are C_STANDALONE */
 	if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
 		drbd_free_tl_hash(mdev);
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 166b51ec7b67..88be45ad84ed 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -522,6 +522,12 @@ int w_make_resync_request(struct drbd_conf *mdev,
 		dev_err(DEV, "%s in w_make_resync_request\n",
 			drbd_conn_str(mdev->state.conn));
 
+	if (mdev->rs_total == 0) {
+		/* empty resync? */
+		drbd_resync_finished(mdev);
+		return 1;
+	}
+
 	if (!get_ldev(mdev)) {
 		/* Since we only need to access mdev->rsync a
 		   get_ldev_if_state(mdev,D_FAILED) would be sufficient, but
@@ -768,6 +774,14 @@ static int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int ca
 	return 1;
 }
 
+static void ping_peer(struct drbd_conf *mdev)
+{
+	clear_bit(GOT_PING_ACK, &mdev->flags);
+	request_ping(mdev);
+	wait_event(mdev->misc_wait,
+		   test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED);
+}
+
 int drbd_resync_finished(struct drbd_conf *mdev)
 {
 	unsigned long db, dt, dbdt;
@@ -807,6 +821,8 @@ int drbd_resync_finished(struct drbd_conf *mdev)
 	if (!get_ldev(mdev))
 		goto out;
 
+	ping_peer(mdev);
+
 	spin_lock_irq(&mdev->req_lock);
 	os = mdev->state;
 
@@ -1420,14 +1436,6 @@ int drbd_alter_sa(struct drbd_conf *mdev, int na)
 	return retcode;
 }
 
-static void ping_peer(struct drbd_conf *mdev)
-{
-	clear_bit(GOT_PING_ACK, &mdev->flags);
-	request_ping(mdev);
-	wait_event(mdev->misc_wait,
-		   test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED);
-}
-
 /**
  * drbd_start_resync() - Start the resync process
  * @mdev:	DRBD device.
@@ -1527,9 +1535,21 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
 		     (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10),
 		     (unsigned long) mdev->rs_total);
 
-		if (mdev->rs_total == 0) {
-			/* Peer still reachable? Beware of failing before-resync-target handlers! */
-			ping_peer(mdev);
+		if (mdev->agreed_pro_version < 95 && mdev->rs_total == 0) {
+			/* This still has a race (about when exactly the peers
+			 * detect connection loss) that can lead to a full sync
+			 * on next handshake. In 8.3.9 we fixed this with explicit
+			 * resync-finished notifications, but the fix
+			 * introduces a protocol change.  Sleeping for some
+			 * time longer than the ping interval + timeout on the
+			 * SyncSource, to give the SyncTarget the chance to
+			 * detect connection loss, then waiting for a ping
+			 * response (implicit in drbd_resync_finished) reduces
+			 * the race considerably, but does not solve it. */
+			if (side == C_SYNC_SOURCE)
+				schedule_timeout_interruptible(
+					mdev->net_conf->ping_int * HZ +
+					mdev->net_conf->ping_timeo*HZ/9);
 			drbd_resync_finished(mdev);
 		}
 
-- 
cgit v1.2.3


From 22cc37a943832c948808884604ec6f5ff2594c1d Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Tue, 14 Sep 2010 20:40:41 +0200
Subject: drbd: fix unlikely access after free and list corruption

Various cleanup paths have been incomplete, for the very unlikely case
that we cannot allocate enough bios from process context when submitting
on behalf of the peer or resync process.

Never observed.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_receiver.c | 25 +++++++++++++++++++++++++
 drivers/block/drbd/drbd_worker.c   |  7 +++++++
 include/linux/drbd.h               |  4 ++--
 3 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 990fe01afa50..71775a9de21d 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1573,6 +1573,13 @@ static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_si
 	if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0)
 		return TRUE;
 
+	/* drbd_submit_ee currently fails for one reason only:
+	 * not being able to allocate enough bios.
+	 * Is dropping the connection going to help? */
+	spin_lock_irq(&mdev->req_lock);
+	list_del(&e->w.list);
+	spin_unlock_irq(&mdev->req_lock);
+
 	drbd_free_ee(mdev, e);
 fail:
 	put_ldev(mdev);
@@ -1998,6 +2005,16 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 	if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0)
 		return TRUE;
 
+	/* drbd_submit_ee currently fails for one reason only:
+	 * not being able to allocate enough bios.
+	 * Is dropping the connection going to help? */
+	spin_lock_irq(&mdev->req_lock);
+	list_del(&e->w.list);
+	hlist_del_init(&e->colision);
+	spin_unlock_irq(&mdev->req_lock);
+	if (e->flags & EE_CALL_AL_COMPLETE_IO)
+		drbd_al_complete_io(mdev, e->sector);
+
 out_interrupted:
 	/* yes, the epoch_size now is imbalanced.
 	 * but we drop the connection anyways, so we don't have a chance to
@@ -2202,6 +2219,14 @@ submit:
 	if (drbd_submit_ee(mdev, e, READ, fault_type) == 0)
 		return TRUE;
 
+	/* drbd_submit_ee currently fails for one reason only:
+	 * not being able to allocate enough bios.
+	 * Is dropping the connection going to help? */
+	spin_lock_irq(&mdev->req_lock);
+	list_del(&e->w.list);
+	spin_unlock_irq(&mdev->req_lock);
+	/* no drbd_rs_complete_io(), we are dropping the connection anyways */
+
 out_free_e:
 	put_ldev(mdev);
 	drbd_free_ee(mdev, e);
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 88be45ad84ed..f12822d53867 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -387,6 +387,13 @@ static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
 	if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0)
 		return 0;
 
+	/* drbd_submit_ee currently fails for one reason only:
+	 * not being able to allocate enough bios.
+	 * Is dropping the connection going to help? */
+	spin_lock_irq(&mdev->req_lock);
+	list_del(&e->w.list);
+	spin_unlock_irq(&mdev->req_lock);
+
 	drbd_free_ee(mdev, e);
 defer:
 	put_ldev(mdev);
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
index 5e72a5d3d48f..da7d9bd4f3f0 100644
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -53,10 +53,10 @@
 
 
 extern const char *drbd_buildtag(void);
-#define REL_VERSION "8.3.8.1"
+#define REL_VERSION "8.3.9rc1"
 #define API_VERSION 88
 #define PRO_VERSION_MIN 86
-#define PRO_VERSION_MAX 94
+#define PRO_VERSION_MAX 95
 
 
 enum drbd_io_error_p {
-- 
cgit v1.2.3


From e9e6f3ec535d7b7c9e2ca64ad691e743e7d3c2f0 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Tue, 14 Sep 2010 20:26:27 +0200
Subject: drbd: fix for possible deadlock on IO error during resync

Scenario:

Something (say, flush-147:0) is in drbd_al_begin_io,
holding a local_cnt, waiting for the resync to make progress.

Disk fails, worker in after_state_ch does drbd_rs_cancel_all,
then waits for local_cnt to drop to zero.

flush-147:0 is woken by drbd_rs_cancel_all, needs to write an AL
transaction, and queues that on the worker.

Deadlock.

Fix: do not wait in the worker, have put_ldev() trigger the
state change D_FAILED -> D_DISKLESS when necessary.
put_ldev() cannot do the state change directly, as it may or may not
already hold various spinlocks. We queue a short work instead.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_int.h  |  8 ++++-
 drivers/block/drbd/drbd_main.c | 68 +++++++++++++++++++++++++++++-------------
 2 files changed, 54 insertions(+), 22 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 8ab6fed39539..c07c370c4c82 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -852,6 +852,7 @@ enum {
 	BITMAP_IO,		/* suspend application io;
 				   once no more io in flight, start bitmap io */
 	BITMAP_IO_QUEUED,       /* Started bitmap IO */
+	GO_DISKLESS,		/* Disk failed, local_cnt reached zero, we are going diskless */
 	RESYNC_AFTER_NEG,       /* Resync after online grow after the attach&negotiate finished. */
 	NET_CONGESTED,		/* The data socket is congested */
 
@@ -976,6 +977,7 @@ struct drbd_conf {
 	unsigned int ko_count;
 	struct drbd_work  resync_work,
 			  unplug_work,
+			  go_diskless,
 			  md_sync_work;
 	struct timer_list resync_timer;
 	struct timer_list md_sync_timer;
@@ -1278,6 +1280,7 @@ extern void drbd_queue_bitmap_io(struct drbd_conf *mdev,
 extern int drbd_bmio_set_n_write(struct drbd_conf *mdev);
 extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev);
 extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why);
+extern void drbd_go_diskless(struct drbd_conf *mdev);
 
 
 /* Meta data layout
@@ -2123,8 +2126,11 @@ static inline void put_ldev(struct drbd_conf *mdev)
 	int i = atomic_dec_return(&mdev->local_cnt);
 	__release(local);
 	D_ASSERT(i >= 0);
-	if (i == 0)
+	if (i == 0) {
+		if (mdev->state.disk == D_FAILED)
+			drbd_go_diskless(mdev);
 		wake_up(&mdev->misc_wait);
+	}
 }
 
 #ifndef __CHECKER__
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 63f45d730f3f..f89b97466d07 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -77,6 +77,7 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
 static void md_sync_timer_fn(unsigned long data);
 static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
+static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
 
 MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
 	      "Lars Ellenberg <lars@linbit.com>");
@@ -1363,42 +1364,46 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 	    os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
 		drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate");
 
+	/* first half of local IO error */
 	if (os.disk > D_FAILED && ns.disk == D_FAILED) {
-		enum drbd_io_error_p eh;
+		enum drbd_io_error_p eh = EP_PASS_ON;
+
+		if (drbd_send_state(mdev))
+			dev_warn(DEV, "Notified peer that my disk is broken.\n");
+		else
+			dev_err(DEV, "Sending state for drbd_io_error() failed\n");
+
+		drbd_rs_cancel_all(mdev);
 
-		eh = EP_PASS_ON;
 		if (get_ldev_if_state(mdev, D_FAILED)) {
 			eh = mdev->ldev->dc.on_io_error;
 			put_ldev(mdev);
 		}
+		if (eh == EP_CALL_HELPER)
+			drbd_khelper(mdev, "local-io-error");
+	}
 
-		drbd_rs_cancel_all(mdev);
-		/* since get_ldev() only works as long as disk>=D_INCONSISTENT,
-		   and it is D_DISKLESS here, local_cnt can only go down, it can
-		   not increase... It will reach zero */
-		wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
+
+	/* second half of local IO error handling,
+	 * after local_cnt references have reached zero: */
+	if (os.disk == D_FAILED && ns.disk == D_DISKLESS) {
 		mdev->rs_total = 0;
 		mdev->rs_failed = 0;
 		atomic_set(&mdev->rs_pending_cnt, 0);
-
-		spin_lock_irq(&mdev->req_lock);
-		_drbd_set_state(_NS(mdev, disk, D_DISKLESS), CS_HARD, NULL);
-		spin_unlock_irq(&mdev->req_lock);
-
-		if (eh == EP_CALL_HELPER)
-			drbd_khelper(mdev, "local-io-error");
 	}
 
 	if (os.disk > D_DISKLESS && ns.disk == D_DISKLESS) {
+		int c = atomic_read(&mdev->local_cnt);
 
-		if (os.disk == D_FAILED) /* && ns.disk == D_DISKLESS*/ {
-			if (drbd_send_state(mdev))
-				dev_warn(DEV, "Notified peer that my disk is broken.\n");
-			else
-				dev_err(DEV, "Sending state in drbd_io_error() failed\n");
-		}
+		if (drbd_send_state(mdev))
+			dev_warn(DEV, "Notified peer that I detached my disk.\n");
+		else
+			dev_err(DEV, "Sending state for detach failed\n");
 
-		wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
+		if (c != 0) {
+			dev_err(DEV, "Logic bug, local_cnt=%d, but should be 0\n", c);
+			wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
+		}
 		lc_destroy(mdev->resync);
 		mdev->resync = NULL;
 		lc_destroy(mdev->act_log);
@@ -2803,11 +2808,13 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
 	INIT_LIST_HEAD(&mdev->meta.work.q);
 	INIT_LIST_HEAD(&mdev->resync_work.list);
 	INIT_LIST_HEAD(&mdev->unplug_work.list);
+	INIT_LIST_HEAD(&mdev->go_diskless.list);
 	INIT_LIST_HEAD(&mdev->md_sync_work.list);
 	INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
 
 	mdev->resync_work.cb  = w_resync_inactive;
 	mdev->unplug_work.cb  = w_send_write_hint;
+	mdev->go_diskless.cb  = w_go_diskless;
 	mdev->md_sync_work.cb = w_md_sync;
 	mdev->bm_io_work.w.cb = w_bitmap_io;
 	init_timer(&mdev->resync_timer);
@@ -2885,6 +2892,7 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev)
 	D_ASSERT(list_empty(&mdev->meta.work.q));
 	D_ASSERT(list_empty(&mdev->resync_work.list));
 	D_ASSERT(list_empty(&mdev->unplug_work.list));
+	D_ASSERT(list_empty(&mdev->go_diskless.list));
 
 }
 
@@ -3712,6 +3720,24 @@ static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
 	return 1;
 }
 
+static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+{
+	D_ASSERT(mdev->state.disk == D_FAILED);
+	D_ASSERT(atomic_read(&mdev->local_cnt) == 0);
+
+	drbd_force_state(mdev, NS(disk, D_DISKLESS));
+
+	clear_bit(GO_DISKLESS, &mdev->flags);
+	return 1;
+}
+
+void drbd_go_diskless(struct drbd_conf *mdev)
+{
+	D_ASSERT(mdev->state.disk == D_FAILED);
+	if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
+		drbd_queue_work_front(&mdev->data.work, &mdev->go_diskless);
+}
+
 /**
  * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
  * @mdev:	DRBD device.
-- 
cgit v1.2.3


From b18b37befb37810ce50e1a9b0a6206dfe363d827 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Wed, 13 Oct 2010 15:32:44 +0200
Subject: drbd: Do not log an ASSERT for P_OV_REQUEST packets while C_CONNECTED

This might happen if on the VERIFY_S node the disk gets dropped.
Although this is an cluster wide state transition, the VERIFY_T node,
updates it connection state first. Then the ack packet for the
cluster wide state transition travels back, and the VERIFY_S node
stops to produce the P_OV_REQUEST packets.

There is absolutely nothing wrong with that.

Further, do not log "Can not satisfy peer's..." on the VERIFY_S
node in this case, but pretend that they had equal checksum.

[Bugz 327]

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_receiver.c | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 71775a9de21d..4dc6a8870526 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -2079,7 +2079,7 @@ static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, un
 	const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
 	struct drbd_epoch_entry *e;
 	struct digest_info *di = NULL;
-	int size;
+	int size, verb;
 	unsigned int fault_type;
 	struct p_block_req *p =	&mdev->data.rbuf.block_req;
 
@@ -2098,11 +2098,29 @@ static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, un
 	}
 
 	if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) {
-		if (__ratelimit(&drbd_ratelimit_state))
+		verb = 1;
+		switch (cmd) {
+		case P_DATA_REQUEST:
+			drbd_send_ack_rp(mdev, P_NEG_DREPLY, p);
+			break;
+		case P_RS_DATA_REQUEST:
+		case P_CSUM_RS_REQUEST:
+		case P_OV_REQUEST:
+			drbd_send_ack_rp(mdev, P_NEG_RS_DREPLY , p);
+			break;
+		case P_OV_REPLY:
+			verb = 0;
+			dec_rs_pending(mdev);
+			drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size, ID_IN_SYNC);
+			break;
+		default:
+			dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
+				cmdname(cmd));
+		}
+		if (verb && __ratelimit(&drbd_ratelimit_state))
 			dev_err(DEV, "Can not satisfy peer's read request, "
 			    "no local data.\n");
-		drbd_send_ack_rp(mdev, cmd == P_DATA_REQUEST ? P_NEG_DREPLY :
-				 P_NEG_RS_DREPLY , p);
+
 		/* drain possibly payload */
 		return drbd_drain_block(mdev, digest_size);
 	}
@@ -2157,10 +2175,6 @@ static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, un
 		break;
 
 	case P_OV_REQUEST:
-		if (mdev->state.conn >= C_CONNECTED &&
-		    mdev->state.conn != C_VERIFY_T)
-			dev_warn(DEV, "ASSERT FAILED: got P_OV_REQUEST while being %s\n",
-				drbd_conn_str(mdev->state.conn));
 		if (mdev->ov_start_sector == ~(sector_t)0 &&
 		    mdev->agreed_pro_version >= 90) {
 			mdev->ov_start_sector = sector;
-- 
cgit v1.2.3


From 13d42685bec1f012dcbc5d187490eb1d15ec8219 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Wed, 13 Oct 2010 17:37:54 +0200
Subject: drbd: add explicit drbd_md_sync to drbd_resync_finished

As we usually update the generation UUIDs here, we should explicitly
sync them to disk.  So far this has been done only implicitly by related
code paths.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_worker.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index f12822d53867..108d58015cd1 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -922,6 +922,8 @@ out:
 	mdev->rs_paused = 0;
 	mdev->ov_start_sector = 0;
 
+	drbd_md_sync(mdev);
+
 	if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags)) {
 		dev_warn(DEV, "Writing the whole bitmap, due to failed kmalloc\n");
 		drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished");
-- 
cgit v1.2.3


From 0f8488e1608b6e30e705460f8110888c645f7f9f Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Wed, 13 Oct 2010 18:19:23 +0200
Subject: drbd: cleanup useless leftover warn/error printk's

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_nl.c       | 3 ---
 drivers/block/drbd/drbd_receiver.c | 4 +---
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 9ae33a5bcf66..87925e97e613 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -780,9 +780,6 @@ void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __mu
 	blk_queue_segment_boundary(q, PAGE_SIZE-1);
 	blk_stack_limits(&q->limits, &b->limits, 0);
 
-	if (b->merge_bvec_fn)
-		dev_warn(DEV, "Backing device's merge_bvec_fn() = %p\n",
-		     b->merge_bvec_fn);
 	dev_info(DEV, "max_segment_size ( = BIO size ) = %u\n", queue_max_segment_size(q));
 
 	if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) {
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 4dc6a8870526..5a4b6dcd48f2 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -4591,10 +4591,8 @@ int drbd_asender(struct drbd_thread *thi)
 		while (1) {
 			clear_bit(SIGNAL_ASENDER, &mdev->flags);
 			flush_signals(current);
-			if (!drbd_process_done_ee(mdev)) {
-				dev_err(DEV, "process_done_ee() = NOT_OK\n");
+			if (!drbd_process_done_ee(mdev))
 				goto reconnect;
-			}
 			/* to avoid race with newly queued ACKs */
 			set_bit(SIGNAL_ASENDER, &mdev->flags);
 			spin_lock_irq(&mdev->req_lock);
-- 
cgit v1.2.3


From 9d282875d85ebc2b49362310677fc0dcd91b9db9 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Thu, 14 Oct 2010 13:57:07 +0200
Subject: drbd: drop wrong debug asserts, fix recently introduced race

 commit 2372c38caadeaebc68a5ee190782c2a0df01edc3
 drbd: fix for possible deadlock on IO error during resync

introduced a new ASSERT, which turns out to be wrong. Drop it.

Also serialize the state change to D_DISKLESS with the after state
change work of the -> D_FAILED transition, don't open a new race.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_main.c | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index f89b97466d07..342574f6d92e 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1393,17 +1393,22 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 	}
 
 	if (os.disk > D_DISKLESS && ns.disk == D_DISKLESS) {
-		int c = atomic_read(&mdev->local_cnt);
-
+		/* We must still be diskless,
+		 * re-attach has to be serialized with this! */
+		if (mdev->state.disk != D_DISKLESS)
+			dev_err(DEV,
+				"ASSERT FAILED: disk is %s while going diskless\n",
+				drbd_disk_str(mdev->state.disk));
+
+		/* we cannot assert local_cnt == 0 here, as get_ldev_if_state
+		 * will inc/dec it frequently. Since we became D_DISKLESS, no
+		 * one has touched the protected members anymore, though, so we
+		 * are safe to free them here. */
 		if (drbd_send_state(mdev))
 			dev_warn(DEV, "Notified peer that I detached my disk.\n");
 		else
 			dev_err(DEV, "Sending state for detach failed\n");
 
-		if (c != 0) {
-			dev_err(DEV, "Logic bug, local_cnt=%d, but should be 0\n", c);
-			wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
-		}
 		lc_destroy(mdev->resync);
 		mdev->resync = NULL;
 		lc_destroy(mdev->act_log);
@@ -3723,8 +3728,10 @@ static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
 static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
 {
 	D_ASSERT(mdev->state.disk == D_FAILED);
-	D_ASSERT(atomic_read(&mdev->local_cnt) == 0);
-
+	/* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
+	 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
+	 * the protected members anymore, though, so in the after_state_ch work
+	 * it will be safe to free them. */
 	drbd_force_state(mdev, NS(disk, D_DISKLESS));
 
 	clear_bit(GO_DISKLESS, &mdev->flags);
@@ -3735,7 +3742,10 @@ void drbd_go_diskless(struct drbd_conf *mdev)
 {
 	D_ASSERT(mdev->state.disk == D_FAILED);
 	if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
-		drbd_queue_work_front(&mdev->data.work, &mdev->go_diskless);
+		drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
+		/* don't drbd_queue_work_front,
+		 * we need to serialize with the after_state_ch work
+		 * of the -> D_FAILED transition. */
 }
 
 /**
-- 
cgit v1.2.3


From 856c50c7b616d50e1a3ccd4ce35f7814650fa594 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Thu, 14 Oct 2010 13:37:40 +0200
Subject: drbd: add some more explicit drbd_md_sync

It sometimes may take a while for the after state change work to be
scheduled, which does drbd_md_sync. At convenient places, we should do
explicit drbd_md_sync to have the new state information on disk as soon
as possible.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_receiver.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 5a4b6dcd48f2..6ec922c623a1 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -3804,6 +3804,9 @@ static void drbdd(struct drbd_conf *mdev)
 	err_out:
 		drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
 	}
+	/* If we leave here, we probably want to update at least the
+	 * "Connected" indicator on stable storage. Do so explicitly here. */
+	drbd_md_sync(mdev);
 }
 
 void drbd_flush_workqueue(struct drbd_conf *mdev)
@@ -4685,10 +4688,12 @@ int drbd_asender(struct drbd_thread *thi)
 	if (0) {
 reconnect:
 		drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
+		drbd_md_sync(mdev);
 	}
 	if (0) {
 disconnect:
 		drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+		drbd_md_sync(mdev);
 	}
 	clear_bit(SIGNAL_ASENDER, &mdev->flags);
 
-- 
cgit v1.2.3


From ca0e6098aad127a555ba29d12e0503dbb1577aac Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Thu, 14 Oct 2010 15:01:21 +0200
Subject: drbd: relax the grace period of the md_sync timer again

Consolidate the ifdef's for the debug level, accidentally the used both
DEBUG and DRBD_DEBUG_MD_SYNC.  Default to off.

For production, we can safely reduce the grace period for this timer
again the the value we used to have.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 342574f6d92e..2e6a07e3848d 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -3565,7 +3565,7 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
  * the meta-data super block. This function sets MD_DIRTY, and starts a
  * timer that ensures that within five seconds you have to call drbd_md_sync().
  */
-#ifdef DRBD_DEBUG_MD_SYNC
+#ifdef DEBUG
 void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
 {
 	if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
@@ -3578,7 +3578,7 @@ void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *
 void drbd_md_mark_dirty(struct drbd_conf *mdev)
 {
 	if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
-		mod_timer(&mdev->md_sync_timer, jiffies + HZ);
+		mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
 }
 #endif
 
-- 
cgit v1.2.3


From 2265769531afe267f864111c103b04b4427720b6 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Thu, 12 Aug 2010 00:38:45 +0200
Subject: drbd: cleanup: change "<= 0" to "== 0"

dt is unsigned so it's never less than zero.  We are calculating the
elapsed time, and that's never less than zero (unless there is a bug or
we invent time travel).  The comparison here is just to guard against
divide by zero bugs.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
---
 drivers/block/drbd/drbd_proc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index aec8426c1bf6..ad325c5d0ce1 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -118,7 +118,7 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
 	/* mean speed since syncer started
 	 * we do account for PausedSync periods */
 	dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ;
-	if (dt <= 0)
+	if (dt == 0)
 		dt = 1;
 	db = mdev->rs_total - rs_left;
 	dbdt = Bit2KB(db/dt);
-- 
cgit v1.2.3


From be70e2671b95a8982ff133ebaafff6399ad393d4 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Thu, 14 Oct 2010 11:58:20 +0200
Subject: dynamic_debug.h: Fix dynamic_dev_dbg() macro if CONFIG_DYNAMIC_DEBUG
 not set

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
---
 include/linux/dynamic_debug.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h
index 52c0da4bdd18..81bc20e36ec0 100644
--- a/include/linux/dynamic_debug.h
+++ b/include/linux/dynamic_debug.h
@@ -80,7 +80,7 @@ static inline int ddebug_remove_module(const char *mod)
 
 #define dynamic_pr_debug(fmt, ...)					\
 	do { if (0) printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); } while (0)
-#define dynamic_dev_dbg(dev, format, ...)				\
+#define dynamic_dev_dbg(dev, fmt, ...)					\
 	do { if (0) dev_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__); } while (0)
 #endif
 
-- 
cgit v1.2.3


From ac7241211ded714873e8dc6d2f7c98ae7ea2cc30 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Thu, 7 Oct 2010 15:18:08 +0200
Subject: drbd: use dynamic_dev_dbg to optionally log uuid changes

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_main.c | 32 +++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 2e6a07e3848d..bbe3bff2cad6 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -3557,6 +3557,28 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
 	return rv;
 }
 
+static void debug_drbd_uuid(struct drbd_conf *mdev, enum drbd_uuid_index index)
+{
+	static char *uuid_str[UI_EXTENDED_SIZE] = {
+		[UI_CURRENT] = "CURRENT",
+		[UI_BITMAP] = "BITMAP",
+		[UI_HISTORY_START] = "HISTORY_START",
+		[UI_HISTORY_END] = "HISTORY_END",
+		[UI_SIZE] = "SIZE",
+		[UI_FLAGS] = "FLAGS",
+	};
+
+	if (index >= UI_EXTENDED_SIZE) {
+		dev_warn(DEV, " uuid_index >= EXTENDED_SIZE\n");
+		return;
+	}
+
+	dynamic_dev_dbg(DEV, " uuid[%s] now %016llX\n",
+		 uuid_str[index],
+		 (unsigned long long)mdev->ldev->md.uuid[index]);
+}
+
+
 /**
  * drbd_md_mark_dirty() - Mark meta data super block as dirty
  * @mdev:	DRBD device.
@@ -3586,8 +3608,10 @@ static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
 {
 	int i;
 
-	for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
+	for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++) {
 		mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
+		debug_drbd_uuid(mdev, i+1);
+	}
 }
 
 void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
@@ -3602,6 +3626,7 @@ void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
 	}
 
 	mdev->ldev->md.uuid[idx] = val;
+	debug_drbd_uuid(mdev, idx);
 	drbd_md_mark_dirty(mdev);
 }
 
@@ -3611,6 +3636,7 @@ void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
 	if (mdev->ldev->md.uuid[idx]) {
 		drbd_uuid_move_history(mdev);
 		mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
+		debug_drbd_uuid(mdev, UI_HISTORY_START);
 	}
 	_drbd_uuid_set(mdev, idx, val);
 }
@@ -3629,6 +3655,7 @@ void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
 	dev_info(DEV, "Creating new current UUID\n");
 	D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0);
 	mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
+	debug_drbd_uuid(mdev, UI_BITMAP);
 
 	get_random_bytes(&val, sizeof(u64));
 	_drbd_uuid_set(mdev, UI_CURRENT, val);
@@ -3643,6 +3670,8 @@ void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
 		drbd_uuid_move_history(mdev);
 		mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
 		mdev->ldev->md.uuid[UI_BITMAP] = 0;
+		debug_drbd_uuid(mdev, UI_HISTORY_START);
+		debug_drbd_uuid(mdev, UI_BITMAP);
 	} else {
 		if (mdev->ldev->md.uuid[UI_BITMAP])
 			dev_warn(DEV, "bm UUID already set");
@@ -3650,6 +3679,7 @@ void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
 		mdev->ldev->md.uuid[UI_BITMAP] = val;
 		mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1);
 
+		debug_drbd_uuid(mdev, UI_BITMAP);
 	}
 	drbd_md_mark_dirty(mdev);
 }
-- 
cgit v1.2.3


From 5dbfe7aedf54aa7f62fd659e34371d4ea0e7bffe Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Fri, 15 Oct 2010 09:52:46 +0200
Subject: drbd: add race-breaker to drbd_go_diskless

This adds a necessary race breaker to these commits:
    drbd: fix for possible deadlock on IO error during resync
    drbd: drop wrong debug asserts, fix recently introduced race

What we do is get a refcount, check the state, then depending on the
state and the requested minimum disk state, either hold it (success),
or give it back immediately (failed "try lock").

Some code paths (flushing of drbd metadata) may still grab and hold a
refcount even if we are D_FAILED (application IO won't).
So even if we hit local_cnt == 0 once after being D_FAILED,
we still need to wait for that again after we changed to D_DISKLESS.
Once local_cnt reaches 0 while we are D_DISKLESS, we can be sure that
no one will look at the protected members anymore, so only then is it
safe to free them.

We cannot easily convert to standard locking primitives here, as we want
to be able to use it in atomic context (we always do a "try lock"),
as well as hold references for a "long time" (from IO submission to
completion callback).

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_main.c | 3 +++
 include/linux/drbd.h           | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index bbe3bff2cad6..8bfedc7164fa 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -3763,6 +3763,9 @@ static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused
 	 * the protected members anymore, though, so in the after_state_ch work
 	 * it will be safe to free them. */
 	drbd_force_state(mdev, NS(disk, D_DISKLESS));
+	/* We need to wait for return of references checked out while we still
+	 * have been D_FAILED, though (drbd_md_sync, bitmap io). */
+	wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
 
 	clear_bit(GO_DISKLESS, &mdev->flags);
 	return 1;
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
index da7d9bd4f3f0..9b2a0158f399 100644
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -53,7 +53,7 @@
 
 
 extern const char *drbd_buildtag(void);
-#define REL_VERSION "8.3.9rc1"
+#define REL_VERSION "8.3.9rc2"
 #define API_VERSION 88
 #define PRO_VERSION_MIN 86
 #define PRO_VERSION_MAX 95
-- 
cgit v1.2.3


From e817bf3f68f55e7307c3e9abe5f32d0c05c83988 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Fri, 15 Oct 2010 15:49:18 +0200
Subject: block: Fix double free in blk_integrity_unregister

Commit 3839e4b introduced a kobject_put but failed to remove the
kmem_cache_free beneath it, leading to a double free.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-integrity.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 885cbb59967e..54bcba6c02a7 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -432,7 +432,6 @@ void blk_integrity_unregister(struct gendisk *disk)
 	kobject_uevent(&bi->kobj, KOBJ_REMOVE);
 	kobject_del(&bi->kobj);
 	kobject_put(&bi->kobj);
-	kmem_cache_free(integrity_cachep, bi);
 	disk->integrity = NULL;
 }
 EXPORT_SYMBOL(blk_integrity_unregister);
-- 
cgit v1.2.3


From 495d2b3883682fcd1c3dee3a45e38fd00154ae25 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Fri, 15 Oct 2010 15:49:20 +0200
Subject: block: Make the integrity mapped property a bio flag

Previously we tracked whether the integrity metadata had been remapped
using a request flag. This was fine for low-level retries. However, if
an I/O was redriven by upper layers we would end up remapping again,
causing the retry to fail.

Deprecate the REQ_INTEGRITY flag and introduce BIO_MAPPED_INTEGRITY
which enables filesystems to notify lower layers that the bio in
question has already been remapped.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/scsi/sd_dif.c     | 11 ++++++-----
 include/linux/blk_types.h |  3 +--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c
index 84be62149c6c..0cb39ff21171 100644
--- a/drivers/scsi/sd_dif.c
+++ b/drivers/scsi/sd_dif.c
@@ -375,21 +375,20 @@ int sd_dif_prepare(struct request *rq, sector_t hw_sector, unsigned int sector_s
 	unsigned int i, j;
 	u32 phys, virt;
 
-	/* Already remapped? */
-	if (rq->cmd_flags & REQ_INTEGRITY)
-		return 0;
-
 	sdkp = rq->bio->bi_bdev->bd_disk->private_data;
 
 	if (sdkp->protection_type == SD_DIF_TYPE3_PROTECTION)
 		return 0;
 
-	rq->cmd_flags |= REQ_INTEGRITY;
 	phys = hw_sector & 0xffffffff;
 
 	__rq_for_each_bio(bio, rq) {
 		struct bio_vec *iv;
 
+		/* Already remapped? */
+		if (bio_flagged(bio, BIO_MAPPED_INTEGRITY))
+			break;
+
 		virt = bio->bi_integrity->bip_sector & 0xffffffff;
 
 		bip_for_each_vec(iv, bio->bi_integrity, i) {
@@ -408,6 +407,8 @@ int sd_dif_prepare(struct request *rq, sector_t hw_sector, unsigned int sector_s
 
 			kunmap_atomic(sdt, KM_USER0);
 		}
+
+		bio->bi_flags |= BIO_MAPPED_INTEGRITY;
 	}
 
 	return 0;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 10a0c291b55a..d36629620a4f 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -97,6 +97,7 @@ struct bio {
 #define BIO_NULL_MAPPED 9	/* contains invalid user pages */
 #define BIO_FS_INTEGRITY 10	/* fs owns integrity data, not block layer */
 #define BIO_QUIET	11	/* Make BIO Quiet */
+#define BIO_MAPPED_INTEGRITY 12/* integrity metadata has been remapped */
 #define bio_flagged(bio, flag)	((bio)->bi_flags & (1 << (flag)))
 
 /*
@@ -148,7 +149,6 @@ enum rq_flag_bits {
 	__REQ_ORDERED_COLOR,	/* is before or after barrier */
 	__REQ_ALLOCED,		/* request came from our alloc pool */
 	__REQ_COPY_USER,	/* contains copies of user pages */
-	__REQ_INTEGRITY,	/* integrity metadata has been remapped */
 	__REQ_FLUSH,		/* request for cache flush */
 	__REQ_IO_STAT,		/* account I/O stat */
 	__REQ_MIXED_MERGE,	/* merge of different types, fail separately */
@@ -190,7 +190,6 @@ enum rq_flag_bits {
 #define REQ_ORDERED_COLOR	(1 << __REQ_ORDERED_COLOR)
 #define REQ_ALLOCED		(1 << __REQ_ALLOCED)
 #define REQ_COPY_USER		(1 << __REQ_COPY_USER)
-#define REQ_INTEGRITY		(1 << __REQ_INTEGRITY)
 #define REQ_FLUSH		(1 << __REQ_FLUSH)
 #define REQ_IO_STAT		(1 << __REQ_IO_STAT)
 #define REQ_MIXED_MERGE		(1 << __REQ_MIXED_MERGE)
-- 
cgit v1.2.3