From 4254bba17d92d53822a56ebc2a0c1eb7e2a71155 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <koverstreet@google.com>
Date: Thu, 6 Sep 2012 15:35:00 -0700
Subject: block: Kill bi_destructor

Now that we've got generic code for freeing bios allocated from bio
pools, this isn't needed anymore.

This patch also makes bio_free() static, since without bi_destructor
there should be no need for it to be called anywhere else.

bio_free() is now only called from bio_put, so we can refactor those a
bit - move some code from bio_put() to bio_free() and kill the redundant
bio->bi_next = NULL.

v5: Switch to BIO_KMALLOC_POOL ((void *)~0), per Boaz
v6: BIO_KMALLOC_POOL now NULL, drop bio_free's EXPORT_SYMBOL
v7: No #define BIO_KMALLOC_POOL anymore

Signed-off-by: Kent Overstreet <koverstreet@google.com>
CC: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/block/biodoc.txt | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index e418dc0a7086..8df5e8e6dceb 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -465,7 +465,6 @@ struct bio {
        bio_end_io_t	*bi_end_io;  /* bi_end_io (bio) */
        atomic_t		bi_cnt;	     /* pin count: free when it hits zero */
        void             *bi_private;
-       bio_destructor_t *bi_destructor; /* bi_destructor (bio) */
 };
 
 With this multipage bio design:
@@ -647,10 +646,6 @@ for a non-clone bio. There are the 6 pools setup for different size biovecs,
 so bio_alloc(gfp_mask, nr_iovecs) will allocate a vec_list of the
 given size from these slabs.
 
-The bi_destructor() routine takes into account the possibility of the bio
-having originated from a different source (see later discussions on
-n/w to block transfers and kvec_cb)
-
 The bio_get() routine may be used to hold an extra reference on a bio prior
 to i/o submission, if the bio fields are likely to be accessed after the
 i/o is issued (since the bio may otherwise get freed in case i/o completion
-- 
cgit v1.2.3


From 4363ac7c13a9a4b763c6e8d9fdbfc2468f3b8ca4 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Tue, 18 Sep 2012 12:19:27 -0400
Subject: block: Implement support for WRITE SAME

The WRITE SAME command supported on some SCSI devices allows the same
block to be efficiently replicated throughout a block range. Only a
single logical block is transferred from the host and the storage device
writes the same data to all blocks described by the I/O.

This patch implements support for WRITE SAME in the block layer. The
blkdev_issue_write_same() function can be used by filesystems and block
drivers to replicate a buffer across a block range. This can be used to
efficiently initialize software RAID devices, etc.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/ABI/testing/sysfs-block | 14 +++++++
 block/blk-core.c                      | 14 ++++++-
 block/blk-lib.c                       | 74 +++++++++++++++++++++++++++++++++++
 block/blk-merge.c                     |  9 +++++
 block/blk-settings.c                  | 16 ++++++++
 block/blk-sysfs.c                     | 13 ++++++
 drivers/md/raid0.c                    |  1 +
 fs/bio.c                              |  9 +++--
 include/linux/bio.h                   |  3 ++
 include/linux/blk_types.h             |  5 ++-
 include/linux/blkdev.h                | 29 ++++++++++++++
 11 files changed, 181 insertions(+), 6 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
index c1eb41cb9876..279da08f7541 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -206,3 +206,17 @@ Description:
 		when a discarded area is read the discard_zeroes_data
 		parameter will be set to one. Otherwise it will be 0 and
 		the result of reading a discarded area is undefined.
+
+What:		/sys/block/<disk>/queue/write_same_max_bytes
+Date:		January 2012
+Contact:	Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+		Some devices support a write same operation in which a
+		single data block can be written to a range of several
+		contiguous blocks on storage. This can be used to wipe
+		areas on disk or to initialize drives in a RAID
+		configuration. write_same_max_bytes indicates how many
+		bytes can be written in a single write same command. If
+		write_same_max_bytes is 0, write same is not supported
+		by the device.
+
diff --git a/block/blk-core.c b/block/blk-core.c
index 33eded00c5b1..3b080541098e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1704,6 +1704,11 @@ generic_make_request_checks(struct bio *bio)
 		goto end_io;
 	}
 
+	if (bio->bi_rw & REQ_WRITE_SAME && !bdev_write_same(bio->bi_bdev)) {
+		err = -EOPNOTSUPP;
+		goto end_io;
+	}
+
 	/*
 	 * Various block parts want %current->io_context and lazy ioc
 	 * allocation ends up trading a lot of pain for a small amount of
@@ -1809,8 +1814,6 @@ EXPORT_SYMBOL(generic_make_request);
  */
 void submit_bio(int rw, struct bio *bio)
 {
-	int count = bio_sectors(bio);
-
 	bio->bi_rw |= rw;
 
 	/*
@@ -1818,6 +1821,13 @@ void submit_bio(int rw, struct bio *bio)
 	 * go through the normal accounting stuff before submission.
 	 */
 	if (bio_has_data(bio)) {
+		unsigned int count;
+
+		if (unlikely(rw & REQ_WRITE_SAME))
+			count = bdev_logical_block_size(bio->bi_bdev) >> 9;
+		else
+			count = bio_sectors(bio);
+
 		if (rw & WRITE) {
 			count_vm_events(PGPGOUT, count);
 		} else {
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 19cc761cacb2..a062543c58ac 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -129,6 +129,80 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 }
 EXPORT_SYMBOL(blkdev_issue_discard);
 
+/**
+ * blkdev_issue_write_same - queue a write same operation
+ * @bdev:	target blockdev
+ * @sector:	start sector
+ * @nr_sects:	number of sectors to write
+ * @gfp_mask:	memory allocation flags (for bio_alloc)
+ * @page:	page containing data to write
+ *
+ * Description:
+ *    Issue a write same request for the sectors in question.
+ */
+int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
+			    sector_t nr_sects, gfp_t gfp_mask,
+			    struct page *page)
+{
+	DECLARE_COMPLETION_ONSTACK(wait);
+	struct request_queue *q = bdev_get_queue(bdev);
+	unsigned int max_write_same_sectors;
+	struct bio_batch bb;
+	struct bio *bio;
+	int ret = 0;
+
+	if (!q)
+		return -ENXIO;
+
+	max_write_same_sectors = q->limits.max_write_same_sectors;
+
+	if (max_write_same_sectors == 0)
+		return -EOPNOTSUPP;
+
+	atomic_set(&bb.done, 1);
+	bb.flags = 1 << BIO_UPTODATE;
+	bb.wait = &wait;
+
+	while (nr_sects) {
+		bio = bio_alloc(gfp_mask, 1);
+		if (!bio) {
+			ret = -ENOMEM;
+			break;
+		}
+
+		bio->bi_sector = sector;
+		bio->bi_end_io = bio_batch_end_io;
+		bio->bi_bdev = bdev;
+		bio->bi_private = &bb;
+		bio->bi_vcnt = 1;
+		bio->bi_io_vec->bv_page = page;
+		bio->bi_io_vec->bv_offset = 0;
+		bio->bi_io_vec->bv_len = bdev_logical_block_size(bdev);
+
+		if (nr_sects > max_write_same_sectors) {
+			bio->bi_size = max_write_same_sectors << 9;
+			nr_sects -= max_write_same_sectors;
+			sector += max_write_same_sectors;
+		} else {
+			bio->bi_size = nr_sects << 9;
+			nr_sects = 0;
+		}
+
+		atomic_inc(&bb.done);
+		submit_bio(REQ_WRITE | REQ_WRITE_SAME, bio);
+	}
+
+	/* Wait for bios in-flight */
+	if (!atomic_dec_and_test(&bb.done))
+		wait_for_completion(&wait);
+
+	if (!test_bit(BIO_UPTODATE, &bb.flags))
+		ret = -ENOTSUPP;
+
+	return ret;
+}
+EXPORT_SYMBOL(blkdev_issue_write_same);
+
 /**
  * blkdev_issue_zeroout - generate number of zero filed write bios
  * @bdev:	blockdev to issue
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 642b862608a1..936a110de0b9 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -419,6 +419,10 @@ static int attempt_merge(struct request_queue *q, struct request *req,
 	    || next->special)
 		return 0;
 
+	if (req->cmd_flags & REQ_WRITE_SAME &&
+	    !blk_write_same_mergeable(req->bio, next->bio))
+		return 0;
+
 	/*
 	 * If we are allowed to merge, then append bio list
 	 * from next to rq and release next. merge_requests_fn
@@ -518,6 +522,11 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
 	if (bio_integrity(bio) != blk_integrity_rq(rq))
 		return false;
 
+	/* must be using the same buffer */
+	if (rq->cmd_flags & REQ_WRITE_SAME &&
+	    !blk_write_same_mergeable(rq->bio, bio))
+		return false;
+
 	return true;
 }
 
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 565a6786032f..779bb7646bcd 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -113,6 +113,7 @@ void blk_set_default_limits(struct queue_limits *lim)
 	lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
 	lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
 	lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
+	lim->max_write_same_sectors = 0;
 	lim->max_discard_sectors = 0;
 	lim->discard_granularity = 0;
 	lim->discard_alignment = 0;
@@ -144,6 +145,7 @@ void blk_set_stacking_limits(struct queue_limits *lim)
 	lim->max_segments = USHRT_MAX;
 	lim->max_hw_sectors = UINT_MAX;
 	lim->max_sectors = UINT_MAX;
+	lim->max_write_same_sectors = UINT_MAX;
 }
 EXPORT_SYMBOL(blk_set_stacking_limits);
 
@@ -285,6 +287,18 @@ void blk_queue_max_discard_sectors(struct request_queue *q,
 }
 EXPORT_SYMBOL(blk_queue_max_discard_sectors);
 
+/**
+ * blk_queue_max_write_same_sectors - set max sectors for a single write same
+ * @q:  the request queue for the device
+ * @max_write_same_sectors: maximum number of sectors to write per command
+ **/
+void blk_queue_max_write_same_sectors(struct request_queue *q,
+				      unsigned int max_write_same_sectors)
+{
+	q->limits.max_write_same_sectors = max_write_same_sectors;
+}
+EXPORT_SYMBOL(blk_queue_max_write_same_sectors);
+
 /**
  * blk_queue_max_segments - set max hw segments for a request for this queue
  * @q:  the request queue for the device
@@ -510,6 +524,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 
 	t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
 	t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
+	t->max_write_same_sectors = min(t->max_write_same_sectors,
+					b->max_write_same_sectors);
 	t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn);
 
 	t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index ea51d827a0bb..247dbfd42621 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -180,6 +180,13 @@ static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *pag
 	return queue_var_show(queue_discard_zeroes_data(q), page);
 }
 
+static ssize_t queue_write_same_max_show(struct request_queue *q, char *page)
+{
+	return sprintf(page, "%llu\n",
+		(unsigned long long)q->limits.max_write_same_sectors << 9);
+}
+
+
 static ssize_t
 queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
 {
@@ -385,6 +392,11 @@ static struct queue_sysfs_entry queue_discard_zeroes_data_entry = {
 	.show = queue_discard_zeroes_data_show,
 };
 
+static struct queue_sysfs_entry queue_write_same_max_entry = {
+	.attr = {.name = "write_same_max_bytes", .mode = S_IRUGO },
+	.show = queue_write_same_max_show,
+};
+
 static struct queue_sysfs_entry queue_nonrot_entry = {
 	.attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
 	.show = queue_show_nonrot,
@@ -432,6 +444,7 @@ static struct attribute *default_attrs[] = {
 	&queue_discard_granularity_entry.attr,
 	&queue_discard_max_entry.attr,
 	&queue_discard_zeroes_data_entry.attr,
+	&queue_write_same_max_entry.attr,
 	&queue_nonrot_entry.attr,
 	&queue_nomerges_entry.attr,
 	&queue_rq_affinity_entry.attr,
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index de63a1fc3737..a9e4fa95dfaa 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -422,6 +422,7 @@ static int raid0_run(struct mddev *mddev)
 	if (md_check_no_bitmap(mddev))
 		return -EINVAL;
 	blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
+	blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
 
 	/* if private is not null, we are here after takeover */
 	if (mddev->private == NULL) {
diff --git a/fs/bio.c b/fs/bio.c
index 13e956779e10..f855e0e1869c 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1487,9 +1487,12 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
 
 	bp->bv1 = bi->bi_io_vec[0];
 	bp->bv2 = bi->bi_io_vec[0];
-	bp->bv2.bv_offset += first_sectors << 9;
-	bp->bv2.bv_len -= first_sectors << 9;
-	bp->bv1.bv_len = first_sectors << 9;
+
+	if (bio_is_rw(bi)) {
+		bp->bv2.bv_offset += first_sectors << 9;
+		bp->bv2.bv_len -= first_sectors << 9;
+		bp->bv1.bv_len = first_sectors << 9;
+	}
 
 	bp->bio1.bi_io_vec = &bp->bv1;
 	bp->bio2.bi_io_vec = &bp->bv2;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index e54305cacc98..820e7aaad4fd 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -399,6 +399,9 @@ static inline bool bio_is_rw(struct bio *bio)
 	if (!bio_has_data(bio))
 		return false;
 
+	if (bio->bi_rw & REQ_WRITE_SAME)
+		return false;
+
 	return true;
 }
 
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 1b229664f573..cdf11191e645 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -147,6 +147,7 @@ enum rq_flag_bits {
 	__REQ_PRIO,		/* boost priority in cfq */
 	__REQ_DISCARD,		/* request to discard sectors */
 	__REQ_SECURE,		/* secure discard (used with __REQ_DISCARD) */
+	__REQ_WRITE_SAME,	/* write same block many times */
 
 	__REQ_NOIDLE,		/* don't anticipate more IO after this one */
 	__REQ_FUA,		/* forced unit access */
@@ -185,13 +186,15 @@ enum rq_flag_bits {
 #define REQ_META		(1 << __REQ_META)
 #define REQ_PRIO		(1 << __REQ_PRIO)
 #define REQ_DISCARD		(1 << __REQ_DISCARD)
+#define REQ_WRITE_SAME		(1 << __REQ_WRITE_SAME)
 #define REQ_NOIDLE		(1 << __REQ_NOIDLE)
 
 #define REQ_FAILFAST_MASK \
 	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
 #define REQ_COMMON_MASK \
 	(REQ_WRITE | REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_PRIO | \
-	 REQ_DISCARD | REQ_NOIDLE | REQ_FLUSH | REQ_FUA | REQ_SECURE)
+	 REQ_DISCARD | REQ_WRITE_SAME | REQ_NOIDLE | REQ_FLUSH | REQ_FUA | \
+	 REQ_SECURE)
 #define REQ_CLONE_MASK		REQ_COMMON_MASK
 
 /* This mask is used for both bio and request merge checking */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 90f7abe8f183..1756001210d2 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -270,6 +270,7 @@ struct queue_limits {
 	unsigned int		io_min;
 	unsigned int		io_opt;
 	unsigned int		max_discard_sectors;
+	unsigned int		max_write_same_sectors;
 	unsigned int		discard_granularity;
 	unsigned int		discard_alignment;
 
@@ -614,9 +615,20 @@ static inline bool blk_check_merge_flags(unsigned int flags1,
 	if ((flags1 & REQ_SECURE) != (flags2 & REQ_SECURE))
 		return false;
 
+	if ((flags1 & REQ_WRITE_SAME) != (flags2 & REQ_WRITE_SAME))
+		return false;
+
 	return true;
 }
 
+static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b)
+{
+	if (bio_data(a) == bio_data(b))
+		return true;
+
+	return false;
+}
+
 /*
  * q->prep_rq_fn return values
  */
@@ -818,6 +830,9 @@ static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
 	if (unlikely(cmd_flags & REQ_DISCARD))
 		return q->limits.max_discard_sectors;
 
+	if (unlikely(cmd_flags & REQ_WRITE_SAME))
+		return q->limits.max_write_same_sectors;
+
 	return q->limits.max_sectors;
 }
 
@@ -886,6 +901,8 @@ extern void blk_queue_max_segments(struct request_queue *, unsigned short);
 extern void blk_queue_max_segment_size(struct request_queue *, unsigned int);
 extern void blk_queue_max_discard_sectors(struct request_queue *q,
 		unsigned int max_discard_sectors);
+extern void blk_queue_max_write_same_sectors(struct request_queue *q,
+		unsigned int max_write_same_sectors);
 extern void blk_queue_logical_block_size(struct request_queue *, unsigned short);
 extern void blk_queue_physical_block_size(struct request_queue *, unsigned int);
 extern void blk_queue_alignment_offset(struct request_queue *q,
@@ -1016,6 +1033,8 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
 extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *);
 extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
+extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
+		sector_t nr_sects, gfp_t gfp_mask, struct page *page);
 extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 			sector_t nr_sects, gfp_t gfp_mask);
 static inline int sb_issue_discard(struct super_block *sb, sector_t block,
@@ -1193,6 +1212,16 @@ static inline unsigned int bdev_discard_zeroes_data(struct block_device *bdev)
 	return queue_discard_zeroes_data(bdev_get_queue(bdev));
 }
 
+static inline unsigned int bdev_write_same(struct block_device *bdev)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+
+	if (q)
+		return q->limits.max_write_same_sectors;
+
+	return 0;
+}
+
 static inline int queue_dma_alignment(struct request_queue *q)
 {
 	return q ? q->dma_alignment : 511;
-- 
cgit v1.2.3


From 62ac665ff9fc07497ca524bd20d6a96893d11071 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Wed, 26 Sep 2012 07:46:43 +0200
Subject: blockdev: turn a rw semaphore into a percpu rw semaphore

This avoids cache line bouncing when many processes lock the semaphore
for read.

New percpu lock implementation

The lock consists of an array of percpu unsigned integers, a boolean
variable and a mutex.

When we take the lock for read, we enter rcu read section, check for a
"locked" variable. If it is false, we increase a percpu counter on the
current cpu and exit the rcu section. If "locked" is true, we exit the
rcu section, take the mutex and drop it (this waits until a writer
finished) and retry.

Unlocking for read just decreases percpu variable. Note that we can
unlock on a difference cpu than where we locked, in this case the
counter underflows. The sum of all percpu counters represents the number
of processes that hold the lock for read.

When we need to lock for write, we take the mutex, set "locked" variable
to true and synchronize rcu. Since RCU has been synchronized, no
processes can create new read locks. We wait until the sum of percpu
counters is zero - when it is, there are no readers in the critical
section.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/percpu-rw-semaphore.txt | 27 +++++++++++
 fs/block_dev.c                        | 27 +++++++----
 include/linux/fs.h                    |  3 +-
 include/linux/percpu-rwsem.h          | 89 +++++++++++++++++++++++++++++++++++
 4 files changed, 135 insertions(+), 11 deletions(-)
 create mode 100644 Documentation/percpu-rw-semaphore.txt
 create mode 100644 include/linux/percpu-rwsem.h

(limited to 'Documentation')

diff --git a/Documentation/percpu-rw-semaphore.txt b/Documentation/percpu-rw-semaphore.txt
new file mode 100644
index 000000000000..eddd77094725
--- /dev/null
+++ b/Documentation/percpu-rw-semaphore.txt
@@ -0,0 +1,27 @@
+Percpu rw semaphores
+--------------------
+
+Percpu rw semaphores is a new read-write semaphore design that is
+optimized for locking for reading.
+
+The problem with traditional read-write semaphores is that when multiple
+cores take the lock for reading, the cache line containing the semaphore
+is bouncing between L1 caches of the cores, causing performance
+degradation.
+
+Locking for reading it very fast, it uses RCU and it avoids any atomic
+instruction in the lock and unlock path. On the other hand, locking for
+writing is very expensive, it calls synchronize_rcu() that can take
+hundreds of microseconds.
+
+The lock is declared with "struct percpu_rw_semaphore" type.
+The lock is initialized percpu_init_rwsem, it returns 0 on success and
+-ENOMEM on allocation failure.
+The lock must be freed with percpu_free_rwsem to avoid memory leak.
+
+The lock is locked for read with percpu_down_read, percpu_up_read and
+for write with percpu_down_write, percpu_up_write.
+
+The idea of using RCU for optimized rw-lock was introduced by
+Eric Dumazet <eric.dumazet@gmail.com>.
+The code was written by Mikulas Patocka <mpatocka@redhat.com>
diff --git a/fs/block_dev.c b/fs/block_dev.c
index cdfb625824e2..7eeb0635338b 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -127,7 +127,7 @@ int set_blocksize(struct block_device *bdev, int size)
 		return -EINVAL;
 
 	/* Prevent starting I/O or mapping the device */
-	down_write(&bdev->bd_block_size_semaphore);
+	percpu_down_write(&bdev->bd_block_size_semaphore);
 
 	/* Check that the block device is not memory mapped */
 	mapping = bdev->bd_inode->i_mapping;
@@ -135,7 +135,7 @@ int set_blocksize(struct block_device *bdev, int size)
 	if (!prio_tree_empty(&mapping->i_mmap) ||
 	    !list_empty(&mapping->i_mmap_nonlinear)) {
 		mutex_unlock(&mapping->i_mmap_mutex);
-		up_write(&bdev->bd_block_size_semaphore);
+		percpu_up_write(&bdev->bd_block_size_semaphore);
 		return -EBUSY;
 	}
 	mutex_unlock(&mapping->i_mmap_mutex);
@@ -148,7 +148,7 @@ int set_blocksize(struct block_device *bdev, int size)
 		kill_bdev(bdev);
 	}
 
-	up_write(&bdev->bd_block_size_semaphore);
+	percpu_up_write(&bdev->bd_block_size_semaphore);
 
 	return 0;
 }
@@ -460,6 +460,12 @@ static struct inode *bdev_alloc_inode(struct super_block *sb)
 	struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
 	if (!ei)
 		return NULL;
+
+	if (unlikely(percpu_init_rwsem(&ei->bdev.bd_block_size_semaphore))) {
+		kmem_cache_free(bdev_cachep, ei);
+		return NULL;
+	}
+
 	return &ei->vfs_inode;
 }
 
@@ -468,6 +474,8 @@ static void bdev_i_callback(struct rcu_head *head)
 	struct inode *inode = container_of(head, struct inode, i_rcu);
 	struct bdev_inode *bdi = BDEV_I(inode);
 
+	percpu_free_rwsem(&bdi->bdev.bd_block_size_semaphore);
+
 	kmem_cache_free(bdev_cachep, bdi);
 }
 
@@ -491,7 +499,6 @@ static void init_once(void *foo)
 	inode_init_once(&ei->vfs_inode);
 	/* Initialize mutex for freeze. */
 	mutex_init(&bdev->bd_fsfreeze_mutex);
-	init_rwsem(&bdev->bd_block_size_semaphore);
 }
 
 static inline void __bd_forget(struct inode *inode)
@@ -1593,11 +1600,11 @@ ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
 	ssize_t ret;
 	struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
 
-	down_read(&bdev->bd_block_size_semaphore);
+	percpu_down_read(&bdev->bd_block_size_semaphore);
 
 	ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
 
-	up_read(&bdev->bd_block_size_semaphore);
+	percpu_up_read(&bdev->bd_block_size_semaphore);
 
 	return ret;
 }
@@ -1622,7 +1629,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
 
 	blk_start_plug(&plug);
 
-	down_read(&bdev->bd_block_size_semaphore);
+	percpu_down_read(&bdev->bd_block_size_semaphore);
 
 	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
 	if (ret > 0 || ret == -EIOCBQUEUED) {
@@ -1633,7 +1640,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
 			ret = err;
 	}
 
-	up_read(&bdev->bd_block_size_semaphore);
+	percpu_up_read(&bdev->bd_block_size_semaphore);
 
 	blk_finish_plug(&plug);
 
@@ -1646,11 +1653,11 @@ int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
 	int ret;
 	struct block_device *bdev = I_BDEV(file->f_mapping->host);
 
-	down_read(&bdev->bd_block_size_semaphore);
+	percpu_down_read(&bdev->bd_block_size_semaphore);
 
 	ret = generic_file_mmap(file, vma);
 
-	up_read(&bdev->bd_block_size_semaphore);
+	percpu_up_read(&bdev->bd_block_size_semaphore);
 
 	return ret;
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e60bbd0225d5..24e1229cdfe0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -10,6 +10,7 @@
 #include <linux/ioctl.h>
 #include <linux/blk_types.h>
 #include <linux/types.h>
+#include <linux/percpu-rwsem.h>
 
 /*
  * It's silly to have NR_OPEN bigger than NR_FILE, but you can change
@@ -726,7 +727,7 @@ struct block_device {
 	/* Mutex for freeze */
 	struct mutex		bd_fsfreeze_mutex;
 	/* A semaphore that prevents I/O while block size is being changed */
-	struct rw_semaphore	bd_block_size_semaphore;
+	struct percpu_rw_semaphore	bd_block_size_semaphore;
 };
 
 /*
diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
new file mode 100644
index 000000000000..cf80f7e5277f
--- /dev/null
+++ b/include/linux/percpu-rwsem.h
@@ -0,0 +1,89 @@
+#ifndef _LINUX_PERCPU_RWSEM_H
+#define _LINUX_PERCPU_RWSEM_H
+
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/rcupdate.h>
+#include <linux/delay.h>
+
+struct percpu_rw_semaphore {
+	unsigned __percpu *counters;
+	bool locked;
+	struct mutex mtx;
+};
+
+static inline void percpu_down_read(struct percpu_rw_semaphore *p)
+{
+	rcu_read_lock();
+	if (unlikely(p->locked)) {
+		rcu_read_unlock();
+		mutex_lock(&p->mtx);
+		this_cpu_inc(*p->counters);
+		mutex_unlock(&p->mtx);
+		return;
+	}
+	this_cpu_inc(*p->counters);
+	rcu_read_unlock();
+}
+
+static inline void percpu_up_read(struct percpu_rw_semaphore *p)
+{
+	/*
+	 * On X86, write operation in this_cpu_dec serves as a memory unlock
+	 * barrier (i.e. memory accesses may be moved before the write, but
+	 * no memory accesses are moved past the write).
+	 * On other architectures this may not be the case, so we need smp_mb()
+	 * there.
+	 */
+#if defined(CONFIG_X86) && (!defined(CONFIG_X86_PPRO_FENCE) && !defined(CONFIG_X86_OOSTORE))
+	barrier();
+#else
+	smp_mb();
+#endif
+	this_cpu_dec(*p->counters);
+}
+
+static inline unsigned __percpu_count(unsigned __percpu *counters)
+{
+	unsigned total = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		total += ACCESS_ONCE(*per_cpu_ptr(counters, cpu));
+
+	return total;
+}
+
+static inline void percpu_down_write(struct percpu_rw_semaphore *p)
+{
+	mutex_lock(&p->mtx);
+	p->locked = true;
+	synchronize_rcu();
+	while (__percpu_count(p->counters))
+		msleep(1);
+	smp_rmb(); /* paired with smp_mb() in percpu_sem_up_read() */
+}
+
+static inline void percpu_up_write(struct percpu_rw_semaphore *p)
+{
+	p->locked = false;
+	mutex_unlock(&p->mtx);
+}
+
+static inline int percpu_init_rwsem(struct percpu_rw_semaphore *p)
+{
+	p->counters = alloc_percpu(unsigned);
+	if (unlikely(!p->counters))
+		return -ENOMEM;
+	p->locked = false;
+	mutex_init(&p->mtx);
+	return 0;
+}
+
+static inline void percpu_free_rwsem(struct percpu_rw_semaphore *p)
+{
+	free_percpu(p->counters);
+	p->counters = NULL; /* catch use after free bugs */
+}
+
+#endif
-- 
cgit v1.2.3


From e6b5c0822bcf43b91a2cdcb535ea828e953b6590 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Wed, 26 Sep 2012 19:56:15 +0200
Subject: percpu-rw-semaphore: fix documentation typos

One more patch for this thing, fixing some typos in the documentation.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/percpu-rw-semaphore.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/percpu-rw-semaphore.txt b/Documentation/percpu-rw-semaphore.txt
index eddd77094725..7d3c82431909 100644
--- a/Documentation/percpu-rw-semaphore.txt
+++ b/Documentation/percpu-rw-semaphore.txt
@@ -9,10 +9,10 @@ cores take the lock for reading, the cache line containing the semaphore
 is bouncing between L1 caches of the cores, causing performance
 degradation.
 
-Locking for reading it very fast, it uses RCU and it avoids any atomic
+Locking for reading is very fast, it uses RCU and it avoids any atomic
 instruction in the lock and unlock path. On the other hand, locking for
 writing is very expensive, it calls synchronize_rcu() that can take
-hundreds of microseconds.
+hundreds of milliseconds.
 
 The lock is declared with "struct percpu_rw_semaphore" type.
 The lock is initialized percpu_init_rwsem, it returns 0 on success and
-- 
cgit v1.2.3