From 4f024f3797c43cb4b73cd2c50cec728842d0e49e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 11 Oct 2013 15:44:27 -0700 Subject: block: Abstract out bvec iterator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Immutable biovecs are going to require an explicit iterator. To implement immutable bvecs, a later patch is going to add a bi_bvec_done member to this struct; for now, this patch effectively just renames things. Signed-off-by: Kent Overstreet Cc: Jens Axboe Cc: Geert Uytterhoeven Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: "Ed L. Cashin" Cc: Nick Piggin Cc: Lars Ellenberg Cc: Jiri Kosina Cc: Matthew Wilcox Cc: Geoff Levand Cc: Yehuda Sadeh Cc: Sage Weil Cc: Alex Elder Cc: ceph-devel@vger.kernel.org Cc: Joshua Morris Cc: Philip Kelleher Cc: Rusty Russell Cc: "Michael S. Tsirkin" Cc: Konrad Rzeszutek Wilk Cc: Jeremy Fitzhardinge Cc: Neil Brown Cc: Alasdair Kergon Cc: Mike Snitzer Cc: dm-devel@redhat.com Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: linux390@de.ibm.com Cc: Boaz Harrosh Cc: Benny Halevy Cc: "James E.J. Bottomley" Cc: Greg Kroah-Hartman Cc: "Nicholas A. Bellinger" Cc: Alexander Viro Cc: Chris Mason Cc: "Theodore Ts'o" Cc: Andreas Dilger Cc: Jaegeuk Kim Cc: Steven Whitehouse Cc: Dave Kleikamp Cc: Joern Engel Cc: Prasad Joshi Cc: Trond Myklebust Cc: KONISHI Ryusuke Cc: Mark Fasheh Cc: Joel Becker Cc: Ben Myers Cc: xfs@oss.sgi.com Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Len Brown Cc: Pavel Machek Cc: "Rafael J. Wysocki" Cc: Herton Ronaldo Krzesinski Cc: Ben Hutchings Cc: Andrew Morton Cc: Guo Chao Cc: Tejun Heo Cc: Asai Thambi S P Cc: Selvan Mani Cc: Sam Bradshaw Cc: Wei Yongjun Cc: "Roger Pau Monné" Cc: Jan Beulich Cc: Stefano Stabellini Cc: Ian Campbell Cc: Sebastian Ott Cc: Christian Borntraeger Cc: Minchan Kim Cc: Jiang Liu Cc: Nitin Gupta Cc: Jerome Marchand Cc: Joe Perches Cc: Peng Tao Cc: Andy Adamson Cc: fanchaoting Cc: Jie Liu Cc: Sunil Mushran Cc: "Martin K. Petersen" Cc: Namjae Jeon Cc: Pankaj Kumar Cc: Dan Magenheimer Cc: Mel Gorman 6 --- Documentation/block/biodoc.txt | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'Documentation') diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt index 8df5e8e6dceb..2101e718670d 100644 --- a/Documentation/block/biodoc.txt +++ b/Documentation/block/biodoc.txt @@ -447,14 +447,13 @@ struct bio_vec { * main unit of I/O for the block layer and lower layers (ie drivers) */ struct bio { - sector_t bi_sector; struct bio *bi_next; /* request queue link */ struct block_device *bi_bdev; /* target device */ unsigned long bi_flags; /* status, command, etc */ unsigned long bi_rw; /* low bits: r/w, high: priority */ unsigned int bi_vcnt; /* how may bio_vec's */ - unsigned int bi_idx; /* current index into bio_vec array */ + struct bvec_iter bi_iter; /* current index into bio_vec array */ unsigned int bi_size; /* total size in bytes */ unsigned short bi_phys_segments; /* segments after physaddr coalesce*/ @@ -480,7 +479,7 @@ With this multipage bio design: - Code that traverses the req list can find all the segments of a bio by using rq_for_each_segment. This handles the fact that a request has multiple bios, each of which can have multiple segments. -- Drivers which can't process a large bio in one shot can use the bi_idx +- Drivers which can't process a large bio in one shot can use the bi_iter field to keep track of the next bio_vec entry to process. (e.g a 1MB bio_vec needs to be handled in max 128kB chunks for IDE) [TBD: Should preferably also have a bi_voffset and bi_vlen to avoid modifying @@ -589,7 +588,7 @@ driver should not modify these values. The block layer sets up the nr_sectors and current_nr_sectors fields (based on the corresponding hard_xxx values and the number of bytes transferred) and updates it on every transfer that invokes end_that_request_first. It does the same for the -buffer, bio, bio->bi_idx fields too. +buffer, bio, bio->bi_iter fields too. The buffer field is just a virtual address mapping of the current segment of the i/o buffer in cases where the buffer resides in low-memory. For high -- cgit v1.2.3 From 4550dd6c6b062fc5e5b647296d55da22616123c3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 7 Aug 2013 14:26:21 -0700 Subject: block: Immutable bio vecs This adds a mechanism by which we can advance a bio by an arbitrary number of bytes without modifying the biovec: bio->bi_iter.bi_bvec_done indicates the number of bytes completed in the current bvec. Various driver code still needs to be updated to not refer to the bvec directly before we can use this for interesting things, like efficient bio splitting. Signed-off-by: Kent Overstreet Cc: Jens Axboe Cc: Lars Ellenberg Cc: Paul Clements Cc: drbd-user@lists.linbit.com Cc: nbd-general@lists.sourceforge.net --- Documentation/block/biovecs.txt | 111 ++++++++++++++++++++++++++++++++++++++++ drivers/block/drbd/drbd_main.c | 4 +- drivers/block/nbd.c | 2 +- fs/bio.c | 27 +--------- include/linux/bio.h | 81 ++++++++++++++++++++++++++--- include/linux/blk_types.h | 3 ++ include/linux/blkdev.h | 4 +- 7 files changed, 194 insertions(+), 38 deletions(-) create mode 100644 Documentation/block/biovecs.txt (limited to 'Documentation') diff --git a/Documentation/block/biovecs.txt b/Documentation/block/biovecs.txt new file mode 100644 index 000000000000..74a32ad52f53 --- /dev/null +++ b/Documentation/block/biovecs.txt @@ -0,0 +1,111 @@ + +Immutable biovecs and biovec iterators: +======================================= + +Kent Overstreet + +As of 3.13, biovecs should never be modified after a bio has been submitted. +Instead, we have a new struct bvec_iter which represents a range of a biovec - +the iterator will be modified as the bio is completed, not the biovec. + +More specifically, old code that needed to partially complete a bio would +update bi_sector and bi_size, and advance bi_idx to the next biovec. If it +ended up partway through a biovec, it would increment bv_offset and decrement +bv_len by the number of bytes completed in that biovec. + +In the new scheme of things, everything that must be mutated in order to +partially complete a bio is segregated into struct bvec_iter: bi_sector, +bi_size and bi_idx have been moved there; and instead of modifying bv_offset +and bv_len, struct bvec_iter has bi_bvec_done, which represents the number of +bytes completed in the current bvec. + +There are a bunch of new helper macros for hiding the gory details - in +particular, presenting the illusion of partially completed biovecs so that +normal code doesn't have to deal with bi_bvec_done. + + * Driver code should no longer refer to biovecs directly; we now have + bio_iovec() and bio_iovec_iter() macros that return literal struct biovecs, + constructed from the raw biovecs but taking into account bi_bvec_done and + bi_size. + + bio_for_each_segment() has been updated to take a bvec_iter argument + instead of an integer (that corresponded to bi_idx); for a lot of code the + conversion just required changing the types of the arguments to + bio_for_each_segment(). + + * Advancing a bvec_iter is done with bio_advance_iter(); bio_advance() is a + wrapper around bio_advance_iter() that operates on bio->bi_iter, and also + advances the bio integrity's iter if present. + + There is a lower level advance function - bvec_iter_advance() - which takes + a pointer to a biovec, not a bio; this is used by the bio integrity code. + +What's all this get us? +======================= + +Having a real iterator, and making biovecs immutable, has a number of +advantages: + + * Before, iterating over bios was very awkward when you weren't processing + exactly one bvec at a time - for example, bio_copy_data() in fs/bio.c, + which copies the contents of one bio into another. Because the biovecs + wouldn't necessarily be the same size, the old code was tricky convoluted - + it had to walk two different bios at the same time, keeping both bi_idx and + and offset into the current biovec for each. + + The new code is much more straightforward - have a look. This sort of + pattern comes up in a lot of places; a lot of drivers were essentially open + coding bvec iterators before, and having common implementation considerably + simplifies a lot of code. + + * Before, any code that might need to use the biovec after the bio had been + completed (perhaps to copy the data somewhere else, or perhaps to resubmit + it somewhere else if there was an error) had to save the entire bvec array + - again, this was being done in a fair number of places. + + * Biovecs can be shared between multiple bios - a bvec iter can represent an + arbitrary range of an existing biovec, both starting and ending midway + through biovecs. This is what enables efficient splitting of arbitrary + bios. Note that this means we _only_ use bi_size to determine when we've + reached the end of a bio, not bi_vcnt - and the bio_iovec() macro takes + bi_size into account when constructing biovecs. + + * Splitting bios is now much simpler. The old bio_split() didn't even work on + bios with more than a single bvec! Now, we can efficiently split arbitrary + size bios - because the new bio can share the old bio's biovec. + + Care must be taken to ensure the biovec isn't freed while the split bio is + still using it, in case the original bio completes first, though. Using + bio_chain() when splitting bios helps with this. + + * Submitting partially completed bios is now perfectly fine - this comes up + occasionally in stacking block drivers and various code (e.g. md and + bcache) had some ugly workarounds for this. + + It used to be the case that submitting a partially completed bio would work + fine to _most_ devices, but since accessing the raw bvec array was the + norm, not all drivers would respect bi_idx and those would break. Now, + since all drivers _must_ go through the bvec iterator - and have been + audited to make sure they are - submitting partially completed bios is + perfectly fine. + +Other implications: +=================== + + * Almost all usage of bi_idx is now incorrect and has been removed; instead, + where previously you would have used bi_idx you'd now use a bvec_iter, + probably passing it to one of the helper macros. + + I.e. instead of using bio_iovec_idx() (or bio->bi_iovec[bio->bi_idx]), you + now use bio_iter_iovec(), which takes a bvec_iter and returns a + literal struct bio_vec - constructed on the fly from the raw biovec but + taking into account bi_bvec_done (and bi_size). + + * bi_vcnt can't be trusted or relied upon by driver code - i.e. anything that + doesn't actually own the bio. The reason is twofold: firstly, it's not + actually needed for iterating over the bio anymore - we only use bi_size. + Secondly, when cloning a bio and reusing (a portion of) the original bio's + biovec, in order to calculate bi_vcnt for the new bio we'd have to iterate + over all the biovecs in the new bio - which is silly as it's not needed. + + So, don't use bi_vcnt anymore. diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index f4e5440aba05..929468e1512a 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1546,7 +1546,7 @@ static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio) err = _drbd_no_send_page(mdev, bvec.bv_page, bvec.bv_offset, bvec.bv_len, - bio_iter_last(bio, iter) + bio_iter_last(bvec, iter) ? 0 : MSG_MORE); if (err) return err; @@ -1565,7 +1565,7 @@ static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio) err = _drbd_send_page(mdev, bvec.bv_page, bvec.bv_offset, bvec.bv_len, - bio_iter_last(bio, iter) ? 0 : MSG_MORE); + bio_iter_last(bvec, iter) ? 0 : MSG_MORE); if (err) return err; } diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index aa362f493216..55298db36b2d 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -278,7 +278,7 @@ static int nbd_send_req(struct nbd_device *nbd, struct request *req) */ rq_for_each_segment(bvec, req, iter) { flags = 0; - if (!rq_iter_last(req, iter)) + if (!rq_iter_last(bvec, iter)) flags = MSG_MORE; dprintk(DBG_TX, "%s: request %p: sending %d bytes data\n", nbd->disk->disk_name, req, bvec.bv_len); diff --git a/fs/bio.c b/fs/bio.c index 8b7f14a95503..07b4b7afa695 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -532,13 +532,11 @@ void __bio_clone(struct bio *bio, struct bio *bio_src) * most users will be overriding ->bi_bdev with a new target, * so we don't set nor calculate new physical/hw segment counts here */ - bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; bio->bi_bdev = bio_src->bi_bdev; bio->bi_flags |= 1 << BIO_CLONED; bio->bi_rw = bio_src->bi_rw; bio->bi_vcnt = bio_src->bi_vcnt; - bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; - bio->bi_iter.bi_idx = bio_src->bi_iter.bi_idx; + bio->bi_iter = bio_src->bi_iter; } EXPORT_SYMBOL(__bio_clone); @@ -808,28 +806,7 @@ void bio_advance(struct bio *bio, unsigned bytes) if (bio_integrity(bio)) bio_integrity_advance(bio, bytes); - bio->bi_iter.bi_sector += bytes >> 9; - bio->bi_iter.bi_size -= bytes; - - if (bio->bi_rw & BIO_NO_ADVANCE_ITER_MASK) - return; - - while (bytes) { - if (unlikely(bio->bi_iter.bi_idx >= bio->bi_vcnt)) { - WARN_ONCE(1, "bio idx %d >= vcnt %d\n", - bio->bi_iter.bi_idx, bio->bi_vcnt); - break; - } - - if (bytes >= bio_iovec(bio).bv_len) { - bytes -= bio_iovec(bio).bv_len; - bio->bi_iter.bi_idx++; - } else { - bio_iovec(bio).bv_len -= bytes; - bio_iovec(bio).bv_offset += bytes; - bytes = 0; - } - } + bio_advance_iter(bio, &bio->bi_iter, bytes); } EXPORT_SYMBOL(bio_advance); diff --git a/include/linux/bio.h b/include/linux/bio.h index c16adb5f69f8..04e592e74c92 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -64,11 +64,38 @@ #define bio_iovec_idx(bio, idx) (&((bio)->bi_io_vec[(idx)])) #define __bio_iovec(bio) bio_iovec_idx((bio), (bio)->bi_iter.bi_idx) -#define bio_iter_iovec(bio, iter) ((bio)->bi_io_vec[(iter).bi_idx]) +#define __bvec_iter_bvec(bvec, iter) (&(bvec)[(iter).bi_idx]) -#define bio_page(bio) (bio_iovec((bio)).bv_page) -#define bio_offset(bio) (bio_iovec((bio)).bv_offset) -#define bio_iovec(bio) (*__bio_iovec(bio)) +#define bvec_iter_page(bvec, iter) \ + (__bvec_iter_bvec((bvec), (iter))->bv_page) + +#define bvec_iter_len(bvec, iter) \ + min((iter).bi_size, \ + __bvec_iter_bvec((bvec), (iter))->bv_len - (iter).bi_bvec_done) + +#define bvec_iter_offset(bvec, iter) \ + (__bvec_iter_bvec((bvec), (iter))->bv_offset + (iter).bi_bvec_done) + +#define bvec_iter_bvec(bvec, iter) \ +((struct bio_vec) { \ + .bv_page = bvec_iter_page((bvec), (iter)), \ + .bv_len = bvec_iter_len((bvec), (iter)), \ + .bv_offset = bvec_iter_offset((bvec), (iter)), \ +}) + +#define bio_iter_iovec(bio, iter) \ + bvec_iter_bvec((bio)->bi_io_vec, (iter)) + +#define bio_iter_page(bio, iter) \ + bvec_iter_page((bio)->bi_io_vec, (iter)) +#define bio_iter_len(bio, iter) \ + bvec_iter_len((bio)->bi_io_vec, (iter)) +#define bio_iter_offset(bio, iter) \ + bvec_iter_offset((bio)->bi_io_vec, (iter)) + +#define bio_page(bio) bio_iter_page((bio), (bio)->bi_iter) +#define bio_offset(bio) bio_iter_offset((bio), (bio)->bi_iter) +#define bio_iovec(bio) bio_iter_iovec((bio), (bio)->bi_iter) #define bio_segments(bio) ((bio)->bi_vcnt - (bio)->bi_iter.bi_idx) #define bio_sectors(bio) ((bio)->bi_iter.bi_size >> 9) @@ -145,16 +172,54 @@ static inline void *bio_data(struct bio *bio) bvl = bio_iovec_idx((bio), (i)), i < (bio)->bi_vcnt; \ i++) +static inline void bvec_iter_advance(struct bio_vec *bv, struct bvec_iter *iter, + unsigned bytes) +{ + WARN_ONCE(bytes > iter->bi_size, + "Attempted to advance past end of bvec iter\n"); + + while (bytes) { + unsigned len = min(bytes, bvec_iter_len(bv, *iter)); + + bytes -= len; + iter->bi_size -= len; + iter->bi_bvec_done += len; + + if (iter->bi_bvec_done == __bvec_iter_bvec(bv, *iter)->bv_len) { + iter->bi_bvec_done = 0; + iter->bi_idx++; + } + } +} + +#define for_each_bvec(bvl, bio_vec, iter, start) \ + for ((iter) = start; \ + (bvl) = bvec_iter_bvec((bio_vec), (iter)), \ + (iter).bi_size; \ + bvec_iter_advance((bio_vec), &(iter), (bvl).bv_len)) + + +static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter, + unsigned bytes) +{ + iter->bi_sector += bytes >> 9; + + if (bio->bi_rw & BIO_NO_ADVANCE_ITER_MASK) + iter->bi_size -= bytes; + else + bvec_iter_advance(bio->bi_io_vec, iter, bytes); +} + #define __bio_for_each_segment(bvl, bio, iter, start) \ for (iter = (start); \ - bvl = bio_iter_iovec((bio), (iter)), \ - (iter).bi_idx < (bio)->bi_vcnt; \ - (iter).bi_idx++) + (iter).bi_size && \ + ((bvl = bio_iter_iovec((bio), (iter))), 1); \ + bio_advance_iter((bio), &(iter), (bvl).bv_len)) #define bio_for_each_segment(bvl, bio, iter) \ __bio_for_each_segment(bvl, bio, iter, (bio)->bi_iter) -#define bio_iter_last(bio, iter) ((iter).bi_idx == (bio)->bi_vcnt - 1) +#define bio_iter_last(bvec, iter) ((iter).bi_size == (bvec).bv_len) /* * get a reference to a bio, so it won't disappear. the intended use is diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 29b5b84d8a29..d369f8f6af79 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -34,6 +34,9 @@ struct bvec_iter { unsigned int bi_size; /* residual I/O count */ unsigned int bi_idx; /* current index into bvl_vec */ + + unsigned int bi_bvec_done; /* number of bytes completed in + current bvec */ }; /* diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 337b92a54658..02cb6f0ea71d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -750,9 +750,9 @@ struct req_iterator { __rq_for_each_bio(_iter.bio, _rq) \ bio_for_each_segment(bvl, _iter.bio, _iter.iter) -#define rq_iter_last(rq, _iter) \ +#define rq_iter_last(bvec, _iter) \ (_iter.bio->bi_next == NULL && \ - bio_iter_last(_iter.bio, _iter.iter)) + bio_iter_last(bvec, _iter.iter)) #ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE # error "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform" -- cgit v1.2.3