diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2011-03-11 11:31:58 +1100 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2011-03-11 11:31:58 +1100 |
commit | 9daf1e0b4e53d211c239b7871aca1f3d7d7aeb2c (patch) | |
tree | 7f61568add083645a6de31a3653cd4de65f56ea3 | |
parent | 6c601062a4906ea362ed46be5b098a2abc79cd2d (diff) | |
parent | 2765df7da540687c4d57ca840182122f074c5b9c (diff) |
Merge remote-tracking branch 'ubifs/linux-next'
-rw-r--r-- | drivers/mtd/ubi/build.c | 14 | ||||
-rw-r--r-- | drivers/mtd/ubi/kapi.c | 2 | ||||
-rw-r--r-- | drivers/mtd/ubi/ubi.h | 3 | ||||
-rw-r--r-- | fs/ubifs/commit.c | 58 | ||||
-rw-r--r-- | fs/ubifs/debug.c | 6 | ||||
-rw-r--r-- | fs/ubifs/io.c | 201 | ||||
-rw-r--r-- | fs/ubifs/recovery.c | 44 | ||||
-rw-r--r-- | fs/ubifs/scan.c | 2 | ||||
-rw-r--r-- | fs/ubifs/super.c | 31 | ||||
-rw-r--r-- | fs/ubifs/tnc.c | 10 | ||||
-rw-r--r-- | fs/ubifs/ubifs.h | 31 | ||||
-rw-r--r-- | include/linux/mtd/ubi.h | 22 |
12 files changed, 336 insertions, 88 deletions
diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c index 5ebe280225d6..f38e8de81811 100644 --- a/drivers/mtd/ubi/build.c +++ b/drivers/mtd/ubi/build.c @@ -690,11 +690,25 @@ static int io_init(struct ubi_device *ubi) ubi_assert(ubi->hdrs_min_io_size <= ubi->min_io_size); ubi_assert(ubi->min_io_size % ubi->hdrs_min_io_size == 0); + ubi->max_write_size = ubi->mtd->writebufsize; + /* + * Maximum write size has to be greater or equivalent to min. I/O + * size, and be multiple of min. I/O size. + */ + if (ubi->max_write_size < ubi->min_io_size || + ubi->max_write_size % ubi->min_io_size || + !is_power_of_2(ubi->max_write_size)) { + ubi_err("bad write buffer size %d for %d min. I/O unit", + ubi->max_write_size, ubi->min_io_size); + return -EINVAL; + } + /* Calculate default aligned sizes of EC and VID headers */ ubi->ec_hdr_alsize = ALIGN(UBI_EC_HDR_SIZE, ubi->hdrs_min_io_size); ubi->vid_hdr_alsize = ALIGN(UBI_VID_HDR_SIZE, ubi->hdrs_min_io_size); dbg_msg("min_io_size %d", ubi->min_io_size); + dbg_msg("max_write_size %d", ubi->max_write_size); dbg_msg("hdrs_min_io_size %d", ubi->hdrs_min_io_size); dbg_msg("ec_hdr_alsize %d", ubi->ec_hdr_alsize); dbg_msg("vid_hdr_alsize %d", ubi->vid_hdr_alsize); diff --git a/drivers/mtd/ubi/kapi.c b/drivers/mtd/ubi/kapi.c index 69fa4ef03c53..d39716e5b204 100644 --- a/drivers/mtd/ubi/kapi.c +++ b/drivers/mtd/ubi/kapi.c @@ -40,7 +40,9 @@ void ubi_do_get_device_info(struct ubi_device *ubi, struct ubi_device_info *di) { di->ubi_num = ubi->ubi_num; di->leb_size = ubi->leb_size; + di->leb_start = ubi->leb_start; di->min_io_size = ubi->min_io_size; + di->max_write_size = ubi->max_write_size; di->ro_mode = ubi->ro_mode; di->cdev = ubi->cdev.dev; } diff --git a/drivers/mtd/ubi/ubi.h b/drivers/mtd/ubi/ubi.h index 0b0149c41fe3..b78994330ebc 100644 --- a/drivers/mtd/ubi/ubi.h +++ b/drivers/mtd/ubi/ubi.h @@ -381,6 +381,8 @@ struct ubi_wl_entry; * @bad_allowed: whether the MTD device admits of bad physical eraseblocks or * not * @nor_flash: non-zero if working on top of NOR flash + * @max_write_size: maximum amount of bytes the underlying flash can write at a + * time (MTD write buffer size) * @mtd: MTD device descriptor * * @peb_buf1: a buffer of PEB size used for different purposes @@ -464,6 +466,7 @@ struct ubi_device { int vid_hdr_shift; unsigned int bad_allowed:1; unsigned int nor_flash:1; + int max_write_size; struct mtd_info *mtd; void *peb_buf1; diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c index 02429d81ca33..b148fbc80f8d 100644 --- a/fs/ubifs/commit.c +++ b/fs/ubifs/commit.c @@ -48,6 +48,56 @@ #include <linux/slab.h> #include "ubifs.h" +/* + * nothing_to_commit - check if there is nothing to commit. + * @c: UBIFS file-system description object + * + * This is a helper function which checks if there is anything to commit. It is + * used as an optimization to avoid starting the commit if it is not really + * necessary. Indeed, the commit operation always assumes flash I/O (e.g., + * writing the commit start node to the log), and it is better to avoid doing + * this unnecessarily. E.g., 'ubifs_sync_fs()' runs the commit, but if there is + * nothing to commit, it is more optimal to avoid any flash I/O. + * + * This function has to be called with @c->commit_sem locked for writing - + * this function does not take LPT/TNC locks because the @c->commit_sem + * guarantees that we have exclusive access to the TNC and LPT data structures. + * + * This function returns %1 if there is nothing to commit and %0 otherwise. + */ +static int nothing_to_commit(struct ubifs_info *c) +{ + /* + * During mounting or remounting from R/O mode to R/W mode we may + * commit for various recovery-related reasons. + */ + if (c->mounting || c->remounting_rw) + return 0; + + /* + * If the root TNC node is dirty, we definitely have something to + * commit. + */ + if (c->zroot.znode && test_bit(DIRTY_ZNODE, &c->zroot.znode->flags)) + return 0; + + /* + * Even though the TNC is clean, the LPT tree may have dirty nodes. For + * example, this may happen if the budgeting subsystem invoked GC to + * make some free space, and the GC found an LEB with only dirty and + * free space. In this case GC would just change the lprops of this + * LEB (by turning all space into free space) and unmap it. + */ + if (c->nroot && test_bit(DIRTY_CNODE, &c->nroot->flags)) + return 0; + + ubifs_assert(atomic_long_read(&c->dirty_zn_cnt) == 0); + ubifs_assert(c->dirty_pn_cnt == 0); + ubifs_assert(c->dirty_nn_cnt == 0); + + return 1; +} + /** * do_commit - commit the journal. * @c: UBIFS file-system description object @@ -70,6 +120,12 @@ static int do_commit(struct ubifs_info *c) goto out_up; } + if (nothing_to_commit(c)) { + up_write(&c->commit_sem); + err = 0; + goto out_cancel; + } + /* Sync all write buffers (necessary for recovery) */ for (i = 0; i < c->jhead_cnt; i++) { err = ubifs_wbuf_sync(&c->jheads[i].wbuf); @@ -162,12 +218,12 @@ static int do_commit(struct ubifs_info *c) if (err) goto out; +out_cancel: spin_lock(&c->cs_lock); c->cmt_state = COMMIT_RESTING; wake_up(&c->cmt_wq); dbg_cmt("commit end"); spin_unlock(&c->cs_lock); - return 0; out_up: diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 0bee4dbffc31..bcb1acb79263 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -2813,19 +2813,19 @@ int dbg_debugfs_init_fs(struct ubifs_info *c) } fname = "dump_lprops"; - dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops); + dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops); if (IS_ERR(dent)) goto out_remove; d->dfs_dump_lprops = dent; fname = "dump_budg"; - dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops); + dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops); if (IS_ERR(dent)) goto out_remove; d->dfs_dump_budg = dent; fname = "dump_tnc"; - dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops); + dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops); if (IS_ERR(dent)) goto out_remove; d->dfs_dump_tnc = dent; diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index d82173182eeb..dfd168b7807e 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -31,6 +31,26 @@ * buffer is full or when it is not used for some time (by timer). This is * similar to the mechanism is used by JFFS2. * + * UBIFS distinguishes between minimum write size (@c->min_io_size) and maximum + * write size (@c->max_write_size). The latter is the maximum amount of bytes + * the underlying flash is able to program at a time, and writing in + * @c->max_write_size units should presumably be faster. Obviously, + * @c->min_io_size <= @c->max_write_size. Write-buffers are of + * @c->max_write_size bytes in size for maximum performance. However, when a + * write-buffer is flushed, only the portion of it (aligned to @c->min_io_size + * boundary) which contains data is written, not the whole write-buffer, + * because this is more space-efficient. + * + * This optimization adds few complications to the code. Indeed, on the one + * hand, we want to write in optimal @c->max_write_size bytes chunks, which + * also means aligning writes at the @c->max_write_size bytes offsets. On the + * other hand, we do not want to waste space when synchronizing the write + * buffer, so during synchronization we writes in smaller chunks. And this makes + * the next write offset to be not aligned to @c->max_write_size bytes. So the + * have to make sure that the write-buffer offset (@wbuf->offs) becomes aligned + * to @c->max_write_size bytes again. We do this by temporarily shrinking + * write-buffer size (@wbuf->size). + * * Write-buffers are defined by 'struct ubifs_wbuf' objects and protected by * mutexes defined inside these objects. Since sometimes upper-level code * has to lock the write-buffer (e.g. journal space reservation code), many @@ -46,8 +66,8 @@ * UBIFS uses padding when it pads to the next min. I/O unit. In this case it * uses padding nodes or padding bytes, if the padding node does not fit. * - * All UBIFS nodes are protected by CRC checksums and UBIFS checks all nodes - * every time they are read from the flash media. + * All UBIFS nodes are protected by CRC checksums and UBIFS checks CRC when + * they are read from the flash media. */ #include <linux/crc32.h> @@ -88,8 +108,12 @@ void ubifs_ro_mode(struct ubifs_info *c, int err) * This function may skip data nodes CRC checking if @c->no_chk_data_crc is * true, which is controlled by corresponding UBIFS mount option. However, if * @must_chk_crc is true, then @c->no_chk_data_crc is ignored and CRC is - * checked. Similarly, if @c->always_chk_crc is true, @c->no_chk_data_crc is - * ignored and CRC is checked. + * checked. Similarly, if @c->mounting or @c->remounting_rw is true (we are + * mounting or re-mounting to R/W mode), @c->no_chk_data_crc is ignored and CRC + * is checked. This is because during mounting or re-mounting from R/O mode to + * R/W mode we may read journal nodes (when replying the journal or doing the + * recovery) and the journal nodes may potentially be corrupted, so checking is + * required. * * This function returns zero in case of success and %-EUCLEAN in case of bad * CRC or magic. @@ -131,8 +155,8 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, node_len > c->ranges[type].max_len) goto out_len; - if (!must_chk_crc && type == UBIFS_DATA_NODE && !c->always_chk_crc && - c->no_chk_data_crc) + if (!must_chk_crc && type == UBIFS_DATA_NODE && !c->mounting && + !c->remounting_rw && c->no_chk_data_crc) return 0; crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8); @@ -343,11 +367,17 @@ static void cancel_wbuf_timer_nolock(struct ubifs_wbuf *wbuf) * * This function synchronizes write-buffer @buf and returns zero in case of * success or a negative error code in case of failure. + * + * Note, although write-buffers are of @c->max_write_size, this function does + * not necessarily writes all @c->max_write_size bytes to the flash. Instead, + * if the write-buffer is only partially filled with data, only the used part + * of the write-buffer (aligned on @c->min_io_size boundary) is synchronized. + * This way we waste less space. */ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf) { struct ubifs_info *c = wbuf->c; - int err, dirt; + int err, dirt, sync_len; cancel_wbuf_timer_nolock(wbuf); if (!wbuf->used || wbuf->lnum == -1) @@ -357,27 +387,53 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf) dbg_io("LEB %d:%d, %d bytes, jhead %s", wbuf->lnum, wbuf->offs, wbuf->used, dbg_jhead(wbuf->jhead)); ubifs_assert(!(wbuf->avail & 7)); - ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size); + ubifs_assert(wbuf->offs + wbuf->size <= c->leb_size); + ubifs_assert(wbuf->size >= c->min_io_size); + ubifs_assert(wbuf->size <= c->max_write_size); + ubifs_assert(wbuf->size % c->min_io_size == 0); ubifs_assert(!c->ro_media && !c->ro_mount); + if (c->leb_size - wbuf->offs >= c->max_write_size) + ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size )); if (c->ro_error) return -EROFS; - ubifs_pad(c, wbuf->buf + wbuf->used, wbuf->avail); + /* + * Do not write whole write buffer but write only the minimum necessary + * amount of min. I/O units. + */ + sync_len = ALIGN(wbuf->used, c->min_io_size); + dirt = sync_len - wbuf->used; + if (dirt) + ubifs_pad(c, wbuf->buf + wbuf->used, dirt); err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs, - c->min_io_size, wbuf->dtype); + sync_len, wbuf->dtype); if (err) { ubifs_err("cannot write %d bytes to LEB %d:%d", - c->min_io_size, wbuf->lnum, wbuf->offs); + sync_len, wbuf->lnum, wbuf->offs); dbg_dump_stack(); return err; } - dirt = wbuf->avail; - spin_lock(&wbuf->lock); - wbuf->offs += c->min_io_size; - wbuf->avail = c->min_io_size; + wbuf->offs += sync_len; + /* + * Now @wbuf->offs is not necessarily aligned to @c->max_write_size. + * But our goal is to optimize writes and make sure we write in + * @c->max_write_size chunks and to @c->max_write_size-aligned offset. + * Thus, if @wbuf->offs is not aligned to @c->max_write_size now, make + * sure that @wbuf->offs + @wbuf->size is aligned to + * @c->max_write_size. This way we make sure that after next + * write-buffer flush we are again at the optimal offset (aligned to + * @c->max_write_size). + */ + if (c->leb_size - wbuf->offs < c->max_write_size) + wbuf->size = c->leb_size - wbuf->offs; + else if (wbuf->offs & (c->max_write_size - 1)) + wbuf->size = ALIGN(wbuf->offs, c->max_write_size) - wbuf->offs; + else + wbuf->size = c->max_write_size; + wbuf->avail = wbuf->size; wbuf->used = 0; wbuf->next_ino = 0; spin_unlock(&wbuf->lock); @@ -420,7 +476,13 @@ int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs, spin_lock(&wbuf->lock); wbuf->lnum = lnum; wbuf->offs = offs; - wbuf->avail = c->min_io_size; + if (c->leb_size - wbuf->offs < c->max_write_size) + wbuf->size = c->leb_size - wbuf->offs; + else if (wbuf->offs & (c->max_write_size - 1)) + wbuf->size = ALIGN(wbuf->offs, c->max_write_size) - wbuf->offs; + else + wbuf->size = c->max_write_size; + wbuf->avail = wbuf->size; wbuf->used = 0; spin_unlock(&wbuf->lock); wbuf->dtype = dtype; @@ -500,8 +562,9 @@ out_timers: * * This function writes data to flash via write-buffer @wbuf. This means that * the last piece of the node won't reach the flash media immediately if it - * does not take whole minimal I/O unit. Instead, the node will sit in RAM - * until the write-buffer is synchronized (e.g., by timer). + * does not take whole max. write unit (@c->max_write_size). Instead, the node + * will sit in RAM until the write-buffer is synchronized (e.g., by timer, or + * because more data are appended to the write-buffer). * * This function returns zero in case of success and a negative error code in * case of failure. If the node cannot be written because there is no more @@ -518,9 +581,14 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) ubifs_assert(len > 0 && wbuf->lnum >= 0 && wbuf->lnum < c->leb_cnt); ubifs_assert(wbuf->offs >= 0 && wbuf->offs % c->min_io_size == 0); ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size); - ubifs_assert(wbuf->avail > 0 && wbuf->avail <= c->min_io_size); + ubifs_assert(wbuf->avail > 0 && wbuf->avail <= wbuf->size); + ubifs_assert(wbuf->size >= c->min_io_size); + ubifs_assert(wbuf->size <= c->max_write_size); + ubifs_assert(wbuf->size % c->min_io_size == 0); ubifs_assert(mutex_is_locked(&wbuf->io_mutex)); ubifs_assert(!c->ro_media && !c->ro_mount); + if (c->leb_size - wbuf->offs >= c->max_write_size) + ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size )); if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) { err = -ENOSPC; @@ -543,14 +611,18 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) dbg_io("flush jhead %s wbuf to LEB %d:%d", dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs); err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, - wbuf->offs, c->min_io_size, + wbuf->offs, wbuf->size, wbuf->dtype); if (err) goto out; spin_lock(&wbuf->lock); - wbuf->offs += c->min_io_size; - wbuf->avail = c->min_io_size; + wbuf->offs += wbuf->size; + if (c->leb_size - wbuf->offs >= c->max_write_size) + wbuf->size = c->max_write_size; + else + wbuf->size = c->leb_size - wbuf->offs; + wbuf->avail = wbuf->size; wbuf->used = 0; wbuf->next_ino = 0; spin_unlock(&wbuf->lock); @@ -564,33 +636,57 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) goto exit; } - /* - * The node is large enough and does not fit entirely within current - * minimal I/O unit. We have to fill and flush write-buffer and switch - * to the next min. I/O unit. - */ - dbg_io("flush jhead %s wbuf to LEB %d:%d", - dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs); - memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail); - err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs, - c->min_io_size, wbuf->dtype); - if (err) - goto out; + offs = wbuf->offs; + written = 0; - offs = wbuf->offs + c->min_io_size; - len -= wbuf->avail; - aligned_len -= wbuf->avail; - written = wbuf->avail; + if (wbuf->used) { + /* + * The node is large enough and does not fit entirely within + * current available space. We have to fill and flush + * write-buffer and switch to the next max. write unit. + */ + dbg_io("flush jhead %s wbuf to LEB %d:%d", + dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs); + memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail); + err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs, + wbuf->size, wbuf->dtype); + if (err) + goto out; + + offs += wbuf->size; + len -= wbuf->avail; + aligned_len -= wbuf->avail; + written += wbuf->avail; + } else if (wbuf->offs & (c->max_write_size - 1)) { + /* + * The write-buffer offset is not aligned to + * @c->max_write_size and @wbuf->size is less than + * @c->max_write_size. Write @wbuf->size bytes to make sure the + * following writes are done in optimal @c->max_write_size + * chunks. + */ + dbg_io("write %d bytes to LEB %d:%d", + wbuf->size, wbuf->lnum, wbuf->offs); + err = ubi_leb_write(c->ubi, wbuf->lnum, buf, wbuf->offs, + wbuf->size, wbuf->dtype); + if (err) + goto out; + + offs += wbuf->size; + len -= wbuf->size; + aligned_len -= wbuf->size; + written += wbuf->size; + } /* - * The remaining data may take more whole min. I/O units, so write the - * remains multiple to min. I/O unit size directly to the flash media. + * The remaining data may take more whole max. write units, so write the + * remains multiple to max. write unit size directly to the flash media. * We align node length to 8-byte boundary because we anyway flash wbuf * if the remaining space is less than 8 bytes. */ - n = aligned_len >> c->min_io_shift; + n = aligned_len >> c->max_write_shift; if (n) { - n <<= c->min_io_shift; + n <<= c->max_write_shift; dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, offs); err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written, offs, n, wbuf->dtype); @@ -606,14 +702,18 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) if (aligned_len) /* * And now we have what's left and what does not take whole - * min. I/O unit, so write it to the write-buffer and we are + * max. write unit, so write it to the write-buffer and we are * done. */ memcpy(wbuf->buf, buf + written, len); wbuf->offs = offs; + if (c->leb_size - wbuf->offs >= c->max_write_size) + wbuf->size = c->max_write_size; + else + wbuf->size = c->leb_size - wbuf->offs; + wbuf->avail = wbuf->size - aligned_len; wbuf->used = aligned_len; - wbuf->avail = c->min_io_size - aligned_len; wbuf->next_ino = 0; spin_unlock(&wbuf->lock); @@ -837,11 +937,11 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf) { size_t size; - wbuf->buf = kmalloc(c->min_io_size, GFP_KERNEL); + wbuf->buf = kmalloc(c->max_write_size, GFP_KERNEL); if (!wbuf->buf) return -ENOMEM; - size = (c->min_io_size / UBIFS_CH_SZ + 1) * sizeof(ino_t); + size = (c->max_write_size / UBIFS_CH_SZ + 1) * sizeof(ino_t); wbuf->inodes = kmalloc(size, GFP_KERNEL); if (!wbuf->inodes) { kfree(wbuf->buf); @@ -851,7 +951,14 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf) wbuf->used = 0; wbuf->lnum = wbuf->offs = -1; - wbuf->avail = c->min_io_size; + /* + * If the LEB starts at the max. write size aligned address, then + * write-buffer size has to be set to @c->max_write_size. Otherwise, + * set it to something smaller so that it ends at the closest max. + * write size boundary. + */ + size = c->max_write_size - (c->leb_start % c->max_write_size); + wbuf->avail = wbuf->size = size; wbuf->dtype = UBI_UNKNOWN; wbuf->sync_callback = NULL; mutex_init(&wbuf->io_mutex); diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index 77e9b874b6c2..936f2cbfe6b6 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -28,6 +28,23 @@ * UBIFS always cleans away all remnants of an unclean un-mount, so that * errors do not accumulate. However UBIFS defers recovery if it is mounted * read-only, and the flash is not modified in that case. + * + * The general UBIFS approach to the recovery is that it recovers from + * corruptions which could be caused by power cuts, but it refuses to recover + * from corruption caused by other reasons. And UBIFS tries to distinguish + * between these 2 reasons of corruptions and silently recover in the former + * case and loudly complain in the latter case. + * + * UBIFS writes only to erased LEBs, so it writes only to the flash space + * containing only 0xFFs. UBIFS also always writes strictly from the beginning + * of the LEB to the end. And UBIFS assumes that the underlying flash media + * writes in @c->max_write_size bytes at a time. + * + * Hence, if UBIFS finds a corrupted node at offset X, it expects only the min. + * I/O unit corresponding to offset X to contain corrupted data, all the + * following min. I/O units have to contain empty space (all 0xFFs). If this is + * not true, the corruption cannot be the result of a power cut, and UBIFS + * refuses to mount. */ #include <linux/crc32.h> @@ -362,8 +379,9 @@ int ubifs_write_rcvrd_mst_node(struct ubifs_info *c) * @offs: offset to check * * This function returns %1 if @offs was in the last write to the LEB whose data - * is in @buf, otherwise %0 is returned. The determination is made by checking - * for subsequent empty space starting from the next @c->min_io_size boundary. + * is in @buf, otherwise %0 is returned. The determination is made by checking + * for subsequent empty space starting from the next @c->max_write_size + * boundary. */ static int is_last_write(const struct ubifs_info *c, void *buf, int offs) { @@ -371,10 +389,10 @@ static int is_last_write(const struct ubifs_info *c, void *buf, int offs) uint8_t *p; /* - * Round up to the next @c->min_io_size boundary i.e. @offs is in the - * last wbuf written. After that should be empty space. + * Round up to the next @c->max_write_size boundary i.e. @offs is in + * the last wbuf written. After that should be empty space. */ - empty_offs = ALIGN(offs + 1, c->min_io_size); + empty_offs = ALIGN(offs + 1, c->max_write_size); check_len = c->leb_size - empty_offs; p = buf + empty_offs - offs; return is_empty(p, check_len); @@ -429,7 +447,7 @@ static int no_more_nodes(const struct ubifs_info *c, void *buf, int len, int skip, dlen = le32_to_cpu(ch->len); /* Check for empty space after the corrupt node's common header */ - skip = ALIGN(offs + UBIFS_CH_SZ, c->min_io_size) - offs; + skip = ALIGN(offs + UBIFS_CH_SZ, c->max_write_size) - offs; if (is_empty(buf + skip, len - skip)) return 1; /* @@ -441,7 +459,7 @@ static int no_more_nodes(const struct ubifs_info *c, void *buf, int len, return 0; } /* Now we know the corrupt node's length we can skip over it */ - skip = ALIGN(offs + dlen, c->min_io_size) - offs; + skip = ALIGN(offs + dlen, c->max_write_size) - offs; /* After which there should be empty space */ if (is_empty(buf + skip, len - skip)) return 1; @@ -671,10 +689,14 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, } else { int corruption = first_non_ff(buf, len); + /* + * See header comment for this file for more + * explanations about the reasons we have this check. + */ ubifs_err("corrupt empty space LEB %d:%d, corruption " "starts at %d", lnum, offs, corruption); /* Make sure we dump interesting non-0xFF data */ - offs = corruption; + offs += corruption; buf += corruption; goto corrupted; } @@ -836,12 +858,8 @@ struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum, static int recover_head(const struct ubifs_info *c, int lnum, int offs, void *sbuf) { - int len, err; + int len = c->max_write_size, err; - if (c->min_io_size > 1) - len = c->min_io_size; - else - len = 512; if (offs + len > c->leb_size) len = c->leb_size - offs; diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c index 3e1ee57dbeaa..36216b46f772 100644 --- a/fs/ubifs/scan.c +++ b/fs/ubifs/scan.c @@ -328,7 +328,7 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum, if (!quiet) ubifs_err("empty space starts at non-aligned offset %d", offs); - goto corrupted;; + goto corrupted; } ubifs_end_scan(c, sleb, lnum, offs); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 6e11c2975dcf..c20c6d2a0779 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -512,9 +512,12 @@ static int init_constants_early(struct ubifs_info *c) c->leb_cnt = c->vi.size; c->leb_size = c->vi.usable_leb_size; + c->leb_start = c->di.leb_start; c->half_leb_size = c->leb_size / 2; c->min_io_size = c->di.min_io_size; c->min_io_shift = fls(c->min_io_size) - 1; + c->max_write_size = c->di.max_write_size; + c->max_write_shift = fls(c->max_write_size) - 1; if (c->leb_size < UBIFS_MIN_LEB_SZ) { ubifs_err("too small LEBs (%d bytes), min. is %d bytes", @@ -534,6 +537,18 @@ static int init_constants_early(struct ubifs_info *c) } /* + * Maximum write size has to be greater or equivalent to min. I/O + * size, and be multiple of min. I/O size. + */ + if (c->max_write_size < c->min_io_size || + c->max_write_size % c->min_io_size || + !is_power_of_2(c->max_write_size)) { + ubifs_err("bad write buffer size %d for %d min. I/O unit", + c->max_write_size, c->min_io_size); + return -EINVAL; + } + + /* * UBIFS aligns all node to 8-byte boundary, so to make function in * io.c simpler, assume minimum I/O unit size to be 8 bytes if it is * less than 8. @@ -541,6 +556,10 @@ static int init_constants_early(struct ubifs_info *c) if (c->min_io_size < 8) { c->min_io_size = 8; c->min_io_shift = 3; + if (c->max_write_size < c->min_io_size) { + c->max_write_size = c->min_io_size; + c->max_write_shift = c->min_io_shift; + } } c->ref_node_alsz = ALIGN(UBIFS_REF_NODE_SZ, c->min_io_size); @@ -1202,11 +1221,7 @@ static int mount_ubifs(struct ubifs_info *c) if (c->bulk_read == 1) bu_init(c); - /* - * We have to check all CRCs, even for data nodes, when we mount the FS - * (specifically, when we are replaying). - */ - c->always_chk_crc = 1; + c->mounting = 1; err = ubifs_read_superblock(c); if (err) @@ -1382,7 +1397,7 @@ static int mount_ubifs(struct ubifs_info *c) if (err) goto out_infos; - c->always_chk_crc = 0; + c->mounting = 0; ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"", c->vi.ubi_num, c->vi.vol_id, c->vi.name); @@ -1403,6 +1418,7 @@ static int mount_ubifs(struct ubifs_info *c) dbg_msg("compiled on: " __DATE__ " at " __TIME__); dbg_msg("min. I/O unit size: %d bytes", c->min_io_size); + dbg_msg("max. write size: %d bytes", c->max_write_size); dbg_msg("LEB size: %d bytes (%d KiB)", c->leb_size, c->leb_size >> 10); dbg_msg("data journal heads: %d", @@ -1543,7 +1559,6 @@ static int ubifs_remount_rw(struct ubifs_info *c) mutex_lock(&c->umount_mutex); dbg_save_space_info(c); c->remounting_rw = 1; - c->always_chk_crc = 1; err = check_free_space(c); if (err) @@ -1650,7 +1665,6 @@ static int ubifs_remount_rw(struct ubifs_info *c) dbg_gen("re-mounted read-write"); c->ro_mount = 0; c->remounting_rw = 0; - c->always_chk_crc = 0; err = dbg_check_space_info(c); mutex_unlock(&c->umount_mutex); return err; @@ -1667,7 +1681,6 @@ out: c->ileb_buf = NULL; ubifs_lpt_free(c, 1); c->remounting_rw = 0; - c->always_chk_crc = 0; mutex_unlock(&c->umount_mutex); return err; } diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index ad9cf0133622..de485979ca39 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -447,8 +447,11 @@ static int tnc_read_node_nm(struct ubifs_info *c, struct ubifs_zbranch *zbr, * * Note, this function does not check CRC of data nodes if @c->no_chk_data_crc * is true (it is controlled by corresponding mount option). However, if - * @c->always_chk_crc is true, @c->no_chk_data_crc is ignored and CRC is always - * checked. + * @c->mounting or @c->remounting_rw is true (we are mounting or re-mounting to + * R/W mode), @c->no_chk_data_crc is ignored and CRC is checked. This is + * because during mounting or re-mounting from R/O mode to R/W mode we may read + * journal nodes (when replying the journal or doing the recovery) and the + * journal nodes may potentially be corrupted, so checking is required. */ static int try_read_node(const struct ubifs_info *c, void *buf, int type, int len, int lnum, int offs) @@ -476,7 +479,8 @@ static int try_read_node(const struct ubifs_info *c, void *buf, int type, if (node_len != len) return 0; - if (type == UBIFS_DATA_NODE && !c->always_chk_crc && c->no_chk_data_crc) + if (type == UBIFS_DATA_NODE && c->no_chk_data_crc && !c->mounting && + !c->remounting_rw) return 1; crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8); diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 381d6b207a52..362495078489 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -646,6 +646,7 @@ typedef int (*ubifs_lpt_scan_callback)(struct ubifs_info *c, * @offs: write-buffer offset in this logical eraseblock * @avail: number of bytes available in the write-buffer * @used: number of used bytes in the write-buffer + * @size: write-buffer size (in [@c->min_io_size, @c->max_write_size] range) * @dtype: type of data stored in this LEB (%UBI_LONGTERM, %UBI_SHORTTERM, * %UBI_UNKNOWN) * @jhead: journal head the mutex belongs to (note, needed only to shut lockdep @@ -680,6 +681,7 @@ struct ubifs_wbuf { int offs; int avail; int used; + int size; int dtype; int jhead; int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad); @@ -1024,7 +1026,12 @@ struct ubifs_debug_info; * * @min_io_size: minimal input/output unit size * @min_io_shift: number of bits in @min_io_size minus one + * @max_write_size: maximum amount of bytes the underlying flash can write at a + * time (MTD write buffer size) + * @max_write_shift: number of bits in @max_write_size minus one * @leb_size: logical eraseblock size in bytes + * @leb_start: starting offset of logical eraseblocks within physical + * eraseblocks * @half_leb_size: half LEB size * @idx_leb_size: how many bytes of an LEB are effectively available when it is * used to store indexing nodes (@leb_size - @max_idx_node_sz) @@ -1166,22 +1173,21 @@ struct ubifs_debug_info; * @rp_uid: reserved pool user ID * @rp_gid: reserved pool group ID * - * @empty: if the UBI device is empty + * @empty: %1 if the UBI device is empty + * @need_recovery: %1 if the file-system needs recovery + * @replaying: %1 during journal replay + * @mounting: %1 while mounting + * @remounting_rw: %1 while re-mounting from R/O mode to R/W mode * @replay_tree: temporary tree used during journal replay * @replay_list: temporary list used during journal replay * @replay_buds: list of buds to replay * @cs_sqnum: sequence number of first node in the log (commit start node) * @replay_sqnum: sequence number of node currently being replayed - * @need_recovery: file-system needs recovery - * @replaying: set to %1 during journal replay * @unclean_leb_list: LEBs to recover when re-mounting R/O mounted FS to R/W * mode * @rcvrd_mst_node: recovered master node to write when re-mounting R/O mounted * FS to R/W mode * @size_tree: inode size information for recovery - * @remounting_rw: set while re-mounting from R/O mode to R/W mode - * @always_chk_crc: always check CRCs (while mounting and remounting to R/W - * mode) * @mount_opts: UBIFS-specific mount options * * @dbg: debugging-related information @@ -1271,7 +1277,10 @@ struct ubifs_info { int min_io_size; int min_io_shift; + int max_write_size; + int max_write_shift; int leb_size; + int leb_start; int half_leb_size; int idx_leb_size; int leb_cnt; @@ -1402,19 +1411,19 @@ struct ubifs_info { gid_t rp_gid; /* The below fields are used only during mounting and re-mounting */ - int empty; + unsigned int empty:1; + unsigned int need_recovery:1; + unsigned int replaying:1; + unsigned int mounting:1; + unsigned int remounting_rw:1; struct rb_root replay_tree; struct list_head replay_list; struct list_head replay_buds; unsigned long long cs_sqnum; unsigned long long replay_sqnum; - int need_recovery; - int replaying; struct list_head unclean_leb_list; struct ubifs_mst_node *rcvrd_mst_node; struct rb_root size_tree; - int remounting_rw; - int always_chk_crc; struct ubifs_mount_opts mount_opts; #ifdef CONFIG_UBIFS_FS_DEBUG diff --git a/include/linux/mtd/ubi.h b/include/linux/mtd/ubi.h index b31bd9e9bca3..84854edf4436 100644 --- a/include/linux/mtd/ubi.h +++ b/include/linux/mtd/ubi.h @@ -116,18 +116,40 @@ struct ubi_volume_info { * struct ubi_device_info - UBI device description data structure. * @ubi_num: ubi device number * @leb_size: logical eraseblock size on this UBI device + * @leb_start: starting offset of logical eraseblocks within physical + * eraseblocks * @min_io_size: minimal I/O unit size + * @max_write_size: maximum amount of bytes the underlying flash can write at a + * time (MTD write buffer size) * @ro_mode: if this device is in read-only mode * @cdev: UBI character device major and minor numbers * * Note, @leb_size is the logical eraseblock size offered by the UBI device. * Volumes of this UBI device may have smaller logical eraseblock size if their * alignment is not equivalent to %1. + * + * The @max_write_size field describes flash write maximum write unit. For + * example, NOR flash allows for changing individual bytes, so @min_io_size is + * %1. However, it does not mean than NOR flash has to write data byte-by-byte. + * Instead, CFI NOR flashes have a write-buffer of, e.g., 64 bytes, and when + * writing large chunks of data, they write 64-bytes at a time. Obviously, this + * improves write throughput. + * + * Also, the MTD device may have N interleaved (striped) flash chips + * underneath, in which case @min_io_size can be physical min. I/O size of + * single flash chip, while @max_write_size can be N * @min_io_size. + * + * The @max_write_size field is always greater or equivalent to @min_io_size. + * E.g., some NOR flashes may have (@min_io_size = 1, @max_write_size = 64). In + * contrast, NAND flashes usually have @min_io_size = @max_write_size = NAND + * page size. */ struct ubi_device_info { int ubi_num; int leb_size; + int leb_start; int min_io_size; + int max_write_size; int ro_mode; dev_t cdev; }; |