diff options
Diffstat (limited to 'fs')
405 files changed, 16871 insertions, 13120 deletions
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index 731e3d14b67d..0e8418066a48 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -42,6 +42,7 @@ struct inode *v9fs_alloc_inode(struct super_block *sb); void v9fs_free_inode(struct inode *inode); struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev); +void v9fs_set_netfs_context(struct inode *inode); int v9fs_init_inode(struct v9fs_session_info *v9ses, struct inode *inode, umode_t mode, dev_t rdev); void v9fs_evict_inode(struct inode *inode); diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 8a635999a7d6..047855033d32 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -19,12 +19,45 @@ #include <linux/netfs.h> #include <net/9p/9p.h> #include <net/9p/client.h> +#include <trace/events/netfs.h> #include "v9fs.h" #include "v9fs_vfs.h" #include "cache.h" #include "fid.h" +static void v9fs_upload_to_server(struct netfs_io_subrequest *subreq) +{ + struct p9_fid *fid = subreq->rreq->netfs_priv; + int err, len; + + trace_netfs_sreq(subreq, netfs_sreq_trace_submit); + len = p9_client_write(fid, subreq->start, &subreq->io_iter, &err); + netfs_write_subrequest_terminated(subreq, len ?: err, false); +} + +static void v9fs_upload_to_server_worker(struct work_struct *work) +{ + struct netfs_io_subrequest *subreq = + container_of(work, struct netfs_io_subrequest, work); + + v9fs_upload_to_server(subreq); +} + +/* + * Set up write requests for a writeback slice. We need to add a write request + * for each write we want to make. + */ +static void v9fs_create_write_requests(struct netfs_io_request *wreq, loff_t start, size_t len) +{ + struct netfs_io_subrequest *subreq; + + subreq = netfs_create_write_request(wreq, NETFS_UPLOAD_TO_SERVER, + start, len, v9fs_upload_to_server_worker); + if (subreq) + netfs_queue_write_request(subreq); +} + /** * v9fs_issue_read - Issue a read from 9P * @subreq: The read to make @@ -33,14 +66,10 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq) { struct netfs_io_request *rreq = subreq->rreq; struct p9_fid *fid = rreq->netfs_priv; - struct iov_iter to; - loff_t pos = subreq->start + subreq->transferred; - size_t len = subreq->len - subreq->transferred; int total, err; - iov_iter_xarray(&to, ITER_DEST, &rreq->mapping->i_pages, pos, len); - - total = p9_client_read(fid, pos, &to, &err); + total = p9_client_read(fid, subreq->start + subreq->transferred, + &subreq->io_iter, &err); /* if we just extended the file size, any portion not in * cache won't be on server and is zeroes */ @@ -50,25 +79,42 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq) } /** - * v9fs_init_request - Initialise a read request + * v9fs_init_request - Initialise a request * @rreq: The read request * @file: The file being read from */ static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file) { - struct p9_fid *fid = file->private_data; - - BUG_ON(!fid); + struct p9_fid *fid; + bool writing = (rreq->origin == NETFS_READ_FOR_WRITE || + rreq->origin == NETFS_WRITEBACK || + rreq->origin == NETFS_WRITETHROUGH || + rreq->origin == NETFS_LAUNDER_WRITE || + rreq->origin == NETFS_UNBUFFERED_WRITE || + rreq->origin == NETFS_DIO_WRITE); + + if (file) { + fid = file->private_data; + if (!fid) + goto no_fid; + p9_fid_get(fid); + } else { + fid = v9fs_fid_find_inode(rreq->inode, writing, INVALID_UID, true); + if (!fid) + goto no_fid; + } /* we might need to read from a fid that was opened write-only * for read-modify-write of page cache, use the writeback fid * for that */ - WARN_ON(rreq->origin == NETFS_READ_FOR_WRITE && - !(fid->mode & P9_ORDWR)); - - p9_fid_get(fid); + WARN_ON(rreq->origin == NETFS_READ_FOR_WRITE && !(fid->mode & P9_ORDWR)); rreq->netfs_priv = fid; return 0; + +no_fid: + WARN_ONCE(1, "folio expected an open fid inode->i_ino=%lx\n", + rreq->inode->i_ino); + return -EINVAL; } /** @@ -82,281 +128,20 @@ static void v9fs_free_request(struct netfs_io_request *rreq) p9_fid_put(fid); } -/** - * v9fs_begin_cache_operation - Begin a cache operation for a read - * @rreq: The read request - */ -static int v9fs_begin_cache_operation(struct netfs_io_request *rreq) -{ -#ifdef CONFIG_9P_FSCACHE - struct fscache_cookie *cookie = v9fs_inode_cookie(V9FS_I(rreq->inode)); - - return fscache_begin_read_operation(&rreq->cache_resources, cookie); -#else - return -ENOBUFS; -#endif -} - const struct netfs_request_ops v9fs_req_ops = { .init_request = v9fs_init_request, .free_request = v9fs_free_request, - .begin_cache_operation = v9fs_begin_cache_operation, .issue_read = v9fs_issue_read, + .create_write_requests = v9fs_create_write_requests, }; -/** - * v9fs_release_folio - release the private state associated with a folio - * @folio: The folio to be released - * @gfp: The caller's allocation restrictions - * - * Returns true if the page can be released, false otherwise. - */ - -static bool v9fs_release_folio(struct folio *folio, gfp_t gfp) -{ - if (folio_test_private(folio)) - return false; -#ifdef CONFIG_9P_FSCACHE - if (folio_test_fscache(folio)) { - if (current_is_kswapd() || !(gfp & __GFP_FS)) - return false; - folio_wait_fscache(folio); - } - fscache_note_page_release(v9fs_inode_cookie(V9FS_I(folio_inode(folio)))); -#endif - return true; -} - -static void v9fs_invalidate_folio(struct folio *folio, size_t offset, - size_t length) -{ - folio_wait_fscache(folio); -} - -#ifdef CONFIG_9P_FSCACHE -static void v9fs_write_to_cache_done(void *priv, ssize_t transferred_or_error, - bool was_async) -{ - struct v9fs_inode *v9inode = priv; - __le32 version; - - if (IS_ERR_VALUE(transferred_or_error) && - transferred_or_error != -ENOBUFS) { - version = cpu_to_le32(v9inode->qid.version); - fscache_invalidate(v9fs_inode_cookie(v9inode), &version, - i_size_read(&v9inode->netfs.inode), 0); - } -} -#endif - -static int v9fs_vfs_write_folio_locked(struct folio *folio) -{ - struct inode *inode = folio_inode(folio); - loff_t start = folio_pos(folio); - loff_t i_size = i_size_read(inode); - struct iov_iter from; - size_t len = folio_size(folio); - struct p9_fid *writeback_fid; - int err; - struct v9fs_inode __maybe_unused *v9inode = V9FS_I(inode); - struct fscache_cookie __maybe_unused *cookie = v9fs_inode_cookie(v9inode); - - if (start >= i_size) - return 0; /* Simultaneous truncation occurred */ - - len = min_t(loff_t, i_size - start, len); - - iov_iter_xarray(&from, ITER_SOURCE, &folio_mapping(folio)->i_pages, start, len); - - writeback_fid = v9fs_fid_find_inode(inode, true, INVALID_UID, true); - if (!writeback_fid) { - WARN_ONCE(1, "folio expected an open fid inode->i_private=%p\n", - inode->i_private); - return -EINVAL; - } - - folio_wait_fscache(folio); - folio_start_writeback(folio); - - p9_client_write(writeback_fid, start, &from, &err); - -#ifdef CONFIG_9P_FSCACHE - if (err == 0 && - fscache_cookie_enabled(cookie) && - test_bit(FSCACHE_COOKIE_IS_CACHING, &cookie->flags)) { - folio_start_fscache(folio); - fscache_write_to_cache(v9fs_inode_cookie(v9inode), - folio_mapping(folio), start, len, i_size, - v9fs_write_to_cache_done, v9inode, - true); - } -#endif - - folio_end_writeback(folio); - p9_fid_put(writeback_fid); - - return err; -} - -static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc) -{ - struct folio *folio = page_folio(page); - int retval; - - p9_debug(P9_DEBUG_VFS, "folio %p\n", folio); - - retval = v9fs_vfs_write_folio_locked(folio); - if (retval < 0) { - if (retval == -EAGAIN) { - folio_redirty_for_writepage(wbc, folio); - retval = 0; - } else { - mapping_set_error(folio_mapping(folio), retval); - } - } else - retval = 0; - - folio_unlock(folio); - return retval; -} - -static int v9fs_launder_folio(struct folio *folio) -{ - int retval; - - if (folio_clear_dirty_for_io(folio)) { - retval = v9fs_vfs_write_folio_locked(folio); - if (retval) - return retval; - } - folio_wait_fscache(folio); - return 0; -} - -/** - * v9fs_direct_IO - 9P address space operation for direct I/O - * @iocb: target I/O control block - * @iter: The data/buffer to use - * - * The presence of v9fs_direct_IO() in the address space ops vector - * allowes open() O_DIRECT flags which would have failed otherwise. - * - * In the non-cached mode, we shunt off direct read and write requests before - * the VFS gets them, so this method should never be called. - * - * Direct IO is not 'yet' supported in the cached mode. Hence when - * this routine is called through generic_file_aio_read(), the read/write fails - * with an error. - * - */ -static ssize_t -v9fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) -{ - struct file *file = iocb->ki_filp; - loff_t pos = iocb->ki_pos; - ssize_t n; - int err = 0; - - if (iov_iter_rw(iter) == WRITE) { - n = p9_client_write(file->private_data, pos, iter, &err); - if (n) { - struct inode *inode = file_inode(file); - loff_t i_size = i_size_read(inode); - - if (pos + n > i_size) - inode_add_bytes(inode, pos + n - i_size); - } - } else { - n = p9_client_read(file->private_data, pos, iter, &err); - } - return n ? n : err; -} - -static int v9fs_write_begin(struct file *filp, struct address_space *mapping, - loff_t pos, unsigned int len, - struct page **subpagep, void **fsdata) -{ - int retval; - struct folio *folio; - struct v9fs_inode *v9inode = V9FS_I(mapping->host); - - p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping); - - /* Prefetch area to be written into the cache if we're caching this - * file. We need to do this before we get a lock on the page in case - * there's more than one writer competing for the same cache block. - */ - retval = netfs_write_begin(&v9inode->netfs, filp, mapping, pos, len, &folio, fsdata); - if (retval < 0) - return retval; - - *subpagep = &folio->page; - return retval; -} - -static int v9fs_write_end(struct file *filp, struct address_space *mapping, - loff_t pos, unsigned int len, unsigned int copied, - struct page *subpage, void *fsdata) -{ - loff_t last_pos = pos + copied; - struct folio *folio = page_folio(subpage); - struct inode *inode = mapping->host; - - p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping); - - if (!folio_test_uptodate(folio)) { - if (unlikely(copied < len)) { - copied = 0; - goto out; - } - - folio_mark_uptodate(folio); - } - - /* - * No need to use i_size_read() here, the i_size - * cannot change under us because we hold the i_mutex. - */ - if (last_pos > inode->i_size) { - inode_add_bytes(inode, last_pos - inode->i_size); - i_size_write(inode, last_pos); -#ifdef CONFIG_9P_FSCACHE - fscache_update_cookie(v9fs_inode_cookie(V9FS_I(inode)), NULL, - &last_pos); -#endif - } - folio_mark_dirty(folio); -out: - folio_unlock(folio); - folio_put(folio); - - return copied; -} - -#ifdef CONFIG_9P_FSCACHE -/* - * Mark a page as having been made dirty and thus needing writeback. We also - * need to pin the cache object to write back to. - */ -static bool v9fs_dirty_folio(struct address_space *mapping, struct folio *folio) -{ - struct v9fs_inode *v9inode = V9FS_I(mapping->host); - - return fscache_dirty_folio(mapping, folio, v9fs_inode_cookie(v9inode)); -} -#else -#define v9fs_dirty_folio filemap_dirty_folio -#endif - const struct address_space_operations v9fs_addr_operations = { - .read_folio = netfs_read_folio, - .readahead = netfs_readahead, - .dirty_folio = v9fs_dirty_folio, - .writepage = v9fs_vfs_writepage, - .write_begin = v9fs_write_begin, - .write_end = v9fs_write_end, - .release_folio = v9fs_release_folio, - .invalidate_folio = v9fs_invalidate_folio, - .launder_folio = v9fs_launder_folio, - .direct_IO = v9fs_direct_IO, + .read_folio = netfs_read_folio, + .readahead = netfs_readahead, + .dirty_folio = netfs_dirty_folio, + .release_folio = netfs_release_folio, + .invalidate_folio = netfs_invalidate_folio, + .launder_folio = netfs_launder_folio, + .direct_IO = noop_direct_IO, + .writepages = netfs_writepages, }; diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 11cd8d23f6f2..bae330c2f0cf 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -353,25 +353,15 @@ static ssize_t v9fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct p9_fid *fid = iocb->ki_filp->private_data; - int ret, err = 0; p9_debug(P9_DEBUG_VFS, "fid %d count %zu offset %lld\n", fid->fid, iov_iter_count(to), iocb->ki_pos); - if (!(fid->mode & P9L_DIRECT)) { - p9_debug(P9_DEBUG_VFS, "(cached)\n"); - return generic_file_read_iter(iocb, to); - } - - if (iocb->ki_filp->f_flags & O_NONBLOCK) - ret = p9_client_read_once(fid, iocb->ki_pos, to, &err); - else - ret = p9_client_read(fid, iocb->ki_pos, to, &err); - if (!ret) - return err; + if (fid->mode & P9L_DIRECT) + return netfs_unbuffered_read_iter(iocb, to); - iocb->ki_pos += ret; - return ret; + p9_debug(P9_DEBUG_VFS, "(cached)\n"); + return netfs_file_read_iter(iocb, to); } /* @@ -407,46 +397,14 @@ v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct p9_fid *fid = file->private_data; - ssize_t retval; - loff_t origin; - int err = 0; p9_debug(P9_DEBUG_VFS, "fid %d\n", fid->fid); - if (!(fid->mode & (P9L_DIRECT | P9L_NOWRITECACHE))) { - p9_debug(P9_DEBUG_CACHE, "(cached)\n"); - return generic_file_write_iter(iocb, from); - } + if (fid->mode & (P9L_DIRECT | P9L_NOWRITECACHE)) + return netfs_unbuffered_write_iter(iocb, from); - retval = generic_write_checks(iocb, from); - if (retval <= 0) - return retval; - - origin = iocb->ki_pos; - retval = p9_client_write(file->private_data, iocb->ki_pos, from, &err); - if (retval > 0) { - struct inode *inode = file_inode(file); - loff_t i_size; - unsigned long pg_start, pg_end; - - pg_start = origin >> PAGE_SHIFT; - pg_end = (origin + retval - 1) >> PAGE_SHIFT; - if (inode->i_mapping && inode->i_mapping->nrpages) - invalidate_inode_pages2_range(inode->i_mapping, - pg_start, pg_end); - iocb->ki_pos += retval; - i_size = i_size_read(inode); - if (iocb->ki_pos > i_size) { - inode_add_bytes(inode, iocb->ki_pos - i_size); - /* - * Need to serialize against i_size_write() in - * v9fs_stat2inode() - */ - v9fs_i_size_write(inode, iocb->ki_pos); - } - return retval; - } - return err; + p9_debug(P9_DEBUG_CACHE, "(cached)\n"); + return netfs_file_write_iter(iocb, from); } static int v9fs_file_fsync(struct file *filp, loff_t start, loff_t end, @@ -519,36 +477,7 @@ v9fs_file_mmap(struct file *filp, struct vm_area_struct *vma) static vm_fault_t v9fs_vm_page_mkwrite(struct vm_fault *vmf) { - struct folio *folio = page_folio(vmf->page); - struct file *filp = vmf->vma->vm_file; - struct inode *inode = file_inode(filp); - - - p9_debug(P9_DEBUG_VFS, "folio %p fid %lx\n", - folio, (unsigned long)filp->private_data); - - /* Wait for the page to be written to the cache before we allow it to - * be modified. We then assume the entire page will need writing back. - */ -#ifdef CONFIG_9P_FSCACHE - if (folio_test_fscache(folio) && - folio_wait_fscache_killable(folio) < 0) - return VM_FAULT_NOPAGE; -#endif - - /* Update file times before taking page lock */ - file_update_time(filp); - - if (folio_lock_killable(folio) < 0) - return VM_FAULT_RETRY; - if (folio_mapping(folio) != inode->i_mapping) - goto out_unlock; - folio_wait_stable(folio); - - return VM_FAULT_LOCKED; -out_unlock: - folio_unlock(folio); - return VM_FAULT_NOPAGE; + return netfs_page_mkwrite(vmf, NULL); } static void v9fs_mmap_vm_close(struct vm_area_struct *vma) diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index b845ee18a80b..32572982f72e 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -246,10 +246,10 @@ void v9fs_free_inode(struct inode *inode) /* * Set parameters for the netfs library */ -static void v9fs_set_netfs_context(struct inode *inode) +void v9fs_set_netfs_context(struct inode *inode) { struct v9fs_inode *v9inode = V9FS_I(inode); - netfs_inode_init(&v9inode->netfs, &v9fs_req_ops); + netfs_inode_init(&v9inode->netfs, &v9fs_req_ops, true); } int v9fs_init_inode(struct v9fs_session_info *v9ses, @@ -326,8 +326,6 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, err = -EINVAL; goto error; } - - v9fs_set_netfs_context(inode); error: return err; @@ -359,6 +357,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev) iput(inode); return ERR_PTR(err); } + v9fs_set_netfs_context(inode); return inode; } @@ -374,11 +373,8 @@ void v9fs_evict_inode(struct inode *inode) truncate_inode_pages_final(&inode->i_data); -#ifdef CONFIG_9P_FSCACHE version = cpu_to_le32(v9inode->qid.version); - fscache_clear_inode_writeback(v9fs_inode_cookie(v9inode), inode, - &version); -#endif + netfs_clear_inode_writeback(inode, &version); clear_inode(inode); filemap_fdatawrite(&inode->i_data); @@ -464,6 +460,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb, goto error; v9fs_stat2inode(st, inode, sb, 0); + v9fs_set_netfs_context(inode); v9fs_cache_inode_get_cookie(inode); unlock_new_inode(inode); return inode; @@ -1113,7 +1110,7 @@ static int v9fs_vfs_setattr(struct mnt_idmap *idmap, if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size != i_size_read(inode)) { truncate_setsize(inode, iattr->ia_size); - truncate_pagecache(inode, iattr->ia_size); + netfs_resize_file(netfs_inode(inode), iattr->ia_size, true); #ifdef CONFIG_9P_FSCACHE if (v9ses->cache & CACHE_FSCACHE) { @@ -1181,6 +1178,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, mode |= inode->i_mode & ~S_IALLUGO; inode->i_mode = mode; + v9inode->netfs.remote_i_size = stat->length; if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE)) v9fs_i_size_write(inode, stat->length); /* not real number of blocks, but 512 byte ones ... */ diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index c7319af2f471..3505227e1704 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -128,6 +128,7 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb, goto error; v9fs_stat2inode_dotl(st, inode, 0); + v9fs_set_netfs_context(inode); v9fs_cache_inode_get_cookie(inode); retval = v9fs_get_acl(inode, fid); if (retval) @@ -598,7 +599,7 @@ int v9fs_vfs_setattr_dotl(struct mnt_idmap *idmap, if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size != i_size_read(inode)) { truncate_setsize(inode, iattr->ia_size); - truncate_pagecache(inode, iattr->ia_size); + netfs_resize_file(netfs_inode(inode), iattr->ia_size, true); #ifdef CONFIG_9P_FSCACHE if (v9ses->cache & CACHE_FSCACHE) @@ -655,6 +656,7 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, mode |= inode->i_mode & ~S_IALLUGO; inode->i_mode = mode; + v9inode->netfs.remote_i_size = stat->st_size; if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE)) v9fs_i_size_write(inode, stat->st_size); inode->i_blocks = stat->st_blocks; @@ -683,8 +685,10 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, inode->i_mode = mode; } if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE) && - stat->st_result_mask & P9_STATS_SIZE) + stat->st_result_mask & P9_STATS_SIZE) { + v9inode->netfs.remote_i_size = stat->st_size; v9fs_i_size_write(inode, stat->st_size); + } if (stat->st_result_mask & P9_STATS_BLOCKS) inode->i_blocks = stat->st_blocks; } diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 73db55c050bf..941f7d0e0bfa 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -289,31 +289,21 @@ static int v9fs_drop_inode(struct inode *inode) static int v9fs_write_inode(struct inode *inode, struct writeback_control *wbc) { - struct v9fs_inode *v9inode; - /* * send an fsync request to server irrespective of * wbc->sync_mode. */ p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode); - - v9inode = V9FS_I(inode); - fscache_unpin_writeback(wbc, v9fs_inode_cookie(v9inode)); - - return 0; + return netfs_unpin_writeback(inode, wbc); } static int v9fs_write_inode_dotl(struct inode *inode, struct writeback_control *wbc) { - struct v9fs_inode *v9inode; - v9inode = V9FS_I(inode); p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode); - fscache_unpin_writeback(wbc, v9fs_inode_cookie(v9inode)); - - return 0; + return netfs_unpin_writeback(inode, wbc); } static const struct super_operations v9fs_super_ops = { diff --git a/fs/Kconfig b/fs/Kconfig index a3159831ba98..89fdbefd1075 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -144,7 +144,6 @@ source "fs/overlayfs/Kconfig" menu "Caches" source "fs/netfs/Kconfig" -source "fs/fscache/Kconfig" source "fs/cachefiles/Kconfig" endmenu diff --git a/fs/Makefile b/fs/Makefile index a6962c588962..c09016257f05 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -61,7 +61,6 @@ obj-$(CONFIG_DLM) += dlm/ # Do not add any filesystems before this line obj-$(CONFIG_NETFS_SUPPORT) += netfs/ -obj-$(CONFIG_FSCACHE) += fscache/ obj-$(CONFIG_REISERFS_FS) += reiserfs/ obj-$(CONFIG_EXT4_FS) += ext4/ # We place ext4 before ext2 so that clean ext3 root fs's do NOT mount using the diff --git a/fs/affs/namei.c b/fs/affs/namei.c index d6b9758ee23d..8c154490a2d6 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -532,9 +532,6 @@ static struct dentry *affs_get_parent(struct dentry *child) parent = affs_iget(child->d_sb, be32_to_cpu(AFFS_TAIL(child->d_sb, bh)->parent)); brelse(bh); - if (IS_ERR(parent)) - return ERR_CAST(parent); - return d_obtain_alias(parent); } diff --git a/fs/afs/dir.c b/fs/afs/dir.c index c14533ef108f..b5b8de521f99 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -124,7 +124,7 @@ static void afs_dir_read_cleanup(struct afs_read *req) if (xas_retry(&xas, folio)) continue; BUG_ON(xa_is_value(folio)); - ASSERTCMP(folio_file_mapping(folio), ==, mapping); + ASSERTCMP(folio->mapping, ==, mapping); folio_put(folio); } @@ -202,12 +202,12 @@ static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req) if (xas_retry(&xas, folio)) continue; - BUG_ON(folio_file_mapping(folio) != mapping); + BUG_ON(folio->mapping != mapping); size = min_t(loff_t, folio_size(folio), req->actual_len - folio_pos(folio)); for (offset = 0; offset < size; offset += sizeof(*block)) { block = kmap_local_folio(folio, offset); - pr_warn("[%02lx] %32phN\n", folio_index(folio) + offset, block); + pr_warn("[%02lx] %32phN\n", folio->index + offset, block); kunmap_local(block); } } @@ -233,7 +233,7 @@ static int afs_dir_check(struct afs_vnode *dvnode, struct afs_read *req) if (xas_retry(&xas, folio)) continue; - BUG_ON(folio_file_mapping(folio) != mapping); + BUG_ON(folio->mapping != mapping); if (!afs_dir_check_folio(dvnode, folio, req->actual_len)) { afs_dir_dump(dvnode, req); @@ -474,6 +474,14 @@ static int afs_dir_iterate_block(struct afs_vnode *dvnode, continue; } + /* Don't expose silly rename entries to userspace. */ + if (nlen > 6 && + dire->u.name[0] == '.' && + ctx->actor != afs_lookup_filldir && + ctx->actor != afs_lookup_one_filldir && + memcmp(dire->u.name, ".__afs", 6) == 0) + continue; + /* found the next entry */ if (!dir_emit(ctx, dire->u.name, nlen, ntohl(dire->u.vnode), @@ -708,6 +716,8 @@ static void afs_do_lookup_success(struct afs_operation *op) break; } + if (vp->scb.status.abort_code) + trace_afs_bulkstat_error(op, &vp->fid, i, vp->scb.status.abort_code); if (!vp->scb.have_status && !vp->scb.have_error) continue; @@ -897,12 +907,16 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, afs_begin_vnode_operation(op); afs_wait_for_operation(op); } - inode = ERR_PTR(afs_op_error(op)); out_op: if (!afs_op_error(op)) { - inode = &op->file[1].vnode->netfs.inode; - op->file[1].vnode = NULL; + if (op->file[1].scb.status.abort_code) { + afs_op_accumulate_error(op, -ECONNABORTED, + op->file[1].scb.status.abort_code); + } else { + inode = &op->file[1].vnode->netfs.inode; + op->file[1].vnode = NULL; + } } if (op->file[0].scb.have_status) @@ -2022,7 +2036,7 @@ static bool afs_dir_release_folio(struct folio *folio, gfp_t gfp_flags) { struct afs_vnode *dvnode = AFS_FS_I(folio_inode(folio)); - _enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, folio_index(folio)); + _enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, folio->index); folio_detach_private(folio); diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index 1f656005018e..c4d2711e20ad 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -76,7 +76,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root) /* there shouldn't be an existing inode */ BUG_ON(!(inode->i_state & I_NEW)); - netfs_inode_init(&vnode->netfs, NULL); + netfs_inode_init(&vnode->netfs, NULL, false); inode->i_size = 0; inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; if (root) { @@ -258,16 +258,7 @@ const struct inode_operations afs_dynroot_inode_operations = { .lookup = afs_dynroot_lookup, }; -/* - * Dirs in the dynamic root don't need revalidation. - */ -static int afs_dynroot_d_revalidate(struct dentry *dentry, unsigned int flags) -{ - return 1; -} - const struct dentry_operations afs_dynroot_dentry_operations = { - .d_revalidate = afs_dynroot_d_revalidate, .d_delete = always_delete_dentry, .d_release = afs_d_release, .d_automount = afs_d_automount, @@ -373,7 +364,7 @@ error: void afs_dynroot_depopulate(struct super_block *sb) { struct afs_net *net = afs_sb2net(sb); - struct dentry *root = sb->s_root, *subdir, *tmp; + struct dentry *root = sb->s_root, *subdir; /* Prevent more subdirs from being created */ mutex_lock(&net->proc_cells_lock); @@ -382,10 +373,11 @@ void afs_dynroot_depopulate(struct super_block *sb) mutex_unlock(&net->proc_cells_lock); if (root) { + struct hlist_node *n; inode_lock(root->d_inode); /* Remove all the pins for dirs created for manually added cells */ - list_for_each_entry_safe(subdir, tmp, &root->d_subdirs, d_child) { + hlist_for_each_entry_safe(subdir, n, &root->d_children, d_sib) { if (subdir->d_fsdata) { subdir->d_fsdata = NULL; dput(subdir); diff --git a/fs/afs/file.c b/fs/afs/file.c index 30914e0d9cb2..3d33b221d9ca 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -20,9 +20,6 @@ static int afs_file_mmap(struct file *file, struct vm_area_struct *vma); static int afs_symlink_read_folio(struct file *file, struct folio *folio); -static void afs_invalidate_folio(struct folio *folio, size_t offset, - size_t length); -static bool afs_release_folio(struct folio *folio, gfp_t gfp_flags); static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter); static ssize_t afs_file_splice_read(struct file *in, loff_t *ppos, @@ -37,7 +34,7 @@ const struct file_operations afs_file_operations = { .release = afs_release, .llseek = generic_file_llseek, .read_iter = afs_file_read_iter, - .write_iter = afs_file_write, + .write_iter = netfs_file_write_iter, .mmap = afs_file_mmap, .splice_read = afs_file_splice_read, .splice_write = iter_file_splice_write, @@ -53,22 +50,21 @@ const struct inode_operations afs_file_inode_operations = { }; const struct address_space_operations afs_file_aops = { + .direct_IO = noop_direct_IO, .read_folio = netfs_read_folio, .readahead = netfs_readahead, - .dirty_folio = afs_dirty_folio, - .launder_folio = afs_launder_folio, - .release_folio = afs_release_folio, - .invalidate_folio = afs_invalidate_folio, - .write_begin = afs_write_begin, - .write_end = afs_write_end, - .writepages = afs_writepages, + .dirty_folio = netfs_dirty_folio, + .launder_folio = netfs_launder_folio, + .release_folio = netfs_release_folio, + .invalidate_folio = netfs_invalidate_folio, .migrate_folio = filemap_migrate_folio, + .writepages = afs_writepages, }; const struct address_space_operations afs_symlink_aops = { .read_folio = afs_symlink_read_folio, - .release_folio = afs_release_folio, - .invalidate_folio = afs_invalidate_folio, + .release_folio = netfs_release_folio, + .invalidate_folio = netfs_invalidate_folio, .migrate_folio = filemap_migrate_folio, }; @@ -323,11 +319,7 @@ static void afs_issue_read(struct netfs_io_subrequest *subreq) fsreq->len = subreq->len - subreq->transferred; fsreq->key = key_get(subreq->rreq->netfs_priv); fsreq->vnode = vnode; - fsreq->iter = &fsreq->def_iter; - - iov_iter_xarray(&fsreq->def_iter, ITER_DEST, - &fsreq->vnode->netfs.inode.i_mapping->i_pages, - fsreq->pos, fsreq->len); + fsreq->iter = &subreq->io_iter; afs_fetch_data(fsreq->vnode, fsreq); afs_put_read(fsreq); @@ -359,22 +351,13 @@ static int afs_symlink_read_folio(struct file *file, struct folio *folio) static int afs_init_request(struct netfs_io_request *rreq, struct file *file) { - rreq->netfs_priv = key_get(afs_file_key(file)); + if (file) + rreq->netfs_priv = key_get(afs_file_key(file)); + rreq->rsize = 256 * 1024; + rreq->wsize = 256 * 1024; return 0; } -static int afs_begin_cache_operation(struct netfs_io_request *rreq) -{ -#ifdef CONFIG_AFS_FSCACHE - struct afs_vnode *vnode = AFS_FS_I(rreq->inode); - - return fscache_begin_read_operation(&rreq->cache_resources, - afs_vnode_cache(vnode)); -#else - return -ENOBUFS; -#endif -} - static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len, struct folio **foliop, void **_fsdata) { @@ -388,128 +371,37 @@ static void afs_free_request(struct netfs_io_request *rreq) key_put(rreq->netfs_priv); } -const struct netfs_request_ops afs_req_ops = { - .init_request = afs_init_request, - .free_request = afs_free_request, - .begin_cache_operation = afs_begin_cache_operation, - .check_write_begin = afs_check_write_begin, - .issue_read = afs_issue_read, -}; - -int afs_write_inode(struct inode *inode, struct writeback_control *wbc) +static void afs_update_i_size(struct inode *inode, loff_t new_i_size) { - fscache_unpin_writeback(wbc, afs_vnode_cache(AFS_FS_I(inode))); - return 0; -} - -/* - * Adjust the dirty region of the page on truncation or full invalidation, - * getting rid of the markers altogether if the region is entirely invalidated. - */ -static void afs_invalidate_dirty(struct folio *folio, size_t offset, - size_t length) -{ - struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio)); - unsigned long priv; - unsigned int f, t, end = offset + length; - - priv = (unsigned long)folio_get_private(folio); - - /* we clean up only if the entire page is being invalidated */ - if (offset == 0 && length == folio_size(folio)) - goto full_invalidate; - - /* If the page was dirtied by page_mkwrite(), the PTE stays writable - * and we don't get another notification to tell us to expand it - * again. - */ - if (afs_is_folio_dirty_mmapped(priv)) - return; - - /* We may need to shorten the dirty region */ - f = afs_folio_dirty_from(folio, priv); - t = afs_folio_dirty_to(folio, priv); - - if (t <= offset || f >= end) - return; /* Doesn't overlap */ - - if (f < offset && t > end) - return; /* Splits the dirty region - just absorb it */ - - if (f >= offset && t <= end) - goto undirty; + struct afs_vnode *vnode = AFS_FS_I(inode); + loff_t i_size; - if (f < offset) - t = offset; - else - f = end; - if (f == t) - goto undirty; - - priv = afs_folio_dirty(folio, f, t); - folio_change_private(folio, (void *)priv); - trace_afs_folio_dirty(vnode, tracepoint_string("trunc"), folio); - return; - -undirty: - trace_afs_folio_dirty(vnode, tracepoint_string("undirty"), folio); - folio_clear_dirty_for_io(folio); -full_invalidate: - trace_afs_folio_dirty(vnode, tracepoint_string("inval"), folio); - folio_detach_private(folio); + write_seqlock(&vnode->cb_lock); + i_size = i_size_read(&vnode->netfs.inode); + if (new_i_size > i_size) { + i_size_write(&vnode->netfs.inode, new_i_size); + inode_set_bytes(&vnode->netfs.inode, new_i_size); + } + write_sequnlock(&vnode->cb_lock); + fscache_update_cookie(afs_vnode_cache(vnode), NULL, &new_i_size); } -/* - * invalidate part or all of a page - * - release a page and clean up its private data if offset is 0 (indicating - * the entire page) - */ -static void afs_invalidate_folio(struct folio *folio, size_t offset, - size_t length) +static void afs_netfs_invalidate_cache(struct netfs_io_request *wreq) { - _enter("{%lu},%zu,%zu", folio->index, offset, length); - - BUG_ON(!folio_test_locked(folio)); + struct afs_vnode *vnode = AFS_FS_I(wreq->inode); - if (folio_get_private(folio)) - afs_invalidate_dirty(folio, offset, length); - - folio_wait_fscache(folio); - _leave(""); + afs_invalidate_cache(vnode, 0); } -/* - * release a page and clean up its private state if it's not busy - * - return true if the page can now be released, false if not - */ -static bool afs_release_folio(struct folio *folio, gfp_t gfp) -{ - struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio)); - - _enter("{{%llx:%llu}[%lu],%lx},%x", - vnode->fid.vid, vnode->fid.vnode, folio_index(folio), folio->flags, - gfp); - - /* deny if folio is being written to the cache and the caller hasn't - * elected to wait */ -#ifdef CONFIG_AFS_FSCACHE - if (folio_test_fscache(folio)) { - if (current_is_kswapd() || !(gfp & __GFP_FS)) - return false; - folio_wait_fscache(folio); - } - fscache_note_page_release(afs_vnode_cache(vnode)); -#endif - - if (folio_test_private(folio)) { - trace_afs_folio_dirty(vnode, tracepoint_string("rel"), folio); - folio_detach_private(folio); - } - - /* Indicate that the folio can be released */ - _leave(" = T"); - return true; -} +const struct netfs_request_ops afs_req_ops = { + .init_request = afs_init_request, + .free_request = afs_free_request, + .check_write_begin = afs_check_write_begin, + .issue_read = afs_issue_read, + .update_i_size = afs_update_i_size, + .invalidate_cache = afs_netfs_invalidate_cache, + .create_write_requests = afs_create_write_requests, +}; static void afs_add_open_mmap(struct afs_vnode *vnode) { @@ -576,28 +468,39 @@ static vm_fault_t afs_vm_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pg static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) { - struct afs_vnode *vnode = AFS_FS_I(file_inode(iocb->ki_filp)); + struct inode *inode = file_inode(iocb->ki_filp); + struct afs_vnode *vnode = AFS_FS_I(inode); struct afs_file *af = iocb->ki_filp->private_data; - int ret; + ssize_t ret; - ret = afs_validate(vnode, af->key); + if (iocb->ki_flags & IOCB_DIRECT) + return netfs_unbuffered_read_iter(iocb, iter); + + ret = netfs_start_io_read(inode); if (ret < 0) return ret; - - return generic_file_read_iter(iocb, iter); + ret = afs_validate(vnode, af->key); + if (ret == 0) + ret = filemap_read(iocb, iter, 0); + netfs_end_io_read(inode); + return ret; } static ssize_t afs_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { - struct afs_vnode *vnode = AFS_FS_I(file_inode(in)); + struct inode *inode = file_inode(in); + struct afs_vnode *vnode = AFS_FS_I(inode); struct afs_file *af = in->private_data; - int ret; + ssize_t ret; - ret = afs_validate(vnode, af->key); + ret = netfs_start_io_read(inode); if (ret < 0) return ret; - - return filemap_splice_read(in, ppos, pipe, len, flags); + ret = afs_validate(vnode, af->key); + if (ret == 0) + ret = filemap_splice_read(in, ppos, pipe, len, flags); + netfs_end_io_read(inode); + return ret; } diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 4f04f6f33f46..94fc049aff58 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -58,7 +58,7 @@ static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *paren */ static void afs_set_netfs_context(struct afs_vnode *vnode) { - netfs_inode_init(&vnode->netfs, &afs_req_ops); + netfs_inode_init(&vnode->netfs, &afs_req_ops, true); } /* @@ -166,6 +166,7 @@ static void afs_apply_status(struct afs_operation *op, struct inode *inode = &vnode->netfs.inode; struct timespec64 t; umode_t mode; + bool unexpected_jump = false; bool data_changed = false; bool change_size = vp->set_size; @@ -230,6 +231,7 @@ static void afs_apply_status(struct afs_operation *op, } change_size = true; data_changed = true; + unexpected_jump = true; } else if (vnode->status.type == AFS_FTYPE_DIR) { /* Expected directory change is handled elsewhere so * that we can locally edit the directory and save on a @@ -249,8 +251,10 @@ static void afs_apply_status(struct afs_operation *op, * what's on the server. */ vnode->netfs.remote_i_size = status->size; - if (change_size) { + if (change_size || status->size > i_size_read(inode)) { afs_set_i_size(vnode, status->size); + if (unexpected_jump) + vnode->netfs.zero_point = status->size; inode_set_ctime_to_ts(inode, t); inode_set_atime_to_ts(inode, t); } @@ -647,7 +651,7 @@ void afs_evict_inode(struct inode *inode) truncate_inode_pages_final(&inode->i_data); afs_set_cache_aux(vnode, &aux); - fscache_clear_inode_writeback(afs_vnode_cache(vnode), inode, &aux); + netfs_clear_inode_writeback(inode, &aux); clear_inode(inode); while (!list_empty(&vnode->wb_keys)) { @@ -689,17 +693,17 @@ static void afs_setattr_success(struct afs_operation *op) static void afs_setattr_edit_file(struct afs_operation *op) { struct afs_vnode_param *vp = &op->file[0]; - struct inode *inode = &vp->vnode->netfs.inode; + struct afs_vnode *vnode = vp->vnode; if (op->setattr.attr->ia_valid & ATTR_SIZE) { loff_t size = op->setattr.attr->ia_size; loff_t i_size = op->setattr.old_i_size; - if (size < i_size) - truncate_pagecache(inode, size); - if (size != i_size) - fscache_resize_cookie(afs_vnode_cache(vp->vnode), - vp->scb.status.size); + if (size != i_size) { + truncate_setsize(&vnode->netfs.inode, size); + netfs_resize_file(&vnode->netfs, size, true); + fscache_resize_cookie(afs_vnode_cache(vnode), size); + } } } @@ -767,11 +771,11 @@ int afs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, */ if (!(attr->ia_valid & (supported & ~ATTR_SIZE & ~ATTR_MTIME)) && attr->ia_size < i_size && - attr->ia_size > vnode->status.size) { - truncate_pagecache(inode, attr->ia_size); + attr->ia_size > vnode->netfs.remote_i_size) { + truncate_setsize(inode, attr->ia_size); + netfs_resize_file(&vnode->netfs, size, false); fscache_resize_cookie(afs_vnode_cache(vnode), attr->ia_size); - i_size_write(inode, attr->ia_size); ret = 0; goto out_unlock; } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 55aa0679d8ce..9c03fcf7ffaa 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -985,62 +985,6 @@ static inline void afs_invalidate_cache(struct afs_vnode *vnode, unsigned int fl i_size_read(&vnode->netfs.inode), flags); } -/* - * We use folio->private to hold the amount of the folio that we've written to, - * splitting the field into two parts. However, we need to represent a range - * 0...FOLIO_SIZE, so we reduce the resolution if the size of the folio - * exceeds what we can encode. - */ -#ifdef CONFIG_64BIT -#define __AFS_FOLIO_PRIV_MASK 0x7fffffffUL -#define __AFS_FOLIO_PRIV_SHIFT 32 -#define __AFS_FOLIO_PRIV_MMAPPED 0x80000000UL -#else -#define __AFS_FOLIO_PRIV_MASK 0x7fffUL -#define __AFS_FOLIO_PRIV_SHIFT 16 -#define __AFS_FOLIO_PRIV_MMAPPED 0x8000UL -#endif - -static inline unsigned int afs_folio_dirty_resolution(struct folio *folio) -{ - int shift = folio_shift(folio) - (__AFS_FOLIO_PRIV_SHIFT - 1); - return (shift > 0) ? shift : 0; -} - -static inline size_t afs_folio_dirty_from(struct folio *folio, unsigned long priv) -{ - unsigned long x = priv & __AFS_FOLIO_PRIV_MASK; - - /* The lower bound is inclusive */ - return x << afs_folio_dirty_resolution(folio); -} - -static inline size_t afs_folio_dirty_to(struct folio *folio, unsigned long priv) -{ - unsigned long x = (priv >> __AFS_FOLIO_PRIV_SHIFT) & __AFS_FOLIO_PRIV_MASK; - - /* The upper bound is immediately beyond the region */ - return (x + 1) << afs_folio_dirty_resolution(folio); -} - -static inline unsigned long afs_folio_dirty(struct folio *folio, size_t from, size_t to) -{ - unsigned int res = afs_folio_dirty_resolution(folio); - from >>= res; - to = (to - 1) >> res; - return (to << __AFS_FOLIO_PRIV_SHIFT) | from; -} - -static inline unsigned long afs_folio_dirty_mmapped(unsigned long priv) -{ - return priv | __AFS_FOLIO_PRIV_MMAPPED; -} - -static inline bool afs_is_folio_dirty_mmapped(unsigned long priv) -{ - return priv & __AFS_FOLIO_PRIV_MMAPPED; -} - #include <trace/events/afs.h> /*****************************************************************************/ @@ -1167,7 +1111,6 @@ extern int afs_release(struct inode *, struct file *); extern int afs_fetch_data(struct afs_vnode *, struct afs_read *); extern struct afs_read *afs_alloc_read(gfp_t); extern void afs_put_read(struct afs_read *); -extern int afs_write_inode(struct inode *, struct writeback_control *); static inline struct afs_read *afs_get_read(struct afs_read *req) { @@ -1658,24 +1601,11 @@ extern int afs_check_volume_status(struct afs_volume *, struct afs_operation *); /* * write.c */ -#ifdef CONFIG_AFS_FSCACHE -bool afs_dirty_folio(struct address_space *, struct folio *); -#else -#define afs_dirty_folio filemap_dirty_folio -#endif -extern int afs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct page **pagep, void **fsdata); -extern int afs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata); -extern int afs_writepage(struct page *, struct writeback_control *); extern int afs_writepages(struct address_space *, struct writeback_control *); -extern ssize_t afs_file_write(struct kiocb *, struct iov_iter *); extern int afs_fsync(struct file *, loff_t, loff_t, int); extern vm_fault_t afs_page_mkwrite(struct vm_fault *vmf); extern void afs_prune_wb_keys(struct afs_vnode *); -int afs_launder_folio(struct folio *); +void afs_create_write_requests(struct netfs_io_request *wreq, loff_t start, size_t len); /* * xattr.c diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 3bd02571f30d..15eab053af6d 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -166,7 +166,7 @@ static int afs_proc_addr_prefs_show(struct seq_file *m, void *v) if (!preflist) { seq_puts(m, "NO PREFS\n"); - return 0; + goto out; } seq_printf(m, "PROT SUBNET PRIOR (v=%u n=%u/%u/%u)\n", @@ -191,7 +191,8 @@ static int afs_proc_addr_prefs_show(struct seq_file *m, void *v) } } - rcu_read_lock(); +out: + rcu_read_unlock(); return 0; } diff --git a/fs/afs/super.c b/fs/afs/super.c index ae2d66a52add..f3ba1c3e72f5 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -55,7 +55,7 @@ int afs_net_id; static const struct super_operations afs_super_ops = { .statfs = afs_statfs, .alloc_inode = afs_alloc_inode, - .write_inode = afs_write_inode, + .write_inode = netfs_unpin_writeback, .drop_inode = afs_drop_inode, .destroy_inode = afs_destroy_inode, .free_inode = afs_free_inode, diff --git a/fs/afs/write.c b/fs/afs/write.c index 61d34ad2ca7d..74402d95a884 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -12,309 +12,17 @@ #include <linux/writeback.h> #include <linux/pagevec.h> #include <linux/netfs.h> +#include <trace/events/netfs.h> #include "internal.h" -static int afs_writepages_region(struct address_space *mapping, - struct writeback_control *wbc, - loff_t start, loff_t end, loff_t *_next, - bool max_one_loop); - -static void afs_write_to_cache(struct afs_vnode *vnode, loff_t start, size_t len, - loff_t i_size, bool caching); - -#ifdef CONFIG_AFS_FSCACHE -/* - * Mark a page as having been made dirty and thus needing writeback. We also - * need to pin the cache object to write back to. - */ -bool afs_dirty_folio(struct address_space *mapping, struct folio *folio) -{ - return fscache_dirty_folio(mapping, folio, - afs_vnode_cache(AFS_FS_I(mapping->host))); -} -static void afs_folio_start_fscache(bool caching, struct folio *folio) -{ - if (caching) - folio_start_fscache(folio); -} -#else -static void afs_folio_start_fscache(bool caching, struct folio *folio) -{ -} -#endif - -/* - * Flush out a conflicting write. This may extend the write to the surrounding - * pages if also dirty and contiguous to the conflicting region.. - */ -static int afs_flush_conflicting_write(struct address_space *mapping, - struct folio *folio) -{ - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = LONG_MAX, - .range_start = folio_pos(folio), - .range_end = LLONG_MAX, - }; - loff_t next; - - return afs_writepages_region(mapping, &wbc, folio_pos(folio), LLONG_MAX, - &next, true); -} - -/* - * prepare to perform part of a write to a page - */ -int afs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct page **_page, void **fsdata) -{ - struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); - struct folio *folio; - unsigned long priv; - unsigned f, from; - unsigned t, to; - pgoff_t index; - int ret; - - _enter("{%llx:%llu},%llx,%x", - vnode->fid.vid, vnode->fid.vnode, pos, len); - - /* Prefetch area to be written into the cache if we're caching this - * file. We need to do this before we get a lock on the page in case - * there's more than one writer competing for the same cache block. - */ - ret = netfs_write_begin(&vnode->netfs, file, mapping, pos, len, &folio, fsdata); - if (ret < 0) - return ret; - - index = folio_index(folio); - from = pos - index * PAGE_SIZE; - to = from + len; - -try_again: - /* See if this page is already partially written in a way that we can - * merge the new write with. - */ - if (folio_test_private(folio)) { - priv = (unsigned long)folio_get_private(folio); - f = afs_folio_dirty_from(folio, priv); - t = afs_folio_dirty_to(folio, priv); - ASSERTCMP(f, <=, t); - - if (folio_test_writeback(folio)) { - trace_afs_folio_dirty(vnode, tracepoint_string("alrdy"), folio); - folio_unlock(folio); - goto wait_for_writeback; - } - /* If the file is being filled locally, allow inter-write - * spaces to be merged into writes. If it's not, only write - * back what the user gives us. - */ - if (!test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags) && - (to < f || from > t)) - goto flush_conflicting_write; - } - - *_page = folio_file_page(folio, pos / PAGE_SIZE); - _leave(" = 0"); - return 0; - - /* The previous write and this write aren't adjacent or overlapping, so - * flush the page out. - */ -flush_conflicting_write: - trace_afs_folio_dirty(vnode, tracepoint_string("confl"), folio); - folio_unlock(folio); - - ret = afs_flush_conflicting_write(mapping, folio); - if (ret < 0) - goto error; - -wait_for_writeback: - ret = folio_wait_writeback_killable(folio); - if (ret < 0) - goto error; - - ret = folio_lock_killable(folio); - if (ret < 0) - goto error; - goto try_again; - -error: - folio_put(folio); - _leave(" = %d", ret); - return ret; -} - -/* - * finalise part of a write to a page - */ -int afs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *subpage, void *fsdata) -{ - struct folio *folio = page_folio(subpage); - struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); - unsigned long priv; - unsigned int f, from = offset_in_folio(folio, pos); - unsigned int t, to = from + copied; - loff_t i_size, write_end_pos; - - _enter("{%llx:%llu},{%lx}", - vnode->fid.vid, vnode->fid.vnode, folio_index(folio)); - - if (!folio_test_uptodate(folio)) { - if (copied < len) { - copied = 0; - goto out; - } - - folio_mark_uptodate(folio); - } - - if (copied == 0) - goto out; - - write_end_pos = pos + copied; - - i_size = i_size_read(&vnode->netfs.inode); - if (write_end_pos > i_size) { - write_seqlock(&vnode->cb_lock); - i_size = i_size_read(&vnode->netfs.inode); - if (write_end_pos > i_size) - afs_set_i_size(vnode, write_end_pos); - write_sequnlock(&vnode->cb_lock); - fscache_update_cookie(afs_vnode_cache(vnode), NULL, &write_end_pos); - } - - if (folio_test_private(folio)) { - priv = (unsigned long)folio_get_private(folio); - f = afs_folio_dirty_from(folio, priv); - t = afs_folio_dirty_to(folio, priv); - if (from < f) - f = from; - if (to > t) - t = to; - priv = afs_folio_dirty(folio, f, t); - folio_change_private(folio, (void *)priv); - trace_afs_folio_dirty(vnode, tracepoint_string("dirty+"), folio); - } else { - priv = afs_folio_dirty(folio, from, to); - folio_attach_private(folio, (void *)priv); - trace_afs_folio_dirty(vnode, tracepoint_string("dirty"), folio); - } - - if (folio_mark_dirty(folio)) - _debug("dirtied %lx", folio_index(folio)); - -out: - folio_unlock(folio); - folio_put(folio); - return copied; -} - -/* - * kill all the pages in the given range - */ -static void afs_kill_pages(struct address_space *mapping, - loff_t start, loff_t len) -{ - struct afs_vnode *vnode = AFS_FS_I(mapping->host); - struct folio *folio; - pgoff_t index = start / PAGE_SIZE; - pgoff_t last = (start + len - 1) / PAGE_SIZE, next; - - _enter("{%llx:%llu},%llx @%llx", - vnode->fid.vid, vnode->fid.vnode, len, start); - - do { - _debug("kill %lx (to %lx)", index, last); - - folio = filemap_get_folio(mapping, index); - if (IS_ERR(folio)) { - next = index + 1; - continue; - } - - next = folio_next_index(folio); - - folio_clear_uptodate(folio); - folio_end_writeback(folio); - folio_lock(folio); - generic_error_remove_folio(mapping, folio); - folio_unlock(folio); - folio_put(folio); - - } while (index = next, index <= last); - - _leave(""); -} - -/* - * Redirty all the pages in a given range. - */ -static void afs_redirty_pages(struct writeback_control *wbc, - struct address_space *mapping, - loff_t start, loff_t len) -{ - struct afs_vnode *vnode = AFS_FS_I(mapping->host); - struct folio *folio; - pgoff_t index = start / PAGE_SIZE; - pgoff_t last = (start + len - 1) / PAGE_SIZE, next; - - _enter("{%llx:%llu},%llx @%llx", - vnode->fid.vid, vnode->fid.vnode, len, start); - - do { - _debug("redirty %llx @%llx", len, start); - - folio = filemap_get_folio(mapping, index); - if (IS_ERR(folio)) { - next = index + 1; - continue; - } - - next = index + folio_nr_pages(folio); - folio_redirty_for_writepage(wbc, folio); - folio_end_writeback(folio); - folio_put(folio); - } while (index = next, index <= last); - - _leave(""); -} - /* * completion of write to server */ static void afs_pages_written_back(struct afs_vnode *vnode, loff_t start, unsigned int len) { - struct address_space *mapping = vnode->netfs.inode.i_mapping; - struct folio *folio; - pgoff_t end; - - XA_STATE(xas, &mapping->i_pages, start / PAGE_SIZE); - _enter("{%llx:%llu},{%x @%llx}", vnode->fid.vid, vnode->fid.vnode, len, start); - rcu_read_lock(); - - end = (start + len - 1) / PAGE_SIZE; - xas_for_each(&xas, folio, end) { - if (!folio_test_writeback(folio)) { - kdebug("bad %x @%llx page %lx %lx", - len, start, folio_index(folio), end); - ASSERT(folio_test_writeback(folio)); - } - - trace_afs_folio_dirty(vnode, tracepoint_string("clear"), folio); - folio_detach_private(folio); - folio_end_writeback(folio); - } - - rcu_read_unlock(); - afs_prune_wb_keys(vnode); _leave(""); } @@ -451,363 +159,53 @@ try_next_key: return afs_put_operation(op); } -/* - * Extend the region to be written back to include subsequent contiguously - * dirty pages if possible, but don't sleep while doing so. - * - * If this page holds new content, then we can include filler zeros in the - * writeback. - */ -static void afs_extend_writeback(struct address_space *mapping, - struct afs_vnode *vnode, - long *_count, - loff_t start, - loff_t max_len, - bool new_content, - bool caching, - unsigned int *_len) +static void afs_upload_to_server(struct netfs_io_subrequest *subreq) { - struct folio_batch fbatch; - struct folio *folio; - unsigned long priv; - unsigned int psize, filler = 0; - unsigned int f, t; - loff_t len = *_len; - pgoff_t index = (start + len) / PAGE_SIZE; - bool stop = true; - unsigned int i; - - XA_STATE(xas, &mapping->i_pages, index); - folio_batch_init(&fbatch); - - do { - /* Firstly, we gather up a batch of contiguous dirty pages - * under the RCU read lock - but we can't clear the dirty flags - * there if any of those pages are mapped. - */ - rcu_read_lock(); - - xas_for_each(&xas, folio, ULONG_MAX) { - stop = true; - if (xas_retry(&xas, folio)) - continue; - if (xa_is_value(folio)) - break; - if (folio_index(folio) != index) - break; - - if (!folio_try_get_rcu(folio)) { - xas_reset(&xas); - continue; - } - - /* Has the page moved or been split? */ - if (unlikely(folio != xas_reload(&xas))) { - folio_put(folio); - break; - } - - if (!folio_trylock(folio)) { - folio_put(folio); - break; - } - if (!folio_test_dirty(folio) || - folio_test_writeback(folio) || - folio_test_fscache(folio)) { - folio_unlock(folio); - folio_put(folio); - break; - } - - psize = folio_size(folio); - priv = (unsigned long)folio_get_private(folio); - f = afs_folio_dirty_from(folio, priv); - t = afs_folio_dirty_to(folio, priv); - if (f != 0 && !new_content) { - folio_unlock(folio); - folio_put(folio); - break; - } - - len += filler + t; - filler = psize - t; - if (len >= max_len || *_count <= 0) - stop = true; - else if (t == psize || new_content) - stop = false; - - index += folio_nr_pages(folio); - if (!folio_batch_add(&fbatch, folio)) - break; - if (stop) - break; - } - - if (!stop) - xas_pause(&xas); - rcu_read_unlock(); - - /* Now, if we obtained any folios, we can shift them to being - * writable and mark them for caching. - */ - if (!folio_batch_count(&fbatch)) - break; - - for (i = 0; i < folio_batch_count(&fbatch); i++) { - folio = fbatch.folios[i]; - trace_afs_folio_dirty(vnode, tracepoint_string("store+"), folio); - - if (!folio_clear_dirty_for_io(folio)) - BUG(); - folio_start_writeback(folio); - afs_folio_start_fscache(caching, folio); - - *_count -= folio_nr_pages(folio); - folio_unlock(folio); - } + struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode); + ssize_t ret; - folio_batch_release(&fbatch); - cond_resched(); - } while (!stop); + _enter("%x[%x],%zx", + subreq->rreq->debug_id, subreq->debug_index, subreq->io_iter.count); - *_len = len; + trace_netfs_sreq(subreq, netfs_sreq_trace_submit); + ret = afs_store_data(vnode, &subreq->io_iter, subreq->start, + subreq->rreq->origin == NETFS_LAUNDER_WRITE); + netfs_write_subrequest_terminated(subreq, ret < 0 ? ret : subreq->len, + false); } -/* - * Synchronously write back the locked page and any subsequent non-locked dirty - * pages. - */ -static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping, - struct writeback_control *wbc, - struct folio *folio, - loff_t start, loff_t end) +static void afs_upload_to_server_worker(struct work_struct *work) { - struct afs_vnode *vnode = AFS_FS_I(mapping->host); - struct iov_iter iter; - unsigned long priv; - unsigned int offset, to, len, max_len; - loff_t i_size = i_size_read(&vnode->netfs.inode); - bool new_content = test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); - bool caching = fscache_cookie_enabled(afs_vnode_cache(vnode)); - long count = wbc->nr_to_write; - int ret; - - _enter(",%lx,%llx-%llx", folio_index(folio), start, end); - - folio_start_writeback(folio); - afs_folio_start_fscache(caching, folio); - - count -= folio_nr_pages(folio); - - /* Find all consecutive lockable dirty pages that have contiguous - * written regions, stopping when we find a page that is not - * immediately lockable, is not dirty or is missing, or we reach the - * end of the range. - */ - priv = (unsigned long)folio_get_private(folio); - offset = afs_folio_dirty_from(folio, priv); - to = afs_folio_dirty_to(folio, priv); - trace_afs_folio_dirty(vnode, tracepoint_string("store"), folio); - - len = to - offset; - start += offset; - if (start < i_size) { - /* Trim the write to the EOF; the extra data is ignored. Also - * put an upper limit on the size of a single storedata op. - */ - max_len = 65536 * 4096; - max_len = min_t(unsigned long long, max_len, end - start + 1); - max_len = min_t(unsigned long long, max_len, i_size - start); - - if (len < max_len && - (to == folio_size(folio) || new_content)) - afs_extend_writeback(mapping, vnode, &count, - start, max_len, new_content, - caching, &len); - len = min_t(loff_t, len, max_len); - } - - /* We now have a contiguous set of dirty pages, each with writeback - * set; the first page is still locked at this point, but all the rest - * have been unlocked. - */ - folio_unlock(folio); - - if (start < i_size) { - _debug("write back %x @%llx [%llx]", len, start, i_size); - - /* Speculatively write to the cache. We have to fix this up - * later if the store fails. - */ - afs_write_to_cache(vnode, start, len, i_size, caching); - - iov_iter_xarray(&iter, ITER_SOURCE, &mapping->i_pages, start, len); - ret = afs_store_data(vnode, &iter, start, false); - } else { - _debug("write discard %x @%llx [%llx]", len, start, i_size); - - /* The dirty region was entirely beyond the EOF. */ - fscache_clear_page_bits(mapping, start, len, caching); - afs_pages_written_back(vnode, start, len); - ret = 0; - } - - switch (ret) { - case 0: - wbc->nr_to_write = count; - ret = len; - break; + struct netfs_io_subrequest *subreq = + container_of(work, struct netfs_io_subrequest, work); - default: - pr_notice("kAFS: Unexpected error from FS.StoreData %d\n", ret); - fallthrough; - case -EACCES: - case -EPERM: - case -ENOKEY: - case -EKEYEXPIRED: - case -EKEYREJECTED: - case -EKEYREVOKED: - case -ENETRESET: - afs_redirty_pages(wbc, mapping, start, len); - mapping_set_error(mapping, ret); - break; - - case -EDQUOT: - case -ENOSPC: - afs_redirty_pages(wbc, mapping, start, len); - mapping_set_error(mapping, -ENOSPC); - break; - - case -EROFS: - case -EIO: - case -EREMOTEIO: - case -EFBIG: - case -ENOENT: - case -ENOMEDIUM: - case -ENXIO: - trace_afs_file_error(vnode, ret, afs_file_error_writeback_fail); - afs_kill_pages(mapping, start, len); - mapping_set_error(mapping, ret); - break; - } - - _leave(" = %d", ret); - return ret; + afs_upload_to_server(subreq); } /* - * write a region of pages back to the server + * Set up write requests for a writeback slice. We need to add a write request + * for each write we want to make. */ -static int afs_writepages_region(struct address_space *mapping, - struct writeback_control *wbc, - loff_t start, loff_t end, loff_t *_next, - bool max_one_loop) +void afs_create_write_requests(struct netfs_io_request *wreq, loff_t start, size_t len) { - struct folio *folio; - struct folio_batch fbatch; - ssize_t ret; - unsigned int i; - int n, skips = 0; - - _enter("%llx,%llx,", start, end); - folio_batch_init(&fbatch); - - do { - pgoff_t index = start / PAGE_SIZE; - - n = filemap_get_folios_tag(mapping, &index, end / PAGE_SIZE, - PAGECACHE_TAG_DIRTY, &fbatch); - - if (!n) - break; - for (i = 0; i < n; i++) { - folio = fbatch.folios[i]; - start = folio_pos(folio); /* May regress with THPs */ - - _debug("wback %lx", folio_index(folio)); - - /* At this point we hold neither the i_pages lock nor the - * page lock: the page may be truncated or invalidated - * (changing page->mapping to NULL), or even swizzled - * back from swapper_space to tmpfs file mapping - */ -try_again: - if (wbc->sync_mode != WB_SYNC_NONE) { - ret = folio_lock_killable(folio); - if (ret < 0) { - folio_batch_release(&fbatch); - return ret; - } - } else { - if (!folio_trylock(folio)) - continue; - } - - if (folio->mapping != mapping || - !folio_test_dirty(folio)) { - start += folio_size(folio); - folio_unlock(folio); - continue; - } - - if (folio_test_writeback(folio) || - folio_test_fscache(folio)) { - folio_unlock(folio); - if (wbc->sync_mode != WB_SYNC_NONE) { - folio_wait_writeback(folio); -#ifdef CONFIG_AFS_FSCACHE - folio_wait_fscache(folio); -#endif - goto try_again; - } - - start += folio_size(folio); - if (wbc->sync_mode == WB_SYNC_NONE) { - if (skips >= 5 || need_resched()) { - *_next = start; - folio_batch_release(&fbatch); - _leave(" = 0 [%llx]", *_next); - return 0; - } - skips++; - } - continue; - } - - if (!folio_clear_dirty_for_io(folio)) - BUG(); - ret = afs_write_back_from_locked_folio(mapping, wbc, - folio, start, end); - if (ret < 0) { - _leave(" = %zd", ret); - folio_batch_release(&fbatch); - return ret; - } - - start += ret; - } + struct netfs_io_subrequest *subreq; - folio_batch_release(&fbatch); - cond_resched(); - } while (wbc->nr_to_write > 0); + _enter("%x,%llx-%llx", wreq->debug_id, start, start + len); - *_next = start; - _leave(" = 0 [%llx]", *_next); - return 0; + subreq = netfs_create_write_request(wreq, NETFS_UPLOAD_TO_SERVER, + start, len, afs_upload_to_server_worker); + if (subreq) + netfs_queue_write_request(subreq); } /* * write some of the pending data back to the server */ -int afs_writepages(struct address_space *mapping, - struct writeback_control *wbc) +int afs_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct afs_vnode *vnode = AFS_FS_I(mapping->host); - loff_t start, next; int ret; - _enter(""); - /* We have to be careful as we can end up racing with setattr() * truncating the pagecache since the caller doesn't take a lock here * to prevent it. @@ -817,69 +215,12 @@ int afs_writepages(struct address_space *mapping, else if (!down_read_trylock(&vnode->validate_lock)) return 0; - if (wbc->range_cyclic) { - start = mapping->writeback_index * PAGE_SIZE; - ret = afs_writepages_region(mapping, wbc, start, LLONG_MAX, - &next, false); - if (ret == 0) { - mapping->writeback_index = next / PAGE_SIZE; - if (start > 0 && wbc->nr_to_write > 0) { - ret = afs_writepages_region(mapping, wbc, 0, - start, &next, false); - if (ret == 0) - mapping->writeback_index = - next / PAGE_SIZE; - } - } - } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) { - ret = afs_writepages_region(mapping, wbc, 0, LLONG_MAX, - &next, false); - if (wbc->nr_to_write > 0 && ret == 0) - mapping->writeback_index = next / PAGE_SIZE; - } else { - ret = afs_writepages_region(mapping, wbc, - wbc->range_start, wbc->range_end, - &next, false); - } - + ret = netfs_writepages(mapping, wbc); up_read(&vnode->validate_lock); - _leave(" = %d", ret); return ret; } /* - * write to an AFS file - */ -ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from) -{ - struct afs_vnode *vnode = AFS_FS_I(file_inode(iocb->ki_filp)); - struct afs_file *af = iocb->ki_filp->private_data; - ssize_t result; - size_t count = iov_iter_count(from); - - _enter("{%llx:%llu},{%zu},", - vnode->fid.vid, vnode->fid.vnode, count); - - if (IS_SWAPFILE(&vnode->netfs.inode)) { - printk(KERN_INFO - "AFS: Attempt to write to active swap file!\n"); - return -EBUSY; - } - - if (!count) - return 0; - - result = afs_validate(vnode, af->key); - if (result < 0) - return result; - - result = generic_file_write_iter(iocb, from); - - _leave(" = %zd", result); - return result; -} - -/* * flush any dirty pages for this process, and check for write errors. * - the return status from this call provides a reliable indication of * whether any write errors occurred for this process. @@ -907,59 +248,11 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync) */ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) { - struct folio *folio = page_folio(vmf->page); struct file *file = vmf->vma->vm_file; - struct inode *inode = file_inode(file); - struct afs_vnode *vnode = AFS_FS_I(inode); - struct afs_file *af = file->private_data; - unsigned long priv; - vm_fault_t ret = VM_FAULT_RETRY; - - _enter("{{%llx:%llu}},{%lx}", vnode->fid.vid, vnode->fid.vnode, folio_index(folio)); - - afs_validate(vnode, af->key); - sb_start_pagefault(inode->i_sb); - - /* Wait for the page to be written to the cache before we allow it to - * be modified. We then assume the entire page will need writing back. - */ -#ifdef CONFIG_AFS_FSCACHE - if (folio_test_fscache(folio) && - folio_wait_fscache_killable(folio) < 0) - goto out; -#endif - - if (folio_wait_writeback_killable(folio)) - goto out; - - if (folio_lock_killable(folio) < 0) - goto out; - - /* We mustn't change folio->private until writeback is complete as that - * details the portion of the page we need to write back and we might - * need to redirty the page if there's a problem. - */ - if (folio_wait_writeback_killable(folio) < 0) { - folio_unlock(folio); - goto out; - } - - priv = afs_folio_dirty(folio, 0, folio_size(folio)); - priv = afs_folio_dirty_mmapped(priv); - if (folio_test_private(folio)) { - folio_change_private(folio, (void *)priv); - trace_afs_folio_dirty(vnode, tracepoint_string("mkwrite+"), folio); - } else { - folio_attach_private(folio, (void *)priv); - trace_afs_folio_dirty(vnode, tracepoint_string("mkwrite"), folio); - } - file_update_time(file); - - ret = VM_FAULT_LOCKED; -out: - sb_end_pagefault(inode->i_sb); - return ret; + if (afs_validate(AFS_FS_I(file_inode(file)), afs_file_key(file)) < 0) + return VM_FAULT_SIGBUS; + return netfs_page_mkwrite(vmf, NULL); } /* @@ -989,64 +282,3 @@ void afs_prune_wb_keys(struct afs_vnode *vnode) afs_put_wb_key(wbk); } } - -/* - * Clean up a page during invalidation. - */ -int afs_launder_folio(struct folio *folio) -{ - struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio)); - struct iov_iter iter; - struct bio_vec bv; - unsigned long priv; - unsigned int f, t; - int ret = 0; - - _enter("{%lx}", folio->index); - - priv = (unsigned long)folio_get_private(folio); - if (folio_clear_dirty_for_io(folio)) { - f = 0; - t = folio_size(folio); - if (folio_test_private(folio)) { - f = afs_folio_dirty_from(folio, priv); - t = afs_folio_dirty_to(folio, priv); - } - - bvec_set_folio(&bv, folio, t - f, f); - iov_iter_bvec(&iter, ITER_SOURCE, &bv, 1, bv.bv_len); - - trace_afs_folio_dirty(vnode, tracepoint_string("launder"), folio); - ret = afs_store_data(vnode, &iter, folio_pos(folio) + f, true); - } - - trace_afs_folio_dirty(vnode, tracepoint_string("laundered"), folio); - folio_detach_private(folio); - folio_wait_fscache(folio); - return ret; -} - -/* - * Deal with the completion of writing the data to the cache. - */ -static void afs_write_to_cache_done(void *priv, ssize_t transferred_or_error, - bool was_async) -{ - struct afs_vnode *vnode = priv; - - if (IS_ERR_VALUE(transferred_or_error) && - transferred_or_error != -ENOBUFS) - afs_invalidate_cache(vnode, 0); -} - -/* - * Save the write to the cache also. - */ -static void afs_write_to_cache(struct afs_vnode *vnode, - loff_t start, size_t len, loff_t i_size, - bool caching) -{ - fscache_write_to_cache(afs_vnode_cache(vnode), - vnode->netfs.inode.i_mapping, start, len, i_size, - afs_write_to_cache_done, vnode, caching); -} @@ -239,7 +239,6 @@ static struct ctl_table aio_sysctls[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, - {} }; static void __init aio_sysctl_init(void) diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index d26222b7eefe..0496cb5b6eab 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -79,7 +79,7 @@ static struct file *__anon_inode_getfile(const char *name, const struct file_operations *fops, void *priv, int flags, const struct inode *context_inode, - bool secure) + bool make_inode) { struct inode *inode; struct file *file; @@ -87,7 +87,7 @@ static struct file *__anon_inode_getfile(const char *name, if (fops->owner && !try_module_get(fops->owner)) return ERR_PTR(-ENOENT); - if (secure) { + if (make_inode) { inode = anon_inode_make_secure_inode(name, context_inode); if (IS_ERR(inode)) { file = ERR_CAST(inode); @@ -149,13 +149,10 @@ struct file *anon_inode_getfile(const char *name, EXPORT_SYMBOL_GPL(anon_inode_getfile); /** - * anon_inode_getfile_secure - Like anon_inode_getfile(), but creates a new + * anon_inode_create_getfile - Like anon_inode_getfile(), but creates a new * !S_PRIVATE anon inode rather than reuse the * singleton anon inode and calls the - * inode_init_security_anon() LSM hook. This - * allows for both the inode to have its own - * security context and for the LSM to enforce - * policy on the inode's creation. + * inode_init_security_anon() LSM hook. * * @name: [in] name of the "class" of the new file * @fops: [in] file operations for the new file @@ -164,11 +161,21 @@ EXPORT_SYMBOL_GPL(anon_inode_getfile); * @context_inode: * [in] the logical relationship with the new inode (optional) * + * Create a new anonymous inode and file pair. This can be done for two + * reasons: + * + * - for the inode to have its own security context, so that LSMs can enforce + * policy on the inode's creation; + * + * - if the caller needs a unique inode, for example in order to customize + * the size returned by fstat() + * * The LSM may use @context_inode in inode_init_security_anon(), but a - * reference to it is not held. Returns the newly created file* or an error - * pointer. See the anon_inode_getfile() documentation for more information. + * reference to it is not held. + * + * Returns the newly created file* or an error pointer. */ -struct file *anon_inode_getfile_secure(const char *name, +struct file *anon_inode_create_getfile(const char *name, const struct file_operations *fops, void *priv, int flags, const struct inode *context_inode) @@ -176,12 +183,13 @@ struct file *anon_inode_getfile_secure(const char *name, return __anon_inode_getfile(name, fops, priv, flags, context_inode, true); } +EXPORT_SYMBOL_GPL(anon_inode_create_getfile); static int __anon_inode_getfd(const char *name, const struct file_operations *fops, void *priv, int flags, const struct inode *context_inode, - bool secure) + bool make_inode) { int error, fd; struct file *file; @@ -192,7 +200,7 @@ static int __anon_inode_getfd(const char *name, fd = error; file = __anon_inode_getfile(name, fops, priv, flags, context_inode, - secure); + make_inode); if (IS_ERR(file)) { error = PTR_ERR(file); goto err_put_unused_fd; @@ -231,10 +239,9 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops, EXPORT_SYMBOL_GPL(anon_inode_getfd); /** - * anon_inode_getfd_secure - Like anon_inode_getfd(), but creates a new + * anon_inode_create_getfd - Like anon_inode_getfd(), but creates a new * !S_PRIVATE anon inode rather than reuse the singleton anon inode, and calls - * the inode_init_security_anon() LSM hook. This allows the inode to have its - * own security context and for a LSM to reject creation of the inode. + * the inode_init_security_anon() LSM hook. * * @name: [in] name of the "class" of the new file * @fops: [in] file operations for the new file @@ -243,16 +250,26 @@ EXPORT_SYMBOL_GPL(anon_inode_getfd); * @context_inode: * [in] the logical relationship with the new inode (optional) * + * Create a new anonymous inode and file pair. This can be done for two + * reasons: + * + * - for the inode to have its own security context, so that LSMs can enforce + * policy on the inode's creation; + * + * - if the caller needs a unique inode, for example in order to customize + * the size returned by fstat() + * * The LSM may use @context_inode in inode_init_security_anon(), but a * reference to it is not held. + * + * Returns a newly created file descriptor or an error code. */ -int anon_inode_getfd_secure(const char *name, const struct file_operations *fops, +int anon_inode_create_getfd(const char *name, const struct file_operations *fops, void *priv, int flags, const struct inode *context_inode) { return __anon_inode_getfd(name, fops, priv, flags, context_inode, true); } -EXPORT_SYMBOL_GPL(anon_inode_getfd_secure); static int __init anon_inode_init(void) { diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c index 038b3d2d9f57..39d8c84c16f4 100644 --- a/fs/autofs/expire.c +++ b/fs/autofs/expire.c @@ -73,12 +73,9 @@ done: /* p->d_lock held */ static struct dentry *positive_after(struct dentry *p, struct dentry *child) { - if (child) - child = list_next_entry(child, d_child); - else - child = list_first_entry(&p->d_subdirs, struct dentry, d_child); + child = child ? d_next_sibling(child) : d_first_child(p); - list_for_each_entry_from(child, &p->d_subdirs, d_child) { + hlist_for_each_entry_from(child, d_sib) { spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); if (simple_positive(child)) { dget_dlock(child); diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig index fddc7be58022..5cdfef3b551a 100644 --- a/fs/bcachefs/Kconfig +++ b/fs/bcachefs/Kconfig @@ -50,14 +50,6 @@ config BCACHEFS_POSIX_ACL depends on BCACHEFS_FS select FS_POSIX_ACL -config BCACHEFS_DEBUG_TRANSACTIONS - bool "bcachefs runtime info" - depends on BCACHEFS_FS - help - This makes the list of running btree transactions available in debugfs. - - This is a highly useful debugging feature but does add a small amount of overhead. - config BCACHEFS_DEBUG bool "bcachefs debugging" depends on BCACHEFS_FS @@ -85,6 +77,16 @@ config BCACHEFS_NO_LATENCY_ACCT help This disables device latency tracking and time stats, only for performance testing +config BCACHEFS_SIX_OPTIMISTIC_SPIN + bool "Optimistic spinning for six locks" + depends on BCACHEFS_FS + depends on SMP + default y + help + Instead of immediately sleeping when attempting to take a six lock that + is held by another thread, spin for a short while, as long as the + thread owning the lock is running. + config MEAN_AND_VARIANCE_UNIT_TEST tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS depends on KUNIT diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index b81268418174..1a05cecda7cc 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -27,7 +27,6 @@ bcachefs-y := \ checksum.o \ clock.o \ compress.o \ - counters.o \ darray.o \ debug.o \ dirent.o \ @@ -71,6 +70,7 @@ bcachefs-y := \ reflink.o \ replicas.o \ sb-clean.o \ + sb-counters.o \ sb-downgrade.o \ sb-errors.o \ sb-members.o \ @@ -82,6 +82,7 @@ bcachefs-y := \ super-io.o \ sysfs.o \ tests.o \ + thread_with_file.o \ trace.o \ two_state_shared_lock.o \ util.o \ diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 1fec0e67891f..fd3e175d8342 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -261,10 +261,8 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k, case BCH_DATA_free: case BCH_DATA_need_gc_gens: case BCH_DATA_need_discard: - bkey_fsck_err_on(a.v->dirty_sectors || - a.v->cached_sectors || - a.v->stripe, c, err, - alloc_key_empty_but_have_data, + bkey_fsck_err_on(bch2_bucket_sectors(*a.v) || a.v->stripe, + c, err, alloc_key_empty_but_have_data, "empty data type free but have data"); break; case BCH_DATA_sb: @@ -272,22 +270,21 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k, case BCH_DATA_btree: case BCH_DATA_user: case BCH_DATA_parity: - bkey_fsck_err_on(!a.v->dirty_sectors, c, err, - alloc_key_dirty_sectors_0, + bkey_fsck_err_on(!bch2_bucket_sectors_dirty(*a.v), + c, err, alloc_key_dirty_sectors_0, "data_type %s but dirty_sectors==0", - bch2_data_types[a.v->data_type]); + bch2_data_type_str(a.v->data_type)); break; case BCH_DATA_cached: bkey_fsck_err_on(!a.v->cached_sectors || - a.v->dirty_sectors || - a.v->stripe, c, err, - alloc_key_cached_inconsistency, + bch2_bucket_sectors_dirty(*a.v) || + a.v->stripe, + c, err, alloc_key_cached_inconsistency, "data type inconsistency"); bkey_fsck_err_on(!a.v->io_time[READ] && c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs, - c, err, - alloc_key_cached_but_read_time_zero, + c, err, alloc_key_cached_but_read_time_zero, "cached bucket with read_time == 0"); break; case BCH_DATA_stripe: @@ -324,16 +321,12 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c { struct bch_alloc_v4 _a; const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); - unsigned i; prt_newline(out); printbuf_indent_add(out, 2); - prt_printf(out, "gen %u oldest_gen %u data_type %s", - a->gen, a->oldest_gen, - a->data_type < BCH_DATA_NR - ? bch2_data_types[a->data_type] - : "(invalid data type)"); + prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen); + bch2_prt_data_type(out, a->data_type); prt_newline(out); prt_printf(out, "journal_seq %llu", a->journal_seq); prt_newline(out); @@ -356,23 +349,6 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c prt_printf(out, "fragmentation %llu", a->fragmentation_lru); prt_newline(out); prt_printf(out, "bp_start %llu", BCH_ALLOC_V4_BACKPOINTERS_START(a)); - prt_newline(out); - - if (BCH_ALLOC_V4_NR_BACKPOINTERS(a)) { - struct bkey_s_c_alloc_v4 a_raw = bkey_s_c_to_alloc_v4(k); - const struct bch_backpointer *bps = alloc_v4_backpointers_c(a_raw.v); - - prt_printf(out, "backpointers: %llu", BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v)); - printbuf_indent_add(out, 2); - - for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v); i++) { - prt_newline(out); - bch2_backpointer_to_text(out, &bps[i]); - } - - printbuf_indent_sub(out, 2); - } - printbuf_indent_sub(out, 2); } @@ -537,18 +513,12 @@ void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bke int bch2_bucket_gens_init(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - struct bch_alloc_v4 a; struct bkey_i_bucket_gens g; bool have_bucket_gens_key = false; - unsigned offset; - struct bpos pos; - u8 gen; int ret; - for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_PREFETCH, k, ret) { + ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ({ /* * Not a fsck error because this is checked/repaired by * bch2_check_alloc_key() which runs later: @@ -556,13 +526,14 @@ int bch2_bucket_gens_init(struct bch_fs *c) if (!bch2_dev_bucket_exists(c, k.k->p)) continue; - gen = bch2_alloc_to_v4(k, &a)->gen; - pos = alloc_gens_pos(iter.pos, &offset); + struct bch_alloc_v4 a; + u8 gen = bch2_alloc_to_v4(k, &a)->gen; + unsigned offset; + struct bpos pos = alloc_gens_pos(iter.pos, &offset); if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) { ret = commit_do(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, + BCH_TRANS_COMMIT_no_enospc, bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0)); if (ret) break; @@ -576,45 +547,37 @@ int bch2_bucket_gens_init(struct bch_fs *c) } g.v.gens[offset] = gen; - } - bch2_trans_iter_exit(trans, &iter); + 0; + })); if (have_bucket_gens_key && !ret) ret = commit_do(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, + BCH_TRANS_COMMIT_no_enospc, bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0)); bch2_trans_put(trans); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } int bch2_alloc_read(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - struct bch_dev *ca; int ret; down_read(&c->gc_lock); if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) { - const struct bch_bucket_gens *g; - u64 b; - - for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN, - BTREE_ITER_PREFETCH, k, ret) { + ret = for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN, + BTREE_ITER_PREFETCH, k, ({ u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; if (k.k->type != KEY_TYPE_bucket_gens) continue; - g = bkey_s_c_to_bucket_gens(k).v; + const struct bch_bucket_gens *g = bkey_s_c_to_bucket_gens(k).v; /* * Not a fsck error because this is checked/repaired by @@ -623,19 +586,17 @@ int bch2_alloc_read(struct bch_fs *c) if (!bch2_dev_exists2(c, k.k->p.inode)) continue; - ca = bch_dev_bkey_exists(c, k.k->p.inode); + struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode); - for (b = max_t(u64, ca->mi.first_bucket, start); + for (u64 b = max_t(u64, ca->mi.first_bucket, start); b < min_t(u64, ca->mi.nbuckets, end); b++) *bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK]; - } - bch2_trans_iter_exit(trans, &iter); + 0; + })); } else { - struct bch_alloc_v4 a; - - for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_PREFETCH, k, ret) { + ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ({ /* * Not a fsck error because this is checked/repaired by * bch2_check_alloc_key() which runs later: @@ -643,19 +604,18 @@ int bch2_alloc_read(struct bch_fs *c) if (!bch2_dev_bucket_exists(c, k.k->p)) continue; - ca = bch_dev_bkey_exists(c, k.k->p.inode); + struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode); + struct bch_alloc_v4 a; *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen; - } - bch2_trans_iter_exit(trans, &iter); + 0; + })); } bch2_trans_put(trans); up_read(&c->gc_lock); - if (ret) - bch_err_fn(c, ret); - + bch_err_fn(c, ret); return ret; } @@ -768,83 +728,177 @@ static noinline int bch2_bucket_gen_update(struct btree_trans *trans, return ret; } -int bch2_trans_mark_alloc(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_i *new, - unsigned flags) +int bch2_trigger_alloc(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bkey_s_c old, struct bkey_s new, + unsigned flags) { struct bch_fs *c = trans->c; - struct bch_alloc_v4 old_a_convert, *new_a; - const struct bch_alloc_v4 *old_a; - u64 old_lru, new_lru; int ret = 0; - /* - * Deletion only happens in the device removal path, with - * BTREE_TRIGGER_NORUN: - */ - BUG_ON(new->k.type != KEY_TYPE_alloc_v4); + if (bch2_trans_inconsistent_on(!bch2_dev_bucket_exists(c, new.k->p), trans, + "alloc key for invalid device or bucket")) + return -EIO; - old_a = bch2_alloc_to_v4(old, &old_a_convert); - new_a = &bkey_i_to_alloc_v4(new)->v; + struct bch_dev *ca = bch_dev_bkey_exists(c, new.k->p.inode); - new_a->data_type = alloc_data_type(*new_a, new_a->data_type); + struct bch_alloc_v4 old_a_convert; + const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert); - if (new_a->dirty_sectors > old_a->dirty_sectors || - new_a->cached_sectors > old_a->cached_sectors) { - new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); - new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now)); - SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true); - SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true); - } + if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v; - if (data_type_is_empty(new_a->data_type) && - BCH_ALLOC_V4_NEED_INC_GEN(new_a) && - !bch2_bucket_is_open_safe(c, new->k.p.inode, new->k.p.offset)) { - new_a->gen++; - SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false); - } + new_a->data_type = alloc_data_type(*new_a, new_a->data_type); - if (old_a->data_type != new_a->data_type || - (new_a->data_type == BCH_DATA_free && - alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) { - ret = bch2_bucket_do_index(trans, old, old_a, false) ?: - bch2_bucket_do_index(trans, bkey_i_to_s_c(new), new_a, true); - if (ret) - return ret; - } + if (bch2_bucket_sectors(*new_a) > bch2_bucket_sectors(*old_a)) { + new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); + new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now)); + SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true); + SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true); + } - if (new_a->data_type == BCH_DATA_cached && - !new_a->io_time[READ]) - new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); + if (data_type_is_empty(new_a->data_type) && + BCH_ALLOC_V4_NEED_INC_GEN(new_a) && + !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) { + new_a->gen++; + SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false); + } + + if (old_a->data_type != new_a->data_type || + (new_a->data_type == BCH_DATA_free && + alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) { + ret = bch2_bucket_do_index(trans, old, old_a, false) ?: + bch2_bucket_do_index(trans, new.s_c, new_a, true); + if (ret) + return ret; + } - old_lru = alloc_lru_idx_read(*old_a); - new_lru = alloc_lru_idx_read(*new_a); + if (new_a->data_type == BCH_DATA_cached && + !new_a->io_time[READ]) + new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); - if (old_lru != new_lru) { - ret = bch2_lru_change(trans, new->k.p.inode, - bucket_to_u64(new->k.p), - old_lru, new_lru); - if (ret) - return ret; + u64 old_lru = alloc_lru_idx_read(*old_a); + u64 new_lru = alloc_lru_idx_read(*new_a); + if (old_lru != new_lru) { + ret = bch2_lru_change(trans, new.k->p.inode, + bucket_to_u64(new.k->p), + old_lru, new_lru); + if (ret) + return ret; + } + + new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a, + bch_dev_bkey_exists(c, new.k->p.inode)); + if (old_a->fragmentation_lru != new_a->fragmentation_lru) { + ret = bch2_lru_change(trans, + BCH_LRU_FRAGMENTATION_START, + bucket_to_u64(new.k->p), + old_a->fragmentation_lru, new_a->fragmentation_lru); + if (ret) + return ret; + } + + if (old_a->gen != new_a->gen) { + ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen); + if (ret) + return ret; + } + + /* + * need to know if we're getting called from the invalidate path or + * not: + */ + + if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && + old_a->cached_sectors) { + ret = bch2_update_cached_sectors_list(trans, new.k->p.inode, + -((s64) old_a->cached_sectors)); + if (ret) + return ret; + } } - new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a, - bch_dev_bkey_exists(c, new->k.p.inode)); + if ((flags & BTREE_TRIGGER_ATOMIC) && (flags & BTREE_TRIGGER_INSERT)) { + struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v; + u64 journal_seq = trans->journal_res.seq; + u64 bucket_journal_seq = new_a->journal_seq; - if (old_a->fragmentation_lru != new_a->fragmentation_lru) { - ret = bch2_lru_change(trans, - BCH_LRU_FRAGMENTATION_START, - bucket_to_u64(new->k.p), - old_a->fragmentation_lru, new_a->fragmentation_lru); - if (ret) - return ret; + if ((flags & BTREE_TRIGGER_INSERT) && + data_type_is_empty(old_a->data_type) != + data_type_is_empty(new_a->data_type) && + new.k->type == KEY_TYPE_alloc_v4) { + struct bch_alloc_v4 *v = bkey_s_to_alloc_v4(new).v; + + /* + * If the btree updates referring to a bucket weren't flushed + * before the bucket became empty again, then the we don't have + * to wait on a journal flush before we can reuse the bucket: + */ + v->journal_seq = bucket_journal_seq = + data_type_is_empty(new_a->data_type) && + (journal_seq == v->journal_seq || + bch2_journal_noflush_seq(&c->journal, v->journal_seq)) + ? 0 : journal_seq; + } + + if (!data_type_is_empty(old_a->data_type) && + data_type_is_empty(new_a->data_type) && + bucket_journal_seq) { + ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, + c->journal.flushed_seq_ondisk, + new.k->p.inode, new.k->p.offset, + bucket_journal_seq); + if (ret) { + bch2_fs_fatal_error(c, + "error setting bucket_needs_journal_commit: %i", ret); + return ret; + } + } + + percpu_down_read(&c->mark_lock); + if (new_a->gen != old_a->gen) + *bucket_gen(ca, new.k->p.offset) = new_a->gen; + + bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, false); + + if (new_a->data_type == BCH_DATA_free && + (!new_a->journal_seq || new_a->journal_seq < c->journal.flushed_seq_ondisk)) + closure_wake_up(&c->freelist_wait); + + if (new_a->data_type == BCH_DATA_need_discard && + (!bucket_journal_seq || bucket_journal_seq < c->journal.flushed_seq_ondisk)) + bch2_do_discards(c); + + if (old_a->data_type != BCH_DATA_cached && + new_a->data_type == BCH_DATA_cached && + should_invalidate_buckets(ca, bch2_dev_usage_read(ca))) + bch2_do_invalidates(c); + + if (new_a->data_type == BCH_DATA_need_gc_gens) + bch2_do_gc_gens(c); + percpu_up_read(&c->mark_lock); } - if (old_a->gen != new_a->gen) { - ret = bch2_bucket_gen_update(trans, new->k.p, new_a->gen); - if (ret) - return ret; + if ((flags & BTREE_TRIGGER_GC) && + (flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) { + struct bch_alloc_v4 new_a_convert; + const struct bch_alloc_v4 *new_a = bch2_alloc_to_v4(new.s_c, &new_a_convert); + + percpu_down_read(&c->mark_lock); + struct bucket *g = gc_bucket(ca, new.k->p.offset); + + bucket_lock(g); + + g->gen_valid = 1; + g->gen = new_a->gen; + g->data_type = new_a->data_type; + g->stripe = new_a->stripe; + g->stripe_redundancy = new_a->stripe_redundancy; + g->dirty_sectors = new_a->dirty_sectors; + g->cached_sectors = new_a->cached_sectors; + + bucket_unlock(g); + percpu_up_read(&c->mark_lock); } return 0; @@ -869,8 +923,9 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos bch2_trans_copy_iter(&iter2, iter); - if (!bpos_eq(iter->path->l[0].b->key.k.p, SPOS_MAX)) - end = bkey_min(end, bpos_nosnap_successor(iter->path->l[0].b->key.k.p)); + struct btree_path *path = btree_iter_path(iter->trans, iter); + if (!bpos_eq(path->l[0].b->key.k.p, SPOS_MAX)) + end = bkey_min(end, bpos_nosnap_successor(path->l[0].b->key.k.p)); end = bkey_min(end, POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1)); @@ -898,7 +953,6 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos static bool next_bucket(struct bch_fs *c, struct bpos *bucket) { struct bch_dev *ca; - unsigned iter; if (bch2_dev_bucket_exists(c, *bucket)) return true; @@ -916,8 +970,7 @@ static bool next_bucket(struct bch_fs *c, struct bpos *bucket) } rcu_read_lock(); - iter = bucket->inode; - ca = __bch2_next_dev(c, &iter, NULL); + ca = __bch2_next_dev_idx(c, bucket->inode, NULL); if (ca) *bucket = POS(ca->dev_idx, ca->mi.first_bucket); rcu_read_unlock(); @@ -1158,9 +1211,6 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans, unsigned i, gens_offset, gens_end_offset; int ret; - if (c->sb.version < bcachefs_metadata_version_bucket_gens) - return 0; - bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset)); k = bch2_btree_iter_peek_slot(bucket_gens_iter); @@ -1212,7 +1262,7 @@ fsck_err: return ret; } -static noinline_for_stack int __bch2_check_discard_freespace_key(struct btree_trans *trans, +static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_iter *iter) { struct bch_fs *c = trans->c; @@ -1267,28 +1317,10 @@ delete: ret = bch2_btree_delete_extent_at(trans, iter, iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?: bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW); + BCH_TRANS_COMMIT_no_enospc); goto out; } -static int bch2_check_discard_freespace_key(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos end) -{ - if (!btree_id_is_extents(iter->btree_id)) { - return __bch2_check_discard_freespace_key(trans, iter); - } else { - int ret = 0; - - while (!bkey_eq(iter->pos, end) && - !(ret = btree_trans_too_many_iters(trans) ?: - __bch2_check_discard_freespace_key(trans, iter))) - bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos)); - - return ret; - } -} - /* * We've already checked that generation numbers in the bucket_gens btree are * valid for buckets that exist; this just checks for keys for nonexistent @@ -1422,8 +1454,7 @@ int bch2_check_alloc_info(struct bch_fs *c) } ret = bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW); + BCH_TRANS_COMMIT_no_enospc); if (ret) goto bkey_err; @@ -1442,23 +1473,50 @@ bkey_err: if (ret < 0) goto err; - ret = for_each_btree_key2(trans, iter, + ret = for_each_btree_key(trans, iter, BTREE_ID_need_discard, POS_MIN, BTREE_ITER_PREFETCH, k, - bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?: - for_each_btree_key2(trans, iter, - BTREE_ID_freespace, POS_MIN, - BTREE_ITER_PREFETCH, k, - bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?: - for_each_btree_key_commit(trans, iter, + bch2_check_discard_freespace_key(trans, &iter)); + if (ret) + goto err; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_freespace, POS_MIN, + BTREE_ITER_PREFETCH); + while (1) { + bch2_trans_begin(trans); + k = bch2_btree_iter_peek(&iter); + if (!k.k) + break; + + ret = bkey_err(k) ?: + bch2_check_discard_freespace_key(trans, &iter); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + ret = 0; + continue; + } + if (ret) { + struct printbuf buf = PRINTBUF; + bch2_bkey_val_to_text(&buf, c, k); + + bch_err(c, "while checking %s", buf.buf); + printbuf_exit(&buf); + break; + } + + bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos)); + } + bch2_trans_iter_exit(trans, &iter); + if (ret) + goto err; + + ret = for_each_btree_key_commit(trans, iter, BTREE_ID_bucket_gens, POS_MIN, BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_check_bucket_gens_key(trans, &iter, k)); err: bch2_trans_put(trans); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } @@ -1486,6 +1544,27 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, if (a->data_type != BCH_DATA_cached) return 0; + if (fsck_err_on(!a->io_time[READ], c, + alloc_key_cached_but_read_time_zero, + "cached bucket with read_time 0\n" + " %s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { + struct bkey_i_alloc_v4 *a_mut = + bch2_alloc_to_v4_mut(trans, alloc_k); + ret = PTR_ERR_OR_ZERO(a_mut); + if (ret) + goto err; + + a_mut->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now); + ret = bch2_trans_update(trans, alloc_iter, + &a_mut->k_i, BTREE_TRIGGER_NORUN); + if (ret) + goto err; + + a = &a_mut->v; + } + lru_k = bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru, lru_pos(alloc_k.k->p.inode, bucket_to_u64(alloc_k.k->p), @@ -1494,41 +1573,18 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, if (ret) return ret; - if (fsck_err_on(!a->io_time[READ], c, - alloc_key_cached_but_read_time_zero, - "cached bucket with read_time 0\n" - " %s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) || - fsck_err_on(lru_k.k->type != KEY_TYPE_set, c, + if (fsck_err_on(lru_k.k->type != KEY_TYPE_set, c, alloc_key_to_missing_lru_entry, "missing lru entry\n" " %s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { - u64 read_time = a->io_time[READ] ?: - atomic64_read(&c->io_clock[READ].now); - ret = bch2_lru_set(trans, alloc_k.k->p.inode, bucket_to_u64(alloc_k.k->p), - read_time); + a->io_time[READ]); if (ret) goto err; - - if (a->io_time[READ] != read_time) { - struct bkey_i_alloc_v4 *a_mut = - bch2_alloc_to_v4_mut(trans, alloc_k); - ret = PTR_ERR_OR_ZERO(a_mut); - if (ret) - goto err; - - a_mut->v.io_time[READ] = read_time; - ret = bch2_trans_update(trans, alloc_iter, - &a_mut->k_i, BTREE_TRIGGER_NORUN); - if (ret) - goto err; - } } err: fsck_err: @@ -1539,27 +1595,45 @@ fsck_err: int bch2_check_alloc_to_lru_refs(struct bch_fs *c) { - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - ret = bch2_trans_run(c, + int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_check_alloc_to_lru_ref(trans, &iter))); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } +struct discard_buckets_state { + u64 seen; + u64 open; + u64 need_journal_commit; + u64 discarded; + struct bch_dev *ca; + u64 need_journal_commit_this_dev; +}; + +static void discard_buckets_next_dev(struct bch_fs *c, struct discard_buckets_state *s, struct bch_dev *ca) +{ + if (s->ca == ca) + return; + + if (s->ca && s->need_journal_commit_this_dev > + bch2_dev_usage_read(s->ca).d[BCH_DATA_free].buckets) + bch2_journal_flush_async(&c->journal, NULL); + + if (s->ca) + percpu_ref_put(&s->ca->ref); + if (ca) + percpu_ref_get(&ca->ref); + s->ca = ca; + s->need_journal_commit_this_dev = 0; +} + static int bch2_discard_one_bucket(struct btree_trans *trans, struct btree_iter *need_discard_iter, struct bpos *discard_pos_done, - u64 *seen, - u64 *open, - u64 *need_journal_commit, - u64 *discarded) + struct discard_buckets_state *s) { struct bch_fs *c = trans->c; struct bpos pos = need_discard_iter->pos; @@ -1571,20 +1645,24 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, int ret = 0; ca = bch_dev_bkey_exists(c, pos.inode); + if (!percpu_ref_tryget(&ca->io_ref)) { bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0)); return 0; } + discard_buckets_next_dev(c, s, ca); + if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) { - (*open)++; + s->open++; goto out; } if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, c->journal.flushed_seq_ondisk, pos.inode, pos.offset)) { - (*need_journal_commit)++; + s->need_journal_commit++; + s->need_journal_commit_this_dev++; goto out; } @@ -1637,7 +1715,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, * This works without any other locks because this is the only * thread that removes items from the need_discard tree */ - bch2_trans_unlock(trans); + bch2_trans_unlock_long(trans); blkdev_issue_discard(ca->disk_sb.bdev, k.k->p.offset * ca->mi.bucket_size, ca->mi.bucket_size, @@ -1655,14 +1733,14 @@ write: ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: bch2_trans_commit(trans, NULL, NULL, BCH_WATERMARK_btree| - BTREE_INSERT_NOFAIL); + BCH_TRANS_COMMIT_no_enospc); if (ret) goto out; - this_cpu_inc(c->counters[BCH_COUNTER_bucket_discard]); - (*discarded)++; + count_event(c, bucket_discard); + s->discarded++; out: - (*seen)++; + s->seen++; bch2_trans_iter_exit(trans, &iter); percpu_ref_put(&ca->io_ref); printbuf_exit(&buf); @@ -1672,9 +1750,7 @@ out: static void bch2_do_discards_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, discard_work); - struct btree_iter iter; - struct bkey_s_c k; - u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0; + struct discard_buckets_state s = {}; struct bpos discard_pos_done = POS_MAX; int ret; @@ -1684,21 +1760,16 @@ static void bch2_do_discards_work(struct work_struct *work) * successful commit: */ ret = bch2_trans_run(c, - for_each_btree_key2(trans, iter, - BTREE_ID_need_discard, POS_MIN, 0, k, - bch2_discard_one_bucket(trans, &iter, &discard_pos_done, - &seen, - &open, - &need_journal_commit, - &discarded))); - - if (need_journal_commit * 2 > seen) - bch2_journal_flush_async(&c->journal, NULL); + for_each_btree_key(trans, iter, + BTREE_ID_need_discard, POS_MIN, 0, k, + bch2_discard_one_bucket(trans, &iter, &discard_pos_done, &s))); - bch2_write_ref_put(c, BCH_WRITE_REF_discard); + discard_buckets_next_dev(c, &s, NULL); - trace_discard_buckets(c, seen, open, need_journal_commit, discarded, + trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); + + bch2_write_ref_put(c, BCH_WRITE_REF_discard); } void bch2_do_discards(struct bch_fs *c) @@ -1760,7 +1831,7 @@ static int invalidate_one_bucket(struct btree_trans *trans, BTREE_TRIGGER_BUCKET_INVALIDATE) ?: bch2_trans_commit(trans, NULL, NULL, BCH_WATERMARK_btree| - BTREE_INSERT_NOFAIL); + BCH_TRANS_COMMIT_no_enospc); if (ret) goto out; @@ -1795,22 +1866,18 @@ err: static void bch2_do_invalidates_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work); - struct bch_dev *ca; struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - unsigned i; int ret = 0; - ret = bch2_btree_write_buffer_flush(trans); + ret = bch2_btree_write_buffer_tryflush(trans); if (ret) goto err; - for_each_member_device(ca, c, i) { + for_each_member_device(c, ca) { s64 nr_to_invalidate = should_invalidate_buckets(ca, bch2_dev_usage_read(ca)); - ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_lru, + ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru, lru_pos(ca->dev_idx, 0, 0), lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX), BTREE_ITER_INTENT, k, @@ -1884,8 +1951,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, ret = bch2_bucket_do_index(trans, k, a, true) ?: bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_NOFAIL); + BCH_TRANS_COMMIT_no_enospc); if (ret) goto bkey_err; @@ -1905,8 +1971,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?: bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_NOFAIL); + BCH_TRANS_COMMIT_no_enospc); if (ret) goto bkey_err; @@ -1937,8 +2002,6 @@ bkey_err: int bch2_fs_freespace_init(struct bch_fs *c) { - struct bch_dev *ca; - unsigned i; int ret = 0; bool doing_init = false; @@ -1947,7 +2010,7 @@ int bch2_fs_freespace_init(struct bch_fs *c) * every mount: */ - for_each_member_device(ca, c, i) { + for_each_member_device(c, ca) { if (ca->mi.freespace_initialized) continue; @@ -2007,15 +2070,13 @@ out: void bch2_recalc_capacity(struct bch_fs *c) { - struct bch_dev *ca; u64 capacity = 0, reserved_sectors = 0, gc_reserve; unsigned bucket_size_max = 0; unsigned long ra_pages = 0; - unsigned i; lockdep_assert_held(&c->state_lock); - for_each_online_member(ca, c, i) { + for_each_online_member(c, ca) { struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi; ra_pages += bdi->ra_pages; @@ -2023,7 +2084,7 @@ void bch2_recalc_capacity(struct bch_fs *c) bch2_set_ra_pages(c, ra_pages); - for_each_rw_member(ca, c, i) { + for_each_rw_member(c, ca) { u64 dev_reserve = 0; /* @@ -2079,11 +2140,9 @@ void bch2_recalc_capacity(struct bch_fs *c) u64 bch2_min_rw_member_capacity(struct bch_fs *c) { - struct bch_dev *ca; - unsigned i; u64 ret = U64_MAX; - for_each_rw_member(ca, c, i) + for_each_rw_member(c, ca) ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size); return ret; } diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index 73faf99a222a..e7f7e842ee1b 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -71,6 +71,24 @@ static inline enum bch_data_type bucket_data_type(enum bch_data_type data_type) return data_type == BCH_DATA_stripe ? BCH_DATA_user : data_type; } +static inline unsigned bch2_bucket_sectors(struct bch_alloc_v4 a) +{ + return a.dirty_sectors + a.cached_sectors; +} + +static inline unsigned bch2_bucket_sectors_dirty(struct bch_alloc_v4 a) +{ + return a.dirty_sectors; +} + +static inline unsigned bch2_bucket_sectors_fragmented(struct bch_dev *ca, + struct bch_alloc_v4 a) +{ + int d = bch2_bucket_sectors_dirty(a); + + return d ? max(0, ca->mi.bucket_size - d) : 0; +} + static inline u64 alloc_lru_idx_read(struct bch_alloc_v4 a) { return a.data_type == BCH_DATA_cached ? a.io_time[READ] : 0; @@ -90,10 +108,11 @@ static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a, struct bch_dev *ca) { if (!data_type_movable(a.data_type) || - a.dirty_sectors >= ca->mi.bucket_size) + !bch2_bucket_sectors_fragmented(ca, a)) return 0; - return div_u64((u64) a.dirty_sectors * (1ULL << 31), ca->mi.bucket_size); + u64 d = bch2_bucket_sectors_dirty(a); + return div_u64(d * (1ULL << 31), ca->mi.bucket_size); } static inline u64 alloc_freespace_genbits(struct bch_alloc_v4 a) @@ -163,24 +182,21 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_alloc ((struct bkey_ops) { \ .key_invalid = bch2_alloc_v1_invalid, \ .val_to_text = bch2_alloc_to_text, \ - .trans_trigger = bch2_trans_mark_alloc, \ - .atomic_trigger = bch2_mark_alloc, \ + .trigger = bch2_trigger_alloc, \ .min_val_size = 8, \ }) #define bch2_bkey_ops_alloc_v2 ((struct bkey_ops) { \ .key_invalid = bch2_alloc_v2_invalid, \ .val_to_text = bch2_alloc_to_text, \ - .trans_trigger = bch2_trans_mark_alloc, \ - .atomic_trigger = bch2_mark_alloc, \ + .trigger = bch2_trigger_alloc, \ .min_val_size = 8, \ }) #define bch2_bkey_ops_alloc_v3 ((struct bkey_ops) { \ .key_invalid = bch2_alloc_v3_invalid, \ .val_to_text = bch2_alloc_to_text, \ - .trans_trigger = bch2_trans_mark_alloc, \ - .atomic_trigger = bch2_mark_alloc, \ + .trigger = bch2_trigger_alloc, \ .min_val_size = 16, \ }) @@ -188,8 +204,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); .key_invalid = bch2_alloc_v4_invalid, \ .val_to_text = bch2_alloc_to_text, \ .swab = bch2_alloc_v4_swab, \ - .trans_trigger = bch2_trans_mark_alloc, \ - .atomic_trigger = bch2_mark_alloc, \ + .trigger = bch2_trigger_alloc, \ .min_val_size = 48, \ }) @@ -213,8 +228,8 @@ static inline bool bkey_is_alloc(const struct bkey *k) int bch2_alloc_read(struct bch_fs *); -int bch2_trans_mark_alloc(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_i *, unsigned); +int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s, unsigned); int bch2_check_alloc_info(struct bch_fs *); int bch2_check_alloc_to_lru_refs(struct bch_fs *); void bch2_do_discards(struct bch_fs *); diff --git a/fs/bcachefs/alloc_background_format.h b/fs/bcachefs/alloc_background_format.h new file mode 100644 index 000000000000..b4ec20be93b8 --- /dev/null +++ b/fs/bcachefs/alloc_background_format.h @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H +#define _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H + +struct bch_alloc { + struct bch_val v; + __u8 fields; + __u8 gen; + __u8 data[]; +} __packed __aligned(8); + +#define BCH_ALLOC_FIELDS_V1() \ + x(read_time, 16) \ + x(write_time, 16) \ + x(data_type, 8) \ + x(dirty_sectors, 16) \ + x(cached_sectors, 16) \ + x(oldest_gen, 8) \ + x(stripe, 32) \ + x(stripe_redundancy, 8) + +enum { +#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name, + BCH_ALLOC_FIELDS_V1() +#undef x +}; + +struct bch_alloc_v2 { + struct bch_val v; + __u8 nr_fields; + __u8 gen; + __u8 oldest_gen; + __u8 data_type; + __u8 data[]; +} __packed __aligned(8); + +#define BCH_ALLOC_FIELDS_V2() \ + x(read_time, 64) \ + x(write_time, 64) \ + x(dirty_sectors, 32) \ + x(cached_sectors, 32) \ + x(stripe, 32) \ + x(stripe_redundancy, 8) + +struct bch_alloc_v3 { + struct bch_val v; + __le64 journal_seq; + __le32 flags; + __u8 nr_fields; + __u8 gen; + __u8 oldest_gen; + __u8 data_type; + __u8 data[]; +} __packed __aligned(8); + +LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1) +LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2) + +struct bch_alloc_v4 { + struct bch_val v; + __u64 journal_seq; + __u32 flags; + __u8 gen; + __u8 oldest_gen; + __u8 data_type; + __u8 stripe_redundancy; + __u32 dirty_sectors; + __u32 cached_sectors; + __u64 io_time[2]; + __u32 stripe; + __u32 nr_external_backpointers; + __u64 fragmentation_lru; +} __packed __aligned(8); + +#define BCH_ALLOC_V4_U64s_V0 6 +#define BCH_ALLOC_V4_U64s (sizeof(struct bch_alloc_v4) / sizeof(__u64)) + +BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1) +BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2) +BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags, 2, 8) +BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS, struct bch_alloc_v4, flags, 8, 14) + +#define KEY_TYPE_BUCKET_GENS_BITS 8 +#define KEY_TYPE_BUCKET_GENS_NR (1U << KEY_TYPE_BUCKET_GENS_BITS) +#define KEY_TYPE_BUCKET_GENS_MASK (KEY_TYPE_BUCKET_GENS_NR - 1) + +struct bch_bucket_gens { + struct bch_val v; + u8 gens[KEY_TYPE_BUCKET_GENS_NR]; +} __packed __aligned(8); + +#endif /* _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H */ diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 0e6157982607..633d3223b353 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -69,11 +69,8 @@ const char * const bch2_watermarks[] = { void bch2_reset_alloc_cursors(struct bch_fs *c) { - struct bch_dev *ca; - unsigned i; - rcu_read_lock(); - for_each_member_device_rcu(ca, c, i, NULL) + for_each_member_device_rcu(c, ca, NULL) ca->alloc_cursor = 0; rcu_read_unlock(); } @@ -239,9 +236,8 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * if (cl) closure_wait(&c->open_buckets_wait, cl); - if (!c->blocked_allocate_open_bucket) - c->blocked_allocate_open_bucket = local_clock(); - + track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], + &c->blocked_allocate_open_bucket, true); spin_unlock(&c->freelist_lock); return ERR_PTR(-BCH_ERR_open_buckets_empty); } @@ -267,19 +263,11 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * ca->nr_open_buckets++; bch2_open_bucket_hash_add(c, ob); - if (c->blocked_allocate_open_bucket) { - bch2_time_stats_update( - &c->times[BCH_TIME_blocked_allocate_open_bucket], - c->blocked_allocate_open_bucket); - c->blocked_allocate_open_bucket = 0; - } + track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], + &c->blocked_allocate_open_bucket, false); - if (c->blocked_allocate) { - bch2_time_stats_update( - &c->times[BCH_TIME_blocked_allocate], - c->blocked_allocate); - c->blocked_allocate = 0; - } + track_event_change(&c->times[BCH_TIME_blocked_allocate], + &c->blocked_allocate, false); spin_unlock(&c->freelist_lock); return ob; @@ -377,9 +365,9 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc ob = __try_alloc_bucket(c, ca, b, watermark, a, s, cl); if (!ob) - iter.path->preserve = false; + set_btree_iter_dontneed(&iter); err: - if (iter.trans && iter.path) + if (iter.path) set_btree_iter_dontneed(&iter); bch2_trans_iter_exit(trans, &iter); printbuf_exit(&buf); @@ -447,7 +435,7 @@ again: ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl); next: - citer.path->preserve = false; + set_btree_iter_dontneed(&citer); bch2_trans_iter_exit(trans, &citer); if (ob) break; @@ -502,7 +490,7 @@ again: ob = try_alloc_bucket(trans, ca, watermark, alloc_cursor, s, k, cl); if (ob) { - iter.path->preserve = false; + set_btree_iter_dontneed(&iter); break; } } @@ -567,8 +555,8 @@ again: goto again; } - if (!c->blocked_allocate) - c->blocked_allocate = local_clock(); + track_event_change(&c->times[BCH_TIME_blocked_allocate], + &c->blocked_allocate, true); ob = ERR_PTR(-BCH_ERR_freelist_empty); goto err; @@ -697,11 +685,9 @@ static int add_new_bucket(struct bch_fs *c, bch_dev_bkey_exists(c, ob->dev)->mi.durability; BUG_ON(*nr_effective >= nr_replicas); - BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS); __clear_bit(ob->dev, devs_may_alloc->d); - *nr_effective += (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) - ? durability : 1; + *nr_effective += durability; *have_cache |= !durability; ob_push(c, ptrs, ob); @@ -972,8 +958,8 @@ static int __open_bucket_add_buckets(struct btree_trans *trans, devs = target_rw_devs(c, wp->data_type, target); /* Don't allocate from devices we already have pointers to: */ - for (i = 0; i < devs_have->nr; i++) - __clear_bit(devs_have->devs[i], devs.d); + darray_for_each(*devs_have, i) + __clear_bit(*i, devs.d); open_bucket_for_each(c, ptrs, ob, i) __clear_bit(ob->dev, devs.d); @@ -1539,10 +1525,11 @@ static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, str unsigned data_type = ob->data_type; barrier(); /* READ_ONCE() doesn't work on bitfields */ - prt_printf(out, "%zu ref %u %s %u:%llu gen %u allocated %u/%u", + prt_printf(out, "%zu ref %u ", ob - c->open_buckets, - atomic_read(&ob->pin), - data_type < BCH_DATA_NR ? bch2_data_types[data_type] : "invalid data type", + atomic_read(&ob->pin)); + bch2_prt_data_type(out, data_type); + prt_printf(out, " %u:%llu gen %u allocated %u/%u", ob->dev, ob->bucket, ob->gen, ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size); if (ob->ec) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 23c0834a97a4..b4dc319bcb2b 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -3,6 +3,7 @@ #include "bbpos.h" #include "alloc_background.h" #include "backpointers.h" +#include "bkey_buf.h" #include "btree_cache.h" #include "btree_update.h" #include "btree_update_interior.h" @@ -136,15 +137,30 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, } int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, - struct bkey_i_backpointer *bp_k, + struct bpos bucket, struct bch_backpointer bp, struct bkey_s_c orig_k, bool insert) { struct btree_iter bp_iter; struct bkey_s_c k; + struct bkey_i_backpointer *bp_k; int ret; + bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer)); + ret = PTR_ERR_OR_ZERO(bp_k); + if (ret) + return ret; + + bkey_backpointer_init(&bp_k->k_i); + bp_k->k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset); + bp_k->v = bp; + + if (!insert) { + bp_k->k.type = KEY_TYPE_deleted; + set_bkey_val_u64s(&bp_k->k, 0); + } + k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, bp_k->k.p, BTREE_ITER_INTENT| @@ -375,41 +391,45 @@ fsck_err: /* verify that every backpointer has a corresponding alloc key */ int bch2_check_btree_backpointers(struct bch_fs *c) { - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - ret = bch2_trans_run(c, + int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers, POS_MIN, 0, k, - NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_check_btree_backpointer(trans, &iter, k))); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } -struct bpos_level { - unsigned level; - struct bpos pos; +static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r) +{ + return bpos_eq(l.k->p, r.k->p) && + bkey_bytes(l.k) == bkey_bytes(r.k) && + !memcmp(l.v, r.v, bkey_val_bytes(l.k)); +} + +struct extents_to_bp_state { + struct bpos bucket_start; + struct bpos bucket_end; + struct bkey_buf last_flushed; }; static int check_bp_exists(struct btree_trans *trans, + struct extents_to_bp_state *s, struct bpos bucket, struct bch_backpointer bp, - struct bkey_s_c orig_k, - struct bpos bucket_start, - struct bpos bucket_end, - struct bpos_level *last_flushed) + struct bkey_s_c orig_k) { struct bch_fs *c = trans->c; struct btree_iter bp_iter = { NULL }; struct printbuf buf = PRINTBUF; struct bkey_s_c bp_k; + struct bkey_buf tmp; int ret; - if (bpos_lt(bucket, bucket_start) || - bpos_gt(bucket, bucket_end)) + bch2_bkey_buf_init(&tmp); + + if (bpos_lt(bucket, s->bucket_start) || + bpos_gt(bucket, s->bucket_end)) return 0; if (!bch2_dev_bucket_exists(c, bucket)) @@ -424,13 +444,20 @@ static int check_bp_exists(struct btree_trans *trans, if (bp_k.k->type != KEY_TYPE_backpointer || memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) { - if (last_flushed->level != bp.level || - !bpos_eq(last_flushed->pos, orig_k.k->p)) { - last_flushed->level = bp.level; - last_flushed->pos = orig_k.k->p; + bch2_bkey_buf_reassemble(&tmp, c, orig_k); + + if (!bkey_and_val_eq(orig_k, bkey_i_to_s_c(s->last_flushed.k))) { + if (bp.level) { + bch2_trans_unlock(trans); + bch2_btree_interior_updates_flush(c); + } + + ret = bch2_btree_write_buffer_flush_sync(trans); + if (ret) + goto err; - ret = bch2_btree_write_buffer_flush_sync(trans) ?: - -BCH_ERR_transaction_restart_write_buffer_flush; + bch2_bkey_buf_copy(&s->last_flushed, c, tmp.k); + ret = -BCH_ERR_transaction_restart_write_buffer_flush; goto out; } goto missing; @@ -439,6 +466,7 @@ out: err: fsck_err: bch2_trans_iter_exit(trans, &bp_iter); + bch2_bkey_buf_exit(&tmp, c); printbuf_exit(&buf); return ret; missing: @@ -448,8 +476,7 @@ missing: prt_printf(&buf, "\nbp pos "); bch2_bpos_to_text(&buf, bp_iter.pos); - if (c->sb.version_upgrade_complete < bcachefs_metadata_version_backpointers || - c->opts.reconstruct_alloc || + if (c->opts.reconstruct_alloc || fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf)) ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true); @@ -457,25 +484,16 @@ missing: } static int check_extent_to_backpointers(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos bucket_start, - struct bpos bucket_end, - struct bpos_level *last_flushed) + struct extents_to_bp_state *s, + enum btree_id btree, unsigned level, + struct bkey_s_c k) { struct bch_fs *c = trans->c; struct bkey_ptrs_c ptrs; const union bch_extent_entry *entry; struct extent_ptr_decoded p; - struct bkey_s_c k; int ret; - k = bch2_btree_iter_peek_all_levels(iter); - ret = bkey_err(k); - if (ret) - return ret; - if (!k.k) - return 0; - ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { struct bpos bucket_pos; @@ -484,12 +502,10 @@ static int check_extent_to_backpointers(struct btree_trans *trans, if (p.ptr.cached) continue; - bch2_extent_ptr_to_bp(c, iter->btree_id, iter->path->level, + bch2_extent_ptr_to_bp(c, btree, level, k, p, &bucket_pos, &bp); - ret = check_bp_exists(trans, bucket_pos, bp, k, - bucket_start, bucket_end, - last_flushed); + ret = check_bp_exists(trans, s, bucket_pos, bp, k); if (ret) return ret; } @@ -498,47 +514,32 @@ static int check_extent_to_backpointers(struct btree_trans *trans, } static int check_btree_root_to_backpointers(struct btree_trans *trans, + struct extents_to_bp_state *s, enum btree_id btree_id, - struct bpos bucket_start, - struct bpos bucket_end, - struct bpos_level *last_flushed) + int *level) { struct bch_fs *c = trans->c; - struct btree_root *r = bch2_btree_id_root(c, btree_id); struct btree_iter iter; struct btree *b; struct bkey_s_c k; - struct bkey_ptrs_c ptrs; - struct extent_ptr_decoded p; - const union bch_extent_entry *entry; int ret; - - bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, r->level, 0); +retry: + bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, + 0, bch2_btree_id_root(c, btree_id)->b->c.level, 0); b = bch2_btree_iter_peek_node(&iter); ret = PTR_ERR_OR_ZERO(b); if (ret) goto err; - BUG_ON(b != btree_node_root(c, b)); - - k = bkey_i_to_s_c(&b->key); - ptrs = bch2_bkey_ptrs_c(k); - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - struct bpos bucket_pos; - struct bch_backpointer bp; - - if (p.ptr.cached) - continue; + if (b != btree_node_root(c, b)) { + bch2_trans_iter_exit(trans, &iter); + goto retry; + } - bch2_extent_ptr_to_bp(c, iter.btree_id, b->c.level + 1, - k, p, &bucket_pos, &bp); + *level = b->c.level; - ret = check_bp_exists(trans, bucket_pos, bp, k, - bucket_start, bucket_end, - last_flushed); - if (ret) - goto err; - } + k = bkey_i_to_s_c(&b->key); + ret = check_extent_to_backpointers(trans, s, btree_id, b->c.level + 1, k); err: bch2_trans_iter_exit(trans, &iter); return ret; @@ -559,7 +560,7 @@ static size_t btree_nodes_fit_in_ram(struct bch_fs *c) si_meminfo(&i); mem_bytes = i.totalram * i.mem_unit; - return div_u64(mem_bytes >> 1, btree_bytes(c)); + return div_u64(mem_bytes >> 1, c->opts.btree_node_size); } static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, @@ -610,49 +611,57 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, } static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, - struct bpos bucket_start, - struct bpos bucket_end) + struct extents_to_bp_state *s) { struct bch_fs *c = trans->c; - struct btree_iter iter; - enum btree_id btree_id; - struct bpos_level last_flushed = { UINT_MAX, POS_MIN }; int ret = 0; - for (btree_id = 0; btree_id < btree_id_nr_alive(c); btree_id++) { - unsigned depth = btree_type_has_ptrs(btree_id) ? 0 : 1; + for (enum btree_id btree_id = 0; + btree_id < btree_id_nr_alive(c); + btree_id++) { + int level, depth = btree_type_has_ptrs(btree_id) ? 0 : 1; - bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, - depth, - BTREE_ITER_ALL_LEVELS| - BTREE_ITER_PREFETCH); - - do { - ret = commit_do(trans, NULL, NULL, - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_NOFAIL, - check_extent_to_backpointers(trans, &iter, - bucket_start, bucket_end, - &last_flushed)); - if (ret) - break; - } while (!bch2_btree_iter_advance(&iter)); + ret = commit_do(trans, NULL, NULL, + BCH_TRANS_COMMIT_no_enospc, + check_btree_root_to_backpointers(trans, s, btree_id, &level)); + if (ret) + return ret; - bch2_trans_iter_exit(trans, &iter); + while (level >= depth) { + struct btree_iter iter; + bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, + level, + BTREE_ITER_PREFETCH); + while (1) { + bch2_trans_begin(trans); + + struct bkey_s_c k = bch2_btree_iter_peek(&iter); + if (!k.k) + break; + ret = bkey_err(k) ?: + check_extent_to_backpointers(trans, s, btree_id, level, k) ?: + bch2_trans_commit(trans, NULL, NULL, + BCH_TRANS_COMMIT_no_enospc); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + ret = 0; + continue; + } + if (ret) + break; + if (bpos_eq(iter.pos, SPOS_MAX)) + break; + bch2_btree_iter_advance(&iter); + } + bch2_trans_iter_exit(trans, &iter); - if (ret) - break; + if (ret) + return ret; - ret = commit_do(trans, NULL, NULL, - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_NOFAIL, - check_btree_root_to_backpointers(trans, btree_id, - bucket_start, bucket_end, - &last_flushed)); - if (ret) - break; + --level; + } } - return ret; + + return 0; } static struct bpos bucket_pos_to_bp_safe(const struct bch_fs *c, @@ -714,40 +723,45 @@ static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans, int bch2_check_extents_to_backpointers(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); - struct bpos start = POS_MIN, end; + struct extents_to_bp_state s = { .bucket_start = POS_MIN }; int ret; + bch2_bkey_buf_init(&s.last_flushed); + bkey_init(&s.last_flushed.k->k); + while (1) { - ret = bch2_get_alloc_in_memory_pos(trans, start, &end); + ret = bch2_get_alloc_in_memory_pos(trans, s.bucket_start, &s.bucket_end); if (ret) break; - if (bpos_eq(start, POS_MIN) && !bpos_eq(end, SPOS_MAX)) + if ( bpos_eq(s.bucket_start, POS_MIN) && + !bpos_eq(s.bucket_end, SPOS_MAX)) bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass", __func__, btree_nodes_fit_in_ram(c)); - if (!bpos_eq(start, POS_MIN) || !bpos_eq(end, SPOS_MAX)) { + if (!bpos_eq(s.bucket_start, POS_MIN) || + !bpos_eq(s.bucket_end, SPOS_MAX)) { struct printbuf buf = PRINTBUF; prt_str(&buf, "check_extents_to_backpointers(): "); - bch2_bpos_to_text(&buf, start); + bch2_bpos_to_text(&buf, s.bucket_start); prt_str(&buf, "-"); - bch2_bpos_to_text(&buf, end); + bch2_bpos_to_text(&buf, s.bucket_end); bch_verbose(c, "%s", buf.buf); printbuf_exit(&buf); } - ret = bch2_check_extents_to_backpointers_pass(trans, start, end); - if (ret || bpos_eq(end, SPOS_MAX)) + ret = bch2_check_extents_to_backpointers_pass(trans, &s); + if (ret || bpos_eq(s.bucket_end, SPOS_MAX)) break; - start = bpos_successor(end); + s.bucket_start = bpos_successor(s.bucket_end); } bch2_trans_put(trans); + bch2_bkey_buf_exit(&s.last_flushed, c); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } @@ -801,13 +815,11 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, struct bbpos start, struct bbpos end) { - struct btree_iter iter; - struct bkey_s_c k; struct bpos last_flushed_pos = SPOS_MAX; return for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers, POS_MIN, BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_one_backpointer(trans, start, end, bkey_s_c_to_backpointer(k), &last_flushed_pos)); @@ -854,7 +866,6 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c) } bch2_trans_put(trans); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index ab866feeaf66..327365a9feac 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -2,6 +2,7 @@ #ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H #define _BCACHEFS_BACKPOINTERS_BACKGROUND_H +#include "btree_cache.h" #include "btree_iter.h" #include "btree_update.h" #include "buckets.h" @@ -63,7 +64,7 @@ static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c, return ret; } -int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bkey_i_backpointer *, +int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bpos bucket, struct bch_backpointer, struct bkey_s_c, bool); static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans, @@ -72,28 +73,21 @@ static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans, struct bkey_s_c orig_k, bool insert) { - struct bch_fs *c = trans->c; - struct bkey_i_backpointer *bp_k; - int ret; + if (unlikely(bch2_backpointers_no_use_write_buffer)) + return bch2_bucket_backpointer_mod_nowritebuffer(trans, bucket, bp, orig_k, insert); - bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer)); - ret = PTR_ERR_OR_ZERO(bp_k); - if (ret) - return ret; + struct bkey_i_backpointer bp_k; - bkey_backpointer_init(&bp_k->k_i); - bp_k->k.p = bucket_pos_to_bp(c, bucket, bp.bucket_offset); - bp_k->v = bp; + bkey_backpointer_init(&bp_k.k_i); + bp_k.k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset); + bp_k.v = bp; if (!insert) { - bp_k->k.type = KEY_TYPE_deleted; - set_bkey_val_u64s(&bp_k->k, 0); + bp_k.k.type = KEY_TYPE_deleted; + set_bkey_val_u64s(&bp_k.k, 0); } - if (unlikely(bch2_backpointers_no_use_write_buffer)) - return bch2_bucket_backpointer_mod_nowritebuffer(trans, bp_k, bp, orig_k, insert); - - return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k->k_i); + return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k.k_i); } static inline enum bch_data_type bkey_ptr_data_type(enum btree_id btree_id, unsigned level, diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index b62737fdf5ab..b80c6c9efd8c 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -193,6 +193,7 @@ #include <linux/mutex.h> #include <linux/percpu-refcount.h> #include <linux/percpu-rwsem.h> +#include <linux/refcount.h> #include <linux/rhashtable.h> #include <linux/rwsem.h> #include <linux/semaphore.h> @@ -223,9 +224,11 @@ #define race_fault(...) dynamic_fault("bcachefs:race") +#define count_event(_c, _name) this_cpu_inc((_c)->counters[BCH_COUNTER_##_name]) + #define trace_and_count(_c, _name, ...) \ do { \ - this_cpu_inc((_c)->counters[BCH_COUNTER_##_name]); \ + count_event(_c, _name); \ trace_##_name(__VA_ARGS__); \ } while (0) @@ -262,46 +265,76 @@ do { \ #define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n") +__printf(2, 3) +void __bch2_print(struct bch_fs *c, const char *fmt, ...); + +#define maybe_dev_to_fs(_c) _Generic((_c), \ + struct bch_dev *: ((struct bch_dev *) (_c))->fs, \ + struct bch_fs *: (_c)) + +#define bch2_print(_c, ...) __bch2_print(maybe_dev_to_fs(_c), __VA_ARGS__) + +#define bch2_print_ratelimited(_c, ...) \ +do { \ + static DEFINE_RATELIMIT_STATE(_rs, \ + DEFAULT_RATELIMIT_INTERVAL, \ + DEFAULT_RATELIMIT_BURST); \ + \ + if (__ratelimit(&_rs)) \ + bch2_print(_c, __VA_ARGS__); \ +} while (0) + #define bch_info(c, fmt, ...) \ - printk(KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) + bch2_print(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) #define bch_notice(c, fmt, ...) \ - printk(KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__) + bch2_print(c, KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__) #define bch_warn(c, fmt, ...) \ - printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) + bch2_print(c, KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) #define bch_warn_ratelimited(c, fmt, ...) \ - printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) + bch2_print_ratelimited(c, KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) #define bch_err(c, fmt, ...) \ - printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) + bch2_print(c, KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) #define bch_err_dev(ca, fmt, ...) \ - printk(KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__) + bch2_print(c, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__) #define bch_err_dev_offset(ca, _offset, fmt, ...) \ - printk(KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__) + bch2_print(c, KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__) #define bch_err_inum(c, _inum, fmt, ...) \ - printk(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__) + bch2_print(c, KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__) #define bch_err_inum_offset(c, _inum, _offset, fmt, ...) \ - printk(KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__) + bch2_print(c, KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__) #define bch_err_ratelimited(c, fmt, ...) \ - printk_ratelimited(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) + bch2_print_ratelimited(c, KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) #define bch_err_dev_ratelimited(ca, fmt, ...) \ - printk_ratelimited(KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__) + bch2_print_ratelimited(ca, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__) #define bch_err_dev_offset_ratelimited(ca, _offset, fmt, ...) \ - printk_ratelimited(KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__) + bch2_print_ratelimited(ca, KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__) #define bch_err_inum_ratelimited(c, _inum, fmt, ...) \ - printk_ratelimited(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__) + bch2_print_ratelimited(c, KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__) #define bch_err_inum_offset_ratelimited(c, _inum, _offset, fmt, ...) \ - printk_ratelimited(KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__) + bch2_print_ratelimited(c, KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__) + +static inline bool should_print_err(int err) +{ + return err && !bch2_err_matches(err, BCH_ERR_transaction_restart); +} #define bch_err_fn(_c, _ret) \ do { \ - if (_ret && !bch2_err_matches(_ret, BCH_ERR_transaction_restart))\ + if (should_print_err(_ret)) \ bch_err(_c, "%s(): error %s", __func__, bch2_err_str(_ret));\ } while (0) +#define bch_err_fn_ratelimited(_c, _ret) \ +do { \ + if (should_print_err(_ret)) \ + bch_err_ratelimited(_c, "%s(): error %s", __func__, bch2_err_str(_ret));\ +} while (0) + #define bch_err_msg(_c, _ret, _msg, ...) \ do { \ - if (_ret && !bch2_err_matches(_ret, BCH_ERR_transaction_restart))\ + if (should_print_err(_ret)) \ bch_err(_c, "%s(): error " _msg " %s", __func__, \ ##__VA_ARGS__, bch2_err_str(_ret)); \ } while (0) @@ -392,6 +425,7 @@ BCH_DEBUG_PARAMS_DEBUG() x(btree_node_merge) \ x(btree_node_sort) \ x(btree_node_read) \ + x(btree_node_read_done) \ x(btree_interior_update_foreground) \ x(btree_interior_update_total) \ x(btree_gc) \ @@ -401,9 +435,12 @@ BCH_DEBUG_PARAMS_DEBUG() x(journal_flush_write) \ x(journal_noflush_write) \ x(journal_flush_seq) \ - x(blocked_journal) \ + x(blocked_journal_low_on_space) \ + x(blocked_journal_low_on_pin) \ + x(blocked_journal_max_in_flight) \ x(blocked_allocate) \ x(blocked_allocate_open_bucket) \ + x(blocked_write_buffer_full) \ x(nocow_lock_contended) enum bch_time_stats { @@ -428,6 +465,7 @@ enum bch_time_stats { #include "replicas_types.h" #include "subvolume_types.h" #include "super_types.h" +#include "thread_with_file_types.h" /* Number of nodes btree coalesce will try to coalesce at once */ #define GC_MERGE_NODES 4U @@ -564,32 +602,35 @@ struct bch_dev { struct io_count __percpu *io_done; }; -enum { - /* startup: */ - BCH_FS_STARTED, - BCH_FS_MAY_GO_RW, - BCH_FS_RW, - BCH_FS_WAS_RW, - - /* shutdown: */ - BCH_FS_STOPPING, - BCH_FS_EMERGENCY_RO, - BCH_FS_GOING_RO, - BCH_FS_WRITE_DISABLE_COMPLETE, - BCH_FS_CLEAN_SHUTDOWN, - - /* fsck passes: */ - BCH_FS_FSCK_DONE, - BCH_FS_INITIAL_GC_UNFIXED, /* kill when we enumerate fsck errors */ - BCH_FS_NEED_ANOTHER_GC, - - BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, - - /* errors: */ - BCH_FS_ERROR, - BCH_FS_TOPOLOGY_ERROR, - BCH_FS_ERRORS_FIXED, - BCH_FS_ERRORS_NOT_FIXED, +/* + * initial_gc_unfixed + * error + * topology error + */ + +#define BCH_FS_FLAGS() \ + x(started) \ + x(may_go_rw) \ + x(rw) \ + x(was_rw) \ + x(stopping) \ + x(emergency_ro) \ + x(going_ro) \ + x(write_disable_complete) \ + x(clean_shutdown) \ + x(fsck_running) \ + x(initial_gc_unfixed) \ + x(need_another_gc) \ + x(need_delete_dead_snapshots) \ + x(error) \ + x(topology_error) \ + x(errors_fixed) \ + x(errors_not_fixed) + +enum bch_fs_flags { +#define x(n) BCH_FS_##n, + BCH_FS_FLAGS() +#undef x }; struct btree_debug { @@ -599,10 +640,11 @@ struct btree_debug { #define BCH_TRANSACTIONS_NR 128 struct btree_transaction_stats { + struct bch2_time_stats duration; struct bch2_time_stats lock_hold_times; struct mutex lock; unsigned nr_max_paths; - unsigned wb_updates_size; + unsigned journal_entries_size; unsigned max_mem; char *max_paths_text; }; @@ -664,7 +706,8 @@ struct btree_trans_buf { x(invalidate) \ x(delete_dead_snapshots) \ x(snapshot_delete_pagecache) \ - x(sysfs) + x(sysfs) \ + x(btree_write_buffer) enum bch_write_ref { #define x(n) BCH_WRITE_REF_##n, @@ -689,6 +732,8 @@ struct bch_fs { struct super_block *vfs_sb; dev_t dev; char name[40]; + struct stdio_redirect *stdio; + struct task_struct *stdio_filter; /* ro/rw, add/remove/resize devices: */ struct rw_semaphore state_lock; @@ -699,6 +744,13 @@ struct bch_fs { #else struct percpu_ref writes; #endif + /* + * Analagous to c->writes, for asynchronous ops that don't necessarily + * need fs to be read-write + */ + refcount_t ro_ref; + wait_queue_head_t ro_ref_wait; + struct work_struct read_only_work; struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX]; @@ -1002,10 +1054,21 @@ struct bch_fs { /* RECOVERY */ u64 journal_replay_seq_start; u64 journal_replay_seq_end; + /* + * Two different uses: + * "Has this fsck pass?" - i.e. should this type of error be an + * emergency read-only + * And, in certain situations fsck will rewind to an earlier pass: used + * for signaling to the toplevel code which pass we want to run now. + */ enum bch_recovery_pass curr_recovery_pass; /* bitmap of explicitly enabled recovery passes: */ u64 recovery_passes_explicit; + /* bitmask of recovery passes that we actually ran */ u64 recovery_passes_complete; + /* never rewinds version of curr_recovery_pass */ + enum bch_recovery_pass recovery_pass_done; + struct semaphore online_fsck_mutex; /* DEBUG JUNK */ struct dentry *fs_debug_dir; @@ -1065,10 +1128,20 @@ static inline void bch2_write_ref_get(struct bch_fs *c, enum bch_write_ref ref) #endif } +static inline bool __bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref) +{ +#ifdef BCH_WRITE_REF_DEBUG + return !test_bit(BCH_FS_going_ro, &c->flags) && + atomic_long_inc_not_zero(&c->writes[ref]); +#else + return percpu_ref_tryget(&c->writes); +#endif +} + static inline bool bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref) { #ifdef BCH_WRITE_REF_DEBUG - return !test_bit(BCH_FS_GOING_RO, &c->flags) && + return !test_bit(BCH_FS_going_ro, &c->flags) && atomic_long_inc_not_zero(&c->writes[ref]); #else return percpu_ref_tryget_live(&c->writes); @@ -1087,13 +1160,27 @@ static inline void bch2_write_ref_put(struct bch_fs *c, enum bch_write_ref ref) if (atomic_long_read(&c->writes[i])) return; - set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); + set_bit(BCH_FS_write_disable_complete, &c->flags); wake_up(&bch2_read_only_wait); #else percpu_ref_put(&c->writes); #endif } +static inline bool bch2_ro_ref_tryget(struct bch_fs *c) +{ + if (test_bit(BCH_FS_stopping, &c->flags)) + return false; + + return refcount_inc_not_zero(&c->ro_ref); +} + +static inline void bch2_ro_ref_put(struct bch_fs *c) +{ + if (refcount_dec_and_test(&c->ro_ref)) + wake_up(&c->ro_ref_wait); +} + static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages) { #ifndef NO_BCACHEFS_FS @@ -1117,11 +1204,6 @@ static inline unsigned block_sectors(const struct bch_fs *c) return c->opts.block_size >> 9; } -static inline size_t btree_sectors(const struct bch_fs *c) -{ - return c->opts.btree_node_size >> 9; -} - static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree) { return c->btree_key_cache_btrees & (1U << btree); @@ -1158,6 +1240,15 @@ static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev) return dev < c->sb.nr_devices && c->devs[dev]; } +static inline struct stdio_redirect *bch2_fs_stdio_redirect(struct bch_fs *c) +{ + struct stdio_redirect *stdio = c->stdio; + + if (c->stdio_filter && c->stdio_filter != current) + stdio = NULL; + return stdio; +} + #define BKEY_PADDED_ONSTACK(key, pad) \ struct { struct bkey_i key; __u64 key ## _pad[pad]; } diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index fe78e87603fc..0668b682a21c 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -307,6 +307,13 @@ struct bkey_i { struct bch_val v; }; +#define POS_KEY(_pos) \ +((struct bkey) { \ + .u64s = BKEY_U64s, \ + .format = KEY_FORMAT_CURRENT, \ + .p = _pos, \ +}) + #define KEY(_inode, _offset, _size) \ ((struct bkey) { \ .u64s = BKEY_U64s, \ @@ -410,600 +417,12 @@ struct bch_set { struct bch_val v; }; -/* Extents */ - -/* - * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally - * preceded by checksum/compression information (bch_extent_crc32 or - * bch_extent_crc64). - * - * One major determining factor in the format of extents is how we handle and - * represent extents that have been partially overwritten and thus trimmed: - * - * If an extent is not checksummed or compressed, when the extent is trimmed we - * don't have to remember the extent we originally allocated and wrote: we can - * merely adjust ptr->offset to point to the start of the data that is currently - * live. The size field in struct bkey records the current (live) size of the - * extent, and is also used to mean "size of region on disk that we point to" in - * this case. - * - * Thus an extent that is not checksummed or compressed will consist only of a - * list of bch_extent_ptrs, with none of the fields in - * bch_extent_crc32/bch_extent_crc64. - * - * When an extent is checksummed or compressed, it's not possible to read only - * the data that is currently live: we have to read the entire extent that was - * originally written, and then return only the part of the extent that is - * currently live. - * - * Thus, in addition to the current size of the extent in struct bkey, we need - * to store the size of the originally allocated space - this is the - * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also, - * when the extent is trimmed, instead of modifying the offset field of the - * pointer, we keep a second smaller offset field - "offset into the original - * extent of the currently live region". - * - * The other major determining factor is replication and data migration: - * - * Each pointer may have its own bch_extent_crc32/64. When doing a replicated - * write, we will initially write all the replicas in the same format, with the - * same checksum type and compression format - however, when copygc runs later (or - * tiering/cache promotion, anything that moves data), it is not in general - * going to rewrite all the pointers at once - one of the replicas may be in a - * bucket on one device that has very little fragmentation while another lives - * in a bucket that has become heavily fragmented, and thus is being rewritten - * sooner than the rest. - * - * Thus it will only move a subset of the pointers (or in the case of - * tiering/cache promotion perhaps add a single pointer without dropping any - * current pointers), and if the extent has been partially overwritten it must - * write only the currently live portion (or copygc would not be able to reduce - * fragmentation!) - which necessitates a different bch_extent_crc format for - * the new pointer. - * - * But in the interests of space efficiency, we don't want to store one - * bch_extent_crc for each pointer if we don't have to. - * - * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and - * bch_extent_ptrs appended arbitrarily one after the other. We determine the - * type of a given entry with a scheme similar to utf8 (except we're encoding a - * type, not a size), encoding the type in the position of the first set bit: - * - * bch_extent_crc32 - 0b1 - * bch_extent_ptr - 0b10 - * bch_extent_crc64 - 0b100 - * - * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and - * bch_extent_crc64 is the least constrained). - * - * Then, each bch_extent_crc32/64 applies to the pointers that follow after it, - * until the next bch_extent_crc32/64. - * - * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer - * is neither checksummed nor compressed. - */ - /* 128 bits, sufficient for cryptographic MACs: */ struct bch_csum { __le64 lo; __le64 hi; } __packed __aligned(8); -#define BCH_EXTENT_ENTRY_TYPES() \ - x(ptr, 0) \ - x(crc32, 1) \ - x(crc64, 2) \ - x(crc128, 3) \ - x(stripe_ptr, 4) \ - x(rebalance, 5) -#define BCH_EXTENT_ENTRY_MAX 6 - -enum bch_extent_entry_type { -#define x(f, n) BCH_EXTENT_ENTRY_##f = n, - BCH_EXTENT_ENTRY_TYPES() -#undef x -}; - -/* Compressed/uncompressed size are stored biased by 1: */ -struct bch_extent_crc32 { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u32 type:2, - _compressed_size:7, - _uncompressed_size:7, - offset:7, - _unused:1, - csum_type:4, - compression_type:4; - __u32 csum; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u32 csum; - __u32 compression_type:4, - csum_type:4, - _unused:1, - offset:7, - _uncompressed_size:7, - _compressed_size:7, - type:2; -#endif -} __packed __aligned(8); - -#define CRC32_SIZE_MAX (1U << 7) -#define CRC32_NONCE_MAX 0 - -struct bch_extent_crc64 { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:3, - _compressed_size:9, - _uncompressed_size:9, - offset:9, - nonce:10, - csum_type:4, - compression_type:4, - csum_hi:16; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 csum_hi:16, - compression_type:4, - csum_type:4, - nonce:10, - offset:9, - _uncompressed_size:9, - _compressed_size:9, - type:3; -#endif - __u64 csum_lo; -} __packed __aligned(8); - -#define CRC64_SIZE_MAX (1U << 9) -#define CRC64_NONCE_MAX ((1U << 10) - 1) - -struct bch_extent_crc128 { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:4, - _compressed_size:13, - _uncompressed_size:13, - offset:13, - nonce:13, - csum_type:4, - compression_type:4; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 compression_type:4, - csum_type:4, - nonce:13, - offset:13, - _uncompressed_size:13, - _compressed_size:13, - type:4; -#endif - struct bch_csum csum; -} __packed __aligned(8); - -#define CRC128_SIZE_MAX (1U << 13) -#define CRC128_NONCE_MAX ((1U << 13) - 1) - -/* - * @reservation - pointer hasn't been written to, just reserved - */ -struct bch_extent_ptr { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:1, - cached:1, - unused:1, - unwritten:1, - offset:44, /* 8 petabytes */ - dev:8, - gen:8; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 gen:8, - dev:8, - offset:44, - unwritten:1, - unused:1, - cached:1, - type:1; -#endif -} __packed __aligned(8); - -struct bch_extent_stripe_ptr { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:5, - block:8, - redundancy:4, - idx:47; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 idx:47, - redundancy:4, - block:8, - type:5; -#endif -}; - -struct bch_extent_rebalance { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:6, - unused:34, - compression:8, /* enum bch_compression_opt */ - target:16; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 target:16, - compression:8, - unused:34, - type:6; -#endif -}; - -union bch_extent_entry { -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64 - unsigned long type; -#elif __BITS_PER_LONG == 32 - struct { - unsigned long pad; - unsigned long type; - }; -#else -#error edit for your odd byteorder. -#endif - -#define x(f, n) struct bch_extent_##f f; - BCH_EXTENT_ENTRY_TYPES() -#undef x -}; - -struct bch_btree_ptr { - struct bch_val v; - - __u64 _data[0]; - struct bch_extent_ptr start[]; -} __packed __aligned(8); - -struct bch_btree_ptr_v2 { - struct bch_val v; - - __u64 mem_ptr; - __le64 seq; - __le16 sectors_written; - __le16 flags; - struct bpos min_key; - __u64 _data[0]; - struct bch_extent_ptr start[]; -} __packed __aligned(8); - -LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1); - -struct bch_extent { - struct bch_val v; - - __u64 _data[0]; - union bch_extent_entry start[]; -} __packed __aligned(8); - -struct bch_reservation { - struct bch_val v; - - __le32 generation; - __u8 nr_replicas; - __u8 pad[3]; -} __packed __aligned(8); - -/* Maximum size (in u64s) a single pointer could be: */ -#define BKEY_EXTENT_PTR_U64s_MAX\ - ((sizeof(struct bch_extent_crc128) + \ - sizeof(struct bch_extent_ptr)) / sizeof(__u64)) - -/* Maximum possible size of an entire extent value: */ -#define BKEY_EXTENT_VAL_U64s_MAX \ - (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1)) - -/* * Maximum possible size of an entire extent, key + value: */ -#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX) - -/* Btree pointers don't carry around checksums: */ -#define BKEY_BTREE_PTR_VAL_U64s_MAX \ - ((sizeof(struct bch_btree_ptr_v2) + \ - sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64)) -#define BKEY_BTREE_PTR_U64s_MAX \ - (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX) - -/* Inodes */ - -#define BLOCKDEV_INODE_MAX 4096 - -#define BCACHEFS_ROOT_INO 4096 - -struct bch_inode { - struct bch_val v; - - __le64 bi_hash_seed; - __le32 bi_flags; - __le16 bi_mode; - __u8 fields[]; -} __packed __aligned(8); - -struct bch_inode_v2 { - struct bch_val v; - - __le64 bi_journal_seq; - __le64 bi_hash_seed; - __le64 bi_flags; - __le16 bi_mode; - __u8 fields[]; -} __packed __aligned(8); - -struct bch_inode_v3 { - struct bch_val v; - - __le64 bi_journal_seq; - __le64 bi_hash_seed; - __le64 bi_flags; - __le64 bi_sectors; - __le64 bi_size; - __le64 bi_version; - __u8 fields[]; -} __packed __aligned(8); - -#define INODEv3_FIELDS_START_INITIAL 6 -#define INODEv3_FIELDS_START_CUR (offsetof(struct bch_inode_v3, fields) / sizeof(__u64)) - -struct bch_inode_generation { - struct bch_val v; - - __le32 bi_generation; - __le32 pad; -} __packed __aligned(8); - -/* - * bi_subvol and bi_parent_subvol are only set for subvolume roots: - */ - -#define BCH_INODE_FIELDS_v2() \ - x(bi_atime, 96) \ - x(bi_ctime, 96) \ - x(bi_mtime, 96) \ - x(bi_otime, 96) \ - x(bi_size, 64) \ - x(bi_sectors, 64) \ - x(bi_uid, 32) \ - x(bi_gid, 32) \ - x(bi_nlink, 32) \ - x(bi_generation, 32) \ - x(bi_dev, 32) \ - x(bi_data_checksum, 8) \ - x(bi_compression, 8) \ - x(bi_project, 32) \ - x(bi_background_compression, 8) \ - x(bi_data_replicas, 8) \ - x(bi_promote_target, 16) \ - x(bi_foreground_target, 16) \ - x(bi_background_target, 16) \ - x(bi_erasure_code, 16) \ - x(bi_fields_set, 16) \ - x(bi_dir, 64) \ - x(bi_dir_offset, 64) \ - x(bi_subvol, 32) \ - x(bi_parent_subvol, 32) - -#define BCH_INODE_FIELDS_v3() \ - x(bi_atime, 96) \ - x(bi_ctime, 96) \ - x(bi_mtime, 96) \ - x(bi_otime, 96) \ - x(bi_uid, 32) \ - x(bi_gid, 32) \ - x(bi_nlink, 32) \ - x(bi_generation, 32) \ - x(bi_dev, 32) \ - x(bi_data_checksum, 8) \ - x(bi_compression, 8) \ - x(bi_project, 32) \ - x(bi_background_compression, 8) \ - x(bi_data_replicas, 8) \ - x(bi_promote_target, 16) \ - x(bi_foreground_target, 16) \ - x(bi_background_target, 16) \ - x(bi_erasure_code, 16) \ - x(bi_fields_set, 16) \ - x(bi_dir, 64) \ - x(bi_dir_offset, 64) \ - x(bi_subvol, 32) \ - x(bi_parent_subvol, 32) \ - x(bi_nocow, 8) - -/* subset of BCH_INODE_FIELDS */ -#define BCH_INODE_OPTS() \ - x(data_checksum, 8) \ - x(compression, 8) \ - x(project, 32) \ - x(background_compression, 8) \ - x(data_replicas, 8) \ - x(promote_target, 16) \ - x(foreground_target, 16) \ - x(background_target, 16) \ - x(erasure_code, 16) \ - x(nocow, 8) - -enum inode_opt_id { -#define x(name, ...) \ - Inode_opt_##name, - BCH_INODE_OPTS() -#undef x - Inode_opt_nr, -}; - -#define BCH_INODE_FLAGS() \ - x(sync, 0) \ - x(immutable, 1) \ - x(append, 2) \ - x(nodump, 3) \ - x(noatime, 4) \ - x(i_size_dirty, 5) \ - x(i_sectors_dirty, 6) \ - x(unlinked, 7) \ - x(backptr_untrusted, 8) - -/* bits 20+ reserved for packed fields below: */ - -enum bch_inode_flags { -#define x(t, n) BCH_INODE_##t = 1U << n, - BCH_INODE_FLAGS() -#undef x -}; - -enum __bch_inode_flags { -#define x(t, n) __BCH_INODE_##t = n, - BCH_INODE_FLAGS() -#undef x -}; - -LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24); -LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31); -LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32); - -LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24); -LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31); - -LE64_BITMASK(INODEv3_STR_HASH, struct bch_inode_v3, bi_flags, 20, 24); -LE64_BITMASK(INODEv3_NR_FIELDS, struct bch_inode_v3, bi_flags, 24, 31); - -LE64_BITMASK(INODEv3_FIELDS_START, - struct bch_inode_v3, bi_flags, 31, 36); -LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52); - -/* Dirents */ - -/* - * Dirents (and xattrs) have to implement string lookups; since our b-tree - * doesn't support arbitrary length strings for the key, we instead index by a - * 64 bit hash (currently truncated sha1) of the string, stored in the offset - * field of the key - using linear probing to resolve hash collisions. This also - * provides us with the readdir cookie posix requires. - * - * Linear probing requires us to use whiteouts for deletions, in the event of a - * collision: - */ - -struct bch_dirent { - struct bch_val v; - - /* Target inode number: */ - union { - __le64 d_inum; - struct { /* DT_SUBVOL */ - __le32 d_child_subvol; - __le32 d_parent_subvol; - }; - }; - - /* - * Copy of mode bits 12-15 from the target inode - so userspace can get - * the filetype without having to do a stat() - */ - __u8 d_type; - - __u8 d_name[]; -} __packed __aligned(8); - -#define DT_SUBVOL 16 -#define BCH_DT_MAX 17 - -#define BCH_NAME_MAX 512 - -/* Xattrs */ - -#define KEY_TYPE_XATTR_INDEX_USER 0 -#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS 1 -#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2 -#define KEY_TYPE_XATTR_INDEX_TRUSTED 3 -#define KEY_TYPE_XATTR_INDEX_SECURITY 4 - -struct bch_xattr { - struct bch_val v; - __u8 x_type; - __u8 x_name_len; - __le16 x_val_len; - __u8 x_name[]; -} __packed __aligned(8); - -/* Bucket/allocation information: */ - -struct bch_alloc { - struct bch_val v; - __u8 fields; - __u8 gen; - __u8 data[]; -} __packed __aligned(8); - -#define BCH_ALLOC_FIELDS_V1() \ - x(read_time, 16) \ - x(write_time, 16) \ - x(data_type, 8) \ - x(dirty_sectors, 16) \ - x(cached_sectors, 16) \ - x(oldest_gen, 8) \ - x(stripe, 32) \ - x(stripe_redundancy, 8) - -enum { -#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name, - BCH_ALLOC_FIELDS_V1() -#undef x -}; - -struct bch_alloc_v2 { - struct bch_val v; - __u8 nr_fields; - __u8 gen; - __u8 oldest_gen; - __u8 data_type; - __u8 data[]; -} __packed __aligned(8); - -#define BCH_ALLOC_FIELDS_V2() \ - x(read_time, 64) \ - x(write_time, 64) \ - x(dirty_sectors, 32) \ - x(cached_sectors, 32) \ - x(stripe, 32) \ - x(stripe_redundancy, 8) - -struct bch_alloc_v3 { - struct bch_val v; - __le64 journal_seq; - __le32 flags; - __u8 nr_fields; - __u8 gen; - __u8 oldest_gen; - __u8 data_type; - __u8 data[]; -} __packed __aligned(8); - -LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1) -LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2) - -struct bch_alloc_v4 { - struct bch_val v; - __u64 journal_seq; - __u32 flags; - __u8 gen; - __u8 oldest_gen; - __u8 data_type; - __u8 stripe_redundancy; - __u32 dirty_sectors; - __u32 cached_sectors; - __u64 io_time[2]; - __u32 stripe; - __u32 nr_external_backpointers; - __u64 fragmentation_lru; -} __packed __aligned(8); - -#define BCH_ALLOC_V4_U64s_V0 6 -#define BCH_ALLOC_V4_U64s (sizeof(struct bch_alloc_v4) / sizeof(__u64)) - -BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1) -BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2) -BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags, 2, 8) -BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS, struct bch_alloc_v4, flags, 8, 14) - -#define BCH_ALLOC_V4_NR_BACKPOINTERS_MAX 40 - struct bch_backpointer { struct bch_val v; __u8 btree_id; @@ -1014,154 +433,6 @@ struct bch_backpointer { struct bpos pos; } __packed __aligned(8); -#define KEY_TYPE_BUCKET_GENS_BITS 8 -#define KEY_TYPE_BUCKET_GENS_NR (1U << KEY_TYPE_BUCKET_GENS_BITS) -#define KEY_TYPE_BUCKET_GENS_MASK (KEY_TYPE_BUCKET_GENS_NR - 1) - -struct bch_bucket_gens { - struct bch_val v; - u8 gens[KEY_TYPE_BUCKET_GENS_NR]; -} __packed __aligned(8); - -/* Quotas: */ - -enum quota_types { - QTYP_USR = 0, - QTYP_GRP = 1, - QTYP_PRJ = 2, - QTYP_NR = 3, -}; - -enum quota_counters { - Q_SPC = 0, - Q_INO = 1, - Q_COUNTERS = 2, -}; - -struct bch_quota_counter { - __le64 hardlimit; - __le64 softlimit; -}; - -struct bch_quota { - struct bch_val v; - struct bch_quota_counter c[Q_COUNTERS]; -} __packed __aligned(8); - -/* Erasure coding */ - -struct bch_stripe { - struct bch_val v; - __le16 sectors; - __u8 algorithm; - __u8 nr_blocks; - __u8 nr_redundant; - - __u8 csum_granularity_bits; - __u8 csum_type; - __u8 pad; - - struct bch_extent_ptr ptrs[]; -} __packed __aligned(8); - -/* Reflink: */ - -struct bch_reflink_p { - struct bch_val v; - __le64 idx; - /* - * A reflink pointer might point to an indirect extent which is then - * later split (by copygc or rebalance). If we only pointed to part of - * the original indirect extent, and then one of the fragments is - * outside the range we point to, we'd leak a refcount: so when creating - * reflink pointers, we need to store pad values to remember the full - * range we were taking a reference on. - */ - __le32 front_pad; - __le32 back_pad; -} __packed __aligned(8); - -struct bch_reflink_v { - struct bch_val v; - __le64 refcount; - union bch_extent_entry start[0]; - __u64 _data[]; -} __packed __aligned(8); - -struct bch_indirect_inline_data { - struct bch_val v; - __le64 refcount; - u8 data[]; -}; - -/* Inline data */ - -struct bch_inline_data { - struct bch_val v; - u8 data[]; -}; - -/* Subvolumes: */ - -#define SUBVOL_POS_MIN POS(0, 1) -#define SUBVOL_POS_MAX POS(0, S32_MAX) -#define BCACHEFS_ROOT_SUBVOL 1 - -struct bch_subvolume { - struct bch_val v; - __le32 flags; - __le32 snapshot; - __le64 inode; - /* - * Snapshot subvolumes form a tree, separate from the snapshot nodes - * tree - if this subvolume is a snapshot, this is the ID of the - * subvolume it was created from: - */ - __le32 parent; - __le32 pad; - bch_le128 otime; -}; - -LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1) -/* - * We need to know whether a subvolume is a snapshot so we can know whether we - * can delete it (or whether it should just be rm -rf'd) - */ -LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2) -LE32_BITMASK(BCH_SUBVOLUME_UNLINKED, struct bch_subvolume, flags, 2, 3) - -/* Snapshots */ - -struct bch_snapshot { - struct bch_val v; - __le32 flags; - __le32 parent; - __le32 children[2]; - __le32 subvol; - /* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */ - __le32 tree; - __le32 depth; - __le32 skip[3]; -}; - -LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1) - -/* True if a subvolume points to this snapshot node: */ -LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2) - -/* - * Snapshot trees: - * - * The snapshot_trees btree gives us persistent indentifier for each tree of - * bch_snapshot nodes, and allow us to record and easily find the root/master - * subvolume that other snapshots were created from: - */ -struct bch_snapshot_tree { - struct bch_val v; - __le32 master_subvol; - __le32 root_snapshot; -}; - /* LRU btree: */ struct bch_lru { @@ -1171,33 +442,6 @@ struct bch_lru { #define LRU_ID_STRIPES (1U << 16) -/* Logged operations btree: */ - -struct bch_logged_op_truncate { - struct bch_val v; - __le32 subvol; - __le32 pad; - __le64 inum; - __le64 new_i_size; -}; - -enum logged_op_finsert_state { - LOGGED_OP_FINSERT_start, - LOGGED_OP_FINSERT_shift_extents, - LOGGED_OP_FINSERT_finish, -}; - -struct bch_logged_op_finsert { - struct bch_val v; - __u8 state; - __u8 pad[3]; - __le32 subvol; - __le64 inum; - __le64 dst_offset; - __le64 src_offset; - __le64 pos; -}; - /* Optional/variable size superblock sections: */ struct bch_sb_field { @@ -1223,6 +467,19 @@ struct bch_sb_field { x(ext, 13) \ x(downgrade, 14) +#include "alloc_background_format.h" +#include "extents_format.h" +#include "reflink_format.h" +#include "ec_format.h" +#include "inode_format.h" +#include "dirent_format.h" +#include "xattr_format.h" +#include "quota_format.h" +#include "logged_ops_format.h" +#include "snapshot_format.h" +#include "subvolume_format.h" +#include "sb-counters_format.h" + enum bch_sb_field_type { #define x(f, nr) BCH_SB_FIELD_##f = nr, BCH_SB_FIELDS() @@ -1296,6 +553,7 @@ struct bch_member { __le64 errors[BCH_MEMBER_ERROR_NR]; __le64 errors_at_reset[BCH_MEMBER_ERROR_NR]; __le64 errors_reset_time; + __le64 seq; }; #define BCH_MEMBER_V1_BYTES 56 @@ -1442,7 +700,7 @@ struct bch_sb_field_replicas_v0 { struct bch_replicas_entry_v0 entries[]; } __packed __aligned(8); -struct bch_replicas_entry { +struct bch_replicas_entry_v1 { __u8 data_type; __u8 nr_devs; __u8 nr_required; @@ -1454,24 +712,7 @@ struct bch_replicas_entry { struct bch_sb_field_replicas { struct bch_sb_field field; - struct bch_replicas_entry entries[]; -} __packed __aligned(8); - -/* BCH_SB_FIELD_quota: */ - -struct bch_sb_quota_counter { - __le32 timelimit; - __le32 warnlimit; -}; - -struct bch_sb_quota_type { - __le64 flags; - struct bch_sb_quota_counter c[Q_COUNTERS]; -}; - -struct bch_sb_field_quota { - struct bch_sb_field field; - struct bch_sb_quota_type q[QTYP_NR]; + struct bch_replicas_entry_v1 entries[]; } __packed __aligned(8); /* BCH_SB_FIELD_disk_groups: */ @@ -1492,99 +733,6 @@ struct bch_sb_field_disk_groups { struct bch_disk_group entries[]; } __packed __aligned(8); -/* BCH_SB_FIELD_counters */ - -#define BCH_PERSISTENT_COUNTERS() \ - x(io_read, 0) \ - x(io_write, 1) \ - x(io_move, 2) \ - x(bucket_invalidate, 3) \ - x(bucket_discard, 4) \ - x(bucket_alloc, 5) \ - x(bucket_alloc_fail, 6) \ - x(btree_cache_scan, 7) \ - x(btree_cache_reap, 8) \ - x(btree_cache_cannibalize, 9) \ - x(btree_cache_cannibalize_lock, 10) \ - x(btree_cache_cannibalize_lock_fail, 11) \ - x(btree_cache_cannibalize_unlock, 12) \ - x(btree_node_write, 13) \ - x(btree_node_read, 14) \ - x(btree_node_compact, 15) \ - x(btree_node_merge, 16) \ - x(btree_node_split, 17) \ - x(btree_node_rewrite, 18) \ - x(btree_node_alloc, 19) \ - x(btree_node_free, 20) \ - x(btree_node_set_root, 21) \ - x(btree_path_relock_fail, 22) \ - x(btree_path_upgrade_fail, 23) \ - x(btree_reserve_get_fail, 24) \ - x(journal_entry_full, 25) \ - x(journal_full, 26) \ - x(journal_reclaim_finish, 27) \ - x(journal_reclaim_start, 28) \ - x(journal_write, 29) \ - x(read_promote, 30) \ - x(read_bounce, 31) \ - x(read_split, 33) \ - x(read_retry, 32) \ - x(read_reuse_race, 34) \ - x(move_extent_read, 35) \ - x(move_extent_write, 36) \ - x(move_extent_finish, 37) \ - x(move_extent_fail, 38) \ - x(move_extent_start_fail, 39) \ - x(copygc, 40) \ - x(copygc_wait, 41) \ - x(gc_gens_end, 42) \ - x(gc_gens_start, 43) \ - x(trans_blocked_journal_reclaim, 44) \ - x(trans_restart_btree_node_reused, 45) \ - x(trans_restart_btree_node_split, 46) \ - x(trans_restart_fault_inject, 47) \ - x(trans_restart_iter_upgrade, 48) \ - x(trans_restart_journal_preres_get, 49) \ - x(trans_restart_journal_reclaim, 50) \ - x(trans_restart_journal_res_get, 51) \ - x(trans_restart_key_cache_key_realloced, 52) \ - x(trans_restart_key_cache_raced, 53) \ - x(trans_restart_mark_replicas, 54) \ - x(trans_restart_mem_realloced, 55) \ - x(trans_restart_memory_allocation_failure, 56) \ - x(trans_restart_relock, 57) \ - x(trans_restart_relock_after_fill, 58) \ - x(trans_restart_relock_key_cache_fill, 59) \ - x(trans_restart_relock_next_node, 60) \ - x(trans_restart_relock_parent_for_fill, 61) \ - x(trans_restart_relock_path, 62) \ - x(trans_restart_relock_path_intent, 63) \ - x(trans_restart_too_many_iters, 64) \ - x(trans_restart_traverse, 65) \ - x(trans_restart_upgrade, 66) \ - x(trans_restart_would_deadlock, 67) \ - x(trans_restart_would_deadlock_write, 68) \ - x(trans_restart_injected, 69) \ - x(trans_restart_key_cache_upgrade, 70) \ - x(trans_traverse_all, 71) \ - x(transaction_commit, 72) \ - x(write_super, 73) \ - x(trans_restart_would_deadlock_recursion_limit, 74) \ - x(trans_restart_write_buffer_flush, 75) \ - x(trans_restart_split_race, 76) - -enum bch_persistent_counters { -#define x(t, n, ...) BCH_COUNTER_##t, - BCH_PERSISTENT_COUNTERS() -#undef x - BCH_COUNTER_NR -}; - -struct bch_sb_field_counters { - struct bch_sb_field field; - __le64 d[]; -}; - /* * On clean shutdown, store btree roots and current journal sequence number in * the superblock: @@ -1662,69 +810,41 @@ struct bch_sb_field_downgrade { #define BCH_VERSION_MINOR(_v) ((__u16) ((_v) & ~(~0U << 10))) #define BCH_VERSION(_major, _minor) (((_major) << 10)|(_minor) << 0) -#define RECOVERY_PASS_ALL_FSCK (1ULL << 63) - /* * field 1: version name * field 2: BCH_VERSION(major, minor) * field 3: recovery passess required on upgrade */ #define BCH_METADATA_VERSIONS() \ - x(bkey_renumber, BCH_VERSION(0, 10), \ - RECOVERY_PASS_ALL_FSCK) \ - x(inode_btree_change, BCH_VERSION(0, 11), \ - RECOVERY_PASS_ALL_FSCK) \ - x(snapshot, BCH_VERSION(0, 12), \ - RECOVERY_PASS_ALL_FSCK) \ - x(inode_backpointers, BCH_VERSION(0, 13), \ - RECOVERY_PASS_ALL_FSCK) \ - x(btree_ptr_sectors_written, BCH_VERSION(0, 14), \ - RECOVERY_PASS_ALL_FSCK) \ - x(snapshot_2, BCH_VERSION(0, 15), \ - BIT_ULL(BCH_RECOVERY_PASS_fs_upgrade_for_subvolumes)| \ - BIT_ULL(BCH_RECOVERY_PASS_initialize_subvolumes)| \ - RECOVERY_PASS_ALL_FSCK) \ - x(reflink_p_fix, BCH_VERSION(0, 16), \ - BIT_ULL(BCH_RECOVERY_PASS_fix_reflink_p)) \ - x(subvol_dirent, BCH_VERSION(0, 17), \ - RECOVERY_PASS_ALL_FSCK) \ - x(inode_v2, BCH_VERSION(0, 18), \ - RECOVERY_PASS_ALL_FSCK) \ - x(freespace, BCH_VERSION(0, 19), \ - RECOVERY_PASS_ALL_FSCK) \ - x(alloc_v4, BCH_VERSION(0, 20), \ - RECOVERY_PASS_ALL_FSCK) \ - x(new_data_types, BCH_VERSION(0, 21), \ - RECOVERY_PASS_ALL_FSCK) \ - x(backpointers, BCH_VERSION(0, 22), \ - RECOVERY_PASS_ALL_FSCK) \ - x(inode_v3, BCH_VERSION(0, 23), \ - RECOVERY_PASS_ALL_FSCK) \ - x(unwritten_extents, BCH_VERSION(0, 24), \ - RECOVERY_PASS_ALL_FSCK) \ - x(bucket_gens, BCH_VERSION(0, 25), \ - BIT_ULL(BCH_RECOVERY_PASS_bucket_gens_init)| \ - RECOVERY_PASS_ALL_FSCK) \ - x(lru_v2, BCH_VERSION(0, 26), \ - RECOVERY_PASS_ALL_FSCK) \ - x(fragmentation_lru, BCH_VERSION(0, 27), \ - RECOVERY_PASS_ALL_FSCK) \ - x(no_bps_in_alloc_keys, BCH_VERSION(0, 28), \ - RECOVERY_PASS_ALL_FSCK) \ - x(snapshot_trees, BCH_VERSION(0, 29), \ - RECOVERY_PASS_ALL_FSCK) \ - x(major_minor, BCH_VERSION(1, 0), \ - 0) \ - x(snapshot_skiplists, BCH_VERSION(1, 1), \ - BIT_ULL(BCH_RECOVERY_PASS_check_snapshots)) \ - x(deleted_inodes, BCH_VERSION(1, 2), \ - BIT_ULL(BCH_RECOVERY_PASS_check_inodes)) \ - x(rebalance_work, BCH_VERSION(1, 3), \ - BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance)) + x(bkey_renumber, BCH_VERSION(0, 10)) \ + x(inode_btree_change, BCH_VERSION(0, 11)) \ + x(snapshot, BCH_VERSION(0, 12)) \ + x(inode_backpointers, BCH_VERSION(0, 13)) \ + x(btree_ptr_sectors_written, BCH_VERSION(0, 14)) \ + x(snapshot_2, BCH_VERSION(0, 15)) \ + x(reflink_p_fix, BCH_VERSION(0, 16)) \ + x(subvol_dirent, BCH_VERSION(0, 17)) \ + x(inode_v2, BCH_VERSION(0, 18)) \ + x(freespace, BCH_VERSION(0, 19)) \ + x(alloc_v4, BCH_VERSION(0, 20)) \ + x(new_data_types, BCH_VERSION(0, 21)) \ + x(backpointers, BCH_VERSION(0, 22)) \ + x(inode_v3, BCH_VERSION(0, 23)) \ + x(unwritten_extents, BCH_VERSION(0, 24)) \ + x(bucket_gens, BCH_VERSION(0, 25)) \ + x(lru_v2, BCH_VERSION(0, 26)) \ + x(fragmentation_lru, BCH_VERSION(0, 27)) \ + x(no_bps_in_alloc_keys, BCH_VERSION(0, 28)) \ + x(snapshot_trees, BCH_VERSION(0, 29)) \ + x(major_minor, BCH_VERSION(1, 0)) \ + x(snapshot_skiplists, BCH_VERSION(1, 1)) \ + x(deleted_inodes, BCH_VERSION(1, 2)) \ + x(rebalance_work, BCH_VERSION(1, 3)) \ + x(member_seq, BCH_VERSION(1, 4)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, -#define x(t, n, upgrade_passes) bcachefs_metadata_version_##t = n, +#define x(t, n) bcachefs_metadata_version_##t = n, BCH_METADATA_VERSIONS() #undef x bcachefs_metadata_version_max @@ -1786,7 +906,8 @@ struct bch_sb { __le32 time_base_hi; __le32 time_precision; - __le64 flags[8]; + __le64 flags[7]; + __le64 write_time; __le64 features[2]; __le64 compat[2]; @@ -2153,7 +1274,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb) x(clock, 7) \ x(dev_usage, 8) \ x(log, 9) \ - x(overwrite, 10) + x(overwrite, 10) \ + x(write_buffer_keys, 11) enum { #define x(f, nr) BCH_JSET_ENTRY_##f = nr, @@ -2162,6 +1284,19 @@ enum { BCH_JSET_ENTRY_NR }; +static inline bool jset_entry_is_key(struct jset_entry *e) +{ + switch (e->type) { + case BCH_JSET_ENTRY_btree_keys: + case BCH_JSET_ENTRY_btree_root: + case BCH_JSET_ENTRY_overwrite: + case BCH_JSET_ENTRY_write_buffer_keys: + return true; + } + + return false; +} + /* * Journal sequence numbers can be blacklisted: bsets record the max sequence * number of all the journal entries they contain updates for, so that on @@ -2203,7 +1338,7 @@ struct jset_entry_usage { struct jset_entry_data_usage { struct jset_entry entry; __le64 v; - struct bch_replicas_entry r; + struct bch_replicas_entry_v1 r; } __packed; struct jset_entry_clock { @@ -2224,8 +1359,8 @@ struct jset_entry_dev_usage { __le32 dev; __u32 pad; - __le64 buckets_ec; - __le64 _buckets_unavailable; /* No longer used */ + __le64 _buckets_ec; /* No longer used */ + __le64 _buckets_unavailable; /* No longer used */ struct jset_entry_dev_usage_type d[]; }; @@ -2239,7 +1374,7 @@ static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage struct jset_entry_log { struct jset_entry entry; u8 d[]; -} __packed; +} __packed __aligned(8); /* * On disk format for a journal entry: diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h index f05881f7e113..4b8fba754b1c 100644 --- a/fs/bcachefs/bcachefs_ioctl.h +++ b/fs/bcachefs/bcachefs_ioctl.h @@ -81,6 +81,11 @@ struct bch_ioctl_incremental { #define BCH_IOCTL_SUBVOLUME_CREATE _IOW(0xbc, 16, struct bch_ioctl_subvolume) #define BCH_IOCTL_SUBVOLUME_DESTROY _IOW(0xbc, 17, struct bch_ioctl_subvolume) +#define BCH_IOCTL_DEV_USAGE_V2 _IOWR(0xbc, 18, struct bch_ioctl_dev_usage_v2) + +#define BCH_IOCTL_FSCK_OFFLINE _IOW(0xbc, 19, struct bch_ioctl_fsck_offline) +#define BCH_IOCTL_FSCK_ONLINE _IOW(0xbc, 20, struct bch_ioctl_fsck_online) + /* ioctl below act on a particular file, not the filesystem as a whole: */ #define BCHFS_IOC_REINHERIT_ATTRS _IOR(0xbc, 64, const char __user *) @@ -173,12 +178,18 @@ struct bch_ioctl_disk_set_state { __u64 dev; }; +#define BCH_DATA_OPS() \ + x(scrub, 0) \ + x(rereplicate, 1) \ + x(migrate, 2) \ + x(rewrite_old_nodes, 3) \ + x(drop_extra_replicas, 4) + enum bch_data_ops { - BCH_DATA_OP_SCRUB = 0, - BCH_DATA_OP_REREPLICATE = 1, - BCH_DATA_OP_MIGRATE = 2, - BCH_DATA_OP_REWRITE_OLD_NODES = 3, - BCH_DATA_OP_NR = 4, +#define x(t, n) BCH_DATA_OP_##t = n, + BCH_DATA_OPS() +#undef x + BCH_DATA_OP_NR }; /* @@ -237,7 +248,7 @@ struct bch_ioctl_data_event { struct bch_replicas_usage { __u64 sectors; - struct bch_replicas_entry r; + struct bch_replicas_entry_v1 r; } __packed; static inline struct bch_replicas_usage * @@ -268,7 +279,7 @@ struct bch_ioctl_fs_usage { __u32 replica_entries_bytes; __u32 pad; - struct bch_replicas_usage replicas[0]; + struct bch_replicas_usage replicas[]; }; /* @@ -292,7 +303,20 @@ struct bch_ioctl_dev_usage { __u64 buckets; __u64 sectors; __u64 fragmented; - } d[BCH_DATA_NR]; + } d[10]; +}; + +struct bch_ioctl_dev_usage_v2 { + __u64 dev; + __u32 flags; + __u8 state; + __u8 nr_data_types; + __u8 pad[6]; + + __u32 bucket_size; + __u64 nr_buckets; + + struct bch_ioctl_dev_usage_type d[]; }; /* @@ -365,4 +389,24 @@ struct bch_ioctl_subvolume { #define BCH_SUBVOL_SNAPSHOT_CREATE (1U << 0) #define BCH_SUBVOL_SNAPSHOT_RO (1U << 1) +/* + * BCH_IOCTL_FSCK_OFFLINE: run fsck from the 'bcachefs fsck' userspace command, + * but with the kernel's implementation of fsck: + */ +struct bch_ioctl_fsck_offline { + __u64 flags; + __u64 opts; /* string */ + __u64 nr_devs; + __u64 devs[] __counted_by(nr_devs); +}; + +/* + * BCH_IOCTL_FSCK_ONLINE: run fsck from the 'bcachefs fsck' userspace command, + * but with the kernel's implementation of fsck: + */ +struct bch_ioctl_fsck_online { + __u64 flags; + __u64 opts; /* string */ +}; + #endif /* _BCACHEFS_IOCTL_H */ diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c index abdb05507d16..76e79a15ba08 100644 --- a/fs/bcachefs/bkey.c +++ b/fs/bcachefs/bkey.c @@ -33,7 +33,7 @@ void bch2_bkey_packed_to_binary_text(struct printbuf *out, next_key_bits -= 64; } - bch2_prt_u64_binary(out, v, min(word_bits, nr_key_bits)); + bch2_prt_u64_base2_nbits(out, v, min(word_bits, nr_key_bits)); if (!next_key_bits) break; diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index 761f5e33b1e6..5e52684764eb 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -63,8 +63,17 @@ static int key_type_cookie_invalid(struct bch_fs *c, struct bkey_s_c k, return 0; } +static void key_type_cookie_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_cookie ck = bkey_s_c_to_cookie(k); + + prt_printf(out, "%llu", le64_to_cpu(ck.v->cookie)); +} + #define bch2_bkey_ops_cookie ((struct bkey_ops) { \ .key_invalid = key_type_cookie_invalid, \ + .val_to_text = key_type_cookie_to_text, \ .min_val_size = 8, \ }) diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h index 3a370b7087ac..03efe8ee565a 100644 --- a/fs/bcachefs/bkey_methods.h +++ b/fs/bcachefs/bkey_methods.h @@ -28,10 +28,8 @@ struct bkey_ops { void (*swab)(struct bkey_s); bool (*key_normalize)(struct bch_fs *, struct bkey_s); bool (*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c); - int (*trans_trigger)(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_i *, unsigned); - int (*atomic_trigger)(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s_c, unsigned); + int (*trigger)(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s, unsigned); void (*compat)(enum btree_id id, unsigned version, unsigned big_endian, int write, struct bkey_s); @@ -78,84 +76,88 @@ static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct b bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); -static inline int bch2_mark_key(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) -{ - const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new.k->type); - - return ops->atomic_trigger - ? ops->atomic_trigger(trans, btree, level, old, new, flags) - : 0; -} - enum btree_update_flags { __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE = __BTREE_ITER_FLAGS_END, __BTREE_UPDATE_NOJOURNAL, - __BTREE_UPDATE_PREJOURNAL, __BTREE_UPDATE_KEY_CACHE_RECLAIM, - __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */ - + __BTREE_TRIGGER_NORUN, + __BTREE_TRIGGER_TRANSACTIONAL, + __BTREE_TRIGGER_ATOMIC, + __BTREE_TRIGGER_GC, __BTREE_TRIGGER_INSERT, __BTREE_TRIGGER_OVERWRITE, - - __BTREE_TRIGGER_GC, __BTREE_TRIGGER_BUCKET_INVALIDATE, - __BTREE_TRIGGER_NOATOMIC, }; #define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) #define BTREE_UPDATE_NOJOURNAL (1U << __BTREE_UPDATE_NOJOURNAL) -#define BTREE_UPDATE_PREJOURNAL (1U << __BTREE_UPDATE_PREJOURNAL) #define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM) +/* Don't run triggers at all */ #define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN) +/* + * If set, we're running transactional triggers as part of a transaction commit: + * triggers may generate new updates + * + * If cleared, and either BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE are set, + * we're running atomic triggers during a transaction commit: we have our + * journal reservation, we're holding btree node write locks, and we know the + * transaction is going to commit (returning an error here is a fatal error, + * causing us to go emergency read-only) + */ +#define BTREE_TRIGGER_TRANSACTIONAL (1U << __BTREE_TRIGGER_TRANSACTIONAL) +#define BTREE_TRIGGER_ATOMIC (1U << __BTREE_TRIGGER_ATOMIC) + +/* We're in gc/fsck: running triggers to recalculate e.g. disk usage */ +#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC) + +/* @new is entering the btree */ #define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT) + +/* @old is leaving the btree */ #define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE) -#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC) +/* signal from bucket invalidate path to alloc trigger */ #define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE) -#define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC) -static inline int bch2_trans_mark_key(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_i *new, - unsigned flags) +static inline int bch2_key_trigger(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bkey_s_c old, struct bkey_s new, + unsigned flags) { - const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new->k.type); + const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new.k->type); - return ops->trans_trigger - ? ops->trans_trigger(trans, btree_id, level, old, new, flags) + return ops->trigger + ? ops->trigger(trans, btree, level, old, new, flags) : 0; } -static inline int bch2_trans_mark_old(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, unsigned flags) +static inline int bch2_key_trigger_old(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, unsigned flags) { struct bkey_i deleted; bkey_init(&deleted.k); deleted.k.p = old.k->p; - return bch2_trans_mark_key(trans, btree_id, level, old, &deleted, - BTREE_TRIGGER_OVERWRITE|flags); + return bch2_key_trigger(trans, btree_id, level, old, bkey_i_to_s(&deleted), + BTREE_TRIGGER_OVERWRITE|flags); } -static inline int bch2_trans_mark_new(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_i *new, unsigned flags) +static inline int bch2_key_trigger_new(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s new, unsigned flags) { struct bkey_i deleted; bkey_init(&deleted.k); - deleted.k.p = new->k.p; + deleted.k.p = new.k->p; - return bch2_trans_mark_key(trans, btree_id, level, bkey_i_to_s_c(&deleted), new, - BTREE_TRIGGER_INSERT|flags); + return bch2_key_trigger(trans, btree_id, level, bkey_i_to_s_c(&deleted), new, + BTREE_TRIGGER_INSERT|flags); } void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int); diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c index bb73ba9017b0..3fd1085b6c61 100644 --- a/fs/bcachefs/bset.c +++ b/fs/bcachefs/bset.c @@ -68,6 +68,12 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b, _k = _n) { _n = bkey_p_next(_k); + if (!_k->u64s) { + printk(KERN_ERR "block %u key %5zu - u64s 0? aieee!\n", set, + _k->_data - i->_data); + break; + } + k = bkey_disassemble(b, _k, &uk); printbuf_reset(&buf); @@ -714,7 +720,7 @@ static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t) { struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t); struct bkey_i min_key, max_key; - unsigned j, cacheline = 1; + unsigned cacheline = 1; t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)), bset_ro_tree_capacity(b, t)); @@ -817,13 +823,12 @@ void bch2_bset_init_first(struct btree *b, struct bset *i) set_btree_bset(b, t, i); } -void bch2_bset_init_next(struct bch_fs *c, struct btree *b, - struct btree_node_entry *bne) +void bch2_bset_init_next(struct btree *b, struct btree_node_entry *bne) { struct bset *i = &bne->keys; struct bset_tree *t; - BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c)); + BUG_ON(bset_byte_offset(b, bne) >= btree_buf_bytes(b)); BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b))); BUG_ON(b->nsets >= MAX_BSETS); diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h index 632c2b8c5460..79c77baaa383 100644 --- a/fs/bcachefs/bset.h +++ b/fs/bcachefs/bset.h @@ -264,8 +264,7 @@ static inline struct bset *bset_next_set(struct btree *b, void bch2_btree_keys_init(struct btree *); void bch2_bset_init_first(struct btree *, struct bset *); -void bch2_bset_init_next(struct bch_fs *, struct btree *, - struct btree_node_entry *); +void bch2_bset_init_next(struct btree *, struct btree_node_entry *); void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool); void bch2_bset_insert(struct btree *, struct btree_node_iter *, diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 79495cd7a794..d7c81beac14a 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -60,7 +60,7 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b) clear_btree_node_just_written(b); - kvpfree(b->data, btree_bytes(c)); + kvpfree(b->data, btree_buf_bytes(b)); b->data = NULL; #ifdef __KERNEL__ kvfree(b->aux_data); @@ -94,7 +94,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) { BUG_ON(b->data || b->aux_data); - b->data = kvpmalloc(btree_bytes(c), gfp); + b->data = kvpmalloc(btree_buf_bytes(b), gfp); if (!b->data) return -BCH_ERR_ENOMEM_btree_node_mem_alloc; #ifdef __KERNEL__ @@ -107,7 +107,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) b->aux_data = NULL; #endif if (!b->aux_data) { - kvpfree(b->data, btree_bytes(c)); + kvpfree(b->data, btree_buf_bytes(b)); b->data = NULL; return -BCH_ERR_ENOMEM_btree_node_mem_alloc; } @@ -126,7 +126,7 @@ static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) bkey_btree_ptr_init(&b->key); INIT_LIST_HEAD(&b->list); INIT_LIST_HEAD(&b->write_blocked); - b->byte_order = ilog2(btree_bytes(c)); + b->byte_order = ilog2(c->opts.btree_node_size); return b; } @@ -408,7 +408,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) if (c->verify_data) list_move(&c->verify_data->list, &bc->live); - kvpfree(c->verify_ondisk, btree_bytes(c)); + kvpfree(c->verify_ondisk, c->opts.btree_node_size); for (i = 0; i < btree_id_nr_alive(c); i++) { struct btree_root *r = bch2_btree_id_root(c, i); @@ -500,19 +500,21 @@ void bch2_fs_btree_cache_init_early(struct btree_cache *bc) * cannibalize_bucket() will take. This means every time we unlock the root of * the btree, we need to release this lock if we have it held. */ -void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c) +void bch2_btree_cache_cannibalize_unlock(struct btree_trans *trans) { + struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; if (bc->alloc_lock == current) { - trace_and_count(c, btree_cache_cannibalize_unlock, c); + trace_and_count(c, btree_cache_cannibalize_unlock, trans); bc->alloc_lock = NULL; closure_wake_up(&bc->alloc_wait); } } -int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl) +int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure *cl) { + struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; struct task_struct *old; @@ -521,7 +523,7 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl) goto success; if (!cl) { - trace_and_count(c, btree_cache_cannibalize_lock_fail, c); + trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock; } @@ -535,11 +537,11 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl) goto success; } - trace_and_count(c, btree_cache_cannibalize_lock_fail, c); + trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); return -BCH_ERR_btree_cache_cannibalize_lock_blocked; success: - trace_and_count(c, btree_cache_cannibalize_lock, c); + trace_and_count(c, btree_cache_cannibalize_lock, trans); return 0; } @@ -673,7 +675,7 @@ err: mutex_unlock(&bc->lock); - trace_and_count(c, btree_cache_cannibalize, c); + trace_and_count(c, btree_cache_cannibalize, trans); goto out; } @@ -717,12 +719,6 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, if (IS_ERR(b)) return b; - /* - * Btree nodes read in from disk should not have the accessed bit set - * initially, so that linear scans don't thrash the cache: - */ - clear_btree_node_accessed(b); - bkey_copy(&b->key, k); if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) { /* raced with another fill: */ @@ -749,7 +745,7 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, if (path && sync) bch2_trans_unlock_noassert(trans); - bch2_btree_node_read(c, b, sync); + bch2_btree_node_read(trans, b, sync); if (!sync) return NULL; @@ -1039,7 +1035,7 @@ retry: goto retry; if (IS_ERR(b) && - !bch2_btree_cache_cannibalize_lock(c, NULL)) + !bch2_btree_cache_cannibalize_lock(trans, NULL)) goto retry; if (IS_ERR(b)) @@ -1087,7 +1083,7 @@ lock_node: EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); btree_check_header(c, b); out: - bch2_btree_cache_cannibalize_unlock(c); + bch2_btree_cache_cannibalize_unlock(trans); return b; } @@ -1196,7 +1192,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struc " failed unpacked %zu\n", b->unpack_fn_len, b->nr.live_u64s * sizeof(u64), - btree_bytes(c) - sizeof(struct btree_node), + btree_buf_bytes(b) - sizeof(struct btree_node), b->nr.live_u64s * 100 / btree_max_u64s(c), b->sib_u64s[0], b->sib_u64s[1], diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h index cfb80b201d61..6d33885fdbde 100644 --- a/fs/bcachefs/btree_cache.h +++ b/fs/bcachefs/btree_cache.h @@ -17,8 +17,8 @@ int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *); int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *, unsigned, enum btree_id); -void bch2_btree_cache_cannibalize_unlock(struct bch_fs *); -int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *); +void bch2_btree_cache_cannibalize_unlock(struct btree_trans *); +int bch2_btree_cache_cannibalize_lock(struct btree_trans *, struct closure *); struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *); struct btree *bch2_btree_node_mem_alloc(struct btree_trans *, bool); @@ -74,22 +74,27 @@ static inline bool btree_node_hashed(struct btree *b) _iter = 0; _iter < (_tbl)->size; _iter++) \ rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash) -static inline size_t btree_bytes(struct bch_fs *c) +static inline size_t btree_buf_bytes(const struct btree *b) { - return c->opts.btree_node_size; + return 1UL << b->byte_order; } -static inline size_t btree_max_u64s(struct bch_fs *c) +static inline size_t btree_buf_max_u64s(const struct btree *b) { - return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64); + return (btree_buf_bytes(b) - sizeof(struct btree_node)) / sizeof(u64); } -static inline size_t btree_pages(struct bch_fs *c) +static inline size_t btree_max_u64s(const struct bch_fs *c) { - return btree_bytes(c) / PAGE_SIZE; + return (c->opts.btree_node_size - sizeof(struct btree_node)) / sizeof(u64); } -static inline unsigned btree_blocks(struct bch_fs *c) +static inline size_t btree_sectors(const struct bch_fs *c) +{ + return c->opts.btree_node_size >> SECTOR_SHIFT; +} + +static inline unsigned btree_blocks(const struct bch_fs *c) { return btree_sectors(c) >> c->block_bits; } diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 30ab78a24517..1102995643b1 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -41,6 +41,14 @@ #define DROP_THIS_NODE 10 #define DROP_PREV_NODE 11 +static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k) +{ + return (struct bkey_s) {{{ + (struct bkey *) k.k, + (struct bch_val *) k.v + }}}; +} + static bool should_restart_for_topology_repair(struct bch_fs *c) { return c->opts.fix_errors != FSCK_FIX_no && @@ -108,7 +116,7 @@ static int bch2_gc_check_topology(struct bch_fs *c, ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); goto err; } else { - set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); + set_bit(BCH_FS_initial_gc_unfixed, &c->flags); } } } @@ -134,7 +142,7 @@ static int bch2_gc_check_topology(struct bch_fs *c, ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); goto err; } else { - set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); + set_bit(BCH_FS_initial_gc_unfixed, &c->flags); } } @@ -414,10 +422,9 @@ again: continue; } - if (ret) { - bch_err_msg(c, ret, "getting btree node"); + bch_err_msg(c, ret, "getting btree node"); + if (ret) break; - } ret = btree_repair_node_boundaries(c, b, prev, cur); @@ -482,10 +489,9 @@ again: false); ret = PTR_ERR_OR_ZERO(cur); - if (ret) { - bch_err_msg(c, ret, "getting btree node"); + bch_err_msg(c, ret, "getting btree node"); + if (ret) goto err; - } ret = bch2_btree_repair_topology_recurse(trans, cur); six_unlock_read(&cur->c.lock); @@ -591,7 +597,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_types[ptr_data_type(k->k, &p.ptr)], + bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), p.ptr.gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) { @@ -609,7 +615,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_types[ptr_data_type(k->k, &p.ptr)], + bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), p.ptr.gen, g->gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) { @@ -619,7 +625,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id g->data_type = 0; g->dirty_sectors = 0; g->cached_sectors = 0; - set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); + set_bit(BCH_FS_need_another_gc, &c->flags); } else { do_update = true; } @@ -631,7 +637,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, - bch2_data_types[ptr_data_type(k->k, &p.ptr)], + bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), p.ptr.gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) @@ -643,7 +649,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_types[ptr_data_type(k->k, &p.ptr)], + bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), p.ptr.gen, g->gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) @@ -658,13 +664,13 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id "bucket %u:%zu different types of data in same bucket: %s, %s\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_types[g->data_type], - bch2_data_types[data_type], + bch2_data_type_str(g->data_type), + bch2_data_type_str(data_type), (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { if (data_type == BCH_DATA_btree) { g->data_type = data_type; - set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); + set_bit(BCH_FS_need_another_gc, &c->flags); } else { do_update = true; } @@ -707,8 +713,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id new = kmalloc(bkey_bytes(k->k), GFP_KERNEL); if (!new) { - bch_err_msg(c, ret, "allocating new key"); ret = -BCH_ERR_ENOMEM_gc_repair_key; + bch_err_msg(c, ret, "allocating new key"); goto err; } @@ -807,9 +813,6 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, struct bch_fs *c = trans->c; struct bkey deleted = KEY(0, 0, 0); struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; - unsigned flags = - BTREE_TRIGGER_GC| - (initial ? BTREE_TRIGGER_NOATOMIC : 0); int ret = 0; deleted.p = k->k->p; @@ -831,11 +834,10 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, } ret = commit_do(trans, NULL, NULL, 0, - bch2_mark_key(trans, btree_id, level, old, *k, flags)); + bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(*k), BTREE_TRIGGER_GC)); fsck_err: err: - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } @@ -996,7 +998,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b /* Continue marking when opted to not * fix the error: */ ret = 0; - set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); + set_bit(BCH_FS_initial_gc_unfixed, &c->flags); continue; } } else if (ret) { @@ -1068,8 +1070,7 @@ static int bch2_gc_btree_init(struct btree_trans *trans, fsck_err: six_unlock_read(&b->c.lock); - if (ret < 0) - bch_err_fn(c, ret); + bch_err_fn(c, ret); printbuf_exit(&buf); return ret; } @@ -1105,10 +1106,8 @@ static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only) : bch2_gc_btree(trans, i, initial, metadata_only); } - if (ret < 0) - bch_err_fn(c, ret); - bch2_trans_put(trans); + bch_err_fn(c, ret); return ret; } @@ -1159,13 +1158,10 @@ static void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, static void bch2_mark_superblocks(struct bch_fs *c) { - struct bch_dev *ca; - unsigned i; - mutex_lock(&c->sb_lock); gc_pos_set(c, gc_phase(GC_PHASE_SB)); - for_each_online_member(ca, c, i) + for_each_online_member(c, ca) bch2_mark_dev_superblock(c, ca, BTREE_TRIGGER_GC); mutex_unlock(&c->sb_lock); } @@ -1190,13 +1186,10 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) static void bch2_gc_free(struct bch_fs *c) { - struct bch_dev *ca; - unsigned i; - genradix_free(&c->reflink_gc_table); genradix_free(&c->gc_stripes); - for_each_member_device(ca, c, i) { + for_each_member_device(c, ca) { kvpfree(rcu_dereference_protected(ca->buckets_gc, 1), sizeof(struct bucket_array) + ca->mi.nbuckets * sizeof(struct bucket)); @@ -1218,7 +1211,7 @@ static int bch2_gc_done(struct bch_fs *c, bool verify = !metadata_only && !c->opts.reconstruct_alloc && (!initial || (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info))); - unsigned i, dev; + unsigned i; int ret = 0; percpu_down_write(&c->mark_lock); @@ -1230,14 +1223,14 @@ static int bch2_gc_done(struct bch_fs *c, , ##__VA_ARGS__, dst->_f, src->_f))) \ dst->_f = src->_f #define copy_dev_field(_err, _f, _msg, ...) \ - copy_field(_err, _f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__) + copy_field(_err, _f, "dev %u has wrong " _msg, ca->dev_idx, ##__VA_ARGS__) #define copy_fs_field(_err, _f, _msg, ...) \ copy_field(_err, _f, "fs has wrong " _msg, ##__VA_ARGS__) for (i = 0; i < ARRAY_SIZE(c->usage); i++) bch2_fs_usage_acc_to_base(c, i); - for_each_member_device(ca, c, dev) { + __for_each_member_device(c, ca) { struct bch_dev_usage *dst = ca->usage_base; struct bch_dev_usage *src = (void *) bch2_acc_percpu_u64s((u64 __percpu *) ca->usage_gc, @@ -1245,15 +1238,12 @@ static int bch2_gc_done(struct bch_fs *c, for (i = 0; i < BCH_DATA_NR; i++) { copy_dev_field(dev_usage_buckets_wrong, - d[i].buckets, "%s buckets", bch2_data_types[i]); + d[i].buckets, "%s buckets", bch2_data_type_str(i)); copy_dev_field(dev_usage_sectors_wrong, - d[i].sectors, "%s sectors", bch2_data_types[i]); + d[i].sectors, "%s sectors", bch2_data_type_str(i)); copy_dev_field(dev_usage_fragmented_wrong, - d[i].fragmented, "%s fragmented", bch2_data_types[i]); + d[i].fragmented, "%s fragmented", bch2_data_type_str(i)); } - - copy_dev_field(dev_usage_buckets_ec_wrong, - buckets_ec, "buckets_ec"); } { @@ -1263,19 +1253,19 @@ static int bch2_gc_done(struct bch_fs *c, bch2_acc_percpu_u64s((u64 __percpu *) c->usage_gc, nr); copy_fs_field(fs_usage_hidden_wrong, - hidden, "hidden"); + b.hidden, "hidden"); copy_fs_field(fs_usage_btree_wrong, - btree, "btree"); + b.btree, "btree"); if (!metadata_only) { copy_fs_field(fs_usage_data_wrong, - data, "data"); + b.data, "data"); copy_fs_field(fs_usage_cached_wrong, - cached, "cached"); + b.cached, "cached"); copy_fs_field(fs_usage_reserved_wrong, - reserved, "reserved"); + b.reserved, "reserved"); copy_fs_field(fs_usage_nr_inodes_wrong, - nr_inodes,"nr_inodes"); + b.nr_inodes,"nr_inodes"); for (i = 0; i < BCH_REPLICAS_MAX; i++) copy_fs_field(fs_usage_persistent_reserved_wrong, @@ -1284,7 +1274,7 @@ static int bch2_gc_done(struct bch_fs *c, } for (i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry *e = + struct bch_replicas_entry_v1 *e = cpu_replicas_entry(&c->replicas, i); if (metadata_only && @@ -1307,8 +1297,7 @@ static int bch2_gc_done(struct bch_fs *c, fsck_err: if (ca) percpu_ref_put(&ca->ref); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); percpu_up_write(&c->mark_lock); printbuf_exit(&buf); @@ -1317,9 +1306,6 @@ fsck_err: static int bch2_gc_start(struct bch_fs *c) { - struct bch_dev *ca = NULL; - unsigned i; - BUG_ON(c->usage_gc); c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64), @@ -1329,7 +1315,7 @@ static int bch2_gc_start(struct bch_fs *c) return -BCH_ERR_ENOMEM_gc_start; } - for_each_member_device(ca, c, i) { + for_each_member_device(c, ca) { BUG_ON(ca->usage_gc); ca->usage_gc = alloc_percpu(struct bch_dev_usage); @@ -1348,10 +1334,7 @@ static int bch2_gc_start(struct bch_fs *c) static int bch2_gc_reset(struct bch_fs *c) { - struct bch_dev *ca; - unsigned i; - - for_each_member_device(ca, c, i) { + for_each_member_device(c, ca) { free_percpu(ca->usage_gc); ca->usage_gc = NULL; } @@ -1389,9 +1372,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans, enum bch_data_type type; int ret; - if (bkey_ge(iter->pos, POS(ca->dev_idx, ca->mi.nbuckets))) - return 1; - old = bch2_alloc_to_v4(k, &old_convert); new = *old; @@ -1437,8 +1417,8 @@ static int bch2_alloc_write_key(struct btree_trans *trans, ": got %s, should be %s", iter->pos.inode, iter->pos.offset, gc.gen, - bch2_data_types[new.data_type], - bch2_data_types[gc.data_type])) + bch2_data_type_str(new.data_type), + bch2_data_type_str(gc.data_type))) new.data_type = gc.data_type; #define copy_bucket_field(_errtype, _f) \ @@ -1448,7 +1428,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans, ": got %u, should be %u", \ iter->pos.inode, iter->pos.offset, \ gc.gen, \ - bch2_data_types[gc.data_type], \ + bch2_data_type_str(gc.data_type), \ new._f, gc._f)) \ new._f = gc._f; \ @@ -1488,52 +1468,36 @@ fsck_err: static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only) { - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - struct bch_dev *ca; - unsigned i; int ret = 0; - for_each_member_device(ca, c, i) { - ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, - POS(ca->dev_idx, ca->mi.first_bucket), - BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_LAZY_RW, - bch2_alloc_write_key(trans, &iter, k, metadata_only)); - - if (ret < 0) { - bch_err_fn(c, ret); + for_each_member_device(c, ca) { + ret = bch2_trans_run(c, + for_each_btree_key_upto_commit(trans, iter, BTREE_ID_alloc, + POS(ca->dev_idx, ca->mi.first_bucket), + POS(ca->dev_idx, ca->mi.nbuckets - 1), + BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k, + NULL, NULL, BCH_TRANS_COMMIT_lazy_rw, + bch2_alloc_write_key(trans, &iter, k, metadata_only))); + if (ret) { percpu_ref_put(&ca->ref); break; } } - bch2_trans_put(trans); - return ret < 0 ? ret : 0; + bch_err_fn(c, ret); + return ret; } static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) { - struct bch_dev *ca; - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - struct bucket *g; - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a; - unsigned i; - int ret; - - for_each_member_device(ca, c, i) { + for_each_member_device(c, ca) { struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) + ca->mi.nbuckets * sizeof(struct bucket), GFP_KERNEL|__GFP_ZERO); if (!buckets) { percpu_ref_put(&ca->ref); bch_err(c, "error allocating ca->buckets[gc]"); - ret = -BCH_ERR_ENOMEM_gc_alloc_start; - goto err; + return -BCH_ERR_ENOMEM_gc_alloc_start; } buckets->first_bucket = ca->mi.first_bucket; @@ -1541,42 +1505,38 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) rcu_assign_pointer(ca->buckets_gc, buckets); } - ret = for_each_btree_key2(trans, iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_PREFETCH, k, ({ - ca = bch_dev_bkey_exists(c, k.k->p.inode); - g = gc_bucket(ca, k.k->p.offset); - - a = bch2_alloc_to_v4(k, &a_convert); - - g->gen_valid = 1; - g->gen = a->gen; - - if (metadata_only && - (a->data_type == BCH_DATA_user || - a->data_type == BCH_DATA_cached || - a->data_type == BCH_DATA_parity)) { - g->data_type = a->data_type; - g->dirty_sectors = a->dirty_sectors; - g->cached_sectors = a->cached_sectors; - g->stripe = a->stripe; - g->stripe_redundancy = a->stripe_redundancy; - } + int ret = bch2_trans_run(c, + for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ({ + struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode); + struct bucket *g = gc_bucket(ca, k.k->p.offset); - 0; - })); -err: - bch2_trans_put(trans); - if (ret) - bch_err_fn(c, ret); + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); + + g->gen_valid = 1; + g->gen = a->gen; + + if (metadata_only && + (a->data_type == BCH_DATA_user || + a->data_type == BCH_DATA_cached || + a->data_type == BCH_DATA_parity)) { + g->data_type = a->data_type; + g->dirty_sectors = a->dirty_sectors; + g->cached_sectors = a->cached_sectors; + g->stripe = a->stripe; + g->stripe_redundancy = a->stripe_redundancy; + } + + 0; + }))); + bch_err_fn(c, ret); return ret; } static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only) { - struct bch_dev *ca; - unsigned i; - - for_each_member_device(ca, c, i) { + for_each_member_device(c, ca) { struct bucket_array *buckets = gc_bucket_array(ca); struct bucket *g; @@ -1634,7 +1594,7 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans, if (!r->refcount) new->k.type = KEY_TYPE_deleted; else - *bkey_refcount(new) = cpu_to_le64(r->refcount); + *bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount); } fsck_err: printbuf_exit(&buf); @@ -1643,64 +1603,52 @@ fsck_err: static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only) { - struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; size_t idx = 0; - int ret = 0; if (metadata_only) return 0; - trans = bch2_trans_get(c); - - ret = for_each_btree_key_commit(trans, iter, - BTREE_ID_reflink, POS_MIN, - BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_NOFAIL, - bch2_gc_write_reflink_key(trans, &iter, k, &idx)); - + int ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, + BTREE_ID_reflink, POS_MIN, + BTREE_ITER_PREFETCH, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + bch2_gc_write_reflink_key(trans, &iter, k, &idx))); c->reflink_gc_nr = 0; - bch2_trans_put(trans); return ret; } static int bch2_gc_reflink_start(struct bch_fs *c, bool metadata_only) { - struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; - struct reflink_gc *r; - int ret = 0; if (metadata_only) return 0; - trans = bch2_trans_get(c); c->reflink_gc_nr = 0; - for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN, - BTREE_ITER_PREFETCH, k, ret) { - const __le64 *refcount = bkey_refcount_c(k); + int ret = bch2_trans_run(c, + for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN, + BTREE_ITER_PREFETCH, k, ({ + const __le64 *refcount = bkey_refcount_c(k); - if (!refcount) - continue; + if (!refcount) + continue; - r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++, - GFP_KERNEL); - if (!r) { - ret = -BCH_ERR_ENOMEM_gc_reflink_start; - break; - } + struct reflink_gc *r = genradix_ptr_alloc(&c->reflink_gc_table, + c->reflink_gc_nr++, GFP_KERNEL); + if (!r) { + ret = -BCH_ERR_ENOMEM_gc_reflink_start; + break; + } - r->offset = k.k->p.offset; - r->size = k.k->size; - r->refcount = 0; - } - bch2_trans_iter_exit(trans, &iter); + r->offset = k.k->p.offset; + r->size = k.k->size; + r->refcount = 0; + 0; + }))); - bch2_trans_put(trans); + bch_err_fn(c, ret); return ret; } @@ -1768,24 +1716,15 @@ fsck_err: static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only) { - struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - if (metadata_only) return 0; - trans = bch2_trans_get(c); - - ret = for_each_btree_key_commit(trans, iter, - BTREE_ID_stripes, POS_MIN, - BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_NOFAIL, - bch2_gc_write_stripes_key(trans, &iter, k)); - - bch2_trans_put(trans); - return ret; + return bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, + BTREE_ID_stripes, POS_MIN, + BTREE_ITER_PREFETCH, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + bch2_gc_write_stripes_key(trans, &iter, k))); } static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only) @@ -1848,7 +1787,7 @@ again: #endif c->gc_count++; - if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) || + if (test_bit(BCH_FS_need_another_gc, &c->flags) || (!iter && bch2_test_restart_gc)) { if (iter++ > 2) { bch_info(c, "Unable to fix bucket gens, looping"); @@ -1860,7 +1799,7 @@ again: * XXX: make sure gens we fixed got saved */ bch_info(c, "Second GC pass needed, restarting:"); - clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); + clear_bit(BCH_FS_need_another_gc, &c->flags); __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); bch2_gc_stripes_reset(c, metadata_only); @@ -1900,9 +1839,7 @@ out: * allocator thread - issue wakeup in case they blocked on gc_lock: */ closure_wake_up(&c->freelist_wait); - - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } @@ -1912,7 +1849,6 @@ static int gc_btree_gens_key(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; struct bkey_i *u; int ret; @@ -1970,12 +1906,7 @@ static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_i int bch2_gc_gens(struct bch_fs *c) { - struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; - struct bch_dev *ca; u64 b, start_time = local_clock(); - unsigned i; int ret; /* @@ -1988,9 +1919,8 @@ int bch2_gc_gens(struct bch_fs *c) trace_and_count(c, gc_gens_start, c); down_read(&c->gc_lock); - trans = bch2_trans_get(c); - for_each_member_device(ca, c, i) { + for_each_member_device(c, ca) { struct bucket_gens *gens = bucket_gens(ca); BUG_ON(ca->oldest_gen); @@ -2007,33 +1937,31 @@ int bch2_gc_gens(struct bch_fs *c) ca->oldest_gen[b] = gens->b[b]; } - for (i = 0; i < BTREE_ID_NR; i++) + for (unsigned i = 0; i < BTREE_ID_NR; i++) if (btree_type_has_ptrs(i)) { c->gc_gens_btree = i; c->gc_gens_pos = POS_MIN; - ret = for_each_btree_key_commit(trans, iter, i, - POS_MIN, - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, - k, - NULL, NULL, - BTREE_INSERT_NOFAIL, - gc_btree_gens_key(trans, &iter, k)); - if (ret && !bch2_err_matches(ret, EROFS)) - bch_err_fn(c, ret); + ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, i, + POS_MIN, + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, + k, + NULL, NULL, + BCH_TRANS_COMMIT_no_enospc, + gc_btree_gens_key(trans, &iter, k))); if (ret) goto err; } - ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, - POS_MIN, - BTREE_ITER_PREFETCH, - k, - NULL, NULL, - BTREE_INSERT_NOFAIL, - bch2_alloc_write_oldest_gen(trans, &iter, k)); - if (ret && !bch2_err_matches(ret, EROFS)) - bch_err_fn(c, ret); + ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, + POS_MIN, + BTREE_ITER_PREFETCH, + k, + NULL, NULL, + BCH_TRANS_COMMIT_no_enospc, + bch2_alloc_write_oldest_gen(trans, &iter, k))); if (ret) goto err; @@ -2045,14 +1973,15 @@ int bch2_gc_gens(struct bch_fs *c) bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); trace_and_count(c, gc_gens_end, c); err: - for_each_member_device(ca, c, i) { + for_each_member_device(c, ca) { kvfree(ca->oldest_gen); ca->oldest_gen = NULL; } - bch2_trans_put(trans); up_read(&c->gc_lock); mutex_unlock(&c->gc_gens_lock); + if (!bch2_err_matches(ret, EROFS)) + bch_err_fn(c, ret); return ret; } @@ -2062,7 +1991,6 @@ static int bch2_gc_thread(void *arg) struct io_clock *clock = &c->io_clock[WRITE]; unsigned long last = atomic64_read(&clock->now); unsigned last_kick = atomic_read(&c->kick_gc); - int ret; set_freezable(); @@ -2102,11 +2030,8 @@ static int bch2_gc_thread(void *arg) #if 0 ret = bch2_gc(c, false, false); #else - ret = bch2_gc_gens(c); + bch2_gc_gens(c); #endif - if (ret < 0) - bch_err_fn(c, ret); - debug_check_no_locks_held(); } diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 5a720f0cd5a6..aa9b6cbe3226 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -112,7 +112,7 @@ static void *btree_bounce_alloc(struct bch_fs *c, size_t size, unsigned flags = memalloc_nofs_save(); void *p; - BUG_ON(size > btree_bytes(c)); + BUG_ON(size > c->opts.btree_node_size); *used_mempool = false; p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT); @@ -174,8 +174,8 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) ptrs = ptrs_end = ((void *) new_whiteouts + bytes); - for (k = unwritten_whiteouts_start(c, b); - k != unwritten_whiteouts_end(c, b); + for (k = unwritten_whiteouts_start(b); + k != unwritten_whiteouts_end(b); k = bkey_p_next(k)) *--ptrs = k; @@ -192,7 +192,7 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) verify_no_dups(b, new_whiteouts, (void *) ((u64 *) new_whiteouts + b->whiteout_u64s)); - memcpy_u64s(unwritten_whiteouts_start(c, b), + memcpy_u64s(unwritten_whiteouts_start(b), new_whiteouts, b->whiteout_u64s); btree_bounce_free(c, bytes, used_mempool, new_whiteouts); @@ -313,7 +313,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, } bytes = sorting_entire_node - ? btree_bytes(c) + ? btree_buf_bytes(b) : __vstruct_bytes(struct btree_node, u64s); out = btree_bounce_alloc(c, bytes, &used_mempool); @@ -338,7 +338,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, if (sorting_entire_node) { u64s = le16_to_cpu(out->keys.u64s); - BUG_ON(bytes != btree_bytes(c)); + BUG_ON(bytes != btree_buf_bytes(b)); /* * Our temporary buffer is the same size as the btree node's @@ -502,7 +502,7 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b) bne = want_new_bset(c, b); if (bne) - bch2_bset_init_next(c, b, bne); + bch2_bset_init_next(b, bne); bch2_btree_build_aux_trees(b); @@ -524,7 +524,8 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c, prt_printf(out, "at btree "); bch2_btree_pos_to_text(out, c, b); - prt_printf(out, "\n node offset %u", b->written); + prt_printf(out, "\n node offset %u/%u", + b->written, btree_ptr_sectors_written(&b->key)); if (i) prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s)); prt_str(out, ": "); @@ -830,6 +831,23 @@ static int bset_key_invalid(struct bch_fs *c, struct btree *b, (rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0); } +static bool __bkey_valid(struct bch_fs *c, struct btree *b, + struct bset *i, struct bkey_packed *k) +{ + if (bkey_p_next(k) > vstruct_last(i)) + return false; + + if (k->format > KEY_FORMAT_CURRENT) + return false; + + struct printbuf buf = PRINTBUF; + struct bkey tmp; + struct bkey_s u = __bkey_disassemble(b, k, &tmp); + bool ret = __bch2_bkey_invalid(c, u.s_c, btree_node_type(b), READ, &buf); + printbuf_exit(&buf); + return ret; +} + static int validate_bset_keys(struct bch_fs *c, struct btree *b, struct bset *i, int write, bool have_retry, bool *saw_error) @@ -845,6 +863,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, k != vstruct_last(i);) { struct bkey_s u; struct bkey tmp; + unsigned next_good_key; if (btree_err_on(bkey_p_next(k) > vstruct_last(i), -BCH_ERR_btree_node_read_err_fixable, @@ -859,12 +878,8 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, btree_node_bkey_bad_format, - "invalid bkey format %u", k->format)) { - i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); - memmove_u64s_down(k, bkey_p_next(k), - (u64 *) vstruct_end(i) - (u64 *) k); - continue; - } + "invalid bkey format %u", k->format)) + goto drop_this_key; /* XXX: validate k->u64s */ if (!write) @@ -885,11 +900,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, c, NULL, b, i, btree_node_bad_bkey, "invalid bkey: %s", buf.buf); - - i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); - memmove_u64s_down(k, bkey_p_next(k), - (u64 *) vstruct_end(i) - (u64 *) k); - continue; + goto drop_this_key; } if (write) @@ -906,21 +917,45 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, prt_printf(&buf, " > "); bch2_bkey_to_text(&buf, u.k); - bch2_dump_bset(c, b, i, 0); - if (btree_err(-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, btree_node_bkey_out_of_order, - "%s", buf.buf)) { - i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); - memmove_u64s_down(k, bkey_p_next(k), - (u64 *) vstruct_end(i) - (u64 *) k); - continue; - } + "%s", buf.buf)) + goto drop_this_key; } prev = k; k = bkey_p_next(k); + continue; +drop_this_key: + next_good_key = k->u64s; + + if (!next_good_key || + (BSET_BIG_ENDIAN(i) == CPU_BIG_ENDIAN && + version >= bcachefs_metadata_version_snapshot)) { + /* + * only do scanning if bch2_bkey_compat() has nothing to + * do + */ + + if (!__bkey_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) { + for (next_good_key = 1; + next_good_key < (u64 *) vstruct_last(i) - (u64 *) k; + next_good_key++) + if (__bkey_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) + goto got_good_key; + + } + + /* + * didn't find a good key, have to truncate the rest of + * the bset + */ + next_good_key = (u64 *) vstruct_last(i) - (u64 *) k; + } +got_good_key: + le16_add_cpu(&i->u64s, -next_good_key); + memmove_u64s_down(k, bkey_p_next(k), (u64 *) vstruct_end(i) - (u64 *) k); } fsck_err: printbuf_exit(&buf); @@ -934,7 +969,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, struct sort_iter *iter; struct btree_node *sorted; struct bkey_packed *k; - struct bch_extent_ptr *ptr; struct bset *i; bool used_mempool, blacklisted; bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && @@ -943,6 +977,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, unsigned ptr_written = btree_ptr_sectors_written(&b->key); struct printbuf buf = PRINTBUF; int ret = 0, retry_read = 0, write = READ; + u64 start_time = local_clock(); b->version_ondisk = U16_MAX; /* We might get called multiple times on read retry: */ @@ -968,12 +1003,20 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, struct bch_btree_ptr_v2 *bp = &bkey_i_to_btree_ptr_v2(&b->key)->v; + bch2_bpos_to_text(&buf, b->data->min_key); + prt_str(&buf, "-"); + bch2_bpos_to_text(&buf, b->data->max_key); + btree_err_on(b->data->keys.seq != bp->seq, -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL, btree_node_bad_seq, - "got wrong btree node (seq %llx want %llx)", - b->data->keys.seq, bp->seq); + "got wrong btree node (want %llx got %llx)\n" + "got btree %s level %llu pos %s", + bp->seq, b->data->keys.seq, + bch2_btree_id_str(BTREE_NODE_ID(b->data)), + BTREE_NODE_LEVEL(b->data), + buf.buf); } else { btree_err_on(!b->data->keys.seq, -BCH_ERR_btree_node_read_err_must_retry, @@ -999,8 +1042,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, nonce = btree_nonce(i, b->written << 9); - csum_bad = bch2_crc_cmp(b->data->csum, - csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data)); + struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); + csum_bad = bch2_crc_cmp(b->data->csum, csum); if (csum_bad) bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); @@ -1008,7 +1051,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i, bset_bad_csum, - "invalid checksum"); + "%s", + (printbuf_reset(&buf), + bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), b->data->csum, csum), + buf.buf)); ret = bset_encrypt(c, i, b->written << 9); if (bch2_fs_fatal_err_on(ret, c, @@ -1037,8 +1083,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, "unknown checksum type %llu", BSET_CSUM_TYPE(i)); nonce = btree_nonce(i, b->written << 9); - csum_bad = bch2_crc_cmp(bne->csum, - csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne)); + struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); + csum_bad = bch2_crc_cmp(bne->csum, csum); if (csum_bad) bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); @@ -1046,7 +1092,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i, bset_bad_csum, - "invalid checksum"); + "%s", + (printbuf_reset(&buf), + bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), bne->csum, csum), + buf.buf)); ret = bset_encrypt(c, i, b->written << 9); if (bch2_fs_fatal_err_on(ret, c, @@ -1111,7 +1160,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, ptr_written, b->written); } else { for (bne = write_block(b); - bset_byte_offset(b, bne) < btree_bytes(c); + bset_byte_offset(b, bne) < btree_buf_bytes(b); bne = (void *) bne + block_bytes(c)) btree_err_on(bne->keys.seq == b->data->keys.seq && !bch2_journal_seq_is_blacklisted(c, @@ -1123,7 +1172,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, "found bset signature after last bset"); } - sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool); + sorted = btree_bounce_alloc(c, btree_buf_bytes(b), &used_mempool); sorted->keys.u64s = 0; set_btree_bset(b, b->set, &b->data->keys); @@ -1139,7 +1188,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, BUG_ON(b->nr.live_u64s != u64s); - btree_bounce_free(c, btree_bytes(c), used_mempool, sorted); + btree_bounce_free(c, btree_buf_bytes(b), used_mempool, sorted); if (updated_range) bch2_btree_node_drop_keys_outside_node(b); @@ -1202,6 +1251,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, out: mempool_free(iter, &c->fill_iter); printbuf_exit(&buf); + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time); return retry_read; fsck_err: if (ret == -BCH_ERR_btree_node_read_err_want_retry || @@ -1234,7 +1284,7 @@ static void btree_node_read_work(struct work_struct *work) rb->have_ioref = bch2_dev_get_ioref(ca, READ); bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META); bio->bi_iter.bi_sector = rb->pick.ptr.offset; - bio->bi_iter.bi_size = btree_bytes(c); + bio->bi_iter.bi_size = btree_buf_bytes(b); if (rb->have_ioref) { bio_set_dev(bio, ca->disk_sb.bdev); @@ -1462,7 +1512,7 @@ fsck_err: } if (best >= 0) { - memcpy(b->data, ra->buf[best], btree_bytes(c)); + memcpy(b->data, ra->buf[best], btree_buf_bytes(b)); ret = bch2_btree_node_read_done(c, NULL, b, false, saw_error); } else { ret = -1; @@ -1528,7 +1578,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool for (i = 0; i < ra->nr; i++) { ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS); ra->bio[i] = bio_alloc_bioset(NULL, - buf_pages(ra->buf[i], btree_bytes(c)), + buf_pages(ra->buf[i], btree_buf_bytes(b)), REQ_OP_READ|REQ_SYNC|REQ_META, GFP_NOFS, &c->btree_bio); @@ -1548,7 +1598,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool rb->pick = pick; rb->bio.bi_iter.bi_sector = pick.ptr.offset; rb->bio.bi_end_io = btree_node_read_all_replicas_endio; - bch2_bio_map(&rb->bio, ra->buf[i], btree_bytes(c)); + bch2_bio_map(&rb->bio, ra->buf[i], btree_buf_bytes(b)); if (rb->have_ioref) { this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], @@ -1575,16 +1625,17 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool return 0; } -void bch2_btree_node_read(struct bch_fs *c, struct btree *b, +void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, bool sync) { + struct bch_fs *c = trans->c; struct extent_ptr_decoded pick; struct btree_read_bio *rb; struct bch_dev *ca; struct bio *bio; int ret; - trace_and_count(c, btree_node_read, c, b); + trace_and_count(c, btree_node_read, trans, b); if (bch2_verify_all_btree_replicas && !btree_node_read_all_replicas(c, b, sync)) @@ -1614,7 +1665,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, ca = bch_dev_bkey_exists(c, pick.ptr.dev); bio = bio_alloc_bioset(NULL, - buf_pages(b->data, btree_bytes(c)), + buf_pages(b->data, btree_buf_bytes(b)), REQ_OP_READ|REQ_SYNC|REQ_META, GFP_NOFS, &c->btree_bio); @@ -1628,7 +1679,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, INIT_WORK(&rb->work, btree_node_read_work); bio->bi_iter.bi_sector = pick.ptr.offset; bio->bi_end_io = btree_node_read_endio; - bch2_bio_map(bio, b->data, btree_bytes(c)); + bch2_bio_map(bio, b->data, btree_buf_bytes(b)); if (rb->have_ioref) { this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], @@ -1637,7 +1688,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, if (sync) { submit_bio_wait(bio); - + bch2_latency_acct(ca, rb->start_time, READ); btree_node_read_work(&rb->work); } else { submit_bio(bio); @@ -1663,12 +1714,12 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id, closure_init_stack(&cl); do { - ret = bch2_btree_cache_cannibalize_lock(c, &cl); + ret = bch2_btree_cache_cannibalize_lock(trans, &cl); closure_sync(&cl); } while (ret); b = bch2_btree_node_mem_alloc(trans, level != 0); - bch2_btree_cache_cannibalize_unlock(c); + bch2_btree_cache_cannibalize_unlock(trans); BUG_ON(IS_ERR(b)); @@ -1677,7 +1728,7 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id, set_btree_node_read_in_flight(b); - bch2_btree_node_read(c, b, true); + bch2_btree_node_read(trans, b, true); if (btree_node_read_error(b)) { bch2_btree_node_hash_remove(&c->btree_cache, b); @@ -1789,8 +1840,10 @@ static void btree_node_write_work(struct work_struct *work) bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr, bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); - if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) + if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) { + ret = -BCH_ERR_btree_write_all_failed; goto err; + } if (wbio->wbio.first_btree_write) { if (wbio->wbio.failed.nr) { @@ -1800,9 +1853,9 @@ static void btree_node_write_work(struct work_struct *work) ret = bch2_trans_do(c, NULL, NULL, 0, bch2_btree_node_update_key_get_iter(trans, b, &wbio->key, BCH_WATERMARK_reclaim| - BTREE_INSERT_JOURNAL_RECLAIM| - BTREE_INSERT_NOFAIL| - BTREE_INSERT_NOCHECK_RW, + BCH_TRANS_COMMIT_journal_reclaim| + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_no_check_rw, !wbio->wbio.failed.nr)); if (ret) goto err; @@ -1885,7 +1938,6 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, static void btree_write_submit(struct work_struct *work) { struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work); - struct bch_extent_ptr *ptr; BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; bkey_copy(&tmp.k, &wbio->key); @@ -2022,8 +2074,8 @@ do_write: i->u64s = 0; sort_iter_add(&sort_iter.iter, - unwritten_whiteouts_start(c, b), - unwritten_whiteouts_end(c, b)); + unwritten_whiteouts_start(b), + unwritten_whiteouts_end(b)); SET_BSET_SEPARATE_WHITEOUTS(i, false); b->whiteout_u64s = 0; @@ -2199,7 +2251,7 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) bne = want_new_bset(c, b); if (bne) - bch2_bset_init_next(c, b, bne); + bch2_bset_init_next(b, bne); bch2_btree_build_aux_trees(b); diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h index e0d7fa5b1dfb..e251cb6b965f 100644 --- a/fs/bcachefs/btree_io.h +++ b/fs/bcachefs/btree_io.h @@ -130,7 +130,7 @@ void bch2_btree_init_next(struct btree_trans *, struct btree *); int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *, struct btree *, bool, bool *); -void bch2_btree_node_read(struct bch_fs *, struct btree *, bool); +void bch2_btree_node_read(struct btree_trans *, struct btree *, bool); int bch2_btree_root_read(struct bch_fs *, enum btree_id, const struct bkey_i *, unsigned); diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index da594e006769..5467a8635be1 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -13,6 +13,7 @@ #include "error.h" #include "extents.h" #include "journal.h" +#include "journal_io.h" #include "replicas.h" #include "snapshot.h" #include "trace.h" @@ -21,8 +22,8 @@ #include <linux/prefetch.h> static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *); -static inline void btree_path_list_add(struct btree_trans *, struct btree_path *, - struct btree_path *); +static inline void btree_path_list_add(struct btree_trans *, + btree_path_idx_t, btree_path_idx_t); static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter) { @@ -33,7 +34,8 @@ static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter) #endif } -static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *); +static btree_path_idx_t btree_path_alloc(struct btree_trans *, btree_path_idx_t); +static void bch2_trans_srcu_lock(struct btree_trans *); static inline int __btree_path_cmp(const struct btree_path *l, enum btree_id r_btree_id, @@ -239,8 +241,9 @@ static void bch2_btree_path_verify(struct btree_trans *trans, void bch2_trans_verify_paths(struct btree_trans *trans) { struct btree_path *path; + unsigned iter; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, iter) bch2_btree_path_verify(trans, path); } @@ -250,7 +253,7 @@ static void bch2_btree_iter_verify(struct btree_iter *iter) BUG_ON(iter->btree_id >= BTREE_ID_NR); - BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached); + BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != btree_iter_path(trans, iter)->cached); BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) && (iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); @@ -260,8 +263,8 @@ static void bch2_btree_iter_verify(struct btree_iter *iter) !btree_type_has_snapshot_field(iter->btree_id)); if (iter->update_path) - bch2_btree_path_verify(trans, iter->update_path); - bch2_btree_path_verify(trans, iter->path); + bch2_btree_path_verify(trans, &trans->paths[iter->update_path]); + bch2_btree_path_verify(trans, btree_iter_path(trans, iter)); } static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) @@ -330,12 +333,12 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, struct bpos pos, bool key_cache) { struct btree_path *path; - unsigned idx; + struct trans_for_each_path_inorder_iter iter; struct printbuf buf = PRINTBUF; btree_trans_sort_paths(trans); - trans_for_each_path_inorder(trans, path, idx) { + trans_for_each_path_inorder(trans, path, iter) { int cmp = cmp_int(path->btree_id, id) ?: cmp_int(path->cached, key_cache); @@ -415,8 +418,9 @@ void bch2_btree_path_fix_key_modified(struct btree_trans *trans, struct bkey_packed *where) { struct btree_path *path; + unsigned i; - trans_for_each_path_with_node(trans, b, path) { + trans_for_each_path_with_node(trans, b, path, i) { __bch2_btree_path_fix_key_modified(path, b, where); bch2_btree_path_verify_level(trans, path, b->c.level); } @@ -523,6 +527,7 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans, { struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where); struct btree_path *linked; + unsigned i; if (node_iter != &path->l[b->c.level].iter) { __bch2_btree_node_iter_fix(path, b, node_iter, t, @@ -532,7 +537,7 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans, bch2_btree_node_iter_verify(node_iter, b); } - trans_for_each_path_with_node(trans, b, linked) { + trans_for_each_path_with_node(trans, b, linked, i) { __bch2_btree_node_iter_fix(linked, b, &linked->l[b->c.level].iter, t, where, clobber_u64s, new_u64s); @@ -647,7 +652,6 @@ void bch2_btree_path_level_init(struct btree_trans *trans, static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, struct btree *b) { struct bch_fs *c = trans->c; - struct btree_insert_entry *i; trans_for_each_update(trans, i) if (!i->cached && @@ -655,7 +659,7 @@ static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, str i->btree_id == b->c.btree_id && bpos_cmp(i->k->k.p, b->data->min_key) >= 0 && bpos_cmp(i->k->k.p, b->data->max_key) <= 0) { - i->old_v = bch2_btree_path_peek_slot(i->path, &i->old_k).v; + i->old_v = bch2_btree_path_peek_slot(trans->paths + i->path, &i->old_k).v; if (unlikely(trans->journal_replay_not_finished)) { struct bkey_i *j_k = @@ -674,14 +678,22 @@ static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, str * A btree node is being replaced - update the iterator to point to the new * node: */ -void bch2_trans_node_add(struct btree_trans *trans, struct btree *b) +void bch2_trans_node_add(struct btree_trans *trans, + struct btree_path *path, + struct btree *b) { - struct btree_path *path; + struct btree_path *prev; + + BUG_ON(!btree_path_pos_in_node(path, b)); + + while ((prev = prev_btree_path(trans, path)) && + btree_path_pos_in_node(prev, b)) + path = prev; - trans_for_each_path(trans, path) - if (path->uptodate == BTREE_ITER_UPTODATE && - !path->cached && - btree_path_pos_in_node(path, b)) { + for (; + path && btree_path_pos_in_node(path, b); + path = next_btree_path(trans, path)) + if (path->uptodate == BTREE_ITER_UPTODATE && !path->cached) { enum btree_node_locked_type t = btree_lock_want(path, b->c.level); @@ -704,8 +716,9 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b) void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b) { struct btree_path *path; + unsigned i; - trans_for_each_path_with_node(trans, b, path) + trans_for_each_path_with_node(trans, b, path, i) __btree_path_level_init(path, b->c.level); bch2_trans_revalidate_updates_in_node(trans, b); @@ -781,7 +794,7 @@ static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *pat struct btree_node_iter node_iter = l->iter; struct bkey_packed *k; struct bkey_buf tmp; - unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) + unsigned nr = test_bit(BCH_FS_started, &c->flags) ? (path->level > 1 ? 0 : 2) : (path->level > 1 ? 1 : 16); bool was_locked = btree_node_locked(path, path->level); @@ -816,7 +829,7 @@ static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *p struct bch_fs *c = trans->c; struct bkey_s_c k; struct bkey_buf tmp; - unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) + unsigned nr = test_bit(BCH_FS_started, &c->flags) ? (path->level > 1 ? 0 : 2) : (path->level > 1 ? 1 : 16); bool was_locked = btree_node_locked(path, path->level); @@ -884,7 +897,8 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, bch2_bkey_buf_reassemble(out, c, k); - if (flags & BTREE_ITER_PREFETCH) + if ((flags & BTREE_ITER_PREFETCH) && + c->opts.btree_node_prefetch) ret = btree_path_prefetch_j(trans, path, &jiter); bch2_btree_and_journal_iter_exit(&jiter); @@ -916,7 +930,8 @@ static __always_inline int btree_path_down(struct btree_trans *trans, bch2_bkey_buf_unpack(&tmp, c, l->b, bch2_btree_node_iter_peek(&l->iter, l->b)); - if (flags & BTREE_ITER_PREFETCH) { + if ((flags & BTREE_ITER_PREFETCH) && + c->opts.btree_node_prefetch) { ret = btree_path_prefetch(trans, path); if (ret) goto err; @@ -953,7 +968,8 @@ static int bch2_btree_path_traverse_all(struct btree_trans *trans) struct bch_fs *c = trans->c; struct btree_path *path; unsigned long trace_ip = _RET_IP_; - int i, ret = 0; + unsigned i; + int ret = 0; if (trans->in_traverse_all) return -BCH_ERR_transaction_restart_in_traverse_all; @@ -963,7 +979,7 @@ retry_all: trans->restarted = 0; trans->last_restarted_ip = 0; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) path->should_be_locked = false; btree_trans_sort_paths(trans); @@ -977,7 +993,7 @@ retry_all: closure_init_stack(&cl); do { - ret = bch2_btree_cache_cannibalize_lock(c, &cl); + ret = bch2_btree_cache_cannibalize_lock(trans, &cl); closure_sync(&cl); } while (ret); } @@ -985,16 +1001,16 @@ retry_all: /* Now, redo traversals in correct order: */ i = 0; while (i < trans->nr_sorted) { - path = trans->paths + trans->sorted[i]; + btree_path_idx_t idx = trans->sorted[i]; /* * Traversing a path can cause another path to be added at about * the same position: */ - if (path->uptodate) { - __btree_path_get(path, false); - ret = bch2_btree_path_traverse_one(trans, path, 0, _THIS_IP_); - __btree_path_put(path, false); + if (trans->paths[idx].uptodate) { + __btree_path_get(&trans->paths[idx], false); + ret = bch2_btree_path_traverse_one(trans, idx, 0, _THIS_IP_); + __btree_path_put(&trans->paths[idx], false); if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || bch2_err_matches(ret, ENOMEM)) @@ -1013,7 +1029,7 @@ retry_all: * then failed to relock a path - that's fine. */ err: - bch2_btree_cache_cannibalize_unlock(c); + bch2_btree_cache_cannibalize_unlock(trans); trans->in_traverse_all = false; @@ -1099,10 +1115,11 @@ static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans, * stashed in the iterator and returned from bch2_trans_exit(). */ int bch2_btree_path_traverse_one(struct btree_trans *trans, - struct btree_path *path, + btree_path_idx_t path_idx, unsigned flags, unsigned long trace_ip) { + struct btree_path *path = &trans->paths[path_idx]; unsigned depth_want = path->level; int ret = -((int) trans->restarted); @@ -1126,6 +1143,8 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans, goto out; } + path = &trans->paths[path_idx]; + if (unlikely(path->level >= BTREE_MAX_DEPTH)) goto out; @@ -1188,39 +1207,38 @@ static inline void btree_path_copy(struct btree_trans *trans, struct btree_path } } -static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src, - bool intent) +static btree_path_idx_t btree_path_clone(struct btree_trans *trans, btree_path_idx_t src, + bool intent) { - struct btree_path *new = btree_path_alloc(trans, src); - - btree_path_copy(trans, new, src); - __btree_path_get(new, intent); + btree_path_idx_t new = btree_path_alloc(trans, src); + btree_path_copy(trans, trans->paths + new, trans->paths + src); + __btree_path_get(trans->paths + new, intent); return new; } __flatten -struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *trans, - struct btree_path *path, bool intent, - unsigned long ip) +btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *trans, + btree_path_idx_t path, bool intent, unsigned long ip) { - __btree_path_put(path, intent); + __btree_path_put(trans->paths + path, intent); path = btree_path_clone(trans, path, intent); - path->preserve = false; + trans->paths[path].preserve = false; return path; } -struct btree_path * __must_check +btree_path_idx_t __must_check __bch2_btree_path_set_pos(struct btree_trans *trans, - struct btree_path *path, struct bpos new_pos, - bool intent, unsigned long ip, int cmp) + btree_path_idx_t path_idx, struct bpos new_pos, + bool intent, unsigned long ip) { - unsigned level = path->level; + int cmp = bpos_cmp(new_pos, trans->paths[path_idx].pos); bch2_trans_verify_not_in_restart(trans); - EBUG_ON(!path->ref); + EBUG_ON(!trans->paths[path_idx].ref); - path = bch2_btree_path_make_mut(trans, path, intent, ip); + path_idx = bch2_btree_path_make_mut(trans, path_idx, intent, ip); + struct btree_path *path = trans->paths + path_idx; path->pos = new_pos; trans->paths_sorted = false; @@ -1231,7 +1249,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans, goto out; } - level = btree_path_up_until_good_node(trans, path, cmp); + unsigned level = btree_path_up_until_good_node(trans, path, cmp); if (btree_path_node(path, level)) { struct btree_path_level *l = &path->l[level]; @@ -1261,7 +1279,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans, } out: bch2_btree_path_verify(trans, path); - return path; + return path_idx; } /* Btree path: main interface: */ @@ -1296,19 +1314,16 @@ static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btr return NULL; } -static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path) +static inline void __bch2_path_free(struct btree_trans *trans, btree_path_idx_t path) { - __bch2_btree_path_unlock(trans, path); - btree_path_list_remove(trans, path); - trans->paths_allocated &= ~(1ULL << path->idx); + __bch2_btree_path_unlock(trans, trans->paths + path); + btree_path_list_remove(trans, trans->paths + path); + __clear_bit(path, trans->paths_allocated); } -void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool intent) +void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool intent) { - struct btree_path *dup; - - EBUG_ON(trans->paths + path->idx != path); - EBUG_ON(!path->ref); + struct btree_path *path = trans->paths + path_idx, *dup; if (!__btree_path_put(path, intent)) return; @@ -1322,7 +1337,7 @@ void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool inte if (path->should_be_locked && !trans->restarted && - (!dup || !bch2_btree_path_relock_norestart(trans, dup, _THIS_IP_))) + (!dup || !bch2_btree_path_relock_norestart(trans, dup))) return; if (dup) { @@ -1330,16 +1345,13 @@ void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool inte dup->should_be_locked |= path->should_be_locked; } - __bch2_path_free(trans, path); + __bch2_path_free(trans, path_idx); } -static void bch2_path_put_nokeep(struct btree_trans *trans, struct btree_path *path, +static void bch2_path_put_nokeep(struct btree_trans *trans, btree_path_idx_t path, bool intent) { - EBUG_ON(trans->paths + path->idx != path); - EBUG_ON(!path->ref); - - if (!__btree_path_put(path, intent)) + if (!__btree_path_put(trans->paths + path, intent)) return; __bch2_path_free(trans, path); @@ -1362,9 +1374,6 @@ void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans) noinline __cold void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) { - struct btree_insert_entry *i; - struct btree_write_buffered_key *wb; - prt_printf(buf, "transaction updates for %s journal seq %llu", trans->fn, trans->journal_res.seq); prt_newline(buf); @@ -1388,16 +1397,10 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) prt_newline(buf); } - trans_for_each_wb_update(trans, wb) { - prt_printf(buf, "update: btree=%s wb=1 %pS", - bch2_btree_id_str(wb->btree), - (void *) i->ip_allocated); - prt_newline(buf); - - prt_printf(buf, " new "); - bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(&wb->k)); - prt_newline(buf); - } + for (struct jset_entry *e = trans->journal_entries; + e != btree_trans_journal_entries_top(trans); + e = vstruct_next(e)) + bch2_journal_entry_to_text(buf, trans->c, e); printbuf_indent_sub(buf, 2); } @@ -1412,11 +1415,12 @@ void bch2_dump_trans_updates(struct btree_trans *trans) printbuf_exit(&buf); } -noinline __cold -void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path) +static void bch2_btree_path_to_text(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx) { + struct btree_path *path = trans->paths + path_idx; + prt_printf(out, "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos ", - path->idx, path->ref, path->intent_ref, + path_idx, path->ref, path->intent_ref, path->preserve ? 'P' : ' ', path->should_be_locked ? 'S' : ' ', bch2_btree_id_str(path->btree_id), @@ -1434,14 +1438,13 @@ static noinline __cold void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans, bool nosort) { - struct btree_path *path; - unsigned idx; + struct trans_for_each_path_inorder_iter iter; if (!nosort) btree_trans_sort_paths(trans); - trans_for_each_path_inorder(trans, path, idx) - bch2_btree_path_to_text(out, path); + trans_for_each_path_idx_inorder(trans, iter) + bch2_btree_path_to_text(out, trans, iter.path_idx); } noinline __cold @@ -1473,17 +1476,14 @@ static void bch2_trans_update_max_paths(struct btree_trans *trans) { struct btree_transaction_stats *s = btree_trans_stats(trans); struct printbuf buf = PRINTBUF; - - if (!s) - return; + size_t nr = bitmap_weight(trans->paths_allocated, trans->nr_paths); bch2_trans_paths_to_text(&buf, trans); if (!buf.allocation_failure) { mutex_lock(&s->lock); - if (s->nr_max_paths < hweight64(trans->paths_allocated)) { - s->nr_max_paths = trans->nr_max_paths = - hweight64(trans->paths_allocated); + if (nr > s->nr_max_paths) { + s->nr_max_paths = nr; swap(s->max_paths_text, buf.buf); } mutex_unlock(&s->lock); @@ -1491,64 +1491,121 @@ static void bch2_trans_update_max_paths(struct btree_trans *trans) printbuf_exit(&buf); - trans->nr_max_paths = hweight64(trans->paths_allocated); + trans->nr_paths_max = nr; +} + +noinline __cold +int __bch2_btree_trans_too_many_iters(struct btree_trans *trans) +{ + if (trace_trans_restart_too_many_iters_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_trans_paths_to_text(&buf, trans); + trace_trans_restart_too_many_iters(trans, _THIS_IP_, buf.buf); + printbuf_exit(&buf); + } + + count_event(trans->c, trans_restart_too_many_iters); + + return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters); } static noinline void btree_path_overflow(struct btree_trans *trans) { bch2_dump_trans_paths_updates(trans); - panic("trans path overflow\n"); + bch_err(trans->c, "trans path overflow"); } -static inline struct btree_path *btree_path_alloc(struct btree_trans *trans, - struct btree_path *pos) +static noinline void btree_paths_realloc(struct btree_trans *trans) { - struct btree_path *path; - unsigned idx; + unsigned nr = trans->nr_paths * 2; + + void *p = kzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) + + sizeof(struct btree_trans_paths) + + nr * sizeof(struct btree_path) + + nr * sizeof(btree_path_idx_t) + 8 + + nr * sizeof(struct btree_insert_entry), GFP_KERNEL|__GFP_NOFAIL); + + unsigned long *paths_allocated = p; + memcpy(paths_allocated, trans->paths_allocated, BITS_TO_LONGS(trans->nr_paths) * sizeof(unsigned long)); + p += BITS_TO_LONGS(nr) * sizeof(unsigned long); + + p += sizeof(struct btree_trans_paths); + struct btree_path *paths = p; + *trans_paths_nr(paths) = nr; + memcpy(paths, trans->paths, trans->nr_paths * sizeof(struct btree_path)); + p += nr * sizeof(struct btree_path); + + btree_path_idx_t *sorted = p; + memcpy(sorted, trans->sorted, trans->nr_sorted * sizeof(btree_path_idx_t)); + p += nr * sizeof(btree_path_idx_t) + 8; + + struct btree_insert_entry *updates = p; + memcpy(updates, trans->updates, trans->nr_paths * sizeof(struct btree_insert_entry)); + + unsigned long *old = trans->paths_allocated; - if (unlikely(trans->paths_allocated == - ~((~0ULL << 1) << (BTREE_ITER_MAX - 1)))) - btree_path_overflow(trans); + rcu_assign_pointer(trans->paths_allocated, paths_allocated); + rcu_assign_pointer(trans->paths, paths); + rcu_assign_pointer(trans->sorted, sorted); + rcu_assign_pointer(trans->updates, updates); - idx = __ffs64(~trans->paths_allocated); + trans->nr_paths = nr; + + if (old != trans->_paths_allocated) + kfree_rcu_mightsleep(old); +} + +static inline btree_path_idx_t btree_path_alloc(struct btree_trans *trans, + btree_path_idx_t pos) +{ + btree_path_idx_t idx = find_first_zero_bit(trans->paths_allocated, trans->nr_paths); + + if (unlikely(idx == trans->nr_paths)) { + if (trans->nr_paths == BTREE_ITER_MAX) { + btree_path_overflow(trans); + return 0; + } + + btree_paths_realloc(trans); + } /* * Do this before marking the new path as allocated, since it won't be * initialized yet: */ - if (unlikely(idx > trans->nr_max_paths)) + if (unlikely(idx > trans->nr_paths_max)) bch2_trans_update_max_paths(trans); - trans->paths_allocated |= 1ULL << idx; + __set_bit(idx, trans->paths_allocated); - path = &trans->paths[idx]; - path->idx = idx; + struct btree_path *path = &trans->paths[idx]; path->ref = 0; path->intent_ref = 0; path->nodes_locked = 0; - path->alloc_seq++; - btree_path_list_add(trans, pos, path); + btree_path_list_add(trans, pos, idx); trans->paths_sorted = false; - return path; + return idx; } -struct btree_path *bch2_path_get(struct btree_trans *trans, - enum btree_id btree_id, struct bpos pos, - unsigned locks_want, unsigned level, - unsigned flags, unsigned long ip) +btree_path_idx_t bch2_path_get(struct btree_trans *trans, + enum btree_id btree_id, struct bpos pos, + unsigned locks_want, unsigned level, + unsigned flags, unsigned long ip) { - struct btree_path *path, *path_pos = NULL; + struct btree_path *path; bool cached = flags & BTREE_ITER_CACHED; bool intent = flags & BTREE_ITER_INTENT; - int i; + struct trans_for_each_path_inorder_iter iter; + btree_path_idx_t path_pos = 0, path_idx; bch2_trans_verify_not_in_restart(trans); bch2_trans_verify_locks(trans); btree_trans_sort_paths(trans); - trans_for_each_path_inorder(trans, path, i) { + trans_for_each_path_inorder(trans, path, iter) { if (__btree_path_cmp(path, btree_id, cached, @@ -1556,18 +1613,19 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, level) > 0) break; - path_pos = path; + path_pos = iter.path_idx; } if (path_pos && - path_pos->cached == cached && - path_pos->btree_id == btree_id && - path_pos->level == level) { - __btree_path_get(path_pos, intent); - path = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip); + trans->paths[path_pos].cached == cached && + trans->paths[path_pos].btree_id == btree_id && + trans->paths[path_pos].level == level) { + __btree_path_get(trans->paths + path_pos, intent); + path_idx = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip); + path = trans->paths + path_idx; } else { - path = btree_path_alloc(trans, path_pos); - path_pos = NULL; + path_idx = btree_path_alloc(trans, path_pos); + path = trans->paths + path_idx; __btree_path_get(path, intent); path->pos = pos; @@ -1578,7 +1636,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, path->level = level; path->locks_want = locks_want; path->nodes_locked = 0; - for (i = 0; i < ARRAY_SIZE(path->l); i++) + for (unsigned i = 0; i < ARRAY_SIZE(path->l); i++) path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_init); #ifdef TRACK_PATH_ALLOCATED path->ip_allocated = ip; @@ -1604,7 +1662,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, if (locks_want > path->locks_want) bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want, NULL); - return path; + return path_idx; } struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u) @@ -1659,9 +1717,10 @@ __bch2_btree_iter_traverse(struct btree_iter *iter) int __must_check bch2_btree_iter_traverse(struct btree_iter *iter) { + struct btree_trans *trans = iter->trans; int ret; - iter->path = bch2_btree_path_set_pos(iter->trans, iter->path, + iter->path = bch2_btree_path_set_pos(trans, iter->path, btree_iter_search_key(iter), iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); @@ -1670,7 +1729,7 @@ bch2_btree_iter_traverse(struct btree_iter *iter) if (ret) return ret; - btree_path_set_should_be_locked(iter->path); + btree_path_set_should_be_locked(trans->paths + iter->path); return 0; } @@ -1682,14 +1741,15 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) struct btree *b = NULL; int ret; - EBUG_ON(iter->path->cached); + EBUG_ON(trans->paths[iter->path].cached); bch2_btree_iter_verify(iter); ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (ret) goto err; - b = btree_path_node(iter->path, iter->path->level); + struct btree_path *path = btree_iter_path(trans, iter); + b = btree_path_node(path, path->level); if (!b) goto out; @@ -1701,7 +1761,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); - btree_path_set_should_be_locked(iter->path); + btree_path_set_should_be_locked(btree_iter_path(trans, iter)); out: bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(iter); @@ -1726,14 +1786,15 @@ struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter) struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) { struct btree_trans *trans = iter->trans; - struct btree_path *path = iter->path; struct btree *b = NULL; int ret; + EBUG_ON(trans->paths[iter->path].cached); bch2_trans_verify_not_in_restart(trans); - EBUG_ON(iter->path->cached); bch2_btree_iter_verify(iter); + struct btree_path *path = btree_iter_path(trans, iter); + /* already at end? */ if (!btree_path_node(path, path->level)) return NULL; @@ -1763,17 +1824,19 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) * Haven't gotten to the end of the parent node: go back down to * the next child node */ - path = iter->path = - bch2_btree_path_set_pos(trans, path, bpos_successor(iter->pos), - iter->flags & BTREE_ITER_INTENT, - btree_iter_ip_allocated(iter)); + iter->path = bch2_btree_path_set_pos(trans, iter->path, + bpos_successor(iter->pos), + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + path = btree_iter_path(trans, iter); btree_path_set_level_down(trans, path, iter->min_depth); - ret = bch2_btree_path_traverse(trans, path, iter->flags); + ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (ret) goto err; + path = btree_iter_path(trans, iter); b = path->l[path->level].b; } @@ -1783,8 +1846,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); - btree_path_set_should_be_locked(iter->path); - BUG_ON(iter->path->uptodate); + btree_path_set_should_be_locked(btree_iter_path(trans, iter)); + EBUG_ON(btree_iter_path(trans, iter)->uptodate); out: bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(iter); @@ -1799,23 +1862,15 @@ err: inline bool bch2_btree_iter_advance(struct btree_iter *iter) { - if (likely(!(iter->flags & BTREE_ITER_ALL_LEVELS))) { - struct bpos pos = iter->k.p; - bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS - ? bpos_eq(pos, SPOS_MAX) - : bkey_eq(pos, SPOS_MAX)); - - if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) - pos = bkey_successor(iter, pos); - bch2_btree_iter_set_pos(iter, pos); - return ret; - } else { - if (!btree_path_node(iter->path, iter->path->level)) - return true; + struct bpos pos = iter->k.p; + bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS + ? bpos_eq(pos, SPOS_MAX) + : bkey_eq(pos, SPOS_MAX)); - iter->advanced = true; - return false; - } + if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) + pos = bkey_successor(iter, pos); + bch2_btree_iter_set_pos(iter, pos); + return ret; } inline bool bch2_btree_iter_rewind(struct btree_iter *iter) @@ -1832,58 +1887,70 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter) } static noinline -struct bkey_i *__bch2_btree_trans_peek_updates(struct btree_iter *iter) +void bch2_btree_trans_peek_prev_updates(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c *k) { - struct btree_insert_entry *i; - struct bkey_i *ret = NULL; + struct bpos end = path_l(btree_iter_path(trans, iter))->b->data->min_key; - trans_for_each_update(iter->trans, i) { - if (i->btree_id < iter->btree_id) - continue; - if (i->btree_id > iter->btree_id) - break; - if (bpos_lt(i->k->k.p, iter->path->pos)) - continue; - if (i->key_cache_already_flushed) - continue; - if (!ret || bpos_lt(i->k->k.p, ret->k.p)) - ret = i->k; - } + trans_for_each_update(trans, i) + if (!i->key_cache_already_flushed && + i->btree_id == iter->btree_id && + bpos_le(i->k->k.p, iter->pos) && + bpos_ge(i->k->k.p, k->k ? k->k->p : end)) { + iter->k = i->k->k; + *k = bkey_i_to_s_c(i->k); + } +} - return ret; +static noinline +void bch2_btree_trans_peek_updates(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c *k) +{ + struct btree_path *path = btree_iter_path(trans, iter); + struct bpos end = path_l(path)->b->key.k.p; + + trans_for_each_update(trans, i) + if (!i->key_cache_already_flushed && + i->btree_id == iter->btree_id && + bpos_ge(i->k->k.p, path->pos) && + bpos_le(i->k->k.p, k->k ? k->k->p : end)) { + iter->k = i->k->k; + *k = bkey_i_to_s_c(i->k); + } } -static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter) +static noinline +void bch2_btree_trans_peek_slot_updates(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c *k) { - return iter->flags & BTREE_ITER_WITH_UPDATES - ? __bch2_btree_trans_peek_updates(iter) - : NULL; + trans_for_each_update(trans, i) + if (!i->key_cache_already_flushed && + i->btree_id == iter->btree_id && + bpos_eq(i->k->k.p, iter->pos)) { + iter->k = i->k->k; + *k = bkey_i_to_s_c(i->k); + } } static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans, struct btree_iter *iter, struct bpos end_pos) { - struct bkey_i *k; - - if (bpos_lt(iter->path->pos, iter->journal_pos)) - iter->journal_idx = 0; + struct btree_path *path = btree_iter_path(trans, iter); - k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id, - iter->path->level, - iter->path->pos, - end_pos, - &iter->journal_idx); - - iter->journal_pos = k ? k->k.p : end_pos; - return k; + return bch2_journal_keys_peek_upto(trans->c, iter->btree_id, + path->level, + path->pos, + end_pos, + &iter->journal_idx); } static noinline struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans, struct btree_iter *iter) { - struct bkey_i *k = bch2_btree_journal_peek(trans, iter, iter->path->pos); + struct btree_path *path = btree_iter_path(trans, iter); + struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos); if (k) { iter->k = k->k; @@ -1898,9 +1965,10 @@ struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) { + struct btree_path *path = btree_iter_path(trans, iter); struct bkey_i *next_journal = bch2_btree_journal_peek(trans, iter, - k.k ? k.k->p : path_l(iter->path)->b->key.k.p); + k.k ? k.k->p : path_l(path)->b->key.k.p); if (next_journal) { iter->k = next_journal->k; @@ -1943,13 +2011,13 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos ret = bch2_btree_path_traverse(trans, iter->key_cache_path, iter->flags|BTREE_ITER_CACHED) ?: - bch2_btree_path_relock(trans, iter->path, _THIS_IP_); + bch2_btree_path_relock(trans, btree_iter_path(trans, iter), _THIS_IP_); if (unlikely(ret)) return bkey_s_c_err(ret); - btree_path_set_should_be_locked(iter->key_cache_path); + btree_path_set_should_be_locked(trans->paths + iter->key_cache_path); - k = bch2_btree_path_peek_slot(iter->key_cache_path, &u); + k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u); if (k.k && !bkey_err(k)) { iter->k = u; k.k = &iter->k; @@ -1960,11 +2028,10 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key) { struct btree_trans *trans = iter->trans; - struct bkey_i *next_update; struct bkey_s_c k, k2; int ret; - EBUG_ON(iter->path->cached); + EBUG_ON(btree_iter_path(trans, iter)->cached); bch2_btree_iter_verify(iter); while (1) { @@ -1982,7 +2049,8 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp goto out; } - l = path_l(iter->path); + struct btree_path *path = btree_iter_path(trans, iter); + l = path_l(path); if (unlikely(!l->b)) { /* No btree nodes at requested level: */ @@ -1991,7 +2059,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp goto out; } - btree_path_set_should_be_locked(iter->path); + btree_path_set_should_be_locked(path); k = btree_path_level_peek_all(trans->c, l, &iter->k); @@ -2009,14 +2077,9 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL)) k = btree_trans_peek_journal(trans, iter, k); - next_update = btree_trans_peek_updates(iter); - - if (next_update && - bpos_le(next_update->k.p, - k.k ? k.k->p : l->b->key.k.p)) { - iter->k = next_update->k; - k = bkey_i_to_s_c(next_update); - } + if (unlikely((iter->flags & BTREE_ITER_WITH_UPDATES) && + trans->nr_updates)) + bch2_btree_trans_peek_updates(trans, iter, &k); if (k.k && bkey_deleted(k.k)) { /* @@ -2066,13 +2129,12 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e struct bpos iter_pos; int ret; - EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS); EBUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && bkey_eq(end, POS_MAX)); if (iter->update_path) { bch2_path_put_nokeep(trans, iter->update_path, iter->flags & BTREE_ITER_INTENT); - iter->update_path = NULL; + iter->update_path = 0; } bch2_btree_iter_verify_entry_exit(iter); @@ -2098,10 +2160,10 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e goto end; if (iter->update_path && - !bkey_eq(iter->update_path->pos, k.k->p)) { + !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) { bch2_path_put_nokeep(trans, iter->update_path, iter->flags & BTREE_ITER_INTENT); - iter->update_path = NULL; + iter->update_path = 0; } if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && @@ -2121,7 +2183,7 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e * advance, same as on exit for iter->path, but only up * to snapshot */ - __btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT); + __btree_path_get(trans->paths + iter->path, iter->flags & BTREE_ITER_INTENT); iter->update_path = iter->path; iter->update_path = bch2_btree_path_set_pos(trans, @@ -2177,14 +2239,14 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); - btree_path_set_should_be_locked(iter->path); + btree_path_set_should_be_locked(btree_iter_path(trans, iter)); out_no_locked: if (iter->update_path) { - ret = bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_); + ret = bch2_btree_path_relock(trans, trans->paths + iter->update_path, _THIS_IP_); if (unlikely(ret)) k = bkey_s_c_err(ret); else - btree_path_set_should_be_locked(iter->update_path); + btree_path_set_should_be_locked(trans->paths + iter->update_path); } if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) @@ -2206,103 +2268,6 @@ end: } /** - * bch2_btree_iter_peek_all_levels() - returns the first key greater than or - * equal to iterator's current position, returning keys from every level of the - * btree. For keys at different levels of the btree that compare equal, the key - * from the lower level (leaf) is returned first. - * @iter: iterator to peek from - * - * Returns: key if found, or an error extractable with bkey_err(). - */ -struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter) -{ - struct btree_trans *trans = iter->trans; - struct bkey_s_c k; - int ret; - - EBUG_ON(iter->path->cached); - bch2_btree_iter_verify(iter); - BUG_ON(iter->path->level < iter->min_depth); - BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); - EBUG_ON(!(iter->flags & BTREE_ITER_ALL_LEVELS)); - - while (1) { - iter->path = bch2_btree_path_set_pos(trans, iter->path, iter->pos, - iter->flags & BTREE_ITER_INTENT, - btree_iter_ip_allocated(iter)); - - ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); - if (unlikely(ret)) { - /* ensure that iter->k is consistent with iter->pos: */ - bch2_btree_iter_set_pos(iter, iter->pos); - k = bkey_s_c_err(ret); - goto out_no_locked; - } - - /* Already at end? */ - if (!btree_path_node(iter->path, iter->path->level)) { - k = bkey_s_c_null; - goto out_no_locked; - } - - k = btree_path_level_peek_all(trans->c, - &iter->path->l[iter->path->level], &iter->k); - - /* Check if we should go up to the parent node: */ - if (!k.k || - (iter->advanced && - bpos_eq(path_l(iter->path)->b->key.k.p, iter->pos))) { - iter->pos = path_l(iter->path)->b->key.k.p; - btree_path_set_level_up(trans, iter->path); - iter->advanced = false; - continue; - } - - /* - * Check if we should go back down to a leaf: - * If we're not in a leaf node, we only return the current key - * if it exactly matches iter->pos - otherwise we first have to - * go back to the leaf: - */ - if (iter->path->level != iter->min_depth && - (iter->advanced || - !k.k || - !bpos_eq(iter->pos, k.k->p))) { - btree_path_set_level_down(trans, iter->path, iter->min_depth); - iter->pos = bpos_successor(iter->pos); - iter->advanced = false; - continue; - } - - /* Check if we should go to the next key: */ - if (iter->path->level == iter->min_depth && - iter->advanced && - k.k && - bpos_eq(iter->pos, k.k->p)) { - iter->pos = bpos_successor(iter->pos); - iter->advanced = false; - continue; - } - - if (iter->advanced && - iter->path->level == iter->min_depth && - !bpos_eq(k.k->p, iter->pos)) - iter->advanced = false; - - BUG_ON(iter->advanced); - BUG_ON(!k.k); - break; - } - - iter->pos = k.k->p; - btree_path_set_should_be_locked(iter->path); -out_no_locked: - bch2_btree_iter_verify(iter); - - return k; -} - -/** * bch2_btree_iter_next() - returns first key greater than iterator's current * position * @iter: iterator to peek from @@ -2328,14 +2293,14 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) { struct btree_trans *trans = iter->trans; struct bpos search_key = iter->pos; - struct btree_path *saved_path = NULL; struct bkey_s_c k; struct bkey saved_k; const struct bch_val *saved_v; + btree_path_idx_t saved_path = 0; int ret; - EBUG_ON(iter->path->cached || iter->path->level); - EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES); + EBUG_ON(btree_iter_path(trans, iter)->cached || + btree_iter_path(trans, iter)->level); if (iter->flags & BTREE_ITER_WITH_JOURNAL) return bkey_s_c_err(-EIO); @@ -2359,14 +2324,18 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) goto out_no_locked; } - k = btree_path_level_peek(trans, iter->path, - &iter->path->l[0], &iter->k); + struct btree_path *path = btree_iter_path(trans, iter); + + k = btree_path_level_peek(trans, path, &path->l[0], &iter->k); if (!k.k || ((iter->flags & BTREE_ITER_IS_EXTENTS) ? bpos_ge(bkey_start_pos(k.k), search_key) : bpos_gt(k.k->p, search_key))) - k = btree_path_level_prev(trans, iter->path, - &iter->path->l[0], &iter->k); + k = btree_path_level_prev(trans, path, &path->l[0], &iter->k); + + if (unlikely((iter->flags & BTREE_ITER_WITH_UPDATES) && + trans->nr_updates)) + bch2_btree_trans_peek_prev_updates(trans, iter, &k); if (likely(k.k)) { if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) { @@ -2382,13 +2351,13 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) bch2_path_put_nokeep(trans, iter->path, iter->flags & BTREE_ITER_INTENT); iter->path = saved_path; - saved_path = NULL; + saved_path = 0; iter->k = saved_k; k.v = saved_v; goto got_key; } - if (bch2_snapshot_is_ancestor(iter->trans->c, + if (bch2_snapshot_is_ancestor(trans->c, iter->snapshot, k.k->p.snapshot)) { if (saved_path) @@ -2396,6 +2365,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) iter->flags & BTREE_ITER_INTENT); saved_path = btree_path_clone(trans, iter->path, iter->flags & BTREE_ITER_INTENT); + path = btree_iter_path(trans, iter); saved_k = *k.k; saved_v = k.v; } @@ -2412,10 +2382,11 @@ got_key: continue; } + btree_path_set_should_be_locked(path); break; - } else if (likely(!bpos_eq(iter->path->l[0].b->data->min_key, POS_MIN))) { + } else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) { /* Advance to previous leaf node: */ - search_key = bpos_predecessor(iter->path->l[0].b->data->min_key); + search_key = bpos_predecessor(path->l[0].b->data->min_key); } else { /* Start of btree: */ bch2_btree_iter_set_pos(iter, POS_MIN); @@ -2432,8 +2403,6 @@ got_key: if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) iter->pos.snapshot = iter->snapshot; - - btree_path_set_should_be_locked(iter->path); out_no_locked: if (saved_path) bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_INTENT); @@ -2468,8 +2437,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) bch2_btree_iter_verify(iter); bch2_btree_iter_verify_entry_exit(iter); - EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS); - EBUG_ON(iter->path->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE)); + EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE)); /* extents can't span inode numbers: */ if ((iter->flags & BTREE_ITER_IS_EXTENTS) && @@ -2493,13 +2461,13 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) if ((iter->flags & BTREE_ITER_CACHED) || !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) { - struct bkey_i *next_update; + k = bkey_s_c_null; - if ((next_update = btree_trans_peek_updates(iter)) && - bpos_eq(next_update->k.p, iter->pos)) { - iter->k = next_update->k; - k = bkey_i_to_s_c(next_update); - goto out; + if (unlikely((iter->flags & BTREE_ITER_WITH_UPDATES) && + trans->nr_updates)) { + bch2_btree_trans_peek_slot_updates(trans, iter, &k); + if (k.k) + goto out; } if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) && @@ -2514,7 +2482,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) goto out_no_locked; } - k = bch2_btree_path_peek_slot(iter->path, &iter->k); + k = bch2_btree_path_peek_slot(trans->paths + iter->path, &iter->k); if (unlikely(!k.k)) goto out_no_locked; } else { @@ -2524,7 +2492,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) if (iter->flags & BTREE_ITER_IS_EXTENTS) end.offset = U64_MAX; - EBUG_ON(iter->path->level); + EBUG_ON(btree_iter_path(trans, iter)->level); if (iter->flags & BTREE_ITER_INTENT) { struct btree_iter iter2; @@ -2570,7 +2538,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) } } out: - btree_path_set_should_be_locked(iter->path); + btree_path_set_should_be_locked(btree_iter_path(trans, iter)); out_no_locked: bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(iter); @@ -2617,17 +2585,17 @@ static void btree_trans_verify_sorted_refs(struct btree_trans *trans) struct btree_path *path; unsigned i; - BUG_ON(trans->nr_sorted != hweight64(trans->paths_allocated)); + BUG_ON(trans->nr_sorted != bitmap_weight(trans->paths_allocated, trans->nr_paths) - 1); - trans_for_each_path(trans, path) { + trans_for_each_path(trans, path, i) { BUG_ON(path->sorted_idx >= trans->nr_sorted); - BUG_ON(trans->sorted[path->sorted_idx] != path->idx); + BUG_ON(trans->sorted[path->sorted_idx] != i); } for (i = 0; i < trans->nr_sorted; i++) { unsigned idx = trans->sorted[i]; - EBUG_ON(!(trans->paths_allocated & (1ULL << idx))); + BUG_ON(!test_bit(idx, trans->paths_allocated)); BUG_ON(trans->paths[idx].sorted_idx != i); } } @@ -2635,12 +2603,12 @@ static void btree_trans_verify_sorted_refs(struct btree_trans *trans) static void btree_trans_verify_sorted(struct btree_trans *trans) { struct btree_path *path, *prev = NULL; - unsigned i; + struct trans_for_each_path_inorder_iter iter; if (!bch2_debug_check_iterators) return; - trans_for_each_path_inorder(trans, path, i) { + trans_for_each_path_inorder(trans, path, iter) { if (prev && btree_path_cmp(prev, path) > 0) { __bch2_dump_trans_paths_updates(trans, true); panic("trans paths out of order!\n"); @@ -2697,42 +2665,40 @@ out: static inline void btree_path_list_remove(struct btree_trans *trans, struct btree_path *path) { - unsigned i; - EBUG_ON(path->sorted_idx >= trans->nr_sorted); #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS trans->nr_sorted--; memmove_u64s_down_small(trans->sorted + path->sorted_idx, trans->sorted + path->sorted_idx + 1, - DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8)); + DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, + sizeof(u64) / sizeof(btree_path_idx_t))); #else array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx); #endif - for (i = path->sorted_idx; i < trans->nr_sorted; i++) + for (unsigned i = path->sorted_idx; i < trans->nr_sorted; i++) trans->paths[trans->sorted[i]].sorted_idx = i; - - path->sorted_idx = U8_MAX; } static inline void btree_path_list_add(struct btree_trans *trans, - struct btree_path *pos, - struct btree_path *path) + btree_path_idx_t pos, + btree_path_idx_t path_idx) { - unsigned i; + struct btree_path *path = trans->paths + path_idx; - path->sorted_idx = pos ? pos->sorted_idx + 1 : trans->nr_sorted; + path->sorted_idx = pos ? trans->paths[pos].sorted_idx + 1 : trans->nr_sorted; #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS memmove_u64s_up_small(trans->sorted + path->sorted_idx + 1, trans->sorted + path->sorted_idx, - DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8)); + DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, + sizeof(u64) / sizeof(btree_path_idx_t))); trans->nr_sorted++; - trans->sorted[path->sorted_idx] = path->idx; + trans->sorted[path->sorted_idx] = path_idx; #else - array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx); + array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path_idx); #endif - for (i = path->sorted_idx; i < trans->nr_sorted; i++) + for (unsigned i = path->sorted_idx; i < trans->nr_sorted; i++) trans->paths[trans->sorted[i]].sorted_idx = i; btree_trans_verify_sorted_refs(trans); @@ -2749,9 +2715,10 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) if (iter->key_cache_path) bch2_path_put(trans, iter->key_cache_path, iter->flags & BTREE_ITER_INTENT); - iter->path = NULL; - iter->update_path = NULL; - iter->key_cache_path = NULL; + iter->path = 0; + iter->update_path = 0; + iter->key_cache_path = 0; + iter->trans = NULL; } void bch2_trans_iter_init_outlined(struct btree_trans *trans, @@ -2782,41 +2749,46 @@ void bch2_trans_node_iter_init(struct btree_trans *trans, iter->min_depth = depth; - BUG_ON(iter->path->locks_want < min(locks_want, BTREE_MAX_DEPTH)); - BUG_ON(iter->path->level != depth); - BUG_ON(iter->min_depth != depth); + struct btree_path *path = btree_iter_path(trans, iter); + BUG_ON(path->locks_want < min(locks_want, BTREE_MAX_DEPTH)); + BUG_ON(path->level != depth); + BUG_ON(iter->min_depth != depth); } void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) { + struct btree_trans *trans = src->trans; + *dst = *src; if (src->path) - __btree_path_get(src->path, src->flags & BTREE_ITER_INTENT); + __btree_path_get(trans->paths + src->path, src->flags & BTREE_ITER_INTENT); if (src->update_path) - __btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT); - dst->key_cache_path = NULL; + __btree_path_get(trans->paths + src->update_path, src->flags & BTREE_ITER_INTENT); + dst->key_cache_path = 0; } void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) { + struct bch_fs *c = trans->c; unsigned new_top = trans->mem_top + size; - size_t old_bytes = trans->mem_bytes; - size_t new_bytes = roundup_pow_of_two(new_top); + unsigned old_bytes = trans->mem_bytes; + unsigned new_bytes = roundup_pow_of_two(new_top); int ret; void *new_mem; void *p; - trans->mem_max = max(trans->mem_max, new_top); - WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX); + struct btree_transaction_stats *s = btree_trans_stats(trans); + s->max_mem = max(s->max_mem, new_bytes); + new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN); if (unlikely(!new_mem)) { bch2_trans_unlock(trans); new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL); if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) { - new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL); + new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); new_bytes = BTREE_TRANS_MEM_MAX; kfree(trans->mem); } @@ -2836,7 +2808,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) trans->mem_bytes = new_bytes; if (old_bytes) { - trace_and_count(trans->c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); + trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced)); } @@ -2858,8 +2830,9 @@ void bch2_trans_srcu_unlock(struct btree_trans *trans) if (trans->srcu_held) { struct bch_fs *c = trans->c; struct btree_path *path; + unsigned i; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) if (path->cached && !btree_node_locked(path, 0)) path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset); @@ -2869,7 +2842,7 @@ void bch2_trans_srcu_unlock(struct btree_trans *trans) } } -void bch2_trans_srcu_lock(struct btree_trans *trans) +static void bch2_trans_srcu_lock(struct btree_trans *trans) { if (!trans->srcu_held) { trans->srcu_idx = srcu_read_lock(&trans->c->btree_trans_barrier); @@ -2891,14 +2864,16 @@ void bch2_trans_srcu_lock(struct btree_trans *trans) u32 bch2_trans_begin(struct btree_trans *trans) { struct btree_path *path; + unsigned i; u64 now; bch2_trans_reset_updates(trans); trans->restart_count++; trans->mem_top = 0; + trans->journal_entries = NULL; - trans_for_each_path(trans, path) { + trans_for_each_path(trans, path, i) { path->should_be_locked = false; /* @@ -2915,15 +2890,21 @@ u32 bch2_trans_begin(struct btree_trans *trans) * iterators if we do that */ if (!path->ref && !path->preserve) - __bch2_path_free(trans, path); + __bch2_path_free(trans, i); else path->preserve = false; } now = local_clock(); + + if (!IS_ENABLED(CONFIG_BCACHEFS_NO_LATENCY_ACCT) && + time_after64(now, trans->last_begin_time + 10)) + __bch2_time_stats_update(&btree_trans_stats(trans)->duration, + trans->last_begin_time, now); + if (!trans->restarted && (need_resched() || - now - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) { + time_after64(now, trans->last_begin_time + BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS))) { drop_locks_do(trans, (cond_resched(), 0)); now = local_clock(); } @@ -2942,32 +2923,11 @@ u32 bch2_trans_begin(struct btree_trans *trans) return trans->restart_count; } -static struct btree_trans *bch2_trans_alloc(struct bch_fs *c) -{ - struct btree_trans *trans; - - if (IS_ENABLED(__KERNEL__)) { - trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL); - if (trans) - return trans; - } - - trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS); - /* - * paths need to be zeroed, bch2_check_for_deadlock looks at - * paths in other threads - */ - memset(&trans->paths, 0, sizeof(trans->paths)); - return trans; -} - -const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR]; +const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR] = { "(unknown)" }; unsigned bch2_trans_get_fn_idx(const char *fn) { - unsigned i; - - for (i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++) + for (unsigned i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++) if (!bch2_btree_transaction_fns[i] || bch2_btree_transaction_fns[i] == fn) { bch2_btree_transaction_fns[i] = fn; @@ -2975,76 +2935,92 @@ unsigned bch2_trans_get_fn_idx(const char *fn) } pr_warn_once("BCH_TRANSACTIONS_NR not big enough!"); - return i; + return 0; } struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) __acquires(&c->btree_trans_barrier) { struct btree_trans *trans; - struct btree_transaction_stats *s; - trans = bch2_trans_alloc(c); - - memset(trans, 0, sizeof(*trans)); - trans->c = c; - trans->fn = fn_idx < ARRAY_SIZE(bch2_btree_transaction_fns) - ? bch2_btree_transaction_fns[fn_idx] : NULL; - trans->last_begin_time = local_clock(); - trans->fn_idx = fn_idx; - trans->locking_wait.task = current; - trans->journal_replay_not_finished = - unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) && - atomic_inc_not_zero(&c->journal_keys.ref); - closure_init_stack(&trans->ref); - - s = btree_trans_stats(trans); - if (s && s->max_mem) { - unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem); - - trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL); - - if (!unlikely(trans->mem)) { - trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); - trans->mem_bytes = BTREE_TRANS_MEM_MAX; - } else { - trans->mem_bytes = expected_mem_bytes; + if (IS_ENABLED(__KERNEL__)) { + trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL); + if (trans) { + memset(trans, 0, offsetof(struct btree_trans, list)); + goto got_trans; } } - if (s) { - trans->nr_max_paths = s->nr_max_paths; - trans->wb_updates_size = s->wb_updates_size; - } - - trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); - trans->srcu_lock_time = jiffies; - trans->srcu_held = true; + trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS); + memset(trans, 0, sizeof(*trans)); + closure_init_stack(&trans->ref); - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) { + seqmutex_lock(&c->btree_trans_lock); + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { struct btree_trans *pos; + pid_t pid = current->pid; + + trans->locking_wait.task = current; - seqmutex_lock(&c->btree_trans_lock); list_for_each_entry(pos, &c->btree_trans_list, list) { + struct task_struct *pos_task = READ_ONCE(pos->locking_wait.task); /* * We'd much prefer to be stricter here and completely * disallow multiple btree_trans in the same thread - * but the data move path calls bch2_write when we * already have a btree_trans initialized. */ - BUG_ON(trans->locking_wait.task->pid == pos->locking_wait.task->pid && + BUG_ON(pos_task && + pid == pos_task->pid && bch2_trans_locked(pos)); - if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) { + if (pos_task && pid < pos_task->pid) { list_add_tail(&trans->list, &pos->list); goto list_add_done; } } - list_add_tail(&trans->list, &c->btree_trans_list); + } + list_add_tail(&trans->list, &c->btree_trans_list); list_add_done: - seqmutex_unlock(&c->btree_trans_lock); + seqmutex_unlock(&c->btree_trans_lock); +got_trans: + trans->c = c; + trans->last_begin_time = local_clock(); + trans->fn_idx = fn_idx; + trans->locking_wait.task = current; + trans->journal_replay_not_finished = + unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) && + atomic_inc_not_zero(&c->journal_keys.ref); + trans->nr_paths = ARRAY_SIZE(trans->_paths); + trans->paths_allocated = trans->_paths_allocated; + trans->sorted = trans->_sorted; + trans->paths = trans->_paths; + trans->updates = trans->_updates; + + *trans_paths_nr(trans->paths) = BTREE_ITER_INITIAL; + + trans->paths_allocated[0] = 1; + + if (fn_idx < BCH_TRANSACTIONS_NR) { + trans->fn = bch2_btree_transaction_fns[fn_idx]; + + struct btree_transaction_stats *s = &c->btree_transaction_stats[fn_idx]; + + if (s->max_mem) { + unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem); + + trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL); + if (likely(trans->mem)) + trans->mem_bytes = expected_mem_bytes; + } + + trans->nr_paths_max = s->nr_max_paths; + trans->journal_entries_size = s->journal_entries_size; } + trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); + trans->srcu_lock_time = jiffies; + trans->srcu_held = true; return trans; } @@ -3053,14 +3029,15 @@ static void check_btree_paths_leaked(struct btree_trans *trans) #ifdef CONFIG_BCACHEFS_DEBUG struct bch_fs *c = trans->c; struct btree_path *path; + unsigned i; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) if (path->ref) goto leaked; return; leaked: bch_err(c, "btree paths leaked from %s!", trans->fn); - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) if (path->ref) printk(KERN_ERR " btree %s %pS\n", bch2_btree_id_str(path->btree_id), @@ -3073,26 +3050,14 @@ leaked: void bch2_trans_put(struct btree_trans *trans) __releases(&c->btree_trans_barrier) { - struct btree_insert_entry *i; struct bch_fs *c = trans->c; - struct btree_transaction_stats *s = btree_trans_stats(trans); bch2_trans_unlock(trans); - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) { - seqmutex_lock(&c->btree_trans_lock); - list_del(&trans->list); - seqmutex_unlock(&c->btree_trans_lock); - } - - closure_sync(&trans->ref); - - if (s) - s->max_mem = max(s->max_mem, trans->mem_max); - trans_for_each_update(trans, i) - __btree_path_put(i->path, true); - trans->nr_updates = 0; + __btree_path_put(trans->paths + i->path, true); + trans->nr_updates = 0; + trans->locking_wait.task = NULL; check_btree_paths_leaked(trans); @@ -3101,8 +3066,6 @@ void bch2_trans_put(struct btree_trans *trans) srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); } - kfree(trans->extra_journal_entries.data); - if (trans->fs_usage_deltas) { if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) == REPLICAS_DELTA_LIST_MAX) @@ -3115,6 +3078,13 @@ void bch2_trans_put(struct btree_trans *trans) if (unlikely(trans->journal_replay_not_finished)) bch2_journal_keys_put(c); + unsigned long *paths_allocated = trans->paths_allocated; + trans->paths_allocated = NULL; + trans->paths = NULL; + + if (paths_allocated != trans->_paths_allocated) + kfree_rcu_mightsleep(paths_allocated); + if (trans->mem_bytes == BTREE_TRANS_MEM_MAX) mempool_free(trans->mem, &c->btree_trans_mem_pool); else @@ -3123,8 +3093,16 @@ void bch2_trans_put(struct btree_trans *trans) /* Userspace doesn't have a real percpu implementation: */ if (IS_ENABLED(__KERNEL__)) trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans); - if (trans) + + if (trans) { + closure_sync(&trans->ref); + + seqmutex_lock(&c->btree_trans_lock); + list_del(&trans->list); + seqmutex_unlock(&c->btree_trans_lock); + mempool_free(trans, &c->btree_trans_pool); + } } static void __maybe_unused @@ -3152,24 +3130,38 @@ bch2_btree_bkey_cached_common_to_text(struct printbuf *out, void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) { - struct btree_path *path; struct btree_bkey_cached_common *b; static char lock_types[] = { 'r', 'i', 'w' }; + struct task_struct *task = READ_ONCE(trans->locking_wait.task); unsigned l, idx; + /* before rcu_read_lock(): */ + bch2_printbuf_make_room(out, 4096); + if (!out->nr_tabstops) { printbuf_tabstop_push(out, 16); printbuf_tabstop_push(out, 32); } - prt_printf(out, "%i %s\n", trans->locking_wait.task->pid, trans->fn); + prt_printf(out, "%i %s\n", task ? task->pid : 0, trans->fn); + + /* trans->paths is rcu protected vs. freeing */ + rcu_read_lock(); + out->atomic++; + + struct btree_path *paths = rcu_dereference(trans->paths); + if (!paths) + goto out; + + unsigned long *paths_allocated = trans_paths_allocated(paths); - trans_for_each_path_safe(trans, path, idx) { + trans_for_each_path_idx_from(paths_allocated, *trans_paths_nr(paths), idx, 1) { + struct btree_path *path = paths + idx; if (!path->nodes_locked) continue; prt_printf(out, " path %u %c l=%u %s:", - path->idx, + idx, path->cached ? 'c' : 'b', path->level, bch2_btree_id_str(path->btree_id)); @@ -3197,6 +3189,9 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) bch2_btree_bkey_cached_common_to_text(out, b); prt_newline(out); } +out: + --out->atomic; + rcu_read_unlock(); } void bch2_fs_btree_iter_exit(struct bch_fs *c) @@ -3205,15 +3200,26 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c) struct btree_trans *trans; int cpu; + if (c->btree_trans_bufs) + for_each_possible_cpu(cpu) { + struct btree_trans *trans = + per_cpu_ptr(c->btree_trans_bufs, cpu)->trans; + + if (trans) { + closure_sync(&trans->ref); + + seqmutex_lock(&c->btree_trans_lock); + list_del(&trans->list); + seqmutex_unlock(&c->btree_trans_lock); + } + kfree(trans); + } + free_percpu(c->btree_trans_bufs); + trans = list_first_entry_or_null(&c->btree_trans_list, struct btree_trans, list); if (trans) panic("%s leaked btree_trans\n", trans->fn); - if (c->btree_trans_bufs) - for_each_possible_cpu(cpu) - kfree(per_cpu_ptr(c->btree_trans_bufs, cpu)->trans); - free_percpu(c->btree_trans_bufs); - for (s = c->btree_transaction_stats; s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); s++) { @@ -3234,6 +3240,7 @@ void bch2_fs_btree_iter_init_early(struct bch_fs *c) for (s = c->btree_transaction_stats; s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); s++) { + bch2_time_stats_init(&s->duration); bch2_time_stats_init(&s->lock_hold_times); mutex_init(&s->lock); } diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index eaffced4c132..24772538e4cc 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -63,60 +63,57 @@ static inline void btree_trans_sort_paths(struct btree_trans *trans) __bch2_btree_trans_sort_paths(trans); } -static inline struct btree_path * -__trans_next_path(struct btree_trans *trans, unsigned idx) +static inline unsigned long *trans_paths_nr(struct btree_path *paths) { - u64 l; - - if (idx == BTREE_ITER_MAX) - return NULL; - - l = trans->paths_allocated >> idx; - if (!l) - return NULL; - - idx += __ffs64(l); - EBUG_ON(idx >= BTREE_ITER_MAX); - EBUG_ON(trans->paths[idx].idx != idx); - return &trans->paths[idx]; + return &container_of(paths, struct btree_trans_paths, paths[0])->nr_paths; } -#define trans_for_each_path_from(_trans, _path, _start) \ - for (_path = __trans_next_path((_trans), _start); \ - (_path); \ - _path = __trans_next_path((_trans), (_path)->idx + 1)) - -#define trans_for_each_path(_trans, _path) \ - trans_for_each_path_from(_trans, _path, 0) - -static inline struct btree_path * -__trans_next_path_safe(struct btree_trans *trans, unsigned *idx) +static inline unsigned long *trans_paths_allocated(struct btree_path *paths) { - u64 l; + unsigned long *v = trans_paths_nr(paths); + return v - BITS_TO_LONGS(*v); +} - if (*idx == BTREE_ITER_MAX) - return NULL; +#define trans_for_each_path_idx_from(_paths_allocated, _nr, _idx, _start)\ + for (_idx = _start; \ + (_idx = find_next_bit(_paths_allocated, _nr, _idx)) < _nr; \ + _idx++) - l = trans->paths_allocated >> *idx; - if (!l) - return NULL; +static inline struct btree_path * +__trans_next_path(struct btree_trans *trans, unsigned *idx) +{ + unsigned long *w = trans->paths_allocated + *idx / BITS_PER_LONG; + /* + * Open coded find_next_bit(), because + * - this is fast path, we can't afford the function call + * - and we know that nr_paths is a multiple of BITS_PER_LONG, + */ + while (*idx < trans->nr_paths) { + unsigned long v = *w >> (*idx & (BITS_PER_LONG - 1)); + if (v) { + *idx += __ffs(v); + return trans->paths + *idx; + } + + *idx += BITS_PER_LONG; + *idx &= ~(BITS_PER_LONG - 1); + w++; + } - *idx += __ffs64(l); - EBUG_ON(*idx >= BTREE_ITER_MAX); - return &trans->paths[*idx]; + return NULL; } /* * This version is intended to be safe for use on a btree_trans that is owned by * another thread, for bch2_btree_trans_to_text(); */ -#define trans_for_each_path_safe_from(_trans, _path, _idx, _start) \ +#define trans_for_each_path_from(_trans, _path, _idx, _start) \ for (_idx = _start; \ - (_path = __trans_next_path_safe((_trans), &_idx)); \ + (_path = __trans_next_path((_trans), &_idx)); \ _idx++) -#define trans_for_each_path_safe(_trans, _path, _idx) \ - trans_for_each_path_safe_from(_trans, _path, _idx, 0) +#define trans_for_each_path(_trans, _path, _idx) \ + trans_for_each_path_from(_trans, _path, _idx, 1) static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path) { @@ -138,10 +135,23 @@ static inline struct btree_path *prev_btree_path(struct btree_trans *trans, stru : NULL; } -#define trans_for_each_path_inorder(_trans, _path, _i) \ - for (_i = 0; \ - ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) < (_trans)->nr_sorted;\ - _i++) +#define trans_for_each_path_idx_inorder(_trans, _iter) \ + for (_iter = (struct trans_for_each_path_inorder_iter) { 0 }; \ + (_iter.path_idx = trans->sorted[_iter.sorted_idx], \ + _iter.sorted_idx < (_trans)->nr_sorted); \ + _iter.sorted_idx++) + +struct trans_for_each_path_inorder_iter { + btree_path_idx_t sorted_idx; + btree_path_idx_t path_idx; +}; + +#define trans_for_each_path_inorder(_trans, _path, _iter) \ + for (_iter = (struct trans_for_each_path_inorder_iter) { 0 }; \ + (_iter.path_idx = trans->sorted[_iter.sorted_idx], \ + _path = (_trans)->paths + _iter.path_idx, \ + _iter.sorted_idx < (_trans)->nr_sorted); \ + _iter.sorted_idx++) #define trans_for_each_path_inorder_reverse(_trans, _path, _i) \ for (_i = trans->nr_sorted - 1; \ @@ -157,67 +167,65 @@ static inline bool __path_has_node(const struct btree_path *path, static inline struct btree_path * __trans_next_path_with_node(struct btree_trans *trans, struct btree *b, - unsigned idx) + unsigned *idx) { - struct btree_path *path = __trans_next_path(trans, idx); + struct btree_path *path; - while (path && !__path_has_node(path, b)) - path = __trans_next_path(trans, path->idx + 1); + while ((path = __trans_next_path(trans, idx)) && + !__path_has_node(path, b)) + (*idx)++; return path; } -#define trans_for_each_path_with_node(_trans, _b, _path) \ - for (_path = __trans_next_path_with_node((_trans), (_b), 0); \ - (_path); \ - _path = __trans_next_path_with_node((_trans), (_b), \ - (_path)->idx + 1)) +#define trans_for_each_path_with_node(_trans, _b, _path, _iter) \ + for (_iter = 1; \ + (_path = __trans_next_path_with_node((_trans), (_b), &_iter));\ + _iter++) -struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *, - bool, unsigned long); +btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *, btree_path_idx_t, + bool, unsigned long); -static inline struct btree_path * __must_check +static inline btree_path_idx_t __must_check bch2_btree_path_make_mut(struct btree_trans *trans, - struct btree_path *path, bool intent, + btree_path_idx_t path, bool intent, unsigned long ip) { - if (path->ref > 1 || path->preserve) + if (trans->paths[path].ref > 1 || + trans->paths[path].preserve) path = __bch2_btree_path_make_mut(trans, path, intent, ip); - path->should_be_locked = false; + trans->paths[path].should_be_locked = false; return path; } -struct btree_path * __must_check -__bch2_btree_path_set_pos(struct btree_trans *, struct btree_path *, - struct bpos, bool, unsigned long, int); +btree_path_idx_t __must_check +__bch2_btree_path_set_pos(struct btree_trans *, btree_path_idx_t, + struct bpos, bool, unsigned long); -static inline struct btree_path * __must_check +static inline btree_path_idx_t __must_check bch2_btree_path_set_pos(struct btree_trans *trans, - struct btree_path *path, struct bpos new_pos, - bool intent, unsigned long ip) + btree_path_idx_t path, struct bpos new_pos, + bool intent, unsigned long ip) { - int cmp = bpos_cmp(new_pos, path->pos); - - return cmp - ? __bch2_btree_path_set_pos(trans, path, new_pos, intent, ip, cmp) + return !bpos_eq(new_pos, trans->paths[path].pos) + ? __bch2_btree_path_set_pos(trans, path, new_pos, intent, ip) : path; } -int __must_check bch2_btree_path_traverse_one(struct btree_trans *, struct btree_path *, +int __must_check bch2_btree_path_traverse_one(struct btree_trans *, + btree_path_idx_t, unsigned, unsigned long); static inline int __must_check bch2_btree_path_traverse(struct btree_trans *trans, - struct btree_path *path, unsigned flags) + btree_path_idx_t path, unsigned flags) { - if (path->uptodate < BTREE_ITER_NEED_RELOCK) + if (trans->paths[path].uptodate < BTREE_ITER_NEED_RELOCK) return 0; return bch2_btree_path_traverse_one(trans, path, flags, _RET_IP_); } -int __must_check bch2_btree_path_traverse(struct btree_trans *, - struct btree_path *, unsigned); -struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpos, +btree_path_idx_t bch2_path_get(struct btree_trans *, enum btree_id, struct bpos, unsigned, unsigned, unsigned, unsigned long); struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *); @@ -269,7 +277,7 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_path *, int bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *); -void bch2_path_put(struct btree_trans *, struct btree_path *, bool); +void bch2_path_put(struct btree_trans *, btree_path_idx_t, bool); int bch2_trans_relock(struct btree_trans *); int bch2_trans_relock_notrace(struct btree_trans *); @@ -335,7 +343,7 @@ static inline void bch2_btree_path_downgrade(struct btree_trans *trans, void bch2_trans_downgrade(struct btree_trans *); -void bch2_trans_node_add(struct btree_trans *trans, struct btree *); +void bch2_trans_node_add(struct btree_trans *trans, struct btree_path *, struct btree *); void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *); int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter); @@ -348,8 +356,6 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *); struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *, struct bpos); struct bkey_s_c bch2_btree_iter_next(struct btree_iter *); -struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *); - static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) { return bch2_btree_iter_peek_upto(iter, SPOS_MAX); @@ -376,10 +382,12 @@ static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpo static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) { + struct btree_trans *trans = iter->trans; + if (unlikely(iter->update_path)) - bch2_path_put(iter->trans, iter->update_path, + bch2_path_put(trans, iter->update_path, iter->flags & BTREE_ITER_INTENT); - iter->update_path = NULL; + iter->update_path = 0; if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) new_pos.snapshot = iter->snapshot; @@ -408,9 +416,6 @@ static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans, unsigned btree_id, unsigned flags) { - if (flags & BTREE_ITER_ALL_LEVELS) - flags |= BTREE_ITER_ALL_SNAPSHOTS|__BTREE_ITER_ALL_SNAPSHOTS; - if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) && btree_id_is_extents(btree_id)) flags |= BTREE_ITER_IS_EXTENTS; @@ -450,14 +455,16 @@ static inline void bch2_trans_iter_init_common(struct btree_trans *trans, unsigned flags, unsigned long ip) { - memset(iter, 0, sizeof(*iter)); - iter->trans = trans; - iter->btree_id = btree_id; - iter->flags = flags; - iter->snapshot = pos.snapshot; - iter->pos = pos; - iter->k.p = pos; - + iter->trans = trans; + iter->update_path = 0; + iter->key_cache_path = 0; + iter->btree_id = btree_id; + iter->min_depth = 0; + iter->flags = flags; + iter->snapshot = pos.snapshot; + iter->pos = pos; + iter->k = POS_KEY(pos); + iter->journal_idx = 0; #ifdef CONFIG_BCACHEFS_DEBUG iter->ip_allocated = ip; #endif @@ -489,8 +496,10 @@ void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *); static inline void set_btree_iter_dontneed(struct btree_iter *iter) { - if (!iter->trans->restarted) - iter->path->preserve = false; + struct btree_trans *trans = iter->trans; + + if (!trans->restarted) + btree_iter_path(trans, iter)->preserve = false; } void *__bch2_trans_kmalloc(struct btree_trans *, size_t); @@ -512,7 +521,7 @@ static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) static inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size) { - size = roundup(size, 8); + size = round_up(size, 8); if (likely(trans->mem_top + size <= trans->mem_bytes)) { void *p = trans->mem + trans->mem_top; @@ -581,7 +590,6 @@ static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans, KEY_TYPE_##_type, sizeof(*_val), _val) void bch2_trans_srcu_unlock(struct btree_trans *); -void bch2_trans_srcu_lock(struct btree_trans *); u32 bch2_trans_begin(struct btree_trans *); @@ -606,8 +614,6 @@ u32 bch2_trans_begin(struct btree_trans *); static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter, unsigned flags) { - BUG_ON(flags & BTREE_ITER_ALL_LEVELS); - return flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) : bch2_btree_iter_peek_prev(iter); } @@ -615,8 +621,7 @@ static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter * static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter, unsigned flags) { - return flags & BTREE_ITER_ALL_LEVELS ? bch2_btree_iter_peek_all_levels(iter) : - flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) : + return flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) : bch2_btree_iter_peek(iter); } @@ -633,61 +638,34 @@ static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter * return bch2_btree_iter_peek_slot(iter); } +int __bch2_btree_trans_too_many_iters(struct btree_trans *); + static inline int btree_trans_too_many_iters(struct btree_trans *trans) { - if (hweight64(trans->paths_allocated) > BTREE_ITER_MAX - 8) { - trace_and_count(trans->c, trans_restart_too_many_iters, trans, _THIS_IP_); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters); - } + if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_INITIAL - 8) + return __bch2_btree_trans_too_many_iters(trans); return 0; } -struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *); - -static inline struct bkey_s_c -__bch2_btree_iter_peek_and_restart(struct btree_trans *trans, - struct btree_iter *iter, unsigned flags) -{ - struct bkey_s_c k; - - while (btree_trans_too_many_iters(trans) || - (k = bch2_btree_iter_peek_type(iter, flags), - bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart))) - bch2_trans_begin(trans); - - return k; -} - -static inline struct bkey_s_c -__bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos end, - unsigned flags) -{ - struct bkey_s_c k; - - while (btree_trans_too_many_iters(trans) || - (k = bch2_btree_iter_peek_upto_type(iter, end, flags), - bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart))) - bch2_trans_begin(trans); - - return k; -} - +/* + * goto instead of loop, so that when used inside for_each_btree_key2() + * break/continue work correctly + */ #define lockrestart_do(_trans, _do) \ ({ \ + __label__ transaction_restart; \ u32 _restart_count; \ int _ret2; \ +transaction_restart: \ + _restart_count = bch2_trans_begin(_trans); \ + _ret2 = (_do); \ \ - do { \ - _restart_count = bch2_trans_begin(_trans); \ - _ret2 = (_do); \ - } while (bch2_err_matches(_ret2, BCH_ERR_transaction_restart)); \ + if (bch2_err_matches(_ret2, BCH_ERR_transaction_restart)) \ + goto transaction_restart; \ \ if (!_ret2) \ bch2_trans_verify_not_restarted(_trans, _restart_count);\ - \ _ret2; \ }) @@ -716,91 +694,56 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, _ret2 ?: trans_was_restarted(_trans, _restart_count); \ }) -#define for_each_btree_key2(_trans, _iter, _btree_id, \ - _start, _flags, _k, _do) \ +#define for_each_btree_key_upto(_trans, _iter, _btree_id, \ + _start, _end, _flags, _k, _do) \ ({ \ + struct btree_iter _iter; \ + struct bkey_s_c _k; \ int _ret3 = 0; \ \ bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ (_start), (_flags)); \ \ - while (1) { \ - u32 _restart_count = bch2_trans_begin(_trans); \ - \ - _ret3 = 0; \ - (_k) = bch2_btree_iter_peek_type(&(_iter), (_flags)); \ - if (!(_k).k) \ - break; \ + do { \ + _ret3 = lockrestart_do(_trans, ({ \ + (_k) = bch2_btree_iter_peek_upto_type(&(_iter), \ + _end, (_flags)); \ + if (!(_k).k) \ + break; \ \ - _ret3 = bkey_err(_k) ?: (_do); \ - if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\ - continue; \ - if (_ret3) \ - break; \ - bch2_trans_verify_not_restarted(_trans, _restart_count);\ - if (!bch2_btree_iter_advance(&(_iter))) \ - break; \ - } \ + bkey_err(_k) ?: (_do); \ + })); \ + } while (!_ret3 && bch2_btree_iter_advance(&(_iter))); \ \ bch2_trans_iter_exit((_trans), &(_iter)); \ _ret3; \ }) -#define for_each_btree_key2_upto(_trans, _iter, _btree_id, \ - _start, _end, _flags, _k, _do) \ -({ \ - int _ret3 = 0; \ - \ - bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ - \ - while (1) { \ - u32 _restart_count = bch2_trans_begin(_trans); \ - \ - _ret3 = 0; \ - (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, (_flags));\ - if (!(_k).k) \ - break; \ - \ - _ret3 = bkey_err(_k) ?: (_do); \ - if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\ - continue; \ - if (_ret3) \ - break; \ - bch2_trans_verify_not_restarted(_trans, _restart_count);\ - if (!bch2_btree_iter_advance(&(_iter))) \ - break; \ - } \ - \ - bch2_trans_iter_exit((_trans), &(_iter)); \ - _ret3; \ -}) +#define for_each_btree_key(_trans, _iter, _btree_id, \ + _start, _flags, _k, _do) \ + for_each_btree_key_upto(_trans, _iter, _btree_id, _start, \ + SPOS_MAX, _flags, _k, _do) #define for_each_btree_key_reverse(_trans, _iter, _btree_id, \ _start, _flags, _k, _do) \ ({ \ + struct btree_iter _iter; \ + struct bkey_s_c _k; \ int _ret3 = 0; \ \ bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ (_start), (_flags)); \ \ - while (1) { \ - u32 _restart_count = bch2_trans_begin(_trans); \ - (_k) = bch2_btree_iter_peek_prev_type(&(_iter), (_flags));\ - if (!(_k).k) { \ - _ret3 = 0; \ - break; \ - } \ + do { \ + _ret3 = lockrestart_do(_trans, ({ \ + (_k) = bch2_btree_iter_peek_prev_type(&(_iter), \ + (_flags)); \ + if (!(_k).k) \ + break; \ \ - _ret3 = bkey_err(_k) ?: (_do); \ - if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\ - continue; \ - if (_ret3) \ - break; \ - bch2_trans_verify_not_restarted(_trans, _restart_count);\ - if (!bch2_btree_iter_rewind(&(_iter))) \ - break; \ - } \ + bkey_err(_k) ?: (_do); \ + })); \ + } while (!_ret3 && bch2_btree_iter_rewind(&(_iter))); \ \ bch2_trans_iter_exit((_trans), &(_iter)); \ _ret3; \ @@ -810,7 +753,7 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, _start, _iter_flags, _k, \ _disk_res, _journal_seq, _commit_flags,\ _do) \ - for_each_btree_key2(_trans, _iter, _btree_id, _start, _iter_flags, _k,\ + for_each_btree_key(_trans, _iter, _btree_id, _start, _iter_flags, _k,\ (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ (_journal_seq), (_commit_flags))) @@ -826,32 +769,31 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, _start, _end, _iter_flags, _k, \ _disk_res, _journal_seq, _commit_flags,\ _do) \ - for_each_btree_key2_upto(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\ + for_each_btree_key_upto(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\ (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ (_journal_seq), (_commit_flags))) -#define for_each_btree_key(_trans, _iter, _btree_id, \ - _start, _flags, _k, _ret) \ - for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ - (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\ - !((_ret) = bkey_err(_k)) && (_k).k; \ - bch2_btree_iter_advance(&(_iter))) +struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *); -#define for_each_btree_key_upto(_trans, _iter, _btree_id, \ - _start, _end, _flags, _k, _ret) \ - for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ - (_k) = __bch2_btree_iter_peek_upto_and_restart((_trans), \ - &(_iter), _end, _flags),\ - !((_ret) = bkey_err(_k)) && (_k).k; \ - bch2_btree_iter_advance(&(_iter))) +static inline struct bkey_s_c +__bch2_btree_iter_peek_and_restart(struct btree_trans *trans, + struct btree_iter *iter, unsigned flags) +{ + struct bkey_s_c k; -#define for_each_btree_key_norestart(_trans, _iter, _btree_id, \ + while (btree_trans_too_many_iters(trans) || + (k = bch2_btree_iter_peek_type(iter, flags), + bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart))) + bch2_trans_begin(trans); + + return k; +} + +#define for_each_btree_key_old(_trans, _iter, _btree_id, \ _start, _flags, _k, _ret) \ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ (_start), (_flags)); \ - (_k) = bch2_btree_iter_peek_type(&(_iter), _flags), \ + (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\ !((_ret) = bkey_err(_k)) && (_k).k; \ bch2_btree_iter_advance(&(_iter))) @@ -863,24 +805,25 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, !((_ret) = bkey_err(_k)) && (_k).k; \ bch2_btree_iter_advance(&(_iter))) -#define for_each_btree_key_continue(_trans, _iter, _flags, _k, _ret) \ - for (; \ - (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\ - !((_ret) = bkey_err(_k)) && (_k).k; \ - bch2_btree_iter_advance(&(_iter))) - -#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \ - for (; \ - (_k) = bch2_btree_iter_peek_type(&(_iter), _flags), \ - !((_ret) = bkey_err(_k)) && (_k).k; \ - bch2_btree_iter_advance(&(_iter))) - #define for_each_btree_key_upto_continue_norestart(_iter, _end, _flags, _k, _ret)\ for (; \ (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags), \ !((_ret) = bkey_err(_k)) && (_k).k; \ bch2_btree_iter_advance(&(_iter))) +#define for_each_btree_key_norestart(_trans, _iter, _btree_id, \ + _start, _flags, _k, _ret) \ + for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, _start,\ + SPOS_MAX, _flags, _k, _ret) + +#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \ + for_each_btree_key_upto_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret) + +/* + * This should not be used in a fastpath, without first trying _do in + * nonblocking mode - it will cause excessive transaction restarts and + * potentially livelocking: + */ #define drop_locks_do(_trans, _do) \ ({ \ bch2_trans_unlock(_trans); \ @@ -912,10 +855,7 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, _p; \ }) -/* new multiple iterator interface: */ - void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *); -void bch2_btree_path_to_text(struct printbuf *, struct btree_path *); void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *); void bch2_dump_trans_updates(struct btree_trans *); void bch2_dump_trans_paths_updates(struct btree_trans *); diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index ec52f50d249d..719a94a84950 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -73,6 +73,7 @@ static size_t bch2_journal_key_search(struct journal_keys *keys, return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos)); } +/* Returns first non-overwritten key >= search key: */ struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id, unsigned level, struct bpos pos, struct bpos end_pos, size_t *idx) @@ -86,12 +87,26 @@ search: if (!*idx) *idx = __bch2_journal_key_search(keys, btree_id, level, pos); + while (*idx && + __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) { + --(*idx); + iters++; + if (iters == 10) { + *idx = 0; + goto search; + } + } + while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { if (__journal_key_cmp(btree_id, level, end_pos, k) < 0) return NULL; - if (__journal_key_cmp(btree_id, level, pos, k) <= 0 && - !k->overwritten) + if (k->overwritten) { + (*idx)++; + continue; + } + + if (__journal_key_cmp(btree_id, level, pos, k) <= 0) return k->k; (*idx)++; @@ -162,7 +177,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, struct journal_keys *keys = &c->journal_keys; size_t idx = bch2_journal_key_search(keys, id, level, k->k.p); - BUG_ON(test_bit(BCH_FS_RW, &c->flags)); + BUG_ON(test_bit(BCH_FS_rw, &c->flags)); if (idx < keys->size && journal_key_cmp(&n, &keys->d[idx]) == 0) { @@ -452,9 +467,7 @@ static void __journal_keys_sort(struct journal_keys *keys) src = dst = keys->d; while (src < keys->d + keys->nr) { while (src + 1 < keys->d + keys->nr && - src[0].btree_id == src[1].btree_id && - src[0].level == src[1].level && - bpos_eq(src[0].k->k.p, src[1].k->k.p)) + !journal_key_cmp(src, src + 1)) src++; *dst++ = *src++; diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 1b7a5668df7c..74e52fd28abe 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -630,7 +630,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, if (ret) goto out; - ck = (void *) c_iter.path->l[0].b; + ck = (void *) btree_iter_path(trans, &c_iter)->l[0].b; if (!ck) goto out; @@ -645,22 +645,29 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, if (journal_seq && ck->journal.seq != journal_seq) goto out; + trans->journal_res.seq = ck->journal.seq; + /* - * Since journal reclaim depends on us making progress here, and the - * allocator/copygc depend on journal reclaim making progress, we need - * to be using alloc reserves: + * If we're at the end of the journal, we really want to free up space + * in the journal right away - we don't want to pin that old journal + * sequence number with a new btree node write, we want to re-journal + * the update */ + if (ck->journal.seq == journal_last_seq(j)) + commit_flags |= BCH_WATERMARK_reclaim; + + if (ck->journal.seq != journal_last_seq(j) || + j->watermark == BCH_WATERMARK_stripe) + commit_flags |= BCH_TRANS_COMMIT_no_journal_res; + ret = bch2_btree_iter_traverse(&b_iter) ?: bch2_trans_update(trans, &b_iter, ck->k, BTREE_UPDATE_KEY_CACHE_RECLAIM| BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| BTREE_TRIGGER_NORUN) ?: bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_NOFAIL| - (ck->journal.seq == journal_last_seq(j) - ? BCH_WATERMARK_reclaim - : 0)| + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_enospc| commit_flags); bch2_fs_fatal_err_on(ret && @@ -673,7 +680,8 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, bch2_journal_pin_drop(j, &ck->journal); - BUG_ON(!btree_node_locked(c_iter.path, 0)); + struct btree_path *path = btree_iter_path(trans, &c_iter); + BUG_ON(!btree_node_locked(path, 0)); if (!evict) { if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { @@ -682,19 +690,20 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, } } else { struct btree_path *path2; + unsigned i; evict: - trans_for_each_path(trans, path2) - if (path2 != c_iter.path) + trans_for_each_path(trans, path2, i) + if (path2 != path) __bch2_btree_path_unlock(trans, path2); - bch2_btree_node_lock_write_nofail(trans, c_iter.path, &ck->c); + bch2_btree_node_lock_write_nofail(trans, path, &ck->c); if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { clear_bit(BKEY_CACHED_DIRTY, &ck->flags); atomic_long_dec(&c->btree_key_cache.nr_dirty); } - mark_btree_node_locked_noreset(c_iter.path, 0, BTREE_NODE_UNLOCKED); + mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); bkey_cached_evict(&c->btree_key_cache, ck); bkey_cached_free_fast(&c->btree_key_cache, ck); } @@ -732,9 +741,9 @@ int bch2_btree_key_cache_journal_flush(struct journal *j, } six_unlock_read(&ck->c.lock); - ret = commit_do(trans, NULL, NULL, 0, + ret = lockrestart_do(trans, btree_key_cache_flush_pos(trans, key, seq, - BTREE_INSERT_JOURNAL_RECLAIM, false)); + BCH_TRANS_COMMIT_journal_reclaim, false)); unlock: srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); @@ -742,28 +751,12 @@ unlock: return ret; } -/* - * Flush and evict a key from the key cache: - */ -int bch2_btree_key_cache_flush(struct btree_trans *trans, - enum btree_id id, struct bpos pos) -{ - struct bch_fs *c = trans->c; - struct bkey_cached_key key = { id, pos }; - - /* Fastpath - assume it won't be found: */ - if (!bch2_btree_key_cache_find(c, id, pos)) - return 0; - - return btree_key_cache_flush_pos(trans, key, 0, 0, true); -} - bool bch2_btree_insert_key_cached(struct btree_trans *trans, unsigned flags, struct btree_insert_entry *insert_entry) { struct bch_fs *c = trans->c; - struct bkey_cached *ck = (void *) insert_entry->path->l[0].b; + struct bkey_cached *ck = (void *) (trans->paths + insert_entry->path)->l[0].b; struct bkey_i *insert = insert_entry->k; bool kick_reclaim = false; @@ -773,7 +766,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, ck->valid = true; if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); + EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); set_bit(BKEY_CACHED_DIRTY, &ck->flags); atomic_long_inc(&c->btree_key_cache.nr_dirty); @@ -1000,7 +993,7 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) if (atomic_long_read(&bc->nr_dirty) && !bch2_journal_error(&c->journal) && - test_bit(BCH_FS_WAS_RW, &c->flags)) + test_bit(BCH_FS_was_rw, &c->flags)) panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n", atomic_long_read(&bc->nr_dirty)); diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h index be3acde2caa0..e6b2cd0dd2c1 100644 --- a/fs/bcachefs/btree_key_cache.h +++ b/fs/bcachefs/btree_key_cache.h @@ -31,8 +31,6 @@ int bch2_btree_path_traverse_cached(struct btree_trans *, struct btree_path *, bool bch2_btree_insert_key_cached(struct btree_trans *, unsigned, struct btree_insert_entry *); -int bch2_btree_key_cache_flush(struct btree_trans *, - enum btree_id, struct bpos); void bch2_btree_key_cache_drop(struct btree_trans *, struct btree_path *); diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index 3d48834d091f..684397442338 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -32,13 +32,14 @@ struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans, { struct btree_path *path; struct six_lock_count ret; + unsigned i; memset(&ret, 0, sizeof(ret)); if (IS_ERR_OR_NULL(b)) return ret; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) if (path != skip && &path->l[level].b->c == b) { int t = btree_node_locked_type(path, level); @@ -85,8 +86,14 @@ static noinline void print_cycle(struct printbuf *out, struct lock_graph *g) prt_printf(out, "Found lock cycle (%u entries):", g->nr); prt_newline(out); - for (i = g->g; i < g->g + g->nr; i++) + for (i = g->g; i < g->g + g->nr; i++) { + struct task_struct *task = READ_ONCE(i->trans->locking_wait.task); + if (!task) + continue; + bch2_btree_trans_to_text(out, i->trans); + bch2_prt_task_backtrace(out, task, i == g->g ? 5 : 1, GFP_NOWAIT); + } } static noinline void print_chain(struct printbuf *out, struct lock_graph *g) @@ -94,9 +101,10 @@ static noinline void print_chain(struct printbuf *out, struct lock_graph *g) struct trans_waiting_for_lock *i; for (i = g->g; i != g->g + g->nr; i++) { + struct task_struct *task = i->trans->locking_wait.task; if (i != g->g) prt_str(out, "<- "); - prt_printf(out, "%u ", i->trans->locking_wait.task->pid); + prt_printf(out, "%u ", task ?task->pid : 0); } prt_newline(out); } @@ -142,10 +150,27 @@ static bool lock_graph_remove_non_waiters(struct lock_graph *g) return false; } +static void trace_would_deadlock(struct lock_graph *g, struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + + count_event(c, trans_restart_would_deadlock); + + if (trace_trans_restart_would_deadlock_enabled()) { + struct printbuf buf = PRINTBUF; + + buf.atomic++; + print_cycle(&buf, g); + + trace_trans_restart_would_deadlock(trans, buf.buf); + printbuf_exit(&buf); + } +} + static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i) { if (i == g->g) { - trace_and_count(i->trans->c, trans_restart_would_deadlock, i->trans, _RET_IP_); + trace_would_deadlock(g, i->trans); return btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock); } else { i->trans->lock_must_abort = true; @@ -202,7 +227,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle) prt_printf(&buf, "backtrace:"); prt_newline(&buf); printbuf_indent_add(&buf, 2); - bch2_prt_task_backtrace(&buf, trans->locking_wait.task); + bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT); printbuf_indent_sub(&buf, 2); prt_newline(&buf); } @@ -262,27 +287,40 @@ int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle) struct lock_graph g; struct trans_waiting_for_lock *top; struct btree_bkey_cached_common *b; - struct btree_path *path; - unsigned path_idx; - int ret; + btree_path_idx_t path_idx; + int ret = 0; + + g.nr = 0; if (trans->lock_must_abort) { if (cycle) return -1; - trace_and_count(trans->c, trans_restart_would_deadlock, trans, _RET_IP_); + trace_would_deadlock(&g, trans); return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock); } - g.nr = 0; lock_graph_down(&g, trans); + + /* trans->paths is rcu protected vs. freeing */ + rcu_read_lock(); + if (cycle) + cycle->atomic++; next: if (!g.nr) - return 0; + goto out; top = &g.g[g.nr - 1]; - trans_for_each_path_safe_from(top->trans, path, path_idx, top->path_idx) { + struct btree_path *paths = rcu_dereference(top->trans->paths); + if (!paths) + goto up; + + unsigned long *paths_allocated = trans_paths_allocated(paths); + + trans_for_each_path_idx_from(paths_allocated, *trans_paths_nr(paths), + path_idx, top->path_idx) { + struct btree_path *path = paths + path_idx; if (!path->nodes_locked) continue; @@ -348,18 +386,23 @@ next: ret = lock_graph_descend(&g, trans, cycle); if (ret) - return ret; + goto out; goto next; } raw_spin_unlock(&b->lock.wait_lock); } } - +up: if (g.nr > 1 && cycle) print_chain(cycle, &g); lock_graph_up(&g); goto next; +out: + if (cycle) + --cycle->atomic; + rcu_read_unlock(); + return ret; } int bch2_six_check_for_deadlock(struct six_lock *lock, void *p) @@ -398,7 +441,7 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, struct btree_bkey_cached_common *b) { struct btree_path *linked; - unsigned i; + unsigned i, iter; int ret; /* @@ -412,7 +455,7 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, * already taken are no longer needed: */ - trans_for_each_path(trans, linked) { + trans_for_each_path(trans, linked, iter) { if (!linked->nodes_locked) continue; @@ -588,8 +631,7 @@ int bch2_btree_path_relock_intent(struct btree_trans *trans, } __flatten -bool bch2_btree_path_relock_norestart(struct btree_trans *trans, - struct btree_path *path, unsigned long trace_ip) +bool bch2_btree_path_relock_norestart(struct btree_trans *trans, struct btree_path *path) { struct get_locks_fail f; @@ -599,7 +641,7 @@ bool bch2_btree_path_relock_norestart(struct btree_trans *trans, int __bch2_btree_path_relock(struct btree_trans *trans, struct btree_path *path, unsigned long trace_ip) { - if (!bch2_btree_path_relock_norestart(trans, path, trace_ip)) { + if (!bch2_btree_path_relock_norestart(trans, path)) { trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path); return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path); } @@ -624,8 +666,6 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans, unsigned new_locks_want, struct get_locks_fail *f) { - struct btree_path *linked; - if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f)) return true; @@ -648,8 +688,11 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans, * before interior nodes - now that's handled by * bch2_btree_path_traverse_all(). */ - if (!path->cached && !trans->in_traverse_all) - trans_for_each_path(trans, linked) + if (!path->cached && !trans->in_traverse_all) { + struct btree_path *linked; + unsigned i; + + trans_for_each_path(trans, linked, i) if (linked != path && linked->cached == path->cached && linked->btree_id == path->btree_id && @@ -657,6 +700,7 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans, linked->locks_want = new_locks_want; btree_path_get_locks(trans, linked, true, NULL); } + } return false; } @@ -665,7 +709,7 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans, struct btree_path *path, unsigned new_locks_want) { - unsigned l; + unsigned l, old_locks_want = path->locks_want; if (trans->restarted) return; @@ -689,8 +733,7 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans, bch2_btree_path_verify_locks(path); - path->downgrade_seq++; - trace_path_downgrade(trans, _RET_IP_, path); + trace_path_downgrade(trans, _RET_IP_, path, old_locks_want); } /* Btree transaction locking: */ @@ -698,40 +741,70 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans, void bch2_trans_downgrade(struct btree_trans *trans) { struct btree_path *path; + unsigned i; if (trans->restarted) return; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) bch2_btree_path_downgrade(trans, path); } int bch2_trans_relock(struct btree_trans *trans) { struct btree_path *path; + unsigned i; if (unlikely(trans->restarted)) return -((int) trans->restarted); - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) { + struct get_locks_fail f; + if (path->should_be_locked && - !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) { - trace_and_count(trans->c, trans_restart_relock, trans, _RET_IP_, path); + !btree_path_get_locks(trans, path, false, &f)) { + if (trace_trans_restart_relock_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_bpos_to_text(&buf, path->pos); + prt_printf(&buf, " l=%u seq=%u node seq=", + f.l, path->l[f.l].lock_seq); + if (IS_ERR_OR_NULL(f.b)) { + prt_str(&buf, bch2_err_str(PTR_ERR(f.b))); + } else { + prt_printf(&buf, "%u", f.b->c.lock.seq); + + struct six_lock_count c = + bch2_btree_node_lock_counts(trans, NULL, &f.b->c, f.l); + prt_printf(&buf, " self locked %u.%u.%u", c.n[0], c.n[1], c.n[2]); + + c = six_lock_counts(&f.b->c.lock); + prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]); + } + + trace_trans_restart_relock(trans, _RET_IP_, buf.buf); + printbuf_exit(&buf); + } + + count_event(trans->c, trans_restart_relock); return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); } + } + return 0; } int bch2_trans_relock_notrace(struct btree_trans *trans) { struct btree_path *path; + unsigned i; if (unlikely(trans->restarted)) return -((int) trans->restarted); - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) if (path->should_be_locked && - !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) { + !bch2_btree_path_relock_norestart(trans, path)) { return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); } return 0; @@ -740,16 +813,18 @@ int bch2_trans_relock_notrace(struct btree_trans *trans) void bch2_trans_unlock_noassert(struct btree_trans *trans) { struct btree_path *path; + unsigned i; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) __bch2_btree_path_unlock(trans, path); } void bch2_trans_unlock(struct btree_trans *trans) { struct btree_path *path; + unsigned i; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) __bch2_btree_path_unlock(trans, path); } @@ -762,8 +837,9 @@ void bch2_trans_unlock_long(struct btree_trans *trans) bool bch2_trans_locked(struct btree_trans *trans) { struct btree_path *path; + unsigned i; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) if (path->nodes_locked) return true; return false; @@ -809,8 +885,9 @@ void bch2_btree_path_verify_locks(struct btree_path *path) void bch2_trans_verify_locks(struct btree_trans *trans) { struct btree_path *path; + unsigned i; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) bch2_btree_path_verify_locks(path); } diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h index 11b0a2c8cd69..4bd72c855da1 100644 --- a/fs/bcachefs/btree_locking.h +++ b/fs/bcachefs/btree_locking.h @@ -122,12 +122,9 @@ static void btree_trans_lock_hold_time_update(struct btree_trans *trans, struct btree_path *path, unsigned level) { #ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS - struct btree_transaction_stats *s = btree_trans_stats(trans); - - if (s) - __bch2_time_stats_update(&s->lock_hold_times, - path->l[level].lock_taken_time, - local_clock()); + __bch2_time_stats_update(&btree_trans_stats(trans)->lock_hold_times, + path->l[level].lock_taken_time, + local_clock()); #endif } @@ -175,6 +172,7 @@ bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_pat struct btree *b) { struct btree_path *linked; + unsigned i; EBUG_ON(path->l[b->c.level].b != b); EBUG_ON(path->l[b->c.level].lock_seq != six_lock_seq(&b->c.lock)); @@ -182,7 +180,7 @@ bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_pat mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED); - trans_for_each_path_with_node(trans, b, linked) + trans_for_each_path_with_node(trans, b, linked, i) linked->l[b->c.level].lock_seq++; six_unlock_write(&b->c.lock); @@ -242,8 +240,9 @@ static inline bool btree_node_lock_increment(struct btree_trans *trans, enum btree_node_locked_type want) { struct btree_path *path; + unsigned i; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) if (&path->l[level].b->c == b && btree_node_locked_type(path, level) >= want) { six_lock_increment(&b->lock, (enum six_lock_type) want); @@ -263,7 +262,6 @@ static inline int btree_node_lock(struct btree_trans *trans, int ret = 0; EBUG_ON(level >= BTREE_MAX_DEPTH); - EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx))); if (likely(six_trylock_type(&b->lock, type)) || btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) || @@ -314,8 +312,7 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *, /* relock: */ -bool bch2_btree_path_relock_norestart(struct btree_trans *, - struct btree_path *, unsigned long); +bool bch2_btree_path_relock_norestart(struct btree_trans *, struct btree_path *); int __bch2_btree_path_relock(struct btree_trans *, struct btree_path *, unsigned long); @@ -355,12 +352,6 @@ static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans, /* upgrade */ - -struct get_locks_fail { - unsigned l; - struct btree *b; -}; - bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *, struct btree_path *, unsigned, struct get_locks_fail *); diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 12907beda98c..30d69a6d133e 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -12,6 +12,7 @@ #include "errcode.h" #include "error.h" #include "journal.h" +#include "journal_io.h" #include "journal_reclaim.h" #include "replicas.h" #include "snapshot.h" @@ -23,7 +24,7 @@ static void verify_update_old_key(struct btree_trans *trans, struct btree_insert #ifdef CONFIG_BCACHEFS_DEBUG struct bch_fs *c = trans->c; struct bkey u; - struct bkey_s_c k = bch2_btree_path_peek_slot_exact(i->path, &u); + struct bkey_s_c k = bch2_btree_path_peek_slot_exact(trans->paths + i->path, &u); if (unlikely(trans->journal_replay_not_finished)) { struct bkey_i *j_k = @@ -41,23 +42,23 @@ static void verify_update_old_key(struct btree_trans *trans, struct btree_insert #endif } -static inline struct btree_path_level *insert_l(struct btree_insert_entry *i) +static inline struct btree_path_level *insert_l(struct btree_trans *trans, struct btree_insert_entry *i) { - return i->path->l + i->level; + return (trans->paths + i->path)->l + i->level; } static inline bool same_leaf_as_prev(struct btree_trans *trans, struct btree_insert_entry *i) { return i != trans->updates && - insert_l(&i[0])->b == insert_l(&i[-1])->b; + insert_l(trans, &i[0])->b == insert_l(trans, &i[-1])->b; } static inline bool same_leaf_as_next(struct btree_trans *trans, struct btree_insert_entry *i) { return i + 1 < trans->updates + trans->nr_updates && - insert_l(&i[0])->b == insert_l(&i[1])->b; + insert_l(trans, &i[0])->b == insert_l(trans, &i[1])->b; } inline void bch2_btree_node_prep_for_write(struct btree_trans *trans, @@ -84,7 +85,7 @@ static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btre if (same_leaf_as_prev(trans, i)) continue; - bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b); + bch2_btree_node_unlock_write(trans, trans->paths + i->path, insert_l(trans, i)->b); } trace_and_count(trans->c, trans_restart_would_deadlock_write, trans); @@ -93,19 +94,17 @@ static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btre static inline int bch2_trans_lock_write(struct btree_trans *trans) { - struct btree_insert_entry *i; - EBUG_ON(trans->write_locked); trans_for_each_update(trans, i) { if (same_leaf_as_prev(trans, i)) continue; - if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c)) + if (bch2_btree_node_lock_write(trans, trans->paths + i->path, &insert_l(trans, i)->b->c)) return trans_lock_write_fail(trans, i); if (!i->cached) - bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b); + bch2_btree_node_prep_for_write(trans, trans->paths + i->path, insert_l(trans, i)->b); } trans->write_locked = true; @@ -115,12 +114,10 @@ static inline int bch2_trans_lock_write(struct btree_trans *trans) static inline void bch2_trans_unlock_write(struct btree_trans *trans) { if (likely(trans->write_locked)) { - struct btree_insert_entry *i; - trans_for_each_update(trans, i) if (!same_leaf_as_prev(trans, i)) - bch2_btree_node_unlock_write_inlined(trans, i->path, - insert_l(i)->b); + bch2_btree_node_unlock_write_inlined(trans, + trans->paths + i->path, insert_l(trans, i)->b); trans->write_locked = false; } } @@ -142,8 +139,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans, EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k)); EBUG_ON(bpos_lt(insert->k.p, b->data->min_key)); EBUG_ON(bpos_gt(insert->k.p, b->data->max_key)); - EBUG_ON(insert->k.u64s > - bch_btree_keys_u64s_remaining(trans->c, b)); + EBUG_ON(insert->k.u64s > bch2_btree_keys_u64s_remaining(b)); EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos)); k = bch2_btree_node_iter_peek_all(node_iter, b); @@ -163,7 +159,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans, k->type = KEY_TYPE_deleted; if (k->needs_whiteout) - push_whiteout(trans->c, b, insert->k.p); + push_whiteout(b, insert->k.p); k->needs_whiteout = false; if (k >= btree_bset_last(b)->start) { @@ -287,7 +283,7 @@ inline void bch2_btree_insert_key_leaf(struct btree_trans *trans, bch2_btree_add_journal_pin(c, b, journal_seq); if (unlikely(!btree_node_dirty(b))) { - EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); + EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); set_btree_node_dirty_acct(c, b); } @@ -311,10 +307,12 @@ inline void bch2_btree_insert_key_leaf(struct btree_trans *trans, static inline void btree_insert_entry_checks(struct btree_trans *trans, struct btree_insert_entry *i) { - BUG_ON(!bpos_eq(i->k->k.p, i->path->pos)); - BUG_ON(i->cached != i->path->cached); - BUG_ON(i->level != i->path->level); - BUG_ON(i->btree_id != i->path->btree_id); + struct btree_path *path = trans->paths + i->path; + + BUG_ON(!bpos_eq(i->k->k.p, path->pos)); + BUG_ON(i->cached != path->cached); + BUG_ON(i->level != path->level); + BUG_ON(i->btree_id != path->btree_id); EBUG_ON(!i->level && btree_type_has_snapshots(i->btree_id) && !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) && @@ -349,9 +347,7 @@ static noinline void journal_transaction_name(struct btree_trans *trans) static inline int btree_key_can_insert(struct btree_trans *trans, struct btree *b, unsigned u64s) { - struct bch_fs *c = trans->c; - - if (!bch2_btree_node_insert_fits(c, b, u64s)) + if (!bch2_btree_node_insert_fits(b, u64s)) return -BCH_ERR_btree_insert_btree_node_full; return 0; @@ -361,8 +357,6 @@ noinline static int btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags, struct btree_path *path, unsigned new_u64s) { - struct bch_fs *c = trans->c; - struct btree_insert_entry *i; struct bkey_cached *ck = (void *) path->l[0].b; struct bkey_i *new_k; int ret; @@ -372,7 +366,7 @@ btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags, new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL); if (!new_k) { - bch_err(c, "error allocating memory for key cache key, btree %s u64s %u", + bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", bch2_btree_id_str(path->btree_id), new_u64s); return -BCH_ERR_ENOMEM_btree_key_cache_insert; } @@ -401,7 +395,6 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags { struct bch_fs *c = trans->c; struct bkey_cached *ck = (void *) path->l[0].b; - struct btree_insert_entry *i; unsigned new_u64s; struct bkey_i *new_k; @@ -409,7 +402,7 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && bch2_btree_key_cache_must_wait(c) && - !(flags & BTREE_INSERT_JOURNAL_RECLAIM)) + !(flags & BCH_TRANS_COMMIT_journal_reclaim)) return -BCH_ERR_btree_insert_need_journal_reclaim; /* @@ -422,7 +415,7 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags return 0; new_u64s = roundup_pow_of_two(u64s); - new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT); + new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN); if (unlikely(!new_k)) return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s); @@ -452,25 +445,15 @@ static int run_one_mem_trigger(struct btree_trans *trans, if (unlikely(flags & BTREE_TRIGGER_NORUN)) return 0; - if (!btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id))) - return 0; - - if (old_ops->atomic_trigger == new_ops->atomic_trigger) { - ret = bch2_mark_key(trans, i->btree_id, i->level, - old, bkey_i_to_s_c(new), + if (old_ops->trigger == new_ops->trigger) { + ret = bch2_key_trigger(trans, i->btree_id, i->level, + old, bkey_i_to_s(new), BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); } else { - struct bkey _deleted = KEY(0, 0, 0); - struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; - - _deleted.p = i->path->pos; - - ret = bch2_mark_key(trans, i->btree_id, i->level, - deleted, bkey_i_to_s_c(new), - BTREE_TRIGGER_INSERT|flags) ?: - bch2_mark_key(trans, i->btree_id, i->level, - old, deleted, - BTREE_TRIGGER_OVERWRITE|flags); + ret = bch2_key_trigger_new(trans, i->btree_id, i->level, + bkey_i_to_s(new), flags) ?: + bch2_key_trigger_old(trans, i->btree_id, i->level, + old, flags); } return ret; @@ -488,6 +471,7 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ struct bkey_s_c old = { &old_k, i->old_v }; const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type); const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type); + unsigned flags = i->flags|BTREE_TRIGGER_TRANSACTIONAL; verify_update_old_key(trans, i); @@ -497,19 +481,18 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ if (!i->insert_trigger_run && !i->overwrite_trigger_run && - old_ops->trans_trigger == new_ops->trans_trigger) { + old_ops->trigger == new_ops->trigger) { i->overwrite_trigger_run = true; i->insert_trigger_run = true; - return bch2_trans_mark_key(trans, i->btree_id, i->level, old, i->k, - BTREE_TRIGGER_INSERT| - BTREE_TRIGGER_OVERWRITE| - i->flags) ?: 1; + return bch2_key_trigger(trans, i->btree_id, i->level, old, bkey_i_to_s(i->k), + BTREE_TRIGGER_INSERT| + BTREE_TRIGGER_OVERWRITE|flags) ?: 1; } else if (overwrite && !i->overwrite_trigger_run) { i->overwrite_trigger_run = true; - return bch2_trans_mark_old(trans, i->btree_id, i->level, old, i->flags) ?: 1; + return bch2_key_trigger_old(trans, i->btree_id, i->level, old, flags) ?: 1; } else if (!overwrite && !i->insert_trigger_run) { i->insert_trigger_run = true; - return bch2_trans_mark_new(trans, i->btree_id, i->level, i->k, i->flags) ?: 1; + return bch2_key_trigger_new(trans, i->btree_id, i->level, bkey_i_to_s(i->k), flags) ?: 1; } else { return 0; } @@ -551,7 +534,7 @@ static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, static int bch2_trans_commit_run_triggers(struct btree_trans *trans) { - struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates; + struct btree_insert_entry *btree_id_start = trans->updates; unsigned btree_id = 0; int ret = 0; @@ -597,10 +580,6 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans) static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) { - struct bch_fs *c = trans->c; - struct btree_insert_entry *i; - int ret = 0; - trans_for_each_update(trans, i) { /* * XXX: synchronization of cached update triggers with gc @@ -608,14 +587,15 @@ static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) */ BUG_ON(i->cached || i->level); - if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) { - ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC); + if (btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id)) && + gc_visited(trans->c, gc_pos_btree_node(insert_l(trans, i)->b))) { + int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC); if (ret) - break; + return ret; } } - return ret; + return 0; } static inline int @@ -624,8 +604,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, unsigned long trace_ip) { struct bch_fs *c = trans->c; - struct btree_insert_entry *i; - struct btree_write_buffered_key *wb; struct btree_trans_commit_hook *h; unsigned u64s = 0; int ret; @@ -650,23 +628,21 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, u64s += i->k->k.u64s; ret = !i->cached - ? btree_key_can_insert(trans, insert_l(i)->b, u64s) - : btree_key_can_insert_cached(trans, flags, i->path, u64s); + ? btree_key_can_insert(trans, insert_l(trans, i)->b, u64s) + : btree_key_can_insert_cached(trans, flags, trans->paths + i->path, u64s); if (ret) { *stopped_at = i; return ret; } - } - if (trans->nr_wb_updates && - trans->nr_wb_updates + c->btree_write_buffer.state.nr > c->btree_write_buffer.size) - return -BCH_ERR_btree_insert_need_flush_buffer; + i->k->k.needs_whiteout = false; + } /* * Don't get journal reservation until after we know insert will * succeed: */ - if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) { + if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) { ret = bch2_trans_journal_res_get(trans, (flags & BCH_WATERMARK_MASK)| JOURNAL_RES_GET_NONBLOCK); @@ -675,8 +651,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, if (unlikely(trans->journal_transaction_names)) journal_transaction_name(trans); - } else { - trans->journal_res.seq = c->journal.replay_journal_seq; } /* @@ -685,7 +659,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, */ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && - !(flags & BTREE_INSERT_JOURNAL_REPLAY)) { + !(flags & BCH_TRANS_COMMIT_no_journal_res)) { if (bch2_journal_seq_verify) trans_for_each_update(trans, i) i->k->k.version.lo = trans->journal_res.seq; @@ -698,13 +672,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas)) return -BCH_ERR_btree_insert_need_mark_replicas; - if (trans->nr_wb_updates) { - EBUG_ON(flags & BTREE_INSERT_JOURNAL_REPLAY); - - ret = bch2_btree_insert_keys_write_buffer(trans); - if (ret) - goto revert_fs_usage; - } + /* XXX: we only want to run this if deltas are nonzero */ + bch2_trans_account_disk_usage_change(trans); h = trans->hooks; while (h) { @@ -715,8 +684,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, } trans_for_each_update(trans, i) - if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) { - ret = run_one_mem_trigger(trans, i, i->flags); + if (BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS & (1U << i->bkey_type)) { + ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_ATOMIC|i->flags); if (ret) goto fatal_err; } @@ -727,16 +696,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, goto fatal_err; } - if (unlikely(trans->extra_journal_entries.nr)) { - memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res), - trans->extra_journal_entries.data, - trans->extra_journal_entries.nr); - - trans->journal_res.offset += trans->extra_journal_entries.nr; - trans->journal_res.u64s -= trans->extra_journal_entries.nr; - } - - if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) { + if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) { struct journal *j = &c->journal; struct jset_entry *entry; @@ -765,33 +725,27 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, bkey_copy((struct bkey_i *) entry->start, i->k); } - trans_for_each_wb_update(trans, wb) { - entry = bch2_journal_add_entry(j, &trans->journal_res, - BCH_JSET_ENTRY_btree_keys, - wb->btree, 0, - wb->k.k.u64s); - bkey_copy((struct bkey_i *) entry->start, &wb->k); - } + memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res), + trans->journal_entries, + trans->journal_entries_u64s); + + trans->journal_res.offset += trans->journal_entries_u64s; + trans->journal_res.u64s -= trans->journal_entries_u64s; if (trans->journal_seq) *trans->journal_seq = trans->journal_res.seq; } trans_for_each_update(trans, i) { - i->k->k.needs_whiteout = false; + struct btree_path *path = trans->paths + i->path; if (!i->cached) { - u64 seq = trans->journal_res.seq; - - if (i->flags & BTREE_UPDATE_PREJOURNAL) - seq = i->seq; - - bch2_btree_insert_key_leaf(trans, i->path, i->k, seq); + bch2_btree_insert_key_leaf(trans, path, i->k, trans->journal_res.seq); } else if (!i->key_cache_already_flushed) bch2_btree_insert_key_cached(trans, flags, i); else { - bch2_btree_key_cache_drop(trans, i->path); - btree_path_set_dirty(i->path, BTREE_ITER_NEED_TRAVERSE); + bch2_btree_key_cache_drop(trans, path); + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); } } @@ -806,14 +760,8 @@ revert_fs_usage: static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans) { - struct btree_insert_entry *i; - struct btree_write_buffered_key *wb; - trans_for_each_update(trans, i) bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p); - - trans_for_each_wb_update(trans, wb) - bch2_journal_key_overwritten(trans->c, wb->btree, 0, wb->k.k.p); } static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, @@ -841,6 +789,33 @@ static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, return -EINVAL; } +static noinline int bch2_trans_commit_journal_entry_invalid(struct btree_trans *trans, + struct jset_entry *i) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + + prt_printf(&buf, "invalid bkey on insert from %s", trans->fn); + prt_newline(&buf); + printbuf_indent_add(&buf, 2); + + bch2_journal_entry_to_text(&buf, c, i); + prt_newline(&buf); + + bch2_print_string_as_lines(KERN_ERR, buf.buf); + + bch2_inconsistent_error(c); + bch2_dump_trans_updates(trans); + + return -EINVAL; +} + +static int bch2_trans_commit_journal_pin_flush(struct journal *j, + struct journal_entry_pin *_pin, u64 seq) +{ + return 0; +} + /* * Get journal reservation, take write locks, and attempt to do btree update(s): */ @@ -849,7 +824,6 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags unsigned long trace_ip) { struct bch_fs *c = trans->c; - struct btree_insert_entry *i; int ret = 0, u64s_delta = 0; trans_for_each_update(trans, i) { @@ -884,13 +858,15 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags if (!ret && trans->journal_pin) bch2_journal_pin_add(&c->journal, trans->journal_res.seq, - trans->journal_pin, NULL); + trans->journal_pin, + bch2_trans_commit_journal_pin_flush); /* * Drop journal reservation after dropping write locks, since dropping * the journal reservation may kick off a journal write: */ - bch2_journal_res_put(&c->journal, &trans->journal_res); + if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) + bch2_journal_res_put(&c->journal, &trans->journal_res); return ret; } @@ -916,7 +892,8 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, case -BCH_ERR_btree_insert_btree_node_full: ret = bch2_btree_split_leaf(trans, i->path, flags); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - trace_and_count(c, trans_restart_btree_node_split, trans, trace_ip, i->path); + trace_and_count(c, trans_restart_btree_node_split, trans, + trace_ip, trans->paths + i->path); break; case -BCH_ERR_btree_insert_need_mark_replicas: ret = drop_locks_do(trans, @@ -927,7 +904,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK * flag */ - if ((flags & BTREE_INSERT_JOURNAL_RECLAIM) && + if ((flags & BCH_TRANS_COMMIT_journal_reclaim) && (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) { ret = -BCH_ERR_journal_reclaim_would_deadlock; break; @@ -950,30 +927,6 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, ret = bch2_trans_relock(trans); break; - case -BCH_ERR_btree_insert_need_flush_buffer: { - struct btree_write_buffer *wb = &c->btree_write_buffer; - - ret = 0; - - if (wb->state.nr > wb->size * 3 / 4) { - bch2_trans_unlock(trans); - mutex_lock(&wb->flush_lock); - - if (wb->state.nr > wb->size * 3 / 4) { - bch2_trans_begin(trans); - ret = __bch2_btree_write_buffer_flush(trans, - flags|BTREE_INSERT_NOCHECK_RW, true); - if (!ret) { - trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush); - } - } else { - mutex_unlock(&wb->flush_lock); - ret = bch2_trans_relock(trans); - } - } - break; - } default: BUG_ON(ret >= 0); break; @@ -982,8 +935,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) && - !(flags & BTREE_INSERT_NOWAIT) && - (flags & BTREE_INSERT_NOFAIL), c, + (flags & BCH_TRANS_COMMIT_no_enospc), c, "%s: incorrectly got %s\n", __func__, bch2_err_str(ret)); return ret; @@ -995,8 +947,8 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags) struct bch_fs *c = trans->c; int ret; - if (likely(!(flags & BTREE_INSERT_LAZY_RW)) || - test_bit(BCH_FS_STARTED, &c->flags)) + if (likely(!(flags & BCH_TRANS_COMMIT_lazy_rw)) || + test_bit(BCH_FS_started, &c->flags)) return -BCH_ERR_erofs_trans_commit; ret = drop_locks_do(trans, bch2_fs_read_write_early(c)); @@ -1016,7 +968,6 @@ static noinline int do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) { struct bch_fs *c = trans->c; - struct btree_insert_entry *i; int ret = 0; trans_for_each_update(trans, i) { @@ -1030,18 +981,15 @@ do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) { + struct btree_insert_entry *errored_at = NULL; struct bch_fs *c = trans->c; - struct btree_insert_entry *i = NULL; - struct btree_write_buffered_key *wb; int ret = 0; if (!trans->nr_updates && - !trans->nr_wb_updates && - !trans->extra_journal_entries.nr) + !trans->journal_entries_u64s) goto out_reset; - if (flags & BTREE_INSERT_GC_LOCK_HELD) - lockdep_assert_held(&c->gc_lock); + memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta)); ret = bch2_trans_commit_run_triggers(trans); if (ret) @@ -1051,7 +999,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) struct printbuf buf = PRINTBUF; enum bkey_invalid_flags invalid_flags = 0; - if (!(flags & BTREE_INSERT_JOURNAL_REPLAY)) + if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT; if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), @@ -1064,47 +1012,52 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) return ret; } - if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) { + for (struct jset_entry *i = trans->journal_entries; + i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); + i = vstruct_next(i)) { + enum bkey_invalid_flags invalid_flags = 0; + + if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) + invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT; + + if (unlikely(bch2_journal_entry_validate(c, NULL, i, + bcachefs_metadata_version_current, + CPU_BIG_ENDIAN, invalid_flags))) + ret = bch2_trans_commit_journal_entry_invalid(trans, i); + + if (ret) + return ret; + } + + if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) { ret = do_bch2_trans_commit_to_journal_replay(trans); goto out_reset; } - if (!(flags & BTREE_INSERT_NOCHECK_RW) && + if (!(flags & BCH_TRANS_COMMIT_no_check_rw) && unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) { ret = bch2_trans_commit_get_rw_cold(trans, flags); if (ret) goto out_reset; } - if (c->btree_write_buffer.state.nr > c->btree_write_buffer.size / 2 && - mutex_trylock(&c->btree_write_buffer.flush_lock)) { - bch2_trans_begin(trans); - bch2_trans_unlock(trans); - - ret = __bch2_btree_write_buffer_flush(trans, - flags|BTREE_INSERT_NOCHECK_RW, true); - if (!ret) { - trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush); - } - goto out; - } - - EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); + EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); - trans->journal_u64s = trans->extra_journal_entries.nr; + trans->journal_u64s = trans->journal_entries_u64s; trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names); if (trans->journal_transaction_names) trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); trans_for_each_update(trans, i) { - EBUG_ON(!i->path->should_be_locked); + struct btree_path *path = trans->paths + i->path; + + EBUG_ON(!path->should_be_locked); - ret = bch2_btree_path_upgrade(trans, i->path, i->level + 1); + ret = bch2_btree_path_upgrade(trans, path, i->level + 1); if (unlikely(ret)) goto out; - EBUG_ON(!btree_node_intent_locked(i->path, i->level)); + EBUG_ON(!btree_node_intent_locked(path, i->level)); if (i->key_cache_already_flushed) continue; @@ -1120,22 +1073,21 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) trans->journal_u64s += jset_u64s(i->old_k.u64s); } - trans_for_each_wb_update(trans, wb) - trans->journal_u64s += jset_u64s(wb->k.k.u64s); - - if (trans->extra_journal_res) { + if (trans->extra_disk_res) { ret = bch2_disk_reservation_add(c, trans->disk_res, - trans->extra_journal_res, - (flags & BTREE_INSERT_NOFAIL) + trans->extra_disk_res, + (flags & BCH_TRANS_COMMIT_no_enospc) ? BCH_DISK_RESERVATION_NOFAIL : 0); if (ret) goto err; } retry: + errored_at = NULL; bch2_trans_verify_not_in_restart(trans); - memset(&trans->journal_res, 0, sizeof(trans->journal_res)); + if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) + memset(&trans->journal_res, 0, sizeof(trans->journal_res)); - ret = do_bch2_trans_commit(trans, flags, &i, _RET_IP_); + ret = do_bch2_trans_commit(trans, flags, &errored_at, _RET_IP_); /* make sure we didn't drop or screw up locks: */ bch2_trans_verify_locks(trans); @@ -1145,7 +1097,7 @@ retry: trace_and_count(c, transaction_commit, trans, _RET_IP_); out: - if (likely(!(flags & BTREE_INSERT_NOCHECK_RW))) + if (likely(!(flags & BCH_TRANS_COMMIT_no_check_rw))) bch2_write_ref_put(c, BCH_WRITE_REF_trans); out_reset: if (!ret) @@ -1154,9 +1106,21 @@ out_reset: return ret; err: - ret = bch2_trans_commit_error(trans, flags, i, ret, _RET_IP_); + ret = bch2_trans_commit_error(trans, flags, errored_at, ret, _RET_IP_); if (ret) goto out; + /* + * We might have done another transaction commit in the error path - + * i.e. btree write buffer flush - which will have made use of + * trans->journal_res, but with BCH_TRANS_COMMIT_no_journal_res that is + * how the journal sequence number to pin is passed in - so we must + * restart: + */ + if (flags & BCH_TRANS_COMMIT_no_journal_res) { + ret = -BCH_ERR_transaction_restart_nested; + goto out; + } + goto retry; } diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 60453ba86c4b..4a5a64499eb7 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -185,33 +185,32 @@ struct btree_node_iter { * Iterate over all possible positions, synthesizing deleted keys for holes: */ static const __maybe_unused u16 BTREE_ITER_SLOTS = 1 << 0; -static const __maybe_unused u16 BTREE_ITER_ALL_LEVELS = 1 << 1; /* * Indicates that intent locks should be taken on leaf nodes, because we expect * to be doing updates: */ -static const __maybe_unused u16 BTREE_ITER_INTENT = 1 << 2; +static const __maybe_unused u16 BTREE_ITER_INTENT = 1 << 1; /* * Causes the btree iterator code to prefetch additional btree nodes from disk: */ -static const __maybe_unused u16 BTREE_ITER_PREFETCH = 1 << 3; +static const __maybe_unused u16 BTREE_ITER_PREFETCH = 1 << 2; /* * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for * @pos or the first key strictly greater than @pos */ -static const __maybe_unused u16 BTREE_ITER_IS_EXTENTS = 1 << 4; -static const __maybe_unused u16 BTREE_ITER_NOT_EXTENTS = 1 << 5; -static const __maybe_unused u16 BTREE_ITER_CACHED = 1 << 6; -static const __maybe_unused u16 BTREE_ITER_WITH_KEY_CACHE = 1 << 7; -static const __maybe_unused u16 BTREE_ITER_WITH_UPDATES = 1 << 8; -static const __maybe_unused u16 BTREE_ITER_WITH_JOURNAL = 1 << 9; -static const __maybe_unused u16 __BTREE_ITER_ALL_SNAPSHOTS = 1 << 10; -static const __maybe_unused u16 BTREE_ITER_ALL_SNAPSHOTS = 1 << 11; -static const __maybe_unused u16 BTREE_ITER_FILTER_SNAPSHOTS = 1 << 12; -static const __maybe_unused u16 BTREE_ITER_NOPRESERVE = 1 << 13; -static const __maybe_unused u16 BTREE_ITER_CACHED_NOFILL = 1 << 14; -static const __maybe_unused u16 BTREE_ITER_KEY_CACHE_FILL = 1 << 15; -#define __BTREE_ITER_FLAGS_END 16 +static const __maybe_unused u16 BTREE_ITER_IS_EXTENTS = 1 << 3; +static const __maybe_unused u16 BTREE_ITER_NOT_EXTENTS = 1 << 4; +static const __maybe_unused u16 BTREE_ITER_CACHED = 1 << 5; +static const __maybe_unused u16 BTREE_ITER_WITH_KEY_CACHE = 1 << 6; +static const __maybe_unused u16 BTREE_ITER_WITH_UPDATES = 1 << 7; +static const __maybe_unused u16 BTREE_ITER_WITH_JOURNAL = 1 << 8; +static const __maybe_unused u16 __BTREE_ITER_ALL_SNAPSHOTS = 1 << 9; +static const __maybe_unused u16 BTREE_ITER_ALL_SNAPSHOTS = 1 << 10; +static const __maybe_unused u16 BTREE_ITER_FILTER_SNAPSHOTS = 1 << 11; +static const __maybe_unused u16 BTREE_ITER_NOPRESERVE = 1 << 12; +static const __maybe_unused u16 BTREE_ITER_CACHED_NOFILL = 1 << 13; +static const __maybe_unused u16 BTREE_ITER_KEY_CACHE_FILL = 1 << 14; +#define __BTREE_ITER_FLAGS_END 15 enum btree_path_uptodate { BTREE_ITER_UPTODATE = 0, @@ -223,13 +222,12 @@ enum btree_path_uptodate { #define TRACK_PATH_ALLOCATED #endif +typedef u16 btree_path_idx_t; + struct btree_path { - u8 idx; - u8 sorted_idx; + btree_path_idx_t sorted_idx; u8 ref; u8 intent_ref; - u32 alloc_seq; - u32 downgrade_seq; /* btree_iter_copy starts here: */ struct bpos pos; @@ -283,13 +281,12 @@ static inline unsigned long btree_path_ip_allocated(struct btree_path *path) */ struct btree_iter { struct btree_trans *trans; - struct btree_path *path; - struct btree_path *update_path; - struct btree_path *key_cache_path; + btree_path_idx_t path; + btree_path_idx_t update_path; + btree_path_idx_t key_cache_path; enum btree_id btree_id:8; - unsigned min_depth:3; - unsigned advanced:1; + u8 min_depth; /* btree_iter_copy starts here: */ u16 flags; @@ -306,7 +303,6 @@ struct btree_iter { /* BTREE_ITER_WITH_JOURNAL: */ size_t journal_idx; - struct bpos journal_pos; #ifdef TRACK_PATH_ALLOCATED unsigned long ip_allocated; #endif @@ -354,16 +350,16 @@ struct btree_insert_entry { * to the size of the key being overwritten in the btree: */ u8 old_btree_u64s; + btree_path_idx_t path; struct bkey_i *k; - struct btree_path *path; - u64 seq; /* key being overwritten: */ struct bkey old_k; const struct bch_val *old_v; unsigned long ip_allocated; }; -#define BTREE_ITER_MAX 64 +#define BTREE_ITER_INITIAL 64 +#define BTREE_ITER_MAX (1U << 10) struct btree_trans_commit_hook; typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *); @@ -377,25 +373,30 @@ struct btree_trans_commit_hook { #define BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS 10000 +struct btree_trans_paths { + unsigned long nr_paths; + struct btree_path paths[]; +}; + struct btree_trans { struct bch_fs *c; - const char *fn; - struct closure ref; - struct list_head list; - u64 last_begin_time; - u8 lock_may_not_fail; - u8 lock_must_abort; - struct btree_bkey_cached_common *locking; - struct six_lock_waiter locking_wait; + unsigned long *paths_allocated; + struct btree_path *paths; + btree_path_idx_t *sorted; + struct btree_insert_entry *updates; - int srcu_idx; + void *mem; + unsigned mem_top; + unsigned mem_bytes; + btree_path_idx_t nr_sorted; + btree_path_idx_t nr_paths; + btree_path_idx_t nr_paths_max; u8 fn_idx; - u8 nr_sorted; u8 nr_updates; - u8 nr_wb_updates; - u8 wb_updates_size; + u8 lock_must_abort; + bool lock_may_not_fail:1; bool srcu_held:1; bool used_mempool:1; bool in_traverse_all:1; @@ -407,41 +408,59 @@ struct btree_trans { bool write_locked:1; enum bch_errcode restarted:16; u32 restart_count; + + u64 last_begin_time; unsigned long last_begin_ip; unsigned long last_restarted_ip; unsigned long srcu_lock_time; - /* - * For when bch2_trans_update notices we'll be splitting a compressed - * extent: - */ - unsigned extra_journal_res; - unsigned nr_max_paths; - - u64 paths_allocated; - - unsigned mem_top; - unsigned mem_max; - unsigned mem_bytes; - void *mem; - - u8 sorted[BTREE_ITER_MAX + 8]; - struct btree_path paths[BTREE_ITER_MAX]; - struct btree_insert_entry updates[BTREE_ITER_MAX]; - struct btree_write_buffered_key *wb_updates; + const char *fn; + struct btree_bkey_cached_common *locking; + struct six_lock_waiter locking_wait; + int srcu_idx; /* update path: */ + u16 journal_entries_u64s; + u16 journal_entries_size; + struct jset_entry *journal_entries; + struct btree_trans_commit_hook *hooks; - darray_u64 extra_journal_entries; struct journal_entry_pin *journal_pin; struct journal_res journal_res; u64 *journal_seq; struct disk_reservation *disk_res; + + struct bch_fs_usage_base fs_usage_delta; + unsigned journal_u64s; + unsigned extra_disk_res; /* XXX kill */ struct replicas_delta_list *fs_usage_deltas; + + /* Entries before this are zeroed out on every bch2_trans_get() call */ + + struct list_head list; + struct closure ref; + + unsigned long _paths_allocated[BITS_TO_LONGS(BTREE_ITER_INITIAL)]; + struct btree_trans_paths trans_paths; + struct btree_path _paths[BTREE_ITER_INITIAL]; + btree_path_idx_t _sorted[BTREE_ITER_INITIAL + 4]; + struct btree_insert_entry _updates[BTREE_ITER_INITIAL]; }; +static inline struct btree_path *btree_iter_path(struct btree_trans *trans, struct btree_iter *iter) +{ + return trans->paths + iter->path; +} + +static inline struct btree_path *btree_iter_key_cache_path(struct btree_trans *trans, struct btree_iter *iter) +{ + return iter->key_cache_path + ? trans->paths + iter->key_cache_path + : NULL; +} + #define BCH_BTREE_WRITE_TYPES() \ x(initial, 0) \ x(init_next_bset, 1) \ @@ -637,7 +656,7 @@ const char *bch2_btree_node_type_str(enum btree_node_type); BIT_ULL(BKEY_TYPE_reflink)| \ BIT_ULL(BKEY_TYPE_btree)) -#define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS \ +#define BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS \ (BIT_ULL(BKEY_TYPE_alloc)| \ BIT_ULL(BKEY_TYPE_inodes)| \ BIT_ULL(BKEY_TYPE_stripes)| \ @@ -645,7 +664,7 @@ const char *bch2_btree_node_type_str(enum btree_node_type); #define BTREE_NODE_TYPE_HAS_TRIGGERS \ (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \ - BTREE_NODE_TYPE_HAS_MEM_TRIGGERS) + BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS) static inline bool btree_node_type_needs_gc(enum btree_node_type type) { @@ -722,4 +741,9 @@ enum btree_node_sibling { btree_next_sib, }; +struct get_locks_fail { + unsigned l; + struct btree *b; +}; + #endif /* _BCACHEFS_BTREE_TYPES_H */ diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index 2fd3c8cc6f51..c3ff365acce9 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -24,7 +24,7 @@ static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, } static int __must_check -bch2_trans_update_by_path(struct btree_trans *, struct btree_path *, +bch2_trans_update_by_path(struct btree_trans *, btree_path_idx_t, struct bkey_i *, enum btree_update_flags, unsigned long ip); @@ -200,7 +200,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans, */ if (nr_splits > 1 && (compressed_sectors = bch2_bkey_sectors_compressed(old))) - trans->extra_journal_res += compressed_sectors * (nr_splits - 1); + trans->extra_disk_res += compressed_sectors * (nr_splits - 1); if (front_split) { update = bch2_bkey_make_mut_noupdate(trans, old); @@ -339,21 +339,22 @@ err: } static noinline int flush_new_cached_update(struct btree_trans *trans, - struct btree_path *path, struct btree_insert_entry *i, enum btree_update_flags flags, unsigned long ip) { - struct btree_path *btree_path; struct bkey k; int ret; - btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0, - BTREE_ITER_INTENT, _THIS_IP_); - ret = bch2_btree_path_traverse(trans, btree_path, 0); + btree_path_idx_t path_idx = + bch2_path_get(trans, i->btree_id, i->old_k.p, 1, 0, + BTREE_ITER_INTENT, _THIS_IP_); + ret = bch2_btree_path_traverse(trans, path_idx, 0); if (ret) goto out; + struct btree_path *btree_path = trans->paths + path_idx; + /* * The old key in the insert entry might actually refer to an existing * key in the btree that has been deleted from cache and not yet @@ -368,43 +369,34 @@ static noinline int flush_new_cached_update(struct btree_trans *trans, i->flags |= BTREE_TRIGGER_NORUN; btree_path_set_should_be_locked(btree_path); - ret = bch2_trans_update_by_path(trans, btree_path, i->k, flags, ip); + ret = bch2_trans_update_by_path(trans, path_idx, i->k, flags, ip); out: - bch2_path_put(trans, btree_path, true); + bch2_path_put(trans, path_idx, true); return ret; } static int __must_check -bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, +bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx, struct bkey_i *k, enum btree_update_flags flags, unsigned long ip) { struct bch_fs *c = trans->c; struct btree_insert_entry *i, n; - u64 seq = 0; int cmp; + struct btree_path *path = trans->paths + path_idx; EBUG_ON(!path->should_be_locked); - EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX); + EBUG_ON(trans->nr_updates >= trans->nr_paths); EBUG_ON(!bpos_eq(k->k.p, path->pos)); - /* - * The transaction journal res hasn't been allocated at this point. - * That occurs at commit time. Reuse the seq field to pass in the seq - * of a prejournaled key. - */ - if (flags & BTREE_UPDATE_PREJOURNAL) - seq = trans->journal_res.seq; - n = (struct btree_insert_entry) { .flags = flags, .bkey_type = __btree_node_type(path->level, path->btree_id), .btree_id = path->btree_id, .level = path->level, .cached = path->cached, - .path = path, + .path = path_idx, .k = k, - .seq = seq, .ip_allocated = ip, }; @@ -418,7 +410,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, * Pending updates are kept sorted: first, find position of new update, * then delete/trim any updates the new update overwrites: */ - trans_for_each_update(trans, i) { + for (i = trans->updates; i < trans->updates + trans->nr_updates; i++) { cmp = btree_insert_entry_cmp(&n, i); if (cmp <= 0) break; @@ -432,7 +424,6 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, i->cached = n.cached; i->k = n.k; i->path = n.path; - i->seq = n.seq; i->ip_allocated = n.ip_allocated; } else { array_insert_item(trans->updates, trans->nr_updates, @@ -452,7 +443,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, } } - __btree_path_get(i->path, true); + __btree_path_get(trans->paths + i->path, true); /* * If a key is present in the key cache, it must also exist in the @@ -462,7 +453,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, * work: */ if (path->cached && bkey_deleted(&i->old_k)) - return flush_new_cached_update(trans, path, i, flags, ip); + return flush_new_cached_update(trans, i, flags, ip); return 0; } @@ -471,9 +462,11 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans, struct btree_iter *iter, struct btree_path *path) { - if (!iter->key_cache_path || - !iter->key_cache_path->should_be_locked || - !bpos_eq(iter->key_cache_path->pos, iter->pos)) { + struct btree_path *key_cache_path = btree_iter_key_cache_path(trans, iter); + + if (!key_cache_path || + !key_cache_path->should_be_locked || + !bpos_eq(key_cache_path->pos, iter->pos)) { struct bkey_cached *ck; int ret; @@ -488,19 +481,18 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans, iter->flags & BTREE_ITER_INTENT, _THIS_IP_); - ret = bch2_btree_path_traverse(trans, iter->key_cache_path, - BTREE_ITER_CACHED); + ret = bch2_btree_path_traverse(trans, iter->key_cache_path, BTREE_ITER_CACHED); if (unlikely(ret)) return ret; - ck = (void *) iter->key_cache_path->l[0].b; + ck = (void *) trans->paths[iter->key_cache_path].l[0].b; if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_); return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced); } - btree_path_set_should_be_locked(iter->key_cache_path); + btree_path_set_should_be_locked(trans->paths + iter->key_cache_path); } return 0; @@ -509,7 +501,7 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans, int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, struct bkey_i *k, enum btree_update_flags flags) { - struct btree_path *path = iter->update_path ?: iter->path; + btree_path_idx_t path_idx = iter->update_path ?: iter->path; int ret; if (iter->flags & BTREE_ITER_IS_EXTENTS) @@ -529,6 +521,7 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter /* * Ensure that updates to cached btrees go to the key cache: */ + struct btree_path *path = trans->paths + path_idx; if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && !path->cached && !path->level && @@ -537,27 +530,15 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter if (ret) return ret; - path = iter->key_cache_path; + path_idx = iter->key_cache_path; } - return bch2_trans_update_by_path(trans, path, k, flags, _RET_IP_); + return bch2_trans_update_by_path(trans, path_idx, k, flags, _RET_IP_); } -/* - * Add a transaction update for a key that has already been journaled. - */ -int __must_check bch2_trans_update_seq(struct btree_trans *trans, u64 seq, - struct btree_iter *iter, struct bkey_i *k, - enum btree_update_flags flags) -{ - trans->journal_res.seq = seq; - return bch2_trans_update(trans, iter, k, flags|BTREE_UPDATE_NOJOURNAL| - BTREE_UPDATE_PREJOURNAL); -} - -static noinline int bch2_btree_insert_clone_trans(struct btree_trans *trans, - enum btree_id btree, - struct bkey_i *k) +int bch2_btree_insert_clone_trans(struct btree_trans *trans, + enum btree_id btree, + struct bkey_i *k) { struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(&k->k)); int ret = PTR_ERR_OR_ZERO(n); @@ -568,60 +549,30 @@ static noinline int bch2_btree_insert_clone_trans(struct btree_trans *trans, return bch2_btree_insert_trans(trans, btree, n, 0); } -int __must_check bch2_trans_update_buffered(struct btree_trans *trans, - enum btree_id btree, - struct bkey_i *k) +struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s) { - struct btree_write_buffered_key *i; - int ret; - - EBUG_ON(trans->nr_wb_updates > trans->wb_updates_size); - EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX); - - if (unlikely(trans->journal_replay_not_finished)) - return bch2_btree_insert_clone_trans(trans, btree, k); - - trans_for_each_wb_update(trans, i) { - if (i->btree == btree && bpos_eq(i->k.k.p, k->k.p)) { - bkey_copy(&i->k, k); - return 0; - } - } + unsigned new_top = trans->journal_entries_u64s + u64s; + unsigned old_size = trans->journal_entries_size; - if (!trans->wb_updates || - trans->nr_wb_updates == trans->wb_updates_size) { - struct btree_write_buffered_key *u; + if (new_top > trans->journal_entries_size) { + trans->journal_entries_size = roundup_pow_of_two(new_top); - if (trans->nr_wb_updates == trans->wb_updates_size) { - struct btree_transaction_stats *s = btree_trans_stats(trans); - - BUG_ON(trans->wb_updates_size > U8_MAX / 2); - trans->wb_updates_size = max(1, trans->wb_updates_size * 2); - if (s) - s->wb_updates_size = trans->wb_updates_size; - } - - u = bch2_trans_kmalloc_nomemzero(trans, - trans->wb_updates_size * - sizeof(struct btree_write_buffered_key)); - ret = PTR_ERR_OR_ZERO(u); - if (ret) - return ret; - - if (trans->nr_wb_updates) - memcpy(u, trans->wb_updates, trans->nr_wb_updates * - sizeof(struct btree_write_buffered_key)); - trans->wb_updates = u; + btree_trans_stats(trans)->journal_entries_size = trans->journal_entries_size; } - trans->wb_updates[trans->nr_wb_updates] = (struct btree_write_buffered_key) { - .btree = btree, - }; + struct jset_entry *n = + bch2_trans_kmalloc_nomemzero(trans, + trans->journal_entries_size * sizeof(u64)); + if (IS_ERR(n)) + return ERR_CAST(n); - bkey_copy(&trans->wb_updates[trans->nr_wb_updates].k, k); - trans->nr_wb_updates++; + if (trans->journal_entries) + memcpy(n, trans->journal_entries, old_size * sizeof(u64)); + trans->journal_entries = n; - return 0; + struct jset_entry *e = btree_trans_journal_entries_top(trans); + trans->journal_entries_u64s = new_top; + return e; } int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter, @@ -733,20 +684,6 @@ int bch2_btree_delete_at(struct btree_trans *trans, return bch2_btree_delete_extent_at(trans, iter, 0, update_flags); } -int bch2_btree_delete_at_buffered(struct btree_trans *trans, - enum btree_id btree, struct bpos pos) -{ - struct bkey_i *k; - - k = bch2_trans_kmalloc(trans, sizeof(*k)); - if (IS_ERR(k)) - return PTR_ERR(k); - - bkey_init(&k->k); - k->k.p = pos; - return bch2_trans_update_buffered(trans, btree, k); -} - int bch2_btree_delete(struct btree_trans *trans, enum btree_id btree, struct bpos pos, unsigned update_flags) @@ -809,7 +746,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, ret = bch2_trans_update(trans, &iter, &delete, update_flags) ?: bch2_trans_commit(trans, &disk_res, journal_seq, - BTREE_INSERT_NOFAIL); + BCH_TRANS_COMMIT_no_enospc); bch2_disk_reservation_put(trans->c, &disk_res); err: /* @@ -851,56 +788,26 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, struct bpos pos, bool set) { - struct bkey_i *k; - int ret = 0; + struct bkey_i k; - k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k)); - ret = PTR_ERR_OR_ZERO(k); - if (unlikely(ret)) - return ret; + bkey_init(&k.k); + k.k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted; + k.k.p = pos; - bkey_init(&k->k); - k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted; - k->k.p = pos; - - return bch2_trans_update_buffered(trans, btree, k); + return bch2_trans_update_buffered(trans, btree, &k); } -__printf(2, 0) -static int __bch2_trans_log_msg(darray_u64 *entries, const char *fmt, va_list args) +static int __bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf, unsigned u64s) { - struct printbuf buf = PRINTBUF; - struct jset_entry_log *l; - unsigned u64s; - int ret; - - prt_vprintf(&buf, fmt, args); - ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; - if (ret) - goto err; - - u64s = DIV_ROUND_UP(buf.pos, sizeof(u64)); - - ret = darray_make_room(entries, jset_u64s(u64s)); + struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(u64s)); + int ret = PTR_ERR_OR_ZERO(e); if (ret) - goto err; + return ret; - l = (void *) &darray_top(*entries); - l->entry.u64s = cpu_to_le16(u64s); - l->entry.btree_id = 0; - l->entry.level = 1; - l->entry.type = BCH_JSET_ENTRY_log; - l->entry.pad[0] = 0; - l->entry.pad[1] = 0; - l->entry.pad[2] = 0; - memcpy(l->d, buf.buf, buf.pos); - while (buf.pos & 7) - l->d[buf.pos++] = '\0'; - - entries->nr += jset_u64s(u64s); -err: - printbuf_exit(&buf); - return ret; + struct jset_entry_log *l = container_of(e, struct jset_entry_log, entry); + journal_entry_init(e, BCH_JSET_ENTRY_log, 0, 1, u64s); + memcpy(l->d, buf->buf, buf->pos); + return 0; } __printf(3, 0) @@ -908,16 +815,32 @@ static int __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, va_list args) { - int ret; + struct printbuf buf = PRINTBUF; + prt_vprintf(&buf, fmt, args); + + unsigned u64s = DIV_ROUND_UP(buf.pos, sizeof(u64)); + prt_chars(&buf, '\0', u64s * sizeof(u64) - buf.pos); + + int ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; + if (ret) + goto err; if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) { - ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args); + ret = darray_make_room(&c->journal.early_journal_entries, jset_u64s(u64s)); + if (ret) + goto err; + + struct jset_entry_log *l = (void *) &darray_top(c->journal.early_journal_entries); + journal_entry_init(&l->entry, BCH_JSET_ENTRY_log, 0, 1, u64s); + memcpy(l->d, buf.buf, buf.pos); + c->journal.early_journal_entries.nr += jset_u64s(u64s); } else { ret = bch2_trans_do(c, NULL, NULL, - BTREE_INSERT_LAZY_RW|commit_flags, - __bch2_trans_log_msg(&trans->extra_journal_entries, fmt, args)); + BCH_TRANS_COMMIT_lazy_rw|commit_flags, + __bch2_trans_log_msg(trans, &buf, u64s)); } - +err: + printbuf_exit(&buf); return ret; } diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index 9816d2286540..b9382b7b288b 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -21,42 +21,32 @@ void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64); void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *, struct bkey_i *, u64); -enum btree_insert_flags { +#define BCH_TRANS_COMMIT_FLAGS() \ + x(no_enospc, "don't check for enospc") \ + x(no_check_rw, "don't attempt to take a ref on c->writes") \ + x(lazy_rw, "go read-write if we haven't yet - only for use in recovery") \ + x(no_journal_res, "don't take a journal reservation, instead " \ + "pin journal entry referred to by trans->journal_res.seq") \ + x(journal_reclaim, "operation required for journal reclaim; may return error" \ + "instead of deadlocking if BCH_WATERMARK_reclaim not specified")\ + +enum __bch_trans_commit_flags { /* First bits for bch_watermark: */ - __BTREE_INSERT_NOFAIL = BCH_WATERMARK_BITS, - __BTREE_INSERT_NOCHECK_RW, - __BTREE_INSERT_LAZY_RW, - __BTREE_INSERT_JOURNAL_REPLAY, - __BTREE_INSERT_JOURNAL_RECLAIM, - __BTREE_INSERT_NOWAIT, - __BTREE_INSERT_GC_LOCK_HELD, - __BCH_HASH_SET_MUST_CREATE, - __BCH_HASH_SET_MUST_REPLACE, + __BCH_TRANS_COMMIT_FLAGS_START = BCH_WATERMARK_BITS, +#define x(n, ...) __BCH_TRANS_COMMIT_##n, + BCH_TRANS_COMMIT_FLAGS() +#undef x }; -/* Don't check for -ENOSPC: */ -#define BTREE_INSERT_NOFAIL BIT(__BTREE_INSERT_NOFAIL) - -#define BTREE_INSERT_NOCHECK_RW BIT(__BTREE_INSERT_NOCHECK_RW) -#define BTREE_INSERT_LAZY_RW BIT(__BTREE_INSERT_LAZY_RW) - -/* Insert is for journal replay - don't get journal reservations: */ -#define BTREE_INSERT_JOURNAL_REPLAY BIT(__BTREE_INSERT_JOURNAL_REPLAY) - -/* Insert is being called from journal reclaim path: */ -#define BTREE_INSERT_JOURNAL_RECLAIM BIT(__BTREE_INSERT_JOURNAL_RECLAIM) - -/* Don't block on allocation failure (for new btree nodes: */ -#define BTREE_INSERT_NOWAIT BIT(__BTREE_INSERT_NOWAIT) -#define BTREE_INSERT_GC_LOCK_HELD BIT(__BTREE_INSERT_GC_LOCK_HELD) - -#define BCH_HASH_SET_MUST_CREATE BIT(__BCH_HASH_SET_MUST_CREATE) -#define BCH_HASH_SET_MUST_REPLACE BIT(__BCH_HASH_SET_MUST_REPLACE) +enum bch_trans_commit_flags { +#define x(n, ...) BCH_TRANS_COMMIT_##n = BIT(__BCH_TRANS_COMMIT_##n), + BCH_TRANS_COMMIT_FLAGS() +#undef x +}; int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *, unsigned, unsigned); int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned); -int bch2_btree_delete_at_buffered(struct btree_trans *, enum btree_id, struct bpos); int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, unsigned); int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id, @@ -74,6 +64,12 @@ int bch2_btree_delete_range(struct bch_fs *, enum btree_id, int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool); +static inline int bch2_btree_delete_at_buffered(struct btree_trans *trans, + enum btree_id btree, struct bpos pos) +{ + return bch2_btree_bit_mod(trans, btree, pos, false); +} + int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id, struct bpos, struct bpos); @@ -105,10 +101,44 @@ int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *, int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *, struct bkey_i *, enum btree_update_flags); -int __must_check bch2_trans_update_seq(struct btree_trans *, u64, struct btree_iter *, - struct bkey_i *, enum btree_update_flags); -int __must_check bch2_trans_update_buffered(struct btree_trans *, - enum btree_id, struct bkey_i *); + +struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *, unsigned); + +static inline struct jset_entry *btree_trans_journal_entries_top(struct btree_trans *trans) +{ + return (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); +} + +static inline struct jset_entry * +bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s) +{ + if (!trans->journal_entries || + trans->journal_entries_u64s + u64s > trans->journal_entries_size) + return __bch2_trans_jset_entry_alloc(trans, u64s); + + struct jset_entry *e = btree_trans_journal_entries_top(trans); + trans->journal_entries_u64s += u64s; + return e; +} + +int bch2_btree_insert_clone_trans(struct btree_trans *, enum btree_id, struct bkey_i *); + +static inline int __must_check bch2_trans_update_buffered(struct btree_trans *trans, + enum btree_id btree, + struct bkey_i *k) +{ + if (unlikely(trans->journal_replay_not_finished)) + return bch2_btree_insert_clone_trans(trans, btree, k); + + struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(k->k.u64s)); + int ret = PTR_ERR_OR_ZERO(e); + if (ret) + return ret; + + journal_entry_init(e, BCH_JSET_ENTRY_write_buffer_keys, btree, 0, k->k.u64s); + bkey_copy(e->start, k); + return 0; +} void bch2_trans_commit_hook(struct btree_trans *, struct btree_trans_commit_hook *); @@ -157,28 +187,19 @@ static inline int bch2_trans_commit(struct btree_trans *trans, bch2_trans_run(_c, commit_do(trans, _disk_res, _journal_seq, _flags, _do)) #define trans_for_each_update(_trans, _i) \ - for ((_i) = (_trans)->updates; \ + for (struct btree_insert_entry *_i = (_trans)->updates; \ (_i) < (_trans)->updates + (_trans)->nr_updates; \ (_i)++) -#define trans_for_each_wb_update(_trans, _i) \ - for ((_i) = (_trans)->wb_updates; \ - (_i) < (_trans)->wb_updates + (_trans)->nr_wb_updates; \ - (_i)++) - static inline void bch2_trans_reset_updates(struct btree_trans *trans) { - struct btree_insert_entry *i; - trans_for_each_update(trans, i) bch2_path_put(trans, i->path, true); - trans->extra_journal_res = 0; trans->nr_updates = 0; - trans->nr_wb_updates = 0; - trans->wb_updates = NULL; + trans->journal_entries_u64s = 0; trans->hooks = NULL; - trans->extra_journal_entries.nr = 0; + trans->extra_disk_res = 0; if (trans->fs_usage_deltas) { trans->fs_usage_deltas->used = 0; diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 239fcc3c7c99..17a5938aa71a 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -25,24 +25,24 @@ #include <linux/random.h> static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *, - struct btree_path *, struct btree *, + btree_path_idx_t, struct btree *, struct keylist *, unsigned); static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); -static struct btree_path *get_unlocked_mut_path(struct btree_trans *trans, - enum btree_id btree_id, - unsigned level, - struct bpos pos) +static btree_path_idx_t get_unlocked_mut_path(struct btree_trans *trans, + enum btree_id btree_id, + unsigned level, + struct bpos pos) { - struct btree_path *path; - - path = bch2_path_get(trans, btree_id, pos, level + 1, level, + btree_path_idx_t path_idx = bch2_path_get(trans, btree_id, pos, level + 1, level, BTREE_ITER_NOPRESERVE| BTREE_ITER_INTENT, _RET_IP_); - path = bch2_btree_path_make_mut(trans, path, true, _RET_IP_); + path_idx = bch2_btree_path_make_mut(trans, path_idx, true, _RET_IP_); + + struct btree_path *path = trans->paths + path_idx; bch2_btree_path_downgrade(trans, path); __bch2_btree_path_unlock(trans, path); - return path; + return path_idx; } /* Debug code: */ @@ -159,14 +159,16 @@ static bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b, { size_t u64s = btree_node_u64s_with_format(nr, &b->format, new_f); - return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c); + return __vstruct_bytes(struct btree_node, u64s) < btree_buf_bytes(b); } /* Btree node freeing/allocation: */ -static void __btree_node_free(struct bch_fs *c, struct btree *b) +static void __btree_node_free(struct btree_trans *trans, struct btree *b) { - trace_and_count(c, btree_node_free, c, b); + struct bch_fs *c = trans->c; + + trace_and_count(c, btree_node_free, trans, b); BUG_ON(btree_node_write_blocked(b)); BUG_ON(btree_node_dirty(b)); @@ -188,15 +190,15 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans, struct btree *b) { struct bch_fs *c = trans->c; - unsigned level = b->c.level; + unsigned i, level = b->c.level; bch2_btree_node_lock_write_nofail(trans, path, &b->c); bch2_btree_node_hash_remove(&c->btree_cache, b); - __btree_node_free(c, b); + __btree_node_free(trans, b); six_unlock_write(&b->c.lock); mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED); - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) if (path->l[level].b == b) { btree_node_unlock(trans, path, level); path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init); @@ -210,7 +212,7 @@ static void bch2_btree_node_free_never_used(struct btree_update *as, struct bch_fs *c = as->c; struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL]; struct btree_path *path; - unsigned level = b->c.level; + unsigned i, level = b->c.level; BUG_ON(!list_empty(&b->write_blocked)); BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as)); @@ -233,7 +235,7 @@ static void bch2_btree_node_free_never_used(struct btree_update *as, six_unlock_intent(&b->c.lock); - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) if (path->l[level].b == b) { btree_node_unlock(trans, path, level); path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init); @@ -363,7 +365,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id); BUG_ON(ret); - trace_and_count(c, btree_node_alloc, c, b); + trace_and_count(c, btree_node_alloc, trans, b); bch2_increment_clock(c, btree_sectors(c), WRITE); return b; } @@ -453,7 +455,7 @@ static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans * btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); - __btree_node_free(c, b); + __btree_node_free(trans, b); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); } @@ -466,7 +468,6 @@ static int bch2_btree_reserve_get(struct btree_trans *trans, unsigned flags, struct closure *cl) { - struct bch_fs *c = as->c; struct btree *b; unsigned interior; int ret = 0; @@ -476,11 +477,8 @@ static int bch2_btree_reserve_get(struct btree_trans *trans, /* * Protects reaping from the btree node cache and using the btree node * open bucket reserve: - * - * BTREE_INSERT_NOWAIT only applies to btree node allocation, not - * blocking on this lock: */ - ret = bch2_btree_cache_cannibalize_lock(c, cl); + ret = bch2_btree_cache_cannibalize_lock(trans, cl); if (ret) return ret; @@ -488,9 +486,8 @@ static int bch2_btree_reserve_get(struct btree_trans *trans, struct prealloc_nodes *p = as->prealloc_nodes + interior; while (p->nr < nr_nodes[interior]) { - b = __bch2_btree_node_alloc(trans, &as->disk_res, - flags & BTREE_INSERT_NOWAIT ? NULL : cl, - interior, flags); + b = __bch2_btree_node_alloc(trans, &as->disk_res, cl, + interior, flags); if (IS_ERR(b)) { ret = PTR_ERR(b); goto err; @@ -500,7 +497,7 @@ static int bch2_btree_reserve_get(struct btree_trans *trans, } } err: - bch2_btree_cache_cannibalize_unlock(c); + bch2_btree_cache_cannibalize_unlock(trans); return ret; } @@ -559,24 +556,20 @@ static void btree_update_add_key(struct btree_update *as, static int btree_update_nodes_written_trans(struct btree_trans *trans, struct btree_update *as) { - struct bkey_i *k; - int ret; - - ret = darray_make_room(&trans->extra_journal_entries, as->journal_u64s); + struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, as->journal_u64s); + int ret = PTR_ERR_OR_ZERO(e); if (ret) return ret; - memcpy(&darray_top(trans->extra_journal_entries), - as->journal_entries, - as->journal_u64s * sizeof(u64)); - trans->extra_journal_entries.nr += as->journal_u64s; + memcpy(e, as->journal_entries, as->journal_u64s * sizeof(u64)); trans->journal_pin = &as->journal; for_each_keylist_key(&as->old_keys, k) { unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr; - ret = bch2_trans_mark_old(trans, as->btree_id, level, bkey_i_to_s_c(k), 0); + ret = bch2_key_trigger_old(trans, as->btree_id, level, bkey_i_to_s_c(k), + BTREE_TRIGGER_TRANSACTIONAL); if (ret) return ret; } @@ -584,7 +577,8 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans, for_each_keylist_key(&as->new_keys, k) { unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr; - ret = bch2_trans_mark_new(trans, as->btree_id, level, k, 0); + ret = bch2_key_trigger_new(trans, as->btree_id, level, bkey_i_to_s(k), + BTREE_TRIGGER_TRANSACTIONAL); if (ret) return ret; } @@ -645,9 +639,9 @@ static void btree_update_nodes_written(struct btree_update *as) */ ret = commit_do(trans, &as->disk_res, &journal_seq, BCH_WATERMARK_reclaim| - BTREE_INSERT_NOFAIL| - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_JOURNAL_RECLAIM, + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_journal_reclaim, btree_update_nodes_written_trans(trans, as)); bch2_trans_unlock(trans); @@ -655,10 +649,11 @@ static void btree_update_nodes_written(struct btree_update *as) "%s(): error %s", __func__, bch2_err_str(ret)); err: if (as->b) { - struct btree_path *path; b = as->b; - path = get_unlocked_mut_path(trans, as->btree_id, b->c.level, b->key.k.p); + btree_path_idx_t path_idx = get_unlocked_mut_path(trans, + as->btree_id, b->c.level, b->key.k.p); + struct btree_path *path = trans->paths + path_idx; /* * @b is the node we did the final insert into: * @@ -728,7 +723,7 @@ err: btree_node_write_if_need(c, b, SIX_LOCK_intent); btree_node_unlock(trans, path, b->c.level); - bch2_path_put(trans, path, true); + bch2_path_put(trans, path_idx, true); } bch2_journal_pin_drop(&c->journal, &as->journal); @@ -815,6 +810,12 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b) mutex_unlock(&c->btree_interior_update_lock); } +static int bch2_update_reparent_journal_pin_flush(struct journal *j, + struct journal_entry_pin *_pin, u64 seq) +{ + return 0; +} + static void btree_update_reparent(struct btree_update *as, struct btree_update *child) { @@ -825,7 +826,8 @@ static void btree_update_reparent(struct btree_update *as, child->b = NULL; child->mode = BTREE_INTERIOR_UPDATING_AS; - bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL); + bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, + bch2_update_reparent_journal_pin_flush); } static void btree_update_updated_root(struct btree_update *as, struct btree *b) @@ -934,6 +936,12 @@ static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct b b->ob.v[--b->ob.nr]; } +static int bch2_btree_update_will_free_node_journal_pin_flush(struct journal *j, + struct journal_entry_pin *_pin, u64 seq) +{ + return 0; +} + /* * @b is being split/rewritten: it may have pointers to not-yet-written btree * nodes and thus outstanding btree_updates - redirect @b's @@ -985,11 +993,13 @@ static void bch2_btree_interior_update_will_free_node(struct btree_update *as, * when the new nodes are persistent and reachable on disk: */ w = btree_current_write(b); - bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL); + bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, + bch2_btree_update_will_free_node_journal_pin_flush); bch2_journal_pin_drop(&c->journal, &w->journal); w = btree_prev_write(b); - bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL); + bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, + bch2_btree_update_will_free_node_journal_pin_flush); bch2_journal_pin_drop(&c->journal, &w->journal); mutex_unlock(&c->btree_interior_update_lock); @@ -1039,7 +1049,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, struct bch_fs *c = trans->c; struct btree_update *as; u64 start_time = local_clock(); - int disk_res_flags = (flags & BTREE_INSERT_NOFAIL) + int disk_res_flags = (flags & BCH_TRANS_COMMIT_no_enospc) ? BCH_DISK_RESERVATION_NOFAIL : 0; unsigned nr_nodes[2] = { 0, 0 }; unsigned update_level = level; @@ -1057,7 +1067,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, flags &= ~BCH_WATERMARK_MASK; flags |= watermark; - if (!(flags & BTREE_INSERT_JOURNAL_RECLAIM) && + if (!(flags & BCH_TRANS_COMMIT_journal_reclaim) && watermark < c->journal.watermark) { struct journal_res res = { 0 }; @@ -1087,16 +1097,14 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, * Always check for space for two keys, even if we won't have to * split at prior level - it might have been a merge instead: */ - if (bch2_btree_node_insert_fits(c, path->l[update_level].b, + if (bch2_btree_node_insert_fits(path->l[update_level].b, BKEY_BTREE_PTR_U64s_MAX * 2)) break; split = path->l[update_level].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c); } - if (flags & BTREE_INSERT_GC_LOCK_HELD) - lockdep_assert_held(&c->gc_lock); - else if (!down_read_trylock(&c->gc_lock)) { + if (!down_read_trylock(&c->gc_lock)) { ret = drop_locks_do(trans, (down_read(&c->gc_lock), 0)); if (ret) { up_read(&c->gc_lock); @@ -1110,7 +1118,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, as->c = c; as->start_time = start_time; as->mode = BTREE_INTERIOR_NO_UPDATE; - as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD); + as->took_gc_lock = true; as->btree_id = path->btree_id; as->update_level = update_level; INIT_LIST_HEAD(&as->list); @@ -1153,7 +1161,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, * flag */ if (bch2_err_matches(ret, ENOSPC) && - (flags & BTREE_INSERT_JOURNAL_RECLAIM) && + (flags & BCH_TRANS_COMMIT_journal_reclaim) && watermark != BCH_WATERMARK_reclaim) { ret = -BCH_ERR_journal_reclaim_would_deadlock; goto err; @@ -1183,6 +1191,9 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, return as; err: bch2_btree_update_free(as, trans); + if (!bch2_err_matches(ret, ENOSPC) && + !bch2_err_matches(ret, EROFS)) + bch_err_fn_ratelimited(c, ret); return ERR_PTR(ret); } @@ -1214,7 +1225,7 @@ static void bch2_btree_set_root(struct btree_update *as, struct bch_fs *c = as->c; struct btree *old; - trace_and_count(c, btree_node_set_root, c, b); + trace_and_count(c, btree_node_set_root, trans, b); old = btree_node_root(c, b); @@ -1390,7 +1401,7 @@ static void __btree_split_node(struct btree_update *as, unsigned u64s = nr_keys[i].nr_keys * n[i]->data->format.key_u64s + nr_keys[i].val_u64s; - if (__vstruct_bytes(struct btree_node, u64s) > btree_bytes(as->c)) + if (__vstruct_bytes(struct btree_node, u64s) > btree_buf_bytes(b)) n[i]->data->format = b->format; btree_node_set_format(n[i], n[i]->data->format); @@ -1445,10 +1456,12 @@ static void __btree_split_node(struct btree_update *as, */ static void btree_split_insert_keys(struct btree_update *as, struct btree_trans *trans, - struct btree_path *path, + btree_path_idx_t path_idx, struct btree *b, struct keylist *keys) { + struct btree_path *path = trans->paths + path_idx; + if (!bch2_keylist_empty(keys) && bpos_le(bch2_keylist_front(keys)->k.p, b->data->max_key)) { struct btree_node_iter node_iter; @@ -1462,25 +1475,25 @@ static void btree_split_insert_keys(struct btree_update *as, } static int btree_split(struct btree_update *as, struct btree_trans *trans, - struct btree_path *path, struct btree *b, + btree_path_idx_t path, struct btree *b, struct keylist *keys, unsigned flags) { struct bch_fs *c = as->c; - struct btree *parent = btree_node_parent(path, b); + struct btree *parent = btree_node_parent(trans->paths + path, b); struct btree *n1, *n2 = NULL, *n3 = NULL; - struct btree_path *path1 = NULL, *path2 = NULL; + btree_path_idx_t path1 = 0, path2 = 0; u64 start_time = local_clock(); int ret = 0; BUG_ON(!parent && (b != btree_node_root(c, b))); - BUG_ON(parent && !btree_node_intent_locked(path, b->c.level + 1)); + BUG_ON(parent && !btree_node_intent_locked(trans->paths + path, b->c.level + 1)); bch2_btree_interior_update_will_free_node(as, b); if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) { struct btree *n[2]; - trace_and_count(c, btree_node_split, c, b); + trace_and_count(c, btree_node_split, trans, b); n[0] = n1 = bch2_btree_node_alloc(as, trans, b->c.level); n[1] = n2 = bch2_btree_node_alloc(as, trans, b->c.level); @@ -1501,15 +1514,15 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, six_unlock_write(&n2->c.lock); six_unlock_write(&n1->c.lock); - path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p); + path1 = get_unlocked_mut_path(trans, as->btree_id, n1->c.level, n1->key.k.p); six_lock_increment(&n1->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, path1, n1->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, path1, n1); + mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED); + bch2_btree_path_level_init(trans, trans->paths + path1, n1); - path2 = get_unlocked_mut_path(trans, path->btree_id, n2->c.level, n2->key.k.p); + path2 = get_unlocked_mut_path(trans, as->btree_id, n2->c.level, n2->key.k.p); six_lock_increment(&n2->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, path2, n2->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, path2, n2); + mark_btree_node_locked(trans, trans->paths + path2, n2->c.level, BTREE_NODE_INTENT_LOCKED); + bch2_btree_path_level_init(trans, trans->paths + path2, n2); /* * Note that on recursive parent_keys == keys, so we @@ -1526,11 +1539,11 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, bch2_btree_update_add_new_node(as, n3); six_unlock_write(&n3->c.lock); - path2->locks_want++; - BUG_ON(btree_node_locked(path2, n3->c.level)); + trans->paths[path2].locks_want++; + BUG_ON(btree_node_locked(trans->paths + path2, n3->c.level)); six_lock_increment(&n3->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, path2, n3->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, path2, n3); + mark_btree_node_locked(trans, trans->paths + path2, n3->c.level, BTREE_NODE_INTENT_LOCKED); + bch2_btree_path_level_init(trans, trans->paths + path2, n3); n3->sib_u64s[0] = U16_MAX; n3->sib_u64s[1] = U16_MAX; @@ -1538,7 +1551,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, btree_split_insert_keys(as, trans, path, n3, &as->parent_keys); } } else { - trace_and_count(c, btree_node_compact, c, b); + trace_and_count(c, btree_node_compact, trans, b); n1 = bch2_btree_node_alloc_replacement(as, trans, b); @@ -1551,10 +1564,10 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, bch2_btree_update_add_new_node(as, n1); six_unlock_write(&n1->c.lock); - path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p); + path1 = get_unlocked_mut_path(trans, as->btree_id, n1->c.level, n1->key.k.p); six_lock_increment(&n1->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, path1, n1->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, path1, n1); + mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED); + bch2_btree_path_level_init(trans, trans->paths + path1, n1); if (parent) bch2_keylist_add(&as->parent_keys, &n1->key); @@ -1568,10 +1581,10 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, if (ret) goto err; } else if (n3) { - bch2_btree_set_root(as, trans, path, n3); + bch2_btree_set_root(as, trans, trans->paths + path, n3); } else { /* Root filled up but didn't need to be split */ - bch2_btree_set_root(as, trans, path, n1); + bch2_btree_set_root(as, trans, trans->paths + path, n1); } if (n3) { @@ -1591,13 +1604,13 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, * node after another thread has locked and updated the new node, thus * seeing stale data: */ - bch2_btree_node_free_inmem(trans, path, b); + bch2_btree_node_free_inmem(trans, trans->paths + path, b); if (n3) - bch2_trans_node_add(trans, n3); + bch2_trans_node_add(trans, trans->paths + path, n3); if (n2) - bch2_trans_node_add(trans, n2); - bch2_trans_node_add(trans, n1); + bch2_trans_node_add(trans, trans->paths + path2, n2); + bch2_trans_node_add(trans, trans->paths + path1, n1); if (n3) six_unlock_intent(&n3->c.lock); @@ -1606,11 +1619,11 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, six_unlock_intent(&n1->c.lock); out: if (path2) { - __bch2_btree_path_unlock(trans, path2); + __bch2_btree_path_unlock(trans, trans->paths + path2); bch2_path_put(trans, path2, true); } if (path1) { - __bch2_btree_path_unlock(trans, path1); + __bch2_btree_path_unlock(trans, trans->paths + path1); bch2_path_put(trans, path1, true); } @@ -1638,13 +1651,14 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct keylist *keys) { struct btree_path *linked; + unsigned i; __bch2_btree_insert_keys_interior(as, trans, path, b, path->l[b->c.level].iter, keys); btree_update_updated_node(as, b); - trans_for_each_path_with_node(trans, b, linked) + trans_for_each_path_with_node(trans, b, linked, i) bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b); bch2_trans_verify_paths(trans); @@ -1655,7 +1669,7 @@ bch2_btree_insert_keys_interior(struct btree_update *as, * * @as: btree_update object * @trans: btree_trans object - * @path: path that points to current node + * @path_idx: path that points to current node * @b: node to insert keys into * @keys: list of keys to insert * @flags: transaction commit flags @@ -1667,10 +1681,11 @@ bch2_btree_insert_keys_interior(struct btree_update *as, * for leaf nodes -- inserts into interior nodes have to be atomic. */ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans, - struct btree_path *path, struct btree *b, + btree_path_idx_t path_idx, struct btree *b, struct keylist *keys, unsigned flags) { struct bch_fs *c = as->c; + struct btree_path *path = trans->paths + path_idx; int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); int old_live_u64s = b->nr.live_u64s; int live_u64s_added, u64s_added; @@ -1688,7 +1703,7 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t bch2_btree_node_prep_for_write(trans, path, b); - if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) { + if (!bch2_btree_node_insert_fits(b, bch2_keylist_u64s(keys))) { bch2_btree_node_unlock_write(trans, path, b); goto split; } @@ -1723,19 +1738,22 @@ split: return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race); } - return btree_split(as, trans, path, b, keys, flags); + return btree_split(as, trans, path_idx, b, keys, flags); } int bch2_btree_split_leaf(struct btree_trans *trans, - struct btree_path *path, + btree_path_idx_t path, unsigned flags) { - struct btree *b = path_l(path)->b; + /* btree_split & merge may both cause paths array to be reallocated */ + + struct btree *b = path_l(trans->paths + path)->b; struct btree_update *as; unsigned l; int ret = 0; - as = bch2_btree_update_start(trans, path, path->level, + as = bch2_btree_update_start(trans, trans->paths + path, + trans->paths[path].level, true, flags); if (IS_ERR(as)) return PTR_ERR(as); @@ -1748,20 +1766,21 @@ int bch2_btree_split_leaf(struct btree_trans *trans, bch2_btree_update_done(as, trans); - for (l = path->level + 1; btree_node_intent_locked(path, l) && !ret; l++) + for (l = trans->paths[path].level + 1; + btree_node_intent_locked(&trans->paths[path], l) && !ret; + l++) ret = bch2_foreground_maybe_merge(trans, path, l, flags); return ret; } int __bch2_foreground_maybe_merge(struct btree_trans *trans, - struct btree_path *path, + btree_path_idx_t path, unsigned level, unsigned flags, enum btree_node_sibling sib) { struct bch_fs *c = trans->c; - struct btree_path *sib_path = NULL, *new_path = NULL; struct btree_update *as; struct bkey_format_state new_s; struct bkey_format new_f; @@ -1769,13 +1788,15 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, struct btree *b, *m, *n, *prev, *next, *parent; struct bpos sib_pos; size_t sib_u64s; + enum btree_id btree = trans->paths[path].btree_id; + btree_path_idx_t sib_path = 0, new_path = 0; u64 start_time = local_clock(); int ret = 0; - BUG_ON(!path->should_be_locked); - BUG_ON(!btree_node_locked(path, level)); + BUG_ON(!trans->paths[path].should_be_locked); + BUG_ON(!btree_node_locked(&trans->paths[path], level)); - b = path->l[level].b; + b = trans->paths[path].l[level].b; if ((sib == btree_prev_sib && bpos_eq(b->data->min_key, POS_MIN)) || (sib == btree_next_sib && bpos_eq(b->data->max_key, SPOS_MAX))) { @@ -1787,18 +1808,18 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, ? bpos_predecessor(b->data->min_key) : bpos_successor(b->data->max_key); - sib_path = bch2_path_get(trans, path->btree_id, sib_pos, + sib_path = bch2_path_get(trans, btree, sib_pos, U8_MAX, level, BTREE_ITER_INTENT, _THIS_IP_); ret = bch2_btree_path_traverse(trans, sib_path, false); if (ret) goto err; - btree_path_set_should_be_locked(sib_path); + btree_path_set_should_be_locked(trans->paths + sib_path); - m = sib_path->l[level].b; + m = trans->paths[sib_path].l[level].b; - if (btree_node_parent(path, b) != - btree_node_parent(sib_path, m)) { + if (btree_node_parent(trans->paths + path, b) != + btree_node_parent(trans->paths + sib_path, m)) { b->sib_u64s[sib] = U16_MAX; goto out; } @@ -1851,14 +1872,14 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold) goto out; - parent = btree_node_parent(path, b); - as = bch2_btree_update_start(trans, path, level, false, - BTREE_INSERT_NOFAIL|flags); + parent = btree_node_parent(trans->paths + path, b); + as = bch2_btree_update_start(trans, trans->paths + path, level, false, + BCH_TRANS_COMMIT_no_enospc|flags); ret = PTR_ERR_OR_ZERO(as); if (ret) goto err; - trace_and_count(c, btree_node_merge, c, b); + trace_and_count(c, btree_node_merge, trans, b); bch2_btree_interior_update_will_free_node(as, b); bch2_btree_interior_update_will_free_node(as, m); @@ -1882,10 +1903,10 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, bch2_btree_update_add_new_node(as, n); six_unlock_write(&n->c.lock); - new_path = get_unlocked_mut_path(trans, path->btree_id, n->c.level, n->key.k.p); + new_path = get_unlocked_mut_path(trans, btree, n->c.level, n->key.k.p); six_lock_increment(&n->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, new_path, n); + mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); + bch2_btree_path_level_init(trans, trans->paths + new_path, n); bkey_init(&delete.k); delete.k.p = prev->key.k.p; @@ -1903,10 +1924,10 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, bch2_btree_update_get_open_buckets(as, n); bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); - bch2_btree_node_free_inmem(trans, path, b); - bch2_btree_node_free_inmem(trans, sib_path, m); + bch2_btree_node_free_inmem(trans, trans->paths + path, b); + bch2_btree_node_free_inmem(trans, trans->paths + sib_path, m); - bch2_trans_node_add(trans, n); + bch2_trans_node_add(trans, trans->paths + path, n); bch2_trans_verify_paths(trans); @@ -1934,16 +1955,16 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, unsigned flags) { struct bch_fs *c = trans->c; - struct btree_path *new_path = NULL; struct btree *n, *parent; struct btree_update *as; + btree_path_idx_t new_path = 0; int ret; - flags |= BTREE_INSERT_NOFAIL; + flags |= BCH_TRANS_COMMIT_no_enospc; - parent = btree_node_parent(iter->path, b); - as = bch2_btree_update_start(trans, iter->path, b->c.level, - false, flags); + struct btree_path *path = btree_iter_path(trans, iter); + parent = btree_node_parent(path, b); + as = bch2_btree_update_start(trans, path, b->c.level, false, flags); ret = PTR_ERR_OR_ZERO(as); if (ret) goto out; @@ -1958,27 +1979,27 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, new_path = get_unlocked_mut_path(trans, iter->btree_id, n->c.level, n->key.k.p); six_lock_increment(&n->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, new_path, n); + mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); + bch2_btree_path_level_init(trans, trans->paths + new_path, n); - trace_and_count(c, btree_node_rewrite, c, b); + trace_and_count(c, btree_node_rewrite, trans, b); if (parent) { bch2_keylist_add(&as->parent_keys, &n->key); - ret = bch2_btree_insert_node(as, trans, iter->path, parent, - &as->parent_keys, flags); + ret = bch2_btree_insert_node(as, trans, iter->path, + parent, &as->parent_keys, flags); if (ret) goto err; } else { - bch2_btree_set_root(as, trans, iter->path, n); + bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n); } bch2_btree_update_get_open_buckets(as, n); bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); - bch2_btree_node_free_inmem(trans, iter->path, b); + bch2_btree_node_free_inmem(trans, btree_iter_path(trans, iter), b); - bch2_trans_node_add(trans, n); + bch2_trans_node_add(trans, trans->paths + iter->path, n); six_unlock_intent(&n->c.lock); bch2_btree_update_done(as, trans); @@ -2047,8 +2068,7 @@ static void async_btree_node_rewrite_work(struct work_struct *work) ret = bch2_trans_do(c, NULL, NULL, 0, async_btree_node_rewrite_trans(trans, a)); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite); kfree(a); } @@ -2071,7 +2091,7 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) a->seq = b->data->keys.seq; INIT_WORK(&a->work, async_btree_node_rewrite_work); - if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) { + if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) { mutex_lock(&c->pending_node_rewrites_lock); list_add(&a->list, &c->pending_node_rewrites); mutex_unlock(&c->pending_node_rewrites_lock); @@ -2079,15 +2099,15 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) } if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) { - if (test_bit(BCH_FS_STARTED, &c->flags)) { + if (test_bit(BCH_FS_started, &c->flags)) { bch_err(c, "%s: error getting c->writes ref", __func__); kfree(a); return; } ret = bch2_fs_read_write_early(c); + bch_err_msg(c, ret, "going read-write"); if (ret) { - bch_err_msg(c, ret, "going read-write"); kfree(a); return; } @@ -2138,13 +2158,12 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, int ret; if (!skip_triggers) { - ret = bch2_trans_mark_old(trans, b->c.btree_id, b->c.level + 1, - bkey_i_to_s_c(&b->key), 0); - if (ret) - return ret; - - ret = bch2_trans_mark_new(trans, b->c.btree_id, b->c.level + 1, - new_key, 0); + ret = bch2_key_trigger_old(trans, b->c.btree_id, b->c.level + 1, + bkey_i_to_s_c(&b->key), + BTREE_TRIGGER_TRANSACTIONAL) ?: + bch2_key_trigger_new(trans, b->c.btree_id, b->c.level + 1, + bkey_i_to_s(new_key), + BTREE_TRIGGER_TRANSACTIONAL); if (ret) return ret; } @@ -2156,7 +2175,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, BUG_ON(ret); } - parent = btree_node_parent(iter->path, b); + parent = btree_node_parent(btree_iter_path(trans, iter), b); if (parent) { bch2_trans_copy_iter(&iter2, iter); @@ -2164,10 +2183,11 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, iter2.flags & BTREE_ITER_INTENT, _THIS_IP_); - BUG_ON(iter2.path->level != b->c.level); - BUG_ON(!bpos_eq(iter2.path->pos, new_key->k.p)); + struct btree_path *path2 = btree_iter_path(trans, &iter2); + BUG_ON(path2->level != b->c.level); + BUG_ON(!bpos_eq(path2->pos, new_key->k.p)); - btree_path_set_level_up(trans, iter2.path); + btree_path_set_level_up(trans, path2); trans->paths_sorted = false; @@ -2178,23 +2198,23 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, } else { BUG_ON(btree_node_root(c, b) != b); - ret = darray_make_room(&trans->extra_journal_entries, + struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(new_key->k.u64s)); + ret = PTR_ERR_OR_ZERO(e); if (ret) return ret; - journal_entry_set((void *) &darray_top(trans->extra_journal_entries), + journal_entry_set(e, BCH_JSET_ENTRY_btree_root, b->c.btree_id, b->c.level, new_key, new_key->k.u64s); - trans->extra_journal_entries.nr += jset_u64s(new_key->k.u64s); } ret = bch2_trans_commit(trans, NULL, NULL, commit_flags); if (ret) goto err; - bch2_btree_node_lock_write_nofail(trans, iter->path, &b->c); + bch2_btree_node_lock_write_nofail(trans, btree_iter_path(trans, iter), &b->c); if (new_hash) { mutex_lock(&c->btree_cache.lock); @@ -2209,7 +2229,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, bkey_copy(&b->key, new_key); } - bch2_btree_node_unlock_write(trans, iter->path, b); + bch2_btree_node_unlock_write(trans, btree_iter_path(trans, iter), b); out: bch2_trans_iter_exit(trans, &iter2); return ret; @@ -2228,7 +2248,7 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite { struct bch_fs *c = trans->c; struct btree *new_hash = NULL; - struct btree_path *path = iter->path; + struct btree_path *path = btree_iter_path(trans, iter); struct closure cl; int ret = 0; @@ -2243,7 +2263,7 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite * btree_iter_traverse(): */ if (btree_ptr_hash_val(new_key) != b->hash_val) { - ret = bch2_btree_cache_cannibalize_lock(c, &cl); + ret = bch2_btree_cache_cannibalize_lock(trans, &cl); if (ret) { ret = drop_locks_do(trans, (closure_sync(&cl), 0)); if (ret) @@ -2267,7 +2287,7 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite six_unlock_intent(&new_hash->c.lock); } closure_sync(&cl); - bch2_btree_cache_cannibalize_unlock(c); + bch2_btree_cache_cannibalize_unlock(trans); return ret; } @@ -2286,7 +2306,7 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans, goto out; /* has node been freed? */ - if (iter.path->l[b->c.level].b != b) { + if (btree_iter_path(trans, &iter)->l[b->c.level].b != b) { /* node has been freed: */ BUG_ON(!btree_node_dying(b)); goto out; @@ -2328,12 +2348,12 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id) closure_init_stack(&cl); do { - ret = bch2_btree_cache_cannibalize_lock(c, &cl); + ret = bch2_btree_cache_cannibalize_lock(trans, &cl); closure_sync(&cl); } while (ret); b = bch2_btree_node_mem_alloc(trans, false); - bch2_btree_cache_cannibalize_unlock(c); + bch2_btree_cache_cannibalize_unlock(trans); set_btree_node_fake(b); set_btree_node_need_rewrite(b); diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index a6668992a272..c593c925d1e3 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -117,16 +117,17 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *, struct btree *, struct bkey_format); -int bch2_btree_split_leaf(struct btree_trans *, struct btree_path *, unsigned); +int bch2_btree_split_leaf(struct btree_trans *, btree_path_idx_t, unsigned); -int __bch2_foreground_maybe_merge(struct btree_trans *, struct btree_path *, +int __bch2_foreground_maybe_merge(struct btree_trans *, btree_path_idx_t, unsigned, unsigned, enum btree_node_sibling); static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans, - struct btree_path *path, + btree_path_idx_t path_idx, unsigned level, unsigned flags, enum btree_node_sibling sib) { + struct btree_path *path = trans->paths + path_idx; struct btree *b; EBUG_ON(!btree_node_locked(path, level)); @@ -135,11 +136,11 @@ static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans, if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold) return 0; - return __bch2_foreground_maybe_merge(trans, path, level, flags, sib); + return __bch2_foreground_maybe_merge(trans, path_idx, level, flags, sib); } static inline int bch2_foreground_maybe_merge(struct btree_trans *trans, - struct btree_path *path, + btree_path_idx_t path, unsigned level, unsigned flags) { @@ -183,21 +184,19 @@ static inline void btree_node_reset_sib_u64s(struct btree *b) b->sib_u64s[1] = b->nr.live_u64s; } -static inline void *btree_data_end(struct bch_fs *c, struct btree *b) +static inline void *btree_data_end(struct btree *b) { - return (void *) b->data + btree_bytes(c); + return (void *) b->data + btree_buf_bytes(b); } -static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c, - struct btree *b) +static inline struct bkey_packed *unwritten_whiteouts_start(struct btree *b) { - return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s); + return (void *) ((u64 *) btree_data_end(b) - b->whiteout_u64s); } -static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c, - struct btree *b) +static inline struct bkey_packed *unwritten_whiteouts_end(struct btree *b) { - return btree_data_end(c, b); + return btree_data_end(b); } static inline void *write_block(struct btree *b) @@ -220,13 +219,11 @@ static inline bool bkey_written(struct btree *b, struct bkey_packed *k) return __btree_addr_written(b, k); } -static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c, - struct btree *b, - void *end) +static inline ssize_t __bch2_btree_u64s_remaining(struct btree *b, void *end) { ssize_t used = bset_byte_offset(b, end) / sizeof(u64) + b->whiteout_u64s; - ssize_t total = c->opts.btree_node_size >> 3; + ssize_t total = btree_buf_bytes(b) >> 3; /* Always leave one extra u64 for bch2_varint_decode: */ used++; @@ -234,10 +231,9 @@ static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c, return total - used; } -static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c, - struct btree *b) +static inline size_t bch2_btree_keys_u64s_remaining(struct btree *b) { - ssize_t remaining = __bch_btree_u64s_remaining(c, b, + ssize_t remaining = __bch2_btree_u64s_remaining(b, btree_bkey_last(b, bset_tree_last(b))); BUG_ON(remaining < 0); @@ -259,14 +255,13 @@ static inline unsigned btree_write_set_buffer(struct btree *b) return 8 << BTREE_WRITE_SET_U64s_BITS; } -static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, - struct btree *b) +static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, struct btree *b) { struct bset_tree *t = bset_tree_last(b); struct btree_node_entry *bne = max(write_block(b), (void *) btree_bkey_last(b, bset_tree_last(b))); ssize_t remaining_space = - __bch_btree_u64s_remaining(c, b, bne->keys.start); + __bch2_btree_u64s_remaining(b, bne->keys.start); if (unlikely(bset_written(b, bset(b, t)))) { if (remaining_space > (ssize_t) (block_bytes(c) >> 3)) @@ -280,12 +275,11 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, return NULL; } -static inline void push_whiteout(struct bch_fs *c, struct btree *b, - struct bpos pos) +static inline void push_whiteout(struct btree *b, struct bpos pos) { struct bkey_packed k; - BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s); + BUG_ON(bch2_btree_keys_u64s_remaining(b) < BKEY_U64s); EBUG_ON(btree_node_just_written(b)); if (!bkey_pack_pos(&k, pos, b)) { @@ -298,20 +292,19 @@ static inline void push_whiteout(struct bch_fs *c, struct btree *b, k.needs_whiteout = true; b->whiteout_u64s += k.u64s; - bkey_p_copy(unwritten_whiteouts_start(c, b), &k); + bkey_p_copy(unwritten_whiteouts_start(b), &k); } /* * write lock must be held on @b (else the dirty bset that we were going to * insert into could be written out from under us) */ -static inline bool bch2_btree_node_insert_fits(struct bch_fs *c, - struct btree *b, unsigned u64s) +static inline bool bch2_btree_node_insert_fits(struct btree *b, unsigned u64s) { if (unlikely(btree_node_need_rewrite(b))) return false; - return u64s <= bch_btree_keys_u64s_remaining(c, b); + return u64s <= bch2_btree_keys_u64s_remaining(b); } void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *); diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index 4e6241db518b..ac7844861966 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -7,45 +7,143 @@ #include "btree_write_buffer.h" #include "error.h" #include "journal.h" +#include "journal_io.h" #include "journal_reclaim.h" -#include <linux/sort.h> +#include <linux/prefetch.h> -static int btree_write_buffered_key_cmp(const void *_l, const void *_r) +static int bch2_btree_write_buffer_journal_flush(struct journal *, + struct journal_entry_pin *, u64); + +static int bch2_journal_keys_to_write_buffer(struct bch_fs *, struct journal_buf *); + +static inline bool __wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r) { - const struct btree_write_buffered_key *l = _l; - const struct btree_write_buffered_key *r = _r; + return (cmp_int(l->hi, r->hi) ?: + cmp_int(l->mi, r->mi) ?: + cmp_int(l->lo, r->lo)) >= 0; +} - return cmp_int(l->btree, r->btree) ?: - bpos_cmp(l->k.k.p, r->k.k.p) ?: - cmp_int(l->journal_seq, r->journal_seq) ?: - cmp_int(l->journal_offset, r->journal_offset); +static inline bool wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r) +{ +#ifdef CONFIG_X86_64 + int cmp; + + asm("mov (%[l]), %%rax;" + "sub (%[r]), %%rax;" + "mov 8(%[l]), %%rax;" + "sbb 8(%[r]), %%rax;" + "mov 16(%[l]), %%rax;" + "sbb 16(%[r]), %%rax;" + : "=@ccae" (cmp) + : [l] "r" (l), [r] "r" (r) + : "rax", "cc"); + + EBUG_ON(cmp != __wb_key_ref_cmp(l, r)); + return cmp; +#else + return __wb_key_ref_cmp(l, r); +#endif } -static int btree_write_buffered_journal_cmp(const void *_l, const void *_r) +/* Compare excluding idx, the low 24 bits: */ +static inline bool wb_key_eq(const void *_l, const void *_r) { - const struct btree_write_buffered_key *l = _l; - const struct btree_write_buffered_key *r = _r; + const struct wb_key_ref *l = _l; + const struct wb_key_ref *r = _r; - return cmp_int(l->journal_seq, r->journal_seq); + return !((l->hi ^ r->hi)| + (l->mi ^ r->mi)| + ((l->lo >> 24) ^ (r->lo >> 24))); } -static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans, - struct btree_iter *iter, - struct btree_write_buffered_key *wb, - unsigned commit_flags, - bool *write_locked, - size_t *fast) +static noinline void wb_sort(struct wb_key_ref *base, size_t num) +{ + size_t n = num, a = num / 2; + + if (!a) /* num < 2 || size == 0 */ + return; + + for (;;) { + size_t b, c, d; + + if (a) /* Building heap: sift down --a */ + --a; + else if (--n) /* Sorting: Extract root to --n */ + swap(base[0], base[n]); + else /* Sort complete */ + break; + + /* + * Sift element at "a" down into heap. This is the + * "bottom-up" variant, which significantly reduces + * calls to cmp_func(): we find the sift-down path all + * the way to the leaves (one compare per level), then + * backtrack to find where to insert the target element. + * + * Because elements tend to sift down close to the leaves, + * this uses fewer compares than doing two per level + * on the way down. (A bit more than half as many on + * average, 3/4 worst-case.) + */ + for (b = a; c = 2*b + 1, (d = c + 1) < n;) + b = wb_key_ref_cmp(base + c, base + d) ? c : d; + if (d == n) /* Special case last leaf with no sibling */ + b = c; + + /* Now backtrack from "b" to the correct location for "a" */ + while (b != a && wb_key_ref_cmp(base + a, base + b)) + b = (b - 1) / 2; + c = b; /* Where "a" belongs */ + while (b != a) { /* Shift it into place */ + b = (b - 1) / 2; + swap(base[b], base[c]); + } + } +} + +static noinline int wb_flush_one_slowpath(struct btree_trans *trans, + struct btree_iter *iter, + struct btree_write_buffered_key *wb) +{ + struct btree_path *path = btree_iter_path(trans, iter); + + bch2_btree_node_unlock_write(trans, path, path->l[0].b); + + trans->journal_res.seq = wb->journal_seq; + + return bch2_trans_update(trans, iter, &wb->k, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + bch2_trans_commit(trans, NULL, NULL, + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_journal_res| + BCH_TRANS_COMMIT_journal_reclaim); +} + +static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *iter, + struct btree_write_buffered_key *wb, + bool *write_locked, size_t *fast) { - struct bch_fs *c = trans->c; struct btree_path *path; int ret; + EBUG_ON(!wb->journal_seq); + EBUG_ON(!trans->c->btree_write_buffer.flushing.pin.seq); + EBUG_ON(trans->c->btree_write_buffer.flushing.pin.seq > wb->journal_seq); + ret = bch2_btree_iter_traverse(iter); if (ret) return ret; - path = iter->path; + /* + * We can't clone a path that has write locks: unshare it now, before + * set_pos and traverse(): + */ + if (btree_iter_path(trans, iter)->ref > 1) + iter->path = __bch2_btree_path_make_mut(trans, iter->path, true, _THIS_IP_); + + path = btree_iter_path(trans, iter); if (!*write_locked) { ret = bch2_btree_node_lock_write(trans, path, &path->l[0].b->c); @@ -56,52 +154,14 @@ static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans, *write_locked = true; } - if (!bch2_btree_node_insert_fits(c, path->l[0].b, wb->k.k.u64s)) { - bch2_btree_node_unlock_write(trans, path, path->l[0].b); + if (unlikely(!bch2_btree_node_insert_fits(path->l[0].b, wb->k.k.u64s))) { *write_locked = false; - goto trans_commit; + return wb_flush_one_slowpath(trans, iter, wb); } bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq); (*fast)++; - - if (path->ref > 1) { - /* - * We can't clone a path that has write locks: if the path is - * shared, unlock before set_pos(), traverse(): - */ - bch2_btree_node_unlock_write(trans, path, path->l[0].b); - *write_locked = false; - } return 0; -trans_commit: - return bch2_trans_update_seq(trans, wb->journal_seq, iter, &wb->k, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: - bch2_trans_commit(trans, NULL, NULL, - commit_flags| - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_NOFAIL| - BTREE_INSERT_JOURNAL_RECLAIM); -} - -static union btree_write_buffer_state btree_write_buffer_switch(struct btree_write_buffer *wb) -{ - union btree_write_buffer_state old, new; - u64 v = READ_ONCE(wb->state.v); - - do { - old.v = new.v = v; - - new.nr = 0; - new.idx++; - } while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v); - - while (old.idx == 0 ? wb->state.ref0 : wb->state.ref1) - cpu_relax(); - - smp_mb(); - - return old; } /* @@ -124,41 +184,87 @@ btree_write_buffered_insert(struct btree_trans *trans, bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k), BTREE_ITER_CACHED|BTREE_ITER_INTENT); + trans->journal_res.seq = wb->journal_seq; + ret = bch2_btree_iter_traverse(&iter) ?: - bch2_trans_update_seq(trans, wb->journal_seq, &iter, &wb->k, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + bch2_trans_update(trans, &iter, &wb->k, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); bch2_trans_iter_exit(trans, &iter); return ret; } -int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_flags, - bool locked) +static void move_keys_from_inc_to_flushing(struct btree_write_buffer *wb) +{ + struct bch_fs *c = container_of(wb, struct bch_fs, btree_write_buffer); + struct journal *j = &c->journal; + + if (!wb->inc.keys.nr) + return; + + bch2_journal_pin_add(j, wb->inc.keys.data[0].journal_seq, &wb->flushing.pin, + bch2_btree_write_buffer_journal_flush); + + darray_resize(&wb->flushing.keys, min_t(size_t, 1U << 20, wb->flushing.keys.nr + wb->inc.keys.nr)); + darray_resize(&wb->sorted, wb->flushing.keys.size); + + if (!wb->flushing.keys.nr && wb->sorted.size >= wb->inc.keys.nr) { + swap(wb->flushing.keys, wb->inc.keys); + goto out; + } + + size_t nr = min(darray_room(wb->flushing.keys), + wb->sorted.size - wb->flushing.keys.nr); + nr = min(nr, wb->inc.keys.nr); + + memcpy(&darray_top(wb->flushing.keys), + wb->inc.keys.data, + sizeof(wb->inc.keys.data[0]) * nr); + + memmove(wb->inc.keys.data, + wb->inc.keys.data + nr, + sizeof(wb->inc.keys.data[0]) * (wb->inc.keys.nr - nr)); + + wb->flushing.keys.nr += nr; + wb->inc.keys.nr -= nr; +out: + if (!wb->inc.keys.nr) + bch2_journal_pin_drop(j, &wb->inc.pin); + else + bch2_journal_pin_update(j, wb->inc.keys.data[0].journal_seq, &wb->inc.pin, + bch2_btree_write_buffer_journal_flush); + + if (j->watermark) { + spin_lock(&j->lock); + bch2_journal_set_watermark(j); + spin_unlock(&j->lock); + } + + BUG_ON(wb->sorted.size < wb->flushing.keys.nr); +} + +static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) { struct bch_fs *c = trans->c; struct journal *j = &c->journal; struct btree_write_buffer *wb = &c->btree_write_buffer; - struct journal_entry_pin pin; - struct btree_write_buffered_key *i, *keys; struct btree_iter iter = { NULL }; - size_t nr = 0, skipped = 0, fast = 0, slowpath = 0; + size_t skipped = 0, fast = 0, slowpath = 0; bool write_locked = false; - union btree_write_buffer_state s; int ret = 0; - memset(&pin, 0, sizeof(pin)); - - if (!locked && !mutex_trylock(&wb->flush_lock)) - return 0; - - bch2_journal_pin_copy(j, &pin, &wb->journal_pin, NULL); - bch2_journal_pin_drop(j, &wb->journal_pin); + bch2_trans_unlock(trans); + bch2_trans_begin(trans); - s = btree_write_buffer_switch(wb); - keys = wb->keys[s.idx]; - nr = s.nr; + mutex_lock(&wb->inc.lock); + move_keys_from_inc_to_flushing(wb); + mutex_unlock(&wb->inc.lock); - if (race_fault()) - goto slowpath; + for (size_t i = 0; i < wb->flushing.keys.nr; i++) { + wb->sorted.data[i].idx = i; + wb->sorted.data[i].btree = wb->flushing.keys.data[i].btree; + memcpy(&wb->sorted.data[i].pos, &wb->flushing.keys.data[i].k.k.p, sizeof(struct bpos)); + } + wb->sorted.nr = wb->flushing.keys.nr; /* * We first sort so that we can detect and skip redundant updates, and @@ -168,208 +274,373 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f * However, since we're not flushing in the order they appear in the * journal we won't be able to drop our journal pin until everything is * flushed - which means this could deadlock the journal if we weren't - * passing BTREE_INSERT_JOURNAL_RECLAIM. This causes the update to fail + * passing BCH_TRANS_COMMIT_journal_reclaim. This causes the update to fail * if it would block taking a journal reservation. * * If that happens, simply skip the key so we can optimistically insert * as many keys as possible in the fast path. */ - sort(keys, nr, sizeof(keys[0]), - btree_write_buffered_key_cmp, NULL); + wb_sort(wb->sorted.data, wb->sorted.nr); + + darray_for_each(wb->sorted, i) { + struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx]; + + for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++) + prefetch(&wb->flushing.keys.data[n->idx]); + + BUG_ON(!k->journal_seq); + + if (i + 1 < &darray_top(wb->sorted) && + wb_key_eq(i, i + 1)) { + struct btree_write_buffered_key *n = &wb->flushing.keys.data[i[1].idx]; - for (i = keys; i < keys + nr; i++) { - if (i + 1 < keys + nr && - i[0].btree == i[1].btree && - bpos_eq(i[0].k.k.p, i[1].k.k.p)) { skipped++; - i->journal_seq = 0; + n->journal_seq = min_t(u64, n->journal_seq, k->journal_seq); + k->journal_seq = 0; continue; } - if (write_locked && - (iter.path->btree_id != i->btree || - bpos_gt(i->k.k.p, iter.path->l[0].b->key.k.p))) { - bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b); - write_locked = false; + if (write_locked) { + struct btree_path *path = btree_iter_path(trans, &iter); + + if (path->btree_id != i->btree || + bpos_gt(k->k.k.p, path->l[0].b->key.k.p)) { + bch2_btree_node_unlock_write(trans, path, path->l[0].b); + write_locked = false; + } } - if (!iter.path || iter.path->btree_id != i->btree) { + if (!iter.path || iter.btree_id != k->btree) { bch2_trans_iter_exit(trans, &iter); - bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p, + bch2_trans_iter_init(trans, &iter, k->btree, k->k.k.p, BTREE_ITER_INTENT|BTREE_ITER_ALL_SNAPSHOTS); } - bch2_btree_iter_set_pos(&iter, i->k.k.p); - iter.path->preserve = false; + bch2_btree_iter_set_pos(&iter, k->k.k.p); + btree_iter_path(trans, &iter)->preserve = false; do { - ret = bch2_btree_write_buffer_flush_one(trans, &iter, i, - commit_flags, &write_locked, &fast); + if (race_fault()) { + ret = -BCH_ERR_journal_reclaim_would_deadlock; + break; + } + + ret = wb_flush_one(trans, &iter, k, &write_locked, &fast); if (!write_locked) bch2_trans_begin(trans); } while (bch2_err_matches(ret, BCH_ERR_transaction_restart)); - if (ret == -BCH_ERR_journal_reclaim_would_deadlock) { + if (!ret) { + k->journal_seq = 0; + } else if (ret == -BCH_ERR_journal_reclaim_would_deadlock) { slowpath++; - continue; - } - if (ret) + ret = 0; + } else break; - - i->journal_seq = 0; } - if (write_locked) - bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b); + if (write_locked) { + struct btree_path *path = btree_iter_path(trans, &iter); + bch2_btree_node_unlock_write(trans, path, path->l[0].b); + } bch2_trans_iter_exit(trans, &iter); - trace_write_buffer_flush(trans, nr, skipped, fast, wb->size); - - if (slowpath) - goto slowpath; + if (ret) + goto err; + if (slowpath) { + /* + * Flush in the order they were present in the journal, so that + * we can release journal pins: + * The fastpath zapped the seq of keys that were successfully flushed so + * we can skip those here. + */ + trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, wb->flushing.keys.nr); + + darray_for_each(wb->flushing.keys, i) { + if (!i->journal_seq) + continue; + + bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin, + bch2_btree_write_buffer_journal_flush); + + bch2_trans_begin(trans); + + ret = commit_do(trans, NULL, NULL, + BCH_WATERMARK_reclaim| + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_no_journal_res| + BCH_TRANS_COMMIT_journal_reclaim, + btree_write_buffered_insert(trans, i)); + if (ret) + goto err; + } + } +err: bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)); -out: - bch2_journal_pin_drop(j, &pin); - mutex_unlock(&wb->flush_lock); + trace_write_buffer_flush(trans, wb->flushing.keys.nr, skipped, fast, 0); + bch2_journal_pin_drop(j, &wb->flushing.pin); + wb->flushing.keys.nr = 0; return ret; -slowpath: - trace_write_buffer_flush_slowpath(trans, i - keys, nr); +} - /* - * Now sort the rest by journal seq and bump the journal pin as we go. - * The slowpath zapped the seq of keys that were successfully flushed so - * we can skip those here. - */ - sort(keys, nr, sizeof(keys[0]), - btree_write_buffered_journal_cmp, - NULL); +static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 seq) +{ + struct journal *j = &c->journal; + struct journal_buf *buf; + int ret = 0; - commit_flags &= ~BCH_WATERMARK_MASK; - commit_flags |= BCH_WATERMARK_reclaim; + while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, seq))) { + ret = bch2_journal_keys_to_write_buffer(c, buf); + mutex_unlock(&j->buf_lock); + } - for (i = keys; i < keys + nr; i++) { - if (!i->journal_seq) - continue; + return ret; +} - if (i->journal_seq > pin.seq) { - struct journal_entry_pin pin2; +static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq) +{ + struct bch_fs *c = trans->c; + struct btree_write_buffer *wb = &c->btree_write_buffer; + int ret = 0, fetch_from_journal_err; - memset(&pin2, 0, sizeof(pin2)); + do { + bch2_trans_unlock(trans); - bch2_journal_pin_add(j, i->journal_seq, &pin2, NULL); - bch2_journal_pin_drop(j, &pin); - bch2_journal_pin_copy(j, &pin, &pin2, NULL); - bch2_journal_pin_drop(j, &pin2); - } + fetch_from_journal_err = fetch_wb_keys_from_journal(c, seq); - ret = commit_do(trans, NULL, NULL, - commit_flags| - BTREE_INSERT_NOFAIL| - BTREE_INSERT_JOURNAL_RECLAIM, - btree_write_buffered_insert(trans, i)); - if (bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret))) - break; - } + /* + * On memory allocation failure, bch2_btree_write_buffer_flush_locked() + * is not guaranteed to empty wb->inc: + */ + mutex_lock(&wb->flushing.lock); + ret = bch2_btree_write_buffer_flush_locked(trans); + mutex_unlock(&wb->flushing.lock); + } while (!ret && + (fetch_from_journal_err || + (wb->inc.pin.seq && wb->inc.pin.seq <= seq) || + (wb->flushing.pin.seq && wb->flushing.pin.seq <= seq))); - goto out; + return ret; } -int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans) +static int bch2_btree_write_buffer_journal_flush(struct journal *j, + struct journal_entry_pin *_pin, u64 seq) { - bch2_trans_unlock(trans); - mutex_lock(&trans->c->btree_write_buffer.flush_lock); - return __bch2_btree_write_buffer_flush(trans, 0, true); + struct bch_fs *c = container_of(j, struct bch_fs, journal); + + return bch2_trans_run(c, btree_write_buffer_flush_seq(trans, seq)); } -int bch2_btree_write_buffer_flush(struct btree_trans *trans) +int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans) { - return __bch2_btree_write_buffer_flush(trans, 0, false); + struct bch_fs *c = trans->c; + + trace_and_count(c, write_buffer_flush_sync, trans, _RET_IP_); + + return btree_write_buffer_flush_seq(trans, journal_cur_seq(&c->journal)); } -static int bch2_btree_write_buffer_journal_flush(struct journal *j, - struct journal_entry_pin *_pin, u64 seq) +int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *trans) { - struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_fs *c = trans->c; struct btree_write_buffer *wb = &c->btree_write_buffer; + int ret = 0; - mutex_lock(&wb->flush_lock); + if (mutex_trylock(&wb->flushing.lock)) { + ret = bch2_btree_write_buffer_flush_locked(trans); + mutex_unlock(&wb->flushing.lock); + } - return bch2_trans_run(c, - __bch2_btree_write_buffer_flush(trans, BTREE_INSERT_NOCHECK_RW, true)); + return ret; } -static inline u64 btree_write_buffer_ref(int idx) +int bch2_btree_write_buffer_tryflush(struct btree_trans *trans) { - return ((union btree_write_buffer_state) { - .ref0 = idx == 0, - .ref1 = idx == 1, - }).v; + struct bch_fs *c = trans->c; + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer)) + return -BCH_ERR_erofs_no_writes; + + int ret = bch2_btree_write_buffer_flush_nocheck_rw(trans); + bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); + return ret; } -int bch2_btree_insert_keys_write_buffer(struct btree_trans *trans) +static void bch2_btree_write_buffer_flush_work(struct work_struct *work) { - struct bch_fs *c = trans->c; + struct bch_fs *c = container_of(work, struct bch_fs, btree_write_buffer.flush_work); struct btree_write_buffer *wb = &c->btree_write_buffer; - struct btree_write_buffered_key *i; - union btree_write_buffer_state old, new; - int ret = 0; - u64 v; + int ret; + + mutex_lock(&wb->flushing.lock); + do { + ret = bch2_trans_run(c, bch2_btree_write_buffer_flush_locked(trans)); + } while (!ret && bch2_btree_write_buffer_should_flush(c)); + mutex_unlock(&wb->flushing.lock); + + bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); +} - trans_for_each_wb_update(trans, i) { - EBUG_ON(i->k.k.u64s > BTREE_WRITE_BUFERED_U64s_MAX); +int bch2_journal_key_to_wb_slowpath(struct bch_fs *c, + struct journal_keys_to_wb *dst, + enum btree_id btree, struct bkey_i *k) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; + int ret; +retry: + ret = darray_make_room_gfp(&dst->wb->keys, 1, GFP_KERNEL); + if (!ret && dst->wb == &wb->flushing) + ret = darray_resize(&wb->sorted, wb->flushing.keys.size); + + if (unlikely(ret)) { + if (dst->wb == &c->btree_write_buffer.flushing) { + mutex_unlock(&dst->wb->lock); + dst->wb = &c->btree_write_buffer.inc; + bch2_journal_pin_add(&c->journal, dst->seq, &dst->wb->pin, + bch2_btree_write_buffer_journal_flush); + goto retry; + } - i->journal_seq = trans->journal_res.seq; - i->journal_offset = trans->journal_res.offset; + return ret; } - preempt_disable(); - v = READ_ONCE(wb->state.v); - do { - old.v = new.v = v; + dst->room = darray_room(dst->wb->keys); + if (dst->wb == &wb->flushing) + dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr); + BUG_ON(!dst->room); + BUG_ON(!dst->seq); + + struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys); + wb_k->journal_seq = dst->seq; + wb_k->btree = btree; + bkey_copy(&wb_k->k, k); + dst->wb->keys.nr++; + dst->room--; + return 0; +} + +void bch2_journal_keys_to_write_buffer_start(struct bch_fs *c, struct journal_keys_to_wb *dst, u64 seq) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; + + if (mutex_trylock(&wb->flushing.lock)) { + mutex_lock(&wb->inc.lock); + move_keys_from_inc_to_flushing(wb); - new.v += btree_write_buffer_ref(new.idx); - new.nr += trans->nr_wb_updates; - if (new.nr > wb->size) { - ret = -BCH_ERR_btree_insert_need_flush_buffer; - goto out; + /* + * Attempt to skip wb->inc, and add keys directly to + * wb->flushing, saving us a copy later: + */ + + if (!wb->inc.keys.nr) { + dst->wb = &wb->flushing; + } else { + mutex_unlock(&wb->flushing.lock); + dst->wb = &wb->inc; } - } while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v); + } else { + mutex_lock(&wb->inc.lock); + dst->wb = &wb->inc; + } - memcpy(wb->keys[new.idx] + old.nr, - trans->wb_updates, - sizeof(trans->wb_updates[0]) * trans->nr_wb_updates); + dst->room = darray_room(dst->wb->keys); + if (dst->wb == &wb->flushing) + dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr); + dst->seq = seq; - bch2_journal_pin_add(&c->journal, trans->journal_res.seq, &wb->journal_pin, + bch2_journal_pin_add(&c->journal, seq, &dst->wb->pin, bch2_btree_write_buffer_journal_flush); +} - atomic64_sub_return_release(btree_write_buffer_ref(new.idx), &wb->state.counter); +void bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_to_wb *dst) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; + + if (!dst->wb->keys.nr) + bch2_journal_pin_drop(&c->journal, &dst->wb->pin); + + if (bch2_btree_write_buffer_should_flush(c) && + __bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer) && + !queue_work(system_unbound_wq, &c->btree_write_buffer.flush_work)) + bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); + + if (dst->wb == &wb->flushing) + mutex_unlock(&wb->flushing.lock); + mutex_unlock(&wb->inc.lock); +} + +static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf) +{ + struct journal_keys_to_wb dst; + struct jset_entry *entry; + struct bkey_i *k; + int ret = 0; + + bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq)); + + for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) { + jset_entry_for_each_key(entry, k) { + ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k); + if (ret) + goto out; + } + + entry->type = BCH_JSET_ENTRY_btree_keys; + } + + buf->need_flush_to_write_buffer = false; out: - preempt_enable(); + bch2_journal_keys_to_write_buffer_end(c, &dst); + return ret; +} + +static int wb_keys_resize(struct btree_write_buffer_keys *wb, size_t new_size) +{ + if (wb->keys.size >= new_size) + return 0; + + if (!mutex_trylock(&wb->lock)) + return -EINTR; + + int ret = darray_resize(&wb->keys, new_size); + mutex_unlock(&wb->lock); return ret; } +int bch2_btree_write_buffer_resize(struct bch_fs *c, size_t new_size) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; + + return wb_keys_resize(&wb->flushing, new_size) ?: + wb_keys_resize(&wb->inc, new_size); +} + void bch2_fs_btree_write_buffer_exit(struct bch_fs *c) { struct btree_write_buffer *wb = &c->btree_write_buffer; - BUG_ON(wb->state.nr && !bch2_journal_error(&c->journal)); + BUG_ON((wb->inc.keys.nr || wb->flushing.keys.nr) && + !bch2_journal_error(&c->journal)); - kvfree(wb->keys[1]); - kvfree(wb->keys[0]); + darray_exit(&wb->sorted); + darray_exit(&wb->flushing.keys); + darray_exit(&wb->inc.keys); } int bch2_fs_btree_write_buffer_init(struct bch_fs *c) { struct btree_write_buffer *wb = &c->btree_write_buffer; - mutex_init(&wb->flush_lock); - wb->size = c->opts.btree_write_buffer_size; + mutex_init(&wb->inc.lock); + mutex_init(&wb->flushing.lock); + INIT_WORK(&wb->flush_work, bch2_btree_write_buffer_flush_work); - wb->keys[0] = kvmalloc_array(wb->size, sizeof(*wb->keys[0]), GFP_KERNEL); - wb->keys[1] = kvmalloc_array(wb->size, sizeof(*wb->keys[1]), GFP_KERNEL); - if (!wb->keys[0] || !wb->keys[1]) - return -BCH_ERR_ENOMEM_fs_btree_write_buffer_init; + /* Will be resized by journal as needed: */ + unsigned initial_size = 1 << 16; - return 0; + return darray_make_room(&wb->inc.keys, initial_size) ?: + darray_make_room(&wb->flushing.keys, initial_size) ?: + darray_make_room(&wb->sorted, initial_size); } diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h index 322df1c8304e..eebcd2b15249 100644 --- a/fs/bcachefs/btree_write_buffer.h +++ b/fs/bcachefs/btree_write_buffer.h @@ -2,12 +2,59 @@ #ifndef _BCACHEFS_BTREE_WRITE_BUFFER_H #define _BCACHEFS_BTREE_WRITE_BUFFER_H -int __bch2_btree_write_buffer_flush(struct btree_trans *, unsigned, bool); +#include "bkey.h" + +static inline bool bch2_btree_write_buffer_should_flush(struct bch_fs *c) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; + + return wb->inc.keys.nr + wb->flushing.keys.nr > wb->inc.keys.size / 4; +} + +static inline bool bch2_btree_write_buffer_must_wait(struct bch_fs *c) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; + + return wb->inc.keys.nr > wb->inc.keys.size * 3 / 4; +} + +struct btree_trans; int bch2_btree_write_buffer_flush_sync(struct btree_trans *); -int bch2_btree_write_buffer_flush(struct btree_trans *); +int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *); +int bch2_btree_write_buffer_tryflush(struct btree_trans *); + +struct journal_keys_to_wb { + struct btree_write_buffer_keys *wb; + size_t room; + u64 seq; +}; + +int bch2_journal_key_to_wb_slowpath(struct bch_fs *, + struct journal_keys_to_wb *, + enum btree_id, struct bkey_i *); + +static inline int bch2_journal_key_to_wb(struct bch_fs *c, + struct journal_keys_to_wb *dst, + enum btree_id btree, struct bkey_i *k) +{ + EBUG_ON(!dst->seq); + + if (unlikely(!dst->room)) + return bch2_journal_key_to_wb_slowpath(c, dst, btree, k); + + struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys); + wb_k->journal_seq = dst->seq; + wb_k->btree = btree; + bkey_copy(&wb_k->k, k); + dst->wb->keys.nr++; + dst->room--; + return 0; +} -int bch2_btree_insert_keys_write_buffer(struct btree_trans *); +void bch2_journal_keys_to_write_buffer_start(struct bch_fs *, struct journal_keys_to_wb *, u64); +void bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_to_wb *); +int bch2_btree_write_buffer_resize(struct bch_fs *, size_t); void bch2_fs_btree_write_buffer_exit(struct bch_fs *); int bch2_fs_btree_write_buffer_init(struct bch_fs *); diff --git a/fs/bcachefs/btree_write_buffer_types.h b/fs/bcachefs/btree_write_buffer_types.h index 99993ba77aea..9b9433de9c36 100644 --- a/fs/bcachefs/btree_write_buffer_types.h +++ b/fs/bcachefs/btree_write_buffer_types.h @@ -2,43 +2,56 @@ #ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H #define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H +#include "darray.h" #include "journal_types.h" #define BTREE_WRITE_BUFERED_VAL_U64s_MAX 4 #define BTREE_WRITE_BUFERED_U64s_MAX (BKEY_U64s + BTREE_WRITE_BUFERED_VAL_U64s_MAX) -struct btree_write_buffered_key { - u64 journal_seq; - unsigned journal_offset; - enum btree_id btree; - __BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX); -}; - -union btree_write_buffer_state { +struct wb_key_ref { +union { struct { - atomic64_t counter; - }; - - struct { - u64 v; - }; - +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + unsigned idx:24; + u8 pos[sizeof(struct bpos)]; + enum btree_id btree:8; +#else + enum btree_id btree:8; + u8 pos[sizeof(struct bpos)]; + unsigned idx:24; +#endif + } __packed; struct { - u64 nr:23; - u64 idx:1; - u64 ref0:20; - u64 ref1:20; +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + u64 lo; + u64 mi; + u64 hi; +#else + u64 hi; + u64 mi; + u64 lo; +#endif }; }; +}; -struct btree_write_buffer { - struct mutex flush_lock; - struct journal_entry_pin journal_pin; +struct btree_write_buffered_key { + enum btree_id btree:8; + u64 journal_seq:56; + __BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX); +}; - union btree_write_buffer_state state; - size_t size; +struct btree_write_buffer_keys { + DARRAY(struct btree_write_buffered_key) keys; + struct journal_entry_pin pin; + struct mutex lock; +}; - struct btree_write_buffered_key *keys[2]; +struct btree_write_buffer { + DARRAY(struct wb_key_ref) sorted; + struct btree_write_buffer_keys inc; + struct btree_write_buffer_keys flushing; + struct work_struct flush_work; }; #endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */ diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 5a91d3189fcf..54f7826ac498 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -25,7 +25,7 @@ #include <linux/preempt.h> -static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage, +static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage, enum bch_data_type data_type, s64 sectors) { @@ -47,31 +47,27 @@ static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage, void bch2_fs_usage_initialize(struct bch_fs *c) { - struct bch_fs_usage *usage; - struct bch_dev *ca; - unsigned i; - percpu_down_write(&c->mark_lock); - usage = c->usage_base; + struct bch_fs_usage *usage = c->usage_base; - for (i = 0; i < ARRAY_SIZE(c->usage); i++) + for (unsigned i = 0; i < ARRAY_SIZE(c->usage); i++) bch2_fs_usage_acc_to_base(c, i); - for (i = 0; i < BCH_REPLICAS_MAX; i++) - usage->reserved += usage->persistent_reserved[i]; + for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++) + usage->b.reserved += usage->persistent_reserved[i]; - for (i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry *e = + for (unsigned i = 0; i < c->replicas.nr; i++) { + struct bch_replicas_entry_v1 *e = cpu_replicas_entry(&c->replicas, i); - fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]); + fs_usage_data_type_to_base(&usage->b, e->data_type, usage->replicas[i]); } - for_each_member_device(ca, c, i) { + for_each_member_device(c, ca) { struct bch_dev_usage dev = bch2_dev_usage_read(ca); - usage->hidden += (dev.d[BCH_DATA_sb].buckets + - dev.d[BCH_DATA_journal].buckets) * + usage->b.hidden += (dev.d[BCH_DATA_sb].buckets + + dev.d[BCH_DATA_journal].buckets) * ca->mi.bucket_size; } @@ -158,8 +154,7 @@ retry: void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) { - struct bch_dev *ca; - unsigned i, u64s = fs_usage_u64s(c); + unsigned u64s = fs_usage_u64s(c); BUG_ON(idx >= ARRAY_SIZE(c->usage)); @@ -171,7 +166,7 @@ void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) percpu_memset(c->usage[idx], 0, u64s * sizeof(u64)); rcu_read_lock(); - for_each_member_device_rcu(ca, c, i, NULL) { + for_each_member_device_rcu(c, ca, NULL) { u64s = dev_usage_u64s(); acc_u64s_percpu((u64 *) ca->usage_base, @@ -193,15 +188,15 @@ void bch2_fs_usage_to_text(struct printbuf *out, prt_printf(out, "capacity:\t\t\t%llu\n", c->capacity); prt_printf(out, "hidden:\t\t\t\t%llu\n", - fs_usage->u.hidden); + fs_usage->u.b.hidden); prt_printf(out, "data:\t\t\t\t%llu\n", - fs_usage->u.data); + fs_usage->u.b.data); prt_printf(out, "cached:\t\t\t\t%llu\n", - fs_usage->u.cached); + fs_usage->u.b.cached); prt_printf(out, "reserved:\t\t\t%llu\n", - fs_usage->u.reserved); + fs_usage->u.b.reserved); prt_printf(out, "nr_inodes:\t\t\t%llu\n", - fs_usage->u.nr_inodes); + fs_usage->u.b.nr_inodes); prt_printf(out, "online reserved:\t\t%llu\n", fs_usage->online_reserved); @@ -214,7 +209,7 @@ void bch2_fs_usage_to_text(struct printbuf *out, } for (i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry *e = + struct bch_replicas_entry_v1 *e = cpu_replicas_entry(&c->replicas, i); prt_printf(out, "\t"); @@ -230,10 +225,10 @@ static u64 reserve_factor(u64 r) u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage) { - return min(fs_usage->u.hidden + - fs_usage->u.btree + - fs_usage->u.data + - reserve_factor(fs_usage->u.reserved + + return min(fs_usage->u.b.hidden + + fs_usage->u.b.btree + + fs_usage->u.b.data + + reserve_factor(fs_usage->u.b.reserved + fs_usage->online_reserved), c->capacity); } @@ -245,17 +240,17 @@ __bch2_fs_usage_read_short(struct bch_fs *c) u64 data, reserved; ret.capacity = c->capacity - - bch2_fs_usage_read_one(c, &c->usage_base->hidden); + bch2_fs_usage_read_one(c, &c->usage_base->b.hidden); - data = bch2_fs_usage_read_one(c, &c->usage_base->data) + - bch2_fs_usage_read_one(c, &c->usage_base->btree); - reserved = bch2_fs_usage_read_one(c, &c->usage_base->reserved) + + data = bch2_fs_usage_read_one(c, &c->usage_base->b.data) + + bch2_fs_usage_read_one(c, &c->usage_base->b.btree); + reserved = bch2_fs_usage_read_one(c, &c->usage_base->b.reserved) + percpu_u64_get(c->online_reserved); ret.used = min(ret.capacity, data + reserve_factor(reserved)); ret.free = ret.capacity - ret.used; - ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->nr_inodes); + ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->b.nr_inodes); return ret; } @@ -277,18 +272,34 @@ void bch2_dev_usage_init(struct bch_dev *ca) ca->usage_base->d[BCH_DATA_free].buckets = ca->mi.nbuckets - ca->mi.first_bucket; } -static inline int bucket_sectors_fragmented(struct bch_dev *ca, - struct bch_alloc_v4 a) +void bch2_dev_usage_to_text(struct printbuf *out, struct bch_dev_usage *usage) { - return a.dirty_sectors - ? max(0, (int) ca->mi.bucket_size - (int) a.dirty_sectors) - : 0; + prt_tab(out); + prt_str(out, "buckets"); + prt_tab_rjust(out); + prt_str(out, "sectors"); + prt_tab_rjust(out); + prt_str(out, "fragmented"); + prt_tab_rjust(out); + prt_newline(out); + + for (unsigned i = 0; i < BCH_DATA_NR; i++) { + bch2_prt_data_type(out, i); + prt_tab(out); + prt_u64(out, usage->d[i].buckets); + prt_tab_rjust(out); + prt_u64(out, usage->d[i].sectors); + prt_tab_rjust(out); + prt_u64(out, usage->d[i].fragmented); + prt_tab_rjust(out); + prt_newline(out); + } } -static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, - struct bch_alloc_v4 old, - struct bch_alloc_v4 new, - u64 journal_seq, bool gc) +void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, + const struct bch_alloc_v4 *old, + const struct bch_alloc_v4 *new, + u64 journal_seq, bool gc) { struct bch_fs_usage *fs_usage; struct bch_dev_usage *u; @@ -296,56 +307,51 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, preempt_disable(); fs_usage = fs_usage_ptr(c, journal_seq, gc); - if (data_type_is_hidden(old.data_type)) - fs_usage->hidden -= ca->mi.bucket_size; - if (data_type_is_hidden(new.data_type)) - fs_usage->hidden += ca->mi.bucket_size; + if (data_type_is_hidden(old->data_type)) + fs_usage->b.hidden -= ca->mi.bucket_size; + if (data_type_is_hidden(new->data_type)) + fs_usage->b.hidden += ca->mi.bucket_size; u = dev_usage_ptr(ca, journal_seq, gc); - u->d[old.data_type].buckets--; - u->d[new.data_type].buckets++; - - u->buckets_ec -= (int) !!old.stripe; - u->buckets_ec += (int) !!new.stripe; + u->d[old->data_type].buckets--; + u->d[new->data_type].buckets++; - u->d[old.data_type].sectors -= old.dirty_sectors; - u->d[new.data_type].sectors += new.dirty_sectors; + u->d[old->data_type].sectors -= bch2_bucket_sectors_dirty(*old); + u->d[new->data_type].sectors += bch2_bucket_sectors_dirty(*new); - u->d[BCH_DATA_cached].sectors += new.cached_sectors; - u->d[BCH_DATA_cached].sectors -= old.cached_sectors; + u->d[BCH_DATA_cached].sectors += new->cached_sectors; + u->d[BCH_DATA_cached].sectors -= old->cached_sectors; - u->d[old.data_type].fragmented -= bucket_sectors_fragmented(ca, old); - u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new); + u->d[old->data_type].fragmented -= bch2_bucket_sectors_fragmented(ca, *old); + u->d[new->data_type].fragmented += bch2_bucket_sectors_fragmented(ca, *new); preempt_enable(); } -static void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca, - struct bucket old, struct bucket new, - u64 journal_seq, bool gc) +static inline struct bch_alloc_v4 bucket_m_to_alloc(struct bucket b) { - struct bch_alloc_v4 old_a = { - .gen = old.gen, - .data_type = old.data_type, - .dirty_sectors = old.dirty_sectors, - .cached_sectors = old.cached_sectors, - .stripe = old.stripe, - }; - struct bch_alloc_v4 new_a = { - .gen = new.gen, - .data_type = new.data_type, - .dirty_sectors = new.dirty_sectors, - .cached_sectors = new.cached_sectors, - .stripe = new.stripe, + return (struct bch_alloc_v4) { + .gen = b.gen, + .data_type = b.data_type, + .dirty_sectors = b.dirty_sectors, + .cached_sectors = b.cached_sectors, + .stripe = b.stripe, }; +} - bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc); +void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca, + struct bucket *old, struct bucket *new) +{ + struct bch_alloc_v4 old_a = bucket_m_to_alloc(*old); + struct bch_alloc_v4 new_a = bucket_m_to_alloc(*new); + + bch2_dev_usage_update(c, ca, &old_a, &new_a, 0, true); } static inline int __update_replicas(struct bch_fs *c, struct bch_fs_usage *fs_usage, - struct bch_replicas_entry *r, + struct bch_replicas_entry_v1 *r, s64 sectors) { int idx = bch2_replicas_entry_idx(c, r); @@ -353,14 +359,14 @@ static inline int __update_replicas(struct bch_fs *c, if (idx < 0) return -1; - fs_usage_data_type_to_base(fs_usage, r->data_type, sectors); + fs_usage_data_type_to_base(&fs_usage->b, r->data_type, sectors); fs_usage->replicas[idx] += sectors; return 0; } -static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k, - struct bch_replicas_entry *r, s64 sectors, - unsigned journal_seq, bool gc) +int bch2_update_replicas(struct bch_fs *c, struct bkey_s_c k, + struct bch_replicas_entry_v1 *r, s64 sectors, + unsigned journal_seq, bool gc) { struct bch_fs_usage *fs_usage; int idx, ret = 0; @@ -388,7 +394,7 @@ static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k, preempt_disable(); fs_usage = fs_usage_ptr(c, journal_seq, gc); - fs_usage_data_type_to_base(fs_usage, r->data_type, sectors); + fs_usage_data_type_to_base(&fs_usage->b, r->data_type, sectors); fs_usage->replicas[idx] += sectors; preempt_enable(); err: @@ -407,7 +413,7 @@ static inline int update_cached_sectors(struct bch_fs *c, bch2_replicas_entry_cached(&r.e, dev); - return update_replicas(c, k, &r.e, sectors, journal_seq, gc); + return bch2_update_replicas(c, k, &r.e, sectors, journal_seq, gc); } static int __replicas_deltas_realloc(struct btree_trans *trans, unsigned more, @@ -453,9 +459,9 @@ int bch2_replicas_deltas_realloc(struct btree_trans *trans, unsigned more) __replicas_deltas_realloc(trans, more, _gfp)); } -static inline int update_replicas_list(struct btree_trans *trans, - struct bch_replicas_entry *r, - s64 sectors) +int bch2_update_replicas_list(struct btree_trans *trans, + struct bch_replicas_entry_v1 *r, + s64 sectors) { struct replicas_delta_list *d; struct replicas_delta *n; @@ -481,139 +487,13 @@ static inline int update_replicas_list(struct btree_trans *trans, return 0; } -static inline int update_cached_sectors_list(struct btree_trans *trans, - unsigned dev, s64 sectors) +int bch2_update_cached_sectors_list(struct btree_trans *trans, unsigned dev, s64 sectors) { struct bch_replicas_padded r; bch2_replicas_entry_cached(&r.e, dev); - return update_replicas_list(trans, &r.e, sectors); -} - -int bch2_mark_alloc(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) -{ - bool gc = flags & BTREE_TRIGGER_GC; - u64 journal_seq = trans->journal_res.seq; - u64 bucket_journal_seq; - struct bch_fs *c = trans->c; - struct bch_alloc_v4 old_a_convert, new_a_convert; - const struct bch_alloc_v4 *old_a, *new_a; - struct bch_dev *ca; - int ret = 0; - - /* - * alloc btree is read in by bch2_alloc_read, not gc: - */ - if ((flags & BTREE_TRIGGER_GC) && - !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) - return 0; - - if (bch2_trans_inconsistent_on(!bch2_dev_bucket_exists(c, new.k->p), trans, - "alloc key for invalid device or bucket")) - return -EIO; - - ca = bch_dev_bkey_exists(c, new.k->p.inode); - - old_a = bch2_alloc_to_v4(old, &old_a_convert); - new_a = bch2_alloc_to_v4(new, &new_a_convert); - - bucket_journal_seq = new_a->journal_seq; - - if ((flags & BTREE_TRIGGER_INSERT) && - data_type_is_empty(old_a->data_type) != - data_type_is_empty(new_a->data_type) && - new.k->type == KEY_TYPE_alloc_v4) { - struct bch_alloc_v4 *v = (struct bch_alloc_v4 *) new.v; - - EBUG_ON(!journal_seq); - - /* - * If the btree updates referring to a bucket weren't flushed - * before the bucket became empty again, then the we don't have - * to wait on a journal flush before we can reuse the bucket: - */ - v->journal_seq = bucket_journal_seq = - data_type_is_empty(new_a->data_type) && - (journal_seq == v->journal_seq || - bch2_journal_noflush_seq(&c->journal, v->journal_seq)) - ? 0 : journal_seq; - } - - if (!data_type_is_empty(old_a->data_type) && - data_type_is_empty(new_a->data_type) && - bucket_journal_seq) { - ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, - c->journal.flushed_seq_ondisk, - new.k->p.inode, new.k->p.offset, - bucket_journal_seq); - if (ret) { - bch2_fs_fatal_error(c, - "error setting bucket_needs_journal_commit: %i", ret); - return ret; - } - } - - percpu_down_read(&c->mark_lock); - if (!gc && new_a->gen != old_a->gen) - *bucket_gen(ca, new.k->p.offset) = new_a->gen; - - bch2_dev_usage_update(c, ca, *old_a, *new_a, journal_seq, gc); - - if (gc) { - struct bucket *g = gc_bucket(ca, new.k->p.offset); - - bucket_lock(g); - - g->gen_valid = 1; - g->gen = new_a->gen; - g->data_type = new_a->data_type; - g->stripe = new_a->stripe; - g->stripe_redundancy = new_a->stripe_redundancy; - g->dirty_sectors = new_a->dirty_sectors; - g->cached_sectors = new_a->cached_sectors; - - bucket_unlock(g); - } - percpu_up_read(&c->mark_lock); - - /* - * need to know if we're getting called from the invalidate path or - * not: - */ - - if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && - old_a->cached_sectors) { - ret = update_cached_sectors(c, new, ca->dev_idx, - -((s64) old_a->cached_sectors), - journal_seq, gc); - if (ret) { - bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors", - __func__); - return ret; - } - } - - if (new_a->data_type == BCH_DATA_free && - (!new_a->journal_seq || new_a->journal_seq < c->journal.flushed_seq_ondisk)) - closure_wake_up(&c->freelist_wait); - - if (new_a->data_type == BCH_DATA_need_discard && - (!bucket_journal_seq || bucket_journal_seq < c->journal.flushed_seq_ondisk)) - bch2_do_discards(c); - - if (old_a->data_type != BCH_DATA_cached && - new_a->data_type == BCH_DATA_cached && - should_invalidate_buckets(ca, bch2_dev_usage_read(ca))) - bch2_do_invalidates(c); - - if (new_a->data_type == BCH_DATA_need_gc_gens) - bch2_do_gc_gens(c); - - return 0; + return bch2_update_replicas_list(trans, &r.e, sectors); } int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, @@ -643,8 +523,8 @@ int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, if (bch2_fs_inconsistent_on(g->data_type && g->data_type != data_type, c, "different types of data in same bucket: %s, %s", - bch2_data_types[g->data_type], - bch2_data_types[data_type])) { + bch2_data_type_str(g->data_type), + bch2_data_type_str(data_type))) { ret = -EIO; goto err; } @@ -652,37 +532,33 @@ int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c, "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > bucket size", ca->dev_idx, b, g->gen, - bch2_data_types[g->data_type ?: data_type], + bch2_data_type_str(g->data_type ?: data_type), g->dirty_sectors, sectors)) { ret = -EIO; goto err; } - g->data_type = data_type; g->dirty_sectors += sectors; new = *g; err: bucket_unlock(g); if (!ret) - bch2_dev_usage_update_m(c, ca, old, new, 0, true); + bch2_dev_usage_update_m(c, ca, &old, &new); percpu_up_read(&c->mark_lock); return ret; } -static int check_bucket_ref(struct btree_trans *trans, - struct bkey_s_c k, - const struct bch_extent_ptr *ptr, - s64 sectors, enum bch_data_type ptr_data_type, - u8 b_gen, u8 bucket_data_type, - u32 dirty_sectors, u32 cached_sectors) +int bch2_check_bucket_ref(struct btree_trans *trans, + struct bkey_s_c k, + const struct bch_extent_ptr *ptr, + s64 sectors, enum bch_data_type ptr_data_type, + u8 b_gen, u8 bucket_data_type, + u32 bucket_sectors) { struct bch_fs *c = trans->c; struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); size_t bucket_nr = PTR_BUCKET_NR(ca, ptr); - u32 bucket_sectors = !ptr->cached - ? dirty_sectors - : cached_sectors; struct printbuf buf = PRINTBUF; int ret = 0; @@ -699,7 +575,7 @@ static int check_bucket_ref(struct btree_trans *trans, "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n" "while marking %s", ptr->dev, bucket_nr, b_gen, - bch2_data_types[bucket_data_type ?: ptr_data_type], + bch2_data_type_str(bucket_data_type ?: ptr_data_type), ptr->gen, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); ret = -EIO; @@ -712,7 +588,7 @@ static int check_bucket_ref(struct btree_trans *trans, "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" "while marking %s", ptr->dev, bucket_nr, b_gen, - bch2_data_types[bucket_data_type ?: ptr_data_type], + bch2_data_type_str(bucket_data_type ?: ptr_data_type), ptr->gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf)); @@ -727,7 +603,7 @@ static int check_bucket_ref(struct btree_trans *trans, "while marking %s", ptr->dev, bucket_nr, b_gen, *bucket_gen(ca, bucket_nr), - bch2_data_types[bucket_data_type ?: ptr_data_type], + bch2_data_type_str(bucket_data_type ?: ptr_data_type), ptr->gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf)); @@ -748,8 +624,8 @@ static int check_bucket_ref(struct btree_trans *trans, "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" "while marking %s", ptr->dev, bucket_nr, b_gen, - bch2_data_types[bucket_data_type], - bch2_data_types[ptr_data_type], + bch2_data_type_str(bucket_data_type), + bch2_data_type_str(ptr_data_type), (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf)); ret = -EIO; @@ -762,7 +638,7 @@ static int check_bucket_ref(struct btree_trans *trans, "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n" "while marking %s", ptr->dev, bucket_nr, b_gen, - bch2_data_types[bucket_data_type ?: ptr_data_type], + bch2_data_type_str(bucket_data_type ?: ptr_data_type), bucket_sectors, sectors, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf)); @@ -777,508 +653,6 @@ err: goto out; } -static int mark_stripe_bucket(struct btree_trans *trans, - struct bkey_s_c k, - unsigned ptr_idx, - unsigned flags) -{ - struct bch_fs *c = trans->c; - u64 journal_seq = trans->journal_res.seq; - const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; - unsigned nr_data = s->nr_blocks - s->nr_redundant; - bool parity = ptr_idx >= nr_data; - enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe; - s64 sectors = parity ? le16_to_cpu(s->sectors) : 0; - const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx; - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket old, new, *g; - struct printbuf buf = PRINTBUF; - int ret = 0; - - BUG_ON(!(flags & BTREE_TRIGGER_GC)); - - /* * XXX doesn't handle deletion */ - - percpu_down_read(&c->mark_lock); - g = PTR_GC_BUCKET(ca, ptr); - - if (g->dirty_sectors || - (g->stripe && g->stripe != k.k->p.offset)) { - bch2_fs_inconsistent(c, - "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s", - ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen, - (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - ret = -EINVAL; - goto err; - } - - bucket_lock(g); - old = *g; - - ret = check_bucket_ref(trans, k, ptr, sectors, data_type, - g->gen, g->data_type, - g->dirty_sectors, g->cached_sectors); - if (ret) - goto err; - - g->data_type = data_type; - g->dirty_sectors += sectors; - - g->stripe = k.k->p.offset; - g->stripe_redundancy = s->nr_redundant; - new = *g; -err: - bucket_unlock(g); - if (!ret) - bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true); - percpu_up_read(&c->mark_lock); - printbuf_exit(&buf); - return ret; -} - -static int __mark_pointer(struct btree_trans *trans, - struct bkey_s_c k, - const struct bch_extent_ptr *ptr, - s64 sectors, enum bch_data_type ptr_data_type, - u8 bucket_gen, u8 *bucket_data_type, - u32 *dirty_sectors, u32 *cached_sectors) -{ - u32 *dst_sectors = !ptr->cached - ? dirty_sectors - : cached_sectors; - int ret = check_bucket_ref(trans, k, ptr, sectors, ptr_data_type, - bucket_gen, *bucket_data_type, - *dirty_sectors, *cached_sectors); - - if (ret) - return ret; - - *dst_sectors += sectors; - - if (!*dirty_sectors && !*cached_sectors) - *bucket_data_type = 0; - else if (*bucket_data_type != BCH_DATA_stripe) - *bucket_data_type = ptr_data_type; - - return 0; -} - -static int bch2_mark_pointer(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, - struct extent_ptr_decoded p, - s64 sectors, - unsigned flags) -{ - u64 journal_seq = trans->journal_res.seq; - struct bch_fs *c = trans->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); - struct bucket old, new, *g; - enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p); - u8 bucket_data_type; - int ret = 0; - - BUG_ON(!(flags & BTREE_TRIGGER_GC)); - - percpu_down_read(&c->mark_lock); - g = PTR_GC_BUCKET(ca, &p.ptr); - bucket_lock(g); - old = *g; - - bucket_data_type = g->data_type; - ret = __mark_pointer(trans, k, &p.ptr, sectors, - data_type, g->gen, - &bucket_data_type, - &g->dirty_sectors, - &g->cached_sectors); - if (!ret) - g->data_type = bucket_data_type; - - new = *g; - bucket_unlock(g); - if (!ret) - bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true); - percpu_up_read(&c->mark_lock); - - return ret; -} - -static int bch2_mark_stripe_ptr(struct btree_trans *trans, - struct bkey_s_c k, - struct bch_extent_stripe_ptr p, - enum bch_data_type data_type, - s64 sectors, - unsigned flags) -{ - struct bch_fs *c = trans->c; - struct bch_replicas_padded r; - struct gc_stripe *m; - - BUG_ON(!(flags & BTREE_TRIGGER_GC)); - - m = genradix_ptr_alloc(&c->gc_stripes, p.idx, GFP_KERNEL); - if (!m) { - bch_err(c, "error allocating memory for gc_stripes, idx %llu", - (u64) p.idx); - return -BCH_ERR_ENOMEM_mark_stripe_ptr; - } - - mutex_lock(&c->ec_stripes_heap_lock); - - if (!m || !m->alive) { - mutex_unlock(&c->ec_stripes_heap_lock); - bch_err_ratelimited(c, "pointer to nonexistent stripe %llu", - (u64) p.idx); - bch2_inconsistent_error(c); - return -EIO; - } - - m->block_sectors[p.block] += sectors; - - r = m->r; - mutex_unlock(&c->ec_stripes_heap_lock); - - r.e.data_type = data_type; - update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, true); - - return 0; -} - -static int __mark_extent(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, unsigned flags) -{ - u64 journal_seq = trans->journal_res.seq; - struct bch_fs *c = trans->c; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - struct bch_replicas_padded r; - enum bch_data_type data_type = bkey_is_btree_ptr(k.k) - ? BCH_DATA_btree - : BCH_DATA_user; - s64 sectors = bkey_is_btree_ptr(k.k) - ? btree_sectors(c) - : k.k->size; - s64 dirty_sectors = 0; - bool stale; - int ret; - - BUG_ON(!(flags & BTREE_TRIGGER_GC)); - - r.e.data_type = data_type; - r.e.nr_devs = 0; - r.e.nr_required = 1; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - s64 disk_sectors = ptr_disk_sectors(sectors, p); - - if (flags & BTREE_TRIGGER_OVERWRITE) - disk_sectors = -disk_sectors; - - ret = bch2_mark_pointer(trans, btree_id, level, k, p, disk_sectors, flags); - if (ret < 0) - return ret; - - stale = ret > 0; - - if (p.ptr.cached) { - if (!stale) { - ret = update_cached_sectors(c, k, p.ptr.dev, - disk_sectors, journal_seq, true); - if (ret) { - bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors", - __func__); - return ret; - } - } - } else if (!p.has_ec) { - dirty_sectors += disk_sectors; - r.e.devs[r.e.nr_devs++] = p.ptr.dev; - } else { - ret = bch2_mark_stripe_ptr(trans, k, p.ec, data_type, - disk_sectors, flags); - if (ret) - return ret; - - /* - * There may be other dirty pointers in this extent, but - * if so they're not required for mounting if we have an - * erasure coded pointer in this extent: - */ - r.e.nr_required = 0; - } - } - - if (r.e.nr_devs) { - ret = update_replicas(c, k, &r.e, dirty_sectors, journal_seq, true); - if (ret) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, k); - bch2_fs_fatal_error(c, "%s(): no replicas entry for %s", __func__, buf.buf); - printbuf_exit(&buf); - return ret; - } - } - - return 0; -} - -int bch2_mark_extent(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) -{ - return mem_trigger_run_overwrite_then_insert(__mark_extent, trans, btree_id, level, old, new, flags); -} - -int bch2_mark_stripe(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) -{ - bool gc = flags & BTREE_TRIGGER_GC; - u64 journal_seq = trans->journal_res.seq; - struct bch_fs *c = trans->c; - u64 idx = new.k->p.offset; - const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe - ? bkey_s_c_to_stripe(old).v : NULL; - const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe - ? bkey_s_c_to_stripe(new).v : NULL; - unsigned i; - int ret; - - BUG_ON(gc && old_s); - - if (!gc) { - struct stripe *m = genradix_ptr(&c->stripes, idx); - - if (!m) { - struct printbuf buf1 = PRINTBUF; - struct printbuf buf2 = PRINTBUF; - - bch2_bkey_val_to_text(&buf1, c, old); - bch2_bkey_val_to_text(&buf2, c, new); - bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n" - "old %s\n" - "new %s", idx, buf1.buf, buf2.buf); - printbuf_exit(&buf2); - printbuf_exit(&buf1); - bch2_inconsistent_error(c); - return -1; - } - - if (!new_s) { - bch2_stripes_heap_del(c, m, idx); - - memset(m, 0, sizeof(*m)); - } else { - m->sectors = le16_to_cpu(new_s->sectors); - m->algorithm = new_s->algorithm; - m->nr_blocks = new_s->nr_blocks; - m->nr_redundant = new_s->nr_redundant; - m->blocks_nonempty = 0; - - for (i = 0; i < new_s->nr_blocks; i++) - m->blocks_nonempty += !!stripe_blockcount_get(new_s, i); - - if (!old_s) - bch2_stripes_heap_insert(c, m, idx); - else - bch2_stripes_heap_update(c, m, idx); - } - } else { - struct gc_stripe *m = - genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL); - - if (!m) { - bch_err(c, "error allocating memory for gc_stripes, idx %llu", - idx); - return -BCH_ERR_ENOMEM_mark_stripe; - } - /* - * This will be wrong when we bring back runtime gc: we should - * be unmarking the old key and then marking the new key - */ - m->alive = true; - m->sectors = le16_to_cpu(new_s->sectors); - m->nr_blocks = new_s->nr_blocks; - m->nr_redundant = new_s->nr_redundant; - - for (i = 0; i < new_s->nr_blocks; i++) - m->ptrs[i] = new_s->ptrs[i]; - - bch2_bkey_to_replicas(&m->r.e, new); - - /* - * gc recalculates this field from stripe ptr - * references: - */ - memset(m->block_sectors, 0, sizeof(m->block_sectors)); - - for (i = 0; i < new_s->nr_blocks; i++) { - ret = mark_stripe_bucket(trans, new, i, flags); - if (ret) - return ret; - } - - ret = update_replicas(c, new, &m->r.e, - ((s64) m->sectors * m->nr_redundant), - journal_seq, gc); - if (ret) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, new); - bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf); - printbuf_exit(&buf); - return ret; - } - } - - return 0; -} - -static int __mark_reservation(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, unsigned flags) -{ - struct bch_fs *c = trans->c; - struct bch_fs_usage *fs_usage; - unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; - s64 sectors = (s64) k.k->size; - - BUG_ON(!(flags & BTREE_TRIGGER_GC)); - - if (flags & BTREE_TRIGGER_OVERWRITE) - sectors = -sectors; - sectors *= replicas; - - percpu_down_read(&c->mark_lock); - preempt_disable(); - - fs_usage = fs_usage_ptr(c, trans->journal_res.seq, flags & BTREE_TRIGGER_GC); - replicas = clamp_t(unsigned, replicas, 1, - ARRAY_SIZE(fs_usage->persistent_reserved)); - - fs_usage->reserved += sectors; - fs_usage->persistent_reserved[replicas - 1] += sectors; - - preempt_enable(); - percpu_up_read(&c->mark_lock); - - return 0; -} - -int bch2_mark_reservation(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) -{ - return mem_trigger_run_overwrite_then_insert(__mark_reservation, trans, btree_id, level, old, new, flags); -} - -static s64 __bch2_mark_reflink_p(struct btree_trans *trans, - struct bkey_s_c_reflink_p p, - u64 start, u64 end, - u64 *idx, unsigned flags, size_t r_idx) -{ - struct bch_fs *c = trans->c; - struct reflink_gc *r; - int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; - u64 next_idx = end; - s64 ret = 0; - struct printbuf buf = PRINTBUF; - - if (r_idx >= c->reflink_gc_nr) - goto not_found; - - r = genradix_ptr(&c->reflink_gc_table, r_idx); - next_idx = min(next_idx, r->offset - r->size); - if (*idx < next_idx) - goto not_found; - - BUG_ON((s64) r->refcount + add < 0); - - r->refcount += add; - *idx = r->offset; - return 0; -not_found: - if (fsck_err(c, reflink_p_to_missing_reflink_v, - "pointer to missing indirect extent\n" - " %s\n" - " missing range %llu-%llu", - (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf), - *idx, next_idx)) { - struct bkey_i_error *new; - - new = bch2_trans_kmalloc(trans, sizeof(*new)); - ret = PTR_ERR_OR_ZERO(new); - if (ret) - goto err; - - bkey_init(&new->k); - new->k.type = KEY_TYPE_error; - new->k.p = bkey_start_pos(p.k); - new->k.p.offset += *idx - start; - bch2_key_resize(&new->k, next_idx - *idx); - ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, - BTREE_TRIGGER_NORUN); - } - - *idx = next_idx; -err: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -static int __mark_reflink_p(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, unsigned flags) -{ - struct bch_fs *c = trans->c; - struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - struct reflink_gc *ref; - size_t l, r, m; - u64 idx = le64_to_cpu(p.v->idx), start = idx; - u64 end = le64_to_cpu(p.v->idx) + p.k->size; - int ret = 0; - - BUG_ON(!(flags & BTREE_TRIGGER_GC)); - - if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_reflink_p_fix) { - idx -= le32_to_cpu(p.v->front_pad); - end += le32_to_cpu(p.v->back_pad); - } - - l = 0; - r = c->reflink_gc_nr; - while (l < r) { - m = l + (r - l) / 2; - - ref = genradix_ptr(&c->reflink_gc_table, m); - if (ref->offset <= idx) - l = m + 1; - else - r = m; - } - - while (idx < end && !ret) - ret = __bch2_mark_reflink_p(trans, p, start, end, - &idx, flags, l++); - - return ret; -} - -int bch2_mark_reflink_p(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) -{ - return mem_trigger_run_overwrite_then_insert(__mark_reflink_p, trans, btree_id, level, old, new, flags); -} - void bch2_trans_fs_usage_revert(struct btree_trans *trans, struct replicas_delta_list *deltas) { @@ -1303,11 +677,11 @@ void bch2_trans_fs_usage_revert(struct btree_trans *trans, BUG_ON(__update_replicas(c, dst, &d->r, -d->delta)); } - dst->nr_inodes -= deltas->nr_inodes; + dst->b.nr_inodes -= deltas->nr_inodes; for (i = 0; i < BCH_REPLICAS_MAX; i++) { added -= deltas->persistent_reserved[i]; - dst->reserved -= deltas->persistent_reserved[i]; + dst->b.reserved -= deltas->persistent_reserved[i]; dst->persistent_reserved[i] -= deltas->persistent_reserved[i]; } @@ -1320,48 +694,25 @@ void bch2_trans_fs_usage_revert(struct btree_trans *trans, percpu_up_read(&c->mark_lock); } -int bch2_trans_fs_usage_apply(struct btree_trans *trans, - struct replicas_delta_list *deltas) +void bch2_trans_account_disk_usage_change(struct btree_trans *trans) { struct bch_fs *c = trans->c; + u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; static int warned_disk_usage = 0; bool warn = false; - u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; - struct replicas_delta *d, *d2; - struct replicas_delta *top = (void *) deltas->d + deltas->used; - struct bch_fs_usage *dst; - s64 added = 0, should_not_have_added; - unsigned i; percpu_down_read(&c->mark_lock); preempt_disable(); - dst = fs_usage_ptr(c, trans->journal_res.seq, false); + struct bch_fs_usage_base *dst = &fs_usage_ptr(c, trans->journal_res.seq, false)->b; + struct bch_fs_usage_base *src = &trans->fs_usage_delta; - for (d = deltas->d; d != top; d = replicas_delta_next(d)) { - switch (d->r.data_type) { - case BCH_DATA_btree: - case BCH_DATA_user: - case BCH_DATA_parity: - added += d->delta; - } - - if (__update_replicas(c, dst, &d->r, d->delta)) - goto need_mark; - } - - dst->nr_inodes += deltas->nr_inodes; - - for (i = 0; i < BCH_REPLICAS_MAX; i++) { - added += deltas->persistent_reserved[i]; - dst->reserved += deltas->persistent_reserved[i]; - dst->persistent_reserved[i] += deltas->persistent_reserved[i]; - } + s64 added = src->btree + src->data + src->reserved; /* * Not allowed to reduce sectors_available except by getting a * reservation: */ - should_not_have_added = added - (s64) disk_res_sectors; + s64 should_not_have_added = added - (s64) disk_res_sectors; if (unlikely(should_not_have_added > 0)) { u64 old, new, v = atomic64_read(&c->sectors_available); @@ -1380,6 +731,13 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans, this_cpu_sub(*c->online_reserved, added); } + dst->hidden += src->hidden; + dst->btree += src->btree; + dst->data += src->data; + dst->cached += src->cached; + dst->reserved += src->reserved; + dst->nr_inodes += src->nr_inodes; + preempt_enable(); percpu_up_read(&c->mark_lock); @@ -1387,6 +745,34 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans, bch2_trans_inconsistent(trans, "disk usage increased %lli more than %llu sectors reserved)", should_not_have_added, disk_res_sectors); +} + +int bch2_trans_fs_usage_apply(struct btree_trans *trans, + struct replicas_delta_list *deltas) +{ + struct bch_fs *c = trans->c; + struct replicas_delta *d, *d2; + struct replicas_delta *top = (void *) deltas->d + deltas->used; + struct bch_fs_usage *dst; + unsigned i; + + percpu_down_read(&c->mark_lock); + preempt_disable(); + dst = fs_usage_ptr(c, trans->journal_res.seq, false); + + for (d = deltas->d; d != top; d = replicas_delta_next(d)) + if (__update_replicas(c, dst, &d->r, d->delta)) + goto need_mark; + + dst->b.nr_inodes += deltas->nr_inodes; + + for (i = 0; i < BCH_REPLICAS_MAX; i++) { + dst->b.reserved += deltas->persistent_reserved[i]; + dst->persistent_reserved[i] += deltas->persistent_reserved[i]; + } + + preempt_enable(); + percpu_up_read(&c->mark_lock); return 0; need_mark: /* revert changes: */ @@ -1398,92 +784,184 @@ need_mark: return -1; } -/* trans_mark: */ +/* KEY_TYPE_extent: */ + +static int __mark_pointer(struct btree_trans *trans, + struct bkey_s_c k, + const struct bch_extent_ptr *ptr, + s64 sectors, enum bch_data_type ptr_data_type, + u8 bucket_gen, u8 *bucket_data_type, + u32 *dirty_sectors, u32 *cached_sectors) +{ + u32 *dst_sectors = !ptr->cached + ? dirty_sectors + : cached_sectors; + int ret = bch2_check_bucket_ref(trans, k, ptr, sectors, ptr_data_type, + bucket_gen, *bucket_data_type, *dst_sectors); + + if (ret) + return ret; + + *dst_sectors += sectors; + + if (!*dirty_sectors && !*cached_sectors) + *bucket_data_type = 0; + else if (*bucket_data_type != BCH_DATA_stripe) + *bucket_data_type = ptr_data_type; + + return 0; +} -static inline int bch2_trans_mark_pointer(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, struct extent_ptr_decoded p, - unsigned flags) +static int bch2_trigger_pointer(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, struct extent_ptr_decoded p, + s64 *sectors, + unsigned flags) { bool insert = !(flags & BTREE_TRIGGER_OVERWRITE); - struct btree_iter iter; - struct bkey_i_alloc_v4 *a; struct bpos bucket; struct bch_backpointer bp; - s64 sectors; - int ret; bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket, &bp); - sectors = bp.bucket_len; - if (!insert) - sectors = -sectors; + *sectors = insert ? bp.bucket_len : -((s64) bp.bucket_len); - a = bch2_trans_start_alloc_update(trans, &iter, bucket); - if (IS_ERR(a)) - return PTR_ERR(a); + if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + struct btree_iter iter; + struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, &iter, bucket); + int ret = PTR_ERR_OR_ZERO(a); + if (ret) + return ret; - ret = __mark_pointer(trans, k, &p.ptr, sectors, bp.data_type, - a->v.gen, &a->v.data_type, - &a->v.dirty_sectors, &a->v.cached_sectors) ?: - bch2_trans_update(trans, &iter, &a->k_i, 0); - bch2_trans_iter_exit(trans, &iter); + ret = __mark_pointer(trans, k, &p.ptr, *sectors, bp.data_type, + a->v.gen, &a->v.data_type, + &a->v.dirty_sectors, &a->v.cached_sectors) ?: + bch2_trans_update(trans, &iter, &a->k_i, 0); + bch2_trans_iter_exit(trans, &iter); - if (ret) - return ret; - - if (!p.ptr.cached) { - ret = bch2_bucket_backpointer_mod(trans, bucket, bp, k, insert); if (ret) return ret; + + if (!p.ptr.cached) { + ret = bch2_bucket_backpointer_mod(trans, bucket, bp, k, insert); + if (ret) + return ret; + } + } + + if (flags & BTREE_TRIGGER_GC) { + struct bch_fs *c = trans->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); + enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p); + + percpu_down_read(&c->mark_lock); + struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); + bucket_lock(g); + struct bucket old = *g; + + u8 bucket_data_type = g->data_type; + int ret = __mark_pointer(trans, k, &p.ptr, *sectors, + data_type, g->gen, + &bucket_data_type, + &g->dirty_sectors, + &g->cached_sectors); + if (ret) { + bucket_unlock(g); + percpu_up_read(&c->mark_lock); + return ret; + } + + g->data_type = bucket_data_type; + struct bucket new = *g; + bucket_unlock(g); + bch2_dev_usage_update_m(c, ca, &old, &new); + percpu_up_read(&c->mark_lock); } return 0; } -static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, - struct extent_ptr_decoded p, - s64 sectors, enum bch_data_type data_type) +static int bch2_trigger_stripe_ptr(struct btree_trans *trans, + struct bkey_s_c k, + struct extent_ptr_decoded p, + enum bch_data_type data_type, + s64 sectors, unsigned flags) { - struct btree_iter iter; - struct bkey_i_stripe *s; - struct bch_replicas_padded r; - int ret = 0; + if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + struct btree_iter iter; + struct bkey_i_stripe *s = bch2_bkey_get_mut_typed(trans, &iter, + BTREE_ID_stripes, POS(0, p.ec.idx), + BTREE_ITER_WITH_UPDATES, stripe); + int ret = PTR_ERR_OR_ZERO(s); + if (unlikely(ret)) { + bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans, + "pointer to nonexistent stripe %llu", + (u64) p.ec.idx); + goto err; + } - s = bch2_bkey_get_mut_typed(trans, &iter, - BTREE_ID_stripes, POS(0, p.ec.idx), - BTREE_ITER_WITH_UPDATES, stripe); - ret = PTR_ERR_OR_ZERO(s); - if (unlikely(ret)) { - bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans, - "pointer to nonexistent stripe %llu", - (u64) p.ec.idx); - goto err; - } + if (!bch2_ptr_matches_stripe(&s->v, p)) { + bch2_trans_inconsistent(trans, + "stripe pointer doesn't match stripe %llu", + (u64) p.ec.idx); + ret = -EIO; + goto err; + } - if (!bch2_ptr_matches_stripe(&s->v, p)) { - bch2_trans_inconsistent(trans, - "stripe pointer doesn't match stripe %llu", - (u64) p.ec.idx); - ret = -EIO; - goto err; + stripe_blockcount_set(&s->v, p.ec.block, + stripe_blockcount_get(&s->v, p.ec.block) + + sectors); + + struct bch_replicas_padded r; + bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i)); + r.e.data_type = data_type; + ret = bch2_update_replicas_list(trans, &r.e, sectors); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; } - stripe_blockcount_set(&s->v, p.ec.block, - stripe_blockcount_get(&s->v, p.ec.block) + - sectors); + if (flags & BTREE_TRIGGER_GC) { + struct bch_fs *c = trans->c; - bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i)); - r.e.data_type = data_type; - ret = update_replicas_list(trans, &r.e, sectors); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; + BUG_ON(!(flags & BTREE_TRIGGER_GC)); + + struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.ec.idx, GFP_KERNEL); + if (!m) { + bch_err(c, "error allocating memory for gc_stripes, idx %llu", + (u64) p.ec.idx); + return -BCH_ERR_ENOMEM_mark_stripe_ptr; + } + + mutex_lock(&c->ec_stripes_heap_lock); + + if (!m || !m->alive) { + mutex_unlock(&c->ec_stripes_heap_lock); + struct printbuf buf = PRINTBUF; + bch2_bkey_val_to_text(&buf, c, k); + bch_err_ratelimited(c, "pointer to nonexistent stripe %llu\n while marking %s", + (u64) p.ec.idx, buf.buf); + printbuf_exit(&buf); + bch2_inconsistent_error(c); + return -EIO; + } + + m->block_sectors[p.ec.block] += sectors; + + struct bch_replicas_padded r = m->r; + mutex_unlock(&c->ec_stripes_heap_lock); + + r.e.data_type = data_type; + bch2_update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, true); + } + + return 0; } -static int __trans_mark_extent(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, unsigned flags) +static int __trigger_extent(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, unsigned flags) { + bool gc = flags & BTREE_TRIGGER_GC; struct bch_fs *c = trans->c; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; @@ -1492,11 +970,7 @@ static int __trans_mark_extent(struct btree_trans *trans, enum bch_data_type data_type = bkey_is_btree_ptr(k.k) ? BCH_DATA_btree : BCH_DATA_user; - s64 sectors = bkey_is_btree_ptr(k.k) - ? btree_sectors(c) - : k.k->size; s64 dirty_sectors = 0; - bool stale; int ret = 0; r.e.data_type = data_type; @@ -1504,21 +978,20 @@ static int __trans_mark_extent(struct btree_trans *trans, r.e.nr_required = 1; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - s64 disk_sectors = ptr_disk_sectors(sectors, p); - - if (flags & BTREE_TRIGGER_OVERWRITE) - disk_sectors = -disk_sectors; - - ret = bch2_trans_mark_pointer(trans, btree_id, level, k, p, flags); + s64 disk_sectors; + ret = bch2_trigger_pointer(trans, btree_id, level, k, p, &disk_sectors, flags); if (ret < 0) return ret; - stale = ret > 0; + bool stale = ret > 0; if (p.ptr.cached) { if (!stale) { - ret = update_cached_sectors_list(trans, p.ptr.dev, - disk_sectors); + ret = !gc + ? bch2_update_cached_sectors_list(trans, p.ptr.dev, disk_sectors) + : update_cached_sectors(c, k, p.ptr.dev, disk_sectors, 0, true); + bch2_fs_fatal_err_on(ret && gc, c, "%s(): no replicas entry while updating cached sectors", + __func__); if (ret) return ret; } @@ -1526,324 +999,122 @@ static int __trans_mark_extent(struct btree_trans *trans, dirty_sectors += disk_sectors; r.e.devs[r.e.nr_devs++] = p.ptr.dev; } else { - ret = bch2_trans_mark_stripe_ptr(trans, p, - disk_sectors, data_type); + ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags); if (ret) return ret; + /* + * There may be other dirty pointers in this extent, but + * if so they're not required for mounting if we have an + * erasure coded pointer in this extent: + */ r.e.nr_required = 0; } } - if (r.e.nr_devs) - ret = update_replicas_list(trans, &r.e, dirty_sectors); - - return ret; -} - -int bch2_trans_mark_extent(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_i *new, - unsigned flags) -{ - struct bch_fs *c = trans->c; - int mod = (int) bch2_bkey_needs_rebalance(c, bkey_i_to_s_c(new)) - - (int) bch2_bkey_needs_rebalance(c, old); + if (r.e.nr_devs) { + ret = !gc + ? bch2_update_replicas_list(trans, &r.e, dirty_sectors) + : bch2_update_replicas(c, k, &r.e, dirty_sectors, 0, true); + if (unlikely(ret && gc)) { + struct printbuf buf = PRINTBUF; - if (mod) { - int ret = bch2_btree_bit_mod(trans, BTREE_ID_rebalance_work, new->k.p, mod > 0); + bch2_bkey_val_to_text(&buf, c, k); + bch2_fs_fatal_error(c, "%s(): no replicas entry for %s", __func__, buf.buf); + printbuf_exit(&buf); + } if (ret) return ret; } - return trigger_run_overwrite_then_insert(__trans_mark_extent, trans, btree_id, level, old, new, flags); -} - -static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, - struct bkey_s_c_stripe s, - unsigned idx, bool deleting) -{ - struct bch_fs *c = trans->c; - const struct bch_extent_ptr *ptr = &s.v->ptrs[idx]; - struct btree_iter iter; - struct bkey_i_alloc_v4 *a; - enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant - ? BCH_DATA_parity : 0; - s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0; - int ret = 0; - - if (deleting) - sectors = -sectors; - - a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr)); - if (IS_ERR(a)) - return PTR_ERR(a); - - ret = check_bucket_ref(trans, s.s_c, ptr, sectors, data_type, - a->v.gen, a->v.data_type, - a->v.dirty_sectors, a->v.cached_sectors); - if (ret) - goto err; - - if (!deleting) { - if (bch2_trans_inconsistent_on(a->v.stripe || - a->v.stripe_redundancy, trans, - "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)", - iter.pos.inode, iter.pos.offset, a->v.gen, - bch2_data_types[a->v.data_type], - a->v.dirty_sectors, - a->v.stripe, s.k->p.offset)) { - ret = -EIO; - goto err; - } - - if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans, - "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu", - iter.pos.inode, iter.pos.offset, a->v.gen, - bch2_data_types[a->v.data_type], - a->v.dirty_sectors, - s.k->p.offset)) { - ret = -EIO; - goto err; - } - - a->v.stripe = s.k->p.offset; - a->v.stripe_redundancy = s.v->nr_redundant; - a->v.data_type = BCH_DATA_stripe; - } else { - if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset || - a->v.stripe_redundancy != s.v->nr_redundant, trans, - "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)", - iter.pos.inode, iter.pos.offset, a->v.gen, - s.k->p.offset, a->v.stripe)) { - ret = -EIO; - goto err; - } - - a->v.stripe = 0; - a->v.stripe_redundancy = 0; - a->v.data_type = alloc_data_type(a->v, BCH_DATA_user); - } - - a->v.dirty_sectors += sectors; - if (data_type) - a->v.data_type = !deleting ? data_type : 0; - - ret = bch2_trans_update(trans, &iter, &a->k_i, 0); - if (ret) - goto err; -err: - bch2_trans_iter_exit(trans, &iter); - return ret; + return 0; } -int bch2_trans_mark_stripe(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_i *new, - unsigned flags) +int bch2_trigger_extent(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_s new, + unsigned flags) { - const struct bch_stripe *old_s = NULL; - struct bch_stripe *new_s = NULL; - struct bch_replicas_padded r; - unsigned i, nr_blocks; - int ret = 0; - - if (old.k->type == KEY_TYPE_stripe) - old_s = bkey_s_c_to_stripe(old).v; - if (new->k.type == KEY_TYPE_stripe) - new_s = &bkey_i_to_stripe(new)->v; - - /* - * If the pointers aren't changing, we don't need to do anything: - */ - if (new_s && old_s && - new_s->nr_blocks == old_s->nr_blocks && - new_s->nr_redundant == old_s->nr_redundant && - !memcmp(old_s->ptrs, new_s->ptrs, - new_s->nr_blocks * sizeof(struct bch_extent_ptr))) + struct bkey_ptrs_c new_ptrs = bch2_bkey_ptrs_c(new.s_c); + struct bkey_ptrs_c old_ptrs = bch2_bkey_ptrs_c(old); + unsigned new_ptrs_bytes = (void *) new_ptrs.end - (void *) new_ptrs.start; + unsigned old_ptrs_bytes = (void *) old_ptrs.end - (void *) old_ptrs.start; + + /* if pointers aren't changing - nothing to do: */ + if (new_ptrs_bytes == old_ptrs_bytes && + !memcmp(new_ptrs.start, + old_ptrs.start, + new_ptrs_bytes)) return 0; - BUG_ON(new_s && old_s && - (new_s->nr_blocks != old_s->nr_blocks || - new_s->nr_redundant != old_s->nr_redundant)); + if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + struct bch_fs *c = trans->c; + int mod = (int) bch2_bkey_needs_rebalance(c, new.s_c) - + (int) bch2_bkey_needs_rebalance(c, old); - nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks; - - if (new_s) { - s64 sectors = le16_to_cpu(new_s->sectors); - - bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(new)); - ret = update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant); - if (ret) - return ret; - } - - if (old_s) { - s64 sectors = -((s64) le16_to_cpu(old_s->sectors)); - - bch2_bkey_to_replicas(&r.e, old); - ret = update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant); - if (ret) - return ret; - } - - for (i = 0; i < nr_blocks; i++) { - if (new_s && old_s && - !memcmp(&new_s->ptrs[i], - &old_s->ptrs[i], - sizeof(new_s->ptrs[i]))) - continue; - - if (new_s) { - ret = bch2_trans_mark_stripe_bucket(trans, - bkey_i_to_s_c_stripe(new), i, false); - if (ret) - break; - } - - if (old_s) { - ret = bch2_trans_mark_stripe_bucket(trans, - bkey_s_c_to_stripe(old), i, true); + if (mod) { + int ret = bch2_btree_bit_mod(trans, BTREE_ID_rebalance_work, new.k->p, mod > 0); if (ret) - break; + return ret; } } - return ret; + if (flags & (BTREE_TRIGGER_TRANSACTIONAL|BTREE_TRIGGER_GC)) + return trigger_run_overwrite_then_insert(__trigger_extent, trans, btree_id, level, old, new, flags); + + return 0; } -static int __trans_mark_reservation(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, unsigned flags) +/* KEY_TYPE_reservation */ + +static int __trigger_reservation(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, unsigned flags) { + struct bch_fs *c = trans->c; unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; - s64 sectors = (s64) k.k->size; - struct replicas_delta_list *d; - int ret; + s64 sectors = (s64) k.k->size * replicas; if (flags & BTREE_TRIGGER_OVERWRITE) sectors = -sectors; - sectors *= replicas; - - ret = bch2_replicas_deltas_realloc(trans, 0); - if (ret) - return ret; - d = trans->fs_usage_deltas; - replicas = clamp_t(unsigned, replicas, 1, - ARRAY_SIZE(d->persistent_reserved)); - - d->persistent_reserved[replicas - 1] += sectors; - return 0; -} - -int bch2_trans_mark_reservation(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, - struct bkey_i *new, - unsigned flags) -{ - return trigger_run_overwrite_then_insert(__trans_mark_reservation, trans, btree_id, level, old, new, flags); -} - -static int trans_mark_reflink_p_segment(struct btree_trans *trans, - struct bkey_s_c_reflink_p p, - u64 *idx, unsigned flags) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_i *k; - __le64 *refcount; - int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; - struct printbuf buf = PRINTBUF; - int ret; + if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + int ret = bch2_replicas_deltas_realloc(trans, 0); + if (ret) + return ret; - k = bch2_bkey_get_mut_noupdate(trans, &iter, - BTREE_ID_reflink, POS(0, *idx), - BTREE_ITER_WITH_UPDATES); - ret = PTR_ERR_OR_ZERO(k); - if (ret) - goto err; + struct replicas_delta_list *d = trans->fs_usage_deltas; + replicas = min(replicas, ARRAY_SIZE(d->persistent_reserved)); - refcount = bkey_refcount(k); - if (!refcount) { - bch2_bkey_val_to_text(&buf, c, p.s_c); - bch2_trans_inconsistent(trans, - "nonexistent indirect extent at %llu while marking\n %s", - *idx, buf.buf); - ret = -EIO; - goto err; + d->persistent_reserved[replicas - 1] += sectors; } - if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) { - bch2_bkey_val_to_text(&buf, c, p.s_c); - bch2_trans_inconsistent(trans, - "indirect extent refcount underflow at %llu while marking\n %s", - *idx, buf.buf); - ret = -EIO; - goto err; - } + if (flags & BTREE_TRIGGER_GC) { + percpu_down_read(&c->mark_lock); + preempt_disable(); - if (flags & BTREE_TRIGGER_INSERT) { - struct bch_reflink_p *v = (struct bch_reflink_p *) p.v; - u64 pad; + struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage_gc); - pad = max_t(s64, le32_to_cpu(v->front_pad), - le64_to_cpu(v->idx) - bkey_start_offset(&k->k)); - BUG_ON(pad > U32_MAX); - v->front_pad = cpu_to_le32(pad); + replicas = min(replicas, ARRAY_SIZE(fs_usage->persistent_reserved)); + fs_usage->b.reserved += sectors; + fs_usage->persistent_reserved[replicas - 1] += sectors; - pad = max_t(s64, le32_to_cpu(v->back_pad), - k->k.p.offset - p.k->size - le64_to_cpu(v->idx)); - BUG_ON(pad > U32_MAX); - v->back_pad = cpu_to_le32(pad); + preempt_enable(); + percpu_up_read(&c->mark_lock); } - le64_add_cpu(refcount, add); - - bch2_btree_iter_set_pos_to_extent_start(&iter); - ret = bch2_trans_update(trans, &iter, k, 0); - if (ret) - goto err; - - *idx = k->k.p.offset; -err: - bch2_trans_iter_exit(trans, &iter); - printbuf_exit(&buf); - return ret; + return 0; } -static int __trans_mark_reflink_p(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, unsigned flags) +int bch2_trigger_reservation(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_s new, + unsigned flags) { - struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - u64 idx, end_idx; - int ret = 0; - - idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad); - end_idx = le64_to_cpu(p.v->idx) + p.k->size + - le32_to_cpu(p.v->back_pad); - - while (idx < end_idx && !ret) - ret = trans_mark_reflink_p_segment(trans, p, &idx, flags); - return ret; + return trigger_run_overwrite_then_insert(__trigger_reservation, trans, btree_id, level, old, new, flags); } -int bch2_trans_mark_reflink_p(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, - struct bkey_i *new, - unsigned flags) -{ - if (flags & BTREE_TRIGGER_INSERT) { - struct bch_reflink_p *v = &bkey_i_to_reflink_p(new)->v; - - v->front_pad = v->back_pad = 0; - } - - return trigger_run_overwrite_then_insert(__trans_mark_reflink_p, trans, btree_id, level, old, new, flags); -} +/* Mark superblocks: */ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev *ca, size_t b, @@ -1871,9 +1142,9 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" "while marking %s", iter.pos.inode, iter.pos.offset, a->v.gen, - bch2_data_types[a->v.data_type], - bch2_data_types[type], - bch2_data_types[type]); + bch2_data_type_str(a->v.data_type), + bch2_data_type_str(type), + bch2_data_type_str(type)); ret = -EIO; goto err; } @@ -1974,17 +1245,13 @@ int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca) { int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(trans, ca)); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } int bch2_trans_mark_dev_sbs(struct bch_fs *c) { - struct bch_dev *ca; - unsigned i; - - for_each_online_member(ca, c, i) { + for_each_online_member(c, ca) { int ret = bch2_trans_mark_dev_sb(c, ca); if (ret) { percpu_ref_put(&ca->ref); diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 21f6cb356921..6387e039f789 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -203,6 +203,7 @@ static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca) } void bch2_dev_usage_init(struct bch_dev *); +void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev_usage *); static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark) { @@ -301,6 +302,12 @@ u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage_online *); struct bch_fs_usage_short bch2_fs_usage_read_short(struct bch_fs *); +void bch2_dev_usage_update(struct bch_fs *, struct bch_dev *, + const struct bch_alloc_v4 *, + const struct bch_alloc_v4 *, u64, bool); +void bch2_dev_usage_update_m(struct bch_fs *, struct bch_dev *, + struct bucket *, struct bucket *); + /* key/bucket marking: */ static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c, @@ -315,43 +322,41 @@ static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c, : c->usage[journal_seq & JOURNAL_BUF_MASK]); } +int bch2_update_replicas(struct bch_fs *, struct bkey_s_c, + struct bch_replicas_entry_v1 *, s64, + unsigned, bool); +int bch2_update_replicas_list(struct btree_trans *, + struct bch_replicas_entry_v1 *, s64); +int bch2_update_cached_sectors_list(struct btree_trans *, unsigned, s64); int bch2_replicas_deltas_realloc(struct btree_trans *, unsigned); void bch2_fs_usage_initialize(struct bch_fs *); +int bch2_check_bucket_ref(struct btree_trans *, struct bkey_s_c, + const struct bch_extent_ptr *, + s64, enum bch_data_type, u8, u8, u32); + int bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, size_t, enum bch_data_type, unsigned, struct gc_pos, unsigned); -int bch2_mark_alloc(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s_c, unsigned); -int bch2_mark_extent(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s_c, unsigned); -int bch2_mark_stripe(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s_c, unsigned); -int bch2_mark_reservation(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s_c, unsigned); -int bch2_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s_c, unsigned); - -int bch2_trans_mark_extent(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); -int bch2_trans_mark_stripe(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); -int bch2_trans_mark_reservation(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); -int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); - -#define mem_trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)\ +int bch2_trigger_extent(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s, unsigned); +int bch2_trigger_reservation(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s, unsigned); + +#define trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)\ ({ \ int ret = 0; \ \ if (_old.k->type) \ ret = _fn(_trans, _btree_id, _level, _old, _flags & ~BTREE_TRIGGER_INSERT); \ if (!ret && _new.k->type) \ - ret = _fn(_trans, _btree_id, _level, _new, _flags & ~BTREE_TRIGGER_OVERWRITE); \ + ret = _fn(_trans, _btree_id, _level, _new.s_c, _flags & ~BTREE_TRIGGER_OVERWRITE);\ ret; \ }) -#define trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags) \ - mem_trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, bkey_i_to_s_c(_new), _flags) +void bch2_trans_account_disk_usage_change(struct btree_trans *); void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *); int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); @@ -382,6 +387,21 @@ static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b) return false; } +static inline const char *bch2_data_type_str(enum bch_data_type type) +{ + return type < BCH_DATA_NR + ? __bch2_data_types[type] + : "(invalid data type)"; +} + +static inline void bch2_prt_data_type(struct printbuf *out, enum bch_data_type type) +{ + if (type < BCH_DATA_NR) + prt_str(out, __bch2_data_types[type]); + else + prt_printf(out, "(invalid data type %u)", type); +} + /* disk reservations: */ static inline void bch2_disk_reservation_put(struct bch_fs *c, diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h index 2a9dab9006ef..6a31740222a7 100644 --- a/fs/bcachefs/buckets_types.h +++ b/fs/bcachefs/buckets_types.h @@ -33,8 +33,6 @@ struct bucket_gens { }; struct bch_dev_usage { - u64 buckets_ec; - struct { u64 buckets; u64 sectors; /* _compressed_ sectors: */ @@ -47,23 +45,18 @@ struct bch_dev_usage { } d[BCH_DATA_NR]; }; -struct bch_fs_usage { - /* all fields are in units of 512 byte sectors: */ +struct bch_fs_usage_base { u64 hidden; u64 btree; u64 data; u64 cached; u64 reserved; u64 nr_inodes; +}; - /* XXX: add stats for compression ratio */ -#if 0 - u64 uncompressed; - u64 compressed; -#endif - - /* broken out: */ - +struct bch_fs_usage { + /* all fields are in units of 512 byte sectors: */ + struct bch_fs_usage_base b; u64 persistent_reserved[BCH_REPLICAS_MAX]; u64 replicas[]; }; diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index 4bb88aefed12..226b39c17667 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -7,22 +7,27 @@ #include "chardev.h" #include "journal.h" #include "move.h" +#include "recovery.h" #include "replicas.h" #include "super.h" #include "super-io.h" +#include "thread_with_file.h" -#include <linux/anon_inodes.h> #include <linux/cdev.h> #include <linux/device.h> -#include <linux/file.h> #include <linux/fs.h> #include <linux/ioctl.h> -#include <linux/kthread.h> #include <linux/major.h> #include <linux/sched/task.h> #include <linux/slab.h> #include <linux/uaccess.h> +__must_check +static int copy_to_user_errcode(void __user *to, const void *from, unsigned long n) +{ + return copy_to_user(to, from, n) ? -EFAULT : 0; +} + /* returns with ref on ca->ref */ static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev, unsigned flags) @@ -132,8 +137,106 @@ static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg } #endif +struct fsck_thread { + struct thread_with_stdio thr; + struct bch_fs *c; + char **devs; + size_t nr_devs; + struct bch_opts opts; +}; + +static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr) +{ + struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr); + if (thr->devs) + for (size_t i = 0; i < thr->nr_devs; i++) + kfree(thr->devs[i]); + kfree(thr->devs); + kfree(thr); +} + +static int bch2_fsck_offline_thread_fn(void *arg) +{ + struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr); + struct bch_fs *c = bch2_fs_open(thr->devs, thr->nr_devs, thr->opts); + + thr->thr.thr.ret = PTR_ERR_OR_ZERO(c); + if (!thr->thr.thr.ret) + bch2_fs_stop(c); + + thread_with_stdio_done(&thr->thr); + return 0; +} + +static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg) +{ + struct bch_ioctl_fsck_offline arg; + struct fsck_thread *thr = NULL; + u64 *devs = NULL; + long ret = 0; + + if (copy_from_user(&arg, user_arg, sizeof(arg))) + return -EFAULT; + + if (arg.flags) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!(devs = kcalloc(arg.nr_devs, sizeof(*devs), GFP_KERNEL)) || + !(thr = kzalloc(sizeof(*thr), GFP_KERNEL)) || + !(thr->devs = kcalloc(arg.nr_devs, sizeof(*thr->devs), GFP_KERNEL))) { + ret = -ENOMEM; + goto err; + } + + thr->opts = bch2_opts_empty(); + thr->nr_devs = arg.nr_devs; + + if (copy_from_user(devs, &user_arg->devs[0], + array_size(sizeof(user_arg->devs[0]), arg.nr_devs))) { + ret = -EINVAL; + goto err; + } + + for (size_t i = 0; i < arg.nr_devs; i++) { + thr->devs[i] = strndup_user((char __user *)(unsigned long) devs[i], PATH_MAX); + ret = PTR_ERR_OR_ZERO(thr->devs[i]); + if (ret) + goto err; + } + + if (arg.opts) { + char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); + + ret = PTR_ERR_OR_ZERO(optstr) ?: + bch2_parse_mount_opts(NULL, &thr->opts, optstr); + kfree(optstr); + + if (ret) + goto err; + } + + opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio); + + ret = bch2_run_thread_with_stdio(&thr->thr, + bch2_fsck_thread_exit, + bch2_fsck_offline_thread_fn); +err: + if (ret < 0) { + if (thr) + bch2_fsck_thread_exit(&thr->thr); + pr_err("ret %s", bch2_err_str(ret)); + } + kfree(devs); + return ret; +} + static long bch2_global_ioctl(unsigned cmd, void __user *arg) { + long ret; + switch (cmd) { #if 0 case BCH_IOCTL_ASSEMBLE: @@ -141,18 +244,25 @@ static long bch2_global_ioctl(unsigned cmd, void __user *arg) case BCH_IOCTL_INCREMENTAL: return bch2_ioctl_incremental(arg); #endif + case BCH_IOCTL_FSCK_OFFLINE: { + ret = bch2_ioctl_fsck_offline(arg); + break; + } default: - return -ENOTTY; + ret = -ENOTTY; + break; } + + if (ret < 0) + ret = bch2_err_class(ret); + return ret; } static long bch2_ioctl_query_uuid(struct bch_fs *c, struct bch_ioctl_query_uuid __user *user_arg) { - if (copy_to_user(&user_arg->uuid, &c->sb.user_uuid, - sizeof(c->sb.user_uuid))) - return -EFAULT; - return 0; + return copy_to_user_errcode(&user_arg->uuid, &c->sb.user_uuid, + sizeof(c->sb.user_uuid)); } #if 0 @@ -295,31 +405,27 @@ static long bch2_ioctl_disk_set_state(struct bch_fs *c, } struct bch_data_ctx { + struct thread_with_file thr; + struct bch_fs *c; struct bch_ioctl_data arg; struct bch_move_stats stats; - - int ret; - - struct task_struct *thread; }; static int bch2_data_thread(void *arg) { - struct bch_data_ctx *ctx = arg; - - ctx->ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg); + struct bch_data_ctx *ctx = container_of(arg, struct bch_data_ctx, thr); + ctx->thr.ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg); ctx->stats.data_type = U8_MAX; return 0; } static int bch2_data_job_release(struct inode *inode, struct file *file) { - struct bch_data_ctx *ctx = file->private_data; + struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr); - kthread_stop(ctx->thread); - put_task_struct(ctx->thread); + bch2_thread_with_file_exit(&ctx->thr); kfree(ctx); return 0; } @@ -327,7 +433,7 @@ static int bch2_data_job_release(struct inode *inode, struct file *file) static ssize_t bch2_data_job_read(struct file *file, char __user *buf, size_t len, loff_t *ppos) { - struct bch_data_ctx *ctx = file->private_data; + struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr); struct bch_fs *c = ctx->c; struct bch_ioctl_data_event e = { .type = BCH_DATA_EVENT_PROGRESS, @@ -341,10 +447,7 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf, if (len < sizeof(e)) return -EINVAL; - if (copy_to_user(buf, &e, sizeof(e))) - return -EFAULT; - - return sizeof(e); + return copy_to_user_errcode(buf, &e, sizeof(e)) ?: sizeof(e); } static const struct file_operations bcachefs_data_ops = { @@ -356,10 +459,8 @@ static const struct file_operations bcachefs_data_ops = { static long bch2_ioctl_data(struct bch_fs *c, struct bch_ioctl_data arg) { - struct bch_data_ctx *ctx = NULL; - struct file *file = NULL; - unsigned flags = O_RDONLY|O_CLOEXEC|O_NONBLOCK; - int ret, fd = -1; + struct bch_data_ctx *ctx; + int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -374,36 +475,11 @@ static long bch2_ioctl_data(struct bch_fs *c, ctx->c = c; ctx->arg = arg; - ctx->thread = kthread_create(bch2_data_thread, ctx, - "bch-data/%s", c->name); - if (IS_ERR(ctx->thread)) { - ret = PTR_ERR(ctx->thread); - goto err; - } - - ret = get_unused_fd_flags(flags); + ret = bch2_run_thread_with_file(&ctx->thr, + &bcachefs_data_ops, + bch2_data_thread); if (ret < 0) - goto err; - fd = ret; - - file = anon_inode_getfile("[bcachefs]", &bcachefs_data_ops, ctx, flags); - if (IS_ERR(file)) { - ret = PTR_ERR(file); - goto err; - } - - fd_install(fd, file); - - get_task_struct(ctx->thread); - wake_up_process(ctx->thread); - - return fd; -err: - if (fd >= 0) - put_unused_fd(fd); - if (!IS_ERR_OR_NULL(ctx->thread)) - kthread_stop(ctx->thread); - kfree(ctx); + kfree(ctx); return ret; } @@ -417,7 +493,7 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c, unsigned i; int ret = 0; - if (!test_bit(BCH_FS_STARTED, &c->flags)) + if (!test_bit(BCH_FS_started, &c->flags)) return -EINVAL; if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes)) @@ -444,7 +520,7 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c, dst_end = (void *) arg->replicas + replica_entries_bytes; for (i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry *src_e = + struct bch_replicas_entry_v1 *src_e = cpu_replicas_entry(&c->replicas, i); /* check that we have enough space for one replicas entry */ @@ -474,14 +550,15 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c, if (ret) goto err; - if (copy_to_user(user_arg, arg, - sizeof(*arg) + arg->replica_entries_bytes)) - ret = -EFAULT; + + ret = copy_to_user_errcode(user_arg, arg, + sizeof(*arg) + arg->replica_entries_bytes); err: kfree(arg); return ret; } +/* obsolete, didn't allow for new data types: */ static long bch2_ioctl_dev_usage(struct bch_fs *c, struct bch_ioctl_dev_usage __user *user_arg) { @@ -490,7 +567,7 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c, struct bch_dev *ca; unsigned i; - if (!test_bit(BCH_FS_STARTED, &c->flags)) + if (!test_bit(BCH_FS_started, &c->flags)) return -EINVAL; if (copy_from_user(&arg, user_arg, sizeof(arg))) @@ -511,7 +588,6 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c, arg.state = ca->mi.state; arg.bucket_size = ca->mi.bucket_size; arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket; - arg.buckets_ec = src.buckets_ec; for (i = 0; i < BCH_DATA_NR; i++) { arg.d[i].buckets = src.d[i].buckets; @@ -521,10 +597,58 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c, percpu_ref_put(&ca->ref); - if (copy_to_user(user_arg, &arg, sizeof(arg))) + return copy_to_user_errcode(user_arg, &arg, sizeof(arg)); +} + +static long bch2_ioctl_dev_usage_v2(struct bch_fs *c, + struct bch_ioctl_dev_usage_v2 __user *user_arg) +{ + struct bch_ioctl_dev_usage_v2 arg; + struct bch_dev_usage src; + struct bch_dev *ca; + int ret = 0; + + if (!test_bit(BCH_FS_started, &c->flags)) + return -EINVAL; + + if (copy_from_user(&arg, user_arg, sizeof(arg))) return -EFAULT; - return 0; + if ((arg.flags & ~BCH_BY_INDEX) || + arg.pad[0] || + arg.pad[1] || + arg.pad[2]) + return -EINVAL; + + ca = bch2_device_lookup(c, arg.dev, arg.flags); + if (IS_ERR(ca)) + return PTR_ERR(ca); + + src = bch2_dev_usage_read(ca); + + arg.state = ca->mi.state; + arg.bucket_size = ca->mi.bucket_size; + arg.nr_data_types = min(arg.nr_data_types, BCH_DATA_NR); + arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket; + + ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg)); + if (ret) + goto err; + + for (unsigned i = 0; i < arg.nr_data_types; i++) { + struct bch_ioctl_dev_usage_type t = { + .buckets = src.d[i].buckets, + .sectors = src.d[i].sectors, + .fragmented = src.d[i].fragmented, + }; + + ret = copy_to_user_errcode(&user_arg->d[i], &t, sizeof(t)); + if (ret) + goto err; + } +err: + percpu_ref_put(&ca->ref); + return ret; } static long bch2_ioctl_read_super(struct bch_fs *c, @@ -561,9 +685,8 @@ static long bch2_ioctl_read_super(struct bch_fs *c, goto err; } - if (copy_to_user((void __user *)(unsigned long)arg.sb, sb, - vstruct_bytes(sb))) - ret = -EFAULT; + ret = copy_to_user_errcode((void __user *)(unsigned long)arg.sb, sb, + vstruct_bytes(sb)); err: if (!IS_ERR_OR_NULL(ca)) percpu_ref_put(&ca->ref); @@ -575,8 +698,6 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c, struct bch_ioctl_disk_get_idx arg) { dev_t dev = huge_decode_dev(arg.dev); - struct bch_dev *ca; - unsigned i; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -584,10 +705,10 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c, if (!dev) return -EINVAL; - for_each_online_member(ca, c, i) + for_each_online_member(c, ca) if (ca->dev == dev) { percpu_ref_put(&ca->io_ref); - return i; + return ca->dev_idx; } return -BCH_ERR_ENOENT_dev_idx_not_found; @@ -642,6 +763,97 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c, return ret; } +static int bch2_fsck_online_thread_fn(void *arg) +{ + struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr); + struct bch_fs *c = thr->c; + + c->stdio_filter = current; + c->stdio = &thr->thr.stdio; + + /* + * XXX: can we figure out a way to do this without mucking with c->opts? + */ + unsigned old_fix_errors = c->opts.fix_errors; + if (opt_defined(thr->opts, fix_errors)) + c->opts.fix_errors = thr->opts.fix_errors; + else + c->opts.fix_errors = FSCK_FIX_ask; + + c->opts.fsck = true; + set_bit(BCH_FS_fsck_running, &c->flags); + + c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info; + int ret = bch2_run_online_recovery_passes(c); + + clear_bit(BCH_FS_fsck_running, &c->flags); + bch_err_fn(c, ret); + + c->stdio = NULL; + c->stdio_filter = NULL; + c->opts.fix_errors = old_fix_errors; + + thread_with_stdio_done(&thr->thr); + + up(&c->online_fsck_mutex); + bch2_ro_ref_put(c); + return 0; +} + +static long bch2_ioctl_fsck_online(struct bch_fs *c, + struct bch_ioctl_fsck_online arg) +{ + struct fsck_thread *thr = NULL; + long ret = 0; + + if (arg.flags) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!bch2_ro_ref_tryget(c)) + return -EROFS; + + if (down_trylock(&c->online_fsck_mutex)) { + bch2_ro_ref_put(c); + return -EAGAIN; + } + + thr = kzalloc(sizeof(*thr), GFP_KERNEL); + if (!thr) { + ret = -ENOMEM; + goto err; + } + + thr->c = c; + thr->opts = bch2_opts_empty(); + + if (arg.opts) { + char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); + + ret = PTR_ERR_OR_ZERO(optstr) ?: + bch2_parse_mount_opts(c, &thr->opts, optstr); + kfree(optstr); + + if (ret) + goto err; + } + + ret = bch2_run_thread_with_stdio(&thr->thr, + bch2_fsck_thread_exit, + bch2_fsck_online_thread_fn); +err: + if (ret < 0) { + bch_err_fn(c, ret); + if (thr) + bch2_fsck_thread_exit(&thr->thr); + up(&c->online_fsck_mutex); + bch2_ro_ref_put(c); + } + return ret; +} + #define BCH_IOCTL(_name, _argtype) \ do { \ _argtype i; \ @@ -663,6 +875,8 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) return bch2_ioctl_fs_usage(c, arg); case BCH_IOCTL_DEV_USAGE: return bch2_ioctl_dev_usage(c, arg); + case BCH_IOCTL_DEV_USAGE_V2: + return bch2_ioctl_dev_usage_v2(c, arg); #if 0 case BCH_IOCTL_START: BCH_IOCTL(start, struct bch_ioctl_start); @@ -675,7 +889,7 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx); } - if (!test_bit(BCH_FS_STARTED, &c->flags)) + if (!test_bit(BCH_FS_started, &c->flags)) return -EINVAL; switch (cmd) { @@ -695,7 +909,8 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize); case BCH_IOCTL_DISK_RESIZE_JOURNAL: BCH_IOCTL(disk_resize_journal, struct bch_ioctl_disk_resize_journal); - + case BCH_IOCTL_FSCK_ONLINE: + BCH_IOCTL(fsck_online, struct bch_ioctl_fsck_online); default: return -ENOTTY; } diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h index 13998388c545..1b8c2c1016dc 100644 --- a/fs/bcachefs/checksum.h +++ b/fs/bcachefs/checksum.h @@ -45,6 +45,29 @@ struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce, bch2_checksum(_c, _type, _nonce, _start, vstruct_end(_i) - _start);\ }) +static inline void bch2_csum_to_text(struct printbuf *out, + enum bch_csum_type type, + struct bch_csum csum) +{ + const u8 *p = (u8 *) &csum; + unsigned bytes = type < BCH_CSUM_NR ? bch_crc_bytes[type] : 16; + + for (unsigned i = 0; i < bytes; i++) + prt_hex_byte(out, p[i]); +} + +static inline void bch2_csum_err_msg(struct printbuf *out, + enum bch_csum_type type, + struct bch_csum expected, + struct bch_csum got) +{ + prt_printf(out, "checksum error: got "); + bch2_csum_to_text(out, type, got); + prt_str(out, " should be "); + bch2_csum_to_text(out, type, expected); + prt_printf(out, " type %s", bch2_csum_types[type]); +} + int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t); int bch2_request_key(struct bch_sb *, struct bch_key *); #ifndef __KERNEL__ diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c index f41889093a2c..363644451106 100644 --- a/fs/bcachefs/clock.c +++ b/fs/bcachefs/clock.c @@ -109,7 +109,7 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock, if (cpu_timeout != MAX_SCHEDULE_TIMEOUT) mod_timer(&wait.cpu_timer, cpu_timeout + jiffies); - while (1) { + do { set_current_state(TASK_INTERRUPTIBLE); if (kthread && kthread_should_stop()) break; @@ -119,7 +119,7 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock, schedule(); try_to_freeze(); - } + } while (0); __set_current_state(TASK_RUNNING); del_timer_sync(&wait.cpu_timer); diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index 51af8ea230ed..33df8cf86bd8 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -572,10 +572,6 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) ZSTD_parameters params = zstd_get_params(zstd_max_clevel(), c->opts.encoded_extent_max); - /* - * ZSTD is lying: if we allocate the size of the workspace it says it - * requires, it returns memory allocation errors - */ c->zstd_workspace_size = zstd_cctx_workspace_bound(¶ms.cParams); struct { diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h index 607fd5e232c9..58c2eb45570f 100644 --- a/fs/bcachefs/compress.h +++ b/fs/bcachefs/compress.h @@ -47,6 +47,14 @@ static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v) return __bch2_compression_opt_to_type[bch2_compression_decode(v).type]; } +static inline void bch2_prt_compression_type(struct printbuf *out, enum bch_compression_type type) +{ + if (type < BCH_COMPRESSION_TYPE_NR) + prt_str(out, __bch2_compression_types[type]); + else + prt_printf(out, "(invalid compression type %u)", type); +} + int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *, struct bch_extent_crc_unpacked *); int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *, diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h index e367c625f057..4b340d13caac 100644 --- a/fs/bcachefs/darray.h +++ b/fs/bcachefs/darray.h @@ -20,6 +20,7 @@ struct { \ #define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0) typedef DARRAY(char) darray_char; +typedef DARRAY(char *) darray_str; int __bch2_darray_resize(darray_char *, size_t, size_t, gfp_t); @@ -81,11 +82,14 @@ static inline int __darray_make_room(darray_char *d, size_t t_size, size_t more, #define darray_remove_item(_d, _pos) \ array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data) +#define __darray_for_each(_d, _i) \ + for ((_i) = (_d).data; _i < (_d).data + (_d).nr; _i++) + #define darray_for_each(_d, _i) \ - for (_i = (_d).data; _i < (_d).data + (_d).nr; _i++) + for (typeof(&(_d).data[0]) _i = (_d).data; _i < (_d).data + (_d).nr; _i++) #define darray_for_each_reverse(_d, _i) \ - for (_i = (_d).data + (_d).nr - 1; _i >= (_d).data; --_i) + for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data; --_i) #define darray_init(_d) \ do { \ diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 37d6ecae8c30..4150feca42a2 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -267,19 +267,31 @@ restart_drop_extra_replicas: goto out; } + if (trace_data_update_enabled()) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "\nold: "); + bch2_bkey_val_to_text(&buf, c, old); + prt_str(&buf, "\nk: "); + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, "\nnew: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); + + trace_data_update(c, buf.buf); + printbuf_exit(&buf); + } + ret = bch2_insert_snapshot_whiteouts(trans, m->btree_id, k.k->p, bkey_start_pos(&insert->k)) ?: bch2_insert_snapshot_whiteouts(trans, m->btree_id, k.k->p, insert->k.p) ?: - bch2_bkey_set_needs_rebalance(c, insert, - op->opts.background_target, - op->opts.background_compression) ?: + bch2_bkey_set_needs_rebalance(c, insert, &op->opts) ?: bch2_trans_update(trans, &iter, insert, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: bch2_trans_commit(trans, &op->res, NULL, - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_NOFAIL| + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_enospc| m->data_opts.btree_insert_flags); if (!ret) { bch2_btree_iter_set_pos(&iter, next_pos); @@ -300,14 +312,14 @@ next: } continue; nowork: - if (m->stats && m->stats) { + if (m->stats) { BUG_ON(k.k->p.offset <= iter.pos.offset); atomic64_inc(&m->stats->keys_raced); atomic64_add(k.k->p.offset - iter.pos.offset, &m->stats->sectors_raced); } - this_cpu_inc(c->counters[BCH_COUNTER_move_extent_fail]); + count_event(c, move_extent_fail); bch2_btree_iter_advance(&iter); goto next; @@ -342,7 +354,6 @@ void bch2_data_update_exit(struct data_update *update) struct bch_fs *c = update->op.c; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(update->k.k)); - const struct bch_extent_ptr *ptr; bkey_for_each_ptr(ptrs, ptr) { if (c->opts.nocow_enabled) @@ -363,7 +374,6 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, struct bio *bio = &update->op.wbio.bio; struct bkey_i_extent *e; struct write_point *wp; - struct bch_extent_ptr *ptr; struct closure cl; struct btree_iter iter; struct bkey_s_c k; @@ -404,6 +414,8 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, continue; } + bch_err_fn_ratelimited(c, ret); + if (ret) return; @@ -476,7 +488,7 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans, return bch2_trans_relock(trans) ?: bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: - bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); } int bch2_data_update_init(struct btree_trans *trans, @@ -493,7 +505,6 @@ int bch2_data_update_init(struct btree_trans *trans, struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; - const struct bch_extent_ptr *ptr; unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas; unsigned ptrs_locked = 0; int ret = 0; @@ -516,7 +527,7 @@ int bch2_data_update_init(struct btree_trans *trans, BCH_WRITE_DATA_ENCODED| BCH_WRITE_MOVE| m->data_opts.write_flags; - m->op.compression_opt = io_opts.background_compression ?: io_opts.compression; + m->op.compression_opt = background_compression(io_opts); m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK; bkey_for_each_ptr(ptrs, ptr) @@ -639,7 +650,6 @@ done: void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; unsigned i = 0; bkey_for_each_ptr(ptrs, ptr) { diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 57c5128db173..7bdba8507fc9 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -44,19 +44,19 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b, return false; bio = bio_alloc_bioset(ca->disk_sb.bdev, - buf_pages(n_sorted, btree_bytes(c)), + buf_pages(n_sorted, btree_buf_bytes(b)), REQ_OP_READ|REQ_META, GFP_NOFS, &c->btree_bio); bio->bi_iter.bi_sector = pick.ptr.offset; - bch2_bio_map(bio, n_sorted, btree_bytes(c)); + bch2_bio_map(bio, n_sorted, btree_buf_bytes(b)); submit_bio_wait(bio); bio_put(bio); percpu_ref_put(&ca->io_ref); - memcpy(n_ondisk, n_sorted, btree_bytes(c)); + memcpy(n_ondisk, n_sorted, btree_buf_bytes(b)); v->written = 0; if (bch2_btree_node_read_done(c, ca, v, false, &saw_error) || saw_error) @@ -137,7 +137,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) mutex_lock(&c->verify_lock); if (!c->verify_ondisk) { - c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL); + c->verify_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL); if (!c->verify_ondisk) goto out; } @@ -199,19 +199,19 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, return; } - n_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL); + n_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL); if (!n_ondisk) { prt_printf(out, "memory allocation failure\n"); goto out; } bio = bio_alloc_bioset(ca->disk_sb.bdev, - buf_pages(n_ondisk, btree_bytes(c)), + buf_pages(n_ondisk, btree_buf_bytes(b)), REQ_OP_READ|REQ_META, GFP_NOFS, &c->btree_bio); bio->bi_iter.bi_sector = pick.ptr.offset; - bch2_bio_map(bio, n_ondisk, btree_bytes(c)); + bch2_bio_map(bio, n_ondisk, btree_buf_bytes(b)); ret = submit_bio_wait(bio); if (ret) { @@ -293,7 +293,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, out: if (bio) bio_put(bio); - kvpfree(n_ondisk, btree_bytes(c)); + kvpfree(n_ondisk, btree_buf_bytes(b)); percpu_ref_put(&ca->io_ref); } @@ -366,35 +366,23 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, size_t size, loff_t *ppos) { struct dump_iter *i = file->private_data; - struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; - ssize_t ret; i->ubuf = buf; i->size = size; i->ret = 0; - ret = flush_buf(i); - if (ret) - return ret; - - trans = bch2_trans_get(i->c); - ret = for_each_btree_key2(trans, iter, i->id, i->from, - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, ({ - bch2_bkey_val_to_text(&i->buf, i->c, k); - prt_newline(&i->buf); - drop_locks_do(trans, flush_buf(i)); - })); - i->from = iter.pos; - - bch2_trans_put(trans); - - if (!ret) - ret = flush_buf(i); - - return ret ?: i->ret; + return flush_buf(i) ?: + bch2_trans_run(i->c, + for_each_btree_key(trans, iter, i->id, i->from, + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ({ + bch2_bkey_val_to_text(&i->buf, i->c, k); + prt_newline(&i->buf); + bch2_trans_unlock(trans); + i->from = bpos_successor(iter.pos); + flush_buf(i); + }))) ?: + i->ret; } static const struct file_operations btree_debug_ops = { @@ -462,44 +450,32 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, size_t size, loff_t *ppos) { struct dump_iter *i = file->private_data; - struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; - ssize_t ret; i->ubuf = buf; i->size = size; i->ret = 0; - ret = flush_buf(i); - if (ret) - return ret; - - trans = bch2_trans_get(i->c); - - ret = for_each_btree_key2(trans, iter, i->id, i->from, - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, ({ - struct btree_path_level *l = &iter.path->l[0]; - struct bkey_packed *_k = - bch2_btree_node_iter_peek(&l->iter, l->b); - - if (bpos_gt(l->b->key.k.p, i->prev_node)) { - bch2_btree_node_to_text(&i->buf, i->c, l->b); - i->prev_node = l->b->key.k.p; - } - - bch2_bfloat_to_text(&i->buf, l->b, _k); - drop_locks_do(trans, flush_buf(i)); - })); - i->from = iter.pos; - - bch2_trans_put(trans); - - if (!ret) - ret = flush_buf(i); - - return ret ?: i->ret; + return flush_buf(i) ?: + bch2_trans_run(i->c, + for_each_btree_key(trans, iter, i->id, i->from, + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ({ + struct btree_path_level *l = + &btree_iter_path(trans, &iter)->l[0]; + struct bkey_packed *_k = + bch2_btree_node_iter_peek(&l->iter, l->b); + + if (bpos_gt(l->b->key.k.p, i->prev_node)) { + bch2_btree_node_to_text(&i->buf, i->c, l->b); + i->prev_node = l->b->key.k.p; + } + + bch2_bfloat_to_text(&i->buf, l->b, _k); + bch2_trans_unlock(trans); + i->from = bpos_successor(iter.pos); + flush_buf(i); + }))) ?: + i->ret; } static const struct file_operations bfloat_failed_debug_ops = { @@ -616,7 +592,6 @@ static const struct file_operations cached_btree_nodes_ops = { .read = bch2_cached_btree_nodes_read, }; -#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) { @@ -632,7 +607,9 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, restart: seqmutex_lock(&c->btree_trans_lock); list_for_each_entry(trans, &c->btree_trans_list, list) { - if (trans->locking_wait.task->pid <= i->iter) + struct task_struct *task = READ_ONCE(trans->locking_wait.task); + + if (!task || task->pid <= i->iter) continue; closure_get(&trans->ref); @@ -650,11 +627,11 @@ restart: prt_printf(&i->buf, "backtrace:"); prt_newline(&i->buf); printbuf_indent_add(&i->buf, 2); - bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task); + bch2_prt_task_backtrace(&i->buf, task, 0, GFP_KERNEL); printbuf_indent_sub(&i->buf, 2); prt_newline(&i->buf); - i->iter = trans->locking_wait.task->pid; + i->iter = task->pid; closure_put(&trans->ref); @@ -678,7 +655,6 @@ static const struct file_operations btree_transactions_ops = { .release = bch2_dump_release, .read = bch2_btree_transactions_read, }; -#endif /* CONFIG_BCACHEFS_DEBUG_TRANSACTIONS */ static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) @@ -717,7 +693,7 @@ static const struct file_operations journal_pins_ops = { .read = bch2_journal_pins_read, }; -static int lock_held_stats_open(struct inode *inode, struct file *file) +static int btree_transaction_stats_open(struct inode *inode, struct file *file) { struct bch_fs *c = inode->i_private; struct dump_iter *i; @@ -727,7 +703,7 @@ static int lock_held_stats_open(struct inode *inode, struct file *file) if (!i) return -ENOMEM; - i->iter = 0; + i->iter = 1; i->c = c; i->buf = PRINTBUF; file->private_data = i; @@ -735,7 +711,7 @@ static int lock_held_stats_open(struct inode *inode, struct file *file) return 0; } -static int lock_held_stats_release(struct inode *inode, struct file *file) +static int btree_transaction_stats_release(struct inode *inode, struct file *file) { struct dump_iter *i = file->private_data; @@ -745,8 +721,8 @@ static int lock_held_stats_release(struct inode *inode, struct file *file) return 0; } -static ssize_t lock_held_stats_read(struct file *file, char __user *buf, - size_t size, loff_t *ppos) +static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) { struct dump_iter *i = file->private_data; struct bch_fs *c = i->c; @@ -779,6 +755,13 @@ static ssize_t lock_held_stats_read(struct file *file, char __user *buf, prt_printf(&i->buf, "Max mem used: %u", s->max_mem); prt_newline(&i->buf); + prt_printf(&i->buf, "Transaction duration:"); + prt_newline(&i->buf); + + printbuf_indent_add(&i->buf, 2); + bch2_time_stats_to_text(&i->buf, &s->duration); + printbuf_indent_sub(&i->buf, 2); + if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) { prt_printf(&i->buf, "Lock hold times:"); prt_newline(&i->buf); @@ -810,11 +793,11 @@ static ssize_t lock_held_stats_read(struct file *file, char __user *buf, return i->ret; } -static const struct file_operations lock_held_stats_op = { - .owner = THIS_MODULE, - .open = lock_held_stats_open, - .release = lock_held_stats_release, - .read = lock_held_stats_read, +static const struct file_operations btree_transaction_stats_op = { + .owner = THIS_MODULE, + .open = btree_transaction_stats_open, + .release = btree_transaction_stats_release, + .read = btree_transaction_stats_read, }; static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, @@ -835,7 +818,9 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, restart: seqmutex_lock(&c->btree_trans_lock); list_for_each_entry(trans, &c->btree_trans_list, list) { - if (trans->locking_wait.task->pid <= i->iter) + struct task_struct *task = READ_ONCE(trans->locking_wait.task); + + if (!task || task->pid <= i->iter) continue; closure_get(&trans->ref); @@ -850,7 +835,7 @@ restart: bch2_check_for_deadlock(trans, &i->buf); - i->iter = trans->locking_wait.task->pid; + i->iter = task->pid; closure_put(&trans->ref); @@ -897,16 +882,14 @@ void bch2_fs_debug_init(struct bch_fs *c) debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir, c->btree_debug, &cached_btree_nodes_ops); -#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS debugfs_create_file("btree_transactions", 0400, c->fs_debug_dir, c->btree_debug, &btree_transactions_ops); -#endif debugfs_create_file("journal_pins", 0400, c->fs_debug_dir, c->btree_debug, &journal_pins_ops); debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir, - c, &lock_held_stats_op); + c, &btree_transaction_stats_op); debugfs_create_file("btree_deadlock", 0400, c->fs_debug_dir, c->btree_debug, &btree_deadlock_ops); @@ -947,8 +930,6 @@ void bch2_debug_exit(void) int __init bch2_debug_init(void) { - int ret = 0; - bch_debug = debugfs_create_dir("bcachefs", NULL); - return ret; + return 0; } diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index 2bfff0da7000..4ae1e9f002a0 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -65,7 +65,7 @@ static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r) const struct qstr l_name = bch2_dirent_get_name(l); const struct qstr *r_name = _r; - return l_name.len - r_name->len ?: memcmp(l_name.name, r_name->name, l_name.len); + return !qstr_eq(l_name, *r_name); } static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) @@ -75,7 +75,7 @@ static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) const struct qstr l_name = bch2_dirent_get_name(l); const struct qstr r_name = bch2_dirent_get_name(r); - return l_name.len - r_name.len ?: memcmp(l_name.name, r_name.name, l_name.len); + return !qstr_eq(l_name, r_name); } static bool dirent_is_visible(subvol_inum inum, struct bkey_s_c k) @@ -198,10 +198,39 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, return dirent; } +int bch2_dirent_create_snapshot(struct btree_trans *trans, + u64 dir, u32 snapshot, + const struct bch_hash_info *hash_info, + u8 type, const struct qstr *name, u64 dst_inum, + u64 *dir_offset, + bch_str_hash_flags_t str_hash_flags) +{ + subvol_inum zero_inum = { 0 }; + struct bkey_i_dirent *dirent; + int ret; + + dirent = dirent_create_key(trans, zero_inum, type, name, dst_inum); + ret = PTR_ERR_OR_ZERO(dirent); + if (ret) + return ret; + + dirent->k.p.inode = dir; + dirent->k.p.snapshot = snapshot; + + ret = bch2_hash_set_snapshot(trans, bch2_dirent_hash_desc, hash_info, + zero_inum, snapshot, + &dirent->k_i, str_hash_flags, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + *dir_offset = dirent->k.p.offset; + + return ret; +} + int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, const struct bch_hash_info *hash_info, u8 type, const struct qstr *name, u64 dst_inum, - u64 *dir_offset, int flags) + u64 *dir_offset, + bch_str_hash_flags_t str_hash_flags) { struct bkey_i_dirent *dirent; int ret; @@ -212,7 +241,7 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, return ret; ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info, - dir, &dirent->k_i, flags); + dir, &dirent->k_i, str_hash_flags); *dir_offset = dirent->k.p.offset; return ret; @@ -470,17 +499,11 @@ u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir, const struct qstr *name, subvol_inum *inum) { struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - int ret; -retry: - bch2_trans_begin(trans); + struct btree_iter iter = { NULL }; - ret = __bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, - name, inum, 0); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - if (!ret) - bch2_trans_iter_exit(trans, &iter); + int ret = lockrestart_do(trans, + __bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0)); + bch2_trans_iter_exit(trans, &iter); bch2_trans_put(trans); return ret; } diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index 1e3431990abd..21ffeb78f02e 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -35,9 +35,14 @@ static inline unsigned dirent_val_u64s(unsigned len) int bch2_dirent_read_target(struct btree_trans *, subvol_inum, struct bkey_s_c_dirent, subvol_inum *); +int bch2_dirent_create_snapshot(struct btree_trans *, u64, u32, + const struct bch_hash_info *, u8, + const struct qstr *, u64, u64 *, + bch_str_hash_flags_t); int bch2_dirent_create(struct btree_trans *, subvol_inum, const struct bch_hash_info *, u8, - const struct qstr *, u64, u64 *, int); + const struct qstr *, u64, u64 *, + bch_str_hash_flags_t); static inline unsigned vfs_d_type(unsigned type) { diff --git a/fs/bcachefs/dirent_format.h b/fs/bcachefs/dirent_format.h new file mode 100644 index 000000000000..5e116b88e814 --- /dev/null +++ b/fs/bcachefs/dirent_format.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_DIRENT_FORMAT_H +#define _BCACHEFS_DIRENT_FORMAT_H + +/* + * Dirents (and xattrs) have to implement string lookups; since our b-tree + * doesn't support arbitrary length strings for the key, we instead index by a + * 64 bit hash (currently truncated sha1) of the string, stored in the offset + * field of the key - using linear probing to resolve hash collisions. This also + * provides us with the readdir cookie posix requires. + * + * Linear probing requires us to use whiteouts for deletions, in the event of a + * collision: + */ + +struct bch_dirent { + struct bch_val v; + + /* Target inode number: */ + union { + __le64 d_inum; + struct { /* DT_SUBVOL */ + __le32 d_child_subvol; + __le32 d_parent_subvol; + }; + }; + + /* + * Copy of mode bits 12-15 from the target inode - so userspace can get + * the filetype without having to do a stat() + */ + __u8 d_type; + + __u8 d_name[]; +} __packed __aligned(8); + +#define DT_SUBVOL 16 +#define BCH_DT_MAX 17 + +#define BCH_NAME_MAX 512 + +#endif /* _BCACHEFS_DIRENT_FORMAT_H */ diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c index 4d0cb0ccff32..06a7df529b40 100644 --- a/fs/bcachefs/disk_groups.c +++ b/fs/bcachefs/disk_groups.c @@ -89,19 +89,14 @@ err: void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c) { - struct bch_disk_groups_cpu *g; - struct bch_dev *ca; - int i; - unsigned iter; - out->atomic++; rcu_read_lock(); - g = rcu_dereference(c->disk_groups); + struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); if (!g) goto out; - for (i = 0; i < g->nr; i++) { + for (unsigned i = 0; i < g->nr; i++) { if (i) prt_printf(out, " "); @@ -111,7 +106,7 @@ void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c) } prt_printf(out, "[parent %d devs", g->entries[i].parent); - for_each_member_device_rcu(ca, c, iter, &g->entries[i].devs) + for_each_member_device_rcu(c, ca, &g->entries[i].devs) prt_printf(out, " %s", ca->name); prt_printf(out, "]"); } @@ -562,7 +557,7 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) : NULL; if (ca && percpu_ref_tryget(&ca->io_ref)) { - prt_printf(out, "/dev/%pg", ca->disk_sb.bdev); + prt_printf(out, "/dev/%s", ca->name); percpu_ref_put(&ca->io_ref); } else if (ca) { prt_printf(out, "offline device %u", t.dev); diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 2a77de18c004..d503af270024 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -3,6 +3,7 @@ /* erasure coding */ #include "bcachefs.h" +#include "alloc_background.h" #include "alloc_foreground.h" #include "backpointers.h" #include "bkey_buf.h" @@ -156,12 +157,311 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, } } +/* Triggers: */ + +static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, + struct bkey_s_c_stripe s, + unsigned idx, bool deleting) +{ + struct bch_fs *c = trans->c; + const struct bch_extent_ptr *ptr = &s.v->ptrs[idx]; + struct btree_iter iter; + struct bkey_i_alloc_v4 *a; + enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant + ? BCH_DATA_parity : 0; + s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0; + int ret = 0; + + if (deleting) + sectors = -sectors; + + a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr)); + if (IS_ERR(a)) + return PTR_ERR(a); + + ret = bch2_check_bucket_ref(trans, s.s_c, ptr, sectors, data_type, + a->v.gen, a->v.data_type, + a->v.dirty_sectors); + if (ret) + goto err; + + if (!deleting) { + if (bch2_trans_inconsistent_on(a->v.stripe || + a->v.stripe_redundancy, trans, + "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)", + iter.pos.inode, iter.pos.offset, a->v.gen, + bch2_data_type_str(a->v.data_type), + a->v.dirty_sectors, + a->v.stripe, s.k->p.offset)) { + ret = -EIO; + goto err; + } + + if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans, + "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu", + iter.pos.inode, iter.pos.offset, a->v.gen, + bch2_data_type_str(a->v.data_type), + a->v.dirty_sectors, + s.k->p.offset)) { + ret = -EIO; + goto err; + } + + a->v.stripe = s.k->p.offset; + a->v.stripe_redundancy = s.v->nr_redundant; + a->v.data_type = BCH_DATA_stripe; + } else { + if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset || + a->v.stripe_redundancy != s.v->nr_redundant, trans, + "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)", + iter.pos.inode, iter.pos.offset, a->v.gen, + s.k->p.offset, a->v.stripe)) { + ret = -EIO; + goto err; + } + + a->v.stripe = 0; + a->v.stripe_redundancy = 0; + a->v.data_type = alloc_data_type(a->v, BCH_DATA_user); + } + + a->v.dirty_sectors += sectors; + if (data_type) + a->v.data_type = !deleting ? data_type : 0; + + ret = bch2_trans_update(trans, &iter, &a->k_i, 0); + if (ret) + goto err; +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int mark_stripe_bucket(struct btree_trans *trans, + struct bkey_s_c k, + unsigned ptr_idx, + unsigned flags) +{ + struct bch_fs *c = trans->c; + const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; + unsigned nr_data = s->nr_blocks - s->nr_redundant; + bool parity = ptr_idx >= nr_data; + enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe; + s64 sectors = parity ? le16_to_cpu(s->sectors) : 0; + const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bucket old, new, *g; + struct printbuf buf = PRINTBUF; + int ret = 0; + + BUG_ON(!(flags & BTREE_TRIGGER_GC)); + + /* * XXX doesn't handle deletion */ + + percpu_down_read(&c->mark_lock); + g = PTR_GC_BUCKET(ca, ptr); + + if (g->dirty_sectors || + (g->stripe && g->stripe != k.k->p.offset)) { + bch2_fs_inconsistent(c, + "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s", + ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + ret = -EINVAL; + goto err; + } + + bucket_lock(g); + old = *g; + + ret = bch2_check_bucket_ref(trans, k, ptr, sectors, data_type, + g->gen, g->data_type, + g->dirty_sectors); + if (ret) + goto err; + + g->data_type = data_type; + g->dirty_sectors += sectors; + + g->stripe = k.k->p.offset; + g->stripe_redundancy = s->nr_redundant; + new = *g; +err: + bucket_unlock(g); + if (!ret) + bch2_dev_usage_update_m(c, ca, &old, &new); + percpu_up_read(&c->mark_lock); + printbuf_exit(&buf); + return ret; +} + +int bch2_trigger_stripe(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_s _new, + unsigned flags) +{ + struct bkey_s_c new = _new.s_c; + struct bch_fs *c = trans->c; + u64 idx = new.k->p.offset; + const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe + ? bkey_s_c_to_stripe(old).v : NULL; + const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe + ? bkey_s_c_to_stripe(new).v : NULL; + + if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + /* + * If the pointers aren't changing, we don't need to do anything: + */ + if (new_s && old_s && + new_s->nr_blocks == old_s->nr_blocks && + new_s->nr_redundant == old_s->nr_redundant && + !memcmp(old_s->ptrs, new_s->ptrs, + new_s->nr_blocks * sizeof(struct bch_extent_ptr))) + return 0; + + BUG_ON(new_s && old_s && + (new_s->nr_blocks != old_s->nr_blocks || + new_s->nr_redundant != old_s->nr_redundant)); + + if (new_s) { + s64 sectors = le16_to_cpu(new_s->sectors); + + struct bch_replicas_padded r; + bch2_bkey_to_replicas(&r.e, new); + int ret = bch2_update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant); + if (ret) + return ret; + } + + if (old_s) { + s64 sectors = -((s64) le16_to_cpu(old_s->sectors)); + + struct bch_replicas_padded r; + bch2_bkey_to_replicas(&r.e, old); + int ret = bch2_update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant); + if (ret) + return ret; + } + + unsigned nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks; + for (unsigned i = 0; i < nr_blocks; i++) { + if (new_s && old_s && + !memcmp(&new_s->ptrs[i], + &old_s->ptrs[i], + sizeof(new_s->ptrs[i]))) + continue; + + if (new_s) { + int ret = bch2_trans_mark_stripe_bucket(trans, + bkey_s_c_to_stripe(new), i, false); + if (ret) + return ret; + } + + if (old_s) { + int ret = bch2_trans_mark_stripe_bucket(trans, + bkey_s_c_to_stripe(old), i, true); + if (ret) + return ret; + } + } + } + + if (flags & BTREE_TRIGGER_ATOMIC) { + struct stripe *m = genradix_ptr(&c->stripes, idx); + + if (!m) { + struct printbuf buf1 = PRINTBUF; + struct printbuf buf2 = PRINTBUF; + + bch2_bkey_val_to_text(&buf1, c, old); + bch2_bkey_val_to_text(&buf2, c, new); + bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n" + "old %s\n" + "new %s", idx, buf1.buf, buf2.buf); + printbuf_exit(&buf2); + printbuf_exit(&buf1); + bch2_inconsistent_error(c); + return -1; + } + + if (!new_s) { + bch2_stripes_heap_del(c, m, idx); + + memset(m, 0, sizeof(*m)); + } else { + m->sectors = le16_to_cpu(new_s->sectors); + m->algorithm = new_s->algorithm; + m->nr_blocks = new_s->nr_blocks; + m->nr_redundant = new_s->nr_redundant; + m->blocks_nonempty = 0; + + for (unsigned i = 0; i < new_s->nr_blocks; i++) + m->blocks_nonempty += !!stripe_blockcount_get(new_s, i); + + if (!old_s) + bch2_stripes_heap_insert(c, m, idx); + else + bch2_stripes_heap_update(c, m, idx); + } + } + + if (flags & BTREE_TRIGGER_GC) { + struct gc_stripe *m = + genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL); + + if (!m) { + bch_err(c, "error allocating memory for gc_stripes, idx %llu", + idx); + return -BCH_ERR_ENOMEM_mark_stripe; + } + /* + * This will be wrong when we bring back runtime gc: we should + * be unmarking the old key and then marking the new key + */ + m->alive = true; + m->sectors = le16_to_cpu(new_s->sectors); + m->nr_blocks = new_s->nr_blocks; + m->nr_redundant = new_s->nr_redundant; + + for (unsigned i = 0; i < new_s->nr_blocks; i++) + m->ptrs[i] = new_s->ptrs[i]; + + bch2_bkey_to_replicas(&m->r.e, new); + + /* + * gc recalculates this field from stripe ptr + * references: + */ + memset(m->block_sectors, 0, sizeof(m->block_sectors)); + + for (unsigned i = 0; i < new_s->nr_blocks; i++) { + int ret = mark_stripe_bucket(trans, new, i, flags); + if (ret) + return ret; + } + + int ret = bch2_update_replicas(c, new, &m->r.e, + ((s64) m->sectors * m->nr_redundant), + 0, true); + if (ret) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, new); + bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf); + printbuf_exit(&buf); + return ret; + } + } + + return 0; +} + /* returns blocknr in stripe that we matched: */ static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s, struct bkey_s_c k, unsigned *block) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; unsigned i, nr_data = s->nr_blocks - s->nr_redundant; bkey_for_each_ptr(ptrs, ptr) @@ -791,28 +1091,22 @@ static void ec_stripe_delete_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, ec_stripe_delete_work); - struct btree_trans *trans = bch2_trans_get(c); - int ret; - u64 idx; while (1) { mutex_lock(&c->ec_stripes_heap_lock); - idx = stripe_idx_to_delete(c); + u64 idx = stripe_idx_to_delete(c); mutex_unlock(&c->ec_stripes_heap_lock); if (!idx) break; - ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, - ec_stripe_delete(trans, idx)); - if (ret) { - bch_err_fn(c, ret); + int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + ec_stripe_delete(trans, idx)); + bch_err_fn(c, ret); + if (ret) break; - } } - bch2_trans_put(trans); - bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete); } @@ -983,8 +1277,8 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b while (1) { ret = commit_do(trans, NULL, NULL, - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_NOFAIL, + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_enospc, ec_stripe_update_extent(trans, bucket_pos, bucket.gen, s, &bp_pos)); if (ret) @@ -1005,7 +1299,7 @@ static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s) unsigned i, nr_data = v->nr_blocks - v->nr_redundant; int ret = 0; - ret = bch2_btree_write_buffer_flush(trans); + ret = bch2_btree_write_buffer_flush_sync(trans); if (ret) goto err; @@ -1121,21 +1415,20 @@ static void ec_stripe_create(struct ec_stripe_new *s) } ret = bch2_trans_do(c, &s->res, NULL, - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_NOFAIL, + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_enospc, ec_stripe_key_update(trans, bkey_i_to_stripe(&s->new_stripe.key), !s->have_existing_stripe)); + bch_err_msg(c, ret, "creating stripe key"); if (ret) { - bch_err(c, "error creating stripe: error creating stripe key"); goto err; } ret = ec_stripe_update_extents(c, &s->new_stripe); - if (ret) { - bch_err_msg(c, ret, "creating stripe: error updating pointers"); + bch_err_msg(c, ret, "error updating extents"); + if (ret) goto err; - } err: bch2_disk_reservation_put(c, &s->res); @@ -1250,18 +1543,17 @@ static int unsigned_cmp(const void *_l, const void *_r) static unsigned pick_blocksize(struct bch_fs *c, struct bch_devs_mask *devs) { - struct bch_dev *ca; - unsigned i, nr = 0, sizes[BCH_SB_MEMBERS_MAX]; + unsigned nr = 0, sizes[BCH_SB_MEMBERS_MAX]; struct { unsigned nr, size; } cur = { 0, 0 }, best = { 0, 0 }; - for_each_member_device_rcu(ca, c, i, devs) + for_each_member_device_rcu(c, ca, devs) sizes[nr++] = ca->mi.bucket_size; sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL); - for (i = 0; i < nr; i++) { + for (unsigned i = 0; i < nr; i++) { if (sizes[i] != cur.size) { if (cur.nr > best.nr) best = cur; @@ -1344,8 +1636,6 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, enum bch_watermark watermark) { struct ec_stripe_head *h; - struct bch_dev *ca; - unsigned i; h = kzalloc(sizeof(*h), GFP_KERNEL); if (!h) @@ -1362,13 +1652,13 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, rcu_read_lock(); h->devs = target_rw_devs(c, BCH_DATA_user, target); - for_each_member_device_rcu(ca, c, i, &h->devs) + for_each_member_device_rcu(c, ca, &h->devs) if (!ca->mi.durability) - __clear_bit(i, h->devs.d); + __clear_bit(ca->dev_idx, h->devs.d); h->blocksize = pick_blocksize(c, &h->devs); - for_each_member_device_rcu(ca, c, i, &h->devs) + for_each_member_device_rcu(c, ca, &h->devs) if (ca->mi.bucket_size == h->blocksize) h->nr_active_devs++; @@ -1415,7 +1705,7 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans, if (ret) return ERR_PTR(ret); - if (test_bit(BCH_FS_GOING_RO, &c->flags)) { + if (test_bit(BCH_FS_going_ro, &c->flags)) { h = ERR_PTR(-BCH_ERR_erofs_no_writes); goto found; } @@ -1833,44 +2123,32 @@ void bch2_fs_ec_flush(struct bch_fs *c) int bch2_stripes_read(struct bch_fs *c) { - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - const struct bch_stripe *s; - struct stripe *m; - unsigned i; - int ret; - - for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN, - BTREE_ITER_PREFETCH, k, ret) { - if (k.k->type != KEY_TYPE_stripe) - continue; - - ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL); - if (ret) - break; - - s = bkey_s_c_to_stripe(k).v; - - m = genradix_ptr(&c->stripes, k.k->p.offset); - m->sectors = le16_to_cpu(s->sectors); - m->algorithm = s->algorithm; - m->nr_blocks = s->nr_blocks; - m->nr_redundant = s->nr_redundant; - m->blocks_nonempty = 0; + int ret = bch2_trans_run(c, + for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN, + BTREE_ITER_PREFETCH, k, ({ + if (k.k->type != KEY_TYPE_stripe) + continue; - for (i = 0; i < s->nr_blocks; i++) - m->blocks_nonempty += !!stripe_blockcount_get(s, i); + ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL); + if (ret) + break; - bch2_stripes_heap_insert(c, m, k.k->p.offset); - } - bch2_trans_iter_exit(trans, &iter); + const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; - bch2_trans_put(trans); + struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset); + m->sectors = le16_to_cpu(s->sectors); + m->algorithm = s->algorithm; + m->nr_blocks = s->nr_blocks; + m->nr_redundant = s->nr_redundant; + m->blocks_nonempty = 0; - if (ret) - bch_err_fn(c, ret); + for (unsigned i = 0; i < s->nr_blocks; i++) + m->blocks_nonempty += !!stripe_blockcount_get(s, i); + bch2_stripes_heap_insert(c, m, k.k->p.offset); + 0; + }))); + bch_err_fn(c, ret); return ret; } diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index 7d0237c9819f..f4369b02e805 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -12,13 +12,14 @@ int bch2_stripe_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +int bch2_trigger_stripe(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s, unsigned); #define bch2_bkey_ops_stripe ((struct bkey_ops) { \ .key_invalid = bch2_stripe_invalid, \ .val_to_text = bch2_stripe_to_text, \ .swab = bch2_ptr_swab, \ - .trans_trigger = bch2_trans_mark_stripe, \ - .atomic_trigger = bch2_mark_stripe, \ + .trigger = bch2_trigger_stripe, \ .min_val_size = 8, \ }) diff --git a/fs/bcachefs/ec_format.h b/fs/bcachefs/ec_format.h new file mode 100644 index 000000000000..44ce88ba08d7 --- /dev/null +++ b/fs/bcachefs/ec_format.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_EC_FORMAT_H +#define _BCACHEFS_EC_FORMAT_H + +struct bch_stripe { + struct bch_val v; + __le16 sectors; + __u8 algorithm; + __u8 nr_blocks; + __u8 nr_redundant; + + __u8 csum_granularity_bits; + __u8 csum_type; + __u8 pad; + + struct bch_extent_ptr ptrs[]; +} __packed __aligned(8); + +#endif /* _BCACHEFS_EC_FORMAT_H */ diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h index e2b02a82de32..976426da3a12 100644 --- a/fs/bcachefs/ec_types.h +++ b/fs/bcachefs/ec_types.h @@ -5,7 +5,7 @@ #include "bcachefs_format.h" struct bch_replicas_padded { - struct bch_replicas_entry e; + struct bch_replicas_entry_v1 e; u8 pad[BCH_BKEY_PTRS_MAX]; }; diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 9ce29681eec9..8c40c2067a04 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -73,7 +73,6 @@ x(ENOMEM, ENOMEM_fsck_add_nlink) \ x(ENOMEM, ENOMEM_journal_key_insert) \ x(ENOMEM, ENOMEM_journal_keys_sort) \ - x(ENOMEM, ENOMEM_journal_replay) \ x(ENOMEM, ENOMEM_read_superblock_clean) \ x(ENOMEM, ENOMEM_fs_alloc) \ x(ENOMEM, ENOMEM_fs_name_alloc) \ @@ -152,7 +151,6 @@ x(BCH_ERR_btree_insert_fail, btree_insert_need_mark_replicas) \ x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_res) \ x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_reclaim) \ - x(BCH_ERR_btree_insert_fail, btree_insert_need_flush_buffer) \ x(0, backpointer_to_overwritten_btree_node) \ x(0, lock_fail_root_changed) \ x(0, journal_reclaim_would_deadlock) \ @@ -172,10 +170,12 @@ x(EINVAL, device_size_too_small) \ x(EINVAL, device_not_a_member_of_filesystem) \ x(EINVAL, device_has_been_removed) \ + x(EINVAL, device_splitbrain) \ x(EINVAL, device_already_online) \ x(EINVAL, insufficient_devices_to_start) \ x(EINVAL, invalid) \ x(EINVAL, internal_fsck_err) \ + x(EINVAL, opt_parse_error) \ x(EROFS, erofs_trans_commit) \ x(EROFS, erofs_no_writes) \ x(EROFS, erofs_journal_err) \ @@ -224,6 +224,8 @@ x(BCH_ERR_invalid, invalid_bkey) \ x(BCH_ERR_operation_blocked, nocow_lock_blocked) \ x(EIO, btree_node_read_err) \ + x(EIO, sb_not_downgraded) \ + x(EIO, btree_write_all_failed) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ @@ -235,6 +237,7 @@ x(BCH_ERR_nopromote, nopromote_unwritten) \ x(BCH_ERR_nopromote, nopromote_congested) \ x(BCH_ERR_nopromote, nopromote_in_flight) \ + x(BCH_ERR_nopromote, nopromote_no_writes) \ x(BCH_ERR_nopromote, nopromote_enomem) enum bch_errcode { diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 25cf78a7b946..d32c8bebe46c 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -2,12 +2,13 @@ #include "bcachefs.h" #include "error.h" #include "super.h" +#include "thread_with_file.h" #define FSCK_ERR_RATELIMIT_NR 10 bool bch2_inconsistent_error(struct bch_fs *c) { - set_bit(BCH_FS_ERROR, &c->flags); + set_bit(BCH_FS_error, &c->flags); switch (c->opts.errors) { case BCH_ON_ERROR_continue: @@ -26,8 +27,8 @@ bool bch2_inconsistent_error(struct bch_fs *c) void bch2_topology_error(struct bch_fs *c) { - set_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags); - if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) + set_bit(BCH_FS_topology_error, &c->flags); + if (!test_bit(BCH_FS_fsck_running, &c->flags)) bch2_inconsistent_error(c); } @@ -69,40 +70,66 @@ enum ask_yn { YN_ALLYES, }; +static enum ask_yn parse_yn_response(char *buf) +{ + buf = strim(buf); + + if (strlen(buf) == 1) + switch (buf[0]) { + case 'n': + return YN_NO; + case 'y': + return YN_YES; + case 'N': + return YN_ALLNO; + case 'Y': + return YN_ALLYES; + } + return -1; +} + #ifdef __KERNEL__ -#define bch2_fsck_ask_yn() YN_NO +static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c) +{ + struct stdio_redirect *stdio = c->stdio; + + if (c->stdio_filter && c->stdio_filter != current) + stdio = NULL; + + if (!stdio) + return YN_NO; + + char buf[100]; + int ret; + + do { + bch2_print(c, " (y,n, or Y,N for all errors of this type) "); + + int r = bch2_stdio_redirect_readline(stdio, buf, sizeof(buf) - 1); + if (r < 0) + return YN_NO; + buf[r] = '\0'; + } while ((ret = parse_yn_response(buf)) < 0); + + return ret; +} #else #include "tools-util.h" -enum ask_yn bch2_fsck_ask_yn(void) +static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c) { char *buf = NULL; size_t buflen = 0; - bool ret; + int ret; - while (true) { + do { fputs(" (y,n, or Y,N for all errors of this type) ", stdout); fflush(stdout); if (getline(&buf, &buflen, stdin) < 0) die("error reading from standard input"); - - strim(buf); - if (strlen(buf) != 1) - continue; - - switch (buf[0]) { - case 'n': - return YN_NO; - case 'y': - return YN_YES; - case 'N': - return YN_ALLNO; - case 'Y': - return YN_ALLYES; - } - } + } while ((ret = parse_yn_response(buf)) < 0); free(buf); return ret; @@ -114,7 +141,7 @@ static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt) { struct fsck_err_state *s; - if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) + if (!test_bit(BCH_FS_fsck_running, &c->flags)) return NULL; list_for_each_entry(s, &c->fsck_error_msgs, list) @@ -152,7 +179,8 @@ int bch2_fsck_err(struct bch_fs *c, struct printbuf buf = PRINTBUF, *out = &buf; int ret = -BCH_ERR_fsck_ignore; - if (test_bit(err, c->sb.errors_silent)) + if ((flags & FSCK_CAN_FIX) && + test_bit(err, c->sb.errors_silent)) return -BCH_ERR_fsck_fix; bch2_sb_error_count(c, err); @@ -196,7 +224,7 @@ int bch2_fsck_err(struct bch_fs *c, prt_printf(out, bch2_log_msg(c, "")); #endif - if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) { + if (!test_bit(BCH_FS_fsck_running, &c->flags)) { if (c->opts.errors != BCH_ON_ERROR_continue || !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) { prt_str(out, ", shutting down"); @@ -221,10 +249,13 @@ int bch2_fsck_err(struct bch_fs *c, int ask; prt_str(out, ": fix?"); - bch2_print_string_as_lines(KERN_ERR, out->buf); + if (bch2_fs_stdio_redirect(c)) + bch2_print(c, "%s", out->buf); + else + bch2_print_string_as_lines(KERN_ERR, out->buf); print = false; - ask = bch2_fsck_ask_yn(); + ask = bch2_fsck_ask_yn(c); if (ask >= YN_ALLNO && s) s->fix = ask == YN_ALLNO @@ -253,10 +284,14 @@ int bch2_fsck_err(struct bch_fs *c, !(flags & FSCK_CAN_IGNORE))) ret = -BCH_ERR_fsck_errors_not_fixed; - if (print) - bch2_print_string_as_lines(KERN_ERR, out->buf); + if (print) { + if (bch2_fs_stdio_redirect(c)) + bch2_print(c, "%s\n", out->buf); + else + bch2_print_string_as_lines(KERN_ERR, out->buf); + } - if (!test_bit(BCH_FS_FSCK_DONE, &c->flags) && + if (test_bit(BCH_FS_fsck_running, &c->flags) && (ret != -BCH_ERR_fsck_fix && ret != -BCH_ERR_fsck_ignore)) bch_err(c, "Unable to continue, halting"); @@ -274,10 +309,10 @@ int bch2_fsck_err(struct bch_fs *c, bch2_inconsistent_error(c); if (ret == -BCH_ERR_fsck_fix) { - set_bit(BCH_FS_ERRORS_FIXED, &c->flags); + set_bit(BCH_FS_errors_fixed, &c->flags); } else { - set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags); - set_bit(BCH_FS_ERROR, &c->flags); + set_bit(BCH_FS_errors_not_fixed, &c->flags); + set_bit(BCH_FS_error, &c->flags); } return ret; diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c index 21af6fb8cecf..b9033bb4f11c 100644 --- a/fs/bcachefs/extent_update.c +++ b/fs/bcachefs/extent_update.c @@ -100,7 +100,7 @@ static int count_iters_for_insert(struct btree_trans *trans, return ret2 ?: ret; } -#define EXTENT_ITERS_MAX (BTREE_ITER_MAX / 3) +#define EXTENT_ITERS_MAX (BTREE_ITER_INITIAL / 3) int bch2_extent_atomic_end(struct btree_trans *trans, struct btree_iter *iter, diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 9d8afcb5979a..61395b113df9 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -8,6 +8,7 @@ #include "bcachefs.h" #include "bkey_methods.h" +#include "btree_cache.h" #include "btree_gc.h" #include "btree_io.h" #include "btree_iter.h" @@ -843,7 +844,6 @@ void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev) const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; bkey_for_each_ptr(ptrs, ptr) if (ptr->dev == dev) @@ -855,7 +855,6 @@ const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; bkey_for_each_ptr(ptrs, ptr) if (bch2_dev_in_target(c, ptr->dev, target) && @@ -1020,12 +1019,12 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct bch_extent_crc_unpacked crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); - prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s", + prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress ", crc.compressed_size, crc.uncompressed_size, crc.offset, crc.nonce, - bch2_csum_types[crc.csum_type], - bch2_compression_types[crc.compression_type]); + bch2_csum_types[crc.csum_type]); + bch2_prt_compression_type(out, crc.compression_type); break; } case BCH_EXTENT_ENTRY_stripe_ptr: { @@ -1065,7 +1064,6 @@ static int extent_ptr_invalid(struct bch_fs *c, struct printbuf *err) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr2; u64 bucket; u32 bucket_offset; struct bch_dev *ca; @@ -1307,7 +1305,6 @@ unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k, } incompressible: if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) { - const struct bch_extent_ptr *ptr; unsigned i = 0; bkey_for_each_ptr(ptrs, ptr) { @@ -1338,10 +1335,12 @@ bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k) } int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k, - unsigned target, unsigned compression) + struct bch_io_opts *opts) { struct bkey_s k = bkey_i_to_s(_k); struct bch_extent_rebalance *r; + unsigned target = opts->background_target; + unsigned compression = background_compression(*opts); bool needs_rebalance; if (!bkey_extent_is_direct_data(k.k)) diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index a2ce8a3be13c..6bf839d69e84 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -300,7 +300,7 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) bkey_extent_entry_for_each_from(_p, _entry, _p.start) #define __bkey_for_each_ptr(_start, _end, _ptr) \ - for ((_ptr) = (_start); \ + for (typeof(_start) (_ptr) = (_start); \ ((_ptr) = __bkey_ptr_next(_ptr, _end)); \ (_ptr)++) @@ -415,8 +415,7 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, .key_invalid = bch2_btree_ptr_invalid, \ .val_to_text = bch2_btree_ptr_to_text, \ .swab = bch2_ptr_swab, \ - .trans_trigger = bch2_trans_mark_extent, \ - .atomic_trigger = bch2_mark_extent, \ + .trigger = bch2_trigger_extent, \ }) #define bch2_bkey_ops_btree_ptr_v2 ((struct bkey_ops) { \ @@ -424,8 +423,7 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, .val_to_text = bch2_btree_ptr_v2_to_text, \ .swab = bch2_ptr_swab, \ .compat = bch2_btree_ptr_v2_compat, \ - .trans_trigger = bch2_trans_mark_extent, \ - .atomic_trigger = bch2_mark_extent, \ + .trigger = bch2_trigger_extent, \ .min_val_size = 40, \ }) @@ -439,8 +437,7 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); .swab = bch2_ptr_swab, \ .key_normalize = bch2_extent_normalize, \ .key_merge = bch2_extent_merge, \ - .trans_trigger = bch2_trans_mark_extent, \ - .atomic_trigger = bch2_mark_extent, \ + .trigger = bch2_trigger_extent, \ }) /* KEY_TYPE_reservation: */ @@ -454,8 +451,7 @@ bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); .key_invalid = bch2_reservation_invalid, \ .val_to_text = bch2_reservation_to_text, \ .key_merge = bch2_reservation_merge, \ - .trans_trigger = bch2_trans_mark_reservation, \ - .atomic_trigger = bch2_mark_reservation, \ + .trigger = bch2_trigger_reservation, \ .min_val_size = 8, \ }) @@ -547,7 +543,6 @@ static inline bool bkey_extent_is_allocation(const struct bkey *k) static inline bool bkey_extent_is_unwritten(struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; bkey_for_each_ptr(ptrs, ptr) if (ptr->unwritten) @@ -565,10 +560,9 @@ static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k) { struct bch_devs_list ret = (struct bch_devs_list) { 0 }; struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; bkey_for_each_ptr(p, ptr) - ret.devs[ret.nr++] = ptr->dev; + ret.data[ret.nr++] = ptr->dev; return ret; } @@ -577,11 +571,10 @@ static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k) { struct bch_devs_list ret = (struct bch_devs_list) { 0 }; struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; bkey_for_each_ptr(p, ptr) if (!ptr->cached) - ret.devs[ret.nr++] = ptr->dev; + ret.data[ret.nr++] = ptr->dev; return ret; } @@ -590,11 +583,10 @@ static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k) { struct bch_devs_list ret = (struct bch_devs_list) { 0 }; struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; bkey_for_each_ptr(p, ptr) if (ptr->cached) - ret.devs[ret.nr++] = ptr->dev; + ret.data[ret.nr++] = ptr->dev; return ret; } @@ -716,7 +708,7 @@ unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c, bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c); int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *, - unsigned, unsigned); + struct bch_io_opts *); /* Generic extent code: */ diff --git a/fs/bcachefs/extents_format.h b/fs/bcachefs/extents_format.h new file mode 100644 index 000000000000..3bd2fdbb0817 --- /dev/null +++ b/fs/bcachefs/extents_format.h @@ -0,0 +1,295 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_EXTENTS_FORMAT_H +#define _BCACHEFS_EXTENTS_FORMAT_H + +/* + * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally + * preceded by checksum/compression information (bch_extent_crc32 or + * bch_extent_crc64). + * + * One major determining factor in the format of extents is how we handle and + * represent extents that have been partially overwritten and thus trimmed: + * + * If an extent is not checksummed or compressed, when the extent is trimmed we + * don't have to remember the extent we originally allocated and wrote: we can + * merely adjust ptr->offset to point to the start of the data that is currently + * live. The size field in struct bkey records the current (live) size of the + * extent, and is also used to mean "size of region on disk that we point to" in + * this case. + * + * Thus an extent that is not checksummed or compressed will consist only of a + * list of bch_extent_ptrs, with none of the fields in + * bch_extent_crc32/bch_extent_crc64. + * + * When an extent is checksummed or compressed, it's not possible to read only + * the data that is currently live: we have to read the entire extent that was + * originally written, and then return only the part of the extent that is + * currently live. + * + * Thus, in addition to the current size of the extent in struct bkey, we need + * to store the size of the originally allocated space - this is the + * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also, + * when the extent is trimmed, instead of modifying the offset field of the + * pointer, we keep a second smaller offset field - "offset into the original + * extent of the currently live region". + * + * The other major determining factor is replication and data migration: + * + * Each pointer may have its own bch_extent_crc32/64. When doing a replicated + * write, we will initially write all the replicas in the same format, with the + * same checksum type and compression format - however, when copygc runs later (or + * tiering/cache promotion, anything that moves data), it is not in general + * going to rewrite all the pointers at once - one of the replicas may be in a + * bucket on one device that has very little fragmentation while another lives + * in a bucket that has become heavily fragmented, and thus is being rewritten + * sooner than the rest. + * + * Thus it will only move a subset of the pointers (or in the case of + * tiering/cache promotion perhaps add a single pointer without dropping any + * current pointers), and if the extent has been partially overwritten it must + * write only the currently live portion (or copygc would not be able to reduce + * fragmentation!) - which necessitates a different bch_extent_crc format for + * the new pointer. + * + * But in the interests of space efficiency, we don't want to store one + * bch_extent_crc for each pointer if we don't have to. + * + * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and + * bch_extent_ptrs appended arbitrarily one after the other. We determine the + * type of a given entry with a scheme similar to utf8 (except we're encoding a + * type, not a size), encoding the type in the position of the first set bit: + * + * bch_extent_crc32 - 0b1 + * bch_extent_ptr - 0b10 + * bch_extent_crc64 - 0b100 + * + * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and + * bch_extent_crc64 is the least constrained). + * + * Then, each bch_extent_crc32/64 applies to the pointers that follow after it, + * until the next bch_extent_crc32/64. + * + * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer + * is neither checksummed nor compressed. + */ + +#define BCH_EXTENT_ENTRY_TYPES() \ + x(ptr, 0) \ + x(crc32, 1) \ + x(crc64, 2) \ + x(crc128, 3) \ + x(stripe_ptr, 4) \ + x(rebalance, 5) +#define BCH_EXTENT_ENTRY_MAX 6 + +enum bch_extent_entry_type { +#define x(f, n) BCH_EXTENT_ENTRY_##f = n, + BCH_EXTENT_ENTRY_TYPES() +#undef x +}; + +/* Compressed/uncompressed size are stored biased by 1: */ +struct bch_extent_crc32 { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u32 type:2, + _compressed_size:7, + _uncompressed_size:7, + offset:7, + _unused:1, + csum_type:4, + compression_type:4; + __u32 csum; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u32 csum; + __u32 compression_type:4, + csum_type:4, + _unused:1, + offset:7, + _uncompressed_size:7, + _compressed_size:7, + type:2; +#endif +} __packed __aligned(8); + +#define CRC32_SIZE_MAX (1U << 7) +#define CRC32_NONCE_MAX 0 + +struct bch_extent_crc64 { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:3, + _compressed_size:9, + _uncompressed_size:9, + offset:9, + nonce:10, + csum_type:4, + compression_type:4, + csum_hi:16; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 csum_hi:16, + compression_type:4, + csum_type:4, + nonce:10, + offset:9, + _uncompressed_size:9, + _compressed_size:9, + type:3; +#endif + __u64 csum_lo; +} __packed __aligned(8); + +#define CRC64_SIZE_MAX (1U << 9) +#define CRC64_NONCE_MAX ((1U << 10) - 1) + +struct bch_extent_crc128 { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:4, + _compressed_size:13, + _uncompressed_size:13, + offset:13, + nonce:13, + csum_type:4, + compression_type:4; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 compression_type:4, + csum_type:4, + nonce:13, + offset:13, + _uncompressed_size:13, + _compressed_size:13, + type:4; +#endif + struct bch_csum csum; +} __packed __aligned(8); + +#define CRC128_SIZE_MAX (1U << 13) +#define CRC128_NONCE_MAX ((1U << 13) - 1) + +/* + * @reservation - pointer hasn't been written to, just reserved + */ +struct bch_extent_ptr { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:1, + cached:1, + unused:1, + unwritten:1, + offset:44, /* 8 petabytes */ + dev:8, + gen:8; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 gen:8, + dev:8, + offset:44, + unwritten:1, + unused:1, + cached:1, + type:1; +#endif +} __packed __aligned(8); + +struct bch_extent_stripe_ptr { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:5, + block:8, + redundancy:4, + idx:47; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 idx:47, + redundancy:4, + block:8, + type:5; +#endif +}; + +struct bch_extent_rebalance { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:6, + unused:34, + compression:8, /* enum bch_compression_opt */ + target:16; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 target:16, + compression:8, + unused:34, + type:6; +#endif +}; + +union bch_extent_entry { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64 + unsigned long type; +#elif __BITS_PER_LONG == 32 + struct { + unsigned long pad; + unsigned long type; + }; +#else +#error edit for your odd byteorder. +#endif + +#define x(f, n) struct bch_extent_##f f; + BCH_EXTENT_ENTRY_TYPES() +#undef x +}; + +struct bch_btree_ptr { + struct bch_val v; + + __u64 _data[0]; + struct bch_extent_ptr start[]; +} __packed __aligned(8); + +struct bch_btree_ptr_v2 { + struct bch_val v; + + __u64 mem_ptr; + __le64 seq; + __le16 sectors_written; + __le16 flags; + struct bpos min_key; + __u64 _data[0]; + struct bch_extent_ptr start[]; +} __packed __aligned(8); + +LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1); + +struct bch_extent { + struct bch_val v; + + __u64 _data[0]; + union bch_extent_entry start[]; +} __packed __aligned(8); + +/* Maximum size (in u64s) a single pointer could be: */ +#define BKEY_EXTENT_PTR_U64s_MAX\ + ((sizeof(struct bch_extent_crc128) + \ + sizeof(struct bch_extent_ptr)) / sizeof(__u64)) + +/* Maximum possible size of an entire extent value: */ +#define BKEY_EXTENT_VAL_U64s_MAX \ + (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1)) + +/* * Maximum possible size of an entire extent, key + value: */ +#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX) + +/* Btree pointers don't carry around checksums: */ +#define BKEY_BTREE_PTR_VAL_U64s_MAX \ + ((sizeof(struct bch_btree_ptr_v2) + \ + sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64)) +#define BKEY_BTREE_PTR_U64s_MAX \ + (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX) + +struct bch_reservation { + struct bch_val v; + + __le32 generation; + __u8 nr_replicas; + __u8 pad[3]; +} __packed __aligned(8); + +struct bch_inline_data { + struct bch_val v; + u8 data[]; +}; + +#endif /* _BCACHEFS_EXTENTS_FORMAT_H */ diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h index 05429c9631cd..b04750dbf870 100644 --- a/fs/bcachefs/eytzinger.h +++ b/fs/bcachefs/eytzinger.h @@ -156,7 +156,7 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size) } #define eytzinger1_for_each(_i, _size) \ - for ((_i) = eytzinger1_first((_size)); \ + for (unsigned (_i) = eytzinger1_first((_size)); \ (_i) != 0; \ (_i) = eytzinger1_next((_i), (_size))) @@ -227,7 +227,7 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size) } #define eytzinger0_for_each(_i, _size) \ - for ((_i) = eytzinger0_first((_size)); \ + for (unsigned (_i) = eytzinger0_first((_size)); \ (_i) != -1; \ (_i) = eytzinger0_next((_i), (_size))) @@ -261,11 +261,11 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, #define eytzinger0_find(base, nr, size, _cmp, search) \ ({ \ - void *_base = (base); \ - void *_search = (search); \ - size_t _nr = (nr); \ - size_t _size = (size); \ - size_t _i = 0; \ + void *_base = (base); \ + const void *_search = (search); \ + size_t _nr = (nr); \ + size_t _size = (size); \ + size_t _i = 0; \ int _res; \ \ while (_i < _nr && \ diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c index 4496cf91a4c1..1c1ea0f0c692 100644 --- a/fs/bcachefs/fs-common.c +++ b/fs/bcachefs/fs-common.c @@ -166,10 +166,8 @@ int bch2_create_trans(struct btree_trans *trans, if (ret) goto err; - if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { - new_inode->bi_dir = dir_u->bi_inum; - new_inode->bi_dir_offset = dir_offset; - } + new_inode->bi_dir = dir_u->bi_inum; + new_inode->bi_dir_offset = dir_offset; } inode_iter.flags &= ~BTREE_ITER_ALL_SNAPSHOTS; @@ -228,10 +226,8 @@ int bch2_link_trans(struct btree_trans *trans, if (ret) goto err; - if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { - inode_u->bi_dir = dir.inum; - inode_u->bi_dir_offset = dir_offset; - } + inode_u->bi_dir = dir.inum; + inode_u->bi_dir_offset = dir_offset; ret = bch2_inode_write(trans, &dir_iter, dir_u) ?: bch2_inode_write(trans, &inode_iter, inode_u); @@ -414,21 +410,19 @@ int bch2_rename_trans(struct btree_trans *trans, goto err; } - if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { - src_inode_u->bi_dir = dst_dir_u->bi_inum; - src_inode_u->bi_dir_offset = dst_offset; + src_inode_u->bi_dir = dst_dir_u->bi_inum; + src_inode_u->bi_dir_offset = dst_offset; - if (mode == BCH_RENAME_EXCHANGE) { - dst_inode_u->bi_dir = src_dir_u->bi_inum; - dst_inode_u->bi_dir_offset = src_offset; - } + if (mode == BCH_RENAME_EXCHANGE) { + dst_inode_u->bi_dir = src_dir_u->bi_inum; + dst_inode_u->bi_dir_offset = src_offset; + } - if (mode == BCH_RENAME_OVERWRITE && - dst_inode_u->bi_dir == dst_dir_u->bi_inum && - dst_inode_u->bi_dir_offset == src_offset) { - dst_inode_u->bi_dir = 0; - dst_inode_u->bi_dir_offset = 0; - } + if (mode == BCH_RENAME_OVERWRITE && + dst_inode_u->bi_dir == dst_dir_u->bi_inum && + dst_inode_u->bi_dir_offset == src_offset) { + dst_inode_u->bi_dir = 0; + dst_inode_u->bi_dir_offset = 0; } if (mode == BCH_RENAME_OVERWRITE) { diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index 52f0e7acda3d..73c12e565af5 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -52,26 +52,20 @@ struct readpages_iter { static int readpages_iter_init(struct readpages_iter *iter, struct readahead_control *ractl) { - struct folio **fi; - int ret; - - memset(iter, 0, sizeof(*iter)); + struct folio *folio; - iter->mapping = ractl->mapping; + *iter = (struct readpages_iter) { ractl->mapping }; - ret = bch2_filemap_get_contig_folios_d(iter->mapping, - ractl->_index << PAGE_SHIFT, - (ractl->_index + ractl->_nr_pages) << PAGE_SHIFT, - 0, mapping_gfp_mask(iter->mapping), - &iter->folios); - if (ret) - return ret; + while ((folio = __readahead_folio(ractl))) { + if (!bch2_folio_create(folio, GFP_KERNEL) || + darray_push(&iter->folios, folio)) { + bch2_folio_release(folio); + ractl->_nr_pages += folio_nr_pages(folio); + ractl->_index -= folio_nr_pages(folio); + return iter->folios.nr ? 0 : -ENOMEM; + } - darray_for_each(iter->folios, fi) { - ractl->_nr_pages -= 1U << folio_order(*fi); - __bch2_folio_create(*fi, __GFP_NOFAIL|GFP_KERNEL); - folio_put(*fi); - folio_put(*fi); + folio_put(folio); } return 0; @@ -273,12 +267,12 @@ void bch2_readahead(struct readahead_control *ractl) struct btree_trans *trans = bch2_trans_get(c); struct folio *folio; struct readpages_iter readpages_iter; - int ret; bch2_inode_opts_get(&opts, c, &inode->ei_inode); - ret = readpages_iter_init(&readpages_iter, ractl); - BUG_ON(ret); + int ret = readpages_iter_init(&readpages_iter, ractl); + if (ret) + return; bch2_pagecache_add_get(inode); @@ -638,7 +632,7 @@ do_io: /* Check for writing past i_size: */ WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) > round_up(i_size, block_bytes(c)) && - !test_bit(BCH_FS_EMERGENCY_RO, &c->flags), + !test_bit(BCH_FS_emergency_ro, &c->flags), "writing past i_size: %llu > %llu (unrounded %llu)\n", bio_end_sector(&w->io->op.wbio.bio) << 9, round_up(i_size, block_bytes(c)), @@ -826,7 +820,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch2_folio_reservation res; folios fs; - struct folio **fi, *f; + struct folio *f; unsigned copied = 0, f_offset, f_copied; u64 end = pos + len, f_pos, f_len; loff_t last_folio_pos = inode->v.i_size; diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c index 84e20c3ada6c..e3b219e19e10 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c @@ -77,7 +77,8 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) bch2_inode_opts_get(&opts, c, &inode->ei_inode); - if ((offset|iter->count) & (block_bytes(c) - 1)) + /* bios must be 512 byte aligned: */ + if ((offset|iter->count) & (SECTOR_SIZE - 1)) return -EINVAL; ret = min_t(loff_t, iter->count, diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c index ff664fd0d8ef..d359aa9b33b8 100644 --- a/fs/bcachefs/fs-io-pagecache.c +++ b/fs/bcachefs/fs-io-pagecache.c @@ -309,39 +309,49 @@ void bch2_mark_pagecache_unallocated(struct bch_inode_info *inode, } } -void bch2_mark_pagecache_reserved(struct bch_inode_info *inode, - u64 start, u64 end) +int bch2_mark_pagecache_reserved(struct bch_inode_info *inode, + u64 *start, u64 end, + bool nonblocking) { struct bch_fs *c = inode->v.i_sb->s_fs_info; - pgoff_t index = start >> PAGE_SECTORS_SHIFT; + pgoff_t index = *start >> PAGE_SECTORS_SHIFT; pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; struct folio_batch fbatch; s64 i_sectors_delta = 0; - unsigned i, j; + int ret = 0; - if (end <= start) - return; + if (end <= *start) + return 0; folio_batch_init(&fbatch); while (filemap_get_folios(inode->v.i_mapping, &index, end_index, &fbatch)) { - for (i = 0; i < folio_batch_count(&fbatch); i++) { + for (unsigned i = 0; i < folio_batch_count(&fbatch); i++) { struct folio *folio = fbatch.folios[i]; + + if (!nonblocking) + folio_lock(folio); + else if (!folio_trylock(folio)) { + folio_batch_release(&fbatch); + ret = -EAGAIN; + break; + } + u64 folio_start = folio_sector(folio); u64 folio_end = folio_end_sector(folio); - unsigned folio_offset = max(start, folio_start) - folio_start; - unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; - struct bch_folio *s; BUG_ON(end <= folio_start); - folio_lock(folio); - s = bch2_folio(folio); + *start = min(end, folio_end); + struct bch_folio *s = bch2_folio(folio); if (s) { + unsigned folio_offset = max(*start, folio_start) - folio_start; + unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; + spin_lock(&s->lock); - for (j = folio_offset; j < folio_offset + folio_len; j++) { + for (unsigned j = folio_offset; j < folio_offset + folio_len; j++) { i_sectors_delta -= s->s[j].state == SECTOR_dirty; bch2_folio_sector_set(folio, s, j, folio_sector_reserve(s->s[j].state)); @@ -356,6 +366,7 @@ void bch2_mark_pagecache_reserved(struct bch_inode_info *inode, } bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); + return ret; } static inline unsigned sectors_to_reserve(struct bch_folio_sector *s, diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h index 27f712ae37a6..8cbaba6565b4 100644 --- a/fs/bcachefs/fs-io-pagecache.h +++ b/fs/bcachefs/fs-io-pagecache.h @@ -143,7 +143,7 @@ int bch2_folio_set(struct bch_fs *, subvol_inum, struct folio **, unsigned); void bch2_bio_page_state_set(struct bio *, struct bkey_s_c); void bch2_mark_pagecache_unallocated(struct bch_inode_info *, u64, u64); -void bch2_mark_pagecache_reserved(struct bch_inode_info *, u64, u64); +int bch2_mark_pagecache_reserved(struct bch_inode_info *, u64 *, u64, bool); int bch2_get_folio_disk_reservation(struct bch_fs *, struct bch_inode_info *, diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index b0e8144ec550..8c70123b6a0c 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -79,7 +79,7 @@ void bch2_inode_flush_nocow_writes_async(struct bch_fs *c, continue; bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0, - REQ_OP_FLUSH, + REQ_OP_WRITE|REQ_PREFLUSH, GFP_KERNEL, &c->nocow_flush_bioset), struct nocow_flush, bio); @@ -192,13 +192,17 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; - int ret, ret2, ret3; + int ret; ret = file_write_and_wait_range(file, start, end); - ret2 = sync_inode_metadata(&inode->v, 1); - ret3 = bch2_flush_inode(c, inode); - - return bch2_err_class(ret ?: ret2 ?: ret3); + if (ret) + goto out; + ret = sync_inode_metadata(&inode->v, 1); + if (ret) + goto out; + ret = bch2_flush_inode(c, inode); +out: + return bch2_err_class(ret); } /* truncate: */ @@ -671,8 +675,11 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, bch2_i_sectors_acct(c, inode, "a_res, i_sectors_delta); - drop_locks_do(trans, - (bch2_mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0)); + if (bch2_mark_pagecache_reserved(inode, &hole_start, + iter.pos.offset, true)) + drop_locks_do(trans, + bch2_mark_pagecache_reserved(inode, &hole_start, + iter.pos.offset, false)); bkey_err: bch2_quota_reservation_put(c, inode, "a_res); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -861,7 +868,8 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, abs(pos_src - pos_dst) < len) return -EINVAL; - bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); + lock_two_nondirectories(&src->v, &dst->v); + bch2_lock_inodes(INODE_PAGECACHE_BLOCK, src, dst); inode_dio_wait(&src->v); inode_dio_wait(&dst->v); @@ -914,7 +922,8 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, ret = bch2_flush_inode(c, dst); err: bch2_quota_reservation_put(c, dst, "a_res); - bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); + bch2_unlock_inodes(INODE_PAGECACHE_BLOCK, src, dst); + unlock_two_nondirectories(&src->v, &dst->v); return bch2_err_class(ret); } diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 94e5a567fa44..3a4c24c28e7f 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -285,34 +285,26 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg) bch_notice(c, "shutdown by ioctl type %u", flags); - down_write(&c->vfs_sb->s_umount); - switch (flags) { case FSOP_GOING_FLAGS_DEFAULT: ret = bdev_freeze(c->vfs_sb->s_bdev); if (ret) - goto err; - + break; bch2_journal_flush(&c->journal); - c->vfs_sb->s_flags |= SB_RDONLY; bch2_fs_emergency_read_only(c); bdev_thaw(c->vfs_sb->s_bdev); break; - case FSOP_GOING_FLAGS_LOGFLUSH: bch2_journal_flush(&c->journal); fallthrough; - case FSOP_GOING_FLAGS_NOLOGFLUSH: - c->vfs_sb->s_flags |= SB_RDONLY; bch2_fs_emergency_read_only(c); break; default: ret = -EINVAL; break; } -err: - up_write(&c->vfs_sb->s_umount); + return ret; } @@ -345,11 +337,12 @@ static long __bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, if (arg.flags & BCH_SUBVOL_SNAPSHOT_RO) create_flags |= BCH_CREATE_SNAPSHOT_RO; - /* why do we need this lock? */ - down_read(&c->vfs_sb->s_umount); - - if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) + if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) { + /* sync_inodes_sb enforce s_umount is locked */ + down_read(&c->vfs_sb->s_umount); sync_inodes_sb(c->vfs_sb); + up_read(&c->vfs_sb->s_umount); + } retry: if (arg.src_ptr) { error = user_path_at(arg.dirfd, @@ -433,8 +426,6 @@ err2: goto retry; } err1: - up_read(&c->vfs_sb->s_umount); - return error; } @@ -451,33 +442,36 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, struct bch_ioctl_subvolume arg) { + const char __user *name = (void __user *)(unsigned long)arg.dst_ptr; struct path path; struct inode *dir; + struct dentry *victim; int ret = 0; if (arg.flags) return -EINVAL; - ret = user_path_at(arg.dirfd, - (const char __user *)(unsigned long)arg.dst_ptr, - LOOKUP_FOLLOW, &path); - if (ret) - return ret; + victim = user_path_locked_at(arg.dirfd, name, &path); + if (IS_ERR(victim)) + return PTR_ERR(victim); - if (path.dentry->d_sb->s_fs_info != c) { + if (victim->d_sb->s_fs_info != c) { ret = -EXDEV; goto err; } - - dir = path.dentry->d_parent->d_inode; - - ret = __bch2_unlink(dir, path.dentry, true); - if (ret) + if (!d_is_positive(victim)) { + ret = -ENOENT; goto err; - - fsnotify_rmdir(dir, path.dentry); - d_delete(path.dentry); + } + dir = d_inode(path.dentry); + ret = __bch2_unlink(dir, victim, true); + if (!ret) { + fsnotify_rmdir(dir, victim); + d_delete(victim); + } + inode_unlock(dir); err: + dput(victim); path_put(&path); return ret; } diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index c1895df1bffe..ec419b8e2c43 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -93,7 +93,7 @@ retry: BTREE_ITER_INTENT) ?: (set ? set(trans, inode, &inode_u, p) : 0) ?: bch2_inode_write(trans, &iter, &inode_u) ?: - bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); /* * the btree node lock protects inode->ei_inode, not ei_update_lock; @@ -455,7 +455,7 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry, bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode); ret = commit_do(trans, NULL, NULL, - BTREE_INSERT_NOFAIL, + BCH_TRANS_COMMIT_no_enospc, bch2_unlink_trans(trans, inode_inum(dir), &dir_u, &inode_u, &dentry->d_name, @@ -729,7 +729,7 @@ retry: ret = bch2_inode_write(trans, &inode_iter, &inode_u) ?: bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL); + BCH_TRANS_COMMIT_no_enospc); btree_err: bch2_trans_iter_exit(trans, &inode_iter); @@ -1012,15 +1012,13 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) { struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; - int ret; if (!dir_emit_dots(file, ctx)) return 0; - ret = bch2_readdir(c, inode_inum(inode), ctx); - if (ret) - bch_err_fn(c, ret); + int ret = bch2_readdir(c, inode_inum(inode), ctx); + bch_err_fn(c, ret); return bch2_err_class(ret); } @@ -1500,7 +1498,7 @@ static void bch2_evict_inode(struct inode *vinode) void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s) { - struct bch_inode_info *inode, **i; + struct bch_inode_info *inode; DARRAY(struct bch_inode_info *) grabbed; bool clean_pass = false, this_pass_clean; @@ -1626,43 +1624,18 @@ static struct bch_fs *bch2_path_to_fs(const char *path) return c ?: ERR_PTR(-ENOENT); } -static char **split_devs(const char *_dev_name, unsigned *nr) -{ - char *dev_name = NULL, **devs = NULL, *s; - size_t i = 0, nr_devs = 0; - - dev_name = kstrdup(_dev_name, GFP_KERNEL); - if (!dev_name) - return NULL; - - for (s = dev_name; s; s = strchr(s + 1, ':')) - nr_devs++; - - devs = kcalloc(nr_devs + 1, sizeof(const char *), GFP_KERNEL); - if (!devs) { - kfree(dev_name); - return NULL; - } - - while ((s = strsep(&dev_name, ":"))) - devs[i++] = s; - - *nr = nr_devs; - return devs; -} - static int bch2_remount(struct super_block *sb, int *flags, char *data) { struct bch_fs *c = sb->s_fs_info; struct bch_opts opts = bch2_opts_empty(); int ret; - opt_set(opts, read_only, (*flags & SB_RDONLY) != 0); - ret = bch2_parse_mount_opts(c, &opts, data); if (ret) goto err; + opt_set(opts, read_only, (*flags & SB_RDONLY) != 0); + if (opts.read_only != c->opts.read_only) { down_write(&c->state_lock); @@ -1696,11 +1669,9 @@ err: static int bch2_show_devname(struct seq_file *seq, struct dentry *root) { struct bch_fs *c = root->d_sb->s_fs_info; - struct bch_dev *ca; - unsigned i; bool first = true; - for_each_online_member(ca, c, i) { + for_each_online_member(c, ca) { if (!first) seq_putc(seq, ':'); first = false; @@ -1770,7 +1741,7 @@ static int bch2_unfreeze(struct super_block *sb) struct bch_fs *c = sb->s_fs_info; int ret; - if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) + if (test_bit(BCH_FS_emergency_ro, &c->flags)) return 0; down_write(&c->state_lock); @@ -1805,17 +1776,18 @@ static int bch2_noset_super(struct super_block *s, void *data) return -EBUSY; } +typedef DARRAY(struct bch_fs *) darray_fs; + static int bch2_test_super(struct super_block *s, void *data) { struct bch_fs *c = s->s_fs_info; - struct bch_fs **devs = data; - unsigned i; + darray_fs *d = data; if (!c) return false; - for (i = 0; devs[i]; i++) - if (c != devs[i]) + darray_for_each(*d, i) + if (c != *i) return false; return true; } @@ -1824,13 +1796,9 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { struct bch_fs *c; - struct bch_dev *ca; struct super_block *sb; struct inode *vinode; struct bch_opts opts = bch2_opts_empty(); - char **devs; - struct bch_fs **devs_to_fs = NULL; - unsigned i, nr_devs; int ret; opt_set(opts, read_only, (flags & SB_RDONLY) != 0); @@ -1842,25 +1810,25 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, if (!dev_name || strlen(dev_name) == 0) return ERR_PTR(-EINVAL); - devs = split_devs(dev_name, &nr_devs); - if (!devs) - return ERR_PTR(-ENOMEM); + darray_str devs; + ret = bch2_split_devs(dev_name, &devs); + if (ret) + return ERR_PTR(ret); - devs_to_fs = kcalloc(nr_devs + 1, sizeof(void *), GFP_KERNEL); - if (!devs_to_fs) { - sb = ERR_PTR(-ENOMEM); - goto got_sb; + darray_fs devs_to_fs = {}; + darray_for_each(devs, i) { + ret = darray_push(&devs_to_fs, bch2_path_to_fs(*i)); + if (ret) { + sb = ERR_PTR(ret); + goto got_sb; + } } - for (i = 0; i < nr_devs; i++) - devs_to_fs[i] = bch2_path_to_fs(devs[i]); - - sb = sget(fs_type, bch2_test_super, bch2_noset_super, - flags|SB_NOSEC, devs_to_fs); + sb = sget(fs_type, bch2_test_super, bch2_noset_super, flags|SB_NOSEC, &devs_to_fs); if (!IS_ERR(sb)) goto got_sb; - c = bch2_fs_open(devs, nr_devs, opts); + c = bch2_fs_open(devs.data, devs.nr, opts); if (IS_ERR(c)) { sb = ERR_CAST(c); goto got_sb; @@ -1880,9 +1848,8 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, if (IS_ERR(sb)) bch2_fs_stop(c); got_sb: - kfree(devs_to_fs); - kfree(devs[0]); - kfree(devs); + darray_exit(&devs_to_fs); + bch2_darray_str_exit(&devs); if (IS_ERR(sb)) { ret = PTR_ERR(sb); @@ -1923,7 +1890,7 @@ got_sb: sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; - for_each_online_member(ca, c, i) { + for_each_online_member(c, ca) { struct block_device *bdev = ca->disk_sb.bdev; /* XXX: create an anonymous device for multi device filesystems */ @@ -1944,10 +1911,9 @@ got_sb: vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM); ret = PTR_ERR_OR_ZERO(vinode); - if (ret) { - bch_err_msg(c, ret, "mounting: error getting root inode"); + bch_err_msg(c, ret, "mounting: error getting root inode"); + if (ret) goto err_put_super; - } sb->s_root = d_make_root(vinode); if (!sb->s_root) { diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index 5edf1d4b9e6b..c3af7225ff69 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -77,9 +77,8 @@ static inline int ptrcmp(void *l, void *r) } enum bch_inode_lock_op { - INODE_LOCK = (1U << 0), - INODE_PAGECACHE_BLOCK = (1U << 1), - INODE_UPDATE_LOCK = (1U << 2), + INODE_PAGECACHE_BLOCK = (1U << 0), + INODE_UPDATE_LOCK = (1U << 1), }; #define bch2_lock_inodes(_locks, ...) \ @@ -91,8 +90,6 @@ do { \ \ for (i = 1; i < ARRAY_SIZE(a); i++) \ if (a[i] != a[i - 1]) { \ - if ((_locks) & INODE_LOCK) \ - down_write_nested(&a[i]->v.i_rwsem, i); \ if ((_locks) & INODE_PAGECACHE_BLOCK) \ bch2_pagecache_block_get(a[i]);\ if ((_locks) & INODE_UPDATE_LOCK) \ @@ -109,8 +106,6 @@ do { \ \ for (i = 1; i < ARRAY_SIZE(a); i++) \ if (a[i] != a[i - 1]) { \ - if ((_locks) & INODE_LOCK) \ - up_write(&a[i]->v.i_rwsem); \ if ((_locks) & INODE_PAGECACHE_BLOCK) \ bch2_pagecache_block_put(a[i]);\ if ((_locks) & INODE_UPDATE_LOCK) \ diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index e0c5cd119acc..6a760777bafb 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -20,8 +20,6 @@ #include <linux/bsearch.h> #include <linux/dcache.h> /* struct qstr */ -#define QSTR(n) { { { .len = strlen(n) } }, .name = n } - /* * XXX: this is handling transaction restarts without returning * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore: @@ -29,19 +27,16 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum, u32 snapshot) { - struct btree_iter iter; - struct bkey_s_c k; u64 sectors = 0; - int ret; - for_each_btree_key_upto(trans, iter, BTREE_ID_extents, + int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_extents, SPOS(inum, 0, snapshot), POS(inum, U64_MAX), - 0, k, ret) + 0, k, ({ if (bkey_extent_is_allocation(k.k)) sectors += k.k->size; - - bch2_trans_iter_exit(trans, &iter); + 0; + })); return ret ?: sectors; } @@ -49,45 +44,23 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum, static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum, u32 snapshot) { - struct btree_iter iter; - struct bkey_s_c k; - struct bkey_s_c_dirent d; u64 subdirs = 0; - int ret; - - for_each_btree_key_upto(trans, iter, BTREE_ID_dirents, - SPOS(inum, 0, snapshot), - POS(inum, U64_MAX), - 0, k, ret) { - if (k.k->type != KEY_TYPE_dirent) - continue; - d = bkey_s_c_to_dirent(k); - if (d.v->d_type == DT_DIR) + int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_dirents, + SPOS(inum, 0, snapshot), + POS(inum, U64_MAX), + 0, k, ({ + if (k.k->type == KEY_TYPE_dirent && + bkey_s_c_to_dirent(k).v->d_type == DT_DIR) subdirs++; - } - bch2_trans_iter_exit(trans, &iter); + 0; + })); return ret ?: subdirs; } -static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot, - u32 *subvol) -{ - struct bch_snapshot s; - int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, - POS(0, snapshot), 0, - snapshot, &s); - if (!ret) - *subvol = le32_to_cpu(s.subvol); - else if (bch2_err_matches(ret, ENOENT)) - bch_err(trans->c, "snapshot %u not found", snapshot); - return ret; - -} - -static int __subvol_lookup(struct btree_trans *trans, u32 subvol, - u32 *snapshot, u64 *inum) +static int subvol_lookup(struct btree_trans *trans, u32 subvol, + u32 *snapshot, u64 *inum) { struct bch_subvolume s; int ret; @@ -99,12 +72,6 @@ static int __subvol_lookup(struct btree_trans *trans, u32 subvol, return ret; } -static int subvol_lookup(struct btree_trans *trans, u32 subvol, - u32 *snapshot, u64 *inum) -{ - return lockrestart_do(trans, __subvol_lookup(trans, subvol, snapshot, inum)); -} - static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, struct bch_inode_unpacked *inode) { @@ -132,7 +99,7 @@ err: return ret; } -static int __lookup_inode(struct btree_trans *trans, u64 inode_nr, +static int lookup_inode(struct btree_trans *trans, u64 inode_nr, struct bch_inode_unpacked *inode, u32 *snapshot) { @@ -152,29 +119,19 @@ static int __lookup_inode(struct btree_trans *trans, u64 inode_nr, if (!ret) *snapshot = iter.pos.snapshot; err: - bch_err_msg(trans->c, ret, "fetching inode %llu:%u", inode_nr, *snapshot); bch2_trans_iter_exit(trans, &iter); return ret; } -static int lookup_inode(struct btree_trans *trans, u64 inode_nr, - struct bch_inode_unpacked *inode, - u32 *snapshot) -{ - return lockrestart_do(trans, __lookup_inode(trans, inode_nr, inode, snapshot)); -} - -static int __lookup_dirent(struct btree_trans *trans, +static int lookup_dirent_in_snapshot(struct btree_trans *trans, struct bch_hash_info hash_info, subvol_inum dir, struct qstr *name, - u64 *target, unsigned *type) + u64 *target, unsigned *type, u32 snapshot) { struct btree_iter iter; struct bkey_s_c_dirent d; - int ret; - - ret = bch2_hash_lookup(trans, &iter, bch2_dirent_hash_desc, - &hash_info, dir, name, 0); + int ret = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc, + &hash_info, dir, name, 0, snapshot); if (ret) return ret; @@ -207,12 +164,9 @@ static int fsck_write_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode, u32 snapshot) { - int ret = commit_do(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, - __write_inode(trans, inode, snapshot)); - if (ret) - bch_err_fn(trans->c, ret); + int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + __write_inode(trans, inode, snapshot)); + bch_err_fn(trans->c, ret); return ret; } @@ -242,35 +196,44 @@ err: } /* Get lost+found, create if it doesn't exist: */ -static int lookup_lostfound(struct btree_trans *trans, u32 subvol, +static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, struct bch_inode_unpacked *lostfound) { struct bch_fs *c = trans->c; - struct bch_inode_unpacked root; - struct bch_hash_info root_hash_info; struct qstr lostfound_str = QSTR("lost+found"); - subvol_inum root_inum = { .subvol = subvol }; u64 inum = 0; unsigned d_type = 0; - u32 snapshot; int ret; - ret = __subvol_lookup(trans, subvol, &snapshot, &root_inum.inum); + struct bch_snapshot_tree st; + ret = bch2_snapshot_tree_lookup(trans, + bch2_snapshot_tree(c, snapshot), &st); + if (ret) + return ret; + + subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) }; + u32 subvol_snapshot; + + ret = subvol_lookup(trans, le32_to_cpu(st.master_subvol), + &subvol_snapshot, &root_inum.inum); + bch_err_msg(c, ret, "looking up root subvol"); if (ret) return ret; - ret = __lookup_inode(trans, root_inum.inum, &root, &snapshot); + struct bch_inode_unpacked root_inode; + struct bch_hash_info root_hash_info; + u32 root_inode_snapshot = snapshot; + ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot); + bch_err_msg(c, ret, "looking up root inode"); if (ret) return ret; - root_hash_info = bch2_hash_info_init(c, &root); + root_hash_info = bch2_hash_info_init(c, &root_inode); - ret = __lookup_dirent(trans, root_hash_info, root_inum, - &lostfound_str, &inum, &d_type); - if (bch2_err_matches(ret, ENOENT)) { - bch_notice(c, "creating lost+found"); + ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum, + &lostfound_str, &inum, &d_type, snapshot); + if (bch2_err_matches(ret, ENOENT)) goto create_lostfound; - } bch_err_fn(c, ret); if (ret) @@ -285,20 +248,53 @@ static int lookup_lostfound(struct btree_trans *trans, u32 subvol, * The bch2_check_dirents pass has already run, dangling dirents * shouldn't exist here: */ - return __lookup_inode(trans, inum, lostfound, &snapshot); + ret = lookup_inode(trans, inum, lostfound, &snapshot); + bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)", + inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot)); + return ret; create_lostfound: + /* + * XXX: we could have a nicer log message here if we had a nice way to + * walk backpointers to print a path + */ + bch_notice(c, "creating lost+found in snapshot %u", le32_to_cpu(st.root_snapshot)); + + u64 now = bch2_current_time(c); + struct btree_iter lostfound_iter = { NULL }; + u64 cpu = raw_smp_processor_id(); + bch2_inode_init_early(c, lostfound); + bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode); + lostfound->bi_dir = root_inode.bi_inum; + + root_inode.bi_nlink++; + + ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu); + if (ret) + goto err; - ret = bch2_create_trans(trans, root_inum, &root, - lostfound, &lostfound_str, - 0, 0, S_IFDIR|0700, 0, NULL, NULL, - (subvol_inum) { }, 0); + bch2_btree_iter_set_snapshot(&lostfound_iter, snapshot); + ret = bch2_btree_iter_traverse(&lostfound_iter); + if (ret) + goto err; + + ret = bch2_dirent_create_snapshot(trans, + root_inode.bi_inum, snapshot, &root_hash_info, + mode_to_type(lostfound->bi_mode), + &lostfound_str, + lostfound->bi_inum, + &lostfound->bi_dir_offset, + BCH_HASH_SET_MUST_CREATE) ?: + bch2_inode_write_flags(trans, &lostfound_iter, lostfound, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); +err: bch_err_msg(c, ret, "creating lost+found"); + bch2_trans_iter_exit(trans, &lostfound_iter); return ret; } -static int __reattach_inode(struct btree_trans *trans, +static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode, u32 inode_snapshot) { @@ -307,14 +303,9 @@ static int __reattach_inode(struct btree_trans *trans, char name_buf[20]; struct qstr name; u64 dir_offset = 0; - u32 subvol; int ret; - ret = __snapshot_lookup_subvol(trans, inode_snapshot, &subvol); - if (ret) - return ret; - - ret = lookup_lostfound(trans, subvol, &lostfound); + ret = lookup_lostfound(trans, inode_snapshot, &lostfound); if (ret) return ret; @@ -331,15 +322,12 @@ static int __reattach_inode(struct btree_trans *trans, snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum); name = (struct qstr) QSTR(name_buf); - ret = bch2_dirent_create(trans, - (subvol_inum) { - .subvol = subvol, - .inum = lostfound.bi_inum, - }, - &dir_hash, - inode_d_type(inode), - &name, inode->bi_inum, &dir_offset, - BCH_HASH_SET_MUST_CREATE); + ret = bch2_dirent_create_snapshot(trans, + lostfound.bi_inum, inode_snapshot, + &dir_hash, + inode_d_type(inode), + &name, inode->bi_inum, &dir_offset, + BCH_HASH_SET_MUST_CREATE); if (ret) return ret; @@ -349,18 +337,6 @@ static int __reattach_inode(struct btree_trans *trans, return __write_inode(trans, inode, inode_snapshot); } -static int reattach_inode(struct btree_trans *trans, - struct bch_inode_unpacked *inode, - u32 inode_snapshot) -{ - int ret = commit_do(trans, NULL, NULL, - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_NOFAIL, - __reattach_inode(trans, inode, inode_snapshot)); - bch_err_msg(trans->c, ret, "reattaching inode %llu", inode->bi_inum); - return ret; -} - static int remove_backpointer(struct btree_trans *trans, struct bch_inode_unpacked *inode) { @@ -405,7 +381,7 @@ static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s }; int ret = 0; - darray_for_each(s->ids, i) { + __darray_for_each(s->ids, i) { if (i->id == id) return 0; if (i->id > id) @@ -422,7 +398,7 @@ static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, enum btree_id btree_id, struct bpos pos) { - struct snapshots_seen_entry *i, n = { + struct snapshots_seen_entry n = { .id = pos.snapshot, .equiv = bch2_snapshot_equiv(c, pos.snapshot), }; @@ -448,7 +424,7 @@ static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, bch2_btree_id_str(btree_id), pos.inode, pos.offset, i->id, n.id, n.equiv); - set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags); + set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_delete_dead_snapshots); } } @@ -593,14 +569,13 @@ static int get_inodes_all_snapshots(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_s_c k; - u32 restart_count = trans->restart_count; int ret; w->recalculate_sums = false; w->inodes.nr = 0; - for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum), - BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum), + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { if (k.k->p.offset != inum) break; @@ -613,8 +588,7 @@ static int get_inodes_all_snapshots(struct btree_trans *trans, return ret; w->first_this_inode = true; - - return trans_was_restarted(trans, restart_count); + return 0; } static struct inode_walker_entry * @@ -625,7 +599,7 @@ lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, snapshot = bch2_snapshot_equiv(c, snapshot); - darray_for_each(w->inodes, i) + __darray_for_each(w->inodes, i) if (bch2_snapshot_is_ancestor(c, snapshot, i->snapshot)) goto found; @@ -667,11 +641,8 @@ static struct inode_walker_entry *walk_inode(struct btree_trans *trans, if (ret) return ERR_PTR(ret); } else if (bkey_cmp(w->last_pos, pos)) { - struct inode_walker_entry *i; - darray_for_each(w->inodes, i) i->seen_this_pos = false; - } w->last_pos = pos; @@ -756,9 +727,7 @@ static int hash_redo_key(struct btree_trans *trans, k.k->p.snapshot, tmp, BCH_HASH_SET_MUST_CREATE, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: - bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW); + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); } static int hash_check_key(struct btree_trans *trans, @@ -826,6 +795,18 @@ fsck_err: goto out; } +static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p) +{ + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0); + int ret = bkey_err(k); + if (ret) + return ret; + + bch2_trans_iter_exit(trans, &iter); + return k.k->type == KEY_TYPE_set; +} + static int check_inode(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, @@ -867,7 +848,7 @@ static int check_inode(struct btree_trans *trans, c, inode_snapshot_mismatch, "inodes in different snapshots don't match")) { bch_err(c, "repair not implemented yet"); - return -EINVAL; + return -BCH_ERR_fsck_repair_unimplemented; } if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) && @@ -890,14 +871,22 @@ static int check_inode(struct btree_trans *trans, return 0; } + if (u.bi_flags & BCH_INODE_unlinked) { + ret = check_inode_deleted_list(trans, k.k->p); + if (ret < 0) + return ret; + + fsck_err_on(ret, c, unlinked_inode_not_on_deleted_list, + "inode %llu:%u unlinked, but not on deleted list", + u.bi_inum, k.k->p.snapshot); + ret = 0; + } + if (u.bi_flags & BCH_INODE_unlinked && (!c->sb.clean || fsck_err(c, inode_unlinked_but_clean, "filesystem marked clean, but inode %llu unlinked", u.bi_inum))) { - bch2_trans_unlock(trans); - bch2_fs_lazy_rw(c); - ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot); bch_err_msg(c, ret, "in fsck deleting inode"); return ret; @@ -910,9 +899,6 @@ static int check_inode(struct btree_trans *trans, u.bi_inum))) { bch_verbose(c, "truncating inode %llu", u.bi_inum); - bch2_trans_unlock(trans); - bch2_fs_lazy_rw(c); - /* * XXX: need to truncate partial blocks too here - or ideally * just switch units to bytes and that issue goes away @@ -976,27 +962,22 @@ fsck_err: return ret; } -noinline_for_stack int bch2_check_inodes(struct bch_fs *c) { bool full = c->opts.fsck; - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; struct bch_inode_unpacked prev = { 0 }; struct snapshots_seen s; - struct bkey_s_c k; - int ret; snapshots_seen_init(&s); - ret = for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, - POS_MIN, - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, - NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, - check_inode(trans, &iter, k, &prev, &s, full)); + int ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, + POS_MIN, + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + check_inode(trans, &iter, k, &prev, &s, full))); snapshots_seen_exit(&s); - bch2_trans_put(trans); bch_err_fn(c, ret); return ret; } @@ -1023,29 +1004,9 @@ static bool dirent_points_to_inode(struct bkey_s_c_dirent d, : le64_to_cpu(d.v->d_inum) == inode->bi_inum; } -static int inode_backpointer_exists(struct btree_trans *trans, - struct bch_inode_unpacked *inode, - u32 snapshot) -{ - struct btree_iter iter; - struct bkey_s_c_dirent d; - int ret; - - d = dirent_get_by_pos(trans, &iter, - SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot)); - ret = bkey_err(d); - if (ret) - return bch2_err_matches(ret, ENOENT) ? 0 : ret; - - ret = dirent_points_to_inode(d, inode); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) { struct bch_fs *c = trans->c; - struct inode_walker_entry *i; u32 restart_count = trans->restart_count; int ret = 0; s64 count2; @@ -1094,11 +1055,8 @@ struct extent_ends { static void extent_ends_reset(struct extent_ends *extent_ends) { - struct extent_end *i; - darray_for_each(extent_ends->e, i) snapshots_seen_exit(&i->seen); - extent_ends->e.nr = 0; } @@ -1130,7 +1088,7 @@ static int extent_ends_at(struct bch_fs *c, if (!n.seen.ids.data) return -BCH_ERR_ENOMEM_fsck_extent_ends_at; - darray_for_each(extent_ends->e, i) { + __darray_for_each(extent_ends->e, i) { if (i->snapshot == k.k->p.snapshot) { snapshots_seen_exit(&i->seen); *i = n; @@ -1220,13 +1178,12 @@ static int overlapping_extents_found(struct btree_trans *trans, swap(k1, k2); } - trans->extra_journal_res += bch2_bkey_sectors_compressed(k2); + trans->extra_disk_res += bch2_bkey_sectors_compressed(k2); ret = bch2_trans_update_extent_overwrite(trans, old_iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE, k1, k2) ?: - bch2_trans_commit(trans, &res, NULL, - BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL); + bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc); bch2_disk_reservation_put(c, &res); if (ret) @@ -1270,7 +1227,6 @@ static int check_overlapping_extents(struct btree_trans *trans, bool *fixed) { struct bch_fs *c = trans->c; - struct extent_end *i; int ret = 0; /* transaction restart, running again */ @@ -1451,32 +1407,28 @@ int bch2_check_extents(struct bch_fs *c) { struct inode_walker w = inode_walker_init(); struct snapshots_seen s; - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; struct extent_ends extent_ends; struct disk_reservation res = { 0 }; - int ret = 0; snapshots_seen_init(&s); extent_ends_init(&extent_ends); - ret = for_each_btree_key_commit(trans, iter, BTREE_ID_extents, - POS(BCACHEFS_ROOT_INO, 0), - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, - &res, NULL, - BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({ - bch2_disk_reservation_put(c, &res); - check_extent(trans, &iter, k, &w, &s, &extent_ends) ?: - check_extent_overbig(trans, &iter, k); - })) ?: - check_i_sectors(trans, &w); + int ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, BTREE_ID_extents, + POS(BCACHEFS_ROOT_INO, 0), + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + &res, NULL, + BCH_TRANS_COMMIT_no_enospc, ({ + bch2_disk_reservation_put(c, &res); + check_extent(trans, &iter, k, &w, &s, &extent_ends) ?: + check_extent_overbig(trans, &iter, k); + })) ?: + check_i_sectors(trans, &w)); bch2_disk_reservation_put(c, &res); extent_ends_exit(&extent_ends); inode_walker_exit(&w); snapshots_seen_exit(&s); - bch2_trans_put(trans); bch_err_fn(c, ret); return ret; @@ -1484,24 +1436,19 @@ int bch2_check_extents(struct bch_fs *c) int bch2_check_indirect_extents(struct bch_fs *c) { - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; struct disk_reservation res = { 0 }; - int ret = 0; - ret = for_each_btree_key_commit(trans, iter, BTREE_ID_reflink, - POS_MIN, - BTREE_ITER_PREFETCH, k, - &res, NULL, - BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({ - bch2_disk_reservation_put(c, &res); - check_extent_overbig(trans, &iter, k); - })); + int ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, BTREE_ID_reflink, + POS_MIN, + BTREE_ITER_PREFETCH, k, + &res, NULL, + BCH_TRANS_COMMIT_no_enospc, ({ + bch2_disk_reservation_put(c, &res); + check_extent_overbig(trans, &iter, k); + }))); bch2_disk_reservation_put(c, &res); - bch2_trans_put(trans); - bch_err_fn(c, ret); return ret; } @@ -1509,7 +1456,6 @@ int bch2_check_indirect_extents(struct bch_fs *c) static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) { struct bch_fs *c = trans->c; - struct inode_walker_entry *i; u32 restart_count = trans->restart_count; int ret = 0; s64 count2; @@ -1553,8 +1499,8 @@ static int check_dirent_target(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct bkey_i_dirent *n; - bool backpointer_exists = true; struct printbuf buf = PRINTBUF; + struct btree_iter bp_iter = { NULL }; int ret = 0; if (!target->bi_dir && @@ -1568,25 +1514,37 @@ static int check_dirent_target(struct btree_trans *trans, } if (!inode_points_to_dirent(target, d)) { - ret = inode_backpointer_exists(trans, target, d.k->p.snapshot); - if (ret < 0) + struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter, + SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot)); + ret = bkey_err(bp_dirent); + if (ret && !bch2_err_matches(ret, ENOENT)) goto err; - backpointer_exists = ret; + bool backpointer_exists = !ret; ret = 0; + bch2_bkey_val_to_text(&buf, c, d.s_c); + prt_newline(&buf); + if (backpointer_exists) + bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); + if (fsck_err_on(S_ISDIR(target->bi_mode) && backpointer_exists, c, inode_dir_multiple_links, - "directory %llu with multiple links", - target->bi_inum)) { + "directory %llu:%u with multiple links\n%s", + target->bi_inum, target_snapshot, buf.buf)) { ret = __remove_dirent(trans, d.k->p); goto out; } + /* + * hardlinked file with nlink 0: + * We're just adjusting nlink here so check_nlinks() will pick + * it up, it ignores inodes with nlink 0 + */ if (fsck_err_on(backpointer_exists && !target->bi_nlink, c, inode_multiple_links_but_nlink_0, - "inode %llu type %s has multiple links but i_nlink 0", - target->bi_inum, bch2_d_types[d.v->d_type])) { + "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", + target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { target->bi_nlink++; target->bi_flags &= ~BCH_INODE_unlinked; @@ -1636,13 +1594,12 @@ static int check_dirent_target(struct btree_trans *trans, d = dirent_i_to_s_c(n); } - if (d.v->d_type == DT_SUBVOL && - target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol) && - (c->sb.version < bcachefs_metadata_version_subvol_dirent || - fsck_err(c, dirent_d_parent_subvol_wrong, - "dirent has wrong d_parent_subvol field: got %u, should be %u", - le32_to_cpu(d.v->d_parent_subvol), - target->bi_parent_subvol))) { + if (fsck_err_on(d.v->d_type == DT_SUBVOL && + target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol), + c, dirent_d_parent_subvol_wrong, + "dirent has wrong d_parent_subvol field: got %u, should be %u", + le32_to_cpu(d.v->d_parent_subvol), + target->bi_parent_subvol)) { n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); ret = PTR_ERR_OR_ZERO(n); if (ret) @@ -1660,6 +1617,7 @@ static int check_dirent_target(struct btree_trans *trans, out: err: fsck_err: + bch2_trans_iter_exit(trans, &bp_iter); printbuf_exit(&buf); bch_err_fn(c, ret); return ret; @@ -1701,7 +1659,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, goto err; } - BUG_ON(!iter->path->should_be_locked); + BUG_ON(!btree_iter_path(trans, iter)->should_be_locked); i = walk_inode(trans, dir, equiv, k.k->type == KEY_TYPE_whiteout); ret = PTR_ERR_OR_ZERO(i); @@ -1754,7 +1712,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, u32 target_snapshot; u64 target_inum; - ret = __subvol_lookup(trans, target_subvol, + ret = subvol_lookup(trans, target_subvol, &target_snapshot, &target_inum); if (ret && !bch2_err_matches(ret, ENOENT)) goto err; @@ -1766,7 +1724,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, goto err; } - ret = __lookup_inode(trans, target_inum, + ret = lookup_inode(trans, target_inum, &subvol_root, &target_snapshot); if (ret && !bch2_err_matches(ret, ENOENT)) goto err; @@ -1842,22 +1800,18 @@ int bch2_check_dirents(struct bch_fs *c) struct inode_walker target = inode_walker_init(); struct snapshots_seen s; struct bch_hash_info hash_info; - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; snapshots_seen_init(&s); - ret = for_each_btree_key_commit(trans, iter, BTREE_ID_dirents, - POS(BCACHEFS_ROOT_INO, 0), - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, - k, - NULL, NULL, - BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, - check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)); + int ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, BTREE_ID_dirents, + POS(BCACHEFS_ROOT_INO, 0), + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, + k, + NULL, NULL, + BCH_TRANS_COMMIT_no_enospc, + check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s))); - bch2_trans_put(trans); snapshots_seen_exit(&s); inode_walker_exit(&dir); inode_walker_exit(&target); @@ -1908,8 +1862,6 @@ int bch2_check_xattrs(struct bch_fs *c) { struct inode_walker inode = inode_walker_init(); struct bch_hash_info hash_info; - struct btree_iter iter; - struct bkey_s_c k; int ret = 0; ret = bch2_trans_run(c, @@ -1918,7 +1870,7 @@ int bch2_check_xattrs(struct bch_fs *c) BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, NULL, NULL, - BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + BCH_TRANS_COMMIT_no_enospc, check_xattr(trans, &iter, k, &hash_info, &inode))); bch_err_fn(c, ret); return ret; @@ -1932,7 +1884,7 @@ static int check_root_trans(struct btree_trans *trans) u64 inum; int ret; - ret = __subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum); + ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum); if (ret && !bch2_err_matches(ret, ENOENT)) return ret; @@ -1948,18 +1900,13 @@ static int check_root_trans(struct btree_trans *trans) root_subvol.v.flags = 0; root_subvol.v.snapshot = cpu_to_le32(snapshot); root_subvol.v.inode = cpu_to_le64(inum); - ret = commit_do(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, - bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, - &root_subvol.k_i, 0)); + ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol.k_i, 0); bch_err_msg(c, ret, "writing root subvol"); if (ret) goto err; - } - ret = __lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot); + ret = lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot); if (ret && !bch2_err_matches(ret, ENOENT)) return ret; @@ -1983,11 +1930,7 @@ fsck_err: /* Get root directory, create if it doesn't exist: */ int bch2_check_root(struct bch_fs *c) { - int ret; - - ret = bch2_trans_do(c, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, + int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_root_trans(trans)); bch_err_fn(c, ret); return ret; @@ -2002,13 +1945,10 @@ typedef DARRAY(struct pathbuf_entry) pathbuf; static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) { - struct pathbuf_entry *i; - darray_for_each(*p, i) if (i->inum == inum && i->snapshot == snapshot) return true; - return false; } @@ -2057,10 +1997,10 @@ static int check_path(struct btree_trans *trans, break; } - ret = lockrestart_do(trans, - PTR_ERR_OR_ZERO((d = dirent_get_by_pos(trans, &dirent_iter, - SPOS(inode->bi_dir, inode->bi_dir_offset, - parent_snapshot))).k)); + d = dirent_get_by_pos(trans, &dirent_iter, + SPOS(inode->bi_dir, inode->bi_dir_offset, + parent_snapshot)); + ret = bkey_err(d.s_c); if (ret && !bch2_err_matches(ret, ENOENT)) break; @@ -2097,13 +2037,12 @@ static int check_path(struct btree_trans *trans, ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot); if (ret) { /* Should have been caught in dirents pass */ - bch_err(c, "error looking up parent directory: %i", ret); + if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) + bch_err(c, "error looking up parent directory: %i", ret); break; } if (path_is_dup(p, inode->bi_inum, snapshot)) { - struct pathbuf_entry *i; - /* XXX print path */ bch_err(c, "directory structure loop"); @@ -2111,20 +2050,19 @@ static int check_path(struct btree_trans *trans, pr_err("%llu:%u", i->inum, i->snapshot); pr_err("%llu:%u", inode->bi_inum, snapshot); - if (!fsck_err(c, dir_loop, - "directory structure loop")) + if (!fsck_err(c, dir_loop, "directory structure loop")) return 0; - ret = commit_do(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, - remove_backpointer(trans, inode)); - if (ret) { - bch_err(c, "error removing dirent: %i", ret); + ret = remove_backpointer(trans, inode); + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + bch_err_msg(c, ret, "removing dirent"); + if (ret) break; - } ret = reattach_inode(trans, inode, snapshot); + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + bch_err_msg(c, ret, "reattaching inode %llu", inode->bi_inum); + break; } } fsck_err: @@ -2139,37 +2077,28 @@ fsck_err: */ int bch2_check_directory_structure(struct bch_fs *c) { - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; struct bch_inode_unpacked u; pathbuf path = { 0, }; int ret; - for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN, - BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, ret) { - if (!bkey_is_inode(k.k)) - continue; + ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN, + BTREE_ITER_INTENT| + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + if (!bkey_is_inode(k.k)) + continue; - ret = bch2_inode_unpack(k, &u); - if (ret) { - /* Should have been caught earlier in fsck: */ - bch_err(c, "error unpacking inode %llu: %i", k.k->p.offset, ret); - break; - } + BUG_ON(bch2_inode_unpack(k, &u)); - if (u.bi_flags & BCH_INODE_unlinked) - continue; + if (u.bi_flags & BCH_INODE_unlinked) + continue; - ret = check_path(trans, &path, &u, iter.pos.snapshot); - if (ret) - break; - } - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); + check_path(trans, &path, &u, iter.pos.snapshot); + }))); darray_exit(&path); + bch_err_fn(c, ret); return ret; } @@ -2255,47 +2184,39 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, struct nlink_table *t, u64 start, u64 *end) { - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - struct bch_inode_unpacked u; - int ret = 0; - - for_each_btree_key(trans, iter, BTREE_ID_inodes, - POS(0, start), - BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, ret) { - if (!bkey_is_inode(k.k)) - continue; - - /* Should never fail, checked by bch2_inode_invalid: */ - BUG_ON(bch2_inode_unpack(k, &u)); - - /* - * Backpointer and directory structure checks are sufficient for - * directories, since they can't have hardlinks: - */ - if (S_ISDIR(u.bi_mode)) - continue; + int ret = bch2_trans_run(c, + for_each_btree_key(trans, iter, BTREE_ID_inodes, + POS(0, start), + BTREE_ITER_INTENT| + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ({ + if (!bkey_is_inode(k.k)) + continue; - if (!u.bi_nlink) - continue; + /* Should never fail, checked by bch2_inode_invalid: */ + struct bch_inode_unpacked u; + BUG_ON(bch2_inode_unpack(k, &u)); - ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot); - if (ret) { - *end = k.k->p.offset; - ret = 0; - break; - } + /* + * Backpointer and directory structure checks are sufficient for + * directories, since they can't have hardlinks: + */ + if (S_ISDIR(u.bi_mode)) + continue; - } - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); + if (!u.bi_nlink) + continue; - if (ret) - bch_err(c, "error in fsck: btree error %i while walking inodes", ret); + ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot); + if (ret) { + *end = k.k->p.offset; + ret = 0; + break; + } + 0; + }))); + bch_err_fn(c, ret); return ret; } @@ -2303,42 +2224,34 @@ noinline_for_stack static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links, u64 range_start, u64 range_end) { - struct btree_trans *trans = bch2_trans_get(c); struct snapshots_seen s; - struct btree_iter iter; - struct bkey_s_c k; - struct bkey_s_c_dirent d; - int ret; snapshots_seen_init(&s); - for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN, - BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, ret) { - ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p); - if (ret) - break; - - switch (k.k->type) { - case KEY_TYPE_dirent: - d = bkey_s_c_to_dirent(k); + int ret = bch2_trans_run(c, + for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN, + BTREE_ITER_INTENT| + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ({ + ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p); + if (ret) + break; - if (d.v->d_type != DT_DIR && - d.v->d_type != DT_SUBVOL) - inc_link(c, &s, links, range_start, range_end, - le64_to_cpu(d.v->d_inum), - bch2_snapshot_equiv(c, d.k->p.snapshot)); - break; - } - } - bch2_trans_iter_exit(trans, &iter); + if (k.k->type == KEY_TYPE_dirent) { + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); - if (ret) - bch_err(c, "error in fsck: btree error %i while walking dirents", ret); + if (d.v->d_type != DT_DIR && + d.v->d_type != DT_SUBVOL) + inc_link(c, &s, links, range_start, range_end, + le64_to_cpu(d.v->d_inum), + bch2_snapshot_equiv(c, d.k->p.snapshot)); + } + 0; + }))); - bch2_trans_put(trans); snapshots_seen_exit(&s); + + bch_err_fn(c, ret); return ret; } @@ -2389,19 +2302,16 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c, struct nlink_table *links, u64 range_start, u64 range_end) { - struct btree_iter iter; - struct bkey_s_c k; size_t idx = 0; - int ret = 0; - ret = bch2_trans_run(c, + int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS(0, range_start), BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, - NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end))); if (ret < 0) { - bch_err(c, "error in fsck: btree error %i while walking inodes", ret); + bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret)); return ret; } @@ -2447,7 +2357,6 @@ static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter, { struct bkey_s_c_reflink_p p; struct bkey_i_reflink_p *u; - int ret; if (k.k->type != KEY_TYPE_reflink_p) return 0; @@ -2458,7 +2367,7 @@ static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter, return 0; u = bch2_trans_kmalloc(trans, sizeof(*u)); - ret = PTR_ERR_OR_ZERO(u); + int ret = PTR_ERR_OR_ZERO(u); if (ret) return ret; @@ -2471,19 +2380,15 @@ static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter, int bch2_fix_reflink_p(struct bch_fs *c) { - struct btree_iter iter; - struct bkey_s_c k; - int ret; - if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) return 0; - ret = bch2_trans_run(c, + int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_extents, POS_MIN, BTREE_ITER_INTENT|BTREE_ITER_PREFETCH| BTREE_ITER_ALL_SNAPSHOTS, k, - NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, fix_reflink_p_key(trans, &iter, k))); bch_err_fn(c, ret); return ret; diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 9309cfeecd8d..086f0090b03a 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -506,22 +506,33 @@ fsck_err: static void __bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) { - prt_printf(out, "mode=%o ", inode->bi_mode); + printbuf_indent_add(out, 2); + prt_printf(out, "mode=%o", inode->bi_mode); + prt_newline(out); prt_str(out, "flags="); prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1)); prt_printf(out, " (%x)", inode->bi_flags); + prt_newline(out); - prt_printf(out, " journal_seq=%llu bi_size=%llu bi_sectors=%llu bi_version=%llu", - inode->bi_journal_seq, - inode->bi_size, - inode->bi_sectors, - inode->bi_version); + prt_printf(out, "journal_seq=%llu", inode->bi_journal_seq); + prt_newline(out); + + prt_printf(out, "bi_size=%llu", inode->bi_size); + prt_newline(out); + + prt_printf(out, "bi_sectors=%llu", inode->bi_sectors); + prt_newline(out); + + prt_newline(out); + prt_printf(out, "bi_version=%llu", inode->bi_version); #define x(_name, _bits) \ - prt_printf(out, " "#_name "=%llu", (u64) inode->_name); + prt_printf(out, #_name "=%llu", (u64) inode->_name); \ + prt_newline(out); BCH_INODE_FIELDS_v3() #undef x + printbuf_indent_sub(out, 2); } void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) @@ -561,64 +572,46 @@ static inline bool bkey_is_deleted_inode(struct bkey_s_c k) return bkey_inode_flags(k) & BCH_INODE_unlinked; } -int bch2_trans_mark_inode(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, - struct bkey_i *new, - unsigned flags) +int bch2_trigger_inode(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, + struct bkey_s new, + unsigned flags) { - int nr = bkey_is_inode(&new->k) - bkey_is_inode(old.k); - bool old_deleted = bkey_is_deleted_inode(old); - bool new_deleted = bkey_is_deleted_inode(bkey_i_to_s_c(new)); + s64 nr = bkey_is_inode(new.k) - bkey_is_inode(old.k); - if (nr) { - int ret = bch2_replicas_deltas_realloc(trans, 0); - struct replicas_delta_list *d = trans->fs_usage_deltas; + if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + if (nr) { + int ret = bch2_replicas_deltas_realloc(trans, 0); + if (ret) + return ret; - if (ret) - return ret; + trans->fs_usage_deltas->nr_inodes += nr; + } - d->nr_inodes += nr; + bool old_deleted = bkey_is_deleted_inode(old); + bool new_deleted = bkey_is_deleted_inode(new.s_c); + if (old_deleted != new_deleted) { + int ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, new.k->p, new_deleted); + if (ret) + return ret; + } } - if (old_deleted != new_deleted) { - int ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, new->k.p, new_deleted); - if (ret) - return ret; - } + if ((flags & BTREE_TRIGGER_ATOMIC) && (flags & BTREE_TRIGGER_INSERT)) { + BUG_ON(!trans->journal_res.seq); - return 0; -} - -int bch2_mark_inode(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) -{ - struct bch_fs *c = trans->c; - struct bch_fs_usage *fs_usage; - u64 journal_seq = trans->journal_res.seq; - - if (flags & BTREE_TRIGGER_INSERT) { - struct bch_inode_v3 *v = (struct bch_inode_v3 *) new.v; - - BUG_ON(!journal_seq); - BUG_ON(new.k->type != KEY_TYPE_inode_v3); - - v->bi_journal_seq = cpu_to_le64(journal_seq); + bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq); } if (flags & BTREE_TRIGGER_GC) { - percpu_down_read(&c->mark_lock); - preempt_disable(); + struct bch_fs *c = trans->c; - fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC); - fs_usage->nr_inodes += bkey_is_inode(new.k); - fs_usage->nr_inodes -= bkey_is_inode(old.k); - - preempt_enable(); + percpu_down_read(&c->mark_lock); + this_cpu_add(c->usage_gc->b.nr_inodes, nr); percpu_up_read(&c->mark_lock); } + return 0; } @@ -831,7 +824,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans, ret = bch2_trans_update(trans, &iter, &delete, 0) ?: bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL); + BCH_TRANS_COMMIT_no_enospc); err: if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) break; @@ -894,7 +887,7 @@ retry: ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?: bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL); + BCH_TRANS_COMMIT_no_enospc); err: bch2_trans_iter_exit(trans, &iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -1058,7 +1051,7 @@ retry: ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?: bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL); + BCH_TRANS_COMMIT_no_enospc); err: bch2_trans_iter_exit(trans, &iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -1155,51 +1148,48 @@ delete: int bch2_delete_dead_inodes(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; bool need_another_pass; int ret; again: need_another_pass = false; - ret = bch2_btree_write_buffer_flush_sync(trans); - if (ret) - goto err; - /* * Weird transaction restart handling here because on successful delete, * bch2_inode_rm_snapshot() will return a nested transaction restart, * but we can't retry because the btree write buffer won't have been * flushed and we'd spin: */ - for_each_btree_key(trans, iter, BTREE_ID_deleted_inodes, POS_MIN, - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ret) { - ret = commit_do(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, - may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass)); - if (ret < 0) - break; - - if (ret) { - if (!test_bit(BCH_FS_RW, &c->flags)) { - bch2_trans_unlock(trans); - bch2_fs_lazy_rw(c); - } - + ret = for_each_btree_key_commit(trans, iter, BTREE_ID_deleted_inodes, POS_MIN, + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + ret = may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass); + if (ret > 0) { bch_verbose(c, "deleting unlinked inode %llu:%u", k.k->p.offset, k.k->p.snapshot); ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot); - if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - break; + /* + * We don't want to loop here: a transaction restart + * error here means we handled a transaction restart and + * we're actually done, but if we loop we'll retry the + * same key because the write buffer hasn't been flushed + * yet + */ + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + ret = 0; + continue; + } } - } - bch2_trans_iter_exit(trans, &iter); - if (!ret && need_another_pass) + ret; + })); + + if (!ret && need_another_pass) { + ret = bch2_btree_write_buffer_flush_sync(trans); + if (ret) + goto err; goto again; + } err: bch2_trans_put(trans); - return ret; } diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index 88818a332b1e..b63f312581cf 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -17,32 +17,27 @@ int bch2_inode_v3_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_trans_mark_inode(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_i *, unsigned); -int bch2_mark_inode(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s_c, unsigned); +int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s, unsigned); #define bch2_bkey_ops_inode ((struct bkey_ops) { \ .key_invalid = bch2_inode_invalid, \ .val_to_text = bch2_inode_to_text, \ - .trans_trigger = bch2_trans_mark_inode, \ - .atomic_trigger = bch2_mark_inode, \ + .trigger = bch2_trigger_inode, \ .min_val_size = 16, \ }) #define bch2_bkey_ops_inode_v2 ((struct bkey_ops) { \ .key_invalid = bch2_inode_v2_invalid, \ .val_to_text = bch2_inode_to_text, \ - .trans_trigger = bch2_trans_mark_inode, \ - .atomic_trigger = bch2_mark_inode, \ + .trigger = bch2_trigger_inode, \ .min_val_size = 32, \ }) #define bch2_bkey_ops_inode_v3 ((struct bkey_ops) { \ .key_invalid = bch2_inode_v3_invalid, \ .val_to_text = bch2_inode_to_text, \ - .trans_trigger = bch2_trans_mark_inode, \ - .atomic_trigger = bch2_mark_inode, \ + .trigger = bch2_trigger_inode, \ .min_val_size = 48, \ }) diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h new file mode 100644 index 000000000000..83d107331edf --- /dev/null +++ b/fs/bcachefs/inode_format.h @@ -0,0 +1,166 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_INODE_FORMAT_H +#define _BCACHEFS_INODE_FORMAT_H + +#define BLOCKDEV_INODE_MAX 4096 +#define BCACHEFS_ROOT_INO 4096 + +struct bch_inode { + struct bch_val v; + + __le64 bi_hash_seed; + __le32 bi_flags; + __le16 bi_mode; + __u8 fields[]; +} __packed __aligned(8); + +struct bch_inode_v2 { + struct bch_val v; + + __le64 bi_journal_seq; + __le64 bi_hash_seed; + __le64 bi_flags; + __le16 bi_mode; + __u8 fields[]; +} __packed __aligned(8); + +struct bch_inode_v3 { + struct bch_val v; + + __le64 bi_journal_seq; + __le64 bi_hash_seed; + __le64 bi_flags; + __le64 bi_sectors; + __le64 bi_size; + __le64 bi_version; + __u8 fields[]; +} __packed __aligned(8); + +#define INODEv3_FIELDS_START_INITIAL 6 +#define INODEv3_FIELDS_START_CUR (offsetof(struct bch_inode_v3, fields) / sizeof(__u64)) + +struct bch_inode_generation { + struct bch_val v; + + __le32 bi_generation; + __le32 pad; +} __packed __aligned(8); + +/* + * bi_subvol and bi_parent_subvol are only set for subvolume roots: + */ + +#define BCH_INODE_FIELDS_v2() \ + x(bi_atime, 96) \ + x(bi_ctime, 96) \ + x(bi_mtime, 96) \ + x(bi_otime, 96) \ + x(bi_size, 64) \ + x(bi_sectors, 64) \ + x(bi_uid, 32) \ + x(bi_gid, 32) \ + x(bi_nlink, 32) \ + x(bi_generation, 32) \ + x(bi_dev, 32) \ + x(bi_data_checksum, 8) \ + x(bi_compression, 8) \ + x(bi_project, 32) \ + x(bi_background_compression, 8) \ + x(bi_data_replicas, 8) \ + x(bi_promote_target, 16) \ + x(bi_foreground_target, 16) \ + x(bi_background_target, 16) \ + x(bi_erasure_code, 16) \ + x(bi_fields_set, 16) \ + x(bi_dir, 64) \ + x(bi_dir_offset, 64) \ + x(bi_subvol, 32) \ + x(bi_parent_subvol, 32) + +#define BCH_INODE_FIELDS_v3() \ + x(bi_atime, 96) \ + x(bi_ctime, 96) \ + x(bi_mtime, 96) \ + x(bi_otime, 96) \ + x(bi_uid, 32) \ + x(bi_gid, 32) \ + x(bi_nlink, 32) \ + x(bi_generation, 32) \ + x(bi_dev, 32) \ + x(bi_data_checksum, 8) \ + x(bi_compression, 8) \ + x(bi_project, 32) \ + x(bi_background_compression, 8) \ + x(bi_data_replicas, 8) \ + x(bi_promote_target, 16) \ + x(bi_foreground_target, 16) \ + x(bi_background_target, 16) \ + x(bi_erasure_code, 16) \ + x(bi_fields_set, 16) \ + x(bi_dir, 64) \ + x(bi_dir_offset, 64) \ + x(bi_subvol, 32) \ + x(bi_parent_subvol, 32) \ + x(bi_nocow, 8) + +/* subset of BCH_INODE_FIELDS */ +#define BCH_INODE_OPTS() \ + x(data_checksum, 8) \ + x(compression, 8) \ + x(project, 32) \ + x(background_compression, 8) \ + x(data_replicas, 8) \ + x(promote_target, 16) \ + x(foreground_target, 16) \ + x(background_target, 16) \ + x(erasure_code, 16) \ + x(nocow, 8) + +enum inode_opt_id { +#define x(name, ...) \ + Inode_opt_##name, + BCH_INODE_OPTS() +#undef x + Inode_opt_nr, +}; + +#define BCH_INODE_FLAGS() \ + x(sync, 0) \ + x(immutable, 1) \ + x(append, 2) \ + x(nodump, 3) \ + x(noatime, 4) \ + x(i_size_dirty, 5) \ + x(i_sectors_dirty, 6) \ + x(unlinked, 7) \ + x(backptr_untrusted, 8) + +/* bits 20+ reserved for packed fields below: */ + +enum bch_inode_flags { +#define x(t, n) BCH_INODE_##t = 1U << n, + BCH_INODE_FLAGS() +#undef x +}; + +enum __bch_inode_flags { +#define x(t, n) __BCH_INODE_##t = n, + BCH_INODE_FLAGS() +#undef x +}; + +LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24); +LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31); +LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32); + +LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24); +LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31); + +LE64_BITMASK(INODEv3_STR_HASH, struct bch_inode_v3, bi_flags, 20, 24); +LE64_BITMASK(INODEv3_NR_FIELDS, struct bch_inode_v3, bi_flags, 24, 31); + +LE64_BITMASK(INODEv3_FIELDS_START, + struct bch_inode_v3, bi_flags, 31, 36); +LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52); + +#endif /* _BCACHEFS_INODE_FORMAT_H */ diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c index bebc11444ef5..1baf78594cca 100644 --- a/fs/bcachefs/io_misc.c +++ b/fs/bcachefs/io_misc.c @@ -34,8 +34,7 @@ int bch2_extent_fallocate(struct btree_trans *trans, struct open_buckets open_buckets = { 0 }; struct bkey_s_c k; struct bkey_buf old, new; - unsigned sectors_allocated = 0; - bool have_reservation = false; + unsigned sectors_allocated = 0, new_replicas; bool unwritten = opts.nocow && c->sb.version >= bcachefs_metadata_version_unwritten_extents; int ret; @@ -50,28 +49,20 @@ int bch2_extent_fallocate(struct btree_trans *trans, return ret; sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset); + new_replicas = max(0, (int) opts.data_replicas - + (int) bch2_bkey_nr_ptrs_fully_allocated(k)); - if (!have_reservation) { - unsigned new_replicas = - max(0, (int) opts.data_replicas - - (int) bch2_bkey_nr_ptrs_fully_allocated(k)); - /* - * Get a disk reservation before (in the nocow case) calling - * into the allocator: - */ - ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0); - if (unlikely(ret)) - goto err; - - bch2_bkey_buf_reassemble(&old, c, k); - } + /* + * Get a disk reservation before (in the nocow case) calling + * into the allocator: + */ + ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0); + if (unlikely(ret)) + goto err_noprint; - if (have_reservation) { - if (!bch2_extents_match(k, bkey_i_to_s_c(old.k))) - goto err; + bch2_bkey_buf_reassemble(&old, c, k); - bch2_key_resize(&new.k->k, sectors); - } else if (!unwritten) { + if (!unwritten) { struct bkey_i_reservation *reservation; bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64)); @@ -83,7 +74,6 @@ int bch2_extent_fallocate(struct btree_trans *trans, struct bkey_i_extent *e; struct bch_devs_list devs_have; struct write_point *wp; - struct bch_extent_ptr *ptr; devs_have.nr = 0; @@ -118,14 +108,17 @@ int bch2_extent_fallocate(struct btree_trans *trans, ptr->unwritten = true; } - have_reservation = true; - ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res, 0, i_sectors_delta, true); err: if (!ret && sectors_allocated) bch2_increment_clock(c, sectors_allocated, WRITE); - + if (should_print_err(ret)) + bch_err_inum_offset_ratelimited(c, + inum.inum, + iter->pos.offset << 9, + "%s(): error: %s", __func__, bch2_err_str(ret)); +err_noprint: bch2_open_buckets_put(c, &open_buckets); bch2_disk_reservation_put(c, &disk_res); bch2_bkey_buf_exit(&new, c); @@ -256,7 +249,7 @@ static int __bch2_resume_logged_op_truncate(struct btree_trans *trans, u64 new_i_size = le64_to_cpu(op->v.new_i_size); int ret; - ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, truncate_set_isize(trans, inum, new_i_size)); if (ret) goto err; @@ -378,7 +371,7 @@ case LOGGED_OP_FINSERT_start: op->v.state = LOGGED_OP_FINSERT_shift_extents; if (insert) { - ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, adjust_i_size(trans, inum, src_offset, len) ?: bch2_logged_op_update(trans, &op->k_i)); if (ret) @@ -390,7 +383,7 @@ case LOGGED_OP_FINSERT_start: if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto err; - ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_logged_op_update(trans, &op->k_i)); } @@ -449,13 +442,11 @@ case LOGGED_OP_FINSERT_shift_extents: op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset); - ret = bch2_bkey_set_needs_rebalance(c, copy, - opts.background_target, - opts.background_compression) ?: + ret = bch2_bkey_set_needs_rebalance(c, copy, &opts) ?: bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?: bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?: bch2_logged_op_update(trans, &op->k_i) ?: - bch2_trans_commit(trans, &disk_res, NULL, BTREE_INSERT_NOFAIL); + bch2_trans_commit(trans, &disk_res, NULL, BCH_TRANS_COMMIT_no_enospc); btree_err: bch2_disk_reservation_put(c, &disk_res); @@ -470,12 +461,12 @@ btree_err: op->v.state = LOGGED_OP_FINSERT_finish; if (!insert) { - ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, adjust_i_size(trans, inum, src_offset, shift) ?: bch2_logged_op_update(trans, &op->k_i)); } else { /* We need an inode update to update bi_journal_seq for fsync: */ - ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, adjust_i_size(trans, inum, 0, 0) ?: bch2_logged_op_update(trans, &op->k_i)); } diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 36763865facd..3c574d8873a1 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -80,7 +80,7 @@ struct promote_op { struct bpos pos; struct data_update write; - struct bio_vec bi_inline_vecs[0]; /* must be last */ + struct bio_vec bi_inline_vecs[]; /* must be last */ }; static const struct rhashtable_params bch_promote_params = { @@ -172,11 +172,13 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, int ret; if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) - return NULL; + return ERR_PTR(-BCH_ERR_nopromote_no_writes); - op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOFS); - if (!op) + op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_KERNEL); + if (!op) { + ret = -BCH_ERR_nopromote_enomem; goto err; + } op->start_time = local_clock(); op->pos = pos; @@ -187,24 +189,29 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, */ *rbio = kzalloc(sizeof(struct bch_read_bio) + sizeof(struct bio_vec) * pages, - GFP_NOFS); - if (!*rbio) + GFP_KERNEL); + if (!*rbio) { + ret = -BCH_ERR_nopromote_enomem; goto err; + } rbio_init(&(*rbio)->bio, opts); bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0); - if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, - GFP_NOFS)) + if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, GFP_KERNEL)) { + ret = -BCH_ERR_nopromote_enomem; goto err; + } (*rbio)->bounce = true; (*rbio)->split = true; (*rbio)->kmalloc = true; if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, - bch_promote_params)) + bch_promote_params)) { + ret = -BCH_ERR_nopromote_in_flight; goto err; + } bio = &op->write.op.wbio.bio; bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0); @@ -223,9 +230,8 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, * -BCH_ERR_ENOSPC_disk_reservation: */ if (ret) { - ret = rhashtable_remove_fast(&c->promote_table, &op->hash, - bch_promote_params); - BUG_ON(ret); + BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, + bch_promote_params)); goto err; } @@ -239,7 +245,7 @@ err: *rbio = NULL; kfree(op); bch2_write_ref_put(c, BCH_WRITE_REF_promote); - return NULL; + return ERR_PTR(ret); } noinline @@ -274,10 +280,9 @@ static struct promote_op *promote_alloc(struct btree_trans *trans, ? BTREE_ID_reflink : BTREE_ID_extents, k, pos, pick, opts, sectors, rbio); - if (!promote) { - ret = -BCH_ERR_nopromote_enomem; + ret = PTR_ERR_OR_ZERO(promote); + if (ret) goto nopromote; - } *bounce = true; *read_full = promote_full; @@ -526,7 +531,7 @@ out: static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) { - bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL, + bch2_trans_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, __bch2_rbio_narrow_crcs(trans, rbio)); } @@ -637,12 +642,17 @@ csum_err: goto out; } + struct printbuf buf = PRINTBUF; + buf.atomic++; + prt_str(&buf, "data "); + bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum); + bch_err_inum_offset_ratelimited(ca, rbio->read_pos.inode, rbio->read_pos.offset << 9, - "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)", - rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, - csum.hi, csum.lo, bch2_csum_types[crc.csum_type]); + "data %s", buf.buf); + printbuf_exit(&buf); + bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); goto out; diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 8c8cb1541ac9..ef3a53f9045a 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -316,8 +316,8 @@ int bch2_extent_update(struct btree_trans *trans, i_sectors_delta) ?: bch2_trans_update(trans, iter, k, 0) ?: bch2_trans_commit(trans, disk_res, NULL, - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_NOFAIL); + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_enospc); if (unlikely(ret)) return ret; @@ -362,9 +362,7 @@ static int bch2_write_index_default(struct bch_write_op *op) bkey_start_pos(&sk.k->k), BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - ret = bch2_bkey_set_needs_rebalance(c, sk.k, - op->opts.background_target, - op->opts.background_compression) ?: + ret = bch2_bkey_set_needs_rebalance(c, sk.k, &op->opts) ?: bch2_extent_update(trans, inum, &iter, sk.k, &op->res, op->new_i_size, &op->i_sectors_delta, @@ -396,17 +394,14 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, bool nocow) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); - const struct bch_extent_ptr *ptr; struct bch_write_bio *n; - struct bch_dev *ca; BUG_ON(c->opts.nochanges); bkey_for_each_ptr(ptrs, ptr) { - BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX || - !c->devs[ptr->dev]); + BUG_ON(!bch2_dev_exists2(c, ptr->dev)); - ca = bch_dev_bkey_exists(c, ptr->dev); + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); if (to_entry(ptr + 1) < ptrs.end) { n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, @@ -1109,16 +1104,14 @@ static bool bch2_extent_is_writeable(struct bch_write_op *op, static inline void bch2_nocow_write_unlock(struct bch_write_op *op) { struct bch_fs *c = op->c; - const struct bch_extent_ptr *ptr; - struct bkey_i *k; for_each_keylist_key(&op->insert_keys, k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); bkey_for_each_ptr(ptrs, ptr) bch2_bucket_nocow_unlock(&c->nocow_locks, - PTR_BUCKET_POS(c, ptr), - BUCKET_NOCOW_LOCK_UPDATE); + PTR_BUCKET_POS(c, ptr), + BUCKET_NOCOW_LOCK_UPDATE); } } @@ -1128,25 +1121,20 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, struct bkey_s_c k, u64 new_i_size) { - struct bkey_i *new; - struct bkey_ptrs ptrs; - struct bch_extent_ptr *ptr; - int ret; - if (!bch2_extents_match(bkey_i_to_s_c(orig), k)) { /* trace this */ return 0; } - new = bch2_bkey_make_mut_noupdate(trans, k); - ret = PTR_ERR_OR_ZERO(new); + struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); + int ret = PTR_ERR_OR_ZERO(new); if (ret) return ret; bch2_cut_front(bkey_start_pos(&orig->k), new); bch2_cut_back(orig->k.p, new); - ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); bkey_for_each_ptr(ptrs, ptr) ptr->unwritten = 0; @@ -1167,16 +1155,12 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) { struct bch_fs *c = op->c; struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_i *orig; - struct bkey_s_c k; - int ret; for_each_keylist_key(&op->insert_keys, orig) { - ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents, + int ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents, bkey_start_pos(&orig->k), orig->k.p, BTREE_ITER_INTENT, k, - NULL, NULL, BTREE_INSERT_NOFAIL, ({ + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size); })); @@ -1228,10 +1212,7 @@ static void bch2_nocow_write(struct bch_write_op *op) struct btree_trans *trans; struct btree_iter iter; struct bkey_s_c k; - struct bkey_ptrs_c ptrs; - const struct bch_extent_ptr *ptr; DARRAY_PREALLOCATED(struct bucket_to_lock, 3) buckets; - struct bucket_to_lock *i; u32 snapshot; struct bucket_to_lock *stale_at; int ret; @@ -1273,7 +1254,7 @@ retry: break; /* Get iorefs before dropping btree locks: */ - ptrs = bch2_bkey_ptrs_c(k); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr(ptrs, ptr) { struct bpos b = PTR_BUCKET_POS(c, ptr); struct nocow_lock_bucket *l = @@ -1464,6 +1445,11 @@ err: op->flags |= BCH_WRITE_DONE; if (ret < 0) { + if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT)) + bch_err_inum_offset_ratelimited(c, + op->pos.inode, + op->pos.offset << 9, + "%s(): error: %s", __func__, bch2_err_str(ret)); op->error = ret; break; } diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 8cf238be6213..bc890776eb57 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -10,6 +10,7 @@ #include "bkey_methods.h" #include "btree_gc.h" #include "btree_update.h" +#include "btree_write_buffer.h" #include "buckets.h" #include "error.h" #include "journal.h" @@ -26,6 +27,47 @@ static const char * const bch2_journal_errors[] = { NULL }; +static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u64 seq) +{ + union journal_res_state s = READ_ONCE(j->reservations); + unsigned i = seq & JOURNAL_BUF_MASK; + struct journal_buf *buf = j->buf + i; + + prt_printf(out, "seq:"); + prt_tab(out); + prt_printf(out, "%llu", seq); + prt_newline(out); + printbuf_indent_add(out, 2); + + prt_printf(out, "refcount:"); + prt_tab(out); + prt_printf(out, "%u", journal_state_count(s, i)); + prt_newline(out); + + prt_printf(out, "size:"); + prt_tab(out); + prt_human_readable_u64(out, vstruct_bytes(buf->data)); + prt_newline(out); + + prt_printf(out, "expires"); + prt_tab(out); + prt_printf(out, "%li jiffies", buf->expires - jiffies); + prt_newline(out); + + printbuf_indent_sub(out, 2); +} + +static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j) +{ + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 24); + + for (u64 seq = journal_last_unwritten_seq(j); + seq <= journal_cur_seq(j); + seq++) + bch2_journal_buf_to_text(out, j, seq); +} + static inline bool journal_seq_unwritten(struct journal *j, u64 seq) { return seq > j->seq_ondisk; @@ -155,7 +197,7 @@ void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write) * We don't close a journal_buf until the next journal_buf is finished writing, * and can be opened again - this also initializes the next journal_buf: */ -static void __journal_entry_close(struct journal *j, unsigned closed_val) +static void __journal_entry_close(struct journal *j, unsigned closed_val, bool trace) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_buf *buf = journal_cur_buf(j); @@ -184,6 +226,18 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val) /* Close out old buffer: */ buf->data->u64s = cpu_to_le32(old.cur_entry_offset); + if (trace_journal_entry_close_enabled() && trace) { + struct printbuf pbuf = PRINTBUF; + pbuf.atomic++; + + prt_str(&pbuf, "entry size: "); + prt_human_readable_u64(&pbuf, vstruct_bytes(buf->data)); + prt_newline(&pbuf); + bch2_prt_task_backtrace(&pbuf, current, 1, GFP_NOWAIT); + trace_journal_entry_close(c, pbuf.buf); + printbuf_exit(&pbuf); + } + sectors = vstruct_blocks_plus(buf->data, c->block_bits, buf->u64s_reserved) << c->block_bits; BUG_ON(sectors > buf->sectors); @@ -222,7 +276,7 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val) void bch2_journal_halt(struct journal *j) { spin_lock(&j->lock); - __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL); + __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL, true); if (!j->err_seq) j->err_seq = journal_cur_seq(j); journal_wake(j); @@ -236,7 +290,7 @@ static bool journal_entry_want_write(struct journal *j) /* Don't close it yet if we already have a write in flight: */ if (ret) - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); else if (nr_unwritten_journal_entries(j)) { struct journal_buf *buf = journal_cur_buf(j); @@ -330,6 +384,7 @@ static int journal_entry_open(struct journal *j) buf->must_flush = false; buf->separate_flush = false; buf->flush_time = 0; + buf->need_flush_to_write_buffer = true; memset(buf->data, 0, sizeof(*buf->data)); buf->data->seq = cpu_to_le64(journal_cur_seq(j)); @@ -363,11 +418,6 @@ static int journal_entry_open(struct journal *j) } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); - if (j->res_get_blocked_start) - bch2_time_stats_update(j->blocked_time, - j->res_get_blocked_start); - j->res_get_blocked_start = 0; - mod_delayed_work(c->io_complete_wq, &j->write_work, msecs_to_jiffies(c->opts.journal_flush_delay)); @@ -407,7 +457,7 @@ static void journal_write_work(struct work_struct *work) if (delta > 0) mod_delayed_work(c->io_complete_wq, &j->write_work, delta); else - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); unlock: spin_unlock(&j->lock); } @@ -464,18 +514,23 @@ retry: buf->buf_size < JOURNAL_ENTRY_SIZE_MAX) j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1); - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false); ret = journal_entry_open(j); - if (ret == JOURNAL_ERR_max_in_flight) - trace_and_count(c, journal_entry_full, c); -unlock: - if ((ret && ret != JOURNAL_ERR_insufficient_devices) && - !j->res_get_blocked_start) { - j->res_get_blocked_start = local_clock() ?: 1; - trace_and_count(c, journal_full, c); - } + if (ret == JOURNAL_ERR_max_in_flight) { + track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], + &j->max_in_flight_start, true); + if (trace_journal_entry_full_enabled()) { + struct printbuf buf = PRINTBUF; + buf.atomic++; + bch2_journal_bufs_to_text(&buf, j); + trace_journal_entry_full(c, buf.buf); + printbuf_exit(&buf); + } + count_event(c, journal_entry_full); + } +unlock: can_discard = j->can_discard; spin_unlock(&j->lock); @@ -553,7 +608,7 @@ void bch2_journal_entry_res_resize(struct journal *j, /* * Not enough room in current journal entry, have to flush it: */ - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); } else { journal_cur_buf(j)->u64s_reserved += d; } @@ -610,7 +665,7 @@ recheck_need_open: struct journal_res res = { 0 }; if (journal_entry_is_open(j)) - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); spin_unlock(&j->lock); @@ -774,6 +829,48 @@ void bch2_journal_block(struct journal *j) journal_quiesce(j); } +static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq) +{ + struct journal_buf *ret = NULL; + + mutex_lock(&j->buf_lock); + spin_lock(&j->lock); + max_seq = min(max_seq, journal_cur_seq(j)); + + for (u64 seq = journal_last_unwritten_seq(j); + seq <= max_seq; + seq++) { + unsigned idx = seq & JOURNAL_BUF_MASK; + struct journal_buf *buf = j->buf + idx; + + if (buf->need_flush_to_write_buffer) { + if (seq == journal_cur_seq(j)) + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); + + union journal_res_state s; + s.v = atomic64_read_acquire(&j->reservations.counter); + + ret = journal_state_count(s, idx) + ? ERR_PTR(-EAGAIN) + : buf; + break; + } + } + + spin_unlock(&j->lock); + if (IS_ERR_OR_NULL(ret)) + mutex_unlock(&j->buf_lock); + return ret; +} + +struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq) +{ + struct journal_buf *ret; + + wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j, max_seq)) != ERR_PTR(-EAGAIN)); + return ret; +} + /* allocate journal on a device: */ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, @@ -955,8 +1052,7 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, break; } - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); unlock: up_write(&c->state_lock); return ret; @@ -986,17 +1082,13 @@ int bch2_dev_journal_alloc(struct bch_dev *ca) ret = __bch2_set_nr_journal_buckets(ca, nr, true, NULL); err: - if (ret) - bch_err_fn(ca, ret); + bch_err_fn(ca, ret); return ret; } int bch2_fs_journal_alloc(struct bch_fs *c) { - struct bch_dev *ca; - unsigned i; - - for_each_online_member(ca, c, i) { + for_each_online_member(c, ca) { if (ca->journal.nr) continue; @@ -1225,6 +1317,7 @@ int bch2_fs_journal_init(struct journal *j) static struct lock_class_key res_key; unsigned i; + mutex_init(&j->buf_lock); spin_lock_init(&j->lock); spin_lock_init(&j->err_lock); init_waitqueue_head(&j->wait); @@ -1260,10 +1353,8 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); union journal_res_state s; - struct bch_dev *ca; unsigned long now = jiffies; - u64 seq; - unsigned i; + u64 nr_writes = j->nr_flush_writes + j->nr_noflush_writes; if (!out->nr_tabstops) printbuf_tabstop_push(out, 24); @@ -1275,20 +1366,23 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) prt_printf(out, "dirty journal entries:\t%llu/%llu\n", fifo_used(&j->pin), j->pin.size); prt_printf(out, "seq:\t\t\t%llu\n", journal_cur_seq(j)); prt_printf(out, "seq_ondisk:\t\t%llu\n", j->seq_ondisk); - prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j)); + prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j)); prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk); - prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk); - prt_printf(out, "watermark:\t\t%s\n", bch2_watermarks[j->watermark]); - prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved); + prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk); + prt_printf(out, "watermark:\t\t%s\n", bch2_watermarks[j->watermark]); + prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved); prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes); - prt_printf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes); - prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim); + prt_printf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes); + prt_printf(out, "average write size:\t"); + prt_human_readable_u64(out, nr_writes ? div64_u64(j->entry_bytes_written, nr_writes) : 0); + prt_newline(out); + prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim); prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim); prt_printf(out, "reclaim kicked:\t\t%u\n", j->reclaim_kicked); - prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now) + prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now) ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0); - prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors); - prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]); + prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors); + prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]); prt_printf(out, "current entry:\t\t"); switch (s.cur_entry_offset) { @@ -1304,35 +1398,9 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) } prt_newline(out); - - for (seq = journal_cur_seq(j); - seq >= journal_last_unwritten_seq(j); - --seq) { - i = seq & JOURNAL_BUF_MASK; - - prt_printf(out, "unwritten entry:"); - prt_tab(out); - prt_printf(out, "%llu", seq); - prt_newline(out); - printbuf_indent_add(out, 2); - - prt_printf(out, "refcount:"); - prt_tab(out); - prt_printf(out, "%u", journal_state_count(s, i)); - prt_newline(out); - - prt_printf(out, "sectors:"); - prt_tab(out); - prt_printf(out, "%u", j->buf[i].sectors); - prt_newline(out); - - prt_printf(out, "expires"); - prt_tab(out); - prt_printf(out, "%li jiffies", j->buf[i].expires - jiffies); - prt_newline(out); - - printbuf_indent_sub(out, 2); - } + prt_printf(out, "unwritten entries:"); + prt_newline(out); + bch2_journal_bufs_to_text(out, j); prt_printf(out, "replay done:\t\t%i\n", @@ -1352,8 +1420,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) j->space[journal_space_total].next_entry, j->space[journal_space_total].total); - for_each_member_device_rcu(ca, c, i, - &c->rw_devs[BCH_DATA_journal]) { + for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { struct journal_device *ja = &ca->journal; if (!test_bit(ca->dev_idx, c->rw_devs[BCH_DATA_journal].d)) @@ -1362,7 +1429,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) if (!ja->nr) continue; - prt_printf(out, "dev %u:\n", i); + prt_printf(out, "dev %u:\n", ca->dev_idx); prt_printf(out, "\tnr\t\t%u\n", ja->nr); prt_printf(out, "\tbucket size\t%u\n", ca->mi.bucket_size); prt_printf(out, "\tavailable\t%u:%u\n", bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free); diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 2f768e11aec9..4544ce24bb8a 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -119,7 +119,6 @@ static inline void journal_wake(struct journal *j) { wake_up(&j->wait); closure_wake_up(&j->async_wait); - closure_wake_up(&j->preres_wait); } static inline struct journal_buf *journal_cur_buf(struct journal *j) @@ -239,8 +238,6 @@ bch2_journal_add_entry(struct journal *j, struct journal_res *res, static inline bool journal_entry_empty(struct jset *j) { - struct jset_entry *i; - if (j->seq != j->last_seq) return false; @@ -426,6 +423,7 @@ static inline void bch2_journal_set_replay_done(struct journal *j) void bch2_journal_unblock(struct journal *); void bch2_journal_block(struct journal *); +struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq); void __bch2_journal_debug_to_text(struct printbuf *, struct journal *); void bch2_journal_debug_to_text(struct printbuf *, struct journal *); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 3eb6c3f62a81..bfd6585e746d 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -4,6 +4,7 @@ #include "alloc_foreground.h" #include "btree_io.h" #include "btree_update_interior.h" +#include "btree_write_buffer.h" #include "buckets.h" #include "checksum.h" #include "disk_groups.h" @@ -26,11 +27,15 @@ static struct nonce journal_nonce(const struct jset *jset) }}; } -static bool jset_csum_good(struct bch_fs *c, struct jset *j) +static bool jset_csum_good(struct bch_fs *c, struct jset *j, struct bch_csum *csum) { - return bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j)) && - !bch2_crc_cmp(j->csum, - csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j)); + if (!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j))) { + *csum = (struct bch_csum) {}; + return false; + } + + *csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j); + return !bch2_crc_cmp(j->csum, *csum); } static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq) @@ -678,17 +683,12 @@ static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs prt_printf(out, "dev=%u", le32_to_cpu(u->dev)); for (i = 0; i < nr_types; i++) { - if (i < BCH_DATA_NR) - prt_printf(out, " %s", bch2_data_types[i]); - else - prt_printf(out, " (unknown data type %u)", i); + bch2_prt_data_type(out, i); prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu", le64_to_cpu(u->d[i].buckets), le64_to_cpu(u->d[i].sectors), le64_to_cpu(u->d[i].fragmented)); } - - prt_printf(out, " buckets_ec: %llu", le64_to_cpu(u->buckets_ec)); } static int journal_entry_log_validate(struct bch_fs *c, @@ -725,6 +725,22 @@ static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs journal_entry_btree_keys_to_text(out, c, entry); } +static int journal_entry_write_buffer_keys_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) +{ + return journal_entry_btree_keys_validate(c, jset, entry, + version, big_endian, READ); +} + +static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + journal_entry_btree_keys_to_text(out, c, entry); +} + struct jset_entry_ops { int (*validate)(struct bch_fs *, struct jset *, struct jset_entry *, unsigned, int, @@ -768,7 +784,6 @@ void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, static int jset_validate_entries(struct bch_fs *c, struct jset *jset, enum bkey_invalid_flags flags) { - struct jset_entry *entry; unsigned version = le32_to_cpu(jset->version); int ret = 0; @@ -920,6 +935,7 @@ static int journal_read_bucket(struct bch_dev *ca, u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), end = offset + ca->mi.bucket_size; bool saw_bad = false, csum_good; + struct printbuf err = PRINTBUF; int ret = 0; pr_debug("reading %u", bucket); @@ -952,7 +968,7 @@ reread: * found on a different device, and missing or * no journal entries will be handled later */ - return 0; + goto out; } j = buf->data; @@ -969,12 +985,12 @@ reread: ret = journal_read_buf_realloc(buf, vstruct_bytes(j)); if (ret) - return ret; + goto err; } goto reread; case JOURNAL_ENTRY_NONE: if (!saw_bad) - return 0; + goto out; /* * On checksum error we don't really trust the size * field of the journal entry we read, so try reading @@ -983,7 +999,7 @@ reread: sectors = block_sectors(c); goto next_block; default: - return ret; + goto err; } /* @@ -993,20 +1009,28 @@ reread: * bucket: */ if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) - return 0; + goto out; ja->bucket_seq[bucket] = le64_to_cpu(j->seq); - csum_good = jset_csum_good(c, j); + enum bch_csum_type csum_type = JSET_CSUM_TYPE(j); + struct bch_csum csum; + csum_good = jset_csum_good(c, j, &csum); + if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum, - "journal checksum error")) + "%s", + (printbuf_reset(&err), + prt_str(&err, "journal "), + bch2_csum_err_msg(&err, csum_type, j->csum, csum), + err.buf))) saw_bad = true; ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), j->encrypted_start, vstruct_end(j) - (void *) j->encrypted_start); bch2_fs_fatal_err_on(ret, c, - "error decrypting journal entry: %i", ret); + "error decrypting journal entry: %s", + bch2_err_str(ret)); mutex_lock(&jlist->lock); ret = journal_entry_add(c, ca, (struct journal_ptr) { @@ -1025,7 +1049,7 @@ reread: case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: break; default: - return ret; + goto err; } next_block: pr_debug("next"); @@ -1034,7 +1058,11 @@ next_block: j = ((void *) j) + (sectors << 9); } - return 0; +out: + ret = 0; +err: + printbuf_exit(&err); + return ret; } static CLOSURE_CALLBACK(bch2_journal_read_device) @@ -1156,8 +1184,6 @@ int bch2_journal_read(struct bch_fs *c, struct journal_list jlist; struct journal_replay *i, **_i, *prev = NULL; struct genradix_iter radix_iter; - struct bch_dev *ca; - unsigned iter; struct printbuf buf = PRINTBUF; bool degraded = false, last_write_torn = false; u64 seq; @@ -1168,7 +1194,7 @@ int bch2_journal_read(struct bch_fs *c, jlist.last_seq = 0; jlist.ret = 0; - for_each_member_device(ca, c, iter) { + for_each_member_device(c, ca) { if (!c->opts.fsck && !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal))) continue; @@ -1334,7 +1360,7 @@ int bch2_journal_read(struct bch_fs *c, continue; for (ptr = 0; ptr < i->nr_ptrs; ptr++) { - ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev); + struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev); if (!i->ptrs[ptr].csum_good) bch_err_dev_offset(ca, i->ptrs[ptr].sector, @@ -1505,6 +1531,8 @@ done: static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) { + struct bch_fs *c = container_of(j, struct bch_fs, journal); + /* we aren't holding j->lock: */ unsigned new_size = READ_ONCE(j->buf_size_want); void *new_buf; @@ -1512,6 +1540,11 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) if (buf->buf_size >= new_size) return; + size_t btree_write_buffer_size = new_size / 64; + + if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size)) + return; + new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN); if (!new_buf) return; @@ -1604,6 +1637,9 @@ static CLOSURE_CALLBACK(journal_write_done) bch2_journal_reclaim_fast(j); bch2_journal_space_available(j); + track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], + &j->max_in_flight_start, false); + closure_wake_up(&w->wait); journal_wake(j); @@ -1656,7 +1692,6 @@ static CLOSURE_CALLBACK(do_journal_write) struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_dev *ca; struct journal_buf *w = journal_last_unwritten_buf(j); - struct bch_extent_ptr *ptr; struct bio *bio; unsigned sectors = vstruct_sectors(w->data, c->block_bits); @@ -1700,11 +1735,13 @@ static CLOSURE_CALLBACK(do_journal_write) static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct jset_entry *start, *end, *i, *next, *prev = NULL; + struct jset_entry *start, *end; struct jset *jset = w->data; + struct journal_keys_to_wb wb = { NULL }; unsigned sectors, bytes, u64s; - bool validate_before_checksum = false; unsigned long btree_roots_have = 0; + bool validate_before_checksum = false; + u64 seq = le64_to_cpu(jset->seq); int ret; /* @@ -1715,7 +1752,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) * If we wanted to be really fancy here, we could sort all the keys in * the jset and drop keys that were overwritten - probably not worth it: */ - vstruct_for_each_safe(jset, i, next) { + vstruct_for_each(jset, i) { unsigned u64s = le16_to_cpu(i->u64s); /* Empty entry: */ @@ -1732,40 +1769,40 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) * to c->btree_roots we have to get any missing btree roots and * add them to this journal entry: */ - if (i->type == BCH_JSET_ENTRY_btree_root) { + switch (i->type) { + case BCH_JSET_ENTRY_btree_root: bch2_journal_entry_to_btree_root(c, i); __set_bit(i->btree_id, &btree_roots_have); + break; + case BCH_JSET_ENTRY_write_buffer_keys: + EBUG_ON(!w->need_flush_to_write_buffer); + + if (!wb.wb) + bch2_journal_keys_to_write_buffer_start(c, &wb, seq); + + struct bkey_i *k; + jset_entry_for_each_key(i, k) { + ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k); + if (ret) { + bch2_fs_fatal_error(c, "-ENOMEM flushing journal keys to btree write buffer"); + bch2_journal_keys_to_write_buffer_end(c, &wb); + return ret; + } + } + i->type = BCH_JSET_ENTRY_btree_keys; + break; } - - /* Can we merge with previous entry? */ - if (prev && - i->btree_id == prev->btree_id && - i->level == prev->level && - i->type == prev->type && - i->type == BCH_JSET_ENTRY_btree_keys && - le16_to_cpu(prev->u64s) + u64s <= U16_MAX) { - memmove_u64s_down(vstruct_next(prev), - i->_data, - u64s); - le16_add_cpu(&prev->u64s, u64s); - continue; - } - - /* Couldn't merge, move i into new position (after prev): */ - prev = prev ? vstruct_next(prev) : jset->start; - if (i != prev) - memmove_u64s_down(prev, i, jset_u64s(u64s)); } - prev = prev ? vstruct_next(prev) : jset->start; - jset->u64s = cpu_to_le32((u64 *) prev - jset->_data); + if (wb.wb) + bch2_journal_keys_to_write_buffer_end(c, &wb); + w->need_flush_to_write_buffer = false; start = end = vstruct_last(jset); end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have); - bch2_journal_super_entries_add_common(c, &end, - le64_to_cpu(jset->seq)); + bch2_journal_super_entries_add_common(c, &end, seq); u64s = (u64 *) end - (u64 *) start; BUG_ON(u64s > j->entry_u64s_reserved); @@ -1788,7 +1825,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset)) - j->last_empty_seq = le64_to_cpu(jset->seq); + j->last_empty_seq = seq; if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) validate_before_checksum = true; @@ -1847,7 +1884,7 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf * (!w->must_flush && (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) { - w->noflush = true; + w->noflush = true; SET_JSET_NO_FLUSH(w->data, true); w->data->last_seq = 0; w->last_seq = 0; @@ -1866,12 +1903,11 @@ CLOSURE_CALLBACK(bch2_journal_write) { closure_type(j, struct journal, io); struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_dev *ca; struct journal_buf *w = journal_last_unwritten_buf(j); struct bch_replicas_padded replicas; struct bio *bio; struct printbuf journal_debug_buf = PRINTBUF; - unsigned i, nr_rw_members = 0; + unsigned nr_rw_members = 0; int ret; BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); @@ -1884,12 +1920,16 @@ CLOSURE_CALLBACK(bch2_journal_write) if (ret) goto err; + mutex_lock(&j->buf_lock); journal_buf_realloc(j, w); ret = bch2_journal_write_prep(j, w); + mutex_unlock(&j->buf_lock); if (ret) goto err; + j->entry_bytes_written += vstruct_bytes(w->data); + while (1) { spin_lock(&j->lock); ret = journal_write_alloc(j, w); @@ -1927,7 +1967,7 @@ CLOSURE_CALLBACK(bch2_journal_write) if (c->opts.nochanges) goto no_io; - for_each_rw_member(ca, c, i) + for_each_rw_member(c, ca) nr_rw_members++; if (nr_rw_members > 1) @@ -1944,11 +1984,12 @@ CLOSURE_CALLBACK(bch2_journal_write) goto err; if (!JSET_NO_FLUSH(w->data) && w->separate_flush) { - for_each_rw_member(ca, c, i) { + for_each_rw_member(c, ca) { percpu_ref_get(&ca->io_ref); bio = ca->journal.bio; - bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH); + bio_reset(bio, ca->disk_sb.bdev, + REQ_OP_WRITE|REQ_PREFLUSH); bio->bi_end_io = journal_write_endio; bio->bi_private = ca; closure_bio_submit(bio, cl); diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index ec712104addb..820d25e19e5f 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -3,6 +3,7 @@ #include "bcachefs.h" #include "btree_key_cache.h" #include "btree_update.h" +#include "btree_write_buffer.h" #include "buckets.h" #include "errcode.h" #include "error.h" @@ -50,17 +51,24 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j, return available; } -static inline void journal_set_watermark(struct journal *j, bool low_on_space) +void bch2_journal_set_watermark(struct journal *j) { - unsigned watermark = BCH_WATERMARK_stripe; - - if (low_on_space) - watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim); - if (fifo_free(&j->pin) < j->pin.size / 4) - watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim); - - if (watermark == j->watermark) - return; + struct bch_fs *c = container_of(j, struct bch_fs, journal); + bool low_on_space = j->space[journal_space_clean].total * 4 <= + j->space[journal_space_total].total; + bool low_on_pin = fifo_free(&j->pin) < j->pin.size / 4; + bool low_on_wb = bch2_btree_write_buffer_must_wait(c); + unsigned watermark = low_on_space || low_on_pin || low_on_wb + ? BCH_WATERMARK_reclaim + : BCH_WATERMARK_stripe; + + if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space], + &j->low_on_space_start, low_on_space) || + track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin], + &j->low_on_pin_start, low_on_pin) || + track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], + &j->write_buffer_full_start, low_on_wb)) + trace_and_count(c, journal_full, c); swap(watermark, j->watermark); if (watermark > j->watermark) @@ -128,15 +136,13 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne enum journal_space_from from) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_dev *ca; - unsigned i, pos, nr_devs = 0; + unsigned pos, nr_devs = 0; struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX]; BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space)); rcu_read_lock(); - for_each_member_device_rcu(ca, c, i, - &c->rw_devs[BCH_DATA_journal]) { + for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { if (!ca->journal.nr) continue; @@ -165,19 +171,17 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne void bch2_journal_space_available(struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_dev *ca; unsigned clean, clean_ondisk, total; unsigned max_entry_size = min(j->buf[0].buf_size >> 9, j->buf[1].buf_size >> 9); - unsigned i, nr_online = 0, nr_devs_want; + unsigned nr_online = 0, nr_devs_want; bool can_discard = false; int ret = 0; lockdep_assert_held(&j->lock); rcu_read_lock(); - for_each_member_device_rcu(ca, c, i, - &c->rw_devs[BCH_DATA_journal]) { + for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { struct journal_device *ja = &ca->journal; if (!ja->nr) @@ -208,7 +212,7 @@ void bch2_journal_space_available(struct journal *j) nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas); - for (i = 0; i < journal_space_nr; i++) + for (unsigned i = 0; i < journal_space_nr; i++) j->space[i] = __journal_space_available(j, nr_devs_want, i); clean_ondisk = j->space[journal_space_clean_ondisk].total; @@ -226,7 +230,7 @@ void bch2_journal_space_available(struct journal *j) else clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); - journal_set_watermark(j, clean * 4 <= total); + bch2_journal_set_watermark(j); out: j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0; j->cur_entry_error = ret; @@ -255,12 +259,10 @@ static bool should_discard_bucket(struct journal *j, struct journal_device *ja) void bch2_journal_do_discards(struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_dev *ca; - unsigned iter; mutex_lock(&j->discard_lock); - for_each_rw_member(ca, c, iter) { + for_each_rw_member(c, ca) { struct journal_device *ja = &ca->journal; while (should_discard_bucket(j, ja)) { @@ -299,6 +301,7 @@ void bch2_journal_reclaim_fast(struct journal *j) * all btree nodes got written out */ while (!fifo_empty(&j->pin) && + j->pin.front <= j->seq_ondisk && !atomic_read(&fifo_peek_front(&j->pin).count)) { j->pin.front++; popped = true; @@ -367,15 +370,36 @@ static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn) return JOURNAL_PIN_other; } -void bch2_journal_pin_set(struct journal *j, u64 seq, +static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq, struct journal_entry_pin *pin, - journal_pin_flush_fn flush_fn) + journal_pin_flush_fn flush_fn, + enum journal_pin_type type) +{ + struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); + + /* + * flush_fn is how we identify journal pins in debugfs, so must always + * exist, even if it doesn't do anything: + */ + BUG_ON(!flush_fn); + + atomic_inc(&pin_list->count); + pin->seq = seq; + pin->flush = flush_fn; + list_add(&pin->list, &pin_list->list[type]); +} + +void bch2_journal_pin_copy(struct journal *j, + struct journal_entry_pin *dst, + struct journal_entry_pin *src, + journal_pin_flush_fn flush_fn) { - struct journal_entry_pin_list *pin_list; bool reclaim; spin_lock(&j->lock); + u64 seq = READ_ONCE(src->seq); + if (seq < journal_last_seq(j)) { /* * bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on @@ -387,18 +411,34 @@ void bch2_journal_pin_set(struct journal *j, u64 seq, return; } - pin_list = journal_seq_pin(j, seq); + reclaim = __journal_pin_drop(j, dst); - reclaim = __journal_pin_drop(j, pin); + bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn)); - atomic_inc(&pin_list->count); - pin->seq = seq; - pin->flush = flush_fn; + if (reclaim) + bch2_journal_reclaim_fast(j); + spin_unlock(&j->lock); - if (flush_fn) - list_add(&pin->list, &pin_list->list[journal_pin_type(flush_fn)]); - else - list_add(&pin->list, &pin_list->flushed); + /* + * If the journal is currently full, we might want to call flush_fn + * immediately: + */ + journal_wake(j); +} + +void bch2_journal_pin_set(struct journal *j, u64 seq, + struct journal_entry_pin *pin, + journal_pin_flush_fn flush_fn) +{ + bool reclaim; + + spin_lock(&j->lock); + + BUG_ON(seq < journal_last_seq(j)); + + reclaim = __journal_pin_drop(j, pin); + + bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn)); if (reclaim) bch2_journal_reclaim_fast(j); @@ -537,13 +577,11 @@ static size_t journal_flush_pins(struct journal *j, static u64 journal_seq_to_flush(struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_dev *ca; u64 seq_to_flush = 0; - unsigned iter; spin_lock(&j->lock); - for_each_rw_member(ca, c, iter) { + for_each_rw_member(c, ca) { struct journal_device *ja = &ca->journal; unsigned nr_buckets, bucket_to_flush; @@ -747,10 +785,9 @@ int bch2_journal_reclaim_start(struct journal *j) p = kthread_create(bch2_journal_reclaim_thread, j, "bch-reclaim/%s", c->name); ret = PTR_ERR_OR_ZERO(p); - if (ret) { - bch_err_msg(c, ret, "creating journal reclaim thread"); + bch_err_msg(c, ret, "creating journal reclaim thread"); + if (ret) return ret; - } get_task_struct(p); j->reclaim_thread = p; @@ -796,6 +833,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) { + /* time_stats this */ bool did_work = false; if (!test_bit(JOURNAL_STARTED, &j->flags)) diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h index 494d1a6eddb0..ec84c3345281 100644 --- a/fs/bcachefs/journal_reclaim.h +++ b/fs/bcachefs/journal_reclaim.h @@ -16,6 +16,7 @@ static inline void journal_reclaim_kick(struct journal *j) unsigned bch2_journal_dev_buckets_available(struct journal *, struct journal_device *, enum journal_space_from); +void bch2_journal_set_watermark(struct journal *); void bch2_journal_space_available(struct journal *); static inline bool journal_pin_active(struct journal_entry_pin *pin) @@ -47,17 +48,10 @@ static inline void bch2_journal_pin_add(struct journal *j, u64 seq, bch2_journal_pin_set(j, seq, pin, flush_fn); } -static inline void bch2_journal_pin_copy(struct journal *j, - struct journal_entry_pin *dst, - struct journal_entry_pin *src, - journal_pin_flush_fn flush_fn) -{ - /* Guard against racing with journal_pin_drop(src): */ - u64 seq = READ_ONCE(src->seq); - - if (seq) - bch2_journal_pin_add(j, seq, dst, flush_fn); -} +void bch2_journal_pin_copy(struct journal *, + struct journal_entry_pin *, + struct journal_entry_pin *, + journal_pin_flush_fn); static inline void bch2_journal_pin_update(struct journal *j, u64 seq, struct journal_entry_pin *pin, diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c index f9d9aa95bf3a..0200e299cfbb 100644 --- a/fs/bcachefs/journal_seq_blacklist.c +++ b/fs/bcachefs/journal_seq_blacklist.c @@ -267,7 +267,7 @@ retry: while (!(ret = PTR_ERR_OR_ZERO(b)) && b && - !test_bit(BCH_FS_STOPPING, &c->flags)) + !test_bit(BCH_FS_stopping, &c->flags)) b = bch2_btree_iter_next_node(&iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index a756b69582e3..38817c7a0851 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -36,6 +36,7 @@ struct journal_buf { bool noflush; /* write has already been kicked off, and was noflush */ bool must_flush; /* something wants a flush */ bool separate_flush; + bool need_flush_to_write_buffer; }; /* @@ -182,6 +183,12 @@ struct journal { darray_u64 early_journal_entries; /* + * Protects journal_buf->data, when accessing without a jorunal + * reservation: for synchronization between the btree write buffer code + * and the journal write path: + */ + struct mutex buf_lock; + /* * Two journal entries -- one is currently open for new entries, the * other is possibly being written out. */ @@ -195,7 +202,6 @@ struct journal { /* Used when waiting because the journal was full */ wait_queue_head_t wait; struct closure_waitlist async_wait; - struct closure_waitlist preres_wait; struct closure io; struct delayed_work write_work; @@ -262,15 +268,19 @@ struct journal { unsigned long last_flush_write; - u64 res_get_blocked_start; u64 write_start_time; u64 nr_flush_writes; u64 nr_noflush_writes; + u64 entry_bytes_written; + + u64 low_on_space_start; + u64 low_on_pin_start; + u64 max_in_flight_start; + u64 write_buffer_full_start; struct bch2_time_stats *flush_write_time; struct bch2_time_stats *noflush_write_time; - struct bch2_time_stats *blocked_time; struct bch2_time_stats *flush_seq_time; #ifdef CONFIG_DEBUG_LOCK_ALLOC diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c index 5699cd4873c8..1b828bddd11b 100644 --- a/fs/bcachefs/keylist.c +++ b/fs/bcachefs/keylist.c @@ -43,8 +43,6 @@ void bch2_keylist_pop_front(struct keylist *l) #ifdef CONFIG_BCACHEFS_DEBUG void bch2_verify_keylist_sorted(struct keylist *l) { - struct bkey_i *k; - for_each_keylist_key(l, k) BUG_ON(bkey_next(k) != l->top && bpos_ge(k->k.p, bkey_next(k)->k.p)); diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h index fe759c7031e0..e687e0e9aede 100644 --- a/fs/bcachefs/keylist.h +++ b/fs/bcachefs/keylist.h @@ -50,18 +50,16 @@ static inline struct bkey_i *bch2_keylist_front(struct keylist *l) } #define for_each_keylist_key(_keylist, _k) \ - for (_k = (_keylist)->keys; \ + for (struct bkey_i *_k = (_keylist)->keys; \ _k != (_keylist)->top; \ _k = bkey_next(_k)) static inline u64 keylist_sectors(struct keylist *keys) { - struct bkey_i *k; u64 ret = 0; for_each_keylist_key(keys, k) ret += k->k.size; - return ret; } diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c index 8640f7dee0de..ad598105c587 100644 --- a/fs/bcachefs/logged_ops.c +++ b/fs/bcachefs/logged_ops.c @@ -54,16 +54,12 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter, int bch2_resume_logged_ops(struct bch_fs *c) { - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - ret = bch2_trans_run(c, - for_each_btree_key2(trans, iter, - BTREE_ID_logged_ops, POS_MIN, BTREE_ITER_PREFETCH, k, + int ret = bch2_trans_run(c, + for_each_btree_key(trans, iter, + BTREE_ID_logged_ops, POS_MIN, + BTREE_ITER_PREFETCH, k, resume_logged_op(trans, &iter, k))); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } @@ -85,13 +81,13 @@ static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k) int bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k) { - return commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + return commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, __bch2_logged_op_start(trans, k)); } void bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k) { - int ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_btree_delete(trans, BTREE_ID_logged_ops, k->k.p, 0)); /* * This needs to be a fatal error because we've left an unfinished diff --git a/fs/bcachefs/logged_ops_format.h b/fs/bcachefs/logged_ops_format.h new file mode 100644 index 000000000000..6a4bf7129dba --- /dev/null +++ b/fs/bcachefs/logged_ops_format.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_LOGGED_OPS_FORMAT_H +#define _BCACHEFS_LOGGED_OPS_FORMAT_H + +struct bch_logged_op_truncate { + struct bch_val v; + __le32 subvol; + __le32 pad; + __le64 inum; + __le64 new_i_size; +}; + +enum logged_op_finsert_state { + LOGGED_OP_FINSERT_start, + LOGGED_OP_FINSERT_shift_extents, + LOGGED_OP_FINSERT_finish, +}; + +struct bch_logged_op_finsert { + struct bch_val v; + __u8 state; + __u8 pad[3]; + __le32 subvol; + __le64 inum; + __le64 dst_offset; + __le64 src_offset; + __le64 pos; +}; + +#endif /* _BCACHEFS_LOGGED_OPS_FORMAT_H */ diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c index a5cc0ed195d6..7a4ca5a28b3e 100644 --- a/fs/bcachefs/lru.c +++ b/fs/bcachefs/lru.c @@ -147,18 +147,13 @@ fsck_err: int bch2_check_lrus(struct bch_fs *c) { - struct btree_iter iter; - struct bkey_s_c k; struct bpos last_flushed_pos = POS_MIN; - int ret = 0; - - ret = bch2_trans_run(c, + int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw, bch2_check_lru_key(trans, &iter, k, &last_flushed_pos))); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } diff --git a/fs/bcachefs/mean_and_variance.c b/fs/bcachefs/mean_and_variance.c index 1f0801e2e565..bf0ef668fd38 100644 --- a/fs/bcachefs/mean_and_variance.c +++ b/fs/bcachefs/mean_and_variance.c @@ -62,6 +62,7 @@ EXPORT_SYMBOL_GPL(u128_div); /** * mean_and_variance_get_mean() - get mean from @s + * @s: mean and variance number of samples and their sums */ s64 mean_and_variance_get_mean(struct mean_and_variance s) { @@ -71,6 +72,7 @@ EXPORT_SYMBOL_GPL(mean_and_variance_get_mean); /** * mean_and_variance_get_variance() - get variance from @s1 + * @s1: mean and variance number of samples and sums * * see linked pdf equation 12. */ @@ -89,6 +91,7 @@ EXPORT_SYMBOL_GPL(mean_and_variance_get_variance); /** * mean_and_variance_get_stddev() - get standard deviation from @s + * @s: mean and variance number of samples and their sums */ u32 mean_and_variance_get_stddev(struct mean_and_variance s) { @@ -98,8 +101,8 @@ EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev); /** * mean_and_variance_weighted_update() - exponentially weighted variant of mean_and_variance_update() - * @s1: .. - * @s2: .. + * @s: mean and variance number of samples and their sums + * @x: new value to include in the &mean_and_variance_weighted * * see linked pdf: function derived from equations 140-143 where alpha = 2^w. * values are stored bitshifted for performance and added precision. @@ -129,6 +132,7 @@ EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update); /** * mean_and_variance_weighted_get_mean() - get mean from @s + * @s: mean and variance number of samples and their sums */ s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s) { @@ -138,6 +142,7 @@ EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean); /** * mean_and_variance_weighted_get_variance() -- get variance from @s + * @s: mean and variance number of samples and their sums */ u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s) { @@ -148,6 +153,7 @@ EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance); /** * mean_and_variance_weighted_get_stddev() - get standard deviation from @s + * @s: mean and variance number of samples and their sums */ u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s) { diff --git a/fs/bcachefs/mean_and_variance.h b/fs/bcachefs/mean_and_variance.h index 647505010b39..b2be565bb8f2 100644 --- a/fs/bcachefs/mean_and_variance.h +++ b/fs/bcachefs/mean_and_variance.h @@ -12,9 +12,12 @@ /* * u128_u: u128 user mode, because not all architectures support a real int128 * type + * + * We don't use this version in userspace, because in userspace we link with + * Rust and rustc has issues with u128. */ -#ifdef __SIZEOF_INT128__ +#if defined(__SIZEOF_INT128__) && defined(__KERNEL__) typedef struct { unsigned __int128 v; diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c index e3a51f6d6c9b..5623cee3ef86 100644 --- a/fs/bcachefs/migrate.c +++ b/fs/bcachefs/migrate.c @@ -79,8 +79,6 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) { struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; enum btree_id id; int ret = 0; @@ -90,7 +88,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) ret = for_each_btree_key_commit(trans, iter, id, POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, - NULL, NULL, BTREE_INSERT_NOFAIL, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags)); if (ret) break; @@ -145,10 +143,9 @@ retry: continue; } - if (ret) { - bch_err_msg(c, ret, "updating btree node key"); + bch_err_msg(c, ret, "updating btree node key"); + if (ret) break; - } next: bch2_btree_iter_next_node(&iter); } diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 54830ee0ed88..bf68ea49447b 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -6,9 +6,11 @@ #include "backpointers.h" #include "bkey_buf.h" #include "btree_gc.h" +#include "btree_io.h" #include "btree_update.h" #include "btree_update_interior.h" #include "btree_write_buffer.h" +#include "compress.h" #include "disk_groups.h" #include "ec.h" #include "errcode.h" @@ -27,12 +29,53 @@ #include <linux/ioprio.h> #include <linux/kthread.h> -static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k) +const char * const bch2_data_ops_strs[] = { +#define x(t, n, ...) [n] = #t, + BCH_DATA_OPS() +#undef x + NULL +}; + +static void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) +{ + printbuf_tabstop_push(out, 20); + prt_str(out, "rewrite ptrs:"); + prt_tab(out); + bch2_prt_u64_base2(out, data_opts->rewrite_ptrs); + prt_newline(out); + + prt_str(out, "kill ptrs: "); + prt_tab(out); + bch2_prt_u64_base2(out, data_opts->kill_ptrs); + prt_newline(out); + + prt_str(out, "target: "); + prt_tab(out); + bch2_target_to_text(out, c, data_opts->target); + prt_newline(out); + + prt_str(out, "compression: "); + prt_tab(out); + bch2_compression_opt_to_text(out, background_compression(*io_opts)); + prt_newline(out); + + prt_str(out, "extra replicas: "); + prt_tab(out); + prt_u64(out, data_opts->extra_replicas); +} + +static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) { if (trace_move_extent_enabled()) { struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, k); + prt_newline(&buf); + bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts); trace_move_extent(c, buf.buf); printbuf_exit(&buf); } @@ -63,7 +106,7 @@ struct moving_io { struct data_update write; /* Must be last since it is variable size */ - struct bio_vec bi_inline_vecs[0]; + struct bio_vec bi_inline_vecs[]; }; static void move_free(struct moving_io *io) @@ -104,6 +147,15 @@ static void move_write(struct moving_io *io) return; } + if (trace_move_extent_write_enabled()) { + struct bch_fs *c = io->write.op.c; + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k)); + trace_move_extent_write(c, buf.buf); + printbuf_exit(&buf); + } + closure_get(&io->write.ctxt->cl); atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); atomic_inc(&io->write.ctxt->write_ios); @@ -152,7 +204,7 @@ void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt) atomic_read(&ctxt->write_sectors) != sectors_pending); } -static void bch2_moving_ctxt_flush_all(struct moving_context *ctxt) +void bch2_moving_ctxt_flush_all(struct moving_context *ctxt) { move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); bch2_trans_unlock_long(ctxt->trans); @@ -211,7 +263,7 @@ void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c) trace_move_data(c, stats); } -void bch2_move_stats_init(struct bch_move_stats *stats, char *name) +void bch2_move_stats_init(struct bch_move_stats *stats, const char *name) { memset(stats, 0, sizeof(*stats)); stats->data_type = BCH_DATA_user; @@ -234,9 +286,10 @@ int bch2_move_extent(struct moving_context *ctxt, unsigned sectors = k.k->size, pages; int ret = -ENOMEM; + trace_move_extent2(c, k, &io_opts, &data_opts); + if (ctxt->stats) ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos); - trace_move_extent2(c, k); bch2_data_update_opts_normalize(k, &data_opts); @@ -342,7 +395,8 @@ err: bch2_err_matches(ret, BCH_ERR_transaction_restart)) return ret; - this_cpu_inc(c->counters[BCH_COUNTER_move_extent_start_fail]); + count_event(c, move_extent_start_fail); + if (trace_move_extent_start_fail_enabled()) { struct printbuf buf = PRINTBUF; @@ -364,13 +418,10 @@ struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, int ret = 0; if (io_opts->cur_inum != extent_k.k->p.inode) { - struct btree_iter iter; - struct bkey_s_c k; - io_opts->d.nr = 0; - for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode), - BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode), + BTREE_ITER_ALL_SNAPSHOTS, k, ({ if (k.k->p.offset != extent_k.k->p.inode) break; @@ -383,11 +434,8 @@ struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot }; bch2_inode_opts_get(&e.io_opts, trans->c, &inode); - ret = darray_push(&io_opts->d, e); - if (ret) - break; - } - bch2_trans_iter_exit(trans, &iter); + darray_push(&io_opts->d, e); + })); io_opts->cur_inum = extent_k.k->p.inode; } @@ -395,12 +443,10 @@ struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, if (ret) return ERR_PTR(ret); - if (extent_k.k->p.snapshot) { - struct snapshot_io_opts_entry *i; + if (extent_k.k->p.snapshot) darray_for_each(io_opts->d, i) if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot)) return &i->io_opts; - } return &io_opts->fs_io_opts; } @@ -628,7 +674,7 @@ int bch2_move_data(struct bch_fs *c, return ret; } -int __bch2_evacuate_bucket(struct moving_context *ctxt, +int bch2_evacuate_bucket(struct moving_context *ctxt, struct move_bucket_in_flight *bucket_in_flight, struct bpos bucket, int gen, struct data_update_opts _data_opts) @@ -664,21 +710,19 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); bch2_trans_iter_exit(trans, &iter); - if (ret) { - bch_err_msg(c, ret, "looking up alloc key"); + bch_err_msg(c, ret, "looking up alloc key"); + if (ret) goto err; - } a = bch2_alloc_to_v4(k, &a_convert); - dirty_sectors = a->dirty_sectors; + dirty_sectors = bch2_bucket_sectors_dirty(*a); bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size; fragmentation = a->fragmentation_lru; - ret = bch2_btree_write_buffer_flush(trans); - if (ret) { - bch_err_msg(c, ret, "flushing btree write buffer"); + ret = bch2_btree_write_buffer_tryflush(trans); + bch_err_msg(c, ret, "flushing btree write buffer"); + if (ret) goto err; - } while (!(ret = bch2_move_ratelimit(ctxt))) { if (is_kthread && kthread_should_stop()) @@ -697,9 +741,6 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt, break; if (!bp.level) { - const struct bch_extent_ptr *ptr; - unsigned i = 0; - k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0); ret = bkey_err(k); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -722,6 +763,7 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt, data_opts.target = io_opts.background_target; data_opts.rewrite_ptrs = 0; + unsigned i = 0; bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { if (ptr->dev == bucket.inode) { data_opts.rewrite_ptrs |= 1U << i; @@ -763,6 +805,8 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt, if (!b) goto next; + unsigned sectors = btree_ptr_sectors_written(&b->key); + ret = bch2_btree_node_rewrite(trans, &iter, b, 0); bch2_trans_iter_exit(trans, &iter); @@ -772,11 +816,10 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt, goto err; if (ctxt->rate) - bch2_ratelimit_increment(ctxt->rate, - c->opts.btree_node_size >> 9); + bch2_ratelimit_increment(ctxt->rate, sectors); if (ctxt->stats) { - atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_seen); - atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_moved); + atomic64_add(sectors, &ctxt->stats->sectors_seen); + atomic64_add(sectors, &ctxt->stats->sectors_moved); } } next: @@ -789,31 +832,13 @@ err: return ret; } -int bch2_evacuate_bucket(struct bch_fs *c, - struct bpos bucket, int gen, - struct data_update_opts data_opts, - struct bch_ratelimit *rate, - struct bch_move_stats *stats, - struct write_point_specifier wp, - bool wait_on_copygc) -{ - struct moving_context ctxt; - int ret; - - bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); - ret = __bch2_evacuate_bucket(&ctxt, NULL, bucket, gen, data_opts); - bch2_moving_ctxt_exit(&ctxt); - - return ret; -} - typedef bool (*move_btree_pred)(struct bch_fs *, void *, struct btree *, struct bch_io_opts *, struct data_update_opts *); static int bch2_move_btree(struct bch_fs *c, - enum btree_id start_btree_id, struct bpos start_pos, - enum btree_id end_btree_id, struct bpos end_pos, + struct bbpos start, + struct bbpos end, move_btree_pred pred, void *arg, struct bch_move_stats *stats) { @@ -823,7 +848,7 @@ static int bch2_move_btree(struct bch_fs *c, struct btree_trans *trans; struct btree_iter iter; struct btree *b; - enum btree_id id; + enum btree_id btree; struct data_update_opts data_opts; int ret = 0; @@ -834,15 +859,15 @@ static int bch2_move_btree(struct bch_fs *c, stats->data_type = BCH_DATA_btree; - for (id = start_btree_id; - id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1); - id++) { - stats->pos = BBPOS(id, POS_MIN); + for (btree = start.btree; + btree <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1); + btree ++) { + stats->pos = BBPOS(btree, POS_MIN); - if (!bch2_btree_id_root(c, id)->b) + if (!bch2_btree_id_root(c, btree)->b) continue; - bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0, + bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, 0, BTREE_ITER_PREFETCH); retry: ret = 0; @@ -852,8 +877,8 @@ retry: if (kthread && kthread_should_stop()) break; - if ((cmp_int(id, end_btree_id) ?: - bpos_cmp(b->key.k.p, end_pos)) > 0) + if ((cmp_int(btree, end.btree) ?: + bpos_cmp(b->key.k.p, end.pos)) > 0) break; stats->pos = BBPOS(iter.btree_id, iter.pos); @@ -910,7 +935,6 @@ static bool migrate_pred(struct bch_fs *c, void *arg, struct data_update_opts *data_opts) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; struct bch_ioctl_data *op = arg; unsigned i = 0; @@ -990,8 +1014,8 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) int ret; ret = bch2_move_btree(c, - 0, POS_MIN, - BTREE_ID_NR, SPOS_MAX, + BBPOS_MIN, + BBPOS_MAX, rewrite_old_nodes_pred, c, stats); if (!ret) { mutex_lock(&c->sb_lock); @@ -1006,79 +1030,109 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) return ret; } +static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg, + struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) +{ + unsigned durability = bch2_bkey_durability(c, k); + unsigned replicas = bkey_is_btree_ptr(k.k) + ? c->opts.metadata_replicas + : io_opts->data_replicas; + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned i = 0; + + bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) { + unsigned d = bch2_extent_ptr_durability(c, &p); + + if (d && durability - d >= replicas) { + data_opts->kill_ptrs |= BIT(i); + durability -= d; + } + + i++; + } + + return data_opts->kill_ptrs != 0; +} + +static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg, + struct btree *b, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) +{ + return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); +} + int bch2_data_job(struct bch_fs *c, struct bch_move_stats *stats, struct bch_ioctl_data op) { + struct bbpos start = BBPOS(op.start_btree, op.start_pos); + struct bbpos end = BBPOS(op.end_btree, op.end_pos); int ret = 0; + if (op.op >= BCH_DATA_OP_NR) + return -EINVAL; + + bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]); + switch (op.op) { - case BCH_DATA_OP_REREPLICATE: - bch2_move_stats_init(stats, "rereplicate"); + case BCH_DATA_OP_rereplicate: stats->data_type = BCH_DATA_journal; ret = bch2_journal_flush_device_pins(&c->journal, -1); - - ret = bch2_move_btree(c, - op.start_btree, op.start_pos, - op.end_btree, op.end_pos, + ret = bch2_move_btree(c, start, end, rereplicate_btree_pred, c, stats) ?: ret; - ret = bch2_replicas_gc2(c) ?: ret; - - ret = bch2_move_data(c, - (struct bbpos) { op.start_btree, op.start_pos }, - (struct bbpos) { op.end_btree, op.end_pos }, + ret = bch2_move_data(c, start, end, NULL, stats, writepoint_hashed((unsigned long) current), true, rereplicate_pred, c) ?: ret; ret = bch2_replicas_gc2(c) ?: ret; - - bch2_move_stats_exit(stats, c); break; - case BCH_DATA_OP_MIGRATE: + case BCH_DATA_OP_migrate: if (op.migrate.dev >= c->sb.nr_devices) return -EINVAL; - bch2_move_stats_init(stats, "migrate"); stats->data_type = BCH_DATA_journal; ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); - - ret = bch2_move_btree(c, - op.start_btree, op.start_pos, - op.end_btree, op.end_pos, + ret = bch2_move_btree(c, start, end, migrate_btree_pred, &op, stats) ?: ret; - ret = bch2_replicas_gc2(c) ?: ret; - - ret = bch2_move_data(c, - (struct bbpos) { op.start_btree, op.start_pos }, - (struct bbpos) { op.end_btree, op.end_pos }, + ret = bch2_move_data(c, start, end, NULL, stats, writepoint_hashed((unsigned long) current), true, migrate_pred, &op) ?: ret; ret = bch2_replicas_gc2(c) ?: ret; - - bch2_move_stats_exit(stats, c); break; - case BCH_DATA_OP_REWRITE_OLD_NODES: - bch2_move_stats_init(stats, "rewrite_old_nodes"); + case BCH_DATA_OP_rewrite_old_nodes: ret = bch2_scan_old_btree_nodes(c, stats); - bch2_move_stats_exit(stats, c); + break; + case BCH_DATA_OP_drop_extra_replicas: + ret = bch2_move_btree(c, start, end, + drop_extra_replicas_btree_pred, c, stats) ?: ret; + ret = bch2_move_data(c, start, end, NULL, stats, + writepoint_hashed((unsigned long) current), + true, + drop_extra_replicas_pred, c) ?: ret; + ret = bch2_replicas_gc2(c) ?: ret; break; default: ret = -EINVAL; } + bch2_move_stats_exit(stats, c); return ret; } void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats) { - prt_printf(out, "%s: data type=%s pos=", - stats->name, - bch2_data_types[stats->data_type]); + prt_printf(out, "%s: data type==", stats->name); + bch2_prt_data_type(out, stats->data_type); + prt_str(out, " pos="); bch2_bbpos_to_text(out, stats->pos); prt_newline(out); printbuf_indent_add(out, 2); diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h index 0906aa2d1de2..9baf3093a678 100644 --- a/fs/bcachefs/move.h +++ b/fs/bcachefs/move.h @@ -75,12 +75,15 @@ do { \ typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c, struct bch_io_opts *, struct data_update_opts *); +extern const char * const bch2_data_ops_strs[]; + void bch2_moving_ctxt_exit(struct moving_context *); void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *, struct bch_ratelimit *, struct bch_move_stats *, struct write_point_specifier, bool); struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *); void bch2_moving_ctxt_do_pending_writes(struct moving_context *); +void bch2_moving_ctxt_flush_all(struct moving_context *); void bch2_move_ctxt_wait_for_io(struct moving_context *); int bch2_move_ratelimit(struct moving_context *); @@ -133,23 +136,17 @@ int bch2_move_data(struct bch_fs *, bool, move_pred_fn, void *); -int __bch2_evacuate_bucket(struct moving_context *, +int bch2_evacuate_bucket(struct moving_context *, struct move_bucket_in_flight *, struct bpos, int, struct data_update_opts); -int bch2_evacuate_bucket(struct bch_fs *, struct bpos, int, - struct data_update_opts, - struct bch_ratelimit *, - struct bch_move_stats *, - struct write_point_specifier, - bool); int bch2_data_job(struct bch_fs *, struct bch_move_stats *, struct bch_ioctl_data); void bch2_move_stats_to_text(struct printbuf *, struct bch_move_stats *); void bch2_move_stats_exit(struct bch_move_stats *, struct bch_fs *); -void bch2_move_stats_init(struct bch_move_stats *, char *); +void bch2_move_stats_init(struct bch_move_stats *, const char *); void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *); diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index a84e79f79e5e..69e06a84dad4 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -91,7 +91,7 @@ static int bch2_bucket_is_movable(struct btree_trans *trans, a = bch2_alloc_to_v4(k, &_a); b->k.gen = a->gen; - b->sectors = a->dirty_sectors; + b->sectors = bch2_bucket_sectors_dirty(*a); ret = data_type_movable(a->data_type) && a->fragmentation_lru && @@ -145,20 +145,21 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt, { struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; size_t nr_to_get = max_t(size_t, 16U, buckets_in_flight->nr / 4); size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0; int ret; move_buckets_wait(ctxt, buckets_in_flight, false); - ret = bch2_btree_write_buffer_flush(trans); - if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_flush()", + ret = bch2_btree_write_buffer_tryflush(trans); + if (bch2_err_matches(ret, EROFS)) + return ret; + + if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_tryflush()", __func__, bch2_err_str(ret))) return ret; - ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_lru, + ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru, lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0), lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX), 0, k, ({ @@ -167,15 +168,23 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt, saw++; - if (!bch2_bucket_is_movable(trans, &b, lru_pos_time(k.k->p))) + ret2 = bch2_bucket_is_movable(trans, &b, lru_pos_time(k.k->p)); + if (ret2 < 0) + goto err; + + if (!ret2) not_movable++; else if (bucket_in_flight(buckets_in_flight, b.k)) in_flight++; else { - ret2 = darray_push(buckets, b) ?: buckets->nr >= nr_to_get; - if (ret2 >= 0) - sectors += b.sectors; + ret2 = darray_push(buckets, b); + if (ret2) + goto err; + sectors += b.sectors; } + + ret2 = buckets->nr >= nr_to_get; +err: ret2; })); @@ -198,7 +207,6 @@ static int bch2_copygc(struct moving_context *ctxt, }; move_buckets buckets = { 0 }; struct move_bucket_in_flight *f; - struct move_bucket *i; u64 moved = atomic64_read(&ctxt->stats->sectors_moved); int ret = 0; @@ -221,7 +229,7 @@ static int bch2_copygc(struct moving_context *ctxt, break; } - ret = __bch2_evacuate_bucket(ctxt, f, f->bucket.k.bucket, + ret = bch2_evacuate_bucket(ctxt, f, f->bucket.k.bucket, f->bucket.k.gen, data_opts); if (ret) goto err; @@ -259,19 +267,16 @@ err: */ unsigned long bch2_copygc_wait_amount(struct bch_fs *c) { - struct bch_dev *ca; - unsigned dev_idx; s64 wait = S64_MAX, fragmented_allowed, fragmented; - unsigned i; - for_each_rw_member(ca, c, dev_idx) { + for_each_rw_member(c, ca) { struct bch_dev_usage usage = bch2_dev_usage_read(ca); fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) * ca->mi.bucket_size) >> 1); fragmented = 0; - for (i = 0; i < BCH_DATA_NR; i++) + for (unsigned i = 0; i < BCH_DATA_NR; i++) if (data_type_movable(i)) fragmented += usage.d[i].fragmented; @@ -313,9 +318,9 @@ static int bch2_copygc_thread(void *arg) if (!buckets) return -ENOMEM; ret = rhashtable_init(&buckets->table, &bch_move_bucket_params); + bch_err_msg(c, ret, "allocating copygc buckets in flight"); if (ret) { kfree(buckets); - bch_err_msg(c, ret, "allocating copygc buckets in flight"); return ret; } @@ -334,7 +339,8 @@ static int bch2_copygc_thread(void *arg) if (!c->copy_gc_enabled) { move_buckets_wait(&ctxt, buckets, true); - kthread_wait_freezable(c->copy_gc_enabled); + kthread_wait_freezable(c->copy_gc_enabled || + kthread_should_stop()); } if (unlikely(freezing(current))) { @@ -411,10 +417,9 @@ int bch2_copygc_start(struct bch_fs *c) t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name); ret = PTR_ERR_OR_ZERO(t); - if (ret) { - bch_err_msg(c, ret, "creating copygc thread"); + bch_err_msg(c, ret, "creating copygc thread"); + if (ret) return ret; - } get_task_struct(t); diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index 8dd4046cca41..b1ed0b9a20d3 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -52,7 +52,7 @@ const char * const bch2_csum_opts[] = { NULL }; -const char * const bch2_compression_types[] = { +const char * const __bch2_compression_types[] = { BCH_COMPRESSION_TYPES() NULL }; @@ -72,7 +72,7 @@ const char * const bch2_str_hash_opts[] = { NULL }; -const char * const bch2_data_types[] = { +const char * const __bch2_data_types[] = { BCH_DATA_TYPES() NULL }; @@ -279,14 +279,14 @@ int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err) if (err) prt_printf(err, "%s: not a multiple of 512", opt->attr.name); - return -EINVAL; + return -BCH_ERR_opt_parse_error; } if ((opt->flags & OPT_MUST_BE_POW_2) && !is_power_of_2(v)) { if (err) prt_printf(err, "%s: must be a power of two", opt->attr.name); - return -EINVAL; + return -BCH_ERR_opt_parse_error; } if (opt->fn.validate) diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 8526f177450a..9a4b7faa3765 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -18,11 +18,11 @@ extern const char * const bch2_sb_compat[]; extern const char * const __bch2_btree_ids[]; extern const char * const bch2_csum_types[]; extern const char * const bch2_csum_opts[]; -extern const char * const bch2_compression_types[]; +extern const char * const __bch2_compression_types[]; extern const char * const bch2_compression_opts[]; extern const char * const bch2_str_hash_types[]; extern const char * const bch2_str_hash_opts[]; -extern const char * const bch2_data_types[]; +extern const char * const __bch2_data_types[]; extern const char * const bch2_member_states[]; extern const char * const bch2_jset_entry_types[]; extern const char * const bch2_fs_usage_types[]; @@ -233,11 +233,6 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH2_NO_SB_OPT, true, \ NULL, "Stash pointer to in memory btree node in btree ptr")\ - x(btree_write_buffer_size, u32, \ - OPT_FS|OPT_MOUNT, \ - OPT_UINT(16, (1U << 20) - 1), \ - BCH2_NO_SB_OPT, 1U << 13, \ - NULL, "Number of btree write buffer entries") \ x(gc_reserve_percent, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_UINT(5, 21), \ @@ -394,7 +389,7 @@ enum fsck_err_opts { BCH2_NO_SB_OPT, BCH_SB_SECTOR, \ "offset", "Sector offset of superblock") \ x(read_only, u8, \ - OPT_FS, \ + OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, NULL) \ @@ -419,6 +414,11 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, "Allocate the buckets_nouse bitmap") \ + x(stdio, u64, \ + 0, \ + OPT_UINT(0, S64_MAX), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Pointer to a struct stdio_redirect") \ x(project, u8, \ OPT_INODE, \ OPT_BOOL(), \ @@ -458,7 +458,13 @@ enum fsck_err_opts { OPT_UINT(0, BCH_REPLICAS_MAX), \ BCH2_NO_SB_OPT, 1, \ "n", "Data written to this device will be considered\n"\ - "to have already been replicated n times") + "to have already been replicated n times") \ + x(btree_node_prefetch, u8, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, true, \ + NULL, "BTREE_ITER_PREFETCH casuse btree nodes to be\n"\ + " prefetched sequentially") struct bch_opts { #define x(_name, _bits, ...) unsigned _name##_defined:1; @@ -558,6 +564,11 @@ struct bch_io_opts { #undef x }; +static inline unsigned background_compression(struct bch_io_opts opts) +{ + return opts.background_compression ?: opts.compression; +} + struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts); bool bch2_opt_is_inode_opt(enum bch_opt_id); diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c index a54647c36b85..e68b34eab90a 100644 --- a/fs/bcachefs/quota.c +++ b/fs/bcachefs/quota.c @@ -599,14 +599,9 @@ advance: int bch2_fs_quota_read(struct bch_fs *c) { - struct bch_sb_field_quota *sb_quota; - struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; - int ret; mutex_lock(&c->sb_lock); - sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); + struct bch_sb_field_quota *sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); if (!sb_quota) { mutex_unlock(&c->sb_lock); return -BCH_ERR_ENOSPC_sb_quota; @@ -615,19 +610,14 @@ int bch2_fs_quota_read(struct bch_fs *c) bch2_sb_quota_read(c); mutex_unlock(&c->sb_lock); - trans = bch2_trans_get(c); - - ret = for_each_btree_key2(trans, iter, BTREE_ID_quotas, - POS_MIN, BTREE_ITER_PREFETCH, k, - __bch2_quota_set(c, k, NULL)) ?: - for_each_btree_key2(trans, iter, BTREE_ID_inodes, - POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, - bch2_fs_quota_read_inode(trans, &iter, k)); - - bch2_trans_put(trans); - - if (ret) - bch_err_fn(c, ret); + int ret = bch2_trans_run(c, + for_each_btree_key(trans, iter, BTREE_ID_quotas, POS_MIN, + BTREE_ITER_PREFETCH, k, + __bch2_quota_set(c, k, NULL)) ?: + for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN, + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + bch2_fs_quota_read_inode(trans, &iter, k))); + bch_err_fn(c, ret); return ret; } diff --git a/fs/bcachefs/quota_format.h b/fs/bcachefs/quota_format.h new file mode 100644 index 000000000000..dc34347ef6c7 --- /dev/null +++ b/fs/bcachefs/quota_format.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_QUOTA_FORMAT_H +#define _BCACHEFS_QUOTA_FORMAT_H + +/* KEY_TYPE_quota: */ + +enum quota_types { + QTYP_USR = 0, + QTYP_GRP = 1, + QTYP_PRJ = 2, + QTYP_NR = 3, +}; + +enum quota_counters { + Q_SPC = 0, + Q_INO = 1, + Q_COUNTERS = 2, +}; + +struct bch_quota_counter { + __le64 hardlimit; + __le64 softlimit; +}; + +struct bch_quota { + struct bch_val v; + struct bch_quota_counter c[Q_COUNTERS]; +} __packed __aligned(8); + +/* BCH_SB_FIELD_quota: */ + +struct bch_sb_quota_counter { + __le32 timelimit; + __le32 warnlimit; +}; + +struct bch_sb_quota_type { + __le64 flags; + struct bch_sb_quota_counter c[Q_COUNTERS]; +}; + +struct bch_sb_field_quota { + struct bch_sb_field field; + struct bch_sb_quota_type q[QTYP_NR]; +} __packed __aligned(8); + +#endif /* _BCACHEFS_QUOTA_FORMAT_H */ diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 3319190b8d9c..22d1017aa49b 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -69,7 +69,7 @@ err: int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum) { - int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, + int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw, __bch2_set_rebalance_needs_scan(trans, inum)); rebalance_wakeup(c); return ret; @@ -125,7 +125,7 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans, extent_entry_drop(bkey_i_to_s(n), (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n))); - return bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); + return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); } static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, @@ -171,6 +171,20 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, return bkey_s_c_null; } + if (trace_rebalance_extent_enabled()) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "target="); + bch2_target_to_text(&buf, c, r->target); + prt_str(&buf, " compression="); + bch2_compression_opt_to_text(&buf, r->compression); + prt_str(&buf, " "); + bch2_bkey_val_to_text(&buf, c, k); + + trace_rebalance_extent(c, buf.buf); + printbuf_exit(&buf); + } + return k; } @@ -239,13 +253,12 @@ static bool rebalance_pred(struct bch_fs *c, void *arg, if (k.k->p.inode) { target = io_opts->background_target; - compression = io_opts->background_compression ?: io_opts->compression; + compression = background_compression(*io_opts); } else { const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); target = r ? r->target : io_opts->background_target; - compression = r ? r->compression : - (io_opts->background_compression ?: io_opts->compression); + compression = r ? r->compression : background_compression(*io_opts); } data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, target, compression); @@ -273,7 +286,7 @@ static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie) r->state = BCH_REBALANCE_scanning; ret = __bch2_move_data(ctxt, r->scan_start, r->scan_end, rebalance_pred, NULL) ?: - commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_clear_rebalance_needs_scan(trans, inum, cookie)); bch2_move_stats_exit(&r->scan_stats, trans->c); @@ -317,8 +330,16 @@ static int do_rebalance(struct moving_context *ctxt) BTREE_ID_rebalance_work, POS_MIN, BTREE_ITER_ALL_SNAPSHOTS); - while (!bch2_move_ratelimit(ctxt) && - !kthread_wait_freezable(r->enabled)) { + while (!bch2_move_ratelimit(ctxt)) { + if (!r->enabled) { + bch2_moving_ctxt_flush_all(ctxt); + kthread_wait_freezable(r->enabled || + kthread_should_stop()); + } + + if (kthread_should_stop()) + break; + bch2_trans_begin(trans); ret = bkey_err(k = next_rebalance_entry(trans, &rebalance_work_iter)); @@ -348,6 +369,7 @@ static int do_rebalance(struct moving_context *ctxt) !kthread_should_stop() && !atomic64_read(&r->work_stats.sectors_seen) && !atomic64_read(&r->scan_stats.sectors_seen)) { + bch2_moving_ctxt_flush_all(ctxt); bch2_trans_unlock_long(trans); rebalance_wait(c); } @@ -362,7 +384,6 @@ static int bch2_rebalance_thread(void *arg) struct bch_fs *c = arg; struct bch_fs_rebalance *r = &c->rebalance; struct moving_context ctxt; - int ret; set_freezable(); @@ -370,8 +391,7 @@ static int bch2_rebalance_thread(void *arg) writepoint_ptr(&c->rebalance_write_point), true); - while (!kthread_should_stop() && - !(ret = do_rebalance(&ctxt))) + while (!kthread_should_stop() && !do_rebalance(&ctxt)) ; bch2_moving_ctxt_exit(&ctxt); @@ -447,10 +467,9 @@ int bch2_rebalance_start(struct bch_fs *c) p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name); ret = PTR_ERR_OR_ZERO(p); - if (ret) { - bch_err_msg(c, ret, "creating rebalance thread"); + bch_err_msg(c, ret, "creating rebalance thread"); + if (ret) return ret; - } get_task_struct(p); rcu_assign_pointer(c->rebalance.thread, p); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 5cf7d0532002..9127d0e3ca2f 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -99,6 +99,11 @@ static int bch2_journal_replay_key(struct btree_trans *trans, unsigned update_flags = BTREE_TRIGGER_NORUN; int ret; + if (k->overwritten) + return 0; + + trans->journal_res.seq = k->journal_seq; + /* * BTREE_UPDATE_KEY_CACHE_RECLAIM disables key cache lookup/update to * keep the key cache coherent with the underlying btree. Nothing @@ -140,27 +145,13 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r) static int bch2_journal_replay(struct bch_fs *c) { struct journal_keys *keys = &c->journal_keys; - struct journal_key **keys_sorted, *k; + DARRAY(struct journal_key *) keys_sorted = { 0 }; struct journal *j = &c->journal; u64 start_seq = c->journal_replay_seq_start; u64 end_seq = c->journal_replay_seq_start; - size_t i; + struct btree_trans *trans = bch2_trans_get(c); int ret = 0; - move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); - keys->gap = keys->nr; - - keys_sorted = kvmalloc_array(keys->nr, sizeof(*keys_sorted), GFP_KERNEL); - if (!keys_sorted) - return -BCH_ERR_ENOMEM_journal_replay; - - for (i = 0; i < keys->nr; i++) - keys_sorted[i] = &keys->d[i]; - - sort(keys_sorted, keys->nr, - sizeof(keys_sorted[0]), - journal_sort_seq_cmp, NULL); - if (keys->nr) { ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)", keys->nr, start_seq, end_seq); @@ -170,27 +161,67 @@ static int bch2_journal_replay(struct bch_fs *c) BUG_ON(!atomic_read(&keys->ref)); - for (i = 0; i < keys->nr; i++) { - k = keys_sorted[i]; + /* + * First, attempt to replay keys in sorted order. This is more + * efficient - better locality of btree access - but some might fail if + * that would cause a journal deadlock. + */ + for (size_t i = 0; i < keys->nr; i++) { + cond_resched(); + + struct journal_key *k = keys->d + i; + + /* Skip fastpath if we're low on space in the journal */ + ret = c->journal.watermark ? -1 : + commit_do(trans, NULL, NULL, + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_journal_reclaim| + (!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0), + bch2_journal_replay_key(trans, k)); + BUG_ON(!ret && !k->overwritten); + if (ret) { + ret = darray_push(&keys_sorted, k); + if (ret) + goto err; + } + } + /* + * Now, replay any remaining keys in the order in which they appear in + * the journal, unpinning those journal entries as we go: + */ + sort(keys_sorted.data, keys_sorted.nr, + sizeof(keys_sorted.data[0]), + journal_sort_seq_cmp, NULL); + + darray_for_each(keys_sorted, kp) { cond_resched(); + struct journal_key *k = *kp; + replay_now_at(j, k->journal_seq); - ret = bch2_trans_do(c, NULL, NULL, - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_NOFAIL| - (!k->allocated - ? BTREE_INSERT_JOURNAL_REPLAY|BCH_WATERMARK_reclaim - : 0), + ret = commit_do(trans, NULL, NULL, + BCH_TRANS_COMMIT_no_enospc| + (!k->allocated + ? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim + : 0), bch2_journal_replay_key(trans, k)); - if (ret) { - bch_err(c, "journal replay: error while replaying key at btree %s level %u: %s", - bch2_btree_id_str(k->btree_id), k->level, bch2_err_str(ret)); + bch_err_msg(c, ret, "while replaying key at btree %s level %u:", + bch2_btree_id_str(k->btree_id), k->level); + if (ret) goto err; - } + + BUG_ON(!k->overwritten); } + /* + * We need to put our btree_trans before calling flush_all_pins(), since + * that will use a btree_trans internally + */ + bch2_trans_put(trans); + trans = NULL; + if (!c->opts.keep_journal) bch2_journal_keys_put_initial(c); @@ -198,16 +229,14 @@ static int bch2_journal_replay(struct bch_fs *c) j->replay_journal_seq = 0; bch2_journal_set_replay_done(j); - bch2_journal_flush_all_pins(j); - ret = bch2_journal_error(j); - if (keys->nr && !ret) + if (keys->nr) bch2_journal_log_msg(c, "journal replay finished"); err: - kvfree(keys_sorted); - - if (ret) - bch_err_fn(c, ret); + if (trans) + bch2_trans_put(trans); + darray_exit(&keys_sorted); + bch_err_fn(c, ret); return ret; } @@ -251,7 +280,7 @@ static int journal_replay_entry_early(struct bch_fs *c, le64_to_cpu(u->v); break; case BCH_FS_USAGE_inodes: - c->usage_base->nr_inodes = le64_to_cpu(u->v); + c->usage_base->b.nr_inodes = le64_to_cpu(u->v); break; case BCH_FS_USAGE_key_version: atomic64_set(&c->key_version, @@ -275,8 +304,6 @@ static int journal_replay_entry_early(struct bch_fs *c, struct bch_dev *ca = bch_dev_bkey_exists(c, le32_to_cpu(u->dev)); unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); - ca->usage_base->buckets_ec = le64_to_cpu(u->buckets_ec); - for (i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) { ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets); ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors); @@ -317,14 +344,11 @@ static int journal_replay_entry_early(struct bch_fs *c, static int journal_replay_early(struct bch_fs *c, struct bch_sb_field_clean *clean) { - struct jset_entry *entry; - int ret; - if (clean) { - for (entry = clean->start; + for (struct jset_entry *entry = clean->start; entry != vstruct_end(&clean->field); entry = vstruct_next(entry)) { - ret = journal_replay_entry_early(c, entry); + int ret = journal_replay_entry_early(c, entry); if (ret) return ret; } @@ -339,7 +363,7 @@ static int journal_replay_early(struct bch_fs *c, continue; vstruct_for_each(&i->j, entry) { - ret = journal_replay_entry_early(c, entry); + int ret = journal_replay_entry_early(c, entry); if (ret) return ret; } @@ -435,8 +459,7 @@ static int bch2_initialize_subvolumes(struct bch_fs *c) ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0) ?: bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0) ?: bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } @@ -474,10 +497,9 @@ err: noinline_for_stack static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c) { - int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW, + int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw, __bch2_fs_upgrade_for_subvolumes(trans)); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } @@ -495,7 +517,20 @@ static int bch2_check_allocations(struct bch_fs *c) static int bch2_set_may_go_rw(struct bch_fs *c) { - set_bit(BCH_FS_MAY_GO_RW, &c->flags); + struct journal_keys *keys = &c->journal_keys; + + /* + * After we go RW, the journal keys buffer can't be modified (except for + * setting journal_key->overwritten: it will be accessed by multiple + * threads + */ + move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); + keys->gap = keys->nr; + + set_bit(BCH_FS_may_go_rw, &c->flags); + + if (keys->nr || c->opts.fsck || !c->sb.clean) + return bch2_fs_read_write_early(c); return 0; } @@ -589,17 +624,15 @@ static bool check_version_upgrade(struct bch_fs *c) bch2_version_to_text(&buf, new_version); prt_newline(&buf); - u64 recovery_passes = bch2_upgrade_recovery_passes(c, old_version, new_version); - if (recovery_passes) { - if ((recovery_passes & RECOVERY_PASS_ALL_FSCK) == RECOVERY_PASS_ALL_FSCK) - prt_str(&buf, "fsck required"); - else { - prt_str(&buf, "running recovery passes: "); - prt_bitflags(&buf, bch2_recovery_passes, recovery_passes); - } - - c->recovery_passes_explicit |= recovery_passes; - c->opts.fix_errors = FSCK_FIX_yes; + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + __le64 passes = ext->recovery_passes_required[0]; + bch2_sb_set_upgrade(c, old_version, new_version); + passes = ext->recovery_passes_required[0] & ~passes; + + if (passes) { + prt_str(&buf, " running recovery passes: "); + prt_bitflags(&buf, bch2_recovery_passes, + bch2_recovery_passes_from_stable(le64_to_cpu(passes))); } bch_info(c, "%s", buf.buf); @@ -625,7 +658,7 @@ u64 bch2_fsck_recovery_passes(void) static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) { - struct recovery_pass_fn *p = recovery_pass_fns + c->curr_recovery_pass; + struct recovery_pass_fn *p = recovery_pass_fns + pass; if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read) return false; @@ -642,39 +675,62 @@ static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pa static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) { + struct recovery_pass_fn *p = recovery_pass_fns + pass; int ret; - c->curr_recovery_pass = pass; + if (!(p->when & PASS_SILENT)) + bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."), + bch2_recovery_passes[pass]); + ret = p->fn(c); + if (ret) + return ret; + if (!(p->when & PASS_SILENT)) + bch2_print(c, KERN_CONT " done\n"); - if (should_run_recovery_pass(c, pass)) { - struct recovery_pass_fn *p = recovery_pass_fns + pass; + return 0; +} - if (!(p->when & PASS_SILENT)) - printk(KERN_INFO bch2_log_msg(c, "%s..."), - bch2_recovery_passes[pass]); - ret = p->fn(c); - if (ret) - return ret; - if (!(p->when & PASS_SILENT)) - printk(KERN_CONT " done\n"); +static int bch2_run_recovery_passes(struct bch_fs *c) +{ + int ret = 0; - c->recovery_passes_complete |= BIT_ULL(pass); + while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) { + if (should_run_recovery_pass(c, c->curr_recovery_pass)) { + unsigned pass = c->curr_recovery_pass; + + ret = bch2_run_recovery_pass(c, c->curr_recovery_pass); + if (bch2_err_matches(ret, BCH_ERR_restart_recovery) || + (ret && c->curr_recovery_pass < pass)) + continue; + if (ret) + break; + + c->recovery_passes_complete |= BIT_ULL(c->curr_recovery_pass); + } + c->curr_recovery_pass++; + c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass); } - return 0; + return ret; } -static int bch2_run_recovery_passes(struct bch_fs *c) +int bch2_run_online_recovery_passes(struct bch_fs *c) { int ret = 0; - while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) { - ret = bch2_run_recovery_pass(c, c->curr_recovery_pass); - if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) + for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) { + struct recovery_pass_fn *p = recovery_pass_fns + i; + + if (!(p->when & PASS_ONLINE)) + continue; + + ret = bch2_run_recovery_pass(c, i); + if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) { + i = c->curr_recovery_pass; continue; + } if (ret) break; - c->curr_recovery_pass++; } return ret; @@ -779,6 +835,9 @@ int bch2_fs_recovery(struct bch_fs *c) if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology); + if (c->opts.fsck) + set_bit(BCH_FS_fsck_running, &c->flags); + ret = bch2_blacklist_table_initialize(c); if (ret) { bch_err(c, "error initializing blacklist table"); @@ -919,13 +978,17 @@ use_clean: if (ret) goto err; + clear_bit(BCH_FS_fsck_running, &c->flags); + /* If we fixed errors, verify that fs is actually clean now: */ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && - test_bit(BCH_FS_ERRORS_FIXED, &c->flags) && - !test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags) && - !test_bit(BCH_FS_ERROR, &c->flags)) { + test_bit(BCH_FS_errors_fixed, &c->flags) && + !test_bit(BCH_FS_errors_not_fixed, &c->flags) && + !test_bit(BCH_FS_error, &c->flags)) { + bch2_flush_fsck_errs(c); + bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean"); - clear_bit(BCH_FS_ERRORS_FIXED, &c->flags); + clear_bit(BCH_FS_errors_fixed, &c->flags); c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info; @@ -933,13 +996,13 @@ use_clean: if (ret) goto err; - if (test_bit(BCH_FS_ERRORS_FIXED, &c->flags) || - test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags)) { + if (test_bit(BCH_FS_errors_fixed, &c->flags) || + test_bit(BCH_FS_errors_not_fixed, &c->flags)) { bch_err(c, "Second fsck run was not clean"); - set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags); + set_bit(BCH_FS_errors_not_fixed, &c->flags); } - set_bit(BCH_FS_ERRORS_FIXED, &c->flags); + set_bit(BCH_FS_errors_fixed, &c->flags); } if (enabled_qtypes(c)) { @@ -958,13 +1021,13 @@ use_clean: write_sb = true; } - if (!test_bit(BCH_FS_ERROR, &c->flags) && + if (!test_bit(BCH_FS_error, &c->flags) && !(c->disk_sb.sb->compat[0] & cpu_to_le64(1ULL << BCH_COMPAT_alloc_info))) { c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info); write_sb = true; } - if (!test_bit(BCH_FS_ERROR, &c->flags)) { + if (!test_bit(BCH_FS_error, &c->flags)) { struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); if (ext && (!bch2_is_zero(ext->recovery_passes_required, sizeof(ext->recovery_passes_required)) || @@ -976,8 +1039,8 @@ use_clean: } if (c->opts.fsck && - !test_bit(BCH_FS_ERROR, &c->flags) && - !test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags)) { + !test_bit(BCH_FS_error, &c->flags) && + !test_bit(BCH_FS_errors_not_fixed, &c->flags)) { SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0); SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 0); write_sb = true; @@ -993,8 +1056,12 @@ use_clean: bch2_move_stats_init(&stats, "recovery"); - bch_info(c, "scanning for old btree nodes"); - ret = bch2_fs_read_write(c) ?: + struct printbuf buf = PRINTBUF; + bch2_version_to_text(&buf, c->sb.version_min); + bch_info(c, "scanning for old btree nodes: min_version %s", buf.buf); + printbuf_exit(&buf); + + ret = bch2_fs_read_write_early(c) ?: bch2_scan_old_btree_nodes(c, &stats); if (ret) goto err; @@ -1007,7 +1074,6 @@ use_clean: ret = 0; out: - set_bit(BCH_FS_FSCK_DONE, &c->flags); bch2_flush_fsck_errs(c); if (!c->opts.keep_journal && @@ -1015,13 +1081,14 @@ out: bch2_journal_keys_put_initial(c); kfree(clean); - if (!ret && test_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags)) { + if (!ret && + test_bit(BCH_FS_need_delete_dead_snapshots, &c->flags) && + !c->opts.nochanges) { bch2_fs_read_write_early(c); bch2_delete_dead_snapshots_async(c); } - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; err: fsck_err: @@ -1034,8 +1101,6 @@ int bch2_fs_initialize(struct bch_fs *c) struct bch_inode_unpacked root_inode, lostfound_inode; struct bkey_inode_buf packed_inode; struct qstr lostfound = QSTR("lost+found"); - struct bch_dev *ca; - unsigned i; int ret; bch_notice(c, "initializing new filesystem"); @@ -1054,13 +1119,12 @@ int bch2_fs_initialize(struct bch_fs *c) mutex_unlock(&c->sb_lock); c->curr_recovery_pass = ARRAY_SIZE(recovery_pass_fns); - set_bit(BCH_FS_MAY_GO_RW, &c->flags); - set_bit(BCH_FS_FSCK_DONE, &c->flags); + set_bit(BCH_FS_may_go_rw, &c->flags); - for (i = 0; i < BTREE_ID_NR; i++) + for (unsigned i = 0; i < BTREE_ID_NR; i++) bch2_btree_root_alloc(c, i); - for_each_member_device(ca, c, i) + for_each_member_device(c, ca) bch2_dev_usage_init(ca); ret = bch2_fs_journal_alloc(c); @@ -1088,7 +1152,7 @@ int bch2_fs_initialize(struct bch_fs *c) if (ret) goto err; - for_each_online_member(ca, c, i) + for_each_online_member(c, ca) ca->new_fs_bucket_idx = 0; ret = bch2_fs_freespace_init(c); @@ -1112,10 +1176,9 @@ int bch2_fs_initialize(struct bch_fs *c) packed_inode.inode.k.p.snapshot = U32_MAX; ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0); - if (ret) { - bch_err_msg(c, ret, "creating root directory"); + bch_err_msg(c, ret, "creating root directory"); + if (ret) goto err; - } bch2_inode_init_early(c, &lostfound_inode); @@ -1126,10 +1189,11 @@ int bch2_fs_initialize(struct bch_fs *c) &lostfound, 0, 0, S_IFDIR|0700, 0, NULL, NULL, (subvol_inum) { 0 }, 0)); - if (ret) { - bch_err_msg(c, ret, "creating lost+found"); + bch_err_msg(c, ret, "creating lost+found"); + if (ret) goto err; - } + + c->recovery_pass_done = ARRAY_SIZE(recovery_pass_fns) - 1; if (enabled_qtypes(c)) { ret = bch2_fs_quota_read(c); @@ -1138,10 +1202,9 @@ int bch2_fs_initialize(struct bch_fs *c) } ret = bch2_journal_flush(&c->journal); - if (ret) { - bch_err_msg(c, ret, "writing first journal entry"); + bch_err_msg(c, ret, "writing first journal entry"); + if (ret) goto err; - } mutex_lock(&c->sb_lock); SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true); @@ -1152,6 +1215,6 @@ int bch2_fs_initialize(struct bch_fs *c) return 0; err: - bch_err_fn(ca, ret); + bch_err_fn(c, ret); return ret; } diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h index 3a554b0751d0..4e9d24719b2e 100644 --- a/fs/bcachefs/recovery.h +++ b/fs/bcachefs/recovery.h @@ -31,6 +31,7 @@ static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c, } } +int bch2_run_online_recovery_passes(struct bch_fs *); u64 bch2_fsck_recovery_passes(void); int bch2_fs_recovery(struct bch_fs *); diff --git a/fs/bcachefs/recovery_types.h b/fs/bcachefs/recovery_types.h index d37c6fd30e38..fa0c8efd2a1b 100644 --- a/fs/bcachefs/recovery_types.h +++ b/fs/bcachefs/recovery_types.h @@ -6,6 +6,7 @@ #define PASS_FSCK BIT(1) #define PASS_UNCLEAN BIT(2) #define PASS_ALWAYS BIT(3) +#define PASS_ONLINE BIT(4) /* * Passes may be reordered, but the second field is a persistent identifier and @@ -22,18 +23,18 @@ x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT) \ x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \ x(journal_replay, 9, PASS_ALWAYS) \ - x(check_alloc_info, 10, PASS_FSCK) \ - x(check_lrus, 11, PASS_FSCK) \ - x(check_btree_backpointers, 12, PASS_FSCK) \ - x(check_backpointers_to_extents, 13, PASS_FSCK) \ - x(check_extents_to_backpointers, 14, PASS_FSCK) \ - x(check_alloc_to_lru_refs, 15, PASS_FSCK) \ + x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK) \ + x(check_lrus, 11, PASS_ONLINE|PASS_FSCK) \ + x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK) \ + x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK) \ + x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK) \ + x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \ x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \ x(bucket_gens_init, 17, 0) \ - x(check_snapshot_trees, 18, PASS_FSCK) \ - x(check_snapshots, 19, PASS_FSCK) \ - x(check_subvols, 20, PASS_FSCK) \ - x(delete_dead_snapshots, 21, PASS_FSCK) \ + x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \ + x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \ + x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \ + x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \ x(fs_upgrade_for_subvolumes, 22, 0) \ x(resume_logged_ops, 23, PASS_ALWAYS) \ x(check_inodes, 24, PASS_FSCK) \ @@ -41,8 +42,8 @@ x(check_indirect_extents, 26, PASS_FSCK) \ x(check_dirents, 27, PASS_FSCK) \ x(check_xattrs, 28, PASS_FSCK) \ - x(check_root, 29, PASS_FSCK) \ - x(check_directory_structure, 30, PASS_FSCK) \ + x(check_root, 29, PASS_ONLINE|PASS_FSCK) \ + x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ x(check_nlinks, 31, PASS_FSCK) \ x(delete_dead_inodes, 32, PASS_FSCK|PASS_UNCLEAN) \ x(fix_reflink_p, 33, 0) \ diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index 37d16e04e671..c47c66c2b394 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -3,6 +3,7 @@ #include "bkey_buf.h" #include "btree_update.h" #include "buckets.h" +#include "error.h" #include "extents.h" #include "inode.h" #include "io_misc.h" @@ -33,15 +34,14 @@ int bch2_reflink_p_invalid(struct bch_fs *c, struct bkey_s_c k, struct printbuf *err) { struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + int ret = 0; - if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix && - le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad)) { - prt_printf(err, "idx < front_pad (%llu < %u)", - le64_to_cpu(p.v->idx), le32_to_cpu(p.v->front_pad)); - return -EINVAL; - } - - return 0; + bkey_fsck_err_on(le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad), + c, err, reflink_p_front_pad_bad, + "idx < front_pad (%llu < %u)", + le64_to_cpu(p.v->idx), le32_to_cpu(p.v->front_pad)); +fsck_err: + return ret; } void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c, @@ -73,6 +73,184 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r return true; } +static int trans_trigger_reflink_p_segment(struct btree_trans *trans, + struct bkey_s_c_reflink_p p, + u64 *idx, unsigned flags) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_i *k; + __le64 *refcount; + int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; + struct printbuf buf = PRINTBUF; + int ret; + + k = bch2_bkey_get_mut_noupdate(trans, &iter, + BTREE_ID_reflink, POS(0, *idx), + BTREE_ITER_WITH_UPDATES); + ret = PTR_ERR_OR_ZERO(k); + if (ret) + goto err; + + refcount = bkey_refcount(bkey_i_to_s(k)); + if (!refcount) { + bch2_bkey_val_to_text(&buf, c, p.s_c); + bch2_trans_inconsistent(trans, + "nonexistent indirect extent at %llu while marking\n %s", + *idx, buf.buf); + ret = -EIO; + goto err; + } + + if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) { + bch2_bkey_val_to_text(&buf, c, p.s_c); + bch2_trans_inconsistent(trans, + "indirect extent refcount underflow at %llu while marking\n %s", + *idx, buf.buf); + ret = -EIO; + goto err; + } + + if (flags & BTREE_TRIGGER_INSERT) { + struct bch_reflink_p *v = (struct bch_reflink_p *) p.v; + u64 pad; + + pad = max_t(s64, le32_to_cpu(v->front_pad), + le64_to_cpu(v->idx) - bkey_start_offset(&k->k)); + BUG_ON(pad > U32_MAX); + v->front_pad = cpu_to_le32(pad); + + pad = max_t(s64, le32_to_cpu(v->back_pad), + k->k.p.offset - p.k->size - le64_to_cpu(v->idx)); + BUG_ON(pad > U32_MAX); + v->back_pad = cpu_to_le32(pad); + } + + le64_add_cpu(refcount, add); + + bch2_btree_iter_set_pos_to_extent_start(&iter); + ret = bch2_trans_update(trans, &iter, k, 0); + if (ret) + goto err; + + *idx = k->k.p.offset; +err: + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); + return ret; +} + +static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans, + struct bkey_s_c_reflink_p p, + u64 *idx, unsigned flags, size_t r_idx) +{ + struct bch_fs *c = trans->c; + struct reflink_gc *r; + int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; + u64 start = le64_to_cpu(p.v->idx); + u64 end = le64_to_cpu(p.v->idx) + p.k->size; + u64 next_idx = end + le32_to_cpu(p.v->back_pad); + s64 ret = 0; + struct printbuf buf = PRINTBUF; + + if (r_idx >= c->reflink_gc_nr) + goto not_found; + + r = genradix_ptr(&c->reflink_gc_table, r_idx); + next_idx = min(next_idx, r->offset - r->size); + if (*idx < next_idx) + goto not_found; + + BUG_ON((s64) r->refcount + add < 0); + + r->refcount += add; + *idx = r->offset; + return 0; +not_found: + if (fsck_err(c, reflink_p_to_missing_reflink_v, + "pointer to missing indirect extent\n" + " %s\n" + " missing range %llu-%llu", + (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf), + *idx, next_idx)) { + struct bkey_i *update = bch2_bkey_make_mut_noupdate(trans, p.s_c); + ret = PTR_ERR_OR_ZERO(update); + if (ret) + goto err; + + if (next_idx <= start) { + bkey_i_to_reflink_p(update)->v.front_pad = cpu_to_le32(start - next_idx); + } else if (*idx >= end) { + bkey_i_to_reflink_p(update)->v.back_pad = cpu_to_le32(*idx - end); + } else { + bkey_error_init(update); + update->k.p = p.k->p; + update->k.p.offset = next_idx; + update->k.size = next_idx - *idx; + set_bkey_val_u64s(&update->k, 0); + } + + ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, update, BTREE_TRIGGER_NORUN); + } + + *idx = next_idx; +err: +fsck_err: + printbuf_exit(&buf); + return ret; +} + +static int __trigger_reflink_p(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, unsigned flags) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + int ret = 0; + + u64 idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad); + u64 end = le64_to_cpu(p.v->idx) + p.k->size + le32_to_cpu(p.v->back_pad); + + if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + while (idx < end && !ret) + ret = trans_trigger_reflink_p_segment(trans, p, &idx, flags); + } + + if (flags & BTREE_TRIGGER_GC) { + size_t l = 0, r = c->reflink_gc_nr; + + while (l < r) { + size_t m = l + (r - l) / 2; + struct reflink_gc *ref = genradix_ptr(&c->reflink_gc_table, m); + if (ref->offset <= idx) + l = m + 1; + else + r = m; + } + + while (idx < end && !ret) + ret = gc_trigger_reflink_p_segment(trans, p, &idx, flags, l++); + } + + return ret; +} + +int bch2_trigger_reflink_p(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, + struct bkey_s new, + unsigned flags) +{ + if ((flags & BTREE_TRIGGER_TRANSACTIONAL) && + (flags & BTREE_TRIGGER_INSERT)) { + struct bch_reflink_p *v = bkey_s_to_reflink_p(new).v; + + v->front_pad = v->back_pad = 0; + } + + return trigger_run_overwrite_then_insert(__trigger_reflink_p, trans, btree_id, level, old, new, flags); +} + /* indirect extents */ int bch2_reflink_v_invalid(struct bch_fs *c, struct bkey_s_c k, @@ -104,32 +282,26 @@ bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r } #endif -static inline void check_indirect_extent_deleting(struct bkey_i *new, unsigned *flags) +static inline void check_indirect_extent_deleting(struct bkey_s new, unsigned *flags) { if ((*flags & BTREE_TRIGGER_INSERT) && !*bkey_refcount(new)) { - new->k.type = KEY_TYPE_deleted; - new->k.size = 0; - set_bkey_val_u64s(&new->k, 0);; + new.k->type = KEY_TYPE_deleted; + new.k->size = 0; + set_bkey_val_u64s(new.k, 0); *flags &= ~BTREE_TRIGGER_INSERT; } } -int bch2_trans_mark_reflink_v(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_i *new, - unsigned flags) +int bch2_trigger_reflink_v(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_s new, + unsigned flags) { - check_indirect_extent_deleting(new, &flags); - - if (old.k->type == KEY_TYPE_reflink_v && - new->k.type == KEY_TYPE_reflink_v && - old.k->u64s == new->k.u64s && - !memcmp(bkey_s_c_to_reflink_v(old).v->start, - bkey_i_to_reflink_v(new)->v.start, - bkey_val_bytes(&new->k) - 8)) - return 0; + if ((flags & BTREE_TRIGGER_TRANSACTIONAL) && + (flags & BTREE_TRIGGER_INSERT)) + check_indirect_extent_deleting(new, &flags); - return bch2_trans_mark_extent(trans, btree_id, level, old, new, flags); + return bch2_trigger_extent(trans, btree_id, level, old, new, flags); } /* indirect inline data */ @@ -152,9 +324,9 @@ void bch2_indirect_inline_data_to_text(struct printbuf *out, min(datalen, 32U), d.v->data); } -int bch2_trans_mark_indirect_inline_data(struct btree_trans *trans, +int bch2_trigger_indirect_inline_data(struct btree_trans *trans, enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_i *new, + struct bkey_s_c old, struct bkey_s new, unsigned flags) { check_indirect_extent_deleting(new, &flags); @@ -197,7 +369,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k)); - refcount = bkey_refcount(r_v); + refcount = bkey_refcount(bkey_i_to_s(r_v)); *refcount = 0; memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k)); @@ -314,6 +486,13 @@ s64 bch2_remap_range(struct bch_fs *c, bch2_btree_iter_set_snapshot(&dst_iter, dst_snapshot); + if (dst_inum.inum < src_inum.inum) { + /* Avoid some lock cycle transaction restarts */ + ret = bch2_btree_iter_traverse(&dst_iter); + if (ret) + continue; + } + dst_done = dst_iter.pos.offset - dst_start.offset; src_want = POS(src_start.inode, src_start.offset + dst_done); bch2_btree_iter_set_pos(&src_iter, src_want); @@ -366,9 +545,7 @@ s64 bch2_remap_range(struct bch_fs *c, min(src_k.k->p.offset - src_want.offset, dst_end.offset - dst_iter.pos.offset)); - ret = bch2_bkey_set_needs_rebalance(c, new_dst.k, - opts.background_target, - opts.background_compression) ?: + ret = bch2_bkey_set_needs_rebalance(c, new_dst.k, &opts) ?: bch2_extent_update(trans, dst_inum, &dst_iter, new_dst.k, &disk_res, new_i_size, i_sectors_delta, @@ -398,7 +575,7 @@ s64 bch2_remap_range(struct bch_fs *c, inode_u.bi_size = new_i_size; ret2 = bch2_inode_write(trans, &inode_iter, &inode_u) ?: bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL); + BCH_TRANS_COMMIT_no_enospc); } bch2_trans_iter_exit(trans, &inode_iter); diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h index 8ccf3f9c4939..4d8867289717 100644 --- a/fs/bcachefs/reflink.h +++ b/fs/bcachefs/reflink.h @@ -9,13 +9,14 @@ int bch2_reflink_p_invalid(struct bch_fs *, struct bkey_s_c, void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); +int bch2_trigger_reflink_p(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s, unsigned); #define bch2_bkey_ops_reflink_p ((struct bkey_ops) { \ .key_invalid = bch2_reflink_p_invalid, \ .val_to_text = bch2_reflink_p_to_text, \ .key_merge = bch2_reflink_p_merge, \ - .trans_trigger = bch2_trans_mark_reflink_p, \ - .atomic_trigger = bch2_mark_reflink_p, \ + .trigger = bch2_trigger_reflink_p, \ .min_val_size = 16, \ }) @@ -23,15 +24,14 @@ int bch2_reflink_v_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_i *, unsigned); +int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s, unsigned); #define bch2_bkey_ops_reflink_v ((struct bkey_ops) { \ .key_invalid = bch2_reflink_v_invalid, \ .val_to_text = bch2_reflink_v_to_text, \ .swab = bch2_ptr_swab, \ - .trans_trigger = bch2_trans_mark_reflink_v, \ - .atomic_trigger = bch2_mark_extent, \ + .trigger = bch2_trigger_reflink_v, \ .min_val_size = 8, \ }) @@ -39,15 +39,15 @@ int bch2_indirect_inline_data_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_indirect_inline_data_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_trans_mark_indirect_inline_data(struct btree_trans *, +int bch2_trigger_indirect_inline_data(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_i *, + struct bkey_s_c, struct bkey_s, unsigned); #define bch2_bkey_ops_indirect_inline_data ((struct bkey_ops) { \ .key_invalid = bch2_indirect_inline_data_invalid, \ .val_to_text = bch2_indirect_inline_data_to_text, \ - .trans_trigger = bch2_trans_mark_indirect_inline_data, \ + .trigger = bch2_trigger_indirect_inline_data, \ .min_val_size = 8, \ }) @@ -63,13 +63,13 @@ static inline const __le64 *bkey_refcount_c(struct bkey_s_c k) } } -static inline __le64 *bkey_refcount(struct bkey_i *k) +static inline __le64 *bkey_refcount(struct bkey_s k) { - switch (k->k.type) { + switch (k.k->type) { case KEY_TYPE_reflink_v: - return &bkey_i_to_reflink_v(k)->v.refcount; + return &bkey_s_to_reflink_v(k).v->refcount; case KEY_TYPE_indirect_inline_data: - return &bkey_i_to_indirect_inline_data(k)->v.refcount; + return &bkey_s_to_indirect_inline_data(k).v->refcount; default: return NULL; } diff --git a/fs/bcachefs/reflink_format.h b/fs/bcachefs/reflink_format.h new file mode 100644 index 000000000000..6772eebb1fc6 --- /dev/null +++ b/fs/bcachefs/reflink_format.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_REFLINK_FORMAT_H +#define _BCACHEFS_REFLINK_FORMAT_H + +struct bch_reflink_p { + struct bch_val v; + __le64 idx; + /* + * A reflink pointer might point to an indirect extent which is then + * later split (by copygc or rebalance). If we only pointed to part of + * the original indirect extent, and then one of the fragments is + * outside the range we point to, we'd leak a refcount: so when creating + * reflink pointers, we need to store pad values to remember the full + * range we were taking a reference on. + */ + __le32 front_pad; + __le32 back_pad; +} __packed __aligned(8); + +struct bch_reflink_v { + struct bch_val v; + __le64 refcount; + union bch_extent_entry start[0]; + __u64 _data[]; +} __packed __aligned(8); + +struct bch_indirect_inline_data { + struct bch_val v; + __le64 refcount; + u8 data[]; +}; + +#endif /* _BCACHEFS_REFLINK_FORMAT_H */ diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index 2008fe8bf706..cc2672c12031 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -9,9 +9,15 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, struct bch_replicas_cpu *); +/* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */ +static int bch2_memcmp(const void *l, const void *r, size_t size) +{ + return memcmp(l, r, size); +} + /* Replicas tracking - in memory: */ -static void verify_replicas_entry(struct bch_replicas_entry *e) +static void verify_replicas_entry(struct bch_replicas_entry_v1 *e) { #ifdef CONFIG_BCACHEFS_DEBUG unsigned i; @@ -26,49 +32,39 @@ static void verify_replicas_entry(struct bch_replicas_entry *e) #endif } -void bch2_replicas_entry_sort(struct bch_replicas_entry *e) +void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e) { bubble_sort(e->devs, e->nr_devs, u8_cmp); } static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) { - eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL); + eytzinger0_sort(r->entries, r->nr, r->entry_size, bch2_memcmp, NULL); } static void bch2_replicas_entry_v0_to_text(struct printbuf *out, struct bch_replicas_entry_v0 *e) { - unsigned i; - - if (e->data_type < BCH_DATA_NR) - prt_printf(out, "%s", bch2_data_types[e->data_type]); - else - prt_printf(out, "(invalid data type %u)", e->data_type); + bch2_prt_data_type(out, e->data_type); prt_printf(out, ": %u [", e->nr_devs); - for (i = 0; i < e->nr_devs; i++) + for (unsigned i = 0; i < e->nr_devs; i++) prt_printf(out, i ? " %u" : "%u", e->devs[i]); prt_printf(out, "]"); } void bch2_replicas_entry_to_text(struct printbuf *out, - struct bch_replicas_entry *e) + struct bch_replicas_entry_v1 *e) { - unsigned i; - - if (e->data_type < BCH_DATA_NR) - prt_printf(out, "%s", bch2_data_types[e->data_type]); - else - prt_printf(out, "(invalid data type %u)", e->data_type); + bch2_prt_data_type(out, e->data_type); prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs); - for (i = 0; i < e->nr_devs; i++) + for (unsigned i = 0; i < e->nr_devs; i++) prt_printf(out, i ? " %u" : "%u", e->devs[i]); prt_printf(out, "]"); } -int bch2_replicas_entry_validate(struct bch_replicas_entry *r, +int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r, struct bch_sb *sb, struct printbuf *err) { @@ -98,7 +94,7 @@ bad: void bch2_cpu_replicas_to_text(struct printbuf *out, struct bch_replicas_cpu *r) { - struct bch_replicas_entry *e; + struct bch_replicas_entry_v1 *e; bool first = true; for_each_cpu_replicas_entry(r, e) { @@ -111,7 +107,7 @@ void bch2_cpu_replicas_to_text(struct printbuf *out, } static void extent_to_replicas(struct bkey_s_c k, - struct bch_replicas_entry *r) + struct bch_replicas_entry_v1 *r) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; @@ -131,7 +127,7 @@ static void extent_to_replicas(struct bkey_s_c k, } static void stripe_to_replicas(struct bkey_s_c k, - struct bch_replicas_entry *r) + struct bch_replicas_entry_v1 *r) { struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); const struct bch_extent_ptr *ptr; @@ -144,7 +140,7 @@ static void stripe_to_replicas(struct bkey_s_c k, r->devs[r->nr_devs++] = ptr->dev; } -void bch2_bkey_to_replicas(struct bch_replicas_entry *e, +void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *e, struct bkey_s_c k) { e->nr_devs = 0; @@ -169,12 +165,10 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e, bch2_replicas_entry_sort(e); } -void bch2_devlist_to_replicas(struct bch_replicas_entry *e, +void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e, enum bch_data_type data_type, struct bch_devs_list devs) { - unsigned i; - BUG_ON(!data_type || data_type == BCH_DATA_sb || data_type >= BCH_DATA_NR); @@ -183,8 +177,8 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry *e, e->nr_devs = 0; e->nr_required = 1; - for (i = 0; i < devs.nr; i++) - e->devs[e->nr_devs++] = devs.devs[i]; + darray_for_each(devs, i) + e->devs[e->nr_devs++] = *i; bch2_replicas_entry_sort(e); } @@ -192,7 +186,7 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry *e, static struct bch_replicas_cpu cpu_replicas_add_entry(struct bch_fs *c, struct bch_replicas_cpu *old, - struct bch_replicas_entry *new_entry) + struct bch_replicas_entry_v1 *new_entry) { unsigned i; struct bch_replicas_cpu new = { @@ -225,7 +219,7 @@ cpu_replicas_add_entry(struct bch_fs *c, } static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, - struct bch_replicas_entry *search) + struct bch_replicas_entry_v1 *search) { int idx, entry_size = replicas_entry_bytes(search); @@ -243,7 +237,7 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, } int bch2_replicas_entry_idx(struct bch_fs *c, - struct bch_replicas_entry *search) + struct bch_replicas_entry_v1 *search) { bch2_replicas_entry_sort(search); @@ -251,13 +245,13 @@ int bch2_replicas_entry_idx(struct bch_fs *c, } static bool __replicas_has_entry(struct bch_replicas_cpu *r, - struct bch_replicas_entry *search) + struct bch_replicas_entry_v1 *search) { return __replicas_entry_idx(r, search) >= 0; } bool bch2_replicas_marked(struct bch_fs *c, - struct bch_replicas_entry *search) + struct bch_replicas_entry_v1 *search) { bool marked; @@ -374,7 +368,7 @@ err: static unsigned reserve_journal_replicas(struct bch_fs *c, struct bch_replicas_cpu *r) { - struct bch_replicas_entry *e; + struct bch_replicas_entry_v1 *e; unsigned journal_res_u64s = 0; /* nr_inodes: */ @@ -399,7 +393,7 @@ static unsigned reserve_journal_replicas(struct bch_fs *c, noinline static int bch2_mark_replicas_slowpath(struct bch_fs *c, - struct bch_replicas_entry *new_entry) + struct bch_replicas_entry_v1 *new_entry) { struct bch_replicas_cpu new_r, new_gc; int ret = 0; @@ -464,7 +458,7 @@ err: goto out; } -int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r) +int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r) { return likely(bch2_replicas_marked(c, r)) ? 0 : bch2_mark_replicas_slowpath(c, r); @@ -515,7 +509,7 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret) int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) { - struct bch_replicas_entry *e; + struct bch_replicas_entry_v1 *e; unsigned i = 0; lockdep_assert_held(&c->replicas_gc_lock); @@ -590,7 +584,7 @@ retry: } for (i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry *e = + struct bch_replicas_entry_v1 *e = cpu_replicas_entry(&c->replicas, i); if (e->data_type == BCH_DATA_journal || @@ -621,7 +615,7 @@ retry: } int bch2_replicas_set_usage(struct bch_fs *c, - struct bch_replicas_entry *r, + struct bch_replicas_entry_v1 *r, u64 sectors) { int ret, idx = bch2_replicas_entry_idx(c, r); @@ -654,7 +648,7 @@ static int __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r, struct bch_replicas_cpu *cpu_r) { - struct bch_replicas_entry *e, *dst; + struct bch_replicas_entry_v1 *e, *dst; unsigned nr = 0, entry_size = 0, idx = 0; for_each_replicas_entry(sb_r, e) { @@ -692,7 +686,7 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, nr++; } - entry_size += sizeof(struct bch_replicas_entry) - + entry_size += sizeof(struct bch_replicas_entry_v1) - sizeof(struct bch_replicas_entry_v0); cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); @@ -703,7 +697,7 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, cpu_r->entry_size = entry_size; for_each_replicas_entry(sb_r, e) { - struct bch_replicas_entry *dst = + struct bch_replicas_entry_v1 *dst = cpu_replicas_entry(cpu_r, idx++); dst->data_type = e->data_type; @@ -747,7 +741,7 @@ static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c, { struct bch_sb_field_replicas_v0 *sb_r; struct bch_replicas_entry_v0 *dst; - struct bch_replicas_entry *src; + struct bch_replicas_entry_v1 *src; size_t bytes; bytes = sizeof(struct bch_sb_field_replicas); @@ -785,7 +779,7 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, struct bch_replicas_cpu *r) { struct bch_sb_field_replicas *sb_r; - struct bch_replicas_entry *dst, *src; + struct bch_replicas_entry_v1 *dst, *src; bool need_v1 = false; size_t bytes; @@ -833,10 +827,10 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, sort_cmp_size(cpu_r->entries, cpu_r->nr, cpu_r->entry_size, - memcmp, NULL); + bch2_memcmp, NULL); for (i = 0; i < cpu_r->nr; i++) { - struct bch_replicas_entry *e = + struct bch_replicas_entry_v1 *e = cpu_replicas_entry(cpu_r, i); int ret = bch2_replicas_entry_validate(e, sb, err); @@ -844,7 +838,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, return ret; if (i + 1 < cpu_r->nr) { - struct bch_replicas_entry *n = + struct bch_replicas_entry_v1 *n = cpu_replicas_entry(cpu_r, i + 1); BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0); @@ -881,7 +875,7 @@ static void bch2_sb_replicas_to_text(struct printbuf *out, struct bch_sb_field *f) { struct bch_sb_field_replicas *r = field_to_type(f, replicas); - struct bch_replicas_entry *e; + struct bch_replicas_entry_v1 *e; bool first = true; for_each_replicas_entry(r, e) { @@ -943,7 +937,7 @@ const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = { bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, unsigned flags, bool print) { - struct bch_replicas_entry *e; + struct bch_replicas_entry_v1 *e; bool ret = true; percpu_down_read(&c->mark_lock); @@ -1003,7 +997,7 @@ unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev) replicas_v0 = bch2_sb_field_get(sb, replicas_v0); if (replicas) { - struct bch_replicas_entry *r; + struct bch_replicas_entry_v1 *r; for_each_replicas_entry(replicas, r) for (i = 0; i < r->nr_devs; i++) diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h index f70a642775d1..654a4b26d3a3 100644 --- a/fs/bcachefs/replicas.h +++ b/fs/bcachefs/replicas.h @@ -6,28 +6,28 @@ #include "eytzinger.h" #include "replicas_types.h" -void bch2_replicas_entry_sort(struct bch_replicas_entry *); +void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *); void bch2_replicas_entry_to_text(struct printbuf *, - struct bch_replicas_entry *); -int bch2_replicas_entry_validate(struct bch_replicas_entry *, + struct bch_replicas_entry_v1 *); +int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *, struct bch_sb *, struct printbuf *); void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *); -static inline struct bch_replicas_entry * +static inline struct bch_replicas_entry_v1 * cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i) { return (void *) r->entries + r->entry_size * i; } int bch2_replicas_entry_idx(struct bch_fs *, - struct bch_replicas_entry *); + struct bch_replicas_entry_v1 *); -void bch2_devlist_to_replicas(struct bch_replicas_entry *, +void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *, enum bch_data_type, struct bch_devs_list); -bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry *); +bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry_v1 *); int bch2_mark_replicas(struct bch_fs *, - struct bch_replicas_entry *); + struct bch_replicas_entry_v1 *); static inline struct replicas_delta * replicas_delta_next(struct replicas_delta *d) @@ -37,9 +37,9 @@ replicas_delta_next(struct replicas_delta *d) int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *); -void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c); +void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *, struct bkey_s_c); -static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e, +static inline void bch2_replicas_entry_cached(struct bch_replicas_entry_v1 *e, unsigned dev) { e->data_type = BCH_DATA_cached; @@ -59,7 +59,7 @@ int bch2_replicas_gc_start(struct bch_fs *, unsigned); int bch2_replicas_gc2(struct bch_fs *); int bch2_replicas_set_usage(struct bch_fs *, - struct bch_replicas_entry *, + struct bch_replicas_entry_v1 *, u64); #define for_each_cpu_replicas_entry(_r, _i) \ diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h index 5cfff489bbc3..ac90d142c4e8 100644 --- a/fs/bcachefs/replicas_types.h +++ b/fs/bcachefs/replicas_types.h @@ -5,12 +5,12 @@ struct bch_replicas_cpu { unsigned nr; unsigned entry_size; - struct bch_replicas_entry *entries; + struct bch_replicas_entry_v1 *entries; }; struct replicas_delta { s64 delta; - struct bch_replicas_entry r; + struct bch_replicas_entry_v1 r; } __packed; struct replicas_delta_list { @@ -21,7 +21,7 @@ struct replicas_delta_list { u64 nr_inodes; u64 persistent_reserved[BCH_REPLICAS_MAX]; struct {} memset_end; - struct replicas_delta d[0]; + struct replicas_delta d[]; }; #endif /* _BCACHEFS_REPLICAS_TYPES_H */ diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c index c76ad8ea5e4a..b6bf0ebe7e84 100644 --- a/fs/bcachefs/sb-clean.c +++ b/fs/bcachefs/sb-clean.c @@ -191,13 +191,10 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, struct jset_entry **end, u64 journal_seq) { - struct bch_dev *ca; - unsigned i, dev; - percpu_down_read(&c->mark_lock); if (!journal_seq) { - for (i = 0; i < ARRAY_SIZE(c->usage); i++) + for (unsigned i = 0; i < ARRAY_SIZE(c->usage); i++) bch2_fs_usage_acc_to_base(c, i); } else { bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK); @@ -210,7 +207,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, u->entry.type = BCH_JSET_ENTRY_usage; u->entry.btree_id = BCH_FS_USAGE_inodes; - u->v = cpu_to_le64(c->usage_base->nr_inodes); + u->v = cpu_to_le64(c->usage_base->b.nr_inodes); } { @@ -223,7 +220,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, u->v = cpu_to_le64(atomic64_read(&c->key_version)); } - for (i = 0; i < BCH_REPLICAS_MAX; i++) { + for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++) { struct jset_entry_usage *u = container_of(jset_entry_init(end, sizeof(*u)), struct jset_entry_usage, entry); @@ -234,8 +231,8 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]); } - for (i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry *e = + for (unsigned i = 0; i < c->replicas.nr; i++) { + struct bch_replicas_entry_v1 *e = cpu_replicas_entry(&c->replicas, i); struct jset_entry_data_usage *u = container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs), @@ -247,7 +244,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, "embedded variable length struct"); } - for_each_member_device(ca, c, dev) { + for_each_member_device(c, ca) { unsigned b = sizeof(struct jset_entry_dev_usage) + sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR; struct jset_entry_dev_usage *u = @@ -255,10 +252,9 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, struct jset_entry_dev_usage, entry); u->entry.type = BCH_JSET_ENTRY_dev_usage; - u->dev = cpu_to_le32(dev); - u->buckets_ec = cpu_to_le64(ca->usage_base->buckets_ec); + u->dev = cpu_to_le32(ca->dev_idx); - for (i = 0; i < BCH_DATA_NR; i++) { + for (unsigned i = 0; i < BCH_DATA_NR; i++) { u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets); u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors); u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented); @@ -267,7 +263,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, percpu_up_read(&c->mark_lock); - for (i = 0; i < 2; i++) { + for (unsigned i = 0; i < 2; i++) { struct jset_entry_clock *clock = container_of(jset_entry_init(end, sizeof(*clock)), struct jset_entry_clock, entry); diff --git a/fs/bcachefs/counters.c b/fs/bcachefs/sb-counters.c index 02a996e06a64..7dc898761bb3 100644 --- a/fs/bcachefs/counters.c +++ b/fs/bcachefs/sb-counters.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "super-io.h" -#include "counters.h" +#include "sb-counters.h" /* BCH_SB_FIELD_counters */ diff --git a/fs/bcachefs/counters.h b/fs/bcachefs/sb-counters.h index 4778aa19bf34..81f8aec9fcb1 100644 --- a/fs/bcachefs/counters.h +++ b/fs/bcachefs/sb-counters.h @@ -1,11 +1,10 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_COUNTERS_H -#define _BCACHEFS_COUNTERS_H +#ifndef _BCACHEFS_SB_COUNTERS_H +#define _BCACHEFS_SB_COUNTERS_H #include "bcachefs.h" #include "super-io.h" - int bch2_sb_counters_to_cpu(struct bch_fs *); int bch2_sb_counters_from_cpu(struct bch_fs *); @@ -14,4 +13,4 @@ int bch2_fs_counters_init(struct bch_fs *); extern const struct bch_sb_field_ops bch_sb_field_ops_counters; -#endif // _BCACHEFS_COUNTERS_H +#endif // _BCACHEFS_SB_COUNTERS_H diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h new file mode 100644 index 000000000000..62ea478215d0 --- /dev/null +++ b/fs/bcachefs/sb-counters_format.h @@ -0,0 +1,98 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SB_COUNTERS_FORMAT_H +#define _BCACHEFS_SB_COUNTERS_FORMAT_H + +#define BCH_PERSISTENT_COUNTERS() \ + x(io_read, 0) \ + x(io_write, 1) \ + x(io_move, 2) \ + x(bucket_invalidate, 3) \ + x(bucket_discard, 4) \ + x(bucket_alloc, 5) \ + x(bucket_alloc_fail, 6) \ + x(btree_cache_scan, 7) \ + x(btree_cache_reap, 8) \ + x(btree_cache_cannibalize, 9) \ + x(btree_cache_cannibalize_lock, 10) \ + x(btree_cache_cannibalize_lock_fail, 11) \ + x(btree_cache_cannibalize_unlock, 12) \ + x(btree_node_write, 13) \ + x(btree_node_read, 14) \ + x(btree_node_compact, 15) \ + x(btree_node_merge, 16) \ + x(btree_node_split, 17) \ + x(btree_node_rewrite, 18) \ + x(btree_node_alloc, 19) \ + x(btree_node_free, 20) \ + x(btree_node_set_root, 21) \ + x(btree_path_relock_fail, 22) \ + x(btree_path_upgrade_fail, 23) \ + x(btree_reserve_get_fail, 24) \ + x(journal_entry_full, 25) \ + x(journal_full, 26) \ + x(journal_reclaim_finish, 27) \ + x(journal_reclaim_start, 28) \ + x(journal_write, 29) \ + x(read_promote, 30) \ + x(read_bounce, 31) \ + x(read_split, 33) \ + x(read_retry, 32) \ + x(read_reuse_race, 34) \ + x(move_extent_read, 35) \ + x(move_extent_write, 36) \ + x(move_extent_finish, 37) \ + x(move_extent_fail, 38) \ + x(move_extent_start_fail, 39) \ + x(copygc, 40) \ + x(copygc_wait, 41) \ + x(gc_gens_end, 42) \ + x(gc_gens_start, 43) \ + x(trans_blocked_journal_reclaim, 44) \ + x(trans_restart_btree_node_reused, 45) \ + x(trans_restart_btree_node_split, 46) \ + x(trans_restart_fault_inject, 47) \ + x(trans_restart_iter_upgrade, 48) \ + x(trans_restart_journal_preres_get, 49) \ + x(trans_restart_journal_reclaim, 50) \ + x(trans_restart_journal_res_get, 51) \ + x(trans_restart_key_cache_key_realloced, 52) \ + x(trans_restart_key_cache_raced, 53) \ + x(trans_restart_mark_replicas, 54) \ + x(trans_restart_mem_realloced, 55) \ + x(trans_restart_memory_allocation_failure, 56) \ + x(trans_restart_relock, 57) \ + x(trans_restart_relock_after_fill, 58) \ + x(trans_restart_relock_key_cache_fill, 59) \ + x(trans_restart_relock_next_node, 60) \ + x(trans_restart_relock_parent_for_fill, 61) \ + x(trans_restart_relock_path, 62) \ + x(trans_restart_relock_path_intent, 63) \ + x(trans_restart_too_many_iters, 64) \ + x(trans_restart_traverse, 65) \ + x(trans_restart_upgrade, 66) \ + x(trans_restart_would_deadlock, 67) \ + x(trans_restart_would_deadlock_write, 68) \ + x(trans_restart_injected, 69) \ + x(trans_restart_key_cache_upgrade, 70) \ + x(trans_traverse_all, 71) \ + x(transaction_commit, 72) \ + x(write_super, 73) \ + x(trans_restart_would_deadlock_recursion_limit, 74) \ + x(trans_restart_write_buffer_flush, 75) \ + x(trans_restart_split_race, 76) \ + x(write_buffer_flush_slowpath, 77) \ + x(write_buffer_flush_sync, 78) + +enum bch_persistent_counters { +#define x(t, n, ...) BCH_COUNTER_##t, + BCH_PERSISTENT_COUNTERS() +#undef x + BCH_COUNTER_NR +}; + +struct bch_sb_field_counters { + struct bch_sb_field field; + __le64 d[]; +}; + +#endif /* _BCACHEFS_SB_COUNTERS_FORMAT_H */ diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 4919237bbe73..441dcb1bf160 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -12,33 +12,105 @@ #include "sb-errors.h" #include "super-io.h" +#define RECOVERY_PASS_ALL_FSCK BIT_ULL(63) + /* - * Downgrade table: - * When dowgrading past certain versions, we need to run certain recovery passes - * and fix certain errors: + * Upgrade, downgrade tables - run certain recovery passes, fix certain errors * * x(version, recovery_passes, errors...) */ +#define UPGRADE_TABLE() \ + x(backpointers, \ + RECOVERY_PASS_ALL_FSCK) \ + x(inode_v3, \ + RECOVERY_PASS_ALL_FSCK) \ + x(unwritten_extents, \ + RECOVERY_PASS_ALL_FSCK) \ + x(bucket_gens, \ + BIT_ULL(BCH_RECOVERY_PASS_bucket_gens_init)| \ + RECOVERY_PASS_ALL_FSCK) \ + x(lru_v2, \ + RECOVERY_PASS_ALL_FSCK) \ + x(fragmentation_lru, \ + RECOVERY_PASS_ALL_FSCK) \ + x(no_bps_in_alloc_keys, \ + RECOVERY_PASS_ALL_FSCK) \ + x(snapshot_trees, \ + RECOVERY_PASS_ALL_FSCK) \ + x(snapshot_skiplists, \ + BIT_ULL(BCH_RECOVERY_PASS_check_snapshots), \ + BCH_FSCK_ERR_snapshot_bad_depth, \ + BCH_FSCK_ERR_snapshot_bad_skiplist) \ + x(deleted_inodes, \ + BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ + BCH_FSCK_ERR_unlinked_inode_not_on_deleted_list) \ + x(rebalance_work, \ + BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance)) #define DOWNGRADE_TABLE() -struct downgrade_entry { +struct upgrade_downgrade_entry { u64 recovery_passes; u16 version; u16 nr_errors; const u16 *errors; }; -#define x(ver, passes, ...) static const u16 ver_##errors[] = { __VA_ARGS__ }; +#define x(ver, passes, ...) static const u16 upgrade_##ver##_errors[] = { __VA_ARGS__ }; +UPGRADE_TABLE() +#undef x + +static const struct upgrade_downgrade_entry upgrade_table[] = { +#define x(ver, passes, ...) { \ + .recovery_passes = passes, \ + .version = bcachefs_metadata_version_##ver,\ + .nr_errors = ARRAY_SIZE(upgrade_##ver##_errors), \ + .errors = upgrade_##ver##_errors, \ +}, +UPGRADE_TABLE() +#undef x +}; + +void bch2_sb_set_upgrade(struct bch_fs *c, + unsigned old_version, + unsigned new_version) +{ + lockdep_assert_held(&c->sb_lock); + + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + + for (const struct upgrade_downgrade_entry *i = upgrade_table; + i < upgrade_table + ARRAY_SIZE(upgrade_table); + i++) + if (i->version > old_version && i->version <= new_version) { + u64 passes = i->recovery_passes; + + if (passes & RECOVERY_PASS_ALL_FSCK) + passes |= bch2_fsck_recovery_passes(); + passes &= ~RECOVERY_PASS_ALL_FSCK; + + ext->recovery_passes_required[0] |= + cpu_to_le64(bch2_recovery_passes_to_stable(passes)); + + for (const u16 *e = i->errors; + e < i->errors + i->nr_errors; + e++) { + __set_bit(*e, c->sb.errors_silent); + ext->errors_silent[*e / 64] |= cpu_to_le64(BIT_ULL(*e % 64)); + } + } +} + +#define x(ver, passes, ...) static const u16 downgrade_ver_##errors[] = { __VA_ARGS__ }; DOWNGRADE_TABLE() #undef x -static const struct downgrade_entry downgrade_table[] = { +static const struct upgrade_downgrade_entry downgrade_table[] = { #define x(ver, passes, ...) { \ .recovery_passes = passes, \ .version = bcachefs_metadata_version_##ver,\ - .nr_errors = ARRAY_SIZE(ver_##errors), \ - .errors = ver_##errors, \ + .nr_errors = ARRAY_SIZE(downgrade_##ver##_errors), \ + .errors = downgrade_##ver##_errors, \ }, DOWNGRADE_TABLE() #undef x @@ -118,7 +190,7 @@ int bch2_sb_downgrade_update(struct bch_fs *c) darray_char table = {}; int ret = 0; - for (const struct downgrade_entry *src = downgrade_table; + for (const struct upgrade_downgrade_entry *src = downgrade_table; src < downgrade_table + ARRAY_SIZE(downgrade_table); src++) { if (BCH_VERSION_MAJOR(src->version) != BCH_VERSION_MAJOR(le16_to_cpu(c->disk_sb.sb->version))) diff --git a/fs/bcachefs/sb-downgrade.h b/fs/bcachefs/sb-downgrade.h index bc48fd2ca70e..57e6c916fc73 100644 --- a/fs/bcachefs/sb-downgrade.h +++ b/fs/bcachefs/sb-downgrade.h @@ -5,6 +5,7 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_downgrade; int bch2_sb_downgrade_update(struct bch_fs *); +void bch2_sb_set_upgrade(struct bch_fs *, unsigned, unsigned); void bch2_sb_set_downgrade(struct bch_fs *, unsigned, unsigned); #endif /* _BCACHEFS_SB_DOWNGRADE_H */ diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h index 3504c2d09c29..c08aacdfd073 100644 --- a/fs/bcachefs/sb-errors_types.h +++ b/fs/bcachefs/sb-errors_types.h @@ -248,7 +248,9 @@ x(root_inode_not_dir, 240) \ x(dir_loop, 241) \ x(hash_table_key_duplicate, 242) \ - x(hash_table_key_wrong_offset, 243) + x(hash_table_key_wrong_offset, 243) \ + x(unlinked_inode_not_on_deleted_list, 244) \ + x(reflink_p_front_pad_bad, 245) enum bch_sb_error_id { #define x(t, n) BCH_FSCK_ERR_##t = n, diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index bed0f857fe5b..a45354d2acde 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -235,6 +235,11 @@ static void member_to_text(struct printbuf *out, prt_printf(out, "(never)"); prt_newline(out); + prt_printf(out, "Last superblock write:"); + prt_tab(out); + prt_u64(out, le64_to_cpu(m.seq)); + prt_newline(out); + prt_printf(out, "State:"); prt_tab(out); prt_printf(out, "%s", @@ -246,7 +251,7 @@ static void member_to_text(struct printbuf *out, prt_printf(out, "Data allowed:"); prt_tab(out); if (BCH_MEMBER_DATA_ALLOWED(&m)) - prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m)); + prt_bitflags(out, __bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m)); else prt_printf(out, "(none)"); prt_newline(out); @@ -254,11 +259,16 @@ static void member_to_text(struct printbuf *out, prt_printf(out, "Has data:"); prt_tab(out); if (data_have) - prt_bitflags(out, bch2_data_types, data_have); + prt_bitflags(out, __bch2_data_types, data_have); else prt_printf(out, "(none)"); prt_newline(out); + prt_str(out, "Durability:"); + prt_tab(out); + prt_printf(out, "%llu", BCH_MEMBER_DURABILITY(&m) ? BCH_MEMBER_DURABILITY(&m) - 1 : 1); + prt_newline(out); + prt_printf(out, "Discard:"); prt_tab(out); prt_printf(out, "%llu", BCH_MEMBER_DISCARD(&m)); @@ -353,14 +363,12 @@ const struct bch_sb_field_ops bch_sb_field_ops_members_v2 = { void bch2_sb_members_from_cpu(struct bch_fs *c) { struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); - struct bch_dev *ca; - unsigned i, e; rcu_read_lock(); - for_each_member_device_rcu(ca, c, i, NULL) { - struct bch_member *m = __bch2_members_v2_get_mut(mi, i); + for_each_member_device_rcu(c, ca, NULL) { + struct bch_member *m = __bch2_members_v2_get_mut(mi, ca->dev_idx); - for (e = 0; e < BCH_MEMBER_ERROR_NR; e++) + for (unsigned e = 0; e < BCH_MEMBER_ERROR_NR; e++) m->errors[e] = cpu_to_le64(atomic64_read(&ca->errors[e])); } rcu_read_unlock(); diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h index 03613e3eb8e3..be0a94183271 100644 --- a/fs/bcachefs/sb-members.h +++ b/fs/bcachefs/sb-members.h @@ -2,6 +2,8 @@ #ifndef _BCACHEFS_SB_MEMBERS_H #define _BCACHEFS_SB_MEMBERS_H +#include "darray.h" + extern char * const bch2_member_error_strs[]; static inline struct bch_member * @@ -47,23 +49,18 @@ static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs) static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs, unsigned dev) { - unsigned i; - - for (i = 0; i < devs.nr; i++) - if (devs.devs[i] == dev) + darray_for_each(devs, i) + if (*i == dev) return true; - return false; } static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs, unsigned dev) { - unsigned i; - - for (i = 0; i < devs->nr; i++) - if (devs->devs[i] == dev) { - array_remove_item(devs->devs, devs->nr, i); + darray_for_each(*devs, i) + if (*i == dev) { + darray_remove_item(devs, i); return; } } @@ -72,40 +69,48 @@ static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs, unsigned dev) { if (!bch2_dev_list_has_dev(*devs, dev)) { - BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs)); - devs->devs[devs->nr++] = dev; + BUG_ON(devs->nr >= ARRAY_SIZE(devs->data)); + devs->data[devs->nr++] = dev; } } static inline struct bch_devs_list bch2_dev_list_single(unsigned dev) { - return (struct bch_devs_list) { .nr = 1, .devs[0] = dev }; + return (struct bch_devs_list) { .nr = 1, .data[0] = dev }; } -static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter, - const struct bch_devs_mask *mask) +static inline struct bch_dev *__bch2_next_dev_idx(struct bch_fs *c, unsigned idx, + const struct bch_devs_mask *mask) { struct bch_dev *ca = NULL; - while ((*iter = mask - ? find_next_bit(mask->d, c->sb.nr_devices, *iter) - : *iter) < c->sb.nr_devices && - !(ca = rcu_dereference_check(c->devs[*iter], + while ((idx = mask + ? find_next_bit(mask->d, c->sb.nr_devices, idx) + : idx) < c->sb.nr_devices && + !(ca = rcu_dereference_check(c->devs[idx], lockdep_is_held(&c->state_lock)))) - (*iter)++; + idx++; return ca; } -#define for_each_member_device_rcu(ca, c, iter, mask) \ - for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++) +static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, struct bch_dev *ca, + const struct bch_devs_mask *mask) +{ + return __bch2_next_dev_idx(c, ca ? ca->dev_idx + 1 : 0, mask); +} -static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter) +#define for_each_member_device_rcu(_c, _ca, _mask) \ + for (struct bch_dev *_ca = NULL; \ + (_ca = __bch2_next_dev((_c), _ca, (_mask)));) + +static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev *ca) { - struct bch_dev *ca; + if (ca) + percpu_ref_put(&ca->ref); rcu_read_lock(); - if ((ca = __bch2_next_dev(c, iter, NULL))) + if ((ca = __bch2_next_dev(c, ca, NULL))) percpu_ref_get(&ca->ref); rcu_read_unlock(); @@ -115,41 +120,42 @@ static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter /* * If you break early, you must drop your ref on the current device */ -#define for_each_member_device(ca, c, iter) \ - for ((iter) = 0; \ - (ca = bch2_get_next_dev(c, &(iter))); \ - percpu_ref_put(&ca->ref), (iter)++) +#define __for_each_member_device(_c, _ca) \ + for (; (_ca = bch2_get_next_dev(_c, _ca));) + +#define for_each_member_device(_c, _ca) \ + for (struct bch_dev *_ca = NULL; \ + (_ca = bch2_get_next_dev(_c, _ca));) static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, - unsigned *iter, - int state_mask) + struct bch_dev *ca, + unsigned state_mask) { - struct bch_dev *ca; + if (ca) + percpu_ref_put(&ca->io_ref); rcu_read_lock(); - while ((ca = __bch2_next_dev(c, iter, NULL)) && + while ((ca = __bch2_next_dev(c, ca, NULL)) && (!((1 << ca->mi.state) & state_mask) || !percpu_ref_tryget(&ca->io_ref))) - (*iter)++; + ; rcu_read_unlock(); return ca; } -#define __for_each_online_member(ca, c, iter, state_mask) \ - for ((iter) = 0; \ - (ca = bch2_get_next_online_dev(c, &(iter), state_mask)); \ - percpu_ref_put(&ca->io_ref), (iter)++) +#define __for_each_online_member(_c, _ca, state_mask) \ + for (struct bch_dev *_ca = NULL; \ + (_ca = bch2_get_next_online_dev(_c, _ca, state_mask));) -#define for_each_online_member(ca, c, iter) \ - __for_each_online_member(ca, c, iter, ~0) +#define for_each_online_member(c, ca) \ + __for_each_online_member(c, ca, ~0) -#define for_each_rw_member(ca, c, iter) \ - __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_rw) +#define for_each_rw_member(c, ca) \ + __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw)) -#define for_each_readable_member(ca, c, iter) \ - __for_each_online_member(ca, c, iter, \ - (1 << BCH_MEMBER_STATE_rw)|(1 << BCH_MEMBER_STATE_ro)) +#define for_each_readable_member(c, ca) \ + __for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro)) /* * If a key exists that references a device, the device won't be going away and @@ -175,11 +181,9 @@ static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx) static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) { struct bch_devs_mask devs; - struct bch_dev *ca; - unsigned i; memset(&devs, 0, sizeof(devs)); - for_each_online_member(ca, c, i) + for_each_online_member(c, ca) __set_bit(ca->dev_idx, devs.d); return devs; } diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c index 97790445e67a..3a494c5d1247 100644 --- a/fs/bcachefs/six.c +++ b/fs/bcachefs/six.c @@ -324,101 +324,57 @@ bool six_relock_ip(struct six_lock *lock, enum six_lock_type type, } EXPORT_SYMBOL_GPL(six_relock_ip); -#ifdef CONFIG_SIX_LOCK_SPIN_ON_OWNER +#ifdef CONFIG_BCACHEFS_SIX_OPTIMISTIC_SPIN -static inline bool six_can_spin_on_owner(struct six_lock *lock) +static inline bool six_owner_running(struct six_lock *lock) { - struct task_struct *owner; - bool ret; - - if (need_resched()) - return false; - + /* + * When there's no owner, we might have preempted between the owner + * acquiring the lock and setting the owner field. If we're an RT task + * that will live-lock because we won't let the owner complete. + */ rcu_read_lock(); - owner = READ_ONCE(lock->owner); - ret = !owner || owner_on_cpu(owner); + struct task_struct *owner = READ_ONCE(lock->owner); + bool ret = owner ? owner_on_cpu(owner) : !rt_task(current); rcu_read_unlock(); return ret; } -static inline bool six_spin_on_owner(struct six_lock *lock, - struct task_struct *owner, - u64 end_time) +static inline bool six_optimistic_spin(struct six_lock *lock, + struct six_lock_waiter *wait, + enum six_lock_type type) { - bool ret = true; unsigned loop = 0; - - rcu_read_lock(); - while (lock->owner == owner) { - /* - * Ensure we emit the owner->on_cpu, dereference _after_ - * checking lock->owner still matches owner. If that fails, - * owner might point to freed memory. If it still matches, - * the rcu_read_lock() ensures the memory stays valid. - */ - barrier(); - - if (!owner_on_cpu(owner) || need_resched()) { - ret = false; - break; - } - - if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) { - six_set_bitmask(lock, SIX_LOCK_NOSPIN); - ret = false; - break; - } - - cpu_relax(); - } - rcu_read_unlock(); - - return ret; -} - -static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) -{ - struct task_struct *task = current; u64 end_time; if (type == SIX_LOCK_write) return false; - preempt_disable(); - if (!six_can_spin_on_owner(lock)) - goto fail; + if (lock->wait_list.next != &wait->list) + return false; - if (!osq_lock(&lock->osq)) - goto fail; + if (atomic_read(&lock->state) & SIX_LOCK_NOSPIN) + return false; + preempt_disable(); end_time = sched_clock() + 10 * NSEC_PER_USEC; - while (1) { - struct task_struct *owner; - + while (!need_resched() && six_owner_running(lock)) { /* - * If there's an owner, wait for it to either - * release the lock or go to sleep. + * Ensures that writes to the waitlist entry happen after we see + * wait->lock_acquired: pairs with the smp_store_release in + * __six_lock_wakeup */ - owner = READ_ONCE(lock->owner); - if (owner && !six_spin_on_owner(lock, owner, end_time)) - break; - - if (do_six_trylock(lock, type, false)) { - osq_unlock(&lock->osq); + if (smp_load_acquire(&wait->lock_acquired)) { preempt_enable(); return true; } - /* - * When there's no owner, we might have preempted between the - * owner acquiring the lock and setting the owner field. If - * we're an RT task that will live-lock because we won't let - * the owner complete. - */ - if (!owner && (need_resched() || rt_task(task))) + if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) { + six_set_bitmask(lock, SIX_LOCK_NOSPIN); break; + } /* * The cpu_relax() call is a compiler barrier which forces @@ -429,24 +385,15 @@ static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type cpu_relax(); } - osq_unlock(&lock->osq); -fail: preempt_enable(); - - /* - * If we fell out of the spin path because of need_resched(), - * reschedule now, before we try-lock again. This avoids getting - * scheduled out right after we obtained the lock. - */ - if (need_resched()) - schedule(); - return false; } -#else /* CONFIG_SIX_LOCK_SPIN_ON_OWNER */ +#else /* CONFIG_LOCK_SPIN_ON_OWNER */ -static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) +static inline bool six_optimistic_spin(struct six_lock *lock, + struct six_lock_waiter *wait, + enum six_lock_type type) { return false; } @@ -470,9 +417,6 @@ static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type, trace_contention_begin(lock, 0); lock_contended(&lock->dep_map, ip); - if (six_optimistic_spin(lock, type)) - goto out; - wait->task = current; wait->lock_want = type; wait->lock_acquired = false; @@ -510,6 +454,9 @@ static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type, ret = 0; } + if (six_optimistic_spin(lock, wait, type)) + goto out; + while (1) { set_current_state(TASK_UNINTERRUPTIBLE); diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h index 4c268b0b8316..68d46fd7f391 100644 --- a/fs/bcachefs/six.h +++ b/fs/bcachefs/six.h @@ -15,7 +15,7 @@ * will have to take write locks for the full duration of the operation. * * But by adding an intent state, which is exclusive with other intent locks but - * not with readers, we can take intent locks at thte start of the operation, + * not with readers, we can take intent locks at the start of the operation, * and then take write locks only for the actual update to each individual * nodes, without deadlocking. * @@ -65,8 +65,8 @@ * * Reentrancy: * - * Six locks are not by themselves reentrent, but have counters for both the - * read and intent states that can be used to provide reentrency by an upper + * Six locks are not by themselves reentrant, but have counters for both the + * read and intent states that can be used to provide reentrancy by an upper * layer that tracks held locks. If a lock is known to already be held in the * read or intent state, six_lock_increment() can be used to bump the "lock * held in this state" counter, increasing the number of unlock calls that @@ -127,10 +127,6 @@ #include <linux/sched.h> #include <linux/types.h> -#ifdef CONFIG_SIX_LOCK_SPIN_ON_OWNER -#include <linux/osq_lock.h> -#endif - enum six_lock_type { SIX_LOCK_read, SIX_LOCK_intent, @@ -143,9 +139,6 @@ struct six_lock { unsigned intent_lock_recurse; struct task_struct *owner; unsigned __percpu *readers; -#ifdef CONFIG_SIX_LOCK_SPIN_ON_OWNER - struct optimistic_spin_queue osq; -#endif raw_spinlock_t wait_lock; struct list_head wait_list; #ifdef CONFIG_DEBUG_LOCK_ALLOC diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 5dac038f0851..45f67e8b29eb 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -123,7 +123,7 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) struct snapshot_table *t; bool ret; - EBUG_ON(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_snapshots); + EBUG_ON(c->recovery_pass_done <= BCH_RECOVERY_PASS_check_snapshots); rcu_read_lock(); t = rcu_dereference(c->snapshots); @@ -276,7 +276,7 @@ static void set_is_ancestor_bitmap(struct bch_fs *c, u32 id) mutex_unlock(&c->snapshot_table_lock); } -int bch2_mark_snapshot(struct btree_trans *trans, +static int __bch2_mark_snapshot(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s_c new, unsigned flags) @@ -318,7 +318,7 @@ int bch2_mark_snapshot(struct btree_trans *trans, __set_is_ancestor_bitmap(c, id); if (BCH_SNAPSHOT_DELETED(s.v)) { - set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags); + set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots) bch2_delete_dead_snapshots_async(c); } @@ -330,6 +330,14 @@ err: return ret; } +int bch2_mark_snapshot(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bkey_s_c old, struct bkey_s new, + unsigned flags) +{ + return __bch2_mark_snapshot(trans, btree, level, old, new.s_c, flags); +} + int bch2_snapshot_lookup(struct btree_trans *trans, u32 id, struct bch_snapshot *s) { @@ -459,7 +467,6 @@ static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_s_c k; - struct bkey_s_c_subvolume s; bool found = false; int ret; @@ -468,7 +475,7 @@ static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans, if (k.k->type != KEY_TYPE_subvolume) continue; - s = bkey_s_c_to_subvolume(k); + struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root)) continue; if (!BCH_SUBVOLUME_SNAP(s.v)) { @@ -582,19 +589,13 @@ fsck_err: */ int bch2_check_snapshot_trees(struct bch_fs *c) { - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - ret = bch2_trans_run(c, + int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_snapshot_trees, POS_MIN, BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_snapshot_tree(trans, &iter, k))); - - if (ret) - bch_err(c, "error %i checking snapshot trees", ret); + bch_err_fn(c, ret); return ret; } @@ -813,11 +814,10 @@ static int check_snapshot(struct btree_trans *trans, real_depth = bch2_snapshot_depth(c, parent_id); - if (le32_to_cpu(s.depth) != real_depth && - (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists || - fsck_err(c, snapshot_bad_depth, - "snapshot with incorrect depth field, should be %u:\n %s", - real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) { + if (fsck_err_on(le32_to_cpu(s.depth) != real_depth, + c, snapshot_bad_depth, + "snapshot with incorrect depth field, should be %u:\n %s", + real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); ret = PTR_ERR_OR_ZERO(u); if (ret) @@ -831,11 +831,9 @@ static int check_snapshot(struct btree_trans *trans, if (ret < 0) goto err; - if (!ret && - (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists || - fsck_err(c, snapshot_bad_skiplist, - "snapshot with bad skiplist field:\n %s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) { + if (fsck_err_on(!ret, c, snapshot_bad_skiplist, + "snapshot with bad skiplist field:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); ret = PTR_ERR_OR_ZERO(u); if (ret) @@ -856,22 +854,17 @@ fsck_err: int bch2_check_snapshots(struct bch_fs *c) { - struct btree_iter iter; - struct bkey_s_c k; - int ret; - /* * We iterate backwards as checking/fixing the depth field requires that * the parent's depth already be correct: */ - ret = bch2_trans_run(c, + int ret = bch2_trans_run(c, for_each_btree_key_reverse_commit(trans, iter, - BTREE_ID_snapshots, POS_MAX, - BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, - check_snapshot(trans, &iter, k))); - if (ret) - bch_err_fn(c, ret); + BTREE_ID_snapshots, POS_MAX, + BTREE_ITER_PREFETCH, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + check_snapshot(trans, &iter, k))); + bch_err_fn(c, ret); return ret; } @@ -1060,6 +1053,8 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, n->v.subvol = cpu_to_le32(snapshot_subvols[i]); n->v.tree = cpu_to_le32(tree); n->v.depth = cpu_to_le32(depth); + n->v.btime.lo = cpu_to_le64(bch2_current_time(c)); + n->v.btime.hi = 0; for (j = 0; j < ARRAY_SIZE(n->v.skip); j++) n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent)); @@ -1067,7 +1062,7 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32); SET_BCH_SNAPSHOT_SUBVOL(&n->v, true); - ret = bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, + ret = __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0); if (ret) goto err; @@ -1315,7 +1310,6 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, struct bch_fs *c = trans->c; u32 nr_deleted_ancestors = 0; struct bkey_i_snapshot *s; - u32 *i; int ret; if (k.k->type != KEY_TYPE_snapshot) @@ -1368,23 +1362,19 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, int bch2_delete_dead_snapshots(struct bch_fs *c) { struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; - struct bkey_s_c_snapshot snap; snapshot_id_list deleted = { 0 }; snapshot_id_list deleted_interior = { 0 }; - u32 *i, id; + u32 id; int ret = 0; - if (!test_and_clear_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags)) + if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags)) return 0; - if (!test_bit(BCH_FS_STARTED, &c->flags)) { + if (!test_bit(BCH_FS_started, &c->flags)) { ret = bch2_fs_read_write_early(c); - if (ret) { - bch_err_msg(c, ret, "deleting dead snapshots: error going rw"); + bch_err_msg(c, ret, "deleting dead snapshots: error going rw"); + if (ret) return ret; - } } trans = bch2_trans_get(c); @@ -1397,37 +1387,29 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) POS_MIN, 0, k, NULL, NULL, 0, bch2_delete_redundant_snapshot(trans, k)); - if (ret) { - bch_err_msg(c, ret, "deleting redundant snapshots"); + bch_err_msg(c, ret, "deleting redundant snapshots"); + if (ret) goto err; - } - ret = for_each_btree_key2(trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, + ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, bch2_snapshot_set_equiv(trans, k)); - if (ret) { - bch_err_msg(c, ret, "in bch2_snapshots_set_equiv"); + bch_err_msg(c, ret, "in bch2_snapshots_set_equiv"); + if (ret) goto err; - } - for_each_btree_key(trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, ret) { + ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, ({ if (k.k->type != KEY_TYPE_snapshot) continue; - snap = bkey_s_c_to_snapshot(k); - if (BCH_SNAPSHOT_DELETED(snap.v)) { - ret = snapshot_list_add(c, &deleted, k.k->p.offset); - if (ret) - break; - } - } - bch2_trans_iter_exit(trans, &iter); - - if (ret) { - bch_err_msg(c, ret, "walking snapshots"); + BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v) + ? snapshot_list_add(c, &deleted, k.k->p.offset) + : 0; + })); + bch_err_msg(c, ret, "walking snapshots"); + if (ret) goto err; - } for (id = 0; id < BTREE_ID_NR; id++) { struct bpos last_pos = POS_MIN; @@ -1449,36 +1431,36 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) ret = for_each_btree_key_commit(trans, iter, id, POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, - &res, NULL, BTREE_INSERT_NOFAIL, + &res, NULL, BCH_TRANS_COMMIT_no_enospc, snapshot_delete_key(trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?: for_each_btree_key_commit(trans, iter, id, POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, - &res, NULL, BTREE_INSERT_NOFAIL, + &res, NULL, BCH_TRANS_COMMIT_no_enospc, move_key_to_correct_snapshot(trans, &iter, k)); bch2_disk_reservation_put(c, &res); darray_exit(&equiv_seen); - if (ret) { - bch_err_msg(c, ret, "deleting keys from dying snapshots"); + bch_err_msg(c, ret, "deleting keys from dying snapshots"); + if (ret) goto err; - } } bch2_trans_unlock(trans); down_write(&c->snapshot_create_lock); - for_each_btree_key(trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, ret) { + ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, ({ u32 snapshot = k.k->p.offset; u32 equiv = bch2_snapshot_equiv(c, snapshot); - if (equiv != snapshot) - snapshot_list_add(c, &deleted_interior, snapshot); - } - bch2_trans_iter_exit(trans, &iter); + equiv != snapshot + ? snapshot_list_add(c, &deleted_interior, snapshot) + : 0; + })); + bch_err_msg(c, ret, "walking snapshots"); if (ret) goto err_create_lock; @@ -1489,7 +1471,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) */ ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN, BTREE_ITER_INTENT, k, - NULL, NULL, BTREE_INSERT_NOFAIL, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior)); if (ret) goto err_create_lock; @@ -1497,19 +1479,17 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) darray_for_each(deleted, i) { ret = commit_do(trans, NULL, NULL, 0, bch2_snapshot_node_delete(trans, *i)); - if (ret) { - bch_err_msg(c, ret, "deleting snapshot %u", *i); + bch_err_msg(c, ret, "deleting snapshot %u", *i); + if (ret) goto err_create_lock; - } } darray_for_each(deleted_interior, i) { ret = commit_do(trans, NULL, NULL, 0, bch2_snapshot_node_delete(trans, *i)); - if (ret) { - bch_err_msg(c, ret, "deleting snapshot %u", *i); + bch_err_msg(c, ret, "deleting snapshot %u", *i); + if (ret) goto err_create_lock; - } } err_create_lock: up_write(&c->snapshot_create_lock); @@ -1517,8 +1497,7 @@ err: darray_exit(&deleted_interior); darray_exit(&deleted); bch2_trans_put(trans); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } @@ -1680,7 +1659,7 @@ static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct if (BCH_SNAPSHOT_DELETED(snap.v) || bch2_snapshot_equiv(c, k.k->p.offset) != k.k->p.offset || (ret = bch2_snapshot_needs_delete(trans, k)) > 0) { - set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags); + set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); return 0; } @@ -1689,25 +1668,20 @@ static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct int bch2_snapshots_read(struct bch_fs *c) { - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - ret = bch2_trans_run(c, - for_each_btree_key2(trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, - bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?: + int ret = bch2_trans_run(c, + for_each_btree_key(trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, + __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?: bch2_snapshot_set_equiv(trans, k) ?: bch2_check_snapshot_needs_deletion(trans, k)) ?: - for_each_btree_key2(trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, + for_each_btree_key(trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, (set_is_ancestor_bitmap(c, k.k->p.offset), 0))); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } void bch2_fs_snapshots_exit(struct bch_fs *c) { - kfree(rcu_dereference_protected(c->snapshots, true)); + kvfree(rcu_dereference_protected(c->snapshots, true)); } diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index f09a22f44239..7c66ffc06385 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -22,12 +22,12 @@ void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_snapshot_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned, - struct bkey_s_c, struct bkey_s_c, unsigned); + struct bkey_s_c, struct bkey_s, unsigned); #define bch2_bkey_ops_snapshot ((struct bkey_ops) { \ .key_invalid = bch2_snapshot_invalid, \ .val_to_text = bch2_snapshot_to_text, \ - .atomic_trigger = bch2_mark_snapshot, \ + .trigger = bch2_mark_snapshot, \ .min_val_size = 24, \ }) @@ -202,8 +202,6 @@ static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id) static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id) { - u32 *i; - darray_for_each(*s, i) if (*i == id) return true; @@ -212,8 +210,6 @@ static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id) static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id) { - u32 *i; - darray_for_each(*s, i) if (bch2_snapshot_is_ancestor(c, id, *i)) return true; diff --git a/fs/bcachefs/snapshot_format.h b/fs/bcachefs/snapshot_format.h new file mode 100644 index 000000000000..aabcd3a74cd9 --- /dev/null +++ b/fs/bcachefs/snapshot_format.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SNAPSHOT_FORMAT_H +#define _BCACHEFS_SNAPSHOT_FORMAT_H + +struct bch_snapshot { + struct bch_val v; + __le32 flags; + __le32 parent; + __le32 children[2]; + __le32 subvol; + /* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */ + __le32 tree; + __le32 depth; + __le32 skip[3]; + bch_le128 btime; +}; + +LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1) + +/* True if a subvolume points to this snapshot node: */ +LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2) + +/* + * Snapshot trees: + * + * The snapshot_trees btree gives us persistent indentifier for each tree of + * bch_snapshot nodes, and allow us to record and easily find the root/master + * subvolume that other snapshots were created from: + */ +struct bch_snapshot_tree { + struct bch_val v; + __le32 master_subvol; + __le32 root_snapshot; +}; + +#endif /* _BCACHEFS_SNAPSHOT_FORMAT_H */ diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index ae21a8cca1b4..fcaa5a888744 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -15,6 +15,16 @@ #include <crypto/hash.h> #include <crypto/sha2.h> +typedef unsigned __bitwise bch_str_hash_flags_t; + +enum bch_str_hash_flags { + __BCH_HASH_SET_MUST_CREATE, + __BCH_HASH_SET_MUST_REPLACE, +}; + +#define BCH_HASH_SET_MUST_CREATE (__force bch_str_hash_flags_t) BIT(__BCH_HASH_SET_MUST_CREATE) +#define BCH_HASH_SET_MUST_REPLACE (__force bch_str_hash_flags_t) BIT(__BCH_HASH_SET_MUST_REPLACE) + static inline enum bch_str_hash_type bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) { @@ -150,21 +160,16 @@ static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, s } static __always_inline int -bch2_hash_lookup(struct btree_trans *trans, +bch2_hash_lookup_in_snapshot(struct btree_trans *trans, struct btree_iter *iter, const struct bch_hash_desc desc, const struct bch_hash_info *info, subvol_inum inum, const void *key, - unsigned flags) + unsigned flags, u32 snapshot) { struct bkey_s_c k; - u32 snapshot; int ret; - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - return ret; - for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, SPOS(inum.inum, desc.hash_key(info, key), snapshot), POS(inum.inum, U64_MAX), @@ -185,6 +190,19 @@ bch2_hash_lookup(struct btree_trans *trans, } static __always_inline int +bch2_hash_lookup(struct btree_trans *trans, + struct btree_iter *iter, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + subvol_inum inum, const void *key, + unsigned flags) +{ + u32 snapshot; + return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?: + bch2_hash_lookup_in_snapshot(trans, iter, desc, info, inum, key, flags, snapshot); +} + +static __always_inline int bch2_hash_hole(struct btree_trans *trans, struct btree_iter *iter, const struct bch_hash_desc desc, @@ -246,7 +264,7 @@ int bch2_hash_set_snapshot(struct btree_trans *trans, const struct bch_hash_info *info, subvol_inum inum, u32 snapshot, struct bkey_i *insert, - int flags, + bch_str_hash_flags_t str_hash_flags, int update_flags) { struct btree_iter iter, slot = { NULL }; @@ -269,7 +287,7 @@ int bch2_hash_set_snapshot(struct btree_trans *trans, } if (!slot.path && - !(flags & BCH_HASH_SET_MUST_REPLACE)) + !(str_hash_flags & BCH_HASH_SET_MUST_REPLACE)) bch2_trans_copy_iter(&slot, &iter); if (k.k->type != KEY_TYPE_hash_whiteout) @@ -287,16 +305,16 @@ found: found = true; not_found: - if (!found && (flags & BCH_HASH_SET_MUST_REPLACE)) { + if (!found && (str_hash_flags & BCH_HASH_SET_MUST_REPLACE)) { ret = -BCH_ERR_ENOENT_str_hash_set_must_replace; - } else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) { + } else if (found && (str_hash_flags & BCH_HASH_SET_MUST_CREATE)) { ret = -EEXIST; } else { if (!found && slot.path) swap(iter, slot); insert->k.p = iter.pos; - ret = bch2_trans_update(trans, &iter, insert, 0); + ret = bch2_trans_update(trans, &iter, insert, update_flags); } goto out; @@ -307,7 +325,8 @@ int bch2_hash_set(struct btree_trans *trans, const struct bch_hash_desc desc, const struct bch_hash_info *info, subvol_inum inum, - struct bkey_i *insert, int flags) + struct bkey_i *insert, + bch_str_hash_flags_t str_hash_flags) { u32 snapshot; int ret; @@ -319,7 +338,7 @@ int bch2_hash_set(struct btree_trans *trans, insert->k.p.inode = inum.inum; return bch2_hash_set_snapshot(trans, desc, info, inum, - snapshot, insert, flags, 0); + snapshot, insert, str_hash_flags, 0); } static __always_inline diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index 22b34a8e4d6e..7c67c28d3ef8 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -37,11 +37,8 @@ static int check_subvol(struct btree_trans *trans, return ret; if (BCH_SUBVOLUME_UNLINKED(subvol.v)) { - bch2_fs_lazy_rw(c); - ret = bch2_subvolume_delete(trans, iter->pos.offset); - if (ret) - bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset); + bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset); return ret ?: -BCH_ERR_transaction_restart_nested; } @@ -82,17 +79,12 @@ fsck_err: int bch2_check_subvols(struct bch_fs *c) { - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - ret = bch2_trans_run(c, + int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, - BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, - check_subvol(trans, &iter, k))); - if (ret) - bch_err_fn(c, ret); + BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + check_subvol(trans, &iter, k))); + bch_err_fn(c, ret); return ret; } @@ -228,8 +220,6 @@ static int bch2_subvolume_reparent(struct btree_trans *trans, */ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_delete) { - struct btree_iter iter; - struct bkey_s_c k; struct bch_subvolume s; return lockrestart_do(trans, @@ -237,7 +227,7 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d BTREE_ITER_CACHED, &s)) ?: for_each_btree_key_commit(trans, iter, BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_NOFAIL, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_subvolume_reparent(trans, &iter, k, subvolid_to_delete, le32_to_cpu(s.parent))); } @@ -274,7 +264,7 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) static int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) { return bch2_subvolumes_reparent(trans, subvolid) ?: - commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, __bch2_subvolume_delete(trans, subvolid)); } @@ -299,10 +289,9 @@ static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *wor for (id = s.data; id < s.data + s.nr; id++) { ret = bch2_trans_run(c, bch2_subvolume_delete(trans, *id)); - if (ret) { - bch_err_msg(c, ret, "deleting subvolume %u", *id); + bch_err_msg(c, ret, "deleting subvolume %u", *id); + if (ret) break; - } } darray_exit(&s); diff --git a/fs/bcachefs/subvolume_format.h b/fs/bcachefs/subvolume_format.h new file mode 100644 index 000000000000..af79134b07d6 --- /dev/null +++ b/fs/bcachefs/subvolume_format.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SUBVOLUME_FORMAT_H +#define _BCACHEFS_SUBVOLUME_FORMAT_H + +#define SUBVOL_POS_MIN POS(0, 1) +#define SUBVOL_POS_MAX POS(0, S32_MAX) +#define BCACHEFS_ROOT_SUBVOL 1 + +struct bch_subvolume { + struct bch_val v; + __le32 flags; + __le32 snapshot; + __le64 inode; + /* + * Snapshot subvolumes form a tree, separate from the snapshot nodes + * tree - if this subvolume is a snapshot, this is the ID of the + * subvolume it was created from: + * + * This is _not_ necessarily the subvolume of the directory containing + * this subvolume: + */ + __le32 parent; + __le32 pad; + bch_le128 otime; +}; + +LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1) +/* + * We need to know whether a subvolume is a snapshot so we can know whether we + * can delete it (or whether it should just be rm -rf'd) + */ +LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2) +LE32_BITMASK(BCH_SUBVOLUME_UNLINKED, struct bch_subvolume, flags, 2, 3) + +#endif /* _BCACHEFS_SUBVOLUME_FORMAT_H */ diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h index 2d2e66a4e468..ae644adfc391 100644 --- a/fs/bcachefs/subvolume_types.h +++ b/fs/bcachefs/subvolume_types.h @@ -20,7 +20,11 @@ struct snapshot_t { }; struct snapshot_table { +#ifndef RUST_BINDGEN DECLARE_FLEX_ARRAY(struct snapshot_t, s); +#else + struct snapshot_t s[0]; +#endif }; typedef struct { diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 78013deda9df..d60c7d27a047 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -2,7 +2,6 @@ #include "bcachefs.h" #include "checksum.h" -#include "counters.h" #include "disk_groups.h" #include "ec.h" #include "error.h" @@ -13,6 +12,7 @@ #include "replicas.h" #include "quota.h" #include "sb-clean.h" +#include "sb-counters.h" #include "sb-downgrade.h" #include "sb-errors.h" #include "sb-members.h" @@ -30,14 +30,12 @@ static const struct blk_holder_ops bch2_sb_handle_bdev_ops = { struct bch2_metadata_version { u16 version; const char *name; - u64 recovery_passes; }; static const struct bch2_metadata_version bch2_metadata_versions[] = { -#define x(n, v, _recovery_passes) { \ +#define x(n, v) { \ .version = v, \ .name = #n, \ - .recovery_passes = _recovery_passes, \ }, BCH_METADATA_VERSIONS() #undef x @@ -70,24 +68,6 @@ unsigned bch2_latest_compatible_version(unsigned v) return v; } -u64 bch2_upgrade_recovery_passes(struct bch_fs *c, - unsigned old_version, - unsigned new_version) -{ - u64 ret = 0; - - for (const struct bch2_metadata_version *i = bch2_metadata_versions; - i < bch2_metadata_versions + ARRAY_SIZE(bch2_metadata_versions); - i++) - if (i->version > old_version && i->version <= new_version) { - if (i->recovery_passes & RECOVERY_PASS_ALL_FSCK) - ret |= bch2_fsck_recovery_passes(); - ret |= i->recovery_passes; - } - - return ret &= ~RECOVERY_PASS_ALL_FSCK; -} - const char * const bch2_sb_fields[] = { #define x(name, nr) #name, BCH_SB_FIELDS() @@ -101,8 +81,6 @@ static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *, struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *sb, enum bch_sb_field_type type) { - struct bch_sb_field *f; - /* XXX: need locking around superblock to access optional fields */ vstruct_for_each(sb, f) @@ -192,8 +170,12 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits; if (new_bytes > max_bytes) { - pr_err("%pg: superblock too big: want %zu but have %llu", - sb->bdev, new_bytes, max_bytes); + struct printbuf buf = PRINTBUF; + + prt_bdevname(&buf, sb->bdev); + prt_printf(&buf, ": superblock too big: want %zu but have %llu", new_bytes, max_bytes); + pr_err("%s", buf.buf); + printbuf_exit(&buf); return -BCH_ERR_ENOSPC_sb; } } @@ -241,14 +223,12 @@ struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb, if (sb->fs_sb) { struct bch_fs *c = container_of(sb, struct bch_fs, disk_sb); - struct bch_dev *ca; - unsigned i; lockdep_assert_held(&c->sb_lock); /* XXX: we're not checking that offline device have enough space */ - for_each_online_member(ca, c, i) { + for_each_online_member(c, ca) { struct bch_sb_handle *dev_sb = &ca->disk_sb; if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) { @@ -368,7 +348,6 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, int rw) { struct bch_sb *sb = disk_sb->sb; - struct bch_sb_field *f; struct bch_sb_field_members_v1 *mi; enum bch_opt_id opt_id; u16 block_size; @@ -514,8 +493,6 @@ static void le_bitvector_to_cpu(unsigned long *dst, unsigned long *src, unsigned static void bch2_sb_update(struct bch_fs *c) { struct bch_sb *src = c->disk_sb.sb; - struct bch_dev *ca; - unsigned i; lockdep_assert_held(&c->sb_lock); @@ -546,7 +523,7 @@ static void bch2_sb_update(struct bch_fs *c) le_bitvector_to_cpu(c->sb.errors_silent, (void *) ext->errors_silent, sizeof(c->sb.errors_silent) * 8); - for_each_member_device(ca, c, i) { + for_each_member_device(c, ca) { struct bch_member m = bch2_sb_member_get(src, ca->dev_idx); ca->mi = bch2_mi_to_cpu(&m); } @@ -571,6 +548,7 @@ static int __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src) dst->time_base_lo = src->time_base_lo; dst->time_base_hi = src->time_base_hi; dst->time_precision = src->time_precision; + dst->write_time = src->write_time; memcpy(dst->flags, src->flags, sizeof(dst->flags)); memcpy(dst->features, src->features, sizeof(dst->features)); @@ -634,7 +612,6 @@ int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca) static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf *err) { - struct bch_csum csum; size_t bytes; int ret; reread: @@ -650,7 +627,9 @@ reread: if (!uuid_equal(&sb->sb->magic, &BCACHE_MAGIC) && !uuid_equal(&sb->sb->magic, &BCHFS_MAGIC)) { - prt_printf(err, "Not a bcachefs superblock"); + prt_str(err, "Not a bcachefs superblock (got magic "); + pr_uuid(err, sb->sb->magic.b); + prt_str(err, ")"); return -BCH_ERR_invalid_sb_magic; } @@ -673,17 +652,16 @@ reread: goto reread; } - if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) { + enum bch_csum_type csum_type = BCH_SB_CSUM_TYPE(sb->sb); + if (csum_type >= BCH_CSUM_NR) { prt_printf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb)); return -BCH_ERR_invalid_sb_csum_type; } /* XXX: verify MACs */ - csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb), - null_nonce(), sb->sb); - + struct bch_csum csum = csum_vstruct(NULL, csum_type, null_nonce(), sb->sb); if (bch2_crc_cmp(csum, sb->sb->csum)) { - prt_printf(err, "bad checksum"); + bch2_csum_err_msg(err, csum_type, sb->sb->csum, csum); return -BCH_ERR_invalid_sb_csum; } @@ -692,12 +670,13 @@ reread: return 0; } -int bch2_read_super(const char *path, struct bch_opts *opts, - struct bch_sb_handle *sb) +static int __bch2_read_super(const char *path, struct bch_opts *opts, + struct bch_sb_handle *sb, bool ignore_notbchfs_msg) { u64 offset = opt_get(*opts, sb); struct bch_sb_layout layout; struct printbuf err = PRINTBUF; + struct printbuf err2 = PRINTBUF; __le64 *i; int ret; #ifndef __KERNEL__ @@ -761,8 +740,14 @@ retry: if (opt_defined(*opts, sb)) goto err; - printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s\n", + prt_printf(&err2, "bcachefs (%s): error reading default superblock: %s\n", path, err.buf); + if (ret == -BCH_ERR_invalid_sb_magic && ignore_notbchfs_msg) + printk(KERN_INFO "%s", err2.buf); + else + printk(KERN_ERR "%s", err2.buf); + + printbuf_exit(&err2); printbuf_reset(&err); /* @@ -838,6 +823,20 @@ err_no_print: goto out; } +int bch2_read_super(const char *path, struct bch_opts *opts, + struct bch_sb_handle *sb) +{ + return __bch2_read_super(path, opts, sb, false); +} + +/* provide a silenced version for mount.bcachefs */ + +int bch2_read_super_silent(const char *path, struct bch_opts *opts, + struct bch_sb_handle *sb) +{ + return __bch2_read_super(path, opts, sb, true); +} + /* write superblock: */ static void write_super_endio(struct bio *bio) @@ -906,9 +905,8 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) int bch2_write_super(struct bch_fs *c) { struct closure *cl = &c->sb_write; - struct bch_dev *ca; struct printbuf err = PRINTBUF; - unsigned i, sb = 0, nr_wrote; + unsigned sb = 0, nr_wrote; struct bch_devs_mask sb_written; bool wrote, can_mount_without_written, can_mount_with_written; unsigned degraded_flags = BCH_FORCE_IF_DEGRADED; @@ -930,9 +928,14 @@ int bch2_write_super(struct bch_fs *c) le64_add_cpu(&c->disk_sb.sb->seq, 1); - if (test_bit(BCH_FS_ERROR, &c->flags)) + struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); + for_each_online_member(c, ca) + __bch2_members_v2_get_mut(mi, ca->dev_idx)->seq = c->disk_sb.sb->seq; + c->disk_sb.sb->write_time = cpu_to_le64(ktime_get_real_seconds()); + + if (test_bit(BCH_FS_error, &c->flags)) SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1); - if (test_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags)) + if (test_bit(BCH_FS_topology_error, &c->flags)) SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 1); SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN); @@ -943,10 +946,10 @@ int bch2_write_super(struct bch_fs *c) bch2_sb_errors_from_cpu(c); bch2_sb_downgrade_update(c); - for_each_online_member(ca, c, i) + for_each_online_member(c, ca) bch2_sb_from_fs(c, ca); - for_each_online_member(ca, c, i) { + for_each_online_member(c, ca) { printbuf_reset(&err); ret = bch2_sb_validate(&ca->disk_sb, &err, WRITE); @@ -967,16 +970,28 @@ int bch2_write_super(struct bch_fs *c) if (!BCH_SB_INITIALIZED(c->disk_sb.sb)) goto out; - for_each_online_member(ca, c, i) { + if (le16_to_cpu(c->disk_sb.sb->version) > bcachefs_metadata_version_current) { + struct printbuf buf = PRINTBUF; + prt_printf(&buf, "attempting to write superblock that wasn't version downgraded ("); + bch2_version_to_text(&buf, le16_to_cpu(c->disk_sb.sb->version)); + prt_str(&buf, " > "); + bch2_version_to_text(&buf, bcachefs_metadata_version_current); + prt_str(&buf, ")"); + bch2_fs_fatal_error(c, "%s", buf.buf); + printbuf_exit(&buf); + return -BCH_ERR_sb_not_downgraded; + } + + for_each_online_member(c, ca) { __set_bit(ca->dev_idx, sb_written.d); ca->sb_write_error = 0; } - for_each_online_member(ca, c, i) + for_each_online_member(c, ca) read_back_super(c, ca); closure_sync(cl); - for_each_online_member(ca, c, i) { + for_each_online_member(c, ca) { if (ca->sb_write_error) continue; @@ -1003,7 +1018,7 @@ int bch2_write_super(struct bch_fs *c) do { wrote = false; - for_each_online_member(ca, c, i) + for_each_online_member(c, ca) if (!ca->sb_write_error && sb < ca->disk_sb.sb->layout.nr_superblocks) { write_one_super(c, ca, sb); @@ -1013,7 +1028,7 @@ int bch2_write_super(struct bch_fs *c) sb++; } while (wrote); - for_each_online_member(ca, c, i) { + for_each_online_member(c, ca) { if (ca->sb_write_error) __clear_bit(ca->dev_idx, sb_written.d); else @@ -1025,7 +1040,7 @@ int bch2_write_super(struct bch_fs *c) can_mount_with_written = bch2_have_enough_devs(c, sb_written, degraded_flags, false); - for (i = 0; i < ARRAY_SIZE(sb_written.d); i++) + for (unsigned i = 0; i < ARRAY_SIZE(sb_written.d); i++) sb_written.d[i] = ~sb_written.d[i]; can_mount_without_written = @@ -1074,13 +1089,22 @@ bool bch2_check_version_downgrade(struct bch_fs *c) /* * Downgrade, if superblock is at a higher version than currently * supported: + * + * c->sb will be checked before we write the superblock, so update it as + * well: */ - if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current) + if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current) { SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); - if (c->sb.version > bcachefs_metadata_version_current) + c->sb.version_upgrade_complete = bcachefs_metadata_version_current; + } + if (c->sb.version > bcachefs_metadata_version_current) { c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current); - if (c->sb.version_min > bcachefs_metadata_version_current) + c->sb.version = bcachefs_metadata_version_current; + } + if (c->sb.version_min > bcachefs_metadata_version_current) { c->disk_sb.sb->version_min = cpu_to_le16(bcachefs_metadata_version_current); + c->sb.version_min = bcachefs_metadata_version_current; + } c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1); return ret; } @@ -1173,8 +1197,8 @@ static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f, return ret; } -void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) +void __bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) { unsigned type = le32_to_cpu(f->type); const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type); @@ -1182,6 +1206,15 @@ void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, if (!out->nr_tabstops) printbuf_tabstop_push(out, 32); + if (ops->to_text) + ops->to_text(out, sb, f); +} + +void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + unsigned type = le32_to_cpu(f->type); + if (type < BCH_SB_FIELD_NR) prt_printf(out, "%s", bch2_sb_fields[type]); else @@ -1190,11 +1223,7 @@ void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, prt_printf(out, " (size %zu):", vstruct_bytes(f)); prt_newline(out); - if (ops->to_text) { - printbuf_indent_add(out, 2); - ops->to_text(out, sb, f); - printbuf_indent_sub(out, 2); - } + __bch2_sb_field_to_text(out, sb, f); } void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l) @@ -1223,7 +1252,6 @@ void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l) void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, bool print_layout, unsigned fields) { - struct bch_sb_field *f; u64 fields_have = 0; unsigned nr_devices = 0; @@ -1243,6 +1271,11 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, pr_uuid(out, sb->uuid.b); prt_newline(out); + prt_printf(out, "Magic number:"); + prt_tab(out); + pr_uuid(out, sb->magic.b); + prt_newline(out); + prt_str(out, "Device index:"); prt_tab(out); prt_printf(out, "%u", sb->dev_idx); @@ -1281,9 +1314,16 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, prt_printf(out, "%llu", le64_to_cpu(sb->seq)); prt_newline(out); + prt_printf(out, "Time of last write:"); + prt_tab(out); + bch2_prt_datetime(out, le64_to_cpu(sb->write_time)); + prt_newline(out); + prt_printf(out, "Superblock size:"); prt_tab(out); - prt_printf(out, "%zu", vstruct_bytes(sb)); + prt_units_u64(out, vstruct_bytes(sb)); + prt_str(out, "/"); + prt_units_u64(out, 512ULL << sb->layout.sb_max_size_bits); prt_newline(out); prt_printf(out, "Clean:"); diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h index e41e5de531a0..95e80e06316b 100644 --- a/fs/bcachefs/super-io.h +++ b/fs/bcachefs/super-io.h @@ -19,10 +19,6 @@ static inline bool bch2_version_compatible(u16 version) void bch2_version_to_text(struct printbuf *, unsigned); unsigned bch2_latest_compatible_version(unsigned); -u64 bch2_upgrade_recovery_passes(struct bch_fs *c, - unsigned, - unsigned); - static inline size_t bch2_sb_field_bytes(struct bch_sb_field *f) { return le32_to_cpu(f->u64s) * sizeof(u64); @@ -84,6 +80,7 @@ void bch2_free_super(struct bch_sb_handle *); int bch2_sb_realloc(struct bch_sb_handle *, unsigned); int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *); +int bch2_read_super_silent(const char *, struct bch_opts *, struct bch_sb_handle *); int bch2_write_super(struct bch_fs *); void __bch2_check_set_feature(struct bch_fs *, unsigned); @@ -96,6 +93,8 @@ static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat) bool bch2_check_version_downgrade(struct bch_fs *); void bch2_sb_upgrade(struct bch_fs *, unsigned); +void __bch2_sb_field_to_text(struct printbuf *, struct bch_sb *, + struct bch_sb_field *); void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *, struct bch_sb_field *); void bch2_sb_layout_to_text(struct printbuf *, struct bch_sb_layout *); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 818ec467a06b..b9911402b175 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -23,7 +23,6 @@ #include "checksum.h" #include "clock.h" #include "compress.h" -#include "counters.h" #include "debug.h" #include "disk_groups.h" #include "ec.h" @@ -49,6 +48,7 @@ #include "recovery.h" #include "replicas.h" #include "sb-clean.h" +#include "sb-counters.h" #include "sb-errors.h" #include "sb-members.h" #include "snapshot.h" @@ -79,6 +79,36 @@ MODULE_SOFTDEP("pre: chacha20"); MODULE_SOFTDEP("pre: poly1305"); MODULE_SOFTDEP("pre: xxhash"); +const char * const bch2_fs_flag_strs[] = { +#define x(n) #n, + BCH_FS_FLAGS() +#undef x + NULL +}; + +void __bch2_print(struct bch_fs *c, const char *fmt, ...) +{ + struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c); + + va_list args; + va_start(args, fmt); + if (likely(!stdio)) { + vprintk(fmt, args); + } else { + unsigned long flags; + + if (fmt[0] == KERN_SOH[0]) + fmt += 2; + + spin_lock_irqsave(&stdio->output_lock, flags); + prt_vprintf(&stdio->output_buf, fmt, args); + spin_unlock_irqrestore(&stdio->output_lock, flags); + + wake_up(&stdio->output_wait); + } + va_end(args); +} + #define KTYPE(type) \ static const struct attribute_group type ## _group = { \ .attrs = type ## _files \ @@ -134,14 +164,12 @@ static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *); struct bch_fs *bch2_dev_to_fs(dev_t dev) { struct bch_fs *c; - struct bch_dev *ca; - unsigned i; mutex_lock(&bch_fs_list_lock); rcu_read_lock(); list_for_each_entry(c, &bch_fs_list, list) - for_each_member_device_rcu(ca, c, i, NULL) + for_each_member_device_rcu(c, ca, NULL) if (ca->disk_sb.bdev && ca->disk_sb.bdev->bd_dev == dev) { closure_get(&c->cl); goto found; @@ -182,14 +210,13 @@ struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid) static void bch2_dev_usage_journal_reserve(struct bch_fs *c) { - struct bch_dev *ca; - unsigned i, nr = 0, u64s = + unsigned nr = 0, u64s = ((sizeof(struct jset_entry_dev_usage) + sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR)) / sizeof(u64); rcu_read_lock(); - for_each_member_device_rcu(ca, c, i, NULL) + for_each_member_device_rcu(c, ca, NULL) nr++; rcu_read_unlock(); @@ -216,8 +243,7 @@ static void bch2_dev_usage_journal_reserve(struct bch_fs *c) static void __bch2_fs_read_only(struct bch_fs *c) { - struct bch_dev *ca; - unsigned i, clean_passes = 0; + unsigned clean_passes = 0; u64 seq = 0; bch2_fs_ec_stop(c); @@ -246,14 +272,14 @@ static void __bch2_fs_read_only(struct bch_fs *c) journal_cur_seq(&c->journal)); if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) && - !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) - set_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags); + !test_bit(BCH_FS_emergency_ro, &c->flags)) + set_bit(BCH_FS_clean_shutdown, &c->flags); bch2_fs_journal_stop(&c->journal); /* * After stopping journal: */ - for_each_member_device(ca, c, i) + for_each_member_device(c, ca) bch2_dev_allocator_remove(c, ca); } @@ -262,25 +288,27 @@ static void bch2_writes_disabled(struct percpu_ref *writes) { struct bch_fs *c = container_of(writes, struct bch_fs, writes); - set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); + set_bit(BCH_FS_write_disable_complete, &c->flags); wake_up(&bch2_read_only_wait); } #endif void bch2_fs_read_only(struct bch_fs *c) { - if (!test_bit(BCH_FS_RW, &c->flags)) { + if (!test_bit(BCH_FS_rw, &c->flags)) { bch2_journal_reclaim_stop(&c->journal); return; } - BUG_ON(test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); + BUG_ON(test_bit(BCH_FS_write_disable_complete, &c->flags)); + + bch_verbose(c, "going read-only"); /* * Block new foreground-end write operations from starting - any new * writes will return -EROFS: */ - set_bit(BCH_FS_GOING_RO, &c->flags); + set_bit(BCH_FS_going_ro, &c->flags); #ifndef BCH_WRITE_REF_DEBUG percpu_ref_kill(&c->writes); #else @@ -300,33 +328,42 @@ void bch2_fs_read_only(struct bch_fs *c) * that going RO is complete: */ wait_event(bch2_read_only_wait, - test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) || - test_bit(BCH_FS_EMERGENCY_RO, &c->flags)); + test_bit(BCH_FS_write_disable_complete, &c->flags) || + test_bit(BCH_FS_emergency_ro, &c->flags)); + + bool writes_disabled = test_bit(BCH_FS_write_disable_complete, &c->flags); + if (writes_disabled) + bch_verbose(c, "finished waiting for writes to stop"); __bch2_fs_read_only(c); wait_event(bch2_read_only_wait, - test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); + test_bit(BCH_FS_write_disable_complete, &c->flags)); - clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); - clear_bit(BCH_FS_GOING_RO, &c->flags); + if (!writes_disabled) + bch_verbose(c, "finished waiting for writes to stop"); + + clear_bit(BCH_FS_write_disable_complete, &c->flags); + clear_bit(BCH_FS_going_ro, &c->flags); + clear_bit(BCH_FS_rw, &c->flags); if (!bch2_journal_error(&c->journal) && - !test_bit(BCH_FS_ERROR, &c->flags) && - !test_bit(BCH_FS_EMERGENCY_RO, &c->flags) && - test_bit(BCH_FS_STARTED, &c->flags) && - test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags) && + !test_bit(BCH_FS_error, &c->flags) && + !test_bit(BCH_FS_emergency_ro, &c->flags) && + test_bit(BCH_FS_started, &c->flags) && + test_bit(BCH_FS_clean_shutdown, &c->flags) && !c->opts.norecovery) { BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal)); BUG_ON(atomic_read(&c->btree_cache.dirty)); BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty)); - BUG_ON(c->btree_write_buffer.state.nr); + BUG_ON(c->btree_write_buffer.inc.keys.nr); + BUG_ON(c->btree_write_buffer.flushing.keys.nr); bch_verbose(c, "marking filesystem clean"); bch2_fs_mark_clean(c); + } else { + bch_verbose(c, "done going read-only, filesystem not clean"); } - - clear_bit(BCH_FS_RW, &c->flags); } static void bch2_fs_read_only_work(struct work_struct *work) @@ -346,7 +383,7 @@ static void bch2_fs_read_only_async(struct bch_fs *c) bool bch2_fs_emergency_read_only(struct bch_fs *c) { - bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags); + bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); bch2_journal_halt(&c->journal); bch2_fs_read_only_async(c); @@ -383,28 +420,16 @@ static int bch2_fs_read_write_late(struct bch_fs *c) static int __bch2_fs_read_write(struct bch_fs *c, bool early) { - struct bch_dev *ca; - unsigned i; int ret; - if (test_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags)) { + if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) { bch_err(c, "cannot go rw, unfixed btree errors"); return -BCH_ERR_erofs_unfixed_errors; } - if (test_bit(BCH_FS_RW, &c->flags)) + if (test_bit(BCH_FS_rw, &c->flags)) return 0; - if (c->opts.norecovery) - return -BCH_ERR_erofs_norecovery; - - /* - * nochanges is used for fsck -n mode - we have to allow going rw - * during recovery for that to work: - */ - if (c->opts.nochanges && (!early || c->opts.read_only)) - return -BCH_ERR_erofs_nochanges; - bch_info(c, "going read-write"); ret = bch2_sb_members_v2_init(c); @@ -415,7 +440,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) if (ret) goto err; - clear_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags); + clear_bit(BCH_FS_clean_shutdown, &c->flags); /* * First journal write must be a flush write: after a clean shutdown we @@ -425,17 +450,17 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) */ set_bit(JOURNAL_NEED_FLUSH_WRITE, &c->journal.flags); - for_each_rw_member(ca, c, i) + for_each_rw_member(c, ca) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); - set_bit(BCH_FS_RW, &c->flags); - set_bit(BCH_FS_WAS_RW, &c->flags); + set_bit(BCH_FS_rw, &c->flags); + set_bit(BCH_FS_was_rw, &c->flags); #ifndef BCH_WRITE_REF_DEBUG percpu_ref_reinit(&c->writes); #else - for (i = 0; i < BCH_WRITE_REF_NR; i++) { + for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) { BUG_ON(atomic_long_read(&c->writes[i])); atomic_long_inc(&c->writes[i]); } @@ -463,7 +488,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) bch2_do_pending_node_rewrites(c); return 0; err: - if (test_bit(BCH_FS_RW, &c->flags)) + if (test_bit(BCH_FS_rw, &c->flags)) bch2_fs_read_only(c); else __bch2_fs_read_only(c); @@ -472,6 +497,12 @@ err: int bch2_fs_read_write(struct bch_fs *c) { + if (c->opts.norecovery) + return -BCH_ERR_erofs_norecovery; + + if (c->opts.nochanges) + return -BCH_ERR_erofs_nochanges; + return __bch2_fs_read_write(c, false); } @@ -558,12 +589,9 @@ static void bch2_fs_release(struct kobject *kobj) void __bch2_fs_stop(struct bch_fs *c) { - struct bch_dev *ca; - unsigned i; - bch_verbose(c, "shutting down"); - set_bit(BCH_FS_STOPPING, &c->flags); + set_bit(BCH_FS_stopping, &c->flags); cancel_work_sync(&c->journal_seq_blacklist_gc_work); @@ -571,7 +599,7 @@ void __bch2_fs_stop(struct bch_fs *c) bch2_fs_read_only(c); up_write(&c->state_lock); - for_each_member_device(ca, c, i) + for_each_member_device(c, ca) if (ca->kobj.state_in_sysfs && ca->disk_sb.bdev) sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs"); @@ -582,6 +610,9 @@ void __bch2_fs_stop(struct bch_fs *c) bch2_fs_debug_exit(c); bch2_fs_chardev_exit(c); + bch2_ro_ref_put(c); + wait_event(c->ro_ref_wait, !refcount_read(&c->ro_ref)); + kobject_put(&c->counters_kobj); kobject_put(&c->time_stats); kobject_put(&c->opts_dir); @@ -590,7 +621,7 @@ void __bch2_fs_stop(struct bch_fs *c) /* btree prefetch might have kicked off reads in the background: */ bch2_btree_flush_all_reads(c); - for_each_member_device(ca, c, i) + for_each_member_device(c, ca) cancel_work_sync(&ca->io_error_work); cancel_work_sync(&c->read_only_work); @@ -629,8 +660,6 @@ void bch2_fs_stop(struct bch_fs *c) static int bch2_fs_online(struct bch_fs *c) { - struct bch_dev *ca; - unsigned i; int ret = 0; lockdep_assert_held(&bch_fs_list_lock); @@ -651,7 +680,9 @@ static int bch2_fs_online(struct bch_fs *c) ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?: kobject_add(&c->internal, &c->kobj, "internal") ?: kobject_add(&c->opts_dir, &c->kobj, "options") ?: +#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT kobject_add(&c->time_stats, &c->kobj, "time_stats") ?: +#endif kobject_add(&c->counters_kobj, &c->kobj, "counters") ?: bch2_opts_create_sysfs_files(&c->opts_dir); if (ret) { @@ -661,7 +692,7 @@ static int bch2_fs_online(struct bch_fs *c) down_write(&c->state_lock); - for_each_member_device(ca, c, i) { + for_each_member_device(c, ca) { ret = bch2_dev_sysfs_online(c, ca); if (ret) { bch_err(c, "error creating sysfs objects"); @@ -690,6 +721,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) goto out; } + c->stdio = (void *)(unsigned long) opts.stdio; + __module_get(THIS_MODULE); closure_init(&c->cl, NULL); @@ -710,6 +743,10 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) mutex_init(&c->btree_root_lock); INIT_WORK(&c->read_only_work, bch2_fs_read_only_work); + refcount_set(&c->ro_ref, 1); + init_waitqueue_head(&c->ro_ref_wait); + sema_init(&c->online_fsck_mutex, 1); + init_rwsem(&c->gc_lock); mutex_init(&c->gc_gens_lock); atomic_set(&c->journal_keys.ref, 1); @@ -763,7 +800,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write]; c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write]; - c->journal.blocked_time = &c->times[BCH_TIME_blocked_journal]; c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; bch2_fs_btree_cache_init_early(&c->btree_cache); @@ -832,7 +868,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || !(c->io_complete_wq = alloc_workqueue("bcachefs_io", - WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 1)) || + WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 512)) || !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", WQ_FREEZABLE, 0)) || #ifndef BCH_WRITE_REF_DEBUG @@ -847,7 +883,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || !(c->online_reserved = alloc_percpu(u64)) || mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, - btree_bytes(c)) || + c->opts.btree_node_size) || mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits, sizeof(u64), GFP_KERNEL))) { @@ -946,16 +982,14 @@ static void print_mount_opts(struct bch_fs *c) int bch2_fs_start(struct bch_fs *c) { - struct bch_dev *ca; time64_t now = ktime_get_real_seconds(); - unsigned i; int ret; print_mount_opts(c); down_write(&c->state_lock); - BUG_ON(test_bit(BCH_FS_STARTED, &c->flags)); + BUG_ON(test_bit(BCH_FS_started, &c->flags)); mutex_lock(&c->sb_lock); @@ -965,12 +999,12 @@ int bch2_fs_start(struct bch_fs *c) goto err; } - for_each_online_member(ca, c, i) - bch2_members_v2_get_mut(c->disk_sb.sb, i)->last_mount = cpu_to_le64(now); + for_each_online_member(c, ca) + bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = cpu_to_le64(now); mutex_unlock(&c->sb_lock); - for_each_rw_member(ca, c, i) + for_each_rw_member(c, ca) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); @@ -990,12 +1024,12 @@ int bch2_fs_start(struct bch_fs *c) goto err; } - set_bit(BCH_FS_STARTED, &c->flags); + set_bit(BCH_FS_started, &c->flags); - if (c->opts.read_only || c->opts.nochanges) { + if (c->opts.read_only) { bch2_fs_read_only(c); } else { - ret = !test_bit(BCH_FS_RW, &c->flags) + ret = !test_bit(BCH_FS_rw, &c->flags) ? bch2_fs_read_write(c) : bch2_fs_read_write_late(c); if (ret) @@ -1003,12 +1037,13 @@ int bch2_fs_start(struct bch_fs *c) } ret = 0; -out: +err: + if (ret) + bch_err_msg(c, ret, "starting filesystem"); + else + bch_verbose(c, "done starting filesystem"); up_write(&c->state_lock); return ret; -err: - bch_err_msg(c, ret, "starting filesystem"); - goto out; } static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) @@ -1025,20 +1060,83 @@ static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) return 0; } -static int bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb) +static int bch2_dev_in_fs(struct bch_sb_handle *fs, + struct bch_sb_handle *sb) { - struct bch_sb *newest = - le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb; + if (fs == sb) + return 0; - if (!uuid_equal(&fs->uuid, &sb->uuid)) + if (!uuid_equal(&fs->sb->uuid, &sb->sb->uuid)) return -BCH_ERR_device_not_a_member_of_filesystem; - if (!bch2_dev_exists(newest, sb->dev_idx)) + if (!bch2_dev_exists(fs->sb, sb->sb->dev_idx)) return -BCH_ERR_device_has_been_removed; - if (fs->block_size != sb->block_size) + if (fs->sb->block_size != sb->sb->block_size) return -BCH_ERR_mismatched_block_size; + if (le16_to_cpu(fs->sb->version) < bcachefs_metadata_version_member_seq || + le16_to_cpu(sb->sb->version) < bcachefs_metadata_version_member_seq) + return 0; + + if (fs->sb->seq == sb->sb->seq && + fs->sb->write_time != sb->sb->write_time) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "Split brain detected between "); + prt_bdevname(&buf, sb->bdev); + prt_str(&buf, " and "); + prt_bdevname(&buf, fs->bdev); + prt_char(&buf, ':'); + prt_newline(&buf); + prt_printf(&buf, "seq=%llu but write_time different, got", le64_to_cpu(sb->sb->seq)); + prt_newline(&buf); + + prt_bdevname(&buf, fs->bdev); + prt_char(&buf, ' '); + bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time));; + prt_newline(&buf); + + prt_bdevname(&buf, sb->bdev); + prt_char(&buf, ' '); + bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));; + prt_newline(&buf); + + prt_printf(&buf, "Not using older sb"); + + pr_err("%s", buf.buf); + printbuf_exit(&buf); + return -BCH_ERR_device_splitbrain; + } + + struct bch_member m = bch2_sb_member_get(fs->sb, sb->sb->dev_idx); + u64 seq_from_fs = le64_to_cpu(m.seq); + u64 seq_from_member = le64_to_cpu(sb->sb->seq); + + if (seq_from_fs && seq_from_fs < seq_from_member) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "Split brain detected between "); + prt_bdevname(&buf, sb->bdev); + prt_str(&buf, " and "); + prt_bdevname(&buf, fs->bdev); + prt_char(&buf, ':'); + prt_newline(&buf); + + prt_bdevname(&buf, fs->bdev); + prt_str(&buf, "believes seq of "); + prt_bdevname(&buf, sb->bdev); + prt_printf(&buf, " to be %llu, but ", seq_from_fs); + prt_bdevname(&buf, sb->bdev); + prt_printf(&buf, " has %llu\n", seq_from_member); + prt_str(&buf, "Not using "); + prt_bdevname(&buf, sb->bdev); + + pr_err("%s", buf.buf); + printbuf_exit(&buf); + return -BCH_ERR_device_splitbrain; + } + return 0; } @@ -1284,9 +1382,14 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) bch2_dev_sysfs_online(c, ca); + struct printbuf name = PRINTBUF; + prt_bdevname(&name, ca->disk_sb.bdev); + if (c->sb.nr_devices == 1) - snprintf(c->name, sizeof(c->name), "%pg", ca->disk_sb.bdev); - snprintf(ca->name, sizeof(ca->name), "%pg", ca->disk_sb.bdev); + strscpy(c->name, name.buf, sizeof(c->name)); + strscpy(ca->name, name.buf, sizeof(ca->name)); + + printbuf_exit(&name); rebalance_wakeup(c); return 0; @@ -1307,8 +1410,7 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, enum bch_member_state new_state, int flags) { struct bch_devs_mask new_online_devs; - struct bch_dev *ca2; - int i, nr_rw = 0, required; + int nr_rw = 0, required; lockdep_assert_held(&c->state_lock); @@ -1320,7 +1422,7 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, return true; /* do we have enough devices to write to? */ - for_each_member_device(ca2, c, i) + for_each_member_device(c, ca2) if (ca2 != ca) nr_rw += ca2->mi.state == BCH_MEMBER_STATE_rw; @@ -1468,9 +1570,7 @@ static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) BTREE_TRIGGER_NORUN, NULL) ?: bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end, BTREE_TRIGGER_NORUN, NULL); - if (ret) - bch_err_msg(c, ret, "removing dev alloc info"); - + bch_err_msg(c, ret, "removing dev alloc info"); return ret; } @@ -1497,40 +1597,35 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) __bch2_dev_read_only(c, ca); ret = bch2_dev_data_drop(c, ca->dev_idx, flags); - if (ret) { - bch_err_msg(ca, ret, "dropping data"); + bch_err_msg(ca, ret, "dropping data"); + if (ret) goto err; - } ret = bch2_dev_remove_alloc(c, ca); - if (ret) { - bch_err_msg(ca, ret, "deleting alloc info"); + bch_err_msg(ca, ret, "deleting alloc info"); + if (ret) goto err; - } ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx); - if (ret) { - bch_err_msg(ca, ret, "flushing journal"); + bch_err_msg(ca, ret, "flushing journal"); + if (ret) goto err; - } ret = bch2_journal_flush(&c->journal); - if (ret) { - bch_err(ca, "journal error"); + bch_err(ca, "journal error"); + if (ret) goto err; - } ret = bch2_replicas_gc2(c); - if (ret) { - bch_err_msg(ca, ret, "in replicas_gc2()"); + bch_err_msg(ca, ret, "in replicas_gc2()"); + if (ret) goto err; - } data = bch2_dev_has_data(c, ca); if (data) { struct printbuf data_has = PRINTBUF; - prt_bitflags(&data_has, bch2_data_types, data); + prt_bitflags(&data_has, __bch2_data_types, data); bch_err(ca, "Remove failed, still has data (%s)", data_has.buf); printbuf_exit(&data_has); ret = -EBUSY; @@ -1596,10 +1691,9 @@ int bch2_dev_add(struct bch_fs *c, const char *path) int ret; ret = bch2_read_super(path, &opts, &sb); - if (ret) { - bch_err_msg(c, ret, "reading super"); + bch_err_msg(c, ret, "reading super"); + if (ret) goto err; - } dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx); @@ -1612,10 +1706,8 @@ int bch2_dev_add(struct bch_fs *c, const char *path) } ret = bch2_dev_may_add(sb.sb, c); - if (ret) { - bch_err_fn(c, ret); + if (ret) goto err; - } ca = __bch2_dev_alloc(c, &dev_mi); if (!ca) { @@ -1630,19 +1722,17 @@ int bch2_dev_add(struct bch_fs *c, const char *path) goto err; ret = bch2_dev_journal_alloc(ca); - if (ret) { - bch_err_msg(c, ret, "allocating journal"); + bch_err_msg(c, ret, "allocating journal"); + if (ret) goto err; - } down_write(&c->state_lock); mutex_lock(&c->sb_lock); ret = bch2_sb_from_fs(c, ca); - if (ret) { - bch_err_msg(c, ret, "setting up new superblock"); + bch_err_msg(c, ret, "setting up new superblock"); + if (ret) goto err_unlock; - } if (dynamic_fault("bcachefs:add:no_slot")) goto no_slot; @@ -1681,10 +1771,9 @@ have_slot: if (BCH_MEMBER_GROUP(&dev_mi)) { ret = __bch2_dev_group_set(c, ca, label.buf); - if (ret) { - bch_err_msg(c, ret, "creating new label"); + bch_err_msg(c, ret, "creating new label"); + if (ret) goto err_unlock; - } } bch2_write_super(c); @@ -1693,16 +1782,14 @@ have_slot: bch2_dev_usage_journal_reserve(c); ret = bch2_trans_mark_dev_sb(c, ca); - if (ret) { - bch_err_msg(ca, ret, "marking new superblock"); + bch_err_msg(ca, ret, "marking new superblock"); + if (ret) goto err_late; - } ret = bch2_fs_freespace_init(c); - if (ret) { - bch_err_msg(ca, ret, "initializing free space"); + bch_err_msg(ca, ret, "initializing free space"); + if (ret) goto err_late; - } ca->new_fs_bucket_idx = 0; @@ -1721,6 +1808,7 @@ err: bch2_free_super(&sb); printbuf_exit(&label); printbuf_exit(&errbuf); + bch_err_fn(c, ret); return ret; err_late: up_write(&c->state_lock); @@ -1747,11 +1835,10 @@ int bch2_dev_online(struct bch_fs *c, const char *path) dev_idx = sb.sb->dev_idx; - ret = bch2_dev_in_fs(c->disk_sb.sb, sb.sb); - if (ret) { - bch_err_msg(c, ret, "bringing %s online", path); + ret = bch2_dev_in_fs(&c->disk_sb, &sb); + bch_err_msg(c, ret, "bringing %s online", path); + if (ret) goto err; - } ret = bch2_dev_attach_bdev(c, &sb); if (ret) @@ -1760,10 +1847,9 @@ int bch2_dev_online(struct bch_fs *c, const char *path) ca = bch_dev_locked(c, dev_idx); ret = bch2_trans_mark_dev_sb(c, ca); - if (ret) { - bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path); + bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path); + if (ret) goto err; - } if (ca->mi.state == BCH_MEMBER_STATE_rw) __bch2_dev_read_write(c, ca); @@ -1842,10 +1928,9 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) } ret = bch2_dev_buckets_resize(c, ca, nbuckets); - if (ret) { - bch_err_msg(ca, ret, "resizing buckets"); + bch_err_msg(ca, ret, "resizing buckets"); + if (ret) goto err; - } ret = bch2_trans_mark_dev_sb(c, ca); if (ret) @@ -1879,28 +1964,30 @@ err: /* return with ref on ca->ref: */ struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name) { - struct bch_dev *ca; - unsigned i; - rcu_read_lock(); - for_each_member_device_rcu(ca, c, i, NULL) - if (!strcmp(name, ca->name)) - goto found; - ca = ERR_PTR(-BCH_ERR_ENOENT_dev_not_found); -found: + for_each_member_device_rcu(c, ca, NULL) + if (!strcmp(name, ca->name)) { + rcu_read_unlock(); + return ca; + } rcu_read_unlock(); - - return ca; + return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found); } /* Filesystem open: */ +static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r) +{ + return cmp_int(le64_to_cpu(l->seq), le64_to_cpu(r->seq)) ?: + cmp_int(le64_to_cpu(l->write_time), le64_to_cpu(r->write_time)); +} + struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, struct bch_opts opts) { DARRAY(struct bch_sb_handle) sbs = { 0 }; struct bch_fs *c = NULL; - struct bch_sb_handle *sb, *best = NULL; + struct bch_sb_handle *best = NULL; struct printbuf errbuf = PRINTBUF; int ret = 0; @@ -1926,20 +2013,27 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, BUG_ON(darray_push(&sbs, sb)); } + if (opts.nochanges && !opts.read_only) { + ret = -BCH_ERR_erofs_nochanges; + goto err_print; + } + darray_for_each(sbs, sb) - if (!best || le64_to_cpu(sb->sb->seq) > le64_to_cpu(best->sb->seq)) + if (!best || sb_cmp(sb->sb, best->sb) > 0) best = sb; darray_for_each_reverse(sbs, sb) { - if (sb != best && !bch2_dev_exists(best->sb, sb->sb->dev_idx)) { - pr_info("%pg has been removed, skipping", sb->bdev); + ret = bch2_dev_in_fs(best, sb); + + if (ret == -BCH_ERR_device_has_been_removed || + ret == -BCH_ERR_device_splitbrain) { bch2_free_super(sb); darray_remove_item(&sbs, sb); best -= best > sb; + ret = 0; continue; } - ret = bch2_dev_in_fs(best->sb, sb->sb); if (ret) goto err_print; } diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h index bf762df18012..dada09331d2e 100644 --- a/fs/bcachefs/super.h +++ b/fs/bcachefs/super.h @@ -8,6 +8,8 @@ #include <linux/math64.h> +extern const char * const bch2_fs_flag_strs[]; + struct bch_fs *bch2_dev_to_fs(dev_t); struct bch_fs *bch2_uuid_to_fs(__uuid_t); @@ -37,8 +39,8 @@ int bch2_fs_read_write_early(struct bch_fs *); */ static inline void bch2_fs_lazy_rw(struct bch_fs *c) { - if (!test_bit(BCH_FS_RW, &c->flags) && - !test_bit(BCH_FS_WAS_RW, &c->flags)) + if (!test_bit(BCH_FS_rw, &c->flags) && + !test_bit(BCH_FS_was_rw, &c->flags)) bch2_fs_read_write_early(c); } diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h index b2119686e2e1..0e5a14fc8e7f 100644 --- a/fs/bcachefs/super_types.h +++ b/fs/bcachefs/super_types.h @@ -23,7 +23,7 @@ struct bch_devs_mask { struct bch_devs_list { u8 nr; - u8 devs[BCH_BKEY_PTRS_MAX]; + u8 data[BCH_BKEY_PTRS_MAX]; }; struct bch_member_cpu { diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index f3cb7115b530..cee80c47feea 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -21,6 +21,7 @@ #include "btree_gc.h" #include "buckets.h" #include "clock.h" +#include "compress.h" #include "disk_groups.h" #include "ec.h" #include "inode.h" @@ -145,6 +146,7 @@ rw_attribute(gc_gens_pos); read_attribute(uuid); read_attribute(minor); +read_attribute(flags); read_attribute(bucket_size); read_attribute(first_bucket); read_attribute(nbuckets); @@ -246,7 +248,7 @@ static size_t bch2_btree_cache_size(struct bch_fs *c) mutex_lock(&c->btree_cache.lock); list_for_each_entry(b, &c->btree_cache.live, list) - ret += btree_bytes(c); + ret += btree_buf_bytes(b); mutex_unlock(&c->btree_cache.lock); return ret; @@ -255,19 +257,18 @@ static size_t bch2_btree_cache_size(struct bch_fs *c) static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c) { struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; enum btree_id id; - u64 nr_uncompressed_extents = 0, - nr_compressed_extents = 0, - nr_incompressible_extents = 0, - uncompressed_sectors = 0, - incompressible_sectors = 0, - compressed_sectors_compressed = 0, - compressed_sectors_uncompressed = 0; + struct compression_type_stats { + u64 nr_extents; + u64 sectors_compressed; + u64 sectors_uncompressed; + } s[BCH_COMPRESSION_TYPE_NR]; + u64 compressed_incompressible = 0; int ret = 0; - if (!test_bit(BCH_FS_STARTED, &c->flags)) + memset(s, 0, sizeof(s)); + + if (!test_bit(BCH_FS_started, &c->flags)) return -EPERM; trans = bch2_trans_get(c); @@ -276,39 +277,33 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c if (!btree_type_has_ptrs(id)) continue; - ret = for_each_btree_key2(trans, iter, id, POS_MIN, - BTREE_ITER_ALL_SNAPSHOTS, k, ({ + ret = for_each_btree_key(trans, iter, id, POS_MIN, + BTREE_ITER_ALL_SNAPSHOTS, k, ({ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + struct bch_extent_crc_unpacked crc; const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - bool compressed = false, uncompressed = false, incompressible = false; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - switch (p.crc.compression_type) { - case BCH_COMPRESSION_TYPE_none: - uncompressed = true; - uncompressed_sectors += k.k->size; - break; - case BCH_COMPRESSION_TYPE_incompressible: - incompressible = true; - incompressible_sectors += k.k->size; - break; - default: - compressed_sectors_compressed += - p.crc.compressed_size; - compressed_sectors_uncompressed += - p.crc.uncompressed_size; - compressed = true; - break; + bool compressed = false, incompressible = false; + + bkey_for_each_crc(k.k, ptrs, crc, entry) { + incompressible |= crc.compression_type == BCH_COMPRESSION_TYPE_incompressible; + compressed |= crc_is_compressed(crc); + + if (crc_is_compressed(crc)) { + s[crc.compression_type].nr_extents++; + s[crc.compression_type].sectors_compressed += crc.compressed_size; + s[crc.compression_type].sectors_uncompressed += crc.uncompressed_size; } } - if (incompressible) - nr_incompressible_extents++; - else if (uncompressed) - nr_uncompressed_extents++; - else if (compressed) - nr_compressed_extents++; + compressed_incompressible += compressed && incompressible; + + if (!compressed) { + unsigned t = incompressible ? BCH_COMPRESSION_TYPE_incompressible : 0; + + s[t].nr_extents++; + s[t].sectors_compressed += k.k->size; + s[t].sectors_uncompressed += k.k->size; + } 0; })); } @@ -318,26 +313,45 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c if (ret) return ret; - prt_printf(out, "uncompressed:\n"); - prt_printf(out, " nr extents: %llu\n", nr_uncompressed_extents); - prt_printf(out, " size: "); - prt_human_readable_u64(out, uncompressed_sectors << 9); - prt_printf(out, "\n"); + prt_str(out, "type"); + printbuf_tabstop_push(out, 12); + prt_tab(out); - prt_printf(out, "compressed:\n"); - prt_printf(out, " nr extents: %llu\n", nr_compressed_extents); - prt_printf(out, " compressed size: "); - prt_human_readable_u64(out, compressed_sectors_compressed << 9); - prt_printf(out, "\n"); - prt_printf(out, " uncompressed size: "); - prt_human_readable_u64(out, compressed_sectors_uncompressed << 9); - prt_printf(out, "\n"); + prt_str(out, "compressed"); + printbuf_tabstop_push(out, 16); + prt_tab_rjust(out); + + prt_str(out, "uncompressed"); + printbuf_tabstop_push(out, 16); + prt_tab_rjust(out); + + prt_str(out, "average extent size"); + printbuf_tabstop_push(out, 24); + prt_tab_rjust(out); + prt_newline(out); + + for (unsigned i = 0; i < ARRAY_SIZE(s); i++) { + bch2_prt_compression_type(out, i); + prt_tab(out); + + prt_human_readable_u64(out, s[i].sectors_compressed << 9); + prt_tab_rjust(out); + + prt_human_readable_u64(out, s[i].sectors_uncompressed << 9); + prt_tab_rjust(out); + + prt_human_readable_u64(out, s[i].nr_extents + ? div_u64(s[i].sectors_uncompressed << 9, s[i].nr_extents) + : 0); + prt_tab_rjust(out); + prt_newline(out); + } + + if (compressed_incompressible) { + prt_printf(out, "%llu compressed & incompressible extents", compressed_incompressible); + prt_newline(out); + } - prt_printf(out, "incompressible:\n"); - prt_printf(out, " nr extents: %llu\n", nr_incompressible_extents); - prt_printf(out, " size: "); - prt_human_readable_u64(out, incompressible_sectors << 9); - prt_printf(out, "\n"); return 0; } @@ -370,6 +384,9 @@ SHOW(bch2_fs) sysfs_print(minor, c->minor); sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b); + if (attr == &sysfs_flags) + prt_bitflags(out, bch2_fs_flag_strs, c->flags); + sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c)); if (attr == &sysfs_btree_write_stats) @@ -483,12 +500,12 @@ STORE(bch2_fs) /* Debugging: */ - if (!test_bit(BCH_FS_STARTED, &c->flags)) + if (!test_bit(BCH_FS_started, &c->flags)) return -EPERM; /* Debugging: */ - if (!test_bit(BCH_FS_RW, &c->flags)) + if (!test_bit(BCH_FS_rw, &c->flags)) return -EROFS; if (attr == &sysfs_prune_cache) { @@ -620,6 +637,7 @@ STORE(bch2_fs_internal) SYSFS_OPS(bch2_fs_internal); struct attribute *bch2_fs_internal_files[] = { + &sysfs_flags, &sysfs_journal_debug, &sysfs_btree_updates, &sysfs_btree_cache, @@ -708,8 +726,10 @@ STORE(bch2_fs_opts_dir) bch2_opt_set_sb(c, opt, v); bch2_opt_set_by_id(&c->opts, id, v); - if ((id == Opt_background_target || - id == Opt_background_compression) && v) + if (v && + (id == Opt_background_target || + id == Opt_background_compression || + (id == Opt_compression && !c->opts.background_compression))) bch2_set_rebalance_needs_scan(c, 0); ret = size; @@ -786,32 +806,7 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) printbuf_tabstop_push(out, 16); printbuf_tabstop_push(out, 16); - prt_tab(out); - prt_str(out, "buckets"); - prt_tab_rjust(out); - prt_str(out, "sectors"); - prt_tab_rjust(out); - prt_str(out, "fragmented"); - prt_tab_rjust(out); - prt_newline(out); - - for (i = 0; i < BCH_DATA_NR; i++) { - prt_str(out, bch2_data_types[i]); - prt_tab(out); - prt_u64(out, stats.d[i].buckets); - prt_tab_rjust(out); - prt_u64(out, stats.d[i].sectors); - prt_tab_rjust(out); - prt_u64(out, stats.d[i].fragmented); - prt_tab_rjust(out); - prt_newline(out); - } - - prt_str(out, "ec"); - prt_tab(out); - prt_u64(out, stats.buckets_ec); - prt_tab_rjust(out); - prt_newline(out); + bch2_dev_usage_to_text(out, &stats); prt_newline(out); @@ -891,7 +886,7 @@ static void dev_io_done_to_text(struct printbuf *out, struct bch_dev *ca) for (i = 1; i < BCH_DATA_NR; i++) prt_printf(out, "%-12s:%12llu\n", - bch2_data_types[i], + bch2_data_type_str(i), percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9); } } @@ -916,7 +911,7 @@ SHOW(bch2_dev) } if (attr == &sysfs_has_data) { - prt_bitflags(out, bch2_data_types, bch2_dev_has_data(c, ca)); + prt_bitflags(out, __bch2_data_types, bch2_dev_has_data(c, ca)); prt_char(out, '\n'); } diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c index 2fc9e60c754b..b3fe9fc57747 100644 --- a/fs/bcachefs/tests.c +++ b/fs/bcachefs/tests.c @@ -107,9 +107,6 @@ err: static int test_iterate(struct bch_fs *c, u64 nr) { - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter = { NULL }; - struct bkey_s_c k; u64 i; int ret = 0; @@ -127,49 +124,43 @@ static int test_iterate(struct bch_fs *c, u64 nr) ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0); bch_err_msg(c, ret, "insert error"); if (ret) - goto err; + return ret; } pr_info("iterating forwards"); - i = 0; - ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - 0, k, ({ - BUG_ON(k.k->p.offset != i++); - 0; - })); + ret = bch2_trans_run(c, + for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + 0, k, ({ + BUG_ON(k.k->p.offset != i++); + 0; + }))); bch_err_msg(c, ret, "error iterating forwards"); if (ret) - goto err; + return ret; BUG_ON(i != nr); pr_info("iterating backwards"); - ret = for_each_btree_key_reverse(trans, iter, BTREE_ID_xattrs, - SPOS(0, U64_MAX, U32_MAX), 0, k, - ({ + ret = bch2_trans_run(c, + for_each_btree_key_reverse(trans, iter, BTREE_ID_xattrs, + SPOS(0, U64_MAX, U32_MAX), 0, k, ({ BUG_ON(k.k->p.offset != --i); 0; - })); + }))); bch_err_msg(c, ret, "error iterating backwards"); if (ret) - goto err; + return ret; BUG_ON(i); -err: - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - return ret; + return 0; } static int test_iterate_extents(struct bch_fs *c, u64 nr) { - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter = { NULL }; - struct bkey_s_c k; u64 i; int ret = 0; @@ -188,51 +179,45 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr) ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0); bch_err_msg(c, ret, "insert error"); if (ret) - goto err; + return ret; } pr_info("iterating forwards"); - i = 0; - ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - 0, k, ({ - BUG_ON(bkey_start_offset(k.k) != i); - i = k.k->p.offset; - 0; - })); + ret = bch2_trans_run(c, + for_each_btree_key_upto(trans, iter, BTREE_ID_extents, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + 0, k, ({ + BUG_ON(bkey_start_offset(k.k) != i); + i = k.k->p.offset; + 0; + }))); bch_err_msg(c, ret, "error iterating forwards"); if (ret) - goto err; + return ret; BUG_ON(i != nr); pr_info("iterating backwards"); - ret = for_each_btree_key_reverse(trans, iter, BTREE_ID_extents, - SPOS(0, U64_MAX, U32_MAX), 0, k, - ({ + ret = bch2_trans_run(c, + for_each_btree_key_reverse(trans, iter, BTREE_ID_extents, + SPOS(0, U64_MAX, U32_MAX), 0, k, ({ BUG_ON(k.k->p.offset != i); i = bkey_start_offset(k.k); 0; - })); + }))); bch_err_msg(c, ret, "error iterating backwards"); if (ret) - goto err; + return ret; BUG_ON(i); -err: - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - return ret; + return 0; } static int test_iterate_slots(struct bch_fs *c, u64 nr) { - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter = { NULL }; - struct bkey_s_c k; u64 i; int ret = 0; @@ -250,57 +235,48 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0); bch_err_msg(c, ret, "insert error"); if (ret) - goto err; + return ret; } pr_info("iterating forwards"); - i = 0; - ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - 0, k, ({ - BUG_ON(k.k->p.offset != i); - i += 2; - 0; - })); + ret = bch2_trans_run(c, + for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + 0, k, ({ + BUG_ON(k.k->p.offset != i); + i += 2; + 0; + }))); bch_err_msg(c, ret, "error iterating forwards"); if (ret) - goto err; + return ret; BUG_ON(i != nr * 2); pr_info("iterating forwards by slots"); - i = 0; - ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - BTREE_ITER_SLOTS, k, ({ - if (i >= nr * 2) - break; + ret = bch2_trans_run(c, + for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + BTREE_ITER_SLOTS, k, ({ + if (i >= nr * 2) + break; - BUG_ON(k.k->p.offset != i); - BUG_ON(bkey_deleted(k.k) != (i & 1)); + BUG_ON(k.k->p.offset != i); + BUG_ON(bkey_deleted(k.k) != (i & 1)); - i++; - 0; - })); - if (ret < 0) { - bch_err_msg(c, ret, "error iterating forwards by slots"); - goto err; - } - ret = 0; -err: - bch2_trans_put(trans); + i++; + 0; + }))); + bch_err_msg(c, ret, "error iterating forwards by slots"); return ret; } static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) { - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter = { NULL }; - struct bkey_s_c k; u64 i; int ret = 0; @@ -319,50 +295,45 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0); bch_err_msg(c, ret, "insert error"); if (ret) - goto err; + return ret; } pr_info("iterating forwards"); - i = 0; - ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - 0, k, ({ - BUG_ON(bkey_start_offset(k.k) != i + 8); - BUG_ON(k.k->size != 8); - i += 16; - 0; - })); + ret = bch2_trans_run(c, + for_each_btree_key_upto(trans, iter, BTREE_ID_extents, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + 0, k, ({ + BUG_ON(bkey_start_offset(k.k) != i + 8); + BUG_ON(k.k->size != 8); + i += 16; + 0; + }))); bch_err_msg(c, ret, "error iterating forwards"); if (ret) - goto err; + return ret; BUG_ON(i != nr); pr_info("iterating forwards by slots"); - i = 0; - ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents, - SPOS(0, 0, U32_MAX), POS(0, U64_MAX), - BTREE_ITER_SLOTS, k, ({ - if (i == nr) - break; - BUG_ON(bkey_deleted(k.k) != !(i % 16)); + ret = bch2_trans_run(c, + for_each_btree_key_upto(trans, iter, BTREE_ID_extents, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + BTREE_ITER_SLOTS, k, ({ + if (i == nr) + break; + BUG_ON(bkey_deleted(k.k) != !(i % 16)); - BUG_ON(bkey_start_offset(k.k) != i); - BUG_ON(k.k->size != 8); - i = k.k->p.offset; - 0; - })); + BUG_ON(bkey_start_offset(k.k) != i); + BUG_ON(k.k->size != 8); + i = k.k->p.offset; + 0; + }))); bch_err_msg(c, ret, "error iterating forwards by slots"); - if (ret) - goto err; - ret = 0; -err: - bch2_trans_put(trans); - return 0; + return ret; } /* @@ -736,8 +707,6 @@ static int rand_delete(struct bch_fs *c, u64 nr) static int seq_insert(struct bch_fs *c, u64 nr) { - struct btree_iter iter; - struct bkey_s_c k; struct bkey_i_cookie insert; bkey_cookie_init(&insert.k_i); @@ -756,11 +725,8 @@ static int seq_insert(struct bch_fs *c, u64 nr) static int seq_lookup(struct bch_fs *c, u64 nr) { - struct btree_iter iter; - struct bkey_s_c k; - return bch2_trans_run(c, - for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs, + for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), 0, k, 0)); @@ -768,9 +734,6 @@ static int seq_lookup(struct bch_fs *c, u64 nr) static int seq_overwrite(struct bch_fs *c, u64 nr) { - struct btree_iter iter; - struct bkey_s_c k; - return bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c new file mode 100644 index 000000000000..b1c867aa2b58 --- /dev/null +++ b/fs/bcachefs/thread_with_file.c @@ -0,0 +1,299 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef NO_BCACHEFS_FS + +#include "bcachefs.h" +#include "printbuf.h" +#include "thread_with_file.h" + +#include <linux/anon_inodes.h> +#include <linux/file.h> +#include <linux/kthread.h> +#include <linux/pagemap.h> +#include <linux/poll.h> + +void bch2_thread_with_file_exit(struct thread_with_file *thr) +{ + if (thr->task) { + kthread_stop(thr->task); + put_task_struct(thr->task); + } +} + +int bch2_run_thread_with_file(struct thread_with_file *thr, + const struct file_operations *fops, + int (*fn)(void *)) +{ + struct file *file = NULL; + int ret, fd = -1; + unsigned fd_flags = O_CLOEXEC; + + if (fops->read && fops->write) + fd_flags |= O_RDWR; + else if (fops->read) + fd_flags |= O_RDONLY; + else if (fops->write) + fd_flags |= O_WRONLY; + + char name[TASK_COMM_LEN]; + get_task_comm(name, current); + + thr->ret = 0; + thr->task = kthread_create(fn, thr, "%s", name); + ret = PTR_ERR_OR_ZERO(thr->task); + if (ret) + return ret; + + ret = get_unused_fd_flags(fd_flags); + if (ret < 0) + goto err; + fd = ret; + + file = anon_inode_getfile(name, fops, thr, fd_flags); + ret = PTR_ERR_OR_ZERO(file); + if (ret) + goto err; + + fd_install(fd, file); + get_task_struct(thr->task); + wake_up_process(thr->task); + return fd; +err: + if (fd >= 0) + put_unused_fd(fd); + if (thr->task) + kthread_stop(thr->task); + return ret; +} + +static inline bool thread_with_stdio_has_output(struct thread_with_stdio *thr) +{ + return thr->stdio.output_buf.pos || + thr->output2.nr || + thr->thr.done; +} + +static ssize_t thread_with_stdio_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos) +{ + struct thread_with_stdio *thr = + container_of(file->private_data, struct thread_with_stdio, thr); + size_t copied = 0, b; + int ret = 0; + + if ((file->f_flags & O_NONBLOCK) && + !thread_with_stdio_has_output(thr)) + return -EAGAIN; + + ret = wait_event_interruptible(thr->stdio.output_wait, + thread_with_stdio_has_output(thr)); + if (ret) + return ret; + + if (thr->thr.done) + return 0; + + while (len) { + ret = darray_make_room(&thr->output2, thr->stdio.output_buf.pos); + if (ret) + break; + + spin_lock_irq(&thr->stdio.output_lock); + b = min_t(size_t, darray_room(thr->output2), thr->stdio.output_buf.pos); + + memcpy(&darray_top(thr->output2), thr->stdio.output_buf.buf, b); + memmove(thr->stdio.output_buf.buf, + thr->stdio.output_buf.buf + b, + thr->stdio.output_buf.pos - b); + + thr->output2.nr += b; + thr->stdio.output_buf.pos -= b; + spin_unlock_irq(&thr->stdio.output_lock); + + b = min(len, thr->output2.nr); + if (!b) + break; + + b -= copy_to_user(buf, thr->output2.data, b); + if (!b) { + ret = -EFAULT; + break; + } + + copied += b; + buf += b; + len -= b; + + memmove(thr->output2.data, + thr->output2.data + b, + thr->output2.nr - b); + thr->output2.nr -= b; + } + + return copied ?: ret; +} + +static int thread_with_stdio_release(struct inode *inode, struct file *file) +{ + struct thread_with_stdio *thr = + container_of(file->private_data, struct thread_with_stdio, thr); + + bch2_thread_with_file_exit(&thr->thr); + printbuf_exit(&thr->stdio.input_buf); + printbuf_exit(&thr->stdio.output_buf); + darray_exit(&thr->output2); + thr->exit(thr); + return 0; +} + +#define WRITE_BUFFER 4096 + +static inline bool thread_with_stdio_has_input_space(struct thread_with_stdio *thr) +{ + return thr->stdio.input_buf.pos < WRITE_BUFFER || thr->thr.done; +} + +static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubuf, + size_t len, loff_t *ppos) +{ + struct thread_with_stdio *thr = + container_of(file->private_data, struct thread_with_stdio, thr); + struct printbuf *buf = &thr->stdio.input_buf; + size_t copied = 0; + ssize_t ret = 0; + + while (len) { + if (thr->thr.done) { + ret = -EPIPE; + break; + } + + size_t b = len - fault_in_readable(ubuf, len); + if (!b) { + ret = -EFAULT; + break; + } + + spin_lock(&thr->stdio.input_lock); + if (buf->pos < WRITE_BUFFER) + bch2_printbuf_make_room(buf, min(b, WRITE_BUFFER - buf->pos)); + b = min(len, printbuf_remaining_size(buf)); + + if (b && !copy_from_user_nofault(&buf->buf[buf->pos], ubuf, b)) { + ubuf += b; + len -= b; + copied += b; + buf->pos += b; + } + spin_unlock(&thr->stdio.input_lock); + + if (b) { + wake_up(&thr->stdio.input_wait); + } else { + if ((file->f_flags & O_NONBLOCK)) { + ret = -EAGAIN; + break; + } + + ret = wait_event_interruptible(thr->stdio.input_wait, + thread_with_stdio_has_input_space(thr)); + if (ret) + break; + } + } + + return copied ?: ret; +} + +static __poll_t thread_with_stdio_poll(struct file *file, struct poll_table_struct *wait) +{ + struct thread_with_stdio *thr = + container_of(file->private_data, struct thread_with_stdio, thr); + + poll_wait(file, &thr->stdio.output_wait, wait); + poll_wait(file, &thr->stdio.input_wait, wait); + + __poll_t mask = 0; + + if (thread_with_stdio_has_output(thr)) + mask |= EPOLLIN; + if (thread_with_stdio_has_input_space(thr)) + mask |= EPOLLOUT; + if (thr->thr.done) + mask |= EPOLLHUP|EPOLLERR; + return mask; +} + +static const struct file_operations thread_with_stdio_fops = { + .release = thread_with_stdio_release, + .read = thread_with_stdio_read, + .write = thread_with_stdio_write, + .poll = thread_with_stdio_poll, + .llseek = no_llseek, +}; + +int bch2_run_thread_with_stdio(struct thread_with_stdio *thr, + void (*exit)(struct thread_with_stdio *), + int (*fn)(void *)) +{ + thr->stdio.input_buf = PRINTBUF; + thr->stdio.input_buf.atomic++; + spin_lock_init(&thr->stdio.input_lock); + init_waitqueue_head(&thr->stdio.input_wait); + + thr->stdio.output_buf = PRINTBUF; + thr->stdio.output_buf.atomic++; + spin_lock_init(&thr->stdio.output_lock); + init_waitqueue_head(&thr->stdio.output_wait); + + darray_init(&thr->output2); + thr->exit = exit; + + return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, fn); +} + +int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *buf, size_t len) +{ + wait_event(stdio->input_wait, + stdio->input_buf.pos || stdio->done); + + if (stdio->done) + return -1; + + spin_lock(&stdio->input_lock); + int ret = min(len, stdio->input_buf.pos); + stdio->input_buf.pos -= ret; + memcpy(buf, stdio->input_buf.buf, ret); + memmove(stdio->input_buf.buf, + stdio->input_buf.buf + ret, + stdio->input_buf.pos); + spin_unlock(&stdio->input_lock); + + wake_up(&stdio->input_wait); + return ret; +} + +int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, char *buf, size_t len) +{ + wait_event(stdio->input_wait, + stdio->input_buf.pos || stdio->done); + + if (stdio->done) + return -1; + + spin_lock(&stdio->input_lock); + int ret = min(len, stdio->input_buf.pos); + char *n = memchr(stdio->input_buf.buf, '\n', ret); + if (n) + ret = min(ret, n + 1 - stdio->input_buf.buf); + stdio->input_buf.pos -= ret; + memcpy(buf, stdio->input_buf.buf, ret); + memmove(stdio->input_buf.buf, + stdio->input_buf.buf + ret, + stdio->input_buf.pos); + spin_unlock(&stdio->input_lock); + + wake_up(&stdio->input_wait); + return ret; +} + +#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/thread_with_file.h b/fs/bcachefs/thread_with_file.h new file mode 100644 index 000000000000..05879c5048c8 --- /dev/null +++ b/fs/bcachefs/thread_with_file.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_THREAD_WITH_FILE_H +#define _BCACHEFS_THREAD_WITH_FILE_H + +#include "thread_with_file_types.h" + +struct task_struct; + +struct thread_with_file { + struct task_struct *task; + int ret; + bool done; +}; + +void bch2_thread_with_file_exit(struct thread_with_file *); +int bch2_run_thread_with_file(struct thread_with_file *, + const struct file_operations *, + int (*fn)(void *)); + +struct thread_with_stdio { + struct thread_with_file thr; + struct stdio_redirect stdio; + DARRAY(char) output2; + void (*exit)(struct thread_with_stdio *); +}; + +static inline void thread_with_stdio_done(struct thread_with_stdio *thr) +{ + thr->thr.done = true; + thr->stdio.done = true; + wake_up(&thr->stdio.input_wait); + wake_up(&thr->stdio.output_wait); +} + +int bch2_run_thread_with_stdio(struct thread_with_stdio *, + void (*exit)(struct thread_with_stdio *), + int (*fn)(void *)); +int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t); +int bch2_stdio_redirect_readline(struct stdio_redirect *, char *, size_t); + +#endif /* _BCACHEFS_THREAD_WITH_FILE_H */ diff --git a/fs/bcachefs/thread_with_file_types.h b/fs/bcachefs/thread_with_file_types.h new file mode 100644 index 000000000000..90b5e645e98c --- /dev/null +++ b/fs/bcachefs/thread_with_file_types.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_THREAD_WITH_FILE_TYPES_H +#define _BCACHEFS_THREAD_WITH_FILE_TYPES_H + +struct stdio_redirect { + spinlock_t output_lock; + wait_queue_head_t output_wait; + struct printbuf output_buf; + + spinlock_t input_lock; + wait_queue_head_t input_wait; + struct printbuf input_buf; + bool done; +}; + +#endif /* _BCACHEFS_THREAD_WITH_FILE_TYPES_H */ diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index fd49b63562c3..293b90d704fb 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -32,22 +32,68 @@ DECLARE_EVENT_CLASS(bpos, TP_printk("%llu:%llu:%u", __entry->p_inode, __entry->p_offset, __entry->p_snapshot) ); -DECLARE_EVENT_CLASS(bkey, - TP_PROTO(struct bch_fs *c, const char *k), - TP_ARGS(c, k), +DECLARE_EVENT_CLASS(fs_str, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str), TP_STRUCT__entry( - __string(k, k ) + __field(dev_t, dev ) + __string(str, str ) ), TP_fast_assign( - __assign_str(k, k); + __entry->dev = c->dev; + __assign_str(str, str); ), - TP_printk("%s", __get_str(k)) + TP_printk("%d,%d\n%s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(str)) ); -DECLARE_EVENT_CLASS(btree_node, +DECLARE_EVENT_CLASS(trans_str, + TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *str), + TP_ARGS(trans, caller_ip, str), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + __string(str, str ) + ), + + TP_fast_assign( + __entry->dev = trans->c->dev; + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __assign_str(str, str); + ), + + TP_printk("%d,%d %s %pS %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->trans_fn, (void *) __entry->caller_ip, __get_str(str)) +); + +DECLARE_EVENT_CLASS(trans_str_nocaller, + TP_PROTO(struct btree_trans *trans, const char *str), + TP_ARGS(trans, str), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __array(char, trans_fn, 32 ) + __string(str, str ) + ), + + TP_fast_assign( + __entry->dev = trans->c->dev; + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __assign_str(str, str); + ), + + TP_printk("%d,%d %s %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->trans_fn, __get_str(str)) +); + +DECLARE_EVENT_CLASS(btree_node_nofs, TP_PROTO(struct bch_fs *c, struct btree *b), TP_ARGS(c, b), @@ -72,6 +118,33 @@ DECLARE_EVENT_CLASS(btree_node, __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot) ); +DECLARE_EVENT_CLASS(btree_node, + TP_PROTO(struct btree_trans *trans, struct btree *b), + TP_ARGS(trans, b), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __array(char, trans_fn, 32 ) + __field(u8, level ) + __field(u8, btree_id ) + TRACE_BPOS_entries(pos) + ), + + TP_fast_assign( + __entry->dev = trans->c->dev; + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->level = b->c.level; + __entry->btree_id = b->c.btree_id; + TRACE_BPOS_assign(pos, b->key.k.p); + ), + + TP_printk("%d,%d %s %u %s %llu:%llu:%u", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->trans_fn, + __entry->level, + bch2_btree_id_str(__entry->btree_id), + __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot) +); + DECLARE_EVENT_CLASS(bch_fs, TP_PROTO(struct bch_fs *c), TP_ARGS(c), @@ -87,6 +160,23 @@ DECLARE_EVENT_CLASS(bch_fs, TP_printk("%d,%d", MAJOR(__entry->dev), MINOR(__entry->dev)) ); +DECLARE_EVENT_CLASS(btree_trans, + TP_PROTO(struct btree_trans *trans), + TP_ARGS(trans), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __array(char, trans_fn, 32 ) + ), + + TP_fast_assign( + __entry->dev = trans->c->dev; + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + ), + + TP_printk("%d,%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->trans_fn) +); + DECLARE_EVENT_CLASS(bio, TP_PROTO(struct bio *bio), TP_ARGS(bio), @@ -183,9 +273,14 @@ DEFINE_EVENT(bch_fs, journal_full, TP_ARGS(c) ); -DEFINE_EVENT(bch_fs, journal_entry_full, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) +DEFINE_EVENT(fs_str, journal_entry_full, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) +); + +DEFINE_EVENT(fs_str, journal_entry_close, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); DEFINE_EVENT(bio, journal_write, @@ -286,36 +381,36 @@ TRACE_EVENT(btree_cache_scan, __entry->nr_to_scan, __entry->can_free, __entry->ret) ); -DEFINE_EVENT(btree_node, btree_cache_reap, +DEFINE_EVENT(btree_node_nofs, btree_cache_reap, TP_PROTO(struct bch_fs *c, struct btree *b), TP_ARGS(c, b) ); -DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock_fail, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) +DEFINE_EVENT(btree_trans, btree_cache_cannibalize_lock_fail, + TP_PROTO(struct btree_trans *trans), + TP_ARGS(trans) ); -DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) +DEFINE_EVENT(btree_trans, btree_cache_cannibalize_lock, + TP_PROTO(struct btree_trans *trans), + TP_ARGS(trans) ); -DEFINE_EVENT(bch_fs, btree_cache_cannibalize, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) +DEFINE_EVENT(btree_trans, btree_cache_cannibalize, + TP_PROTO(struct btree_trans *trans), + TP_ARGS(trans) ); -DEFINE_EVENT(bch_fs, btree_cache_cannibalize_unlock, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) +DEFINE_EVENT(btree_trans, btree_cache_cannibalize_unlock, + TP_PROTO(struct btree_trans *trans), + TP_ARGS(trans) ); /* Btree */ DEFINE_EVENT(btree_node, btree_node_read, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) + TP_PROTO(struct btree_trans *trans, struct btree *b), + TP_ARGS(trans, b) ); TRACE_EVENT(btree_node_write, @@ -339,13 +434,13 @@ TRACE_EVENT(btree_node_write, ); DEFINE_EVENT(btree_node, btree_node_alloc, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) + TP_PROTO(struct btree_trans *trans, struct btree *b), + TP_ARGS(trans, b) ); DEFINE_EVENT(btree_node, btree_node_free, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) + TP_PROTO(struct btree_trans *trans, struct btree *b), + TP_ARGS(trans, b) ); TRACE_EVENT(btree_reserve_get_fail, @@ -377,28 +472,28 @@ TRACE_EVENT(btree_reserve_get_fail, ); DEFINE_EVENT(btree_node, btree_node_compact, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) + TP_PROTO(struct btree_trans *trans, struct btree *b), + TP_ARGS(trans, b) ); DEFINE_EVENT(btree_node, btree_node_merge, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) + TP_PROTO(struct btree_trans *trans, struct btree *b), + TP_ARGS(trans, b) ); DEFINE_EVENT(btree_node, btree_node_split, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) + TP_PROTO(struct btree_trans *trans, struct btree *b), + TP_ARGS(trans, b) ); DEFINE_EVENT(btree_node, btree_node_rewrite, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) + TP_PROTO(struct btree_trans *trans, struct btree *b), + TP_ARGS(trans, b) ); DEFINE_EVENT(btree_node, btree_node_set_root, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) + TP_PROTO(struct btree_trans *trans, struct btree *b), + TP_ARGS(trans, b) ); TRACE_EVENT(btree_path_relock_fail, @@ -433,7 +528,7 @@ TRACE_EVENT(btree_path_relock_fail, __entry->level = path->level; TRACE_BPOS_assign(pos, path->pos); - c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level), + c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level); __entry->self_read_count = c.n[SIX_LOCK_read]; __entry->self_intent_count = c.n[SIX_LOCK_intent]; @@ -717,44 +812,32 @@ TRACE_EVENT(bucket_evacuate, __entry->dev_idx, __entry->bucket) ); -DEFINE_EVENT(bkey, move_extent, - TP_PROTO(struct bch_fs *c, const char *k), - TP_ARGS(c, k) +DEFINE_EVENT(fs_str, move_extent, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); -DEFINE_EVENT(bkey, move_extent_read, - TP_PROTO(struct bch_fs *c, const char *k), - TP_ARGS(c, k) +DEFINE_EVENT(fs_str, move_extent_read, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); -DEFINE_EVENT(bkey, move_extent_write, - TP_PROTO(struct bch_fs *c, const char *k), - TP_ARGS(c, k) +DEFINE_EVENT(fs_str, move_extent_write, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); -DEFINE_EVENT(bkey, move_extent_finish, - TP_PROTO(struct bch_fs *c, const char *k), - TP_ARGS(c, k) +DEFINE_EVENT(fs_str, move_extent_finish, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); -TRACE_EVENT(move_extent_fail, - TP_PROTO(struct bch_fs *c, const char *msg), - TP_ARGS(c, msg), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __string(msg, msg ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __assign_str(msg, msg); - ), - - TP_printk("%d:%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(msg)) +DEFINE_EVENT(fs_str, move_extent_fail, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); -DEFINE_EVENT(bkey, move_extent_start_fail, +DEFINE_EVENT(fs_str, move_extent_start_fail, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) ); @@ -930,7 +1013,7 @@ TRACE_EVENT(trans_restart_split_race, __entry->level = b->c.level; __entry->written = b->written; __entry->blocks = btree_blocks(trans->c); - __entry->u64s_remaining = bch_btree_keys_u64s_remaining(trans->c, b); + __entry->u64s_remaining = bch2_btree_keys_u64s_remaining(b); ), TP_printk("%s %pS l=%u written %u/%u u64s remaining %u", @@ -987,10 +1070,11 @@ DEFINE_EVENT(transaction_event, trans_restart_key_cache_raced, TP_ARGS(trans, caller_ip) ); -DEFINE_EVENT(transaction_event, trans_restart_too_many_iters, +DEFINE_EVENT(trans_str, trans_restart_too_many_iters, TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip) + unsigned long caller_ip, + const char *paths), + TP_ARGS(trans, caller_ip, paths) ); DECLARE_EVENT_CLASS(transaction_restart_iter, @@ -1036,8 +1120,6 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split, TP_ARGS(trans, caller_ip, path) ); -struct get_locks_fail; - TRACE_EVENT(trans_restart_upgrade, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, @@ -1056,8 +1138,6 @@ TRACE_EVENT(trans_restart_upgrade, __field(u8, level ) __field(u32, path_seq ) __field(u32, node_seq ) - __field(u32, path_alloc_seq ) - __field(u32, downgrade_seq) TRACE_BPOS_entries(pos) ), @@ -1070,12 +1150,10 @@ TRACE_EVENT(trans_restart_upgrade, __entry->level = f->l; __entry->path_seq = path->l[f->l].lock_seq; __entry->node_seq = IS_ERR_OR_NULL(f->b) ? 0 : f->b->c.lock.seq; - __entry->path_alloc_seq = path->alloc_seq; - __entry->downgrade_seq = path->downgrade_seq; TRACE_BPOS_assign(pos, path->pos) ), - TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u level %u path seq %u node seq %u alloc_seq %u downgrade_seq %u", + TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u level %u path seq %u node seq %u", __entry->trans_fn, (void *) __entry->caller_ip, bch2_btree_id_str(__entry->btree_id), @@ -1086,16 +1164,12 @@ TRACE_EVENT(trans_restart_upgrade, __entry->new_locks_want, __entry->level, __entry->path_seq, - __entry->node_seq, - __entry->path_alloc_seq, - __entry->downgrade_seq) + __entry->node_seq) ); -DEFINE_EVENT(transaction_restart_iter, trans_restart_relock, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path), - TP_ARGS(trans, caller_ip, path) +DEFINE_EVENT(trans_str, trans_restart_relock, + TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *str), + TP_ARGS(trans, caller_ip, str) ); DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_next_node, @@ -1160,10 +1234,10 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_memory_allocation_failure, TP_ARGS(trans, caller_ip, path) ); -DEFINE_EVENT(transaction_event, trans_restart_would_deadlock, +DEFINE_EVENT(trans_str_nocaller, trans_restart_would_deadlock, TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip) + const char *cycle), + TP_ARGS(trans, cycle) ); DEFINE_EVENT(transaction_event, trans_restart_would_deadlock_recursion_limit, @@ -1252,22 +1326,37 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced, TRACE_EVENT(path_downgrade, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, - struct btree_path *path), - TP_ARGS(trans, caller_ip, path), + struct btree_path *path, + unsigned old_locks_want), + TP_ARGS(trans, caller_ip, path, old_locks_want), TP_STRUCT__entry( __array(char, trans_fn, 32 ) __field(unsigned long, caller_ip ) + __field(unsigned, old_locks_want ) + __field(unsigned, new_locks_want ) + __field(unsigned, btree ) + TRACE_BPOS_entries(pos) ), TP_fast_assign( strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); __entry->caller_ip = caller_ip; + __entry->old_locks_want = old_locks_want; + __entry->new_locks_want = path->locks_want; + __entry->btree = path->btree_id; + TRACE_BPOS_assign(pos, path->pos); ), - TP_printk("%s %pS", + TP_printk("%s %pS locks_want %u -> %u %s %llu:%llu:%u", __entry->trans_fn, - (void *) __entry->caller_ip) + (void *) __entry->caller_ip, + __entry->old_locks_want, + __entry->new_locks_want, + bch2_btree_id_str(__entry->btree), + __entry->pos_inode, + __entry->pos_offset, + __entry->pos_snapshot) ); DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush, @@ -1298,21 +1387,48 @@ TRACE_EVENT(write_buffer_flush, __entry->nr, __entry->size, __entry->skipped, __entry->fast) ); +TRACE_EVENT(write_buffer_flush_sync, + TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), + TP_ARGS(trans, caller_ip), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + ), + + TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip) +); + TRACE_EVENT(write_buffer_flush_slowpath, - TP_PROTO(struct btree_trans *trans, size_t nr, size_t size), - TP_ARGS(trans, nr, size), + TP_PROTO(struct btree_trans *trans, size_t slowpath, size_t total), + TP_ARGS(trans, slowpath, total), TP_STRUCT__entry( - __field(size_t, nr ) - __field(size_t, size ) + __field(size_t, slowpath ) + __field(size_t, total ) ), TP_fast_assign( - __entry->nr = nr; - __entry->size = size; + __entry->slowpath = slowpath; + __entry->total = total; ), - TP_printk("%zu/%zu", __entry->nr, __entry->size) + TP_printk("%zu/%zu", __entry->slowpath, __entry->total) +); + +DEFINE_EVENT(fs_str, rebalance_extent, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) +); + +DEFINE_EVENT(fs_str, data_update, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); #endif /* _TRACE_BCACHEFS_H */ diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 84b142fcc3df..56b815fd9fc6 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -241,12 +241,17 @@ bool bch2_is_zero(const void *_p, size_t n) return true; } -void bch2_prt_u64_binary(struct printbuf *out, u64 v, unsigned nr_bits) +void bch2_prt_u64_base2_nbits(struct printbuf *out, u64 v, unsigned nr_bits) { while (nr_bits) prt_char(out, '0' + ((v >> --nr_bits) & 1)); } +void bch2_prt_u64_base2(struct printbuf *out, u64 v) +{ + bch2_prt_u64_base2_nbits(out, v, fls64(v) ?: 1); +} + void bch2_print_string_as_lines(const char *prefix, const char *lines) { const char *p; @@ -267,14 +272,14 @@ void bch2_print_string_as_lines(const char *prefix, const char *lines) console_unlock(); } -int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task) +int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigned skipnr, + gfp_t gfp) { #ifdef CONFIG_STACKTRACE unsigned nr_entries = 0; - int ret = 0; stack->nr = 0; - ret = darray_make_room(stack, 32); + int ret = darray_make_room_gfp(stack, 32, gfp); if (ret) return ret; @@ -282,7 +287,7 @@ int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task) return -1; do { - nr_entries = stack_trace_save_tsk(task, stack->data, stack->size, 0); + nr_entries = stack_trace_save_tsk(task, stack->data, stack->size, skipnr + 1); } while (nr_entries == stack->size && !(ret = darray_make_room(stack, stack->size * 2))); @@ -297,24 +302,74 @@ int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task) void bch2_prt_backtrace(struct printbuf *out, bch_stacktrace *stack) { - unsigned long *i; - darray_for_each(*stack, i) { prt_printf(out, "[<0>] %pB", (void *) *i); prt_newline(out); } } -int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task) +int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task, unsigned skipnr, gfp_t gfp) { bch_stacktrace stack = { 0 }; - int ret = bch2_save_backtrace(&stack, task); + int ret = bch2_save_backtrace(&stack, task, skipnr + 1, gfp); bch2_prt_backtrace(out, &stack); darray_exit(&stack); return ret; } +#ifndef __KERNEL__ +#include <time.h> +void bch2_prt_datetime(struct printbuf *out, time64_t sec) +{ + time_t t = sec; + char buf[64]; + ctime_r(&t, buf); + strim(buf); + prt_str(out, buf); +} +#else +void bch2_prt_datetime(struct printbuf *out, time64_t sec) +{ + char buf[64]; + snprintf(buf, sizeof(buf), "%ptT", &sec); + prt_u64(out, sec); +} +#endif + +static const struct time_unit { + const char *name; + u64 nsecs; +} time_units[] = { + { "ns", 1 }, + { "us", NSEC_PER_USEC }, + { "ms", NSEC_PER_MSEC }, + { "s", NSEC_PER_SEC }, + { "m", (u64) NSEC_PER_SEC * 60}, + { "h", (u64) NSEC_PER_SEC * 3600}, + { "eon", U64_MAX }, +}; + +static const struct time_unit *pick_time_units(u64 ns) +{ + const struct time_unit *u; + + for (u = time_units; + u + 1 < time_units + ARRAY_SIZE(time_units) && + ns >= u[1].nsecs << 1; + u++) + ; + + return u; +} + +void bch2_pr_time_units(struct printbuf *out, u64 ns) +{ + const struct time_unit *u = pick_time_units(ns); + + prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name); +} + /* time stats: */ #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT @@ -359,6 +414,7 @@ static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats, mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration); stats->max_duration = max(stats->max_duration, duration); stats->min_duration = min(stats->min_duration, duration); + stats->total_duration += duration; bch2_quantiles_update(&stats->quantiles, duration); } @@ -372,29 +428,33 @@ static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats, } } +static void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats, + struct bch2_time_stat_buffer *b) +{ + for (struct bch2_time_stat_buffer_entry *i = b->entries; + i < b->entries + ARRAY_SIZE(b->entries); + i++) + bch2_time_stats_update_one(stats, i->start, i->end); + b->nr = 0; +} + static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats, struct bch2_time_stat_buffer *b) { - struct bch2_time_stat_buffer_entry *i; unsigned long flags; spin_lock_irqsave(&stats->lock, flags); - for (i = b->entries; - i < b->entries + ARRAY_SIZE(b->entries); - i++) - bch2_time_stats_update_one(stats, i->start, i->end); + __bch2_time_stats_clear_buffer(stats, b); spin_unlock_irqrestore(&stats->lock, flags); - - b->nr = 0; } void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) { unsigned long flags; - WARN_RATELIMIT(!stats->min_duration || !stats->min_freq, - "time_stats: min_duration = %llu, min_freq = %llu", - stats->min_duration, stats->min_freq); + WARN_ONCE(!stats->duration_stats_weighted.weight || + !stats->freq_stats_weighted.weight, + "uninitialized time_stats"); if (!stats->buffer) { spin_lock_irqsave(&stats->lock, flags); @@ -423,40 +483,6 @@ void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) preempt_enable(); } } -#endif - -static const struct time_unit { - const char *name; - u64 nsecs; -} time_units[] = { - { "ns", 1 }, - { "us", NSEC_PER_USEC }, - { "ms", NSEC_PER_MSEC }, - { "s", NSEC_PER_SEC }, - { "m", (u64) NSEC_PER_SEC * 60}, - { "h", (u64) NSEC_PER_SEC * 3600}, - { "eon", U64_MAX }, -}; - -static const struct time_unit *pick_time_units(u64 ns) -{ - const struct time_unit *u; - - for (u = time_units; - u + 1 < time_units + ARRAY_SIZE(time_units) && - ns >= u[1].nsecs << 1; - u++) - ; - - return u; -} - -void bch2_pr_time_units(struct printbuf *out, u64 ns) -{ - const struct time_unit *u = pick_time_units(ns); - - prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name); -} static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns) { @@ -467,26 +493,6 @@ static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns) prt_printf(out, "%s", u->name); } -#ifndef __KERNEL__ -#include <time.h> -void bch2_prt_datetime(struct printbuf *out, time64_t sec) -{ - time_t t = sec; - char buf[64]; - ctime_r(&t, buf); - prt_str(out, buf); -} -#else -void bch2_prt_datetime(struct printbuf *out, time64_t sec) -{ - char buf[64]; - snprintf(buf, sizeof(buf), "%ptT", &sec); - prt_u64(out, sec); -} -#endif - -#define TABSTOP_SIZE 12 - static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns) { prt_str(out, name); @@ -495,12 +501,24 @@ static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 prt_newline(out); } +#define TABSTOP_SIZE 12 + void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) { const struct time_unit *u; s64 f_mean = 0, d_mean = 0; u64 q, last_q = 0, f_stddev = 0, d_stddev = 0; int i; + + if (stats->buffer) { + int cpu; + + spin_lock_irq(&stats->lock); + for_each_possible_cpu(cpu) + __bch2_time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu)); + spin_unlock_irq(&stats->lock); + } + /* * avoid divide by zero */ @@ -546,6 +564,7 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats pr_name_and_units(out, "min:", stats->min_duration); pr_name_and_units(out, "max:", stats->max_duration); + pr_name_and_units(out, "total:", stats->total_duration); prt_printf(out, "mean:"); prt_tab(out); @@ -603,6 +622,9 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats last_q = q; } } +#else +void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) {} +#endif void bch2_time_stats_exit(struct bch2_time_stats *stats) { @@ -1157,3 +1179,39 @@ u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr) return ret; } + +void bch2_darray_str_exit(darray_str *d) +{ + darray_for_each(*d, i) + kfree(*i); + darray_exit(d); +} + +int bch2_split_devs(const char *_dev_name, darray_str *ret) +{ + darray_init(ret); + + char *dev_name, *s, *orig; + + dev_name = orig = kstrdup(_dev_name, GFP_KERNEL); + if (!dev_name) + return -ENOMEM; + + while ((s = strsep(&dev_name, ":"))) { + char *p = kstrdup(s, GFP_KERNEL); + if (!p) + goto err; + + if (darray_push(ret, p)) { + kfree(p); + goto err; + } + } + + kfree(orig); + return 0; +err: + bch2_darray_str_exit(ret); + kfree(orig); + return -ENOMEM; +} diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index b701f7fe0784..b414736d59a5 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -342,14 +342,24 @@ bool bch2_is_zero(const void *, size_t); u64 bch2_read_flag_list(char *, const char * const[]); -void bch2_prt_u64_binary(struct printbuf *, u64, unsigned); +void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned); +void bch2_prt_u64_base2(struct printbuf *, u64); void bch2_print_string_as_lines(const char *prefix, const char *lines); typedef DARRAY(unsigned long) bch_stacktrace; -int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *); +int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned, gfp_t); void bch2_prt_backtrace(struct printbuf *, bch_stacktrace *); -int bch2_prt_task_backtrace(struct printbuf *, struct task_struct *); +int bch2_prt_task_backtrace(struct printbuf *, struct task_struct *, unsigned, gfp_t); + +static inline void prt_bdevname(struct printbuf *out, struct block_device *bdev) +{ +#ifdef __KERNEL__ + prt_printf(out, "%pg", bdev); +#else + prt_str(out, bdev->name); +#endif +} #define NR_QUANTILES 15 #define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES) @@ -374,8 +384,9 @@ struct bch2_time_stat_buffer { struct bch2_time_stats { spinlock_t lock; /* all fields are in nanoseconds */ - u64 max_duration; u64 min_duration; + u64 max_duration; + u64 total_duration; u64 max_freq; u64 min_freq; u64 last_event; @@ -390,15 +401,39 @@ struct bch2_time_stats { #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64); -#else -static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {} -#endif static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) { __bch2_time_stats_update(stats, start, local_clock()); } +static inline bool track_event_change(struct bch2_time_stats *stats, + u64 *start, bool v) +{ + if (v != !!*start) { + if (!v) { + bch2_time_stats_update(stats, *start); + *start = 0; + } else { + *start = local_clock() ?: 1; + return true; + } + } + + return false; +} +#else +static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {} +static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) {} +static inline bool track_event_change(struct bch2_time_stats *stats, + u64 *start, bool v) +{ + bool ret = v && !*start; + *start = v; + return ret; +} +#endif + void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *); void bch2_time_stats_exit(struct bch2_time_stats *); @@ -831,4 +866,14 @@ static inline int cmp_le32(__le32 l, __le32 r) #include <linux/uuid.h> +#define QSTR(n) { { { .len = strlen(n) } }, .name = n } + +static inline bool qstr_eq(const struct qstr l, const struct qstr r) +{ + return l.len == r.len && !memcmp(l.name, r.name, l.len); +} + +void bch2_darray_str_exit(darray_str *); +int bch2_split_devs(const char *, darray_str *); + #endif /* _BCACHEFS_UTIL_H */ diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h index a6561b4b36a6..2ad338e282da 100644 --- a/fs/bcachefs/vstructs.h +++ b/fs/bcachefs/vstructs.h @@ -48,14 +48,14 @@ ((void *) ((u64 *) (_s)->_data + __vstruct_u64s(_s))) #define vstruct_for_each(_s, _i) \ - for (_i = (_s)->start; \ + for (typeof(&(_s)->start[0]) _i = (_s)->start; \ _i < vstruct_last(_s); \ _i = vstruct_next(_i)) -#define vstruct_for_each_safe(_s, _i, _t) \ - for (_i = (_s)->start; \ - _i < vstruct_last(_s) && (_t = vstruct_next(_i), true); \ - _i = _t) +#define vstruct_for_each_safe(_s, _i) \ + for (typeof(&(_s)->start[0]) _next, _i = (_s)->start; \ + _i < vstruct_last(_s) && (_next = vstruct_next(_i), true); \ + _i = _next) #define vstruct_idx(_s, _idx) \ ((typeof(&(_s)->start[0])) ((_s)->_data + (_idx))) diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index 5a1858fb9879..9c0d2316031b 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -590,8 +590,9 @@ err: mutex_unlock(&inode->ei_update_lock); if (value && - (opt_id == Opt_background_compression || - opt_id == Opt_background_target)) + (opt_id == Opt_background_target || + opt_id == Opt_background_compression || + (opt_id == Opt_compression && !inode_opt_get(c, &inode->ei_inode, background_compression)))) bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum); return bch2_err_class(ret); diff --git a/fs/bcachefs/xattr_format.h b/fs/bcachefs/xattr_format.h new file mode 100644 index 000000000000..e9f810539552 --- /dev/null +++ b/fs/bcachefs/xattr_format.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_XATTR_FORMAT_H +#define _BCACHEFS_XATTR_FORMAT_H + +#define KEY_TYPE_XATTR_INDEX_USER 0 +#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS 1 +#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2 +#define KEY_TYPE_XATTR_INDEX_TRUSTED 3 +#define KEY_TYPE_XATTR_INDEX_SECURITY 4 + +struct bch_xattr { + struct bch_val v; + __u8 x_type; + __u8 x_name_len; + __le16 x_val_len; + __u8 x_name[]; +} __packed __aligned(8); + +#endif /* _BCACHEFS_XATTR_FORMAT_H */ diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index a93d76df8ed8..2b4dda047450 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -671,9 +671,6 @@ static struct dentry *befs_get_parent(struct dentry *child) parent = befs_iget(child->d_sb, (unsigned long)befs_ino->i_parent.start); - if (IS_ERR(parent)) - return ERR_CAST(parent); - return d_obtain_alias(parent); } diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index fbc4ae80a4b2..c375e22c4c0c 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -275,11 +275,6 @@ static int bfs_add_entry(struct inode *dir, const struct qstr *child, int ino) dprintf("name=%s, namelen=%d\n", name, namelen); - if (!namelen) - return -ENOENT; - if (namelen > BFS_NAMELEN) - return -ENAMETOOLONG; - sblock = BFS_I(dir)->i_sblock; eblock = BFS_I(dir)->i_eblock; for (block = sblock; block <= eblock; block++) { diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 193168214eeb..68345f73d429 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -141,16 +141,16 @@ static int compression_decompress_bio(struct list_head *ws, } static int compression_decompress(int type, struct list_head *ws, - const u8 *data_in, struct page *dest_page, - unsigned long start_byte, size_t srclen, size_t destlen) + const u8 *data_in, struct page *dest_page, + unsigned long dest_pgoff, size_t srclen, size_t destlen) { switch (type) { case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_page, - start_byte, srclen, destlen); + dest_pgoff, srclen, destlen); case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_page, - start_byte, srclen, destlen); + dest_pgoff, srclen, destlen); case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_page, - start_byte, srclen, destlen); + dest_pgoff, srclen, destlen); case BTRFS_COMPRESS_NONE: default: /* @@ -1037,14 +1037,23 @@ static int btrfs_decompress_bio(struct compressed_bio *cb) * start_byte tells us the offset into the compressed data we're interested in */ int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, - unsigned long start_byte, size_t srclen, size_t destlen) + unsigned long dest_pgoff, size_t srclen, size_t destlen) { + struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb); struct list_head *workspace; + const u32 sectorsize = fs_info->sectorsize; int ret; + /* + * The full destination page range should not exceed the page size. + * And the @destlen should not exceed sectorsize, as this is only called for + * inline file extents, which should not exceed sectorsize. + */ + ASSERT(dest_pgoff + destlen <= PAGE_SIZE && destlen <= sectorsize); + workspace = get_workspace(type, 0); ret = compression_decompress(type, workspace, data_in, dest_page, - start_byte, srclen, destlen); + dest_pgoff, srclen, destlen); put_workspace(type, workspace); return ret; diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 93cc92974dee..afd7e50d073d 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -148,7 +148,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, unsigned long *total_in, unsigned long *total_out); int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zlib_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen); struct list_head *zlib_alloc_workspace(unsigned int level); void zlib_free_workspace(struct list_head *ws); @@ -159,7 +159,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, unsigned long *total_in, unsigned long *total_out); int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int lzo_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen); struct list_head *lzo_alloc_workspace(unsigned int level); void lzo_free_workspace(struct list_head *ws); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f396aba92c57..8e8cc1111277 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1260,7 +1260,8 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, u64 bytes_left, end; u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT); - if (WARN_ON(start != aligned_start)) { + /* Adjust the range to be aligned to 512B sectors if necessary. */ + if (start != aligned_start) { len -= aligned_start - start; len = round_down(len, 1 << SECTOR_SHIFT); start = aligned_start; @@ -4298,6 +4299,42 @@ static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, return 0; } +static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info, + struct find_free_extent_ctl *ffe_ctl) +{ + if (ffe_ctl->for_treelog) { + spin_lock(&fs_info->treelog_bg_lock); + if (fs_info->treelog_bg) + ffe_ctl->hint_byte = fs_info->treelog_bg; + spin_unlock(&fs_info->treelog_bg_lock); + } else if (ffe_ctl->for_data_reloc) { + spin_lock(&fs_info->relocation_bg_lock); + if (fs_info->data_reloc_bg) + ffe_ctl->hint_byte = fs_info->data_reloc_bg; + spin_unlock(&fs_info->relocation_bg_lock); + } else if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) { + struct btrfs_block_group *block_group; + + spin_lock(&fs_info->zone_active_bgs_lock); + list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) { + /* + * No lock is OK here because avail is monotinically + * decreasing, and this is just a hint. + */ + u64 avail = block_group->zone_capacity - block_group->alloc_offset; + + if (block_group_bits(block_group, ffe_ctl->flags) && + avail >= ffe_ctl->num_bytes) { + ffe_ctl->hint_byte = block_group->start; + break; + } + } + spin_unlock(&fs_info->zone_active_bgs_lock); + } + + return 0; +} + static int prepare_allocation(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl, struct btrfs_space_info *space_info, @@ -4308,19 +4345,7 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info, return prepare_allocation_clustered(fs_info, ffe_ctl, space_info, ins); case BTRFS_EXTENT_ALLOC_ZONED: - if (ffe_ctl->for_treelog) { - spin_lock(&fs_info->treelog_bg_lock); - if (fs_info->treelog_bg) - ffe_ctl->hint_byte = fs_info->treelog_bg; - spin_unlock(&fs_info->treelog_bg_lock); - } - if (ffe_ctl->for_data_reloc) { - spin_lock(&fs_info->relocation_bg_lock); - if (fs_info->data_reloc_bg) - ffe_ctl->hint_byte = fs_info->data_reloc_bg; - spin_unlock(&fs_info->relocation_bg_lock); - } - return 0; + return prepare_allocation_zoned(fs_info, ffe_ctl); default: BUG(); } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 809b11472a80..1eb93d3962aa 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4458,6 +4458,8 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) u64 root_flags; int ret; + down_write(&fs_info->subvol_sem); + /* * Don't allow to delete a subvolume with send in progress. This is * inside the inode lock so the error handling that has to drop the bit @@ -4469,25 +4471,25 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) btrfs_warn(fs_info, "attempt to delete subvolume %llu during send", dest->root_key.objectid); - return -EPERM; + ret = -EPERM; + goto out_up_write; } if (atomic_read(&dest->nr_swapfiles)) { spin_unlock(&dest->root_item_lock); btrfs_warn(fs_info, "attempt to delete subvolume %llu with active swapfile", root->root_key.objectid); - return -EPERM; + ret = -EPERM; + goto out_up_write; } root_flags = btrfs_root_flags(&dest->root_item); btrfs_set_root_flags(&dest->root_item, root_flags | BTRFS_ROOT_SUBVOL_DEAD); spin_unlock(&dest->root_item_lock); - down_write(&fs_info->subvol_sem); - ret = may_destroy_subvol(dest); if (ret) - goto out_up_write; + goto out_undead; btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); /* @@ -4497,7 +4499,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) */ ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true); if (ret) - goto out_up_write; + goto out_undead; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { @@ -4563,15 +4565,17 @@ out_end_trans: inode->i_flags |= S_DEAD; out_release: btrfs_subvolume_release_metadata(root, &block_rsv); -out_up_write: - up_write(&fs_info->subvol_sem); +out_undead: if (ret) { spin_lock(&dest->root_item_lock); root_flags = btrfs_root_flags(&dest->root_item); btrfs_set_root_flags(&dest->root_item, root_flags & ~BTRFS_ROOT_SUBVOL_DEAD); spin_unlock(&dest->root_item_lock); - } else { + } +out_up_write: + up_write(&fs_info->subvol_sem); + if (!ret) { d_invalidate(dentry); btrfs_prune_dentries(dest); ASSERT(dest->send_in_progress == 0); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 41b479861b3c..dfed9dd9c2d7 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -790,6 +790,9 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, return -EOPNOTSUPP; } + if (btrfs_root_refs(&root->root_item) == 0) + return -ENOENT; + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) return -EINVAL; @@ -2608,6 +2611,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) ret = -EFAULT; goto out; } + if (range.flags & ~BTRFS_DEFRAG_RANGE_FLAGS_SUPP) { + ret = -EOPNOTSUPP; + goto out; + } /* compression requires us to start the IO */ if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { range.flags |= BTRFS_DEFRAG_RANGE_START_IO; diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 1131d5a29d61..e43bc0fdc74e 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -425,16 +425,16 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) } int lzo_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); + struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb); + const u32 sectorsize = fs_info->sectorsize; size_t in_len; size_t out_len; size_t max_segment_len = WORKSPACE_BUF_LENGTH; int ret = 0; - char *kaddr; - unsigned long bytes; if (srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2) return -EUCLEAN; @@ -451,7 +451,7 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in, } data_in += LZO_LEN; - out_len = PAGE_SIZE; + out_len = sectorsize; ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len); if (ret != LZO_E_OK) { pr_warn("BTRFS: decompress failed!\n"); @@ -459,29 +459,13 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in, goto out; } - if (out_len < start_byte) { + ASSERT(out_len <= sectorsize); + memcpy_to_page(dest_page, dest_pgoff, workspace->buf, out_len); + /* Early end, considered as an error. */ + if (unlikely(out_len < destlen)) { ret = -EIO; - goto out; + memzero_page(dest_page, dest_pgoff + out_len, destlen - out_len); } - - /* - * the caller is already checking against PAGE_SIZE, but lets - * move this check closer to the memcpy/memset - */ - destlen = min_t(unsigned long, destlen, PAGE_SIZE); - bytes = min_t(unsigned long, destlen, out_len - start_byte); - - kaddr = kmap_local_page(dest_page); - memcpy(kaddr, workspace->buf + start_byte, bytes); - - /* - * btrfs_getblock is doing a zero on the tail of the page too, - * but this will cover anything missing from the decompressed - * data. - */ - if (bytes < destlen) - memset(kaddr+bytes, 0, destlen-bytes); - kunmap_local(kaddr); out: return ret; } diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 6486f0d7e993..8c4fc98ca9ce 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -889,8 +889,10 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, out_unlock: spin_unlock(&fs_info->ref_verify_lock); out: - if (ret) + if (ret) { + btrfs_free_ref_cache(fs_info); btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); + } return ret; } @@ -1021,8 +1023,8 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) } } if (ret) { - btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); btrfs_free_ref_cache(fs_info); + btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); } btrfs_free_path(path); return ret; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a01807cbd4d4..0123d2728923 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1098,12 +1098,22 @@ out: static void scrub_read_endio(struct btrfs_bio *bbio) { struct scrub_stripe *stripe = bbio->private; + struct bio_vec *bvec; + int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); + int num_sectors; + u32 bio_size = 0; + int i; + + ASSERT(sector_nr < stripe->nr_sectors); + bio_for_each_bvec_all(bvec, &bbio->bio, i) + bio_size += bvec->bv_len; + num_sectors = bio_size >> stripe->bg->fs_info->sectorsize_bits; if (bbio->bio.bi_status) { - bitmap_set(&stripe->io_error_bitmap, 0, stripe->nr_sectors); - bitmap_set(&stripe->error_bitmap, 0, stripe->nr_sectors); + bitmap_set(&stripe->io_error_bitmap, sector_nr, num_sectors); + bitmap_set(&stripe->error_bitmap, sector_nr, num_sectors); } else { - bitmap_clear(&stripe->io_error_bitmap, 0, stripe->nr_sectors); + bitmap_clear(&stripe->io_error_bitmap, sector_nr, num_sectors); } bio_put(&bbio->bio); if (atomic_dec_and_test(&stripe->pending_io)) { @@ -1636,6 +1646,9 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, { struct btrfs_fs_info *fs_info = stripe->bg->fs_info; struct btrfs_bio *bbio = NULL; + unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start + + stripe->bg->length - stripe->logical) >> + fs_info->sectorsize_bits; u64 stripe_len = BTRFS_STRIPE_LEN; int mirror = stripe->mirror_num; int i; @@ -1646,6 +1659,10 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, struct page *page = scrub_stripe_get_page(stripe, i); unsigned int pgoff = scrub_stripe_get_page_offset(stripe, i); + /* We're beyond the chunk boundary, no need to read anymore. */ + if (i >= nr_sectors) + break; + /* The current sector cannot be merged, submit the bio. */ if (bbio && ((i > 0 && @@ -1701,6 +1718,9 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, { struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_bio *bbio; + unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start + + stripe->bg->length - stripe->logical) >> + fs_info->sectorsize_bits; int mirror = stripe->mirror_num; ASSERT(stripe->bg); @@ -1715,14 +1735,16 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info, scrub_read_endio, stripe); - /* Read the whole stripe. */ bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT; - for (int i = 0; i < BTRFS_STRIPE_LEN >> PAGE_SHIFT; i++) { + /* Read the whole range inside the chunk boundary. */ + for (unsigned int cur = 0; cur < nr_sectors; cur++) { + struct page *page = scrub_stripe_get_page(stripe, cur); + unsigned int pgoff = scrub_stripe_get_page_offset(stripe, cur); int ret; - ret = bio_add_page(&bbio->bio, stripe->pages[i], PAGE_SIZE, 0); + ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); /* We should have allocated enough bio vectors. */ - ASSERT(ret == PAGE_SIZE); + ASSERT(ret == fs_info->sectorsize); } atomic_inc(&stripe->pending_io); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 4e36550618e5..2d7519a6ce72 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -8205,8 +8205,8 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) goto out; } - sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots), - arg->clone_sources_count + 1, + sctx->clone_roots = kvcalloc(arg->clone_sources_count + 1, + sizeof(*sctx->clone_roots), GFP_KERNEL); if (!sctx->clone_roots) { ret = -ENOMEM; diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 93511d54abf8..0e49dab8dad2 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -475,7 +475,8 @@ void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info, spin_lock_irqsave(&subpage->lock, flags); bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - folio_start_writeback(folio); + if (!folio_test_writeback(folio)) + folio_start_writeback(folio); spin_unlock_irqrestore(&subpage->lock, flags); } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 896acfda1789..101f786963d4 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1457,6 +1457,14 @@ static int btrfs_reconfigure(struct fs_context *fc) btrfs_info_to_ctx(fs_info, &old_ctx); + /* + * This is our "bind mount" trick, we don't want to allow the user to do + * anything other than mount a different ro/rw and a different subvol, + * all of the mount options should be maintained. + */ + if (mount_reconfigure) + ctx->mount_opt = old_ctx.mount_opt; + sync_filesystem(sb); set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 50fdc69fdddf..6eccf8496486 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1436,7 +1436,7 @@ static int check_extent_item(struct extent_buffer *leaf, if (unlikely(ptr + btrfs_extent_inline_ref_size(inline_type) > end)) { extent_err(leaf, slot, "inline ref item overflows extent item, ptr %lu iref size %u end %lu", - ptr, inline_type, end); + ptr, btrfs_extent_inline_ref_size(inline_type), end); return -EUCLEAN; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 4c32497311d2..d67785be2c77 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3087,7 +3087,6 @@ struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, map = btrfs_find_chunk_map(fs_info, logical, length); if (unlikely(!map)) { - read_unlock(&fs_info->mapping_tree_lock); btrfs_crit(fs_info, "unable to find chunk map for logical %llu length %llu", logical, length); @@ -3095,7 +3094,6 @@ struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, } if (unlikely(map->start > logical || map->start + map->chunk_len <= logical)) { - read_unlock(&fs_info->mapping_tree_lock); btrfs_crit(fs_info, "found a bad chunk map, wanted %llu-%llu, found %llu-%llu", logical, logical + length, map->start, diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 36cf1f0e338e..8da66ea699e8 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -354,18 +354,13 @@ done: } int zlib_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, + struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); int ret = 0; int wbits = MAX_WBITS; - unsigned long bytes_left; - unsigned long total_out = 0; - unsigned long pg_offset = 0; - - destlen = min_t(unsigned long, destlen, PAGE_SIZE); - bytes_left = destlen; + unsigned long to_copy; workspace->strm.next_in = data_in; workspace->strm.avail_in = srclen; @@ -390,60 +385,30 @@ int zlib_decompress(struct list_head *ws, const u8 *data_in, return -EIO; } - while (bytes_left > 0) { - unsigned long buf_start; - unsigned long buf_offset; - unsigned long bytes; - - ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH); - if (ret != Z_OK && ret != Z_STREAM_END) - break; - - buf_start = total_out; - total_out = workspace->strm.total_out; - - if (total_out == buf_start) { - ret = -EIO; - break; - } - - if (total_out <= start_byte) - goto next; - - if (total_out > start_byte && buf_start < start_byte) - buf_offset = start_byte - buf_start; - else - buf_offset = 0; - - bytes = min(PAGE_SIZE - pg_offset, - PAGE_SIZE - (buf_offset % PAGE_SIZE)); - bytes = min(bytes, bytes_left); + /* + * Everything (in/out buf) should be at most one sector, there should + * be no need to switch any input/output buffer. + */ + ret = zlib_inflate(&workspace->strm, Z_FINISH); + to_copy = min(workspace->strm.total_out, destlen); + if (ret != Z_STREAM_END) + goto out; - memcpy_to_page(dest_page, pg_offset, - workspace->buf + buf_offset, bytes); + memcpy_to_page(dest_page, dest_pgoff, workspace->buf, to_copy); - pg_offset += bytes; - bytes_left -= bytes; -next: - workspace->strm.next_out = workspace->buf; - workspace->strm.avail_out = workspace->buf_size; - } - - if (ret != Z_STREAM_END && bytes_left != 0) +out: + if (unlikely(to_copy != destlen)) { + pr_warn_ratelimited("BTRFS: infalte failed, decompressed=%lu expected=%zu\n", + to_copy, destlen); ret = -EIO; - else + } else { ret = 0; + } zlib_inflateEnd(&workspace->strm); - /* - * this should only happen if zlib returned fewer bytes than we - * expected. btrfs_get_block is responsible for zeroing from the - * end of the inline extent (destlen) to the end of the page - */ - if (pg_offset < destlen) { - memzero_page(dest_page, pg_offset, destlen - pg_offset); - } + if (unlikely(to_copy < destlen)) + memzero_page(dest_page, dest_pgoff + to_copy, destlen - to_copy); return ret; } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 12066afc235c..168af9d000d1 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -578,26 +578,12 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) kvfree(zones); - switch (bdev_zoned_model(bdev)) { - case BLK_ZONED_HM: + if (bdev_is_zoned(bdev)) { model = "host-managed zoned"; emulated = ""; - break; - case BLK_ZONED_HA: - model = "host-aware zoned"; - emulated = ""; - break; - case BLK_ZONED_NONE: + } else { model = "regular"; emulated = "emulated "; - break; - default: - /* Just in case */ - btrfs_err_in_rcu(fs_info, "zoned: unsupported model %d on %s", - bdev_zoned_model(bdev), - rcu_str_deref(device->name)); - ret = -EOPNOTSUPP; - goto out_free_zone_info; } btrfs_info_in_rcu(fs_info, @@ -609,9 +595,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) out: kvfree(zones); -out_free_zone_info: btrfs_destroy_dev_zone_info(device); - return ret; } @@ -688,8 +672,7 @@ static int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info) struct btrfs_device *device; list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { - if (device->bdev && - bdev_zoned_model(device->bdev) == BLK_ZONED_HM) { + if (device->bdev && bdev_is_zoned(device->bdev)) { btrfs_err(fs_info, "zoned: mode not enabled but zoned device found: %pg", device->bdev); @@ -2072,6 +2055,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) map = block_group->physical_map; + spin_lock(&fs_info->zone_active_bgs_lock); spin_lock(&block_group->lock); if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) { ret = true; @@ -2084,7 +2068,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) goto out_unlock; } - spin_lock(&fs_info->zone_active_bgs_lock); for (i = 0; i < map->num_stripes; i++) { struct btrfs_zoned_device_info *zinfo; int reserved = 0; @@ -2104,20 +2087,17 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) */ if (atomic_read(&zinfo->active_zones_left) <= reserved) { ret = false; - spin_unlock(&fs_info->zone_active_bgs_lock); goto out_unlock; } if (!btrfs_dev_set_active_zone(device, physical)) { /* Cannot activate the zone */ ret = false; - spin_unlock(&fs_info->zone_active_bgs_lock); goto out_unlock; } if (!is_data) zinfo->reserved_active_zones--; } - spin_unlock(&fs_info->zone_active_bgs_lock); /* Successfully activated all the zones */ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags); @@ -2125,8 +2105,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) /* For the active block group list */ btrfs_get_block_group(block_group); - - spin_lock(&fs_info->zone_active_bgs_lock); list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs); spin_unlock(&fs_info->zone_active_bgs_lock); @@ -2134,6 +2112,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) out_unlock: spin_unlock(&block_group->lock); + spin_unlock(&fs_info->zone_active_bgs_lock); return ret; } diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index f24a5ffb7807..f573bda496fb 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -320,7 +320,7 @@ static inline bool btrfs_check_device_zone_type(const struct btrfs_fs_info *fs_i } /* Do not allow Host Managed zoned device. */ - return bdev_zoned_model(bdev) != BLK_ZONED_HM; + return !bdev_is_zoned(bdev); } static inline bool btrfs_check_super_location(struct btrfs_device *device, u64 pos) diff --git a/fs/cachefiles/Kconfig b/fs/cachefiles/Kconfig index 8df715640a48..c5a070550ee3 100644 --- a/fs/cachefiles/Kconfig +++ b/fs/cachefiles/Kconfig @@ -2,7 +2,7 @@ config CACHEFILES tristate "Filesystem caching on files" - depends on FSCACHE && BLOCK + depends on NETFS_SUPPORT && FSCACHE && BLOCK help This permits use of a mounted filesystem as a cache for other filesystems - primarily networking filesystems - thus allowing fast diff --git a/fs/cachefiles/error_inject.c b/fs/cachefiles/error_inject.c index 18de8a876b02..1715d5ca2b2d 100644 --- a/fs/cachefiles/error_inject.c +++ b/fs/cachefiles/error_inject.c @@ -19,7 +19,6 @@ static struct ctl_table cachefiles_sysctls[] = { .mode = 0644, .proc_handler = proc_douintvec, }, - {} }; int __init cachefiles_register_error_injection(void) diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 4a87c9d714a9..d33169f0018b 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -246,7 +246,7 @@ extern bool cachefiles_begin_operation(struct netfs_cache_resources *cres, enum fscache_want_state want_state); extern int __cachefiles_prepare_write(struct cachefiles_object *object, struct file *file, - loff_t *_start, size_t *_len, + loff_t *_start, size_t *_len, size_t upper_len, bool no_space_allocated_yet); extern int __cachefiles_write(struct cachefiles_object *object, struct file *file, diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index 5857241c5918..1d685357e67f 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -517,18 +517,26 @@ cachefiles_prepare_ondemand_read(struct netfs_cache_resources *cres, */ int __cachefiles_prepare_write(struct cachefiles_object *object, struct file *file, - loff_t *_start, size_t *_len, + loff_t *_start, size_t *_len, size_t upper_len, bool no_space_allocated_yet) { struct cachefiles_cache *cache = object->volume->cache; loff_t start = *_start, pos; - size_t len = *_len, down; + size_t len = *_len; int ret; /* Round to DIO size */ - down = start - round_down(start, PAGE_SIZE); - *_start = start - down; - *_len = round_up(down + len, PAGE_SIZE); + start = round_down(*_start, PAGE_SIZE); + if (start != *_start || *_len > upper_len) { + /* Probably asked to cache a streaming write written into the + * pagecache when the cookie was temporarily out of service to + * culling. + */ + fscache_count_dio_misfit(); + return -ENOBUFS; + } + + *_len = round_up(len, PAGE_SIZE); /* We need to work out whether there's sufficient disk space to perform * the write - but we can skip that check if we have space already @@ -539,7 +547,7 @@ int __cachefiles_prepare_write(struct cachefiles_object *object, pos = cachefiles_inject_read_error(); if (pos == 0) - pos = vfs_llseek(file, *_start, SEEK_DATA); + pos = vfs_llseek(file, start, SEEK_DATA); if (pos < 0 && pos >= (loff_t)-MAX_ERRNO) { if (pos == -ENXIO) goto check_space; /* Unallocated tail */ @@ -547,7 +555,7 @@ int __cachefiles_prepare_write(struct cachefiles_object *object, cachefiles_trace_seek_error); return pos; } - if ((u64)pos >= (u64)*_start + *_len) + if ((u64)pos >= (u64)start + *_len) goto check_space; /* Unallocated region */ /* We have a block that's at least partially filled - if we're low on @@ -560,13 +568,13 @@ int __cachefiles_prepare_write(struct cachefiles_object *object, pos = cachefiles_inject_read_error(); if (pos == 0) - pos = vfs_llseek(file, *_start, SEEK_HOLE); + pos = vfs_llseek(file, start, SEEK_HOLE); if (pos < 0 && pos >= (loff_t)-MAX_ERRNO) { trace_cachefiles_io_error(object, file_inode(file), pos, cachefiles_trace_seek_error); return pos; } - if ((u64)pos >= (u64)*_start + *_len) + if ((u64)pos >= (u64)start + *_len) return 0; /* Fully allocated */ /* Partially allocated, but insufficient space: cull. */ @@ -574,7 +582,7 @@ int __cachefiles_prepare_write(struct cachefiles_object *object, ret = cachefiles_inject_remove_error(); if (ret == 0) ret = vfs_fallocate(file, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, - *_start, *_len); + start, *_len); if (ret < 0) { trace_cachefiles_io_error(object, file_inode(file), ret, cachefiles_trace_fallocate_error); @@ -591,8 +599,8 @@ check_space: } static int cachefiles_prepare_write(struct netfs_cache_resources *cres, - loff_t *_start, size_t *_len, loff_t i_size, - bool no_space_allocated_yet) + loff_t *_start, size_t *_len, size_t upper_len, + loff_t i_size, bool no_space_allocated_yet) { struct cachefiles_object *object = cachefiles_cres_object(cres); struct cachefiles_cache *cache = object->volume->cache; @@ -608,7 +616,7 @@ static int cachefiles_prepare_write(struct netfs_cache_resources *cres, cachefiles_begin_secure(cache, &saved_cred); ret = __cachefiles_prepare_write(object, cachefiles_cres_file(cres), - _start, _len, + _start, _len, upper_len, no_space_allocated_yet); cachefiles_end_secure(cache, saved_cred); return ret; diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 7bf7a5fcc045..7ade836beb58 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -305,6 +305,8 @@ try_again: /* do the multiway lock magic */ trap = lock_rename(cache->graveyard, dir); + if (IS_ERR(trap)) + return PTR_ERR(trap); /* do some checks before getting the grave dentry */ if (rep->d_parent != dir || IS_DEADDIR(d_inode(rep))) { diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c index b8fbbb1961bb..4ba42f1fa3b4 100644 --- a/fs/cachefiles/ondemand.c +++ b/fs/cachefiles/ondemand.c @@ -50,7 +50,7 @@ static ssize_t cachefiles_ondemand_fd_write_iter(struct kiocb *kiocb, return -ENOBUFS; cachefiles_begin_secure(cache, &saved_cred); - ret = __cachefiles_prepare_write(object, file, &pos, &len, true); + ret = __cachefiles_prepare_write(object, file, &pos, &len, len, true); cachefiles_end_secure(cache, saved_cred); if (ret < 0) return ret; @@ -539,6 +539,9 @@ int cachefiles_ondemand_init_object(struct cachefiles_object *object) struct fscache_volume *volume = object->volume->vcookie; size_t volume_key_size, cookie_key_size, data_len; + if (!object->ondemand) + return 0; + /* * CacheFiles will firstly check the cache file under the root cache * directory. If the coherency check failed, it will fallback to diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig index 94df854147d3..7249d70e1a43 100644 --- a/fs/ceph/Kconfig +++ b/fs/ceph/Kconfig @@ -7,6 +7,7 @@ config CEPH_FS select CRYPTO_AES select CRYPTO select NETFS_SUPPORT + select FS_ENCRYPTION_ALGS if FS_ENCRYPTION default n help Choose Y or M here to include support for mounting the diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 13af429ab030..1340d77124ae 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -159,27 +159,7 @@ static void ceph_invalidate_folio(struct folio *folio, size_t offset, ceph_put_snap_context(snapc); } - folio_wait_fscache(folio); -} - -static bool ceph_release_folio(struct folio *folio, gfp_t gfp) -{ - struct inode *inode = folio->mapping->host; - struct ceph_client *cl = ceph_inode_to_client(inode); - - doutc(cl, "%llx.%llx idx %lu (%sdirty)\n", ceph_vinop(inode), - folio->index, folio_test_dirty(folio) ? "" : "not "); - - if (folio_test_private(folio)) - return false; - - if (folio_test_fscache(folio)) { - if (current_is_kswapd() || !(gfp & __GFP_FS)) - return false; - folio_wait_fscache(folio); - } - ceph_fscache_note_page_release(inode); - return true; + netfs_invalidate_folio(folio, offset, length); } static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq) @@ -357,6 +337,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) u64 len = subreq->len; bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD); u64 off = subreq->start; + int extent_cnt; if (ceph_inode_is_shutdown(inode)) { err = -EIO; @@ -370,8 +351,8 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, off, &len, 0, 1, sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ, - CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica, - NULL, ci->i_truncate_seq, ci->i_truncate_size, false); + CEPH_OSD_FLAG_READ, NULL, ci->i_truncate_seq, + ci->i_truncate_size, false); if (IS_ERR(req)) { err = PTR_ERR(req); req = NULL; @@ -379,7 +360,8 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) } if (sparse) { - err = ceph_alloc_sparse_ext_map(&req->r_ops[0]); + extent_cnt = __ceph_sparse_read_ext_count(inode, len); + err = ceph_alloc_sparse_ext_map(&req->r_ops[0], extent_cnt); if (err) goto out; } @@ -509,7 +491,6 @@ static void ceph_netfs_free_request(struct netfs_io_request *rreq) const struct netfs_request_ops ceph_netfs_ops = { .init_request = ceph_init_request, .free_request = ceph_netfs_free_request, - .begin_cache_operation = ceph_begin_cache_operation, .issue_read = ceph_netfs_issue_read, .expand_readahead = ceph_netfs_expand_readahead, .clamp_length = ceph_netfs_clamp_length, @@ -1586,7 +1567,7 @@ const struct address_space_operations ceph_aops = { .write_end = ceph_write_end, .dirty_folio = ceph_dirty_folio, .invalidate_folio = ceph_invalidate_folio, - .release_folio = ceph_release_folio, + .release_folio = netfs_release_folio, .direct_IO = noop_direct_IO, }; diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h index dc502daac49a..20efac020394 100644 --- a/fs/ceph/cache.h +++ b/fs/ceph/cache.h @@ -43,38 +43,19 @@ static inline void ceph_fscache_resize(struct inode *inode, loff_t to) } } -static inline void ceph_fscache_unpin_writeback(struct inode *inode, +static inline int ceph_fscache_unpin_writeback(struct inode *inode, struct writeback_control *wbc) { - fscache_unpin_writeback(wbc, ceph_fscache_cookie(ceph_inode(inode))); + return netfs_unpin_writeback(inode, wbc); } -static inline int ceph_fscache_dirty_folio(struct address_space *mapping, - struct folio *folio) -{ - struct ceph_inode_info *ci = ceph_inode(mapping->host); - - return fscache_dirty_folio(mapping, folio, ceph_fscache_cookie(ci)); -} - -static inline int ceph_begin_cache_operation(struct netfs_io_request *rreq) -{ - struct fscache_cookie *cookie = ceph_fscache_cookie(ceph_inode(rreq->inode)); - - return fscache_begin_read_operation(&rreq->cache_resources, cookie); -} +#define ceph_fscache_dirty_folio netfs_dirty_folio static inline bool ceph_is_cache_enabled(struct inode *inode) { return fscache_cookie_enabled(ceph_fscache_cookie(ceph_inode(inode))); } -static inline void ceph_fscache_note_page_release(struct inode *inode) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - - fscache_note_page_release(ceph_fscache_cookie(ci)); -} #else /* CONFIG_CEPH_FSCACHE */ static inline int ceph_fscache_register_fs(struct ceph_fs_client* fsc, struct fs_context *fc) @@ -119,30 +100,18 @@ static inline void ceph_fscache_resize(struct inode *inode, loff_t to) { } -static inline void ceph_fscache_unpin_writeback(struct inode *inode, - struct writeback_control *wbc) +static inline int ceph_fscache_unpin_writeback(struct inode *inode, + struct writeback_control *wbc) { + return 0; } -static inline int ceph_fscache_dirty_folio(struct address_space *mapping, - struct folio *folio) -{ - return filemap_dirty_folio(mapping, folio); -} +#define ceph_fscache_dirty_folio filemap_dirty_folio static inline bool ceph_is_cache_enabled(struct inode *inode) { return false; } - -static inline int ceph_begin_cache_operation(struct netfs_io_request *rreq) -{ - return -ENOBUFS; -} - -static inline void ceph_fscache_note_page_release(struct inode *inode) -{ -} #endif /* CONFIG_CEPH_FSCACHE */ #endif diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 2c0b8dc3dd0d..9c02f328c966 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -4887,13 +4887,15 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry, struct inode *dir, int mds, int drop, int unless) { - struct dentry *parent = NULL; struct ceph_mds_request_release *rel = *p; struct ceph_dentry_info *di = ceph_dentry(dentry); struct ceph_client *cl; int force = 0; int ret; + /* This shouldn't happen */ + BUG_ON(!dir); + /* * force an record for the directory caps if we have a dentry lease. * this is racy (can't take i_ceph_lock and d_lock together), but it @@ -4903,14 +4905,9 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry, spin_lock(&dentry->d_lock); if (di->lease_session && di->lease_session->s_mds == mds) force = 1; - if (!dir) { - parent = dget(dentry->d_parent); - dir = d_inode(parent); - } spin_unlock(&dentry->d_lock); ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force); - dput(parent); cl = ceph_inode_to_client(dir); spin_lock(&dentry->d_lock); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 91709934c8b1..0e9f56eaba1e 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -174,7 +174,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx, /* * When possible, we try to satisfy a readdir by peeking at the * dcache. We make this work by carefully ordering dentries on - * d_child when we initially get results back from the MDS, and + * d_children when we initially get results back from the MDS, and * falling back to a "normal" sync readdir if any dentries in the dir * are dropped. * @@ -1593,10 +1593,12 @@ struct ceph_lease_walk_control { unsigned long dir_lease_ttl; }; +static int __dir_lease_check(const struct dentry *, struct ceph_lease_walk_control *); +static int __dentry_lease_check(const struct dentry *); + static unsigned long __dentry_leases_walk(struct ceph_mds_client *mdsc, - struct ceph_lease_walk_control *lwc, - int (*check)(struct dentry*, void*)) + struct ceph_lease_walk_control *lwc) { struct ceph_dentry_info *di, *tmp; struct dentry *dentry, *last = NULL; @@ -1624,7 +1626,10 @@ __dentry_leases_walk(struct ceph_mds_client *mdsc, goto next; } - ret = check(dentry, lwc); + if (lwc->dir_lease) + ret = __dir_lease_check(dentry, lwc); + else + ret = __dentry_lease_check(dentry); if (ret & TOUCH) { /* move it into tail of dir lease list */ __dentry_dir_lease_touch(mdsc, di); @@ -1681,7 +1686,7 @@ next: return freed; } -static int __dentry_lease_check(struct dentry *dentry, void *arg) +static int __dentry_lease_check(const struct dentry *dentry) { struct ceph_dentry_info *di = ceph_dentry(dentry); int ret; @@ -1696,9 +1701,9 @@ static int __dentry_lease_check(struct dentry *dentry, void *arg) return DELETE; } -static int __dir_lease_check(struct dentry *dentry, void *arg) +static int __dir_lease_check(const struct dentry *dentry, + struct ceph_lease_walk_control *lwc) { - struct ceph_lease_walk_control *lwc = arg; struct ceph_dentry_info *di = ceph_dentry(dentry); int ret = __dir_lease_try_check(dentry); @@ -1737,7 +1742,7 @@ int ceph_trim_dentries(struct ceph_mds_client *mdsc) lwc.dir_lease = false; lwc.nr_to_scan = CEPH_CAPS_PER_RELEASE * 2; - freed = __dentry_leases_walk(mdsc, &lwc, __dentry_lease_check); + freed = __dentry_leases_walk(mdsc, &lwc); if (!lwc.nr_to_scan) /* more invalid leases */ return -EAGAIN; @@ -1747,7 +1752,7 @@ int ceph_trim_dentries(struct ceph_mds_client *mdsc) lwc.dir_lease = true; lwc.expire_dir_lease = freed < count; lwc.dir_lease_ttl = mdsc->fsc->mount_options->caps_wanted_delay_max * HZ; - freed +=__dentry_leases_walk(mdsc, &lwc, __dir_lease_check); + freed +=__dentry_leases_walk(mdsc, &lwc); if (!lwc.nr_to_scan) /* more to check */ return -EAGAIN; diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 726af69d4d62..a79f163ae4ed 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -286,8 +286,6 @@ static struct dentry *__snapfh_to_dentry(struct super_block *sb, doutc(cl, "%llx.%llx parent %llx hash %x err=%d", vino.ino, vino.snap, sfh->parent_ino, sfh->hash, err); } - if (IS_ERR(inode)) - return ERR_CAST(inode); /* see comments in ceph_get_parent() */ return unlinked ? d_obtain_root(inode) : d_obtain_alias(inode); } diff --git a/fs/ceph/file.c b/fs/ceph/file.c index d380d9dad0e0..abe8028d95bf 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1029,6 +1029,7 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, struct ceph_osd_req_op *op; u64 read_off = off; u64 read_len = len; + int extent_cnt; /* determine new offset/length if encrypted */ ceph_fscrypt_adjust_off_and_len(inode, &read_off, &read_len); @@ -1068,7 +1069,8 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, op = &req->r_ops[0]; if (sparse) { - ret = ceph_alloc_sparse_ext_map(op); + extent_cnt = __ceph_sparse_read_ext_count(inode, read_len); + ret = ceph_alloc_sparse_ext_map(op, extent_cnt); if (ret) { ceph_osdc_put_request(req); break; @@ -1465,6 +1467,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, ssize_t len; struct ceph_osd_req_op *op; int readop = sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ; + int extent_cnt; if (write) size = min_t(u64, size, fsc->mount_options->wsize); @@ -1528,7 +1531,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len); op = &req->r_ops[0]; if (sparse) { - ret = ceph_alloc_sparse_ext_map(op); + extent_cnt = __ceph_sparse_read_ext_count(inode, size); + ret = ceph_alloc_sparse_ext_map(op, extent_cnt); if (ret) { ceph_osdc_put_request(req); break; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 0679240f06db..0c25d326afc4 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -574,7 +574,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) doutc(fsc->client, "%p\n", &ci->netfs.inode); /* Set parameters for the netfs library */ - netfs_inode_init(&ci->netfs, &ceph_netfs_ops); + netfs_inode_init(&ci->netfs, &ceph_netfs_ops, false); spin_lock_init(&ci->i_ceph_lock); @@ -694,7 +694,7 @@ void ceph_evict_inode(struct inode *inode) percpu_counter_dec(&mdsc->metric.total_inodes); truncate_inode_pages_final(&inode->i_data); - if (inode->i_state & I_PINNING_FSCACHE_WB) + if (inode->i_state & I_PINNING_NETFS_WB) ceph_fscache_unuse_cookie(inode, true); clear_inode(inode); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index d95eb525519a..548d1de379f3 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1534,7 +1534,8 @@ static int encode_metric_spec(void **p, void *end) * session message, specialization for CEPH_SESSION_REQUEST_OPEN * to include additional client metadata fields. */ -static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u64 seq) +static struct ceph_msg * +create_session_full_msg(struct ceph_mds_client *mdsc, int op, u64 seq) { struct ceph_msg *msg; struct ceph_mds_session_head *h; @@ -1578,6 +1579,9 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 size = METRIC_BYTES(count); extra_bytes += 2 + 4 + 4 + size; + /* flags, mds auth caps and oldest_client_tid */ + extra_bytes += 4 + 4 + 8; + /* Allocate the message */ msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes, GFP_NOFS, false); @@ -1589,16 +1593,16 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 end = p + msg->front.iov_len; h = p; - h->op = cpu_to_le32(CEPH_SESSION_REQUEST_OPEN); + h->op = cpu_to_le32(op); h->seq = cpu_to_le64(seq); /* * Serialize client metadata into waiting buffer space, using * the format that userspace expects for map<string, string> * - * ClientSession messages with metadata are v4 + * ClientSession messages with metadata are v7 */ - msg->hdr.version = cpu_to_le16(4); + msg->hdr.version = cpu_to_le16(7); msg->hdr.compat_version = cpu_to_le16(1); /* The write pointer, following the session_head structure */ @@ -1634,6 +1638,15 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 return ERR_PTR(ret); } + /* version == 5, flags */ + ceph_encode_32(&p, 0); + + /* version == 6, mds auth caps */ + ceph_encode_32(&p, 0); + + /* version == 7, oldest_client_tid */ + ceph_encode_64(&p, mdsc->oldest_tid); + msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); @@ -1663,7 +1676,8 @@ static int __open_session(struct ceph_mds_client *mdsc, session->s_renew_requested = jiffies; /* send connect message */ - msg = create_session_open_msg(mdsc, session->s_seq); + msg = create_session_full_msg(mdsc, CEPH_SESSION_REQUEST_OPEN, + session->s_seq); if (IS_ERR(msg)) return PTR_ERR(msg); ceph_con_send(&session->s_con, msg); @@ -2028,10 +2042,10 @@ static int send_renew_caps(struct ceph_mds_client *mdsc, doutc(cl, "to mds%d (%s)\n", session->s_mds, ceph_mds_state_name(state)); - msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS, + msg = create_session_full_msg(mdsc, CEPH_SESSION_REQUEST_RENEWCAPS, ++session->s_renew_seq); - if (!msg) - return -ENOMEM; + if (IS_ERR(msg)) + return PTR_ERR(msg); ceph_con_send(&session->s_con, msg); return 0; } @@ -2128,7 +2142,7 @@ static bool drop_negative_children(struct dentry *dentry) goto out; spin_lock(&dentry->d_lock); - list_for_each_entry(child, &dentry->d_subdirs, d_child) { + hlist_for_each_entry(child, &dentry->d_children, d_sib) { if (d_really_is_positive(child)) { all_negative = false; break; @@ -4128,12 +4142,12 @@ static void handle_session(struct ceph_mds_session *session, pr_info_client(cl, "mds%d reconnect success\n", session->s_mds); + session->s_features = features; if (session->s_state == CEPH_MDS_SESSION_OPEN) { pr_notice_client(cl, "mds%d is already opened\n", session->s_mds); } else { session->s_state = CEPH_MDS_SESSION_OPEN; - session->s_features = features; renewed_caps(mdsc, session, 0); if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT, &session->s_features)) @@ -5870,7 +5884,8 @@ static void mds_peer_reset(struct ceph_connection *con) pr_warn_client(mdsc->fsc->client, "mds%d closed our session\n", s->s_mds); - if (READ_ONCE(mdsc->fsc->mount_state) != CEPH_MOUNT_FENCE_IO) + if (READ_ONCE(mdsc->fsc->mount_state) != CEPH_MOUNT_FENCE_IO && + ceph_mdsmap_get_state(mdsc->mdsmap, s->s_mds) >= CEPH_MDS_STATE_RECONNECT) send_mds_reconnect(mdsc, s); } diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 9d36c3532de1..06ee397e0c3a 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -197,10 +197,10 @@ void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc) } /* - * This function walks through the snaprealm for an inode and returns the - * ceph_snap_realm for the first snaprealm that has quotas set (max_files, + * This function walks through the snaprealm for an inode and set the + * realmp with the first snaprealm that has quotas set (max_files, * max_bytes, or any, depending on the 'which_quota' argument). If the root is - * reached, return the root ceph_snap_realm instead. + * reached, set the realmp with the root ceph_snap_realm instead. * * Note that the caller is responsible for calling ceph_put_snap_realm() on the * returned realm. @@ -211,10 +211,9 @@ void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc) * this function will return -EAGAIN; otherwise, the snaprealms walk-through * will be restarted. */ -static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc, - struct inode *inode, - enum quota_get_realm which_quota, - bool retry) +static int get_quota_realm(struct ceph_mds_client *mdsc, struct inode *inode, + enum quota_get_realm which_quota, + struct ceph_snap_realm **realmp, bool retry) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_inode_info *ci = NULL; @@ -222,8 +221,10 @@ static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc, struct inode *in; bool has_quota; + if (realmp) + *realmp = NULL; if (ceph_snap(inode) != CEPH_NOSNAP) - return NULL; + return 0; restart: realm = ceph_inode(inode)->i_snap_realm; @@ -250,7 +251,7 @@ restart: break; ceph_put_snap_realm(mdsc, realm); if (!retry) - return ERR_PTR(-EAGAIN); + return -EAGAIN; goto restart; } @@ -259,8 +260,11 @@ restart: iput(in); next = realm->parent; - if (has_quota || !next) - return realm; + if (has_quota || !next) { + if (realmp) + *realmp = realm; + return 0; + } ceph_get_snap_realm(mdsc, next); ceph_put_snap_realm(mdsc, realm); @@ -269,7 +273,7 @@ restart: if (realm) ceph_put_snap_realm(mdsc, realm); - return NULL; + return 0; } bool ceph_quota_is_same_realm(struct inode *old, struct inode *new) @@ -277,6 +281,7 @@ bool ceph_quota_is_same_realm(struct inode *old, struct inode *new) struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old->i_sb); struct ceph_snap_realm *old_realm, *new_realm; bool is_same; + int ret; restart: /* @@ -286,9 +291,9 @@ restart: * dropped and we can then restart the whole operation. */ down_read(&mdsc->snap_rwsem); - old_realm = get_quota_realm(mdsc, old, QUOTA_GET_ANY, true); - new_realm = get_quota_realm(mdsc, new, QUOTA_GET_ANY, false); - if (PTR_ERR(new_realm) == -EAGAIN) { + get_quota_realm(mdsc, old, QUOTA_GET_ANY, &old_realm, true); + ret = get_quota_realm(mdsc, new, QUOTA_GET_ANY, &new_realm, false); + if (ret == -EAGAIN) { up_read(&mdsc->snap_rwsem); if (old_realm) ceph_put_snap_realm(mdsc, old_realm); @@ -492,8 +497,8 @@ bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf) bool is_updated = false; down_read(&mdsc->snap_rwsem); - realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root), - QUOTA_GET_MAX_BYTES, true); + get_quota_realm(mdsc, d_inode(fsc->sb->s_root), QUOTA_GET_MAX_BYTES, + &realm, true); up_read(&mdsc->snap_rwsem); if (!realm) return false; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index fe0f64a0acb2..b06e2bc86221 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -3,6 +3,7 @@ #define _FS_CEPH_SUPER_H #include <linux/ceph/ceph_debug.h> +#include <linux/ceph/osd_client.h> #include <asm/unaligned.h> #include <linux/backing-dev.h> @@ -1407,6 +1408,19 @@ static inline void __ceph_update_quota(struct ceph_inode_info *ci, ceph_adjust_quota_realms_count(&ci->netfs.inode, has_quota); } +static inline int __ceph_sparse_read_ext_count(struct inode *inode, u64 len) +{ + int cnt = 0; + + if (IS_ENCRYPTED(inode)) { + cnt = len >> CEPH_FSCRYPT_BLOCK_SHIFT; + if (cnt > CEPH_SPARSE_EXT_ARRAY_INITIAL) + cnt = 0; + } + + return cnt; +} + extern void ceph_handle_quota(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg); diff --git a/fs/coda/cache.c b/fs/coda/cache.c index 3b8c4513118f..970f0022ec52 100644 --- a/fs/coda/cache.c +++ b/fs/coda/cache.c @@ -93,13 +93,13 @@ static void coda_flag_children(struct dentry *parent, int flag) struct dentry *de; spin_lock(&parent->d_lock); - list_for_each_entry(de, &parent->d_subdirs, d_child) { + hlist_for_each_entry(de, &parent->d_children, d_sib) { + struct inode *inode = d_inode_rcu(de); /* don't know what to do with negative dentries */ - if (d_inode(de) ) - coda_flag_inode(d_inode(de), flag); + if (inode) + coda_flag_inode(inode, flag); } spin_unlock(&parent->d_lock); - return; } void coda_flag_inode_children(struct inode *inode, int flag) diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c index a247c14aaab7..9f2d5743e2c8 100644 --- a/fs/coda/sysctl.c +++ b/fs/coda/sysctl.c @@ -36,7 +36,6 @@ static struct ctl_table coda_table[] = { .mode = 0600, .proc_handler = proc_dointvec }, - {} }; void coda_sysctl_init(void) diff --git a/fs/coredump.c b/fs/coredump.c index 9d235fa14ab9..f258c17c1841 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -981,7 +981,6 @@ static struct ctl_table coredump_sysctls[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; static int __init init_fs_coredump_sysctls(void) diff --git a/fs/dcache.c b/fs/dcache.c index 2ba37643b9c5..b813528fb147 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -51,8 +51,8 @@ * - d_lru * - d_count * - d_unhashed() - * - d_parent and d_subdirs - * - childrens' d_child and d_parent + * - d_parent and d_chilren + * - childrens' d_sib and d_parent * - d_u.d_alias, d_inode * * Ordering: @@ -191,7 +191,6 @@ static struct ctl_table fs_dcache_sysctls[] = { .mode = 0444, .proc_handler = proc_nr_dentry, }, - { } }; static int __init init_fs_dcache_sysctls(void) @@ -344,7 +343,7 @@ static inline void __d_set_inode_and_type(struct dentry *dentry, dentry->d_inode = inode; flags = READ_ONCE(dentry->d_flags); - flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU); + flags &= ~DCACHE_ENTRY_TYPE; flags |= type_flags; smp_store_release(&dentry->d_flags, flags); } @@ -353,7 +352,7 @@ static inline void __d_clear_type_and_inode(struct dentry *dentry) { unsigned flags = READ_ONCE(dentry->d_flags); - flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU); + flags &= ~DCACHE_ENTRY_TYPE; WRITE_ONCE(dentry->d_flags, flags); dentry->d_inode = NULL; if (dentry->d_flags & DCACHE_LRU_LIST) @@ -539,7 +538,7 @@ void d_drop(struct dentry *dentry) } EXPORT_SYMBOL(d_drop); -static inline void dentry_unlist(struct dentry *dentry, struct dentry *parent) +static inline void dentry_unlist(struct dentry *dentry) { struct dentry *next; /* @@ -547,12 +546,12 @@ static inline void dentry_unlist(struct dentry *dentry, struct dentry *parent) * attached to the dentry tree */ dentry->d_flags |= DCACHE_DENTRY_KILLED; - if (unlikely(list_empty(&dentry->d_child))) + if (unlikely(hlist_unhashed(&dentry->d_sib))) return; - __list_del_entry(&dentry->d_child); + __hlist_del(&dentry->d_sib); /* * Cursors can move around the list of children. While we'd been - * a normal list member, it didn't matter - ->d_child.next would've + * a normal list member, it didn't matter - ->d_sib.next would've * been updated. However, from now on it won't be and for the * things like d_walk() it might end up with a nasty surprise. * Normally d_walk() doesn't care about cursors moving around - @@ -560,29 +559,27 @@ static inline void dentry_unlist(struct dentry *dentry, struct dentry *parent) * of its own, we get through it without ever unlocking the parent. * There is one exception, though - if we ascend from a child that * gets killed as soon as we unlock it, the next sibling is found - * using the value left in its ->d_child.next. And if _that_ + * using the value left in its ->d_sib.next. And if _that_ * pointed to a cursor, and cursor got moved (e.g. by lseek()) * before d_walk() regains parent->d_lock, we'll end up skipping * everything the cursor had been moved past. * - * Solution: make sure that the pointer left behind in ->d_child.next + * Solution: make sure that the pointer left behind in ->d_sib.next * points to something that won't be moving around. I.e. skip the * cursors. */ - while (dentry->d_child.next != &parent->d_subdirs) { - next = list_entry(dentry->d_child.next, struct dentry, d_child); + while (dentry->d_sib.next) { + next = hlist_entry(dentry->d_sib.next, struct dentry, d_sib); if (likely(!(next->d_flags & DCACHE_DENTRY_CURSOR))) break; - dentry->d_child.next = next->d_child.next; + dentry->d_sib.next = next->d_sib.next; } } -static void __dentry_kill(struct dentry *dentry) +static struct dentry *__dentry_kill(struct dentry *dentry) { struct dentry *parent = NULL; bool can_free = true; - if (!IS_ROOT(dentry)) - parent = dentry->d_parent; /* * The dentry is now unrecoverably dead to the world. @@ -602,9 +599,6 @@ static void __dentry_kill(struct dentry *dentry) } /* if it was on the hash then remove it */ __d_drop(dentry); - dentry_unlist(dentry, parent); - if (parent) - spin_unlock(&parent->d_lock); if (dentry->d_inode) dentry_unlink_inode(dentry); else @@ -613,80 +607,114 @@ static void __dentry_kill(struct dentry *dentry) if (dentry->d_op && dentry->d_op->d_release) dentry->d_op->d_release(dentry); - spin_lock(&dentry->d_lock); - if (dentry->d_flags & DCACHE_SHRINK_LIST) { - dentry->d_flags |= DCACHE_MAY_FREE; - can_free = false; + cond_resched(); + /* now that it's negative, ->d_parent is stable */ + if (!IS_ROOT(dentry)) { + parent = dentry->d_parent; + spin_lock(&parent->d_lock); } + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + dentry_unlist(dentry); + if (dentry->d_flags & DCACHE_SHRINK_LIST) + can_free = false; spin_unlock(&dentry->d_lock); if (likely(can_free)) dentry_free(dentry); - cond_resched(); -} - -static struct dentry *__lock_parent(struct dentry *dentry) -{ - struct dentry *parent; - rcu_read_lock(); - spin_unlock(&dentry->d_lock); -again: - parent = READ_ONCE(dentry->d_parent); - spin_lock(&parent->d_lock); - /* - * We can't blindly lock dentry until we are sure - * that we won't violate the locking order. - * Any changes of dentry->d_parent must have - * been done with parent->d_lock held, so - * spin_lock() above is enough of a barrier - * for checking if it's still our child. - */ - if (unlikely(parent != dentry->d_parent)) { + if (parent && --parent->d_lockref.count) { spin_unlock(&parent->d_lock); - goto again; + return NULL; } - rcu_read_unlock(); - if (parent != dentry) - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - else - parent = NULL; return parent; } -static inline struct dentry *lock_parent(struct dentry *dentry) +/* + * Lock a dentry for feeding it to __dentry_kill(). + * Called under rcu_read_lock() and dentry->d_lock; the former + * guarantees that nothing we access will be freed under us. + * Note that dentry is *not* protected from concurrent dentry_kill(), + * d_delete(), etc. + * + * Return false if dentry is busy. Otherwise, return true and have + * that dentry's inode locked. + */ + +static bool lock_for_kill(struct dentry *dentry) { - struct dentry *parent = dentry->d_parent; - if (IS_ROOT(dentry)) - return NULL; - if (likely(spin_trylock(&parent->d_lock))) - return parent; - return __lock_parent(dentry); + struct inode *inode = dentry->d_inode; + + if (unlikely(dentry->d_lockref.count)) + return false; + + if (!inode || likely(spin_trylock(&inode->i_lock))) + return true; + + do { + spin_unlock(&dentry->d_lock); + spin_lock(&inode->i_lock); + spin_lock(&dentry->d_lock); + if (likely(inode == dentry->d_inode)) + break; + spin_unlock(&inode->i_lock); + inode = dentry->d_inode; + } while (inode); + if (likely(!dentry->d_lockref.count)) + return true; + if (inode) + spin_unlock(&inode->i_lock); + return false; } -static inline bool retain_dentry(struct dentry *dentry) +/* + * Decide if dentry is worth retaining. Usually this is called with dentry + * locked; if not locked, we are more limited and might not be able to tell + * without a lock. False in this case means "punt to locked path and recheck". + * + * In case we aren't locked, these predicates are not "stable". However, it is + * sufficient that at some point after we dropped the reference the dentry was + * hashed and the flags had the proper value. Other dentry users may have + * re-gotten a reference to the dentry and change that, but our work is done - + * we can leave the dentry around with a zero refcount. + */ +static inline bool retain_dentry(struct dentry *dentry, bool locked) { - WARN_ON(d_in_lookup(dentry)); + unsigned int d_flags; - /* Unreachable? Get rid of it */ + smp_rmb(); + d_flags = READ_ONCE(dentry->d_flags); + + // Unreachable? Nobody would be able to look it up, no point retaining if (unlikely(d_unhashed(dentry))) return false; - if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED)) + // Same if it's disconnected + if (unlikely(d_flags & DCACHE_DISCONNECTED)) return false; - if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) { - if (dentry->d_op->d_delete(dentry)) + // ->d_delete() might tell us not to bother, but that requires + // ->d_lock; can't decide without it + if (unlikely(d_flags & DCACHE_OP_DELETE)) { + if (!locked || dentry->d_op->d_delete(dentry)) return false; } - if (unlikely(dentry->d_flags & DCACHE_DONTCACHE)) + // Explicitly told not to bother + if (unlikely(d_flags & DCACHE_DONTCACHE)) return false; - /* retain; LRU fodder */ - dentry->d_lockref.count--; - if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST))) + // At this point it looks like we ought to keep it. We also might + // need to do something - put it on LRU if it wasn't there already + // and mark it referenced if it was on LRU, but not marked yet. + // Unfortunately, both actions require ->d_lock, so in lockless + // case we'd have to punt rather than doing those. + if (unlikely(!(d_flags & DCACHE_LRU_LIST))) { + if (!locked) + return false; d_lru_add(dentry); - else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED))) + } else if (unlikely(!(d_flags & DCACHE_REFERENCED))) { + if (!locked) + return false; dentry->d_flags |= DCACHE_REFERENCED; + } return true; } @@ -706,60 +734,11 @@ void d_mark_dontcache(struct inode *inode) EXPORT_SYMBOL(d_mark_dontcache); /* - * Finish off a dentry we've decided to kill. - * dentry->d_lock must be held, returns with it unlocked. - * Returns dentry requiring refcount drop, or NULL if we're done. - */ -static struct dentry *dentry_kill(struct dentry *dentry) - __releases(dentry->d_lock) -{ - struct inode *inode = dentry->d_inode; - struct dentry *parent = NULL; - - if (inode && unlikely(!spin_trylock(&inode->i_lock))) - goto slow_positive; - - if (!IS_ROOT(dentry)) { - parent = dentry->d_parent; - if (unlikely(!spin_trylock(&parent->d_lock))) { - parent = __lock_parent(dentry); - if (likely(inode || !dentry->d_inode)) - goto got_locks; - /* negative that became positive */ - if (parent) - spin_unlock(&parent->d_lock); - inode = dentry->d_inode; - goto slow_positive; - } - } - __dentry_kill(dentry); - return parent; - -slow_positive: - spin_unlock(&dentry->d_lock); - spin_lock(&inode->i_lock); - spin_lock(&dentry->d_lock); - parent = lock_parent(dentry); -got_locks: - if (unlikely(dentry->d_lockref.count != 1)) { - dentry->d_lockref.count--; - } else if (likely(!retain_dentry(dentry))) { - __dentry_kill(dentry); - return parent; - } - /* we are keeping it, after all */ - if (inode) - spin_unlock(&inode->i_lock); - if (parent) - spin_unlock(&parent->d_lock); - spin_unlock(&dentry->d_lock); - return NULL; -} - -/* * Try to do a lockless dput(), and return whether that was successful. * * If unsuccessful, we return false, having already taken the dentry lock. + * In that case refcount is guaranteed to be zero and we have already + * decided that it's not worth keeping around. * * The caller needs to hold the RCU read lock, so that the dentry is * guaranteed to stay around even if the refcount goes down to zero! @@ -767,18 +746,9 @@ got_locks: static inline bool fast_dput(struct dentry *dentry) { int ret; - unsigned int d_flags; - - /* - * If we have a d_op->d_delete() operation, we sould not - * let the dentry count go to zero, so use "put_or_lock". - */ - if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) - return lockref_put_or_lock(&dentry->d_lockref); /* - * .. otherwise, we can try to just decrement the - * lockref optimistically. + * try to decrement the lockref optimistically. */ ret = lockref_put_return(&dentry->d_lockref); @@ -789,12 +759,12 @@ static inline bool fast_dput(struct dentry *dentry) */ if (unlikely(ret < 0)) { spin_lock(&dentry->d_lock); - if (dentry->d_lockref.count > 1) { - dentry->d_lockref.count--; + if (WARN_ON_ONCE(dentry->d_lockref.count <= 0)) { spin_unlock(&dentry->d_lock); return true; } - return false; + dentry->d_lockref.count--; + goto locked; } /* @@ -804,45 +774,18 @@ static inline bool fast_dput(struct dentry *dentry) return true; /* - * Careful, careful. The reference count went down - * to zero, but we don't hold the dentry lock, so - * somebody else could get it again, and do another - * dput(), and we need to not race with that. - * - * However, there is a very special and common case - * where we don't care, because there is nothing to - * do: the dentry is still hashed, it does not have - * a 'delete' op, and it's referenced and already on - * the LRU list. - * - * NOTE! Since we aren't locked, these values are - * not "stable". However, it is sufficient that at - * some point after we dropped the reference the - * dentry was hashed and the flags had the proper - * value. Other dentry users may have re-gotten - * a reference to the dentry and change that, but - * our work is done - we can leave the dentry - * around with a zero refcount. - * - * Nevertheless, there are two cases that we should kill - * the dentry anyway. - * 1. free disconnected dentries as soon as their refcount - * reached zero. - * 2. free dentries if they should not be cached. + * Can we decide that decrement of refcount is all we needed without + * taking the lock? There's a very common case when it's all we need - + * dentry looks like it ought to be retained and there's nothing else + * to do. */ - smp_rmb(); - d_flags = READ_ONCE(dentry->d_flags); - d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST | - DCACHE_DISCONNECTED | DCACHE_DONTCACHE; - - /* Nothing to do? Dropping the reference was all we needed? */ - if (d_flags == (DCACHE_REFERENCED | DCACHE_LRU_LIST) && !d_unhashed(dentry)) + if (retain_dentry(dentry, false)) return true; /* - * Not the fast normal case? Get the lock. We've already decremented - * the refcount, but we'll need to re-check the situation after - * getting the lock. + * Either not worth retaining or we can't tell without the lock. + * Get the lock, then. We've already decremented the refcount to 0, + * but we'll need to re-check the situation after getting the lock. */ spin_lock(&dentry->d_lock); @@ -852,17 +795,11 @@ static inline bool fast_dput(struct dentry *dentry) * else could have killed it and marked it dead. Either way, we * don't need to do anything else. */ - if (dentry->d_lockref.count) { +locked: + if (dentry->d_lockref.count || retain_dentry(dentry, true)) { spin_unlock(&dentry->d_lock); return true; } - - /* - * Re-get the reference we optimistically dropped. We hold the - * lock, and we just tested that it was zero, so we can just - * set it to 1. - */ - dentry->d_lockref.count = 1; return false; } @@ -895,39 +832,37 @@ static inline bool fast_dput(struct dentry *dentry) */ void dput(struct dentry *dentry) { - while (dentry) { - might_sleep(); - - rcu_read_lock(); - if (likely(fast_dput(dentry))) { - rcu_read_unlock(); - return; - } - - /* Slow case: now with the dentry lock held */ + if (!dentry) + return; + might_sleep(); + rcu_read_lock(); + if (likely(fast_dput(dentry))) { rcu_read_unlock(); - - if (likely(retain_dentry(dentry))) { + return; + } + while (lock_for_kill(dentry)) { + rcu_read_unlock(); + dentry = __dentry_kill(dentry); + if (!dentry) + return; + if (retain_dentry(dentry, true)) { spin_unlock(&dentry->d_lock); return; } - - dentry = dentry_kill(dentry); + rcu_read_lock(); } + rcu_read_unlock(); + spin_unlock(&dentry->d_lock); } EXPORT_SYMBOL(dput); -static void __dput_to_list(struct dentry *dentry, struct list_head *list) +static void to_shrink_list(struct dentry *dentry, struct list_head *list) __must_hold(&dentry->d_lock) { - if (dentry->d_flags & DCACHE_SHRINK_LIST) { - /* let the owner of the list it's on deal with it */ - --dentry->d_lockref.count; - } else { + if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { if (dentry->d_flags & DCACHE_LRU_LIST) d_lru_del(dentry); - if (!--dentry->d_lockref.count) - d_shrink_add(dentry, list); + d_shrink_add(dentry, list); } } @@ -939,22 +874,10 @@ void dput_to_list(struct dentry *dentry, struct list_head *list) return; } rcu_read_unlock(); - if (!retain_dentry(dentry)) - __dput_to_list(dentry, list); + to_shrink_list(dentry, list); spin_unlock(&dentry->d_lock); } -/* This must be called with d_lock held */ -static inline void __dget_dlock(struct dentry *dentry) -{ - dentry->d_lockref.count++; -} - -static inline void __dget(struct dentry *dentry) -{ - lockref_get(&dentry->d_lockref); -} - struct dentry *dget_parent(struct dentry *dentry) { int gotref; @@ -1004,7 +927,7 @@ static struct dentry * __d_find_any_alias(struct inode *inode) if (hlist_empty(&inode->i_dentry)) return NULL; alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); - __dget(alias); + lockref_get(&alias->d_lockref); return alias; } @@ -1036,7 +959,7 @@ static struct dentry *__d_find_alias(struct inode *inode) hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { spin_lock(&alias->d_lock); if (!d_unhashed(alias)) { - __dget_dlock(alias); + dget_dlock(alias); spin_unlock(&alias->d_lock); return alias; } @@ -1103,104 +1026,53 @@ struct dentry *d_find_alias_rcu(struct inode *inode) */ void d_prune_aliases(struct inode *inode) { + LIST_HEAD(dispose); struct dentry *dentry; -restart: + spin_lock(&inode->i_lock); hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { spin_lock(&dentry->d_lock); - if (!dentry->d_lockref.count) { - struct dentry *parent = lock_parent(dentry); - if (likely(!dentry->d_lockref.count)) { - __dentry_kill(dentry); - dput(parent); - goto restart; - } - if (parent) - spin_unlock(&parent->d_lock); - } + if (!dentry->d_lockref.count) + to_shrink_list(dentry, &dispose); spin_unlock(&dentry->d_lock); } spin_unlock(&inode->i_lock); + shrink_dentry_list(&dispose); } EXPORT_SYMBOL(d_prune_aliases); -/* - * Lock a dentry from shrink list. - * Called under rcu_read_lock() and dentry->d_lock; the former - * guarantees that nothing we access will be freed under us. - * Note that dentry is *not* protected from concurrent dentry_kill(), - * d_delete(), etc. - * - * Return false if dentry has been disrupted or grabbed, leaving - * the caller to kick it off-list. Otherwise, return true and have - * that dentry's inode and parent both locked. - */ -static bool shrink_lock_dentry(struct dentry *dentry) +static inline void shrink_kill(struct dentry *victim) { - struct inode *inode; - struct dentry *parent; - - if (dentry->d_lockref.count) - return false; - - inode = dentry->d_inode; - if (inode && unlikely(!spin_trylock(&inode->i_lock))) { - spin_unlock(&dentry->d_lock); - spin_lock(&inode->i_lock); - spin_lock(&dentry->d_lock); - if (unlikely(dentry->d_lockref.count)) - goto out; - /* changed inode means that somebody had grabbed it */ - if (unlikely(inode != dentry->d_inode)) - goto out; - } - - parent = dentry->d_parent; - if (IS_ROOT(dentry) || likely(spin_trylock(&parent->d_lock))) - return true; - - spin_unlock(&dentry->d_lock); - spin_lock(&parent->d_lock); - if (unlikely(parent != dentry->d_parent)) { - spin_unlock(&parent->d_lock); - spin_lock(&dentry->d_lock); - goto out; - } - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - if (likely(!dentry->d_lockref.count)) - return true; - spin_unlock(&parent->d_lock); -out: - if (inode) - spin_unlock(&inode->i_lock); - return false; + do { + rcu_read_unlock(); + victim = __dentry_kill(victim); + rcu_read_lock(); + } while (victim && lock_for_kill(victim)); + rcu_read_unlock(); + if (victim) + spin_unlock(&victim->d_lock); } void shrink_dentry_list(struct list_head *list) { while (!list_empty(list)) { - struct dentry *dentry, *parent; + struct dentry *dentry; dentry = list_entry(list->prev, struct dentry, d_lru); spin_lock(&dentry->d_lock); rcu_read_lock(); - if (!shrink_lock_dentry(dentry)) { - bool can_free = false; + if (!lock_for_kill(dentry)) { + bool can_free; rcu_read_unlock(); d_shrink_del(dentry); - if (dentry->d_lockref.count < 0) - can_free = dentry->d_flags & DCACHE_MAY_FREE; + can_free = dentry->d_flags & DCACHE_DENTRY_KILLED; spin_unlock(&dentry->d_lock); if (can_free) dentry_free(dentry); continue; } - rcu_read_unlock(); d_shrink_del(dentry); - parent = dentry->d_parent; - if (parent != dentry) - __dput_to_list(parent, list); - __dentry_kill(dentry); + shrink_kill(dentry); } } @@ -1350,8 +1222,7 @@ enum d_walk_ret { static void d_walk(struct dentry *parent, void *data, enum d_walk_ret (*enter)(void *, struct dentry *)) { - struct dentry *this_parent; - struct list_head *next; + struct dentry *this_parent, *dentry; unsigned seq = 0; enum d_walk_ret ret; bool retry = true; @@ -1373,13 +1244,9 @@ again: break; } repeat: - next = this_parent->d_subdirs.next; + dentry = d_first_child(this_parent); resume: - while (next != &this_parent->d_subdirs) { - struct list_head *tmp = next; - struct dentry *dentry = list_entry(tmp, struct dentry, d_child); - next = tmp->next; - + hlist_for_each_entry_from(dentry, d_sib) { if (unlikely(dentry->d_flags & DCACHE_DENTRY_CURSOR)) continue; @@ -1400,7 +1267,7 @@ resume: continue; } - if (!list_empty(&dentry->d_subdirs)) { + if (!hlist_empty(&dentry->d_children)) { spin_unlock(&this_parent->d_lock); spin_release(&dentry->d_lock.dep_map, _RET_IP_); this_parent = dentry; @@ -1415,24 +1282,23 @@ resume: rcu_read_lock(); ascend: if (this_parent != parent) { - struct dentry *child = this_parent; - this_parent = child->d_parent; + dentry = this_parent; + this_parent = dentry->d_parent; - spin_unlock(&child->d_lock); + spin_unlock(&dentry->d_lock); spin_lock(&this_parent->d_lock); /* might go back up the wrong parent if we have had a rename. */ if (need_seqretry(&rename_lock, seq)) goto rename_retry; /* go into the first sibling still alive */ - do { - next = child->d_child.next; - if (next == &this_parent->d_subdirs) - goto ascend; - child = list_entry(next, struct dentry, d_child); - } while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED)); - rcu_read_unlock(); - goto resume; + hlist_for_each_entry_continue(dentry, d_sib) { + if (likely(!(dentry->d_flags & DCACHE_DENTRY_KILLED))) { + rcu_read_unlock(); + goto resume; + } + } + goto ascend; } if (need_seqretry(&rename_lock, seq)) goto rename_retry; @@ -1532,7 +1398,7 @@ out: * Search the dentry child list of the specified parent, * and move any unused dentries to the end of the unused * list for prune_dcache(). We descend to the next level - * whenever the d_subdirs list is non-empty and continue + * whenever the d_children list is non-empty and continue * searching. * * It returns zero iff there are no unused children, @@ -1562,13 +1428,11 @@ static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) if (dentry->d_flags & DCACHE_SHRINK_LIST) { data->found++; - } else { - if (dentry->d_flags & DCACHE_LRU_LIST) - d_lru_del(dentry); - if (!dentry->d_lockref.count) { - d_shrink_add(dentry, &data->dispose); - data->found++; - } + } else if (!dentry->d_lockref.count) { + to_shrink_list(dentry, &data->dispose); + data->found++; + } else if (dentry->d_lockref.count < 0) { + data->found++; } /* * We can return to the caller if we have found some (this @@ -1589,17 +1453,13 @@ static enum d_walk_ret select_collect2(void *_data, struct dentry *dentry) if (data->start == dentry) goto out; - if (dentry->d_flags & DCACHE_SHRINK_LIST) { - if (!dentry->d_lockref.count) { + if (!dentry->d_lockref.count) { + if (dentry->d_flags & DCACHE_SHRINK_LIST) { rcu_read_lock(); data->victim = dentry; return D_WALK_QUIT; } - } else { - if (dentry->d_flags & DCACHE_LRU_LIST) - d_lru_del(dentry); - if (!dentry->d_lockref.count) - d_shrink_add(dentry, &data->dispose); + to_shrink_list(dentry, &data->dispose); } /* * We can return to the caller if we have found some (this @@ -1637,17 +1497,12 @@ void shrink_dcache_parent(struct dentry *parent) data.victim = NULL; d_walk(parent, &data, select_collect2); if (data.victim) { - struct dentry *parent; spin_lock(&data.victim->d_lock); - if (!shrink_lock_dentry(data.victim)) { + if (!lock_for_kill(data.victim)) { spin_unlock(&data.victim->d_lock); rcu_read_unlock(); } else { - rcu_read_unlock(); - parent = data.victim->d_parent; - if (parent != data.victim) - __dput_to_list(parent, &data.dispose); - __dentry_kill(data.victim); + shrink_kill(data.victim); } } if (!list_empty(&data.dispose)) @@ -1659,7 +1514,7 @@ EXPORT_SYMBOL(shrink_dcache_parent); static enum d_walk_ret umount_check(void *_data, struct dentry *dentry) { /* it has busy descendents; complain about those instead */ - if (!list_empty(&dentry->d_subdirs)) + if (!hlist_empty(&dentry->d_children)) return D_WALK_CONTINUE; /* root with refcount 1 is fine */ @@ -1709,8 +1564,7 @@ static enum d_walk_ret find_submount(void *_data, struct dentry *dentry) { struct dentry **victim = _data; if (d_mountpoint(dentry)) { - __dget_dlock(dentry); - *victim = dentry; + *victim = dget_dlock(dentry); return D_WALK_QUIT; } return D_WALK_CONTINUE; @@ -1816,9 +1670,9 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) dentry->d_fsdata = NULL; INIT_HLIST_BL_NODE(&dentry->d_hash); INIT_LIST_HEAD(&dentry->d_lru); - INIT_LIST_HEAD(&dentry->d_subdirs); + INIT_HLIST_HEAD(&dentry->d_children); INIT_HLIST_NODE(&dentry->d_u.d_alias); - INIT_LIST_HEAD(&dentry->d_child); + INIT_HLIST_NODE(&dentry->d_sib); d_set_d_op(dentry, dentry->d_sb->s_d_op); if (dentry->d_op && dentry->d_op->d_init) { @@ -1855,9 +1709,8 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) * don't need child lock because it is not subject * to concurrency here */ - __dget_dlock(parent); - dentry->d_parent = parent; - list_add(&dentry->d_child, &parent->d_subdirs); + dentry->d_parent = dget_dlock(parent); + hlist_add_head(&dentry->d_sib, &parent->d_children); spin_unlock(&parent->d_lock); return dentry; @@ -1897,9 +1750,15 @@ struct dentry *d_alloc_cursor(struct dentry * parent) */ struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name) { + static const struct dentry_operations anon_ops = { + .d_dname = simple_dname + }; struct dentry *dentry = __d_alloc(sb, name); - if (likely(dentry)) + if (likely(dentry)) { dentry->d_flags |= DCACHE_NORCU; + if (!sb->s_d_op) + d_set_d_op(dentry, &anon_ops); + } return dentry; } @@ -1943,22 +1802,6 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) } EXPORT_SYMBOL(d_set_d_op); - -/* - * d_set_fallthru - Mark a dentry as falling through to a lower layer - * @dentry - The dentry to mark - * - * Mark a dentry as falling through to the lower layer (as set with - * d_pin_lower()). This flag may be recorded on the medium. - */ -void d_set_fallthru(struct dentry *dentry) -{ - spin_lock(&dentry->d_lock); - dentry->d_flags |= DCACHE_FALLTHRU; - spin_unlock(&dentry->d_lock); -} -EXPORT_SYMBOL(d_set_fallthru); - static unsigned d_flags_for_inode(struct inode *inode) { unsigned add_flags = DCACHE_REGULAR_TYPE; @@ -2077,75 +1920,55 @@ struct dentry *d_make_root(struct inode *root_inode) } EXPORT_SYMBOL(d_make_root); -static struct dentry *__d_instantiate_anon(struct dentry *dentry, - struct inode *inode, - bool disconnected) -{ - struct dentry *res; - unsigned add_flags; - - security_d_instantiate(dentry, inode); - spin_lock(&inode->i_lock); - res = __d_find_any_alias(inode); - if (res) { - spin_unlock(&inode->i_lock); - dput(dentry); - goto out_iput; - } - - /* attach a disconnected dentry */ - add_flags = d_flags_for_inode(inode); - - if (disconnected) - add_flags |= DCACHE_DISCONNECTED; - - spin_lock(&dentry->d_lock); - __d_set_inode_and_type(dentry, inode, add_flags); - hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); - if (!disconnected) { - hlist_bl_lock(&dentry->d_sb->s_roots); - hlist_bl_add_head(&dentry->d_hash, &dentry->d_sb->s_roots); - hlist_bl_unlock(&dentry->d_sb->s_roots); - } - spin_unlock(&dentry->d_lock); - spin_unlock(&inode->i_lock); - - return dentry; - - out_iput: - iput(inode); - return res; -} - -struct dentry *d_instantiate_anon(struct dentry *dentry, struct inode *inode) -{ - return __d_instantiate_anon(dentry, inode, true); -} -EXPORT_SYMBOL(d_instantiate_anon); - static struct dentry *__d_obtain_alias(struct inode *inode, bool disconnected) { - struct dentry *tmp; - struct dentry *res; + struct super_block *sb; + struct dentry *new, *res; if (!inode) return ERR_PTR(-ESTALE); if (IS_ERR(inode)) return ERR_CAST(inode); - res = d_find_any_alias(inode); + sb = inode->i_sb; + + res = d_find_any_alias(inode); /* existing alias? */ if (res) - goto out_iput; + goto out; - tmp = d_alloc_anon(inode->i_sb); - if (!tmp) { + new = d_alloc_anon(sb); + if (!new) { res = ERR_PTR(-ENOMEM); - goto out_iput; + goto out; } - return __d_instantiate_anon(tmp, inode, disconnected); + security_d_instantiate(new, inode); + spin_lock(&inode->i_lock); + res = __d_find_any_alias(inode); /* recheck under lock */ + if (likely(!res)) { /* still no alias, attach a disconnected dentry */ + unsigned add_flags = d_flags_for_inode(inode); + + if (disconnected) + add_flags |= DCACHE_DISCONNECTED; -out_iput: + spin_lock(&new->d_lock); + __d_set_inode_and_type(new, inode, add_flags); + hlist_add_head(&new->d_u.d_alias, &inode->i_dentry); + if (!disconnected) { + hlist_bl_lock(&sb->s_roots); + hlist_bl_add_head(&new->d_hash, &sb->s_roots); + hlist_bl_unlock(&sb->s_roots); + } + spin_unlock(&new->d_lock); + spin_unlock(&inode->i_lock); + inode = NULL; /* consumed by new->d_inode */ + res = new; + } else { + spin_unlock(&inode->i_lock); + dput(new); + } + + out: iput(inode); return res; } @@ -2729,7 +2552,7 @@ retry: /* we can't take ->d_lock here; it's OK, though. */ new->d_flags |= DCACHE_PAR_LOOKUP; new->d_wait = wq; - hlist_bl_add_head_rcu(&new->d_u.d_in_lookup_hash, b); + hlist_bl_add_head(&new->d_u.d_in_lookup_hash, b); hlist_bl_unlock(b); return new; mismatch: @@ -2853,7 +2676,7 @@ struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode) spin_unlock(&alias->d_lock); alias = NULL; } else { - __dget_dlock(alias); + dget_dlock(alias); __d_rehash(alias); spin_unlock(&alias->d_lock); } @@ -2995,11 +2818,15 @@ static void __d_move(struct dentry *dentry, struct dentry *target, } else { target->d_parent = old_parent; swap_names(dentry, target); - list_move(&target->d_child, &target->d_parent->d_subdirs); + if (!hlist_unhashed(&target->d_sib)) + __hlist_del(&target->d_sib); + hlist_add_head(&target->d_sib, &target->d_parent->d_children); __d_rehash(target); fsnotify_update_flags(target); } - list_move(&dentry->d_child, &dentry->d_parent->d_subdirs); + if (!hlist_unhashed(&dentry->d_sib)) + __hlist_del(&dentry->d_sib); + hlist_add_head(&dentry->d_sib, &dentry->d_parent->d_children); __d_rehash(dentry); fsnotify_update_flags(dentry); fscrypt_handle_d_move(dentry); @@ -3082,8 +2909,7 @@ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2) * Note: If ever the locking in lock_rename() changes, then please * remember to update this too... */ -static int __d_unalias(struct inode *inode, - struct dentry *dentry, struct dentry *alias) +static int __d_unalias(struct dentry *dentry, struct dentry *alias) { struct mutex *m1 = NULL; struct rw_semaphore *m2 = NULL; @@ -3164,7 +2990,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) inode->i_sb->s_id); } else if (!IS_ROOT(new)) { struct dentry *old_parent = dget(new->d_parent); - int err = __d_unalias(inode, dentry, new); + int err = __d_unalias(dentry, new); write_sequnlock(&rename_lock); if (err) { dput(new); @@ -3235,10 +3061,7 @@ static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry) if (d_unhashed(dentry) || !dentry->d_inode) return D_WALK_SKIP; - if (!(dentry->d_flags & DCACHE_GENOCIDE)) { - dentry->d_flags |= DCACHE_GENOCIDE; - dentry->d_lockref.count--; - } + dentry->d_lockref.count--; } return D_WALK_CONTINUE; } diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index c830261aa883..b20e565b9c5e 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -69,7 +69,6 @@ static struct ctl_table pty_table[] = { .data = &pty_count, .proc_handler = proc_dointvec, }, - {} }; struct pts_mount_opts { diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index d7193687b9b4..5ed1e4cf6c0b 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -607,6 +607,8 @@ ecryptfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, target_inode = d_inode(new_dentry); trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry); + if (IS_ERR(trap)) + return PTR_ERR(trap); dget(lower_new_dentry); rc = -EINVAL; if (lower_old_dentry->d_parent != lower_old_dir_dentry) diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index 1d318f85232d..fffd3919343e 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -114,8 +114,11 @@ config EROFS_FS_ZIP_DEFLATE config EROFS_FS_ONDEMAND bool "EROFS fscache-based on-demand read support" - depends on CACHEFILES_ONDEMAND && (EROFS_FS=m && FSCACHE || EROFS_FS=y && FSCACHE=y) - default n + depends on EROFS_FS + select NETFS_SUPPORT + select FSCACHE + select CACHEFILES + select CACHEFILES_ONDEMAND help This permits EROFS to use fscache-backed data blobs with on-demand read support. diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h index 279933e007d2..7cc5841577b2 100644 --- a/fs/erofs/compress.h +++ b/fs/erofs/compress.h @@ -11,13 +11,12 @@ struct z_erofs_decompress_req { struct super_block *sb; struct page **in, **out; - unsigned short pageofs_in, pageofs_out; unsigned int inputsize, outputsize; - /* indicate the algorithm will be used for decompression */ - unsigned int alg; + unsigned int alg; /* the algorithm for decompression */ bool inplace_io, partial_decoding, fillgaps; + gfp_t gfp; /* allocation flags for extra temporary buffers */ }; struct z_erofs_decompressor { diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 1d65b9f60a39..d4cee95af14c 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -111,8 +111,9 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx, victim = availables[--top]; get_page(victim); } else { - victim = erofs_allocpage(pagepool, - GFP_KERNEL | __GFP_NOFAIL); + victim = erofs_allocpage(pagepool, rq->gfp); + if (!victim) + return -ENOMEM; set_page_private(victim, Z_EROFS_SHORTLIVED_PAGE); } rq->out[i] = victim; @@ -408,7 +409,7 @@ int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb) int size, ret = 0; if (!erofs_sb_has_compr_cfgs(sbi)) { - sbi->available_compr_algs = Z_EROFS_COMPRESSION_LZ4; + sbi->available_compr_algs = 1 << Z_EROFS_COMPRESSION_LZ4; return z_erofs_load_lz4_config(sb, dsb, NULL, 0); } diff --git a/fs/erofs/decompressor_deflate.c b/fs/erofs/decompressor_deflate.c index 4a64a9c91dd3..b98872058abe 100644 --- a/fs/erofs/decompressor_deflate.c +++ b/fs/erofs/decompressor_deflate.c @@ -95,7 +95,7 @@ int z_erofs_load_deflate_config(struct super_block *sb, } int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq, - struct page **pagepool) + struct page **pgpl) { const unsigned int nrpages_out = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; @@ -158,8 +158,12 @@ again: strm->z.avail_out = min_t(u32, outsz, PAGE_SIZE - pofs); outsz -= strm->z.avail_out; if (!rq->out[no]) { - rq->out[no] = erofs_allocpage(pagepool, - GFP_KERNEL | __GFP_NOFAIL); + rq->out[no] = erofs_allocpage(pgpl, rq->gfp); + if (!rq->out[no]) { + kout = NULL; + err = -ENOMEM; + break; + } set_page_private(rq->out[no], Z_EROFS_SHORTLIVED_PAGE); } @@ -211,8 +215,11 @@ again: DBG_BUGON(erofs_page_is_managed(EROFS_SB(sb), rq->in[j])); - tmppage = erofs_allocpage(pagepool, - GFP_KERNEL | __GFP_NOFAIL); + tmppage = erofs_allocpage(pgpl, rq->gfp); + if (!tmppage) { + err = -ENOMEM; + goto failed; + } set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE); copy_highpage(tmppage, rq->in[j]); rq->in[j] = tmppage; @@ -230,7 +237,7 @@ again: break; } } - +failed: if (zlib_inflateEnd(&strm->z) != Z_OK && !err) err = -EIO; if (kout) diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c index 2dd14f99c1dc..6ca357d83cfa 100644 --- a/fs/erofs/decompressor_lzma.c +++ b/fs/erofs/decompressor_lzma.c @@ -148,7 +148,7 @@ again: } int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, - struct page **pagepool) + struct page **pgpl) { const unsigned int nrpages_out = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; @@ -215,8 +215,11 @@ again: PAGE_SIZE - pageofs); outlen -= strm->buf.out_size; if (!rq->out[no] && rq->fillgaps) { /* deduped */ - rq->out[no] = erofs_allocpage(pagepool, - GFP_KERNEL | __GFP_NOFAIL); + rq->out[no] = erofs_allocpage(pgpl, rq->gfp); + if (!rq->out[no]) { + err = -ENOMEM; + break; + } set_page_private(rq->out[no], Z_EROFS_SHORTLIVED_PAGE); } @@ -258,8 +261,11 @@ again: DBG_BUGON(erofs_page_is_managed(EROFS_SB(rq->sb), rq->in[j])); - tmppage = erofs_allocpage(pagepool, - GFP_KERNEL | __GFP_NOFAIL); + tmppage = erofs_allocpage(pgpl, rq->gfp); + if (!tmppage) { + err = -ENOMEM; + goto failed; + } set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE); copy_highpage(tmppage, rq->in[j]); rq->in[j] = tmppage; @@ -277,6 +283,7 @@ again: break; } } +failed: if (no < nrpages_out && strm->buf.out) kunmap(rq->out[no]); if (ni < nrpages_in) diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index 87ff35bff8d5..5ff90026fd43 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -165,10 +165,10 @@ static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie, static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio) { int ret; - struct erofs_fscache *ctx = folio_mapping(folio)->host->i_private; + struct erofs_fscache *ctx = folio->mapping->host->i_private; struct erofs_fscache_request *req; - req = erofs_fscache_req_alloc(folio_mapping(folio), + req = erofs_fscache_req_alloc(folio->mapping, folio_pos(folio), folio_size(folio)); if (IS_ERR(req)) { folio_unlock(folio); @@ -276,7 +276,7 @@ static int erofs_fscache_read_folio(struct file *file, struct folio *folio) struct erofs_fscache_request *req; int ret; - req = erofs_fscache_req_alloc(folio_mapping(folio), + req = erofs_fscache_req_alloc(folio->mapping, folio_pos(folio), folio_size(folio)); if (IS_ERR(req)) { folio_unlock(folio); @@ -459,7 +459,7 @@ static struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb inode->i_size = OFFSET_MAX; inode->i_mapping->a_ops = &erofs_fscache_meta_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL); inode->i_blkbits = EROFS_SB(sb)->blkszbits; inode->i_private = ctx; diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 3d616dea55dc..36e638e8b53a 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -60,7 +60,7 @@ static void *erofs_read_inode(struct erofs_buf *buf, } else { const unsigned int gotten = sb->s_blocksize - *ofs; - copied = kmalloc(vi->inode_isize, GFP_NOFS); + copied = kmalloc(vi->inode_isize, GFP_KERNEL); if (!copied) { err = -ENOMEM; goto err_out; diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c index 5dea308764b4..e146d09151af 100644 --- a/fs/erofs/utils.c +++ b/fs/erofs/utils.c @@ -81,7 +81,7 @@ struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb, repeat: xa_lock(&sbi->managed_pslots); pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index, - NULL, grp, GFP_NOFS); + NULL, grp, GFP_KERNEL); if (pre) { if (xa_is_err(pre)) { pre = ERR_PTR(xa_err(pre)); diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 692c0c39be63..ff0aa72b0db3 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -82,6 +82,9 @@ struct z_erofs_pcluster { /* L: indicate several pageofs_outs or not */ bool multibases; + /* L: whether extra buffer allocations are best-effort */ + bool besteffort; + /* A: compressed bvecs (can be cached or inplaced pages) */ struct z_erofs_bvec compressed_bvecs[]; }; @@ -230,7 +233,7 @@ static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter, struct page *nextpage = *candidate_bvpage; if (!nextpage) { - nextpage = erofs_allocpage(pagepool, GFP_NOFS); + nextpage = erofs_allocpage(pagepool, GFP_KERNEL); if (!nextpage) return -ENOMEM; set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE); @@ -302,7 +305,7 @@ static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int size) if (nrpages > pcs->maxpages) continue; - pcl = kmem_cache_zalloc(pcs->slab, GFP_NOFS); + pcl = kmem_cache_zalloc(pcs->slab, GFP_KERNEL); if (!pcl) return ERR_PTR(-ENOMEM); pcl->pclustersize = size; @@ -563,21 +566,19 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; unsigned int i; - if (i_blocksize(fe->inode) != PAGE_SIZE) - return; - if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED) + if (i_blocksize(fe->inode) != PAGE_SIZE || + fe->mode < Z_EROFS_PCLUSTER_FOLLOWED) return; for (i = 0; i < pclusterpages; ++i) { struct page *page, *newpage; void *t; /* mark pages just found for debugging */ - /* the compressed page was loaded before */ + /* Inaccurate check w/o locking to avoid unneeded lookups */ if (READ_ONCE(pcl->compressed_bvecs[i].page)) continue; page = find_get_page(mc, pcl->obj.index + i); - if (page) { t = (void *)((unsigned long)page | 1); newpage = NULL; @@ -597,9 +598,13 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE); t = (void *)((unsigned long)newpage | 1); } - - if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, t)) + spin_lock(&pcl->obj.lockref.lock); + if (!pcl->compressed_bvecs[i].page) { + pcl->compressed_bvecs[i].page = t; + spin_unlock(&pcl->obj.lockref.lock); continue; + } + spin_unlock(&pcl->obj.lockref.lock); if (page) put_page(page); @@ -694,7 +699,7 @@ static void z_erofs_cache_invalidate_folio(struct folio *folio, DBG_BUGON(stop > folio_size(folio) || stop < length); if (offset == 0 && stop == folio_size(folio)) - while (!z_erofs_cache_release_folio(folio, GFP_NOFS)) + while (!z_erofs_cache_release_folio(folio, 0)) cond_resched(); } @@ -713,36 +718,30 @@ int erofs_init_managed_cache(struct super_block *sb) set_nlink(inode, 1); inode->i_size = OFFSET_MAX; inode->i_mapping->a_ops = &z_erofs_cache_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL); EROFS_SB(sb)->managed_cache = inode; return 0; } -static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, - struct z_erofs_bvec *bvec) -{ - struct z_erofs_pcluster *const pcl = fe->pcl; - - while (fe->icur > 0) { - if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page, - NULL, bvec->page)) { - pcl->compressed_bvecs[fe->icur] = *bvec; - return true; - } - } - return false; -} - /* callers must be with pcluster lock held */ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, struct z_erofs_bvec *bvec, bool exclusive) { + struct z_erofs_pcluster *pcl = fe->pcl; int ret; if (exclusive) { /* give priority for inplaceio to use file pages first */ - if (z_erofs_try_inplace_io(fe, bvec)) + spin_lock(&pcl->obj.lockref.lock); + while (fe->icur > 0) { + if (pcl->compressed_bvecs[--fe->icur].page) + continue; + pcl->compressed_bvecs[fe->icur] = *bvec; + spin_unlock(&pcl->obj.lockref.lock); return 0; + } + spin_unlock(&pcl->obj.lockref.lock); + /* otherwise, check if it can be used as a bvpage */ if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED && !fe->candidate_bvpage) @@ -964,7 +963,7 @@ static int z_erofs_read_fragment(struct super_block *sb, struct page *page, } static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, - struct page *page) + struct page *page, bool ra) { struct inode *const inode = fe->inode; struct erofs_map_blocks *const map = &fe->map; @@ -1014,6 +1013,7 @@ repeat: err = z_erofs_pcluster_begin(fe); if (err) goto out; + fe->pcl->besteffort |= !ra; } /* @@ -1280,6 +1280,9 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, .inplace_io = overlapped, .partial_decoding = pcl->partial, .fillgaps = pcl->multibases, + .gfp = pcl->besteffort ? + GFP_KERNEL | __GFP_NOFAIL : + GFP_NOWAIT | __GFP_NORETRY }, be->pagepool); /* must handle all compressed pages before actual file pages */ @@ -1322,6 +1325,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, pcl->length = 0; pcl->partial = true; pcl->multibases = false; + pcl->besteffort = false; pcl->bvset.nextpage = NULL; pcl->vcnt = 0; @@ -1423,23 +1427,26 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec, { gfp_t gfp = mapping_gfp_mask(mc); bool tocache = false; - struct z_erofs_bvec *zbv = pcl->compressed_bvecs + nr; + struct z_erofs_bvec zbv; struct address_space *mapping; - struct page *page, *oldpage; + struct page *page; int justfound, bs = i_blocksize(f->inode); /* Except for inplace pages, the entire page can be used for I/Os */ bvec->bv_offset = 0; bvec->bv_len = PAGE_SIZE; repeat: - oldpage = READ_ONCE(zbv->page); - if (!oldpage) + spin_lock(&pcl->obj.lockref.lock); + zbv = pcl->compressed_bvecs[nr]; + page = zbv.page; + justfound = (unsigned long)page & 1UL; + page = (struct page *)((unsigned long)page & ~1UL); + pcl->compressed_bvecs[nr].page = page; + spin_unlock(&pcl->obj.lockref.lock); + if (!page) goto out_allocpage; - justfound = (unsigned long)oldpage & 1UL; - page = (struct page *)((unsigned long)oldpage & ~1UL); bvec->bv_page = page; - DBG_BUGON(z_erofs_is_shortlived_page(page)); /* * Handle preallocated cached pages. We tried to allocate such pages @@ -1448,7 +1455,6 @@ repeat: */ if (page->private == Z_EROFS_PREALLOCATED_PAGE) { set_page_private(page, 0); - WRITE_ONCE(zbv->page, page); tocache = true; goto out_tocache; } @@ -1459,9 +1465,9 @@ repeat: * therefore it is impossible for `mapping` to be NULL. */ if (mapping && mapping != mc) { - if (zbv->offset < 0) - bvec->bv_offset = round_up(-zbv->offset, bs); - bvec->bv_len = round_up(zbv->end, bs) - bvec->bv_offset; + if (zbv.offset < 0) + bvec->bv_offset = round_up(-zbv.offset, bs); + bvec->bv_len = round_up(zbv.end, bs) - bvec->bv_offset; return; } @@ -1471,7 +1477,6 @@ repeat: /* the cached page is still in managed cache */ if (page->mapping == mc) { - WRITE_ONCE(zbv->page, page); /* * The cached page is still available but without a valid * `->private` pcluster hint. Let's reconnect them. @@ -1503,11 +1508,15 @@ repeat: put_page(page); out_allocpage: page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL); - if (oldpage != cmpxchg(&zbv->page, oldpage, page)) { + spin_lock(&pcl->obj.lockref.lock); + if (pcl->compressed_bvecs[nr].page) { erofs_pagepool_add(&f->pagepool, page); + spin_unlock(&pcl->obj.lockref.lock); cond_resched(); goto repeat; } + pcl->compressed_bvecs[nr].page = page; + spin_unlock(&pcl->obj.lockref.lock); bvec->bv_page = page; out_tocache: if (!tocache || bs != PAGE_SIZE || @@ -1685,6 +1694,7 @@ submit_bio_retry: if (cur + bvec.bv_len > end) bvec.bv_len = end - cur; + DBG_BUGON(bvec.bv_len < sb->s_blocksize); if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len, bvec.bv_offset)) goto submit_bio_retry; @@ -1785,7 +1795,7 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, if (PageUptodate(page)) unlock_page(page); else - (void)z_erofs_do_read_page(f, page); + (void)z_erofs_do_read_page(f, page, !!rac); put_page(page); } @@ -1806,7 +1816,7 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio) f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT; z_erofs_pcluster_readmore(&f, NULL, true); - err = z_erofs_do_read_page(&f, &folio->page); + err = z_erofs_do_read_page(&f, &folio->page, false); z_erofs_pcluster_readmore(&f, NULL, false); z_erofs_pcluster_end(&f); @@ -1847,7 +1857,7 @@ static void z_erofs_readahead(struct readahead_control *rac) folio = head; head = folio_get_private(folio); - err = z_erofs_do_read_page(&f, &folio->page); + err = z_erofs_do_read_page(&f, &folio->page, true); if (err && err != -EINTR) erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu", folio->index, EROFS_I(inode)->nid); diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 9753875e41cb..e313c936351d 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -454,7 +454,7 @@ static int z_erofs_do_map_blocks(struct inode *inode, .map = map, }; int err = 0; - unsigned int lclusterbits, endoff; + unsigned int lclusterbits, endoff, afmt; unsigned long initial_lcn; unsigned long long ofs, end; @@ -543,17 +543,20 @@ static int z_erofs_do_map_blocks(struct inode *inode, err = -EFSCORRUPTED; goto unmap_out; } - if (vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER) - map->m_algorithmformat = - Z_EROFS_COMPRESSION_INTERLACED; - else - map->m_algorithmformat = - Z_EROFS_COMPRESSION_SHIFTED; - } else if (m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) { - map->m_algorithmformat = vi->z_algorithmtype[1]; + afmt = vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER ? + Z_EROFS_COMPRESSION_INTERLACED : + Z_EROFS_COMPRESSION_SHIFTED; } else { - map->m_algorithmformat = vi->z_algorithmtype[0]; + afmt = m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2 ? + vi->z_algorithmtype[1] : vi->z_algorithmtype[0]; + if (!(EROFS_I_SB(inode)->available_compr_algs & (1 << afmt))) { + erofs_err(inode->i_sb, "inconsistent algorithmtype %u for nid %llu", + afmt, vi->nid); + err = -EFSCORRUPTED; + goto unmap_out; + } } + map->m_algorithmformat = afmt; if ((flags & EROFS_GET_BLOCKS_FIEMAP) || ((flags & EROFS_GET_BLOCKS_READMORE) && diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 2877cc01cff1..3534d36a1474 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -322,7 +322,6 @@ static struct ctl_table epoll_table[] = { .extra1 = &long_zero, .extra2 = &long_max, }, - { } }; static void __init epoll_sysctls_init(void) diff --git a/fs/exec.c b/fs/exec.c index ee43597cb453..af4fbb61cd53 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -66,6 +66,7 @@ #include <linux/coredump.h> #include <linux/time_namespace.h> #include <linux/user_events.h> +#include <linux/rseq.h> #include <linux/uaccess.h> #include <asm/mmu_context.h> @@ -127,7 +128,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library) struct filename *tmp = getname(library); int error = PTR_ERR(tmp); static const struct open_flags uselib_flags = { - .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, + .open_flag = O_LARGEFILE | O_RDONLY, .acc_mode = MAY_READ | MAY_EXEC, .intent = LOOKUP_OPEN, .lookup_flags = LOOKUP_FOLLOW, @@ -903,6 +904,10 @@ EXPORT_SYMBOL(transfer_args_to_stack); #endif /* CONFIG_MMU */ +/* + * On success, caller must call do_close_execat() on the returned + * struct file to close it. + */ static struct file *do_open_execat(int fd, struct filename *name, int flags) { struct file *file; @@ -947,6 +952,17 @@ exit: return ERR_PTR(err); } +/** + * open_exec - Open a path name for execution + * + * @name: path name to open with the intent of executing it. + * + * Returns ERR_PTR on failure or allocated struct file on success. + * + * As this is a wrapper for the internal do_open_execat(), callers + * must call allow_write_access() before fput() on release. Also see + * do_close_execat(). + */ struct file *open_exec(const char *name) { struct filename *filename = getname_kernel(name); @@ -1408,6 +1424,9 @@ int begin_new_exec(struct linux_binprm * bprm) out_unlock: up_write(&me->signal->exec_update_lock); + if (!bprm->cred) + mutex_unlock(&me->signal->cred_guard_mutex); + out: return retval; } @@ -1483,6 +1502,15 @@ static int prepare_bprm_creds(struct linux_binprm *bprm) return -ENOMEM; } +/* Matches do_open_execat() */ +static void do_close_execat(struct file *file) +{ + if (!file) + return; + allow_write_access(file); + fput(file); +} + static void free_bprm(struct linux_binprm *bprm) { if (bprm->mm) { @@ -1494,10 +1522,7 @@ static void free_bprm(struct linux_binprm *bprm) mutex_unlock(¤t->signal->cred_guard_mutex); abort_creds(bprm->cred); } - if (bprm->file) { - allow_write_access(bprm->file); - fput(bprm->file); - } + do_close_execat(bprm->file); if (bprm->executable) fput(bprm->executable); /* If a binfmt changed the interp, free it. */ @@ -1507,12 +1532,23 @@ static void free_bprm(struct linux_binprm *bprm) kfree(bprm); } -static struct linux_binprm *alloc_bprm(int fd, struct filename *filename) +static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int flags) { - struct linux_binprm *bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); + struct linux_binprm *bprm; + struct file *file; int retval = -ENOMEM; - if (!bprm) - goto out; + + file = do_open_execat(fd, filename, flags); + if (IS_ERR(file)) + return ERR_CAST(file); + + bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); + if (!bprm) { + do_close_execat(file); + return ERR_PTR(-ENOMEM); + } + + bprm->file = file; if (fd == AT_FDCWD || filename->name[0] == '/') { bprm->filename = filename->name; @@ -1525,18 +1561,28 @@ static struct linux_binprm *alloc_bprm(int fd, struct filename *filename) if (!bprm->fdpath) goto out_free; + /* + * Record that a name derived from an O_CLOEXEC fd will be + * inaccessible after exec. This allows the code in exec to + * choose to fail when the executable is not mmaped into the + * interpreter and an open file descriptor is not passed to + * the interpreter. This makes for a better user experience + * than having the interpreter start and then immediately fail + * when it finds the executable is inaccessible. + */ + if (get_close_on_exec(fd)) + bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE; + bprm->filename = bprm->fdpath; } bprm->interp = bprm->filename; retval = bprm_mm_init(bprm); - if (retval) - goto out_free; - return bprm; + if (!retval) + return bprm; out_free: free_bprm(bprm); -out: return ERR_PTR(retval); } @@ -1587,6 +1633,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm) } rcu_read_unlock(); + /* "users" and "in_exec" locked for copy_fs() */ if (p->fs->users > n_fs) bprm->unsafe |= LSM_UNSAFE_SHARE; else @@ -1803,13 +1850,8 @@ static int exec_binprm(struct linux_binprm *bprm) return 0; } -/* - * sys_execve() executes a new program. - */ -static int bprm_execve(struct linux_binprm *bprm, - int fd, struct filename *filename, int flags) +static int bprm_execve(struct linux_binprm *bprm) { - struct file *file; int retval; retval = prepare_bprm_creds(bprm); @@ -1825,26 +1867,8 @@ static int bprm_execve(struct linux_binprm *bprm, current->in_execve = 1; sched_mm_cid_before_execve(current); - file = do_open_execat(fd, filename, flags); - retval = PTR_ERR(file); - if (IS_ERR(file)) - goto out_unmark; - sched_exec(); - bprm->file = file; - /* - * Record that a name derived from an O_CLOEXEC fd will be - * inaccessible after exec. This allows the code in exec to - * choose to fail when the executable is not mmaped into the - * interpreter and an open file descriptor is not passed to - * the interpreter. This makes for a better user experience - * than having the interpreter start and then immediately fail - * when it finds the executable is inaccessible. - */ - if (bprm->fdpath && get_close_on_exec(fd)) - bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE; - /* Set the unchanging part of bprm->cred */ retval = security_bprm_creds_for_exec(bprm); if (retval) @@ -1874,7 +1898,6 @@ out: if (bprm->point_of_no_return && !fatal_signal_pending(current)) force_fatal_sig(SIGSEGV); -out_unmark: sched_mm_cid_after_execve(current); current->fs->in_exec = 0; current->in_execve = 0; @@ -1909,7 +1932,7 @@ static int do_execveat_common(int fd, struct filename *filename, * further execve() calls fail. */ current->flags &= ~PF_NPROC_EXCEEDED; - bprm = alloc_bprm(fd, filename); + bprm = alloc_bprm(fd, filename, flags); if (IS_ERR(bprm)) { retval = PTR_ERR(bprm); goto out_ret; @@ -1958,7 +1981,7 @@ static int do_execveat_common(int fd, struct filename *filename, bprm->argc = 1; } - retval = bprm_execve(bprm, fd, filename, flags); + retval = bprm_execve(bprm); out_free: free_bprm(bprm); @@ -1983,7 +2006,7 @@ int kernel_execve(const char *kernel_filename, if (IS_ERR(filename)) return PTR_ERR(filename); - bprm = alloc_bprm(fd, filename); + bprm = alloc_bprm(fd, filename, 0); if (IS_ERR(bprm)) { retval = PTR_ERR(bprm); goto out_ret; @@ -2018,7 +2041,7 @@ int kernel_execve(const char *kernel_filename, if (retval < 0) goto out_free; - retval = bprm_execve(bprm, fd, filename, 0); + retval = bprm_execve(bprm); out_free: free_bprm(bprm); out_ret: @@ -2164,7 +2187,6 @@ static struct ctl_table fs_exec_sysctls[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, - { } }; static int __init init_fs_exec_sysctls(void) diff --git a/fs/exfat/balloc.c b/fs/exfat/balloc.c index e918decb3735..0356c88252bd 100644 --- a/fs/exfat/balloc.c +++ b/fs/exfat/balloc.c @@ -5,42 +5,23 @@ #include <linux/blkdev.h> #include <linux/slab.h> +#include <linux/bitmap.h> #include <linux/buffer_head.h> #include "exfat_raw.h" #include "exfat_fs.h" -static const unsigned char free_bit[] = { - 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2,/* 0 ~ 19*/ - 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3,/* 20 ~ 39*/ - 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2,/* 40 ~ 59*/ - 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4,/* 60 ~ 79*/ - 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2,/* 80 ~ 99*/ - 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3,/*100 ~ 119*/ - 0, 1, 0, 2, 0, 1, 0, 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2,/*120 ~ 139*/ - 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5,/*140 ~ 159*/ - 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2,/*160 ~ 179*/ - 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3,/*180 ~ 199*/ - 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2,/*200 ~ 219*/ - 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4,/*220 ~ 239*/ - 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 /*240 ~ 254*/ -}; - -static const unsigned char used_bit[] = { - 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3,/* 0 ~ 19*/ - 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4,/* 20 ~ 39*/ - 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5,/* 40 ~ 59*/ - 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,/* 60 ~ 79*/ - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4,/* 80 ~ 99*/ - 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,/*100 ~ 119*/ - 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4,/*120 ~ 139*/ - 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,/*140 ~ 159*/ - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5,/*160 ~ 179*/ - 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5,/*180 ~ 199*/ - 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6,/*200 ~ 219*/ - 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,/*220 ~ 239*/ - 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 /*240 ~ 255*/ -}; +#if BITS_PER_LONG == 32 +#define __le_long __le32 +#define lel_to_cpu(A) le32_to_cpu(A) +#define cpu_to_lel(A) cpu_to_le32(A) +#elif BITS_PER_LONG == 64 +#define __le_long __le64 +#define lel_to_cpu(A) le64_to_cpu(A) +#define cpu_to_lel(A) cpu_to_le64(A) +#else +#error "BITS_PER_LONG not 32 or 64" +#endif /* * Allocation Bitmap Management Functions @@ -200,32 +181,35 @@ unsigned int exfat_find_free_bitmap(struct super_block *sb, unsigned int clu) { unsigned int i, map_i, map_b, ent_idx; unsigned int clu_base, clu_free; - unsigned char k, clu_mask; + unsigned long clu_bits, clu_mask; struct exfat_sb_info *sbi = EXFAT_SB(sb); + __le_long bitval; WARN_ON(clu < EXFAT_FIRST_CLUSTER); - ent_idx = CLUSTER_TO_BITMAP_ENT(clu); - clu_base = BITMAP_ENT_TO_CLUSTER(ent_idx & ~(BITS_PER_BYTE_MASK)); + ent_idx = ALIGN_DOWN(CLUSTER_TO_BITMAP_ENT(clu), BITS_PER_LONG); + clu_base = BITMAP_ENT_TO_CLUSTER(ent_idx); clu_mask = IGNORED_BITS_REMAINED(clu, clu_base); map_i = BITMAP_OFFSET_SECTOR_INDEX(sb, ent_idx); map_b = BITMAP_OFFSET_BYTE_IN_SECTOR(sb, ent_idx); for (i = EXFAT_FIRST_CLUSTER; i < sbi->num_clusters; - i += BITS_PER_BYTE) { - k = *(sbi->vol_amap[map_i]->b_data + map_b); + i += BITS_PER_LONG) { + bitval = *(__le_long *)(sbi->vol_amap[map_i]->b_data + map_b); if (clu_mask > 0) { - k |= clu_mask; + bitval |= cpu_to_lel(clu_mask); clu_mask = 0; } - if (k < 0xFF) { - clu_free = clu_base + free_bit[k]; + if (lel_to_cpu(bitval) != ULONG_MAX) { + clu_bits = lel_to_cpu(bitval); + clu_free = clu_base + ffz(clu_bits); if (clu_free < sbi->num_clusters) return clu_free; } - clu_base += BITS_PER_BYTE; + clu_base += BITS_PER_LONG; + map_b += sizeof(long); - if (++map_b >= sb->s_blocksize || + if (map_b >= sb->s_blocksize || clu_base >= sbi->num_clusters) { if (++map_i >= sbi->map_sectors) { clu_base = EXFAT_FIRST_CLUSTER; @@ -244,25 +228,24 @@ int exfat_count_used_clusters(struct super_block *sb, unsigned int *ret_count) unsigned int count = 0; unsigned int i, map_i = 0, map_b = 0; unsigned int total_clus = EXFAT_DATA_CLUSTER_COUNT(sbi); - unsigned int last_mask = total_clus & BITS_PER_BYTE_MASK; - unsigned char clu_bits; - const unsigned char last_bit_mask[] = {0, 0b00000001, 0b00000011, - 0b00000111, 0b00001111, 0b00011111, 0b00111111, 0b01111111}; + unsigned int last_mask = total_clus & (BITS_PER_LONG - 1); + unsigned long *bitmap, clu_bits; total_clus &= ~last_mask; - for (i = 0; i < total_clus; i += BITS_PER_BYTE) { - clu_bits = *(sbi->vol_amap[map_i]->b_data + map_b); - count += used_bit[clu_bits]; - if (++map_b >= (unsigned int)sb->s_blocksize) { + for (i = 0; i < total_clus; i += BITS_PER_LONG) { + bitmap = (void *)(sbi->vol_amap[map_i]->b_data + map_b); + count += hweight_long(*bitmap); + map_b += sizeof(long); + if (map_b >= (unsigned int)sb->s_blocksize) { map_i++; map_b = 0; } } if (last_mask) { - clu_bits = *(sbi->vol_amap[map_i]->b_data + map_b); - clu_bits &= last_bit_mask[last_mask]; - count += used_bit[clu_bits]; + bitmap = (void *)(sbi->vol_amap[map_i]->b_data + map_b); + clu_bits = lel_to_cpu(*(__le_long *)bitmap); + count += hweight_long(clu_bits & BITMAP_LAST_WORD_MASK(last_mask)); } *ret_count = count; diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index a7a2c35d74fb..9474cd50da6d 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -135,8 +135,7 @@ enum { #define BITMAP_OFFSET_BIT_IN_SECTOR(sb, ent) (ent & BITS_PER_SECTOR_MASK(sb)) #define BITMAP_OFFSET_BYTE_IN_SECTOR(sb, ent) \ ((ent / BITS_PER_BYTE) & ((sb)->s_blocksize - 1)) -#define BITS_PER_BYTE_MASK 0x7 -#define IGNORED_BITS_REMAINED(clu, clu_base) ((1 << ((clu) - (clu_base))) - 1) +#define IGNORED_BITS_REMAINED(clu, clu_base) ((1UL << ((clu) - (clu_base))) - 1) #define ES_ENTRY_NUM(name_len) (ES_IDX_LAST_FILENAME(name_len) + 1) /* 19 entries = 1 file entry + 1 stream entry + 17 filename entries */ @@ -208,6 +207,7 @@ struct exfat_dir_entry { unsigned char flags; unsigned short attr; loff_t size; + loff_t valid_size; unsigned int num_subdirs; struct timespec64 atime; struct timespec64 mtime; @@ -317,6 +317,7 @@ struct exfat_inode_info { loff_t i_size_aligned; /* on-disk position of directory entry or 0 */ loff_t i_pos; + loff_t valid_size; /* hash by i_location */ struct hlist_node i_hash_fat; /* protect bmap against truncate */ diff --git a/fs/exfat/file.c b/fs/exfat/file.c index bfdfafe00993..d25a96a148af 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -11,37 +11,76 @@ #include <linux/fsnotify.h> #include <linux/security.h> #include <linux/msdos_fs.h> +#include <linux/writeback.h> #include "exfat_raw.h" #include "exfat_fs.h" static int exfat_cont_expand(struct inode *inode, loff_t size) { - struct address_space *mapping = inode->i_mapping; - loff_t start = i_size_read(inode), count = size - i_size_read(inode); - int err, err2; + int ret; + unsigned int num_clusters, new_num_clusters, last_clu; + struct exfat_inode_info *ei = EXFAT_I(inode); + struct super_block *sb = inode->i_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct exfat_chain clu; - err = generic_cont_expand_simple(inode, size); - if (err) - return err; + ret = inode_newsize_ok(inode, size); + if (ret) + return ret; + + num_clusters = EXFAT_B_TO_CLU_ROUND_UP(ei->i_size_ondisk, sbi); + new_num_clusters = EXFAT_B_TO_CLU_ROUND_UP(size, sbi); + + if (new_num_clusters == num_clusters) + goto out; + + exfat_chain_set(&clu, ei->start_clu, num_clusters, ei->flags); + ret = exfat_find_last_cluster(sb, &clu, &last_clu); + if (ret) + return ret; + + clu.dir = (last_clu == EXFAT_EOF_CLUSTER) ? + EXFAT_EOF_CLUSTER : last_clu + 1; + clu.size = 0; + clu.flags = ei->flags; + + ret = exfat_alloc_cluster(inode, new_num_clusters - num_clusters, + &clu, IS_DIRSYNC(inode)); + if (ret) + return ret; + + /* Append new clusters to chain */ + if (clu.flags != ei->flags) { + exfat_chain_cont_cluster(sb, ei->start_clu, num_clusters); + ei->flags = ALLOC_FAT_CHAIN; + } + if (clu.flags == ALLOC_FAT_CHAIN) + if (exfat_ent_set(sb, last_clu, clu.dir)) + goto free_clu; + if (num_clusters == 0) + ei->start_clu = clu.dir; + +out: inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); - mark_inode_dirty(inode); + /* Expanded range not zeroed, do not update valid_size */ + i_size_write(inode, size); - if (!IS_SYNC(inode)) - return 0; + ei->i_size_aligned = round_up(size, sb->s_blocksize); + ei->i_size_ondisk = ei->i_size_aligned; + inode->i_blocks = round_up(size, sbi->cluster_size) >> 9; - err = filemap_fdatawrite_range(mapping, start, start + count - 1); - err2 = sync_mapping_buffers(mapping); - if (!err) - err = err2; - err2 = write_inode_now(inode, 1); - if (!err) - err = err2; - if (err) - return err; + if (IS_DIRSYNC(inode)) + return write_inode_now(inode, 1); + + mark_inode_dirty(inode); - return filemap_fdatawait_range(mapping, start, start + count - 1); + return 0; + +free_clu: + exfat_free_cluster(inode, &clu); + return -EIO; } static bool exfat_allow_set_time(struct exfat_sb_info *sbi, struct inode *inode) @@ -146,6 +185,9 @@ int __exfat_truncate(struct inode *inode) ei->start_clu = EXFAT_EOF_CLUSTER; } + if (i_size_read(inode) < ei->valid_size) + ei->valid_size = i_size_read(inode); + if (ei->type == TYPE_FILE) ei->attr |= EXFAT_ATTR_ARCHIVE; @@ -474,15 +516,124 @@ int exfat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) return blkdev_issue_flush(inode->i_sb->s_bdev); } +static int exfat_file_zeroed_range(struct file *file, loff_t start, loff_t end) +{ + int err; + struct inode *inode = file_inode(file); + struct address_space *mapping = inode->i_mapping; + const struct address_space_operations *ops = mapping->a_ops; + + while (start < end) { + u32 zerofrom, len; + struct page *page = NULL; + + zerofrom = start & (PAGE_SIZE - 1); + len = PAGE_SIZE - zerofrom; + if (start + len > end) + len = end - start; + + err = ops->write_begin(file, mapping, start, len, &page, NULL); + if (err) + goto out; + + zero_user_segment(page, zerofrom, zerofrom + len); + + err = ops->write_end(file, mapping, start, len, len, page, NULL); + if (err < 0) + goto out; + start += len; + + balance_dirty_pages_ratelimited(mapping); + cond_resched(); + } + +out: + return err; +} + +static ssize_t exfat_file_write_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + ssize_t ret; + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct exfat_inode_info *ei = EXFAT_I(inode); + loff_t pos = iocb->ki_pos; + loff_t valid_size; + + inode_lock(inode); + + valid_size = ei->valid_size; + + ret = generic_write_checks(iocb, iter); + if (ret < 0) + goto unlock; + + if (pos > valid_size) { + ret = exfat_file_zeroed_range(file, valid_size, pos); + if (ret < 0 && ret != -ENOSPC) { + exfat_err(inode->i_sb, + "write: fail to zero from %llu to %llu(%zd)", + valid_size, pos, ret); + } + if (ret < 0) + goto unlock; + } + + ret = __generic_file_write_iter(iocb, iter); + if (ret < 0) + goto unlock; + + inode_unlock(inode); + + if (pos > valid_size) + pos = valid_size; + + if (iocb_is_dsync(iocb) && iocb->ki_pos > pos) { + ssize_t err = vfs_fsync_range(file, pos, iocb->ki_pos - 1, + iocb->ki_flags & IOCB_SYNC); + if (err < 0) + return err; + } + + return ret; + +unlock: + inode_unlock(inode); + + return ret; +} + +static int exfat_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + int ret; + struct inode *inode = file_inode(file); + struct exfat_inode_info *ei = EXFAT_I(inode); + loff_t start = ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + loff_t end = min_t(loff_t, i_size_read(inode), + start + vma->vm_end - vma->vm_start); + + if ((vma->vm_flags & VM_WRITE) && ei->valid_size < end) { + ret = exfat_file_zeroed_range(file, ei->valid_size, end); + if (ret < 0) { + exfat_err(inode->i_sb, + "mmap: fail to zero from %llu to %llu(%d)", + start, end, ret); + return ret; + } + } + + return generic_file_mmap(file, vma); +} + const struct file_operations exfat_file_operations = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, - .write_iter = generic_file_write_iter, + .write_iter = exfat_file_write_iter, .unlocked_ioctl = exfat_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = exfat_compat_ioctl, #endif - .mmap = generic_file_mmap, + .mmap = exfat_file_mmap, .fsync = exfat_file_fsync, .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index e7ff58b8e68c..0687f952956c 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -75,8 +75,17 @@ int __exfat_write_inode(struct inode *inode, int sync) if (ei->start_clu == EXFAT_EOF_CLUSTER) on_disk_size = 0; - ep2->dentry.stream.valid_size = cpu_to_le64(on_disk_size); - ep2->dentry.stream.size = ep2->dentry.stream.valid_size; + ep2->dentry.stream.size = cpu_to_le64(on_disk_size); + /* + * mmap write does not use exfat_write_end(), valid_size may be + * extended to the sector-aligned length in exfat_get_block(). + * So we need to fixup valid_size to the writren length. + */ + if (on_disk_size < ei->valid_size) + ep2->dentry.stream.valid_size = ep2->dentry.stream.size; + else + ep2->dentry.stream.valid_size = cpu_to_le64(ei->valid_size); + if (on_disk_size) { ep2->dentry.stream.flags = ei->flags; ep2->dentry.stream.start_clu = cpu_to_le32(ei->start_clu); @@ -278,6 +287,7 @@ static int exfat_get_block(struct inode *inode, sector_t iblock, unsigned int cluster, sec_offset; sector_t last_block; sector_t phys = 0; + sector_t valid_blks; loff_t pos; mutex_lock(&sbi->s_lock); @@ -306,17 +316,32 @@ static int exfat_get_block(struct inode *inode, sector_t iblock, mapped_blocks = sbi->sect_per_clus - sec_offset; max_blocks = min(mapped_blocks, max_blocks); - /* Treat newly added block / cluster */ - if (iblock < last_block) - create = 0; - - if (create || buffer_delay(bh_result)) { - pos = EXFAT_BLK_TO_B((iblock + 1), sb); + pos = EXFAT_BLK_TO_B((iblock + 1), sb); + if ((create && iblock >= last_block) || buffer_delay(bh_result)) { if (ei->i_size_ondisk < pos) ei->i_size_ondisk = pos; } + map_bh(bh_result, sb, phys); + if (buffer_delay(bh_result)) + clear_buffer_delay(bh_result); + if (create) { + valid_blks = EXFAT_B_TO_BLK_ROUND_UP(ei->valid_size, sb); + + if (iblock + max_blocks < valid_blks) { + /* The range has been written, map it */ + goto done; + } else if (iblock < valid_blks) { + /* + * The range has been partially written, + * map the written part. + */ + max_blocks = valid_blks - iblock; + goto done; + } + + /* The area has not been written, map and mark as new. */ err = exfat_map_new_buffer(ei, bh_result, pos); if (err) { exfat_fs_error(sb, @@ -324,11 +349,58 @@ static int exfat_get_block(struct inode *inode, sector_t iblock, pos, ei->i_size_aligned); goto unlock_ret; } - } - if (buffer_delay(bh_result)) - clear_buffer_delay(bh_result); - map_bh(bh_result, sb, phys); + ei->valid_size = EXFAT_BLK_TO_B(iblock + max_blocks, sb); + mark_inode_dirty(inode); + } else { + valid_blks = EXFAT_B_TO_BLK(ei->valid_size, sb); + + if (iblock + max_blocks < valid_blks) { + /* The range has been written, map it */ + goto done; + } else if (iblock < valid_blks) { + /* + * The area has been partially written, + * map the written part. + */ + max_blocks = valid_blks - iblock; + goto done; + } else if (iblock == valid_blks && + (ei->valid_size & (sb->s_blocksize - 1))) { + /* + * The block has been partially written, + * zero the unwritten part and map the block. + */ + loff_t size, off; + + max_blocks = 1; + + /* + * For direct read, the unwritten part will be zeroed in + * exfat_direct_IO() + */ + if (!bh_result->b_folio) + goto done; + + pos -= sb->s_blocksize; + size = ei->valid_size - pos; + off = pos & (PAGE_SIZE - 1); + + folio_set_bh(bh_result, bh_result->b_folio, off); + err = bh_read(bh_result, 0); + if (err < 0) + goto unlock_ret; + + folio_zero_segment(bh_result->b_folio, off + size, + off + sb->s_blocksize); + } else { + /* + * The range has not been written, clear the mapped flag + * to only zero the cache and do not read from disk. + */ + clear_buffer_mapped(bh_result); + } + } done: bh_result->b_size = EXFAT_BLK_TO_B(max_blocks, sb); unlock_ret: @@ -343,6 +415,17 @@ static int exfat_read_folio(struct file *file, struct folio *folio) static void exfat_readahead(struct readahead_control *rac) { + struct address_space *mapping = rac->mapping; + struct inode *inode = mapping->host; + struct exfat_inode_info *ei = EXFAT_I(inode); + loff_t pos = readahead_pos(rac); + + /* Range cross valid_size, read it page by page. */ + if (ei->valid_size < i_size_read(inode) && + pos <= ei->valid_size && + ei->valid_size < pos + readahead_length(rac)) + return; + mpage_readahead(rac, exfat_get_block); } @@ -370,9 +453,7 @@ static int exfat_write_begin(struct file *file, struct address_space *mapping, int ret; *pagep = NULL; - ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata, - exfat_get_block, - &EXFAT_I(mapping->host)->i_size_ondisk); + ret = block_write_begin(mapping, pos, len, pagep, exfat_get_block); if (ret < 0) exfat_write_failed(mapping, pos+len); @@ -400,6 +481,11 @@ static int exfat_write_end(struct file *file, struct address_space *mapping, if (err < len) exfat_write_failed(mapping, pos+len); + if (!(err < 0) && pos + err > ei->valid_size) { + ei->valid_size = pos + err; + mark_inode_dirty(inode); + } + if (!(err < 0) && !(ei->attr & EXFAT_ATTR_ARCHIVE)) { inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); ei->attr |= EXFAT_ATTR_ARCHIVE; @@ -413,7 +499,9 @@ static ssize_t exfat_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = mapping->host; - loff_t size = iocb->ki_pos + iov_iter_count(iter); + struct exfat_inode_info *ei = EXFAT_I(inode); + loff_t pos = iocb->ki_pos; + loff_t size = pos + iov_iter_count(iter); int rw = iov_iter_rw(iter); ssize_t ret; @@ -436,8 +524,20 @@ static ssize_t exfat_direct_IO(struct kiocb *iocb, struct iov_iter *iter) * condition of exfat_get_block() and ->truncate(). */ ret = blockdev_direct_IO(iocb, inode, iter, exfat_get_block); - if (ret < 0 && (rw & WRITE)) - exfat_write_failed(mapping, size); + if (ret < 0) { + if (rw == WRITE && ret != -EIOCBQUEUED) + exfat_write_failed(mapping, size); + + return ret; + } else + size = pos + ret; + + /* zero the unwritten part in the partially written block */ + if (rw == READ && pos < ei->valid_size && ei->valid_size < size) { + iov_iter_revert(iter, size - ei->valid_size); + iov_iter_zero(size - ei->valid_size, iter); + } + return ret; } @@ -537,6 +637,7 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info) ei->start_clu = info->start_clu; ei->flags = info->flags; ei->type = info->type; + ei->valid_size = info->valid_size; ei->version = 0; ei->hint_stat.eidx = 0; diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index 5d737e0b639a..9c549fd11fc8 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -406,6 +406,7 @@ static int exfat_find_empty_entry(struct inode *inode, i_size_write(inode, size); ei->i_size_ondisk += sbi->cluster_size; ei->i_size_aligned += sbi->cluster_size; + ei->valid_size += sbi->cluster_size; ei->flags = p_dir->flags; inode->i_blocks += sbi->cluster_size >> 9; } @@ -558,6 +559,8 @@ static int exfat_add_entry(struct inode *inode, const char *path, info->size = clu_size; info->num_subdirs = EXFAT_MIN_SUBDIR; } + info->valid_size = info->size; + memset(&info->crtime, 0, sizeof(info->crtime)); memset(&info->mtime, 0, sizeof(info->mtime)); memset(&info->atime, 0, sizeof(info->atime)); @@ -660,6 +663,8 @@ static int exfat_find(struct inode *dir, struct qstr *qname, info->type = exfat_get_entry_type(ep); info->attr = le16_to_cpu(ep->dentry.file.attr); info->size = le64_to_cpu(ep2->dentry.stream.valid_size); + info->valid_size = le64_to_cpu(ep2->dentry.stream.valid_size); + info->size = le64_to_cpu(ep2->dentry.stream.size); if (info->size == 0) { info->flags = ALLOC_NO_FAT_CHAIN; info->start_clu = EXFAT_EOF_CLUSTER; @@ -1288,6 +1293,7 @@ static int __exfat_rename(struct inode *old_parent_inode, } i_size_write(new_inode, 0); + new_ei->valid_size = 0; new_ei->start_clu = EXFAT_EOF_CLUSTER; new_ei->flags = ALLOC_NO_FAT_CHAIN; } diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 65f702b1da5b..8346ab9534c1 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -325,6 +325,7 @@ static int ext2_rename (struct mnt_idmap * idmap, struct ext2_dir_entry_2 * dir_de = NULL; struct folio * old_folio; struct ext2_dir_entry_2 * old_de; + bool old_is_dir = S_ISDIR(old_inode->i_mode); int err; if (flags & ~RENAME_NOREPLACE) @@ -342,7 +343,7 @@ static int ext2_rename (struct mnt_idmap * idmap, if (IS_ERR(old_de)) return PTR_ERR(old_de); - if (S_ISDIR(old_inode->i_mode)) { + if (old_is_dir && old_dir != new_dir) { err = -EIO; dir_de = ext2_dotdot(old_inode, &dir_folio); if (!dir_de) @@ -354,7 +355,7 @@ static int ext2_rename (struct mnt_idmap * idmap, struct ext2_dir_entry_2 *new_de; err = -ENOTEMPTY; - if (dir_de && !ext2_empty_dir (new_inode)) + if (old_is_dir && !ext2_empty_dir(new_inode)) goto out_dir; new_de = ext2_find_entry(new_dir, &new_dentry->d_name, @@ -368,14 +369,14 @@ static int ext2_rename (struct mnt_idmap * idmap, if (err) goto out_dir; inode_set_ctime_current(new_inode); - if (dir_de) + if (old_is_dir) drop_nlink(new_inode); inode_dec_link_count(new_inode); } else { err = ext2_add_link(new_dentry, old_inode); if (err) goto out_dir; - if (dir_de) + if (old_is_dir) inode_inc_link_count(new_dir); } @@ -387,7 +388,7 @@ static int ext2_rename (struct mnt_idmap * idmap, mark_inode_dirty(old_inode); err = ext2_delete_entry(old_de, old_folio); - if (!err && dir_de) { + if (!err && old_is_dir) { if (old_dir != new_dir) err = ext2_set_link(old_inode, dir_de, dir_folio, new_dir, false); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index d252935f9c8a..05b647e6bc19 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2388,8 +2388,6 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, sb = dir->i_sb; blocksize = sb->s_blocksize; - if (!dentry->d_name.len) - return -EINVAL; if (fscrypt_is_nokey_name(dentry)) return -ENOKEY; @@ -3591,10 +3589,14 @@ struct ext4_renament { int dir_inlined; }; -static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent) +static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent, bool is_cross) { int retval; + ent->is_dir = true; + if (!is_cross) + return 0; + ent->dir_bh = ext4_get_first_dir_block(handle, ent->inode, &retval, &ent->parent_de, &ent->dir_inlined); @@ -3612,6 +3614,9 @@ static int ext4_rename_dir_finish(handle_t *handle, struct ext4_renament *ent, { int retval; + if (!ent->dir_bh) + return 0; + ent->parent_de->inode = cpu_to_le32(dir_ino); BUFFER_TRACE(ent->dir_bh, "call ext4_handle_dirty_metadata"); if (!ent->dir_inlined) { @@ -3900,7 +3905,7 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (new.dir != old.dir && EXT4_DIR_LINK_MAX(new.dir)) goto end_rename; } - retval = ext4_rename_dir_prepare(handle, &old); + retval = ext4_rename_dir_prepare(handle, &old, new.dir != old.dir); if (retval) goto end_rename; } @@ -3964,7 +3969,7 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir, } inode_set_mtime_to_ts(old.dir, inode_set_ctime_current(old.dir)); ext4_update_dx_flag(old.dir); - if (old.dir_bh) { + if (old.is_dir) { retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); if (retval) goto end_rename; @@ -3987,7 +3992,7 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (unlikely(retval)) goto end_rename; - if (S_ISDIR(old.inode->i_mode)) { + if (old.is_dir) { /* * We disable fast commits here that's because the * replay code is not yet capable of changing dot dot @@ -4114,14 +4119,12 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, ext4_handle_sync(handle); if (S_ISDIR(old.inode->i_mode)) { - old.is_dir = true; - retval = ext4_rename_dir_prepare(handle, &old); + retval = ext4_rename_dir_prepare(handle, &old, new.dir != old.dir); if (retval) goto end_rename; } if (S_ISDIR(new.inode->i_mode)) { - new.is_dir = true; - retval = ext4_rename_dir_prepare(handle, &new); + retval = ext4_rename_dir_prepare(handle, &new, new.dir != old.dir); if (retval) goto end_rename; } diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 6b2af514660d..531517dac079 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1036,8 +1036,10 @@ static void set_cluster_dirty(struct compress_ctx *cc) int i; for (i = 0; i < cc->cluster_size; i++) - if (cc->rpages[i]) + if (cc->rpages[i]) { set_page_dirty(cc->rpages[i]); + set_page_private_gcing(cc->rpages[i]); + } } static int prepare_compress_overwrite(struct compress_ctx *cc, @@ -1369,8 +1371,6 @@ unlock_continue: add_compr_block_stat(inode, cc->valid_nr_cpages); set_inode_flag(cc->inode, FI_APPEND_WRITE); - if (cc->cluster_idx == 0) - set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); f2fs_put_dnode(&dn); if (quota_inode) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 4e42b5f24deb..26e317696b33 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -995,7 +995,7 @@ static bool is_end_zone_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr) } blkaddr -= FDEV(devi).start_blk; } - return bdev_zoned_model(FDEV(devi).bdev) == BLK_ZONED_HM && + return bdev_is_zoned(FDEV(devi).bdev) && f2fs_blkz_is_seq(sbi, devi, blkaddr) && (blkaddr % sbi->blocks_per_blkz == sbi->blocks_per_blkz - 1); } @@ -1179,18 +1179,12 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page, return 0; } -static void __set_data_blkaddr(struct dnode_of_data *dn) +static void __set_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) { - struct f2fs_node *rn = F2FS_NODE(dn->node_page); - __le32 *addr_array; - int base = 0; + __le32 *addr = get_dnode_addr(dn->inode, dn->node_page); - if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode)) - base = get_extra_isize(dn->inode); - - /* Get physical address of data block */ - addr_array = blkaddr_in_node(rn); - addr_array[base + dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr); + dn->data_blkaddr = blkaddr; + addr[dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr); } /* @@ -1199,18 +1193,17 @@ static void __set_data_blkaddr(struct dnode_of_data *dn) * ->node_page * update block addresses in the node page */ -void f2fs_set_data_blkaddr(struct dnode_of_data *dn) +void f2fs_set_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) { f2fs_wait_on_page_writeback(dn->node_page, NODE, true, true); - __set_data_blkaddr(dn); + __set_data_blkaddr(dn, blkaddr); if (set_page_dirty(dn->node_page)) dn->node_changed = true; } void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) { - dn->data_blkaddr = blkaddr; - f2fs_set_data_blkaddr(dn); + f2fs_set_data_blkaddr(dn, blkaddr); f2fs_update_read_extent_cache(dn); } @@ -1237,8 +1230,7 @@ int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) block_t blkaddr = f2fs_data_blkaddr(dn); if (blkaddr == NULL_ADDR) { - dn->data_blkaddr = NEW_ADDR; - __set_data_blkaddr(dn); + __set_data_blkaddr(dn, NEW_ADDR); count--; } } @@ -1492,11 +1484,9 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) old_blkaddr = dn->data_blkaddr; f2fs_allocate_data_block(sbi, NULL, old_blkaddr, &dn->data_blkaddr, &sum, seg_type, NULL); - if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) { - invalidate_mapping_pages(META_MAPPING(sbi), - old_blkaddr, old_blkaddr); - f2fs_invalidate_compress_page(sbi, old_blkaddr); - } + if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) + f2fs_invalidate_internal_cache(sbi, old_blkaddr); + f2fs_update_data_blkaddr(dn, dn->data_blkaddr); return 0; } @@ -1992,7 +1982,7 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ret) return ret; - inode_lock(inode); + inode_lock_shared(inode); maxbytes = max_file_blocks(inode) << F2FS_BLKSIZE_BITS; if (start > maxbytes) { @@ -2112,7 +2102,7 @@ out: if (ret == 1) ret = 0; - inode_unlock(inode); + inode_unlock_shared(inode); return ret; } @@ -2566,9 +2556,6 @@ int f2fs_encrypt_one_page(struct f2fs_io_info *fio) page = fio->compressed_page ? fio->compressed_page : fio->page; - /* wait for GCed page writeback via META_MAPPING */ - f2fs_wait_on_block_writeback(inode, fio->old_blkaddr); - if (fscrypt_inode_uses_inline_crypto(inode)) return 0; @@ -2755,6 +2742,10 @@ got_it: goto out_writepage; } + /* wait for GCed page writeback via META_MAPPING */ + if (fio->post_read) + f2fs_wait_on_block_writeback(inode, fio->old_blkaddr); + /* * If current allocation needs SSR, * it had better in-place writes for updated data. @@ -2810,8 +2801,6 @@ got_it: f2fs_outplace_write_data(&dn, fio); trace_f2fs_do_write_data_page(page, OPU); set_inode_flag(inode, FI_APPEND_WRITE); - if (page->index == 0) - set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); out_writepage: f2fs_put_dnode(&dn); out: @@ -2894,9 +2883,6 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, zero_user_segment(page, offset, PAGE_SIZE); write: - if (f2fs_is_drop_cache(inode)) - goto out; - /* Dentry/quota blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode) || quota_inode) { /* diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9043cedfa12b..65294e3b0bef 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -374,6 +374,12 @@ enum { MAX_DPOLICY, }; +enum { + DPOLICY_IO_AWARE_DISABLE, /* force to not be aware of IO */ + DPOLICY_IO_AWARE_ENABLE, /* force to be aware of IO */ + DPOLICY_IO_AWARE_MAX, +}; + struct discard_policy { int type; /* type of discard */ unsigned int min_interval; /* used for candidates exist */ @@ -406,6 +412,7 @@ struct discard_cmd_control { unsigned int discard_urgent_util; /* utilization which issue discard proactively */ unsigned int discard_granularity; /* discard granularity */ unsigned int max_ordered_discard; /* maximum discard granularity issued by lba order */ + unsigned int discard_io_aware; /* io_aware policy */ unsigned int undiscard_blks; /* # of undiscard blocks */ unsigned int next_pos; /* next discard position */ atomic_t issued_discard; /* # of issued discard */ @@ -774,8 +781,6 @@ enum { FI_UPDATE_WRITE, /* inode has in-place-update data */ FI_NEED_IPU, /* used for ipu per file */ FI_ATOMIC_FILE, /* indicate atomic file */ - FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */ - FI_DROP_CACHE, /* drop dirty page cache */ FI_DATA_EXIST, /* indicate data exists */ FI_INLINE_DOTS, /* indicate inline dot dentries */ FI_SKIP_WRITES, /* should skip data page writeback */ @@ -3272,22 +3277,13 @@ static inline bool f2fs_is_cow_file(struct inode *inode) return is_inode_flag_set(inode, FI_COW_FILE); } -static inline bool f2fs_is_first_block_written(struct inode *inode) -{ - return is_inode_flag_set(inode, FI_FIRST_BLOCK_WRITTEN); -} - -static inline bool f2fs_is_drop_cache(struct inode *inode) -{ - return is_inode_flag_set(inode, FI_DROP_CACHE); -} - +static inline __le32 *get_dnode_addr(struct inode *inode, + struct page *node_page); static inline void *inline_data_addr(struct inode *inode, struct page *page) { - struct f2fs_inode *ri = F2FS_INODE(page); - int extra_size = get_extra_isize(inode); + __le32 *addr = get_dnode_addr(inode, page); - return (void *)&(ri->i_addr[extra_size + DEF_INLINE_RESERVED_SIZE]); + return (void *)(addr + DEF_INLINE_RESERVED_SIZE); } static inline int f2fs_has_inline_dentry(struct inode *inode) @@ -3432,6 +3428,17 @@ static inline int get_inline_xattr_addrs(struct inode *inode) return F2FS_I(inode)->i_inline_xattr_size; } +static inline __le32 *get_dnode_addr(struct inode *inode, + struct page *node_page) +{ + int base = 0; + + if (IS_INODE(node_page) && f2fs_has_extra_attr(inode)) + base = get_extra_isize(inode); + + return blkaddr_in_node(F2FS_NODE(node_page)) + base; +} + #define f2fs_get_inode_mode(i) \ ((is_inode_flag_set(i, FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) @@ -3815,7 +3822,7 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio); struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, block_t blk_addr, sector_t *sector); int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr); -void f2fs_set_data_blkaddr(struct dnode_of_data *dn); +void f2fs_set_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr); void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr); int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count); int f2fs_reserve_new_block(struct dnode_of_data *dn); @@ -4606,6 +4613,13 @@ static inline bool f2fs_is_readonly(struct f2fs_sb_info *sbi) return f2fs_sb_has_readonly(sbi) || f2fs_readonly(sbi->sb); } +static inline void f2fs_invalidate_internal_cache(struct f2fs_sb_info *sbi, + block_t blkaddr) +{ + invalidate_mapping_pages(META_MAPPING(sbi), blkaddr, blkaddr); + f2fs_invalidate_compress_page(sbi, blkaddr); +} + #define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 4580dfefd5e9..b58ab1157b7e 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -42,11 +42,11 @@ static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf) vm_fault_t ret; ret = filemap_fault(vmf); - if (!ret) + if (ret & VM_FAULT_LOCKED) f2fs_update_iostat(F2FS_I_SB(inode), inode, APP_MAPPED_READ_IO, F2FS_BLKSIZE); - trace_f2fs_filemap_fault(inode, vmf->pgoff, (unsigned long)ret); + trace_f2fs_filemap_fault(inode, vmf->pgoff, vmf->vma->vm_flags, ret); return ret; } @@ -59,26 +59,29 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) struct dnode_of_data dn; bool need_alloc = true; int err = 0; + vm_fault_t ret; if (unlikely(IS_IMMUTABLE(inode))) return VM_FAULT_SIGBUS; - if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) - return VM_FAULT_SIGBUS; + if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { + err = -EIO; + goto out; + } if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; - goto err; + goto out; } if (!f2fs_is_checkpoint_ready(sbi)) { err = -ENOSPC; - goto err; + goto out; } err = f2fs_convert_inline_inode(inode); if (err) - goto err; + goto out; #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_compressed_file(inode)) { @@ -86,7 +89,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) if (ret < 0) { err = ret; - goto err; + goto out; } else if (ret) { need_alloc = false; } @@ -153,13 +156,15 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) f2fs_update_iostat(sbi, inode, APP_MAPPED_IO, F2FS_BLKSIZE); f2fs_update_time(sbi, REQ_TIME); - trace_f2fs_vm_page_mkwrite(page, DATA); out_sem: filemap_invalidate_unlock_shared(inode->i_mapping); sb_end_pagefault(inode->i_sb); -err: - return vmf_fs_error(err); +out: + ret = vmf_fs_error(err); + + trace_f2fs_vm_page_mkwrite(inode, page->index, vmf->vma->vm_flags, ret); + return ret; } static const struct vm_operations_struct f2fs_file_vm_ops = { @@ -418,7 +423,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) loff_t isize; int err = 0; - inode_lock(inode); + inode_lock_shared(inode); isize = i_size_read(inode); if (offset >= isize) @@ -483,10 +488,10 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) found: if (whence == SEEK_HOLE && data_ofs > isize) data_ofs = isize; - inode_unlock(inode); + inode_unlock_shared(inode); return vfs_setpos(file, data_ofs, maxbytes); fail: - inode_unlock(inode); + inode_unlock_shared(inode); return -ENXIO; } @@ -557,20 +562,14 @@ static int f2fs_file_open(struct inode *inode, struct file *filp) void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); - struct f2fs_node *raw_node; int nr_free = 0, ofs = dn->ofs_in_node, len = count; __le32 *addr; - int base = 0; bool compressed_cluster = false; int cluster_index = 0, valid_blocks = 0; int cluster_size = F2FS_I(dn->inode)->i_cluster_size; bool released = !atomic_read(&F2FS_I(dn->inode)->i_compr_blocks); - if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode)) - base = get_extra_isize(dn->inode); - - raw_node = F2FS_NODE(dn->node_page); - addr = blkaddr_in_node(raw_node) + base + ofs; + addr = get_dnode_addr(dn->inode, dn->node_page) + ofs; /* Assumption: truncation starts with cluster */ for (; count > 0; count--, addr++, dn->ofs_in_node++, cluster_index++) { @@ -588,8 +587,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) if (blkaddr == NULL_ADDR) continue; - dn->data_blkaddr = NULL_ADDR; - f2fs_set_data_blkaddr(dn); + f2fs_set_data_blkaddr(dn, NULL_ADDR); if (__is_valid_data_blkaddr(blkaddr)) { if (!f2fs_is_valid_blkaddr(sbi, blkaddr, @@ -599,9 +597,6 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) valid_blocks++; } - if (dn->ofs_in_node == 0 && IS_INODE(dn->node_page)) - clear_inode_flag(dn->inode, FI_FIRST_BLOCK_WRITTEN); - f2fs_invalidate_blocks(sbi, blkaddr); if (!released || blkaddr != COMPRESS_ADDR) @@ -1317,6 +1312,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, } memcpy_page(pdst, 0, psrc, 0, PAGE_SIZE); set_page_dirty(pdst); + set_page_private_gcing(pdst); f2fs_put_page(pdst, 1); f2fs_put_page(psrc, 1); @@ -1487,8 +1483,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, } f2fs_invalidate_blocks(sbi, dn->data_blkaddr); - dn->data_blkaddr = NEW_ADDR; - f2fs_set_data_blkaddr(dn); + f2fs_set_data_blkaddr(dn, NEW_ADDR); } f2fs_update_read_extent_cache_range(dn, start, 0, index - start); @@ -2818,6 +2813,11 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, goto out; } + if (f2fs_compressed_file(src) || f2fs_compressed_file(dst)) { + ret = -EOPNOTSUPP; + goto out_unlock; + } + ret = -EINVAL; if (pos_in + len > src->i_size || pos_in + len < pos_in) goto out_unlock; @@ -3463,8 +3463,7 @@ static int release_compress_blocks(struct dnode_of_data *dn, pgoff_t count) if (blkaddr != NEW_ADDR) continue; - dn->data_blkaddr = NULL_ADDR; - f2fs_set_data_blkaddr(dn); + f2fs_set_data_blkaddr(dn, NULL_ADDR); } f2fs_i_compr_blocks_update(dn->inode, compr_blocks, false); @@ -3630,8 +3629,7 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count) continue; } - dn->data_blkaddr = NEW_ADDR; - f2fs_set_data_blkaddr(dn); + f2fs_set_data_blkaddr(dn, NEW_ADDR); } reserved = cluster_size - compr_blocks; @@ -4054,6 +4052,7 @@ static int redirty_blocks(struct inode *inode, pgoff_t page_idx, int len) f2fs_bug_on(F2FS_I_SB(inode), !page); set_page_dirty(page); + set_page_private_gcing(page); f2fs_put_page(page, 1); f2fs_put_page(page, 0); } @@ -4568,7 +4567,8 @@ static int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *iter, if (map.m_len > map.m_lblk) map.m_len -= map.m_lblk; else - map.m_len = 0; + return 0; + map.m_may_create = true; if (dio) { map.m_seg_type = f2fs_rw_hint_to_seg_type(inode->i_write_hint); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index f550cdeaa663..a079eebfb080 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -46,8 +46,8 @@ static int gc_thread_func(void *data) do { bool sync_mode, foreground = false; - wait_event_interruptible_timeout(*wq, - kthread_should_stop() || freezing(current) || + wait_event_freezable_timeout(*wq, + kthread_should_stop() || waitqueue_active(fggc_wq) || gc_th->gc_wake, msecs_to_jiffies(wait_ms)); @@ -59,7 +59,7 @@ static int gc_thread_func(void *data) if (gc_th->gc_wake) gc_th->gc_wake = false; - if (try_to_freeze() || f2fs_readonly(sbi->sb)) { + if (f2fs_readonly(sbi->sb)) { stat_other_skip_bggc_count(sbi); continue; } @@ -1380,9 +1380,8 @@ static int move_data_block(struct inode *inode, block_t bidx, memcpy(page_address(fio.encrypted_page), page_address(mpage), PAGE_SIZE); f2fs_put_page(mpage, 1); - invalidate_mapping_pages(META_MAPPING(fio.sbi), - fio.old_blkaddr, fio.old_blkaddr); - f2fs_invalidate_compress_page(fio.sbi, fio.old_blkaddr); + + f2fs_invalidate_internal_cache(fio.sbi, fio.old_blkaddr); set_page_dirty(fio.encrypted_page); if (clear_page_dirty_for_io(fio.encrypted_page)) @@ -1405,8 +1404,6 @@ static int move_data_block(struct inode *inode, block_t bidx, f2fs_update_data_blkaddr(&dn, newaddr); set_inode_flag(inode, FI_APPEND_WRITE); - if (page->index == 0) - set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); put_page_out: f2fs_put_page(fio.encrypted_page, 1); recover_block: @@ -1868,6 +1865,9 @@ retry: seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type, gc_control->should_migrate_blocks); + if (seg_freed < 0) + goto stop; + total_freed += seg_freed; if (seg_freed == f2fs_usable_segs_in_sec(sbi, segno)) { diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index a9eb3891f417..c26effdce9aa 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -61,49 +61,31 @@ void f2fs_set_inode_flags(struct inode *inode) S_ENCRYPTED|S_VERITY|S_CASEFOLD); } -static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) +static void __get_inode_rdev(struct inode *inode, struct page *node_page) { - int extra_size = get_extra_isize(inode); + __le32 *addr = get_dnode_addr(inode, node_page); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { - if (ri->i_addr[extra_size]) - inode->i_rdev = old_decode_dev( - le32_to_cpu(ri->i_addr[extra_size])); + if (addr[0]) + inode->i_rdev = old_decode_dev(le32_to_cpu(addr[0])); else - inode->i_rdev = new_decode_dev( - le32_to_cpu(ri->i_addr[extra_size + 1])); + inode->i_rdev = new_decode_dev(le32_to_cpu(addr[1])); } } -static int __written_first_block(struct f2fs_sb_info *sbi, - struct f2fs_inode *ri) +static void __set_inode_rdev(struct inode *inode, struct page *node_page) { - block_t addr = le32_to_cpu(ri->i_addr[offset_in_addr(ri)]); - - if (!__is_valid_data_blkaddr(addr)) - return 1; - if (!f2fs_is_valid_blkaddr(sbi, addr, DATA_GENERIC_ENHANCE)) { - f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); - return -EFSCORRUPTED; - } - return 0; -} - -static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri) -{ - int extra_size = get_extra_isize(inode); + __le32 *addr = get_dnode_addr(inode, node_page); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { if (old_valid_dev(inode->i_rdev)) { - ri->i_addr[extra_size] = - cpu_to_le32(old_encode_dev(inode->i_rdev)); - ri->i_addr[extra_size + 1] = 0; + addr[0] = cpu_to_le32(old_encode_dev(inode->i_rdev)); + addr[1] = 0; } else { - ri->i_addr[extra_size] = 0; - ri->i_addr[extra_size + 1] = - cpu_to_le32(new_encode_dev(inode->i_rdev)); - ri->i_addr[extra_size + 2] = 0; + addr[0] = 0; + addr[1] = cpu_to_le32(new_encode_dev(inode->i_rdev)); + addr[2] = 0; } } } @@ -398,7 +380,6 @@ static int do_read_inode(struct inode *inode) struct page *node_page; struct f2fs_inode *ri; projid_t i_projid; - int err; /* Check if ino is within scope */ if (f2fs_check_nid_range(sbi, inode->i_ino)) @@ -478,17 +459,7 @@ static int do_read_inode(struct inode *inode) } /* get rdev by using inline_info */ - __get_inode_rdev(inode, ri); - - if (S_ISREG(inode->i_mode)) { - err = __written_first_block(sbi, ri); - if (err < 0) { - f2fs_put_page(node_page, 1); - return err; - } - if (!err) - set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); - } + __get_inode_rdev(inode, node_page); if (!f2fs_need_inode_block_update(sbi, inode->i_ino)) fi->last_disk_size = inode->i_size; @@ -761,7 +732,7 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) } } - __set_inode_rdev(inode, ri); + __set_inode_rdev(inode, node_page); /* deleted inode */ if (inode->i_nlink == 0) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index d0053b0284d8..b3bb815fc6aa 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -459,7 +459,6 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct qstr dot = QSTR_INIT(".", 1); - struct qstr dotdot = QSTR_INIT("..", 2); struct f2fs_dir_entry *de; struct page *page; int err = 0; @@ -497,13 +496,13 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) goto out; } - de = f2fs_find_entry(dir, &dotdot, &page); + de = f2fs_find_entry(dir, &dotdot_name, &page); if (de) f2fs_put_page(page, 0); else if (IS_ERR(page)) err = PTR_ERR(page); else - err = f2fs_do_add_link(dir, &dotdot, NULL, pino, S_IFDIR); + err = f2fs_do_add_link(dir, &dotdot_name, NULL, pino, S_IFDIR); out: if (!err) clear_inode_flag(dir, FI_INLINE_DOTS); @@ -963,6 +962,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct f2fs_dir_entry *old_dir_entry = NULL; struct f2fs_dir_entry *old_entry; struct f2fs_dir_entry *new_entry; + bool old_is_dir = S_ISDIR(old_inode->i_mode); int err; if (unlikely(f2fs_cp_error(sbi))) @@ -1017,7 +1017,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, goto out; } - if (S_ISDIR(old_inode->i_mode)) { + if (old_is_dir && old_dir != new_dir) { old_dir_entry = f2fs_parent_dir(old_inode, &old_dir_page); if (!old_dir_entry) { if (IS_ERR(old_dir_page)) @@ -1029,7 +1029,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (new_inode) { err = -ENOTEMPTY; - if (old_dir_entry && !f2fs_empty_dir(new_inode)) + if (old_is_dir && !f2fs_empty_dir(new_inode)) goto out_dir; err = -ENOENT; @@ -1054,7 +1054,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, inode_set_ctime_current(new_inode); f2fs_down_write(&F2FS_I(new_inode)->i_sem); - if (old_dir_entry) + if (old_is_dir) f2fs_i_links_write(new_inode, false); f2fs_i_links_write(new_inode, false); f2fs_up_write(&F2FS_I(new_inode)->i_sem); @@ -1074,12 +1074,12 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, goto out_dir; } - if (old_dir_entry) + if (old_is_dir) f2fs_i_links_write(new_dir, true); } f2fs_down_write(&F2FS_I(old_inode)->i_sem); - if (!old_dir_entry || whiteout) + if (!old_is_dir || whiteout) file_lost_pino(old_inode); else /* adjust dir's i_pino to pass fsck check */ @@ -1105,8 +1105,8 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, iput(whiteout); } - if (old_dir_entry) { - if (old_dir != new_dir && !whiteout) + if (old_is_dir) { + if (old_dir_entry) f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); else @@ -1316,21 +1316,27 @@ static int f2fs_rename2(struct mnt_idmap *idmap, if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; + trace_f2fs_rename_start(old_dir, old_dentry, new_dir, new_dentry, + flags); + err = fscrypt_prepare_rename(old_dir, old_dentry, new_dir, new_dentry, flags); if (err) return err; - if (flags & RENAME_EXCHANGE) { - return f2fs_cross_rename(old_dir, old_dentry, - new_dir, new_dentry); - } + if (flags & RENAME_EXCHANGE) + err = f2fs_cross_rename(old_dir, old_dentry, + new_dir, new_dentry); + else /* * VFS has already handled the new dentry existence case, * here, we just deal with "RENAME_NOREPLACE" as regular rename. */ - return f2fs_rename(idmap, old_dir, old_dentry, + err = f2fs_rename(idmap, old_dir, old_dentry, new_dir, new_dentry, flags); + + trace_f2fs_rename_end(old_dentry, new_dentry, flags, err); + return err; } static const char *f2fs_encrypted_get_link(struct dentry *dentry, diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 6c7f6a649d27..9b546fd21010 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2751,11 +2751,11 @@ recover_xnid: f2fs_update_inode_page(inode); /* 3: update and set xattr node page dirty */ - if (page) + if (page) { memcpy(F2FS_NODE(xpage), F2FS_NODE(page), VALID_XATTR_BLOCK_SIZE); - - set_page_dirty(xpage); + set_page_dirty(xpage); + } f2fs_put_page(xpage, 1); return 0; diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index b56d0f1078a7..d0f24ccbd1ac 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -712,7 +712,16 @@ retry_dn: */ if (dest == NEW_ADDR) { f2fs_truncate_data_blocks_range(&dn, 1); - f2fs_reserve_new_block(&dn); + do { + err = f2fs_reserve_new_block(&dn); + if (err == -ENOSPC) { + f2fs_bug_on(sbi, 1); + break; + } + } while (err && + IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION)); + if (err) + goto err; continue; } @@ -720,12 +729,14 @@ retry_dn: if (f2fs_is_valid_blkaddr(sbi, dest, META_POR)) { if (src == NULL_ADDR) { - err = f2fs_reserve_new_block(&dn); - while (err && - IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION)) + do { err = f2fs_reserve_new_block(&dn); - /* We should not get -ENOSPC */ - f2fs_bug_on(sbi, err); + if (err == -ENOSPC) { + f2fs_bug_on(sbi, 1); + break; + } + } while (err && + IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION)); if (err) goto err; } @@ -906,6 +917,8 @@ skip: if (!err && fix_curseg_write_pointer && !f2fs_readonly(sbi->sb) && f2fs_sb_has_blkzoned(sbi)) { err = f2fs_fix_curseg_write_pointer(sbi); + if (!err) + err = f2fs_check_write_pointer(sbi); ret = err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 727d016318f9..4c8836ded90f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1172,7 +1172,10 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, dpolicy->min_interval = dcc->min_discard_issue_time; dpolicy->mid_interval = dcc->mid_discard_issue_time; dpolicy->max_interval = dcc->max_discard_issue_time; - dpolicy->io_aware = true; + if (dcc->discard_io_aware == DPOLICY_IO_AWARE_ENABLE) + dpolicy->io_aware = true; + else if (dcc->discard_io_aware == DPOLICY_IO_AWARE_DISABLE) + dpolicy->io_aware = false; dpolicy->sync = false; dpolicy->ordered = true; if (utilization(sbi) > dcc->discard_urgent_util) { @@ -1380,7 +1383,8 @@ static void __insert_discard_cmd(struct f2fs_sb_info *sbi, p = &(*p)->rb_right; leftmost = false; } else { - f2fs_bug_on(sbi, 1); + /* Let's skip to add, if exists */ + return; } } @@ -1883,9 +1887,8 @@ static int issue_discard_thread(void *data) set_freezable(); do { - wait_event_interruptible_timeout(*q, - kthread_should_stop() || freezing(current) || - dcc->discard_wake, + wait_event_freezable_timeout(*q, + kthread_should_stop() || dcc->discard_wake, msecs_to_jiffies(wait_ms)); if (sbi->gc_mode == GC_URGENT_HIGH || @@ -1903,8 +1906,6 @@ static int issue_discard_thread(void *data) if (atomic_read(&dcc->queued_discard)) __wait_all_discard_cmd(sbi, NULL); - if (try_to_freeze()) - continue; if (f2fs_readonly(sbi->sb)) continue; if (kthread_should_stop()) @@ -2274,6 +2275,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) dcc->discard_io_aware_gran = MAX_PLIST_NUM; dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY; dcc->max_ordered_discard = DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY; + dcc->discard_io_aware = DPOLICY_IO_AWARE_ENABLE; if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT) dcc->discard_granularity = sbi->blocks_per_seg; else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION) @@ -2495,8 +2497,7 @@ void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) if (addr == NEW_ADDR || addr == COMPRESS_ADDR) return; - invalidate_mapping_pages(META_MAPPING(sbi), addr, addr); - f2fs_invalidate_compress_page(sbi, addr); + f2fs_invalidate_internal_cache(sbi, addr); /* add it into sit main buffer */ down_write(&sit_i->sentry_lock); @@ -3557,11 +3558,8 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) reallocate: f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, &fio->new_blkaddr, sum, type, fio); - if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO) { - invalidate_mapping_pages(META_MAPPING(fio->sbi), - fio->old_blkaddr, fio->old_blkaddr); - f2fs_invalidate_compress_page(fio->sbi, fio->old_blkaddr); - } + if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO) + f2fs_invalidate_internal_cache(fio->sbi, fio->old_blkaddr); /* writeout dirty page into bdev */ f2fs_submit_page_write(fio); @@ -3757,9 +3755,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, update_sit_entry(sbi, new_blkaddr, 1); } if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) { - invalidate_mapping_pages(META_MAPPING(sbi), - old_blkaddr, old_blkaddr); - f2fs_invalidate_compress_page(sbi, old_blkaddr); + f2fs_invalidate_internal_cache(sbi, old_blkaddr); if (!from_gc) update_segment_mtime(sbi, old_blkaddr, 0); update_sit_entry(sbi, old_blkaddr, -1); @@ -4865,99 +4861,56 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi, struct f2fs_dev_info *fdev, struct blk_zone *zone) { - unsigned int wp_segno, wp_blkoff, zone_secno, zone_segno, segno; - block_t zone_block, wp_block, last_valid_block; + unsigned int zone_segno; + block_t zone_block, valid_block_cnt; unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT; - int i, s, b, ret; - struct seg_entry *se; + int ret; if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ) return 0; - wp_block = fdev->start_blk + (zone->wp >> log_sectors_per_block); - wp_segno = GET_SEGNO(sbi, wp_block); - wp_blkoff = wp_block - START_BLOCK(sbi, wp_segno); zone_block = fdev->start_blk + (zone->start >> log_sectors_per_block); zone_segno = GET_SEGNO(sbi, zone_block); - zone_secno = GET_SEC_FROM_SEG(sbi, zone_segno); - - if (zone_segno >= MAIN_SEGS(sbi)) - return 0; /* * Skip check of zones cursegs point to, since * fix_curseg_write_pointer() checks them. */ - for (i = 0; i < NO_CHECK_TYPE; i++) - if (zone_secno == GET_SEC_FROM_SEG(sbi, - CURSEG_I(sbi, i)->segno)) - return 0; + if (zone_segno >= MAIN_SEGS(sbi) || + IS_CURSEC(sbi, GET_SEC_FROM_SEG(sbi, zone_segno))) + return 0; /* - * Get last valid block of the zone. + * Get # of valid block of the zone. */ - last_valid_block = zone_block - 1; - for (s = sbi->segs_per_sec - 1; s >= 0; s--) { - segno = zone_segno + s; - se = get_seg_entry(sbi, segno); - for (b = sbi->blocks_per_seg - 1; b >= 0; b--) - if (f2fs_test_bit(b, se->cur_valid_map)) { - last_valid_block = START_BLOCK(sbi, segno) + b; - break; - } - if (last_valid_block >= zone_block) - break; - } + valid_block_cnt = get_valid_blocks(sbi, zone_segno, true); - /* - * When safely unmounted in the previous mount, we can trust write - * pointers. Otherwise, finish zones. - */ - if (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { - /* - * The write pointer matches with the valid blocks or - * already points to the end of the zone. - */ - if ((last_valid_block + 1 == wp_block) || - (zone->wp == zone->start + zone->len)) - return 0; - } + if ((!valid_block_cnt && zone->cond == BLK_ZONE_COND_EMPTY) || + (valid_block_cnt && zone->cond == BLK_ZONE_COND_FULL)) + return 0; - if (last_valid_block + 1 == zone_block) { - if (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { - /* - * If there is no valid block in the zone and if write - * pointer is not at zone start, reset the write - * pointer. - */ - f2fs_notice(sbi, - "Zone without valid block has non-zero write " - "pointer. Reset the write pointer: wp[0x%x,0x%x]", - wp_segno, wp_blkoff); - } + if (!valid_block_cnt) { + f2fs_notice(sbi, "Zone without valid block has non-zero write " + "pointer. Reset the write pointer: cond[0x%x]", + zone->cond); ret = __f2fs_issue_discard_zone(sbi, fdev->bdev, zone_block, zone->len >> log_sectors_per_block); if (ret) f2fs_err(sbi, "Discard zone failed: %s (errno=%d)", fdev->path, ret); - return ret; } - if (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { - /* - * If there are valid blocks and the write pointer doesn't match - * with them, we need to report the inconsistency and fill - * the zone till the end to close the zone. This inconsistency - * does not cause write error because the zone will not be - * selected for write operation until it get discarded. - */ - f2fs_notice(sbi, "Valid blocks are not aligned with write " - "pointer: valid block[0x%x,0x%x] wp[0x%x,0x%x]", - GET_SEGNO(sbi, last_valid_block), - GET_BLKOFF_FROM_SEG0(sbi, last_valid_block), - wp_segno, wp_blkoff); - } + /* + * If there are valid blocks and the write pointer doesn't match + * with them, we need to report the inconsistency and fill + * the zone till the end to close the zone. This inconsistency + * does not cause write error because the zone will not be + * selected for write operation until it get discarded. + */ + f2fs_notice(sbi, "Valid blocks are not aligned with write " + "pointer: valid block[0x%x,0x%x] cond[0x%x]", + zone_segno, valid_block_cnt, zone->cond); ret = blkdev_zone_mgmt(fdev->bdev, REQ_OP_ZONE_FINISH, zone->start, zone->len, GFP_NOFS); @@ -5048,15 +5001,18 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) f2fs_notice(sbi, "Unaligned curseg[%d] with write pointer: " "curseg[0x%x,0x%x] wp[0x%x,0x%x]", type, cs->segno, cs->next_blkoff, wp_segno, wp_blkoff); - } else { - f2fs_notice(sbi, "Not successfully unmounted in the previous " - "mount"); } - f2fs_notice(sbi, "Assign new section to curseg[%d]: " - "curseg[0x%x,0x%x]", type, cs->segno, cs->next_blkoff); + /* Allocate a new section if it's not new. */ + if (cs->next_blkoff) { + unsigned int old_segno = cs->segno, old_blkoff = cs->next_blkoff; - f2fs_allocate_new_section(sbi, type, true); + f2fs_allocate_new_section(sbi, type, true); + f2fs_notice(sbi, "Assign new section to curseg[%d]: " + "[0x%x,0x%x] -> [0x%x,0x%x]", + type, old_segno, old_blkoff, + cs->segno, cs->next_blkoff); + } /* check consistency of the zone curseg pointed to */ if (check_zone_write_pointer(sbi, zbd, &zone)) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index d66e0692ac02..d45ab0992ae5 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1422,11 +1422,6 @@ default_check: } } - if (test_opt(sbi, DISABLE_CHECKPOINT) && f2fs_lfs_mode(sbi)) { - f2fs_err(sbi, "LFS is not compatible with checkpoint=disable"); - return -EINVAL; - } - if (test_opt(sbi, ATGC) && f2fs_lfs_mode(sbi)) { f2fs_err(sbi, "LFS is not compatible with ATGC"); return -EINVAL; @@ -3361,6 +3356,14 @@ loff_t max_file_blocks(struct inode *inode) leaf_count *= NIDS_PER_BLOCK; result += leaf_count; + /* + * For compatibility with FSCRYPT_POLICY_FLAG_IV_INO_LBLK_{64,32} with + * a 4K crypto data unit, we must restrict the max filesize to what can + * fit within U32_MAX + 1 data units. + */ + + result = min(result, (((loff_t)U32_MAX + 1) * 4096) >> F2FS_BLKSIZE_BITS); + return result; } @@ -4279,24 +4282,21 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) sbi->aligned_blksize = false; #ifdef CONFIG_BLK_DEV_ZONED - if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM && - !f2fs_sb_has_blkzoned(sbi)) { - f2fs_err(sbi, "Zoned block device feature not enabled"); - return -EINVAL; - } - if (bdev_zoned_model(FDEV(i).bdev) != BLK_ZONED_NONE) { + if (bdev_is_zoned(FDEV(i).bdev)) { + if (!f2fs_sb_has_blkzoned(sbi)) { + f2fs_err(sbi, "Zoned block device feature not enabled"); + return -EINVAL; + } if (init_blkz_info(sbi, i)) { f2fs_err(sbi, "Failed to initialize F2FS blkzone information"); return -EINVAL; } if (max_devices == 1) break; - f2fs_info(sbi, "Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: %s)", + f2fs_info(sbi, "Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: Host-managed)", i, FDEV(i).path, FDEV(i).total_segments, - FDEV(i).start_blk, FDEV(i).end_blk, - bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HA ? - "Host-aware" : "Host-managed"); + FDEV(i).start_blk, FDEV(i).end_blk); continue; } #endif @@ -4743,7 +4743,7 @@ try_onemore: #ifdef CONFIG_QUOTA f2fs_recover_quota_end(sbi, quota_enabled); #endif - +reset_checkpoint: /* * If the f2fs is not readonly and fsync data recovery succeeds, * check zoned block devices' write pointer consistency. @@ -4754,7 +4754,6 @@ try_onemore: goto free_meta; } -reset_checkpoint: f2fs_init_inmem_curseg(sbi); /* f2fs_recover_fsync_data() cleared this already */ @@ -4881,6 +4880,7 @@ free_sbi: if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); kfree(sbi); + sb->s_fs_info = NULL; /* give only one another chance */ if (retry_cnt > 0 && skip_recovery) { diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 417fae96890f..a7ec55c7bb20 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -143,6 +143,33 @@ static ssize_t pending_discard_show(struct f2fs_attr *a, &SM_I(sbi)->dcc_info->discard_cmd_cnt)); } +static ssize_t issued_discard_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + if (!SM_I(sbi)->dcc_info) + return -EINVAL; + return sysfs_emit(buf, "%llu\n", (unsigned long long)atomic_read( + &SM_I(sbi)->dcc_info->issued_discard)); +} + +static ssize_t queued_discard_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + if (!SM_I(sbi)->dcc_info) + return -EINVAL; + return sysfs_emit(buf, "%llu\n", (unsigned long long)atomic_read( + &SM_I(sbi)->dcc_info->queued_discard)); +} + +static ssize_t undiscard_blks_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + if (!SM_I(sbi)->dcc_info) + return -EINVAL; + return sysfs_emit(buf, "%u\n", + SM_I(sbi)->dcc_info->undiscard_blks); +} + static ssize_t gc_mode_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -516,6 +543,13 @@ out: return count; } + if (!strcmp(a->attr.name, "discard_io_aware")) { + if (t >= DPOLICY_IO_AWARE_MAX) + return -EINVAL; + *ui = t; + return count; + } + if (!strcmp(a->attr.name, "migration_granularity")) { if (t == 0 || t > sbi->segs_per_sec) return -EINVAL; @@ -734,6 +768,13 @@ out: return count; } + if (!strcmp(a->attr.name, "dir_level")) { + if (t > MAX_DIR_HASH_DEPTH) + return -EINVAL; + sbi->dir_level = t; + return count; + } + *ui = (unsigned int)t; return count; @@ -926,6 +967,7 @@ DCC_INFO_GENERAL_RW_ATTR(discard_io_aware_gran); DCC_INFO_GENERAL_RW_ATTR(discard_urgent_util); DCC_INFO_GENERAL_RW_ATTR(discard_granularity); DCC_INFO_GENERAL_RW_ATTR(max_ordered_discard); +DCC_INFO_GENERAL_RW_ATTR(discard_io_aware); /* NM_INFO ATTR */ NM_INFO_RW_ATTR(max_roll_forward_node_blocks, max_rf_node_blocks); @@ -1074,6 +1116,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(discard_urgent_util), ATTR_LIST(discard_granularity), ATTR_LIST(max_ordered_discard), + ATTR_LIST(discard_io_aware), ATTR_LIST(pending_discard), ATTR_LIST(gc_mode), ATTR_LIST(ipu_policy), @@ -1197,9 +1240,16 @@ ATTRIBUTE_GROUPS(f2fs_feat); F2FS_GENERAL_RO_ATTR(sb_status); F2FS_GENERAL_RO_ATTR(cp_status); +F2FS_GENERAL_RO_ATTR(issued_discard); +F2FS_GENERAL_RO_ATTR(queued_discard); +F2FS_GENERAL_RO_ATTR(undiscard_blks); + static struct attribute *f2fs_stat_attrs[] = { ATTR_LIST(sb_status), ATTR_LIST(cp_status), + ATTR_LIST(issued_discard), + ATTR_LIST(queued_discard), + ATTR_LIST(undiscard_blks), NULL, }; ATTRIBUTE_GROUPS(f2fs_stat); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 47e88b4d4e7d..f290fe9327c4 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -660,11 +660,14 @@ retry: here = __find_xattr(base_addr, last_base_addr, NULL, index, len, name); if (!here) { if (!F2FS_I(inode)->i_xattr_nid) { + error = f2fs_recover_xattr_data(inode, NULL); f2fs_notice(F2FS_I_SB(inode), - "recover xattr in inode (%lu)", inode->i_ino); - f2fs_recover_xattr_data(inode, NULL); - kfree(base_addr); - goto retry; + "recover xattr in inode (%lu), error(%d)", + inode->i_ino, error); + if (!error) { + kfree(base_addr); + goto retry; + } } f2fs_err(F2FS_I_SB(inode), "set inode (%lu) has corrupted xattr", inode->i_ino); @@ -754,6 +757,12 @@ retry: memcpy(pval, value, size); last->e_value_size = cpu_to_le16(size); new_hsize += newsize; + /* + * Explicitly add the null terminator. The unused xattr space + * is supposed to always be zeroed, which would make this + * unnecessary, but don't depend on that. + */ + *(u32 *)((u8 *)last + newsize) = 0; } error = write_all_xattrs(inode, new_hsize, base_addr, ipage); diff --git a/fs/file_table.c b/fs/file_table.c index 3ba764d73fc9..b991f90571b4 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -130,7 +130,6 @@ static struct ctl_table fs_stat_sysctls[] = { .extra1 = &sysctl_nr_open_min, .extra2 = &sysctl_nr_open_max, }, - { } }; static int __init init_fs_stat_sysctls(void) @@ -317,9 +316,6 @@ struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt, const char *name, int flags, const struct file_operations *fops) { - static const struct dentry_operations anon_ops = { - .d_dname = simple_dname - }; struct qstr this = QSTR_INIT(name, strlen(name)); struct path path; struct file *file; @@ -327,8 +323,6 @@ struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt, path.dentry = d_alloc_pseudo(mnt->mnt_sb, &this); if (!path.dentry) return ERR_PTR(-ENOMEM); - if (!mnt->mnt_sb->s_d_op) - d_set_d_op(path.dentry, &anon_ops); path.mnt = mntget(mnt); d_instantiate(path.dentry, inode); file = alloc_file(&path, flags, fops); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 1767493dffda..3d84fcc471c6 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1675,11 +1675,11 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) inode->i_state |= I_DIRTY_PAGES; - else if (unlikely(inode->i_state & I_PINNING_FSCACHE_WB)) { + else if (unlikely(inode->i_state & I_PINNING_NETFS_WB)) { if (!(inode->i_state & I_DIRTY_PAGES)) { - inode->i_state &= ~I_PINNING_FSCACHE_WB; - wbc->unpinned_fscache_wb = true; - dirty |= I_PINNING_FSCACHE_WB; /* Cause write_inode */ + inode->i_state &= ~I_PINNING_NETFS_WB; + wbc->unpinned_netfs_wb = true; + dirty |= I_PINNING_NETFS_WB; /* Cause write_inode */ } } @@ -1691,7 +1691,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) if (ret == 0) ret = err; } - wbc->unpinned_fscache_wb = false; + wbc->unpinned_netfs_wb = false; trace_writeback_single_inode(inode, wbc, nr_to_write); return ret; } diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig deleted file mode 100644 index b313a978ae0a..000000000000 --- a/fs/fscache/Kconfig +++ /dev/null @@ -1,40 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -config FSCACHE - tristate "General filesystem local caching manager" - select NETFS_SUPPORT - help - This option enables a generic filesystem caching manager that can be - used by various network and other filesystems to cache data locally. - Different sorts of caches can be plugged in, depending on the - resources available. - - See Documentation/filesystems/caching/fscache.rst for more information. - -config FSCACHE_STATS - bool "Gather statistical information on local caching" - depends on FSCACHE && PROC_FS - select NETFS_STATS - help - This option causes statistical information to be gathered on local - caching and exported through file: - - /proc/fs/fscache/stats - - The gathering of statistics adds a certain amount of overhead to - execution as there are a quite a few stats gathered, and on a - multi-CPU system these may be on cachelines that keep bouncing - between CPUs. On the other hand, the stats are very useful for - debugging purposes. Saying 'Y' here is recommended. - - See Documentation/filesystems/caching/fscache.rst for more information. - -config FSCACHE_DEBUG - bool "Debug FS-Cache" - depends on FSCACHE - help - This permits debugging to be dynamically enabled in the local caching - management module. If this is set, the debugging output may be - enabled by setting bits in /sys/modules/fscache/parameter/debug. - - See Documentation/filesystems/caching/fscache.rst for more information. diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile deleted file mode 100644 index afb090ea16c4..000000000000 --- a/fs/fscache/Makefile +++ /dev/null @@ -1,16 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# -# Makefile for general filesystem caching code -# - -fscache-y := \ - cache.o \ - cookie.o \ - io.o \ - main.o \ - volume.o - -fscache-$(CONFIG_PROC_FS) += proc.o -fscache-$(CONFIG_FSCACHE_STATS) += stats.o - -obj-$(CONFIG_FSCACHE) := fscache.o diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h deleted file mode 100644 index 1336f517e9b1..000000000000 --- a/fs/fscache/internal.h +++ /dev/null @@ -1,277 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* Internal definitions for FS-Cache - * - * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - */ - -#ifdef pr_fmt -#undef pr_fmt -#endif - -#define pr_fmt(fmt) "FS-Cache: " fmt - -#include <linux/slab.h> -#include <linux/fscache-cache.h> -#include <trace/events/fscache.h> -#include <linux/sched.h> -#include <linux/seq_file.h> - -/* - * cache.c - */ -#ifdef CONFIG_PROC_FS -extern const struct seq_operations fscache_caches_seq_ops; -#endif -bool fscache_begin_cache_access(struct fscache_cache *cache, enum fscache_access_trace why); -void fscache_end_cache_access(struct fscache_cache *cache, enum fscache_access_trace why); -struct fscache_cache *fscache_lookup_cache(const char *name, bool is_cache); -void fscache_put_cache(struct fscache_cache *cache, enum fscache_cache_trace where); - -static inline enum fscache_cache_state fscache_cache_state(const struct fscache_cache *cache) -{ - return smp_load_acquire(&cache->state); -} - -static inline bool fscache_cache_is_live(const struct fscache_cache *cache) -{ - return fscache_cache_state(cache) == FSCACHE_CACHE_IS_ACTIVE; -} - -static inline void fscache_set_cache_state(struct fscache_cache *cache, - enum fscache_cache_state new_state) -{ - smp_store_release(&cache->state, new_state); - -} - -static inline bool fscache_set_cache_state_maybe(struct fscache_cache *cache, - enum fscache_cache_state old_state, - enum fscache_cache_state new_state) -{ - return try_cmpxchg_release(&cache->state, &old_state, new_state); -} - -/* - * cookie.c - */ -extern struct kmem_cache *fscache_cookie_jar; -#ifdef CONFIG_PROC_FS -extern const struct seq_operations fscache_cookies_seq_ops; -#endif -extern struct timer_list fscache_cookie_lru_timer; - -extern void fscache_print_cookie(struct fscache_cookie *cookie, char prefix); -extern bool fscache_begin_cookie_access(struct fscache_cookie *cookie, - enum fscache_access_trace why); - -static inline void fscache_see_cookie(struct fscache_cookie *cookie, - enum fscache_cookie_trace where) -{ - trace_fscache_cookie(cookie->debug_id, refcount_read(&cookie->ref), - where); -} - -/* - * main.c - */ -extern unsigned fscache_debug; - -extern unsigned int fscache_hash(unsigned int salt, const void *data, size_t len); - -/* - * proc.c - */ -#ifdef CONFIG_PROC_FS -extern int __init fscache_proc_init(void); -extern void fscache_proc_cleanup(void); -#else -#define fscache_proc_init() (0) -#define fscache_proc_cleanup() do {} while (0) -#endif - -/* - * stats.c - */ -#ifdef CONFIG_FSCACHE_STATS -extern atomic_t fscache_n_volumes; -extern atomic_t fscache_n_volumes_collision; -extern atomic_t fscache_n_volumes_nomem; -extern atomic_t fscache_n_cookies; -extern atomic_t fscache_n_cookies_lru; -extern atomic_t fscache_n_cookies_lru_expired; -extern atomic_t fscache_n_cookies_lru_removed; -extern atomic_t fscache_n_cookies_lru_dropped; - -extern atomic_t fscache_n_acquires; -extern atomic_t fscache_n_acquires_ok; -extern atomic_t fscache_n_acquires_oom; - -extern atomic_t fscache_n_invalidates; - -extern atomic_t fscache_n_relinquishes; -extern atomic_t fscache_n_relinquishes_retire; -extern atomic_t fscache_n_relinquishes_dropped; - -extern atomic_t fscache_n_resizes; -extern atomic_t fscache_n_resizes_null; - -static inline void fscache_stat(atomic_t *stat) -{ - atomic_inc(stat); -} - -static inline void fscache_stat_d(atomic_t *stat) -{ - atomic_dec(stat); -} - -#define __fscache_stat(stat) (stat) - -int fscache_stats_show(struct seq_file *m, void *v); -#else - -#define __fscache_stat(stat) (NULL) -#define fscache_stat(stat) do {} while (0) -#define fscache_stat_d(stat) do {} while (0) -#endif - -/* - * volume.c - */ -#ifdef CONFIG_PROC_FS -extern const struct seq_operations fscache_volumes_seq_ops; -#endif - -struct fscache_volume *fscache_get_volume(struct fscache_volume *volume, - enum fscache_volume_trace where); -void fscache_put_volume(struct fscache_volume *volume, - enum fscache_volume_trace where); -bool fscache_begin_volume_access(struct fscache_volume *volume, - struct fscache_cookie *cookie, - enum fscache_access_trace why); -void fscache_create_volume(struct fscache_volume *volume, bool wait); - - -/*****************************************************************************/ -/* - * debug tracing - */ -#define dbgprintk(FMT, ...) \ - printk("[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__) - -#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__) -#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__) -#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__) - -#define kjournal(FMT, ...) no_printk(FMT, ##__VA_ARGS__) - -#ifdef __KDEBUG -#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__) -#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__) -#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__) - -#elif defined(CONFIG_FSCACHE_DEBUG) -#define _enter(FMT, ...) \ -do { \ - if (__do_kdebug(ENTER)) \ - kenter(FMT, ##__VA_ARGS__); \ -} while (0) - -#define _leave(FMT, ...) \ -do { \ - if (__do_kdebug(LEAVE)) \ - kleave(FMT, ##__VA_ARGS__); \ -} while (0) - -#define _debug(FMT, ...) \ -do { \ - if (__do_kdebug(DEBUG)) \ - kdebug(FMT, ##__VA_ARGS__); \ -} while (0) - -#else -#define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__) -#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__) -#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__) -#endif - -/* - * determine whether a particular optional debugging point should be logged - * - we need to go through three steps to persuade cpp to correctly join the - * shorthand in FSCACHE_DEBUG_LEVEL with its prefix - */ -#define ____do_kdebug(LEVEL, POINT) \ - unlikely((fscache_debug & \ - (FSCACHE_POINT_##POINT << (FSCACHE_DEBUG_ ## LEVEL * 3)))) -#define ___do_kdebug(LEVEL, POINT) \ - ____do_kdebug(LEVEL, POINT) -#define __do_kdebug(POINT) \ - ___do_kdebug(FSCACHE_DEBUG_LEVEL, POINT) - -#define FSCACHE_DEBUG_CACHE 0 -#define FSCACHE_DEBUG_COOKIE 1 -#define FSCACHE_DEBUG_OBJECT 2 -#define FSCACHE_DEBUG_OPERATION 3 - -#define FSCACHE_POINT_ENTER 1 -#define FSCACHE_POINT_LEAVE 2 -#define FSCACHE_POINT_DEBUG 4 - -#ifndef FSCACHE_DEBUG_LEVEL -#define FSCACHE_DEBUG_LEVEL CACHE -#endif - -/* - * assertions - */ -#if 1 /* defined(__KDEBUGALL) */ - -#define ASSERT(X) \ -do { \ - if (unlikely(!(X))) { \ - pr_err("\n"); \ - pr_err("Assertion failed\n"); \ - BUG(); \ - } \ -} while (0) - -#define ASSERTCMP(X, OP, Y) \ -do { \ - if (unlikely(!((X) OP (Y)))) { \ - pr_err("\n"); \ - pr_err("Assertion failed\n"); \ - pr_err("%lx " #OP " %lx is false\n", \ - (unsigned long)(X), (unsigned long)(Y)); \ - BUG(); \ - } \ -} while (0) - -#define ASSERTIF(C, X) \ -do { \ - if (unlikely((C) && !(X))) { \ - pr_err("\n"); \ - pr_err("Assertion failed\n"); \ - BUG(); \ - } \ -} while (0) - -#define ASSERTIFCMP(C, X, OP, Y) \ -do { \ - if (unlikely((C) && !((X) OP (Y)))) { \ - pr_err("\n"); \ - pr_err("Assertion failed\n"); \ - pr_err("%lx " #OP " %lx is false\n", \ - (unsigned long)(X), (unsigned long)(Y)); \ - BUG(); \ - } \ -} while (0) - -#else - -#define ASSERT(X) do {} while (0) -#define ASSERTCMP(X, OP, Y) do {} while (0) -#define ASSERTIF(C, X) do {} while (0) -#define ASSERTIFCMP(C, X, OP, Y) do {} while (0) - -#endif /* assert or not */ diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c index 177f1f41f225..2e215e8c3c88 100644 --- a/fs/gfs2/dentry.c +++ b/fs/gfs2/dentry.c @@ -32,25 +32,21 @@ static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags) { - struct dentry *parent = NULL; + struct dentry *parent; struct gfs2_sbd *sdp; struct gfs2_inode *dip; - struct inode *dinode, *inode; + struct inode *inode; struct gfs2_holder d_gh; struct gfs2_inode *ip = NULL; int error, valid = 0; int had_lock = 0; - if (flags & LOOKUP_RCU) { - dinode = d_inode_rcu(READ_ONCE(dentry->d_parent)); - if (!dinode) - return -ECHILD; - } else { - parent = dget_parent(dentry); - dinode = d_inode(parent); - } - sdp = GFS2_SB(dinode); - dip = GFS2_I(dinode); + if (flags & LOOKUP_RCU) + return -ECHILD; + + parent = dget_parent(dentry); + sdp = GFS2_SB(d_inode(parent)); + dip = GFS2_I(d_inode(parent)); inode = d_inode(dentry); if (inode) { @@ -66,8 +62,7 @@ static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags) had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL); if (!had_lock) { - error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, - flags & LOOKUP_RCU ? GL_NOBLOCK : 0, &d_gh); + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); if (error) goto out; } diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 6bfc9383b7b8..1b95db2c3aac 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1882,10 +1882,10 @@ int gfs2_permission(struct mnt_idmap *idmap, struct inode *inode, WARN_ON_ONCE(!may_not_block); return -ECHILD; } - if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) { - int noblock = may_not_block ? GL_NOBLOCK : 0; - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, - LM_FLAG_ANY | noblock, &i_gh); + if (gfs2_glock_is_locked_by_me(gl) == NULL) { + if (may_not_block) + return -ECHILD; + error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); if (error) return error; } diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index ea87f24c6c3f..a73d27c4dd58 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -637,12 +637,8 @@ static struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry, inode = hostfs_iget(ino->i_sb, name); __putname(name); - if (IS_ERR(inode)) { - if (PTR_ERR(inode) == -ENOENT) - inode = NULL; - else - return ERR_CAST(inode); - } + if (inode == ERR_PTR(-ENOENT)) + inode = NULL; return d_splice_alias(inode, dentry); } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index ea5b8e57d904..671664fed307 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -340,7 +340,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) } else { folio_unlock(folio); - if (!folio_test_has_hwpoisoned(folio)) + if (!folio_test_hwpoison(folio)) want = nr; else { /* diff --git a/fs/inode.c b/fs/inode.c index 99d8754a74a3..91048c4c9c9e 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -129,7 +129,6 @@ static struct ctl_table inodes_sysctls[] = { .mode = 0444, .proc_handler = proc_nr_inodes, }, - { } }; static int __init init_fs_inode_sysctls(void) @@ -1090,48 +1089,6 @@ void discard_new_inode(struct inode *inode) EXPORT_SYMBOL(discard_new_inode); /** - * lock_two_inodes - lock two inodes (may be regular files but also dirs) - * - * Lock any non-NULL argument. The caller must make sure that if he is passing - * in two directories, one is not ancestor of the other. Zero, one or two - * objects may be locked by this function. - * - * @inode1: first inode to lock - * @inode2: second inode to lock - * @subclass1: inode lock subclass for the first lock obtained - * @subclass2: inode lock subclass for the second lock obtained - */ -void lock_two_inodes(struct inode *inode1, struct inode *inode2, - unsigned subclass1, unsigned subclass2) -{ - if (!inode1 || !inode2) { - /* - * Make sure @subclass1 will be used for the acquired lock. - * This is not strictly necessary (no current caller cares) but - * let's keep things consistent. - */ - if (!inode1) - swap(inode1, inode2); - goto lock; - } - - /* - * If one object is directory and the other is not, we must make sure - * to lock directory first as the other object may be its child. - */ - if (S_ISDIR(inode2->i_mode) == S_ISDIR(inode1->i_mode)) { - if (inode1 > inode2) - swap(inode1, inode2); - } else if (!S_ISDIR(inode1->i_mode)) - swap(inode1, inode2); -lock: - if (inode1) - inode_lock_nested(inode1, subclass1); - if (inode2 && inode2 != inode1) - inode_lock_nested(inode2, subclass2); -} - -/** * lock_two_nondirectories - take two i_mutexes on non-directory objects * * Lock any non-NULL argument. Passed objects must not be directories. @@ -1146,7 +1103,12 @@ void lock_two_nondirectories(struct inode *inode1, struct inode *inode2) WARN_ON_ONCE(S_ISDIR(inode1->i_mode)); if (inode2) WARN_ON_ONCE(S_ISDIR(inode2->i_mode)); - lock_two_inodes(inode1, inode2, I_MUTEX_NORMAL, I_MUTEX_NONDIR2); + if (inode1 > inode2) + swap(inode1, inode2); + if (inode1) + inode_lock(inode1); + if (inode2 && inode2 != inode1) + inode_lock_nested(inode2, I_MUTEX_NONDIR2); } EXPORT_SYMBOL(lock_two_nondirectories); diff --git a/fs/internal.h b/fs/internal.h index bf2ee2e0d45d..b67406435fc0 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -197,8 +197,6 @@ extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc); int dentry_needs_remove_privs(struct mnt_idmap *, struct dentry *dentry); bool in_group_or_capable(struct mnt_idmap *idmap, const struct inode *inode, vfsgid_t vfsgid); -void lock_two_inodes(struct inode *inode1, struct inode *inode2, - unsigned subclass1, unsigned subclass2); /* * fs-writeback.c @@ -216,6 +214,11 @@ extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *) extern char *simple_dname(struct dentry *, char *, int); extern void dput_to_list(struct dentry *, struct list_head *); extern void shrink_dentry_list(struct list_head *); +extern void shrink_dcache_for_umount(struct super_block *); +extern struct dentry *__d_lookup(const struct dentry *, const struct qstr *); +extern struct dentry *__d_lookup_rcu(const struct dentry *parent, + const struct qstr *name, unsigned *seq); +extern void d_genocide(struct dentry *); /* * pipe.c diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 8eec84c651bf..cb3cda1390ad 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -2763,9 +2763,7 @@ static int dbBackSplit(dmtree_t *tp, int leafno, bool is_ctl) * leafno - the number of the leaf to be updated. * newval - the new value for the leaf. * - * RETURN VALUES: - * 0 - success - * -EIO - i/o error + * RETURN VALUES: none */ static int dbJoin(dmtree_t *tp, int leafno, int newval, bool is_ctl) { @@ -2792,10 +2790,6 @@ static int dbJoin(dmtree_t *tp, int leafno, int newval, bool is_ctl) * get the buddy size (number of words covered) of * the new value. */ - - if ((newval - tp->dmt_budmin) > BUDMIN) - return -EIO; - budsz = BUDSIZE(newval, tp->dmt_budmin); /* try to join. diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 8b2bd65d70e7..bce1d7ac95ca 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -54,9 +54,9 @@ static bool kernfs_lockdep(struct kernfs_node *kn) static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen) { if (!kn) - return strlcpy(buf, "(null)", buflen); + return strscpy(buf, "(null)", buflen); - return strlcpy(buf, kn->parent ? kn->name : "/", buflen); + return strscpy(buf, kn->parent ? kn->name : "/", buflen); } /* kernfs_node_depth - compute depth from @from to @to */ @@ -127,7 +127,7 @@ static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a, * * [3] when @kn_to is %NULL result will be "(null)" * - * Return: the length of the full path. If the full length is equal to or + * Return: the length of the constructed path. If the path would have been * greater than @buflen, @buf contains the truncated path with the trailing * '\0'. On error, -errno is returned. */ @@ -138,16 +138,17 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, struct kernfs_node *kn, *common; const char parent_str[] = "/.."; size_t depth_from, depth_to, len = 0; + ssize_t copied; int i, j; if (!kn_to) - return strlcpy(buf, "(null)", buflen); + return strscpy(buf, "(null)", buflen); if (!kn_from) kn_from = kernfs_root(kn_to)->kn; if (kn_from == kn_to) - return strlcpy(buf, "/", buflen); + return strscpy(buf, "/", buflen); common = kernfs_common_ancestor(kn_from, kn_to); if (WARN_ON(!common)) @@ -158,18 +159,19 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, buf[0] = '\0'; - for (i = 0; i < depth_from; i++) - len += strlcpy(buf + len, parent_str, - len < buflen ? buflen - len : 0); + for (i = 0; i < depth_from; i++) { + copied = strscpy(buf + len, parent_str, buflen - len); + if (copied < 0) + return copied; + len += copied; + } /* Calculate how many bytes we need for the rest */ for (i = depth_to - 1; i >= 0; i--) { for (kn = kn_to, j = 0; j < i; j++) kn = kn->parent; - len += strlcpy(buf + len, "/", - len < buflen ? buflen - len : 0); - len += strlcpy(buf + len, kn->name, - len < buflen ? buflen - len : 0); + + len += scnprintf(buf + len, buflen - len, "/%s", kn->name); } return len; @@ -182,12 +184,12 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, * @buflen: size of @buf * * Copies the name of @kn into @buf of @buflen bytes. The behavior is - * similar to strlcpy(). + * similar to strscpy(). * * Fills buffer with "(null)" if @kn is %NULL. * - * Return: the length of @kn's name and if @buf isn't long enough, - * it's filled up to @buflen-1 and nul terminated. + * Return: the resulting length of @buf. If @buf isn't long enough, + * it's filled up to @buflen-1 and nul terminated, and returns -E2BIG. * * This function can be called from any context. */ @@ -214,7 +216,7 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) * path (which includes '..'s) as needed to reach from @from to @to is * returned. * - * Return: the length of the full path. If the full length is equal to or + * Return: the length of the constructed path. If the path would have been * greater than @buflen, @buf contains the truncated path with the trailing * '\0'. On error, -errno is returned. */ @@ -265,12 +267,10 @@ void pr_cont_kernfs_path(struct kernfs_node *kn) sz = kernfs_path_from_node(kn, NULL, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf)); if (sz < 0) { - pr_cont("(error)"); - goto out; - } - - if (sz >= sizeof(kernfs_pr_cont_buf)) { - pr_cont("(name too long)"); + if (sz == -E2BIG) + pr_cont("(name too long)"); + else + pr_cont("(error)"); goto out; } @@ -676,6 +676,18 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, { struct kernfs_node *kn; + if (parent->mode & S_ISGID) { + /* this code block imitates inode_init_owner() for + * kernfs + */ + + if (parent->iattr) + gid = parent->iattr->ia_gid; + + if (flags & KERNFS_DIR) + mode |= S_ISGID; + } + kn = __kernfs_new_node(kernfs_root(parent), parent, name, mode, uid, gid, flags); if (kn) { @@ -850,16 +862,16 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent, const unsigned char *path, const void *ns) { - size_t len; + ssize_t len; char *p, *name; lockdep_assert_held_read(&kernfs_root(parent)->kernfs_rwsem); spin_lock_irq(&kernfs_pr_cont_lock); - len = strlcpy(kernfs_pr_cont_buf, path, sizeof(kernfs_pr_cont_buf)); + len = strscpy(kernfs_pr_cont_buf, path, sizeof(kernfs_pr_cont_buf)); - if (len >= sizeof(kernfs_pr_cont_buf)) { + if (len < 0) { spin_unlock_irq(&kernfs_pr_cont_lock); return NULL; } diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index f0cb729e9a97..ffa4565c275a 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -447,7 +447,7 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) * warnings and we don't want to add spurious locking dependency * between the two. Check whether mmap is actually implemented * without grabbing @of->mutex by testing HAS_MMAP flag. See the - * comment in kernfs_file_open() for more details. + * comment in kernfs_fop_open() for more details. */ if (!(of->kn->flags & KERNFS_HAS_MMAP)) return -ENODEV; diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 4628edde2e7e..0c93cad0f0ac 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -125,9 +125,6 @@ static struct dentry *__kernfs_fh_to_dentry(struct super_block *sb, inode = kernfs_get_inode(sb, kn); kernfs_put(kn); - if (!inode) - return ERR_PTR(-ESTALE); - return d_obtain_alias(inode); } diff --git a/fs/libfs.c b/fs/libfs.c index c2aa6fd4795c..eec6031b0155 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -104,15 +104,16 @@ EXPORT_SYMBOL(dcache_dir_close); * If no such element exists, NULL is returned. */ static struct dentry *scan_positives(struct dentry *cursor, - struct list_head *p, + struct hlist_node **p, loff_t count, struct dentry *last) { struct dentry *dentry = cursor->d_parent, *found = NULL; spin_lock(&dentry->d_lock); - while ((p = p->next) != &dentry->d_subdirs) { - struct dentry *d = list_entry(p, struct dentry, d_child); + while (*p) { + struct dentry *d = hlist_entry(*p, struct dentry, d_sib); + p = &d->d_sib.next; // we must at least skip cursors, to avoid livelocks if (d->d_flags & DCACHE_DENTRY_CURSOR) continue; @@ -126,8 +127,10 @@ static struct dentry *scan_positives(struct dentry *cursor, count = 1; } if (need_resched()) { - list_move(&cursor->d_child, p); - p = &cursor->d_child; + if (!hlist_unhashed(&cursor->d_sib)) + __hlist_del(&cursor->d_sib); + hlist_add_behind(&cursor->d_sib, &d->d_sib); + p = &cursor->d_sib.next; spin_unlock(&dentry->d_lock); cond_resched(); spin_lock(&dentry->d_lock); @@ -159,13 +162,12 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) inode_lock_shared(dentry->d_inode); if (offset > 2) - to = scan_positives(cursor, &dentry->d_subdirs, + to = scan_positives(cursor, &dentry->d_children.first, offset - 2, NULL); spin_lock(&dentry->d_lock); + hlist_del_init(&cursor->d_sib); if (to) - list_move(&cursor->d_child, &to->d_child); - else - list_del_init(&cursor->d_child); + hlist_add_behind(&cursor->d_sib, &to->d_sib); spin_unlock(&dentry->d_lock); dput(to); @@ -187,19 +189,16 @@ int dcache_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file->f_path.dentry; struct dentry *cursor = file->private_data; - struct list_head *anchor = &dentry->d_subdirs; struct dentry *next = NULL; - struct list_head *p; + struct hlist_node **p; if (!dir_emit_dots(file, ctx)) return 0; if (ctx->pos == 2) - p = anchor; - else if (!list_empty(&cursor->d_child)) - p = &cursor->d_child; + p = &dentry->d_children.first; else - return 0; + p = &cursor->d_sib.next; while ((next = scan_positives(cursor, p, 1, next)) != NULL) { if (!dir_emit(ctx, next->d_name.name, next->d_name.len, @@ -207,13 +206,12 @@ int dcache_readdir(struct file *file, struct dir_context *ctx) fs_umode_to_dtype(d_inode(next)->i_mode))) break; ctx->pos++; - p = &next->d_child; + p = &next->d_sib.next; } spin_lock(&dentry->d_lock); + hlist_del_init(&cursor->d_sib); if (next) - list_move_tail(&cursor->d_child, &next->d_child); - else - list_del_init(&cursor->d_child); + hlist_add_before(&cursor->d_sib, &next->d_sib); spin_unlock(&dentry->d_lock); dput(next); @@ -500,12 +498,11 @@ const struct file_operations simple_offset_dir_operations = { static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev) { - struct dentry *child = NULL; - struct list_head *p = prev ? &prev->d_child : &parent->d_subdirs; + struct dentry *child = NULL, *d; spin_lock(&parent->d_lock); - while ((p = p->next) != &parent->d_subdirs) { - struct dentry *d = container_of(p, struct dentry, d_child); + d = prev ? d_next_sibling(prev) : d_first_child(parent); + hlist_for_each_entry_from(d, d_sib) { if (simple_positive(d)) { spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); if (simple_positive(d)) @@ -666,7 +663,7 @@ int simple_empty(struct dentry *dentry) int ret = 0; spin_lock(&dentry->d_lock); - list_for_each_entry(child, &dentry->d_subdirs, d_child) { + hlist_for_each_entry(child, &dentry->d_children, d_sib) { spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); if (simple_positive(child)) { spin_unlock(&child->d_lock); @@ -920,7 +917,6 @@ int simple_fill_super(struct super_block *s, unsigned long magic, const struct tree_descr *files) { struct inode *inode; - struct dentry *root; struct dentry *dentry; int i; @@ -943,8 +939,8 @@ int simple_fill_super(struct super_block *s, unsigned long magic, inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; set_nlink(inode, 2); - root = d_make_root(inode); - if (!root) + s->s_root = d_make_root(inode); + if (!s->s_root) return -ENOMEM; for (i = 0; !files->name || files->name[0]; i++, files++) { if (!files->name) @@ -956,13 +952,13 @@ int simple_fill_super(struct super_block *s, unsigned long magic, "with an index of 1!\n", __func__, s->s_type->name); - dentry = d_alloc_name(root, files->name); + dentry = d_alloc_name(s->s_root, files->name); if (!dentry) - goto out; + return -ENOMEM; inode = new_inode(s); if (!inode) { dput(dentry); - goto out; + return -ENOMEM; } inode->i_mode = S_IFREG | files->mode; simple_inode_init_ts(inode); @@ -970,13 +966,7 @@ int simple_fill_super(struct super_block *s, unsigned long magic, inode->i_ino = i; d_add(dentry, inode); } - s->s_root = root; return 0; -out: - d_genocide(root); - shrink_dcache_parent(root); - dput(root); - return -ENOMEM; } EXPORT_SYMBOL(simple_fill_super); diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 0d6cb3fdc0e1..ce5862482097 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -473,7 +473,6 @@ static struct ctl_table nlm_sysctls[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; #endif /* CONFIG_SYSCTL */ diff --git a/fs/locks.c b/fs/locks.c index 46d88b9e222c..cc7c117ee192 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -111,7 +111,6 @@ static struct ctl_table locks_sysctls[] = { .proc_handler = proc_dointvec, }, #endif /* CONFIG_MMU */ - {} }; static int __init init_fs_locks_sysctls(void) diff --git a/fs/minix/dir.c b/fs/minix/dir.c index 62c313fc9a49..a224cf222570 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -26,12 +26,6 @@ const struct file_operations minix_dir_operations = { .fsync = generic_file_fsync, }; -static inline void dir_put_page(struct page *page) -{ - kunmap(page); - put_page(page); -} - /* * Return the offset into page `page_nr' of the last valid * byte in that page, plus one. @@ -70,13 +64,14 @@ static int minix_handle_dirsync(struct inode *dir) return err; } -static struct page * dir_get_page(struct inode *dir, unsigned long n) +static void *dir_get_page(struct inode *dir, unsigned long n, struct page **p) { struct address_space *mapping = dir->i_mapping; struct page *page = read_mapping_page(mapping, n, NULL); - if (!IS_ERR(page)) - kmap(page); - return page; + if (IS_ERR(page)) + return ERR_CAST(page); + *p = page; + return kmap_local_page(page); } static inline void *minix_next_entry(void *de, struct minix_sb_info *sbi) @@ -104,11 +99,11 @@ static int minix_readdir(struct file *file, struct dir_context *ctx) for ( ; n < npages; n++, offset = 0) { char *p, *kaddr, *limit; - struct page *page = dir_get_page(inode, n); + struct page *page; - if (IS_ERR(page)) + kaddr = dir_get_page(inode, n, &page); + if (IS_ERR(kaddr)) continue; - kaddr = (char *)page_address(page); p = kaddr+offset; limit = kaddr + minix_last_byte(inode, n) - chunk_size; for ( ; p <= limit; p = minix_next_entry(p, sbi)) { @@ -127,13 +122,13 @@ static int minix_readdir(struct file *file, struct dir_context *ctx) unsigned l = strnlen(name, sbi->s_namelen); if (!dir_emit(ctx, name, l, inumber, DT_UNKNOWN)) { - dir_put_page(page); + unmap_and_put_page(page, p); return 0; } } ctx->pos += chunk_size; } - dir_put_page(page); + unmap_and_put_page(page, kaddr); } return 0; } @@ -173,11 +168,10 @@ minix_dirent *minix_find_entry(struct dentry *dentry, struct page **res_page) for (n = 0; n < npages; n++) { char *kaddr, *limit; - page = dir_get_page(dir, n); - if (IS_ERR(page)) + kaddr = dir_get_page(dir, n, &page); + if (IS_ERR(kaddr)) continue; - kaddr = (char*)page_address(page); limit = kaddr + minix_last_byte(dir, n) - sbi->s_dirsize; for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) { if (sbi->s_version == MINIX_V3) { @@ -194,7 +188,7 @@ minix_dirent *minix_find_entry(struct dentry *dentry, struct page **res_page) if (namecompare(namelen, sbi->s_namelen, name, namx)) goto found; } - dir_put_page(page); + unmap_and_put_page(page, kaddr); } return NULL; @@ -229,12 +223,10 @@ int minix_add_link(struct dentry *dentry, struct inode *inode) for (n = 0; n <= npages; n++) { char *limit, *dir_end; - page = dir_get_page(dir, n); - err = PTR_ERR(page); - if (IS_ERR(page)) - goto out; + kaddr = dir_get_page(dir, n, &page); + if (IS_ERR(kaddr)) + return PTR_ERR(kaddr); lock_page(page); - kaddr = (char*)page_address(page); dir_end = kaddr + minix_last_byte(dir, n); limit = kaddr + PAGE_SIZE - sbi->s_dirsize; for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) { @@ -262,13 +254,13 @@ int minix_add_link(struct dentry *dentry, struct inode *inode) goto out_unlock; } unlock_page(page); - dir_put_page(page); + unmap_and_put_page(page, kaddr); } BUG(); return -EINVAL; got_it: - pos = page_offset(page) + p - (char *)page_address(page); + pos = page_offset(page) + offset_in_page(p); err = minix_prepare_chunk(page, pos, sbi->s_dirsize); if (err) goto out_unlock; @@ -285,8 +277,7 @@ got_it: mark_inode_dirty(dir); err = minix_handle_dirsync(dir); out_put: - dir_put_page(page); -out: + unmap_and_put_page(page, kaddr); return err; out_unlock: unlock_page(page); @@ -296,8 +287,7 @@ out_unlock: int minix_delete_entry(struct minix_dir_entry *de, struct page *page) { struct inode *inode = page->mapping->host; - char *kaddr = page_address(page); - loff_t pos = page_offset(page) + (char*)de - kaddr; + loff_t pos = page_offset(page) + offset_in_page(de); struct minix_sb_info *sbi = minix_sb(inode->i_sb); unsigned len = sbi->s_dirsize; int err; @@ -333,7 +323,7 @@ int minix_make_empty(struct inode *inode, struct inode *dir) goto fail; } - kaddr = kmap_atomic(page); + kaddr = kmap_local_page(page); memset(kaddr, 0, PAGE_SIZE); if (sbi->s_version == MINIX_V3) { @@ -353,7 +343,7 @@ int minix_make_empty(struct inode *inode, struct inode *dir) de->inode = dir->i_ino; strcpy(de->name, ".."); } - kunmap_atomic(kaddr); + kunmap_local(kaddr); dir_commit_chunk(page, 0, 2 * sbi->s_dirsize); err = minix_handle_dirsync(inode); @@ -370,17 +360,16 @@ int minix_empty_dir(struct inode * inode) struct page *page = NULL; unsigned long i, npages = dir_pages(inode); struct minix_sb_info *sbi = minix_sb(inode->i_sb); - char *name; + char *name, *kaddr; __u32 inumber; for (i = 0; i < npages; i++) { - char *p, *kaddr, *limit; + char *p, *limit; - page = dir_get_page(inode, i); - if (IS_ERR(page)) + kaddr = dir_get_page(inode, i, &page); + if (IS_ERR(kaddr)) continue; - kaddr = (char *)page_address(page); limit = kaddr + minix_last_byte(inode, i) - sbi->s_dirsize; for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) { if (sbi->s_version == MINIX_V3) { @@ -406,12 +395,12 @@ int minix_empty_dir(struct inode * inode) goto not_empty; } } - dir_put_page(page); + unmap_and_put_page(page, kaddr); } return 1; not_empty: - dir_put_page(page); + unmap_and_put_page(page, kaddr); return 0; } @@ -421,8 +410,7 @@ int minix_set_link(struct minix_dir_entry *de, struct page *page, { struct inode *dir = page->mapping->host; struct minix_sb_info *sbi = minix_sb(dir->i_sb); - loff_t pos = page_offset(page) + - (char *)de-(char*)page_address(page); + loff_t pos = page_offset(page) + offset_in_page(de); int err; lock_page(page); @@ -443,15 +431,12 @@ int minix_set_link(struct minix_dir_entry *de, struct page *page, struct minix_dir_entry * minix_dotdot (struct inode *dir, struct page **p) { - struct page *page = dir_get_page(dir, 0); struct minix_sb_info *sbi = minix_sb(dir->i_sb); - struct minix_dir_entry *de = NULL; + struct minix_dir_entry *de = dir_get_page(dir, 0, p); - if (!IS_ERR(page)) { - de = minix_next_entry(page_address(page), sbi); - *p = page; - } - return de; + if (!IS_ERR(de)) + return minix_next_entry(de, sbi); + return NULL; } ino_t minix_inode_by_name(struct dentry *dentry) @@ -469,7 +454,7 @@ ino_t minix_inode_by_name(struct dentry *dentry) res = ((minix3_dirent *) de)->inode; else res = de->inode; - dir_put_page(page); + unmap_and_put_page(page, de); } return res; } diff --git a/fs/minix/namei.c b/fs/minix/namei.c index 114084d5636a..d6031acc34f0 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -149,8 +149,7 @@ static int minix_unlink(struct inode * dir, struct dentry *dentry) if (!de) return -ENOENT; err = minix_delete_entry(de, page); - kunmap(page); - put_page(page); + unmap_and_put_page(page, de); if (err) return err; @@ -242,13 +241,10 @@ static int minix_rename(struct mnt_idmap *idmap, inode_dec_link_count(old_dir); } out_dir: - if (dir_de) { - kunmap(dir_page); - put_page(dir_page); - } + if (dir_de) + unmap_and_put_page(dir_page, dir_de); out_old: - kunmap(old_page); - put_page(old_page); + unmap_and_put_page(old_page, old_de); out: return err; } diff --git a/fs/namei.c b/fs/namei.c index faae721e4d63..4e0de939fea1 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1071,7 +1071,6 @@ static struct ctl_table namei_sysctls[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, - { } }; static int __init init_fs_namei_sysctls(void) @@ -2573,13 +2572,13 @@ static int filename_parentat(int dfd, struct filename *name, } /* does lookup, returns the object with parent locked */ -static struct dentry *__kern_path_locked(struct filename *name, struct path *path) +static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct path *path) { struct dentry *d; struct qstr last; int type, error; - error = filename_parentat(AT_FDCWD, name, 0, path, &last, &type); + error = filename_parentat(dfd, name, 0, path, &last, &type); if (error) return ERR_PTR(error); if (unlikely(type != LAST_NORM)) { @@ -2598,12 +2597,22 @@ static struct dentry *__kern_path_locked(struct filename *name, struct path *pat struct dentry *kern_path_locked(const char *name, struct path *path) { struct filename *filename = getname_kernel(name); - struct dentry *res = __kern_path_locked(filename, path); + struct dentry *res = __kern_path_locked(AT_FDCWD, filename, path); putname(filename); return res; } +struct dentry *user_path_locked_at(int dfd, const char __user *name, struct path *path) +{ + struct filename *filename = getname(name); + struct dentry *res = __kern_path_locked(dfd, filename, path); + + putname(filename); + return res; +} +EXPORT_SYMBOL(user_path_locked_at); + int kern_path(const char *name, unsigned int flags, struct path *path) { struct filename *filename = getname_kernel(name); @@ -3014,27 +3023,37 @@ static inline int may_create(struct mnt_idmap *idmap, return inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); } +// p1 != p2, both are on the same filesystem, ->s_vfs_rename_mutex is held static struct dentry *lock_two_directories(struct dentry *p1, struct dentry *p2) { - struct dentry *p; + struct dentry *p = p1, *q = p2, *r; - p = d_ancestor(p2, p1); - if (p) { + while ((r = p->d_parent) != p2 && r != p) + p = r; + if (r == p2) { + // p is a child of p2 and an ancestor of p1 or p1 itself inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); - inode_lock_nested(p1->d_inode, I_MUTEX_CHILD); + inode_lock_nested(p1->d_inode, I_MUTEX_PARENT2); return p; } - - p = d_ancestor(p1, p2); - if (p) { + // p is the root of connected component that contains p1 + // p2 does not occur on the path from p to p1 + while ((r = q->d_parent) != p1 && r != p && r != q) + q = r; + if (r == p1) { + // q is a child of p1 and an ancestor of p2 or p2 itself inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); - inode_lock_nested(p2->d_inode, I_MUTEX_CHILD); - return p; + inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2); + return q; + } else if (likely(r == p)) { + // both p2 and p1 are descendents of p + inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); + inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2); + return NULL; + } else { // no common ancestor at the time we'd been called + mutex_unlock(&p1->d_sb->s_vfs_rename_mutex); + return ERR_PTR(-EXDEV); } - - lock_two_inodes(p1->d_inode, p2->d_inode, - I_MUTEX_PARENT, I_MUTEX_PARENT2); - return NULL; } /* @@ -4713,11 +4732,12 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname * * a) we can get into loop creation. * b) race potential - two innocent renames can create a loop together. - * That's where 4.4 screws up. Current fix: serialization on + * That's where 4.4BSD screws up. Current fix: serialization on * sb->s_vfs_rename_mutex. We might be more accurate, but that's another * story. - * c) we have to lock _four_ objects - parents and victim (if it exists), - * and source. + * c) we may have to lock up to _four_ objects - parents and victim (if it exists), + * and source (if it's a non-directory or a subdirectory that moves to + * different parent). * And that - after we got ->i_mutex on parents (until then we don't know * whether the target exists). Solution: try to be smart with locking * order for inodes. We rely on the fact that tree topology may change @@ -4749,6 +4769,7 @@ int vfs_rename(struct renamedata *rd) bool new_is_dir = false; unsigned max_links = new_dir->i_sb->s_max_links; struct name_snapshot old_name; + bool lock_old_subdir, lock_new_subdir; if (source == target) return 0; @@ -4802,15 +4823,32 @@ int vfs_rename(struct renamedata *rd) take_dentry_name_snapshot(&old_name, old_dentry); dget(new_dentry); /* - * Lock all moved children. Moved directories may need to change parent - * pointer so they need the lock to prevent against concurrent - * directory changes moving parent pointer. For regular files we've - * historically always done this. The lockdep locking subclasses are - * somewhat arbitrary but RENAME_EXCHANGE in particular can swap - * regular files and directories so it's difficult to tell which - * subclasses to use. + * Lock children. + * The source subdirectory needs to be locked on cross-directory + * rename or cross-directory exchange since its parent changes. + * The target subdirectory needs to be locked on cross-directory + * exchange due to parent change and on any rename due to becoming + * a victim. + * Non-directories need locking in all cases (for NFS reasons); + * they get locked after any subdirectories (in inode address order). + * + * NOTE: WE ONLY LOCK UNRELATED DIRECTORIES IN CROSS-DIRECTORY CASE. + * NEVER, EVER DO THAT WITHOUT ->s_vfs_rename_mutex. */ - lock_two_inodes(source, target, I_MUTEX_NORMAL, I_MUTEX_NONDIR2); + lock_old_subdir = new_dir != old_dir; + lock_new_subdir = new_dir != old_dir || !(flags & RENAME_EXCHANGE); + if (is_dir) { + if (lock_old_subdir) + inode_lock_nested(source, I_MUTEX_CHILD); + if (target && (!new_is_dir || lock_new_subdir)) + inode_lock(target); + } else if (new_is_dir) { + if (lock_new_subdir) + inode_lock_nested(target, I_MUTEX_CHILD); + inode_lock(source); + } else { + lock_two_nondirectories(source, target); + } error = -EPERM; if (IS_SWAPFILE(source) || (target && IS_SWAPFILE(target))) @@ -4858,8 +4896,9 @@ int vfs_rename(struct renamedata *rd) d_exchange(old_dentry, new_dentry); } out: - inode_unlock(source); - if (target) + if (!is_dir || lock_old_subdir) + inode_unlock(source); + if (target && (!new_is_dir || lock_new_subdir)) inode_unlock(target); dput(new_dentry); if (!error) { @@ -4930,6 +4969,10 @@ retry: retry_deleg: trap = lock_rename(new_path.dentry, old_path.dentry); + if (IS_ERR(trap)) { + error = PTR_ERR(trap); + goto exit_lock_rename; + } old_dentry = lookup_one_qstr_excl(&old_last, old_path.dentry, lookup_flags); @@ -4997,6 +5040,7 @@ exit4: dput(old_dentry); exit3: unlock_rename(new_path.dentry, old_path.dentry); +exit_lock_rename: if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) diff --git a/fs/namespace.c b/fs/namespace.c index 95b2fff91f67..437f60e96d40 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -5042,13 +5042,12 @@ static struct mount *listmnt_next(struct mount *curr) return node_to_mount(rb_next(&curr->mnt_node)); } -static ssize_t do_listmount(struct mount *first, struct path *orig, u64 mnt_id, - u64 __user *buf, size_t bufsize, - const struct path *root) +static ssize_t do_listmount(struct mount *first, struct path *orig, + u64 mnt_parent_id, u64 __user *mnt_ids, + size_t nr_mnt_ids, const struct path *root) { struct mount *r; - ssize_t ctr; - int err; + ssize_t ret; /* * Don't trigger audit denials. We just want to determine what @@ -5058,50 +5057,57 @@ static ssize_t do_listmount(struct mount *first, struct path *orig, u64 mnt_id, !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN)) return -EPERM; - err = security_sb_statfs(orig->dentry); - if (err) - return err; + ret = security_sb_statfs(orig->dentry); + if (ret) + return ret; - for (ctr = 0, r = first; r && ctr < bufsize; r = listmnt_next(r)) { - if (r->mnt_id_unique == mnt_id) + for (ret = 0, r = first; r && nr_mnt_ids; r = listmnt_next(r)) { + if (r->mnt_id_unique == mnt_parent_id) continue; if (!is_path_reachable(r, r->mnt.mnt_root, orig)) continue; - ctr = array_index_nospec(ctr, bufsize); - if (put_user(r->mnt_id_unique, buf + ctr)) + if (put_user(r->mnt_id_unique, mnt_ids)) return -EFAULT; - if (check_add_overflow(ctr, 1, &ctr)) - return -ERANGE; + mnt_ids++; + nr_mnt_ids--; + ret++; } - return ctr; + return ret; } -SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, - u64 __user *, buf, size_t, bufsize, unsigned int, flags) +SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, u64 __user *, + mnt_ids, size_t, nr_mnt_ids, unsigned int, flags) { struct mnt_namespace *ns = current->nsproxy->mnt_ns; struct mnt_id_req kreq; struct mount *first; struct path root, orig; - u64 mnt_id, last_mnt_id; + u64 mnt_parent_id, last_mnt_id; + const size_t maxcount = (size_t)-1 >> 3; ssize_t ret; if (flags) return -EINVAL; + if (unlikely(nr_mnt_ids > maxcount)) + return -EFAULT; + + if (!access_ok(mnt_ids, nr_mnt_ids * sizeof(*mnt_ids))) + return -EFAULT; + ret = copy_mnt_id_req(req, &kreq); if (ret) return ret; - mnt_id = kreq.mnt_id; + mnt_parent_id = kreq.mnt_id; last_mnt_id = kreq.param; down_read(&namespace_sem); get_fs_root(current->fs, &root); - if (mnt_id == LSMT_ROOT) { + if (mnt_parent_id == LSMT_ROOT) { orig = root; } else { ret = -ENOENT; - orig.mnt = lookup_mnt_in_ns(mnt_id, ns); + orig.mnt = lookup_mnt_in_ns(mnt_parent_id, ns); if (!orig.mnt) goto err; orig.dentry = orig.mnt->mnt_root; @@ -5111,7 +5117,7 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, else first = mnt_find_id_at(ns, last_mnt_id + 1); - ret = do_listmount(first, &orig, mnt_id, buf, bufsize, &root); + ret = do_listmount(first, &orig, mnt_parent_id, mnt_ids, nr_mnt_ids, &root); err: path_put(&root); up_read(&namespace_sem); @@ -5447,7 +5453,6 @@ static struct ctl_table fs_namespace_sysctls[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, }, - { } }; static int __init init_fs_namespace_sysctls(void) diff --git a/fs/netfs/Kconfig b/fs/netfs/Kconfig index b4db21022cb4..bec805e0c44c 100644 --- a/fs/netfs/Kconfig +++ b/fs/netfs/Kconfig @@ -21,3 +21,42 @@ config NETFS_STATS multi-CPU system these may be on cachelines that keep bouncing between CPUs. On the other hand, the stats are very useful for debugging purposes. Saying 'Y' here is recommended. + +config FSCACHE + bool "General filesystem local caching manager" + depends on NETFS_SUPPORT + help + This option enables a generic filesystem caching manager that can be + used by various network and other filesystems to cache data locally. + Different sorts of caches can be plugged in, depending on the + resources available. + + See Documentation/filesystems/caching/fscache.rst for more information. + +config FSCACHE_STATS + bool "Gather statistical information on local caching" + depends on FSCACHE && PROC_FS + select NETFS_STATS + help + This option causes statistical information to be gathered on local + caching and exported through file: + + /proc/fs/fscache/stats + + The gathering of statistics adds a certain amount of overhead to + execution as there are a quite a few stats gathered, and on a + multi-CPU system these may be on cachelines that keep bouncing + between CPUs. On the other hand, the stats are very useful for + debugging purposes. Saying 'Y' here is recommended. + + See Documentation/filesystems/caching/fscache.rst for more information. + +config FSCACHE_DEBUG + bool "Debug FS-Cache" + depends on FSCACHE + help + This permits debugging to be dynamically enabled in the local caching + management module. If this is set, the debugging output may be + enabled by setting bits in /sys/modules/fscache/parameter/debug. + + See Documentation/filesystems/caching/fscache.rst for more information. diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile index 386d6fb92793..d4d1d799819e 100644 --- a/fs/netfs/Makefile +++ b/fs/netfs/Makefile @@ -2,11 +2,29 @@ netfs-y := \ buffered_read.o \ + buffered_write.o \ + direct_read.o \ + direct_write.o \ io.o \ iterator.o \ + locking.o \ main.o \ - objects.o + misc.o \ + objects.o \ + output.o netfs-$(CONFIG_NETFS_STATS) += stats.o -obj-$(CONFIG_NETFS_SUPPORT) := netfs.o +netfs-$(CONFIG_FSCACHE) += \ + fscache_cache.o \ + fscache_cookie.o \ + fscache_io.o \ + fscache_main.o \ + fscache_volume.o + +ifeq ($(CONFIG_PROC_FS),y) +netfs-$(CONFIG_FSCACHE) += fscache_proc.o +endif +netfs-$(CONFIG_FSCACHE_STATS) += fscache_stats.o + +obj-$(CONFIG_NETFS_SUPPORT) += netfs.o diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 2cd3ccf4c439..3298c29b5548 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -16,6 +16,7 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) { struct netfs_io_subrequest *subreq; + struct netfs_folio *finfo; struct folio *folio; pgoff_t start_page = rreq->start / PAGE_SIZE; pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1; @@ -63,6 +64,7 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) break; } if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) { + trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache); folio_start_fscache(folio); folio_started = true; } @@ -86,11 +88,20 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) if (!pg_failed) { flush_dcache_folio(folio); + finfo = netfs_folio_info(folio); + if (finfo) { + trace_netfs_folio(folio, netfs_folio_trace_filled_gaps); + if (finfo->netfs_group) + folio_change_private(folio, finfo->netfs_group); + else + folio_detach_private(folio); + kfree(finfo); + } folio_mark_uptodate(folio); } if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) { - if (folio_index(folio) == rreq->no_unlock_folio && + if (folio->index == rreq->no_unlock_folio && test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) _debug("no unlock"); else @@ -147,6 +158,15 @@ static void netfs_rreq_expand(struct netfs_io_request *rreq, } } +/* + * Begin an operation, and fetch the stored zero point value from the cookie if + * available. + */ +static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_inode *ctx) +{ + return fscache_begin_read_operation(&rreq->cache_resources, netfs_i_cookie(ctx)); +} + /** * netfs_readahead - Helper to manage a read request * @ractl: The description of the readahead request @@ -180,11 +200,9 @@ void netfs_readahead(struct readahead_control *ractl) if (IS_ERR(rreq)) return; - if (ctx->ops->begin_cache_operation) { - ret = ctx->ops->begin_cache_operation(rreq); - if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) - goto cleanup_free; - } + ret = netfs_begin_cache_read(rreq, ctx); + if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) + goto cleanup_free; netfs_stat(&netfs_n_rh_readahead); trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl), @@ -192,6 +210,10 @@ void netfs_readahead(struct readahead_control *ractl) netfs_rreq_expand(rreq, ractl); + /* Set up the output buffer */ + iov_iter_xarray(&rreq->iter, ITER_DEST, &ractl->mapping->i_pages, + rreq->start, rreq->len); + /* Drop the refs on the folios here rather than in the cache or * filesystem. The locks will be dropped in netfs_rreq_unlock(). */ @@ -199,6 +221,7 @@ void netfs_readahead(struct readahead_control *ractl) ; netfs_begin_read(rreq, false); + netfs_put_request(rreq, false, netfs_rreq_trace_put_return); return; cleanup_free: @@ -223,12 +246,13 @@ EXPORT_SYMBOL(netfs_readahead); */ int netfs_read_folio(struct file *file, struct folio *folio) { - struct address_space *mapping = folio_file_mapping(folio); + struct address_space *mapping = folio->mapping; struct netfs_io_request *rreq; struct netfs_inode *ctx = netfs_inode(mapping->host); + struct folio *sink = NULL; int ret; - _enter("%lx", folio_index(folio)); + _enter("%lx", folio->index); rreq = netfs_alloc_request(mapping, file, folio_file_pos(folio), folio_size(folio), @@ -238,15 +262,64 @@ int netfs_read_folio(struct file *file, struct folio *folio) goto alloc_error; } - if (ctx->ops->begin_cache_operation) { - ret = ctx->ops->begin_cache_operation(rreq); - if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) - goto discard; - } + ret = netfs_begin_cache_read(rreq, ctx); + if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) + goto discard; netfs_stat(&netfs_n_rh_readpage); trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage); - return netfs_begin_read(rreq, true); + + /* Set up the output buffer */ + if (folio_test_dirty(folio)) { + /* Handle someone trying to read from an unflushed streaming + * write. We fiddle the buffer so that a gap at the beginning + * and/or a gap at the end get copied to, but the middle is + * discarded. + */ + struct netfs_folio *finfo = netfs_folio_info(folio); + struct bio_vec *bvec; + unsigned int from = finfo->dirty_offset; + unsigned int to = from + finfo->dirty_len; + unsigned int off = 0, i = 0; + size_t flen = folio_size(folio); + size_t nr_bvec = flen / PAGE_SIZE + 2; + size_t part; + + ret = -ENOMEM; + bvec = kmalloc_array(nr_bvec, sizeof(*bvec), GFP_KERNEL); + if (!bvec) + goto discard; + + sink = folio_alloc(GFP_KERNEL, 0); + if (!sink) + goto discard; + + trace_netfs_folio(folio, netfs_folio_trace_read_gaps); + + rreq->direct_bv = bvec; + rreq->direct_bv_count = nr_bvec; + if (from > 0) { + bvec_set_folio(&bvec[i++], folio, from, 0); + off = from; + } + while (off < to) { + part = min_t(size_t, to - off, PAGE_SIZE); + bvec_set_folio(&bvec[i++], sink, part, 0); + off += part; + } + if (to < flen) + bvec_set_folio(&bvec[i++], folio, flen - to, to); + iov_iter_bvec(&rreq->iter, ITER_DEST, bvec, i, rreq->len); + } else { + iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages, + rreq->start, rreq->len); + } + + ret = netfs_begin_read(rreq, true); + if (sink) + folio_put(sink); + netfs_put_request(rreq, false, netfs_rreq_trace_put_return); + return ret < 0 ? ret : 0; discard: netfs_put_request(rreq, false, netfs_rreq_trace_put_discard); @@ -387,14 +460,12 @@ retry: ret = PTR_ERR(rreq); goto error; } - rreq->no_unlock_folio = folio_index(folio); + rreq->no_unlock_folio = folio->index; __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags); - if (ctx->ops->begin_cache_operation) { - ret = ctx->ops->begin_cache_operation(rreq); - if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) - goto error_put; - } + ret = netfs_begin_cache_read(rreq, ctx); + if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) + goto error_put; netfs_stat(&netfs_n_rh_write_begin); trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin); @@ -405,6 +476,10 @@ retry: ractl._nr_pages = folio_nr_pages(folio); netfs_rreq_expand(rreq, &ractl); + /* Set up the output buffer */ + iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages, + rreq->start, rreq->len); + /* We hold the folio locks, so we can drop the references */ folio_get(folio); while (readahead_folio(&ractl)) @@ -413,6 +488,7 @@ retry: ret = netfs_begin_read(rreq, true); if (ret < 0) goto error; + netfs_put_request(rreq, false, netfs_rreq_trace_put_return); have_folio: ret = folio_wait_fscache_killable(folio); @@ -434,3 +510,124 @@ error: return ret; } EXPORT_SYMBOL(netfs_write_begin); + +/* + * Preload the data into a page we're proposing to write into. + */ +int netfs_prefetch_for_write(struct file *file, struct folio *folio, + size_t offset, size_t len) +{ + struct netfs_io_request *rreq; + struct address_space *mapping = folio->mapping; + struct netfs_inode *ctx = netfs_inode(mapping->host); + unsigned long long start = folio_pos(folio); + size_t flen = folio_size(folio); + int ret; + + _enter("%zx @%llx", flen, start); + + ret = -ENOMEM; + + rreq = netfs_alloc_request(mapping, file, start, flen, + NETFS_READ_FOR_WRITE); + if (IS_ERR(rreq)) { + ret = PTR_ERR(rreq); + goto error; + } + + rreq->no_unlock_folio = folio->index; + __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags); + ret = netfs_begin_cache_read(rreq, ctx); + if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) + goto error_put; + + netfs_stat(&netfs_n_rh_write_begin); + trace_netfs_read(rreq, start, flen, netfs_read_trace_prefetch_for_write); + + /* Set up the output buffer */ + iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages, + rreq->start, rreq->len); + + ret = netfs_begin_read(rreq, true); + netfs_put_request(rreq, false, netfs_rreq_trace_put_return); + return ret; + +error_put: + netfs_put_request(rreq, false, netfs_rreq_trace_put_discard); +error: + _leave(" = %d", ret); + return ret; +} + +/** + * netfs_buffered_read_iter - Filesystem buffered I/O read routine + * @iocb: kernel I/O control block + * @iter: destination for the data read + * + * This is the ->read_iter() routine for all filesystems that can use the page + * cache directly. + * + * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall be + * returned when no data can be read without waiting for I/O requests to + * complete; it doesn't prevent readahead. + * + * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O requests + * shall be made for the read or for readahead. When no data can be read, + * -EAGAIN shall be returned. When readahead would be triggered, a partial, + * possibly empty read shall be returned. + * + * Return: + * * number of bytes copied, even for partial reads + * * negative error code (or 0 if IOCB_NOIO) if nothing was read + */ +ssize_t netfs_buffered_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct netfs_inode *ictx = netfs_inode(inode); + ssize_t ret; + + if (WARN_ON_ONCE((iocb->ki_flags & IOCB_DIRECT) || + test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags))) + return -EINVAL; + + ret = netfs_start_io_read(inode); + if (ret == 0) { + ret = filemap_read(iocb, iter, 0); + netfs_end_io_read(inode); + } + return ret; +} +EXPORT_SYMBOL(netfs_buffered_read_iter); + +/** + * netfs_file_read_iter - Generic filesystem read routine + * @iocb: kernel I/O control block + * @iter: destination for the data read + * + * This is the ->read_iter() routine for all filesystems that can use the page + * cache directly. + * + * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall be + * returned when no data can be read without waiting for I/O requests to + * complete; it doesn't prevent readahead. + * + * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O requests + * shall be made for the read or for readahead. When no data can be read, + * -EAGAIN shall be returned. When readahead would be triggered, a partial, + * possibly empty read shall be returned. + * + * Return: + * * number of bytes copied, even for partial reads + * * negative error code (or 0 if IOCB_NOIO) if nothing was read + */ +ssize_t netfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct netfs_inode *ictx = netfs_inode(iocb->ki_filp->f_mapping->host); + + if ((iocb->ki_flags & IOCB_DIRECT) || + test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags)) + return netfs_unbuffered_read_iter(iocb, iter); + + return netfs_buffered_read_iter(iocb, iter); +} +EXPORT_SYMBOL(netfs_file_read_iter); diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c new file mode 100644 index 000000000000..a3059b3168fd --- /dev/null +++ b/fs/netfs/buffered_write.c @@ -0,0 +1,1254 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Network filesystem high-level write support. + * + * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/export.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/slab.h> +#include <linux/pagevec.h> +#include "internal.h" + +/* + * Determined write method. Adjust netfs_folio_traces if this is changed. + */ +enum netfs_how_to_modify { + NETFS_FOLIO_IS_UPTODATE, /* Folio is uptodate already */ + NETFS_JUST_PREFETCH, /* We have to read the folio anyway */ + NETFS_WHOLE_FOLIO_MODIFY, /* We're going to overwrite the whole folio */ + NETFS_MODIFY_AND_CLEAR, /* We can assume there is no data to be downloaded. */ + NETFS_STREAMING_WRITE, /* Store incomplete data in non-uptodate page. */ + NETFS_STREAMING_WRITE_CONT, /* Continue streaming write. */ + NETFS_FLUSH_CONTENT, /* Flush incompatible content. */ +}; + +static void netfs_cleanup_buffered_write(struct netfs_io_request *wreq); + +static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group) +{ + if (netfs_group && !folio_get_private(folio)) + folio_attach_private(folio, netfs_get_group(netfs_group)); +} + +#if IS_ENABLED(CONFIG_FSCACHE) +static void netfs_folio_start_fscache(bool caching, struct folio *folio) +{ + if (caching) + folio_start_fscache(folio); +} +#else +static void netfs_folio_start_fscache(bool caching, struct folio *folio) +{ +} +#endif + +/* + * Decide how we should modify a folio. We might be attempting to do + * write-streaming, in which case we don't want to a local RMW cycle if we can + * avoid it. If we're doing local caching or content crypto, we award that + * priority over avoiding RMW. If the file is open readably, then we also + * assume that we may want to read what we wrote. + */ +static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx, + struct file *file, + struct folio *folio, + void *netfs_group, + size_t flen, + size_t offset, + size_t len, + bool maybe_trouble) +{ + struct netfs_folio *finfo = netfs_folio_info(folio); + loff_t pos = folio_file_pos(folio); + + _enter(""); + + if (netfs_folio_group(folio) != netfs_group) + return NETFS_FLUSH_CONTENT; + + if (folio_test_uptodate(folio)) + return NETFS_FOLIO_IS_UPTODATE; + + if (pos >= ctx->zero_point) + return NETFS_MODIFY_AND_CLEAR; + + if (!maybe_trouble && offset == 0 && len >= flen) + return NETFS_WHOLE_FOLIO_MODIFY; + + if (file->f_mode & FMODE_READ) + goto no_write_streaming; + if (test_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags)) + goto no_write_streaming; + + if (netfs_is_cache_enabled(ctx)) { + /* We don't want to get a streaming write on a file that loses + * caching service temporarily because the backing store got + * culled. + */ + if (!test_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags)) + set_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags); + goto no_write_streaming; + } + + if (!finfo) + return NETFS_STREAMING_WRITE; + + /* We can continue a streaming write only if it continues on from the + * previous. If it overlaps, we must flush lest we suffer a partial + * copy and disjoint dirty regions. + */ + if (offset == finfo->dirty_offset + finfo->dirty_len) + return NETFS_STREAMING_WRITE_CONT; + return NETFS_FLUSH_CONTENT; + +no_write_streaming: + if (finfo) { + netfs_stat(&netfs_n_wh_wstream_conflict); + return NETFS_FLUSH_CONTENT; + } + return NETFS_JUST_PREFETCH; +} + +/* + * Grab a folio for writing and lock it. Attempt to allocate as large a folio + * as possible to hold as much of the remaining length as possible in one go. + */ +static struct folio *netfs_grab_folio_for_write(struct address_space *mapping, + loff_t pos, size_t part) +{ + pgoff_t index = pos / PAGE_SIZE; + fgf_t fgp_flags = FGP_WRITEBEGIN; + + if (mapping_large_folio_support(mapping)) + fgp_flags |= fgf_set_order(pos % PAGE_SIZE + part); + + return __filemap_get_folio(mapping, index, fgp_flags, + mapping_gfp_mask(mapping)); +} + +/** + * netfs_perform_write - Copy data into the pagecache. + * @iocb: The operation parameters + * @iter: The source buffer + * @netfs_group: Grouping for dirty pages (eg. ceph snaps). + * + * Copy data into pagecache pages attached to the inode specified by @iocb. + * The caller must hold appropriate inode locks. + * + * Dirty pages are tagged with a netfs_folio struct if they're not up to date + * to indicate the range modified. Dirty pages may also be tagged with a + * netfs-specific grouping such that data from an old group gets flushed before + * a new one is started. + */ +ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, + struct netfs_group *netfs_group) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct address_space *mapping = inode->i_mapping; + struct netfs_inode *ctx = netfs_inode(inode); + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + .for_sync = true, + .nr_to_write = LONG_MAX, + .range_start = iocb->ki_pos, + .range_end = iocb->ki_pos + iter->count, + }; + struct netfs_io_request *wreq = NULL; + struct netfs_folio *finfo; + struct folio *folio; + enum netfs_how_to_modify howto; + enum netfs_folio_trace trace; + unsigned int bdp_flags = (iocb->ki_flags & IOCB_SYNC) ? 0: BDP_ASYNC; + ssize_t written = 0, ret; + loff_t i_size, pos = iocb->ki_pos, from, to; + size_t max_chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER; + bool maybe_trouble = false; + + if (unlikely(test_bit(NETFS_ICTX_WRITETHROUGH, &ctx->flags) || + iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) + ) { + if (pos < i_size_read(inode)) { + ret = filemap_write_and_wait_range(mapping, pos, pos + iter->count); + if (ret < 0) { + goto out; + } + } + + wbc_attach_fdatawrite_inode(&wbc, mapping->host); + + wreq = netfs_begin_writethrough(iocb, iter->count); + if (IS_ERR(wreq)) { + wbc_detach_inode(&wbc); + ret = PTR_ERR(wreq); + wreq = NULL; + goto out; + } + if (!is_sync_kiocb(iocb)) + wreq->iocb = iocb; + wreq->cleanup = netfs_cleanup_buffered_write; + } + + do { + size_t flen; + size_t offset; /* Offset into pagecache folio */ + size_t part; /* Bytes to write to folio */ + size_t copied; /* Bytes copied from user */ + + ret = balance_dirty_pages_ratelimited_flags(mapping, bdp_flags); + if (unlikely(ret < 0)) + break; + + offset = pos & (max_chunk - 1); + part = min(max_chunk - offset, iov_iter_count(iter)); + + /* Bring in the user pages that we will copy from _first_ lest + * we hit a nasty deadlock on copying from the same page as + * we're writing to, without it being marked uptodate. + * + * Not only is this an optimisation, but it is also required to + * check that the address is actually valid, when atomic + * usercopies are used below. + * + * We rely on the page being held onto long enough by the LRU + * that we can grab it below if this causes it to be read. + */ + ret = -EFAULT; + if (unlikely(fault_in_iov_iter_readable(iter, part) == part)) + break; + + folio = netfs_grab_folio_for_write(mapping, pos, part); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); + break; + } + + flen = folio_size(folio); + offset = pos & (flen - 1); + part = min_t(size_t, flen - offset, part); + + if (signal_pending(current)) { + ret = written ? -EINTR : -ERESTARTSYS; + goto error_folio_unlock; + } + + /* See if we need to prefetch the area we're going to modify. + * We need to do this before we get a lock on the folio in case + * there's more than one writer competing for the same cache + * block. + */ + howto = netfs_how_to_modify(ctx, file, folio, netfs_group, + flen, offset, part, maybe_trouble); + _debug("howto %u", howto); + switch (howto) { + case NETFS_JUST_PREFETCH: + ret = netfs_prefetch_for_write(file, folio, offset, part); + if (ret < 0) { + _debug("prefetch = %zd", ret); + goto error_folio_unlock; + } + break; + case NETFS_FOLIO_IS_UPTODATE: + case NETFS_WHOLE_FOLIO_MODIFY: + case NETFS_STREAMING_WRITE_CONT: + break; + case NETFS_MODIFY_AND_CLEAR: + zero_user_segment(&folio->page, 0, offset); + break; + case NETFS_STREAMING_WRITE: + ret = -EIO; + if (WARN_ON(folio_get_private(folio))) + goto error_folio_unlock; + break; + case NETFS_FLUSH_CONTENT: + trace_netfs_folio(folio, netfs_flush_content); + from = folio_pos(folio); + to = from + folio_size(folio) - 1; + folio_unlock(folio); + folio_put(folio); + ret = filemap_write_and_wait_range(mapping, from, to); + if (ret < 0) + goto error_folio_unlock; + continue; + } + + if (mapping_writably_mapped(mapping)) + flush_dcache_folio(folio); + + copied = copy_folio_from_iter_atomic(folio, offset, part, iter); + + flush_dcache_folio(folio); + + /* Deal with a (partially) failed copy */ + if (copied == 0) { + ret = -EFAULT; + goto error_folio_unlock; + } + + trace = (enum netfs_folio_trace)howto; + switch (howto) { + case NETFS_FOLIO_IS_UPTODATE: + case NETFS_JUST_PREFETCH: + netfs_set_group(folio, netfs_group); + break; + case NETFS_MODIFY_AND_CLEAR: + zero_user_segment(&folio->page, offset + copied, flen); + netfs_set_group(folio, netfs_group); + folio_mark_uptodate(folio); + break; + case NETFS_WHOLE_FOLIO_MODIFY: + if (unlikely(copied < part)) { + maybe_trouble = true; + iov_iter_revert(iter, copied); + copied = 0; + goto retry; + } + netfs_set_group(folio, netfs_group); + folio_mark_uptodate(folio); + break; + case NETFS_STREAMING_WRITE: + if (offset == 0 && copied == flen) { + netfs_set_group(folio, netfs_group); + folio_mark_uptodate(folio); + trace = netfs_streaming_filled_page; + break; + } + finfo = kzalloc(sizeof(*finfo), GFP_KERNEL); + if (!finfo) { + iov_iter_revert(iter, copied); + ret = -ENOMEM; + goto error_folio_unlock; + } + finfo->netfs_group = netfs_get_group(netfs_group); + finfo->dirty_offset = offset; + finfo->dirty_len = copied; + folio_attach_private(folio, (void *)((unsigned long)finfo | + NETFS_FOLIO_INFO)); + break; + case NETFS_STREAMING_WRITE_CONT: + finfo = netfs_folio_info(folio); + finfo->dirty_len += copied; + if (finfo->dirty_offset == 0 && finfo->dirty_len == flen) { + if (finfo->netfs_group) + folio_change_private(folio, finfo->netfs_group); + else + folio_detach_private(folio); + folio_mark_uptodate(folio); + kfree(finfo); + trace = netfs_streaming_cont_filled_page; + } + break; + default: + WARN(true, "Unexpected modify type %u ix=%lx\n", + howto, folio->index); + ret = -EIO; + goto error_folio_unlock; + } + + trace_netfs_folio(folio, trace); + + /* Update the inode size if we moved the EOF marker */ + i_size = i_size_read(inode); + pos += copied; + if (pos > i_size) { + if (ctx->ops->update_i_size) { + ctx->ops->update_i_size(inode, pos); + } else { + i_size_write(inode, pos); +#if IS_ENABLED(CONFIG_FSCACHE) + fscache_update_cookie(ctx->cache, NULL, &pos); +#endif + } + } + written += copied; + + if (likely(!wreq)) { + folio_mark_dirty(folio); + } else { + if (folio_test_dirty(folio)) + /* Sigh. mmap. */ + folio_clear_dirty_for_io(folio); + /* We make multiple writes to the folio... */ + if (!folio_test_writeback(folio)) { + folio_wait_fscache(folio); + folio_start_writeback(folio); + folio_start_fscache(folio); + if (wreq->iter.count == 0) + trace_netfs_folio(folio, netfs_folio_trace_wthru); + else + trace_netfs_folio(folio, netfs_folio_trace_wthru_plus); + } + netfs_advance_writethrough(wreq, copied, + offset + copied == flen); + } + retry: + folio_unlock(folio); + folio_put(folio); + folio = NULL; + + cond_resched(); + } while (iov_iter_count(iter)); + +out: + if (unlikely(wreq)) { + ret = netfs_end_writethrough(wreq, iocb); + wbc_detach_inode(&wbc); + if (ret == -EIOCBQUEUED) + return ret; + } + + iocb->ki_pos += written; + _leave(" = %zd [%zd]", written, ret); + return written ? written : ret; + +error_folio_unlock: + folio_unlock(folio); + folio_put(folio); + goto out; +} +EXPORT_SYMBOL(netfs_perform_write); + +/** + * netfs_buffered_write_iter_locked - write data to a file + * @iocb: IO state structure (file, offset, etc.) + * @from: iov_iter with data to write + * @netfs_group: Grouping for dirty pages (eg. ceph snaps). + * + * This function does all the work needed for actually writing data to a + * file. It does all basic checks, removes SUID from the file, updates + * modification times and calls proper subroutines depending on whether we + * do direct IO or a standard buffered write. + * + * The caller must hold appropriate locks around this function and have called + * generic_write_checks() already. The caller is also responsible for doing + * any necessary syncing afterwards. + * + * This function does *not* take care of syncing data in case of O_SYNC write. + * A caller has to handle it. This is mainly due to the fact that we want to + * avoid syncing under i_rwsem. + * + * Return: + * * number of bytes written, even for truncated writes + * * negative error code if no data has been written at all + */ +ssize_t netfs_buffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *from, + struct netfs_group *netfs_group) +{ + struct file *file = iocb->ki_filp; + ssize_t ret; + + trace_netfs_write_iter(iocb, from); + + ret = file_remove_privs(file); + if (ret) + return ret; + + ret = file_update_time(file); + if (ret) + return ret; + + return netfs_perform_write(iocb, from, netfs_group); +} +EXPORT_SYMBOL(netfs_buffered_write_iter_locked); + +/** + * netfs_file_write_iter - write data to a file + * @iocb: IO state structure + * @from: iov_iter with data to write + * + * Perform a write to a file, writing into the pagecache if possible and doing + * an unbuffered write instead if not. + * + * Return: + * * Negative error code if no data has been written at all of + * vfs_fsync_range() failed for a synchronous write + * * Number of bytes written, even for truncated writes + */ +ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct netfs_inode *ictx = netfs_inode(inode); + ssize_t ret; + + _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode)); + + if ((iocb->ki_flags & IOCB_DIRECT) || + test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags)) + return netfs_unbuffered_write_iter(iocb, from); + + ret = netfs_start_io_write(inode); + if (ret < 0) + return ret; + + ret = generic_write_checks(iocb, from); + if (ret > 0) + ret = netfs_buffered_write_iter_locked(iocb, from, NULL); + netfs_end_io_write(inode); + if (ret > 0) + ret = generic_write_sync(iocb, ret); + return ret; +} +EXPORT_SYMBOL(netfs_file_write_iter); + +/* + * Notification that a previously read-only page is about to become writable. + * Note that the caller indicates a single page of a multipage folio. + */ +vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group) +{ + struct folio *folio = page_folio(vmf->page); + struct file *file = vmf->vma->vm_file; + struct inode *inode = file_inode(file); + vm_fault_t ret = VM_FAULT_RETRY; + int err; + + _enter("%lx", folio->index); + + sb_start_pagefault(inode->i_sb); + + if (folio_wait_writeback_killable(folio)) + goto out; + + if (folio_lock_killable(folio) < 0) + goto out; + + /* Can we see a streaming write here? */ + if (WARN_ON(!folio_test_uptodate(folio))) { + ret = VM_FAULT_SIGBUS | VM_FAULT_LOCKED; + goto out; + } + + if (netfs_folio_group(folio) != netfs_group) { + folio_unlock(folio); + err = filemap_fdatawait_range(inode->i_mapping, + folio_pos(folio), + folio_pos(folio) + folio_size(folio)); + switch (err) { + case 0: + ret = VM_FAULT_RETRY; + goto out; + case -ENOMEM: + ret = VM_FAULT_OOM; + goto out; + default: + ret = VM_FAULT_SIGBUS; + goto out; + } + } + + if (folio_test_dirty(folio)) + trace_netfs_folio(folio, netfs_folio_trace_mkwrite_plus); + else + trace_netfs_folio(folio, netfs_folio_trace_mkwrite); + netfs_set_group(folio, netfs_group); + file_update_time(file); + ret = VM_FAULT_LOCKED; +out: + sb_end_pagefault(inode->i_sb); + return ret; +} +EXPORT_SYMBOL(netfs_page_mkwrite); + +/* + * Kill all the pages in the given range + */ +static void netfs_kill_pages(struct address_space *mapping, + loff_t start, loff_t len) +{ + struct folio *folio; + pgoff_t index = start / PAGE_SIZE; + pgoff_t last = (start + len - 1) / PAGE_SIZE, next; + + _enter("%llx-%llx", start, start + len - 1); + + do { + _debug("kill %lx (to %lx)", index, last); + + folio = filemap_get_folio(mapping, index); + if (IS_ERR(folio)) { + next = index + 1; + continue; + } + + next = folio_next_index(folio); + + trace_netfs_folio(folio, netfs_folio_trace_kill); + folio_clear_uptodate(folio); + if (folio_test_fscache(folio)) + folio_end_fscache(folio); + folio_end_writeback(folio); + folio_lock(folio); + generic_error_remove_folio(mapping, folio); + folio_unlock(folio); + folio_put(folio); + + } while (index = next, index <= last); + + _leave(""); +} + +/* + * Redirty all the pages in a given range. + */ +static void netfs_redirty_pages(struct address_space *mapping, + loff_t start, loff_t len) +{ + struct folio *folio; + pgoff_t index = start / PAGE_SIZE; + pgoff_t last = (start + len - 1) / PAGE_SIZE, next; + + _enter("%llx-%llx", start, start + len - 1); + + do { + _debug("redirty %llx @%llx", len, start); + + folio = filemap_get_folio(mapping, index); + if (IS_ERR(folio)) { + next = index + 1; + continue; + } + + next = folio_next_index(folio); + trace_netfs_folio(folio, netfs_folio_trace_redirty); + filemap_dirty_folio(mapping, folio); + if (folio_test_fscache(folio)) + folio_end_fscache(folio); + folio_end_writeback(folio); + folio_put(folio); + } while (index = next, index <= last); + + balance_dirty_pages_ratelimited(mapping); + + _leave(""); +} + +/* + * Completion of write to server + */ +static void netfs_pages_written_back(struct netfs_io_request *wreq) +{ + struct address_space *mapping = wreq->mapping; + struct netfs_folio *finfo; + struct netfs_group *group = NULL; + struct folio *folio; + pgoff_t last; + int gcount = 0; + + XA_STATE(xas, &mapping->i_pages, wreq->start / PAGE_SIZE); + + _enter("%llx-%llx", wreq->start, wreq->start + wreq->len); + + rcu_read_lock(); + + last = (wreq->start + wreq->len - 1) / PAGE_SIZE; + xas_for_each(&xas, folio, last) { + WARN(!folio_test_writeback(folio), + "bad %zx @%llx page %lx %lx\n", + wreq->len, wreq->start, folio->index, last); + + if ((finfo = netfs_folio_info(folio))) { + /* Streaming writes cannot be redirtied whilst under + * writeback, so discard the streaming record. + */ + folio_detach_private(folio); + group = finfo->netfs_group; + gcount++; + trace_netfs_folio(folio, netfs_folio_trace_clear_s); + kfree(finfo); + } else if ((group = netfs_folio_group(folio))) { + /* Need to detach the group pointer if the page didn't + * get redirtied. If it has been redirtied, then it + * must be within the same group. + */ + if (folio_test_dirty(folio)) { + trace_netfs_folio(folio, netfs_folio_trace_redirtied); + goto end_wb; + } + if (folio_trylock(folio)) { + if (!folio_test_dirty(folio)) { + folio_detach_private(folio); + gcount++; + trace_netfs_folio(folio, netfs_folio_trace_clear_g); + } else { + trace_netfs_folio(folio, netfs_folio_trace_redirtied); + } + folio_unlock(folio); + goto end_wb; + } + + xas_pause(&xas); + rcu_read_unlock(); + folio_lock(folio); + if (!folio_test_dirty(folio)) { + folio_detach_private(folio); + gcount++; + trace_netfs_folio(folio, netfs_folio_trace_clear_g); + } else { + trace_netfs_folio(folio, netfs_folio_trace_redirtied); + } + folio_unlock(folio); + rcu_read_lock(); + } else { + trace_netfs_folio(folio, netfs_folio_trace_clear); + } + end_wb: + if (folio_test_fscache(folio)) + folio_end_fscache(folio); + xas_advance(&xas, folio_next_index(folio) - 1); + folio_end_writeback(folio); + } + + rcu_read_unlock(); + netfs_put_group_many(group, gcount); + _leave(""); +} + +/* + * Deal with the disposition of the folios that are under writeback to close + * out the operation. + */ +static void netfs_cleanup_buffered_write(struct netfs_io_request *wreq) +{ + struct address_space *mapping = wreq->mapping; + + _enter(""); + + switch (wreq->error) { + case 0: + netfs_pages_written_back(wreq); + break; + + default: + pr_notice("R=%08x Unexpected error %d\n", wreq->debug_id, wreq->error); + fallthrough; + case -EACCES: + case -EPERM: + case -ENOKEY: + case -EKEYEXPIRED: + case -EKEYREJECTED: + case -EKEYREVOKED: + case -ENETRESET: + case -EDQUOT: + case -ENOSPC: + netfs_redirty_pages(mapping, wreq->start, wreq->len); + break; + + case -EROFS: + case -EIO: + case -EREMOTEIO: + case -EFBIG: + case -ENOENT: + case -ENOMEDIUM: + case -ENXIO: + netfs_kill_pages(mapping, wreq->start, wreq->len); + break; + } + + if (wreq->error) + mapping_set_error(mapping, wreq->error); + if (wreq->netfs_ops->done) + wreq->netfs_ops->done(wreq); +} + +/* + * Extend the region to be written back to include subsequent contiguously + * dirty pages if possible, but don't sleep while doing so. + * + * If this page holds new content, then we can include filler zeros in the + * writeback. + */ +static void netfs_extend_writeback(struct address_space *mapping, + struct netfs_group *group, + struct xa_state *xas, + long *_count, + loff_t start, + loff_t max_len, + bool caching, + size_t *_len, + size_t *_top) +{ + struct netfs_folio *finfo; + struct folio_batch fbatch; + struct folio *folio; + unsigned int i; + pgoff_t index = (start + *_len) / PAGE_SIZE; + size_t len; + void *priv; + bool stop = true; + + folio_batch_init(&fbatch); + + do { + /* Firstly, we gather up a batch of contiguous dirty pages + * under the RCU read lock - but we can't clear the dirty flags + * there if any of those pages are mapped. + */ + rcu_read_lock(); + + xas_for_each(xas, folio, ULONG_MAX) { + stop = true; + if (xas_retry(xas, folio)) + continue; + if (xa_is_value(folio)) + break; + if (folio->index != index) { + xas_reset(xas); + break; + } + + if (!folio_try_get_rcu(folio)) { + xas_reset(xas); + continue; + } + + /* Has the folio moved or been split? */ + if (unlikely(folio != xas_reload(xas))) { + folio_put(folio); + xas_reset(xas); + break; + } + + if (!folio_trylock(folio)) { + folio_put(folio); + xas_reset(xas); + break; + } + if (!folio_test_dirty(folio) || + folio_test_writeback(folio) || + folio_test_fscache(folio)) { + folio_unlock(folio); + folio_put(folio); + xas_reset(xas); + break; + } + + stop = false; + len = folio_size(folio); + priv = folio_get_private(folio); + if ((const struct netfs_group *)priv != group) { + stop = true; + finfo = netfs_folio_info(folio); + if (finfo->netfs_group != group || + finfo->dirty_offset > 0) { + folio_unlock(folio); + folio_put(folio); + xas_reset(xas); + break; + } + len = finfo->dirty_len; + } + + *_top += folio_size(folio); + index += folio_nr_pages(folio); + *_count -= folio_nr_pages(folio); + *_len += len; + if (*_len >= max_len || *_count <= 0) + stop = true; + + if (!folio_batch_add(&fbatch, folio)) + break; + if (stop) + break; + } + + xas_pause(xas); + rcu_read_unlock(); + + /* Now, if we obtained any folios, we can shift them to being + * writable and mark them for caching. + */ + if (!folio_batch_count(&fbatch)) + break; + + for (i = 0; i < folio_batch_count(&fbatch); i++) { + folio = fbatch.folios[i]; + trace_netfs_folio(folio, netfs_folio_trace_store_plus); + + if (!folio_clear_dirty_for_io(folio)) + BUG(); + folio_start_writeback(folio); + netfs_folio_start_fscache(caching, folio); + folio_unlock(folio); + } + + folio_batch_release(&fbatch); + cond_resched(); + } while (!stop); +} + +/* + * Synchronously write back the locked page and any subsequent non-locked dirty + * pages. + */ +static ssize_t netfs_write_back_from_locked_folio(struct address_space *mapping, + struct writeback_control *wbc, + struct netfs_group *group, + struct xa_state *xas, + struct folio *folio, + unsigned long long start, + unsigned long long end) +{ + struct netfs_io_request *wreq; + struct netfs_folio *finfo; + struct netfs_inode *ctx = netfs_inode(mapping->host); + unsigned long long i_size = i_size_read(&ctx->inode); + size_t len, max_len; + bool caching = netfs_is_cache_enabled(ctx); + long count = wbc->nr_to_write; + int ret; + + _enter(",%lx,%llx-%llx,%u", folio->index, start, end, caching); + + wreq = netfs_alloc_request(mapping, NULL, start, folio_size(folio), + NETFS_WRITEBACK); + if (IS_ERR(wreq)) { + folio_unlock(folio); + return PTR_ERR(wreq); + } + + if (!folio_clear_dirty_for_io(folio)) + BUG(); + folio_start_writeback(folio); + netfs_folio_start_fscache(caching, folio); + + count -= folio_nr_pages(folio); + + /* Find all consecutive lockable dirty pages that have contiguous + * written regions, stopping when we find a page that is not + * immediately lockable, is not dirty or is missing, or we reach the + * end of the range. + */ + trace_netfs_folio(folio, netfs_folio_trace_store); + + len = wreq->len; + finfo = netfs_folio_info(folio); + if (finfo) { + start += finfo->dirty_offset; + if (finfo->dirty_offset + finfo->dirty_len != len) { + len = finfo->dirty_len; + goto cant_expand; + } + len = finfo->dirty_len; + } + + if (start < i_size) { + /* Trim the write to the EOF; the extra data is ignored. Also + * put an upper limit on the size of a single storedata op. + */ + max_len = 65536 * 4096; + max_len = min_t(unsigned long long, max_len, end - start + 1); + max_len = min_t(unsigned long long, max_len, i_size - start); + + if (len < max_len) + netfs_extend_writeback(mapping, group, xas, &count, start, + max_len, caching, &len, &wreq->upper_len); + } + +cant_expand: + len = min_t(unsigned long long, len, i_size - start); + + /* We now have a contiguous set of dirty pages, each with writeback + * set; the first page is still locked at this point, but all the rest + * have been unlocked. + */ + folio_unlock(folio); + wreq->start = start; + wreq->len = len; + + if (start < i_size) { + _debug("write back %zx @%llx [%llx]", len, start, i_size); + + /* Speculatively write to the cache. We have to fix this up + * later if the store fails. + */ + wreq->cleanup = netfs_cleanup_buffered_write; + + iov_iter_xarray(&wreq->iter, ITER_SOURCE, &mapping->i_pages, start, + wreq->upper_len); + __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags); + ret = netfs_begin_write(wreq, true, netfs_write_trace_writeback); + if (ret == 0 || ret == -EIOCBQUEUED) + wbc->nr_to_write -= len / PAGE_SIZE; + } else { + _debug("write discard %zx @%llx [%llx]", len, start, i_size); + + /* The dirty region was entirely beyond the EOF. */ + fscache_clear_page_bits(mapping, start, len, caching); + netfs_pages_written_back(wreq); + ret = 0; + } + + netfs_put_request(wreq, false, netfs_rreq_trace_put_return); + _leave(" = 1"); + return 1; +} + +/* + * Write a region of pages back to the server + */ +static ssize_t netfs_writepages_begin(struct address_space *mapping, + struct writeback_control *wbc, + struct netfs_group *group, + struct xa_state *xas, + unsigned long long *_start, + unsigned long long end) +{ + const struct netfs_folio *finfo; + struct folio *folio; + unsigned long long start = *_start; + ssize_t ret; + void *priv; + int skips = 0; + + _enter("%llx,%llx,", start, end); + +search_again: + /* Find the first dirty page in the group. */ + rcu_read_lock(); + + for (;;) { + folio = xas_find_marked(xas, end / PAGE_SIZE, PAGECACHE_TAG_DIRTY); + if (xas_retry(xas, folio) || xa_is_value(folio)) + continue; + if (!folio) + break; + + if (!folio_try_get_rcu(folio)) { + xas_reset(xas); + continue; + } + + if (unlikely(folio != xas_reload(xas))) { + folio_put(folio); + xas_reset(xas); + continue; + } + + /* Skip any dirty folio that's not in the group of interest. */ + priv = folio_get_private(folio); + if ((const struct netfs_group *)priv != group) { + finfo = netfs_folio_info(folio); + if (finfo->netfs_group != group) { + folio_put(folio); + continue; + } + } + + xas_pause(xas); + break; + } + rcu_read_unlock(); + if (!folio) + return 0; + + start = folio_pos(folio); /* May regress with THPs */ + + _debug("wback %lx", folio->index); + + /* At this point we hold neither the i_pages lock nor the page lock: + * the page may be truncated or invalidated (changing page->mapping to + * NULL), or even swizzled back from swapper_space to tmpfs file + * mapping + */ +lock_again: + if (wbc->sync_mode != WB_SYNC_NONE) { + ret = folio_lock_killable(folio); + if (ret < 0) + return ret; + } else { + if (!folio_trylock(folio)) + goto search_again; + } + + if (folio->mapping != mapping || + !folio_test_dirty(folio)) { + start += folio_size(folio); + folio_unlock(folio); + goto search_again; + } + + if (folio_test_writeback(folio) || + folio_test_fscache(folio)) { + folio_unlock(folio); + if (wbc->sync_mode != WB_SYNC_NONE) { + folio_wait_writeback(folio); +#ifdef CONFIG_FSCACHE + folio_wait_fscache(folio); +#endif + goto lock_again; + } + + start += folio_size(folio); + if (wbc->sync_mode == WB_SYNC_NONE) { + if (skips >= 5 || need_resched()) { + ret = 0; + goto out; + } + skips++; + } + goto search_again; + } + + ret = netfs_write_back_from_locked_folio(mapping, wbc, group, xas, + folio, start, end); +out: + if (ret > 0) + *_start = start + ret; + _leave(" = %zd [%llx]", ret, *_start); + return ret; +} + +/* + * Write a region of pages back to the server + */ +static int netfs_writepages_region(struct address_space *mapping, + struct writeback_control *wbc, + struct netfs_group *group, + unsigned long long *_start, + unsigned long long end) +{ + ssize_t ret; + + XA_STATE(xas, &mapping->i_pages, *_start / PAGE_SIZE); + + do { + ret = netfs_writepages_begin(mapping, wbc, group, &xas, + _start, end); + if (ret > 0 && wbc->nr_to_write > 0) + cond_resched(); + } while (ret > 0 && wbc->nr_to_write > 0); + + return ret > 0 ? 0 : ret; +} + +/* + * write some of the pending data back to the server + */ +int netfs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct netfs_group *group = NULL; + loff_t start, end; + int ret; + + _enter(""); + + /* We have to be careful as we can end up racing with setattr() + * truncating the pagecache since the caller doesn't take a lock here + * to prevent it. + */ + + if (wbc->range_cyclic && mapping->writeback_index) { + start = mapping->writeback_index * PAGE_SIZE; + ret = netfs_writepages_region(mapping, wbc, group, + &start, LLONG_MAX); + if (ret < 0) + goto out; + + if (wbc->nr_to_write <= 0) { + mapping->writeback_index = start / PAGE_SIZE; + goto out; + } + + start = 0; + end = mapping->writeback_index * PAGE_SIZE; + mapping->writeback_index = 0; + ret = netfs_writepages_region(mapping, wbc, group, &start, end); + if (ret == 0) + mapping->writeback_index = start / PAGE_SIZE; + } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) { + start = 0; + ret = netfs_writepages_region(mapping, wbc, group, + &start, LLONG_MAX); + if (wbc->nr_to_write > 0 && ret == 0) + mapping->writeback_index = start / PAGE_SIZE; + } else { + start = wbc->range_start; + ret = netfs_writepages_region(mapping, wbc, group, + &start, wbc->range_end); + } + +out: + _leave(" = %d", ret); + return ret; +} +EXPORT_SYMBOL(netfs_writepages); + +/* + * Deal with the disposition of a laundered folio. + */ +static void netfs_cleanup_launder_folio(struct netfs_io_request *wreq) +{ + if (wreq->error) { + pr_notice("R=%08x Laundering error %d\n", wreq->debug_id, wreq->error); + mapping_set_error(wreq->mapping, wreq->error); + } +} + +/** + * netfs_launder_folio - Clean up a dirty folio that's being invalidated + * @folio: The folio to clean + * + * This is called to write back a folio that's being invalidated when an inode + * is getting torn down. Ideally, writepages would be used instead. + */ +int netfs_launder_folio(struct folio *folio) +{ + struct netfs_io_request *wreq; + struct address_space *mapping = folio->mapping; + struct netfs_folio *finfo = netfs_folio_info(folio); + struct netfs_group *group = netfs_folio_group(folio); + struct bio_vec bvec; + unsigned long long i_size = i_size_read(mapping->host); + unsigned long long start = folio_pos(folio); + size_t offset = 0, len; + int ret = 0; + + if (finfo) { + offset = finfo->dirty_offset; + start += offset; + len = finfo->dirty_len; + } else { + len = folio_size(folio); + } + len = min_t(unsigned long long, len, i_size - start); + + wreq = netfs_alloc_request(mapping, NULL, start, len, NETFS_LAUNDER_WRITE); + if (IS_ERR(wreq)) { + ret = PTR_ERR(wreq); + goto out; + } + + if (!folio_clear_dirty_for_io(folio)) + goto out_put; + + trace_netfs_folio(folio, netfs_folio_trace_launder); + + _debug("launder %llx-%llx", start, start + len - 1); + + /* Speculatively write to the cache. We have to fix this up later if + * the store fails. + */ + wreq->cleanup = netfs_cleanup_launder_folio; + + bvec_set_folio(&bvec, folio, len, offset); + iov_iter_bvec(&wreq->iter, ITER_SOURCE, &bvec, 1, len); + __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags); + ret = netfs_begin_write(wreq, true, netfs_write_trace_launder); + +out_put: + folio_detach_private(folio); + netfs_put_group(group); + kfree(finfo); + netfs_put_request(wreq, false, netfs_rreq_trace_put_return); +out: + folio_wait_fscache(folio); + _leave(" = %d", ret); + return ret; +} +EXPORT_SYMBOL(netfs_launder_folio); diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c new file mode 100644 index 000000000000..ad4370b3935d --- /dev/null +++ b/fs/netfs/direct_read.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Direct I/O support. + * + * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/export.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/slab.h> +#include <linux/uio.h> +#include <linux/sched/mm.h> +#include <linux/task_io_accounting_ops.h> +#include <linux/netfs.h> +#include "internal.h" + +/** + * netfs_unbuffered_read_iter_locked - Perform an unbuffered or direct I/O read + * @iocb: The I/O control descriptor describing the read + * @iter: The output buffer (also specifies read length) + * + * Perform an unbuffered I/O or direct I/O from the file in @iocb to the + * output buffer. No use is made of the pagecache. + * + * The caller must hold any appropriate locks. + */ +static ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *iter) +{ + struct netfs_io_request *rreq; + ssize_t ret; + size_t orig_count = iov_iter_count(iter); + bool async = !is_sync_kiocb(iocb); + + _enter(""); + + if (!orig_count) + return 0; /* Don't update atime */ + + ret = kiocb_write_and_wait(iocb, orig_count); + if (ret < 0) + return ret; + file_accessed(iocb->ki_filp); + + rreq = netfs_alloc_request(iocb->ki_filp->f_mapping, iocb->ki_filp, + iocb->ki_pos, orig_count, + NETFS_DIO_READ); + if (IS_ERR(rreq)) + return PTR_ERR(rreq); + + netfs_stat(&netfs_n_rh_dio_read); + trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_dio_read); + + /* If this is an async op, we have to keep track of the destination + * buffer for ourselves as the caller's iterator will be trashed when + * we return. + * + * In such a case, extract an iterator to represent as much of the the + * output buffer as we can manage. Note that the extraction might not + * be able to allocate a sufficiently large bvec array and may shorten + * the request. + */ + if (user_backed_iter(iter)) { + ret = netfs_extract_user_iter(iter, rreq->len, &rreq->iter, 0); + if (ret < 0) + goto out; + rreq->direct_bv = (struct bio_vec *)rreq->iter.bvec; + rreq->direct_bv_count = ret; + rreq->direct_bv_unpin = iov_iter_extract_will_pin(iter); + rreq->len = iov_iter_count(&rreq->iter); + } else { + rreq->iter = *iter; + rreq->len = orig_count; + rreq->direct_bv_unpin = false; + iov_iter_advance(iter, orig_count); + } + + // TODO: Set up bounce buffer if needed + + if (async) + rreq->iocb = iocb; + + ret = netfs_begin_read(rreq, is_sync_kiocb(iocb)); + if (ret < 0) + goto out; /* May be -EIOCBQUEUED */ + if (!async) { + // TODO: Copy from bounce buffer + iocb->ki_pos += rreq->transferred; + ret = rreq->transferred; + } + +out: + netfs_put_request(rreq, false, netfs_rreq_trace_put_return); + if (ret > 0) + orig_count -= ret; + if (ret != -EIOCBQUEUED) + iov_iter_revert(iter, orig_count - iov_iter_count(iter)); + return ret; +} + +/** + * netfs_unbuffered_read_iter - Perform an unbuffered or direct I/O read + * @iocb: The I/O control descriptor describing the read + * @iter: The output buffer (also specifies read length) + * + * Perform an unbuffered I/O or direct I/O from the file in @iocb to the + * output buffer. No use is made of the pagecache. + */ +ssize_t netfs_unbuffered_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t ret; + + if (!iter->count) + return 0; /* Don't update atime */ + + ret = netfs_start_io_direct(inode); + if (ret == 0) { + ret = netfs_unbuffered_read_iter_locked(iocb, iter); + netfs_end_io_direct(inode); + } + return ret; +} +EXPORT_SYMBOL(netfs_unbuffered_read_iter); diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c new file mode 100644 index 000000000000..60a40d293c87 --- /dev/null +++ b/fs/netfs/direct_write.c @@ -0,0 +1,171 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Unbuffered and direct write support. + * + * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/export.h> +#include <linux/uio.h> +#include "internal.h" + +static void netfs_cleanup_dio_write(struct netfs_io_request *wreq) +{ + struct inode *inode = wreq->inode; + unsigned long long end = wreq->start + wreq->len; + + if (!wreq->error && + i_size_read(inode) < end) { + if (wreq->netfs_ops->update_i_size) + wreq->netfs_ops->update_i_size(inode, end); + else + i_size_write(inode, end); + } +} + +/* + * Perform an unbuffered write where we may have to do an RMW operation on an + * encrypted file. This can also be used for direct I/O writes. + */ +static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *iter, + struct netfs_group *netfs_group) +{ + struct netfs_io_request *wreq; + unsigned long long start = iocb->ki_pos; + unsigned long long end = start + iov_iter_count(iter); + ssize_t ret, n; + bool async = !is_sync_kiocb(iocb); + + _enter(""); + + /* We're going to need a bounce buffer if what we transmit is going to + * be different in some way to the source buffer, e.g. because it gets + * encrypted/compressed or because it needs expanding to a block size. + */ + // TODO + + _debug("uw %llx-%llx", start, end); + + wreq = netfs_alloc_request(iocb->ki_filp->f_mapping, iocb->ki_filp, + start, end - start, + iocb->ki_flags & IOCB_DIRECT ? + NETFS_DIO_WRITE : NETFS_UNBUFFERED_WRITE); + if (IS_ERR(wreq)) + return PTR_ERR(wreq); + + { + /* If this is an async op and we're not using a bounce buffer, + * we have to save the source buffer as the iterator is only + * good until we return. In such a case, extract an iterator + * to represent as much of the the output buffer as we can + * manage. Note that the extraction might not be able to + * allocate a sufficiently large bvec array and may shorten the + * request. + */ + if (async || user_backed_iter(iter)) { + n = netfs_extract_user_iter(iter, wreq->len, &wreq->iter, 0); + if (n < 0) { + ret = n; + goto out; + } + wreq->direct_bv = (struct bio_vec *)wreq->iter.bvec; + wreq->direct_bv_count = n; + wreq->direct_bv_unpin = iov_iter_extract_will_pin(iter); + wreq->len = iov_iter_count(&wreq->iter); + } else { + wreq->iter = *iter; + } + + wreq->io_iter = wreq->iter; + } + + /* Copy the data into the bounce buffer and encrypt it. */ + // TODO + + /* Dispatch the write. */ + __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags); + if (async) + wreq->iocb = iocb; + wreq->cleanup = netfs_cleanup_dio_write; + ret = netfs_begin_write(wreq, is_sync_kiocb(iocb), + iocb->ki_flags & IOCB_DIRECT ? + netfs_write_trace_dio_write : + netfs_write_trace_unbuffered_write); + if (ret < 0) { + _debug("begin = %zd", ret); + goto out; + } + + if (!async) { + trace_netfs_rreq(wreq, netfs_rreq_trace_wait_ip); + wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS, + TASK_UNINTERRUPTIBLE); + + ret = wreq->error; + _debug("waited = %zd", ret); + if (ret == 0) { + ret = wreq->transferred; + iocb->ki_pos += ret; + } + } else { + ret = -EIOCBQUEUED; + } + +out: + netfs_put_request(wreq, false, netfs_rreq_trace_put_return); + return ret; +} + +/** + * netfs_unbuffered_write_iter - Unbuffered write to a file + * @iocb: IO state structure + * @from: iov_iter with data to write + * + * Do an unbuffered write to a file, writing the data directly to the server + * and not lodging the data in the pagecache. + * + * Return: + * * Negative error code if no data has been written at all of + * vfs_fsync_range() failed for a synchronous write + * * Number of bytes written, even for truncated writes + */ +ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct netfs_inode *ictx = netfs_inode(inode); + unsigned long long end; + ssize_t ret; + + _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode)); + + trace_netfs_write_iter(iocb, from); + netfs_stat(&netfs_n_rh_dio_write); + + ret = netfs_start_io_direct(inode); + if (ret < 0) + return ret; + ret = generic_write_checks(iocb, from); + if (ret < 0) + goto out; + ret = file_remove_privs(file); + if (ret < 0) + goto out; + ret = file_update_time(file); + if (ret < 0) + goto out; + ret = kiocb_invalidate_pages(iocb, iov_iter_count(from)); + if (ret < 0) + goto out; + end = iocb->ki_pos + iov_iter_count(from); + if (end > ictx->zero_point) + ictx->zero_point = end; + + fscache_invalidate(netfs_i_cookie(ictx), NULL, i_size_read(inode), + FSCACHE_INVAL_DIO_WRITE); + ret = netfs_unbuffered_write_iter_locked(iocb, from, NULL); +out: + netfs_end_io_direct(inode); + return ret; +} +EXPORT_SYMBOL(netfs_unbuffered_write_iter); diff --git a/fs/fscache/cache.c b/fs/netfs/fscache_cache.c index d645f8b302a2..9397ed39b0b4 100644 --- a/fs/fscache/cache.c +++ b/fs/netfs/fscache_cache.c @@ -179,13 +179,14 @@ EXPORT_SYMBOL(fscache_acquire_cache); void fscache_put_cache(struct fscache_cache *cache, enum fscache_cache_trace where) { - unsigned int debug_id = cache->debug_id; + unsigned int debug_id; bool zero; int ref; if (IS_ERR_OR_NULL(cache)) return; + debug_id = cache->debug_id; zero = __refcount_dec_and_test(&cache->ref, &ref); trace_fscache_cache(debug_id, ref - 1, where); diff --git a/fs/fscache/cookie.c b/fs/netfs/fscache_cookie.c index bce2492186d0..bce2492186d0 100644 --- a/fs/fscache/cookie.c +++ b/fs/netfs/fscache_cookie.c diff --git a/fs/netfs/fscache_internal.h b/fs/netfs/fscache_internal.h new file mode 100644 index 000000000000..a09b948fcef2 --- /dev/null +++ b/fs/netfs/fscache_internal.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Internal definitions for FS-Cache + * + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include "internal.h" + +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) "FS-Cache: " fmt diff --git a/fs/fscache/io.c b/fs/netfs/fscache_io.c index 0d2b8dec8f82..ad572f7ee897 100644 --- a/fs/fscache/io.c +++ b/fs/netfs/fscache_io.c @@ -158,46 +158,6 @@ int __fscache_begin_write_operation(struct netfs_cache_resources *cres, } EXPORT_SYMBOL(__fscache_begin_write_operation); -/** - * fscache_dirty_folio - Mark folio dirty and pin a cache object for writeback - * @mapping: The mapping the folio belongs to. - * @folio: The folio being dirtied. - * @cookie: The cookie referring to the cache object - * - * Set the dirty flag on a folio and pin an in-use cache object in memory - * so that writeback can later write to it. This is intended - * to be called from the filesystem's ->dirty_folio() method. - * - * Return: true if the dirty flag was set on the folio, false otherwise. - */ -bool fscache_dirty_folio(struct address_space *mapping, struct folio *folio, - struct fscache_cookie *cookie) -{ - struct inode *inode = mapping->host; - bool need_use = false; - - _enter(""); - - if (!filemap_dirty_folio(mapping, folio)) - return false; - if (!fscache_cookie_valid(cookie)) - return true; - - if (!(inode->i_state & I_PINNING_FSCACHE_WB)) { - spin_lock(&inode->i_lock); - if (!(inode->i_state & I_PINNING_FSCACHE_WB)) { - inode->i_state |= I_PINNING_FSCACHE_WB; - need_use = true; - } - spin_unlock(&inode->i_lock); - - if (need_use) - fscache_use_cookie(cookie, true); - } - return true; -} -EXPORT_SYMBOL(fscache_dirty_folio); - struct fscache_write_request { struct netfs_cache_resources cache_resources; struct address_space *mapping; @@ -277,7 +237,7 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie, fscache_access_io_write) < 0) goto abandon_free; - ret = cres->ops->prepare_write(cres, &start, &len, i_size, false); + ret = cres->ops->prepare_write(cres, &start, &len, len, i_size, false); if (ret < 0) goto abandon_end; diff --git a/fs/fscache/main.c b/fs/netfs/fscache_main.c index dad85fd84f6f..42e98bb523e3 100644 --- a/fs/fscache/main.c +++ b/fs/netfs/fscache_main.c @@ -8,18 +8,9 @@ #define FSCACHE_DEBUG_LEVEL CACHE #include <linux/module.h> #include <linux/init.h> -#define CREATE_TRACE_POINTS #include "internal.h" - -MODULE_DESCRIPTION("FS Cache Manager"); -MODULE_AUTHOR("Red Hat, Inc."); -MODULE_LICENSE("GPL"); - -unsigned fscache_debug; -module_param_named(debug, fscache_debug, uint, - S_IWUSR | S_IRUGO); -MODULE_PARM_DESC(fscache_debug, - "FS-Cache debugging mask"); +#define CREATE_TRACE_POINTS +#include <trace/events/fscache.h> EXPORT_TRACEPOINT_SYMBOL(fscache_access_cache); EXPORT_TRACEPOINT_SYMBOL(fscache_access_volume); @@ -71,7 +62,7 @@ unsigned int fscache_hash(unsigned int salt, const void *data, size_t len) /* * initialise the fs caching module */ -static int __init fscache_init(void) +int __init fscache_init(void) { int ret = -ENOMEM; @@ -92,7 +83,7 @@ static int __init fscache_init(void) goto error_cookie_jar; } - pr_notice("Loaded\n"); + pr_notice("FS-Cache loaded\n"); return 0; error_cookie_jar: @@ -103,19 +94,15 @@ error_wq: return ret; } -fs_initcall(fscache_init); - /* * clean up on module removal */ -static void __exit fscache_exit(void) +void __exit fscache_exit(void) { _enter(""); kmem_cache_destroy(fscache_cookie_jar); fscache_proc_cleanup(); destroy_workqueue(fscache_wq); - pr_notice("Unloaded\n"); + pr_notice("FS-Cache unloaded\n"); } - -module_exit(fscache_exit); diff --git a/fs/fscache/proc.c b/fs/netfs/fscache_proc.c index dc3b0e9c8cce..874d951bc390 100644 --- a/fs/fscache/proc.c +++ b/fs/netfs/fscache_proc.c @@ -12,41 +12,34 @@ #include "internal.h" /* - * initialise the /proc/fs/fscache/ directory + * Add files to /proc/fs/netfs/. */ int __init fscache_proc_init(void) { - if (!proc_mkdir("fs/fscache", NULL)) - goto error_dir; + if (!proc_symlink("fs/fscache", NULL, "netfs")) + goto error_sym; - if (!proc_create_seq("fs/fscache/caches", S_IFREG | 0444, NULL, + if (!proc_create_seq("fs/netfs/caches", S_IFREG | 0444, NULL, &fscache_caches_seq_ops)) goto error; - if (!proc_create_seq("fs/fscache/volumes", S_IFREG | 0444, NULL, + if (!proc_create_seq("fs/netfs/volumes", S_IFREG | 0444, NULL, &fscache_volumes_seq_ops)) goto error; - if (!proc_create_seq("fs/fscache/cookies", S_IFREG | 0444, NULL, + if (!proc_create_seq("fs/netfs/cookies", S_IFREG | 0444, NULL, &fscache_cookies_seq_ops)) goto error; - -#ifdef CONFIG_FSCACHE_STATS - if (!proc_create_single("fs/fscache/stats", S_IFREG | 0444, NULL, - fscache_stats_show)) - goto error; -#endif - return 0; error: remove_proc_entry("fs/fscache", NULL); -error_dir: +error_sym: return -ENOMEM; } /* - * clean up the /proc/fs/fscache/ directory + * Clean up the /proc/fs/fscache symlink. */ void fscache_proc_cleanup(void) { diff --git a/fs/fscache/stats.c b/fs/netfs/fscache_stats.c index fc94e5e79f1c..add21abdf713 100644 --- a/fs/fscache/stats.c +++ b/fs/netfs/fscache_stats.c @@ -48,13 +48,15 @@ atomic_t fscache_n_no_create_space; EXPORT_SYMBOL(fscache_n_no_create_space); atomic_t fscache_n_culled; EXPORT_SYMBOL(fscache_n_culled); +atomic_t fscache_n_dio_misfit; +EXPORT_SYMBOL(fscache_n_dio_misfit); /* * display the general statistics */ -int fscache_stats_show(struct seq_file *m, void *v) +int fscache_stats_show(struct seq_file *m) { - seq_puts(m, "FS-Cache statistics\n"); + seq_puts(m, "-- FS-Cache statistics --\n"); seq_printf(m, "Cookies: n=%d v=%d vcol=%u voom=%u\n", atomic_read(&fscache_n_cookies), atomic_read(&fscache_n_volumes), @@ -93,10 +95,9 @@ int fscache_stats_show(struct seq_file *m, void *v) atomic_read(&fscache_n_no_create_space), atomic_read(&fscache_n_culled)); - seq_printf(m, "IO : rd=%u wr=%u\n", + seq_printf(m, "IO : rd=%u wr=%u mis=%u\n", atomic_read(&fscache_n_read), - atomic_read(&fscache_n_write)); - - netfs_stats_show(m); + atomic_read(&fscache_n_write), + atomic_read(&fscache_n_dio_misfit)); return 0; } diff --git a/fs/fscache/volume.c b/fs/netfs/fscache_volume.c index cdf991bdd9de..cdf991bdd9de 100644 --- a/fs/fscache/volume.c +++ b/fs/netfs/fscache_volume.c diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index 43fac1b14e40..ec7045d24400 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -5,9 +5,13 @@ * Written by David Howells (dhowells@redhat.com) */ +#include <linux/slab.h> +#include <linux/seq_file.h> #include <linux/netfs.h> #include <linux/fscache.h> +#include <linux/fscache-cache.h> #include <trace/events/netfs.h> +#include <trace/events/fscache.h> #ifdef pr_fmt #undef pr_fmt @@ -19,6 +23,8 @@ * buffered_read.c */ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq); +int netfs_prefetch_for_write(struct file *file, struct folio *folio, + size_t offset, size_t len); /* * io.c @@ -29,6 +35,41 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync); * main.c */ extern unsigned int netfs_debug; +extern struct list_head netfs_io_requests; +extern spinlock_t netfs_proc_lock; + +#ifdef CONFIG_PROC_FS +static inline void netfs_proc_add_rreq(struct netfs_io_request *rreq) +{ + spin_lock(&netfs_proc_lock); + list_add_tail_rcu(&rreq->proc_link, &netfs_io_requests); + spin_unlock(&netfs_proc_lock); +} +static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq) +{ + if (!list_empty(&rreq->proc_link)) { + spin_lock(&netfs_proc_lock); + list_del_rcu(&rreq->proc_link); + spin_unlock(&netfs_proc_lock); + } +} +#else +static inline void netfs_proc_add_rreq(struct netfs_io_request *rreq) {} +static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq) {} +#endif + +/* + * misc.c + */ +#define NETFS_FLAG_PUT_MARK BIT(0) +#define NETFS_FLAG_PAGECACHE_MARK BIT(1) +int netfs_xa_store_and_mark(struct xarray *xa, unsigned long index, + struct folio *folio, unsigned int flags, + gfp_t gfp_mask); +int netfs_add_folios_to_buffer(struct xarray *buffer, + struct address_space *mapping, + pgoff_t index, pgoff_t to, gfp_t gfp_mask); +void netfs_clear_buffer(struct xarray *buffer); /* * objects.c @@ -50,9 +91,20 @@ static inline void netfs_see_request(struct netfs_io_request *rreq, } /* + * output.c + */ +int netfs_begin_write(struct netfs_io_request *wreq, bool may_wait, + enum netfs_write_trace what); +struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len); +int netfs_advance_writethrough(struct netfs_io_request *wreq, size_t copied, bool to_page_end); +int netfs_end_writethrough(struct netfs_io_request *wreq, struct kiocb *iocb); + +/* * stats.c */ #ifdef CONFIG_NETFS_STATS +extern atomic_t netfs_n_rh_dio_read; +extern atomic_t netfs_n_rh_dio_write; extern atomic_t netfs_n_rh_readahead; extern atomic_t netfs_n_rh_readpage; extern atomic_t netfs_n_rh_rreq; @@ -71,7 +123,15 @@ extern atomic_t netfs_n_rh_write_begin; extern atomic_t netfs_n_rh_write_done; extern atomic_t netfs_n_rh_write_failed; extern atomic_t netfs_n_rh_write_zskip; +extern atomic_t netfs_n_wh_wstream_conflict; +extern atomic_t netfs_n_wh_upload; +extern atomic_t netfs_n_wh_upload_done; +extern atomic_t netfs_n_wh_upload_failed; +extern atomic_t netfs_n_wh_write; +extern atomic_t netfs_n_wh_write_done; +extern atomic_t netfs_n_wh_write_failed; +int netfs_stats_show(struct seq_file *m, void *v); static inline void netfs_stat(atomic_t *stat) { @@ -103,6 +163,176 @@ static inline bool netfs_is_cache_enabled(struct netfs_inode *ctx) #endif } +/* + * Get a ref on a netfs group attached to a dirty page (e.g. a ceph snap). + */ +static inline struct netfs_group *netfs_get_group(struct netfs_group *netfs_group) +{ + if (netfs_group) + refcount_inc(&netfs_group->ref); + return netfs_group; +} + +/* + * Dispose of a netfs group attached to a dirty page (e.g. a ceph snap). + */ +static inline void netfs_put_group(struct netfs_group *netfs_group) +{ + if (netfs_group && refcount_dec_and_test(&netfs_group->ref)) + netfs_group->free(netfs_group); +} + +/* + * Dispose of a netfs group attached to a dirty page (e.g. a ceph snap). + */ +static inline void netfs_put_group_many(struct netfs_group *netfs_group, int nr) +{ + if (netfs_group && refcount_sub_and_test(nr, &netfs_group->ref)) + netfs_group->free(netfs_group); +} + +/* + * fscache-cache.c + */ +#ifdef CONFIG_PROC_FS +extern const struct seq_operations fscache_caches_seq_ops; +#endif +bool fscache_begin_cache_access(struct fscache_cache *cache, enum fscache_access_trace why); +void fscache_end_cache_access(struct fscache_cache *cache, enum fscache_access_trace why); +struct fscache_cache *fscache_lookup_cache(const char *name, bool is_cache); +void fscache_put_cache(struct fscache_cache *cache, enum fscache_cache_trace where); + +static inline enum fscache_cache_state fscache_cache_state(const struct fscache_cache *cache) +{ + return smp_load_acquire(&cache->state); +} + +static inline bool fscache_cache_is_live(const struct fscache_cache *cache) +{ + return fscache_cache_state(cache) == FSCACHE_CACHE_IS_ACTIVE; +} + +static inline void fscache_set_cache_state(struct fscache_cache *cache, + enum fscache_cache_state new_state) +{ + smp_store_release(&cache->state, new_state); + +} + +static inline bool fscache_set_cache_state_maybe(struct fscache_cache *cache, + enum fscache_cache_state old_state, + enum fscache_cache_state new_state) +{ + return try_cmpxchg_release(&cache->state, &old_state, new_state); +} + +/* + * fscache-cookie.c + */ +extern struct kmem_cache *fscache_cookie_jar; +#ifdef CONFIG_PROC_FS +extern const struct seq_operations fscache_cookies_seq_ops; +#endif +extern struct timer_list fscache_cookie_lru_timer; + +extern void fscache_print_cookie(struct fscache_cookie *cookie, char prefix); +extern bool fscache_begin_cookie_access(struct fscache_cookie *cookie, + enum fscache_access_trace why); + +static inline void fscache_see_cookie(struct fscache_cookie *cookie, + enum fscache_cookie_trace where) +{ + trace_fscache_cookie(cookie->debug_id, refcount_read(&cookie->ref), + where); +} + +/* + * fscache-main.c + */ +extern unsigned int fscache_hash(unsigned int salt, const void *data, size_t len); +#ifdef CONFIG_FSCACHE +int __init fscache_init(void); +void __exit fscache_exit(void); +#else +static inline int fscache_init(void) { return 0; } +static inline void fscache_exit(void) {} +#endif + +/* + * fscache-proc.c + */ +#ifdef CONFIG_PROC_FS +extern int __init fscache_proc_init(void); +extern void fscache_proc_cleanup(void); +#else +#define fscache_proc_init() (0) +#define fscache_proc_cleanup() do {} while (0) +#endif + +/* + * fscache-stats.c + */ +#ifdef CONFIG_FSCACHE_STATS +extern atomic_t fscache_n_volumes; +extern atomic_t fscache_n_volumes_collision; +extern atomic_t fscache_n_volumes_nomem; +extern atomic_t fscache_n_cookies; +extern atomic_t fscache_n_cookies_lru; +extern atomic_t fscache_n_cookies_lru_expired; +extern atomic_t fscache_n_cookies_lru_removed; +extern atomic_t fscache_n_cookies_lru_dropped; + +extern atomic_t fscache_n_acquires; +extern atomic_t fscache_n_acquires_ok; +extern atomic_t fscache_n_acquires_oom; + +extern atomic_t fscache_n_invalidates; + +extern atomic_t fscache_n_relinquishes; +extern atomic_t fscache_n_relinquishes_retire; +extern atomic_t fscache_n_relinquishes_dropped; + +extern atomic_t fscache_n_resizes; +extern atomic_t fscache_n_resizes_null; + +static inline void fscache_stat(atomic_t *stat) +{ + atomic_inc(stat); +} + +static inline void fscache_stat_d(atomic_t *stat) +{ + atomic_dec(stat); +} + +#define __fscache_stat(stat) (stat) + +int fscache_stats_show(struct seq_file *m); +#else + +#define __fscache_stat(stat) (NULL) +#define fscache_stat(stat) do {} while (0) +#define fscache_stat_d(stat) do {} while (0) + +static inline int fscache_stats_show(struct seq_file *m) { return 0; } +#endif + +/* + * fscache-volume.c + */ +#ifdef CONFIG_PROC_FS +extern const struct seq_operations fscache_volumes_seq_ops; +#endif + +struct fscache_volume *fscache_get_volume(struct fscache_volume *volume, + enum fscache_volume_trace where); +void fscache_put_volume(struct fscache_volume *volume, + enum fscache_volume_trace where); +bool fscache_begin_volume_access(struct fscache_volume *volume, + struct fscache_cookie *cookie, + enum fscache_access_trace why); +void fscache_create_volume(struct fscache_volume *volume, bool wait); + /*****************************************************************************/ /* * debug tracing @@ -143,3 +373,57 @@ do { \ #define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__) #define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__) #endif + +/* + * assertions + */ +#if 1 /* defined(__KDEBUGALL) */ + +#define ASSERT(X) \ +do { \ + if (unlikely(!(X))) { \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ + BUG(); \ + } \ +} while (0) + +#define ASSERTCMP(X, OP, Y) \ +do { \ + if (unlikely(!((X) OP (Y)))) { \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ + pr_err("%lx " #OP " %lx is false\n", \ + (unsigned long)(X), (unsigned long)(Y)); \ + BUG(); \ + } \ +} while (0) + +#define ASSERTIF(C, X) \ +do { \ + if (unlikely((C) && !(X))) { \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ + BUG(); \ + } \ +} while (0) + +#define ASSERTIFCMP(C, X, OP, Y) \ +do { \ + if (unlikely((C) && !((X) OP (Y)))) { \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ + pr_err("%lx " #OP " %lx is false\n", \ + (unsigned long)(X), (unsigned long)(Y)); \ + BUG(); \ + } \ +} while (0) + +#else + +#define ASSERT(X) do {} while (0) +#define ASSERTCMP(X, OP, Y) do {} while (0) +#define ASSERTIF(C, X) do {} while (0) +#define ASSERTIFCMP(C, X, OP, Y) do {} while (0) + +#endif /* assert or not */ diff --git a/fs/netfs/io.c b/fs/netfs/io.c index 7f753380e047..e8ff1e61ce79 100644 --- a/fs/netfs/io.c +++ b/fs/netfs/io.c @@ -21,12 +21,7 @@ */ static void netfs_clear_unread(struct netfs_io_subrequest *subreq) { - struct iov_iter iter; - - iov_iter_xarray(&iter, ITER_DEST, &subreq->rreq->mapping->i_pages, - subreq->start + subreq->transferred, - subreq->len - subreq->transferred); - iov_iter_zero(iov_iter_count(&iter), &iter); + iov_iter_zero(iov_iter_count(&subreq->io_iter), &subreq->io_iter); } static void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, @@ -46,14 +41,9 @@ static void netfs_read_from_cache(struct netfs_io_request *rreq, enum netfs_read_from_hole read_hole) { struct netfs_cache_resources *cres = &rreq->cache_resources; - struct iov_iter iter; netfs_stat(&netfs_n_rh_read); - iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, - subreq->start + subreq->transferred, - subreq->len - subreq->transferred); - - cres->ops->read(cres, subreq->start, &iter, read_hole, + cres->ops->read(cres, subreq->start, &subreq->io_iter, read_hole, netfs_cache_read_terminated, subreq); } @@ -88,6 +78,13 @@ static void netfs_read_from_server(struct netfs_io_request *rreq, struct netfs_io_subrequest *subreq) { netfs_stat(&netfs_n_rh_download); + + if (rreq->origin != NETFS_DIO_READ && + iov_iter_count(&subreq->io_iter) != subreq->len - subreq->transferred) + pr_warn("R=%08x[%u] ITER PRE-MISMATCH %zx != %zx-%zx %lx\n", + rreq->debug_id, subreq->debug_index, + iov_iter_count(&subreq->io_iter), subreq->len, + subreq->transferred, subreq->flags); rreq->netfs_ops->issue_read(subreq); } @@ -127,9 +124,10 @@ static void netfs_rreq_unmark_after_write(struct netfs_io_request *rreq, /* We might have multiple writes from the same huge * folio, but we mustn't unlock a folio more than once. */ - if (have_unlocked && folio_index(folio) <= unlocked) + if (have_unlocked && folio->index <= unlocked) continue; - unlocked = folio_index(folio); + unlocked = folio_next_index(folio) - 1; + trace_netfs_folio(folio, netfs_folio_trace_end_copy); folio_end_fscache(folio); have_unlocked = true; } @@ -201,7 +199,7 @@ static void netfs_rreq_do_write_to_cache(struct netfs_io_request *rreq) } ret = cres->ops->prepare_write(cres, &subreq->start, &subreq->len, - rreq->i_size, true); + subreq->len, rreq->i_size, true); if (ret < 0) { trace_netfs_failure(rreq, subreq, ret, netfs_fail_prepare_write); trace_netfs_sreq(subreq, netfs_sreq_trace_write_skip); @@ -260,6 +258,30 @@ static void netfs_rreq_short_read(struct netfs_io_request *rreq, } /* + * Reset the subrequest iterator prior to resubmission. + */ +static void netfs_reset_subreq_iter(struct netfs_io_request *rreq, + struct netfs_io_subrequest *subreq) +{ + size_t remaining = subreq->len - subreq->transferred; + size_t count = iov_iter_count(&subreq->io_iter); + + if (count == remaining) + return; + + _debug("R=%08x[%u] ITER RESUB-MISMATCH %zx != %zx-%zx-%llx %x\n", + rreq->debug_id, subreq->debug_index, + iov_iter_count(&subreq->io_iter), subreq->transferred, + subreq->len, rreq->i_size, + subreq->io_iter.iter_type); + + if (count < remaining) + iov_iter_revert(&subreq->io_iter, remaining - count); + else + iov_iter_advance(&subreq->io_iter, count - remaining); +} + +/* * Resubmit any short or failed operations. Returns true if we got the rreq * ref back. */ @@ -287,6 +309,7 @@ static bool netfs_rreq_perform_resubmissions(struct netfs_io_request *rreq) trace_netfs_sreq(subreq, netfs_sreq_trace_download_instead); netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); atomic_inc(&rreq->nr_outstanding); + netfs_reset_subreq_iter(rreq, subreq); netfs_read_from_server(rreq, subreq); } else if (test_bit(NETFS_SREQ_SHORT_IO, &subreq->flags)) { netfs_rreq_short_read(rreq, subreq); @@ -321,6 +344,43 @@ static void netfs_rreq_is_still_valid(struct netfs_io_request *rreq) } /* + * Determine how much we can admit to having read from a DIO read. + */ +static void netfs_rreq_assess_dio(struct netfs_io_request *rreq) +{ + struct netfs_io_subrequest *subreq; + unsigned int i; + size_t transferred = 0; + + for (i = 0; i < rreq->direct_bv_count; i++) + flush_dcache_page(rreq->direct_bv[i].bv_page); + + list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { + if (subreq->error || subreq->transferred == 0) + break; + transferred += subreq->transferred; + if (subreq->transferred < subreq->len) + break; + } + + for (i = 0; i < rreq->direct_bv_count; i++) + flush_dcache_page(rreq->direct_bv[i].bv_page); + + rreq->transferred = transferred; + task_io_account_read(transferred); + + if (rreq->iocb) { + rreq->iocb->ki_pos += transferred; + if (rreq->iocb->ki_complete) + rreq->iocb->ki_complete( + rreq->iocb, rreq->error ? rreq->error : transferred); + } + if (rreq->netfs_ops->done) + rreq->netfs_ops->done(rreq); + inode_dio_end(rreq->inode); +} + +/* * Assess the state of a read request and decide what to do next. * * Note that we could be in an ordinary kernel thread, on a workqueue or in @@ -340,8 +400,12 @@ again: return; } - netfs_rreq_unlock_folios(rreq); + if (rreq->origin != NETFS_DIO_READ) + netfs_rreq_unlock_folios(rreq); + else + netfs_rreq_assess_dio(rreq); + trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip); clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags); wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS); @@ -399,9 +463,9 @@ void netfs_subreq_terminated(struct netfs_io_subrequest *subreq, struct netfs_io_request *rreq = subreq->rreq; int u; - _enter("[%u]{%llx,%lx},%zd", - subreq->debug_index, subreq->start, subreq->flags, - transferred_or_error); + _enter("R=%x[%x]{%llx,%lx},%zd", + rreq->debug_id, subreq->debug_index, + subreq->start, subreq->flags, transferred_or_error); switch (subreq->source) { case NETFS_READ_FROM_CACHE: @@ -501,15 +565,20 @@ static enum netfs_io_source netfs_cache_prepare_read(struct netfs_io_subrequest */ static enum netfs_io_source netfs_rreq_prepare_read(struct netfs_io_request *rreq, - struct netfs_io_subrequest *subreq) + struct netfs_io_subrequest *subreq, + struct iov_iter *io_iter) { - enum netfs_io_source source; + enum netfs_io_source source = NETFS_DOWNLOAD_FROM_SERVER; + struct netfs_inode *ictx = netfs_inode(rreq->inode); + size_t lsize; _enter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size); - source = netfs_cache_prepare_read(subreq, rreq->i_size); - if (source == NETFS_INVALID_READ) - goto out; + if (rreq->origin != NETFS_DIO_READ) { + source = netfs_cache_prepare_read(subreq, rreq->i_size); + if (source == NETFS_INVALID_READ) + goto out; + } if (source == NETFS_DOWNLOAD_FROM_SERVER) { /* Call out to the netfs to let it shrink the request to fit @@ -518,19 +587,52 @@ netfs_rreq_prepare_read(struct netfs_io_request *rreq, * to make serial calls, it can indicate a short read and then * we will call it again. */ + if (rreq->origin != NETFS_DIO_READ) { + if (subreq->start >= ictx->zero_point) { + source = NETFS_FILL_WITH_ZEROES; + goto set; + } + if (subreq->len > ictx->zero_point - subreq->start) + subreq->len = ictx->zero_point - subreq->start; + } if (subreq->len > rreq->i_size - subreq->start) subreq->len = rreq->i_size - subreq->start; + if (rreq->rsize && subreq->len > rreq->rsize) + subreq->len = rreq->rsize; if (rreq->netfs_ops->clamp_length && !rreq->netfs_ops->clamp_length(subreq)) { source = NETFS_INVALID_READ; goto out; } + + if (subreq->max_nr_segs) { + lsize = netfs_limit_iter(io_iter, 0, subreq->len, + subreq->max_nr_segs); + if (subreq->len > lsize) { + subreq->len = lsize; + trace_netfs_sreq(subreq, netfs_sreq_trace_limited); + } + } } - if (WARN_ON(subreq->len == 0)) +set: + if (subreq->len > rreq->len) + pr_warn("R=%08x[%u] SREQ>RREQ %zx > %zx\n", + rreq->debug_id, subreq->debug_index, + subreq->len, rreq->len); + + if (WARN_ON(subreq->len == 0)) { source = NETFS_INVALID_READ; + goto out; + } + subreq->source = source; + trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); + + subreq->io_iter = *io_iter; + iov_iter_truncate(&subreq->io_iter, subreq->len); + iov_iter_advance(io_iter, subreq->len); out: subreq->source = source; trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); @@ -541,6 +643,7 @@ out: * Slice off a piece of a read request and submit an I/O request for it. */ static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq, + struct iov_iter *io_iter, unsigned int *_debug_index) { struct netfs_io_subrequest *subreq; @@ -552,7 +655,7 @@ static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq, subreq->debug_index = (*_debug_index)++; subreq->start = rreq->start + rreq->submitted; - subreq->len = rreq->len - rreq->submitted; + subreq->len = io_iter->count; _debug("slice %llx,%zx,%zx", subreq->start, subreq->len, rreq->submitted); list_add_tail(&subreq->rreq_link, &rreq->subrequests); @@ -565,7 +668,7 @@ static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq, * (the starts must coincide), in which case, we go around the loop * again and ask it to download the next piece. */ - source = netfs_rreq_prepare_read(rreq, subreq); + source = netfs_rreq_prepare_read(rreq, subreq, io_iter); if (source == NETFS_INVALID_READ) goto subreq_failed; @@ -603,6 +706,7 @@ subreq_failed: */ int netfs_begin_read(struct netfs_io_request *rreq, bool sync) { + struct iov_iter io_iter; unsigned int debug_index = 0; int ret; @@ -611,50 +715,71 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync) if (rreq->len == 0) { pr_err("Zero-sized read [R=%x]\n", rreq->debug_id); - netfs_put_request(rreq, false, netfs_rreq_trace_put_zero_len); return -EIO; } - INIT_WORK(&rreq->work, netfs_rreq_work); + if (rreq->origin == NETFS_DIO_READ) + inode_dio_begin(rreq->inode); - if (sync) - netfs_get_request(rreq, netfs_rreq_trace_get_hold); + // TODO: Use bounce buffer if requested + rreq->io_iter = rreq->iter; + + INIT_WORK(&rreq->work, netfs_rreq_work); /* Chop the read into slices according to what the cache and the netfs * want and submit each one. */ + netfs_get_request(rreq, netfs_rreq_trace_get_for_outstanding); atomic_set(&rreq->nr_outstanding, 1); + io_iter = rreq->io_iter; do { - if (!netfs_rreq_submit_slice(rreq, &debug_index)) + _debug("submit %llx + %zx >= %llx", + rreq->start, rreq->submitted, rreq->i_size); + if (rreq->origin == NETFS_DIO_READ && + rreq->start + rreq->submitted >= rreq->i_size) + break; + if (!netfs_rreq_submit_slice(rreq, &io_iter, &debug_index)) + break; + if (test_bit(NETFS_RREQ_BLOCKED, &rreq->flags) && + test_bit(NETFS_RREQ_NONBLOCK, &rreq->flags)) break; } while (rreq->submitted < rreq->len); + if (!rreq->submitted) { + netfs_put_request(rreq, false, netfs_rreq_trace_put_no_submit); + ret = 0; + goto out; + } + if (sync) { - /* Keep nr_outstanding incremented so that the ref always belongs to - * us, and the service code isn't punted off to a random thread pool to - * process. + /* Keep nr_outstanding incremented so that the ref always + * belongs to us, and the service code isn't punted off to a + * random thread pool to process. Note that this might start + * further work, such as writing to the cache. */ - for (;;) { - wait_var_event(&rreq->nr_outstanding, - atomic_read(&rreq->nr_outstanding) == 1); + wait_var_event(&rreq->nr_outstanding, + atomic_read(&rreq->nr_outstanding) == 1); + if (atomic_dec_and_test(&rreq->nr_outstanding)) netfs_rreq_assess(rreq, false); - if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags)) - break; - cond_resched(); - } + + trace_netfs_rreq(rreq, netfs_rreq_trace_wait_ip); + wait_on_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS, + TASK_UNINTERRUPTIBLE); ret = rreq->error; - if (ret == 0 && rreq->submitted < rreq->len) { + if (ret == 0 && rreq->submitted < rreq->len && + rreq->origin != NETFS_DIO_READ) { trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read); ret = -EIO; } - netfs_put_request(rreq, false, netfs_rreq_trace_put_hold); } else { /* If we decrement nr_outstanding to 0, the ref belongs to us. */ if (atomic_dec_and_test(&rreq->nr_outstanding)) netfs_rreq_assess(rreq, false); - ret = 0; + ret = -EIOCBQUEUED; } + +out: return ret; } diff --git a/fs/netfs/iterator.c b/fs/netfs/iterator.c index 2ff07ba655a0..b781bbbf1d8d 100644 --- a/fs/netfs/iterator.c +++ b/fs/netfs/iterator.c @@ -101,3 +101,100 @@ ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len, return npages; } EXPORT_SYMBOL_GPL(netfs_extract_user_iter); + +/* + * Select the span of a bvec iterator we're going to use. Limit it by both maximum + * size and maximum number of segments. Returns the size of the span in bytes. + */ +static size_t netfs_limit_bvec(const struct iov_iter *iter, size_t start_offset, + size_t max_size, size_t max_segs) +{ + const struct bio_vec *bvecs = iter->bvec; + unsigned int nbv = iter->nr_segs, ix = 0, nsegs = 0; + size_t len, span = 0, n = iter->count; + size_t skip = iter->iov_offset + start_offset; + + if (WARN_ON(!iov_iter_is_bvec(iter)) || + WARN_ON(start_offset > n) || + n == 0) + return 0; + + while (n && ix < nbv && skip) { + len = bvecs[ix].bv_len; + if (skip < len) + break; + skip -= len; + n -= len; + ix++; + } + + while (n && ix < nbv) { + len = min3(n, bvecs[ix].bv_len - skip, max_size); + span += len; + nsegs++; + ix++; + if (span >= max_size || nsegs >= max_segs) + break; + skip = 0; + n -= len; + } + + return min(span, max_size); +} + +/* + * Select the span of an xarray iterator we're going to use. Limit it by both + * maximum size and maximum number of segments. It is assumed that segments + * can be larger than a page in size, provided they're physically contiguous. + * Returns the size of the span in bytes. + */ +static size_t netfs_limit_xarray(const struct iov_iter *iter, size_t start_offset, + size_t max_size, size_t max_segs) +{ + struct folio *folio; + unsigned int nsegs = 0; + loff_t pos = iter->xarray_start + iter->iov_offset; + pgoff_t index = pos / PAGE_SIZE; + size_t span = 0, n = iter->count; + + XA_STATE(xas, iter->xarray, index); + + if (WARN_ON(!iov_iter_is_xarray(iter)) || + WARN_ON(start_offset > n) || + n == 0) + return 0; + max_size = min(max_size, n - start_offset); + + rcu_read_lock(); + xas_for_each(&xas, folio, ULONG_MAX) { + size_t offset, flen, len; + if (xas_retry(&xas, folio)) + continue; + if (WARN_ON(xa_is_value(folio))) + break; + if (WARN_ON(folio_test_hugetlb(folio))) + break; + + flen = folio_size(folio); + offset = offset_in_folio(folio, pos); + len = min(max_size, flen - offset); + span += len; + nsegs++; + if (span >= max_size || nsegs >= max_segs) + break; + } + + rcu_read_unlock(); + return min(span, max_size); +} + +size_t netfs_limit_iter(const struct iov_iter *iter, size_t start_offset, + size_t max_size, size_t max_segs) +{ + if (iov_iter_is_bvec(iter)) + return netfs_limit_bvec(iter, start_offset, max_size, max_segs); + if (iov_iter_is_xarray(iter)) + return netfs_limit_xarray(iter, start_offset, max_size, max_segs); + BUG(); +} +EXPORT_SYMBOL(netfs_limit_iter); diff --git a/fs/netfs/locking.c b/fs/netfs/locking.c new file mode 100644 index 000000000000..75dc52a49b3a --- /dev/null +++ b/fs/netfs/locking.c @@ -0,0 +1,216 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * I/O and data path helper functionality. + * + * Borrowed from NFS Copyright (c) 2016 Trond Myklebust + */ + +#include <linux/kernel.h> +#include <linux/netfs.h> +#include "internal.h" + +/* + * inode_dio_wait_interruptible - wait for outstanding DIO requests to finish + * @inode: inode to wait for + * + * Waits for all pending direct I/O requests to finish so that we can + * proceed with a truncate or equivalent operation. + * + * Must be called under a lock that serializes taking new references + * to i_dio_count, usually by inode->i_mutex. + */ +static int inode_dio_wait_interruptible(struct inode *inode) +{ + if (!atomic_read(&inode->i_dio_count)) + return 0; + + wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP); + DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP); + + for (;;) { + prepare_to_wait(wq, &q.wq_entry, TASK_INTERRUPTIBLE); + if (!atomic_read(&inode->i_dio_count)) + break; + if (signal_pending(current)) + break; + schedule(); + } + finish_wait(wq, &q.wq_entry); + + return atomic_read(&inode->i_dio_count) ? -ERESTARTSYS : 0; +} + +/* Call with exclusively locked inode->i_rwsem */ +static int netfs_block_o_direct(struct netfs_inode *ictx) +{ + if (!test_bit(NETFS_ICTX_ODIRECT, &ictx->flags)) + return 0; + clear_bit(NETFS_ICTX_ODIRECT, &ictx->flags); + return inode_dio_wait_interruptible(&ictx->inode); +} + +/** + * netfs_start_io_read - declare the file is being used for buffered reads + * @inode: file inode + * + * Declare that a buffered read operation is about to start, and ensure + * that we block all direct I/O. + * On exit, the function ensures that the NETFS_ICTX_ODIRECT flag is unset, + * and holds a shared lock on inode->i_rwsem to ensure that the flag + * cannot be changed. + * In practice, this means that buffered read operations are allowed to + * execute in parallel, thanks to the shared lock, whereas direct I/O + * operations need to wait to grab an exclusive lock in order to set + * NETFS_ICTX_ODIRECT. + * Note that buffered writes and truncates both take a write lock on + * inode->i_rwsem, meaning that those are serialised w.r.t. the reads. + */ +int netfs_start_io_read(struct inode *inode) + __acquires(inode->i_rwsem) +{ + struct netfs_inode *ictx = netfs_inode(inode); + + /* Be an optimist! */ + if (down_read_interruptible(&inode->i_rwsem) < 0) + return -ERESTARTSYS; + if (test_bit(NETFS_ICTX_ODIRECT, &ictx->flags) == 0) + return 0; + up_read(&inode->i_rwsem); + + /* Slow path.... */ + if (down_write_killable(&inode->i_rwsem) < 0) + return -ERESTARTSYS; + if (netfs_block_o_direct(ictx) < 0) { + up_write(&inode->i_rwsem); + return -ERESTARTSYS; + } + downgrade_write(&inode->i_rwsem); + return 0; +} +EXPORT_SYMBOL(netfs_start_io_read); + +/** + * netfs_end_io_read - declare that the buffered read operation is done + * @inode: file inode + * + * Declare that a buffered read operation is done, and release the shared + * lock on inode->i_rwsem. + */ +void netfs_end_io_read(struct inode *inode) + __releases(inode->i_rwsem) +{ + up_read(&inode->i_rwsem); +} +EXPORT_SYMBOL(netfs_end_io_read); + +/** + * netfs_start_io_write - declare the file is being used for buffered writes + * @inode: file inode + * + * Declare that a buffered read operation is about to start, and ensure + * that we block all direct I/O. + */ +int netfs_start_io_write(struct inode *inode) + __acquires(inode->i_rwsem) +{ + struct netfs_inode *ictx = netfs_inode(inode); + + if (down_write_killable(&inode->i_rwsem) < 0) + return -ERESTARTSYS; + if (netfs_block_o_direct(ictx) < 0) { + up_write(&inode->i_rwsem); + return -ERESTARTSYS; + } + return 0; +} +EXPORT_SYMBOL(netfs_start_io_write); + +/** + * netfs_end_io_write - declare that the buffered write operation is done + * @inode: file inode + * + * Declare that a buffered write operation is done, and release the + * lock on inode->i_rwsem. + */ +void netfs_end_io_write(struct inode *inode) + __releases(inode->i_rwsem) +{ + up_write(&inode->i_rwsem); +} +EXPORT_SYMBOL(netfs_end_io_write); + +/* Call with exclusively locked inode->i_rwsem */ +static int netfs_block_buffered(struct inode *inode) +{ + struct netfs_inode *ictx = netfs_inode(inode); + int ret; + + if (!test_bit(NETFS_ICTX_ODIRECT, &ictx->flags)) { + set_bit(NETFS_ICTX_ODIRECT, &ictx->flags); + if (inode->i_mapping->nrpages != 0) { + unmap_mapping_range(inode->i_mapping, 0, 0, 0); + ret = filemap_fdatawait(inode->i_mapping); + if (ret < 0) { + clear_bit(NETFS_ICTX_ODIRECT, &ictx->flags); + return ret; + } + } + } + return 0; +} + +/** + * netfs_start_io_direct - declare the file is being used for direct i/o + * @inode: file inode + * + * Declare that a direct I/O operation is about to start, and ensure + * that we block all buffered I/O. + * On exit, the function ensures that the NETFS_ICTX_ODIRECT flag is set, + * and holds a shared lock on inode->i_rwsem to ensure that the flag + * cannot be changed. + * In practice, this means that direct I/O operations are allowed to + * execute in parallel, thanks to the shared lock, whereas buffered I/O + * operations need to wait to grab an exclusive lock in order to clear + * NETFS_ICTX_ODIRECT. + * Note that buffered writes and truncates both take a write lock on + * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT. + */ +int netfs_start_io_direct(struct inode *inode) + __acquires(inode->i_rwsem) +{ + struct netfs_inode *ictx = netfs_inode(inode); + int ret; + + /* Be an optimist! */ + if (down_read_interruptible(&inode->i_rwsem) < 0) + return -ERESTARTSYS; + if (test_bit(NETFS_ICTX_ODIRECT, &ictx->flags) != 0) + return 0; + up_read(&inode->i_rwsem); + + /* Slow path.... */ + if (down_write_killable(&inode->i_rwsem) < 0) + return -ERESTARTSYS; + ret = netfs_block_buffered(inode); + if (ret < 0) { + up_write(&inode->i_rwsem); + return ret; + } + downgrade_write(&inode->i_rwsem); + return 0; +} +EXPORT_SYMBOL(netfs_start_io_direct); + +/** + * netfs_end_io_direct - declare that the direct i/o operation is done + * @inode: file inode + * + * Declare that a direct I/O operation is done, and release the shared + * lock on inode->i_rwsem. + */ +void netfs_end_io_direct(struct inode *inode) + __releases(inode->i_rwsem) +{ + up_read(&inode->i_rwsem); +} +EXPORT_SYMBOL(netfs_end_io_direct); diff --git a/fs/netfs/main.c b/fs/netfs/main.c index 068568702957..5e77618a7940 100644 --- a/fs/netfs/main.c +++ b/fs/netfs/main.c @@ -7,6 +7,8 @@ #include <linux/module.h> #include <linux/export.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> #include "internal.h" #define CREATE_TRACE_POINTS #include <trace/events/netfs.h> @@ -15,6 +17,113 @@ MODULE_DESCRIPTION("Network fs support"); MODULE_AUTHOR("Red Hat, Inc."); MODULE_LICENSE("GPL"); +EXPORT_TRACEPOINT_SYMBOL(netfs_sreq); + unsigned netfs_debug; module_param_named(debug, netfs_debug, uint, S_IWUSR | S_IRUGO); MODULE_PARM_DESC(netfs_debug, "Netfs support debugging mask"); + +#ifdef CONFIG_PROC_FS +LIST_HEAD(netfs_io_requests); +DEFINE_SPINLOCK(netfs_proc_lock); + +static const char *netfs_origins[nr__netfs_io_origin] = { + [NETFS_READAHEAD] = "RA", + [NETFS_READPAGE] = "RP", + [NETFS_READ_FOR_WRITE] = "RW", + [NETFS_WRITEBACK] = "WB", + [NETFS_WRITETHROUGH] = "WT", + [NETFS_LAUNDER_WRITE] = "LW", + [NETFS_UNBUFFERED_WRITE] = "UW", + [NETFS_DIO_READ] = "DR", + [NETFS_DIO_WRITE] = "DW", +}; + +/* + * Generate a list of I/O requests in /proc/fs/netfs/requests + */ +static int netfs_requests_seq_show(struct seq_file *m, void *v) +{ + struct netfs_io_request *rreq; + + if (v == &netfs_io_requests) { + seq_puts(m, + "REQUEST OR REF FL ERR OPS COVERAGE\n" + "======== == === == ==== === =========\n" + ); + return 0; + } + + rreq = list_entry(v, struct netfs_io_request, proc_link); + seq_printf(m, + "%08x %s %3d %2lx %4d %3d @%04llx %zx/%zx", + rreq->debug_id, + netfs_origins[rreq->origin], + refcount_read(&rreq->ref), + rreq->flags, + rreq->error, + atomic_read(&rreq->nr_outstanding), + rreq->start, rreq->submitted, rreq->len); + seq_putc(m, '\n'); + return 0; +} + +static void *netfs_requests_seq_start(struct seq_file *m, loff_t *_pos) + __acquires(rcu) +{ + rcu_read_lock(); + return seq_list_start_head(&netfs_io_requests, *_pos); +} + +static void *netfs_requests_seq_next(struct seq_file *m, void *v, loff_t *_pos) +{ + return seq_list_next(v, &netfs_io_requests, _pos); +} + +static void netfs_requests_seq_stop(struct seq_file *m, void *v) + __releases(rcu) +{ + rcu_read_unlock(); +} + +static const struct seq_operations netfs_requests_seq_ops = { + .start = netfs_requests_seq_start, + .next = netfs_requests_seq_next, + .stop = netfs_requests_seq_stop, + .show = netfs_requests_seq_show, +}; +#endif /* CONFIG_PROC_FS */ + +static int __init netfs_init(void) +{ + int ret = -ENOMEM; + + if (!proc_mkdir("fs/netfs", NULL)) + goto error; + if (!proc_create_seq("fs/netfs/requests", S_IFREG | 0444, NULL, + &netfs_requests_seq_ops)) + goto error_proc; +#ifdef CONFIG_FSCACHE_STATS + if (!proc_create_single("fs/netfs/stats", S_IFREG | 0444, NULL, + netfs_stats_show)) + goto error_proc; +#endif + + ret = fscache_init(); + if (ret < 0) + goto error_proc; + return 0; + +error_proc: + remove_proc_entry("fs/netfs", NULL); +error: + return ret; +} +fs_initcall(netfs_init); + +static void __exit netfs_exit(void) +{ + fscache_exit(); + remove_proc_entry("fs/netfs", NULL); +} +module_exit(netfs_exit); diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c new file mode 100644 index 000000000000..90051ced8e2a --- /dev/null +++ b/fs/netfs/misc.c @@ -0,0 +1,260 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Miscellaneous routines. + * + * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/swap.h> +#include "internal.h" + +/* + * Attach a folio to the buffer and maybe set marks on it to say that we need + * to put the folio later and twiddle the pagecache flags. + */ +int netfs_xa_store_and_mark(struct xarray *xa, unsigned long index, + struct folio *folio, unsigned int flags, + gfp_t gfp_mask) +{ + XA_STATE_ORDER(xas, xa, index, folio_order(folio)); + +retry: + xas_lock(&xas); + for (;;) { + xas_store(&xas, folio); + if (!xas_error(&xas)) + break; + xas_unlock(&xas); + if (!xas_nomem(&xas, gfp_mask)) + return xas_error(&xas); + goto retry; + } + + if (flags & NETFS_FLAG_PUT_MARK) + xas_set_mark(&xas, NETFS_BUF_PUT_MARK); + if (flags & NETFS_FLAG_PAGECACHE_MARK) + xas_set_mark(&xas, NETFS_BUF_PAGECACHE_MARK); + xas_unlock(&xas); + return xas_error(&xas); +} + +/* + * Create the specified range of folios in the buffer attached to the read + * request. The folios are marked with NETFS_BUF_PUT_MARK so that we know that + * these need freeing later. + */ +int netfs_add_folios_to_buffer(struct xarray *buffer, + struct address_space *mapping, + pgoff_t index, pgoff_t to, gfp_t gfp_mask) +{ + struct folio *folio; + int ret; + + if (to + 1 == index) /* Page range is inclusive */ + return 0; + + do { + /* TODO: Figure out what order folio can be allocated here */ + folio = filemap_alloc_folio(readahead_gfp_mask(mapping), 0); + if (!folio) + return -ENOMEM; + folio->index = index; + ret = netfs_xa_store_and_mark(buffer, index, folio, + NETFS_FLAG_PUT_MARK, gfp_mask); + if (ret < 0) { + folio_put(folio); + return ret; + } + + index += folio_nr_pages(folio); + } while (index <= to && index != 0); + + return 0; +} + +/* + * Clear an xarray buffer, putting a ref on the folios that have + * NETFS_BUF_PUT_MARK set. + */ +void netfs_clear_buffer(struct xarray *buffer) +{ + struct folio *folio; + XA_STATE(xas, buffer, 0); + + rcu_read_lock(); + xas_for_each_marked(&xas, folio, ULONG_MAX, NETFS_BUF_PUT_MARK) { + folio_put(folio); + } + rcu_read_unlock(); + xa_destroy(buffer); +} + +/** + * netfs_dirty_folio - Mark folio dirty and pin a cache object for writeback + * @mapping: The mapping the folio belongs to. + * @folio: The folio being dirtied. + * + * Set the dirty flag on a folio and pin an in-use cache object in memory so + * that writeback can later write to it. This is intended to be called from + * the filesystem's ->dirty_folio() method. + * + * Return: true if the dirty flag was set on the folio, false otherwise. + */ +bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio) +{ + struct inode *inode = mapping->host; + struct netfs_inode *ictx = netfs_inode(inode); + struct fscache_cookie *cookie = netfs_i_cookie(ictx); + bool need_use = false; + + _enter(""); + + if (!filemap_dirty_folio(mapping, folio)) + return false; + if (!fscache_cookie_valid(cookie)) + return true; + + if (!(inode->i_state & I_PINNING_NETFS_WB)) { + spin_lock(&inode->i_lock); + if (!(inode->i_state & I_PINNING_NETFS_WB)) { + inode->i_state |= I_PINNING_NETFS_WB; + need_use = true; + } + spin_unlock(&inode->i_lock); + + if (need_use) + fscache_use_cookie(cookie, true); + } + return true; +} +EXPORT_SYMBOL(netfs_dirty_folio); + +/** + * netfs_unpin_writeback - Unpin writeback resources + * @inode: The inode on which the cookie resides + * @wbc: The writeback control + * + * Unpin the writeback resources pinned by netfs_dirty_folio(). This is + * intended to be called as/by the netfs's ->write_inode() method. + */ +int netfs_unpin_writeback(struct inode *inode, struct writeback_control *wbc) +{ + struct fscache_cookie *cookie = netfs_i_cookie(netfs_inode(inode)); + + if (wbc->unpinned_netfs_wb) + fscache_unuse_cookie(cookie, NULL, NULL); + return 0; +} +EXPORT_SYMBOL(netfs_unpin_writeback); + +/** + * netfs_clear_inode_writeback - Clear writeback resources pinned by an inode + * @inode: The inode to clean up + * @aux: Auxiliary data to apply to the inode + * + * Clear any writeback resources held by an inode when the inode is evicted. + * This must be called before clear_inode() is called. + */ +void netfs_clear_inode_writeback(struct inode *inode, const void *aux) +{ + struct fscache_cookie *cookie = netfs_i_cookie(netfs_inode(inode)); + + if (inode->i_state & I_PINNING_NETFS_WB) { + loff_t i_size = i_size_read(inode); + fscache_unuse_cookie(cookie, aux, &i_size); + } +} +EXPORT_SYMBOL(netfs_clear_inode_writeback); + +/** + * netfs_invalidate_folio - Invalidate or partially invalidate a folio + * @folio: Folio proposed for release + * @offset: Offset of the invalidated region + * @length: Length of the invalidated region + * + * Invalidate part or all of a folio for a network filesystem. The folio will + * be removed afterwards if the invalidated region covers the entire folio. + */ +void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length) +{ + struct netfs_folio *finfo = NULL; + size_t flen = folio_size(folio); + + _enter("{%lx},%zx,%zx", folio->index, offset, length); + + folio_wait_fscache(folio); + + if (!folio_test_private(folio)) + return; + + finfo = netfs_folio_info(folio); + + if (offset == 0 && length >= flen) + goto erase_completely; + + if (finfo) { + /* We have a partially uptodate page from a streaming write. */ + unsigned int fstart = finfo->dirty_offset; + unsigned int fend = fstart + finfo->dirty_len; + unsigned int end = offset + length; + + if (offset >= fend) + return; + if (end <= fstart) + return; + if (offset <= fstart && end >= fend) + goto erase_completely; + if (offset <= fstart && end > fstart) + goto reduce_len; + if (offset > fstart && end >= fend) + goto move_start; + /* A partial write was split. The caller has already zeroed + * it, so just absorb the hole. + */ + } + return; + +erase_completely: + netfs_put_group(netfs_folio_group(folio)); + folio_detach_private(folio); + folio_clear_uptodate(folio); + kfree(finfo); + return; +reduce_len: + finfo->dirty_len = offset + length - finfo->dirty_offset; + return; +move_start: + finfo->dirty_len -= offset - finfo->dirty_offset; + finfo->dirty_offset = offset; +} +EXPORT_SYMBOL(netfs_invalidate_folio); + +/** + * netfs_release_folio - Try to release a folio + * @folio: Folio proposed for release + * @gfp: Flags qualifying the release + * + * Request release of a folio and clean up its private state if it's not busy. + * Returns true if the folio can now be released, false if not + */ +bool netfs_release_folio(struct folio *folio, gfp_t gfp) +{ + struct netfs_inode *ctx = netfs_inode(folio_inode(folio)); + unsigned long long end; + + end = folio_pos(folio) + folio_size(folio); + if (end > ctx->zero_point) + ctx->zero_point = end; + + if (folio_test_private(folio)) + return false; + if (folio_test_fscache(folio)) { + if (current_is_kswapd() || !(gfp & __GFP_FS)) + return false; + folio_wait_fscache(folio); + } + + fscache_note_page_release(netfs_i_cookie(ctx)); + return true; +} +EXPORT_SYMBOL(netfs_release_folio); diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index e17cdf53f6a7..610ceb5bd86c 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -20,14 +20,20 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, struct inode *inode = file ? file_inode(file) : mapping->host; struct netfs_inode *ctx = netfs_inode(inode); struct netfs_io_request *rreq; + bool is_unbuffered = (origin == NETFS_UNBUFFERED_WRITE || + origin == NETFS_DIO_READ || + origin == NETFS_DIO_WRITE); + bool cached = !is_unbuffered && netfs_is_cache_enabled(ctx); int ret; - rreq = kzalloc(sizeof(struct netfs_io_request), GFP_KERNEL); + rreq = kzalloc(ctx->ops->io_request_size ?: sizeof(struct netfs_io_request), + GFP_KERNEL); if (!rreq) return ERR_PTR(-ENOMEM); rreq->start = start; rreq->len = len; + rreq->upper_len = len; rreq->origin = origin; rreq->netfs_ops = ctx->ops; rreq->mapping = mapping; @@ -35,8 +41,14 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, rreq->i_size = i_size_read(inode); rreq->debug_id = atomic_inc_return(&debug_ids); INIT_LIST_HEAD(&rreq->subrequests); + INIT_WORK(&rreq->work, NULL); refcount_set(&rreq->ref, 1); + __set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags); + if (cached) + __set_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags); + if (file && file->f_flags & O_NONBLOCK) + __set_bit(NETFS_RREQ_NONBLOCK, &rreq->flags); if (rreq->netfs_ops->init_request) { ret = rreq->netfs_ops->init_request(rreq, file); if (ret < 0) { @@ -45,6 +57,8 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, } } + trace_netfs_rreq_ref(rreq->debug_id, 1, netfs_rreq_trace_new); + netfs_proc_add_rreq(rreq); netfs_stat(&netfs_n_rh_rreq); return rreq; } @@ -74,33 +88,47 @@ static void netfs_free_request(struct work_struct *work) { struct netfs_io_request *rreq = container_of(work, struct netfs_io_request, work); + unsigned int i; trace_netfs_rreq(rreq, netfs_rreq_trace_free); + netfs_proc_del_rreq(rreq); netfs_clear_subrequests(rreq, false); if (rreq->netfs_ops->free_request) rreq->netfs_ops->free_request(rreq); if (rreq->cache_resources.ops) rreq->cache_resources.ops->end_operation(&rreq->cache_resources); - kfree(rreq); + if (rreq->direct_bv) { + for (i = 0; i < rreq->direct_bv_count; i++) { + if (rreq->direct_bv[i].bv_page) { + if (rreq->direct_bv_unpin) + unpin_user_page(rreq->direct_bv[i].bv_page); + } + } + kvfree(rreq->direct_bv); + } + kfree_rcu(rreq, rcu); netfs_stat_d(&netfs_n_rh_rreq); } void netfs_put_request(struct netfs_io_request *rreq, bool was_async, enum netfs_rreq_ref_trace what) { - unsigned int debug_id = rreq->debug_id; + unsigned int debug_id; bool dead; int r; - dead = __refcount_dec_and_test(&rreq->ref, &r); - trace_netfs_rreq_ref(debug_id, r - 1, what); - if (dead) { - if (was_async) { - rreq->work.func = netfs_free_request; - if (!queue_work(system_unbound_wq, &rreq->work)) - BUG(); - } else { - netfs_free_request(&rreq->work); + if (rreq) { + debug_id = rreq->debug_id; + dead = __refcount_dec_and_test(&rreq->ref, &r); + trace_netfs_rreq_ref(debug_id, r - 1, what); + if (dead) { + if (was_async) { + rreq->work.func = netfs_free_request; + if (!queue_work(system_unbound_wq, &rreq->work)) + BUG(); + } else { + netfs_free_request(&rreq->work); + } } } } @@ -112,8 +140,11 @@ struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq { struct netfs_io_subrequest *subreq; - subreq = kzalloc(sizeof(struct netfs_io_subrequest), GFP_KERNEL); + subreq = kzalloc(rreq->netfs_ops->io_subrequest_size ?: + sizeof(struct netfs_io_subrequest), + GFP_KERNEL); if (subreq) { + INIT_WORK(&subreq->work, NULL); INIT_LIST_HEAD(&subreq->rreq_link); refcount_set(&subreq->ref, 2); subreq->rreq = rreq; @@ -140,6 +171,8 @@ static void netfs_free_subrequest(struct netfs_io_subrequest *subreq, struct netfs_io_request *rreq = subreq->rreq; trace_netfs_sreq(subreq, netfs_sreq_trace_free); + if (rreq->netfs_ops->free_subrequest) + rreq->netfs_ops->free_subrequest(subreq); kfree(subreq); netfs_stat_d(&netfs_n_rh_sreq); netfs_put_request(rreq, was_async, netfs_rreq_trace_put_subreq); diff --git a/fs/netfs/output.c b/fs/netfs/output.c new file mode 100644 index 000000000000..625eb68f3e5a --- /dev/null +++ b/fs/netfs/output.c @@ -0,0 +1,478 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Network filesystem high-level write support. + * + * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/slab.h> +#include <linux/writeback.h> +#include <linux/pagevec.h> +#include "internal.h" + +/** + * netfs_create_write_request - Create a write operation. + * @wreq: The write request this is storing from. + * @dest: The destination type + * @start: Start of the region this write will modify + * @len: Length of the modification + * @worker: The worker function to handle the write(s) + * + * Allocate a write operation, set it up and add it to the list on a write + * request. + */ +struct netfs_io_subrequest *netfs_create_write_request(struct netfs_io_request *wreq, + enum netfs_io_source dest, + loff_t start, size_t len, + work_func_t worker) +{ + struct netfs_io_subrequest *subreq; + + subreq = netfs_alloc_subrequest(wreq); + if (subreq) { + INIT_WORK(&subreq->work, worker); + subreq->source = dest; + subreq->start = start; + subreq->len = len; + subreq->debug_index = wreq->subreq_counter++; + + switch (subreq->source) { + case NETFS_UPLOAD_TO_SERVER: + netfs_stat(&netfs_n_wh_upload); + break; + case NETFS_WRITE_TO_CACHE: + netfs_stat(&netfs_n_wh_write); + break; + default: + BUG(); + } + + subreq->io_iter = wreq->io_iter; + iov_iter_advance(&subreq->io_iter, subreq->start - wreq->start); + iov_iter_truncate(&subreq->io_iter, subreq->len); + + trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index, + refcount_read(&subreq->ref), + netfs_sreq_trace_new); + atomic_inc(&wreq->nr_outstanding); + list_add_tail(&subreq->rreq_link, &wreq->subrequests); + trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); + } + + return subreq; +} +EXPORT_SYMBOL(netfs_create_write_request); + +/* + * Process a completed write request once all the component operations have + * been completed. + */ +static void netfs_write_terminated(struct netfs_io_request *wreq, bool was_async) +{ + struct netfs_io_subrequest *subreq; + struct netfs_inode *ctx = netfs_inode(wreq->inode); + size_t transferred = 0; + + _enter("R=%x[]", wreq->debug_id); + + trace_netfs_rreq(wreq, netfs_rreq_trace_write_done); + + list_for_each_entry(subreq, &wreq->subrequests, rreq_link) { + if (subreq->error || subreq->transferred == 0) + break; + transferred += subreq->transferred; + if (subreq->transferred < subreq->len) + break; + } + wreq->transferred = transferred; + + list_for_each_entry(subreq, &wreq->subrequests, rreq_link) { + if (!subreq->error) + continue; + switch (subreq->source) { + case NETFS_UPLOAD_TO_SERVER: + /* Depending on the type of failure, this may prevent + * writeback completion unless we're in disconnected + * mode. + */ + if (!wreq->error) + wreq->error = subreq->error; + break; + + case NETFS_WRITE_TO_CACHE: + /* Failure doesn't prevent writeback completion unless + * we're in disconnected mode. + */ + if (subreq->error != -ENOBUFS) + ctx->ops->invalidate_cache(wreq); + break; + + default: + WARN_ON_ONCE(1); + if (!wreq->error) + wreq->error = -EIO; + return; + } + } + + wreq->cleanup(wreq); + + if (wreq->origin == NETFS_DIO_WRITE && + wreq->mapping->nrpages) { + pgoff_t first = wreq->start >> PAGE_SHIFT; + pgoff_t last = (wreq->start + wreq->transferred - 1) >> PAGE_SHIFT; + invalidate_inode_pages2_range(wreq->mapping, first, last); + } + + if (wreq->origin == NETFS_DIO_WRITE) + inode_dio_end(wreq->inode); + + _debug("finished"); + trace_netfs_rreq(wreq, netfs_rreq_trace_wake_ip); + clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &wreq->flags); + wake_up_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS); + + if (wreq->iocb) { + wreq->iocb->ki_pos += transferred; + if (wreq->iocb->ki_complete) + wreq->iocb->ki_complete( + wreq->iocb, wreq->error ? wreq->error : transferred); + } + + netfs_clear_subrequests(wreq, was_async); + netfs_put_request(wreq, was_async, netfs_rreq_trace_put_complete); +} + +/* + * Deal with the completion of writing the data to the cache. + */ +void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error, + bool was_async) +{ + struct netfs_io_subrequest *subreq = _op; + struct netfs_io_request *wreq = subreq->rreq; + unsigned int u; + + _enter("%x[%x] %zd", wreq->debug_id, subreq->debug_index, transferred_or_error); + + switch (subreq->source) { + case NETFS_UPLOAD_TO_SERVER: + netfs_stat(&netfs_n_wh_upload_done); + break; + case NETFS_WRITE_TO_CACHE: + netfs_stat(&netfs_n_wh_write_done); + break; + case NETFS_INVALID_WRITE: + break; + default: + BUG(); + } + + if (IS_ERR_VALUE(transferred_or_error)) { + subreq->error = transferred_or_error; + trace_netfs_failure(wreq, subreq, transferred_or_error, + netfs_fail_write); + goto failed; + } + + if (WARN(transferred_or_error > subreq->len - subreq->transferred, + "Subreq excess write: R%x[%x] %zd > %zu - %zu", + wreq->debug_id, subreq->debug_index, + transferred_or_error, subreq->len, subreq->transferred)) + transferred_or_error = subreq->len - subreq->transferred; + + subreq->error = 0; + subreq->transferred += transferred_or_error; + + if (iov_iter_count(&subreq->io_iter) != subreq->len - subreq->transferred) + pr_warn("R=%08x[%u] ITER POST-MISMATCH %zx != %zx-%zx %x\n", + wreq->debug_id, subreq->debug_index, + iov_iter_count(&subreq->io_iter), subreq->len, + subreq->transferred, subreq->io_iter.iter_type); + + if (subreq->transferred < subreq->len) + goto incomplete; + + __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); +out: + trace_netfs_sreq(subreq, netfs_sreq_trace_terminated); + + /* If we decrement nr_outstanding to 0, the ref belongs to us. */ + u = atomic_dec_return(&wreq->nr_outstanding); + if (u == 0) + netfs_write_terminated(wreq, was_async); + else if (u == 1) + wake_up_var(&wreq->nr_outstanding); + + netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated); + return; + +incomplete: + if (transferred_or_error == 0) { + if (__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) { + subreq->error = -ENODATA; + goto failed; + } + } else { + __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); + } + + __set_bit(NETFS_SREQ_SHORT_IO, &subreq->flags); + set_bit(NETFS_RREQ_INCOMPLETE_IO, &wreq->flags); + goto out; + +failed: + switch (subreq->source) { + case NETFS_WRITE_TO_CACHE: + netfs_stat(&netfs_n_wh_write_failed); + set_bit(NETFS_RREQ_INCOMPLETE_IO, &wreq->flags); + break; + case NETFS_UPLOAD_TO_SERVER: + netfs_stat(&netfs_n_wh_upload_failed); + set_bit(NETFS_RREQ_FAILED, &wreq->flags); + wreq->error = subreq->error; + break; + default: + break; + } + goto out; +} +EXPORT_SYMBOL(netfs_write_subrequest_terminated); + +static void netfs_write_to_cache_op(struct netfs_io_subrequest *subreq) +{ + struct netfs_io_request *wreq = subreq->rreq; + struct netfs_cache_resources *cres = &wreq->cache_resources; + + trace_netfs_sreq(subreq, netfs_sreq_trace_submit); + + cres->ops->write(cres, subreq->start, &subreq->io_iter, + netfs_write_subrequest_terminated, subreq); +} + +static void netfs_write_to_cache_op_worker(struct work_struct *work) +{ + struct netfs_io_subrequest *subreq = + container_of(work, struct netfs_io_subrequest, work); + + netfs_write_to_cache_op(subreq); +} + +/** + * netfs_queue_write_request - Queue a write request for attention + * @subreq: The write request to be queued + * + * Queue the specified write request for processing by a worker thread. We + * pass the caller's ref on the request to the worker thread. + */ +void netfs_queue_write_request(struct netfs_io_subrequest *subreq) +{ + if (!queue_work(system_unbound_wq, &subreq->work)) + netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_wip); +} +EXPORT_SYMBOL(netfs_queue_write_request); + +/* + * Set up a op for writing to the cache. + */ +static void netfs_set_up_write_to_cache(struct netfs_io_request *wreq) +{ + struct netfs_cache_resources *cres = &wreq->cache_resources; + struct netfs_io_subrequest *subreq; + struct netfs_inode *ctx = netfs_inode(wreq->inode); + struct fscache_cookie *cookie = netfs_i_cookie(ctx); + loff_t start = wreq->start; + size_t len = wreq->len; + int ret; + + if (!fscache_cookie_enabled(cookie)) { + clear_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags); + return; + } + + _debug("write to cache"); + ret = fscache_begin_write_operation(cres, cookie); + if (ret < 0) + return; + + ret = cres->ops->prepare_write(cres, &start, &len, wreq->upper_len, + i_size_read(wreq->inode), true); + if (ret < 0) + return; + + subreq = netfs_create_write_request(wreq, NETFS_WRITE_TO_CACHE, start, len, + netfs_write_to_cache_op_worker); + if (!subreq) + return; + + netfs_write_to_cache_op(subreq); +} + +/* + * Begin the process of writing out a chunk of data. + * + * We are given a write request that holds a series of dirty regions and + * (partially) covers a sequence of folios, all of which are present. The + * pages must have been marked as writeback as appropriate. + * + * We need to perform the following steps: + * + * (1) If encrypting, create an output buffer and encrypt each block of the + * data into it, otherwise the output buffer will point to the original + * folios. + * + * (2) If the data is to be cached, set up a write op for the entire output + * buffer to the cache, if the cache wants to accept it. + * + * (3) If the data is to be uploaded (ie. not merely cached): + * + * (a) If the data is to be compressed, create a compression buffer and + * compress the data into it. + * + * (b) For each destination we want to upload to, set up write ops to write + * to that destination. We may need multiple writes if the data is not + * contiguous or the span exceeds wsize for a server. + */ +int netfs_begin_write(struct netfs_io_request *wreq, bool may_wait, + enum netfs_write_trace what) +{ + struct netfs_inode *ctx = netfs_inode(wreq->inode); + + _enter("R=%x %llx-%llx f=%lx", + wreq->debug_id, wreq->start, wreq->start + wreq->len - 1, + wreq->flags); + + trace_netfs_write(wreq, what); + if (wreq->len == 0 || wreq->iter.count == 0) { + pr_err("Zero-sized write [R=%x]\n", wreq->debug_id); + return -EIO; + } + + if (wreq->origin == NETFS_DIO_WRITE) + inode_dio_begin(wreq->inode); + + wreq->io_iter = wreq->iter; + + /* ->outstanding > 0 carries a ref */ + netfs_get_request(wreq, netfs_rreq_trace_get_for_outstanding); + atomic_set(&wreq->nr_outstanding, 1); + + /* Start the encryption/compression going. We can do that in the + * background whilst we generate a list of write ops that we want to + * perform. + */ + // TODO: Encrypt or compress the region as appropriate + + /* We need to write all of the region to the cache */ + if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags)) + netfs_set_up_write_to_cache(wreq); + + /* However, we don't necessarily write all of the region to the server. + * Caching of reads is being managed this way also. + */ + if (test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags)) + ctx->ops->create_write_requests(wreq, wreq->start, wreq->len); + + if (atomic_dec_and_test(&wreq->nr_outstanding)) + netfs_write_terminated(wreq, false); + + if (!may_wait) + return -EIOCBQUEUED; + + wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS, + TASK_UNINTERRUPTIBLE); + return wreq->error; +} + +/* + * Begin a write operation for writing through the pagecache. + */ +struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len) +{ + struct netfs_io_request *wreq; + struct file *file = iocb->ki_filp; + + wreq = netfs_alloc_request(file->f_mapping, file, iocb->ki_pos, len, + NETFS_WRITETHROUGH); + if (IS_ERR(wreq)) + return wreq; + + trace_netfs_write(wreq, netfs_write_trace_writethrough); + + __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags); + iov_iter_xarray(&wreq->iter, ITER_SOURCE, &wreq->mapping->i_pages, wreq->start, 0); + wreq->io_iter = wreq->iter; + + /* ->outstanding > 0 carries a ref */ + netfs_get_request(wreq, netfs_rreq_trace_get_for_outstanding); + atomic_set(&wreq->nr_outstanding, 1); + return wreq; +} + +static void netfs_submit_writethrough(struct netfs_io_request *wreq, bool final) +{ + struct netfs_inode *ictx = netfs_inode(wreq->inode); + unsigned long long start; + size_t len; + + if (!test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags)) + return; + + start = wreq->start + wreq->submitted; + len = wreq->iter.count - wreq->submitted; + if (!final) { + len /= wreq->wsize; /* Round to number of maximum packets */ + len *= wreq->wsize; + } + + ictx->ops->create_write_requests(wreq, start, len); + wreq->submitted += len; +} + +/* + * Advance the state of the write operation used when writing through the + * pagecache. Data has been copied into the pagecache that we need to append + * to the request. If we've added more than wsize then we need to create a new + * subrequest. + */ +int netfs_advance_writethrough(struct netfs_io_request *wreq, size_t copied, bool to_page_end) +{ + _enter("ic=%zu sb=%zu ws=%u cp=%zu tp=%u", + wreq->iter.count, wreq->submitted, wreq->wsize, copied, to_page_end); + + wreq->iter.count += copied; + wreq->io_iter.count += copied; + if (to_page_end && wreq->io_iter.count - wreq->submitted >= wreq->wsize) + netfs_submit_writethrough(wreq, false); + + return wreq->error; +} + +/* + * End a write operation used when writing through the pagecache. + */ +int netfs_end_writethrough(struct netfs_io_request *wreq, struct kiocb *iocb) +{ + int ret = -EIOCBQUEUED; + + _enter("ic=%zu sb=%zu ws=%u", + wreq->iter.count, wreq->submitted, wreq->wsize); + + if (wreq->submitted < wreq->io_iter.count) + netfs_submit_writethrough(wreq, true); + + if (atomic_dec_and_test(&wreq->nr_outstanding)) + netfs_write_terminated(wreq, false); + + if (is_sync_kiocb(iocb)) { + wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS, + TASK_UNINTERRUPTIBLE); + ret = wreq->error; + } + + netfs_put_request(wreq, false, netfs_rreq_trace_put_return); + return ret; +} diff --git a/fs/netfs/stats.c b/fs/netfs/stats.c index 5510a7a14a40..deeba9f9dcf5 100644 --- a/fs/netfs/stats.c +++ b/fs/netfs/stats.c @@ -9,6 +9,8 @@ #include <linux/seq_file.h> #include "internal.h" +atomic_t netfs_n_rh_dio_read; +atomic_t netfs_n_rh_dio_write; atomic_t netfs_n_rh_readahead; atomic_t netfs_n_rh_readpage; atomic_t netfs_n_rh_rreq; @@ -27,32 +29,48 @@ atomic_t netfs_n_rh_write_begin; atomic_t netfs_n_rh_write_done; atomic_t netfs_n_rh_write_failed; atomic_t netfs_n_rh_write_zskip; +atomic_t netfs_n_wh_wstream_conflict; +atomic_t netfs_n_wh_upload; +atomic_t netfs_n_wh_upload_done; +atomic_t netfs_n_wh_upload_failed; +atomic_t netfs_n_wh_write; +atomic_t netfs_n_wh_write_done; +atomic_t netfs_n_wh_write_failed; -void netfs_stats_show(struct seq_file *m) +int netfs_stats_show(struct seq_file *m, void *v) { - seq_printf(m, "RdHelp : RA=%u RP=%u WB=%u WBZ=%u rr=%u sr=%u\n", + seq_printf(m, "Netfs : DR=%u DW=%u RA=%u RP=%u WB=%u WBZ=%u\n", + atomic_read(&netfs_n_rh_dio_read), + atomic_read(&netfs_n_rh_dio_write), atomic_read(&netfs_n_rh_readahead), atomic_read(&netfs_n_rh_readpage), atomic_read(&netfs_n_rh_write_begin), - atomic_read(&netfs_n_rh_write_zskip), - atomic_read(&netfs_n_rh_rreq), - atomic_read(&netfs_n_rh_sreq)); - seq_printf(m, "RdHelp : ZR=%u sh=%u sk=%u\n", + atomic_read(&netfs_n_rh_write_zskip)); + seq_printf(m, "Netfs : ZR=%u sh=%u sk=%u\n", atomic_read(&netfs_n_rh_zero), atomic_read(&netfs_n_rh_short_read), atomic_read(&netfs_n_rh_write_zskip)); - seq_printf(m, "RdHelp : DL=%u ds=%u df=%u di=%u\n", + seq_printf(m, "Netfs : DL=%u ds=%u df=%u di=%u\n", atomic_read(&netfs_n_rh_download), atomic_read(&netfs_n_rh_download_done), atomic_read(&netfs_n_rh_download_failed), atomic_read(&netfs_n_rh_download_instead)); - seq_printf(m, "RdHelp : RD=%u rs=%u rf=%u\n", + seq_printf(m, "Netfs : RD=%u rs=%u rf=%u\n", atomic_read(&netfs_n_rh_read), atomic_read(&netfs_n_rh_read_done), atomic_read(&netfs_n_rh_read_failed)); - seq_printf(m, "RdHelp : WR=%u ws=%u wf=%u\n", - atomic_read(&netfs_n_rh_write), - atomic_read(&netfs_n_rh_write_done), - atomic_read(&netfs_n_rh_write_failed)); + seq_printf(m, "Netfs : UL=%u us=%u uf=%u\n", + atomic_read(&netfs_n_wh_upload), + atomic_read(&netfs_n_wh_upload_done), + atomic_read(&netfs_n_wh_upload_failed)); + seq_printf(m, "Netfs : WR=%u ws=%u wf=%u\n", + atomic_read(&netfs_n_wh_write), + atomic_read(&netfs_n_wh_write_done), + atomic_read(&netfs_n_wh_write_failed)); + seq_printf(m, "Netfs : rr=%u sr=%u wsc=%u\n", + atomic_read(&netfs_n_rh_rreq), + atomic_read(&netfs_n_rh_sreq), + atomic_read(&netfs_n_wh_wstream_conflict)); + return fscache_stats_show(m); } EXPORT_SYMBOL(netfs_stats_show); diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 01ac733a6320..f7e32d76e34d 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -169,8 +169,8 @@ config ROOT_NFS config NFS_FSCACHE bool "Provide NFS client caching support" - depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y - select NETFS_SUPPORT + depends on NFS_FS=m && NETFS_SUPPORT || NFS_FS=y && NETFS_SUPPORT=y + select FSCACHE help Say Y here if you want NFS data to be cached locally on disc through the general filesystem cache manager diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 943aeea1eb16..6be13e0ec170 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -580,6 +580,8 @@ retry: nfs4_delete_deviceid(node->ld, node->nfs_client, id); goto retry; } + + nfs4_put_deviceid_node(node); return ERR_PTR(-ENODEV); } @@ -893,10 +895,9 @@ bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) } if (pgio->pg_dreq == NULL) - wb_size = pnfs_num_cont_bytes(pgio->pg_inode, - req->wb_index); + wb_size = pnfs_num_cont_bytes(pgio->pg_inode, req->wb_index); else - wb_size = nfs_dreq_bytes_left(pgio->pg_dreq); + wb_size = nfs_dreq_bytes_left(pgio->pg_dreq, req_offset(req)); pnfs_generic_pg_init_write(pgio, req, wb_size); diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index f318a05a80e1..c97ebc42ec0f 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -351,6 +351,9 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, d->map = bl_map_simple; d->pr_key = v->scsi.pr_key; + if (d->len == 0) + return -ENODEV; + pr_info("pNFS: using block device %s (reservation key 0x%llx)\n", d->bdev_handle->bdev->bd_disk->disk_name, d->pr_key); diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c index 6c977288cc28..d8d50a88de04 100644 --- a/fs/nfs/blocklayout/rpc_pipefs.c +++ b/fs/nfs/blocklayout/rpc_pipefs.c @@ -75,7 +75,7 @@ bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b, msg->len = sizeof(*bl_msg) + b->simple.len; msg->data = kzalloc(msg->len, gfp_mask); if (!msg->data) - goto out_free_data; + goto out_unlock; bl_msg = msg->data; bl_msg->type = BL_DEVICE_MOUNT; diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index 0279b78b5fc9..650758ee0d5f 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -21,11 +21,12 @@ enum nfs4_callback_procnum { struct nfs4_slot; struct cb_process_state { - __be32 drc_status; struct nfs_client *clp; struct nfs4_slot *slot; - u32 minorversion; struct net *net; + u32 minorversion; + __be32 drc_status; + unsigned int referring_calls; }; struct cb_compound_hdr_arg { diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 96a4923080ae..76cea34477ae 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -207,7 +207,8 @@ static struct inode *nfs_layout_find_inode(struct nfs_client *clp, * Enforce RFC5661 section 12.5.5.2.1. (Layout Recall and Return Sequencing) */ static u32 pnfs_check_callback_stateid(struct pnfs_layout_hdr *lo, - const nfs4_stateid *new) + const nfs4_stateid *new, + struct cb_process_state *cps) { u32 oldseq, newseq; @@ -221,28 +222,29 @@ static u32 pnfs_check_callback_stateid(struct pnfs_layout_hdr *lo, newseq = be32_to_cpu(new->seqid); /* Are we already in a layout recall situation? */ - if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) && - lo->plh_return_seq != 0) { - if (newseq < lo->plh_return_seq) - return NFS4ERR_OLD_STATEID; - if (newseq > lo->plh_return_seq) - return NFS4ERR_DELAY; - goto out; - } + if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) + return NFS4ERR_DELAY; - /* Check that the stateid matches what we think it should be. */ + /* + * Check that the stateid matches what we think it should be. + * Note that if the server sent us a list of referring calls, + * and we know that those have completed, then we trust the + * stateid argument is correct. + */ oldseq = be32_to_cpu(lo->plh_stateid.seqid); - if (newseq > oldseq + 1) + if (newseq > oldseq + 1 && !cps->referring_calls) return NFS4ERR_DELAY; + /* Crazy server! */ if (newseq <= oldseq) return NFS4ERR_OLD_STATEID; -out: + return NFS_OK; } static u32 initiate_file_draining(struct nfs_client *clp, - struct cb_layoutrecallargs *args) + struct cb_layoutrecallargs *args, + struct cb_process_state *cps) { struct inode *ino; struct pnfs_layout_hdr *lo; @@ -266,7 +268,7 @@ static u32 initiate_file_draining(struct nfs_client *clp, goto out; } pnfs_get_layout_hdr(lo); - rv = pnfs_check_callback_stateid(lo, &args->cbl_stateid); + rv = pnfs_check_callback_stateid(lo, &args->cbl_stateid, cps); if (rv != NFS_OK) goto unlock; @@ -326,10 +328,11 @@ static u32 initiate_bulk_draining(struct nfs_client *clp, } static u32 do_callback_layoutrecall(struct nfs_client *clp, - struct cb_layoutrecallargs *args) + struct cb_layoutrecallargs *args, + struct cb_process_state *cps) { if (args->cbl_recall_type == RETURN_FILE) - return initiate_file_draining(clp, args); + return initiate_file_draining(clp, args, cps); return initiate_bulk_draining(clp, args); } @@ -340,11 +343,12 @@ __be32 nfs4_callback_layoutrecall(void *argp, void *resp, u32 res = NFS4ERR_OP_NOT_IN_SESSION; if (cps->clp) - res = do_callback_layoutrecall(cps->clp, args); + res = do_callback_layoutrecall(cps->clp, args, cps); return cpu_to_be32(res); } -static void pnfs_recall_all_layouts(struct nfs_client *clp) +static void pnfs_recall_all_layouts(struct nfs_client *clp, + struct cb_process_state *cps) { struct cb_layoutrecallargs args; @@ -352,7 +356,7 @@ static void pnfs_recall_all_layouts(struct nfs_client *clp) memset(&args, 0, sizeof(args)); args.cbl_recall_type = RETURN_ALL; /* FIXME we ignore errors, what should we do? */ - do_callback_layoutrecall(clp, &args); + do_callback_layoutrecall(clp, &args, cps); } __be32 nfs4_callback_devicenotify(void *argp, void *resp, @@ -450,6 +454,7 @@ static int referring_call_exists(struct nfs_client *clp, __acquires(lock) { int status = 0; + int found = 0; int i, j; struct nfs4_session *session; struct nfs4_slot_table *tbl; @@ -478,11 +483,12 @@ static int referring_call_exists(struct nfs_client *clp, spin_lock(lock); if (status) goto out; + found++; } } out: - return status; + return status < 0 ? status : found; } __be32 nfs4_callback_sequence(void *argp, void *resp, @@ -493,6 +499,7 @@ __be32 nfs4_callback_sequence(void *argp, void *resp, struct nfs4_slot_table *tbl; struct nfs4_slot *slot; struct nfs_client *clp; + int ret; int i; __be32 status = htonl(NFS4ERR_BADSESSION); @@ -552,11 +559,13 @@ __be32 nfs4_callback_sequence(void *argp, void *resp, * related callback was received before the response to the original * call. */ - if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists, - &tbl->slot_tbl_lock) < 0) { + ret = referring_call_exists(clp, args->csa_nrclists, args->csa_rclists, + &tbl->slot_tbl_lock); + if (ret < 0) { status = htonl(NFS4ERR_DELAY); goto out_unlock; } + cps->referring_calls = ret; /* * RFC5661 20.9.3 @@ -617,7 +626,7 @@ __be32 nfs4_callback_recallany(void *argp, void *resp, nfs_expire_unused_delegation_types(cps->clp, flags); if (args->craa_type_mask & BIT(RCA4_TYPE_MASK_FILE_LAYOUT)) - pnfs_recall_all_layouts(cps->clp); + pnfs_recall_all_layouts(cps->clp, cps); if (args->craa_type_mask & BIT(PNFS_FF_RCA4_TYPE_MASK_READ)) { set_bit(NFS4CLNT_RECALL_ANY_LAYOUT_READ, &cps->clp->cl_state); diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 321af81c456e..9369488f2ed4 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -967,6 +967,11 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp) nops--; } + if (svc_is_backchannel(rqstp) && cps.clp) { + rqstp->bc_to_initval = cps.clp->cl_rpcclient->cl_timeout->to_initval; + rqstp->bc_to_retries = cps.clp->cl_rpcclient->cl_timeout->to_retries; + } + *hdr_res.status = status; *hdr_res.nops = htonl(nops); nfs4_cb_free_slot(&cps); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 13dffe4201e6..c8ecbe999059 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2194,6 +2194,8 @@ nfs4_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, { struct inode *inode; + trace_nfs_lookup_revalidate_enter(dir, dentry, flags); + if (!(flags & LOOKUP_OPEN) || (flags & LOOKUP_DIRECTORY)) goto full_reval; if (d_mountpoint(dentry)) @@ -2963,7 +2965,7 @@ static u64 nfs_access_login_time(const struct task_struct *task, rcu_read_lock(); for (;;) { parent = rcu_dereference(task->real_parent); - pcred = rcu_dereference(parent->cred); + pcred = __task_cred(parent); if (parent == task || cred_fscmp(pcred, cred) != 0) break; task = parent; diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index f6c74f424691..c03926a1cc73 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -205,9 +205,10 @@ static void nfs_direct_req_release(struct nfs_direct_req *dreq) kref_put(&dreq->kref, nfs_direct_req_free); } -ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq) +ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq, loff_t offset) { - return dreq->bytes_left; + loff_t start = offset - dreq->io_start; + return dreq->max_count - start; } EXPORT_SYMBOL_GPL(nfs_dreq_bytes_left); @@ -368,7 +369,6 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, bytes -= req_len; requested_bytes += req_len; pos += req_len; - dreq->bytes_left -= req_len; } nfs_direct_release_pages(pagevec, npages); kvfree(pagevec); @@ -440,7 +440,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, goto out; dreq->inode = inode; - dreq->bytes_left = dreq->max_count = count; + dreq->max_count = count; dreq->io_start = iocb->ki_pos; dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); l_ctx = nfs_get_lock_context(dreq->ctx); @@ -873,7 +873,6 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, bytes -= req_len; requested_bytes += req_len; pos += req_len; - dreq->bytes_left -= req_len; if (defer) { nfs_mark_request_commit(req, NULL, &cinfo, 0); @@ -980,7 +979,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter, goto out; dreq->inode = inode; - dreq->bytes_left = dreq->max_count = count; + dreq->max_count = count; dreq->io_start = pos; dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); l_ctx = nfs_get_lock_context(dreq->ctx); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index e8cccb94b927..8577ccf621f5 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -558,7 +558,6 @@ const struct address_space_operations nfs_file_aops = { .read_folio = nfs_read_folio, .readahead = nfs_readahead, .dirty_folio = filemap_dirty_folio, - .writepage = nfs_writepage, .writepages = nfs_writepages, .write_begin = nfs_write_begin, .write_end = nfs_write_end, diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index b05717fe0d4e..2d1bfee225c3 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -274,12 +274,6 @@ static void nfs_netfs_free_request(struct netfs_io_request *rreq) put_nfs_open_context(rreq->netfs_priv); } -static inline int nfs_netfs_begin_cache_operation(struct netfs_io_request *rreq) -{ - return fscache_begin_read_operation(&rreq->cache_resources, - netfs_i_cookie(netfs_inode(rreq->inode))); -} - static struct nfs_netfs_io_data *nfs_netfs_alloc(struct netfs_io_subrequest *sreq) { struct nfs_netfs_io_data *netfs; @@ -387,7 +381,6 @@ void nfs_netfs_read_completion(struct nfs_pgio_header *hdr) const struct netfs_request_ops nfs_netfs_ops = { .init_request = nfs_netfs_init_request, .free_request = nfs_netfs_free_request, - .begin_cache_operation = nfs_netfs_begin_cache_operation, .issue_read = nfs_netfs_issue_read, .clamp_length = nfs_netfs_clamp_length }; diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h index 5407ab8c8783..e3cb4923316b 100644 --- a/fs/nfs/fscache.h +++ b/fs/nfs/fscache.h @@ -80,7 +80,7 @@ static inline void nfs_netfs_put(struct nfs_netfs_io_data *netfs) } static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) { - netfs_inode_init(&nfsi->netfs, &nfs_netfs_ops); + netfs_inode_init(&nfsi->netfs, &nfs_netfs_ops, false); } extern void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr); extern void nfs_netfs_read_completion(struct nfs_pgio_header *hdr); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 9c9cf764f600..e3722ce6722e 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -655,7 +655,7 @@ extern int nfs_sillyrename(struct inode *dir, struct dentry *dentry); /* direct.c */ void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, struct nfs_direct_req *dreq); -extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq); +extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq, loff_t offset); /* nfs4proc.c */ extern struct nfs_client *nfs4_init_client(struct nfs_client *clp, @@ -936,7 +936,6 @@ struct nfs_direct_req { loff_t io_start; /* Start offset for I/O */ ssize_t count, /* bytes actually processed */ max_count, /* max expected count */ - bytes_left, /* bytes left to be sent */ error; /* any reported error */ struct completion completion; /* wait for i/o completion */ diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 8a943fffaad5..23819a756508 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -170,6 +170,7 @@ static int nfs4_map_errors(int err) case -NFS4ERR_RESOURCE: case -NFS4ERR_LAYOUTTRYLATER: case -NFS4ERR_RECALLCONFLICT: + case -NFS4ERR_RETURNCONFLICT: return -EREMOTEIO; case -NFS4ERR_WRONGSEC: case -NFS4ERR_WRONG_CRED: @@ -558,6 +559,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server, case -NFS4ERR_GRACE: case -NFS4ERR_LAYOUTTRYLATER: case -NFS4ERR_RECALLCONFLICT: + case -NFS4ERR_RETURNCONFLICT: exception->delay = 1; return 0; @@ -9691,6 +9693,7 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, status = -EBUSY; break; case -NFS4ERR_RECALLCONFLICT: + case -NFS4ERR_RETURNCONFLICT: status = -ERECALLCONFLICT; break; case -NFS4ERR_DELEG_REVOKED: diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c index e776200e9a11..886a7c4c60b3 100644 --- a/fs/nfs/nfs4sysctl.c +++ b/fs/nfs/nfs4sysctl.c @@ -34,7 +34,6 @@ static struct ctl_table nfs4_cb_sysctls[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; int nfs4_register_sysctl(void) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index deec76cf5afe..69406e60f391 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1602,7 +1602,8 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_pgio_args *args static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr) { uint32_t attrs[3] = { - FATTR4_WORD0_RDATTR_ERROR, + FATTR4_WORD0_TYPE + | FATTR4_WORD0_RDATTR_ERROR, FATTR4_WORD1_MOUNTED_ON_FILEID, }; uint32_t dircount = readdir->count; @@ -1612,12 +1613,20 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg unsigned int i; if (readdir->plus) { - attrs[0] |= FATTR4_WORD0_TYPE|FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE| - FATTR4_WORD0_FSID|FATTR4_WORD0_FILEHANDLE|FATTR4_WORD0_FILEID; - attrs[1] |= FATTR4_WORD1_MODE|FATTR4_WORD1_NUMLINKS|FATTR4_WORD1_OWNER| - FATTR4_WORD1_OWNER_GROUP|FATTR4_WORD1_RAWDEV| - FATTR4_WORD1_SPACE_USED|FATTR4_WORD1_TIME_ACCESS| - FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; + attrs[0] |= FATTR4_WORD0_CHANGE + | FATTR4_WORD0_SIZE + | FATTR4_WORD0_FSID + | FATTR4_WORD0_FILEHANDLE + | FATTR4_WORD0_FILEID; + attrs[1] |= FATTR4_WORD1_MODE + | FATTR4_WORD1_NUMLINKS + | FATTR4_WORD1_OWNER + | FATTR4_WORD1_OWNER_GROUP + | FATTR4_WORD1_RAWDEV + | FATTR4_WORD1_SPACE_USED + | FATTR4_WORD1_TIME_ACCESS + | FATTR4_WORD1_TIME_METADATA + | FATTR4_WORD1_TIME_MODIFY; attrs[2] |= FATTR4_WORD2_SECURITY_LABEL; } /* Use mounted_on_fileid only if the server supports it */ diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 4e90ca531176..afedb449b54f 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -400,6 +400,7 @@ DECLARE_EVENT_CLASS(nfs_lookup_event, __field(unsigned long, flags) __field(dev_t, dev) __field(u64, dir) + __field(u64, fileid) __string(name, dentry->d_name.name) ), @@ -407,16 +408,18 @@ DECLARE_EVENT_CLASS(nfs_lookup_event, __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); __entry->flags = flags; + __entry->fileid = d_is_negative(dentry) ? 0 : NFS_FILEID(d_inode(dentry)); __assign_str(name, dentry->d_name.name); ), TP_printk( - "flags=0x%lx (%s) name=%02x:%02x:%llu/%s", + "flags=0x%lx (%s) name=%02x:%02x:%llu/%s fileid=%llu", __entry->flags, show_fs_lookup_flags(__entry->flags), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->dir, - __get_str(name) + __get_str(name), + __entry->fileid ) ); @@ -444,6 +447,7 @@ DECLARE_EVENT_CLASS(nfs_lookup_event_done, __field(unsigned long, flags) __field(dev_t, dev) __field(u64, dir) + __field(u64, fileid) __string(name, dentry->d_name.name) ), @@ -452,17 +456,19 @@ DECLARE_EVENT_CLASS(nfs_lookup_event_done, __entry->dir = NFS_FILEID(dir); __entry->error = error < 0 ? -error : 0; __entry->flags = flags; + __entry->fileid = d_is_negative(dentry) ? 0 : NFS_FILEID(d_inode(dentry)); __assign_str(name, dentry->d_name.name); ), TP_printk( - "error=%ld (%s) flags=0x%lx (%s) name=%02x:%02x:%llu/%s", + "error=%ld (%s) flags=0x%lx (%s) name=%02x:%02x:%llu/%s fileid=%llu", -__entry->error, show_nfs_status(__entry->error), __entry->flags, show_fs_lookup_flags(__entry->flags), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->dir, - __get_str(name) + __get_str(name), + __entry->fileid ) ); @@ -893,7 +899,7 @@ DECLARE_EVENT_CLASS(nfs_rename_event_done, DEFINE_NFS_RENAME_EVENT(nfs_rename_enter); DEFINE_NFS_RENAME_EVENT_DONE(nfs_rename_exit); -DEFINE_NFS_RENAME_EVENT_DONE(nfs_sillyrename_rename); +DEFINE_NFS_RENAME_EVENT_DONE(nfs_async_rename_done); TRACE_EVENT(nfs_sillyrename_unlink, TP_PROTO( @@ -1539,7 +1545,6 @@ DECLARE_EVENT_CLASS(nfs_direct_req_class, __field(u32, fhandle) __field(loff_t, offset) __field(ssize_t, count) - __field(ssize_t, bytes_left) __field(ssize_t, error) __field(int, flags) ), @@ -1554,19 +1559,18 @@ DECLARE_EVENT_CLASS(nfs_direct_req_class, __entry->fhandle = nfs_fhandle_hash(fh); __entry->offset = dreq->io_start; __entry->count = dreq->count; - __entry->bytes_left = dreq->bytes_left; __entry->error = dreq->error; __entry->flags = dreq->flags; ), TP_printk( "error=%zd fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%lld count=%zd bytes_left=%zd flags=%s", + "offset=%lld count=%zd flags=%s", __entry->error, MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, __entry->offset, - __entry->count, __entry->bytes_left, + __entry->count, nfs_show_direct_req_flags(__entry->flags) ) ); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 21a365357629..0c0fed1ecd0b 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -2733,7 +2733,8 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r if (pgio->pg_dreq == NULL) rd_size = i_size_read(pgio->pg_inode) - req_offset(req); else - rd_size = nfs_dreq_bytes_left(pgio->pg_dreq); + rd_size = nfs_dreq_bytes_left(pgio->pg_dreq, + req_offset(req)); pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req), diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c index f39e2089bc4c..e645be1a3381 100644 --- a/fs/nfs/sysctl.c +++ b/fs/nfs/sysctl.c @@ -29,7 +29,6 @@ static struct ctl_table nfs_cb_sysctls[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; int nfs_register_sysctl(void) diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 150a953a8be9..0110299643a2 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -267,7 +267,7 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata) struct inode *new_dir = data->new_dir; struct dentry *old_dentry = data->old_dentry; - trace_nfs_sillyrename_rename(old_dir, old_dentry, + trace_nfs_async_rename_done(old_dir, old_dentry, new_dir, data->new_dentry, task->tk_status); if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) { rpc_restart_call_prepare(task); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 7248705faef4..bb79d3a886ae 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -680,17 +680,6 @@ static int nfs_writepage_locked(struct folio *folio, return err; } -int nfs_writepage(struct page *page, struct writeback_control *wbc) -{ - struct folio *folio = page_folio(page); - int ret; - - ret = nfs_writepage_locked(folio, wbc); - if (ret != AOP_WRITEPAGE_ACTIVATE) - unlock_page(page); - return ret; -} - static int nfs_writepages_callback(struct folio *folio, struct writeback_control *wbc, void *data) { diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 2fa54cfd4882..6dc6340e2852 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -7911,14 +7911,16 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) { struct file_lock *fl; int status = false; - struct nfsd_file *nf = find_any_file(fp); + struct nfsd_file *nf; struct inode *inode; struct file_lock_context *flctx; + spin_lock(&fp->fi_lock); + nf = find_any_file_locked(fp); if (!nf) { /* Any valid lock stateid should have some sort of access */ WARN_ON_ONCE(1); - return status; + goto out; } inode = file_inode(nf->nf_file); @@ -7934,7 +7936,8 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) } spin_unlock(&flctx->flc_lock); } - nfsd_file_put(nf); +out: + spin_unlock(&fp->fi_lock); return status; } @@ -7944,10 +7947,8 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) * @cstate: NFSv4 COMPOUND state * @u: RELEASE_LOCKOWNER arguments * - * The lockowner's so_count is bumped when a lock record is added - * or when copying a conflicting lock. The latter case is brief, - * but can lead to fleeting false positives when looking for - * locks-in-use. + * Check if theree are any locks still held and if not - free the lockowner + * and any lock state that is owned. * * Return values: * %nfs_ok: lockowner released or not found @@ -7983,10 +7984,13 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, spin_unlock(&clp->cl_lock); return nfs_ok; } - if (atomic_read(&lo->lo_owner.so_count) != 2) { - spin_unlock(&clp->cl_lock); - nfs4_put_stateowner(&lo->lo_owner); - return nfserr_locks_held; + + list_for_each_entry(stp, &lo->lo_owner.so_stateids, st_perstateowner) { + if (check_for_locks(stp->st_stid.sc_file, lo)) { + spin_unlock(&clp->cl_lock); + nfs4_put_stateowner(&lo->lo_owner); + return nfserr_locks_held; + } } unhash_lockowner_locked(lo); while (!list_empty(&lo->lo_owner.so_stateids)) { diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 8e6dbe9e0b65..f206ca32e7f5 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -48,10 +48,6 @@ enum { NFSD_MaxBlkSize, NFSD_MaxConnections, NFSD_Filecache, - /* - * The below MUST come last. Otherwise we leave a hole in nfsd_files[] - * with !CONFIG_NFSD_V4 and simple_fill_super() goes oops - */ #ifdef CONFIG_NFSD_V4 NFSD_Leasetime, NFSD_Gracetime, @@ -1242,63 +1238,34 @@ static inline void _nfsd_symlink(struct dentry *parent, const char *name, #endif -static void clear_ncl(struct inode *inode) +static void clear_ncl(struct dentry *dentry) { + struct inode *inode = d_inode(dentry); struct nfsdfs_client *ncl = inode->i_private; + spin_lock(&inode->i_lock); inode->i_private = NULL; + spin_unlock(&inode->i_lock); kref_put(&ncl->cl_ref, ncl->cl_release); } -static struct nfsdfs_client *__get_nfsdfs_client(struct inode *inode) -{ - struct nfsdfs_client *nc = inode->i_private; - - if (nc) - kref_get(&nc->cl_ref); - return nc; -} - struct nfsdfs_client *get_nfsdfs_client(struct inode *inode) { struct nfsdfs_client *nc; - inode_lock_shared(inode); - nc = __get_nfsdfs_client(inode); - inode_unlock_shared(inode); + spin_lock(&inode->i_lock); + nc = inode->i_private; + if (nc) + kref_get(&nc->cl_ref); + spin_unlock(&inode->i_lock); return nc; } -/* from __rpc_unlink */ -static void nfsdfs_remove_file(struct inode *dir, struct dentry *dentry) -{ - int ret; - - clear_ncl(d_inode(dentry)); - dget(dentry); - ret = simple_unlink(dir, dentry); - d_drop(dentry); - fsnotify_unlink(dir, dentry); - dput(dentry); - WARN_ON_ONCE(ret); -} - -static void nfsdfs_remove_files(struct dentry *root) -{ - struct dentry *dentry, *tmp; - - list_for_each_entry_safe(dentry, tmp, &root->d_subdirs, d_child) { - if (!simple_positive(dentry)) { - WARN_ON_ONCE(1); /* I think this can't happen? */ - continue; - } - nfsdfs_remove_file(d_inode(root), dentry); - } -} /* XXX: cut'n'paste from simple_fill_super; figure out if we could share * code instead. */ static int nfsdfs_create_files(struct dentry *root, const struct tree_descr *files, + struct nfsdfs_client *ncl, struct dentry **fdentries) { struct inode *dir = d_inode(root); @@ -1317,8 +1284,9 @@ static int nfsdfs_create_files(struct dentry *root, dput(dentry); goto out; } + kref_get(&ncl->cl_ref); inode->i_fop = files->ops; - inode->i_private = __get_nfsdfs_client(dir); + inode->i_private = ncl; d_add(dentry, inode); fsnotify_create(dir, dentry); if (fdentries) @@ -1327,7 +1295,6 @@ static int nfsdfs_create_files(struct dentry *root, inode_unlock(dir); return 0; out: - nfsdfs_remove_files(root); inode_unlock(dir); return -ENOMEM; } @@ -1347,7 +1314,7 @@ struct dentry *nfsd_client_mkdir(struct nfsd_net *nn, dentry = nfsd_mkdir(nn->nfsd_client_dir, ncl, name); if (IS_ERR(dentry)) /* XXX: tossing errors? */ return NULL; - ret = nfsdfs_create_files(dentry, files, fdentries); + ret = nfsdfs_create_files(dentry, files, ncl, fdentries); if (ret) { nfsd_client_rmdir(dentry); return NULL; @@ -1358,20 +1325,7 @@ struct dentry *nfsd_client_mkdir(struct nfsd_net *nn, /* Taken from __rpc_rmdir: */ void nfsd_client_rmdir(struct dentry *dentry) { - struct inode *dir = d_inode(dentry->d_parent); - struct inode *inode = d_inode(dentry); - int ret; - - inode_lock(dir); - nfsdfs_remove_files(dentry); - clear_ncl(inode); - dget(dentry); - ret = simple_rmdir(dir, dentry); - WARN_ON_ONCE(ret); - d_drop(dentry); - fsnotify_rmdir(dir, dentry); - dput(dentry); - inode_unlock(dir); + simple_recursive_removal(dentry, clear_ncl); } static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 6e7e37192461..b7c7a9273ea0 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1831,6 +1831,10 @@ retry: } trap = lock_rename(tdentry, fdentry); + if (IS_ERR(trap)) { + err = (rqstp->rq_vers == 2) ? nfserr_acces : nfserr_xdev; + goto out; + } err = fh_fill_pre_attrs(ffhp); if (err != nfs_ok) goto out_unlock; diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 959bd9fb3d81..c950139db6ef 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -441,7 +441,6 @@ out: static struct dentry *nilfs_get_parent(struct dentry *child) { unsigned long ino; - struct inode *inode; struct nilfs_root *root; ino = nilfs_inode_by_name(d_inode(child), &dotdot_name); @@ -450,11 +449,7 @@ static struct dentry *nilfs_get_parent(struct dentry *child) root = NILFS_I(d_inode(child))->i_root; - inode = nilfs_iget(child->d_sb, root, ino); - if (IS_ERR(inode)) - return ERR_CAST(inode); - - return d_obtain_alias(inode); + return d_obtain_alias(nilfs_iget(child->d_sb, root, ino)); } static struct dentry *nilfs_get_dentry(struct super_block *sb, u64 cno, diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 1cb9ad7e884e..3464fa7e8538 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -29,7 +29,6 @@ static struct ctl_table dnotify_sysctls[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - {} }; static void __init dnotify_sysctl_init(void) { diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index f83e7cc5ccf2..fbdc63cc10d9 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -86,7 +86,6 @@ static struct ctl_table fanotify_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO }, - { } }; static void __init fanotify_sysctls_init(void) diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 7974e91ffe13..8bfd690e9f10 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -124,7 +124,7 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode) * d_flags to indicate parental interest (their parent is the * original inode) */ spin_lock(&alias->d_lock); - list_for_each_entry(child, &alias->d_subdirs, d_child) { + hlist_for_each_entry(child, &alias->d_children, d_sib) { if (!child->d_inode) continue; diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index a3809ae92170..85d8fdd55329 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -85,7 +85,6 @@ static struct ctl_table inotify_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO }, - { } }; static void __init inotify_sysctls_init(void) diff --git a/fs/nsfs.c b/fs/nsfs.c index 9a4b228d42fa..34e1e3e36733 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -90,12 +90,9 @@ slow: inode->i_fop = &ns_file_operations; inode->i_private = ns; - dentry = d_alloc_anon(mnt->mnt_sb); - if (!dentry) { - iput(inode); + dentry = d_make_root(inode); /* not the normal use, but... */ + if (!dentry) return -ENOMEM; - } - d_instantiate(dentry, inode); dentry->d_fsdata = (void *)ns->ops; d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry); if (d) { diff --git a/fs/ntfs/sysctl.c b/fs/ntfs/sysctl.c index 174fe536a1c0..4e980170d86a 100644 --- a/fs/ntfs/sysctl.c +++ b/fs/ntfs/sysctl.c @@ -28,7 +28,6 @@ static struct ctl_table ntfs_sysctls[] = { .mode = 0644, /* Mode, proc handler. */ .proc_handler = proc_dointvec }, - {} }; /* Storage for the sysctls header. */ diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index 04fc8344063a..a9b8688aaf30 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -124,17 +124,10 @@ static int ocfs2_match_dentry(struct dentry *dentry, if (!dentry->d_fsdata) return 0; - if (!dentry->d_parent) - return 0; - if (skip_unhashed && d_unhashed(dentry)) return 0; parent = d_inode(dentry->d_parent); - /* Negative parent dentry? */ - if (!parent) - return 0; - /* Name is in a different directory. */ if (OCFS2_I(parent)->ip_blkno != parent_blkno) return 0; diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index a14c8fee6ee5..d620d4c53c6f 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -1593,9 +1593,6 @@ int __ocfs2_add_entry(handle_t *handle, struct buffer_head *insert_bh = lookup->dl_leaf_bh; char *data_start = insert_bh->b_data; - if (!namelen) - return -EINVAL; - if (ocfs2_dir_indexed(dir)) { struct buffer_head *bh; @@ -4245,12 +4242,6 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, trace_ocfs2_prepare_dir_for_insert( (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen); - if (!namelen) { - ret = -EINVAL; - mlog_errno(ret); - goto out; - } - /* * Do this up front to reduce confusion. * diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 814733ba2f4b..9221a33f917b 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -1336,7 +1336,7 @@ static int ocfs2_rename(struct mnt_idmap *idmap, goto bail; } - if (S_ISDIR(old_inode->i_mode)) { + if (S_ISDIR(old_inode->i_mode) && new_dir != old_dir) { u64 old_inode_parent; update_dot_dot = 1; @@ -1353,8 +1353,7 @@ static int ocfs2_rename(struct mnt_idmap *idmap, goto bail; } - if (!new_inode && new_dir != old_dir && - new_dir->i_nlink >= ocfs2_link_max(osb)) { + if (!new_inode && new_dir->i_nlink >= ocfs2_link_max(osb)) { status = -EMLINK; goto bail; } @@ -1601,6 +1600,9 @@ static int ocfs2_rename(struct mnt_idmap *idmap, mlog_errno(status); goto bail; } + } + + if (S_ISDIR(old_inode->i_mode)) { drop_nlink(old_dir); if (new_inode) { drop_nlink(new_inode); diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index a8d5ca98fa57..20aa37b67cfb 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -658,7 +658,6 @@ static struct ctl_table ocfs2_nm_table[] = { .mode = 0644, .proc_handler = proc_dostring, }, - { } }; static struct ctl_table_header *ocfs2_table_header; diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c index 9cacce5d55c1..6d1fbeca9d81 100644 --- a/fs/orangefs/dir.c +++ b/fs/orangefs/dir.c @@ -58,10 +58,10 @@ struct orangefs_dir { * first part of the part list. */ -static int do_readdir(struct orangefs_inode_s *oi, - struct orangefs_dir *od, struct dentry *dentry, +static int do_readdir(struct orangefs_dir *od, struct inode *inode, struct orangefs_kernel_op_s *op) { + struct orangefs_inode_s *oi = ORANGEFS_I(inode); struct orangefs_readdir_response_s *resp; int bufi, r; @@ -87,7 +87,7 @@ again: op->upcall.req.readdir.buf_index = bufi; r = service_operation(op, "orangefs_readdir", - get_interruptible_flag(dentry->d_inode)); + get_interruptible_flag(inode)); orangefs_readdir_index_put(bufi); @@ -158,8 +158,7 @@ static int parse_readdir(struct orangefs_dir *od, return 0; } -static int orangefs_dir_more(struct orangefs_inode_s *oi, - struct orangefs_dir *od, struct dentry *dentry) +static int orangefs_dir_more(struct orangefs_dir *od, struct inode *inode) { struct orangefs_kernel_op_s *op; int r; @@ -169,7 +168,7 @@ static int orangefs_dir_more(struct orangefs_inode_s *oi, od->error = -ENOMEM; return -ENOMEM; } - r = do_readdir(oi, od, dentry, op); + r = do_readdir(od, inode, op); if (r) { od->error = r; goto out; @@ -238,9 +237,7 @@ next: return 1; } -static int orangefs_dir_fill(struct orangefs_inode_s *oi, - struct orangefs_dir *od, struct dentry *dentry, - struct dir_context *ctx) +static int orangefs_dir_fill(struct orangefs_dir *od, struct dir_context *ctx) { struct orangefs_dir_part *part; size_t count; @@ -304,15 +301,10 @@ static loff_t orangefs_dir_llseek(struct file *file, loff_t offset, static int orangefs_dir_iterate(struct file *file, struct dir_context *ctx) { - struct orangefs_inode_s *oi; - struct orangefs_dir *od; - struct dentry *dentry; + struct orangefs_dir *od = file->private_data; + struct inode *inode = file_inode(file); int r; - dentry = file->f_path.dentry; - oi = ORANGEFS_I(dentry->d_inode); - od = file->private_data; - if (od->error) return od->error; @@ -342,7 +334,7 @@ static int orangefs_dir_iterate(struct file *file, */ while (od->token != ORANGEFS_ITERATE_END && ctx->pos > od->end) { - r = orangefs_dir_more(oi, od, dentry); + r = orangefs_dir_more(od, inode); if (r) return r; } @@ -351,17 +343,17 @@ static int orangefs_dir_iterate(struct file *file, /* Then try to fill if there's any left in the buffer. */ if (ctx->pos < od->end) { - r = orangefs_dir_fill(oi, od, dentry, ctx); + r = orangefs_dir_fill(od, ctx); if (r) return r; } /* Finally get some more and try to fill. */ if (od->token != ORANGEFS_ITERATE_END) { - r = orangefs_dir_more(oi, od, dentry); + r = orangefs_dir_more(od, inode); if (r) return r; - r = orangefs_dir_fill(oi, od, dentry, ctx); + r = orangefs_dir_fill(od, ctx); } return r; diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 696478f09cc1..b8e25ca51016 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -744,7 +744,7 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) struct inode *inode; struct inode *udir = d_inode(c->destdir), *wdir = d_inode(c->workdir); struct path path = { .mnt = ovl_upper_mnt(ofs) }; - struct dentry *temp, *upper; + struct dentry *temp, *upper, *trap; struct ovl_cu_creds cc; int err; struct ovl_cattr cattr = { @@ -781,11 +781,13 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) * temp wasn't moved before copy up completion or cleanup. */ ovl_start_write(c->dentry); - if (lock_rename(c->workdir, c->destdir) != NULL || - temp->d_parent != c->workdir) { + trap = lock_rename(c->workdir, c->destdir); + if (trap || temp->d_parent != c->workdir) { /* temp or workdir moved underneath us? abort without cleanup */ dput(temp); err = -EIO; + if (IS_ERR(trap)) + goto out; goto unlock; } else if (err) { goto cleanup; @@ -826,6 +828,7 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) ovl_set_flag(OVL_WHITEOUTS, inode); unlock: unlock_rename(c->workdir, c->destdir); +out: ovl_end_write(c->dentry); return err; diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index aab3f5d93556..0f8b4a719237 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -1180,6 +1180,10 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir, } trap = lock_rename(new_upperdir, old_upperdir); + if (IS_ERR(trap)) { + err = PTR_ERR(trap); + goto out_revert_creds; + } olddentry = ovl_lookup_upper(ofs, old->d_name.name, old_upperdir, old->d_name.len); diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 6909c4a5da56..063409069f56 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -289,7 +289,6 @@ static struct dentry *ovl_obtain_alias(struct super_block *sb, { struct dentry *lower = lowerpath ? lowerpath->dentry : NULL; struct dentry *upper = upper_alias ?: index; - struct dentry *dentry; struct inode *inode = NULL; struct ovl_entry *oe; struct ovl_inode_params oip = { @@ -320,27 +319,7 @@ static struct dentry *ovl_obtain_alias(struct super_block *sb, if (upper) ovl_set_flag(OVL_UPPERDATA, inode); - dentry = d_find_any_alias(inode); - if (dentry) - goto out_iput; - - dentry = d_alloc_anon(inode->i_sb); - if (unlikely(!dentry)) - goto nomem; - - if (upper_alias) - ovl_dentry_set_upper_alias(dentry); - - ovl_dentry_init_reval(dentry, upper, OVL_I_E(inode)); - - return d_instantiate_anon(dentry, inode); - -nomem: - dput(dentry); - dentry = ERR_PTR(-ENOMEM); -out_iput: - iput(inode); - return dentry; + return d_obtain_alias(inode); } /* Get the upper or lower dentry in stack whose on layer @idx */ diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 984ffdaeed6c..5764f91d283e 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -18,10 +18,11 @@ struct ovl_lookup_data { struct super_block *sb; - struct vfsmount *mnt; + const struct ovl_layer *layer; struct qstr name; bool is_dir; bool opaque; + bool xwhiteouts; bool stop; bool last; char *redirect; @@ -201,17 +202,13 @@ struct dentry *ovl_decode_real_fh(struct ovl_fs *ofs, struct ovl_fh *fh, return real; } -static bool ovl_is_opaquedir(struct ovl_fs *ofs, const struct path *path) -{ - return ovl_path_check_dir_xattr(ofs, path, OVL_XATTR_OPAQUE); -} - static struct dentry *ovl_lookup_positive_unlocked(struct ovl_lookup_data *d, const char *name, struct dentry *base, int len, bool drop_negative) { - struct dentry *ret = lookup_one_unlocked(mnt_idmap(d->mnt), name, base, len); + struct dentry *ret = lookup_one_unlocked(mnt_idmap(d->layer->mnt), name, + base, len); if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { if (drop_negative && ret->d_lockref.count == 1) { @@ -232,10 +229,13 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, size_t prelen, const char *post, struct dentry **ret, bool drop_negative) { + struct ovl_fs *ofs = OVL_FS(d->sb); struct dentry *this; struct path path; int err; bool last_element = !post[0]; + bool is_upper = d->layer->idx == 0; + char val; this = ovl_lookup_positive_unlocked(d, name, base, namelen, drop_negative); if (IS_ERR(this)) { @@ -253,8 +253,8 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, } path.dentry = this; - path.mnt = d->mnt; - if (ovl_path_is_whiteout(OVL_FS(d->sb), &path)) { + path.mnt = d->layer->mnt; + if (ovl_path_is_whiteout(ofs, &path)) { d->stop = d->opaque = true; goto put_and_out; } @@ -272,7 +272,7 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, d->stop = true; goto put_and_out; } - err = ovl_check_metacopy_xattr(OVL_FS(d->sb), &path, NULL); + err = ovl_check_metacopy_xattr(ofs, &path, NULL); if (err < 0) goto out_err; @@ -292,7 +292,12 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, if (d->last) goto out; - if (ovl_is_opaquedir(OVL_FS(d->sb), &path)) { + /* overlay.opaque=x means xwhiteouts directory */ + val = ovl_get_opaquedir_val(ofs, &path); + if (last_element && !is_upper && val == 'x') { + d->xwhiteouts = true; + ovl_layer_set_xwhiteouts(ofs, d->layer); + } else if (val == 'y') { d->stop = true; if (last_element) d->opaque = true; @@ -863,7 +868,8 @@ fail: * Returns next layer in stack starting from top. * Returns -1 if this is the last layer. */ -int ovl_path_next(int idx, struct dentry *dentry, struct path *path) +int ovl_path_next(int idx, struct dentry *dentry, struct path *path, + const struct ovl_layer **layer) { struct ovl_entry *oe = OVL_E(dentry); struct ovl_path *lowerstack = ovl_lowerstack(oe); @@ -871,13 +877,16 @@ int ovl_path_next(int idx, struct dentry *dentry, struct path *path) BUG_ON(idx < 0); if (idx == 0) { ovl_path_upper(dentry, path); - if (path->dentry) + if (path->dentry) { + *layer = &OVL_FS(dentry->d_sb)->layers[0]; return ovl_numlower(oe) ? 1 : -1; + } idx++; } BUG_ON(idx > ovl_numlower(oe)); path->dentry = lowerstack[idx - 1].dentry; - path->mnt = lowerstack[idx - 1].layer->mnt; + *layer = lowerstack[idx - 1].layer; + path->mnt = (*layer)->mnt; return (idx < ovl_numlower(oe)) ? idx + 1 : -1; } @@ -1055,7 +1064,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, old_cred = ovl_override_creds(dentry->d_sb); upperdir = ovl_dentry_upper(dentry->d_parent); if (upperdir) { - d.mnt = ovl_upper_mnt(ofs); + d.layer = &ofs->layers[0]; err = ovl_lookup_layer(upperdir, &d, &upperdentry, true); if (err) goto out; @@ -1111,7 +1120,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, else if (d.is_dir || !ofs->numdatalayer) d.last = lower.layer->idx == ovl_numlower(roe); - d.mnt = lower.layer->mnt; + d.layer = lower.layer; err = ovl_lookup_layer(lower.dentry, &d, &this, false); if (err) goto out_put; @@ -1278,6 +1287,8 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, if (upperopaque) ovl_dentry_set_opaque(dentry); + if (d.xwhiteouts) + ovl_dentry_set_xwhiteouts(dentry); if (upperdentry) ovl_dentry_set_upper_alias(dentry); diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 5ba11eb43767..ee949f3e7c77 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -50,7 +50,6 @@ enum ovl_xattr { OVL_XATTR_METACOPY, OVL_XATTR_PROTATTR, OVL_XATTR_XWHITEOUT, - OVL_XATTR_XWHITEOUTS, }; enum ovl_inode_flag { @@ -70,6 +69,8 @@ enum ovl_entry_flag { OVL_E_UPPER_ALIAS, OVL_E_OPAQUE, OVL_E_CONNECTED, + /* Lower stack may contain xwhiteout entries */ + OVL_E_XWHITEOUTS, }; enum { @@ -477,6 +478,10 @@ bool ovl_dentry_test_flag(unsigned long flag, struct dentry *dentry); bool ovl_dentry_is_opaque(struct dentry *dentry); bool ovl_dentry_is_whiteout(struct dentry *dentry); void ovl_dentry_set_opaque(struct dentry *dentry); +bool ovl_dentry_has_xwhiteouts(struct dentry *dentry); +void ovl_dentry_set_xwhiteouts(struct dentry *dentry); +void ovl_layer_set_xwhiteouts(struct ovl_fs *ofs, + const struct ovl_layer *layer); bool ovl_dentry_has_upper_alias(struct dentry *dentry); void ovl_dentry_set_upper_alias(struct dentry *dentry); bool ovl_dentry_needs_data_copy_up(struct dentry *dentry, int flags); @@ -494,11 +499,10 @@ struct file *ovl_path_open(const struct path *path, int flags); int ovl_copy_up_start(struct dentry *dentry, int flags); void ovl_copy_up_end(struct dentry *dentry); bool ovl_already_copied_up(struct dentry *dentry, int flags); -bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path, - enum ovl_xattr ox); +char ovl_get_dir_xattr_val(struct ovl_fs *ofs, const struct path *path, + enum ovl_xattr ox); bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, const struct path *path); bool ovl_path_check_xwhiteout_xattr(struct ovl_fs *ofs, const struct path *path); -bool ovl_path_check_xwhiteouts_xattr(struct ovl_fs *ofs, const struct path *path); bool ovl_init_uuid_xattr(struct super_block *sb, struct ovl_fs *ofs, const struct path *upperpath); @@ -573,7 +577,13 @@ static inline bool ovl_is_impuredir(struct super_block *sb, .mnt = ovl_upper_mnt(ofs), }; - return ovl_path_check_dir_xattr(ofs, &upperpath, OVL_XATTR_IMPURE); + return ovl_get_dir_xattr_val(ofs, &upperpath, OVL_XATTR_IMPURE) == 'y'; +} + +static inline char ovl_get_opaquedir_val(struct ovl_fs *ofs, + const struct path *path) +{ + return ovl_get_dir_xattr_val(ofs, path, OVL_XATTR_OPAQUE); } static inline bool ovl_redirect_follow(struct ovl_fs *ofs) @@ -680,7 +690,8 @@ int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin, struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh); struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, struct dentry *origin, bool verify); -int ovl_path_next(int idx, struct dentry *dentry, struct path *path); +int ovl_path_next(int idx, struct dentry *dentry, struct path *path, + const struct ovl_layer **layer); int ovl_verify_lowerdata(struct dentry *dentry); struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index 5fa9c58af65f..cb449ab310a7 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -40,6 +40,8 @@ struct ovl_layer { int idx; /* One fsid per unique underlying sb (upper fsid == 0) */ int fsid; + /* xwhiteouts were found on this layer */ + bool has_xwhiteouts; }; struct ovl_path { @@ -59,7 +61,7 @@ struct ovl_fs { unsigned int numfs; /* Number of data-only lower layers */ unsigned int numdatalayer; - const struct ovl_layer *layers; + struct ovl_layer *layers; struct ovl_sb *fs; /* workbasedir is the path at workdir= mount option */ struct dentry *workbasedir; diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index e71156baa7bc..0ca8af060b0c 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -305,8 +305,6 @@ static inline int ovl_dir_read(const struct path *realpath, if (IS_ERR(realfile)) return PTR_ERR(realfile); - rdd->in_xwhiteouts_dir = rdd->dentry && - ovl_path_check_xwhiteouts_xattr(OVL_FS(rdd->dentry->d_sb), realpath); rdd->first_maybe_whiteout = NULL; rdd->ctx.pos = 0; do { @@ -359,10 +357,13 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list, .is_lowest = false, }; int idx, next; + const struct ovl_layer *layer; for (idx = 0; idx != -1; idx = next) { - next = ovl_path_next(idx, dentry, &realpath); + next = ovl_path_next(idx, dentry, &realpath, &layer); rdd.is_upper = ovl_dentry_upper(dentry) == realpath.dentry; + rdd.in_xwhiteouts_dir = layer->has_xwhiteouts && + ovl_dentry_has_xwhiteouts(dentry); if (next != -1) { err = ovl_dir_read(&realpath, &rdd); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 0bbbe4818f67..2eef6c70b2ae 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -439,8 +439,10 @@ static bool ovl_workdir_ok(struct dentry *workdir, struct dentry *upperdir) bool ok = false; if (workdir != upperdir) { - ok = (lock_rename(workdir, upperdir) == NULL); - unlock_rename(workdir, upperdir); + struct dentry *trap = lock_rename(workdir, upperdir); + if (!IS_ERR(trap)) + unlock_rename(workdir, upperdir); + ok = (trap == NULL); } return ok; } @@ -1247,6 +1249,7 @@ static struct dentry *ovl_get_root(struct super_block *sb, struct ovl_entry *oe) { struct dentry *root; + struct ovl_fs *ofs = OVL_FS(sb); struct ovl_path *lowerpath = ovl_lowerstack(oe); unsigned long ino = d_inode(lowerpath->dentry)->i_ino; int fsid = lowerpath->layer->fsid; @@ -1268,6 +1271,20 @@ static struct dentry *ovl_get_root(struct super_block *sb, ovl_set_flag(OVL_IMPURE, d_inode(root)); } + /* Look for xwhiteouts marker except in the lowermost layer */ + for (int i = 0; i < ovl_numlower(oe) - 1; i++, lowerpath++) { + struct path path = { + .mnt = lowerpath->layer->mnt, + .dentry = lowerpath->dentry, + }; + + /* overlay.opaque=x means xwhiteouts directory */ + if (ovl_get_opaquedir_val(ofs, &path) == 'x') { + ovl_layer_set_xwhiteouts(ofs, lowerpath->layer); + ovl_dentry_set_xwhiteouts(root); + } + } + /* Root is always merge -> can have whiteouts */ ovl_set_flag(OVL_WHITEOUTS, d_inode(root)); ovl_dentry_set_flag(OVL_E_CONNECTED, root); diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 22b519763267..a8e17f14d7a2 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -461,6 +461,33 @@ void ovl_dentry_set_opaque(struct dentry *dentry) ovl_dentry_set_flag(OVL_E_OPAQUE, dentry); } +bool ovl_dentry_has_xwhiteouts(struct dentry *dentry) +{ + return ovl_dentry_test_flag(OVL_E_XWHITEOUTS, dentry); +} + +void ovl_dentry_set_xwhiteouts(struct dentry *dentry) +{ + ovl_dentry_set_flag(OVL_E_XWHITEOUTS, dentry); +} + +/* + * ovl_layer_set_xwhiteouts() is called before adding the overlay dir + * dentry to dcache, while readdir of that same directory happens after + * the overlay dir dentry is in dcache, so if some cpu observes that + * ovl_dentry_is_xwhiteouts(), it will also observe layer->has_xwhiteouts + * for the layers where xwhiteouts marker was found in that merge dir. + */ +void ovl_layer_set_xwhiteouts(struct ovl_fs *ofs, + const struct ovl_layer *layer) +{ + if (layer->has_xwhiteouts) + return; + + /* Write once to read-mostly layer properties */ + ofs->layers[layer->idx].has_xwhiteouts = true; +} + /* * For hard links and decoded file handles, it's possible for ovl_dentry_upper() * to return positive, while there's no actual upper alias for the inode. @@ -739,19 +766,6 @@ bool ovl_path_check_xwhiteout_xattr(struct ovl_fs *ofs, const struct path *path) return res >= 0; } -bool ovl_path_check_xwhiteouts_xattr(struct ovl_fs *ofs, const struct path *path) -{ - struct dentry *dentry = path->dentry; - int res; - - /* xattr.whiteouts must be a directory */ - if (!d_is_dir(dentry)) - return false; - - res = ovl_path_getxattr(ofs, path, OVL_XATTR_XWHITEOUTS, NULL, 0); - return res >= 0; -} - /* * Load persistent uuid from xattr into s_uuid if found, or store a new * random generated value in s_uuid and in xattr. @@ -811,20 +825,17 @@ fail: return false; } -bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path, - enum ovl_xattr ox) +char ovl_get_dir_xattr_val(struct ovl_fs *ofs, const struct path *path, + enum ovl_xattr ox) { int res; char val; if (!d_is_dir(path->dentry)) - return false; + return 0; res = ovl_path_getxattr(ofs, path, ox, &val, 1); - if (res == 1 && val == 'y') - return true; - - return false; + return res == 1 ? val : 0; } #define OVL_XATTR_OPAQUE_POSTFIX "opaque" @@ -837,7 +848,6 @@ bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path, #define OVL_XATTR_METACOPY_POSTFIX "metacopy" #define OVL_XATTR_PROTATTR_POSTFIX "protattr" #define OVL_XATTR_XWHITEOUT_POSTFIX "whiteout" -#define OVL_XATTR_XWHITEOUTS_POSTFIX "whiteouts" #define OVL_XATTR_TAB_ENTRY(x) \ [x] = { [false] = OVL_XATTR_TRUSTED_PREFIX x ## _POSTFIX, \ @@ -854,7 +864,6 @@ const char *const ovl_xattr_table[][2] = { OVL_XATTR_TAB_ENTRY(OVL_XATTR_METACOPY), OVL_XATTR_TAB_ENTRY(OVL_XATTR_PROTATTR), OVL_XATTR_TAB_ENTRY(OVL_XATTR_XWHITEOUT), - OVL_XATTR_TAB_ENTRY(OVL_XATTR_XWHITEOUTS), }; int ovl_check_setxattr(struct ovl_fs *ofs, struct dentry *upperdentry, @@ -1198,12 +1207,17 @@ void ovl_nlink_end(struct dentry *dentry) int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir) { + struct dentry *trap; + /* Workdir should not be the same as upperdir */ if (workdir == upperdir) goto err; /* Workdir should not be subdir of upperdir and vice versa */ - if (lock_rename(workdir, upperdir) != NULL) + trap = lock_rename(workdir, upperdir); + if (IS_ERR(trap)) + goto err; + if (trap) goto err_unlock; return 0; diff --git a/fs/pipe.c b/fs/pipe.c index 8d9286a1f2e8..f1adbfe743d4 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1507,7 +1507,6 @@ static struct ctl_table fs_pipe_sysctls[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, - { } }; #endif diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 8064ea76f80b..37cde0efee57 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -44,7 +44,7 @@ static struct ctl_table sysctl_mount_point[] = { */ struct ctl_table_header *register_sysctl_mount_point(const char *path) { - return register_sysctl_sz(path, sysctl_mount_point, 0); + return register_sysctl(path, sysctl_mount_point); } EXPORT_SYMBOL(register_sysctl_mount_point); @@ -71,7 +71,6 @@ static struct ctl_table root_table[] = { .procname = "", .mode = S_IFDIR|S_IRUGO|S_IXUGO, }, - { } }; static struct ctl_table_root sysctl_table_root = { .default_set.dir.header = { @@ -233,7 +232,8 @@ static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header) return -EROFS; /* Am I creating a permanently empty directory? */ - if (sysctl_is_perm_empty_ctl_table(header->ctl_table)) { + if (header->ctl_table_size > 0 && + sysctl_is_perm_empty_ctl_table(header->ctl_table)) { if (!RB_EMPTY_ROOT(&dir->root)) return -EINVAL; sysctl_set_perm_empty_ctl_header(dir_h); @@ -534,13 +534,8 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, goto out; } - inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p); - if (IS_ERR(inode)) { - err = ERR_CAST(inode); - goto out; - } - d_set_d_op(dentry, &proc_sys_dentry_operations); + inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p); err = d_splice_alias(inode, dentry); out: @@ -698,13 +693,8 @@ static bool proc_sys_fill_cache(struct file *file, return false; if (d_in_lookup(child)) { struct dentry *res; - inode = proc_sys_make_inode(dir->d_sb, head, table); - if (IS_ERR(inode)) { - d_lookup_done(child); - dput(child); - return false; - } d_set_d_op(child, &proc_sys_dentry_operations); + inode = proc_sys_make_inode(dir->d_sb, head, table); res = d_splice_alias(inode, child); d_lookup_done(child); if (unlikely(res)) { @@ -1213,6 +1203,10 @@ static bool get_links(struct ctl_dir *dir, struct ctl_table_header *tmp_head; struct ctl_table *entry, *link; + if (header->ctl_table_size == 0 || + sysctl_is_perm_empty_ctl_table(header->ctl_table)) + return true; + /* Are there links available for every entry in table? */ list_for_each_table_entry(entry, header) { const char *procname = entry->procname; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 62b16f42d5d2..3f78ebbb795f 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -2432,7 +2432,6 @@ static long pagemap_scan_flush_buffer(struct pagemap_scan_private *p) static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg) { - struct mmu_notifier_range range; struct pagemap_scan_private p = {0}; unsigned long walk_start; size_t n_ranges_out = 0; @@ -2448,15 +2447,9 @@ static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg) if (ret) return ret; - /* Protection change for the range is going to happen. */ - if (p.arg.flags & PM_SCAN_WP_MATCHING) { - mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0, - mm, p.arg.start, p.arg.end); - mmu_notifier_invalidate_range_start(&range); - } - for (walk_start = p.arg.start; walk_start < p.arg.end; walk_start = p.arg.walk_end) { + struct mmu_notifier_range range; long n_out; if (fatal_signal_pending(current)) { @@ -2467,8 +2460,20 @@ static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg) ret = mmap_read_lock_killable(mm); if (ret) break; + + /* Protection change for the range is going to happen. */ + if (p.arg.flags & PM_SCAN_WP_MATCHING) { + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0, + mm, walk_start, p.arg.end); + mmu_notifier_invalidate_range_start(&range); + } + ret = walk_page_range(mm, walk_start, p.arg.end, &pagemap_scan_ops, &p); + + if (p.arg.flags & PM_SCAN_WP_MATCHING) + mmu_notifier_invalidate_range_end(&range); + mmap_read_unlock(mm); n_out = pagemap_scan_flush_buffer(&p); @@ -2494,9 +2499,6 @@ static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg) if (pagemap_scan_writeback_args(&p.arg, uarg)) ret = -EFAULT; - if (p.arg.flags & PM_SCAN_WP_MATCHING) - mmu_notifier_invalidate_range_end(&range); - kfree(p.vec_buf); return ret; } diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 44ff2813ae51..1f0c754416b6 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2969,7 +2969,6 @@ static struct ctl_table fs_dqstats_table[] = { .proc_handler = proc_dointvec, }, #endif - { }, }; static int __init dquot_init(void) diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 994d6e6995ab..7e7b531fcc49 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -451,13 +451,6 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, BUG_ON(!th->t_trans_id); - /* cannot allow items to be added into a busy deleted directory */ - if (!namelen) - return -EINVAL; - - if (namelen > REISERFS_MAX_NAME(dir->i_sb->s_blocksize)) - return -ENAMETOOLONG; - /* each entry has unique key. compose it */ make_cpu_key(&entry_key, dir, get_third_component(dir->i_sb, name, namelen), @@ -1324,8 +1317,8 @@ static int reiserfs_rename(struct mnt_idmap *idmap, struct inode *old_inode, *new_dentry_inode; struct reiserfs_transaction_handle th; int jbegin_count; - umode_t old_inode_mode; unsigned long savelink = 1; + bool update_dir_parent = false; if (flags & ~RENAME_NOREPLACE) return -EINVAL; @@ -1375,8 +1368,7 @@ static int reiserfs_rename(struct mnt_idmap *idmap, return -ENOENT; } - old_inode_mode = old_inode->i_mode; - if (S_ISDIR(old_inode_mode)) { + if (S_ISDIR(old_inode->i_mode)) { /* * make sure that directory being renamed has correct ".." * and that its new parent directory has not too many links @@ -1389,24 +1381,28 @@ static int reiserfs_rename(struct mnt_idmap *idmap, } } - /* - * directory is renamed, its parent directory will be changed, - * so find ".." entry - */ - dot_dot_de.de_gen_number_bit_string = NULL; - retval = - reiserfs_find_entry(old_inode, "..", 2, &dot_dot_entry_path, + if (old_dir != new_dir) { + /* + * directory is renamed, its parent directory will be + * changed, so find ".." entry + */ + dot_dot_de.de_gen_number_bit_string = NULL; + retval = + reiserfs_find_entry(old_inode, "..", 2, + &dot_dot_entry_path, &dot_dot_de); - pathrelse(&dot_dot_entry_path); - if (retval != NAME_FOUND) { - reiserfs_write_unlock(old_dir->i_sb); - return -EIO; - } + pathrelse(&dot_dot_entry_path); + if (retval != NAME_FOUND) { + reiserfs_write_unlock(old_dir->i_sb); + return -EIO; + } - /* inode number of .. must equal old_dir->i_ino */ - if (dot_dot_de.de_objectid != old_dir->i_ino) { - reiserfs_write_unlock(old_dir->i_sb); - return -EIO; + /* inode number of .. must equal old_dir->i_ino */ + if (dot_dot_de.de_objectid != old_dir->i_ino) { + reiserfs_write_unlock(old_dir->i_sb); + return -EIO; + } + update_dir_parent = true; } } @@ -1486,7 +1482,7 @@ static int reiserfs_rename(struct mnt_idmap *idmap, reiserfs_prepare_for_journal(old_inode->i_sb, new_de.de_bh, 1); - if (S_ISDIR(old_inode->i_mode)) { + if (update_dir_parent) { if ((retval = search_by_entry_key(new_dir->i_sb, &dot_dot_de.de_entry_key, @@ -1534,14 +1530,14 @@ static int reiserfs_rename(struct mnt_idmap *idmap, new_de.de_bh); reiserfs_restore_prepared_buffer(old_inode->i_sb, old_de.de_bh); - if (S_ISDIR(old_inode_mode)) + if (update_dir_parent) reiserfs_restore_prepared_buffer(old_inode-> i_sb, dot_dot_de. de_bh); continue; } - if (S_ISDIR(old_inode_mode)) { + if (update_dir_parent) { if (item_moved(&dot_dot_ih, &dot_dot_entry_path) || !entry_points_to_object("..", 2, &dot_dot_de, old_dir)) { @@ -1559,7 +1555,7 @@ static int reiserfs_rename(struct mnt_idmap *idmap, } } - RFALSE(S_ISDIR(old_inode_mode) && + RFALSE(update_dir_parent && !buffer_journal_prepared(dot_dot_de.de_bh), ""); break; @@ -1592,11 +1588,12 @@ static int reiserfs_rename(struct mnt_idmap *idmap, savelink = new_dentry_inode->i_nlink; } - if (S_ISDIR(old_inode_mode)) { + if (update_dir_parent) { /* adjust ".." of renamed directory */ set_ino_in_dir_entry(&dot_dot_de, INODE_PKEY(new_dir)); journal_mark_dirty(&th, dot_dot_de.de_bh); - + } + if (S_ISDIR(old_inode->i_mode)) { /* * there (in new_dir) was no directory, so it got new link * (".." of renamed directory) diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c index d64a306a414b..1daeb5714faa 100644 --- a/fs/smb/client/cached_dir.c +++ b/fs/smb/client/cached_dir.c @@ -145,21 +145,27 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, struct cached_fid *cfid; struct cached_fids *cfids; const char *npath; + int retries = 0, cur_sleep = 1; if (tcon == NULL || tcon->cfids == NULL || tcon->nohandlecache || is_smb1_server(tcon->ses->server) || (dir_cache_timeout == 0)) return -EOPNOTSUPP; ses = tcon->ses; - server = ses->server; cfids = tcon->cfids; - if (!server->ops->new_lease_key) - return -EIO; - if (cifs_sb->root == NULL) return -ENOENT; +replay_again: + /* reinitialize for possible replay */ + flags = 0; + oplock = SMB2_OPLOCK_LEVEL_II; + server = cifs_pick_channel(ses); + + if (!server->ops->new_lease_key) + return -EIO; + utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); if (!utf16_path) return -ENOMEM; @@ -268,6 +274,11 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, */ cfid->has_lease = true; + if (retries) { + smb2_set_replay(server, &rqst[0]); + smb2_set_replay(server, &rqst[1]); + } + rc = compound_send_recv(xid, ses, server, flags, 2, rqst, resp_buftype, rsp_iov); @@ -367,6 +378,11 @@ out: atomic_inc(&tcon->num_remote_opens); } kfree(utf16_path); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c index 60027f5aebe8..3e4209f41c18 100644 --- a/fs/smb/client/cifs_debug.c +++ b/fs/smb/client/cifs_debug.c @@ -659,6 +659,7 @@ static ssize_t cifs_stats_proc_write(struct file *file, spin_lock(&tcon->stat_lock); tcon->bytes_read = 0; tcon->bytes_written = 0; + tcon->stats_from_time = ktime_get_real_seconds(); spin_unlock(&tcon->stat_lock); if (server->ops->clear_stats) server->ops->clear_stats(tcon); @@ -737,8 +738,9 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v) seq_printf(m, "\n%d) %s", i, tcon->tree_name); if (tcon->need_reconnect) seq_puts(m, "\tDISCONNECTED "); - seq_printf(m, "\nSMBs: %d", - atomic_read(&tcon->num_smbs_sent)); + seq_printf(m, "\nSMBs: %d since %ptTs UTC", + atomic_read(&tcon->num_smbs_sent), + &tcon->stats_from_time); if (server->ops->print_stats) server->ops->print_stats(m, tcon); } diff --git a/fs/smb/client/cifsencrypt.c b/fs/smb/client/cifsencrypt.c index ef4c2e3c9fa6..6322f0f68a17 100644 --- a/fs/smb/client/cifsencrypt.c +++ b/fs/smb/client/cifsencrypt.c @@ -572,7 +572,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, len = cifs_strtoUTF16(user, ses->user_name, len, nls_cp); UniStrupr(user); } else { - memset(user, '\0', 2); + *(u16 *)user = 0; } rc = crypto_shash_update(ses->server->secmech.hmacmd5, diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index 99b0ade833aa..2a4a4e3a8751 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -396,7 +396,7 @@ cifs_alloc_inode(struct super_block *sb) spin_lock_init(&cifs_inode->writers_lock); cifs_inode->writers = 0; cifs_inode->netfs.inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ - cifs_inode->server_eof = 0; + cifs_inode->netfs.remote_i_size = 0; cifs_inode->uniqueid = 0; cifs_inode->createtime = 0; cifs_inode->epoch = 0; @@ -430,7 +430,7 @@ static void cifs_evict_inode(struct inode *inode) { truncate_inode_pages_final(&inode->i_data); - if (inode->i_state & I_PINNING_FSCACHE_WB) + if (inode->i_state & I_PINNING_NETFS_WB) cifs_fscache_unuse_inode_cookie(inode, true); cifs_fscache_release_inode_cookie(inode); clear_inode(inode); @@ -681,6 +681,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",rasize=%u", cifs_sb->ctx->rasize); if (tcon->ses->server->min_offload) seq_printf(s, ",esize=%u", tcon->ses->server->min_offload); + if (tcon->ses->server->retrans) + seq_printf(s, ",retrans=%u", tcon->ses->server->retrans); seq_printf(s, ",echo_interval=%lu", tcon->ses->server->echo_interval / HZ); @@ -793,8 +795,7 @@ static int cifs_show_stats(struct seq_file *s, struct dentry *root) static int cifs_write_inode(struct inode *inode, struct writeback_control *wbc) { - fscache_unpin_writeback(wbc, cifs_inode_cookie(inode)); - return 0; + return netfs_unpin_writeback(inode, wbc); } static int cifs_drop_inode(struct inode *inode) @@ -1222,7 +1223,7 @@ static int cifs_precopy_set_eof(struct inode *src_inode, struct cifsInodeInfo *s if (rc < 0) goto set_failed; - netfs_resize_file(&src_cifsi->netfs, src_end); + netfs_resize_file(&src_cifsi->netfs, src_end, true); fscache_resize_cookie(cifs_inode_cookie(src_inode), src_end); return 0; @@ -1353,7 +1354,7 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off, smb_file_src, smb_file_target, off, len, destoff); if (rc == 0 && new_size > i_size_read(target_inode)) { truncate_setsize(target_inode, new_size); - netfs_resize_file(&target_cifsi->netfs, new_size); + netfs_resize_file(&target_cifsi->netfs, new_size, true); fscache_resize_cookie(cifs_inode_cookie(target_inode), new_size); } @@ -1379,6 +1380,7 @@ ssize_t cifs_file_copychunk_range(unsigned int xid, struct inode *src_inode = file_inode(src_file); struct inode *target_inode = file_inode(dst_file); struct cifsInodeInfo *src_cifsi = CIFS_I(src_inode); + struct cifsInodeInfo *target_cifsi = CIFS_I(target_inode); struct cifsFileInfo *smb_file_src; struct cifsFileInfo *smb_file_target; struct cifs_tcon *src_tcon; @@ -1427,7 +1429,7 @@ ssize_t cifs_file_copychunk_range(unsigned int xid, * Advance the EOF marker after the flush above to the end of the range * if it's short of that. */ - if (src_cifsi->server_eof < off + len) { + if (src_cifsi->netfs.remote_i_size < off + len) { rc = cifs_precopy_set_eof(src_inode, src_cifsi, src_tcon, xid, off + len); if (rc < 0) goto unlock; @@ -1451,12 +1453,22 @@ ssize_t cifs_file_copychunk_range(unsigned int xid, /* Discard all the folios that overlap the destination region. */ truncate_inode_pages_range(&target_inode->i_data, fstart, fend); + fscache_invalidate(cifs_inode_cookie(target_inode), NULL, + i_size_read(target_inode), 0); + rc = file_modified(dst_file); if (!rc) { rc = target_tcon->ses->server->ops->copychunk_range(xid, smb_file_src, smb_file_target, off, len, destoff); - if (rc > 0 && destoff + rc > i_size_read(target_inode)) + if (rc > 0 && destoff + rc > i_size_read(target_inode)) { truncate_setsize(target_inode, destoff + rc); + netfs_resize_file(&target_cifsi->netfs, + i_size_read(target_inode), true); + fscache_resize_cookie(cifs_inode_cookie(target_inode), + i_size_read(target_inode)); + } + if (rc > 0 && destoff + rc > target_cifsi->netfs.zero_point) + target_cifsi->netfs.zero_point = destoff + rc; } file_accessed(src_file); diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h index 3adea10aa9da..685f7d1139c6 100644 --- a/fs/smb/client/cifsfs.h +++ b/fs/smb/client/cifsfs.h @@ -152,6 +152,6 @@ extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ /* when changing internal version - update following two lines at same time */ -#define SMB3_PRODUCT_BUILD 46 -#define CIFS_VERSION "2.46" +#define SMB3_PRODUCT_BUILD 47 +#define CIFS_VERSION "2.47" #endif /* _CIFSFS_H */ diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 5e32c79f03a7..c86a72c9d9ec 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -50,6 +50,11 @@ #define CIFS_DEF_ACTIMEO (1 * HZ) /* + * max sleep time before retry to server + */ +#define CIFS_MAX_SLEEP 2000 + +/* * max attribute cache timeout (jiffies) - 2^30 */ #define CIFS_MAX_ACTIMEO (1 << 30) @@ -82,7 +87,7 @@ #define SMB_INTERFACE_POLL_INTERVAL 600 /* maximum number of PDUs in one compound */ -#define MAX_COMPOUND 5 +#define MAX_COMPOUND 7 /* * Default number of credits to keep available for SMB3. @@ -192,6 +197,11 @@ struct cifs_open_info_data { bool symlink; }; struct { + /* ioctl response buffer */ + struct { + int buftype; + struct kvec iov; + } io; __u32 tag; union { struct reparse_data_buffer *buf; @@ -199,19 +209,25 @@ struct cifs_open_info_data { }; } reparse; char *symlink_target; + struct cifs_sid posix_owner; + struct cifs_sid posix_group; union { struct smb2_file_all_info fi; struct smb311_posix_qinfo posix_fi; }; }; -#define cifs_open_data_reparse(d) \ - ((d)->reparse_point || \ - (le32_to_cpu((d)->fi.Attributes) & ATTR_REPARSE)) - -static inline void cifs_free_open_info(struct cifs_open_info_data *data) +static inline bool cifs_open_data_reparse(struct cifs_open_info_data *data) { - kfree(data->symlink_target); + struct smb2_file_all_info *fi = &data->fi; + u32 attrs = le32_to_cpu(fi->Attributes); + bool ret; + + ret = data->reparse_point || (attrs & ATTR_REPARSE); + if (ret) + attrs |= ATTR_REPARSE; + fi->Attributes = cpu_to_le32(attrs); + return ret; } /* @@ -390,12 +406,17 @@ struct smb_version_operations { int (*rename_pending_delete)(const char *, struct dentry *, const unsigned int); /* send rename request */ - int (*rename)(const unsigned int, struct cifs_tcon *, const char *, - const char *, struct cifs_sb_info *); + int (*rename)(const unsigned int xid, + struct cifs_tcon *tcon, + struct dentry *source_dentry, + const char *from_name, const char *to_name, + struct cifs_sb_info *cifs_sb); /* send create hardlink request */ - int (*create_hardlink)(const unsigned int, struct cifs_tcon *, - const char *, const char *, - struct cifs_sb_info *); + int (*create_hardlink)(const unsigned int xid, + struct cifs_tcon *tcon, + struct dentry *source_dentry, + const char *from_name, const char *to_name, + struct cifs_sb_info *cifs_sb); /* query symlink target */ int (*query_symlink)(const unsigned int xid, struct cifs_tcon *tcon, @@ -560,6 +581,12 @@ struct smb_version_operations { int (*parse_reparse_point)(struct cifs_sb_info *cifs_sb, struct kvec *rsp_iov, struct cifs_open_info_data *data); + int (*create_reparse_symlink)(const unsigned int xid, + struct inode *inode, + struct dentry *dentry, + struct cifs_tcon *tcon, + const char *full_path, + const char *symname); }; struct smb_version_values { @@ -731,6 +758,7 @@ struct TCP_Server_Info { unsigned int max_read; unsigned int max_write; unsigned int min_offload; + unsigned int retrans; __le16 compress_algorithm; __u16 signing_algorithm; __le16 cipher_type; @@ -1004,6 +1032,8 @@ struct cifs_chan { __u8 signkey[SMB3_SIGN_KEY_SIZE]; }; +#define CIFS_SES_FLAG_SCALE_CHANNELS (0x1) + /* * Session structure. One of these for each uid session with a particular host */ @@ -1036,6 +1066,7 @@ struct cifs_ses { enum securityEnum sectype; /* what security flavor was specified? */ bool sign; /* is signing required? */ bool domainAuto:1; + unsigned int flags; __u16 session_flags; __u8 smb3signingkey[SMB3_SIGN_KEY_SIZE]; __u8 smb3encryptionkey[SMB3_ENC_DEC_KEY_SIZE]; @@ -1187,6 +1218,7 @@ struct cifs_tcon { __u64 bytes_read; __u64 bytes_written; spinlock_t stat_lock; /* protects the two fields above */ + time64_t stats_from_time; FILE_SYSTEM_DEVICE_INFO fsDevInfo; FILE_SYSTEM_ATTRIBUTE_INFO fsAttrInfo; /* ok if fs name truncated */ FILE_SYSTEM_UNIX_INFO fsUnixInfo; @@ -1477,6 +1509,7 @@ struct cifs_writedata { struct smbd_mr *mr; #endif struct cifs_credits credits; + bool replay; }; /* @@ -1537,7 +1570,6 @@ struct cifsInodeInfo { spinlock_t writers_lock; unsigned int writers; /* Number of writers on this inode */ unsigned long time; /* jiffies of last update of inode */ - u64 server_eof; /* current file size on server -- protected by i_lock */ u64 uniqueid; /* server inode number */ u64 createtime; /* creation time on server */ __u8 lease_key[SMB2_LEASE_KEY_SIZE]; /* lease key for this inode */ @@ -1545,6 +1577,7 @@ struct cifsInodeInfo { spinlock_t deferred_lock; /* protection on deferred list */ bool lease_granted; /* Flag to indicate whether lease or oplock is granted. */ char *symlink_target; + __u32 reparse_tag; }; static inline struct cifsInodeInfo * @@ -1806,6 +1839,13 @@ static inline bool is_retryable_error(int error) return false; } +static inline bool is_replayable_error(int error) +{ + if (error == -EAGAIN || error == -ECONNABORTED) + return true; + return false; +} + /* cifs_get_writable_file() flags */ #define FIND_WR_ANY 0 @@ -2238,8 +2278,8 @@ static inline void cifs_sg_set_buf(struct sg_table *sgtable, struct smb2_compound_vars { struct cifs_open_parms oparms; - struct kvec rsp_iov[3]; - struct smb_rqst rqst[3]; + struct kvec rsp_iov[MAX_COMPOUND]; + struct smb_rqst rqst[MAX_COMPOUND]; struct kvec open_iov[SMB2_CREATE_IOV_SIZE]; struct kvec qi_iov; struct kvec io_iov[SMB2_IOCTL_IOV_SIZE]; diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index 46feaa0880bd..a841bf4967fa 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -211,8 +211,12 @@ int cifs_get_inode_info(struct inode **inode, const char *full_path, bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, struct cifs_open_info_data *data); -extern int smb311_posix_get_inode_info(struct inode **pinode, const char *search_path, - struct super_block *sb, unsigned int xid); + +extern int smb311_posix_get_inode_info(struct inode **inode, + const char *full_path, + struct cifs_open_info_data *data, + struct super_block *sb, + const unsigned int xid); extern int cifs_get_inode_info_unix(struct inode **pinode, const unsigned char *search_path, struct super_block *sb, unsigned int xid); @@ -435,16 +439,19 @@ extern int CIFSPOSIXDelFile(const unsigned int xid, struct cifs_tcon *tcon, int remap_special_chars); extern int CIFSSMBDelFile(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb); -extern int CIFSSMBRename(const unsigned int xid, struct cifs_tcon *tcon, - const char *from_name, const char *to_name, - struct cifs_sb_info *cifs_sb); +int CIFSSMBRename(const unsigned int xid, struct cifs_tcon *tcon, + struct dentry *source_dentry, + const char *from_name, const char *to_name, + struct cifs_sb_info *cifs_sb); extern int CIFSSMBRenameOpenFile(const unsigned int xid, struct cifs_tcon *tcon, int netfid, const char *target_name, const struct nls_table *nls_codepage, int remap_special_chars); -extern int CIFSCreateHardLink(const unsigned int xid, struct cifs_tcon *tcon, - const char *from_name, const char *to_name, - struct cifs_sb_info *cifs_sb); +int CIFSCreateHardLink(const unsigned int xid, + struct cifs_tcon *tcon, + struct dentry *source_dentry, + const char *from_name, const char *to_name, + struct cifs_sb_info *cifs_sb); extern int CIFSUnixCreateHardLink(const unsigned int xid, struct cifs_tcon *tcon, const char *fromName, const char *toName, @@ -649,7 +656,7 @@ cifs_chan_is_iface_active(struct cifs_ses *ses, struct TCP_Server_Info *server); void cifs_disable_secondary_channels(struct cifs_ses *ses); -int +void cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server); int SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon, bool in_mount); @@ -760,4 +767,11 @@ static inline void release_mid(struct mid_q_entry *mid) kref_put(&mid->refcount, __release_mid); } +static inline void cifs_free_open_info(struct cifs_open_info_data *data) +{ + kfree(data->symlink_target); + free_rsp_buf(data->reparse.io.buftype, data->reparse.io.iov.iov_base); + memset(data, 0, sizeof(*data)); +} + #endif /* _CIFSPROTO_H */ diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index 9ee348e6d106..01e89070df5a 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -2149,10 +2149,10 @@ CIFSSMBFlush(const unsigned int xid, struct cifs_tcon *tcon, int smb_file_id) return rc; } -int -CIFSSMBRename(const unsigned int xid, struct cifs_tcon *tcon, - const char *from_name, const char *to_name, - struct cifs_sb_info *cifs_sb) +int CIFSSMBRename(const unsigned int xid, struct cifs_tcon *tcon, + struct dentry *source_dentry, + const char *from_name, const char *to_name, + struct cifs_sb_info *cifs_sb) { int rc = 0; RENAME_REQ *pSMB = NULL; @@ -2530,10 +2530,11 @@ createHardLinkRetry: return rc; } -int -CIFSCreateHardLink(const unsigned int xid, struct cifs_tcon *tcon, - const char *from_name, const char *to_name, - struct cifs_sb_info *cifs_sb) +int CIFSCreateHardLink(const unsigned int xid, + struct cifs_tcon *tcon, + struct dentry *source_dentry, + const char *from_name, const char *to_name, + struct cifs_sb_info *cifs_sb) { int rc = 0; NT_RENAME_REQ *pSMB = NULL; @@ -2699,11 +2700,12 @@ int cifs_query_reparse_point(const unsigned int xid, u32 *tag, struct kvec *rsp, int *rsp_buftype) { + struct reparse_data_buffer *buf; struct cifs_open_parms oparms; TRANSACT_IOCTL_REQ *io_req = NULL; TRANSACT_IOCTL_RSP *io_rsp = NULL; struct cifs_fid fid; - __u32 data_offset, data_count; + __u32 data_offset, data_count, len; __u8 *start, *end; int io_rsp_len; int oplock = 0; @@ -2773,7 +2775,16 @@ int cifs_query_reparse_point(const unsigned int xid, goto error; } - *tag = le32_to_cpu(((struct reparse_data_buffer *)start)->ReparseTag); + data_count = le16_to_cpu(io_rsp->ByteCount); + buf = (struct reparse_data_buffer *)start; + len = sizeof(*buf); + if (data_count < len || + data_count < le16_to_cpu(buf->ReparseDataLength) + len) { + rc = -EIO; + goto error; + } + + *tag = le32_to_cpu(buf->ReparseTag); rsp->iov_base = io_rsp; rsp->iov_len = io_rsp_len; *rsp_buftype = CIFS_LARGE_BUFFER; diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index dc9b95ca71e6..bfd568f89710 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -483,6 +483,7 @@ static int reconnect_target_unlocked(struct TCP_Server_Info *server, struct dfs_ static int reconnect_dfs_server(struct TCP_Server_Info *server) { struct dfs_cache_tgt_iterator *target_hint = NULL; + DFS_CACHE_TGT_LIST(tl); int num_targets = 0; int rc = 0; @@ -745,6 +746,7 @@ cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, { struct msghdr smb_msg = {}; struct kvec iov = {.iov_base = buf, .iov_len = to_read}; + iov_iter_kvec(&smb_msg.msg_iter, ITER_DEST, &iov, 1, to_read); return cifs_readv_from_socket(server, &smb_msg); @@ -1400,11 +1402,13 @@ cifs_match_ipaddr(struct sockaddr *srcaddr, struct sockaddr *rhs) case AF_INET: { struct sockaddr_in *saddr4 = (struct sockaddr_in *)srcaddr; struct sockaddr_in *vaddr4 = (struct sockaddr_in *)rhs; + return (saddr4->sin_addr.s_addr == vaddr4->sin_addr.s_addr); } case AF_INET6: { struct sockaddr_in6 *saddr6 = (struct sockaddr_in6 *)srcaddr; struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)rhs; + return (ipv6_addr_equal(&saddr6->sin6_addr, &vaddr6->sin6_addr) && saddr6->sin6_scope_id == vaddr6->sin6_scope_id); } @@ -1570,6 +1574,9 @@ static int match_server(struct TCP_Server_Info *server, if (server->min_offload != ctx->min_offload) return 0; + if (server->retrans != ctx->retrans) + return 0; + return 1; } @@ -1794,6 +1801,7 @@ smbd_connected: goto out_err_crypto_release; } tcp_ses->min_offload = ctx->min_offload; + tcp_ses->retrans = ctx->retrans; /* * at this point we are the only ones with the pointer * to the struct since the kernel thread not created yet @@ -2599,8 +2607,8 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) rc = -EOPNOTSUPP; goto out_fail; } else { - cifs_dbg(VFS, "Check vers= mount option. SMB3.11 " - "disabled but required for POSIX extensions\n"); + cifs_dbg(VFS, + "Check vers= mount option. SMB3.11 disabled but required for POSIX extensions\n"); rc = -EOPNOTSUPP; goto out_fail; } @@ -2743,7 +2751,6 @@ cifs_put_tlink(struct tcon_link *tlink) if (!IS_ERR(tlink_tcon(tlink))) cifs_put_tcon(tlink_tcon(tlink)); kfree(tlink); - return; } static int @@ -2884,6 +2891,7 @@ static inline void cifs_reclassify_socket4(struct socket *sock) { struct sock *sk = sock->sk; + BUG_ON(!sock_allow_reclassification(sk)); sock_lock_init_class_and_name(sk, "slock-AF_INET-CIFS", &cifs_slock_key[0], "sk_lock-AF_INET-CIFS", &cifs_key[0]); @@ -2893,6 +2901,7 @@ static inline void cifs_reclassify_socket6(struct socket *sock) { struct sock *sk = sock->sk; + BUG_ON(!sock_allow_reclassification(sk)); sock_lock_init_class_and_name(sk, "slock-AF_INET6-CIFS", &cifs_slock_key[1], "sk_lock-AF_INET6-CIFS", &cifs_key[1]); @@ -2927,15 +2936,18 @@ static int bind_socket(struct TCP_Server_Info *server) { int rc = 0; + if (server->srcaddr.ss_family != AF_UNSPEC) { /* Bind to the specified local IP address */ struct socket *socket = server->ssocket; + rc = kernel_bind(socket, (struct sockaddr *) &server->srcaddr, sizeof(server->srcaddr)); if (rc < 0) { struct sockaddr_in *saddr4; struct sockaddr_in6 *saddr6; + saddr4 = (struct sockaddr_in *)&server->srcaddr; saddr6 = (struct sockaddr_in6 *)&server->srcaddr; if (saddr6->sin6_family == AF_INET6) @@ -3165,6 +3177,7 @@ void reset_cifs_unix_caps(unsigned int xid, struct cifs_tcon *tcon, if (!CIFSSMBQFSUnixInfo(xid, tcon)) { __u64 cap = le64_to_cpu(tcon->fsUnixInfo.Capability); + cifs_dbg(FYI, "unix caps which server supports %lld\n", cap); /* * check for reconnect case in which we do not @@ -3668,7 +3681,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses, smb_buffer_response = smb_buffer; header_assemble(smb_buffer, SMB_COM_TREE_CONNECT_ANDX, - NULL /*no tid */ , 4 /*wct */ ); + NULL /*no tid */, 4 /*wct */); smb_buffer->Mid = get_next_mid(ses->server); smb_buffer->Uid = ses->Suid; @@ -3687,12 +3700,12 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses, if (ses->server->sign) smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE; - if (ses->capabilities & CAP_STATUS32) { + if (ses->capabilities & CAP_STATUS32) smb_buffer->Flags2 |= SMBFLG2_ERR_STATUS; - } - if (ses->capabilities & CAP_DFS) { + + if (ses->capabilities & CAP_DFS) smb_buffer->Flags2 |= SMBFLG2_DFS; - } + if (ses->capabilities & CAP_UNICODE) { smb_buffer->Flags2 |= SMBFLG2_UNICODE; length = diff --git a/fs/smb/client/dir.c b/fs/smb/client/dir.c index 580a27a3a7e6..89333d9bce36 100644 --- a/fs/smb/client/dir.c +++ b/fs/smb/client/dir.c @@ -680,9 +680,10 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, full_path, d_inode(direntry)); again: - if (pTcon->posix_extensions) - rc = smb311_posix_get_inode_info(&newInode, full_path, parent_dir_inode->i_sb, xid); - else if (pTcon->unix_ext) { + if (pTcon->posix_extensions) { + rc = smb311_posix_get_inode_info(&newInode, full_path, NULL, + parent_dir_inode->i_sb, xid); + } else if (pTcon->unix_ext) { rc = cifs_get_inode_info_unix(&newInode, full_path, parent_dir_inode->i_sb, xid); } else { diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 4e84e88b47e3..b75282c204da 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -87,7 +87,7 @@ void cifs_pages_written_back(struct inode *inode, loff_t start, unsigned int len continue; if (!folio_test_writeback(folio)) { WARN_ONCE(1, "bad %x @%llx page %lx %lx\n", - len, start, folio_index(folio), end); + len, start, folio->index, end); continue; } @@ -120,7 +120,7 @@ void cifs_pages_write_failed(struct inode *inode, loff_t start, unsigned int len continue; if (!folio_test_writeback(folio)) { WARN_ONCE(1, "bad %x @%llx page %lx %lx\n", - len, start, folio_index(folio), end); + len, start, folio->index, end); continue; } @@ -151,7 +151,7 @@ void cifs_pages_write_redirty(struct inode *inode, loff_t start, unsigned int le xas_for_each(&xas, folio, end) { if (!folio_test_writeback(folio)) { WARN_ONCE(1, "bad %x @%llx page %lx %lx\n", - len, start, folio_index(folio), end); + len, start, folio->index, end); continue; } @@ -1020,14 +1020,16 @@ reopen_success: if (!is_interrupt_error(rc)) mapping_set_error(inode->i_mapping, rc); - if (tcon->posix_extensions) - rc = smb311_posix_get_inode_info(&inode, full_path, inode->i_sb, xid); - else if (tcon->unix_ext) + if (tcon->posix_extensions) { + rc = smb311_posix_get_inode_info(&inode, full_path, + NULL, inode->i_sb, xid); + } else if (tcon->unix_ext) { rc = cifs_get_inode_info_unix(&inode, full_path, inode->i_sb, xid); - else + } else { rc = cifs_get_inode_info(&inode, full_path, NULL, inode->i_sb, xid, NULL); + } } /* * Else we are writing out data to server already and could deadlock if @@ -2118,8 +2120,8 @@ cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset, { loff_t end_of_write = offset + bytes_written; - if (end_of_write > cifsi->server_eof) - cifsi->server_eof = end_of_write; + if (end_of_write > cifsi->netfs.remote_i_size) + netfs_resize_file(&cifsi->netfs, end_of_write, true); } static ssize_t @@ -2649,7 +2651,7 @@ static void cifs_extend_writeback(struct address_space *mapping, continue; if (xa_is_value(folio)) break; - if (folio_index(folio) != index) + if (folio->index != index) break; if (!folio_try_get_rcu(folio)) { xas_reset(&xas); @@ -2897,7 +2899,7 @@ redo_folio: goto skip_write; } - if (folio_mapping(folio) != mapping || + if (folio->mapping != mapping || !folio_test_dirty(folio)) { start += folio_size(folio); folio_unlock(folio); @@ -3245,8 +3247,8 @@ cifs_uncached_writev_complete(struct work_struct *work) spin_lock(&inode->i_lock); cifs_update_eof(cifsi, wdata->offset, wdata->bytes); - if (cifsi->server_eof > inode->i_size) - i_size_write(inode, cifsi->server_eof); + if (cifsi->netfs.remote_i_size > inode->i_size) + i_size_write(inode, cifsi->netfs.remote_i_size); spin_unlock(&inode->i_lock); complete(&wdata->done); @@ -3298,6 +3300,7 @@ cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head *wdata_list, if (wdata->cfile->invalidHandle) rc = -EAGAIN; else { + wdata->replay = true; #ifdef CONFIG_CIFS_SMB_DIRECT if (wdata->mr) { wdata->mr->need_invalidate = true; @@ -5041,27 +5044,13 @@ static void cifs_swap_deactivate(struct file *file) /* do we need to unpin (or unlock) the file */ } -/* - * Mark a page as having been made dirty and thus needing writeback. We also - * need to pin the cache object to write back to. - */ -#ifdef CONFIG_CIFS_FSCACHE -static bool cifs_dirty_folio(struct address_space *mapping, struct folio *folio) -{ - return fscache_dirty_folio(mapping, folio, - cifs_inode_cookie(mapping->host)); -} -#else -#define cifs_dirty_folio filemap_dirty_folio -#endif - const struct address_space_operations cifs_addr_ops = { .read_folio = cifs_read_folio, .readahead = cifs_readahead, .writepages = cifs_writepages, .write_begin = cifs_write_begin, .write_end = cifs_write_end, - .dirty_folio = cifs_dirty_folio, + .dirty_folio = netfs_dirty_folio, .release_folio = cifs_release_folio, .direct_IO = cifs_direct_io, .invalidate_folio = cifs_invalidate_folio, @@ -5085,7 +5074,7 @@ const struct address_space_operations cifs_addr_ops_smallbuf = { .writepages = cifs_writepages, .write_begin = cifs_write_begin, .write_end = cifs_write_end, - .dirty_folio = cifs_dirty_folio, + .dirty_folio = netfs_dirty_folio, .release_folio = cifs_release_folio, .invalidate_folio = cifs_invalidate_folio, .launder_folio = cifs_launder_folio, diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index a3493da12ad1..52cbef2eeb28 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -139,6 +139,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = { fsparam_u32("dir_mode", Opt_dirmode), fsparam_u32("port", Opt_port), fsparam_u32("min_enc_offload", Opt_min_enc_offload), + fsparam_u32("retrans", Opt_retrans), fsparam_u32("esize", Opt_min_enc_offload), fsparam_u32("bsize", Opt_blocksize), fsparam_u32("rasize", Opt_rasize), @@ -1064,6 +1065,9 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, case Opt_min_enc_offload: ctx->min_offload = result.uint_32; break; + case Opt_retrans: + ctx->retrans = result.uint_32; + break; case Opt_blocksize: /* * inode blocksize realistically should never need to be @@ -1619,6 +1623,8 @@ int smb3_init_fs_context(struct fs_context *fc) ctx->backupuid_specified = false; /* no backup intent for a user */ ctx->backupgid_specified = false; /* no backup intent for a group */ + ctx->retrans = 1; + /* * short int override_uid = -1; * short int override_gid = -1; diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h index cf46916286d0..182ce11cbe93 100644 --- a/fs/smb/client/fs_context.h +++ b/fs/smb/client/fs_context.h @@ -118,6 +118,7 @@ enum cifs_param { Opt_file_mode, Opt_dirmode, Opt_min_enc_offload, + Opt_retrans, Opt_blocksize, Opt_rasize, Opt_rsize, @@ -245,6 +246,7 @@ struct smb3_fs_context { unsigned int rsize; unsigned int wsize; unsigned int min_offload; + unsigned int retrans; bool sockopt_tcp_nodelay:1; /* attribute cache timemout for files and directories in jiffies */ unsigned long acregmax; diff --git a/fs/smb/client/fscache.c b/fs/smb/client/fscache.c index e5cad149f5a2..c4a3cb736881 100644 --- a/fs/smb/client/fscache.c +++ b/fs/smb/client/fscache.c @@ -180,7 +180,7 @@ static int fscache_fallback_write_pages(struct inode *inode, loff_t start, size_ if (ret < 0) return ret; - ret = cres.ops->prepare_write(&cres, &start, &len, i_size_read(inode), + ret = cres.ops->prepare_write(&cres, &start, &len, len, i_size_read(inode), no_space_allocated_yet); if (ret == 0) ret = fscache_write(&cres, start, &iter, NULL, NULL); diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 09c5c0f5c96e..d02f8ba29cb5 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -104,7 +104,7 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr) fattr->cf_mtime = timestamp_truncate(fattr->cf_mtime, inode); mtime = inode_get_mtime(inode); if (timespec64_equal(&mtime, &fattr->cf_mtime) && - cifs_i->server_eof == fattr->cf_eof) { + cifs_i->netfs.remote_i_size == fattr->cf_eof) { cifs_dbg(FYI, "%s: inode %llu is unchanged\n", __func__, cifs_i->uniqueid); return; @@ -182,6 +182,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) inode->i_mode = fattr->cf_mode; cifs_i->cifsAttrs = fattr->cf_cifsattrs; + cifs_i->reparse_tag = fattr->cf_cifstag; if (fattr->cf_flags & CIFS_FATTR_NEED_REVAL) cifs_i->time = 0; @@ -193,7 +194,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) else clear_bit(CIFS_INO_DELETE_PENDING, &cifs_i->flags); - cifs_i->server_eof = fattr->cf_eof; + cifs_i->netfs.remote_i_size = fattr->cf_eof; /* * Can't safely change the file size here if the client is writing to * it due to potential races. @@ -209,7 +210,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) inode->i_blocks = (512 - 1 + fattr->cf_bytes) >> 9; } - if (S_ISLNK(fattr->cf_mode)) { + if (S_ISLNK(fattr->cf_mode) && fattr->cf_symlink_target) { kfree(cifs_i->symlink_target); cifs_i->symlink_target = fattr->cf_symlink_target; fattr->cf_symlink_target = NULL; @@ -664,8 +665,6 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path, /* Fill a cifs_fattr struct with info from POSIX info struct */ static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct cifs_open_info_data *data, - struct cifs_sid *owner, - struct cifs_sid *group, struct super_block *sb) { struct smb311_posix_qinfo *info = &data->posix_fi; @@ -691,31 +690,38 @@ static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr, fattr->cf_mtime.tv_sec += tcon->ses->server->timeAdj; } + /* + * The srv fs device id is overridden on network mount so setting + * @fattr->cf_rdev isn't needed here. + */ fattr->cf_eof = le64_to_cpu(info->EndOfFile); fattr->cf_bytes = le64_to_cpu(info->AllocationSize); fattr->cf_createtime = le64_to_cpu(info->CreationTime); - fattr->cf_nlink = le32_to_cpu(info->HardLinks); fattr->cf_mode = (umode_t) le32_to_cpu(info->Mode); - /* The srv fs device id is overridden on network mount so setting rdev isn't needed here */ - /* fattr->cf_rdev = le32_to_cpu(info->DeviceId); */ - if (data->symlink) { - fattr->cf_mode |= S_IFLNK; - fattr->cf_dtype = DT_LNK; - fattr->cf_symlink_target = data->symlink_target; - data->symlink_target = NULL; - } else if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { + if (cifs_open_data_reparse(data) && + cifs_reparse_point_to_fattr(cifs_sb, fattr, data)) + goto out_reparse; + + fattr->cf_mode &= ~S_IFMT; + if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { fattr->cf_mode |= S_IFDIR; fattr->cf_dtype = DT_DIR; } else { /* file */ fattr->cf_mode |= S_IFREG; fattr->cf_dtype = DT_REG; } - /* else if reparse point ... TODO: add support for FIFO and blk dev; special file types */ - sid_to_id(cifs_sb, owner, fattr, SIDOWNER); - sid_to_id(cifs_sb, group, fattr, SIDGROUP); +out_reparse: + if (S_ISLNK(fattr->cf_mode)) { + if (likely(data->symlink_target)) + fattr->cf_eof = strnlen(data->symlink_target, PATH_MAX); + fattr->cf_symlink_target = data->symlink_target; + data->symlink_target = NULL; + } + sid_to_id(cifs_sb, &data->posix_owner, fattr, SIDOWNER); + sid_to_id(cifs_sb, &data->posix_group, fattr, SIDGROUP); cifs_dbg(FYI, "POSIX query info: mode 0x%x uniqueid 0x%llx nlink %d\n", fattr->cf_mode, fattr->cf_uniqueid, fattr->cf_nlink); @@ -738,25 +744,25 @@ bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb, if (tag == IO_REPARSE_TAG_NFS && buf) { switch (le64_to_cpu(buf->InodeType)) { case NFS_SPECFILE_CHR: - fattr->cf_mode |= S_IFCHR | cifs_sb->ctx->file_mode; + fattr->cf_mode |= S_IFCHR; fattr->cf_dtype = DT_CHR; fattr->cf_rdev = nfs_mkdev(buf); break; case NFS_SPECFILE_BLK: - fattr->cf_mode |= S_IFBLK | cifs_sb->ctx->file_mode; + fattr->cf_mode |= S_IFBLK; fattr->cf_dtype = DT_BLK; fattr->cf_rdev = nfs_mkdev(buf); break; case NFS_SPECFILE_FIFO: - fattr->cf_mode |= S_IFIFO | cifs_sb->ctx->file_mode; + fattr->cf_mode |= S_IFIFO; fattr->cf_dtype = DT_FIFO; break; case NFS_SPECFILE_SOCK: - fattr->cf_mode |= S_IFSOCK | cifs_sb->ctx->file_mode; + fattr->cf_mode |= S_IFSOCK; fattr->cf_dtype = DT_SOCK; break; case NFS_SPECFILE_LNK: - fattr->cf_mode = S_IFLNK | cifs_sb->ctx->file_mode; + fattr->cf_mode |= S_IFLNK; fattr->cf_dtype = DT_LNK; break; default: @@ -768,29 +774,29 @@ bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb, switch (tag) { case IO_REPARSE_TAG_LX_SYMLINK: - fattr->cf_mode |= S_IFLNK | cifs_sb->ctx->file_mode; + fattr->cf_mode |= S_IFLNK; fattr->cf_dtype = DT_LNK; break; case IO_REPARSE_TAG_LX_FIFO: - fattr->cf_mode |= S_IFIFO | cifs_sb->ctx->file_mode; + fattr->cf_mode |= S_IFIFO; fattr->cf_dtype = DT_FIFO; break; case IO_REPARSE_TAG_AF_UNIX: - fattr->cf_mode |= S_IFSOCK | cifs_sb->ctx->file_mode; + fattr->cf_mode |= S_IFSOCK; fattr->cf_dtype = DT_SOCK; break; case IO_REPARSE_TAG_LX_CHR: - fattr->cf_mode |= S_IFCHR | cifs_sb->ctx->file_mode; + fattr->cf_mode |= S_IFCHR; fattr->cf_dtype = DT_CHR; break; case IO_REPARSE_TAG_LX_BLK: - fattr->cf_mode |= S_IFBLK | cifs_sb->ctx->file_mode; + fattr->cf_mode |= S_IFBLK; fattr->cf_dtype = DT_BLK; break; case 0: /* SMB1 symlink */ case IO_REPARSE_TAG_SYMLINK: case IO_REPARSE_TAG_NFS: - fattr->cf_mode = S_IFLNK | cifs_sb->ctx->file_mode; + fattr->cf_mode |= S_IFLNK; fattr->cf_dtype = DT_LNK; break; default: @@ -830,6 +836,7 @@ static void cifs_open_info_to_fattr(struct cifs_fattr *fattr, fattr->cf_createtime = le64_to_cpu(info->CreationTime); fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks); + fattr->cf_mode = cifs_sb->ctx->file_mode; if (cifs_open_data_reparse(data) && cifs_reparse_point_to_fattr(cifs_sb, fattr, data)) goto out_reparse; @@ -1076,6 +1083,9 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data, &rsp_iov, &rsp_buftype); if (!rc) iov = &rsp_iov; + } else if (data->reparse.io.buftype != CIFS_NO_BUFFER && + data->reparse.io.iov.iov_base) { + iov = &data->reparse.io.iov; } rc = -EOPNOTSUPP; @@ -1092,17 +1102,22 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data, rc = 0; goto out; default: - if (data->symlink_target) { + /* Check for cached reparse point data */ + if (data->symlink_target || data->reparse.buf) { rc = 0; - } else if (server->ops->parse_reparse_point) { + } else if (iov && server->ops->parse_reparse_point) { rc = server->ops->parse_reparse_point(cifs_sb, iov, data); } break; } - cifs_open_info_to_fattr(fattr, data, sb); + if (tcon->posix_extensions) + smb311_posix_info_to_fattr(fattr, data, sb); + else + cifs_open_info_to_fattr(fattr, data, sb); out: + fattr->cf_cifstag = data->reparse.tag; free_rsp_buf(rsp_buftype, rsp_iov.iov_base); return rc; } @@ -1290,31 +1305,34 @@ out: return rc; } -static int smb311_posix_get_fattr(struct cifs_fattr *fattr, +static int smb311_posix_get_fattr(struct cifs_open_info_data *data, + struct cifs_fattr *fattr, const char *full_path, struct super_block *sb, const unsigned int xid) { - struct cifs_open_info_data data = {}; + struct cifs_open_info_data tmp_data = {}; + struct TCP_Server_Info *server; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct cifs_tcon *tcon; struct tcon_link *tlink; - struct cifs_sid owner, group; int tmprc; - int rc; + int rc = 0; tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) return PTR_ERR(tlink); tcon = tlink_tcon(tlink); + server = tcon->ses->server; /* - * 1. Fetch file metadata + * 1. Fetch file metadata if not provided (data) */ - - rc = smb311_posix_query_path_info(xid, tcon, cifs_sb, - full_path, &data, - &owner, &group); + if (!data) { + rc = server->ops->query_path_info(xid, tcon, cifs_sb, + full_path, &tmp_data); + data = &tmp_data; + } /* * 2. Convert it to internal cifs metadata (fattr) @@ -1322,7 +1340,12 @@ static int smb311_posix_get_fattr(struct cifs_fattr *fattr, switch (rc) { case 0: - smb311_posix_info_to_fattr(fattr, &data, &owner, &group, sb); + if (cifs_open_data_reparse(data)) { + rc = reparse_info_to_fattr(data, sb, xid, tcon, + full_path, fattr); + } else { + smb311_posix_info_to_fattr(fattr, data, sb); + } break; case -EREMOTE: /* DFS link, no metadata available on this server */ @@ -1353,12 +1376,15 @@ static int smb311_posix_get_fattr(struct cifs_fattr *fattr, out: cifs_put_tlink(tlink); - cifs_free_open_info(&data); + cifs_free_open_info(data); return rc; } -int smb311_posix_get_inode_info(struct inode **inode, const char *full_path, - struct super_block *sb, const unsigned int xid) +int smb311_posix_get_inode_info(struct inode **inode, + const char *full_path, + struct cifs_open_info_data *data, + struct super_block *sb, + const unsigned int xid) { struct cifs_fattr fattr = {}; int rc; @@ -1368,7 +1394,7 @@ int smb311_posix_get_inode_info(struct inode **inode, const char *full_path, return 0; } - rc = smb311_posix_get_fattr(&fattr, full_path, sb, xid); + rc = smb311_posix_get_fattr(data, &fattr, full_path, sb, xid); if (rc) goto out; @@ -1516,7 +1542,7 @@ struct inode *cifs_root_iget(struct super_block *sb) convert_delimiter(path, CIFS_DIR_SEP(cifs_sb)); if (tcon->posix_extensions) - rc = smb311_posix_get_fattr(&fattr, path, sb, xid); + rc = smb311_posix_get_fattr(NULL, &fattr, path, sb, xid); else rc = cifs_get_fattr(NULL, sb, xid, NULL, &fattr, &inode, path); @@ -1889,16 +1915,18 @@ cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode, int rc = 0; struct inode *inode = NULL; - if (tcon->posix_extensions) - rc = smb311_posix_get_inode_info(&inode, full_path, parent->i_sb, xid); + if (tcon->posix_extensions) { + rc = smb311_posix_get_inode_info(&inode, full_path, + NULL, parent->i_sb, xid); #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY - else if (tcon->unix_ext) + } else if (tcon->unix_ext) { rc = cifs_get_inode_info_unix(&inode, full_path, parent->i_sb, xid); #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ - else + } else { rc = cifs_get_inode_info(&inode, full_path, NULL, parent->i_sb, xid, NULL); + } if (rc) return rc; @@ -2219,7 +2247,8 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry, return -ENOSYS; /* try path-based rename first */ - rc = server->ops->rename(xid, tcon, from_path, to_path, cifs_sb); + rc = server->ops->rename(xid, tcon, from_dentry, + from_path, to_path, cifs_sb); /* * Don't bother with rename by filehandle unless file is busy and @@ -2579,13 +2608,15 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry) dentry, cifs_get_time(dentry), jiffies); again: - if (cifs_sb_master_tcon(CIFS_SB(sb))->posix_extensions) - rc = smb311_posix_get_inode_info(&inode, full_path, sb, xid); - else if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext) + if (cifs_sb_master_tcon(CIFS_SB(sb))->posix_extensions) { + rc = smb311_posix_get_inode_info(&inode, full_path, + NULL, sb, xid); + } else if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext) { rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid); - else + } else { rc = cifs_get_inode_info(&inode, full_path, NULL, sb, xid, NULL); + } if (rc == -EAGAIN && count++ < 10) goto again; out: @@ -2827,7 +2858,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs, set_size_out: if (rc == 0) { - cifsInode->server_eof = attrs->ia_size; + netfs_resize_file(&cifsInode->netfs, attrs->ia_size, true); cifs_setsize(inode, attrs->ia_size); /* * i_blocks is not related to (i_size / i_blksize), but instead @@ -2980,6 +3011,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) if ((attrs->ia_valid & ATTR_SIZE) && attrs->ia_size != i_size_read(inode)) { truncate_setsize(inode, attrs->ia_size); + netfs_resize_file(&cifsInode->netfs, attrs->ia_size, true); fscache_resize_cookie(cifs_inode_cookie(inode), attrs->ia_size); } @@ -3179,6 +3211,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) if ((attrs->ia_valid & ATTR_SIZE) && attrs->ia_size != i_size_read(inode)) { truncate_setsize(inode, attrs->ia_size); + netfs_resize_file(&cifsInode->netfs, attrs->ia_size, true); fscache_resize_cookie(cifs_inode_cookie(inode), attrs->ia_size); } diff --git a/fs/smb/client/link.c b/fs/smb/client/link.c index a1da50e66fbb..d86da949a919 100644 --- a/fs/smb/client/link.c +++ b/fs/smb/client/link.c @@ -510,8 +510,8 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode, rc = -ENOSYS; goto cifs_hl_exit; } - rc = server->ops->create_hardlink(xid, tcon, from_name, to_name, - cifs_sb); + rc = server->ops->create_hardlink(xid, tcon, old_file, + from_name, to_name, cifs_sb); if ((rc == -EIO) || (rc == -EINVAL)) rc = -EOPNOTSUPP; } @@ -569,6 +569,7 @@ cifs_symlink(struct mnt_idmap *idmap, struct inode *inode, int rc = -EOPNOTSUPP; unsigned int xid; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct TCP_Server_Info *server; struct tcon_link *tlink; struct cifs_tcon *pTcon; const char *full_path; @@ -590,6 +591,7 @@ cifs_symlink(struct mnt_idmap *idmap, struct inode *inode, goto symlink_exit; } pTcon = tlink_tcon(tlink); + server = cifs_pick_channel(pTcon->ses); full_path = build_path_from_dentry(direntry, page); if (IS_ERR(full_path)) { @@ -601,27 +603,32 @@ cifs_symlink(struct mnt_idmap *idmap, struct inode *inode, cifs_dbg(FYI, "symname is %s\n", symname); /* BB what if DFS and this volume is on different share? BB */ - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) { rc = create_mf_symlink(xid, pTcon, cifs_sb, full_path, symname); #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY - else if (pTcon->unix_ext) + } else if (pTcon->unix_ext) { rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname, cifs_sb->local_nls, cifs_remap(cifs_sb)); #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ - /* else - rc = CIFSCreateReparseSymLink(xid, pTcon, fromName, toName, - cifs_sb_target->local_nls); */ + } else if (server->ops->create_reparse_symlink) { + rc = server->ops->create_reparse_symlink(xid, inode, direntry, + pTcon, full_path, + symname); + goto symlink_exit; + } if (rc == 0) { - if (pTcon->posix_extensions) - rc = smb311_posix_get_inode_info(&newinode, full_path, inode->i_sb, xid); - else if (pTcon->unix_ext) + if (pTcon->posix_extensions) { + rc = smb311_posix_get_inode_info(&newinode, full_path, + NULL, inode->i_sb, xid); + } else if (pTcon->unix_ext) { rc = cifs_get_inode_info_unix(&newinode, full_path, inode->i_sb, xid); - else + } else { rc = cifs_get_inode_info(&newinode, full_path, NULL, inode->i_sb, xid, NULL); + } if (rc != 0) { cifs_dbg(FYI, "Create symlink ok, getinodeinfo fail rc = %d\n", diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c index c2137ea3c253..0748d7b757b9 100644 --- a/fs/smb/client/misc.c +++ b/fs/smb/client/misc.c @@ -140,6 +140,7 @@ tcon_info_alloc(bool dir_leases_enabled) spin_lock_init(&ret_buf->stat_lock); atomic_set(&ret_buf->num_local_opens, 0); atomic_set(&ret_buf->num_remote_opens, 0); + ret_buf->stats_from_time = ktime_get_real_seconds(); #ifdef CONFIG_CIFS_DFS_UPCALL INIT_LIST_HEAD(&ret_buf->dfs_ses_list); #endif diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c index d30ea2005eb3..3b1b01d10f7d 100644 --- a/fs/smb/client/readdir.c +++ b/fs/smb/client/readdir.c @@ -56,6 +56,23 @@ static inline void dump_cifs_file_struct(struct file *file, char *label) #endif /* DEBUG2 */ /* + * Match a reparse point inode if reparse tag and ctime haven't changed. + * + * Windows Server updates ctime of reparse points when their data have changed. + * The server doesn't allow changing reparse tags from existing reparse points, + * though it's worth checking. + */ +static inline bool reparse_inode_match(struct inode *inode, + struct cifs_fattr *fattr) +{ + struct timespec64 ctime = inode_get_ctime(inode); + + return (CIFS_I(inode)->cifsAttrs & ATTR_REPARSE) && + CIFS_I(inode)->reparse_tag == fattr->cf_cifstag && + timespec64_equal(&ctime, &fattr->cf_ctime); +} + +/* * Attempt to preload the dcache with the results from the FIND_FIRST/NEXT * * Find the dentry that matches "name". If there isn't one, create one. If it's @@ -71,6 +88,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name, struct super_block *sb = parent->d_sb; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + int rc; cifs_dbg(FYI, "%s: for %s\n", __func__, name->name); @@ -82,9 +100,11 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name, * We'll end up doing an on the wire call either way and * this spares us an invalidation. */ - if (fattr->cf_flags & CIFS_FATTR_NEED_REVAL) - return; retry: + if ((fattr->cf_cifsattrs & ATTR_REPARSE) || + (fattr->cf_flags & CIFS_FATTR_NEED_REVAL)) + return; + dentry = d_alloc_parallel(parent, name, &wq); } if (IS_ERR(dentry)) @@ -104,12 +124,34 @@ retry: if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) fattr->cf_uniqueid = CIFS_I(inode)->uniqueid; - /* update inode in place - * if both i_ino and i_mode didn't change */ - if (CIFS_I(inode)->uniqueid == fattr->cf_uniqueid && - cifs_fattr_to_inode(inode, fattr) == 0) { - dput(dentry); - return; + /* + * Update inode in place if both i_ino and i_mode didn't + * change. + */ + if (CIFS_I(inode)->uniqueid == fattr->cf_uniqueid) { + /* + * Query dir responses don't provide enough + * information about reparse points other than + * their reparse tags. Save an invalidation by + * not clobbering some existing attributes when + * reparse tag and ctime haven't changed. + */ + rc = 0; + if (fattr->cf_cifsattrs & ATTR_REPARSE) { + if (likely(reparse_inode_match(inode, fattr))) { + fattr->cf_mode = inode->i_mode; + fattr->cf_rdev = inode->i_rdev; + fattr->cf_eof = CIFS_I(inode)->netfs.remote_i_size; + fattr->cf_symlink_target = NULL; + } else { + CIFS_I(inode)->time = 0; + rc = -ESTALE; + } + } + if (!rc && !cifs_fattr_to_inode(inode, fattr)) { + dput(dentry); + return; + } } } d_invalidate(dentry); @@ -127,29 +169,6 @@ retry: dput(dentry); } -static bool reparse_file_needs_reval(const struct cifs_fattr *fattr) -{ - if (!(fattr->cf_cifsattrs & ATTR_REPARSE)) - return false; - /* - * The DFS tags should be only intepreted by server side as per - * MS-FSCC 2.1.2.1, but let's include them anyway. - * - * Besides, if cf_cifstag is unset (0), then we still need it to be - * revalidated to know exactly what reparse point it is. - */ - switch (fattr->cf_cifstag) { - case IO_REPARSE_TAG_DFS: - case IO_REPARSE_TAG_DFSR: - case IO_REPARSE_TAG_SYMLINK: - case IO_REPARSE_TAG_NFS: - case IO_REPARSE_TAG_MOUNT_POINT: - case 0: - return true; - } - return false; -} - static void cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb) { @@ -181,14 +200,6 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb) } out_reparse: - /* - * We need to revalidate it further to make a decision about whether it - * is a symbolic link, DFS referral or a reparse point with a direct - * access like junctions, deduplicated files, NFS symlinks. - */ - if (reparse_file_needs_reval(fattr)) - fattr->cf_flags |= CIFS_FATTR_NEED_REVAL; - /* non-unix readdir doesn't provide nlink */ fattr->cf_flags |= CIFS_FATTR_UNKNOWN_NLINK; @@ -269,9 +280,6 @@ cifs_posix_to_fattr(struct cifs_fattr *fattr, struct smb2_posix_info *info, fattr->cf_dtype = DT_REG; } - if (reparse_file_needs_reval(fattr)) - fattr->cf_flags |= CIFS_FATTR_NEED_REVAL; - sid_to_id(cifs_sb, &parsed.owner, fattr, SIDOWNER); sid_to_id(cifs_sb, &parsed.group, fattr, SIDGROUP); } @@ -331,38 +339,6 @@ cifs_std_info_to_fattr(struct cifs_fattr *fattr, FIND_FILE_STANDARD_INFO *info, cifs_fill_common_info(fattr, cifs_sb); } -/* BB eventually need to add the following helper function to - resolve NT_STATUS_STOPPED_ON_SYMLINK return code when - we try to do FindFirst on (NTFS) directory symlinks */ -/* -int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb, - unsigned int xid) -{ - __u16 fid; - int len; - int oplock = 0; - int rc; - struct cifs_tcon *ptcon = cifs_sb_tcon(cifs_sb); - char *tmpbuffer; - - rc = CIFSSMBOpen(xid, ptcon, full_path, FILE_OPEN, GENERIC_READ, - OPEN_REPARSE_POINT, &fid, &oplock, NULL, - cifs_sb->local_nls, - cifs_remap(cifs_sb); - if (!rc) { - tmpbuffer = kmalloc(maxpath); - rc = CIFSSMBQueryReparseLinkInfo(xid, ptcon, full_path, - tmpbuffer, - maxpath -1, - fid, - cifs_sb->local_nls); - if (CIFSSMBClose(xid, ptcon, fid)) { - cifs_dbg(FYI, "Error closing temporary reparsepoint open\n"); - } - } -} - */ - static int _initiate_cifs_search(const unsigned int xid, struct file *file, const char *full_path) @@ -431,13 +407,10 @@ ffirst_retry: &cifsFile->fid, search_flags, &cifsFile->srch_inf); - if (rc == 0) + if (rc == 0) { cifsFile->invalidHandle = false; - /* BB add following call to handle readdir on new NTFS symlink errors - else if STATUS_STOPPED_ON_SYMLINK - call get_symlink_reparse_path and retry with new path */ - else if ((rc == -EOPNOTSUPP) && - (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) { + } else if ((rc == -EOPNOTSUPP) && + (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) { cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM; goto ffirst_retry; } @@ -672,10 +645,10 @@ static int cifs_entry_is_dot(struct cifs_dirent *de, bool is_unicode) static int is_dir_changed(struct file *file) { struct inode *inode = file_inode(file); - struct cifsInodeInfo *cifsInfo = CIFS_I(inode); + struct cifsInodeInfo *cifs_inode_info = CIFS_I(inode); - if (cifsInfo->time == 0) - return 1; /* directory was changed, perhaps due to unlink */ + if (cifs_inode_info->time == 0) + return 1; /* directory was changed, e.g. unlink or new file */ else return 0; diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c index 2d3b332a79a1..ed4bd88dd528 100644 --- a/fs/smb/client/sess.c +++ b/fs/smb/client/sess.c @@ -75,6 +75,10 @@ cifs_ses_get_chan_index(struct cifs_ses *ses, { unsigned int i; + /* if the channel is waiting for termination */ + if (server->terminate) + return CIFS_INVAL_CHAN_INDEX; + for (i = 0; i < ses->chan_count; i++) { if (ses->chans[i].server == server) return i; @@ -269,6 +273,8 @@ int cifs_try_adding_channels(struct cifs_ses *ses) &iface->sockaddr, rc); kref_put(&iface->refcount, release_iface); + /* failure to add chan should increase weight */ + iface->weight_fulfilled++; continue; } @@ -356,10 +362,9 @@ done: /* * update the iface for the channel if necessary. - * will return 0 when iface is updated, 1 if removed, 2 otherwise * Must be called with chan_lock held. */ -int +void cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server) { unsigned int chan_index; @@ -368,20 +373,19 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server) struct cifs_server_iface *old_iface = NULL; struct cifs_server_iface *last_iface = NULL; struct sockaddr_storage ss; - int rc = 0; spin_lock(&ses->chan_lock); chan_index = cifs_ses_get_chan_index(ses, server); if (chan_index == CIFS_INVAL_CHAN_INDEX) { spin_unlock(&ses->chan_lock); - return 0; + return; } if (ses->chans[chan_index].iface) { old_iface = ses->chans[chan_index].iface; if (old_iface->is_active) { spin_unlock(&ses->chan_lock); - return 1; + return; } } spin_unlock(&ses->chan_lock); @@ -394,7 +398,7 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server) if (!ses->iface_count) { spin_unlock(&ses->iface_lock); cifs_dbg(VFS, "server %s does not advertise interfaces\n", ses->server->hostname); - return 0; + return; } last_iface = list_last_entry(&ses->iface_list, struct cifs_server_iface, @@ -434,16 +438,21 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server) } if (list_entry_is_head(iface, &ses->iface_list, iface_head)) { - rc = 1; iface = NULL; cifs_dbg(FYI, "unable to find a suitable iface\n"); } if (!iface) { - cifs_dbg(FYI, "unable to get the interface matching: %pIS\n", - &ss); + if (!chan_index) + cifs_dbg(FYI, "unable to get the interface matching: %pIS\n", + &ss); + else { + cifs_dbg(FYI, "unable to find another interface to replace: %pIS\n", + &old_iface->sockaddr); + } + spin_unlock(&ses->iface_lock); - return 0; + return; } /* now drop the ref to the current iface */ @@ -459,34 +468,24 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server) iface->weight_fulfilled++; kref_put(&old_iface->refcount, release_iface); - } else if (old_iface) { - /* if a new candidate is not found, keep things as is */ - cifs_dbg(FYI, "could not replace iface: %pIS\n", - &old_iface->sockaddr); } else if (!chan_index) { /* special case: update interface for primary channel */ - if (iface) { - cifs_dbg(FYI, "referencing primary channel iface: %pIS\n", - &iface->sockaddr); - iface->num_channels++; - iface->weight_fulfilled++; - } + cifs_dbg(FYI, "referencing primary channel iface: %pIS\n", + &iface->sockaddr); + iface->num_channels++; + iface->weight_fulfilled++; } spin_unlock(&ses->iface_lock); - if (iface) { - spin_lock(&ses->chan_lock); - chan_index = cifs_ses_get_chan_index(ses, server); - if (chan_index == CIFS_INVAL_CHAN_INDEX) { - spin_unlock(&ses->chan_lock); - return 0; - } - - ses->chans[chan_index].iface = iface; + spin_lock(&ses->chan_lock); + chan_index = cifs_ses_get_chan_index(ses, server); + if (chan_index == CIFS_INVAL_CHAN_INDEX) { spin_unlock(&ses->chan_lock); + return; } - return rc; + ses->chans[chan_index].iface = iface; + spin_unlock(&ses->chan_lock); } /* diff --git a/fs/smb/client/smb2glob.h b/fs/smb/client/smb2glob.h index 82e916ad167c..a0c156996fc5 100644 --- a/fs/smb/client/smb2glob.h +++ b/fs/smb/client/smb2glob.h @@ -23,17 +23,21 @@ * Identifiers for functions that use the open, operation, close pattern * in smb2inode.c:smb2_compound_op() */ -#define SMB2_OP_SET_DELETE 1 -#define SMB2_OP_SET_INFO 2 -#define SMB2_OP_QUERY_INFO 3 -#define SMB2_OP_QUERY_DIR 4 -#define SMB2_OP_MKDIR 5 -#define SMB2_OP_RENAME 6 -#define SMB2_OP_DELETE 7 -#define SMB2_OP_HARDLINK 8 -#define SMB2_OP_SET_EOF 9 -#define SMB2_OP_RMDIR 10 -#define SMB2_OP_POSIX_QUERY_INFO 11 +enum smb2_compound_ops { + SMB2_OP_SET_DELETE = 1, + SMB2_OP_SET_INFO, + SMB2_OP_QUERY_INFO, + SMB2_OP_QUERY_DIR, + SMB2_OP_MKDIR, + SMB2_OP_RENAME, + SMB2_OP_DELETE, + SMB2_OP_HARDLINK, + SMB2_OP_SET_EOF, + SMB2_OP_RMDIR, + SMB2_OP_POSIX_QUERY_INFO, + SMB2_OP_SET_REPARSE, + SMB2_OP_GET_REPARSE +}; /* Used when constructing chained read requests. */ #define CHAINED_REQUEST 1 diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index c94940af5d4b..05818cd6d932 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -26,13 +26,63 @@ #include "cached_dir.h" #include "smb2status.h" -static void -free_set_inf_compound(struct smb_rqst *rqst) +static struct reparse_data_buffer *reparse_buf_ptr(struct kvec *iov) { - if (rqst[1].rq_iov) - SMB2_set_info_free(&rqst[1]); - if (rqst[2].rq_iov) - SMB2_close_free(&rqst[2]); + struct reparse_data_buffer *buf; + struct smb2_ioctl_rsp *io = iov->iov_base; + u32 off, count, len; + + count = le32_to_cpu(io->OutputCount); + off = le32_to_cpu(io->OutputOffset); + if (check_add_overflow(off, count, &len) || len > iov->iov_len) + return ERR_PTR(-EIO); + + buf = (struct reparse_data_buffer *)((u8 *)io + off); + len = sizeof(*buf); + if (count < len || count < le16_to_cpu(buf->ReparseDataLength) + len) + return ERR_PTR(-EIO); + return buf; +} + +static inline __u32 file_create_options(struct dentry *dentry) +{ + struct cifsInodeInfo *ci; + + if (dentry) { + ci = CIFS_I(d_inode(dentry)); + if (ci->cifsAttrs & ATTR_REPARSE) + return OPEN_REPARSE_POINT; + } + return 0; +} + +/* Parse owner and group from SMB3.1.1 POSIX query info */ +static int parse_posix_sids(struct cifs_open_info_data *data, + struct kvec *rsp_iov) +{ + struct smb2_query_info_rsp *qi = rsp_iov->iov_base; + unsigned int out_len = le32_to_cpu(qi->OutputBufferLength); + unsigned int qi_len = sizeof(data->posix_fi); + int owner_len, group_len; + u8 *sidsbuf, *sidsbuf_end; + + if (out_len <= qi_len) + return -EINVAL; + + sidsbuf = (u8 *)qi + le16_to_cpu(qi->OutputBufferOffset) + qi_len; + sidsbuf_end = sidsbuf + out_len - qi_len; + + owner_len = posix_info_sid_size(sidsbuf, sidsbuf_end); + if (owner_len == -1) + return -EINVAL; + + memcpy(&data->posix_owner, sidsbuf, owner_len); + group_len = posix_info_sid_size(sidsbuf + owner_len, sidsbuf_end); + if (group_len == -1) + return -EINVAL; + + memcpy(&data->posix_group, sidsbuf + owner_len, group_len); + return 0; } /* @@ -45,13 +95,15 @@ free_set_inf_compound(struct smb_rqst *rqst) */ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, - __u32 desired_access, __u32 create_disposition, __u32 create_options, - umode_t mode, void *ptr, int command, struct cifsFileInfo *cfile, - __u8 **extbuf, size_t *extbuflen, + __u32 desired_access, __u32 create_disposition, + __u32 create_options, umode_t mode, struct kvec *in_iov, + int *cmds, int num_cmds, struct cifsFileInfo *cfile, struct kvec *out_iov, int *out_buftype) { + + struct reparse_data_buffer *rbuf; struct smb2_compound_vars *vars = NULL; - struct kvec *rsp_iov; + struct kvec *rsp_iov, *iov; struct smb_rqst *rqst; int rc; __le16 *utf16_path = NULL; @@ -59,8 +111,8 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid fid; struct cifs_ses *ses = tcon->ses; struct TCP_Server_Info *server; - int num_rqst = 0; - int resp_buftype[3]; + int num_rqst = 0, i; + int resp_buftype[MAX_COMPOUND]; struct smb2_query_info_rsp *qi_rsp = NULL; struct cifs_open_info_data *idata; int flags = 0; @@ -68,6 +120,14 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, unsigned int size[2]; void *data[2]; int len; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + oplock = SMB2_OPLOCK_LEVEL_NONE; + num_rqst = 0; + server = cifs_pick_channel(ses); vars = kzalloc(sizeof(*vars), GFP_ATOMIC); if (vars == NULL) @@ -75,12 +135,11 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, rqst = &vars->rqst[0]; rsp_iov = &vars->rsp_iov[0]; - server = cifs_pick_channel(ses); - if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER; + for (i = 0; i < ARRAY_SIZE(resp_buftype); i++) + resp_buftype[i] = CIFS_NO_BUFFER; /* We already have a handle so we can skip the open */ if (cfile) @@ -118,242 +177,277 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, num_rqst++; rc = 0; - /* Operation */ - switch (command) { - case SMB2_OP_QUERY_INFO: - rqst[num_rqst].rq_iov = &vars->qi_iov; - rqst[num_rqst].rq_nvec = 1; - - if (cfile) - rc = SMB2_query_info_init(tcon, server, - &rqst[num_rqst], - cfile->fid.persistent_fid, - cfile->fid.volatile_fid, - FILE_ALL_INFORMATION, - SMB2_O_INFO_FILE, 0, - sizeof(struct smb2_file_all_info) + - PATH_MAX * 2, 0, NULL); - else { - rc = SMB2_query_info_init(tcon, server, - &rqst[num_rqst], - COMPOUND_FID, - COMPOUND_FID, - FILE_ALL_INFORMATION, - SMB2_O_INFO_FILE, 0, - sizeof(struct smb2_file_all_info) + - PATH_MAX * 2, 0, NULL); - if (!rc) { - smb2_set_next_command(tcon, &rqst[num_rqst]); - smb2_set_related(&rqst[num_rqst]); + for (i = 0; i < num_cmds; i++) { + /* Operation */ + switch (cmds[i]) { + case SMB2_OP_QUERY_INFO: + rqst[num_rqst].rq_iov = &vars->qi_iov; + rqst[num_rqst].rq_nvec = 1; + + if (cfile) { + rc = SMB2_query_info_init(tcon, server, + &rqst[num_rqst], + cfile->fid.persistent_fid, + cfile->fid.volatile_fid, + FILE_ALL_INFORMATION, + SMB2_O_INFO_FILE, 0, + sizeof(struct smb2_file_all_info) + + PATH_MAX * 2, 0, NULL); + } else { + rc = SMB2_query_info_init(tcon, server, + &rqst[num_rqst], + COMPOUND_FID, + COMPOUND_FID, + FILE_ALL_INFORMATION, + SMB2_O_INFO_FILE, 0, + sizeof(struct smb2_file_all_info) + + PATH_MAX * 2, 0, NULL); + if (!rc) { + smb2_set_next_command(tcon, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst]); + } } - } - if (rc) - goto finished; - num_rqst++; - trace_smb3_query_info_compound_enter(xid, ses->Suid, tcon->tid, - full_path); - break; - case SMB2_OP_POSIX_QUERY_INFO: - rqst[num_rqst].rq_iov = &vars->qi_iov; - rqst[num_rqst].rq_nvec = 1; - - if (cfile) - rc = SMB2_query_info_init(tcon, server, - &rqst[num_rqst], - cfile->fid.persistent_fid, - cfile->fid.volatile_fid, - SMB_FIND_FILE_POSIX_INFO, - SMB2_O_INFO_FILE, 0, + if (rc) + goto finished; + num_rqst++; + trace_smb3_query_info_compound_enter(xid, ses->Suid, + tcon->tid, full_path); + break; + case SMB2_OP_POSIX_QUERY_INFO: + rqst[num_rqst].rq_iov = &vars->qi_iov; + rqst[num_rqst].rq_nvec = 1; + + if (cfile) { /* TBD: fix following to allow for longer SIDs */ - sizeof(struct smb311_posix_qinfo *) + (PATH_MAX * 2) + - (sizeof(struct cifs_sid) * 2), 0, NULL); - else { - rc = SMB2_query_info_init(tcon, server, - &rqst[num_rqst], - COMPOUND_FID, - COMPOUND_FID, - SMB_FIND_FILE_POSIX_INFO, - SMB2_O_INFO_FILE, 0, - sizeof(struct smb311_posix_qinfo *) + (PATH_MAX * 2) + - (sizeof(struct cifs_sid) * 2), 0, NULL); - if (!rc) { - smb2_set_next_command(tcon, &rqst[num_rqst]); - smb2_set_related(&rqst[num_rqst]); + rc = SMB2_query_info_init(tcon, server, + &rqst[num_rqst], + cfile->fid.persistent_fid, + cfile->fid.volatile_fid, + SMB_FIND_FILE_POSIX_INFO, + SMB2_O_INFO_FILE, 0, + sizeof(struct smb311_posix_qinfo *) + + (PATH_MAX * 2) + + (sizeof(struct cifs_sid) * 2), 0, NULL); + } else { + rc = SMB2_query_info_init(tcon, server, + &rqst[num_rqst], + COMPOUND_FID, + COMPOUND_FID, + SMB_FIND_FILE_POSIX_INFO, + SMB2_O_INFO_FILE, 0, + sizeof(struct smb311_posix_qinfo *) + + (PATH_MAX * 2) + + (sizeof(struct cifs_sid) * 2), 0, NULL); + if (!rc) { + smb2_set_next_command(tcon, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst]); + } } - } - if (rc) - goto finished; - num_rqst++; - trace_smb3_posix_query_info_compound_enter(xid, ses->Suid, tcon->tid, full_path); - break; - case SMB2_OP_DELETE: - trace_smb3_delete_enter(xid, ses->Suid, tcon->tid, full_path); - break; - case SMB2_OP_MKDIR: - /* - * Directories are created through parameters in the - * SMB2_open() call. - */ - trace_smb3_mkdir_enter(xid, ses->Suid, tcon->tid, full_path); - break; - case SMB2_OP_RMDIR: - rqst[num_rqst].rq_iov = &vars->si_iov[0]; - rqst[num_rqst].rq_nvec = 1; - - size[0] = 1; /* sizeof __u8 See MS-FSCC section 2.4.11 */ - data[0] = &delete_pending[0]; - - rc = SMB2_set_info_init(tcon, server, - &rqst[num_rqst], COMPOUND_FID, - COMPOUND_FID, current->tgid, - FILE_DISPOSITION_INFORMATION, - SMB2_O_INFO_FILE, 0, data, size); - if (rc) - goto finished; - smb2_set_next_command(tcon, &rqst[num_rqst]); - smb2_set_related(&rqst[num_rqst++]); - trace_smb3_rmdir_enter(xid, ses->Suid, tcon->tid, full_path); - break; - case SMB2_OP_SET_EOF: - rqst[num_rqst].rq_iov = &vars->si_iov[0]; - rqst[num_rqst].rq_nvec = 1; + if (rc) + goto finished; + num_rqst++; + trace_smb3_posix_query_info_compound_enter(xid, ses->Suid, + tcon->tid, full_path); + break; + case SMB2_OP_DELETE: + trace_smb3_delete_enter(xid, ses->Suid, tcon->tid, full_path); + break; + case SMB2_OP_MKDIR: + /* + * Directories are created through parameters in the + * SMB2_open() call. + */ + trace_smb3_mkdir_enter(xid, ses->Suid, tcon->tid, full_path); + break; + case SMB2_OP_RMDIR: + rqst[num_rqst].rq_iov = &vars->si_iov[0]; + rqst[num_rqst].rq_nvec = 1; - size[0] = 8; /* sizeof __le64 */ - data[0] = ptr; + size[0] = 1; /* sizeof __u8 See MS-FSCC section 2.4.11 */ + data[0] = &delete_pending[0]; - if (cfile) { - rc = SMB2_set_info_init(tcon, server, - &rqst[num_rqst], - cfile->fid.persistent_fid, - cfile->fid.volatile_fid, - current->tgid, - FILE_END_OF_FILE_INFORMATION, - SMB2_O_INFO_FILE, 0, - data, size); - } else { rc = SMB2_set_info_init(tcon, server, - &rqst[num_rqst], - COMPOUND_FID, - COMPOUND_FID, - current->tgid, - FILE_END_OF_FILE_INFORMATION, - SMB2_O_INFO_FILE, 0, - data, size); - if (!rc) { - smb2_set_next_command(tcon, &rqst[num_rqst]); - smb2_set_related(&rqst[num_rqst]); + &rqst[num_rqst], COMPOUND_FID, + COMPOUND_FID, current->tgid, + FILE_DISPOSITION_INFORMATION, + SMB2_O_INFO_FILE, 0, data, size); + if (rc) + goto finished; + smb2_set_next_command(tcon, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst++]); + trace_smb3_rmdir_enter(xid, ses->Suid, tcon->tid, full_path); + break; + case SMB2_OP_SET_EOF: + rqst[num_rqst].rq_iov = &vars->si_iov[0]; + rqst[num_rqst].rq_nvec = 1; + + size[0] = in_iov[i].iov_len; + data[0] = in_iov[i].iov_base; + + if (cfile) { + rc = SMB2_set_info_init(tcon, server, + &rqst[num_rqst], + cfile->fid.persistent_fid, + cfile->fid.volatile_fid, + current->tgid, + FILE_END_OF_FILE_INFORMATION, + SMB2_O_INFO_FILE, 0, + data, size); + } else { + rc = SMB2_set_info_init(tcon, server, + &rqst[num_rqst], + COMPOUND_FID, + COMPOUND_FID, + current->tgid, + FILE_END_OF_FILE_INFORMATION, + SMB2_O_INFO_FILE, 0, + data, size); + if (!rc) { + smb2_set_next_command(tcon, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst]); + } } - } - if (rc) - goto finished; - num_rqst++; - trace_smb3_set_eof_enter(xid, ses->Suid, tcon->tid, full_path); - break; - case SMB2_OP_SET_INFO: - rqst[num_rqst].rq_iov = &vars->si_iov[0]; - rqst[num_rqst].rq_nvec = 1; - - - size[0] = sizeof(FILE_BASIC_INFO); - data[0] = ptr; - - if (cfile) - rc = SMB2_set_info_init(tcon, server, - &rqst[num_rqst], - cfile->fid.persistent_fid, - cfile->fid.volatile_fid, current->tgid, - FILE_BASIC_INFORMATION, - SMB2_O_INFO_FILE, 0, data, size); - else { - rc = SMB2_set_info_init(tcon, server, - &rqst[num_rqst], - COMPOUND_FID, - COMPOUND_FID, current->tgid, - FILE_BASIC_INFORMATION, - SMB2_O_INFO_FILE, 0, data, size); - if (!rc) { - smb2_set_next_command(tcon, &rqst[num_rqst]); - smb2_set_related(&rqst[num_rqst]); + if (rc) + goto finished; + num_rqst++; + trace_smb3_set_eof_enter(xid, ses->Suid, tcon->tid, full_path); + break; + case SMB2_OP_SET_INFO: + rqst[num_rqst].rq_iov = &vars->si_iov[0]; + rqst[num_rqst].rq_nvec = 1; + + size[0] = in_iov[i].iov_len; + data[0] = in_iov[i].iov_base; + + if (cfile) { + rc = SMB2_set_info_init(tcon, server, + &rqst[num_rqst], + cfile->fid.persistent_fid, + cfile->fid.volatile_fid, current->tgid, + FILE_BASIC_INFORMATION, + SMB2_O_INFO_FILE, 0, data, size); + } else { + rc = SMB2_set_info_init(tcon, server, + &rqst[num_rqst], + COMPOUND_FID, + COMPOUND_FID, current->tgid, + FILE_BASIC_INFORMATION, + SMB2_O_INFO_FILE, 0, data, size); + if (!rc) { + smb2_set_next_command(tcon, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst]); + } } - } - if (rc) - goto finished; - num_rqst++; - trace_smb3_set_info_compound_enter(xid, ses->Suid, tcon->tid, - full_path); - break; - case SMB2_OP_RENAME: - rqst[num_rqst].rq_iov = &vars->si_iov[0]; - rqst[num_rqst].rq_nvec = 2; + if (rc) + goto finished; + num_rqst++; + trace_smb3_set_info_compound_enter(xid, ses->Suid, + tcon->tid, full_path); + break; + case SMB2_OP_RENAME: + rqst[num_rqst].rq_iov = &vars->si_iov[0]; + rqst[num_rqst].rq_nvec = 2; - len = (2 * UniStrnlen((wchar_t *)ptr, PATH_MAX)); + len = in_iov[i].iov_len; - vars->rename_info.ReplaceIfExists = 1; - vars->rename_info.RootDirectory = 0; - vars->rename_info.FileNameLength = cpu_to_le32(len); + vars->rename_info.ReplaceIfExists = 1; + vars->rename_info.RootDirectory = 0; + vars->rename_info.FileNameLength = cpu_to_le32(len); - size[0] = sizeof(struct smb2_file_rename_info); - data[0] = &vars->rename_info; + size[0] = sizeof(struct smb2_file_rename_info); + data[0] = &vars->rename_info; - size[1] = len + 2 /* null */; - data[1] = (__le16 *)ptr; + size[1] = len + 2 /* null */; + data[1] = in_iov[i].iov_base; - if (cfile) - rc = SMB2_set_info_init(tcon, server, - &rqst[num_rqst], - cfile->fid.persistent_fid, - cfile->fid.volatile_fid, - current->tgid, FILE_RENAME_INFORMATION, - SMB2_O_INFO_FILE, 0, data, size); - else { - rc = SMB2_set_info_init(tcon, server, - &rqst[num_rqst], - COMPOUND_FID, COMPOUND_FID, - current->tgid, FILE_RENAME_INFORMATION, - SMB2_O_INFO_FILE, 0, data, size); - if (!rc) { - smb2_set_next_command(tcon, &rqst[num_rqst]); - smb2_set_related(&rqst[num_rqst]); + if (cfile) { + rc = SMB2_set_info_init(tcon, server, + &rqst[num_rqst], + cfile->fid.persistent_fid, + cfile->fid.volatile_fid, + current->tgid, FILE_RENAME_INFORMATION, + SMB2_O_INFO_FILE, 0, data, size); + } else { + rc = SMB2_set_info_init(tcon, server, + &rqst[num_rqst], + COMPOUND_FID, COMPOUND_FID, + current->tgid, FILE_RENAME_INFORMATION, + SMB2_O_INFO_FILE, 0, data, size); + if (!rc) { + smb2_set_next_command(tcon, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst]); + } } - } - if (rc) - goto finished; - num_rqst++; - trace_smb3_rename_enter(xid, ses->Suid, tcon->tid, full_path); - break; - case SMB2_OP_HARDLINK: - rqst[num_rqst].rq_iov = &vars->si_iov[0]; - rqst[num_rqst].rq_nvec = 2; + if (rc) + goto finished; + num_rqst++; + trace_smb3_rename_enter(xid, ses->Suid, tcon->tid, full_path); + break; + case SMB2_OP_HARDLINK: + rqst[num_rqst].rq_iov = &vars->si_iov[0]; + rqst[num_rqst].rq_nvec = 2; - len = (2 * UniStrnlen((wchar_t *)ptr, PATH_MAX)); + len = in_iov[i].iov_len; - vars->link_info.ReplaceIfExists = 0; - vars->link_info.RootDirectory = 0; - vars->link_info.FileNameLength = cpu_to_le32(len); + vars->link_info.ReplaceIfExists = 0; + vars->link_info.RootDirectory = 0; + vars->link_info.FileNameLength = cpu_to_le32(len); - size[0] = sizeof(struct smb2_file_link_info); - data[0] = &vars->link_info; + size[0] = sizeof(struct smb2_file_link_info); + data[0] = &vars->link_info; - size[1] = len + 2 /* null */; - data[1] = (__le16 *)ptr; + size[1] = len + 2 /* null */; + data[1] = in_iov[i].iov_base; - rc = SMB2_set_info_init(tcon, server, - &rqst[num_rqst], COMPOUND_FID, - COMPOUND_FID, current->tgid, - FILE_LINK_INFORMATION, - SMB2_O_INFO_FILE, 0, data, size); - if (rc) - goto finished; - smb2_set_next_command(tcon, &rqst[num_rqst]); - smb2_set_related(&rqst[num_rqst++]); - trace_smb3_hardlink_enter(xid, ses->Suid, tcon->tid, full_path); - break; - default: - cifs_dbg(VFS, "Invalid command\n"); - rc = -EINVAL; + rc = SMB2_set_info_init(tcon, server, + &rqst[num_rqst], COMPOUND_FID, + COMPOUND_FID, current->tgid, + FILE_LINK_INFORMATION, + SMB2_O_INFO_FILE, 0, data, size); + if (rc) + goto finished; + smb2_set_next_command(tcon, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst++]); + trace_smb3_hardlink_enter(xid, ses->Suid, tcon->tid, full_path); + break; + case SMB2_OP_SET_REPARSE: + rqst[num_rqst].rq_iov = vars->io_iov; + rqst[num_rqst].rq_nvec = ARRAY_SIZE(vars->io_iov); + + rc = SMB2_ioctl_init(tcon, server, &rqst[num_rqst], + COMPOUND_FID, COMPOUND_FID, + FSCTL_SET_REPARSE_POINT, + in_iov[i].iov_base, + in_iov[i].iov_len, 0); + if (rc) + goto finished; + smb2_set_next_command(tcon, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst++]); + trace_smb3_set_reparse_compound_enter(xid, ses->Suid, + tcon->tid, full_path); + break; + case SMB2_OP_GET_REPARSE: + rqst[num_rqst].rq_iov = vars->io_iov; + rqst[num_rqst].rq_nvec = ARRAY_SIZE(vars->io_iov); + + rc = SMB2_ioctl_init(tcon, server, &rqst[num_rqst], + COMPOUND_FID, COMPOUND_FID, + FSCTL_GET_REPARSE_POINT, + NULL, 0, CIFSMaxBufSize); + if (rc) + goto finished; + smb2_set_next_command(tcon, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst++]); + trace_smb3_get_reparse_compound_enter(xid, ses->Suid, + tcon->tid, full_path); + break; + default: + cifs_dbg(VFS, "Invalid command\n"); + rc = -EINVAL; + } } if (rc) goto finished; @@ -375,157 +469,191 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, num_rqst++; if (cfile) { + if (retries) + for (i = 1; i < num_rqst - 2; i++) + smb2_set_replay(server, &rqst[i]); + rc = compound_send_recv(xid, ses, server, flags, num_rqst - 2, &rqst[1], &resp_buftype[1], &rsp_iov[1]); - } else + } else { + if (retries) + for (i = 0; i < num_rqst; i++) + smb2_set_replay(server, &rqst[i]); + rc = compound_send_recv(xid, ses, server, flags, num_rqst, rqst, resp_buftype, rsp_iov); + } - finished: - SMB2_open_free(&rqst[0]); +finished: + num_rqst = 0; + SMB2_open_free(&rqst[num_rqst++]); if (rc == -EREMCHG) { pr_warn_once("server share %s deleted\n", tcon->tree_name); tcon->need_reconnect = true; } - switch (command) { - case SMB2_OP_QUERY_INFO: - idata = ptr; - if (rc == 0 && cfile && cfile->symlink_target) { - idata->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL); - if (!idata->symlink_target) - rc = -ENOMEM; - } - if (rc == 0) { - qi_rsp = (struct smb2_query_info_rsp *) - rsp_iov[1].iov_base; - rc = smb2_validate_and_copy_iov( - le16_to_cpu(qi_rsp->OutputBufferOffset), - le32_to_cpu(qi_rsp->OutputBufferLength), - &rsp_iov[1], sizeof(idata->fi), (char *)&idata->fi); - } - if (rqst[1].rq_iov) - SMB2_query_info_free(&rqst[1]); - if (rqst[2].rq_iov) - SMB2_close_free(&rqst[2]); - if (rc) - trace_smb3_query_info_compound_err(xid, ses->Suid, - tcon->tid, rc); - else - trace_smb3_query_info_compound_done(xid, ses->Suid, - tcon->tid); - break; - case SMB2_OP_POSIX_QUERY_INFO: - idata = ptr; - if (rc == 0 && cfile && cfile->symlink_target) { - idata->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL); - if (!idata->symlink_target) - rc = -ENOMEM; - } - if (rc == 0) { - qi_rsp = (struct smb2_query_info_rsp *) - rsp_iov[1].iov_base; - rc = smb2_validate_and_copy_iov( - le16_to_cpu(qi_rsp->OutputBufferOffset), - le32_to_cpu(qi_rsp->OutputBufferLength), - &rsp_iov[1], sizeof(idata->posix_fi) /* add SIDs */, - (char *)&idata->posix_fi); - } - if (rc == 0) { - unsigned int length = le32_to_cpu(qi_rsp->OutputBufferLength); - - if (length > sizeof(idata->posix_fi)) { - char *base = (char *)rsp_iov[1].iov_base + - le16_to_cpu(qi_rsp->OutputBufferOffset) + - sizeof(idata->posix_fi); - *extbuflen = length - sizeof(idata->posix_fi); - *extbuf = kmemdup(base, *extbuflen, GFP_KERNEL); - if (!*extbuf) + for (i = 0; i < num_cmds; i++) { + switch (cmds[i]) { + case SMB2_OP_QUERY_INFO: + idata = in_iov[i].iov_base; + if (rc == 0 && cfile && cfile->symlink_target) { + idata->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL); + if (!idata->symlink_target) rc = -ENOMEM; + } + if (rc == 0) { + qi_rsp = (struct smb2_query_info_rsp *) + rsp_iov[i + 1].iov_base; + rc = smb2_validate_and_copy_iov( + le16_to_cpu(qi_rsp->OutputBufferOffset), + le32_to_cpu(qi_rsp->OutputBufferLength), + &rsp_iov[i + 1], sizeof(idata->fi), (char *)&idata->fi); + } + SMB2_query_info_free(&rqst[num_rqst++]); + if (rc) + trace_smb3_query_info_compound_err(xid, ses->Suid, + tcon->tid, rc); + else + trace_smb3_query_info_compound_done(xid, ses->Suid, + tcon->tid); + break; + case SMB2_OP_POSIX_QUERY_INFO: + idata = in_iov[i].iov_base; + if (rc == 0 && cfile && cfile->symlink_target) { + idata->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL); + if (!idata->symlink_target) + rc = -ENOMEM; + } + if (rc == 0) { + qi_rsp = (struct smb2_query_info_rsp *) + rsp_iov[i + 1].iov_base; + rc = smb2_validate_and_copy_iov( + le16_to_cpu(qi_rsp->OutputBufferOffset), + le32_to_cpu(qi_rsp->OutputBufferLength), + &rsp_iov[i + 1], sizeof(idata->posix_fi) /* add SIDs */, + (char *)&idata->posix_fi); + } + if (rc == 0) + rc = parse_posix_sids(idata, &rsp_iov[i + 1]); + + SMB2_query_info_free(&rqst[num_rqst++]); + if (rc) + trace_smb3_posix_query_info_compound_err(xid, ses->Suid, + tcon->tid, rc); + else + trace_smb3_posix_query_info_compound_done(xid, ses->Suid, + tcon->tid); + break; + case SMB2_OP_DELETE: + if (rc) + trace_smb3_delete_err(xid, ses->Suid, tcon->tid, rc); + else + trace_smb3_delete_done(xid, ses->Suid, tcon->tid); + break; + case SMB2_OP_MKDIR: + if (rc) + trace_smb3_mkdir_err(xid, ses->Suid, tcon->tid, rc); + else + trace_smb3_mkdir_done(xid, ses->Suid, tcon->tid); + break; + case SMB2_OP_HARDLINK: + if (rc) + trace_smb3_hardlink_err(xid, ses->Suid, tcon->tid, rc); + else + trace_smb3_hardlink_done(xid, ses->Suid, tcon->tid); + SMB2_set_info_free(&rqst[num_rqst++]); + break; + case SMB2_OP_RENAME: + if (rc) + trace_smb3_rename_err(xid, ses->Suid, tcon->tid, rc); + else + trace_smb3_rename_done(xid, ses->Suid, tcon->tid); + SMB2_set_info_free(&rqst[num_rqst++]); + break; + case SMB2_OP_RMDIR: + if (rc) + trace_smb3_rmdir_err(xid, ses->Suid, tcon->tid, rc); + else + trace_smb3_rmdir_done(xid, ses->Suid, tcon->tid); + SMB2_set_info_free(&rqst[num_rqst++]); + break; + case SMB2_OP_SET_EOF: + if (rc) + trace_smb3_set_eof_err(xid, ses->Suid, tcon->tid, rc); + else + trace_smb3_set_eof_done(xid, ses->Suid, tcon->tid); + SMB2_set_info_free(&rqst[num_rqst++]); + break; + case SMB2_OP_SET_INFO: + if (rc) + trace_smb3_set_info_compound_err(xid, ses->Suid, + tcon->tid, rc); + else + trace_smb3_set_info_compound_done(xid, ses->Suid, + tcon->tid); + SMB2_set_info_free(&rqst[num_rqst++]); + break; + case SMB2_OP_SET_REPARSE: + if (rc) { + trace_smb3_set_reparse_compound_err(xid, ses->Suid, + tcon->tid, rc); + } else { + trace_smb3_set_reparse_compound_done(xid, ses->Suid, + tcon->tid); + } + SMB2_ioctl_free(&rqst[num_rqst++]); + break; + case SMB2_OP_GET_REPARSE: + if (!rc) { + iov = &rsp_iov[i + 1]; + idata = in_iov[i].iov_base; + idata->reparse.io.iov = *iov; + idata->reparse.io.buftype = resp_buftype[i + 1]; + rbuf = reparse_buf_ptr(iov); + if (IS_ERR(rbuf)) { + rc = PTR_ERR(rbuf); + trace_smb3_set_reparse_compound_err(xid, ses->Suid, + tcon->tid, rc); + } else { + idata->reparse.tag = le32_to_cpu(rbuf->ReparseTag); + trace_smb3_set_reparse_compound_done(xid, ses->Suid, + tcon->tid); + } + memset(iov, 0, sizeof(*iov)); + resp_buftype[i + 1] = CIFS_NO_BUFFER; } else { - rc = -EINVAL; + trace_smb3_set_reparse_compound_err(xid, ses->Suid, + tcon->tid, rc); } + SMB2_ioctl_free(&rqst[num_rqst++]); + break; } - if (rqst[1].rq_iov) - SMB2_query_info_free(&rqst[1]); - if (rqst[2].rq_iov) - SMB2_close_free(&rqst[2]); - if (rc) - trace_smb3_posix_query_info_compound_err(xid, ses->Suid, tcon->tid, rc); - else - trace_smb3_posix_query_info_compound_done(xid, ses->Suid, tcon->tid); - break; - case SMB2_OP_DELETE: - if (rc) - trace_smb3_delete_err(xid, ses->Suid, tcon->tid, rc); - else - trace_smb3_delete_done(xid, ses->Suid, tcon->tid); - if (rqst[1].rq_iov) - SMB2_close_free(&rqst[1]); - break; - case SMB2_OP_MKDIR: - if (rc) - trace_smb3_mkdir_err(xid, ses->Suid, tcon->tid, rc); - else - trace_smb3_mkdir_done(xid, ses->Suid, tcon->tid); - if (rqst[1].rq_iov) - SMB2_close_free(&rqst[1]); - break; - case SMB2_OP_HARDLINK: - if (rc) - trace_smb3_hardlink_err(xid, ses->Suid, tcon->tid, rc); - else - trace_smb3_hardlink_done(xid, ses->Suid, tcon->tid); - free_set_inf_compound(rqst); - break; - case SMB2_OP_RENAME: - if (rc) - trace_smb3_rename_err(xid, ses->Suid, tcon->tid, rc); - else - trace_smb3_rename_done(xid, ses->Suid, tcon->tid); - free_set_inf_compound(rqst); - break; - case SMB2_OP_RMDIR: - if (rc) - trace_smb3_rmdir_err(xid, ses->Suid, tcon->tid, rc); - else - trace_smb3_rmdir_done(xid, ses->Suid, tcon->tid); - free_set_inf_compound(rqst); - break; - case SMB2_OP_SET_EOF: - if (rc) - trace_smb3_set_eof_err(xid, ses->Suid, tcon->tid, rc); - else - trace_smb3_set_eof_done(xid, ses->Suid, tcon->tid); - free_set_inf_compound(rqst); - break; - case SMB2_OP_SET_INFO: - if (rc) - trace_smb3_set_info_compound_err(xid, ses->Suid, - tcon->tid, rc); - else - trace_smb3_set_info_compound_done(xid, ses->Suid, - tcon->tid); - free_set_inf_compound(rqst); - break; } + SMB2_close_free(&rqst[num_rqst]); - if (cfile) - cifsFileInfo_put(cfile); - + num_cmds += 2; if (out_iov && out_buftype) { - memcpy(out_iov, rsp_iov, 3 * sizeof(*out_iov)); - memcpy(out_buftype, resp_buftype, 3 * sizeof(*out_buftype)); + memcpy(out_iov, rsp_iov, num_cmds * sizeof(*out_iov)); + memcpy(out_buftype, resp_buftype, + num_cmds * sizeof(*out_buftype)); } else { - free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); - free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); - free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base); + for (i = 0; i < num_cmds; i++) + free_rsp_buf(resp_buftype[i], rsp_iov[i].iov_base); } + num_cmds -= 2; /* correct num_cmds as there could be a retry */ kfree(vars); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + + if (cfile) + cifsFileInfo_put(cfile); + return rc; } @@ -569,34 +697,57 @@ int smb2_query_path_info(const unsigned int xid, struct cifsFileInfo *cfile; struct cached_fid *cfid = NULL; struct smb2_hdr *hdr; - struct kvec out_iov[3] = {}; + struct kvec in_iov[2], out_iov[3] = {}; int out_buftype[3] = {}; + int cmds[2]; bool islink; + int i, num_cmds; int rc, rc2; data->adjust_tz = false; data->reparse_point = false; - if (strcmp(full_path, "")) - rc = -ENOENT; - else - rc = open_cached_dir(xid, tcon, full_path, cifs_sb, false, &cfid); - /* If it is a root and its handle is cached then use it */ - if (!rc) { - if (cfid->file_all_info_is_valid) { - memcpy(&data->fi, &cfid->file_all_info, sizeof(data->fi)); + /* + * BB TODO: Add support for using cached root handle in SMB3.1.1 POSIX. + * Create SMB2_query_posix_info worker function to do non-compounded + * query when we already have an open file handle for this. For now this + * is fast enough (always using the compounded version). + */ + if (!tcon->posix_extensions) { + if (*full_path) { + rc = -ENOENT; } else { - rc = SMB2_query_info(xid, tcon, cfid->fid.persistent_fid, - cfid->fid.volatile_fid, &data->fi); + rc = open_cached_dir(xid, tcon, full_path, + cifs_sb, false, &cfid); } - close_cached_dir(cfid); - return rc; + /* If it is a root and its handle is cached then use it */ + if (!rc) { + if (cfid->file_all_info_is_valid) { + memcpy(&data->fi, &cfid->file_all_info, + sizeof(data->fi)); + } else { + rc = SMB2_query_info(xid, tcon, + cfid->fid.persistent_fid, + cfid->fid.volatile_fid, + &data->fi); + } + close_cached_dir(cfid); + return rc; + } + cmds[0] = SMB2_OP_QUERY_INFO; + } else { + cmds[0] = SMB2_OP_POSIX_QUERY_INFO; } + in_iov[0].iov_base = data; + in_iov[0].iov_len = sizeof(*data); + in_iov[1] = in_iov[0]; + cifs_get_readable_path(tcon, full_path, &cfile); - rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, - create_options, ACL_NO_MODE, data, SMB2_OP_QUERY_INFO, cfile, - NULL, NULL, out_iov, out_buftype); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, + FILE_READ_ATTRIBUTES, FILE_OPEN, + create_options, ACL_NO_MODE, in_iov, + cmds, 1, cfile, out_iov, out_buftype); hdr = out_iov[0].iov_base; /* * If first iov is unset, then SMB session was dropped or we've got a @@ -608,18 +759,27 @@ int smb2_query_path_info(const unsigned int xid, switch (rc) { case 0: case -EOPNOTSUPP: + /* + * BB TODO: When support for special files added to Samba + * re-verify this path. + */ rc = parse_create_response(data, cifs_sb, &out_iov[0]); if (rc || !data->reparse_point) goto out; + if (data->reparse.tag == IO_REPARSE_TAG_SYMLINK) { + /* symlink already parsed in create response */ + num_cmds = 1; + } else { + cmds[1] = SMB2_OP_GET_REPARSE; + num_cmds = 2; + } create_options |= OPEN_REPARSE_POINT; - /* Failed on a symbolic link - query a reparse point info */ cifs_get_readable_path(tcon, full_path, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, - create_options, ACL_NO_MODE, data, - SMB2_OP_QUERY_INFO, cfile, NULL, NULL, - NULL, NULL); + create_options, ACL_NO_MODE, in_iov, + cmds, num_cmds, cfile, NULL, NULL); break; case -EREMOTE: break; @@ -637,93 +797,8 @@ int smb2_query_path_info(const unsigned int xid, } out: - free_rsp_buf(out_buftype[0], out_iov[0].iov_base); - free_rsp_buf(out_buftype[1], out_iov[1].iov_base); - free_rsp_buf(out_buftype[2], out_iov[2].iov_base); - return rc; -} - -int smb311_posix_query_path_info(const unsigned int xid, - struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, - const char *full_path, - struct cifs_open_info_data *data, - struct cifs_sid *owner, - struct cifs_sid *group) -{ - int rc; - __u32 create_options = 0; - struct cifsFileInfo *cfile; - struct kvec out_iov[3] = {}; - int out_buftype[3] = {}; - __u8 *sidsbuf = NULL; - __u8 *sidsbuf_end = NULL; - size_t sidsbuflen = 0; - size_t owner_len, group_len; - - data->adjust_tz = false; - data->reparse_point = false; - - /* - * BB TODO: Add support for using the cached root handle. - * Create SMB2_query_posix_info worker function to do non-compounded query - * when we already have an open file handle for this. For now this is fast enough - * (always using the compounded version). - */ - - cifs_get_readable_path(tcon, full_path, &cfile); - rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, - create_options, ACL_NO_MODE, data, SMB2_OP_POSIX_QUERY_INFO, cfile, - &sidsbuf, &sidsbuflen, out_iov, out_buftype); - /* - * If first iov is unset, then SMB session was dropped or we've got a - * cached open file (@cfile). - */ - if (!out_iov[0].iov_base || out_buftype[0] == CIFS_NO_BUFFER) - goto out; - - switch (rc) { - case 0: - case -EOPNOTSUPP: - /* BB TODO: When support for special files added to Samba re-verify this path */ - rc = parse_create_response(data, cifs_sb, &out_iov[0]); - if (rc || !data->reparse_point) - goto out; - - create_options |= OPEN_REPARSE_POINT; - /* Failed on a symbolic link - query a reparse point info */ - cifs_get_readable_path(tcon, full_path, &cfile); - rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, - FILE_OPEN, create_options, ACL_NO_MODE, data, - SMB2_OP_POSIX_QUERY_INFO, cfile, - &sidsbuf, &sidsbuflen, NULL, NULL); - break; - } - -out: - if (rc == 0) { - sidsbuf_end = sidsbuf + sidsbuflen; - - owner_len = posix_info_sid_size(sidsbuf, sidsbuf_end); - if (owner_len == -1) { - rc = -EINVAL; - goto out; - } - memcpy(owner, sidsbuf, owner_len); - - group_len = posix_info_sid_size( - sidsbuf + owner_len, sidsbuf_end); - if (group_len == -1) { - rc = -EINVAL; - goto out; - } - memcpy(group, sidsbuf + owner_len, group_len); - } - - kfree(sidsbuf); - free_rsp_buf(out_buftype[0], out_iov[0].iov_base); - free_rsp_buf(out_buftype[1], out_iov[1].iov_base); - free_rsp_buf(out_buftype[2], out_iov[2].iov_base); + for (i = 0; i < ARRAY_SIZE(out_buftype); i++) + free_rsp_buf(out_buftype[i], out_iov[i].iov_base); return rc; } @@ -734,8 +809,9 @@ smb2_mkdir(const unsigned int xid, struct inode *parent_inode, umode_t mode, { return smb2_compound_op(xid, tcon, cifs_sb, name, FILE_WRITE_ATTRIBUTES, FILE_CREATE, - CREATE_NOT_FILE, mode, NULL, SMB2_OP_MKDIR, - NULL, NULL, NULL, NULL, NULL); + CREATE_NOT_FILE, mode, + NULL, &(int){SMB2_OP_MKDIR}, 1, + NULL, NULL, NULL); } void @@ -743,21 +819,24 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name, struct cifs_sb_info *cifs_sb, struct cifs_tcon *tcon, const unsigned int xid) { - FILE_BASIC_INFO data; + FILE_BASIC_INFO data = {}; struct cifsInodeInfo *cifs_i; struct cifsFileInfo *cfile; + struct kvec in_iov; u32 dosattrs; int tmprc; - memset(&data, 0, sizeof(data)); + in_iov.iov_base = &data; + in_iov.iov_len = sizeof(data); cifs_i = CIFS_I(inode); dosattrs = cifs_i->cifsAttrs | ATTR_READONLY; data.Attributes = cpu_to_le32(dosattrs); cifs_get_writable_path(tcon, name, FIND_WR_ANY, &cfile); tmprc = smb2_compound_op(xid, tcon, cifs_sb, name, FILE_WRITE_ATTRIBUTES, FILE_CREATE, - CREATE_NOT_FILE, ACL_NO_MODE, - &data, SMB2_OP_SET_INFO, cfile, NULL, NULL, NULL, NULL); + CREATE_NOT_FILE, ACL_NO_MODE, &in_iov, + &(int){SMB2_OP_SET_INFO}, 1, + cfile, NULL, NULL); if (tmprc == 0) cifs_i->cifsAttrs = dosattrs; } @@ -767,9 +846,11 @@ smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb) { drop_cached_dir_by_name(xid, tcon, name, cifs_sb); - return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, - CREATE_NOT_FILE, ACL_NO_MODE, - NULL, SMB2_OP_RMDIR, NULL, NULL, NULL, NULL, NULL); + return smb2_compound_op(xid, tcon, cifs_sb, name, + DELETE, FILE_OPEN, CREATE_NOT_FILE, + ACL_NO_MODE, NULL, + &(int){SMB2_OP_RMDIR}, 1, + NULL, NULL, NULL); } int @@ -778,15 +859,18 @@ smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name, { return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT, - ACL_NO_MODE, NULL, SMB2_OP_DELETE, NULL, NULL, NULL, NULL, NULL); + ACL_NO_MODE, NULL, + &(int){SMB2_OP_DELETE}, 1, + NULL, NULL, NULL); } -static int -smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon, - const char *from_name, const char *to_name, - struct cifs_sb_info *cifs_sb, __u32 access, int command, - struct cifsFileInfo *cfile) +static int smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon, + const char *from_name, const char *to_name, + struct cifs_sb_info *cifs_sb, + __u32 create_options, __u32 access, + int command, struct cifsFileInfo *cfile) { + struct kvec in_iov; __le16 *smb2_to_name = NULL; int rc; @@ -795,36 +879,43 @@ smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon, rc = -ENOMEM; goto smb2_rename_path; } + in_iov.iov_base = smb2_to_name; + in_iov.iov_len = 2 * UniStrnlen((wchar_t *)smb2_to_name, PATH_MAX); rc = smb2_compound_op(xid, tcon, cifs_sb, from_name, access, - FILE_OPEN, 0, ACL_NO_MODE, smb2_to_name, - command, cfile, NULL, NULL, NULL, NULL); + FILE_OPEN, create_options, ACL_NO_MODE, + &in_iov, &command, 1, cfile, NULL, NULL); smb2_rename_path: kfree(smb2_to_name); return rc; } -int -smb2_rename_path(const unsigned int xid, struct cifs_tcon *tcon, - const char *from_name, const char *to_name, - struct cifs_sb_info *cifs_sb) +int smb2_rename_path(const unsigned int xid, + struct cifs_tcon *tcon, + struct dentry *source_dentry, + const char *from_name, const char *to_name, + struct cifs_sb_info *cifs_sb) { struct cifsFileInfo *cfile; + __u32 co = file_create_options(source_dentry); drop_cached_dir_by_name(xid, tcon, from_name, cifs_sb); cifs_get_writable_path(tcon, from_name, FIND_WR_WITH_DELETE, &cfile); - return smb2_set_path_attr(xid, tcon, from_name, to_name, - cifs_sb, DELETE, SMB2_OP_RENAME, cfile); + return smb2_set_path_attr(xid, tcon, from_name, to_name, cifs_sb, + co, DELETE, SMB2_OP_RENAME, cfile); } -int -smb2_create_hardlink(const unsigned int xid, struct cifs_tcon *tcon, - const char *from_name, const char *to_name, - struct cifs_sb_info *cifs_sb) +int smb2_create_hardlink(const unsigned int xid, + struct cifs_tcon *tcon, + struct dentry *source_dentry, + const char *from_name, const char *to_name, + struct cifs_sb_info *cifs_sb) { - return smb2_set_path_attr(xid, tcon, from_name, to_name, cifs_sb, - FILE_READ_ATTRIBUTES, SMB2_OP_HARDLINK, - NULL); + __u32 co = file_create_options(source_dentry); + + return smb2_set_path_attr(xid, tcon, from_name, to_name, + cifs_sb, co, FILE_READ_ATTRIBUTES, + SMB2_OP_HARDLINK, NULL); } int @@ -832,13 +923,18 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon, const char *full_path, __u64 size, struct cifs_sb_info *cifs_sb, bool set_alloc) { - __le64 eof = cpu_to_le64(size); struct cifsFileInfo *cfile; + struct kvec in_iov; + __le64 eof = cpu_to_le64(size); + in_iov.iov_base = &eof; + in_iov.iov_len = sizeof(eof); cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile); return smb2_compound_op(xid, tcon, cifs_sb, full_path, - FILE_WRITE_DATA, FILE_OPEN, 0, ACL_NO_MODE, - &eof, SMB2_OP_SET_EOF, cfile, NULL, NULL, NULL, NULL); + FILE_WRITE_DATA, FILE_OPEN, + 0, ACL_NO_MODE, &in_iov, + &(int){SMB2_OP_SET_EOF}, 1, + cfile, NULL, NULL); } int @@ -849,6 +945,7 @@ smb2_set_file_info(struct inode *inode, const char *full_path, struct tcon_link *tlink; struct cifs_tcon *tcon; struct cifsFileInfo *cfile; + struct kvec in_iov = { .iov_base = buf, .iov_len = sizeof(*buf), }; int rc; if ((buf->CreationTime == 0) && (buf->LastAccessTime == 0) && @@ -864,8 +961,91 @@ smb2_set_file_info(struct inode *inode, const char *full_path, cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_WRITE_ATTRIBUTES, FILE_OPEN, - 0, ACL_NO_MODE, buf, SMB2_OP_SET_INFO, cfile, - NULL, NULL, NULL, NULL); + 0, ACL_NO_MODE, &in_iov, + &(int){SMB2_OP_SET_INFO}, 1, + cfile, NULL, NULL); cifs_put_tlink(tlink); return rc; } + +struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data, + struct super_block *sb, + const unsigned int xid, + struct cifs_tcon *tcon, + const char *full_path, + struct kvec *iov) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct cifsFileInfo *cfile; + struct inode *new = NULL; + struct kvec in_iov[2]; + int cmds[2]; + int da, co, cd; + int rc; + + da = SYNCHRONIZE | DELETE | + FILE_READ_ATTRIBUTES | + FILE_WRITE_ATTRIBUTES; + co = CREATE_NOT_DIR | OPEN_REPARSE_POINT; + cd = FILE_CREATE; + cmds[0] = SMB2_OP_SET_REPARSE; + in_iov[0] = *iov; + in_iov[1].iov_base = data; + in_iov[1].iov_len = sizeof(*data); + + if (tcon->posix_extensions) { + cmds[1] = SMB2_OP_POSIX_QUERY_INFO; + cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, + da, cd, co, ACL_NO_MODE, in_iov, + cmds, 2, cfile, NULL, NULL); + if (!rc) { + rc = smb311_posix_get_inode_info(&new, full_path, + data, sb, xid); + } + } else { + cmds[1] = SMB2_OP_QUERY_INFO; + cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, + da, cd, co, ACL_NO_MODE, in_iov, + cmds, 2, cfile, NULL, NULL); + if (!rc) { + rc = cifs_get_inode_info(&new, full_path, + data, sb, xid, NULL); + } + } + return rc ? ERR_PTR(rc) : new; +} + +int smb2_query_reparse_point(const unsigned int xid, + struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const char *full_path, + u32 *tag, struct kvec *rsp, + int *rsp_buftype) +{ + struct cifs_open_info_data data = {}; + struct cifsFileInfo *cfile; + struct kvec in_iov = { .iov_base = &data, .iov_len = sizeof(data), }; + int rc; + + cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path); + + cifs_get_readable_path(tcon, full_path, &cfile); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, + FILE_READ_ATTRIBUTES, FILE_OPEN, + OPEN_REPARSE_POINT, ACL_NO_MODE, &in_iov, + &(int){SMB2_OP_GET_REPARSE}, 1, + cfile, NULL, NULL); + if (rc) + goto out; + + *tag = data.reparse.tag; + *rsp = data.reparse.io.iov; + *rsp_buftype = data.reparse.io.buftype; + memset(&data.reparse.io.iov, 0, sizeof(data.reparse.io.iov)); + data.reparse.io.buftype = CIFS_NO_BUFFER; +out: + cifs_free_open_info(&data); + return rc; +} diff --git a/fs/smb/client/smb2maperror.c b/fs/smb/client/smb2maperror.c index 1a90dd78b238..ac1895358908 100644 --- a/fs/smb/client/smb2maperror.c +++ b/fs/smb/client/smb2maperror.c @@ -1210,6 +1210,8 @@ static const struct status_to_posix_error smb2_error_map_table[] = { {STATUS_INVALID_TASK_INDEX, -EIO, "STATUS_INVALID_TASK_INDEX"}, {STATUS_THREAD_ALREADY_IN_TASK, -EIO, "STATUS_THREAD_ALREADY_IN_TASK"}, {STATUS_CALLBACK_BYPASS, -EIO, "STATUS_CALLBACK_BYPASS"}, + {STATUS_SERVER_UNAVAILABLE, -EAGAIN, "STATUS_SERVER_UNAVAILABLE"}, + {STATUS_FILE_NOT_AVAILABLE, -EAGAIN, "STATUS_FILE_NOT_AVAILABLE"}, {STATUS_PORT_CLOSED, -EIO, "STATUS_PORT_CLOSED"}, {STATUS_MESSAGE_LOST, -EIO, "STATUS_MESSAGE_LOST"}, {STATUS_INVALID_MESSAGE, -EIO, "STATUS_INVALID_MESSAGE"}, diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 14bc745de199..83c898afc835 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -614,7 +614,8 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, "multichannel not available\n" "Empty network interface list returned by server %s\n", ses->server->hostname); - rc = -EINVAL; + rc = -EOPNOTSUPP; + ses->iface_last_update = jiffies; goto out; } @@ -712,7 +713,6 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, ses->iface_count++; spin_unlock(&ses->iface_lock); - ses->iface_last_update = jiffies; next_iface: nb_iface++; next = le32_to_cpu(p->Next); @@ -734,11 +734,7 @@ next_iface: if ((bytes_left > 8) || p->Next) cifs_dbg(VFS, "%s: incomplete interface info\n", __func__); - - if (!ses->iface_count) { - rc = -EINVAL; - goto out; - } + ses->iface_last_update = jiffies; out: /* @@ -1112,7 +1108,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, { struct smb2_compound_vars *vars; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; struct smb_rqst *rqst; struct kvec *rsp_iov; __le16 *utf16_path = NULL; @@ -1128,6 +1124,13 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, struct smb2_file_full_ea_info *ea = NULL; struct smb2_query_info_rsp *rsp; int rc, used_len = 0; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = CIFS_CP_CREATE_CLOSE_OP; + oplock = SMB2_OPLOCK_LEVEL_NONE; + server = cifs_pick_channel(ses); if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; @@ -1248,6 +1251,12 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, goto sea_exit; smb2_set_related(&rqst[2]); + if (retries) { + smb2_set_replay(server, &rqst[0]); + smb2_set_replay(server, &rqst[1]); + smb2_set_replay(server, &rqst[2]); + } + rc = compound_send_recv(xid, ses, server, flags, 3, rqst, resp_buftype, rsp_iov); @@ -1264,6 +1273,11 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, kfree(vars); out_free_path: kfree(utf16_path); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } #endif @@ -1488,7 +1502,7 @@ smb2_ioctl_query_info(const unsigned int xid, struct smb_rqst *rqst; struct kvec *rsp_iov; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; char __user *arg = (char __user *)p; struct smb_query_info qi; struct smb_query_info __user *pqi; @@ -1505,6 +1519,13 @@ smb2_ioctl_query_info(const unsigned int xid, void *data[2]; int create_options = is_dir ? CREATE_NOT_FILE : CREATE_NOT_DIR; void (*free_req1_func)(struct smb_rqst *r); + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = CIFS_CP_CREATE_CLOSE_OP; + oplock = SMB2_OPLOCK_LEVEL_NONE; + server = cifs_pick_channel(ses); vars = kzalloc(sizeof(*vars), GFP_ATOMIC); if (vars == NULL) @@ -1645,6 +1666,12 @@ smb2_ioctl_query_info(const unsigned int xid, goto free_req_1; smb2_set_related(&rqst[2]); + if (retries) { + smb2_set_replay(server, &rqst[0]); + smb2_set_replay(server, &rqst[1]); + smb2_set_replay(server, &rqst[2]); + } + rc = compound_send_recv(xid, ses, server, flags, 3, rqst, resp_buftype, rsp_iov); @@ -1705,6 +1732,11 @@ free_output_buffer: kfree(buffer); free_vars: kfree(vars); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -1935,7 +1967,6 @@ static int smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon, struct cifsFileInfo *cfile, __u64 size, bool set_alloc) { - __le64 eof = cpu_to_le64(size); struct inode *inode; /* @@ -1952,7 +1983,7 @@ smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon, } return SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, - cfile->fid.volatile_fid, cfile->pid, &eof); + cfile->fid.volatile_fid, cfile->pid, size); } static int @@ -2232,8 +2263,14 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_open_parms oparms; struct smb2_query_directory_rsp *qd_rsp = NULL; struct smb2_create_rsp *op_rsp = NULL; - struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses); - int retry_count = 0; + struct TCP_Server_Info *server; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + oplock = SMB2_OPLOCK_LEVEL_NONE; + server = cifs_pick_channel(tcon->ses); utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); if (!utf16_path) @@ -2283,14 +2320,15 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, smb2_set_related(&rqst[1]); -again: + if (retries) { + smb2_set_replay(server, &rqst[0]); + smb2_set_replay(server, &rqst[1]); + } + rc = compound_send_recv(xid, tcon->ses, server, flags, 2, rqst, resp_buftype, rsp_iov); - if (rc == -EAGAIN && retry_count++ < 10) - goto again; - /* If the open failed there is nothing to do */ op_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base; if (op_rsp == NULL || op_rsp->hdr.Status != STATUS_SUCCESS) { @@ -2338,6 +2376,11 @@ again: SMB2_query_directory_free(&rqst[1]); free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -2463,6 +2506,22 @@ smb2_oplock_response(struct cifs_tcon *tcon, __u64 persistent_fid, } void +smb2_set_replay(struct TCP_Server_Info *server, struct smb_rqst *rqst) +{ + struct smb2_hdr *shdr; + + if (server->dialect < SMB30_PROT_ID) + return; + + shdr = (struct smb2_hdr *)(rqst->rq_iov[0].iov_base); + if (shdr == NULL) { + cifs_dbg(FYI, "shdr NULL in smb2_set_related\n"); + return; + } + shdr->Flags |= SMB2_FLAGS_REPLAY_OPERATION; +} + +void smb2_set_related(struct smb_rqst *rqst) { struct smb2_hdr *shdr; @@ -2535,6 +2594,27 @@ smb2_set_next_command(struct cifs_tcon *tcon, struct smb_rqst *rqst) } /* + * helper function for exponential backoff and check if replayable + */ +bool smb2_should_replay(struct cifs_tcon *tcon, + int *pretries, + int *pcur_sleep) +{ + if (!pretries || !pcur_sleep) + return false; + + if (tcon->retry || (*pretries)++ < tcon->ses->server->retrans) { + msleep(*pcur_sleep); + (*pcur_sleep) = ((*pcur_sleep) << 1); + if ((*pcur_sleep) > CIFS_MAX_SLEEP) + (*pcur_sleep) = CIFS_MAX_SLEEP; + return true; + } + + return false; +} + +/* * Passes the query info response back to the caller on success. * Caller need to free this with free_rsp_buf(). */ @@ -2547,7 +2627,7 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, { struct smb2_compound_vars *vars; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; int flags = CIFS_CP_CREATE_CLOSE_OP; struct smb_rqst *rqst; int resp_buftype[3]; @@ -2558,6 +2638,13 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, int rc; __le16 *utf16_path; struct cached_fid *cfid = NULL; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = CIFS_CP_CREATE_CLOSE_OP; + oplock = SMB2_OPLOCK_LEVEL_NONE; + server = cifs_pick_channel(ses); if (!path) path = ""; @@ -2638,6 +2725,14 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, goto qic_exit; smb2_set_related(&rqst[2]); + if (retries) { + if (!cfid) { + smb2_set_replay(server, &rqst[0]); + smb2_set_replay(server, &rqst[2]); + } + smb2_set_replay(server, &rqst[1]); + } + if (cfid) { rc = compound_send_recv(xid, ses, server, flags, 1, &rqst[1], @@ -2670,6 +2765,11 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, kfree(vars); out_free_path: kfree(utf16_path); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -2948,18 +3048,6 @@ int parse_reparse_point(struct reparse_data_buffer *buf, u32 plen, struct cifs_sb_info *cifs_sb, bool unicode, struct cifs_open_info_data *data) { - if (plen < sizeof(*buf)) { - cifs_dbg(VFS, "%s: reparse buffer is too small. Must be at least 8 bytes but was %d\n", - __func__, plen); - return -EIO; - } - - if (plen < le16_to_cpu(buf->ReparseDataLength) + sizeof(*buf)) { - cifs_dbg(VFS, "%s: invalid reparse buf length: %d\n", - __func__, plen); - return -EIO; - } - data->reparse.buf = buf; /* See MS-FSCC 2.1.2 */ @@ -2997,145 +3085,6 @@ static int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb, return parse_reparse_point(buf, plen, cifs_sb, true, data); } -static int smb2_query_reparse_point(const unsigned int xid, - struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, - const char *full_path, - u32 *tag, struct kvec *rsp, - int *rsp_buftype) -{ - struct smb2_compound_vars *vars; - int rc; - __le16 *utf16_path = NULL; - __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; - struct cifs_open_parms oparms; - struct cifs_fid fid; - struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses); - int flags = CIFS_CP_CREATE_CLOSE_OP; - struct smb_rqst *rqst; - int resp_buftype[3]; - struct kvec *rsp_iov; - struct smb2_ioctl_rsp *ioctl_rsp; - struct reparse_data_buffer *reparse_buf; - u32 off, count, len; - - cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path); - - if (smb3_encryption_required(tcon)) - flags |= CIFS_TRANSFORM_REQ; - - utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb); - if (!utf16_path) - return -ENOMEM; - - resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER; - vars = kzalloc(sizeof(*vars), GFP_KERNEL); - if (!vars) { - rc = -ENOMEM; - goto out_free_path; - } - rqst = vars->rqst; - rsp_iov = vars->rsp_iov; - - /* - * setup smb2open - TODO add optimization to call cifs_get_readable_path - * to see if there is a handle already open that we can use - */ - rqst[0].rq_iov = vars->open_iov; - rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; - - oparms = (struct cifs_open_parms) { - .tcon = tcon, - .path = full_path, - .desired_access = FILE_READ_ATTRIBUTES, - .disposition = FILE_OPEN, - .create_options = cifs_create_options(cifs_sb, OPEN_REPARSE_POINT), - .fid = &fid, - }; - - rc = SMB2_open_init(tcon, server, - &rqst[0], &oplock, &oparms, utf16_path); - if (rc) - goto query_rp_exit; - smb2_set_next_command(tcon, &rqst[0]); - - - /* IOCTL */ - rqst[1].rq_iov = vars->io_iov; - rqst[1].rq_nvec = SMB2_IOCTL_IOV_SIZE; - - rc = SMB2_ioctl_init(tcon, server, - &rqst[1], COMPOUND_FID, - COMPOUND_FID, FSCTL_GET_REPARSE_POINT, NULL, 0, - CIFSMaxBufSize - - MAX_SMB2_CREATE_RESPONSE_SIZE - - MAX_SMB2_CLOSE_RESPONSE_SIZE); - if (rc) - goto query_rp_exit; - - smb2_set_next_command(tcon, &rqst[1]); - smb2_set_related(&rqst[1]); - - /* Close */ - rqst[2].rq_iov = &vars->close_iov; - rqst[2].rq_nvec = 1; - - rc = SMB2_close_init(tcon, server, - &rqst[2], COMPOUND_FID, COMPOUND_FID, false); - if (rc) - goto query_rp_exit; - - smb2_set_related(&rqst[2]); - - rc = compound_send_recv(xid, tcon->ses, server, - flags, 3, rqst, - resp_buftype, rsp_iov); - - ioctl_rsp = rsp_iov[1].iov_base; - - /* - * Open was successful and we got an ioctl response. - */ - if (rc == 0) { - /* See MS-FSCC 2.3.23 */ - off = le32_to_cpu(ioctl_rsp->OutputOffset); - count = le32_to_cpu(ioctl_rsp->OutputCount); - if (check_add_overflow(off, count, &len) || - len > rsp_iov[1].iov_len) { - cifs_tcon_dbg(VFS, "%s: invalid ioctl: off=%d count=%d\n", - __func__, off, count); - rc = -EIO; - goto query_rp_exit; - } - - reparse_buf = (void *)((u8 *)ioctl_rsp + off); - len = sizeof(*reparse_buf); - if (count < len || - count < le16_to_cpu(reparse_buf->ReparseDataLength) + len) { - cifs_tcon_dbg(VFS, "%s: invalid ioctl: off=%d count=%d\n", - __func__, off, count); - rc = -EIO; - goto query_rp_exit; - } - *tag = le32_to_cpu(reparse_buf->ReparseTag); - *rsp = rsp_iov[1]; - *rsp_buftype = resp_buftype[1]; - resp_buftype[1] = CIFS_NO_BUFFER; - } - - query_rp_exit: - SMB2_open_free(&rqst[0]); - SMB2_ioctl_free(&rqst[1]); - SMB2_close_free(&rqst[2]); - free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); - free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); - free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base); - kfree(vars); -out_free_path: - kfree(utf16_path); - return rc; -} - static struct cifs_ntsd * get_smb2_acl_by_fid(struct cifs_sb_info *cifs_sb, const struct cifs_fid *cifsfid, u32 *pacllen, u32 info) @@ -3336,7 +3285,6 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, unsigned long long new_size; long rc; unsigned int xid; - __le64 eof; xid = get_xid(); @@ -3366,11 +3314,13 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, */ new_size = offset + len; if (keep_size == false && (unsigned long long)i_size_read(inode) < new_size) { - eof = cpu_to_le64(new_size); rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, - cfile->fid.volatile_fid, cfile->pid, &eof); + cfile->fid.volatile_fid, cfile->pid, new_size); if (rc >= 0) { truncate_setsize(inode, new_size); + netfs_resize_file(&cifsi->netfs, new_size, true); + if (offset < cifsi->netfs.zero_point) + cifsi->netfs.zero_point = offset; fscache_resize_cookie(cifs_inode_cookie(inode), new_size); } } @@ -3561,7 +3511,7 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, struct cifsFileInfo *cfile = file->private_data; long rc = -EOPNOTSUPP; unsigned int xid; - __le64 eof; + loff_t new_eof; xid = get_xid(); @@ -3590,14 +3540,14 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, if (cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) smb2_set_sparse(xid, tcon, cfile, inode, false); - eof = cpu_to_le64(off + len); + new_eof = off + len; rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, - cfile->fid.volatile_fid, cfile->pid, &eof); + cfile->fid.volatile_fid, cfile->pid, new_eof); if (rc == 0) { - cifsi->server_eof = off + len; - cifs_setsize(inode, off + len); + netfs_resize_file(&cifsi->netfs, new_eof, true); + cifs_setsize(inode, new_eof); cifs_truncate_page(inode->i_mapping, inode->i_size); - truncate_setsize(inode, off + len); + truncate_setsize(inode, new_eof); } goto out; } @@ -3686,10 +3636,10 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon, int rc; unsigned int xid; struct inode *inode = file_inode(file); - struct cifsFileInfo *cfile = file->private_data; struct cifsInodeInfo *cifsi = CIFS_I(inode); - __le64 eof; - loff_t old_eof; + struct cifsFileInfo *cfile = file->private_data; + struct netfs_inode *ictx = &cifsi->netfs; + loff_t old_eof, new_eof; xid = get_xid(); @@ -3708,23 +3658,25 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon, goto out_2; truncate_pagecache_range(inode, off, old_eof); + ictx->zero_point = old_eof; rc = smb2_copychunk_range(xid, cfile, cfile, off + len, old_eof - off - len, off); if (rc < 0) goto out_2; - eof = cpu_to_le64(old_eof - len); + new_eof = old_eof - len; rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, - cfile->fid.volatile_fid, cfile->pid, &eof); + cfile->fid.volatile_fid, cfile->pid, new_eof); if (rc < 0) goto out_2; rc = 0; - cifsi->server_eof = i_size_read(inode) - len; - truncate_setsize(inode, cifsi->server_eof); - fscache_resize_cookie(cifs_inode_cookie(inode), cifsi->server_eof); + truncate_setsize(inode, new_eof); + netfs_resize_file(&cifsi->netfs, new_eof, true); + ictx->zero_point = new_eof; + fscache_resize_cookie(cifs_inode_cookie(inode), new_eof); out_2: filemap_invalidate_unlock(inode->i_mapping); out: @@ -3740,8 +3692,8 @@ static long smb3_insert_range(struct file *file, struct cifs_tcon *tcon, unsigned int xid; struct cifsFileInfo *cfile = file->private_data; struct inode *inode = file_inode(file); - __le64 eof; - __u64 count, old_eof; + struct cifsInodeInfo *cifsi = CIFS_I(inode); + __u64 count, old_eof, new_eof; xid = get_xid(); @@ -3754,20 +3706,21 @@ static long smb3_insert_range(struct file *file, struct cifs_tcon *tcon, } count = old_eof - off; - eof = cpu_to_le64(old_eof + len); + new_eof = old_eof + len; filemap_invalidate_lock(inode->i_mapping); - rc = filemap_write_and_wait_range(inode->i_mapping, off, old_eof + len - 1); + rc = filemap_write_and_wait_range(inode->i_mapping, off, new_eof - 1); if (rc < 0) goto out_2; truncate_pagecache_range(inode, off, old_eof); rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, - cfile->fid.volatile_fid, cfile->pid, &eof); + cfile->fid.volatile_fid, cfile->pid, new_eof); if (rc < 0) goto out_2; - truncate_setsize(inode, old_eof + len); + truncate_setsize(inode, new_eof); + netfs_resize_file(&cifsi->netfs, i_size_read(inode), true); fscache_resize_cookie(cifs_inode_cookie(inode), i_size_read(inode)); rc = smb2_copychunk_range(xid, cfile, cfile, off, count, off + len); @@ -5171,11 +5124,154 @@ int cifs_sfu_make_node(unsigned int xid, struct inode *inode, return rc; } +static inline u64 mode_nfs_type(mode_t mode) +{ + switch (mode & S_IFMT) { + case S_IFBLK: return NFS_SPECFILE_BLK; + case S_IFCHR: return NFS_SPECFILE_CHR; + case S_IFIFO: return NFS_SPECFILE_FIFO; + case S_IFSOCK: return NFS_SPECFILE_SOCK; + } + return 0; +} + +static int nfs_set_reparse_buf(struct reparse_posix_data *buf, + mode_t mode, dev_t dev, + struct kvec *iov) +{ + u64 type; + u16 len, dlen; + + len = sizeof(*buf); + + switch ((type = mode_nfs_type(mode))) { + case NFS_SPECFILE_BLK: + case NFS_SPECFILE_CHR: + dlen = sizeof(__le64); + break; + case NFS_SPECFILE_FIFO: + case NFS_SPECFILE_SOCK: + dlen = 0; + break; + default: + return -EOPNOTSUPP; + } + + buf->ReparseTag = cpu_to_le32(IO_REPARSE_TAG_NFS); + buf->Reserved = 0; + buf->InodeType = cpu_to_le64(type); + buf->ReparseDataLength = cpu_to_le16(len + dlen - + sizeof(struct reparse_data_buffer)); + *(__le64 *)buf->DataBuffer = cpu_to_le64(((u64)MAJOR(dev) << 32) | + MINOR(dev)); + iov->iov_base = buf; + iov->iov_len = len + dlen; + return 0; +} + +static int nfs_make_node(unsigned int xid, struct inode *inode, + struct dentry *dentry, struct cifs_tcon *tcon, + const char *full_path, umode_t mode, dev_t dev) +{ + struct cifs_open_info_data data; + struct reparse_posix_data *p; + struct inode *new; + struct kvec iov; + __u8 buf[sizeof(*p) + sizeof(__le64)]; + int rc; + + p = (struct reparse_posix_data *)buf; + rc = nfs_set_reparse_buf(p, mode, dev, &iov); + if (rc) + return rc; + + data = (struct cifs_open_info_data) { + .reparse_point = true, + .reparse = { .tag = IO_REPARSE_TAG_NFS, .posix = p, }, + }; + + new = smb2_get_reparse_inode(&data, inode->i_sb, xid, + tcon, full_path, &iov); + if (!IS_ERR(new)) + d_instantiate(dentry, new); + else + rc = PTR_ERR(new); + cifs_free_open_info(&data); + return rc; +} + +static int smb2_create_reparse_symlink(const unsigned int xid, + struct inode *inode, + struct dentry *dentry, + struct cifs_tcon *tcon, + const char *full_path, + const char *symname) +{ + struct reparse_symlink_data_buffer *buf = NULL; + struct cifs_open_info_data data; + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct inode *new; + struct kvec iov; + __le16 *path; + char *sym; + u16 len, plen; + int rc = 0; + + sym = kstrdup(symname, GFP_KERNEL); + if (!sym) + return -ENOMEM; + + data = (struct cifs_open_info_data) { + .reparse_point = true, + .reparse = { .tag = IO_REPARSE_TAG_SYMLINK, }, + .symlink_target = sym, + }; + + path = cifs_convert_path_to_utf16(symname, cifs_sb); + if (!path) { + rc = -ENOMEM; + goto out; + } + + plen = 2 * UniStrnlen((wchar_t *)path, PATH_MAX); + len = sizeof(*buf) + plen * 2; + buf = kzalloc(len, GFP_KERNEL); + if (!buf) { + rc = -ENOMEM; + goto out; + } + + buf->ReparseTag = cpu_to_le32(IO_REPARSE_TAG_SYMLINK); + buf->ReparseDataLength = cpu_to_le16(len - sizeof(struct reparse_data_buffer)); + buf->SubstituteNameOffset = cpu_to_le16(plen); + buf->SubstituteNameLength = cpu_to_le16(plen); + memcpy(&buf->PathBuffer[plen], path, plen); + buf->PrintNameOffset = 0; + buf->PrintNameLength = cpu_to_le16(plen); + memcpy(buf->PathBuffer, path, plen); + buf->Flags = cpu_to_le32(*symname != '/' ? SYMLINK_FLAG_RELATIVE : 0); + + iov.iov_base = buf; + iov.iov_len = len; + new = smb2_get_reparse_inode(&data, inode->i_sb, xid, + tcon, full_path, &iov); + if (!IS_ERR(new)) + d_instantiate(dentry, new); + else + rc = PTR_ERR(new); +out: + kfree(path); + cifs_free_open_info(&data); + kfree(buf); + return rc; +} + static int smb2_make_node(unsigned int xid, struct inode *inode, struct dentry *dentry, struct cifs_tcon *tcon, const char *full_path, umode_t mode, dev_t dev) { struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + int rc; /* * Check if mounted with mount parm 'sfu' mount parm. @@ -5183,15 +5279,14 @@ static int smb2_make_node(unsigned int xid, struct inode *inode, * supports block and char device (no socket & fifo), * and was used by default in earlier versions of Windows */ - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) - return -EPERM; - /* - * TODO: Add ability to create instead via reparse point. Windows (e.g. - * their current NFS server) uses this approach to expose special files - * over SMB2/SMB3 and Samba will do this with SMB3.1.1 POSIX Extensions - */ - return cifs_sfu_make_node(xid, inode, dentry, tcon, - full_path, mode, dev); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) { + rc = cifs_sfu_make_node(xid, inode, dentry, tcon, + full_path, mode, dev); + } else { + rc = nfs_make_node(xid, inode, dentry, tcon, + full_path, mode, dev); + } + return rc; } #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY @@ -5247,6 +5342,7 @@ struct smb_version_operations smb20_operations = { .parse_reparse_point = smb2_parse_reparse_point, .query_mf_symlink = smb3_query_mf_symlink, .create_mf_symlink = smb3_create_mf_symlink, + .create_reparse_symlink = smb2_create_reparse_symlink, .open = smb2_open_file, .set_fid = smb2_set_fid, .close = smb2_close_file, @@ -5349,6 +5445,7 @@ struct smb_version_operations smb21_operations = { .parse_reparse_point = smb2_parse_reparse_point, .query_mf_symlink = smb3_query_mf_symlink, .create_mf_symlink = smb3_create_mf_symlink, + .create_reparse_symlink = smb2_create_reparse_symlink, .open = smb2_open_file, .set_fid = smb2_set_fid, .close = smb2_close_file, @@ -5454,6 +5551,7 @@ struct smb_version_operations smb30_operations = { .parse_reparse_point = smb2_parse_reparse_point, .query_mf_symlink = smb3_query_mf_symlink, .create_mf_symlink = smb3_create_mf_symlink, + .create_reparse_symlink = smb2_create_reparse_symlink, .open = smb2_open_file, .set_fid = smb2_set_fid, .close = smb2_close_file, @@ -5568,6 +5666,7 @@ struct smb_version_operations smb311_operations = { .parse_reparse_point = smb2_parse_reparse_point, .query_mf_symlink = smb3_query_mf_symlink, .create_mf_symlink = smb3_create_mf_symlink, + .create_reparse_symlink = smb2_create_reparse_symlink, .open = smb2_open_file, .set_fid = smb2_set_fid, .close = smb2_close_file, diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 4f971c1061f0..c58fa44dd6b0 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -156,6 +156,56 @@ out: return; } +/* helper function for code reuse */ +static int +cifs_chan_skip_or_disable(struct cifs_ses *ses, + struct TCP_Server_Info *server, + bool from_reconnect) +{ + struct TCP_Server_Info *pserver; + unsigned int chan_index; + + if (SERVER_IS_CHAN(server)) { + cifs_dbg(VFS, + "server %s does not support multichannel anymore. Skip secondary channel\n", + ses->server->hostname); + + spin_lock(&ses->chan_lock); + chan_index = cifs_ses_get_chan_index(ses, server); + if (chan_index == CIFS_INVAL_CHAN_INDEX) { + spin_unlock(&ses->chan_lock); + goto skip_terminate; + } + + ses->chans[chan_index].server = NULL; + server->terminate = true; + spin_unlock(&ses->chan_lock); + + /* + * the above reference of server by channel + * needs to be dropped without holding chan_lock + * as cifs_put_tcp_session takes a higher lock + * i.e. cifs_tcp_ses_lock + */ + cifs_put_tcp_session(server, from_reconnect); + + cifs_signal_cifsd_for_reconnect(server, false); + + /* mark primary server as needing reconnect */ + pserver = server->primary_server; + cifs_signal_cifsd_for_reconnect(pserver, false); +skip_terminate: + return -EHOSTDOWN; + } + + cifs_server_dbg(VFS, + "server does not support multichannel anymore. Disable all other channels\n"); + cifs_disable_secondary_channels(ses); + + + return 0; +} + static int smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, struct TCP_Server_Info *server, bool from_reconnect) @@ -164,8 +214,6 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, struct nls_table *nls_codepage = NULL; struct cifs_ses *ses; int xid; - struct TCP_Server_Info *pserver; - unsigned int chan_index; /* * SMB2s NegProt, SessSetup, Logoff do not have tcon yet so @@ -310,44 +358,11 @@ again: */ if (ses->chan_count > 1 && !(server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) { - if (SERVER_IS_CHAN(server)) { - cifs_dbg(VFS, "server %s does not support " \ - "multichannel anymore. skipping secondary channel\n", - ses->server->hostname); - - spin_lock(&ses->chan_lock); - chan_index = cifs_ses_get_chan_index(ses, server); - if (chan_index == CIFS_INVAL_CHAN_INDEX) { - spin_unlock(&ses->chan_lock); - goto skip_terminate; - } - - ses->chans[chan_index].server = NULL; - spin_unlock(&ses->chan_lock); - - /* - * the above reference of server by channel - * needs to be dropped without holding chan_lock - * as cifs_put_tcp_session takes a higher lock - * i.e. cifs_tcp_ses_lock - */ - cifs_put_tcp_session(server, from_reconnect); - - server->terminate = true; - cifs_signal_cifsd_for_reconnect(server, false); - - /* mark primary server as needing reconnect */ - pserver = server->primary_server; - cifs_signal_cifsd_for_reconnect(pserver, false); - -skip_terminate: + rc = cifs_chan_skip_or_disable(ses, server, + from_reconnect); + if (rc) { mutex_unlock(&ses->session_mutex); - rc = -EHOSTDOWN; goto out; - } else { - cifs_server_dbg(VFS, "does not support " \ - "multichannel anymore. disabling all other channels\n"); - cifs_disable_secondary_channels(ses); } } @@ -384,6 +399,15 @@ skip_sess_setup: goto out; } + spin_lock(&ses->ses_lock); + if (ses->flags & CIFS_SES_FLAG_SCALE_CHANNELS) { + spin_unlock(&ses->ses_lock); + mutex_unlock(&ses->session_mutex); + goto skip_add_channels; + } + ses->flags |= CIFS_SES_FLAG_SCALE_CHANNELS; + spin_unlock(&ses->ses_lock); + if (!rc && (server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) { mutex_unlock(&ses->session_mutex); @@ -395,14 +419,29 @@ skip_sess_setup: rc = SMB3_request_interfaces(xid, tcon, false); free_xid(xid); - if (rc) + if (rc == -EOPNOTSUPP && ses->chan_count > 1) { + /* + * some servers like Azure SMB server do not advertise + * that multichannel has been disabled with server + * capabilities, rather return STATUS_NOT_IMPLEMENTED. + * treat this as server not supporting multichannel + */ + + rc = cifs_chan_skip_or_disable(ses, server, + from_reconnect); + goto skip_add_channels; + } else if (rc) cifs_dbg(FYI, "%s: failed to query server interfaces: %d\n", __func__, rc); if (ses->chan_max > ses->chan_count && + ses->iface_count && !SERVER_IS_CHAN(server)) { - if (ses->chan_count == 1) + if (ses->chan_count == 1) { cifs_server_dbg(VFS, "supports multichannel now\n"); + queue_delayed_work(cifsiod_wq, &tcon->query_interfaces, + (SMB_INTERFACE_POLL_INTERVAL * HZ)); + } cifs_try_adding_channels(ses); } @@ -410,6 +449,11 @@ skip_sess_setup: mutex_unlock(&ses->session_mutex); } +skip_add_channels: + spin_lock(&ses->ses_lock); + ses->flags &= ~CIFS_SES_FLAG_SCALE_CHANNELS; + spin_unlock(&ses->ses_lock); + if (smb2_command != SMB2_INTERNAL_CMD) mod_delayed_work(cifsiod_wq, &server->reconnect, 0); @@ -1958,10 +2002,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, __le16 *unc_path = NULL; int flags = 0; unsigned int total_len; - struct TCP_Server_Info *server; - - /* always use master channel */ - server = ses->server; + struct TCP_Server_Info *server = cifs_pick_channel(ses); cifs_dbg(FYI, "TCON\n"); @@ -2094,6 +2135,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) struct smb2_tree_disconnect_req *req; /* response is trivial */ int rc = 0; struct cifs_ses *ses = tcon->ses; + struct TCP_Server_Info *server = cifs_pick_channel(ses); int flags = 0; unsigned int total_len; struct kvec iov[1]; @@ -2116,7 +2158,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) invalidate_all_cached_dirs(tcon); - rc = smb2_plain_req_init(SMB2_TREE_DISCONNECT, tcon, ses->server, + rc = smb2_plain_req_init(SMB2_TREE_DISCONNECT, tcon, server, (void **) &req, &total_len); if (rc) @@ -2134,7 +2176,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) rqst.rq_iov = iov; rqst.rq_nvec = 1; - rc = cifs_send_recv(xid, ses, ses->server, + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); if (rc) { @@ -2279,7 +2321,7 @@ int smb2_parse_contexts(struct TCP_Server_Info *server, noff = le16_to_cpu(cc->NameOffset); nlen = le16_to_cpu(cc->NameLength); - if (noff + nlen >= doff) + if (noff + nlen > doff) return -EINVAL; name = (char *)cc + noff; @@ -2736,7 +2778,14 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, int flags = 0; unsigned int total_len; __le16 *utf16_path = NULL; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + n_iov = 2; + server = cifs_pick_channel(ses); cifs_dbg(FYI, "mkdir\n"); @@ -2840,6 +2889,10 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, /* no need to inc num_remote_opens because we close it just below */ trace_smb3_posix_mkdir_enter(xid, tcon->tid, ses->Suid, full_path, CREATE_NOT_FILE, FILE_WRITE_ATTRIBUTES); + + if (retries) + smb2_set_replay(server, &rqst); + /* resource #4: response buffer */ rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); @@ -2877,6 +2930,11 @@ err_free_req: cifs_small_buf_release(req); err_free_path: kfree(utf16_path); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -3072,12 +3130,18 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, struct smb2_create_rsp *rsp = NULL; struct cifs_tcon *tcon = oparms->tcon; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; struct kvec iov[SMB2_CREATE_IOV_SIZE]; struct kvec rsp_iov = {NULL, 0}; int resp_buftype = CIFS_NO_BUFFER; int rc = 0; int flags = 0; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + server = cifs_pick_channel(ses); cifs_dbg(FYI, "create/open\n"); if (!ses || !server) @@ -3099,6 +3163,9 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, trace_smb3_open_enter(xid, tcon->tid, tcon->ses->Suid, oparms->path, oparms->create_options, oparms->desired_access); + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); @@ -3152,6 +3219,11 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, creat_exit: SMB2_open_free(&rqst); free_rsp_buf(resp_buftype, rsp); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -3276,15 +3348,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, int resp_buftype = CIFS_NO_BUFFER; int rc = 0; int flags = 0; - - cifs_dbg(FYI, "SMB2 IOCTL\n"); - - if (out_data != NULL) - *out_data = NULL; - - /* zero out returned data len, in case of error */ - if (plen) - *plen = 0; + int retries = 0, cur_sleep = 1; if (!tcon) return -EIO; @@ -3293,10 +3357,23 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, if (!ses) return -EIO; +replay_again: + /* reinitialize for possible replay */ + flags = 0; server = cifs_pick_channel(ses); + if (!server) return -EIO; + cifs_dbg(FYI, "SMB2 IOCTL\n"); + + if (out_data != NULL) + *out_data = NULL; + + /* zero out returned data len, in case of error */ + if (plen) + *plen = 0; + if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; @@ -3311,6 +3388,9 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, if (rc) goto ioctl_exit; + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); @@ -3380,6 +3460,11 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, ioctl_exit: SMB2_ioctl_free(&rqst); free_rsp_buf(resp_buftype, rsp); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -3451,13 +3536,20 @@ __SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, struct smb_rqst rqst; struct smb2_close_rsp *rsp = NULL; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; struct kvec iov[1]; struct kvec rsp_iov; int resp_buftype = CIFS_NO_BUFFER; int rc = 0; int flags = 0; bool query_attrs = false; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + query_attrs = false; + server = cifs_pick_channel(ses); cifs_dbg(FYI, "Close\n"); @@ -3483,6 +3575,9 @@ __SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, if (rc) goto close_exit; + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); rsp = (struct smb2_close_rsp *)rsp_iov.iov_base; @@ -3516,6 +3611,11 @@ close_exit: cifs_dbg(VFS, "handle cancelled close fid 0x%llx returned error %d\n", persistent_fid, tmp_rc); } + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -3646,12 +3746,19 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, struct TCP_Server_Info *server; int flags = 0; bool allocated = false; + int retries = 0, cur_sleep = 1; cifs_dbg(FYI, "Query Info\n"); if (!ses) return -EIO; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + allocated = false; server = cifs_pick_channel(ses); + if (!server) return -EIO; @@ -3673,6 +3780,9 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, trace_smb3_query_info_enter(xid, persistent_fid, tcon->tid, ses->Suid, info_class, (__u32)info_type); + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base; @@ -3715,6 +3825,11 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, qinf_exit: SMB2_query_info_free(&rqst); free_rsp_buf(resp_buftype, rsp); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -3815,7 +3930,7 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, u32 *plen /* returned data len */) { struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; struct smb_rqst rqst; struct smb2_change_notify_rsp *smb_rsp; struct kvec iov[1]; @@ -3823,6 +3938,12 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, int resp_buftype = CIFS_NO_BUFFER; int flags = 0; int rc = 0; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + server = cifs_pick_channel(ses); cifs_dbg(FYI, "change notify\n"); if (!ses || !server) @@ -3847,6 +3968,10 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, trace_smb3_notify_enter(xid, persistent_fid, tcon->tid, ses->Suid, (u8)watch_tree, completion_filter); + + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); @@ -3881,6 +4006,11 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, if (rqst.rq_iov) cifs_small_buf_release(rqst.rq_iov[0].iov_base); /* request */ free_rsp_buf(resp_buftype, rsp_iov.iov_base); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -3918,7 +4048,7 @@ void smb2_reconnect_server(struct work_struct *work) struct cifs_ses *ses, *ses2; struct cifs_tcon *tcon, *tcon2; struct list_head tmp_list, tmp_ses_list; - bool tcon_exist = false, ses_exist = false; + bool ses_exist = false; bool tcon_selected = false; int rc; bool resched = false; @@ -3964,7 +4094,7 @@ void smb2_reconnect_server(struct work_struct *work) if (tcon->need_reconnect || tcon->need_reopen_files) { tcon->tc_count++; list_add_tail(&tcon->rlist, &tmp_list); - tcon_selected = tcon_exist = true; + tcon_selected = true; } } /* @@ -3973,7 +4103,7 @@ void smb2_reconnect_server(struct work_struct *work) */ if (ses->tcon_ipc && ses->tcon_ipc->need_reconnect) { list_add_tail(&ses->tcon_ipc->rlist, &tmp_list); - tcon_selected = tcon_exist = true; + tcon_selected = true; cifs_smb_ses_inc_refcount(ses); } /* @@ -4123,10 +4253,16 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, struct smb_rqst rqst; struct kvec iov[1]; struct kvec rsp_iov = {NULL, 0}; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; int resp_buftype = CIFS_NO_BUFFER; int flags = 0; int rc = 0; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + server = cifs_pick_channel(ses); cifs_dbg(FYI, "flush\n"); if (!ses || !(ses->server)) @@ -4146,6 +4282,10 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, goto flush_exit; trace_smb3_flush_enter(xid, persistent_fid, tcon->tid, ses->Suid); + + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); @@ -4160,6 +4300,11 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, flush_exit: SMB2_flush_free(&rqst); free_rsp_buf(resp_buftype, rsp_iov.iov_base); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -4639,7 +4784,7 @@ smb2_async_writev(struct cifs_writedata *wdata, struct cifs_io_parms *io_parms = NULL; int credit_request; - if (!wdata->server) + if (!wdata->server || wdata->replay) server = wdata->server = cifs_pick_channel(tcon->ses); /* @@ -4724,6 +4869,8 @@ smb2_async_writev(struct cifs_writedata *wdata, rqst.rq_nvec = 1; rqst.rq_iter = wdata->iter; rqst.rq_iter_size = iov_iter_count(&rqst.rq_iter); + if (wdata->replay) + smb2_set_replay(server, &rqst); #ifdef CONFIG_CIFS_SMB_DIRECT if (wdata->mr) iov[0].iov_len += sizeof(struct smbd_buffer_descriptor_v1); @@ -4797,18 +4944,21 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, int flags = 0; unsigned int total_len; struct TCP_Server_Info *server; + int retries = 0, cur_sleep = 1; +replay_again: + /* reinitialize for possible replay */ + flags = 0; *nbytes = 0; - - if (n_vec < 1) - return rc; - if (!io_parms->server) io_parms->server = cifs_pick_channel(io_parms->tcon->ses); server = io_parms->server; if (server == NULL) return -ECONNABORTED; + if (n_vec < 1) + return rc; + rc = smb2_plain_req_init(SMB2_WRITE, io_parms->tcon, server, (void **) &req, &total_len); if (rc) @@ -4842,6 +4992,9 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, rqst.rq_iov = iov; rqst.rq_nvec = n_vec + 1; + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, io_parms->tcon->ses, server, &rqst, &resp_buftype, flags, &rsp_iov); @@ -4866,6 +5019,11 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, cifs_small_buf_release(req); free_rsp_buf(resp_buftype, rsp); + + if (is_replayable_error(rc) && + smb2_should_replay(io_parms->tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -5177,8 +5335,14 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, struct kvec rsp_iov; int rc = 0; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; int flags = 0; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + server = cifs_pick_channel(ses); if (!ses || !(ses->server)) return -EIO; @@ -5198,6 +5362,9 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, if (rc) goto qdir_exit; + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); rsp = (struct smb2_query_directory_rsp *)rsp_iov.iov_base; @@ -5232,6 +5399,11 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, qdir_exit: SMB2_query_directory_free(&rqst); free_rsp_buf(resp_buftype, rsp); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -5298,8 +5470,14 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, int rc = 0; int resp_buftype; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; int flags = 0; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + server = cifs_pick_channel(ses); if (!ses || !server) return -EIO; @@ -5327,6 +5505,8 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, return rc; } + if (retries) + smb2_set_replay(server, &rqst); rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, @@ -5342,23 +5522,28 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, free_rsp_buf(resp_buftype, rsp); kfree(iov); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } int SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, - u64 volatile_fid, u32 pid, __le64 *eof) + u64 volatile_fid, u32 pid, loff_t new_eof) { struct smb2_file_eof_info info; void *data; unsigned int size; - info.EndOfFile = *eof; + info.EndOfFile = cpu_to_le64(new_eof); data = &info; size = sizeof(struct smb2_file_eof_info); - trace_smb3_set_eof(xid, persistent_fid, tcon->tid, tcon->ses->Suid, le64_to_cpu(*eof)); + trace_smb3_set_eof(xid, persistent_fid, tcon->tid, tcon->ses->Suid, new_eof); return send_set_info(xid, tcon, persistent_fid, volatile_fid, pid, FILE_END_OF_FILE_INFORMATION, SMB2_O_INFO_FILE, @@ -5394,12 +5579,18 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, int rc; struct smb2_oplock_break *req = NULL; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; int flags = CIFS_OBREAK_OP; unsigned int total_len; struct kvec iov[1]; struct kvec rsp_iov; int resp_buf_type; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = CIFS_OBREAK_OP; + server = cifs_pick_channel(ses); cifs_dbg(FYI, "SMB2_oplock_break\n"); rc = smb2_plain_req_init(SMB2_OPLOCK_BREAK, tcon, server, @@ -5424,15 +5615,21 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, rqst.rq_iov = iov; rqst.rq_nvec = 1; + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); - if (rc) { cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE); cifs_dbg(FYI, "Send error in Oplock Break = %d\n", rc); } + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -5518,9 +5715,15 @@ SMB311_posix_qfs_info(const unsigned int xid, struct cifs_tcon *tcon, int rc = 0; int resp_buftype; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; FILE_SYSTEM_POSIX_INFO *info = NULL; int flags = 0; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + server = cifs_pick_channel(ses); rc = build_qfs_info_req(&iov, tcon, server, FS_POSIX_INFORMATION, @@ -5536,6 +5739,9 @@ SMB311_posix_qfs_info(const unsigned int xid, struct cifs_tcon *tcon, rqst.rq_iov = &iov; rqst.rq_nvec = 1; + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); free_qfs_info_req(&iov); @@ -5555,6 +5761,11 @@ SMB311_posix_qfs_info(const unsigned int xid, struct cifs_tcon *tcon, posix_qfsinf_exit: free_rsp_buf(resp_buftype, rsp_iov.iov_base); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -5569,9 +5780,15 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, int rc = 0; int resp_buftype; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; struct smb2_fs_full_size_info *info = NULL; int flags = 0; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + server = cifs_pick_channel(ses); rc = build_qfs_info_req(&iov, tcon, server, FS_FULL_SIZE_INFORMATION, @@ -5587,6 +5804,9 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, rqst.rq_iov = &iov; rqst.rq_nvec = 1; + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); free_qfs_info_req(&iov); @@ -5606,6 +5826,11 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, qfsinf_exit: free_rsp_buf(resp_buftype, rsp_iov.iov_base); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -5620,9 +5845,15 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, int rc = 0; int resp_buftype, max_len, min_len; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct TCP_Server_Info *server; unsigned int rsp_len, offset; int flags = 0; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = 0; + server = cifs_pick_channel(ses); if (level == FS_DEVICE_INFORMATION) { max_len = sizeof(FILE_SYSTEM_DEVICE_INFO); @@ -5654,6 +5885,9 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, rqst.rq_iov = &iov; rqst.rq_nvec = 1; + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); free_qfs_info_req(&iov); @@ -5691,6 +5925,11 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, qfsattr_exit: free_rsp_buf(resp_buftype, rsp_iov.iov_base); + + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } @@ -5708,7 +5947,13 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon, unsigned int count; int flags = CIFS_NO_RSP_BUF; unsigned int total_len; - struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses); + struct TCP_Server_Info *server; + int retries = 0, cur_sleep = 1; + +replay_again: + /* reinitialize for possible replay */ + flags = CIFS_NO_RSP_BUF; + server = cifs_pick_channel(tcon->ses); cifs_dbg(FYI, "smb2_lockv num lock %d\n", num_lock); @@ -5739,6 +5984,9 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon, rqst.rq_iov = iov; rqst.rq_nvec = 2; + if (retries) + smb2_set_replay(server, &rqst); + rc = cifs_send_recv(xid, tcon->ses, server, &rqst, &resp_buf_type, flags, &rsp_iov); @@ -5750,6 +5998,10 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon, tcon->ses->Suid, rc); } + if (is_replayable_error(rc) && + smb2_should_replay(tcon, &retries, &cur_sleep)) + goto replay_again; + return rc; } diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h index 0e371f7e2854..b3069911e9dd 100644 --- a/fs/smb/client/smb2proto.h +++ b/fs/smb/client/smb2proto.h @@ -56,6 +56,18 @@ extern int smb3_handle_read_data(struct TCP_Server_Info *server, extern int smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *path, __u32 *reparse_tag); +struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data, + struct super_block *sb, + const unsigned int xid, + struct cifs_tcon *tcon, + const char *full_path, + struct kvec *iov); +int smb2_query_reparse_point(const unsigned int xid, + struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const char *full_path, + u32 *tag, struct kvec *rsp, + int *rsp_buftype); int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, @@ -80,12 +92,16 @@ extern int smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb); extern int smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb); -extern int smb2_rename_path(const unsigned int xid, struct cifs_tcon *tcon, - const char *from_name, const char *to_name, - struct cifs_sb_info *cifs_sb); -extern int smb2_create_hardlink(const unsigned int xid, struct cifs_tcon *tcon, - const char *from_name, const char *to_name, - struct cifs_sb_info *cifs_sb); +int smb2_rename_path(const unsigned int xid, + struct cifs_tcon *tcon, + struct dentry *source_dentry, + const char *from_name, const char *to_name, + struct cifs_sb_info *cifs_sb); +int smb2_create_hardlink(const unsigned int xid, + struct cifs_tcon *tcon, + struct dentry *source_dentry, + const char *from_name, const char *to_name, + struct cifs_sb_info *cifs_sb); extern int smb3_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const unsigned char *path, char *pbuf, unsigned int *pbytes_written); @@ -106,6 +122,11 @@ extern unsigned long smb_rqst_len(struct TCP_Server_Info *server, extern void smb2_set_next_command(struct cifs_tcon *tcon, struct smb_rqst *rqst); extern void smb2_set_related(struct smb_rqst *rqst); +extern void smb2_set_replay(struct TCP_Server_Info *server, + struct smb_rqst *rqst); +extern bool smb2_should_replay(struct cifs_tcon *tcon, + int *pretries, + int *pcur_sleep); /* * SMB2 Worker functions - most of protocol specific implementation details @@ -205,7 +226,7 @@ extern int SMB2_query_directory_init(unsigned int xid, struct cifs_tcon *tcon, extern void SMB2_query_directory_free(struct smb_rqst *rqst); extern int SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, u32 pid, - __le64 *eof); + loff_t new_eof); extern int SMB2_set_info_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, struct smb_rqst *rqst, @@ -283,10 +304,9 @@ int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, - struct cifs_open_info_data *data, - struct cifs_sid *owner, - struct cifs_sid *group); + struct cifs_open_info_data *data); int posix_info_parse(const void *beg, const void *end, struct smb2_posix_info_parsed *out); int posix_info_sid_size(const void *beg, const void *end); + #endif /* _SMB2PROTO_H */ diff --git a/fs/smb/client/smb2status.h b/fs/smb/client/smb2status.h index a9e958166fc5..9c6d79b0bd49 100644 --- a/fs/smb/client/smb2status.h +++ b/fs/smb/client/smb2status.h @@ -982,6 +982,8 @@ struct ntstatus { #define STATUS_INVALID_TASK_INDEX cpu_to_le32(0xC0000501) #define STATUS_THREAD_ALREADY_IN_TASK cpu_to_le32(0xC0000502) #define STATUS_CALLBACK_BYPASS cpu_to_le32(0xC0000503) +#define STATUS_SERVER_UNAVAILABLE cpu_to_le32(0xC0000466) +#define STATUS_FILE_NOT_AVAILABLE cpu_to_le32(0xC0000467) #define STATUS_PORT_CLOSED cpu_to_le32(0xC0000700) #define STATUS_MESSAGE_LOST cpu_to_le32(0xC0000701) #define STATUS_INVALID_MESSAGE cpu_to_le32(0xC0000702) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 94df9eec3d8d..d74e829de51c 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -2136,7 +2136,7 @@ static int allocate_mr_list(struct smbd_connection *info) for (i = 0; i < info->responder_resources * 2; i++) { smbdirect_mr = kzalloc(sizeof(*smbdirect_mr), GFP_KERNEL); if (!smbdirect_mr) - goto out; + goto cleanup_entries; smbdirect_mr->mr = ib_alloc_mr(info->pd, info->mr_type, info->max_frmr_depth); if (IS_ERR(smbdirect_mr->mr)) { @@ -2162,7 +2162,7 @@ static int allocate_mr_list(struct smbd_connection *info) out: kfree(smbdirect_mr); - +cleanup_entries: list_for_each_entry_safe(smbdirect_mr, tmp, &info->mr_list, list) { list_del(&smbdirect_mr->list); ib_dereg_mr(smbdirect_mr->mr); diff --git a/fs/smb/client/smbencrypt.c b/fs/smb/client/smbencrypt.c index f0ce26414f17..1d1ee9f18f37 100644 --- a/fs/smb/client/smbencrypt.c +++ b/fs/smb/client/smbencrypt.c @@ -26,13 +26,6 @@ #include "cifsproto.h" #include "../common/md4.h" -#ifndef false -#define false 0 -#endif -#ifndef true -#define true 1 -#endif - /* following came from the other byteorder.h to avoid include conflicts */ #define CVAL(buf,pos) (((unsigned char *)(buf))[pos]) #define SSVALX(buf,pos,val) (CVAL(buf,pos)=(val)&0xFF,CVAL(buf,pos+1)=(val)>>8) diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h index de199ec9f726..522fa387fcfd 100644 --- a/fs/smb/client/trace.h +++ b/fs/smb/client/trace.h @@ -370,11 +370,12 @@ DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rename_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rmdir_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_eof_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_info_compound_enter); +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_reparse_compound_enter); +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(get_reparse_compound_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(delete_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mkdir_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(tdis_enter); - DECLARE_EVENT_CLASS(smb3_inf_compound_done_class, TP_PROTO(unsigned int xid, __u32 tid, @@ -408,6 +409,8 @@ DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rename_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rmdir_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_eof_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_info_compound_done); +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_reparse_compound_done); +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(get_reparse_compound_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(delete_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mkdir_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(tdis_done); @@ -451,6 +454,8 @@ DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rename_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rmdir_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_eof_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_info_compound_err); +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_reparse_compound_err); +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(get_reparse_compound_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mkdir_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(delete_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(tdis_err); diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c index 4f717ad7c21b..994d70193432 100644 --- a/fs/smb/client/transport.c +++ b/fs/smb/client/transport.c @@ -400,10 +400,17 @@ unmask: server->conn_id, server->hostname); } smbd_done: - if (rc < 0 && rc != -EINTR) + /* + * there's hardly any use for the layers above to know the + * actual error code here. All they should do at this point is + * to retry the connection and hope it goes away. + */ + if (rc < 0 && rc != -EINTR && rc != -EAGAIN) { cifs_server_dbg(VFS, "Error %d sending data on socket to server\n", rc); - else if (rc > 0) + rc = -ECONNABORTED; + cifs_signal_cifsd_for_reconnect(server, false); + } else if (rc > 0) rc = 0; out: cifs_in_send_dec(server); @@ -428,8 +435,8 @@ smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, if (!(flags & CIFS_TRANSFORM_REQ)) return __smb_send_rqst(server, num_rqst, rqst); - if (num_rqst > MAX_COMPOUND - 1) - return -ENOMEM; + if (WARN_ON_ONCE(num_rqst > MAX_COMPOUND - 1)) + return -EIO; if (!server->ops->init_transform_rq) { cifs_server_dbg(VFS, "Encryption requested but transform callback is missing\n"); @@ -1026,6 +1033,9 @@ struct TCP_Server_Info *cifs_pick_channel(struct cifs_ses *ses) if (!server || server->terminate) continue; + if (CIFS_CHAN_NEEDS_RECONNECT(ses, i)) + continue; + /* * strictly speaking, we should pick up req_lock to read * server->in_flight. But it shouldn't matter much here if we diff --git a/fs/smb/server/asn1.c b/fs/smb/server/asn1.c index 4a4b2b03ff33..b931a99ab9c8 100644 --- a/fs/smb/server/asn1.c +++ b/fs/smb/server/asn1.c @@ -214,10 +214,15 @@ static int ksmbd_neg_token_alloc(void *context, size_t hdrlen, { struct ksmbd_conn *conn = context; + if (!vlen) + return -EINVAL; + conn->mechToken = kmemdup_nul(value, vlen, GFP_KERNEL); if (!conn->mechToken) return -ENOMEM; + conn->mechTokenLen = (unsigned int)vlen; + return 0; } diff --git a/fs/smb/server/auth.c b/fs/smb/server/auth.c index 229a6527870d..09b20039636e 100644 --- a/fs/smb/server/auth.c +++ b/fs/smb/server/auth.c @@ -208,10 +208,12 @@ out: /** * ksmbd_auth_ntlmv2() - NTLMv2 authentication handler - * @sess: session of connection + * @conn: connection + * @sess: session of connection * @ntlmv2: NTLMv2 challenge response * @blen: NTLMv2 blob length * @domain_name: domain name + * @cryptkey: session crypto key * * Return: 0 on success, error number on error */ @@ -294,7 +296,8 @@ out: * ksmbd_decode_ntlmssp_auth_blob() - helper function to construct * authenticate blob * @authblob: authenticate blob source pointer - * @usr: user details + * @blob_len: length of the @authblob message + * @conn: connection * @sess: session of connection * * Return: 0 on success, error number on error @@ -376,8 +379,8 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob, * ksmbd_decode_ntlmssp_neg_blob() - helper function to construct * negotiate blob * @negblob: negotiate blob source pointer - * @rsp: response header pointer to be updated - * @sess: session of connection + * @blob_len: length of the @authblob message + * @conn: connection * */ int ksmbd_decode_ntlmssp_neg_blob(struct negotiate_message *negblob, @@ -403,8 +406,7 @@ int ksmbd_decode_ntlmssp_neg_blob(struct negotiate_message *negblob, * ksmbd_build_ntlmssp_challenge_blob() - helper function to construct * challenge blob * @chgblob: challenge blob source pointer to initialize - * @rsp: response header pointer to be updated - * @sess: session of connection + * @conn: connection * */ unsigned int diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c index b6fa1e285c40..09e1e7771592 100644 --- a/fs/smb/server/connection.c +++ b/fs/smb/server/connection.c @@ -284,6 +284,7 @@ int ksmbd_conn_handler_loop(void *p) goto out; conn->last_active = jiffies; + set_freezable(); while (ksmbd_conn_alive(conn)) { if (try_to_freeze()) continue; @@ -415,13 +416,7 @@ static void stop_sessions(void) again: down_read(&conn_list_lock); list_for_each_entry(conn, &conn_list, conns_list) { - struct task_struct *task; - t = conn->transport; - task = t->handler; - if (task) - ksmbd_debug(CONN, "Stop session handler %s/%d\n", - task->comm, task_pid_nr(task)); ksmbd_conn_set_exiting(conn); if (t->ops->shutdown) { up_read(&conn_list_lock); diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h index 3c005246a32e..0e04cf8b1d89 100644 --- a/fs/smb/server/connection.h +++ b/fs/smb/server/connection.h @@ -88,6 +88,7 @@ struct ksmbd_conn { __u16 dialect; char *mechToken; + unsigned int mechTokenLen; struct ksmbd_conn_ops *conn_ops; @@ -134,7 +135,6 @@ struct ksmbd_transport_ops { struct ksmbd_transport { struct ksmbd_conn *conn; struct ksmbd_transport_ops *ops; - struct task_struct *handler; }; #define KSMBD_TCP_RECV_TIMEOUT (7 * HZ) diff --git a/fs/smb/server/ksmbd_netlink.h b/fs/smb/server/ksmbd_netlink.h index b7521e41402e..0ebf91ffa236 100644 --- a/fs/smb/server/ksmbd_netlink.h +++ b/fs/smb/server/ksmbd_netlink.h @@ -304,7 +304,8 @@ enum ksmbd_event { KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST, KSMBD_EVENT_SPNEGO_AUTHEN_RESPONSE = 15, - KSMBD_EVENT_MAX + __KSMBD_EVENT_MAX, + KSMBD_EVENT_MAX = __KSMBD_EVENT_MAX - 1 }; /* diff --git a/fs/smb/server/mgmt/ksmbd_ida.c b/fs/smb/server/mgmt/ksmbd_ida.c index 54194d959a5e..a18e27e9e0cd 100644 --- a/fs/smb/server/mgmt/ksmbd_ida.c +++ b/fs/smb/server/mgmt/ksmbd_ida.c @@ -5,42 +5,33 @@ #include "ksmbd_ida.h" -static inline int __acquire_id(struct ida *ida, int from, int to) -{ - return ida_simple_get(ida, from, to, GFP_KERNEL); -} - int ksmbd_acquire_smb2_tid(struct ida *ida) { - int id; - - id = __acquire_id(ida, 1, 0xFFFFFFFF); - - return id; + return ida_alloc_range(ida, 1, 0xFFFFFFFE, GFP_KERNEL); } int ksmbd_acquire_smb2_uid(struct ida *ida) { int id; - id = __acquire_id(ida, 1, 0); + id = ida_alloc_min(ida, 1, GFP_KERNEL); if (id == 0xFFFE) - id = __acquire_id(ida, 1, 0); + id = ida_alloc_min(ida, 1, GFP_KERNEL); return id; } int ksmbd_acquire_async_msg_id(struct ida *ida) { - return __acquire_id(ida, 1, 0); + return ida_alloc_min(ida, 1, GFP_KERNEL); } int ksmbd_acquire_id(struct ida *ida) { - return __acquire_id(ida, 0, 0); + return ida_alloc(ida, GFP_KERNEL); } void ksmbd_release_id(struct ida *ida, int id) { - ida_simple_remove(ida, id); + ida_free(ida, id); } diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c index 562b180459a1..53dfaac425c6 100644 --- a/fs/smb/server/oplock.c +++ b/fs/smb/server/oplock.c @@ -105,7 +105,7 @@ static int alloc_lease(struct oplock_info *opinfo, struct lease_ctx_info *lctx) lease->is_dir = lctx->is_dir; memcpy(lease->parent_lease_key, lctx->parent_lease_key, SMB2_LEASE_KEY_SIZE); lease->version = lctx->version; - lease->epoch = le16_to_cpu(lctx->epoch); + lease->epoch = le16_to_cpu(lctx->epoch) + 1; INIT_LIST_HEAD(&opinfo->lease_entry); opinfo->o_lease = lease; @@ -546,6 +546,7 @@ static struct oplock_info *same_client_has_lease(struct ksmbd_inode *ci, atomic_read(&ci->sop_count)) == 1) { if (lease->state != SMB2_LEASE_NONE_LE && lease->state == (lctx->req_state & lease->state)) { + lease->epoch++; lease->state |= lctx->req_state; if (lctx->req_state & SMB2_LEASE_WRITE_CACHING_LE) @@ -556,13 +557,17 @@ static struct oplock_info *same_client_has_lease(struct ksmbd_inode *ci, atomic_read(&ci->sop_count)) > 1) { if (lctx->req_state == (SMB2_LEASE_READ_CACHING_LE | - SMB2_LEASE_HANDLE_CACHING_LE)) + SMB2_LEASE_HANDLE_CACHING_LE)) { + lease->epoch++; lease->state = lctx->req_state; + } } if (lctx->req_state && lease->state == - SMB2_LEASE_NONE_LE) + SMB2_LEASE_NONE_LE) { + lease->epoch++; lease_none_upgrade(opinfo, lctx->req_state); + } } read_lock(&ci->m_lock); } @@ -1035,7 +1040,8 @@ static void copy_lease(struct oplock_info *op1, struct oplock_info *op2) SMB2_LEASE_KEY_SIZE); lease2->duration = lease1->duration; lease2->flags = lease1->flags; - lease2->epoch = lease1->epoch++; + lease2->epoch = lease1->epoch; + lease2->version = lease1->version; } static int add_lease_global_list(struct oplock_info *opinfo) @@ -1191,6 +1197,12 @@ int smb_grant_oplock(struct ksmbd_work *work, int req_op_level, u64 pid, bool prev_op_has_lease; __le32 prev_op_state = 0; + /* Only v2 leases handle the directory */ + if (S_ISDIR(file_inode(fp->filp)->i_mode)) { + if (!lctx || lctx->version != 2) + return 0; + } + opinfo = alloc_opinfo(work, pid, tid); if (!opinfo) return -ENOMEM; @@ -1447,7 +1459,7 @@ void create_lease_buf(u8 *rbuf, struct lease *lease) memcpy(buf->lcontext.LeaseKey, lease->lease_key, SMB2_LEASE_KEY_SIZE); buf->lcontext.LeaseFlags = lease->flags; - buf->lcontext.Epoch = cpu_to_le16(++lease->epoch); + buf->lcontext.Epoch = cpu_to_le16(lease->epoch); buf->lcontext.LeaseState = lease->state; memcpy(buf->lcontext.ParentLeaseKey, lease->parent_lease_key, SMB2_LEASE_KEY_SIZE); diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 652ab429bf2e..ba7a72a6a4f4 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -1414,7 +1414,10 @@ static struct ksmbd_user *session_user(struct ksmbd_conn *conn, char *name; unsigned int name_off, name_len, secbuf_len; - secbuf_len = le16_to_cpu(req->SecurityBufferLength); + if (conn->use_spnego && conn->mechToken) + secbuf_len = conn->mechTokenLen; + else + secbuf_len = le16_to_cpu(req->SecurityBufferLength); if (secbuf_len < sizeof(struct authenticate_message)) { ksmbd_debug(SMB, "blob len %d too small\n", secbuf_len); return NULL; @@ -1505,7 +1508,10 @@ static int ntlm_authenticate(struct ksmbd_work *work, struct authenticate_message *authblob; authblob = user_authblob(conn, req); - sz = le16_to_cpu(req->SecurityBufferLength); + if (conn->use_spnego && conn->mechToken) + sz = conn->mechTokenLen; + else + sz = le16_to_cpu(req->SecurityBufferLength); rc = ksmbd_decode_ntlmssp_auth_blob(authblob, sz, conn, sess); if (rc) { set_user_flag(sess->user, KSMBD_USER_FLAG_BAD_PASSWORD); @@ -1778,8 +1784,7 @@ int smb2_sess_setup(struct ksmbd_work *work) negblob_off = le16_to_cpu(req->SecurityBufferOffset); negblob_len = le16_to_cpu(req->SecurityBufferLength); - if (negblob_off < offsetof(struct smb2_sess_setup_req, Buffer) || - negblob_len < offsetof(struct negotiate_message, NegotiateFlags)) { + if (negblob_off < offsetof(struct smb2_sess_setup_req, Buffer)) { rc = -EINVAL; goto out_err; } @@ -1788,8 +1793,15 @@ int smb2_sess_setup(struct ksmbd_work *work) negblob_off); if (decode_negotiation_token(conn, negblob, negblob_len) == 0) { - if (conn->mechToken) + if (conn->mechToken) { negblob = (struct negotiate_message *)conn->mechToken; + negblob_len = conn->mechTokenLen; + } + } + + if (negblob_len < offsetof(struct negotiate_message, NegotiateFlags)) { + rc = -EINVAL; + goto out_err; } if (server_conf.auth_mechs & conn->auth_mechs) { @@ -2311,11 +2323,12 @@ out: * @eabuf: set info command buffer * @buf_len: set info command buffer length * @path: dentry path for get ea + * @get_write: get write access to a mount * * Return: 0 on success, otherwise error */ static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len, - const struct path *path) + const struct path *path, bool get_write) { struct mnt_idmap *idmap = mnt_idmap(path->mnt); char *attr_name = NULL, *value; @@ -2971,7 +2984,7 @@ int smb2_open(struct ksmbd_work *work) &may_flags); if (!test_tree_conn_flag(tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) { - if (open_flags & O_CREAT) { + if (open_flags & (O_CREAT | O_TRUNC)) { ksmbd_debug(SMB, "User does not have write permission\n"); rc = -EACCES; @@ -3003,7 +3016,7 @@ int smb2_open(struct ksmbd_work *work) rc = smb2_set_ea(&ea_buf->ea, le32_to_cpu(ea_buf->ccontext.DataLength), - &path); + &path, false); if (rc == -EOPNOTSUPP) rc = 0; else if (rc) @@ -5568,6 +5581,7 @@ static int smb2_rename(struct ksmbd_work *work, if (!file_info->ReplaceIfExists) flags = RENAME_NOREPLACE; + smb_break_all_levII_oplock(work, fp, 0); rc = ksmbd_vfs_rename(work, &fp->filp->f_path, new_name, flags); out: kfree(new_name); @@ -5943,12 +5957,6 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp, } case FILE_RENAME_INFORMATION: { - if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) { - ksmbd_debug(SMB, - "User does not have write permission\n"); - return -EACCES; - } - if (buf_len < sizeof(struct smb2_file_rename_info)) return -EINVAL; @@ -5968,12 +5976,6 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp, } case FILE_DISPOSITION_INFORMATION: { - if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) { - ksmbd_debug(SMB, - "User does not have write permission\n"); - return -EACCES; - } - if (buf_len < sizeof(struct smb2_file_disposition_info)) return -EINVAL; @@ -5992,7 +5994,7 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp, return -EINVAL; return smb2_set_ea((struct smb2_ea_info *)req->Buffer, - buf_len, &fp->filp->f_path); + buf_len, &fp->filp->f_path, true); } case FILE_POSITION_INFORMATION: { @@ -6035,7 +6037,7 @@ int smb2_set_info(struct ksmbd_work *work) { struct smb2_set_info_req *req; struct smb2_set_info_rsp *rsp; - struct ksmbd_file *fp; + struct ksmbd_file *fp = NULL; int rc = 0; unsigned int id = KSMBD_NO_FID, pid = KSMBD_NO_FID; @@ -6055,6 +6057,13 @@ int smb2_set_info(struct ksmbd_work *work) rsp = smb2_get_msg(work->response_buf); } + if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) { + ksmbd_debug(SMB, "User does not have write permission\n"); + pr_err("User does not have write permission\n"); + rc = -EACCES; + goto err_out; + } + if (!has_file_id(id)) { id = req->VolatileFileId; pid = req->PersistentFileId; diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c index 6691ae68af0c..7c98bf699772 100644 --- a/fs/smb/server/smb_common.c +++ b/fs/smb/server/smb_common.c @@ -158,8 +158,12 @@ int ksmbd_verify_smb_message(struct ksmbd_work *work) */ bool ksmbd_smb_request(struct ksmbd_conn *conn) { - __le32 *proto = (__le32 *)smb2_get_msg(conn->request_buf); + __le32 *proto; + if (conn->request_buf[0] != 0) + return false; + + proto = (__le32 *)smb2_get_msg(conn->request_buf); if (*proto == SMB2_COMPRESSION_TRANSFORM_ID) { pr_err_ratelimited("smb2 compression not support yet"); return false; diff --git a/fs/smb/server/smbacl.c b/fs/smb/server/smbacl.c index 1164365533f0..1c9775f1efa5 100644 --- a/fs/smb/server/smbacl.c +++ b/fs/smb/server/smbacl.c @@ -401,10 +401,6 @@ static void parse_dacl(struct mnt_idmap *idmap, if (num_aces > ULONG_MAX / sizeof(struct smb_ace *)) return; - ppace = kmalloc_array(num_aces, sizeof(struct smb_ace *), GFP_KERNEL); - if (!ppace) - return; - ret = init_acl_state(&acl_state, num_aces); if (ret) return; @@ -414,6 +410,13 @@ static void parse_dacl(struct mnt_idmap *idmap, return; } + ppace = kmalloc_array(num_aces, sizeof(struct smb_ace *), GFP_KERNEL); + if (!ppace) { + free_acl_state(&default_acl_state); + free_acl_state(&acl_state); + return; + } + /* * reset rwx permissions for user/group/other. * Also, if num_aces is 0 i.e. DACL has no ACEs, diff --git a/fs/smb/server/transport_ipc.c b/fs/smb/server/transport_ipc.c index b49d47bdafc9..f29bb03f0dc4 100644 --- a/fs/smb/server/transport_ipc.c +++ b/fs/smb/server/transport_ipc.c @@ -74,7 +74,7 @@ static int handle_unsupported_event(struct sk_buff *skb, struct genl_info *info) static int handle_generic_event(struct sk_buff *skb, struct genl_info *info); static int ksmbd_ipc_heartbeat_request(void); -static const struct nla_policy ksmbd_nl_policy[KSMBD_EVENT_MAX] = { +static const struct nla_policy ksmbd_nl_policy[KSMBD_EVENT_MAX + 1] = { [KSMBD_EVENT_UNSPEC] = { .len = 0, }, @@ -403,7 +403,7 @@ static int handle_generic_event(struct sk_buff *skb, struct genl_info *info) return -EPERM; #endif - if (type >= KSMBD_EVENT_MAX) { + if (type > KSMBD_EVENT_MAX) { WARN_ON(1); return -EINVAL; } diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index c5629a68c8b7..8faa25c6e129 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -2039,6 +2039,7 @@ static bool rdma_frwr_is_supported(struct ib_device_attr *attrs) static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id) { struct smb_direct_transport *t; + struct task_struct *handler; int ret; if (!rdma_frwr_is_supported(&new_cm_id->device->attrs)) { @@ -2056,11 +2057,11 @@ static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id) if (ret) goto out_err; - KSMBD_TRANS(t)->handler = kthread_run(ksmbd_conn_handler_loop, - KSMBD_TRANS(t)->conn, "ksmbd:r%u", - smb_direct_port); - if (IS_ERR(KSMBD_TRANS(t)->handler)) { - ret = PTR_ERR(KSMBD_TRANS(t)->handler); + handler = kthread_run(ksmbd_conn_handler_loop, + KSMBD_TRANS(t)->conn, "ksmbd:r%u", + smb_direct_port); + if (IS_ERR(handler)) { + ret = PTR_ERR(handler); pr_err("Can't start thread\n"); goto out_err; } diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c index eff7a1d793f0..002a3f0dc7c5 100644 --- a/fs/smb/server/transport_tcp.c +++ b/fs/smb/server/transport_tcp.c @@ -185,6 +185,7 @@ static int ksmbd_tcp_new_connection(struct socket *client_sk) struct sockaddr *csin; int rc = 0; struct tcp_transport *t; + struct task_struct *handler; t = alloc_transport(client_sk); if (!t) { @@ -199,13 +200,13 @@ static int ksmbd_tcp_new_connection(struct socket *client_sk) goto out_error; } - KSMBD_TRANS(t)->handler = kthread_run(ksmbd_conn_handler_loop, - KSMBD_TRANS(t)->conn, - "ksmbd:%u", - ksmbd_tcp_get_port(csin)); - if (IS_ERR(KSMBD_TRANS(t)->handler)) { + handler = kthread_run(ksmbd_conn_handler_loop, + KSMBD_TRANS(t)->conn, + "ksmbd:%u", + ksmbd_tcp_get_port(csin)); + if (IS_ERR(handler)) { pr_err("cannot start conn thread\n"); - rc = PTR_ERR(KSMBD_TRANS(t)->handler); + rc = PTR_ERR(handler); free_transport(t); } return rc; @@ -364,6 +365,7 @@ static int ksmbd_tcp_readv(struct tcp_transport *t, struct kvec *iov_orig, * @t: TCP transport instance * @buf: buffer to store read data from socket * @to_read: number of bytes to read from socket + * @max_retries: number of retries if reading from socket fails * * Return: on success return number of bytes read from socket, * otherwise return error number @@ -415,6 +417,7 @@ static void tcp_destroy_socket(struct socket *ksmbd_socket) /** * create_socket - create socket for ksmbd/0 + * @iface: interface to bind the created socket to * * Return: 0 on success, error number otherwise */ diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index 4277750a6da1..a6961bfe3e13 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -49,6 +49,10 @@ static void ksmbd_vfs_inherit_owner(struct ksmbd_work *work, /** * ksmbd_vfs_lock_parent() - lock parent dentry if it is stable + * @parent: parent dentry + * @child: child dentry + * + * Returns: %0 on success, %-ENOENT if the parent dentry is not stable */ int ksmbd_vfs_lock_parent(struct dentry *parent, struct dentry *child) { @@ -360,7 +364,7 @@ out: /** * ksmbd_vfs_read() - vfs helper for smb file read * @work: smb work - * @fid: file id of open file + * @fp: ksmbd file pointer * @count: read byte count * @pos: file pos * @rbuf: read data buffer @@ -474,7 +478,7 @@ out: /** * ksmbd_vfs_write() - vfs helper for smb file write * @work: work - * @fid: file id of open file + * @fp: ksmbd file pointer * @buf: buf containing data for writing * @count: read byte count * @pos: file pos @@ -545,10 +549,8 @@ out: /** * ksmbd_vfs_getattr() - vfs helper for smb getattr - * @work: work - * @fid: file id of open file - * @attrs: inode attributes - * + * @path: path of dentry + * @stat: pointer to returned kernel stat structure * Return: 0 on success, otherwise error */ int ksmbd_vfs_getattr(const struct path *path, struct kstat *stat) @@ -565,6 +567,7 @@ int ksmbd_vfs_getattr(const struct path *path, struct kstat *stat) * ksmbd_vfs_fsync() - vfs helper for smb fsync * @work: work * @fid: file id of open file + * @p_id: persistent file id * * Return: 0 on success, otherwise error */ @@ -587,7 +590,8 @@ int ksmbd_vfs_fsync(struct ksmbd_work *work, u64 fid, u64 p_id) /** * ksmbd_vfs_remove_file() - vfs helper for smb rmdir or unlink - * @name: directory or file name that is relative to share + * @work: work + * @path: path of dentry * * Return: 0 on success, otherwise error */ @@ -623,6 +627,7 @@ out_err: /** * ksmbd_vfs_link() - vfs helper for creating smb hardlink + * @work: work * @oldname: source file name * @newname: hardlink name that is relative to share * @@ -715,6 +720,10 @@ retry: goto out2; trap = lock_rename_child(old_child, new_path.dentry); + if (IS_ERR(trap)) { + err = PTR_ERR(trap); + goto out_drop_write; + } old_parent = dget(old_child->d_parent); if (d_unhashed(old_child)) { @@ -777,6 +786,7 @@ out4: out3: dput(old_parent); unlock_rename(old_parent, new_path.dentry); +out_drop_write: mnt_drop_write(old_path->mnt); out2: path_put(&new_path); @@ -795,7 +805,7 @@ revert_fsids: /** * ksmbd_vfs_truncate() - vfs helper for smb file truncate * @work: work - * @fid: file id of old file + * @fp: ksmbd file pointer * @size: truncate to given size * * Return: 0 on success, otherwise error @@ -838,7 +848,6 @@ int ksmbd_vfs_truncate(struct ksmbd_work *work, * ksmbd_vfs_listxattr() - vfs helper for smb list extended attributes * @dentry: dentry of file for listing xattrs * @list: destination buffer - * @size: destination buffer length * * Return: xattr list length on success, otherwise error */ @@ -947,7 +956,7 @@ int ksmbd_vfs_setxattr(struct mnt_idmap *idmap, /** * ksmbd_vfs_set_fadvise() - convert smb IO caching options to linux options * @filp: file pointer for IO - * @options: smb IO options + * @option: smb IO options */ void ksmbd_vfs_set_fadvise(struct file *filp, __le32 option) { @@ -1159,6 +1168,7 @@ static bool __caseless_lookup(struct dir_context *ctx, const char *name, * @dir: path info * @name: filename to lookup * @namelen: filename length + * @um: &struct unicode_map to use * * Return: 0 on success, otherwise error */ @@ -1189,6 +1199,7 @@ static int ksmbd_vfs_lookup_in_dir(const struct path *dir, char *name, /** * ksmbd_vfs_kern_path_locked() - lookup a file and get path info + * @work: work * @name: file path that is relative to share * @flags: lookup flags * @parent_path: if lookup succeed, return parent_path info @@ -1636,6 +1647,8 @@ int ksmbd_vfs_get_dos_attrib_xattr(struct mnt_idmap *idmap, * ksmbd_vfs_init_kstat() - convert unix stat information to smb stat format * @p: destination buffer * @ksmbd_kstat: ksmbd kstat wrapper + * + * Returns: pointer to the converted &struct file_directory_info */ void *ksmbd_vfs_init_kstat(char **p, struct ksmbd_kstat *ksmbd_kstat) { diff --git a/fs/sysctls.c b/fs/sysctls.c index 76a0aee8c229..8dbde9a802fa 100644 --- a/fs/sysctls.c +++ b/fs/sysctls.c @@ -26,7 +26,6 @@ static struct ctl_table fs_shared_sysctls[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_MAXOLDUID, }, - { } }; static int __init init_fs_sysctls(void) diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index b6b6796e1616..4df2afa551dc 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -81,7 +81,7 @@ void sysfs_remove_dir(struct kobject *kobj) struct kernfs_node *kn = kobj->sd; /* - * In general, kboject owner is responsible for ensuring removal + * In general, kobject owner is responsible for ensuring removal * doesn't race with other operations and sysfs doesn't provide any * protection; however, when @kobj is used as a symlink target, the * symlinking entity usually doesn't own @kobj and thus has no diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index f0677ea0ec24..110e8a272189 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -32,6 +32,18 @@ */ static DEFINE_MUTEX(eventfs_mutex); +/* Choose something "unique" ;-) */ +#define EVENTFS_FILE_INODE_INO 0x12c4e37 + +/* Just try to make something consistent and unique */ +static int eventfs_dir_ino(struct eventfs_inode *ei) +{ + if (!ei->ino) + ei->ino = get_next_ino(); + + return ei->ino; +} + /* * The eventfs_inode (ei) itself is protected by SRCU. It is released from * its parent's list and will have is_freed set (under eventfs_mutex). @@ -45,16 +57,55 @@ enum { EVENTFS_SAVE_MODE = BIT(16), EVENTFS_SAVE_UID = BIT(17), EVENTFS_SAVE_GID = BIT(18), + EVENTFS_TOPLEVEL = BIT(19), }; #define EVENTFS_MODE_MASK (EVENTFS_SAVE_MODE - 1) +/* + * eventfs_inode reference count management. + * + * NOTE! We count only references from dentries, in the + * form 'dentry->d_fsdata'. There are also references from + * directory inodes ('ti->private'), but the dentry reference + * count is always a superset of the inode reference count. + */ +static void release_ei(struct kref *ref) +{ + struct eventfs_inode *ei = container_of(ref, struct eventfs_inode, kref); + + WARN_ON_ONCE(!ei->is_freed); + + kfree(ei->entry_attrs); + kfree_const(ei->name); + kfree_rcu(ei, rcu); +} + +static inline void put_ei(struct eventfs_inode *ei) +{ + if (ei) + kref_put(&ei->kref, release_ei); +} + +static inline void free_ei(struct eventfs_inode *ei) +{ + if (ei) { + ei->is_freed = 1; + put_ei(ei); + } +} + +static inline struct eventfs_inode *get_ei(struct eventfs_inode *ei) +{ + if (ei) + kref_get(&ei->kref); + return ei; +} + static struct dentry *eventfs_root_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); -static int dcache_dir_open_wrapper(struct inode *inode, struct file *file); -static int dcache_readdir_wrapper(struct file *file, struct dir_context *ctx); -static int eventfs_release(struct inode *inode, struct file *file); +static int eventfs_iterate(struct file *file, struct dir_context *ctx); static void update_attr(struct eventfs_attr *attr, struct iattr *iattr) { @@ -94,7 +145,7 @@ static int eventfs_set_attr(struct mnt_idmap *idmap, struct dentry *dentry, /* Preallocate the children mode array if necessary */ if (!(dentry->d_inode->i_mode & S_IFDIR)) { if (!ei->entry_attrs) { - ei->entry_attrs = kzalloc(sizeof(*ei->entry_attrs) * ei->nr_entries, + ei->entry_attrs = kcalloc(ei->nr_entries, sizeof(*ei->entry_attrs), GFP_NOFS); if (!ei->entry_attrs) { ret = -ENOMEM; @@ -117,10 +168,17 @@ static int eventfs_set_attr(struct mnt_idmap *idmap, struct dentry *dentry, * The events directory dentry is never freed, unless its * part of an instance that is deleted. It's attr is the * default for its child files and directories. - * Do not update it. It's not used for its own mode or ownership + * Do not update it. It's not used for its own mode or ownership. */ - if (!ei->is_events) + if (ei->is_events) { + /* But it still needs to know if it was modified */ + if (iattr->ia_valid & ATTR_UID) + ei->attr.mode |= EVENTFS_SAVE_UID; + if (iattr->ia_valid & ATTR_GID) + ei->attr.mode |= EVENTFS_SAVE_GID; + } else { update_attr(&ei->attr, iattr); + } } else { name = dentry->d_name.name; @@ -138,9 +196,63 @@ static int eventfs_set_attr(struct mnt_idmap *idmap, struct dentry *dentry, return ret; } +static void update_top_events_attr(struct eventfs_inode *ei, struct super_block *sb) +{ + struct inode *root; + + /* Only update if the "events" was on the top level */ + if (!ei || !(ei->attr.mode & EVENTFS_TOPLEVEL)) + return; + + /* Get the tracefs root inode. */ + root = d_inode(sb->s_root); + ei->attr.uid = root->i_uid; + ei->attr.gid = root->i_gid; +} + +static void set_top_events_ownership(struct inode *inode) +{ + struct tracefs_inode *ti = get_tracefs(inode); + struct eventfs_inode *ei = ti->private; + + /* The top events directory doesn't get automatically updated */ + if (!ei || !ei->is_events || !(ei->attr.mode & EVENTFS_TOPLEVEL)) + return; + + update_top_events_attr(ei, inode->i_sb); + + if (!(ei->attr.mode & EVENTFS_SAVE_UID)) + inode->i_uid = ei->attr.uid; + + if (!(ei->attr.mode & EVENTFS_SAVE_GID)) + inode->i_gid = ei->attr.gid; +} + +static int eventfs_get_attr(struct mnt_idmap *idmap, + const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) +{ + struct dentry *dentry = path->dentry; + struct inode *inode = d_backing_inode(dentry); + + set_top_events_ownership(inode); + + generic_fillattr(idmap, request_mask, inode, stat); + return 0; +} + +static int eventfs_permission(struct mnt_idmap *idmap, + struct inode *inode, int mask) +{ + set_top_events_ownership(inode); + return generic_permission(idmap, inode, mask); +} + static const struct inode_operations eventfs_root_dir_inode_operations = { .lookup = eventfs_root_lookup, .setattr = eventfs_set_attr, + .getattr = eventfs_get_attr, + .permission = eventfs_permission, }; static const struct inode_operations eventfs_file_inode_operations = { @@ -148,11 +260,9 @@ static const struct inode_operations eventfs_file_inode_operations = { }; static const struct file_operations eventfs_file_operations = { - .open = dcache_dir_open_wrapper, .read = generic_read_dir, - .iterate_shared = dcache_readdir_wrapper, + .iterate_shared = eventfs_iterate, .llseek = generic_file_llseek, - .release = eventfs_release, }; /* Return the evenfs_inode of the "events" directory */ @@ -160,10 +270,11 @@ static struct eventfs_inode *eventfs_find_events(struct dentry *dentry) { struct eventfs_inode *ei; - mutex_lock(&eventfs_mutex); do { - /* The parent always has an ei, except for events itself */ - ei = dentry->d_parent->d_fsdata; + // The parent is stable because we do not do renames + dentry = dentry->d_parent; + // ... and directories always have d_fsdata + ei = dentry->d_fsdata; /* * If the ei is being freed, the ownership of the children @@ -173,10 +284,10 @@ static struct eventfs_inode *eventfs_find_events(struct dentry *dentry) ei = NULL; break; } - - dentry = ei->dentry; + // Walk upwards until you find the events inode } while (!ei->is_events); - mutex_unlock(&eventfs_mutex); + + update_top_events_attr(ei, dentry->d_sb); return ei; } @@ -206,50 +317,11 @@ static void update_inode_attr(struct dentry *dentry, struct inode *inode, inode->i_gid = attr->gid; } -static void update_gid(struct eventfs_inode *ei, kgid_t gid, int level) -{ - struct eventfs_inode *ei_child; - - /* at most we have events/system/event */ - if (WARN_ON_ONCE(level > 3)) - return; - - ei->attr.gid = gid; - - if (ei->entry_attrs) { - for (int i = 0; i < ei->nr_entries; i++) { - ei->entry_attrs[i].gid = gid; - } - } - - /* - * Only eventfs_inode with dentries are updated, make sure - * all eventfs_inodes are updated. If one of the children - * do not have a dentry, this function must traverse it. - */ - list_for_each_entry_srcu(ei_child, &ei->children, list, - srcu_read_lock_held(&eventfs_srcu)) { - if (!ei_child->dentry) - update_gid(ei_child, gid, level + 1); - } -} - -void eventfs_update_gid(struct dentry *dentry, kgid_t gid) -{ - struct eventfs_inode *ei = dentry->d_fsdata; - int idx; - - idx = srcu_read_lock(&eventfs_srcu); - update_gid(ei, gid, 0); - srcu_read_unlock(&eventfs_srcu, idx); -} - /** - * create_file - create a file in the tracefs filesystem - * @name: the name of the file to create. + * lookup_file - look up a file in the tracefs filesystem + * @dentry: the dentry to look up * @mode: the permission that the file should have. * @attr: saved attributes changed by user - * @parent: parent dentry for this file. * @data: something that the caller will want to get to later on. * @fop: struct file_operations that should be used for this file. * @@ -257,30 +329,25 @@ void eventfs_update_gid(struct dentry *dentry, kgid_t gid) * directory. The inode.i_private pointer will point to @data in the open() * call. */ -static struct dentry *create_file(const char *name, umode_t mode, +static struct dentry *lookup_file(struct eventfs_inode *parent_ei, + struct dentry *dentry, + umode_t mode, struct eventfs_attr *attr, - struct dentry *parent, void *data, + void *data, const struct file_operations *fop) { struct tracefs_inode *ti; - struct dentry *dentry; struct inode *inode; if (!(mode & S_IFMT)) mode |= S_IFREG; if (WARN_ON_ONCE(!S_ISREG(mode))) - return NULL; - - WARN_ON_ONCE(!parent); - dentry = eventfs_start_creating(name, parent); - - if (IS_ERR(dentry)) - return dentry; + return ERR_PTR(-EIO); inode = tracefs_get_inode(dentry->d_sb); if (unlikely(!inode)) - return eventfs_failed_creating(dentry); + return ERR_PTR(-ENOMEM); /* If the user updated the directory's attributes, use them */ update_inode_attr(dentry, inode, attr, mode); @@ -289,34 +356,36 @@ static struct dentry *create_file(const char *name, umode_t mode, inode->i_fop = fop; inode->i_private = data; + /* All files will have the same inode number */ + inode->i_ino = EVENTFS_FILE_INODE_INO; + ti = get_tracefs(inode); ti->flags |= TRACEFS_EVENT_INODE; - d_instantiate(dentry, inode); - fsnotify_create(dentry->d_parent->d_inode, dentry); - return eventfs_end_creating(dentry); + + // Files have their parent's ei as their fsdata + dentry->d_fsdata = get_ei(parent_ei); + + d_add(dentry, inode); + return NULL; }; /** - * create_dir - create a dir in the tracefs filesystem + * lookup_dir_entry - look up a dir in the tracefs filesystem + * @dentry: the directory to look up * @ei: the eventfs_inode that represents the directory to create - * @parent: parent dentry for this file. * - * This function will create a dentry for a directory represented by + * This function will look up a dentry for a directory represented by * a eventfs_inode. */ -static struct dentry *create_dir(struct eventfs_inode *ei, struct dentry *parent) +static struct dentry *lookup_dir_entry(struct dentry *dentry, + struct eventfs_inode *pei, struct eventfs_inode *ei) { struct tracefs_inode *ti; - struct dentry *dentry; struct inode *inode; - dentry = eventfs_start_creating(ei->name, parent); - if (IS_ERR(dentry)) - return dentry; - inode = tracefs_get_inode(dentry->d_sb); if (unlikely(!inode)) - return eventfs_failed_creating(dentry); + return ERR_PTR(-ENOMEM); /* If the user updated the directory's attributes, use them */ update_inode_attr(dentry, inode, &ei->attr, @@ -325,247 +394,72 @@ static struct dentry *create_dir(struct eventfs_inode *ei, struct dentry *parent inode->i_op = &eventfs_root_dir_inode_operations; inode->i_fop = &eventfs_file_operations; + /* All directories will have the same inode number */ + inode->i_ino = eventfs_dir_ino(ei); + ti = get_tracefs(inode); ti->flags |= TRACEFS_EVENT_INODE; + /* Only directories have ti->private set to an ei, not files */ + ti->private = ei; - inc_nlink(inode); - d_instantiate(dentry, inode); - inc_nlink(dentry->d_parent->d_inode); - fsnotify_mkdir(dentry->d_parent->d_inode, dentry); - return eventfs_end_creating(dentry); + dentry->d_fsdata = get_ei(ei); + + d_add(dentry, inode); + return NULL; } -static void free_ei(struct eventfs_inode *ei) +static inline struct eventfs_inode *alloc_ei(const char *name) { - kfree_const(ei->name); - kfree(ei->d_children); - kfree(ei->entry_attrs); - kfree(ei); + struct eventfs_inode *ei = kzalloc(sizeof(*ei), GFP_KERNEL); + + if (!ei) + return NULL; + + ei->name = kstrdup_const(name, GFP_KERNEL); + if (!ei->name) { + kfree(ei); + return NULL; + } + kref_init(&ei->kref); + return ei; } /** - * eventfs_set_ei_status_free - remove the dentry reference from an eventfs_inode - * @ti: the tracefs_inode of the dentry + * eventfs_d_release - dentry is going away * @dentry: dentry which has the reference to remove. * * Remove the association between a dentry from an eventfs_inode. */ -void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry) +void eventfs_d_release(struct dentry *dentry) { - struct eventfs_inode *ei; - int i; - - mutex_lock(&eventfs_mutex); - - ei = dentry->d_fsdata; - if (!ei) - goto out; - - /* This could belong to one of the files of the ei */ - if (ei->dentry != dentry) { - for (i = 0; i < ei->nr_entries; i++) { - if (ei->d_children[i] == dentry) - break; - } - if (WARN_ON_ONCE(i == ei->nr_entries)) - goto out; - ei->d_children[i] = NULL; - } else if (ei->is_freed) { - free_ei(ei); - } else { - ei->dentry = NULL; - } - - dentry->d_fsdata = NULL; - out: - mutex_unlock(&eventfs_mutex); + put_ei(dentry->d_fsdata); } /** - * create_file_dentry - create a dentry for a file of an eventfs_inode + * lookup_file_dentry - create a dentry for a file of an eventfs_inode * @ei: the eventfs_inode that the file will be created under - * @idx: the index into the d_children[] of the @ei + * @idx: the index into the entry_attrs[] of the @ei * @parent: The parent dentry of the created file. * @name: The name of the file to create * @mode: The mode of the file. * @data: The data to use to set the inode of the file with on open() * @fops: The fops of the file to be created. - * @lookup: If called by the lookup routine, in which case, dput() the created dentry. * * Create a dentry for a file of an eventfs_inode @ei and place it into the - * address located at @e_dentry. If the @e_dentry already has a dentry, then - * just do a dget() on it and return. Otherwise create the dentry and attach it. + * address located at @e_dentry. */ static struct dentry * -create_file_dentry(struct eventfs_inode *ei, int idx, - struct dentry *parent, const char *name, umode_t mode, void *data, - const struct file_operations *fops, bool lookup) +lookup_file_dentry(struct dentry *dentry, + struct eventfs_inode *ei, int idx, + umode_t mode, void *data, + const struct file_operations *fops) { struct eventfs_attr *attr = NULL; - struct dentry **e_dentry = &ei->d_children[idx]; - struct dentry *dentry; - - WARN_ON_ONCE(!inode_is_locked(parent->d_inode)); - mutex_lock(&eventfs_mutex); - if (ei->is_freed) { - mutex_unlock(&eventfs_mutex); - return NULL; - } - /* If the e_dentry already has a dentry, use it */ - if (*e_dentry) { - /* lookup does not need to up the ref count */ - if (!lookup) - dget(*e_dentry); - mutex_unlock(&eventfs_mutex); - return *e_dentry; - } - - /* ei->entry_attrs are protected by SRCU */ if (ei->entry_attrs) attr = &ei->entry_attrs[idx]; - mutex_unlock(&eventfs_mutex); - - dentry = create_file(name, mode, attr, parent, data, fops); - - mutex_lock(&eventfs_mutex); - - if (IS_ERR_OR_NULL(dentry)) { - /* - * When the mutex was released, something else could have - * created the dentry for this e_dentry. In which case - * use that one. - * - * If ei->is_freed is set, the e_dentry is currently on its - * way to being freed, don't return it. If e_dentry is NULL - * it means it was already freed. - */ - if (ei->is_freed) - dentry = NULL; - else - dentry = *e_dentry; - /* The lookup does not need to up the dentry refcount */ - if (dentry && !lookup) - dget(dentry); - mutex_unlock(&eventfs_mutex); - return dentry; - } - - if (!*e_dentry && !ei->is_freed) { - *e_dentry = dentry; - dentry->d_fsdata = ei; - } else { - /* - * Should never happen unless we get here due to being freed. - * Otherwise it means two dentries exist with the same name. - */ - WARN_ON_ONCE(!ei->is_freed); - dentry = NULL; - } - mutex_unlock(&eventfs_mutex); - - if (lookup) - dput(dentry); - - return dentry; -} - -/** - * eventfs_post_create_dir - post create dir routine - * @ei: eventfs_inode of recently created dir - * - * Map the meta-data of files within an eventfs dir to their parent dentry - */ -static void eventfs_post_create_dir(struct eventfs_inode *ei) -{ - struct eventfs_inode *ei_child; - struct tracefs_inode *ti; - - lockdep_assert_held(&eventfs_mutex); - - /* srcu lock already held */ - /* fill parent-child relation */ - list_for_each_entry_srcu(ei_child, &ei->children, list, - srcu_read_lock_held(&eventfs_srcu)) { - ei_child->d_parent = ei->dentry; - } - - ti = get_tracefs(ei->dentry->d_inode); - ti->private = ei; -} - -/** - * create_dir_dentry - Create a directory dentry for the eventfs_inode - * @pei: The eventfs_inode parent of ei. - * @ei: The eventfs_inode to create the directory for - * @parent: The dentry of the parent of this directory - * @lookup: True if this is called by the lookup code - * - * This creates and attaches a directory dentry to the eventfs_inode @ei. - */ -static struct dentry * -create_dir_dentry(struct eventfs_inode *pei, struct eventfs_inode *ei, - struct dentry *parent, bool lookup) -{ - struct dentry *dentry = NULL; - - WARN_ON_ONCE(!inode_is_locked(parent->d_inode)); - - mutex_lock(&eventfs_mutex); - if (pei->is_freed || ei->is_freed) { - mutex_unlock(&eventfs_mutex); - return NULL; - } - if (ei->dentry) { - /* If the dentry already has a dentry, use it */ - dentry = ei->dentry; - /* lookup does not need to up the ref count */ - if (!lookup) - dget(dentry); - mutex_unlock(&eventfs_mutex); - return dentry; - } - mutex_unlock(&eventfs_mutex); - - dentry = create_dir(ei, parent); - - mutex_lock(&eventfs_mutex); - - if (IS_ERR_OR_NULL(dentry) && !ei->is_freed) { - /* - * When the mutex was released, something else could have - * created the dentry for this e_dentry. In which case - * use that one. - * - * If ei->is_freed is set, the e_dentry is currently on its - * way to being freed. - */ - dentry = ei->dentry; - if (dentry && !lookup) - dget(dentry); - mutex_unlock(&eventfs_mutex); - return dentry; - } - - if (!ei->dentry && !ei->is_freed) { - ei->dentry = dentry; - eventfs_post_create_dir(ei); - dentry->d_fsdata = ei; - } else { - /* - * Should never happen unless we get here due to being freed. - * Otherwise it means two dentries exist with the same name. - */ - WARN_ON_ONCE(!ei->is_freed); - dentry = NULL; - } - mutex_unlock(&eventfs_mutex); - - if (lookup) - dput(dentry); - - return dentry; + return lookup_file(ei, dentry, mode, attr, data, fops); } /** @@ -582,250 +476,153 @@ static struct dentry *eventfs_root_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { - const struct file_operations *fops; - const struct eventfs_entry *entry; struct eventfs_inode *ei_child; struct tracefs_inode *ti; struct eventfs_inode *ei; - struct dentry *ei_dentry = NULL; - struct dentry *ret = NULL; const char *name = dentry->d_name.name; - bool created = false; - umode_t mode; - void *data; - int idx; - int i; - int r; + struct dentry *result = NULL; ti = get_tracefs(dir); if (!(ti->flags & TRACEFS_EVENT_INODE)) - return NULL; - - /* Grab srcu to prevent the ei from going away */ - idx = srcu_read_lock(&eventfs_srcu); + return ERR_PTR(-EIO); - /* - * Grab the eventfs_mutex to consistent value from ti->private. - * This s - */ mutex_lock(&eventfs_mutex); - ei = READ_ONCE(ti->private); - if (ei && !ei->is_freed) - ei_dentry = READ_ONCE(ei->dentry); - mutex_unlock(&eventfs_mutex); - if (!ei || !ei_dentry) + ei = ti->private; + if (!ei || ei->is_freed) goto out; - data = ei->data; - - list_for_each_entry_srcu(ei_child, &ei->children, list, - srcu_read_lock_held(&eventfs_srcu)) { + list_for_each_entry(ei_child, &ei->children, list) { if (strcmp(ei_child->name, name) != 0) continue; - ret = simple_lookup(dir, dentry, flags); - if (IS_ERR(ret)) + if (ei_child->is_freed) goto out; - create_dir_dentry(ei, ei_child, ei_dentry, true); - created = true; - break; - } - - if (created) + result = lookup_dir_entry(dentry, ei, ei_child); goto out; - - for (i = 0; i < ei->nr_entries; i++) { - entry = &ei->entries[i]; - if (strcmp(name, entry->name) == 0) { - void *cdata = data; - mutex_lock(&eventfs_mutex); - /* If ei->is_freed, then the event itself may be too */ - if (!ei->is_freed) - r = entry->callback(name, &mode, &cdata, &fops); - else - r = -1; - mutex_unlock(&eventfs_mutex); - if (r <= 0) - continue; - ret = simple_lookup(dir, dentry, flags); - if (IS_ERR(ret)) - goto out; - create_file_dentry(ei, i, ei_dentry, name, mode, cdata, - fops, true); - break; - } } - out: - srcu_read_unlock(&eventfs_srcu, idx); - return ret; -} - -struct dentry_list { - void *cursor; - struct dentry **dentries; -}; -/** - * eventfs_release - called to release eventfs file/dir - * @inode: inode to be released - * @file: file to be released (not used) - */ -static int eventfs_release(struct inode *inode, struct file *file) -{ - struct tracefs_inode *ti; - struct dentry_list *dlist = file->private_data; - void *cursor; - int i; + for (int i = 0; i < ei->nr_entries; i++) { + void *data; + umode_t mode; + const struct file_operations *fops; + const struct eventfs_entry *entry = &ei->entries[i]; - ti = get_tracefs(inode); - if (!(ti->flags & TRACEFS_EVENT_INODE)) - return -EINVAL; + if (strcmp(name, entry->name) != 0) + continue; - if (WARN_ON_ONCE(!dlist)) - return -EINVAL; + data = ei->data; + if (entry->callback(name, &mode, &data, &fops) <= 0) + goto out; - for (i = 0; dlist->dentries && dlist->dentries[i]; i++) { - dput(dlist->dentries[i]); + result = lookup_file_dentry(dentry, ei, i, mode, data, fops); + goto out; } - - cursor = dlist->cursor; - kfree(dlist->dentries); - kfree(dlist); - file->private_data = cursor; - return dcache_dir_close(inode, file); -} - -static int add_dentries(struct dentry ***dentries, struct dentry *d, int cnt) -{ - struct dentry **tmp; - - tmp = krealloc(*dentries, sizeof(d) * (cnt + 2), GFP_NOFS); - if (!tmp) - return -1; - tmp[cnt] = d; - tmp[cnt + 1] = NULL; - *dentries = tmp; - return 0; + out: + mutex_unlock(&eventfs_mutex); + return result; } -/** - * dcache_dir_open_wrapper - eventfs open wrapper - * @inode: not used - * @file: dir to be opened (to create it's children) - * - * Used to dynamic create file/dir with-in @file, all the - * file/dir will be created. If already created then references - * will be increased +/* + * Walk the children of a eventfs_inode to fill in getdents(). */ -static int dcache_dir_open_wrapper(struct inode *inode, struct file *file) +static int eventfs_iterate(struct file *file, struct dir_context *ctx) { const struct file_operations *fops; + struct inode *f_inode = file_inode(file); const struct eventfs_entry *entry; struct eventfs_inode *ei_child; struct tracefs_inode *ti; struct eventfs_inode *ei; - struct dentry_list *dlist; - struct dentry **dentries = NULL; - struct dentry *parent = file_dentry(file); - struct dentry *d; - struct inode *f_inode = file_inode(file); - const char *name = parent->d_name.name; + const char *name; umode_t mode; - void *data; - int cnt = 0; int idx; - int ret; - int i; - int r; + int ret = -EINVAL; + int ino; + int i, r, c; + + if (!dir_emit_dots(file, ctx)) + return 0; ti = get_tracefs(f_inode); if (!(ti->flags & TRACEFS_EVENT_INODE)) return -EINVAL; - if (WARN_ON_ONCE(file->private_data)) - return -EINVAL; + c = ctx->pos - 2; idx = srcu_read_lock(&eventfs_srcu); mutex_lock(&eventfs_mutex); ei = READ_ONCE(ti->private); + if (ei && ei->is_freed) + ei = NULL; mutex_unlock(&eventfs_mutex); - if (!ei) { - srcu_read_unlock(&eventfs_srcu, idx); - return -EINVAL; - } - + if (!ei) + goto out; - data = ei->data; + /* + * Need to create the dentries and inodes to have a consistent + * inode number. + */ + ret = 0; - dlist = kmalloc(sizeof(*dlist), GFP_KERNEL); - if (!dlist) { - srcu_read_unlock(&eventfs_srcu, idx); - return -ENOMEM; - } + /* Start at 'c' to jump over already read entries */ + for (i = c; i < ei->nr_entries; i++, ctx->pos++) { + void *cdata = ei->data; - inode_lock(parent->d_inode); - list_for_each_entry_srcu(ei_child, &ei->children, list, - srcu_read_lock_held(&eventfs_srcu)) { - d = create_dir_dentry(ei, ei_child, parent, false); - if (d) { - ret = add_dentries(&dentries, d, cnt); - if (ret < 0) - break; - cnt++; - } - } - - for (i = 0; i < ei->nr_entries; i++) { - void *cdata = data; entry = &ei->entries[i]; name = entry->name; + mutex_lock(&eventfs_mutex); - /* If ei->is_freed, then the event itself may be too */ - if (!ei->is_freed) - r = entry->callback(name, &mode, &cdata, &fops); - else - r = -1; + /* If ei->is_freed then just bail here, nothing more to do */ + if (ei->is_freed) { + mutex_unlock(&eventfs_mutex); + goto out; + } + r = entry->callback(name, &mode, &cdata, &fops); mutex_unlock(&eventfs_mutex); if (r <= 0) continue; - d = create_file_dentry(ei, i, parent, name, mode, cdata, fops, false); - if (d) { - ret = add_dentries(&dentries, d, cnt); - if (ret < 0) - break; - cnt++; + + ino = EVENTFS_FILE_INODE_INO; + + if (!dir_emit(ctx, name, strlen(name), ino, DT_REG)) + goto out; + } + + /* Subtract the skipped entries above */ + c -= min((unsigned int)c, (unsigned int)ei->nr_entries); + + list_for_each_entry_srcu(ei_child, &ei->children, list, + srcu_read_lock_held(&eventfs_srcu)) { + + if (c > 0) { + c--; + continue; } + + ctx->pos++; + + if (ei_child->is_freed) + continue; + + name = ei_child->name; + + ino = eventfs_dir_ino(ei_child); + + if (!dir_emit(ctx, name, strlen(name), ino, DT_DIR)) + goto out_dec; } - inode_unlock(parent->d_inode); + ret = 1; + out: srcu_read_unlock(&eventfs_srcu, idx); - ret = dcache_dir_open(inode, file); - /* - * dcache_dir_open() sets file->private_data to a dentry cursor. - * Need to save that but also save all the dentries that were - * opened by this function. - */ - dlist->cursor = file->private_data; - dlist->dentries = dentries; - file->private_data = dlist; return ret; -} - -/* - * This just sets the file->private_data back to the cursor and back. - */ -static int dcache_readdir_wrapper(struct file *file, struct dir_context *ctx) -{ - struct dentry_list *dlist = file->private_data; - int ret; - file->private_data = dlist->cursor; - ret = dcache_readdir(file, ctx); - dlist->cursor = file->private_data; - file->private_data = dlist; - return ret; + out_dec: + /* Incremented ctx->pos without adding something, reset it */ + ctx->pos--; + goto out; } /** @@ -872,25 +669,10 @@ struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode if (!parent) return ERR_PTR(-EINVAL); - ei = kzalloc(sizeof(*ei), GFP_KERNEL); + ei = alloc_ei(name); if (!ei) return ERR_PTR(-ENOMEM); - ei->name = kstrdup_const(name, GFP_KERNEL); - if (!ei->name) { - kfree(ei); - return ERR_PTR(-ENOMEM); - } - - if (size) { - ei->d_children = kzalloc(sizeof(*ei->d_children) * size, GFP_KERNEL); - if (!ei->d_children) { - kfree_const(ei->name); - kfree(ei); - return ERR_PTR(-ENOMEM); - } - } - ei->entries = entries; ei->nr_entries = size; ei->data = data; @@ -898,10 +680,8 @@ struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode INIT_LIST_HEAD(&ei->list); mutex_lock(&eventfs_mutex); - if (!parent->is_freed) { + if (!parent->is_freed) list_add_tail(&ei->list, &parent->children); - ei->d_parent = parent->dentry; - } mutex_unlock(&eventfs_mutex); /* Was the parent freed? */ @@ -941,33 +721,33 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry if (IS_ERR(dentry)) return ERR_CAST(dentry); - ei = kzalloc(sizeof(*ei), GFP_KERNEL); + ei = alloc_ei(name); if (!ei) - goto fail_ei; + goto fail; inode = tracefs_get_inode(dentry->d_sb); if (unlikely(!inode)) goto fail; - if (size) { - ei->d_children = kzalloc(sizeof(*ei->d_children) * size, GFP_KERNEL); - if (!ei->d_children) - goto fail; - } - - ei->dentry = dentry; + // Note: we have a ref to the dentry from tracefs_start_creating() + ei->events_dir = dentry; ei->entries = entries; ei->nr_entries = size; ei->is_events = 1; ei->data = data; - ei->name = kstrdup_const(name, GFP_KERNEL); - if (!ei->name) - goto fail; /* Save the ownership of this directory */ uid = d_inode(dentry->d_parent)->i_uid; gid = d_inode(dentry->d_parent)->i_gid; + /* + * If the events directory is of the top instance, then parent + * is NULL. Set the attr.mode to reflect this and its permissions will + * default to the tracefs root dentry. + */ + if (!parent) + ei->attr.mode = EVENTFS_TOPLEVEL; + /* This is used as the default ownership of the files and directories */ ei->attr.uid = uid; ei->attr.gid = gid; @@ -985,11 +765,19 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry inode->i_op = &eventfs_root_dir_inode_operations; inode->i_fop = &eventfs_file_operations; - dentry->d_fsdata = ei; + dentry->d_fsdata = get_ei(ei); - /* directory inodes start off with i_nlink == 2 (for "." entry) */ - inc_nlink(inode); + /* + * Keep all eventfs directories with i_nlink == 1. + * Due to the dynamic nature of the dentry creations and not + * wanting to add a pointer to the parent eventfs_inode in the + * eventfs_inode structure, keeping the i_nlink in sync with the + * number of directories would cause too much complexity for + * something not worth much. Keeping directory links at 1 + * tells userspace not to trust the link number. + */ d_instantiate(dentry, inode); + /* The dentry of the "events" parent does keep track though */ inc_nlink(dentry->d_parent->d_inode); fsnotify_mkdir(dentry->d_parent->d_inode, dentry); tracefs_end_creating(dentry); @@ -997,72 +785,11 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry return ei; fail: - kfree(ei->d_children); - kfree(ei); - fail_ei: + free_ei(ei); tracefs_failed_creating(dentry); return ERR_PTR(-ENOMEM); } -static LLIST_HEAD(free_list); - -static void eventfs_workfn(struct work_struct *work) -{ - struct eventfs_inode *ei, *tmp; - struct llist_node *llnode; - - llnode = llist_del_all(&free_list); - llist_for_each_entry_safe(ei, tmp, llnode, llist) { - /* This dput() matches the dget() from unhook_dentry() */ - for (int i = 0; i < ei->nr_entries; i++) { - if (ei->d_children[i]) - dput(ei->d_children[i]); - } - /* This should only get here if it had a dentry */ - if (!WARN_ON_ONCE(!ei->dentry)) - dput(ei->dentry); - } -} - -static DECLARE_WORK(eventfs_work, eventfs_workfn); - -static void free_rcu_ei(struct rcu_head *head) -{ - struct eventfs_inode *ei = container_of(head, struct eventfs_inode, rcu); - - if (ei->dentry) { - /* Do not free the ei until all references of dentry are gone */ - if (llist_add(&ei->llist, &free_list)) - queue_work(system_unbound_wq, &eventfs_work); - return; - } - - /* If the ei doesn't have a dentry, neither should its children */ - for (int i = 0; i < ei->nr_entries; i++) { - WARN_ON_ONCE(ei->d_children[i]); - } - - free_ei(ei); -} - -static void unhook_dentry(struct dentry *dentry) -{ - if (!dentry) - return; - /* - * Need to add a reference to the dentry that is expected by - * simple_recursive_removal(), which will include a dput(). - */ - dget(dentry); - - /* - * Also add a reference for the dput() in eventfs_workfn(). - * That is required as that dput() will free the ei after - * the SRCU grace period is over. - */ - dget(dentry); -} - /** * eventfs_remove_rec - remove eventfs dir or file from list * @ei: eventfs_inode to be removed. @@ -1075,8 +802,6 @@ static void eventfs_remove_rec(struct eventfs_inode *ei, int level) { struct eventfs_inode *ei_child; - if (!ei) - return; /* * Check recursion depth. It should never be greater than 3: * 0 - events/ @@ -1088,28 +813,11 @@ static void eventfs_remove_rec(struct eventfs_inode *ei, int level) return; /* search for nested folders or files */ - list_for_each_entry_srcu(ei_child, &ei->children, list, - lockdep_is_held(&eventfs_mutex)) { - /* Children only have dentry if parent does */ - WARN_ON_ONCE(ei_child->dentry && !ei->dentry); + list_for_each_entry(ei_child, &ei->children, list) eventfs_remove_rec(ei_child, level + 1); - } - - - ei->is_freed = 1; - for (int i = 0; i < ei->nr_entries; i++) { - if (ei->d_children[i]) { - /* Children only have dentry if parent does */ - WARN_ON_ONCE(!ei->dentry); - unhook_dentry(ei->d_children[i]); - } - } - - unhook_dentry(ei->dentry); - - list_del_rcu(&ei->list); - call_srcu(&eventfs_srcu, &ei->rcu, free_rcu_ei); + list_del(&ei->list); + free_ei(ei); } /** @@ -1120,22 +828,12 @@ static void eventfs_remove_rec(struct eventfs_inode *ei, int level) */ void eventfs_remove_dir(struct eventfs_inode *ei) { - struct dentry *dentry; - if (!ei) return; mutex_lock(&eventfs_mutex); - dentry = ei->dentry; eventfs_remove_rec(ei, 0); mutex_unlock(&eventfs_mutex); - - /* - * If any of the ei children has a dentry, then the ei itself - * must have a dentry. - */ - if (dentry) - simple_recursive_removal(dentry, NULL); } /** @@ -1148,7 +846,11 @@ void eventfs_remove_events_dir(struct eventfs_inode *ei) { struct dentry *dentry; - dentry = ei->dentry; + dentry = ei->events_dir; + if (!dentry) + return; + + ei->events_dir = NULL; eventfs_remove_dir(ei); /* @@ -1158,5 +860,6 @@ void eventfs_remove_events_dir(struct eventfs_inode *ei) * sticks around while the other ei->dentry are created * and destroyed dynamically. */ + d_invalidate(dentry); dput(dentry); } diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index bc86ffdb103b..d65ffad4c327 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -38,8 +38,6 @@ static struct inode *tracefs_alloc_inode(struct super_block *sb) if (!ti) return NULL; - ti->flags = 0; - return &ti->vfs_inode; } @@ -91,6 +89,7 @@ static int tracefs_syscall_mkdir(struct mnt_idmap *idmap, struct inode *inode, struct dentry *dentry, umode_t mode) { + struct tracefs_inode *ti; char *name; int ret; @@ -99,6 +98,15 @@ static int tracefs_syscall_mkdir(struct mnt_idmap *idmap, return -ENOMEM; /* + * This is a new directory that does not take the default of + * the rootfs. It becomes the default permissions for all the + * files and directories underneath it. + */ + ti = get_tracefs(inode); + ti->flags |= TRACEFS_INSTANCE_INODE; + ti->private = inode; + + /* * The mkdir call can call the generic functions that create * the files within the tracefs system. It is up to the individual * mkdir routine to handle races. @@ -141,10 +149,76 @@ static int tracefs_syscall_rmdir(struct inode *inode, struct dentry *dentry) return ret; } -static const struct inode_operations tracefs_dir_inode_operations = { +static void set_tracefs_inode_owner(struct inode *inode) +{ + struct tracefs_inode *ti = get_tracefs(inode); + struct inode *root_inode = ti->private; + + /* + * If this inode has never been referenced, then update + * the permissions to the superblock. + */ + if (!(ti->flags & TRACEFS_UID_PERM_SET)) + inode->i_uid = root_inode->i_uid; + + if (!(ti->flags & TRACEFS_GID_PERM_SET)) + inode->i_gid = root_inode->i_gid; +} + +static int tracefs_permission(struct mnt_idmap *idmap, + struct inode *inode, int mask) +{ + set_tracefs_inode_owner(inode); + return generic_permission(idmap, inode, mask); +} + +static int tracefs_getattr(struct mnt_idmap *idmap, + const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) +{ + struct inode *inode = d_backing_inode(path->dentry); + + set_tracefs_inode_owner(inode); + generic_fillattr(idmap, request_mask, inode, stat); + return 0; +} + +static int tracefs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *attr) +{ + unsigned int ia_valid = attr->ia_valid; + struct inode *inode = d_inode(dentry); + struct tracefs_inode *ti = get_tracefs(inode); + + if (ia_valid & ATTR_UID) + ti->flags |= TRACEFS_UID_PERM_SET; + + if (ia_valid & ATTR_GID) + ti->flags |= TRACEFS_GID_PERM_SET; + + return simple_setattr(idmap, dentry, attr); +} + +static const struct inode_operations tracefs_instance_dir_inode_operations = { .lookup = simple_lookup, .mkdir = tracefs_syscall_mkdir, .rmdir = tracefs_syscall_rmdir, + .permission = tracefs_permission, + .getattr = tracefs_getattr, + .setattr = tracefs_setattr, +}; + +static const struct inode_operations tracefs_dir_inode_operations = { + .lookup = simple_lookup, + .permission = tracefs_permission, + .getattr = tracefs_getattr, + .setattr = tracefs_setattr, +}; + +static const struct inode_operations tracefs_file_inode_operations = { + .permission = tracefs_permission, + .getattr = tracefs_getattr, + .setattr = tracefs_setattr, }; struct inode *tracefs_get_inode(struct super_block *sb) @@ -183,87 +257,6 @@ struct tracefs_fs_info { struct tracefs_mount_opts mount_opts; }; -static void change_gid(struct dentry *dentry, kgid_t gid) -{ - if (!dentry->d_inode) - return; - dentry->d_inode->i_gid = gid; -} - -/* - * Taken from d_walk, but without he need for handling renames. - * Nothing can be renamed while walking the list, as tracefs - * does not support renames. This is only called when mounting - * or remounting the file system, to set all the files to - * the given gid. - */ -static void set_gid(struct dentry *parent, kgid_t gid) -{ - struct dentry *this_parent; - struct list_head *next; - - this_parent = parent; - spin_lock(&this_parent->d_lock); - - change_gid(this_parent, gid); -repeat: - next = this_parent->d_subdirs.next; -resume: - while (next != &this_parent->d_subdirs) { - struct tracefs_inode *ti; - struct list_head *tmp = next; - struct dentry *dentry = list_entry(tmp, struct dentry, d_child); - next = tmp->next; - - /* Note, getdents() can add a cursor dentry with no inode */ - if (!dentry->d_inode) - continue; - - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - - change_gid(dentry, gid); - - /* If this is the events directory, update that too */ - ti = get_tracefs(dentry->d_inode); - if (ti && (ti->flags & TRACEFS_EVENT_INODE)) - eventfs_update_gid(dentry, gid); - - if (!list_empty(&dentry->d_subdirs)) { - spin_unlock(&this_parent->d_lock); - spin_release(&dentry->d_lock.dep_map, _RET_IP_); - this_parent = dentry; - spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); - goto repeat; - } - spin_unlock(&dentry->d_lock); - } - /* - * All done at this level ... ascend and resume the search. - */ - rcu_read_lock(); -ascend: - if (this_parent != parent) { - struct dentry *child = this_parent; - this_parent = child->d_parent; - - spin_unlock(&child->d_lock); - spin_lock(&this_parent->d_lock); - - /* go into the first sibling still alive */ - do { - next = child->d_child.next; - if (next == &this_parent->d_subdirs) - goto ascend; - child = list_entry(next, struct dentry, d_child); - } while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED)); - rcu_read_unlock(); - goto resume; - } - rcu_read_unlock(); - spin_unlock(&this_parent->d_lock); - return; -} - static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts) { substring_t args[MAX_OPT_ARGS]; @@ -336,10 +329,8 @@ static int tracefs_apply_options(struct super_block *sb, bool remount) if (!remount || opts->opts & BIT(Opt_uid)) inode->i_uid = opts->uid; - if (!remount || opts->opts & BIT(Opt_gid)) { - /* Set all the group ids to the mount option */ - set_gid(sb->s_root, opts->gid); - } + if (!remount || opts->opts & BIT(Opt_gid)) + inode->i_gid = opts->gid; return 0; } @@ -386,21 +377,30 @@ static const struct super_operations tracefs_super_operations = { .show_options = tracefs_show_options, }; -static void tracefs_dentry_iput(struct dentry *dentry, struct inode *inode) +/* + * It would be cleaner if eventfs had its own dentry ops. + * + * Note that d_revalidate is called potentially under RCU, + * so it can't take the eventfs mutex etc. It's fine - if + * we open a file just as it's marked dead, things will + * still work just fine, and just see the old stale case. + */ +static void tracefs_d_release(struct dentry *dentry) { - struct tracefs_inode *ti; + if (dentry->d_fsdata) + eventfs_d_release(dentry); +} - if (!dentry || !inode) - return; +static int tracefs_d_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct eventfs_inode *ei = dentry->d_fsdata; - ti = get_tracefs(inode); - if (ti && ti->flags & TRACEFS_EVENT_INODE) - eventfs_set_ei_status_free(ti, dentry); - iput(inode); + return !(ei && ei->is_freed); } static const struct dentry_operations tracefs_dentry_operations = { - .d_iput = tracefs_dentry_iput, + .d_revalidate = tracefs_d_revalidate, + .d_release = tracefs_d_release, }; static int trace_fill_super(struct super_block *sb, void *data, int silent) @@ -504,73 +504,24 @@ struct dentry *tracefs_end_creating(struct dentry *dentry) return dentry; } -/** - * eventfs_start_creating - start the process of creating a dentry - * @name: Name of the file created for the dentry - * @parent: The parent dentry where this dentry will be created - * - * This is a simple helper function for the dynamically created eventfs - * files. When the directory of the eventfs files are accessed, their - * dentries are created on the fly. This function is used to start that - * process. - */ -struct dentry *eventfs_start_creating(const char *name, struct dentry *parent) +/* Find the inode that this will use for default */ +static struct inode *instance_inode(struct dentry *parent, struct inode *inode) { - struct dentry *dentry; - int error; - - /* Must always have a parent. */ - if (WARN_ON_ONCE(!parent)) - return ERR_PTR(-EINVAL); - - error = simple_pin_fs(&trace_fs_type, &tracefs_mount, - &tracefs_mount_count); - if (error) - return ERR_PTR(error); + struct tracefs_inode *ti; - if (unlikely(IS_DEADDIR(parent->d_inode))) - dentry = ERR_PTR(-ENOENT); - else - dentry = lookup_one_len(name, parent, strlen(name)); + /* If parent is NULL then use root inode */ + if (!parent) + return d_inode(inode->i_sb->s_root); - if (!IS_ERR(dentry) && dentry->d_inode) { - dput(dentry); - dentry = ERR_PTR(-EEXIST); + /* Find the inode that is flagged as an instance or the root inode */ + while (!IS_ROOT(parent)) { + ti = get_tracefs(d_inode(parent)); + if (ti->flags & TRACEFS_INSTANCE_INODE) + break; + parent = parent->d_parent; } - if (IS_ERR(dentry)) - simple_release_fs(&tracefs_mount, &tracefs_mount_count); - - return dentry; -} - -/** - * eventfs_failed_creating - clean up a failed eventfs dentry creation - * @dentry: The dentry to clean up - * - * If after calling eventfs_start_creating(), a failure is detected, the - * resources created by eventfs_start_creating() needs to be cleaned up. In - * that case, this function should be called to perform that clean up. - */ -struct dentry *eventfs_failed_creating(struct dentry *dentry) -{ - dput(dentry); - simple_release_fs(&tracefs_mount, &tracefs_mount_count); - return NULL; -} - -/** - * eventfs_end_creating - Finish the process of creating a eventfs dentry - * @dentry: The dentry that has successfully been created. - * - * This function is currently just a place holder to match - * eventfs_start_creating(). In case any synchronization needs to be added, - * this function will be used to implement that without having to modify - * the callers of eventfs_start_creating(). - */ -struct dentry *eventfs_end_creating(struct dentry *dentry) -{ - return dentry; + return d_inode(parent); } /** @@ -603,6 +554,7 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops) { + struct tracefs_inode *ti; struct dentry *dentry; struct inode *inode; @@ -621,7 +573,11 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode, if (unlikely(!inode)) return tracefs_failed_creating(dentry); + ti = get_tracefs(inode); + ti->private = instance_inode(parent, inode); + inode->i_mode = mode; + inode->i_op = &tracefs_file_inode_operations; inode->i_fop = fops ? fops : &tracefs_file_operations; inode->i_private = data; inode->i_uid = d_inode(dentry->d_parent)->i_uid; @@ -634,6 +590,7 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode, static struct dentry *__create_dir(const char *name, struct dentry *parent, const struct inode_operations *ops) { + struct tracefs_inode *ti; struct dentry *dentry = tracefs_start_creating(name, parent); struct inode *inode; @@ -651,6 +608,9 @@ static struct dentry *__create_dir(const char *name, struct dentry *parent, inode->i_uid = d_inode(dentry->d_parent)->i_uid; inode->i_gid = d_inode(dentry->d_parent)->i_gid; + ti = get_tracefs(inode); + ti->private = instance_inode(parent, inode); + /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); d_instantiate(dentry, inode); @@ -681,7 +641,7 @@ struct dentry *tracefs_create_dir(const char *name, struct dentry *parent) if (security_locked_down(LOCKDOWN_TRACEFS)) return NULL; - return __create_dir(name, parent, &simple_dir_inode_operations); + return __create_dir(name, parent, &tracefs_dir_inode_operations); } /** @@ -712,7 +672,7 @@ __init struct dentry *tracefs_create_instance_dir(const char *name, if (WARN_ON(tracefs_ops.mkdir || tracefs_ops.rmdir)) return NULL; - dentry = __create_dir(name, parent, &tracefs_dir_inode_operations); + dentry = __create_dir(name, parent, &tracefs_instance_dir_inode_operations); if (!dentry) return NULL; @@ -757,7 +717,11 @@ static void init_once(void *foo) { struct tracefs_inode *ti = (struct tracefs_inode *) foo; + /* inode_init_once() calls memset() on the vfs_inode portion */ inode_init_once(&ti->vfs_inode); + + /* Zero out the rest */ + memset_after(ti, 0, vfs_inode); } static int __init tracefs_init(void) diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h index 42bdeb471a07..beb3dcd0e434 100644 --- a/fs/tracefs/internal.h +++ b/fs/tracefs/internal.h @@ -5,12 +5,16 @@ enum { TRACEFS_EVENT_INODE = BIT(1), TRACEFS_EVENT_TOP_INODE = BIT(2), + TRACEFS_GID_PERM_SET = BIT(3), + TRACEFS_UID_PERM_SET = BIT(4), + TRACEFS_INSTANCE_INODE = BIT(5), }; struct tracefs_inode { + struct inode vfs_inode; + /* The below gets initialized with memset_after(ti, 0, vfs_inode) */ unsigned long flags; void *private; - struct inode vfs_inode; }; /* @@ -28,42 +32,37 @@ struct eventfs_attr { /* * struct eventfs_inode - hold the properties of the eventfs directories. * @list: link list into the parent directory + * @rcu: Union with @list for freeing + * @children: link list into the child eventfs_inode * @entries: the array of entries representing the files in the directory * @name: the name of the directory to create - * @children: link list into the child eventfs_inode - * @dentry: the dentry of the directory - * @d_parent: pointer to the parent's dentry - * @d_children: The array of dentries to represent the files when created + * @events_dir: the dentry of the events directory * @entry_attrs: Saved mode and ownership of the @d_children - * @attr: Saved mode and ownership of eventfs_inode itself * @data: The private data to pass to the callbacks + * @attr: Saved mode and ownership of eventfs_inode itself * @is_freed: Flag set if the eventfs is on its way to be freed * Note if is_freed is set, then dentry is corrupted. + * @is_events: Flag set for only the top level "events" directory * @nr_entries: The number of items in @entries + * @ino: The saved inode number */ struct eventfs_inode { - struct list_head list; + union { + struct list_head list; + struct rcu_head rcu; + }; + struct list_head children; const struct eventfs_entry *entries; const char *name; - struct list_head children; - struct dentry *dentry; /* Check is_freed to access */ - struct dentry *d_parent; - struct dentry **d_children; + struct dentry *events_dir; struct eventfs_attr *entry_attrs; - struct eventfs_attr attr; void *data; - /* - * Union - used for deletion - * @llist: for calling dput() if needed after RCU - * @rcu: eventfs_inode to delete in RCU - */ - union { - struct llist_node llist; - struct rcu_head rcu; - }; + struct eventfs_attr attr; + struct kref kref; unsigned int is_freed:1; unsigned int is_events:1; unsigned int nr_entries:30; + unsigned int ino; }; static inline struct tracefs_inode *get_tracefs(const struct inode *inode) @@ -75,10 +74,7 @@ struct dentry *tracefs_start_creating(const char *name, struct dentry *parent); struct dentry *tracefs_end_creating(struct dentry *dentry); struct dentry *tracefs_failed_creating(struct dentry *dentry); struct inode *tracefs_get_inode(struct super_block *sb); -struct dentry *eventfs_start_creating(const char *name, struct dentry *parent); -struct dentry *eventfs_failed_creating(struct dentry *dentry); -struct dentry *eventfs_end_creating(struct dentry *dentry); -void eventfs_update_gid(struct dentry *dentry, kgid_t gid); -void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry); + +void eventfs_d_release(struct dentry *dentry); #endif /* _TRACEFS_INTERNAL_H */ diff --git a/fs/ubifs/auth.c b/fs/ubifs/auth.c index 0d561ecb6869..a4a0158f712d 100644 --- a/fs/ubifs/auth.c +++ b/fs/ubifs/auth.c @@ -18,7 +18,7 @@ #include "ubifs.h" /** - * ubifs_node_calc_hash - calculate the hash of a UBIFS node + * __ubifs_node_calc_hash - calculate the hash of a UBIFS node * @c: UBIFS file-system description object * @node: the node to calculate a hash for * @hash: the returned hash @@ -507,28 +507,13 @@ out: */ int ubifs_hmac_wkm(struct ubifs_info *c, u8 *hmac) { - SHASH_DESC_ON_STACK(shash, c->hmac_tfm); - int err; const char well_known_message[] = "UBIFS"; if (!ubifs_authenticated(c)) return 0; - shash->tfm = c->hmac_tfm; - - err = crypto_shash_init(shash); - if (err) - return err; - - err = crypto_shash_update(shash, well_known_message, - sizeof(well_known_message) - 1); - if (err < 0) - return err; - - err = crypto_shash_final(shash, hmac); - if (err) - return err; - return 0; + return crypto_shash_tfm_digest(c->hmac_tfm, well_known_message, + sizeof(well_known_message) - 1, hmac); } /* diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c index c4fc1047fc07..5b3a840098b0 100644 --- a/fs/ubifs/commit.c +++ b/fs/ubifs/commit.c @@ -70,18 +70,29 @@ static int nothing_to_commit(struct ubifs_info *c) return 0; /* + * Increasing @c->dirty_pn_cnt/@c->dirty_nn_cnt and marking + * nnodes/pnodes as dirty in run_gc() could race with following + * checking, which leads inconsistent states between @c->nroot + * and @c->dirty_pn_cnt/@c->dirty_nn_cnt, holding @c->lp_mutex + * to avoid that. + */ + mutex_lock(&c->lp_mutex); + /* * Even though the TNC is clean, the LPT tree may have dirty nodes. For * example, this may happen if the budgeting subsystem invoked GC to * make some free space, and the GC found an LEB with only dirty and * free space. In this case GC would just change the lprops of this * LEB (by turning all space into free space) and unmap it. */ - if (c->nroot && test_bit(DIRTY_CNODE, &c->nroot->flags)) + if (c->nroot && test_bit(DIRTY_CNODE, &c->nroot->flags)) { + mutex_unlock(&c->lp_mutex); return 0; + } ubifs_assert(c, atomic_long_read(&c->dirty_zn_cnt) == 0); ubifs_assert(c, c->dirty_pn_cnt == 0); ubifs_assert(c, c->dirty_nn_cnt == 0); + mutex_unlock(&c->lp_mutex); return 1; } diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 3b13c648d490..e413a9cf8ee3 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -1234,6 +1234,8 @@ out_cancel: dir_ui->ui_size = dir->i_size; mutex_unlock(&dir_ui->ui_mutex); out_inode: + /* Free inode->i_link before inode is marked as bad. */ + fscrypt_free_inode(inode); make_bad_inode(inode); iput(inode); out_fname: diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 2d2b39f843ce..5029eb3390a5 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -318,8 +318,9 @@ static int write_begin_slow(struct address_space *mapping, * This is a helper function for 'ubifs_write_begin()' which allocates budget * for the operation. The budget is allocated differently depending on whether * this is appending, whether the page is dirty or not, and so on. This - * function leaves the @ui->ui_mutex locked in case of appending. Returns zero - * in case of success and %-ENOSPC in case of failure. + * function leaves the @ui->ui_mutex locked in case of appending. + * + * Returns: %0 in case of success and %-ENOSPC in case of failure. */ static int allocate_budget(struct ubifs_info *c, struct page *page, struct ubifs_inode *ui, int appending) @@ -600,7 +601,7 @@ out: * @bu: bulk-read information * @n: next zbranch slot * - * This function returns %0 on success and a negative error code on failure. + * Returns: %0 on success and a negative error code on failure. */ static int populate_page(struct ubifs_info *c, struct page *page, struct bu_info *bu, int *n) @@ -711,7 +712,7 @@ out_err: * @bu: bulk-read information * @page1: first page to read * - * This function returns %1 if the bulk-read is done, otherwise %0 is returned. + * Returns: %1 if the bulk-read is done, otherwise %0 is returned. */ static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu, struct page *page1) @@ -821,7 +822,9 @@ out_bu_off: * Some flash media are capable of reading sequentially at faster rates. UBIFS * bulk-read facility is designed to take advantage of that, by reading in one * go consecutive data nodes that are also located consecutively in the same - * LEB. This function returns %1 if a bulk-read is done and %0 otherwise. + * LEB. + * + * Returns: %1 if a bulk-read is done and %0 otherwise. */ static int ubifs_bulk_read(struct page *page) { @@ -1109,7 +1112,9 @@ static void do_attr_changes(struct inode *inode, const struct iattr *attr) * @attr: inode attribute changes description * * This function implements VFS '->setattr()' call when the inode is truncated - * to a smaller size. Returns zero in case of success and a negative error code + * to a smaller size. + * + * Returns: %0 in case of success and a negative error code * in case of failure. */ static int do_truncation(struct ubifs_info *c, struct inode *inode, @@ -1215,7 +1220,9 @@ out_budg: * @attr: inode attribute changes description * * This function implements VFS '->setattr()' call for all cases except - * truncations to smaller size. Returns zero in case of success and a negative + * truncations to smaller size. + * + * Returns: %0 in case of success and a negative * error code in case of failure. */ static int do_setattr(struct ubifs_info *c, struct inode *inode, @@ -1360,6 +1367,8 @@ out: * This helper function checks if the inode mtime/ctime should be updated or * not. If current values of the time-stamps are within the UBIFS inode time * granularity, they are not updated. This is an optimization. + * + * Returns: %1 if time update is needed, %0 if not */ static inline int mctime_update_needed(const struct inode *inode, const struct timespec64 *now) @@ -1375,11 +1384,12 @@ static inline int mctime_update_needed(const struct inode *inode, /** * ubifs_update_time - update time of inode. * @inode: inode to update - * @time: timespec structure to hold the current time value * @flags: time updating control flag determines updating * which time fields of @inode * * This function updates time of the inode. + * + * Returns: %0 for success or a negative error code otherwise. */ int ubifs_update_time(struct inode *inode, int flags) { @@ -1413,7 +1423,9 @@ int ubifs_update_time(struct inode *inode, int flags) * @inode: inode to update * * This function updates mtime and ctime of the inode if it is not equivalent to - * current time. Returns zero in case of success and a negative error code in + * current time. + * + * Returns: %0 in case of success and a negative error code in * case of failure. */ static int update_mctime(struct inode *inode) diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index c59d47fe7939..17da28d6247a 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -365,6 +365,7 @@ static void destroy_replay_list(struct ubifs_info *c) * @lnum: node logical eraseblock number * @offs: node offset * @len: node length + * @hash: node hash * @key: node key * @sqnum: sequence number * @deletion: non-zero if this is a deletion @@ -417,6 +418,7 @@ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len, * @lnum: node logical eraseblock number * @offs: node offset * @len: node length + * @hash: node hash * @key: node key * @name: directory entry name * @nlen: directory entry name length diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 3508ac484da3..1bb6ed948927 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -125,8 +125,6 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry, udf_fiiter_release(&iter); inode = udf_iget(dir->i_sb, &loc); - if (IS_ERR(inode)) - return ERR_CAST(inode); } return d_splice_alias(inode, dentry); @@ -230,8 +228,6 @@ static int udf_fiiter_add_entry(struct inode *dir, struct dentry *dentry, char name[UDF_NAME_LEN_CS0]; if (dentry) { - if (!dentry->d_name.len) - return -EINVAL; namelen = udf_put_filename(dir->i_sb, dentry->d_name.name, dentry->d_name.len, name, UDF_NAME_LEN_CS0); @@ -766,7 +762,7 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct inode *old_inode = d_inode(old_dentry); struct inode *new_inode = d_inode(new_dentry); struct udf_fileident_iter oiter, niter, diriter; - bool has_diriter = false; + bool has_diriter = false, is_dir = false; int retval; struct kernel_lb_addr tloc; @@ -789,6 +785,9 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (!empty_dir(new_inode)) goto out_oiter; } + is_dir = true; + } + if (is_dir && old_dir != new_dir) { retval = udf_fiiter_find_entry(old_inode, &dotdot_name, &diriter); if (retval == -ENOENT) { @@ -878,7 +877,9 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir, udf_dir_entry_len(&diriter.fi)); udf_fiiter_write_fi(&diriter, NULL); udf_fiiter_release(&diriter); + } + if (is_dir) { inode_dec_link_count(old_dir); if (new_inode) inode_dec_link_count(new_inode); @@ -899,7 +900,6 @@ out_oiter: static struct dentry *udf_get_parent(struct dentry *child) { struct kernel_lb_addr tloc; - struct inode *inode = NULL; struct udf_fileident_iter iter; int err; @@ -909,11 +909,7 @@ static struct dentry *udf_get_parent(struct dentry *child) tloc = lelb_to_cpu(iter.fi.icb.extLocation); udf_fiiter_release(&iter); - inode = udf_iget(child->d_sb, &tloc); - if (IS_ERR(inode)) - return ERR_CAST(inode); - - return d_obtain_alias(inode); + return d_obtain_alias(udf_iget(child->d_sb, &tloc)); } diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 6e2a4d6a0d8f..959551ff9a95 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -45,7 +45,6 @@ static struct ctl_table vm_userfaultfd_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - { } }; #endif @@ -1033,7 +1032,7 @@ static int resolve_userfault_fork(struct userfaultfd_ctx *new, { int fd; - fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, new, + fd = anon_inode_create_getfd("[userfaultfd]", &userfaultfd_fops, new, O_RDONLY | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode); if (fd < 0) return fd; @@ -2261,7 +2260,8 @@ static int new_userfaultfd(int flags) /* prevent the mm struct to be freed */ mmgrab(ctx->mm); - fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, ctx, + /* Create a new inode so that the LSM can block the creation. */ + fd = anon_inode_create_getfd("[userfaultfd]", &userfaultfd_fops, ctx, O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL); if (fd < 0) { mmdrop(ctx->mm); diff --git a/fs/vboxsf/vboxsf_wrappers.c b/fs/vboxsf/vboxsf_wrappers.c index bfc78a097dae..5e8d4359e171 100644 --- a/fs/vboxsf/vboxsf_wrappers.c +++ b/fs/vboxsf/vboxsf_wrappers.c @@ -114,7 +114,7 @@ int vboxsf_unmap_folder(u32 root) * vboxsf_create - Create a new file or folder * @root: Root of the shared folder in which to create the file * @parsed_path: The path of the file or folder relative to the shared folder - * @param: create_parms Parameters for file/folder creation. + * @create_parms: Parameters for file/folder creation. * * Create a new file or folder or open an existing one in a shared folder. * Note this function always returns 0 / success unless an exceptional condition diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index d071a6e32581..a6a6b2749241 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -100,6 +100,16 @@ fsverity_msg(const struct inode *inode, const char *level, #define fsverity_err(inode, fmt, ...) \ fsverity_msg((inode), KERN_ERR, fmt, ##__VA_ARGS__) +/* measure.c */ + +#ifdef CONFIG_BPF_SYSCALL +void __init fsverity_init_bpf(void); +#else +static inline void fsverity_init_bpf(void) +{ +} +#endif + /* open.c */ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, diff --git a/fs/verity/init.c b/fs/verity/init.c index a29f062f6047..cb2c9aac61ed 100644 --- a/fs/verity/init.c +++ b/fs/verity/init.c @@ -24,7 +24,6 @@ static struct ctl_table fsverity_sysctl_table[] = { .extra2 = SYSCTL_ONE, }, #endif - { } }; static void __init fsverity_init_sysctl(void) @@ -69,6 +68,7 @@ static int __init fsverity_init(void) fsverity_init_workqueue(); fsverity_init_sysctl(); fsverity_init_signature(); + fsverity_init_bpf(); return 0; } late_initcall(fsverity_init) diff --git a/fs/verity/measure.c b/fs/verity/measure.c index eec5956141da..bf7a5f4cccaf 100644 --- a/fs/verity/measure.c +++ b/fs/verity/measure.c @@ -7,6 +7,8 @@ #include "fsverity_private.h" +#include <linux/bpf.h> +#include <linux/btf.h> #include <linux/uaccess.h> /** @@ -100,3 +102,85 @@ int fsverity_get_digest(struct inode *inode, return hash_alg->digest_size; } EXPORT_SYMBOL_GPL(fsverity_get_digest); + +#ifdef CONFIG_BPF_SYSCALL + +/* bpf kfuncs */ +__bpf_kfunc_start_defs(); + +/** + * bpf_get_fsverity_digest: read fsverity digest of file + * @file: file to get digest from + * @digest_ptr: (out) dynptr for struct fsverity_digest + * + * Read fsverity_digest of *file* into *digest_ptr*. + * + * Return: 0 on success, a negative value on error. + */ +__bpf_kfunc int bpf_get_fsverity_digest(struct file *file, struct bpf_dynptr_kern *digest_ptr) +{ + const struct inode *inode = file_inode(file); + u32 dynptr_sz = __bpf_dynptr_size(digest_ptr); + struct fsverity_digest *arg; + const struct fsverity_info *vi; + const struct fsverity_hash_alg *hash_alg; + int out_digest_sz; + + if (dynptr_sz < sizeof(struct fsverity_digest)) + return -EINVAL; + + arg = __bpf_dynptr_data_rw(digest_ptr, dynptr_sz); + if (!arg) + return -EINVAL; + + if (!IS_ALIGNED((uintptr_t)arg, __alignof__(*arg))) + return -EINVAL; + + vi = fsverity_get_info(inode); + if (!vi) + return -ENODATA; /* not a verity file */ + + hash_alg = vi->tree_params.hash_alg; + + arg->digest_algorithm = hash_alg - fsverity_hash_algs; + arg->digest_size = hash_alg->digest_size; + + out_digest_sz = dynptr_sz - sizeof(struct fsverity_digest); + + /* copy digest */ + memcpy(arg->digest, vi->file_digest, min_t(int, hash_alg->digest_size, out_digest_sz)); + + /* fill the extra buffer with zeros */ + if (out_digest_sz > hash_alg->digest_size) + memset(arg->digest + arg->digest_size, 0, out_digest_sz - hash_alg->digest_size); + + return 0; +} + +__bpf_kfunc_end_defs(); + +BTF_SET8_START(fsverity_set_ids) +BTF_ID_FLAGS(func, bpf_get_fsverity_digest, KF_TRUSTED_ARGS) +BTF_SET8_END(fsverity_set_ids) + +static int bpf_get_fsverity_digest_filter(const struct bpf_prog *prog, u32 kfunc_id) +{ + if (!btf_id_set8_contains(&fsverity_set_ids, kfunc_id)) + return 0; + + /* Only allow to attach from LSM hooks, to avoid recursion */ + return prog->type != BPF_PROG_TYPE_LSM ? -EACCES : 0; +} + +static const struct btf_kfunc_id_set bpf_fsverity_set = { + .owner = THIS_MODULE, + .set = &fsverity_set_ids, + .filter = bpf_get_fsverity_digest_filter, +}; + +void __init fsverity_init_bpf(void) +{ + register_btf_kfunc_id_set(BPF_PROG_TYPE_LSM, &bpf_fsverity_set); +} + +#endif /* CONFIG_BPF_SYSCALL */ diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 9976a00a73f9..e965a48e7db9 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -421,10 +421,10 @@ xfs_attr_complete_op( bool do_replace = args->op_flags & XFS_DA_OP_REPLACE; args->op_flags &= ~XFS_DA_OP_REPLACE; - if (do_replace) { - args->attr_filter &= ~XFS_ATTR_INCOMPLETE; + args->attr_filter &= ~XFS_ATTR_INCOMPLETE; + if (do_replace) return replace_state; - } + return XFS_DAS_DONE; } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 98aaca933bdd..f362345467fa 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -3277,7 +3277,7 @@ xfs_bmap_alloc_account( struct xfs_bmalloca *ap) { bool isrt = XFS_IS_REALTIME_INODE(ap->ip) && - (ap->flags & XFS_BMAPI_ATTRFORK); + !(ap->flags & XFS_BMAPI_ATTRFORK); uint fld; if (ap->flags & XFS_BMAPI_COWFORK) { diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 31100120b2c5..e31663cb7b43 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1119,20 +1119,6 @@ xfs_rtbitmap_blockcount( } /* - * Compute the maximum level number of the realtime summary file, as defined by - * mkfs. The historic use of highbit32 on a 64-bit quantity prohibited correct - * use of rt volumes with more than 2^32 extents. - */ -uint8_t -xfs_compute_rextslog( - xfs_rtbxlen_t rtextents) -{ - if (!rtextents) - return 0; - return xfs_highbit64(rtextents); -} - -/* * Compute the number of rtbitmap words needed to populate every block of a * bitmap that is large enough to track the given number of rt extents. */ diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h index 274dc7dae1fa..152a66750af5 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.h +++ b/fs/xfs/libxfs/xfs_rtbitmap.h @@ -351,20 +351,6 @@ xfs_rtfree_extent( int xfs_rtfree_blocks(struct xfs_trans *tp, xfs_fsblock_t rtbno, xfs_filblks_t rtlen); -uint8_t xfs_compute_rextslog(xfs_rtbxlen_t rtextents); - -/* Do we support an rt volume having this number of rtextents? */ -static inline bool -xfs_validate_rtextents( - xfs_rtbxlen_t rtextents) -{ - /* No runt rt volumes */ - if (rtextents == 0) - return false; - - return true; -} - xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents); unsigned long long xfs_rtbitmap_wordcount(struct xfs_mount *mp, @@ -383,8 +369,6 @@ unsigned long long xfs_rtsummary_wordcount(struct xfs_mount *mp, # define xfs_rtsummary_read_buf(a,b) (-ENOSYS) # define xfs_rtbuf_cache_relse(a) (0) # define xfs_rtalloc_extent_is_free(m,t,s,l,i) (-ENOSYS) -# define xfs_compute_rextslog(rtx) (0) -# define xfs_validate_rtextents(rtx) (false) static inline xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents) { diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 4a9e8588f4c9..5bb6e2bd6dee 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -1377,3 +1377,17 @@ xfs_validate_stripe_geometry( } return true; } + +/* + * Compute the maximum level number of the realtime summary file, as defined by + * mkfs. The historic use of highbit32 on a 64-bit quantity prohibited correct + * use of rt volumes with more than 2^32 extents. + */ +uint8_t +xfs_compute_rextslog( + xfs_rtbxlen_t rtextents) +{ + if (!rtextents) + return 0; + return xfs_highbit64(rtextents); +} diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h index 19134b23c10b..2e8e8d63d4eb 100644 --- a/fs/xfs/libxfs/xfs_sb.h +++ b/fs/xfs/libxfs/xfs_sb.h @@ -38,4 +38,6 @@ extern int xfs_sb_get_secondary(struct xfs_mount *mp, extern bool xfs_validate_stripe_geometry(struct xfs_mount *mp, __s64 sunit, __s64 swidth, int sectorsize, bool silent); +uint8_t xfs_compute_rextslog(xfs_rtbxlen_t rtextents); + #endif /* __XFS_SB_H__ */ diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 20b5375f2d9c..62e02d5380ad 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -251,4 +251,16 @@ bool xfs_verify_fileoff(struct xfs_mount *mp, xfs_fileoff_t off); bool xfs_verify_fileext(struct xfs_mount *mp, xfs_fileoff_t off, xfs_fileoff_t len); +/* Do we support an rt volume having this number of rtextents? */ +static inline bool +xfs_validate_rtextents( + xfs_rtbxlen_t rtextents) +{ + /* No runt rt volumes */ + if (rtextents == 0) + return false; + + return true; +} + #endif /* __XFS_TYPES_H__ */ diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index 441ca9977652..46583517377f 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -15,6 +15,7 @@ #include "xfs_inode.h" #include "xfs_bmap.h" #include "xfs_bit.h" +#include "xfs_sb.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/repair.h" diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c index fabd0ed9dfa6..b1ff4f33324a 100644 --- a/fs/xfs/scrub/rtsummary.c +++ b/fs/xfs/scrub/rtsummary.c @@ -16,6 +16,7 @@ #include "xfs_rtbitmap.h" #include "xfs_bit.h" #include "xfs_bmap.h" +#include "xfs_sb.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index aff20ddd4a9f..5a2512d20bd0 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1496,6 +1496,18 @@ xfs_fs_fill_super( mp->m_super = sb; + /* + * Copy VFS mount flags from the context now that all parameter parsing + * is guaranteed to have been completed by either the old mount API or + * the newer fsopen/fsconfig API. + */ + if (fc->sb_flags & SB_RDONLY) + set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); + if (fc->sb_flags & SB_DIRSYNC) + mp->m_features |= XFS_FEAT_DIRSYNC; + if (fc->sb_flags & SB_SYNCHRONOUS) + mp->m_features |= XFS_FEAT_WSYNC; + error = xfs_fs_validate_params(mp); if (error) return error; @@ -1965,6 +1977,11 @@ static const struct fs_context_operations xfs_context_ops = { .free = xfs_fs_free, }; +/* + * WARNING: do not initialise any parameters in this function that depend on + * mount option parsing having already been performed as this can be called from + * fsopen() before any parameters have been set. + */ static int xfs_init_fs_context( struct fs_context *fc) { @@ -1996,16 +2013,6 @@ static int xfs_init_fs_context( mp->m_logbsize = -1; mp->m_allocsize_log = 16; /* 64k */ - /* - * Copy binary VFS mount flags we are interested in. - */ - if (fc->sb_flags & SB_RDONLY) - set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); - if (fc->sb_flags & SB_DIRSYNC) - mp->m_features |= XFS_FEAT_DIRSYNC; - if (fc->sb_flags & SB_SYNCHRONOUS) - mp->m_features |= XFS_FEAT_WSYNC; - fc->s_fs_info = mp; fc->ops = &xfs_context_ops; diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index fade33735393..a191f6560f98 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -206,8 +206,6 @@ static struct ctl_table xfs_table[] = { .extra2 = &xfs_params.stats_clear.max }, #endif /* CONFIG_PROC_FS */ - - {} }; int diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index e6a75401677d..93971742613a 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -747,8 +747,6 @@ static struct dentry *zonefs_lookup(struct inode *dir, struct dentry *dentry, inode = zonefs_get_dir_inode(dir, dentry); else inode = zonefs_get_file_inode(dir, dentry); - if (IS_ERR(inode)) - return ERR_CAST(inode); return d_splice_alias(inode, dentry); } |