diff options
Diffstat (limited to 'mm')
40 files changed, 812 insertions, 583 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 1b501db06417..0b7f4bb5cb80 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -242,6 +242,10 @@ menu "Slab allocator options" config SLUB def_bool y +config KVFREE_RCU_BATCHED + def_bool y + depends on !SLUB_TINY && !TINY_RCU + config SLUB_TINY bool "Configure for minimal memory footprint" depends on EXPERT diff --git a/mm/compaction.c b/mm/compaction.c index 12ed8425fa17..a3203d97123e 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -3181,6 +3181,7 @@ static int kcompactd(void *p) long default_timeout = msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC); long timeout = default_timeout; + current->flags |= PF_KCOMPACTD; set_freezable(); pgdat->kcompactd_max_order = 0; @@ -3237,6 +3238,8 @@ static int kcompactd(void *p) pgdat->proactive_compact_trigger = false; } + current->flags &= ~PF_KCOMPACTD; + return 0; } diff --git a/mm/damon/core.c b/mm/damon/core.c index c7b981308862..384935ef4e65 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -373,6 +373,7 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern, * or damon_attrs are updated. */ scheme->next_apply_sis = 0; + scheme->walk_completed = false; INIT_LIST_HEAD(&scheme->filters); scheme->stat = (struct damos_stat){}; INIT_LIST_HEAD(&scheme->list); @@ -1429,9 +1430,13 @@ static bool damos_filter_out(struct damon_ctx *ctx, struct damon_target *t, { struct damos_filter *filter; + s->core_filters_allowed = false; damos_for_each_filter(filter, s) { - if (damos_filter_match(ctx, t, r, filter)) + if (damos_filter_match(ctx, t, r, filter)) { + if (filter->allow) + s->core_filters_allowed = true; return !filter->allow; + } } return false; } diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 0f9ae14f884d..c834aa217835 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -236,6 +236,9 @@ static bool damos_pa_filter_out(struct damos *scheme, struct folio *folio) { struct damos_filter *filter; + if (scheme->core_filters_allowed) + return false; + damos_for_each_filter(filter, scheme) { if (damos_pa_filter_match(filter, folio)) return !filter->allow; diff --git a/mm/execmem.c b/mm/execmem.c index 317b6a8d35be..e6c4f5076ca8 100644 --- a/mm/execmem.c +++ b/mm/execmem.c @@ -257,7 +257,6 @@ out_unlock: static int execmem_cache_populate(struct execmem_range *range, size_t size) { unsigned long vm_flags = VM_ALLOW_HUGE_VMAP; - unsigned long start, end; struct vm_struct *vm; size_t alloc_size; int err = -ENOMEM; @@ -275,26 +274,18 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size) /* fill memory with instructions that will trap */ execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true); - start = (unsigned long)p; - end = start + alloc_size; - - vunmap_range(start, end); - - err = execmem_set_direct_map_valid(vm, false); - if (err) - goto err_free_mem; - - err = vmap_pages_range_noflush(start, end, range->pgprot, vm->pages, - PMD_SHIFT); + err = set_memory_rox((unsigned long)p, vm->nr_pages); if (err) goto err_free_mem; err = execmem_cache_add(p, alloc_size); if (err) - goto err_free_mem; + goto err_reset_direct_map; return 0; +err_reset_direct_map: + execmem_set_direct_map_valid(vm, true); err_free_mem: vfree(p); return err; @@ -344,6 +335,28 @@ static bool execmem_cache_free(void *ptr) return true; } + +int execmem_make_temp_rw(void *ptr, size_t size) +{ + unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT; + unsigned long addr = (unsigned long)ptr; + int ret; + + ret = set_memory_nx(addr, nr); + if (ret) + return ret; + + return set_memory_rw(addr, nr); +} + +int execmem_restore_rox(void *ptr, size_t size) +{ + unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT; + unsigned long addr = (unsigned long)ptr; + + return set_memory_rox(addr, nr); +} + #else /* CONFIG_ARCH_HAS_EXECMEM_ROX */ static void *execmem_cache_alloc(struct execmem_range *range, size_t size) { diff --git a/mm/filemap.c b/mm/filemap.c index 804d7365680c..e9404290f2c6 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -47,7 +47,6 @@ #include <linux/splice.h> #include <linux/rcupdate_wait.h> #include <linux/sched/mm.h> -#include <linux/fsnotify.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include "internal.h" @@ -445,7 +444,7 @@ EXPORT_SYMBOL(filemap_fdatawrite_range); * filemap_fdatawrite_range_kick - start writeback on a range * @mapping: target address_space * @start: index to start writeback on - * @end: last (non-inclusive) index for writeback + * @end: last (inclusive) index for writeback * * This is a non-integrity writeback helper, to start writing back folios * for the indicated range. @@ -1986,8 +1985,19 @@ no_page: if (err == -EEXIST) goto repeat; - if (err) + if (err) { + /* + * When NOWAIT I/O fails to allocate folios this could + * be due to a nonblocking memory allocation and not + * because the system actually is out of memory. + * Return -EAGAIN so that there caller retries in a + * blocking fashion instead of propagating -ENOMEM + * to the application. + */ + if ((fgp_flags & FGP_NOWAIT) && err == -ENOMEM) + err = -EAGAIN; return ERR_PTR(err); + } /* * filemap_add_folio locks the page, and for mmap * we expect an unlocked page. @@ -2897,8 +2907,7 @@ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe, size = min(size, folio_size(folio) - offset); offset %= PAGE_SIZE; - while (spliced < size && - !pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { + while (spliced < size && !pipe_is_full(pipe)) { struct pipe_buffer *buf = pipe_head_buf(pipe); size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced); @@ -2955,7 +2964,7 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos, iocb.ki_pos = *ppos; /* Work out how much data we can actually add into the pipe */ - used = pipe_occupancy(pipe->head, pipe->tail); + used = pipe_buf_usage(pipe); npages = max_t(ssize_t, pipe->max_usage - used, 0); len = min_t(size_t, len, npages * PAGE_SIZE); @@ -3015,7 +3024,7 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos, total_spliced += n; *ppos += n; in->f_ra.prev_pos = *ppos; - if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) + if (pipe_is_full(pipe)) goto out; } @@ -3199,14 +3208,6 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) unsigned long vm_flags = vmf->vma->vm_flags; unsigned int mmap_miss; - /* - * If we have pre-content watches we need to disable readahead to make - * sure that we don't populate our mapping with 0 filled pages that we - * never emitted an event for. - */ - if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) - return fpin; - #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* Use the readahead code, even if readahead is disabled */ if ((vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) { @@ -3275,10 +3276,6 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf, struct file *fpin = NULL; unsigned int mmap_miss; - /* See comment in do_sync_mmap_readahead. */ - if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) - return fpin; - /* If we don't want any read-ahead, don't bother */ if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages) return fpin; @@ -3338,48 +3335,6 @@ static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf) } /** - * filemap_fsnotify_fault - maybe emit a pre-content event. - * @vmf: struct vm_fault containing details of the fault. - * - * If we have a pre-content watch on this file we will emit an event for this - * range. If we return anything the fault caller should return immediately, we - * will return VM_FAULT_RETRY if we had to emit an event, which will trigger the - * fault again and then the fault handler will run the second time through. - * - * Return: a bitwise-OR of %VM_FAULT_ codes, 0 if nothing happened. - */ -vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf) -{ - struct file *fpin = NULL; - int mask = (vmf->flags & FAULT_FLAG_WRITE) ? MAY_WRITE : MAY_ACCESS; - loff_t pos = vmf->pgoff >> PAGE_SHIFT; - size_t count = PAGE_SIZE; - int err; - - /* - * We already did this and now we're retrying with everything locked, - * don't emit the event and continue. - */ - if (vmf->flags & FAULT_FLAG_TRIED) - return 0; - - /* No watches, we're done. */ - if (likely(!FMODE_FSNOTIFY_HSM(vmf->vma->vm_file->f_mode))) - return 0; - - fpin = maybe_unlock_mmap_for_io(vmf, fpin); - if (!fpin) - return VM_FAULT_SIGBUS; - - err = fsnotify_file_area_perm(fpin, mask, &pos, count); - fput(fpin); - if (err) - return VM_FAULT_SIGBUS; - return VM_FAULT_RETRY; -} -EXPORT_SYMBOL_GPL(filemap_fsnotify_fault); - -/** * filemap_fault - read in file data for page fault handling * @vmf: struct vm_fault containing details of the fault * @@ -3483,37 +3438,6 @@ retry_find: */ if (unlikely(!folio_test_uptodate(folio))) { /* - * If this is a precontent file we have can now emit an event to - * try and populate the folio. - */ - if (!(vmf->flags & FAULT_FLAG_TRIED) && - unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) { - loff_t pos = folio_pos(folio); - size_t count = folio_size(folio); - - /* We're NOWAIT, we have to retry. */ - if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) { - folio_unlock(folio); - goto out_retry; - } - - if (mapping_locked) - filemap_invalidate_unlock_shared(mapping); - mapping_locked = false; - - folio_unlock(folio); - fpin = maybe_unlock_mmap_for_io(vmf, fpin); - if (!fpin) - goto out_retry; - - error = fsnotify_file_area_perm(fpin, MAY_ACCESS, &pos, - count); - if (error) - ret = VM_FAULT_SIGBUS; - goto out_retry; - } - - /* * If the invalidate lock is not held, the folio was in cache * and uptodate and now it is not. Strange but possible since we * didn't hold the page lock all the time. Let's drop @@ -4170,17 +4094,6 @@ retry: bytes = min(chunk - offset, bytes); balance_dirty_pages_ratelimited(mapping); - /* - * Bring in the user page that we will copy from _first_. - * Otherwise there's a nasty deadlock on copying from the - * same page as we're writing to, without it being marked - * up-to-date. - */ - if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) { - status = -EFAULT; - break; - } - if (fatal_signal_pending(current)) { status = -EINTR; break; @@ -4198,6 +4111,12 @@ retry: if (mapping_writably_mapped(mapping)) flush_dcache_folio(folio); + /* + * Faults here on mmap()s can recurse into arbitrary + * filesystem code. Lots of locks are held that can + * deadlock. Use an atomic copy to avoid deadlocking + * in page fault handling. + */ copied = copy_folio_from_iter_atomic(folio, offset, bytes, i); flush_dcache_folio(folio); @@ -4223,6 +4142,16 @@ retry: bytes = copied; goto retry; } + + /* + * 'folio' is now unlocked and faults on it can be + * handled. Ensure forward progress by trying to + * fault it in now. + */ + if (fault_in_iov_iter_readable(i, bytes) == bytes) { + status = -EFAULT; + break; + } } else { pos += status; written += status; @@ -2254,6 +2254,7 @@ EXPORT_SYMBOL(fault_in_readable); /** * get_dump_page() - pin user page in memory while writing it to core dump * @addr: user address + * @locked: a pointer to an int denoting whether the mmap sem is held * * Returns struct page pointer of user page pinned for dump, * to be freed afterwards by put_page(). @@ -2266,13 +2267,12 @@ EXPORT_SYMBOL(fault_in_readable); * Called without mmap_lock (takes and releases the mmap_lock by itself). */ #ifdef CONFIG_ELF_CORE -struct page *get_dump_page(unsigned long addr) +struct page *get_dump_page(unsigned long addr, int *locked) { struct page *page; - int locked = 0; int ret; - ret = __get_user_pages_locked(current->mm, addr, 1, &page, &locked, + ret = __get_user_pages_locked(current->mm, addr, 1, &page, locked, FOLL_FORCE | FOLL_DUMP | FOLL_GET); return (ret == 1) ? page : NULL; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3d3ebdc002d5..373781b21e5c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3304,7 +3304,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, folio_account_cleaned(tail, inode_to_wb(folio->mapping->host)); __filemap_remove_folio(tail, NULL); - folio_put(tail); + folio_put_refs(tail, folio_nr_pages(tail)); } else if (!folio_test_anon(folio)) { __xa_store(&folio->mapping->i_pages, tail->index, tail, 0); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 65068671e460..318624c96584 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2135,6 +2135,8 @@ retry: if (!folio_ref_count(folio)) { struct hstate *h = folio_hstate(folio); + bool adjust_surplus = false; + if (!available_huge_pages(h)) goto out; @@ -2157,7 +2159,9 @@ retry: goto retry; } - remove_hugetlb_folio(h, folio, false); + if (h->surplus_huge_pages_node[folio_nid(folio)]) + adjust_surplus = true; + remove_hugetlb_folio(h, folio, adjust_surplus); h->max_huge_pages--; spin_unlock_irq(&hugetlb_lock); @@ -2177,7 +2181,7 @@ retry: rc = hugetlb_vmemmap_restore_folio(h, folio); if (rc) { spin_lock_irq(&hugetlb_lock); - add_hugetlb_folio(h, folio, false); + add_hugetlb_folio(h, folio, adjust_surplus); h->max_huge_pages++; goto out; } @@ -2943,6 +2947,14 @@ int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn) return ret; } +void wait_for_freed_hugetlb_folios(void) +{ + if (llist_empty(&hpage_freelist)) + return; + + flush_work(&free_hpage_work); +} + typedef enum { /* * For either 0/1: we checked the per-vma resv map, and one resv @@ -3145,7 +3157,7 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid) /* do node specific alloc */ if (nid != NUMA_NO_NODE) { - m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h), + m = memblock_alloc_exact_nid_raw(huge_page_size(h), huge_page_size(h), 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid); if (!m) return 0; @@ -5447,7 +5459,7 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, if (src_ptl != dst_ptl) spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); - pte = huge_ptep_get_and_clear(mm, old_addr, src_pte); + pte = huge_ptep_get_and_clear(mm, old_addr, src_pte, sz); if (need_clear_uffd_wp && pte_marker_uffd_wp(pte)) huge_pte_clear(mm, new_addr, dst_pte, sz); @@ -5622,7 +5634,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED); } - pte = huge_ptep_get_and_clear(mm, address, ptep); + pte = huge_ptep_get_and_clear(mm, address, ptep, sz); tlb_remove_huge_tlb_entry(h, tlb, ptep, address); if (huge_pte_dirty(pte)) set_page_dirty(page); diff --git a/mm/internal.h b/mm/internal.h index 109ef30fee11..20b3535935a3 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1115,7 +1115,7 @@ static inline int find_next_best_node(int node, nodemask_t *used_node_mask) * mm/memory-failure.c */ #ifdef CONFIG_MEMORY_FAILURE -void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu); +int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill); void shake_folio(struct folio *folio); extern int hwpoison_filter(struct page *p); @@ -1138,8 +1138,9 @@ unsigned long page_mapped_in_vma(const struct page *page, struct vm_area_struct *vma); #else -static inline void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu) +static inline int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill) { + return -EBUSY; } #endif diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 3fe77a360f1c..8357e1a33699 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -370,6 +370,36 @@ static inline bool init_task_stack_addr(const void *addr) sizeof(init_thread_union.stack)); } +/* + * This function is invoked with report_lock (a raw_spinlock) held. A + * PREEMPT_RT kernel cannot call find_vm_area() as it will acquire a sleeping + * rt_spinlock. + * + * For !RT kernel, the PROVE_RAW_LOCK_NESTING config option will print a + * lockdep warning for this raw_spinlock -> spinlock dependency. This config + * option is enabled by default to ensure better test coverage to expose this + * kind of RT kernel problem. This lockdep splat, however, can be suppressed + * by using DEFINE_WAIT_OVERRIDE_MAP() if it serves a useful purpose and the + * invalid PREEMPT_RT case has been taken care of. + */ +static inline struct vm_struct *kasan_find_vm_area(void *addr) +{ + static DEFINE_WAIT_OVERRIDE_MAP(vmalloc_map, LD_WAIT_SLEEP); + struct vm_struct *va; + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + return NULL; + + /* + * Suppress lockdep warning and fetch vmalloc area of the + * offending address. + */ + lock_map_acquire_try(&vmalloc_map); + va = find_vm_area(addr); + lock_map_release(&vmalloc_map); + return va; +} + static void print_address_description(void *addr, u8 tag, struct kasan_report_info *info) { @@ -399,7 +429,7 @@ static void print_address_description(void *addr, u8 tag, } if (is_vmalloc_addr(addr)) { - struct vm_struct *va = find_vm_area(addr); + struct vm_struct *va = kasan_find_vm_area(addr); if (va) { pr_err("The buggy address belongs to the virtual mapping at\n" @@ -409,6 +439,8 @@ static void print_address_description(void *addr, u8 tag, pr_err("\n"); page = vmalloc_to_page(addr); + } else { + pr_err("The buggy address %px belongs to a vmalloc virtual mapping\n", addr); } } diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 3ea50f09311f..3df45c25c1f6 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -357,6 +357,7 @@ void kmsan_handle_dma(struct page *page, size_t offset, size_t size, size -= to_go; } } +EXPORT_SYMBOL_GPL(kmsan_handle_dma); void kmsan_handle_dma_sg(struct scatterlist *sg, int nents, enum dma_data_direction dir) diff --git a/mm/madvise.c b/mm/madvise.c index 49f3a75046f6..08b207f8e61e 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -933,7 +933,16 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, */ end = vma->vm_end; } - VM_WARN_ON(start >= end); + /* + * If the memory region between start and end was + * originally backed by 4kB pages and then remapped to + * be backed by hugepages while mmap_lock was dropped, + * the adjustment for hugetlb vma above may have rounded + * end down to the start address. + */ + if (start == end) + return 0; + VM_WARN_ON(start > end); } if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED) diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index 2be6b9112808..2e9fa431bbf5 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -1855,9 +1855,11 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, if (val > MAX_SWAPPINESS) return -EINVAL; - if (!mem_cgroup_is_root(memcg)) + if (!mem_cgroup_is_root(memcg)) { + pr_info_once("Per memcg swappiness does not exist in cgroup v2. " + "See memory.reclaim or memory.swap.max there\n "); WRITE_ONCE(memcg->swappiness, val); - else + } else WRITE_ONCE(vm_swappiness, val); return 0; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 46f8b372d212..a037ec92881d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1921,9 +1921,18 @@ void drain_all_stock(struct mem_cgroup *root_memcg) static int memcg_hotplug_cpu_dead(unsigned int cpu) { struct memcg_stock_pcp *stock; + struct obj_cgroup *old; + unsigned long flags; stock = &per_cpu(memcg_stock, cpu); + + /* drain_obj_stock requires stock_lock */ + local_lock_irqsave(&memcg_stock.stock_lock, flags); + old = drain_obj_stock(stock); + local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + drain_stock(stock); + obj_cgroup_put(old); return 0; } @@ -4166,6 +4175,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, memcg_memory_event(memcg, MEMCG_OOM); if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0)) break; + cond_resched(); } memcg_wb_domain_size_changed(memcg); @@ -4992,7 +5002,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) mem_cgroup_id_get_many(swap_memcg, nr_entries - 1); mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); - swap_cgroup_record(folio, entry); + swap_cgroup_record(folio, mem_cgroup_id(swap_memcg), entry); folio_unqueue_deferred_split(folio); folio->memcg_data = 0; @@ -5054,7 +5064,7 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) mem_cgroup_id_get_many(memcg, nr_pages - 1); mod_memcg_state(memcg, MEMCG_SWAP, nr_pages); - swap_cgroup_record(folio, entry); + swap_cgroup_record(folio, mem_cgroup_id(memcg), entry); return 0; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 995a15eb67e2..327e02fdc029 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1556,11 +1556,35 @@ static int get_hwpoison_page(struct page *p, unsigned long flags) return ret; } -void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu) +int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill) { - if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) { - struct address_space *mapping; + enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON; + struct address_space *mapping; + + if (folio_test_swapcache(folio)) { + pr_err("%#lx: keeping poisoned page in swap cache\n", pfn); + ttu &= ~TTU_HWPOISON; + } + /* + * Propagate the dirty bit from PTEs to struct page first, because we + * need this to decide if we should kill or just drop the page. + * XXX: the dirty test could be racy: set_page_dirty() may not always + * be called inside page lock (it's recommended but not enforced). + */ + mapping = folio_mapping(folio); + if (!must_kill && !folio_test_dirty(folio) && mapping && + mapping_can_writeback(mapping)) { + if (folio_mkclean(folio)) { + folio_set_dirty(folio); + } else { + ttu &= ~TTU_HWPOISON; + pr_info("%#lx: corrupted page was clean: dropped without side effects\n", + pfn); + } + } + + if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) { /* * For hugetlb folios in shared mappings, try_to_unmap * could potentially call huge_pmd_unshare. Because of @@ -1572,7 +1596,7 @@ void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu) if (!mapping) { pr_info("%#lx: could not lock mapping for mapped hugetlb folio\n", folio_pfn(folio)); - return; + return -EBUSY; } try_to_unmap(folio, ttu|TTU_RMAP_LOCKED); @@ -1580,6 +1604,8 @@ void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu) } else { try_to_unmap(folio, ttu); } + + return folio_mapped(folio) ? -EBUSY : 0; } /* @@ -1589,8 +1615,6 @@ void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu) static bool hwpoison_user_mappings(struct folio *folio, struct page *p, unsigned long pfn, int flags) { - enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON; - struct address_space *mapping; LIST_HEAD(tokill); bool unmap_success; int forcekill; @@ -1613,29 +1637,6 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p, if (!folio_mapped(folio)) return true; - if (folio_test_swapcache(folio)) { - pr_err("%#lx: keeping poisoned page in swap cache\n", pfn); - ttu &= ~TTU_HWPOISON; - } - - /* - * Propagate the dirty bit from PTEs to struct page first, because we - * need this to decide if we should kill or just drop the page. - * XXX: the dirty test could be racy: set_page_dirty() may not always - * be called inside page lock (it's recommended but not enforced). - */ - mapping = folio_mapping(folio); - if (!(flags & MF_MUST_KILL) && !folio_test_dirty(folio) && mapping && - mapping_can_writeback(mapping)) { - if (folio_mkclean(folio)) { - folio_set_dirty(folio); - } else { - ttu &= ~TTU_HWPOISON; - pr_info("%#lx: corrupted page was clean: dropped without side effects\n", - pfn); - } - } - /* * First collect all the processes that have the page * mapped in dirty form. This has to be done before try_to_unmap, @@ -1643,9 +1644,7 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p, */ collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED); - unmap_poisoned_folio(folio, ttu); - - unmap_success = !folio_mapped(folio); + unmap_success = !unmap_poisoned_folio(folio, pfn, flags & MF_MUST_KILL); if (!unmap_success) pr_err("%#lx: failed to unmap page (folio mapcount=%d)\n", pfn, folio_mapcount(folio)); diff --git a/mm/memory.c b/mm/memory.c index 539c0f7c6d54..4f6d9766a046 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -76,7 +76,6 @@ #include <linux/ptrace.h> #include <linux/vmalloc.h> #include <linux/sched/sysctl.h> -#include <linux/fsnotify.h> #include <trace/events/kmem.h> @@ -1719,7 +1718,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, pmd_t pmdval; unsigned long start = addr; bool can_reclaim_pt = reclaim_pt_is_enabled(start, end, details); - bool direct_reclaim = false; + bool direct_reclaim = true; int nr; retry: @@ -1734,8 +1733,10 @@ retry: do { bool any_skipped = false; - if (need_resched()) + if (need_resched()) { + direct_reclaim = false; break; + } nr = do_zap_pte_range(tlb, vma, pte, addr, end, details, rss, &force_flush, &force_break, &any_skipped); @@ -1743,11 +1744,20 @@ retry: can_reclaim_pt = false; if (unlikely(force_break)) { addr += nr * PAGE_SIZE; + direct_reclaim = false; break; } } while (pte += nr, addr += PAGE_SIZE * nr, addr != end); - if (can_reclaim_pt && addr == end) + /* + * Fast path: try to hold the pmd lock and unmap the PTE page. + * + * If the pte lock was released midway (retry case), or if the attempt + * to hold the pmd lock failed, then we need to recheck all pte entries + * to ensure they are still none, thereby preventing the pte entries + * from being repopulated by another thread. + */ + if (can_reclaim_pt && direct_reclaim && addr == end) direct_reclaim = try_get_and_clear_pmd(mm, pmd, &pmdval); add_mm_rss_vec(mm, rss); @@ -3040,8 +3050,10 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr, next = pgd_addr_end(addr, end); if (pgd_none(*pgd) && !create) continue; - if (WARN_ON_ONCE(pgd_leaf(*pgd))) - return -EINVAL; + if (WARN_ON_ONCE(pgd_leaf(*pgd))) { + err = -EINVAL; + break; + } if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) { if (!create) continue; @@ -5172,7 +5184,11 @@ vm_fault_t finish_fault(struct vm_fault *vmf) bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED); int type, nr_pages; - unsigned long addr = vmf->address; + unsigned long addr; + bool needs_fallback = false; + +fallback: + addr = vmf->address; /* Did we COW the page? */ if (is_cow) @@ -5211,7 +5227,8 @@ vm_fault_t finish_fault(struct vm_fault *vmf) * approach also applies to non-anonymous-shmem faults to avoid * inflating the RSS of the process. */ - if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma))) { + if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma)) || + unlikely(needs_fallback)) { nr_pages = 1; } else if (nr_pages > 1) { pgoff_t idx = folio_page_idx(folio, page); @@ -5247,9 +5264,9 @@ vm_fault_t finish_fault(struct vm_fault *vmf) ret = VM_FAULT_NOPAGE; goto unlock; } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) { - update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages); - ret = VM_FAULT_NOPAGE; - goto unlock; + needs_fallback = true; + pte_unmap_unlock(vmf->pte, vmf->ptl); + goto fallback; } folio_ref_add(folio, nr_pages - 1); @@ -5732,17 +5749,8 @@ out_map: static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - if (vma_is_anonymous(vma)) return do_huge_pmd_anonymous_page(vmf); - /* - * Currently we just emit PAGE_SIZE for our fault events, so don't allow - * a huge fault if we have a pre content watch on this file. This would - * be trivial to support, but there would need to be tests to ensure - * this works properly and those don't exist currently. - */ - if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode))) - return VM_FAULT_FALLBACK; if (vma->vm_ops->huge_fault) return vma->vm_ops->huge_fault(vmf, PMD_ORDER); return VM_FAULT_FALLBACK; @@ -5766,9 +5774,6 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) } if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { - /* See comment in create_huge_pmd. */ - if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode))) - goto split; if (vma->vm_ops->huge_fault) { ret = vma->vm_ops->huge_fault(vmf, PMD_ORDER); if (!(ret & VM_FAULT_FALLBACK)) @@ -5791,9 +5796,6 @@ static vm_fault_t create_huge_pud(struct vm_fault *vmf) /* No support for anonymous transparent PUD pages yet */ if (vma_is_anonymous(vma)) return VM_FAULT_FALLBACK; - /* See comment in create_huge_pmd. */ - if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode))) - return VM_FAULT_FALLBACK; if (vma->vm_ops->huge_fault) return vma->vm_ops->huge_fault(vmf, PUD_ORDER); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -5811,9 +5813,6 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) if (vma_is_anonymous(vma)) goto split; if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { - /* See comment in create_huge_pmd. */ - if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode))) - goto split; if (vma->vm_ops->huge_fault) { ret = vma->vm_ops->huge_fault(vmf, PUD_ORDER); if (!(ret & VM_FAULT_FALLBACK)) @@ -6835,10 +6834,8 @@ void __might_fault(const char *file, int line) if (pagefault_disabled()) return; __might_sleep(file, line); -#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) if (current->mm) might_lock_read(¤t->mm->mmap_lock); -#endif } EXPORT_SYMBOL(__might_fault); #endif diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e3655f07dd6e..16cf9e17077e 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1822,26 +1822,24 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (folio_test_large(folio)) pfn = folio_pfn(folio) + folio_nr_pages(folio) - 1; - /* - * HWPoison pages have elevated reference counts so the migration would - * fail on them. It also doesn't make any sense to migrate them in the - * first place. Still try to unmap such a page in case it is still mapped - * (keep the unmap as the catch all safety net). - */ + if (!folio_try_get(folio)) + continue; + + if (unlikely(page_folio(page) != folio)) + goto put_folio; + if (folio_test_hwpoison(folio) || (folio_test_large(folio) && folio_test_has_hwpoisoned(folio))) { if (WARN_ON(folio_test_lru(folio))) folio_isolate_lru(folio); - if (folio_mapped(folio)) - unmap_poisoned_folio(folio, TTU_IGNORE_MLOCK); - continue; - } - - if (!folio_try_get(folio)) - continue; + if (folio_mapped(folio)) { + folio_lock(folio); + unmap_poisoned_folio(folio, pfn, false); + folio_unlock(folio); + } - if (unlikely(page_folio(page) != folio)) goto put_folio; + } if (!isolate_folio_to_list(folio, &source)) { if (__ratelimit(&migrate_rs)) { diff --git a/mm/mempolicy.c b/mm/mempolicy.c index bbaadbeeb291..a9eea051b0d6 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -196,6 +196,37 @@ int numa_nearest_node(int node, unsigned int state) } EXPORT_SYMBOL_GPL(numa_nearest_node); +/** + * nearest_node_nodemask - Find the node in @mask at the nearest distance + * from @node. + * + * @node: a valid node ID to start the search from. + * @mask: a pointer to a nodemask representing the allowed nodes. + * + * This function iterates over all nodes in @mask and calculates the + * distance from the starting @node, then it returns the node ID that is + * the closest to @node, or MAX_NUMNODES if no node is found. + * + * Note that @node must be a valid node ID usable with node_distance(), + * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes + * or unexpected behavior. + */ +int nearest_node_nodemask(int node, nodemask_t *mask) +{ + int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES; + + for_each_node_mask(n, *mask) { + dist = node_distance(node, n); + if (dist < min_dist) { + min_dist = dist; + min_node = n; + } + } + + return min_node; +} +EXPORT_SYMBOL_GPL(nearest_node_nodemask); + struct mempolicy *get_task_policy(struct task_struct *p) { struct mempolicy *pol = p->mempolicy; diff --git a/mm/migrate.c b/mm/migrate.c index fb19a18892c8..97f0edf0c032 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -518,15 +518,13 @@ static int __folio_migrate_mapping(struct address_space *mapping, if (folio_test_anon(folio) && folio_test_large(folio)) mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1); folio_ref_add(newfolio, nr); /* add cache reference */ - if (folio_test_swapbacked(folio)) { + if (folio_test_swapbacked(folio)) __folio_set_swapbacked(newfolio); - if (folio_test_swapcache(folio)) { - folio_set_swapcache(newfolio); - newfolio->private = folio_get_private(folio); - } + if (folio_test_swapcache(folio)) { + folio_set_swapcache(newfolio); + newfolio->private = folio_get_private(folio); entries = nr; } else { - VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio); entries = 1; } diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 9cf26592ac93..5bd888223cc8 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -840,20 +840,15 @@ void migrate_device_finalize(unsigned long *src_pfns, dst = src; } + if (!folio_is_zone_device(dst)) + folio_add_lru(dst); remove_migration_ptes(src, dst, 0); folio_unlock(src); - - if (folio_is_zone_device(src)) - folio_put(src); - else - folio_putback_lru(src); + folio_put(src); if (dst != src) { folio_unlock(dst); - if (folio_is_zone_device(dst)) - folio_put(dst); - else - folio_putback_lru(dst); + folio_put(dst); } } } diff --git a/mm/nommu.c b/mm/nommu.c index baa79abdaf03..9cb6e99215e2 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1613,13 +1613,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, } EXPORT_SYMBOL(remap_vmalloc_range); -vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf) -{ - BUG(); - return 0; -} -EXPORT_SYMBOL_GPL(filemap_fsnotify_fault); - vm_fault_t filemap_fault(struct vm_fault *vmf) { BUG(); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 579789600a3c..542d25f77be8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4243,6 +4243,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, restart: compaction_retries = 0; no_progress_loops = 0; + compact_result = COMPACT_SKIPPED; compact_priority = DEF_COMPACT_PRIORITY; cpuset_mems_cookie = read_mems_allowed_begin(); zonelist_iter_cookie = zonelist_iter_begin(); @@ -5849,11 +5850,10 @@ static void setup_per_zone_lowmem_reserve(void) for (j = i + 1; j < MAX_NR_ZONES; j++) { struct zone *upper_zone = &pgdat->node_zones[j]; - bool empty = !zone_managed_pages(upper_zone); managed_pages += zone_managed_pages(upper_zone); - if (clear || empty) + if (clear) zone->lowmem_reserve[j] = 0; else zone->lowmem_reserve[j] = managed_pages / ratio; @@ -7004,7 +7004,7 @@ static inline bool has_unaccepted_memory(void) static bool cond_accept_memory(struct zone *zone, unsigned int order) { - long to_accept; + long to_accept, wmark; bool ret = false; if (!has_unaccepted_memory()) @@ -7013,8 +7013,18 @@ static bool cond_accept_memory(struct zone *zone, unsigned int order) if (list_empty(&zone->unaccepted_pages)) return false; + wmark = promo_wmark_pages(zone); + + /* + * Watermarks have not been initialized yet. + * + * Accepting one MAX_ORDER page to ensure progress. + */ + if (!wmark) + return try_to_accept_memory_one(zone); + /* How much to accept to get to promo watermark? */ - to_accept = promo_wmark_pages(zone) - + to_accept = wmark - (zone_page_state(zone, NR_FREE_PAGES) - __zone_watermark_unusable_free(zone, order, 0) - zone_page_state(zone, NR_UNACCEPTED)); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index c608e9d72865..a051a29e95ad 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -608,6 +608,16 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, int ret; /* + * Due to the deferred freeing of hugetlb folios, the hugepage folios may + * not immediately release to the buddy system. This can cause PageBuddy() + * to fail in __test_page_isolated_in_pageblock(). To ensure that the + * hugetlb folios are properly released back to the buddy system, we + * invoke the wait_for_freed_hugetlb_folios() function to wait for the + * release to complete. + */ + wait_for_freed_hugetlb_folios(); + + /* * Note: pageblock_nr_pages != MAX_PAGE_ORDER. Then, chunks of free * pages are not aligned to pageblock_nr_pages. * Then we just check migratetype first. diff --git a/mm/percpu.c b/mm/percpu.c index ac61e3fc5f15..7b5835356d1e 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -3071,7 +3071,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, continue; } /* copy and return the unused part */ - memcpy(ptr, __per_cpu_load, ai->static_size); + memcpy(ptr, __per_cpu_start, ai->static_size); pcpu_fc_free(ptr + size_sum, ai->unit_size - size_sum); } } @@ -3240,7 +3240,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t flush_cache_vmap_early(unit_addr, unit_addr + ai->unit_size); /* copy static data */ - memcpy((void *)unit_addr, __per_cpu_load, ai->static_size); + memcpy((void *)unit_addr, __per_cpu_start, ai->static_size); } /* we're ready, commit */ diff --git a/mm/readahead.c b/mm/readahead.c index 220155a5c964..6a4e96b69702 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -128,7 +128,6 @@ #include <linux/blk-cgroup.h> #include <linux/fadvise.h> #include <linux/sched/mm.h> -#include <linux/fsnotify.h> #include "internal.h" @@ -559,15 +558,6 @@ void page_cache_sync_ra(struct readahead_control *ractl, pgoff_t prev_index, miss; /* - * If we have pre-content watches we need to disable readahead to make - * sure that we don't find 0 filled pages in cache that we never emitted - * events for. Filesystems supporting HSM must make sure to not call - * this function with ractl->file unset for files handled by HSM. - */ - if (ractl->file && unlikely(FMODE_FSNOTIFY_HSM(ractl->file->f_mode))) - return; - - /* * Even if readahead is disabled, issue this request as readahead * as we'll need it to satisfy the requested range. The forced * readahead will do the right thing and limit the read to just the @@ -645,10 +635,6 @@ void page_cache_async_ra(struct readahead_control *ractl, if (!ra->ra_pages) return; - /* See the comment in page_cache_sync_ra. */ - if (ractl->file && unlikely(FMODE_FSNOTIFY_HSM(ractl->file->f_mode))) - return; - /* * Same bit is used for PG_readahead and PG_reclaim. */ diff --git a/mm/shmem.c b/mm/shmem.c index 4ea6109a8043..ab61c8bb20e1 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1548,7 +1548,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) if (WARN_ON_ONCE(!wbc->for_reclaim)) goto redirty; - if (WARN_ON_ONCE((info->flags & VM_LOCKED) || sbinfo->noswap)) + if ((info->flags & VM_LOCKED) || sbinfo->noswap) goto redirty; if (!total_swap_pages) @@ -2253,7 +2253,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct folio *folio = NULL; bool skip_swapcache = false; swp_entry_t swap; - int error, nr_pages; + int error, nr_pages, order, split_order; VM_BUG_ON(!*foliop || !xa_is_value(*foliop)); swap = radix_to_swp_entry(*foliop); @@ -2272,10 +2272,9 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, /* Look it up and read it in.. */ folio = swap_cache_get_folio(swap, NULL, 0); + order = xa_get_order(&mapping->i_pages, index); if (!folio) { - int order = xa_get_order(&mapping->i_pages, index); bool fallback_order0 = false; - int split_order; /* Or update major stats only when swapin succeeds?? */ if (fault_type) { @@ -2339,6 +2338,29 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, error = -ENOMEM; goto failed; } + } else if (order != folio_order(folio)) { + /* + * Swap readahead may swap in order 0 folios into swapcache + * asynchronously, while the shmem mapping can still stores + * large swap entries. In such cases, we should split the + * large swap entry to prevent possible data corruption. + */ + split_order = shmem_split_large_entry(inode, index, swap, gfp); + if (split_order < 0) { + error = split_order; + goto failed; + } + + /* + * If the large swap entry has already been split, it is + * necessary to recalculate the new swap entry based on + * the old order alignment. + */ + if (split_order > 0) { + pgoff_t offset = index - round_down(index, 1 << split_order); + + swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); + } } alloced: @@ -2346,7 +2368,8 @@ alloced: folio_lock(folio); if ((!skip_swapcache && !folio_test_swapcache(folio)) || folio->swap.val != swap.val || - !shmem_confirm_swap(mapping, index, swap)) { + !shmem_confirm_swap(mapping, index, swap) || + xa_get_order(&mapping->i_pages, index) != folio_order(folio)) { error = -EEXIST; goto unlock; } @@ -3487,7 +3510,7 @@ static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe, size = min_t(size_t, size, PAGE_SIZE - offset); - if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { + if (!pipe_is_full(pipe)) { struct pipe_buffer *buf = pipe_head_buf(pipe); *buf = (struct pipe_buffer) { @@ -3514,7 +3537,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, int error = 0; /* Work out how much data we can actually add into the pipe */ - used = pipe_occupancy(pipe->head, pipe->tail); + used = pipe_buf_usage(pipe); npages = max_t(ssize_t, pipe->max_usage - used, 0); len = min_t(size_t, len, npages * PAGE_SIZE); @@ -3601,7 +3624,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, total_spliced += n; *ppos += n; in->f_ra.prev_pos = *ppos; - if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) + if (pipe_is_full(pipe)) break; cond_resched(); @@ -3889,16 +3912,16 @@ out_iput: return error; } -static int shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { int error; error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0); if (error) - return error; + return ERR_PTR(error); inc_nlink(dir); - return 0; + return NULL; } static int shmem_create(struct mnt_idmap *idmap, struct inode *dir, diff --git a/mm/slab.h b/mm/slab.h index e9fd9bf0bfa6..05a21dc796e0 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -457,39 +457,17 @@ static inline bool is_kmalloc_normal(struct kmem_cache *s) return !(s->flags & (SLAB_CACHE_DMA|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT)); } -/* Legal flag mask for kmem_cache_create(), for various configurations */ #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \ SLAB_CACHE_DMA32 | SLAB_PANIC | \ - SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS ) + SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS | \ + SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ + SLAB_TEMPORARY | SLAB_ACCOUNT | \ + SLAB_NO_USER_FLAGS | SLAB_KMALLOC | SLAB_NO_MERGE) -#ifdef CONFIG_SLUB_DEBUG #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ SLAB_TRACE | SLAB_CONSISTENCY_CHECKS) -#else -#define SLAB_DEBUG_FLAGS (0) -#endif -#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ - SLAB_TEMPORARY | SLAB_ACCOUNT | \ - SLAB_NO_USER_FLAGS | SLAB_KMALLOC | SLAB_NO_MERGE) - -/* Common flags available with current configuration */ -#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) - -/* Common flags permitted for kmem_cache_create */ -#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | \ - SLAB_RED_ZONE | \ - SLAB_POISON | \ - SLAB_STORE_USER | \ - SLAB_TRACE | \ - SLAB_CONSISTENCY_CHECKS | \ - SLAB_NOLEAKTRACE | \ - SLAB_RECLAIM_ACCOUNT | \ - SLAB_TEMPORARY | \ - SLAB_ACCOUNT | \ - SLAB_KMALLOC | \ - SLAB_NO_MERGE | \ - SLAB_NO_USER_FLAGS) +#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS) bool __kmem_cache_empty(struct kmem_cache *); int __kmem_cache_shutdown(struct kmem_cache *); @@ -604,6 +582,8 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, int objects, struct slabobj_ext *obj_exts); #endif +void kvfree_rcu_cb(struct rcu_head *head); + size_t __ksize(const void *objp); static inline size_t slab_ksize(const struct kmem_cache *s) diff --git a/mm/slab_common.c b/mm/slab_common.c index 4030907b6b7d..5be257e03c7c 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -298,6 +298,8 @@ struct kmem_cache *__kmem_cache_create_args(const char *name, static_branch_enable(&slub_debug_enabled); if (flags & SLAB_STORE_USER) stack_depot_init(); +#else + flags &= ~SLAB_DEBUG_FLAGS; #endif mutex_lock(&slab_mutex); @@ -307,20 +309,11 @@ struct kmem_cache *__kmem_cache_create_args(const char *name, goto out_unlock; } - /* Refuse requests with allocator specific flags */ if (flags & ~SLAB_FLAGS_PERMITTED) { err = -EINVAL; goto out_unlock; } - /* - * Some allocators will constraint the set of valid flags to a subset - * of all flags. We expect them to define CACHE_CREATE_MASK in this - * case, and we'll just provide them with a sanitized version of the - * passed flags. - */ - flags &= CACHE_CREATE_MASK; - /* Fail closed on bad usersize of useroffset values. */ if (!IS_ENABLED(CONFIG_HARDENED_USERCOPY) || WARN_ON(!args->usersize && args->useroffset) || @@ -1284,6 +1277,29 @@ EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); EXPORT_TRACEPOINT_SYMBOL(kfree); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); +#ifndef CONFIG_KVFREE_RCU_BATCHED + +void kvfree_call_rcu(struct rcu_head *head, void *ptr) +{ + if (head) { + kasan_record_aux_stack(ptr); + call_rcu(head, kvfree_rcu_cb); + return; + } + + // kvfree_rcu(one_arg) call. + might_sleep(); + synchronize_rcu(); + kvfree(ptr); +} +EXPORT_SYMBOL_GPL(kvfree_call_rcu); + +void __init kvfree_rcu_init(void) +{ +} + +#else /* CONFIG_KVFREE_RCU_BATCHED */ + /* * This rcu parameter is runtime-read-only. It reflects * a minimum allowed number of objects which can be cached @@ -1304,6 +1320,8 @@ module_param(rcu_min_cached_objs, int, 0444); static int rcu_delay_page_cache_fill_msec = 5000; module_param(rcu_delay_page_cache_fill_msec, int, 0444); +static struct workqueue_struct *rcu_reclaim_wq; + /* Maximum number of jiffies to wait before draining a batch. */ #define KFREE_DRAIN_JIFFIES (5 * HZ) #define KFREE_N_BATCHES 2 @@ -1532,8 +1550,7 @@ kvfree_rcu_list(struct rcu_head *head) rcu_lock_acquire(&rcu_callback_map); trace_rcu_invoke_kvfree_callback("slab", head, offset); - if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset))) - kvfree(ptr); + kvfree(ptr); rcu_lock_release(&rcu_callback_map); cond_resched_tasks_rcu_qs(); @@ -1632,10 +1649,10 @@ __schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) if (delayed_work_pending(&krcp->monitor_work)) { delay_left = krcp->monitor_work.timer.expires - jiffies; if (delay < delay_left) - mod_delayed_work(system_unbound_wq, &krcp->monitor_work, delay); + mod_delayed_work(rcu_reclaim_wq, &krcp->monitor_work, delay); return; } - queue_delayed_work(system_unbound_wq, &krcp->monitor_work, delay); + queue_delayed_work(rcu_reclaim_wq, &krcp->monitor_work, delay); } static void @@ -1733,7 +1750,7 @@ kvfree_rcu_queue_batch(struct kfree_rcu_cpu *krcp) // "free channels", the batch can handle. Break // the loop since it is done with this CPU thus // queuing an RCU work is _always_ success here. - queued = queue_rcu_work(system_unbound_wq, &krwp->rcu_work); + queued = queue_rcu_work(rcu_reclaim_wq, &krwp->rcu_work); WARN_ON_ONCE(!queued); break; } @@ -1861,8 +1878,6 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, return true; } -#if !defined(CONFIG_TINY_RCU) - static enum hrtimer_restart schedule_page_work_fn(struct hrtimer *t) { @@ -1883,12 +1898,12 @@ run_page_cache_worker(struct kfree_rcu_cpu *krcp) if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && !atomic_xchg(&krcp->work_in_progress, 1)) { if (atomic_read(&krcp->backoff_page_cache_fill)) { - queue_delayed_work(system_unbound_wq, + queue_delayed_work(rcu_reclaim_wq, &krcp->page_cache_work, msecs_to_jiffies(rcu_delay_page_cache_fill_msec)); } else { - hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - krcp->hrtimer.function = schedule_page_work_fn; + hrtimer_setup(&krcp->hrtimer, schedule_page_work_fn, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL); } } @@ -2071,8 +2086,6 @@ void kvfree_rcu_barrier(void) } EXPORT_SYMBOL_GPL(kvfree_rcu_barrier); -#endif /* #if !defined(CONFIG_TINY_RCU) */ - static unsigned long kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { @@ -2120,6 +2133,10 @@ void __init kvfree_rcu_init(void) int i, j; struct shrinker *kfree_rcu_shrinker; + rcu_reclaim_wq = alloc_workqueue("kvfree_rcu_reclaim", + WQ_UNBOUND | WQ_MEM_RECLAIM, 0); + WARN_ON(!rcu_reclaim_wq); + /* Clamp it to [0:100] seconds interval. */ if (rcu_delay_page_cache_fill_msec < 0 || rcu_delay_page_cache_fill_msec > 100 * MSEC_PER_SEC) { @@ -2162,3 +2179,6 @@ void __init kvfree_rcu_init(void) shrinker_register(kfree_rcu_shrinker); } + +#endif /* CONFIG_KVFREE_RCU_BATCHED */ + diff --git a/mm/slub.c b/mm/slub.c index 1f50129dcfb3..5eac408e818e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -19,6 +19,7 @@ #include <linux/bitops.h> #include <linux/slab.h> #include "slab.h" +#include <linux/vmalloc.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/kasan.h> @@ -1017,22 +1018,31 @@ void skip_orig_size_check(struct kmem_cache *s, const void *object) set_orig_size(s, (void *)object, s->object_size); } -static void slab_bug(struct kmem_cache *s, char *fmt, ...) +static void __slab_bug(struct kmem_cache *s, const char *fmt, va_list argsp) { struct va_format vaf; va_list args; - va_start(args, fmt); + va_copy(args, argsp); vaf.fmt = fmt; vaf.va = &args; pr_err("=============================================================================\n"); - pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf); + pr_err("BUG %s (%s): %pV\n", s ? s->name : "<unknown>", print_tainted(), &vaf); pr_err("-----------------------------------------------------------------------------\n\n"); va_end(args); } +static void slab_bug(struct kmem_cache *s, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + __slab_bug(s, fmt, args); + va_end(args); +} + __printf(2, 3) -static void slab_fix(struct kmem_cache *s, char *fmt, ...) +static void slab_fix(struct kmem_cache *s, const char *fmt, ...) { struct va_format vaf; va_list args; @@ -1085,19 +1095,19 @@ static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p) /* Beginning of the filler is the free pointer */ print_section(KERN_ERR, "Padding ", p + off, size_from_object(s) - off); - - dump_stack(); } static void object_err(struct kmem_cache *s, struct slab *slab, - u8 *object, char *reason) + u8 *object, const char *reason) { if (slab_add_kunit_errors()) return; - slab_bug(s, "%s", reason); + slab_bug(s, reason); print_trailer(s, slab, object); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + + WARN_ON(1); } static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, @@ -1114,22 +1124,30 @@ static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, return false; } +static void __slab_err(struct slab *slab) +{ + if (slab_in_kunit_test()) + return; + + print_slab_info(slab); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + + WARN_ON(1); +} + static __printf(3, 4) void slab_err(struct kmem_cache *s, struct slab *slab, const char *fmt, ...) { va_list args; - char buf[100]; if (slab_add_kunit_errors()) return; va_start(args, fmt); - vsnprintf(buf, sizeof(buf), fmt, args); + __slab_bug(s, fmt, args); va_end(args); - slab_bug(s, "%s", buf); - print_slab_info(slab); - dump_stack(); - add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + + __slab_err(slab); } static void init_object(struct kmem_cache *s, void *object, u8 val) @@ -1166,7 +1184,7 @@ static void init_object(struct kmem_cache *s, void *object, u8 val) s->inuse - poison_size); } -static void restore_bytes(struct kmem_cache *s, char *message, u8 data, +static void restore_bytes(struct kmem_cache *s, const char *message, u8 data, void *from, void *to) { slab_fix(s, "Restoring %s 0x%p-0x%p=0x%x", message, from, to - 1, data); @@ -1181,8 +1199,8 @@ static void restore_bytes(struct kmem_cache *s, char *message, u8 data, static pad_check_attributes int check_bytes_and_report(struct kmem_cache *s, struct slab *slab, - u8 *object, char *what, - u8 *start, unsigned int value, unsigned int bytes) + u8 *object, const char *what, u8 *start, unsigned int value, + unsigned int bytes, bool slab_obj_print) { u8 *fault; u8 *end; @@ -1201,10 +1219,11 @@ check_bytes_and_report(struct kmem_cache *s, struct slab *slab, if (slab_add_kunit_errors()) goto skip_bug_print; - slab_bug(s, "%s overwritten", what); - pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", - fault, end - 1, fault - addr, - fault[0], value); + pr_err("[%s overwritten] 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", + what, fault, end - 1, fault - addr, fault[0], value); + + if (slab_obj_print) + object_err(s, slab, object, "Object corrupt"); skip_bug_print: restore_bytes(s, what, value, fault, end); @@ -1268,7 +1287,7 @@ static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p) return 1; return check_bytes_and_report(s, slab, p, "Object padding", - p + off, POISON_INUSE, size_from_object(s) - off); + p + off, POISON_INUSE, size_from_object(s) - off, true); } /* Check the pad bytes at the end of a slab page */ @@ -1301,9 +1320,10 @@ slab_pad_check(struct kmem_cache *s, struct slab *slab) while (end > fault && end[-1] == POISON_INUSE) end--; - slab_err(s, slab, "Padding overwritten. 0x%p-0x%p @offset=%tu", - fault, end - 1, fault - start); + slab_bug(s, "Padding overwritten. 0x%p-0x%p @offset=%tu", + fault, end - 1, fault - start); print_section(KERN_ERR, "Padding ", pad, remainder); + __slab_err(slab); restore_bytes(s, "slab padding", POISON_INUSE, fault, end); } @@ -1318,11 +1338,11 @@ static int check_object(struct kmem_cache *s, struct slab *slab, if (s->flags & SLAB_RED_ZONE) { if (!check_bytes_and_report(s, slab, object, "Left Redzone", - object - s->red_left_pad, val, s->red_left_pad)) + object - s->red_left_pad, val, s->red_left_pad, ret)) ret = 0; if (!check_bytes_and_report(s, slab, object, "Right Redzone", - endobject, val, s->inuse - s->object_size)) + endobject, val, s->inuse - s->object_size, ret)) ret = 0; if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) { @@ -1331,7 +1351,7 @@ static int check_object(struct kmem_cache *s, struct slab *slab, if (s->object_size > orig_size && !check_bytes_and_report(s, slab, object, "kmalloc Redzone", p + orig_size, - val, s->object_size - orig_size)) { + val, s->object_size - orig_size, ret)) { ret = 0; } } @@ -1339,7 +1359,7 @@ static int check_object(struct kmem_cache *s, struct slab *slab, if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) { if (!check_bytes_and_report(s, slab, p, "Alignment padding", endobject, POISON_INUSE, - s->inuse - s->object_size)) + s->inuse - s->object_size, ret)) ret = 0; } } @@ -1355,11 +1375,11 @@ static int check_object(struct kmem_cache *s, struct slab *slab, if (kasan_meta_size < s->object_size - 1 && !check_bytes_and_report(s, slab, p, "Poison", p + kasan_meta_size, POISON_FREE, - s->object_size - kasan_meta_size - 1)) + s->object_size - kasan_meta_size - 1, ret)) ret = 0; if (kasan_meta_size < s->object_size && !check_bytes_and_report(s, slab, p, "End Poison", - p + s->object_size - 1, POISON_END, 1)) + p + s->object_size - 1, POISON_END, 1, ret)) ret = 0; } /* @@ -1385,11 +1405,6 @@ static int check_object(struct kmem_cache *s, struct slab *slab, ret = 0; } - if (!ret && !slab_in_kunit_test()) { - print_trailer(s, slab, object); - add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); - } - return ret; } @@ -1427,7 +1442,7 @@ static int check_slab(struct kmem_cache *s, struct slab *slab) * Determine if a certain object in a slab is on the freelist. Must hold the * slab lock to guarantee that the chains are in a consistent state. */ -static int on_freelist(struct kmem_cache *s, struct slab *slab, void *search) +static bool on_freelist(struct kmem_cache *s, struct slab *slab, void *search) { int nr = 0; void *fp; @@ -1437,26 +1452,34 @@ static int on_freelist(struct kmem_cache *s, struct slab *slab, void *search) fp = slab->freelist; while (fp && nr <= slab->objects) { if (fp == search) - return 1; + return true; if (!check_valid_pointer(s, slab, fp)) { if (object) { object_err(s, slab, object, "Freechain corrupt"); set_freepointer(s, object, NULL); + break; } else { slab_err(s, slab, "Freepointer corrupt"); slab->freelist = NULL; slab->inuse = slab->objects; slab_fix(s, "Freelist cleared"); - return 0; + return false; } - break; } object = fp; fp = get_freepointer(s, object); nr++; } + if (nr > slab->objects) { + slab_err(s, slab, "Freelist cycle detected"); + slab->freelist = NULL; + slab->inuse = slab->objects; + slab_fix(s, "Freelist cleared"); + return false; + } + max_objects = order_objects(slab_order(slab), s->size); if (max_objects > MAX_OBJS_PER_PAGE) max_objects = MAX_OBJS_PER_PAGE; @@ -1624,12 +1647,12 @@ static inline int free_consistency_checks(struct kmem_cache *s, slab_err(s, slab, "Attempt to free object(0x%p) outside of slab", object); } else if (!slab->slab_cache) { - pr_err("SLUB <none>: no slab for object 0x%p.\n", - object); - dump_stack(); - } else + slab_err(NULL, slab, "No slab cache for object 0x%p", + object); + } else { object_err(s, slab, object, - "page slab pointer corrupt."); + "page slab pointer corrupt."); + } return 0; } return 1; @@ -4241,6 +4264,7 @@ static void *___kmalloc_large_node(size_t size, gfp_t flags, int node) ptr = folio_address(folio); lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, PAGE_SIZE << order); + __folio_set_large_kmalloc(folio); } ptr = kasan_kmalloc_large(ptr, size, flags); @@ -4716,6 +4740,11 @@ static void free_large_kmalloc(struct folio *folio, void *object) { unsigned int order = folio_order(folio); + if (WARN_ON_ONCE(!folio_test_large_kmalloc(folio))) { + dump_page(&folio->page, "Not a kmalloc allocation"); + return; + } + if (WARN_ON_ONCE(order == 0)) pr_warn_once("object pointer: 0x%p\n", object); @@ -4725,9 +4754,55 @@ static void free_large_kmalloc(struct folio *folio, void *object) lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, -(PAGE_SIZE << order)); + __folio_clear_large_kmalloc(folio); folio_put(folio); } +/* + * Given an rcu_head embedded within an object obtained from kvmalloc at an + * offset < 4k, free the object in question. + */ +void kvfree_rcu_cb(struct rcu_head *head) +{ + void *obj = head; + struct folio *folio; + struct slab *slab; + struct kmem_cache *s; + void *slab_addr; + + if (is_vmalloc_addr(obj)) { + obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj); + vfree(obj); + return; + } + + folio = virt_to_folio(obj); + if (!folio_test_slab(folio)) { + /* + * rcu_head offset can be only less than page size so no need to + * consider folio order + */ + obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj); + free_large_kmalloc(folio, obj); + return; + } + + slab = folio_slab(folio); + s = slab->slab_cache; + slab_addr = folio_address(folio); + + if (is_kfence_address(obj)) { + obj = kfence_object_start(obj); + } else { + unsigned int idx = __obj_to_index(s, slab_addr, obj); + + obj = slab_addr + s->size * idx; + obj = fixup_red_left(s, obj); + } + + slab_free(s, slab, obj, _RET_IP_); +} + /** * kfree - free previously allocated memory * @object: pointer returned by kmalloc() or kmem_cache_alloc() @@ -4878,6 +4953,168 @@ void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags) } EXPORT_SYMBOL(krealloc_noprof); +static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size) +{ + /* + * We want to attempt a large physically contiguous block first because + * it is less likely to fragment multiple larger blocks and therefore + * contribute to a long term fragmentation less than vmalloc fallback. + * However make sure that larger requests are not too disruptive - no + * OOM killer and no allocation failure warnings as we have a fallback. + */ + if (size > PAGE_SIZE) { + flags |= __GFP_NOWARN; + + if (!(flags & __GFP_RETRY_MAYFAIL)) + flags |= __GFP_NORETRY; + + /* nofail semantic is implemented by the vmalloc fallback */ + flags &= ~__GFP_NOFAIL; + } + + return flags; +} + +/** + * __kvmalloc_node - attempt to allocate physically contiguous memory, but upon + * failure, fall back to non-contiguous (vmalloc) allocation. + * @size: size of the request. + * @b: which set of kmalloc buckets to allocate from. + * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL. + * @node: numa node to allocate from + * + * Uses kmalloc to get the memory but if the allocation fails then falls back + * to the vmalloc allocator. Use kvfree for freeing the memory. + * + * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier. + * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is + * preferable to the vmalloc fallback, due to visible performance drawbacks. + * + * Return: pointer to the allocated memory of %NULL in case of failure + */ +void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) +{ + void *ret; + + /* + * It doesn't really make sense to fallback to vmalloc for sub page + * requests + */ + ret = __do_kmalloc_node(size, PASS_BUCKET_PARAM(b), + kmalloc_gfp_adjust(flags, size), + node, _RET_IP_); + if (ret || size <= PAGE_SIZE) + return ret; + + /* non-sleeping allocations are not supported by vmalloc */ + if (!gfpflags_allow_blocking(flags)) + return NULL; + + /* Don't even allow crazy sizes */ + if (unlikely(size > INT_MAX)) { + WARN_ON_ONCE(!(flags & __GFP_NOWARN)); + return NULL; + } + + /* + * kvmalloc() can always use VM_ALLOW_HUGE_VMAP, + * since the callers already cannot assume anything + * about the resulting pointer, and cannot play + * protection games. + */ + return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, + flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, + node, __builtin_return_address(0)); +} +EXPORT_SYMBOL(__kvmalloc_node_noprof); + +/** + * kvfree() - Free memory. + * @addr: Pointer to allocated memory. + * + * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc(). + * It is slightly more efficient to use kfree() or vfree() if you are certain + * that you know which one to use. + * + * Context: Either preemptible task context or not-NMI interrupt. + */ +void kvfree(const void *addr) +{ + if (is_vmalloc_addr(addr)) + vfree(addr); + else + kfree(addr); +} +EXPORT_SYMBOL(kvfree); + +/** + * kvfree_sensitive - Free a data object containing sensitive information. + * @addr: address of the data object to be freed. + * @len: length of the data object. + * + * Use the special memzero_explicit() function to clear the content of a + * kvmalloc'ed object containing sensitive data to make sure that the + * compiler won't optimize out the data clearing. + */ +void kvfree_sensitive(const void *addr, size_t len) +{ + if (likely(!ZERO_OR_NULL_PTR(addr))) { + memzero_explicit((void *)addr, len); + kvfree(addr); + } +} +EXPORT_SYMBOL(kvfree_sensitive); + +/** + * kvrealloc - reallocate memory; contents remain unchanged + * @p: object to reallocate memory for + * @size: the size to reallocate + * @flags: the flags for the page level allocator + * + * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0 + * and @p is not a %NULL pointer, the object pointed to is freed. + * + * If __GFP_ZERO logic is requested, callers must ensure that, starting with the + * initial memory allocation, every subsequent call to this API for the same + * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that + * __GFP_ZERO is not fully honored by this API. + * + * In any case, the contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. + * + * This function must not be called concurrently with itself or kvfree() for the + * same memory allocation. + * + * Return: pointer to the allocated memory or %NULL in case of error + */ +void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags) +{ + void *n; + + if (is_vmalloc_addr(p)) + return vrealloc_noprof(p, size, flags); + + n = krealloc_noprof(p, size, kmalloc_gfp_adjust(flags, size)); + if (!n) { + /* We failed to krealloc(), fall back to kvmalloc(). */ + n = kvmalloc_noprof(size, flags); + if (!n) + return NULL; + + if (p) { + /* We already know that `p` is not a vmalloc address. */ + kasan_disable_current(); + memcpy(n, kasan_reset_tag(p), ksize(p)); + kasan_enable_current(); + + kfree(p); + } + } + + return n; +} +EXPORT_SYMBOL(kvrealloc_noprof); + struct detached_freelist { struct slab *slab; void *tail; @@ -5570,14 +5807,14 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s) return !!oo_objects(s->oo); } -static void list_slab_objects(struct kmem_cache *s, struct slab *slab, - const char *text) +static void list_slab_objects(struct kmem_cache *s, struct slab *slab) { #ifdef CONFIG_SLUB_DEBUG void *addr = slab_address(slab); void *p; - slab_err(s, slab, text, s->name); + if (!slab_add_kunit_errors()) + slab_bug(s, "Objects remaining on __kmem_cache_shutdown()"); spin_lock(&object_map_lock); __fill_map(object_map, s, slab); @@ -5592,6 +5829,8 @@ static void list_slab_objects(struct kmem_cache *s, struct slab *slab, } } spin_unlock(&object_map_lock); + + __slab_err(slab); #endif } @@ -5612,8 +5851,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) remove_partial(n, slab); list_add(&slab->slab_list, &discard); } else { - list_slab_objects(s, slab, - "Objects remaining in %s on __kmem_cache_shutdown()"); + list_slab_objects(s, slab); } } spin_unlock_irq(&n->list_lock); diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c index be39078f255b..1007c30f12e2 100644 --- a/mm/swap_cgroup.c +++ b/mm/swap_cgroup.c @@ -58,9 +58,11 @@ static unsigned short __swap_cgroup_id_xchg(struct swap_cgroup *map, * entries must not have been charged * * @folio: the folio that the swap entry belongs to + * @id: mem_cgroup ID to be recorded * @ent: the first swap entry to be recorded */ -void swap_cgroup_record(struct folio *folio, swp_entry_t ent) +void swap_cgroup_record(struct folio *folio, unsigned short id, + swp_entry_t ent) { unsigned int nr_ents = folio_nr_pages(folio); struct swap_cgroup *map; @@ -72,8 +74,7 @@ void swap_cgroup_record(struct folio *folio, swp_entry_t ent) map = swap_cgroup_ctrl[swp_type(ent)].map; do { - old = __swap_cgroup_id_xchg(map, offset, - mem_cgroup_id(folio_memcg(folio))); + old = __swap_cgroup_id_xchg(map, offset, id); VM_BUG_ON(old); } while (++offset != end); } diff --git a/mm/swapfile.c b/mm/swapfile.c index ba19430dd4ea..df7c4e8b089c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -653,7 +653,8 @@ static void relocate_cluster(struct swap_info_struct *si, return; if (!ci->count) { - free_cluster(si, ci); + if (ci->flags != CLUSTER_FLAG_FREE) + free_cluster(si, ci); } else if (ci->count != SWAPFILE_CLUSTER) { if (ci->flags != CLUSTER_FLAG_FRAG) move_cluster(si, ci, &si->frag_clusters[ci->order], @@ -858,6 +859,10 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) offset++; } + /* in case no swap cache is reclaimed */ + if (ci->flags == CLUSTER_FLAG_NONE) + relocate_cluster(si, ci); + unlock_cluster(ci); if (to_scan <= 0) break; @@ -2641,7 +2646,6 @@ static void wait_for_allocation(struct swap_info_struct *si) for (offset = 0; offset < end; offset += SWAPFILE_CLUSTER) { ci = lock_cluster(si, offset); unlock_cluster(ci); - offset += SWAPFILE_CLUSTER; } } @@ -3542,6 +3546,10 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) int err, i; si = swp_swap_info(entry); + if (WARN_ON_ONCE(!si)) { + pr_err("%s%08lx\n", Bad_file, entry.val); + return -EINVAL; + } offset = swp_offset(entry); VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); diff --git a/mm/truncate.c b/mm/truncate.c index e2e115adfbc5..76d8fcd89bd0 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -548,8 +548,6 @@ int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio, VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); - if (folio_test_dirty(folio)) - return 0; if (folio_mapped(folio)) unmap_mapping_folio(folio); BUG_ON(folio_mapped(folio)); diff --git a/mm/usercopy.c b/mm/usercopy.c index 83c164aba6e0..dbdcc43964fb 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -17,7 +17,7 @@ #include <linux/sched.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> -#include <linux/thread_info.h> +#include <linux/ucopysize.h> #include <linux/vmalloc.h> #include <linux/atomic.h> #include <linux/jump_label.h> @@ -201,7 +201,9 @@ static inline void check_heap_object(const void *ptr, unsigned long n, } } -static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks); +DEFINE_STATIC_KEY_MAYBE_RO(CONFIG_HARDENED_USERCOPY_DEFAULT_ON, + validate_usercopy_range); +EXPORT_SYMBOL(validate_usercopy_range); /* * Validates that the given object is: @@ -212,9 +214,6 @@ static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks); */ void __check_object_size(const void *ptr, unsigned long n, bool to_user) { - if (static_branch_unlikely(&bypass_usercopy_checks)) - return; - /* Skip all tests if size is zero. */ if (!n) return; @@ -255,7 +254,8 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user) } EXPORT_SYMBOL(__check_object_size); -static bool enable_checks __initdata = true; +static bool enable_checks __initdata = + IS_ENABLED(CONFIG_HARDENED_USERCOPY_DEFAULT_ON); static int __init parse_hardened_usercopy(char *str) { @@ -269,8 +269,10 @@ __setup("hardened_usercopy=", parse_hardened_usercopy); static int __init set_hardened_usercopy(void) { - if (enable_checks == false) - static_branch_enable(&bypass_usercopy_checks); + if (enable_checks) + static_branch_enable(&validate_usercopy_range); + else + static_branch_disable(&validate_usercopy_range); return 1; } diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index af3dfc3633db..d06453fa8aba 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -18,6 +18,7 @@ #include <asm/tlbflush.h> #include <asm/tlb.h> #include "internal.h" +#include "swap.h" static __always_inline bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end) @@ -1076,16 +1077,14 @@ out: return err; } -static int move_swap_pte(struct mm_struct *mm, +static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, pte_t *dst_pte, pte_t *src_pte, pte_t orig_dst_pte, pte_t orig_src_pte, pmd_t *dst_pmd, pmd_t dst_pmdval, - spinlock_t *dst_ptl, spinlock_t *src_ptl) + spinlock_t *dst_ptl, spinlock_t *src_ptl, + struct folio *src_folio) { - if (!pte_swp_exclusive(orig_src_pte)) - return -EBUSY; - double_pt_lock(dst_ptl, src_ptl); if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, @@ -1094,6 +1093,16 @@ static int move_swap_pte(struct mm_struct *mm, return -EAGAIN; } + /* + * The src_folio resides in the swapcache, requiring an update to its + * index and mapping to align with the dst_vma, where a swap-in may + * occur and hit the swapcache after moving the PTE. + */ + if (src_folio) { + folio_move_anon_rmap(src_folio, dst_vma); + src_folio->index = linear_page_index(dst_vma, dst_addr); + } + orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); set_pte_at(mm, dst_addr, dst_pte, orig_src_pte); double_pt_unlock(dst_ptl, src_ptl); @@ -1141,6 +1150,7 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, __u64 mode) { swp_entry_t entry; + struct swap_info_struct *si = NULL; pte_t orig_src_pte, orig_dst_pte; pte_t src_folio_pte; spinlock_t *src_ptl, *dst_ptl; @@ -1240,6 +1250,7 @@ retry: */ if (!src_folio) { struct folio *folio; + bool locked; /* * Pin the page while holding the lock to be sure the @@ -1259,14 +1270,28 @@ retry: goto out; } + locked = folio_trylock(folio); + /* + * We avoid waiting for folio lock with a raised + * refcount for large folios because extra refcounts + * will result in split_folio() failing later and + * retrying. If multiple tasks are trying to move a + * large folio we can end up livelocking. + */ + if (!locked && folio_test_large(folio)) { + spin_unlock(src_ptl); + err = -EAGAIN; + goto out; + } + folio_get(folio); src_folio = folio; src_folio_pte = orig_src_pte; spin_unlock(src_ptl); - if (!folio_trylock(src_folio)) { - pte_unmap(&orig_src_pte); - pte_unmap(&orig_dst_pte); + if (!locked) { + pte_unmap(src_pte); + pte_unmap(dst_pte); src_pte = dst_pte = NULL; /* now we can block and wait */ folio_lock(src_folio); @@ -1282,8 +1307,8 @@ retry: /* at this point we have src_folio locked */ if (folio_test_large(src_folio)) { /* split_folio() can block */ - pte_unmap(&orig_src_pte); - pte_unmap(&orig_dst_pte); + pte_unmap(src_pte); + pte_unmap(dst_pte); src_pte = dst_pte = NULL; err = split_folio(src_folio); if (err) @@ -1308,8 +1333,8 @@ retry: goto out; } if (!anon_vma_trylock_write(src_anon_vma)) { - pte_unmap(&orig_src_pte); - pte_unmap(&orig_dst_pte); + pte_unmap(src_pte); + pte_unmap(dst_pte); src_pte = dst_pte = NULL; /* now we can block and wait */ anon_vma_lock_write(src_anon_vma); @@ -1322,11 +1347,13 @@ retry: orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval, dst_ptl, src_ptl, src_folio); } else { + struct folio *folio = NULL; + entry = pte_to_swp_entry(orig_src_pte); if (non_swap_entry(entry)) { if (is_migration_entry(entry)) { - pte_unmap(&orig_src_pte); - pte_unmap(&orig_dst_pte); + pte_unmap(src_pte); + pte_unmap(dst_pte); src_pte = dst_pte = NULL; migration_entry_wait(mm, src_pmd, src_addr); err = -EAGAIN; @@ -1335,9 +1362,53 @@ retry: goto out; } - err = move_swap_pte(mm, dst_addr, src_addr, dst_pte, src_pte, - orig_dst_pte, orig_src_pte, dst_pmd, - dst_pmdval, dst_ptl, src_ptl); + if (!pte_swp_exclusive(orig_src_pte)) { + err = -EBUSY; + goto out; + } + + si = get_swap_device(entry); + if (unlikely(!si)) { + err = -EAGAIN; + goto out; + } + /* + * Verify the existence of the swapcache. If present, the folio's + * index and mapping must be updated even when the PTE is a swap + * entry. The anon_vma lock is not taken during this process since + * the folio has already been unmapped, and the swap entry is + * exclusive, preventing rmap walks. + * + * For large folios, return -EBUSY immediately, as split_folio() + * also returns -EBUSY when attempting to split unmapped large + * folios in the swapcache. This issue needs to be resolved + * separately to allow proper handling. + */ + if (!src_folio) + folio = filemap_get_folio(swap_address_space(entry), + swap_cache_index(entry)); + if (!IS_ERR_OR_NULL(folio)) { + if (folio_test_large(folio)) { + err = -EBUSY; + folio_put(folio); + goto out; + } + src_folio = folio; + src_folio_pte = orig_src_pte; + if (!folio_trylock(src_folio)) { + pte_unmap(src_pte); + pte_unmap(dst_pte); + src_pte = dst_pte = NULL; + put_swap_device(si); + si = NULL; + /* now we can block and wait */ + folio_lock(src_folio); + goto retry; + } + } + err = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte, + orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval, + dst_ptl, src_ptl, src_folio); } out: @@ -1354,6 +1425,8 @@ out: if (src_pte) pte_unmap(src_pte); mmu_notifier_invalidate_range_end(&range); + if (si) + put_swap_device(si); return err; } diff --git a/mm/util.c b/mm/util.c index b6b9684a1438..e7d81371032b 100644 --- a/mm/util.c +++ b/mm/util.c @@ -23,6 +23,7 @@ #include <linux/processor.h> #include <linux/sizes.h> #include <linux/compat.h> +#include <linux/fsnotify.h> #include <linux/uaccess.h> @@ -569,6 +570,8 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, LIST_HEAD(uf); ret = security_mmap_file(file, prot, flag); + if (!ret) + ret = fsnotify_mmap_perm(file, prot, pgoff >> PAGE_SHIFT, len); if (!ret) { if (mmap_write_lock_killable(mm)) return -EINTR; @@ -612,168 +615,6 @@ unsigned long vm_mmap(struct file *file, unsigned long addr, } EXPORT_SYMBOL(vm_mmap); -static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size) -{ - /* - * We want to attempt a large physically contiguous block first because - * it is less likely to fragment multiple larger blocks and therefore - * contribute to a long term fragmentation less than vmalloc fallback. - * However make sure that larger requests are not too disruptive - no - * OOM killer and no allocation failure warnings as we have a fallback. - */ - if (size > PAGE_SIZE) { - flags |= __GFP_NOWARN; - - if (!(flags & __GFP_RETRY_MAYFAIL)) - flags |= __GFP_NORETRY; - - /* nofail semantic is implemented by the vmalloc fallback */ - flags &= ~__GFP_NOFAIL; - } - - return flags; -} - -/** - * __kvmalloc_node - attempt to allocate physically contiguous memory, but upon - * failure, fall back to non-contiguous (vmalloc) allocation. - * @size: size of the request. - * @b: which set of kmalloc buckets to allocate from. - * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL. - * @node: numa node to allocate from - * - * Uses kmalloc to get the memory but if the allocation fails then falls back - * to the vmalloc allocator. Use kvfree for freeing the memory. - * - * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier. - * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is - * preferable to the vmalloc fallback, due to visible performance drawbacks. - * - * Return: pointer to the allocated memory of %NULL in case of failure - */ -void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) -{ - void *ret; - - /* - * It doesn't really make sense to fallback to vmalloc for sub page - * requests - */ - ret = __kmalloc_node_noprof(PASS_BUCKET_PARAMS(size, b), - kmalloc_gfp_adjust(flags, size), - node); - if (ret || size <= PAGE_SIZE) - return ret; - - /* non-sleeping allocations are not supported by vmalloc */ - if (!gfpflags_allow_blocking(flags)) - return NULL; - - /* Don't even allow crazy sizes */ - if (unlikely(size > INT_MAX)) { - WARN_ON_ONCE(!(flags & __GFP_NOWARN)); - return NULL; - } - - /* - * kvmalloc() can always use VM_ALLOW_HUGE_VMAP, - * since the callers already cannot assume anything - * about the resulting pointer, and cannot play - * protection games. - */ - return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, - flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, - node, __builtin_return_address(0)); -} -EXPORT_SYMBOL(__kvmalloc_node_noprof); - -/** - * kvfree() - Free memory. - * @addr: Pointer to allocated memory. - * - * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc(). - * It is slightly more efficient to use kfree() or vfree() if you are certain - * that you know which one to use. - * - * Context: Either preemptible task context or not-NMI interrupt. - */ -void kvfree(const void *addr) -{ - if (is_vmalloc_addr(addr)) - vfree(addr); - else - kfree(addr); -} -EXPORT_SYMBOL(kvfree); - -/** - * kvfree_sensitive - Free a data object containing sensitive information. - * @addr: address of the data object to be freed. - * @len: length of the data object. - * - * Use the special memzero_explicit() function to clear the content of a - * kvmalloc'ed object containing sensitive data to make sure that the - * compiler won't optimize out the data clearing. - */ -void kvfree_sensitive(const void *addr, size_t len) -{ - if (likely(!ZERO_OR_NULL_PTR(addr))) { - memzero_explicit((void *)addr, len); - kvfree(addr); - } -} -EXPORT_SYMBOL(kvfree_sensitive); - -/** - * kvrealloc - reallocate memory; contents remain unchanged - * @p: object to reallocate memory for - * @size: the size to reallocate - * @flags: the flags for the page level allocator - * - * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0 - * and @p is not a %NULL pointer, the object pointed to is freed. - * - * If __GFP_ZERO logic is requested, callers must ensure that, starting with the - * initial memory allocation, every subsequent call to this API for the same - * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that - * __GFP_ZERO is not fully honored by this API. - * - * In any case, the contents of the object pointed to are preserved up to the - * lesser of the new and old sizes. - * - * This function must not be called concurrently with itself or kvfree() for the - * same memory allocation. - * - * Return: pointer to the allocated memory or %NULL in case of error - */ -void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags) -{ - void *n; - - if (is_vmalloc_addr(p)) - return vrealloc_noprof(p, size, flags); - - n = krealloc_noprof(p, size, kmalloc_gfp_adjust(flags, size)); - if (!n) { - /* We failed to krealloc(), fall back to kvmalloc(). */ - n = kvmalloc_noprof(size, flags); - if (!n) - return NULL; - - if (p) { - /* We already know that `p` is not a vmalloc address. */ - kasan_disable_current(); - memcpy(n, kasan_reset_tag(p), ksize(p)); - kasan_enable_current(); - - kfree(p); - } - } - - return n; -} -EXPORT_SYMBOL(kvrealloc_noprof); - /** * __vmalloc_array - allocate memory for a virtually contiguous array. * @n: number of elements. @@ -1509,24 +1509,28 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg) { struct vm_area_struct *vma = vmg->vma; + unsigned long start = vmg->start; + unsigned long end = vmg->end; struct vm_area_struct *merged; /* First, try to merge. */ merged = vma_merge_existing_range(vmg); if (merged) return merged; + if (vmg_nomem(vmg)) + return ERR_PTR(-ENOMEM); /* Split any preceding portion of the VMA. */ - if (vma->vm_start < vmg->start) { - int err = split_vma(vmg->vmi, vma, vmg->start, 1); + if (vma->vm_start < start) { + int err = split_vma(vmg->vmi, vma, start, 1); if (err) return ERR_PTR(err); } /* Split any trailing portion of the VMA. */ - if (vma->vm_end > vmg->end) { - int err = split_vma(vmg->vmi, vma, vmg->end, 0); + if (vma->vm_end > end) { + int err = split_vma(vmg->vmi, vma, end, 0); if (err) return ERR_PTR(err); @@ -2377,7 +2381,8 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) * vma_merge_new_range() calls khugepaged_enter_vma() too, the below * call covers the non-merge case. */ - khugepaged_enter_vma(vma, map->flags); + if (!vma_is_anonymous(vma)) + khugepaged_enter_vma(vma, map->flags); ksm_add_vma(vma); *vmap = vma; return 0; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a6e7acebe9ad..61981ee1c9d2 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -586,13 +586,13 @@ static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, mask |= PGTBL_PGD_MODIFIED; err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask); if (err) - return err; + break; } while (pgd++, addr = next, addr != end); if (mask & ARCH_PAGE_TABLE_SYNC_MASK) arch_sync_kernel_mappings(start, end); - return 0; + return err; } /* diff --git a/mm/vmstat.c b/mm/vmstat.c index 16bfe1c694dd..88998725f1c5 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1435,6 +1435,8 @@ const char * const vmstat_text[] = { #ifdef CONFIG_X86 "direct_map_level2_splits", "direct_map_level3_splits", + "direct_map_level2_collapses", + "direct_map_level3_collapses", #endif #ifdef CONFIG_PER_VMA_LOCK_STATS "vma_lock_success", diff --git a/mm/zswap.c b/mm/zswap.c index 6504174fbc6a..23365e76a3ce 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -43,7 +43,7 @@ * statistics **********************************/ /* The number of compressed pages currently stored in zswap */ -atomic_long_t zswap_stored_pages = ATOMIC_INIT(0); +atomic_long_t zswap_stored_pages = ATOMIC_LONG_INIT(0); /* * The statistics below are not protected from concurrent access for @@ -1445,9 +1445,9 @@ resched: * main API **********************************/ -static ssize_t zswap_store_page(struct page *page, - struct obj_cgroup *objcg, - struct zswap_pool *pool) +static bool zswap_store_page(struct page *page, + struct obj_cgroup *objcg, + struct zswap_pool *pool) { swp_entry_t page_swpentry = page_swap_entry(page); struct zswap_entry *entry, *old; @@ -1456,7 +1456,7 @@ static ssize_t zswap_store_page(struct page *page, entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page)); if (!entry) { zswap_reject_kmemcache_fail++; - return -EINVAL; + return false; } if (!zswap_compress(page, entry, pool)) @@ -1483,13 +1483,17 @@ static ssize_t zswap_store_page(struct page *page, /* * The entry is successfully compressed and stored in the tree, there is - * no further possibility of failure. Grab refs to the pool and objcg. - * These refs will be dropped by zswap_entry_free() when the entry is - * removed from the tree. + * no further possibility of failure. Grab refs to the pool and objcg, + * charge zswap memory, and increment zswap_stored_pages. + * The opposite actions will be performed by zswap_entry_free() + * when the entry is removed from the tree. */ zswap_pool_get(pool); - if (objcg) + if (objcg) { obj_cgroup_get(objcg); + obj_cgroup_charge_zswap(objcg, entry->length); + } + atomic_long_inc(&zswap_stored_pages); /* * We finish initializing the entry while it's already in xarray. @@ -1510,13 +1514,13 @@ static ssize_t zswap_store_page(struct page *page, zswap_lru_add(&zswap_list_lru, entry); } - return entry->length; + return true; store_failed: zpool_free(pool->zpool, entry->handle); compress_failed: zswap_entry_cache_free(entry); - return -EINVAL; + return false; } bool zswap_store(struct folio *folio) @@ -1526,7 +1530,6 @@ bool zswap_store(struct folio *folio) struct obj_cgroup *objcg = NULL; struct mem_cgroup *memcg = NULL; struct zswap_pool *pool; - size_t compressed_bytes = 0; bool ret = false; long index; @@ -1564,20 +1567,14 @@ bool zswap_store(struct folio *folio) for (index = 0; index < nr_pages; ++index) { struct page *page = folio_page(folio, index); - ssize_t bytes; - bytes = zswap_store_page(page, objcg, pool); - if (bytes < 0) + if (!zswap_store_page(page, objcg, pool)) goto put_pool; - compressed_bytes += bytes; } - if (objcg) { - obj_cgroup_charge_zswap(objcg, compressed_bytes); + if (objcg) count_objcg_events(objcg, ZSWPOUT, nr_pages); - } - atomic_long_add(nr_pages, &zswap_stored_pages); count_vm_events(ZSWPOUT, nr_pages); ret = true; |