diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2022-06-20 16:21:49 +1000 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2022-06-20 16:21:49 +1000 |
commit | 71ce136b883e202d3d714e00641442649d37de42 (patch) | |
tree | 34f7a625000e88c692bbf5f5c8bfe1f0cca20cb8 /include | |
parent | 8b9dfef619c00f1631bb582c4f4b9d65e3c7dd86 (diff) | |
parent | 67978e7f29f00dcfb21158570418ca19c8cbb285 (diff) |
next-20220617/mm
Diffstat (limited to 'include')
34 files changed, 1284 insertions, 266 deletions
diff --git a/include/linux/damon.h b/include/linux/damon.h index 2765c7d99beb..7b1f4a488230 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -86,6 +86,8 @@ struct damon_target { * @DAMOS_PAGEOUT: Call ``madvise()`` for the region with MADV_PAGEOUT. * @DAMOS_HUGEPAGE: Call ``madvise()`` for the region with MADV_HUGEPAGE. * @DAMOS_NOHUGEPAGE: Call ``madvise()`` for the region with MADV_NOHUGEPAGE. + * @DAMOS_LRU_PRIO: Prioritize the region on its LRU lists. + * @DAMOS_LRU_DEPRIO: Deprioritize the region on its LRU lists. * @DAMOS_STAT: Do nothing but count the stat. * @NR_DAMOS_ACTIONS: Total number of DAMOS actions */ @@ -95,6 +97,8 @@ enum damos_action { DAMOS_PAGEOUT, DAMOS_HUGEPAGE, DAMOS_NOHUGEPAGE, + DAMOS_LRU_PRIO, + DAMOS_LRU_DEPRIO, DAMOS_STAT, /* Do nothing but only record the stat */ NR_DAMOS_ACTIONS, }; @@ -525,6 +529,12 @@ bool damon_is_registered_ops(enum damon_ops_id id); int damon_register_ops(struct damon_operations *ops); int damon_select_ops(struct damon_ctx *ctx, enum damon_ops_id id); +static inline bool damon_target_has_pid(const struct damon_ctx *ctx) +{ + return ctx->ops.id == DAMON_OPS_VADDR || ctx->ops.id == DAMON_OPS_FVADDR; +} + + int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive); int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); diff --git a/include/linux/dax.h b/include/linux/dax.h index e7b81634c52a..ba985333e26b 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -43,8 +43,21 @@ struct dax_operations { void *addr, size_t bytes, struct iov_iter *iter); }; +struct dax_holder_operations { + /* + * notify_failure - notify memory failure into inner holder device + * @dax_dev: the dax device which contains the holder + * @offset: offset on this dax device where memory failure occurs + * @len: length of this memory failure event + * @flags: action flags for memory failure handler + */ + int (*notify_failure)(struct dax_device *dax_dev, u64 offset, + u64 len, int mf_flags); +}; + #if IS_ENABLED(CONFIG_DAX) struct dax_device *alloc_dax(void *private, const struct dax_operations *ops); +void *dax_holder(struct dax_device *dax_dev); void put_dax(struct dax_device *dax_dev); void kill_dax(struct dax_device *dax_dev); void dax_write_cache(struct dax_device *dax_dev, bool wc); @@ -66,6 +79,10 @@ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma, return dax_synchronous(dax_dev); } #else +static inline void *dax_holder(struct dax_device *dax_dev) +{ + return NULL; +} static inline struct dax_device *alloc_dax(void *private, const struct dax_operations *ops) { @@ -114,12 +131,9 @@ struct writeback_control; #if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX) int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk); void dax_remove_host(struct gendisk *disk); -struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, - u64 *start_off); -static inline void fs_put_dax(struct dax_device *dax_dev) -{ - put_dax(dax_dev); -} +struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off, + void *holder, const struct dax_holder_operations *ops); +void fs_put_dax(struct dax_device *dax_dev, void *holder); #else static inline int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk) { @@ -129,11 +143,12 @@ static inline void dax_remove_host(struct gendisk *disk) { } static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, - u64 *start_off) + u64 *start_off, void *holder, + const struct dax_holder_operations *ops) { return NULL; } -static inline void fs_put_dax(struct dax_device *dax_dev) +static inline void fs_put_dax(struct dax_device *dax_dev, void *holder) { } #endif /* CONFIG_BLOCK && CONFIG_FS_DAX */ @@ -146,6 +161,10 @@ struct page *dax_layout_busy_page(struct address_space *mapping); struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end); dax_entry_t dax_lock_page(struct page *page); void dax_unlock_page(struct page *page, dax_entry_t cookie); +dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, + unsigned long index, struct page **page); +void dax_unlock_mapping_entry(struct address_space *mapping, + unsigned long index, dax_entry_t cookie); #else static inline struct page *dax_layout_busy_page(struct address_space *mapping) { @@ -173,6 +192,17 @@ static inline dax_entry_t dax_lock_page(struct page *page) static inline void dax_unlock_page(struct page *page, dax_entry_t cookie) { } + +static inline dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, + unsigned long index, struct page **page) +{ + return 0; +} + +static inline void dax_unlock_mapping_entry(struct address_space *mapping, + unsigned long index, dax_entry_t cookie) +{ +} #endif int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, @@ -203,6 +233,8 @@ size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i); int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, size_t nr_pages); +int dax_holder_notify_failure(struct dax_device *dax_dev, u64 off, u64 len, + int mf_flags); void dax_flush(struct dax_device *dax_dev, void *addr, size_t size); ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, @@ -214,6 +246,14 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); int dax_invalidate_mapping_entry_sync(struct address_space *mapping, pgoff_t index); +int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff, + struct inode *dest, loff_t destoff, + loff_t len, bool *is_same, + const struct iomap_ops *ops); +int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *len, unsigned int remap_flags, + const struct iomap_ops *ops); static inline bool dax_mapping(struct address_space *mapping) { return mapping->host && IS_DAX(mapping->host); diff --git a/include/linux/fs.h b/include/linux/fs.h index 7e06919b8f60..0ef49fed1300 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -74,6 +74,7 @@ struct fsverity_operations; struct fs_context; struct fs_parameter_spec; struct fileattr; +struct iomap_ops; extern void __init inode_init(void); extern void __init inode_init_early(void); @@ -2068,10 +2069,13 @@ extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *, extern ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t len, unsigned int flags); -extern int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t *count, - unsigned int remap_flags); +int __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *len, unsigned int remap_flags, + const struct iomap_ops *dax_read_ops); +int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *count, unsigned int remap_flags); extern loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags); diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 2d2ccae933c2..9a88cce23e17 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -39,7 +39,7 @@ struct vm_area_struct; #define ___GFP_IO 0x40u #define ___GFP_FS 0x80u #define ___GFP_ZERO 0x100u -#define ___GFP_ATOMIC 0x200u +/* 0x200u unused */ #define ___GFP_DIRECT_RECLAIM 0x400u #define ___GFP_KSWAPD_RECLAIM 0x800u #define ___GFP_WRITE 0x1000u @@ -124,11 +124,8 @@ struct vm_area_struct; * * %__GFP_HIGH indicates that the caller is high-priority and that granting * the request is necessary before the system can make forward progress. - * For example, creating an IO context to clean pages. - * - * %__GFP_ATOMIC indicates that the caller cannot reclaim or sleep and is - * high priority. Users are typically interrupt handlers. This may be - * used in conjunction with %__GFP_HIGH + * For example creating an IO context to clean pages and requests + * from atomic context. * * %__GFP_MEMALLOC allows access to all memory. This should only be used when * the caller guarantees the allocation will allow more memory to be freed @@ -143,7 +140,6 @@ struct vm_area_struct; * %__GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves. * This takes precedence over the %__GFP_MEMALLOC flag if both are set. */ -#define __GFP_ATOMIC ((__force gfp_t)___GFP_ATOMIC) #define __GFP_HIGH ((__force gfp_t)___GFP_HIGH) #define __GFP_MEMALLOC ((__force gfp_t)___GFP_MEMALLOC) #define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) @@ -337,7 +333,7 @@ struct vm_area_struct; * version does not attempt reclaim/compaction at all and is by default used * in page fault path, while the non-light is used by khugepaged. */ -#define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM) +#define GFP_ATOMIC (__GFP_HIGH|__GFP_KSWAPD_RECLAIM) #define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) #define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT) #define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index fee9835e3793..22379a63e293 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -243,6 +243,16 @@ static inline void clear_highpage(struct page *page) kunmap_local(kaddr); } +static inline void clear_highpage_kasan_tagged(struct page *page) +{ + u8 tag; + + tag = page_kasan_tag(page); + page_kasan_tag_reset(page); + clear_highpage(page); + page_kasan_tag_set(page, tag); +} + #ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGE static inline void tag_clear_highpage(struct page *page) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index de29821231c9..ae3d8e2fd9e2 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -116,9 +116,30 @@ extern struct kobj_attribute shmem_enabled_attr; extern unsigned long transparent_hugepage_flags; +#define hugepage_flags_enabled() \ + (transparent_hugepage_flags & \ + ((1<<TRANSPARENT_HUGEPAGE_FLAG) | \ + (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG))) +#define hugepage_flags_always() \ + (transparent_hugepage_flags & \ + (1<<TRANSPARENT_HUGEPAGE_FLAG)) + +/* + * Do the below checks: + * - For file vma, check if the linear page offset of vma is + * HPAGE_PMD_NR aligned within the file. The hugepage is + * guaranteed to be hugepage-aligned within the file, but we must + * check that the PMD-aligned addresses in the VMA map to + * PMD-aligned offsets within the file, else the hugepage will + * not be PMD-mappable. + * - For all vmas, check if the haddr is in an aligned HPAGE_PMD_SIZE + * area. + */ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, - unsigned long haddr) + unsigned long addr) { + unsigned long haddr; + /* Don't have to check pgoff for anonymous vma */ if (!vma_is_anonymous(vma)) { if (!IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, @@ -126,53 +147,13 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, return false; } - if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) - return false; - return true; -} + haddr = addr & HPAGE_PMD_MASK; -static inline bool transhuge_vma_enabled(struct vm_area_struct *vma, - unsigned long vm_flags) -{ - /* Explicitly disabled through madvise. */ - if ((vm_flags & VM_NOHUGEPAGE) || - test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) + if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) return false; return true; } -/* - * to be used on vmas which are known to support THP. - * Use transparent_hugepage_active otherwise - */ -static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma) -{ - - /* - * If the hardware/firmware marked hugepage support disabled. - */ - if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_NEVER_DAX)) - return false; - - if (!transhuge_vma_enabled(vma, vma->vm_flags)) - return false; - - if (vma_is_temporary_stack(vma)) - return false; - - if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_FLAG)) - return true; - - if (vma_is_dax(vma)) - return true; - - if (transparent_hugepage_flags & - (1 << TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)) - return !!(vma->vm_flags & VM_HUGEPAGE); - - return false; -} - static inline bool file_thp_enabled(struct vm_area_struct *vma) { struct inode *inode; @@ -187,7 +168,9 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode); } -bool transparent_hugepage_active(struct vm_area_struct *vma); +bool hugepage_vma_check(struct vm_area_struct *vma, + unsigned long vm_flags, + bool smaps, bool in_pf); #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ @@ -331,24 +314,15 @@ static inline bool folio_test_pmd_mappable(struct folio *folio) return false; } -static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma) -{ - return false; -} - -static inline bool transparent_hugepage_active(struct vm_area_struct *vma) -{ - return false; -} - static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, - unsigned long haddr) + unsigned long addr) { return false; } -static inline bool transhuge_vma_enabled(struct vm_area_struct *vma, - unsigned long vm_flags) +static inline bool hugepage_vma_check(struct vm_area_struct *vma, + unsigned long vm_flags, + bool smaps, bool in_pf) { return false; } diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index e4cff27d1198..642a39016f9a 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -43,6 +43,9 @@ enum { SUBPAGE_INDEX_CGROUP_RSVD, /* reuse page->private */ __MAX_CGROUP_SUBPAGE_INDEX = SUBPAGE_INDEX_CGROUP_RSVD, #endif +#ifdef CONFIG_MEMORY_FAILURE + SUBPAGE_INDEX_HWPOISON, +#endif __NR_USED_SUBPAGE, }; @@ -170,7 +173,7 @@ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, vm_flags_t vm_flags); long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); -bool isolate_huge_page(struct page *page, struct list_head *list); +int isolate_hugetlb(struct page *page, struct list_head *list); int get_hwpoison_huge_page(struct page *page, bool *hugetlb); int get_huge_page_for_hwpoison(unsigned long pfn, int flags); void putback_active_hugepage(struct page *page); @@ -376,9 +379,9 @@ static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, return NULL; } -static inline bool isolate_huge_page(struct page *page, struct list_head *list) +static inline int isolate_hugetlb(struct page *page, struct list_head *list) { - return false; + return -EBUSY; } static inline int get_hwpoison_huge_page(struct page *page, bool *hugetlb) @@ -798,6 +801,27 @@ extern int dissolve_free_huge_page(struct page *page); extern int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn); +#ifdef CONFIG_MEMORY_FAILURE +/* + * pointer to raw error page is located in hpage[SUBPAGE_INDEX_HWPOISON].private + */ +static inline struct page *hugetlb_page_hwpoison(struct page *hpage) +{ + return (void *)page_private(hpage + SUBPAGE_INDEX_HWPOISON); +} + +static inline void hugetlb_set_page_hwpoison(struct page *hpage, + struct page *page) +{ + set_page_private(hpage + SUBPAGE_INDEX_HWPOISON, (unsigned long)page); +} +#else +static inline struct page *hugetlb_page_hwpoison(struct page *hpage) +{ + return NULL; +} +#endif + #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION #ifndef arch_hugetlb_migration_supported static inline bool arch_hugetlb_migration_supported(struct hstate *h) diff --git a/include/linux/kconfig.h b/include/linux/kconfig.h index 20d1079e92b4..7044032b9f42 100644 --- a/include/linux/kconfig.h +++ b/include/linux/kconfig.h @@ -10,6 +10,7 @@ #define __LITTLE_ENDIAN 1234 #endif +#define __ARG_PLACEHOLDER_ 0, #define __ARG_PLACEHOLDER_1 0, #define __take_second_arg(__ignored, val, ...) val diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 392d34c3c59a..384f034ae947 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -10,8 +10,6 @@ extern struct attribute_group khugepaged_attr_group; extern int khugepaged_init(void); extern void khugepaged_destroy(void); extern int start_stop_khugepaged(void); -extern bool hugepage_vma_check(struct vm_area_struct *vma, - unsigned long vm_flags); extern void __khugepaged_enter(struct mm_struct *mm); extern void __khugepaged_exit(struct mm_struct *mm); extern void khugepaged_enter_vma(struct vm_area_struct *vma, @@ -26,20 +24,6 @@ static inline void collapse_pte_mapped_thp(struct mm_struct *mm, } #endif -#define khugepaged_enabled() \ - (transparent_hugepage_flags & \ - ((1<<TRANSPARENT_HUGEPAGE_FLAG) | \ - (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG))) -#define khugepaged_always() \ - (transparent_hugepage_flags & \ - (1<<TRANSPARENT_HUGEPAGE_FLAG)) -#define khugepaged_req_madv() \ - (transparent_hugepage_flags & \ - (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)) -#define khugepaged_defrag() \ - (transparent_hugepage_flags & \ - (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)) - static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) { if (test_bit(MMF_VM_HUGEPAGE, &oldmm->flags)) @@ -51,16 +35,6 @@ static inline void khugepaged_exit(struct mm_struct *mm) if (test_bit(MMF_VM_HUGEPAGE, &mm->flags)) __khugepaged_exit(mm); } - -static inline void khugepaged_enter(struct vm_area_struct *vma, - unsigned long vm_flags) -{ - if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && - khugepaged_enabled()) { - if (hugepage_vma_check(vma, vm_flags)) - __khugepaged_enter(vma->vm_mm); - } -} #else /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) { @@ -68,10 +42,6 @@ static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm static inline void khugepaged_exit(struct mm_struct *mm) { } -static inline void khugepaged_enter(struct vm_area_struct *vma, - unsigned long vm_flags) -{ -} static inline void khugepaged_enter_vma(struct vm_area_struct *vma, unsigned long vm_flags) { diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h new file mode 100644 index 000000000000..3c6642358904 --- /dev/null +++ b/include/linux/maple_tree.h @@ -0,0 +1,685 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +#ifndef _LINUX_MAPLE_TREE_H +#define _LINUX_MAPLE_TREE_H +/* + * Maple Tree - An RCU-safe adaptive tree for storing ranges + * Copyright (c) 2018-2022 Oracle + * Authors: Liam R. Howlett <Liam.Howlett@Oracle.com> + * Matthew Wilcox <willy@infradead.org> + */ + +#include <linux/kernel.h> +#include <linux/rcupdate.h> +#include <linux/spinlock.h> +/* #define CONFIG_MAPLE_RCU_DISABLED */ +/* #define CONFIG_DEBUG_MAPLE_TREE_VERBOSE */ + +/* + * Allocated nodes are mutable until they have been inserted into the tree, + * at which time they cannot change their type until they have been removed + * from the tree and an RCU grace period has passed. + * + * Removed nodes have their ->parent set to point to themselves. RCU readers + * check ->parent before relying on the value that they loaded from the + * slots array. This lets us reuse the slots array for the RCU head. + * + * Nodes in the tree point to their parent unless bit 0 is set. + */ +#if defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64) +/* 64bit sizes */ +#define MAPLE_NODE_SLOTS 31 /* 256 bytes including ->parent */ +#define MAPLE_RANGE64_SLOTS 16 /* 256 bytes */ +#define MAPLE_ARANGE64_SLOTS 10 /* 240 bytes */ +#define MAPLE_ARANGE64_META_MAX 15 /* Out of range for metadata */ +#define MAPLE_ALLOC_SLOTS (MAPLE_NODE_SLOTS - 1) +#else +/* 32bit sizes */ +#define MAPLE_NODE_SLOTS 63 /* 256 bytes including ->parent */ +#define MAPLE_RANGE64_SLOTS 32 /* 256 bytes */ +#define MAPLE_ARANGE64_SLOTS 21 /* 240 bytes */ +#define MAPLE_ARANGE64_META_MAX 31 /* Out of range for metadata */ +#define MAPLE_ALLOC_SLOTS (MAPLE_NODE_SLOTS - 2) +#endif /* defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64) */ + +#define MAPLE_NODE_MASK 255UL + +/* + * The node->parent of the root node has bit 0 set and the rest of the pointer + * is a pointer to the tree itself. No more bits are available in this pointer + * (on m68k, the data structure may only be 2-byte aligned). + * + * Internal non-root nodes can only have maple_range_* nodes as parents. The + * parent pointer is 256B aligned like all other tree nodes. When storing a 32 + * or 64 bit values, the offset can fit into 4 bits. The 16 bit values need an + * extra bit to store the offset. This extra bit comes from a reuse of the last + * bit in the node type. This is possible by using bit 1 to indicate if bit 2 + * is part of the type or the slot. + * + * Once the type is decided, the decision of an allocation range type or a range + * type is done by examining the immutable tree flag for the MAPLE_ALLOC_RANGE + * flag. + * + * Node types: + * 0x??1 = Root + * 0x?00 = 16 bit nodes + * 0x010 = 32 bit nodes + * 0x110 = 64 bit nodes + * + * Slot size and location in the parent pointer: + * type : slot location + * 0x??1 : Root + * 0x?00 : 16 bit values, type in 0-1, slot in 2-6 + * 0x010 : 32 bit values, type in 0-2, slot in 3-6 + * 0x110 : 64 bit values, type in 0-2, slot in 3-6 + */ +typedef struct maple_enode *maple_enode; /* encoded node */ +typedef struct maple_pnode *maple_pnode; /* parent node */ + +/* + * This metadata is used to optimize the gap updating code and in reverse + * searching for gaps or any other code that needs to find the end of the data. + */ +struct maple_metadata { + unsigned char end; + unsigned char gap; +}; + +/* + * Leaf nodes do not store pointers to nodes, they store user data. Users may + * store almost any bit pattern. As noted above, the optimisation of storing an + * entry at 0 in the root pointer cannot be done for data which have the bottom + * two bits set to '10'. We also reserve values with the bottom two bits set to + * '10' which are below 4096 (ie 2, 6, 10 .. 4094) for internal use. Some APIs + * return errnos as a negative errno shifted right by two bits and the bottom + * two bits set to '10', and while choosing to store these values in the array + * is not an error, it may lead to confusion if you're testing for an error with + * mas_is_err(). + * + * Non-leaf nodes store the type of the node pointed to (enum maple_type in bits + * 3-6), bit 2 is reserved. That leaves bits 0-1 unused for now. + * + * In regular B-Tree terms, pivots are called keys. The term pivot is used to + * indicate that the tree is specifying ranges, Pivots may appear in the + * subtree with an entry attached to the value whereas keys are unique to a + * specific position of a B-tree. Pivot values are inclusive of the slot with + * the same index. + */ + +struct maple_range_64 { + struct maple_pnode *parent; + unsigned long pivot[MAPLE_RANGE64_SLOTS - 1]; + union { + void __rcu *slot[MAPLE_RANGE64_SLOTS]; + struct { + void __rcu *pad[MAPLE_RANGE64_SLOTS - 1]; + struct maple_metadata meta; + }; + }; +}; + +/* + * At tree creation time, the user can specify that they're willing to trade off + * storing fewer entries in a tree in return for storing more information in + * each node. + * + * The maple tree supports recording the largest range of NULL entries available + * in this node, also called gaps. This optimises the tree for allocating a + * range. + */ +struct maple_arange_64 { + struct maple_pnode *parent; + unsigned long pivot[MAPLE_ARANGE64_SLOTS - 1]; + void __rcu *slot[MAPLE_ARANGE64_SLOTS]; + unsigned long gap[MAPLE_ARANGE64_SLOTS]; + struct maple_metadata meta; +}; + +struct maple_alloc { + unsigned long total; + unsigned char node_count; + unsigned int request_count; + struct maple_alloc *slot[MAPLE_ALLOC_SLOTS]; +}; + +struct maple_topiary { + struct maple_pnode *parent; + struct maple_enode *next; /* Overlaps the pivot */ +}; + +enum maple_type { + maple_dense, + maple_leaf_64, + maple_range_64, + maple_arange_64, +}; + + +/** + * DOC: Maple tree flags + * + * * MT_FLAGS_ALLOC_RANGE - Track gaps in this tree + * * MT_FLAGS_USE_RCU - Operate in RCU mode + * * MT_FLAGS_HEIGHT_OFFSET - The position of the tree height in the flags + * * MT_FLAGS_HEIGHT_MASK - The mask for the maple tree height value + * * MT_FLAGS_LOCK_MASK - How the mt_lock is used + * * MT_FLAGS_LOCK_IRQ - Acquired irq-safe + * * MT_FLAGS_LOCK_BH - Acquired bh-safe + * * MT_FLAGS_LOCK_EXTERN - mt_lock is not used + * + * MAPLE_HEIGHT_MAX The largest height that can be stored + */ +#define MT_FLAGS_ALLOC_RANGE 0x01 +#define MT_FLAGS_USE_RCU 0x02 +#define MT_FLAGS_HEIGHT_OFFSET 0x02 +#define MT_FLAGS_HEIGHT_MASK 0x7C +#define MT_FLAGS_LOCK_MASK 0x300 +#define MT_FLAGS_LOCK_IRQ 0x100 +#define MT_FLAGS_LOCK_BH 0x200 +#define MT_FLAGS_LOCK_EXTERN 0x300 + +#define MAPLE_HEIGHT_MAX 31 + + +#define MAPLE_NODE_TYPE_MASK 0x0F +#define MAPLE_NODE_TYPE_SHIFT 0x03 + +#define MAPLE_RESERVED_RANGE 4096 + +#ifdef CONFIG_LOCKDEP +typedef struct lockdep_map *lockdep_map_p; +#define mt_lock_is_held(mt) lock_is_held(mt->ma_external_lock) +#define mt_set_external_lock(mt, lock) \ + (mt)->ma_external_lock = &(lock)->dep_map +#else +typedef struct { /* nothing */ } lockdep_map_p; +#define mt_lock_is_held(mt) 1 +#define mt_set_external_lock(mt, lock) do { } while (0) +#endif + +/* + * If the tree contains a single entry at index 0, it is usually stored in + * tree->ma_root. To optimise for the page cache, an entry which ends in '00', + * '01' or '11' is stored in the root, but an entry which ends in '10' will be + * stored in a node. Bits 3-6 are used to store enum maple_type. + * + * The flags are used both to store some immutable information about this tree + * (set at tree creation time) and dynamic information set under the spinlock. + * + * Another use of flags are to indicate global states of the tree. This is the + * case with the MAPLE_USE_RCU flag, which indicates the tree is currently in + * RCU mode. This mode was added to allow the tree to reuse nodes instead of + * re-allocating and RCU freeing nodes when there is a single user. + */ +struct maple_tree { + union { + spinlock_t ma_lock; + lockdep_map_p ma_external_lock; + }; + void __rcu *ma_root; + unsigned int ma_flags; +}; + +/** + * MTREE_INIT() - Initialize a maple tree + * @name: The maple tree name + * @flags: The maple tree flags + * + */ +#define MTREE_INIT(name, flags) { \ + .ma_lock = __SPIN_LOCK_UNLOCKED(name.ma_lock), \ + .ma_flags = flags, \ + .ma_root = NULL, \ +} + +/** + * MTREE_INIT_EXT() - Initialize a maple tree with an external lock. + * @name: The tree name + * @flags: The maple tree flags + * @lock: The external lock + */ +#ifdef CONFIG_LOCKDEP +#define MTREE_INIT_EXT(name, flags, lock) { \ + .ma_external_lock = &(lock).dep_map, \ + .ma_flags = flags, \ + .ma_root = NULL, \ +} +#else +#define MTREE_INIT_EXT(name, flags, lock) MTREE_INIT(name, flags) +#endif + +#define DEFINE_MTREE(name) \ + struct maple_tree name = MTREE_INIT(name, 0) + +#define mtree_lock(mt) spin_lock((&(mt)->ma_lock)) +#define mtree_unlock(mt) spin_unlock((&(mt)->ma_lock)) + +/* + * The Maple Tree squeezes various bits in at various points which aren't + * necessarily obvious. Usually, this is done by observing that pointers are + * N-byte aligned and thus the bottom log_2(N) bits are available for use. We + * don't use the high bits of pointers to store additional information because + * we don't know what bits are unused on any given architecture. + * + * Nodes are 256 bytes in size and are also aligned to 256 bytes, giving us 8 + * low bits for our own purposes. Nodes are currently of 4 types: + * 1. Single pointer (Range is 0-0) + * 2. Non-leaf Allocation Range nodes + * 3. Non-leaf Range nodes + * 4. Leaf Range nodes All nodes consist of a number of node slots, + * pivots, and a parent pointer. + */ + +struct maple_node { + union { + struct { + struct maple_pnode *parent; + void __rcu *slot[MAPLE_NODE_SLOTS]; + }; + struct { + void *pad; + struct rcu_head rcu; + struct maple_enode *piv_parent; + unsigned char parent_slot; + enum maple_type type; + unsigned char slot_len; + unsigned int ma_flags; + }; + struct maple_range_64 mr64; + struct maple_arange_64 ma64; + struct maple_alloc alloc; + }; +}; + +/* + * More complicated stores can cause two nodes to become one or three and + * potentially alter the height of the tree. Either half of the tree may need + * to be rebalanced against the other. The ma_topiary struct is used to track + * which nodes have been 'cut' from the tree so that the change can be done + * safely at a later date. This is done to support RCU. + */ +struct ma_topiary { + struct maple_enode *head; + struct maple_enode *tail; + struct maple_tree *mtree; +}; + +void *mtree_load(struct maple_tree *mt, unsigned long index); + +int mtree_insert(struct maple_tree *mt, unsigned long index, + void *entry, gfp_t gfp); +int mtree_insert_range(struct maple_tree *mt, unsigned long first, + unsigned long last, void *entry, gfp_t gfp); +int mtree_alloc_range(struct maple_tree *mt, unsigned long *startp, + void *entry, unsigned long size, unsigned long min, + unsigned long max, gfp_t gfp); +int mtree_alloc_rrange(struct maple_tree *mt, unsigned long *startp, + void *entry, unsigned long size, unsigned long min, + unsigned long max, gfp_t gfp); + +int mtree_store_range(struct maple_tree *mt, unsigned long first, + unsigned long last, void *entry, gfp_t gfp); +int mtree_store(struct maple_tree *mt, unsigned long index, + void *entry, gfp_t gfp); +void *mtree_erase(struct maple_tree *mt, unsigned long index); + +void mtree_destroy(struct maple_tree *mt); +void __mt_destroy(struct maple_tree *mt); + +/** + * mtree_empty() - Determine if a tree has any present entries. + * @mt: Maple Tree. + * + * Context: Any context. + * Return: %true if the tree contains only NULL pointers. + */ +static inline bool mtree_empty(const struct maple_tree *mt) +{ + return mt->ma_root == NULL; +} + +/* Advanced API */ + +/* + * The maple state is defined in the struct ma_state and is used to keep track + * of information during operations, and even between operations when using the + * advanced API. + * + * If state->node has bit 0 set then it references a tree location which is not + * a node (eg the root). If bit 1 is set, the rest of the bits are a negative + * errno. Bit 2 (the 'unallocated slots' bit) is clear. Bits 3-6 indicate the + * node type. + * + * state->alloc either has a request number of nodes or an allocated node. If + * stat->alloc has a requested number of nodes, the first bit will be set (0x1) + * and the remaining bits are the value. If state->alloc is a node, then the + * node will be of type maple_alloc. maple_alloc has MAPLE_NODE_SLOTS - 1 for + * storing more allocated nodes, a total number of nodes allocated, and the + * node_count in this node. node_count is the number of allocated nodes in this + * node. The scaling beyond MAPLE_NODE_SLOTS - 1 is handled by storing further + * nodes into state->alloc->slot[0]'s node. Nodes are taken from state->alloc + * by removing a node from the state->alloc node until state->alloc->node_count + * is 1, when state->alloc is returned and the state->alloc->slot[0] is promoted + * to state->alloc. Nodes are pushed onto state->alloc by putting the current + * state->alloc into the pushed node's slot[0]. + * + * The state also contains the implied min/max of the state->node, the depth of + * this search, and the offset. The implied min/max are either from the parent + * node or are 0-oo for the root node. The depth is incremented or decremented + * every time a node is walked down or up. The offset is the slot/pivot of + * interest in the node - either for reading or writing. + * + * When returning a value the maple state index and last respectively contain + * the start and end of the range for the entry. Ranges are inclusive in the + * Maple Tree. + */ +struct ma_state { + struct maple_tree *tree; /* The tree we're operating in */ + unsigned long index; /* The index we're operating on - range start */ + unsigned long last; /* The last index we're operating on - range end */ + struct maple_enode *node; /* The node containing this entry */ + unsigned long min; /* The minimum index of this node - implied pivot min */ + unsigned long max; /* The maximum index of this node - implied pivot max */ + struct maple_alloc *alloc; /* Allocated nodes for this operation */ + unsigned char depth; /* depth of tree descent during write */ + unsigned char offset; + unsigned char mas_flags; +}; + +struct ma_wr_state { + struct ma_state *mas; + struct maple_node *node; /* Decoded mas->node */ + unsigned long r_min; /* range min */ + unsigned long r_max; /* range max */ + enum maple_type type; /* mas->node type */ + unsigned char offset_end; /* The offset where the write ends */ + unsigned char node_end; /* mas->node end */ + unsigned long *pivots; /* mas->node->pivots pointer */ + unsigned long end_piv; /* The pivot at the offset end */ + void __rcu **slots; /* mas->node->slots pointer */ + void *entry; /* The entry to write */ + void *content; /* The existing entry that is being overwritten */ +}; + +#define mas_lock(mas) spin_lock(&((mas)->tree->ma_lock)) +#define mas_unlock(mas) spin_unlock(&((mas)->tree->ma_lock)) + + +/* + * Special values for ma_state.node. + * MAS_START means we have not searched the tree. + * MAS_ROOT means we have searched the tree and the entry we found lives in + * the root of the tree (ie it has index 0, length 1 and is the only entry in + * the tree). + * MAS_NONE means we have searched the tree and there is no node in the + * tree for this entry. For example, we searched for index 1 in an empty + * tree. Or we have a tree which points to a full leaf node and we + * searched for an entry which is larger than can be contained in that + * leaf node. + * MA_ERROR represents an errno. After dropping the lock and attempting + * to resolve the error, the walk would have to be restarted from the + * top of the tree as the tree may have been modified. + */ +#define MAS_START ((struct maple_enode *)1UL) +#define MAS_ROOT ((struct maple_enode *)5UL) +#define MAS_NONE ((struct maple_enode *)9UL) +#define MAS_PAUSE ((struct maple_enode *)17UL) +#define MA_ERROR(err) \ + ((struct maple_enode *)(((unsigned long)err << 2) | 2UL)) + +#define MA_STATE(name, mt, first, end) \ + struct ma_state name = { \ + .tree = mt, \ + .index = first, \ + .last = end, \ + .node = MAS_START, \ + .min = 0, \ + .max = ULONG_MAX, \ + } + +#define MA_WR_STATE(name, ma_state, wr_entry) \ + struct ma_wr_state name = { \ + .mas = ma_state, \ + .content = NULL, \ + .entry = wr_entry, \ + } + +#define MA_TOPIARY(name, tree) \ + struct ma_topiary name = { \ + .head = NULL, \ + .tail = NULL, \ + .mtree = tree, \ + } + +void *mas_walk(struct ma_state *mas); +void *mas_store(struct ma_state *mas, void *entry); +void *mas_erase(struct ma_state *mas); +int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp); +void mas_store_prealloc(struct ma_state *mas, void *entry); +void *mas_find(struct ma_state *mas, unsigned long max); +void *mas_find_rev(struct ma_state *mas, unsigned long min); +int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp); + +bool mas_nomem(struct ma_state *mas, gfp_t gfp); +void mas_pause(struct ma_state *mas); +void maple_tree_init(void); +void mas_destroy(struct ma_state *mas); +int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries); + +void *mas_prev(struct ma_state *mas, unsigned long min); +void *mas_next(struct ma_state *mas, unsigned long max); + +int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned long max, + unsigned long size); + +/* Checks if a mas has not found anything */ +static inline bool mas_is_none(struct ma_state *mas) +{ + return mas->node == MAS_NONE; +} + +/* Checks if a mas has been paused */ +static inline bool mas_is_paused(struct ma_state *mas) +{ + return mas->node == MAS_PAUSE; +} + +void mas_dup_tree(struct ma_state *oldmas, struct ma_state *mas); +void mas_dup_store(struct ma_state *mas, void *entry); + +/* + * This finds an empty area from the highest address to the lowest. + * AKA "Topdown" version, + */ +int mas_empty_area_rev(struct ma_state *mas, unsigned long min, + unsigned long max, unsigned long size); +/** + * mas_reset() - Reset a Maple Tree operation state. + * @mas: Maple Tree operation state. + * + * Resets the error or walk state of the @mas so future walks of the + * array will start from the root. Use this if you have dropped the + * lock and want to reuse the ma_state. + * + * Context: Any context. + */ +static inline void mas_reset(struct ma_state *mas) +{ + mas->node = MAS_START; +} + +/** + * mas_for_each() - Iterate over a range of the maple tree. + * @mas: Maple Tree operation state (maple_state) + * @entry: Entry retrieved from the tree + * @max: maximum index to retrieve from the tree + * + * When returned, mas->index and mas->last will hold the entire range for the + * entry. + * + * Note: may return the zero entry. + * + */ +#define mas_for_each(mas, entry, max) \ + while (((entry) = mas_find((mas), (max))) != NULL) + + +/** + * mas_set_range() - Set up Maple Tree operation state for a different index. + * @mas: Maple Tree operation state. + * @start: New start of range in the Maple Tree. + * @last: New end of range in the Maple Tree. + * + * Move the operation state to refer to a different range. This will + * have the effect of starting a walk from the top; see mas_next() + * to move to an adjacent index. + */ +static inline +void mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last) +{ + mas->index = start; + mas->last = last; + mas->node = MAS_START; +} + +/** + * mas_set() - Set up Maple Tree operation state for a different index. + * @mas: Maple Tree operation state. + * @index: New index into the Maple Tree. + * + * Move the operation state to refer to a different index. This will + * have the effect of starting a walk from the top; see mas_next() + * to move to an adjacent index. + */ +static inline void mas_set(struct ma_state *mas, unsigned long index) +{ + + mas_set_range(mas, index, index); +} + +static inline bool mt_external_lock(const struct maple_tree *mt) +{ + return (mt->ma_flags & MT_FLAGS_LOCK_MASK) == MT_FLAGS_LOCK_EXTERN; +} + +/** + * mt_init_flags() - Initialise an empty maple tree with flags. + * @mt: Maple Tree + * @flags: maple tree flags. + * + * If you need to initialise a Maple Tree with special flags (eg, an + * allocation tree), use this function. + * + * Context: Any context. + */ +static inline void mt_init_flags(struct maple_tree *mt, unsigned int flags) +{ + mt->ma_flags = flags; + if (!mt_external_lock(mt)) + spin_lock_init(&mt->ma_lock); + rcu_assign_pointer(mt->ma_root, NULL); +} + +/** + * mt_init() - Initialise an empty maple tree. + * @mt: Maple Tree + * + * An empty Maple Tree. + * + * Context: Any context. + */ +static inline void mt_init(struct maple_tree *mt) +{ + mt_init_flags(mt, 0); +} + +static inline bool mt_in_rcu(struct maple_tree *mt) +{ +#ifdef CONFIG_MAPLE_RCU_DISABLED + return false; +#endif + return mt->ma_flags & MT_FLAGS_USE_RCU; +} + +/** + * mt_clear_in_rcu() - Switch the tree to non-RCU mode. + * @mt: The Maple Tree + */ +static inline void mt_clear_in_rcu(struct maple_tree *mt) +{ + if (!mt_in_rcu(mt)) + return; + + if (mt_external_lock(mt)) { + BUG_ON(!mt_lock_is_held(mt)); + mt->ma_flags &= ~MT_FLAGS_USE_RCU; + } else { + mtree_lock(mt); + mt->ma_flags &= ~MT_FLAGS_USE_RCU; + mtree_unlock(mt); + } +} + +/** + * mt_set_in_rcu() - Switch the tree to RCU safe mode. + * @mt: The Maple Tree + */ +static inline void mt_set_in_rcu(struct maple_tree *mt) +{ + if (mt_in_rcu(mt)) + return; + + if (mt_external_lock(mt)) { + BUG_ON(!mt_lock_is_held(mt)); + mt->ma_flags |= MT_FLAGS_USE_RCU; + } else { + mtree_lock(mt); + mt->ma_flags |= MT_FLAGS_USE_RCU; + mtree_unlock(mt); + } +} + +void *mt_find(struct maple_tree *mt, unsigned long *index, unsigned long max); +void *mt_find_after(struct maple_tree *mt, unsigned long *index, + unsigned long max); +void *mt_prev(struct maple_tree *mt, unsigned long index, unsigned long min); +void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max); + +/** + * mt_for_each - Iterate over each entry starting at index until max. + * @tree: The Maple Tree + * @entry: The current entry + * @index: The index to update to track the location in the tree + * @max: The maximum limit for @index + * + * Note: Will not return the zero entry. + */ +#define mt_for_each(tree, entry, index, max) \ + for (entry = mt_find(tree, &(index), max); \ + entry; entry = mt_find_after(tree, &(index), max)) + + +#ifdef CONFIG_DEBUG_MAPLE_TREE +extern atomic_t maple_tree_tests_run; +extern atomic_t maple_tree_tests_passed; + +void mt_dump(const struct maple_tree *mt); +void mt_validate(struct maple_tree *mt); +#define MT_BUG_ON(tree, x) do { \ + atomic_inc(&maple_tree_tests_run); \ + if (x) { \ + pr_info("BUG at %s:%d (%u)\n", \ + __func__, __LINE__, x); \ + mt_dump(tree); \ + pr_info("Pass: %u Run:%u\n", \ + atomic_read(&maple_tree_tests_passed), \ + atomic_read(&maple_tree_tests_run)); \ + dump_stack(); \ + } else { \ + atomic_inc(&maple_tree_tests_passed); \ + } \ +} while (0) +#else +#define MT_BUG_ON(tree, x) BUG_ON(x) +#endif /* CONFIG_DEBUG_MAPLE_TREE */ + +#endif /*_LINUX_MAPLE_TREE_H */ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 04f2f33607e9..4d31ce55b1c0 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -837,6 +837,15 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) } struct mem_cgroup *mem_cgroup_from_id(unsigned short id); +#ifdef CONFIG_SHRINKER_DEBUG +static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg) +{ + return memcg ? cgroup_ino(memcg->css.cgroup) : 0; +} + +struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino); +#endif + static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) { return mem_cgroup_from_css(seq_css(m)); @@ -1343,6 +1352,18 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) return NULL; } +#ifdef CONFIG_SHRINKER_DEBUG +static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg) +{ + return 0; +} + +static inline struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) +{ + return NULL; +} +#endif + static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) { return NULL; diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 20d7edf62a6a..e0b2209ab71c 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -351,13 +351,4 @@ void arch_remove_linear_mapping(u64 start, u64 size); extern bool mhp_supports_memmap_on_memory(unsigned long size); #endif /* CONFIG_MEMORY_HOTPLUG */ -#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY -bool mhp_memmap_on_memory(void); -#else -static inline bool mhp_memmap_on_memory(void) -{ - return false; -} -#endif - #endif /* __LINUX_MEMORY_HOTPLUG_H */ diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 8af304f6b504..334ce79a3b91 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -41,6 +41,13 @@ struct vmem_altmap { * A more complete discussion of unaddressable memory may be found in * include/linux/hmm.h and Documentation/vm/hmm.rst. * + * MEMORY_DEVICE_COHERENT: + * Device memory that is cache coherent from device and CPU point of view. This + * is used on platforms that have an advanced system bus (like CAPI or CXL). A + * driver can hotplug the device memory using ZONE_DEVICE and with that memory + * type. Any page of a process can be migrated to such memory. However no one + * should be allowed to pin such memory so that it can always be evicted. + * * MEMORY_DEVICE_FS_DAX: * Host memory that has similar access semantics as System RAM i.e. DMA * coherent and supports page pinning. In support of coordinating page @@ -61,6 +68,7 @@ struct vmem_altmap { enum memory_type { /* 0 is reserved to catch uninitialized type fields */ MEMORY_DEVICE_PRIVATE = 1, + MEMORY_DEVICE_COHERENT, MEMORY_DEVICE_FS_DAX, MEMORY_DEVICE_GENERIC, MEMORY_DEVICE_PCI_P2PDMA, @@ -79,6 +87,18 @@ struct dev_pagemap_ops { * the page back to a CPU accessible page. */ vm_fault_t (*migrate_to_ram)(struct vm_fault *vmf); + + /* + * Handle the memory failure happens on a range of pfns. Notify the + * processes who are using these pfns, and try to recover the data on + * them if necessary. The mf_flags is finally passed to the recover + * function through the whole notify routine. + * + * When this is not implemented, or it returns -EOPNOTSUPP, the caller + * will fall back to a common handler called mf_generic_kill_procs(). + */ + int (*memory_failure)(struct dev_pagemap *pgmap, unsigned long pfn, + unsigned long nr_pages, int mf_flags); }; #define PGMAP_ALTMAP_VALID (1 << 0) @@ -143,6 +163,17 @@ static inline bool folio_is_device_private(const struct folio *folio) return is_device_private_page(&folio->page); } +static inline bool is_device_coherent_page(const struct page *page) +{ + return is_zone_device_page(page) && + page->pgmap->type == MEMORY_DEVICE_COHERENT; +} + +static inline bool folio_is_device_coherent(const struct folio *folio) +{ + return is_device_coherent_page(&folio->page); +} + static inline bool is_pci_p2pdma_page(const struct page *page) { return IS_ENABLED(CONFIG_PCI_P2PDMA) && diff --git a/include/linux/migrate.h b/include/linux/migrate.h index ae5bb67a9ba1..22c0a0cf5e0c 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -182,6 +182,7 @@ static inline unsigned long migrate_pfn(unsigned long pfn) enum migrate_vma_direction { MIGRATE_VMA_SELECT_SYSTEM = 1 << 0, MIGRATE_VMA_SELECT_DEVICE_PRIVATE = 1 << 1, + MIGRATE_VMA_SELECT_DEVICE_COHERENT = 1 << 2, }; struct migrate_vma { diff --git a/include/linux/mm.h b/include/linux/mm.h index cf3d0d673f6b..82b680261cf0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -601,7 +601,7 @@ struct vm_operations_struct { #endif /* * Called by vm_normal_page() for special PTEs to find the - * page for @addr. This is useful if the default behavior + * page for @addr. This is useful if the default behavior * (using pte_page()) would not find the correct page. */ struct page *(*find_special_page)(struct vm_area_struct *vma, @@ -658,6 +658,37 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma) return vma->vm_flags & VM_ACCESS_FLAGS; } +static inline +struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max) +{ + return mas_find(&vmi->mas, max); +} + +static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi) +{ + /* + * Uses vma_find() to get the first VMA when the iterator starts. + * Calling mas_next() could skip the first entry. + */ + return vma_find(vmi, ULONG_MAX); +} + +static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi) +{ + return mas_prev(&vmi->mas, 0); +} + +static inline unsigned long vma_iter_addr(struct vma_iterator *vmi) +{ + return vmi->mas.index; +} + +#define for_each_vma(vmi, vma) while ((vma = vma_next(&(vmi))) != NULL) + +/* The MM code likes to work with exclusive end addresses */ +#define for_each_vma_range(vmi, vma, end) \ + while ((vma = vma_find(&(vmi), (end) - 1)) != NULL) + #ifdef CONFIG_SHMEM /* * The vma_is_shmem is not inline because it is used only by slow @@ -1842,8 +1873,9 @@ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); void zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size); -void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, - unsigned long start, unsigned long end); +void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, + struct vm_area_struct *start_vma, unsigned long start, + unsigned long end); struct mmu_notifier_range; @@ -1962,8 +1994,12 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma, * for now all the callers are only use one of the flags at the same * time. */ -/* Whether we should allow dirty bit accounting */ -#define MM_CP_DIRTY_ACCT (1UL << 0) +/* + * Whether we should manually check if we can map individual PTEs writable, + * because something (e.g., COW, uffd-wp) blocks that from happening for all + * PTEs automatically in a writable mapping. + */ +#define MM_CP_TRY_CHANGE_WRITABLE (1UL << 0) /* Whether this protection change is for NUMA hints */ #define MM_CP_PROT_NUMA (1UL << 1) /* Whether this change is for write protecting */ @@ -2636,14 +2672,15 @@ extern int __split_vma(struct mm_struct *, struct vm_area_struct *, extern int split_vma(struct mm_struct *, struct vm_area_struct *, unsigned long addr, int new_below); extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); -extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *, - struct rb_node **, struct rb_node *); extern void unlink_file_vma(struct vm_area_struct *); extern struct vm_area_struct *copy_vma(struct vm_area_struct **, unsigned long addr, unsigned long len, pgoff_t pgoff, bool *need_rmap_locks); extern void exit_mmap(struct mm_struct *); +void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas); +void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas); + static inline int check_data_rlimit(unsigned long rlim, unsigned long new, unsigned long start, @@ -2691,8 +2728,9 @@ extern unsigned long mmap_region(struct file *file, unsigned long addr, extern unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long pgoff, unsigned long *populate, struct list_head *uf); -extern int __do_munmap(struct mm_struct *, unsigned long, size_t, - struct list_head *uf, bool downgrade); +extern int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm, + unsigned long start, size_t len, struct list_head *uf, + bool downgrade); extern int do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf); extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior); @@ -2759,26 +2797,12 @@ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long add extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, struct vm_area_struct **pprev); -/** - * find_vma_intersection() - Look up the first VMA which intersects the interval - * @mm: The process address space. - * @start_addr: The inclusive start user address. - * @end_addr: The exclusive end user address. - * - * Returns: The first VMA within the provided range, %NULL otherwise. Assumes - * start_addr < end_addr. +/* + * Look up the first VMA which intersects the interval [start_addr, end_addr) + * NULL if none. Assume start_addr < end_addr. */ -static inline struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, - unsigned long start_addr, - unsigned long end_addr) -{ - struct vm_area_struct *vma = find_vma(mm, start_addr); - - if (vma && end_addr <= vma->vm_start) - vma = NULL; - return vma; -} + unsigned long start_addr, unsigned long end_addr); /** * vma_lookup() - Find a VMA at a specific address @@ -2790,12 +2814,7 @@ struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, static inline struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr) { - struct vm_area_struct *vma = find_vma(mm, addr); - - if (vma && addr < vma->vm_start) - vma = NULL; - - return vma; + return mtree_load(&mm->mm_mt, addr); } static inline unsigned long vm_start_gap(struct vm_area_struct *vma) @@ -2831,7 +2850,7 @@ static inline unsigned long vma_pages(struct vm_area_struct *vma) static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end) { - struct vm_area_struct *vma = find_vma(mm, vm_start); + struct vm_area_struct *vma = vma_lookup(mm, vm_start); if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end)) vma = NULL; @@ -2934,6 +2953,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, #define FOLL_NUMA 0x200 /* force NUMA hinting page fault */ #define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ #define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */ +#define FOLL_LRU 0x1000 /* return only LRU (anon or page cache) */ #define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ #define FOLL_COW 0x4000 /* internal GUP flag */ #define FOLL_ANON 0x8000 /* don't do file mappings */ @@ -3233,7 +3253,10 @@ enum mf_flags { MF_SOFT_OFFLINE = 1 << 3, MF_UNPOISON = 1 << 4, MF_SW_SIMULATED = 1 << 5, + MF_NO_RETRY = 1 << 6, }; +int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, + unsigned long count, int mf_flags); extern int memory_failure(unsigned long pfn, int flags); extern void memory_failure_queue(unsigned long pfn, int flags); extern void memory_failure_queue_kick(int cpu); @@ -3283,7 +3306,6 @@ enum mf_action_page_type { MF_MSG_DIFFERENT_COMPOUND, MF_MSG_HUGE, MF_MSG_FREE_HUGE, - MF_MSG_NON_PMD_HUGE, MF_MSG_UNMAP_FAILED, MF_MSG_DIRTY_SWAPCACHE, MF_MSG_CLEAN_SWAPCACHE, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6b961a29bf26..acbd8d03e01e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -9,6 +9,7 @@ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/rbtree.h> +#include <linux/maple_tree.h> #include <linux/rwsem.h> #include <linux/completion.h> #include <linux/cpumask.h> @@ -87,6 +88,7 @@ struct page { */ union { struct list_head lru; + /* Or, for the Unevictable "LRU list" slot */ struct { /* Always even, to negate PageTail */ @@ -94,6 +96,10 @@ struct page { /* Count page's or folio's mlocks */ unsigned int mlock_count; }; + + /* Or, free page */ + struct list_head buddy_list; + struct list_head pcp_list; }; /* See page-flags.h for PAGE_MAPPING_FLAGS */ struct address_space *mapping; @@ -402,21 +408,6 @@ struct vm_area_struct { unsigned long vm_end; /* The first byte after our end address within vm_mm. */ - /* linked list of VM areas per task, sorted by address */ - struct vm_area_struct *vm_next, *vm_prev; - - struct rb_node vm_rb; - - /* - * Largest free memory gap in bytes to the left of this VMA. - * Either between this VMA and vma->vm_prev, or between one of the - * VMAs below us in the VMA rbtree and its ->vm_prev. This helps - * get_unmapped_area find a free area of the right size. - */ - unsigned long rb_subtree_gap; - - /* Second cache line starts here. */ - struct mm_struct *vm_mm; /* The address space we belong to. */ /* @@ -480,9 +471,7 @@ struct vm_area_struct { struct kioctx_table; struct mm_struct { struct { - struct vm_area_struct *mmap; /* list of VMAs */ - struct rb_root mm_rb; - u64 vmacache_seqnum; /* per-thread vmacache */ + struct maple_tree mm_mt; #ifdef CONFIG_MMU unsigned long (*get_unmapped_area) (struct file *filp, unsigned long addr, unsigned long len, @@ -496,7 +485,6 @@ struct mm_struct { unsigned long mmap_compat_legacy_base; #endif unsigned long task_size; /* size of task vm space */ - unsigned long highest_vm_end; /* highest vma end address */ pgd_t * pgd; #ifdef CONFIG_MEMBARRIER @@ -676,6 +664,7 @@ struct mm_struct { unsigned long cpu_bitmap[]; }; +#define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN) extern struct mm_struct init_mm; /* Pointer magic because the dynamic array size confuses some compilers. */ @@ -693,6 +682,27 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm) return (struct cpumask *)&mm->cpu_bitmap; } +struct vma_iterator { + struct ma_state mas; +}; + +#define VMA_ITERATOR(name, mm, addr) \ + struct vma_iterator name = { \ + .mas = { \ + .tree = &mm->mm_mt, \ + .index = addr, \ + .node = MAS_START, \ + }, \ + } + +static inline void vma_iter_init(struct vma_iterator *vmi, + struct mm_struct *mm, unsigned long addr) +{ + vmi->mas.tree = &mm->mm_mt; + vmi->mas.index = addr; + vmi->mas.node = MAS_START; +} + struct mmu_gather; extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h index c1bc6731125c..0bb4b6da9993 100644 --- a/include/linux/mm_types_task.h +++ b/include/linux/mm_types_task.h @@ -25,18 +25,6 @@ #define ALLOC_SPLIT_PTLOCKS (SPINLOCK_SIZE > BITS_PER_LONG/8) /* - * The per task VMA cache array: - */ -#define VMACACHE_BITS 2 -#define VMACACHE_SIZE (1U << VMACACHE_BITS) -#define VMACACHE_MASK (VMACACHE_SIZE - 1) - -struct vmacache { - u64 seqnum; - struct vm_area_struct *vmas[VMACACHE_SIZE]; -}; - -/* * When updating this, please also update struct resident_page_types[] in * kernel/fork.c */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index aab70355d64f..607a4fcabbd4 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -355,15 +355,18 @@ enum zone_watermarks { }; /* - * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER plus one additional - * for pageblock size for THP if configured. + * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER. One additional list + * for THP which will usually be GFP_MOVABLE. Even if it is another type, + * it should not contribute to serious fragmentation causing THP allocation + * failures. */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define NR_PCP_THP 1 #else #define NR_PCP_THP 0 #endif -#define NR_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1 + NR_PCP_THP)) +#define NR_LOWORDER_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1)) +#define NR_PCP_LISTS (NR_LOWORDER_PCP_LISTS + NR_PCP_THP) /* * Shift to encode migratetype and order in the same integer, with order @@ -379,6 +382,7 @@ enum zone_watermarks { /* Fields and list protected by pagesets local_lock in page_alloc.c */ struct per_cpu_pages { + spinlock_t lock; /* Protects lists field */ int count; /* number of pages in the list */ int high; /* high watermark, emptying needed */ int batch; /* chunk size for buddy add/remove */ @@ -389,7 +393,7 @@ struct per_cpu_pages { /* Lists of pages, one per migrate type stored on the pcp-lists */ struct list_head lists[NR_PCP_LISTS]; -}; +} ____cacheline_aligned_in_smp; struct per_cpu_zonestat { #ifdef CONFIG_SMP @@ -1418,16 +1422,48 @@ extern size_t mem_section_usage_size(void); * (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the * worst combination is powerpc with 256k pages, * which results in PFN_SECTION_SHIFT equal 6. - * To sum it up, at least 6 bits are available. + * To sum it up, at least 6 bits are available on all architectures. + * However, we can exceed 6 bits on some other architectures except + * powerpc (e.g. 15 bits are available on x86_64, 13 bits are available + * with the worst case of 64K pages on arm64) if we make sure the + * exceeded bit is not applicable to powerpc. */ -#define SECTION_MARKED_PRESENT (1UL<<0) -#define SECTION_HAS_MEM_MAP (1UL<<1) -#define SECTION_IS_ONLINE (1UL<<2) -#define SECTION_IS_EARLY (1UL<<3) -#define SECTION_TAINT_ZONE_DEVICE (1UL<<4) -#define SECTION_MAP_LAST_BIT (1UL<<5) +#define ENUM_SECTION_FLAG(MAPPER) \ + MAPPER(MARKED_PRESENT) \ + MAPPER(HAS_MEM_MAP) \ + MAPPER(IS_ONLINE) \ + MAPPER(IS_EARLY) \ + MAPPER(TAINT_ZONE_DEVICE, CONFIG_ZONE_DEVICE) \ + MAPPER(CANNOT_OPTIMIZE_VMEMMAP, CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) \ + MAPPER(MAP_LAST_BIT) + +#define __SECTION_SHIFT_FLAG_MAPPER_0(x) +#define __SECTION_SHIFT_FLAG_MAPPER_1(x) SECTION_##x##_SHIFT, +#define __SECTION_SHIFT_FLAG_MAPPER(x, ...) \ + __PASTE(__SECTION_SHIFT_FLAG_MAPPER_, IS_ENABLED(__VA_ARGS__))(x) + +#define __SECTION_FLAG_MAPPER_0(x) +#define __SECTION_FLAG_MAPPER_1(x) SECTION_##x = BIT(SECTION_##x##_SHIFT), +#define __SECTION_FLAG_MAPPER(x, ...) \ + __PASTE(__SECTION_FLAG_MAPPER_, IS_ENABLED(__VA_ARGS__))(x) + +enum { + /* + * Generate a series of enumeration flags like SECTION_$name_SHIFT. + * Each entry in ENUM_SECTION_FLAG() macro will be generated to one + * enumeration iff the 2nd parameter of MAPPER() is defined or absent. + * The $name comes from the 1st parameter of MAPPER() macro. + */ + ENUM_SECTION_FLAG(__SECTION_SHIFT_FLAG_MAPPER) + /* + * Generate a series of enumeration flags like: + * SECTION_$name = BIT(SECTION_$name_SHIFT) + */ + ENUM_SECTION_FLAG(__SECTION_FLAG_MAPPER) +}; + #define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1)) -#define SECTION_NID_SHIFT 6 +#define SECTION_NID_SHIFT SECTION_MAP_LAST_BIT_SHIFT static inline struct page *__section_mem_map_addr(struct mem_section *section) { @@ -1436,6 +1472,22 @@ static inline struct page *__section_mem_map_addr(struct mem_section *section) return (struct page *)map; } +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP +static inline void section_mark_cannot_optimize_vmemmap(struct mem_section *ms) +{ + ms->section_mem_map |= SECTION_CANNOT_OPTIMIZE_VMEMMAP; +} + +static inline int section_cannot_optimize_vmemmap(struct mem_section *ms) +{ + return (ms && (ms->section_mem_map & SECTION_CANNOT_OPTIMIZE_VMEMMAP)); +} +#else +static inline void section_mark_cannot_optimize_vmemmap(struct mem_section *ms) +{ +} +#endif + static inline int present_section(struct mem_section *section) { return (section && (section->section_mem_map & SECTION_MARKED_PRESENT)); @@ -1466,12 +1518,19 @@ static inline int online_section(struct mem_section *section) return (section && (section->section_mem_map & SECTION_IS_ONLINE)); } +#ifdef CONFIG_ZONE_DEVICE static inline int online_device_section(struct mem_section *section) { unsigned long flags = SECTION_IS_ONLINE | SECTION_TAINT_ZONE_DEVICE; return section && ((section->section_mem_map & flags) == flags); } +#else +static inline int online_device_section(struct mem_section *section) +{ + return 0; +} +#endif static inline int online_section_nr(unsigned long nr) { diff --git a/include/linux/oom.h b/include/linux/oom.h index 02d1e7bbd8cd..7d0c9c48a0c5 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -78,15 +78,6 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk) } /* - * Use this helper if tsk->mm != mm and the victim mm needs a special - * handling. This is guaranteed to stay true after once set. - */ -static inline bool mm_is_oom_victim(struct mm_struct *mm) -{ - return test_bit(MMF_OOM_VICTIM, &mm->flags); -} - -/* * Checks whether a page fault on the given mm is still reliable. * This is no longer true if the oom reaper started to reap the * address space which is reflected by MMF_UNSTABLE flag set in @@ -106,8 +97,6 @@ static inline vm_fault_t check_stable_address_space(struct mm_struct *mm) return 0; } -bool __oom_reap_task_mm(struct mm_struct *mm); - long oom_badness(struct task_struct *p, unsigned long totalpages); diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 3f5490f6f038..de80f0c26b2f 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -650,6 +650,12 @@ __PAGEFLAG(Reported, reported, PF_NO_COMPOUND) #define PAGE_MAPPING_KSM (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE) #define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE) +/* + * Different with flags above, this flag is used only for fsdax mode. It + * indicates that this page->mapping is now under reflink case. + */ +#define PAGE_MAPPING_DAX_COW 0x1 + static __always_inline bool folio_mapping_flags(struct folio *folio) { return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) != 0; diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 9ec23138e410..bf80adca980b 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -325,8 +325,8 @@ struct page_vma_mapped_walk { #define DEFINE_PAGE_VMA_WALK(name, _page, _vma, _address, _flags) \ struct page_vma_mapped_walk name = { \ .pfn = page_to_pfn(_page), \ - .nr_pages = compound_nr(page), \ - .pgoff = page_to_pgoff(page), \ + .nr_pages = compound_nr(_page), \ + .pgoff = page_to_pgoff(_page), \ .vma = _vma, \ .address = _address, \ .flags = _flags, \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 72242bc73d85..9776dee75048 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -860,7 +860,6 @@ struct task_struct { struct mm_struct *active_mm; /* Per-thread vma caching: */ - struct vmacache vmacache; #ifdef SPLIT_RSS_COUNTING struct task_rss_stat rss_stat; diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 4d0a5be28b70..8270ad7ae14c 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -71,9 +71,8 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ #define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */ #define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ -#define MMF_OOM_VICTIM 25 /* mm is the oom victim */ -#define MMF_OOM_REAP_QUEUED 26 /* mm was queued for oom_reaper */ -#define MMF_MULTIPROCESS 27 /* mm is shared between processes */ +#define MMF_OOM_REAP_QUEUED 25 /* mm was queued for oom_reaper */ +#define MMF_MULTIPROCESS 26 /* mm is shared between processes */ /* * MMF_HAS_PINNED: Whether this mm has pinned any pages. This can be either * replaced in the future by mm.pinned_vm when it becomes stable, or grow into @@ -81,7 +80,7 @@ static inline int get_dumpable(struct mm_struct *mm) * pinned pages were unpinned later on, we'll still keep this bit set for the * lifecycle of this mm, just for simplicity. */ -#define MMF_HAS_PINNED 28 /* FOLL_PIN has run, never cleared */ +#define MMF_HAS_PINNED 27 /* FOLL_PIN has run, never cleared */ #define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 76fbf92b04d9..64416f3e0a1f 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -73,6 +73,11 @@ struct shrinker { /* ID in shrinker_idr */ int id; #endif +#ifdef CONFIG_SHRINKER_DEBUG + int debugfs_id; + const char *name; + struct dentry *debugfs_entry; +#endif /* objs pending delete, per node */ atomic_long_t *nr_deferred; }; @@ -88,10 +93,30 @@ struct shrinker { */ #define SHRINKER_NONSLAB (1 << 3) -extern int prealloc_shrinker(struct shrinker *shrinker); +extern int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...); extern void register_shrinker_prepared(struct shrinker *shrinker); -extern int register_shrinker(struct shrinker *shrinker); +extern int register_shrinker(struct shrinker *shrinker, const char *fmt, ...); extern void unregister_shrinker(struct shrinker *shrinker); extern void free_prealloced_shrinker(struct shrinker *shrinker); extern void synchronize_shrinkers(void); -#endif + +#ifdef CONFIG_SHRINKER_DEBUG +extern int shrinker_debugfs_add(struct shrinker *shrinker); +extern void shrinker_debugfs_remove(struct shrinker *shrinker); +extern int shrinker_debugfs_rename(struct shrinker *shrinker, + const char *fmt, ...); +#else /* CONFIG_SHRINKER_DEBUG */ +static inline int shrinker_debugfs_add(struct shrinker *shrinker) +{ + return 0; +} +static inline void shrinker_debugfs_remove(struct shrinker *shrinker) +{ +} +static inline int shrinker_debugfs_rename(struct shrinker *shrinker, + const char *fmt, ...) +{ + return 0; +} +#endif /* CONFIG_SHRINKER_DEBUG */ +#endif /* _LINUX_SHRINKER_H */ diff --git a/include/linux/swapops.h b/include/linux/swapops.h index f24775b41880..bb7afd03a324 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -244,8 +244,10 @@ extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, spinlock_t *ptl); extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address); -extern void migration_entry_wait_huge(struct vm_area_struct *vma, - struct mm_struct *mm, pte_t *pte); +#ifdef CONFIG_HUGETLB_PAGE +extern void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl); +extern void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte); +#endif #else static inline swp_entry_t make_readable_migration_entry(pgoff_t offset) { @@ -271,8 +273,10 @@ static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, spinlock_t *ptl) { } static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { } -static inline void migration_entry_wait_huge(struct vm_area_struct *vma, - struct mm_struct *mm, pte_t *pte) { } +#ifdef CONFIG_HUGETLB_PAGE +static inline void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl) { } +static inline void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte) { } +#endif static inline int is_writable_migration_entry(swp_entry_t entry) { return 0; diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 732b522bacb7..eee374c29c85 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -173,9 +173,8 @@ extern bool userfaultfd_remove(struct vm_area_struct *vma, unsigned long start, unsigned long end); -extern int userfaultfd_unmap_prep(struct vm_area_struct *vma, - unsigned long start, unsigned long end, - struct list_head *uf); +extern int userfaultfd_unmap_prep(struct mm_struct *mm, unsigned long start, + unsigned long end, struct list_head *uf); extern void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf); @@ -256,7 +255,7 @@ static inline bool userfaultfd_remove(struct vm_area_struct *vma, return true; } -static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma, +static inline int userfaultfd_unmap_prep(struct mm_struct *mm, unsigned long start, unsigned long end, struct list_head *uf) { diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 404024486fa5..1ce8fadb2b1c 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -122,10 +122,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, NR_TLB_LOCAL_FLUSH_ALL, NR_TLB_LOCAL_FLUSH_ONE, #endif /* CONFIG_DEBUG_TLBFLUSH */ -#ifdef CONFIG_DEBUG_VM_VMACACHE - VMACACHE_FIND_CALLS, - VMACACHE_FIND_HITS, -#endif #ifdef CONFIG_SWAP SWAP_RA, SWAP_RA_HIT, diff --git a/include/linux/vmacache.h b/include/linux/vmacache.h deleted file mode 100644 index 6fce268a4588..000000000000 --- a/include/linux/vmacache.h +++ /dev/null @@ -1,28 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __LINUX_VMACACHE_H -#define __LINUX_VMACACHE_H - -#include <linux/sched.h> -#include <linux/mm.h> - -static inline void vmacache_flush(struct task_struct *tsk) -{ - memset(tsk->vmacache.vmas, 0, sizeof(tsk->vmacache.vmas)); -} - -extern void vmacache_update(unsigned long addr, struct vm_area_struct *newvma); -extern struct vm_area_struct *vmacache_find(struct mm_struct *mm, - unsigned long addr); - -#ifndef CONFIG_MMU -extern struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, - unsigned long start, - unsigned long end); -#endif - -static inline void vmacache_invalidate(struct mm_struct *mm) -{ - mm->vmacache_seqnum++; -} - -#endif /* __LINUX_VMACACHE_H */ diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index bfe38869498d..19cf5b6892ce 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -125,12 +125,6 @@ static inline void vm_events_fold_cpu(int cpu) #define count_vm_tlb_events(x, y) do { (void)(y); } while (0) #endif -#ifdef CONFIG_DEBUG_VM_VMACACHE -#define count_vm_vmacache_event(x) count_vm_event(x) -#else -#define count_vm_vmacache_event(x) do {} while (0) -#endif - #define __count_zid_vm_events(item, zid, delta) \ __count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta) diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index d0337a41141c..cbd3ddd7c33d 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -360,7 +360,6 @@ TRACE_EVENT(aer_event, EM ( MF_MSG_DIFFERENT_COMPOUND, "different compound page after locking" ) \ EM ( MF_MSG_HUGE, "huge page" ) \ EM ( MF_MSG_FREE_HUGE, "free huge page" ) \ - EM ( MF_MSG_NON_PMD_HUGE, "non-pmd-sized huge page" ) \ EM ( MF_MSG_UNMAP_FAILED, "unmapping failed page" ) \ EM ( MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page" ) \ EM ( MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page" ) \ diff --git a/include/trace/events/maple_tree.h b/include/trace/events/maple_tree.h new file mode 100644 index 000000000000..2be403bdc2bd --- /dev/null +++ b/include/trace/events/maple_tree.h @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM maple_tree + +#if !defined(_TRACE_MM_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_MM_H + + +#include <linux/tracepoint.h> + +struct ma_state; + +TRACE_EVENT(ma_op, + + TP_PROTO(const char *fn, struct ma_state *mas), + + TP_ARGS(fn, mas), + + TP_STRUCT__entry( + __field(const char *, fn) + __field(unsigned long, min) + __field(unsigned long, max) + __field(unsigned long, index) + __field(unsigned long, last) + __field(void *, node) + ), + + TP_fast_assign( + __entry->fn = fn; + __entry->min = mas->min; + __entry->max = mas->max; + __entry->index = mas->index; + __entry->last = mas->last; + __entry->node = mas->node; + ), + + TP_printk("%s\tNode: %p (%lu %lu) range: %lu-%lu", + __entry->fn, + (void *) __entry->node, + (unsigned long) __entry->min, + (unsigned long) __entry->max, + (unsigned long) __entry->index, + (unsigned long) __entry->last + ) +) +TRACE_EVENT(ma_read, + + TP_PROTO(const char *fn, struct ma_state *mas), + + TP_ARGS(fn, mas), + + TP_STRUCT__entry( + __field(const char *, fn) + __field(unsigned long, min) + __field(unsigned long, max) + __field(unsigned long, index) + __field(unsigned long, last) + __field(void *, node) + ), + + TP_fast_assign( + __entry->fn = fn; + __entry->min = mas->min; + __entry->max = mas->max; + __entry->index = mas->index; + __entry->last = mas->last; + __entry->node = mas->node; + ), + + TP_printk("%s\tNode: %p (%lu %lu) range: %lu-%lu", + __entry->fn, + (void *) __entry->node, + (unsigned long) __entry->min, + (unsigned long) __entry->max, + (unsigned long) __entry->index, + (unsigned long) __entry->last + ) +) + +TRACE_EVENT(ma_write, + + TP_PROTO(const char *fn, struct ma_state *mas, unsigned long piv, + void *val), + + TP_ARGS(fn, mas, piv, val), + + TP_STRUCT__entry( + __field(const char *, fn) + __field(unsigned long, min) + __field(unsigned long, max) + __field(unsigned long, index) + __field(unsigned long, last) + __field(unsigned long, piv) + __field(void *, val) + __field(void *, node) + ), + + TP_fast_assign( + __entry->fn = fn; + __entry->min = mas->min; + __entry->max = mas->max; + __entry->index = mas->index; + __entry->last = mas->last; + __entry->piv = piv; + __entry->val = val; + __entry->node = mas->node; + ), + + TP_printk("%s\tNode %p (%lu %lu) range:%lu-%lu piv (%lu) val %p", + __entry->fn, + (void *) __entry->node, + (unsigned long) __entry->min, + (unsigned long) __entry->max, + (unsigned long) __entry->index, + (unsigned long) __entry->last, + (unsigned long) __entry->piv, + (void *) __entry->val + ) +) +#endif /* _TRACE_MM_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/include/trace/events/mmap.h b/include/trace/events/mmap.h index 4661f7ba07c0..216de5f03621 100644 --- a/include/trace/events/mmap.h +++ b/include/trace/events/mmap.h @@ -42,6 +42,79 @@ TRACE_EVENT(vm_unmapped_area, __entry->low_limit, __entry->high_limit, __entry->align_mask, __entry->align_offset) ); + +TRACE_EVENT(vma_mas_szero, + TP_PROTO(struct maple_tree *mt, unsigned long start, + unsigned long end), + + TP_ARGS(mt, start, end), + + TP_STRUCT__entry( + __field(struct maple_tree *, mt) + __field(unsigned long, start) + __field(unsigned long, end) + ), + + TP_fast_assign( + __entry->mt = mt; + __entry->start = start; + __entry->end = end; + ), + + TP_printk("mt_mod %p, (NULL), SNULL, %lu, %lu,", + __entry->mt, + (unsigned long) __entry->start, + (unsigned long) __entry->end + ) +); + +TRACE_EVENT(vma_store, + TP_PROTO(struct maple_tree *mt, struct vm_area_struct *vma), + + TP_ARGS(mt, vma), + + TP_STRUCT__entry( + __field(struct maple_tree *, mt) + __field(struct vm_area_struct *, vma) + __field(unsigned long, vm_start) + __field(unsigned long, vm_end) + ), + + TP_fast_assign( + __entry->mt = mt; + __entry->vma = vma; + __entry->vm_start = vma->vm_start; + __entry->vm_end = vma->vm_end - 1; + ), + + TP_printk("mt_mod %p, (%p), STORE, %lu, %lu,", + __entry->mt, __entry->vma, + (unsigned long) __entry->vm_start, + (unsigned long) __entry->vm_end + ) +); + + +TRACE_EVENT(exit_mmap, + TP_PROTO(struct mm_struct *mm), + + TP_ARGS(mm), + + TP_STRUCT__entry( + __field(struct mm_struct *, mm) + __field(struct maple_tree *, mt) + ), + + TP_fast_assign( + __entry->mm = mm; + __entry->mt = &mm->mm_mt; + ), + + TP_printk("mt_mod %p, DESTROY\n", + __entry->mt + ) +); + #endif /* This part must be outside protection */ diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index e87cb2b80ed3..11524cda4a95 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -31,7 +31,6 @@ gfpflag_string(__GFP_HIGHMEM), \ gfpflag_string(GFP_DMA32), \ gfpflag_string(__GFP_HIGH), \ - gfpflag_string(__GFP_ATOMIC), \ gfpflag_string(__GFP_IO), \ gfpflag_string(__GFP_FS), \ gfpflag_string(__GFP_NOWARN), \ diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 7d32b1e797fb..005e5e306266 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -12,6 +12,10 @@ #include <linux/types.h> +/* ioctls for /dev/userfaultfd */ +#define USERFAULTFD_IOC 0xAA +#define USERFAULTFD_IOC_NEW _IO(USERFAULTFD_IOC, 0x00) + /* * If the UFFDIO_API is upgraded someday, the UFFDIO_UNREGISTER and * UFFDIO_WAKE ioctls should be defined as _IOW and not as _IOR. In |